Compare commits
2 commits
61bbd5cd1f
...
e55cacd224
Author | SHA1 | Date | |
---|---|---|---|
e55cacd224 | |||
4e0e931839 |
36
crawler.js
36
crawler.js
|
@ -5,6 +5,7 @@ var fs = require('fs');
|
||||||
|
|
||||||
var plsStop = 0;
|
var plsStop = 0;
|
||||||
var pagesVisited = {};
|
var pagesVisited = {};
|
||||||
|
var printOptions = [];
|
||||||
|
|
||||||
// set a default starting page (which is the title of Grundgesetz for arbitrary reasons) but take input from CLI
|
// set a default starting page (which is the title of Grundgesetz for arbitrary reasons) but take input from CLI
|
||||||
var argv = process.argv.slice(2);
|
var argv = process.argv.slice(2);
|
||||||
|
@ -23,11 +24,11 @@ crawl();
|
||||||
|
|
||||||
function crawl() {
|
function crawl() {
|
||||||
if (plsStop === 1) {
|
if (plsStop === 1) {
|
||||||
crawl();
|
return;
|
||||||
}
|
}
|
||||||
if (pageToVisit in pagesVisited) {
|
if (pageToVisit in pagesVisited) {
|
||||||
// We've already visited this page, so repeat the crawl
|
// We've already visited this page, so repeat the crawl
|
||||||
crawl();
|
//continue;
|
||||||
} else {
|
} else {
|
||||||
// New page we haven't visited
|
// New page we haven't visited
|
||||||
visitPage(pageToVisit, crawl);
|
visitPage(pageToVisit, crawl);
|
||||||
|
@ -55,14 +56,25 @@ console.log("Visiting page " + url);
|
||||||
|
|
||||||
// save page
|
// save page
|
||||||
innerhtml = $('div.kapitel');
|
innerhtml = $('div.kapitel');
|
||||||
// console.log("Content: " + innerhtml);
|
// download
|
||||||
fs.appendFileSync('beckOK.html', innerhtml + '\n');
|
var options = {
|
||||||
|
url: baseUrl + "/Print/CurrentDoc" + url + "&printdialogmode=ParentChapter&actionname=&gesamtversionpath=&exportFormat=print",
|
||||||
|
headers: {
|
||||||
|
"__RequestVerificationToken": token
|
||||||
|
}
|
||||||
|
};
|
||||||
|
printOptions.push(options);
|
||||||
|
|
||||||
// prepare next page
|
// prepare next page
|
||||||
var nextPage = $('#next').attr('href');
|
var nextPage = $('#next').attr('href');
|
||||||
console.log("Next up: " + nextPage);
|
console.log("Next up: " + nextPage);
|
||||||
if (nextPage){
|
if (nextPage){
|
||||||
pageToVisit = baseUrl + nextPage;
|
pageToVisit = {
|
||||||
|
url: baseUrl + nextPage,
|
||||||
|
headers: {
|
||||||
|
"__RequestVerificationToken": token
|
||||||
|
}
|
||||||
|
};
|
||||||
callback();
|
callback();
|
||||||
} else {
|
} else {
|
||||||
plsStop = 1;
|
plsStop = 1;
|
||||||
|
@ -71,8 +83,18 @@ console.log("Visiting page " + url);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function savePage($,callback){
|
function savePage(callback){
|
||||||
console.log($);
|
while(printOptions)
|
||||||
|
var options = printOptions.pop();
|
||||||
|
request(options, function(error, response, body){
|
||||||
|
if(response.statusCode === 200) {
|
||||||
|
// Parse the document body
|
||||||
|
var $ = cheerio.load(body);
|
||||||
|
console.log("Page title: " + $('title').text());
|
||||||
|
innerhtml = $('body');
|
||||||
|
fs.appendFileSync('beckOK.html', innerhtml + '\n');
|
||||||
|
};
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue