101 lines
2.5 KiB
JavaScript
101 lines
2.5 KiB
JavaScript
var request = require('request');
|
|
var cheerio = require('cheerio');
|
|
var URL = require('url-parse');
|
|
var fs = require('fs');
|
|
|
|
var plsStop = 0;
|
|
var pagesVisited = {};
|
|
var printOptions = [];
|
|
|
|
// set a default starting page (which is the title of Grundgesetz for arbitrary reasons) but take input from CLI
|
|
var argv = process.argv.slice(2);
|
|
var START_URL = "https://beck-online.beck.de/?vpath=bibdata/komm/BeckOKGG_44/cont/BECKOKGG.glUD1.htm";
|
|
if (argv[0]){
|
|
START_URL = argv[0];
|
|
}
|
|
console.log(START_URL);
|
|
var url = new URL(START_URL);
|
|
var baseUrl = url.protocol + "//" + url.hostname;
|
|
|
|
pageToVisit = START_URL;
|
|
crawl();
|
|
|
|
// function declarations:
|
|
|
|
function crawl() {
|
|
if (plsStop === 1) {
|
|
return;
|
|
}
|
|
if (pageToVisit in pagesVisited) {
|
|
// We've already visited this page, so repeat the crawl
|
|
//continue;
|
|
} else {
|
|
// New page we haven't visited
|
|
visitPage(pageToVisit, crawl);
|
|
}
|
|
}
|
|
|
|
|
|
function visitPage(url, callback){
|
|
console.log("Visiting page " + url);
|
|
pagesVisited[url] = true;
|
|
request(url, function(error, response, body) {
|
|
if(error) {
|
|
console.log("Error: " + error);
|
|
}
|
|
// Check status code (200 is HTTP OK)
|
|
console.log("Status code: " + response.statusCode);
|
|
if(response.statusCode === 200) {
|
|
// Parse the document body
|
|
var $ = cheerio.load(body);
|
|
console.log("Page title: " + $('title').text());
|
|
|
|
// get real document
|
|
var token = $("[name='__RequestVerificationToken']").attr('value');
|
|
console.log(token);
|
|
|
|
// save page
|
|
innerhtml = $('div.kapitel');
|
|
// download
|
|
var options = {
|
|
url: baseUrl + "/Print/CurrentDoc" + url + "&printdialogmode=ParentChapter&actionname=&gesamtversionpath=&exportFormat=print",
|
|
headers: {
|
|
"__RequestVerificationToken": token
|
|
}
|
|
};
|
|
printOptions.push(options);
|
|
|
|
// prepare next page
|
|
var nextPage = $('#next').attr('href');
|
|
console.log("Next up: " + nextPage);
|
|
if (nextPage){
|
|
pageToVisit = {
|
|
url: baseUrl + nextPage,
|
|
headers: {
|
|
"__RequestVerificationToken": token
|
|
}
|
|
};
|
|
callback();
|
|
} else {
|
|
plsStop = 1;
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
function savePage(callback){
|
|
while(printOptions)
|
|
var options = printOptions.pop();
|
|
request(options, function(error, response, body){
|
|
if(response.statusCode === 200) {
|
|
// Parse the document body
|
|
var $ = cheerio.load(body);
|
|
console.log("Page title: " + $('title').text());
|
|
innerhtml = $('body');
|
|
fs.appendFileSync('beckOK.html', innerhtml + '\n');
|
|
};
|
|
});
|
|
}
|
|
|
|
|