var request = require('request'); var cheerio = require('cheerio'); var URL = require('url-parse'); var fs = require('fs'); var plsStop = 0; var pagesVisited = {}; var printOptions = []; // set a default starting page (which is the title of Grundgesetz for arbitrary reasons) but take input from CLI var argv = process.argv.slice(2); var START_URL = "https://beck-online.beck.de/?vpath=bibdata/komm/BeckOKGG_44/cont/BECKOKGG.glUD1.htm"; if (argv[0]){ START_URL = argv[0]; } console.log(START_URL); var url = new URL(START_URL); var baseUrl = url.protocol + "//" + url.hostname; pageToVisit = START_URL; crawl(); // function declarations: function crawl() { if (plsStop === 1) { return; } if (pageToVisit in pagesVisited) { // We've already visited this page, so repeat the crawl //continue; } else { // New page we haven't visited visitPage(pageToVisit, crawl); } } function visitPage(url, callback){ console.log("Visiting page " + url); pagesVisited[url] = true; request(url, function(error, response, body) { if(error) { console.log("Error: " + error); } // Check status code (200 is HTTP OK) console.log("Status code: " + response.statusCode); if(response.statusCode === 200) { // Parse the document body var $ = cheerio.load(body); console.log("Page title: " + $('title').text()); // get real document var token = $("[name='__RequestVerificationToken']").attr('value'); console.log(token); // save page innerhtml = $('div.kapitel'); // download var options = { url: baseUrl + "/Print/CurrentDoc" + url + "&printdialogmode=ParentChapter&actionname=&gesamtversionpath=&exportFormat=print", headers: { "__RequestVerificationToken": token } }; printOptions.push(options); // prepare next page var nextPage = $('#next').attr('href'); console.log("Next up: " + nextPage); if (nextPage){ pageToVisit = { url: baseUrl + nextPage, headers: { "__RequestVerificationToken": token } }; callback(); } else { plsStop = 1; } } }); } function savePage(callback){ while(printOptions) var options = printOptions.pop(); request(options, function(error, response, body){ if(response.statusCode === 200) { // Parse the document body var $ = cheerio.load(body); console.log("Page title: " + $('title').text()); innerhtml = $('body'); fs.appendFileSync('beckOK.html', innerhtml + '\n'); }; }); }