Something broke
parent
c9cbf2f9b7
commit
0266be624a
88
crawler.js
88
crawler.js
|
@ -2,38 +2,68 @@ var request = require('request');
|
|||
var cheerio = require('cheerio');
|
||||
var URL = require('url-parse');
|
||||
|
||||
var pageToVisit = "http://beck-online.beck.de/?vpath=bibdata/komm/BeckOKGG_44/cont/BECKOKGG.glUD1.htm";
|
||||
console.log("Visiting page " + pageToVisit);
|
||||
request(pageToVisit, function(error, response, body) {
|
||||
if(error) {
|
||||
console.log("Error: " + error);
|
||||
}
|
||||
// Check status code (200 is HTTP OK)
|
||||
console.log("Status code: " + response.statusCode);
|
||||
if(response.statusCode === 200) {
|
||||
// Parse the document body
|
||||
var $ = cheerio.load(body);
|
||||
console.log("Page title: " + $('#dk2next').attr('href'));
|
||||
collectInternalLinks($);
|
||||
}
|
||||
});
|
||||
var plsStop = 0;
|
||||
var pagesVisited = {};
|
||||
|
||||
function collectInternalLinks($) {
|
||||
var allRelativeLinks = [];
|
||||
var allAbsoluteLinks = [];
|
||||
// set a default starting page (which is the title of Grundgesetz for arbitrary reasons) but take input from CLI
|
||||
var argv = process.argv.slice(2);
|
||||
var START_URL = "https://beck-online.beck.de/?vpath=bibdata/komm/BeckOKGG_44/cont/BECKOKGG.glUD1.htm";
|
||||
if (argv[0]){
|
||||
START_URL = argv[0];
|
||||
}
|
||||
console.log(START_URL);
|
||||
var url = new URL(START_URL);
|
||||
var baseUrl = url.protocol + "//" + url.hostname;
|
||||
|
||||
var relativeLinks = $("a[href^='/']");
|
||||
relativeLinks.each(function() {
|
||||
allRelativeLinks.push($(this).attr('href'));
|
||||
pageToVisit = START_URL;
|
||||
crawl();
|
||||
|
||||
});
|
||||
// function declarations:
|
||||
|
||||
var absoluteLinks = $("a[href^='http']");
|
||||
absoluteLinks.each(function() {
|
||||
allAbsoluteLinks.push($(this).attr('href'));
|
||||
});
|
||||
|
||||
console.log("Found " + allRelativeLinks.length + " relative links");
|
||||
console.log("Found " + allAbsoluteLinks.length + " absolute links");
|
||||
function crawl() {
|
||||
console.log(plsStop);
|
||||
if (plsStop === 1) {
|
||||
crawl();
|
||||
}
|
||||
if (pageToVisit in pagesVisited) {
|
||||
// We've already visited this page, so repeat the crawl
|
||||
crawl();
|
||||
} else {
|
||||
// New page we haven't visited
|
||||
visitPage(pageToVisit, crawl);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function visitPage(pageToVisit, callback){
|
||||
console.log("Visiting page " + pageToVisit);
|
||||
request(pageToVisit, function(error, response, body) {
|
||||
if(error) {
|
||||
console.log("Error: " + error);
|
||||
}
|
||||
// Check status code (200 is HTTP OK)
|
||||
console.log("Status code: " + response.statusCode);
|
||||
if(response.statusCode === 200) {
|
||||
// Parse the document body
|
||||
var $ = cheerio.load(body);
|
||||
console.log("Page title: " + $('title').text());
|
||||
// savePage($,crawl);
|
||||
// prepare next page
|
||||
console.log("Next up: " + $("#dk2next").text());
|
||||
var nextPage = $('#dk2next').attr('href');
|
||||
console.log("Next up: " + nextPage);
|
||||
if (nextPage){
|
||||
pageToVisit = baseUrl + nextPage;
|
||||
callback();
|
||||
} else {
|
||||
plsStop = 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function savePage($,callback){
|
||||
console.log($);
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue