diff --git a/crawler.js b/crawler.js new file mode 100644 index 0000000..afe9915 --- /dev/null +++ b/crawler.js @@ -0,0 +1,39 @@ +var request = require('request'); +var cheerio = require('cheerio'); +var URL = require('url-parse'); + +var pageToVisit = "https://beck-online.beck.de/?vpath=bibdata/komm/BeckOKGG_44/cont/BECKOKGG.glUD1.htm"; +console.log("Visiting page " + pageToVisit); +request(pageToVisit, function(error, response, body) { + if(error) { + console.log("Error: " + error); + } + // Check status code (200 is HTTP OK) + console.log("Status code: " + response.statusCode); + if(response.statusCode === 200) { + // Parse the document body + var $ = cheerio.load(body); + console.log("Page title: " + $('#dk2next').attr('href')); + collectInternalLinks($); + } +}); + +function collectInternalLinks($) { + var allRelativeLinks = []; + var allAbsoluteLinks = []; + + var relativeLinks = $("a[href^='/']"); + relativeLinks.each(function() { + allRelativeLinks.push($(this).attr('href')); + + }); + + var absoluteLinks = $("a[href^='http']"); + absoluteLinks.each(function() { + allAbsoluteLinks.push($(this).attr('href')); + }); + + console.log("Found " + allRelativeLinks.length + " relative links"); + console.log("Found " + allAbsoluteLinks.length + " absolute links"); +} + diff --git a/package.json b/package.json new file mode 100644 index 0000000..81063cb --- /dev/null +++ b/package.json @@ -0,0 +1,13 @@ +{ + "name": "simple-webcrawler-javascript", + "version": "0.0.0", + "description": "A simple webcrawler written in JavaScript to learn it.", + "main": "crawler.js", + "author": "Stephen", + "license": "ISC", + "dependencies": { + "cheerio": "^0.19.0", + "url-parse": "^1.0.5", + "request": "^2.65.0" + } +}