Compare commits
2 Commits
94e20da74d
...
61bbd5cd1f
Author | SHA1 | Date |
---|---|---|
Wurzelkoch | 61bbd5cd1f | |
Wurzelkoch | 2fddbd00f3 |
13
crawler.js
13
crawler.js
|
@ -1,6 +1,7 @@
|
|||
var request = require('request');
|
||||
var cheerio = require('cheerio');
|
||||
var URL = require('url-parse');
|
||||
var fs = require('fs');
|
||||
|
||||
var plsStop = 0;
|
||||
var pagesVisited = {};
|
||||
|
@ -21,7 +22,6 @@ crawl();
|
|||
// function declarations:
|
||||
|
||||
function crawl() {
|
||||
console.log(plsStop);
|
||||
if (plsStop === 1) {
|
||||
crawl();
|
||||
}
|
||||
|
@ -48,7 +48,16 @@ console.log("Visiting page " + url);
|
|||
// Parse the document body
|
||||
var $ = cheerio.load(body);
|
||||
console.log("Page title: " + $('title').text());
|
||||
// savePage($,crawl);
|
||||
|
||||
// get real document
|
||||
var token = $("[name='__RequestVerificationToken']").attr('value');
|
||||
console.log(token);
|
||||
|
||||
// save page
|
||||
innerhtml = $('div.kapitel');
|
||||
// console.log("Content: " + innerhtml);
|
||||
fs.appendFileSync('beckOK.html', innerhtml + '\n');
|
||||
|
||||
// prepare next page
|
||||
var nextPage = $('#next').attr('href');
|
||||
console.log("Next up: " + nextPage);
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
{
|
||||
"name": "simple-webcrawler-javascript",
|
||||
"name": "BeckOK-scrawler-javascript",
|
||||
"version": "0.0.0",
|
||||
"description": "A simple webcrawler written in JavaScript to learn it.",
|
||||
"description": "A webcrawler written in JavaScript to get BeckOK law books.",
|
||||
"main": "crawler.js",
|
||||
"author": "Stephen",
|
||||
"author": "Gandalf",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"cheerio": "^0.19.0",
|
||||
|
|
Loading…
Reference in New Issue