Commit ccbdfdc4 authored by Alain Takoudjou's avatar Alain Takoudjou

Initial crawler script for mynij

parent 1e7cd1fb
var fs = require("fs");
var args = require("yargs")
.usage("Usage : crawler.js -link website_link -f file_name")
.demandOption(['link'])
.demandOption(['file'])
.alias("f", "file")
.describe("file", "name of the file the results will be saved into")
.alias("l", "link")
.describe("link", "link to the website to be made into a site map")
.alias("d", "depth")
.describe("depth", "crawling depth")
.nargs("link", 1)
.nargs("file", 1)
.nargs("depth", 1)
.argv;
fs.open(args.file, 'w', function(err, file){
if (err) throw err;
}
);
var depth = 3,
count = 1,
link = args.link,
builder = require("xmlbuilder"),
readline = require('readline'),
SCrawler = require("simplecrawler"),
url_list = [],
crawler = new SCrawler(link);
if (args.depth) depth = args.depth;
crawler.interval = 250;
crawler.maxConcurrency = 5;
crawler.maxDepth = depth;
crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
var mime = response.headers["content-type"];
if (mime.startsWith("text/html") || mime.startsWith("text/plain")) {
readline.cursorTo(process.stdout, 0);
process.stdout.write(count + "");
count+=1;
url_list.push({
"loc": queueItem.url
});
}
});
// Fire callback
crawler.on("complete", function() {
readline.cursorTo(process.stdout, 0);
console.log("crawled " + url_list.length + " urls");
var xml_dict = {
"urlset": {
"@xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9",
"url": url_list
}
};
var xml = builder.create(xml_dict, { encoding: 'utf-8' });
fs.appendFile(args.file, xml.end({ pretty: true }), function(err){if(err) throw err;});
});
crawler.on("fetcherror", function(queueItem, response) {
console.log("");
console.log("Error " + response.statusCode + " while fetching " + queueItem.url);
});
// Start Crawl
crawler.start();
{
"name": "jscrawler",
"version": "1.0.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
"ansi-regex": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz",
"integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg=="
},
"ansi-styles": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
"integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
"requires": {
"color-convert": "^2.0.1"
}
},
"async": {
"version": "3.2.0",
"resolved": "https://registry.npmjs.org/async/-/async-3.2.0.tgz",
"integrity": "sha512-TR2mEZFVOj2pLStYxLht7TyfuRzaydfpxr3k9RpHIzMgw7A64dzsdqCxH1WJyQdoe8T10nDXd9wnEigmiuHIZw=="
},
"cliui": {
"version": "7.0.4",
"resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz",
"integrity": "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ==",
"requires": {
"string-width": "^4.2.0",
"strip-ansi": "^6.0.0",
"wrap-ansi": "^7.0.0"
}
},
"color-convert": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
"requires": {
"color-name": "~1.1.4"
}
},
"color-name": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA=="
},
"emoji-regex": {
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
"integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A=="
},
"escalade": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
"integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw=="
},
"get-caller-file": {
"version": "2.0.5",
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
"integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg=="
},
"iconv-lite": {
"version": "0.5.2",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.5.2.tgz",
"integrity": "sha512-kERHXvpSaB4aU3eANwidg79K8FlrN77m8G9V+0vOR3HYaRifrlwMEpT7ZBJqLSEIHnEgJTHcWK82wwLwwKwtag==",
"requires": {
"safer-buffer": ">= 2.1.2 < 3"
}
},
"is-fullwidth-code-point": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
"integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg=="
},
"require-directory": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
"integrity": "sha1-jGStX9MNqxyXbiNE/+f3kqam30I="
},
"robots-parser": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/robots-parser/-/robots-parser-2.1.1.tgz",
"integrity": "sha512-6yWEYSdhK3bAEcYY0In3wgSBK70BiQoJArzdjZKCP/35b3gKIYu5Lc0qQqsoxjoLVebVoJiKK4VWGc5+oxvWBQ=="
},
"safer-buffer": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
},
"simplecrawler": {
"version": "1.1.9",
"resolved": "https://registry.npmjs.org/simplecrawler/-/simplecrawler-1.1.9.tgz",
"integrity": "sha512-IY5YmeJWvfc1zpy9so1p/EknCqNum3Y9tmnzuLWZqKEwbntGXPGvN0SOtr+XqT4BHjfek2C12g3Tg1yK7Hoh8g==",
"requires": {
"async": "^3.1.0",
"iconv-lite": "^0.5.0",
"robots-parser": "^2.1.1",
"urijs": "^1.19.1"
}
},
"string-width": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.0.tgz",
"integrity": "sha512-zUz5JD+tgqtuDjMhwIg5uFVV3dtqZ9yQJlZVfq4I01/K5Paj5UHj7VyrQOJvzawSVlKpObApbfD0Ed6yJc+1eg==",
"requires": {
"emoji-regex": "^8.0.0",
"is-fullwidth-code-point": "^3.0.0",
"strip-ansi": "^6.0.0"
}
},
"strip-ansi": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz",
"integrity": "sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==",
"requires": {
"ansi-regex": "^5.0.0"
}
},
"urijs": {
"version": "1.19.5",
"resolved": "https://registry.npmjs.org/urijs/-/urijs-1.19.5.tgz",
"integrity": "sha512-48z9VGWwdCV5KfizHsE05DWS5fhK6gFlx5MjO7xu0Krc5FGPWzjlXEVV0nPMrdVuP7xmMHiPZ2HoYZwKOFTZOg=="
},
"wrap-ansi": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
"integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
"requires": {
"ansi-styles": "^4.0.0",
"string-width": "^4.1.0",
"strip-ansi": "^6.0.0"
}
},
"xmlbuilder": {
"version": "15.1.1",
"resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-15.1.1.tgz",
"integrity": "sha512-yMqGBqtXyeN1e3TGYvgNgDVZ3j84W4cwkOXQswghol6APgZWaff9lnbvN7MHYJOiXsvGPXtjTYJEiC9J2wv9Eg=="
},
"y18n": {
"version": "5.0.5",
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.5.tgz",
"integrity": "sha512-hsRUr4FFrvhhRH12wOdfs38Gy7k2FFzB9qgN9v3aLykRq0dRcdcpz5C9FxdS2NuhOrI/628b/KSTJ3rwHysYSg=="
},
"yargs": {
"version": "16.2.0",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-16.2.0.tgz",
"integrity": "sha512-D1mvvtDG0L5ft/jGWkLpG1+m0eQxOfaBvTNELraWj22wSVUMWxZUvYgJYcKh6jGGIkJFhH4IZPQhR4TKpc8mBw==",
"requires": {
"cliui": "^7.0.2",
"escalade": "^3.1.1",
"get-caller-file": "^2.0.5",
"require-directory": "^2.1.1",
"string-width": "^4.2.0",
"y18n": "^5.0.5",
"yargs-parser": "^20.2.2"
}
},
"yargs-parser": {
"version": "20.2.4",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.4.tgz",
"integrity": "sha512-WOkpgNhPTlE73h4VFAFsOnomJVaovO8VqLDzy5saChRBFQFBoMYirowyW+Q9HB4HFF4Z7VZTiG3iSzJJA29yRA=="
}
}
}
{
"name": "jscrawler",
"version": "1.0.0",
"description": "",
"main": "crawler.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"simplecrawler": "^1.1.9",
"xmlbuilder": "^15.1.1",
"yargs": "^16.2.0"
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment