Commit d1f914ac authored by Xiaowu Zhang's avatar Xiaowu Zhang

parameter

parent 507c1beb
...@@ -17,6 +17,10 @@ var args = require("yargs") ...@@ -17,6 +17,10 @@ var args = require("yargs")
.nargs("link", 1) .nargs("link", 1)
.nargs("file", 1) .nargs("file", 1)
.nargs("depth", 1) .nargs("depth", 1)
.nargs("include_html", 1)
.nargs("include_js", 1)
.nargs("include_css", 1)
.nargs("include_html", 1)
.argv; .argv;
fs.open(args.file, 'w', function(err, file){ fs.open(args.file, 'w', function(err, file){
...@@ -26,6 +30,10 @@ fs.open(args.file, 'w', function(err, file){ ...@@ -26,6 +30,10 @@ fs.open(args.file, 'w', function(err, file){
var depth = 3, var depth = 3,
count = 1, count = 1,
include_html = true,
include_js = false,
include_css = false,
include_header = false,
link = args.link, link = args.link,
builder = require("xmlbuilder"), builder = require("xmlbuilder"),
readline = require('readline'), readline = require('readline'),
...@@ -33,8 +41,24 @@ var depth = 3, ...@@ -33,8 +41,24 @@ var depth = 3,
url_list = [], url_list = [],
crawler = new SCrawler(link); crawler = new SCrawler(link);
if (args.depth) depth = args.depth; if (args.depth) {
depth = args.depth;
}
if (args.include_html) {
include_html = (args.include_html === "True");
}
if (args.include_js) {
include_js = (args.include_js === "True");
}
if (args.include_css) {
include_css = (args.include_css === "True");
}
if (args.include_header) {
include_header = (args.include_header === "True");
}
crawler.interval = 250; crawler.interval = 250;
crawler.maxConcurrency = 5; crawler.maxConcurrency = 5;
crawler.maxDepth = depth; crawler.maxDepth = depth;
...@@ -43,13 +67,20 @@ crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) { ...@@ -43,13 +67,20 @@ crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
readline.cursorTo(process.stdout, 0); readline.cursorTo(process.stdout, 0);
process.stdout.write(count + ""); process.stdout.write(count + "");
count+=1; count+=1;
url_list.push({ if (include_header) {
"loc": queueItem.url, url_list.push({
"stateData": queueItem.stateData, "loc": queueItem.url,
"referrer": queueItem.referrer "stateData": queueItem.stateData,
}); "referrer": queueItem.referrer
});
} else {
url_list.push({
"loc": queueItem.url
});
}
}); });
// Fire callback // Fire callback
crawler.on("complete", function() { crawler.on("complete", function() {
readline.cursorTo(process.stdout, 0); readline.cursorTo(process.stdout, 0);
...@@ -67,29 +98,38 @@ crawler.on("complete", function() { ...@@ -67,29 +98,38 @@ crawler.on("complete", function() {
crawler.on("fetcherror", function(queueItem, response) { crawler.on("fetcherror", function(queueItem, response) {
console.log("Error " + response.statusCode + " while fetching " + queueItem.url); console.log("Error " + response.statusCode + " while fetching " + queueItem.url);
url_list.push({ if (include_header) {
"loc": queueItem.url, url_list.push({
"stateData": queueItem.stateData, "loc": queueItem.url,
"referrer": queueItem.referrer "stateData": queueItem.stateData,
}); "referrer": queueItem.referrer
});
}
}); });
crawler.discoverResources = function(buffer, queueItem) { crawler.discoverResources = function(buffer, queueItem) {
var $ = cheerio.load(buffer.toString("utf8")); var $ = cheerio.load(buffer.toString("utf8"));
var link_list = [];
var tag_a = $("a[href]").map(function () { if (include_html) {
link_list = link_list.concat($("a[href]").map(function () {
return $(this).attr("href"); return $(this).attr("href");
}).get(); }).get())
}
var tag_link = $("link[href]").map(function () { if (include_css) {
return $(this).attr("href"); console.log('************************');
}).get(); console.log(include_css);
link_list = link_list.concat($("link[href]").map(function () {
return $(this).attr("href");
}).get());
}
var tag_script = $("script[src]").map(function () { if (include_js) {
return $(this).attr("src"); link_list = link_list.concat($("script[src]").map(function () {
}).get(); return $(this).attr("src");
return tag_a.concat(tag_link).concat(tag_script); }).get())
}
return link_list;
}; };
// Start Crawl // Start Crawl
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment