forked from vecna/trackmap
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect_included_url.js
99 lines (82 loc) · 3.05 KB
/
collect_included_url.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// This is my first test in phantomjs, I'm thinking that is better
// move in node.js + jsdom to render javascript, I need also to
// dump the content of the HTML+JS+CSS in a file, if you've
// a better approach, open a github issue: THANKS!
//
// plus, I'm evaluating python-selenium instead calling subprocess
// here, but I need something stable to be deploy in box beside
// linux, and so... the best solution still has to be found.
var page = require('webpage').create(),
system = require('system'),
fs = require('fs'),
landing_title,
address;
if (system.args.length === 1) {
console.log('Usage: collect_included_url.js <some URL> <a directory>');
phantom.exit(1);
}
address = system.args[1];
page.onResourceError = function(resourceError) {
page.reason = resourceError.errorString;
page.reason_url = resourceError.url;
};
page.onResourceRequested = function (req) {
// console.log('requested: ' + JSON.stringify(req, undefined, 4));
try {
fs.write(system.args[2] + "/__urls", req.url + "\n", 'a+');
} catch(e) {
// console.log(e);
}
};
page.onResourceReceived = function (res) {
try {
fs.write(system.args[2] + "/__responses", JSON.stringify(res, undefined, 4), 'a+');
} catch(e) {
// console.log(e);
}
};
page.settings.resourceTimeout = 29000; // express in milliseconds, 29 sec
page.onResourceTimeout = function(e) {
// it'll probably be 'Network timeout on resource'
// console.log(e.errorCode + " " + e.errorString + " " + e.url);
try {
// the url whose request timed out
fs.write(system.args[2] + "/__failures", e.url + "\n", 'a+');
} catch(e) {
// console.log(e);
}
// phantom.exit(1);
};
page.open(address, function (status) {
if (status !== 'success') {
// console.log('FAIL to load the address', address);
// console.log(status);
// console.log(page.reason);
// console.log(page.reason_url);
} else {
landing_title = page.evaluate(function () {
return document.title;
});
// console.log(address + ' => ' + landing_title);
fs.write(system.args[2] + "/__title", landing_title, 'w+');
var objects_e = page.evaluate(function() {
return document.getElementsByTagName('object');
});
// console.log("Final page loaded #" + objects_e.length + " <object> elements");
for (var i = 0; i < objects_e.length; i++) {
if (objects_e[i] != null) {
fs.write(system.args[2] + "/__object_" + i, objects_e[i].outerHTML, 'w+');
}
else {
// console.log("[!!!] dumping " + system.args[2] + "/__debug.body");
fs.write(system.args[2] + "/__debug.body",
page.evaluate(function() {
return document.getElementsByTagName('body')[0].outerHTML;
}), 'w+');
}
// console.log(objects_e[i].outerHTML);
}
}
phantom.exit();
fs.remove(system.args[2] + "/lock");
});