-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspiderable_server.js
executable file
·129 lines (114 loc) · 5.53 KB
/
spiderable_server.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
var fs = Npm.require('fs');
var child_process = Npm.require('child_process');
var querystring = Npm.require('querystring');
var urlParser = Npm.require('url');
// list of bot user agents that we want to serve statically, but do
// not obey the _escaped_fragment_ protocol. The page is served
// statically to any client whos user agent matches any of these
// regexps. Users may modify this array.
//
// An original goal with the spiderable package was to avoid doing
// user-agent based tests. But the reality is not enough bots support
// the _escaped_fragment_ protocol, so we need to hardcode a list
// here. I shed a silent tear.
Spiderable.userAgentRegExps = [
/^facebookexternalhit/i, /^linkedinbot/i, /^twitterbot/i];
// how long to let phantomjs run before we kill it
var REQUEST_TIMEOUT = 30*1000;
// maximum size of result HTML. node's default is 200k which is too
// small for our docs.
var MAX_BUFFER = 10*1024*1024; // 5MB
// Exported for tests.
Spiderable._urlForPhantom = function (siteAbsoluteUrl, requestUrl) {
// reassembling url without escaped fragment if exists
var parsedUrl = urlParser.parse(requestUrl);
var parsedQuery = querystring.parse(parsedUrl.query);
var escapedFragment = parsedQuery['_escaped_fragment_'];
delete parsedQuery['_escaped_fragment_'];
var parsedAbsoluteUrl = urlParser.parse(siteAbsoluteUrl);
// If the ROOT_URL contains a path, Meteor strips that path off of the
// request's URL before we see it. So we concatenate the pathname from
// the request's URL with the root URL's pathname to get the full
// pathname.
if (parsedUrl.pathname.charAt(0) === "/") {
parsedUrl.pathname = parsedUrl.pathname.substring(1);
}
parsedAbsoluteUrl.pathname = urlParser.resolve(parsedAbsoluteUrl.pathname,
parsedUrl.pathname);
parsedAbsoluteUrl.query = parsedQuery;
// `url.format` will only use `query` if `search` is absent
parsedAbsoluteUrl.search = null;
if (escapedFragment !== undefined && escapedFragment !== null && escapedFragment.length > 0) {
parsedAbsoluteUrl.hash = '!' + decodeURIComponent(escapedFragment);
}
return urlParser.format(parsedAbsoluteUrl);
};
var PHANTOM_SCRIPT = Assets.getText("phantom_script.js");
WebApp.connectHandlers.use(function (req, res, next) {
// _escaped_fragment_ comes from Google's AJAX crawling spec:
// https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (/\?.*_escaped_fragment_=/.test(req.url) ||
_.any(Spiderable.userAgentRegExps, function (re) {
return re.test(req.headers['user-agent']); })) {
var url = Spiderable._urlForPhantom('http://' + req.headers.host, req.url);
// This string is going to be put into a bash script, so it's important
// that 'url' (which comes from the network) can neither exploit phantomjs
// or the bash script. JSON stringification should prevent it from
// exploiting phantomjs, and since the output of JSON.stringify shouldn't
// be able to contain newlines, it should be unable to exploit bash as
// well.
var phantomScript = "var url = " + JSON.stringify(url) + ";" +
PHANTOM_SCRIPT;
// Allow override of phantomjs args via env var
// We use one env var to try to keep env-var explosion under control.
// We're not going to document this unless it is actually needed;
// (if you find yourself needing this please let us know the use case!)
var phantomJsArgs = process.env.METEOR_PKG_SPIDERABLE_PHANTOMJS_ARGS || '';
// Default image loading to off (we don't need images)
if (phantomJsArgs.indexOf("--load-images=") === -1) {
phantomJsArgs += " --load-images=no";
}
// POODLE means SSLv3 is being turned off everywhere.
// phantomjs currently defaults to SSLv3, and won't use TLS.
// Use --ssl-protocol to set the default to TLSv1
// (another option would be 'any', but really, we want to say >= TLSv1)
// More info: https://groups.google.com/forum/#!topic/meteor-core/uZhT3AHwpsI
if (phantomJsArgs.indexOf("--ssl-protocol=") === -1) {
phantomJsArgs += " --ssl-protocol=TLSv1";
}
// Run phantomjs.
//
// Use '/dev/stdin' to avoid writing to a temporary file. We can't
// just omit the file, as PhantomJS takes that to mean 'use a
// REPL' and exits as soon as stdin closes.
//
// However, Node 0.8 broke the ability to open /dev/stdin in the
// subprocess, so we can't just write our string to the process's stdin
// directly; see https://gist.github.com/3751746 for the gory details. We
// work around this with a bash heredoc. (We previous used a "cat |"
// instead, but that meant we couldn't use exec and had to manage several
// processes.)
child_process.execFile(
'/bin/bash',
['-c',
("exec phantomjs " + phantomJsArgs + " /dev/stdin <<'END'\n" +
phantomScript + "END\n")],
{timeout: REQUEST_TIMEOUT, maxBuffer: MAX_BUFFER},
Meteor.bindEnvironment( function (error, stdout, stderr) {
if (!error && /<html/i.test(stdout)) {
res.writeHead(200, {'Content-Type': 'text/html; charset=UTF-8'});
res.end(stdout);
} else {
// phantomjs failed. Don't send the error, instead send the
// normal page.
if (error && error.code === 127)
Meteor._debug("spiderable: phantomjs not installed. Download and install from http://phantomjs.org/");
else
Meteor._debug("spiderable: phantomjs failed:", error, "\nstderr:", stderr);
next();
}
}));
} else {
next();
}
});