From 25a6133876659f3e08760f017d7473e1b0a65396 Mon Sep 17 00:00:00 2001 From: Andrew Balakirev Date: Mon, 9 Oct 2017 21:24:38 +0300 Subject: [PATCH] vow promised -> native Promise async/await instead of .then eslint errors fixed using more popular mkdirp closes #6 ref #1 --- .eslintrc | 3 +- lib/PhantomEnvironment.js | 930 +++++++++++++++++++------------------- package.json | 6 +- yarn.lock | 18 +- 4 files changed, 461 insertions(+), 496 deletions(-) diff --git a/.eslintrc b/.eslintrc index 14d346a..1298c9e 100644 --- a/.eslintrc +++ b/.eslintrc @@ -10,6 +10,7 @@ "max-len": [2, 120], "no-param-reassign": [2, {"props": false}], "prefer-template": [0], - "no-underscore-dangle": [2, {"allowAfterThis": true}] + "no-underscore-dangle": [2, {"allowAfterThis": true}], + "no-mixed-operators": [0] } } diff --git a/lib/PhantomEnvironment.js b/lib/PhantomEnvironment.js index d7a2e2f..e3a821b 100644 --- a/lib/PhantomEnvironment.js +++ b/lib/PhantomEnvironment.js @@ -1,18 +1,19 @@ - - -const _ = require('lodash'), - AbstractEnvironment = require('goose-abstract-environment'), - debugLib = require('debug'), - debug = debugLib('PhantomEnvironment'), - phantomError = debugLib('Phantom:error'), - debugParser = debugLib('RedParser'), - phantom = require('phantom'), - path = require('path'), - mkdir = require('mkdir-p'), - url = require('url'), - vowNode = require('vow-node'), - vow = require('vow'), - fs = require('fs'); +const _ = require('lodash'); +const AbstractEnvironment = require('goose-abstract-environment'); +const debugLib = require('debug'); +const phantom = require('phantom'); +const path = require('path'); +const mkdirp = require('mkdirp'); +const { parse: parseUrl } = require('url'); +const fs = require('fs'); + +const debug = debugLib('PhantomEnvironment'); +const phantomError = debugLib('Phantom:error'); +const debugParser = debugLib('RedParser'); + +function mkdir(...args) { + return new Promise((resolve, reject) => mkdirp(...args, e => (e ? reject(e) : resolve()))); +} /** * @typedef {object} Proxy @@ -52,6 +53,69 @@ const _ = require('lodash'), * @property {number} height */ +/** + * @param {ProxyIndicator} proxyIndicator + * @returns {Error} + */ +function createProxyError(proxyIndicator) { + let msg; + switch (proxyIndicator.type) { + case 'redirect': + msg = 'Proxy matched redirect'; + break; + case 'responseCode': + msg = 'Proxy matched response code'; + break; + case 'captcha': + msg = 'Captcha handled'; + break; + default: + throw new Error('Unsupported proxyIndicator'); + } + const err = new Error(msg); + err.proxyIndicator = proxyIndicator.type; + err.proxyLevel = proxyIndicator.level || 'medium'; + + return err; +} + +/** + * @param {string} currentUrl + * @param {string} redirectUri + * @returns {string} + * @private + */ +function getRedirectUrl(currentUrl, redirectUri) { + const parsedCurrentUrl = parseUrl(currentUrl); + const parsedRedirectUri = parseUrl(redirectUri); + const hostname = parsedRedirectUri.hostname || parsedCurrentUrl.hostname; + const protocol = parsedRedirectUri.protocol || parsedCurrentUrl.protocol; + + return protocol + '//' + hostname + parsedRedirectUri.path; +} + +/** + * @param {object} resource + * @returns {string} + * @private + */ +function extractRedirectUrl(resource) { + let redirectUrl; + if (resource.redirectUrl) { + redirectUrl = resource.redirectUrl; + } else { + const locationHeader = (resource.headers || []).find( + header => header.name && header.name.toLowerCase() === 'location', + ); + + if (locationHeader && locationHeader.value) { + redirectUrl = locationHeader.value; + } + } + + return redirectUrl ? getRedirectUrl(resource.url, redirectUrl) : ''; +} + /** * @typedef {object} PhantomEnvironmentOptions * @property {?number} timeout @@ -67,13 +131,14 @@ const _ = require('lodash'), * @property {?string} snapshotDir directory for snapshots * @property {?Proxy|Array.} proxy single proxy or proxy list * @property {Array.} proxyIndicators Indicators which say that proxy became unreachable - * @property {?function} proxyRotator proxy rotator function(proxyList, currentProxy) with context of this env. function should return Proxy from the list + * @property {?function} proxyRotator proxy rotator function(proxyList, currentProxy) with context of this env. function + * should return Proxy from the list * @property {?string|Array.} userAgent user agent or list of agents for setting to phantom * @property {?Screen} screen screen dimensions * @property {?Resources} resources white and black lists for loading resources on the page */ const defaultOptions = { - // Phantom options + // Phantom options timeout: 60 * 1000, weak: true, loadImages: false, @@ -83,12 +148,12 @@ const defaultOptions = { webSecurity: false, phantomPath: path.join(require.resolve('phantomjs-prebuilt'), '../../bin/'), - // Custom environment options + // Custom environment options snapshot: false, snapshotDir: 'snapshots', proxy: null, proxyRotator: null, - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12', + userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12', // eslint-disable-line max-len screen: { width: 1440, height: 900, @@ -102,9 +167,9 @@ const defaultOptions = { let port = 13200; class PhantomEnvironment extends AbstractEnvironment { - /** - * @param {PhantomEnvironmentOptions} options - */ + /** + * @param {PhantomEnvironmentOptions} options + */ constructor(options) { debug('Initializing...'); super(options); @@ -129,19 +194,18 @@ class PhantomEnvironment extends AbstractEnvironment { this._browserEnvInjected = false; } - prepare() { + async prepare() { debug('Preparing...'); - return AbstractEnvironment.prototype.prepare - .call(this) - .then(this._setup, this) - .then(this._setViewport, this) - .then(this._setUserAgent, this) - .then(this._setTimeout, this) - .then(this._handlePhantomEvents, this) - .then(this._rotateProxy, this) - .then(this._navigateTo.bind(this, this._url)) - .then(this._validateProxy, this) - .then(this._injectVendors, this); + await super.prepare(); + await this._setup(); + await this._setViewport(); + await this._setUserAgent(); + await this._setTimeout(); + await this._handlePhantomEvents(); + await this._rotateProxy(); + await this._navigateTo(this._url); + await this._validateProxy(); + await this._injectVendors(); } setProxy(proxy) { @@ -157,151 +221,134 @@ class PhantomEnvironment extends AbstractEnvironment { return this._options[name]; } - evaluateJs() { - const deferred = vow.defer(), - page = this._page, - args = Array.prototype.slice.call(arguments, 0); + evaluateJs(...args) { + return new Promise((resolve, reject) => { + const page = this._page; - const evalFunc = args.pop(); - if (typeof evalFunc !== 'function') { - throw new Error('You must pass function as last argument to PhantomEnvironment.evaluateJs'); - } - args.unshift(evalFunc, results => deferred.resolve(results)); + const evalFunc = args.pop(); + if (typeof evalFunc !== 'function') { + reject(new Error('You must pass function as last argument to PhantomEnvironment.evaluateJs')); + return; + } + args.unshift(evalFunc, results => resolve(results)); - page.evaluate(...args); - return deferred.promise(); + page.evaluate(...args); + }); } - /** - * Take screen snapshot - * @param {string} fileName - * @returns {Promise} - */ - snapshot(fileName) { + /** + * Take screen snapshot + * @param {string} fileName + * @returns {Promise} + */ + async snapshot(fileName) { const options = this._options; if (!options.snapshot) { - return vow.resolve(); + return; } - const screenShotFilePath = path.join(options.snapshotDir, this._getHostName(this._url)); + const screenShotFilePath = path.join(options.snapshotDir, parseUrl(this._url).hostname); const screenShotFileName = path.join(screenShotFilePath, fileName + '.png'); debug('.snapshot() to %s', screenShotFileName); - return vowNode - .invoke(mkdir, screenShotFilePath) - .then(() => { - const windowSize = { - left: 0, - top: 0, - width: options.screen.width, - height: options.screen.height, - }; - this._page.clipRect = windowSize; - debug('Doing snapshot with window size %o, filepath %s', windowSize, screenShotFileName); - this._page.render(screenShotFileName); - }) - .then(() => { - const deferred = vow.defer(); - - const interval = setInterval(() => { - if (fs.statSync(screenShotFilePath).size) { - clearInterval(interval); - clearTimeout(timeout); - deferred.resolve(); - } - }, 20); - - const timeout = setTimeout(() => { - clearInterval(interval); - deferred.reject(new Error('Snapshot timeout')); - }, 500); - - return deferred.promise(); - }); - } - - waitForPage(timeout) { - timeout = timeout || 5000; - - const deferred = vow.defer(); - const timeoutId = setTimeout(() => { - debug('Timeout %s has reached on page load', timeout); - this._navigationActions = []; - deferred.reject(new Error('Page navigation timeout')); - }, timeout); - - this._navigationActions.push((err) => { - clearTimeout(timeoutId); - if (err) { - deferred.reject(err); - } else { - deferred.resolve(); - } - }); - debug('Added page load callback'); + await mkdir(screenShotFilePath); + + const windowSize = { + left: 0, + top: 0, + width: options.screen.width, + height: options.screen.height, + }; + this._page.clipRect = windowSize; + debug('Doing snapshot with window size %o, filepath %s', windowSize, screenShotFileName); + this._page.render(screenShotFileName); + + await new Promise((resolve, reject) => { + let timeout; + + const interval = setInterval(() => { + if (fs.statSync(screenShotFilePath).size) { + clearInterval(interval); + clearTimeout(timeout); + resolve(); + } + }, 20); - return deferred.promise().then(this._injectVendors, this); + timeout = setTimeout(() => { + clearInterval(interval); + reject(new Error('Snapshot timeout')); + }, 500); + }); } - waitForQuery(uri, timeout) { - timeout = timeout || 5000; - - const deferred = vow.defer(); - const timeoutId = setTimeout(() => { - debug('Timeout %s has reached for waiting query %s', timeout, uri); - this._requestingActions = []; - deferred.reject(new Error('Waiting request timeout')); - }, timeout); + async waitForPage(timeout = 5000) { + await new Promise((resolve, reject) => { + const timeoutId = setTimeout(() => { + debug('Timeout %s has reached on page load', timeout); + this._navigationActions = []; + reject(new Error('Page navigation timeout')); + }, timeout); - this._requestingActions.push({ - pattern: uri, - fn(err, results) { + this._navigationActions.push((err) => { clearTimeout(timeoutId); if (err) { - deferred.reject(err); + reject(err); } else { - deferred.resolve(results); + resolve(); } - }, + }); + debug('Added page load callback'); + }); + await this._injectVendors(); + } + + waitForQuery(uri, timeout = 5000) { + return new Promise((resolve, reject) => { + const timeoutId = setTimeout(() => { + debug('Timeout %s has reached for waiting query %s', timeout, uri); + this._requestingActions = []; + reject(new Error('Waiting request timeout')); + }, timeout); + + this._requestingActions.push({ + pattern: uri, + fn(err, results) { + clearTimeout(timeoutId); + if (err) { + reject(err); + } else { + resolve(results); + } + }, + }); + debug('Added request callback'); }); - debug('Added request callback'); - - return deferred.promise(); } back() { debug('Back'); this._page.goBack(); - return vow.resolve(); + return Promise.resolve(); } - mouseClick(selector) { - return this._getElementPosition(selector) - .then((position) => { - this._page.sendEvent('mousedown', position.x, position.y); - return position; - }) - .then((position) => { - this._page.sendEvent('mouseup', position.x, position.y); - }); + async mouseClick(selector) { + const position = await this._getElementPosition(selector); + this._page.sendEvent('mousedown', position.x, position.y); + this._page.sendEvent('mouseup', position.x, position.y); } - mousedown(selector) { - return this._getElementPosition(selector) - .then((position) => { - this._page.sendEvent('mousedown', position.x, position.y); - }); + async mousedown(selector) { + const position = await this._getElementPosition(selector); + this._page.sendEvent('mousedown', position.x, position.y); } - mouseup(selector) { - return this._getElementPosition(selector) - .then((position) => { - this._page.sendEvent('mouseup', position.x, position.y); - }); + async mouseup(selector) { + const position = await this._getElementPosition(selector); + this._page.sendEvent('mouseup', position.x, position.y); } - _getElementPosition(selector) { - return this.evaluateJs(selector, /* @covignore */ (selector) => { - const node = Sizzle(selector)[0]; + async _getElementPosition(selector) { + const position = await this.evaluateJs(selector, /* @covignore */ (selector) => { // eslint-disable-line no-shadow + const node = Sizzle(selector)[0]; // eslint-disable-line no-undef if (!node) { return null; } @@ -311,349 +358,325 @@ class PhantomEnvironment extends AbstractEnvironment { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2, }; - }).then((position) => { - if (!position) { - throw new Error('Position of element ' + selector + ' was not found'); - } - debug('Element position is %o', position); - return position; }); + + if (!position) { + throw new Error('Position of element ' + selector + ' was not found'); + } + debug('Element position is %o', position); + return position; } - /** - * Set up a fresh phantomjs page. - * @returns {Promise} - * @private - */ - _setup() { - return this._createInstance().then(this._createPage, this); + /** + * Set up a fresh phantomjs page. + * @returns {Promise} + * @private + */ + async _setup() { + await this._createInstance(); + await this._createPage(); } - /** - * Create a phantomjs instance. - * @returns {Promise} - * @private - */ + /** + * Create a phantomjs instance. + * @returns {Promise} + * @private + */ _createInstance() { - const options = this._options, - deferred = vow.defer(), - flags = []; - debug('.createInstance() creating Phantom instance with options %o', options); - flags.push('--load-images=' + options.loadImages); - flags.push('--ignore-ssl-errors=' + options.ignoreSslErrors); - flags.push('--ssl-protocol=' + options.sslProtocol); - flags.push('--web-security=' + options.webSecurity); - - if (options.cookiesFile !== null) { - flags.push('--cookies-file=' + options.cookiesFile); - } + return new Promise((resolve) => { + const options = this._options; + const flags = []; + debug('.createInstance() creating Phantom instance with options %o', options); + flags.push('--load-images=' + options.loadImages); + flags.push('--ignore-ssl-errors=' + options.ignoreSslErrors); + flags.push('--ssl-protocol=' + options.sslProtocol); + flags.push('--web-security=' + options.webSecurity); + + if (options.cookiesFile !== null) { + flags.push('--cookies-file=' + options.cookiesFile); + } - // dnode options for compilation on windows - let dnodeOpts = {}; - if (options.weak === false) { - dnodeOpts = { weak: false }; - } + // dnode options for compilation on windows + let dnodeOpts = {}; + if (options.weak === false) { + dnodeOpts = { weak: false }; + } - // combine flags, options and callback into args - const args = flags; - args.push({ - port: options.port || this._getPort(), - dnodeOpts, - path: options.phantomPath, - onExit: this._handleExit.bind(this), - }); - args.push((instance) => { - this._phantomJS = instance; - deferred.resolve(instance); + // combine flags, options and callback into args + const args = flags; + args.push({ + port: options.port || (port += 1), + dnodeOpts, + path: options.phantomPath, + onExit: this._handleExit.bind(this), + }); + args.push((instance) => { + this._phantomJS = instance; + resolve(instance); + }); + phantom.create(...args); }); - phantom.create(...args); - return deferred.promise(); } - _getPort() { - port++; - return port; - } - - /** - * Creates new page in phantom - * @returns {Promise} - */ + /** + * Creates new page in phantom + * @returns {Promise} + */ _createPage() { - debug('._createPage() has called'); - const deferred = vow.defer(); - this._phantomJS.createPage((page) => { - this._page = page; - debug('._createPage() phantom page created'); - deferred.resolve(page); + return new Promise((resolve) => { + debug('._createPage() has called'); + this._phantomJS.createPage((page) => { + this._page = page; + debug('._createPage() phantom page created'); + resolve(page); + }); }); - return deferred.promise(); } - /** - * Tear down a phantomjs instance. - */ + /** + * Tear down a phantomjs instance. + */ tearDown() { - debug('._tearDownInstance() tearing down'); - const phantom = this._phantomJS; - if (!phantom || !phantom.process) { - debug('Phantom process already exited, not killing'); - return vow.resolve(); - } + return new Promise((resolve) => { + debug('._tearDownInstance() tearing down'); + const phantomJs = this._phantomJS; + if (!phantomJs || !phantomJs.process) { + debug('Phantom process already exited, not killing'); + resolve(); + return; + } - const deferred = vow.defer(); - const pid = phantom.process.pid; + const pid = phantomJs.process.pid; - debug('Terminating phantom process gracefully, pid: ', pid); - if (this._page) { - this._page.close(); - delete this._page; - } + debug('Terminating phantom process gracefully, pid: ', pid); + if (this._page) { + this._page.close(); + delete this._page; + } - phantom.exit(); + phantomJs.exit(); - const timeout = setTimeout(() => { - const i = this._exitHanlers.indexOf(resolver); - if (i !== -1) { - this._exitHanlers.splice(i, 1); - } + let timeout; - debug('phantom time is out, kill it and go ahead'); - if (phantom.process) { - phantom.process.kill('SIGKILL'); + function resolver() { + clearTimeout(timeout); + resolve(); } - deferred.resolve(); - }, 5000); // 5 sec to die + timeout = setTimeout(() => { + const i = this._exitHanlers.indexOf(resolver); + if (i !== -1) { + this._exitHanlers.splice(i, 1); + } + + debug('phantom time is out, kill it and go ahead'); + if (phantomJs.process) { + phantomJs.process.kill('SIGKILL'); + } - function resolver() { - clearTimeout(timeout); - deferred.resolve(); - } + resolve(); + }, 5000); // 5 sec to die - this._exitHanlers.push(resolver); + this._exitHanlers.push(resolver); - delete this._phantomJS; - return deferred.promise(); + delete this._phantomJS; + }); } - /** - * Handles the phantom process ending/crashing unexpectedly. - * If an `onExit` handler has been bound then that will be called. Otherwise, the error will be re-thrown. - * @param {Number} code - * @param {String} [signal] - */ + /** + * Handles the phantom process ending/crashing unexpectedly. + * If an `onExit` handler has been bound then that will be called. Otherwise, the error will be re-thrown. + * @param {Number} code + * @param {String} [signal] + */ _handleExit(code, signal) { debug('Phantom exited with code ' + code + ' and signal ' + signal); - // delete this._phantomJS.process; + // delete this._phantomJS.process; - // otherwise, if we have a non-zero code we'll throw a better error message - // than the `phantom` lib would. + // otherwise, if we have a non-zero code we'll throw a better error message + // than the `phantom` lib would. if (code !== 0) { const err = new Error('The PhantomJS process ended unexpectedly'); err.code = code; err.signal = signal; - // throw err; + // throw err; } this._exitHanlers.forEach(handler => handler(code)); this._exitHanlers = []; } - /** - * Go to url - * @param url - * @returns {Promise} - * @private - */ + /** + * Go to url + * @param url + * @returns {Promise} + * @private + */ _navigateTo(url) { - const deferred = vow.defer(); + return new Promise((resolve, reject) => { + this._openPage(url, resolve, reject); + }); + } + + _openPage(url, resolve, reject) { debug('.goto() url: ' + url); - this._page.open(url, (status) => { + this._page.open(url, async (status) => { debug('.goto() page loaded: ' + status); if (status === 'success') { - return deferred.resolve(); + resolve(); + return; } - this._rotateProxy() - .then((proxy) => { - // cannot set new proxy - if (proxy === null) { - return deferred.reject(new Error('Page ' + this._url + ' was not loaded')); - } + try { + const proxy = await this._rotateProxy(); + // cannot set new proxy + if (proxy === null) { + reject(new Error(`Page ${url} was not loaded`)); + return; + } - // one more attempt to open page through the new proxy - return this._navigateTo(url); - }, e => deferred.reject(e)); + // one more attempt to open page through the new proxy + this._openPage(url, resolve, reject); + } catch (e) { + reject(e); + } }); - - return deferred.promise(); } - /** - * Set the viewport. - * - * @returns {Promise} - * @private - */ + /** + * Set the viewport. + * + * @returns {Promise} + * @private + */ _setViewport() { - const deferred = vow.defer(); - let screen = this._options.screen; - if (Array.isArray(screen)) { - screen = _.sample(screen); - } - const width = screen.width; - const height = screen.height; - debug('.viewport() to ' + width + ' x ' + height); - const viewport = { width, height }; - this._options.screen = viewport; - this._page.set('viewportSize', viewport, () => deferred.resolve()); - - return deferred.promise(); - } - - /** - * Set the user agent. - * - * @returns {Promise} - * @private - */ - _setUserAgent() { - const deferred = vow.defer(); - let userAgent = this._options.userAgent; - if (Array.isArray(userAgent)) { - userAgent = _.sample(this._options.userAgent); - } - debug('.userAgent() to ' + userAgent); - this._page.set('settings.userAgent', userAgent, () => deferred.resolve()); + return new Promise((resolve) => { + let screen = this._options.screen; + if (Array.isArray(screen)) { + screen = _.sample(screen); + } + const width = screen.width; + const height = screen.height; + debug('.viewport() to ' + width + ' x ' + height); + const viewport = { width, height }; + this._options.screen = viewport; + this._page.set('viewportSize', viewport, () => resolve()); + }); + } - return deferred.promise(); + /** + * Set the user agent. + * + * @returns {Promise} + * @private + */ + _setUserAgent() { + return new Promise((resolve) => { + let userAgent = this._options.userAgent; + if (Array.isArray(userAgent)) { + userAgent = _.sample(this._options.userAgent); + } + debug('.userAgent() to ' + userAgent); + this._page.set('settings.userAgent', userAgent, () => resolve()); + }); } - /** - * Set timeout. - * - * @returns {Promise} - * @private - */ + /** + * Set timeout. + * + * @returns {Promise} + * @private + */ _setTimeout() { - const deferred = vow.defer(); - const timeout = this._options.timeout; - debug('.timeout() to ' + timeout); - this._page.set('settings.resourceTimeout', timeout, () => deferred.resolve()); - - return deferred.promise(); + return new Promise((resolve) => { + const timeout = this._options.timeout; + debug('.timeout() to ' + timeout); + this._page.set('settings.resourceTimeout', timeout, () => resolve()); + }); } - /** - * @param {Error} error - */ + /** + * @param {Error} error + */ addProxyError(error) { this._proxyErrors.push(error); } - /** - * @returns {Array.} - */ + /** + * @returns {Array.} + */ getProxyErrors() { return this._proxyErrors; } - /** - * @param type - * @returns {Array.} - */ + /** + * @param type + * @returns {Array.} + */ getProxyIndicators(type) { return this._proxyIndicators.filter(item => item.type === type); } - /** - * @returns {Promise} - * @private - */ + /** + * @returns {Promise} + * @private + */ _validateProxy() { return this.getProxyErrors().length === 0 ? - Promise.resolve() : - Promise.reject(this.getProxyErrors().pop()); - } - - /** - * @param {ProxyIndicator} proxyIndicator - * @returns {Error} - */ - createProxyError(proxyIndicator) { - let msg; - switch (proxyIndicator.type) { - case 'redirect': - msg = 'Proxy matched redirect'; - break; - case 'responseCode': - msg = 'Proxy matched response code'; - break; - case 'captcha': - msg = 'Captcha handled'; - break; - default: - throw new Error('Unsupported proxyIndicator'); - } - const err = new Error(msg); - err.proxyIndicator = proxyIndicator.type; - err.proxyLevel = proxyIndicator.level || 'medium'; - - return err; + Promise.resolve() : + Promise.reject(this.getProxyErrors().pop()); } - /** - * Set a proxy from the proxy list (unset previous one) - * - * @returns {Promise} - * @private - */ - _rotateProxy() { + /** + * Set a proxy from the proxy list (unset previous one) + * + * @returns {Promise} + * @private + */ + async _rotateProxy() { const proxy = this._proxy; const currentProxy = this._proxyCurrent; - if (proxy == undefined) { - return vow.resolve(null); + if (!proxy) { + return null; } - if (Array.isArray(proxy)) { - this._removeUnavailableProxy(); - const promise = (typeof this._options.proxyRotator === 'function') ? - this._options.proxyRotator(proxy, currentProxy) : - vow.resolve(_.sample(proxy)); - - return promise - .then((foundProxy) => { - this._proxyErrors = []; - if (foundProxy == undefined) { - throw new Error('No proxy found'); - } - return this._applyProxy(foundProxy); - }); + + if (!Array.isArray(proxy)) { + return this._applyProxy(proxy); } - return this._applyProxy(proxy); + this._removeUnavailableProxy(); + const { proxyRotator } = this._options; + const foundProxy = typeof proxyRotator === 'function' + ? await proxyRotator(proxy, currentProxy) + : _.sample(proxy); + + this._proxyErrors = []; + if (!foundProxy) { + throw new Error('No proxy found'); + } + return this._applyProxy(foundProxy); } - /** - * Apply proxy to Phantom - * @private - */ + /** + * Apply proxy to Phantom + * @private + */ _applyProxy(proxy) { - const deferred = vow.defer(); - this._phantomJS.setProxy(proxy.host, proxy.port, 'manual', proxy.username, proxy.password, () => { - debug('Proxy applied %o', proxy); - this._proxyCurrent = proxy; - deferred.resolve(proxy); + return new Promise((resolve) => { + this._phantomJS.setProxy(proxy.host, proxy.port, 'manual', proxy.username, proxy.password, () => { + debug('Proxy applied %o', proxy); + this._proxyCurrent = proxy; + resolve(proxy); + }); }); - return deferred.promise(); } - /** - * Remove from proxy list one which doesn't work - * - * @returns {?Proxy} - * @private - */ + /** + * Remove from proxy list one which doesn't work + * + * @returns {?Proxy} + * @private + */ _removeUnavailableProxy() { const current = this._proxyCurrent; if (!Array.isArray(this._proxy) || this._proxy.length === 0 || current === null) { @@ -664,7 +687,7 @@ class PhantomEnvironment extends AbstractEnvironment { const index = this._proxy.findIndex(item => item.host === current.host && item.port === current.port); let proxy = null; if (index !== -1) { - // cut off old used proxy from the list + // cut off old used proxy from the list proxy = this._proxy.splice(index, 1); } return Array.isArray(proxy) ? proxy.pop() : null; @@ -675,12 +698,12 @@ class PhantomEnvironment extends AbstractEnvironment { debug('injecting file %s', filePath); this._page.injectJs(filePath); }); - return vow.resolve(); + return Promise.resolve(); } injectBrowserEnv() { if (this._browserEnvInjected) { - return vow.resolve(); + return Promise.resolve(); } debug('.inject()-ing browser env libs'); @@ -689,10 +712,10 @@ class PhantomEnvironment extends AbstractEnvironment { ]); } - /** - * @param {string} [urlPattern] - * @returns {boolean} - */ + /** + * @param {string} [urlPattern] + * @returns {boolean} + */ hasRedirect(urlPattern) { if (urlPattern === undefined) { return this._redirectUrls.length > 0; @@ -705,10 +728,10 @@ class PhantomEnvironment extends AbstractEnvironment { page.set('onError', (msg, trace) => { phantomError('%s, trace %o, fire %s errbacks', msg, trace, this._errbacks.length); - // this._errbacks.splice(0).forEach(errback => errback(msg, trace)); + // this._errbacks.splice(0).forEach(errback => errback(msg, trace)); }); - // todo: make it workable + // todo: make it workable page.set('onConsoleMessage', (msg) => { const regex = /^(\[GooseParser])(.+)/i; const found = msg.match(regex); @@ -730,7 +753,7 @@ class PhantomEnvironment extends AbstractEnvironment { actions.shift(); action.fn(null, url); } else { - i++; + i += 1; } } }); @@ -746,7 +769,7 @@ class PhantomEnvironment extends AbstractEnvironment { debug('Navigation error %s %s', resourceError.url, resourceError.errorString); const matched = this.getProxyIndicators('responseCode').find(item => item.code === resourceError.status); if (matched) { - this.addProxyError(this.createProxyError(matched)); + this.addProxyError(createProxyError(matched)); } }); @@ -762,83 +785,36 @@ class PhantomEnvironment extends AbstractEnvironment { } if (!allowed || blocked) { - console.log('[GooseParser] Resource ' + requestData.url.substr(0, 30) + ' was aborted'); + console.log( // eslint-disable-line no-console + '[GooseParser] Resource ' + requestData.url.substr(0, 30) + ' was aborted', + ); request.abort(); } - }, (requestData) => { - // todo: decide, remove or leave - // debug('Resource requested %s, %o', requestData.url, requestData); - }, this._options.resources.allowed, this._options.resources.denied); + }, () => {}, this._options.resources.allowed, this._options.resources.denied); page.set('onResourceReceived', (resource) => { - // debug('Resource recieved %o', resource); - // redirect has occurred + // redirect has occurred if ([302, 301].indexOf(resource.status) !== -1) { - const redirectUrl = this._extractRedirectUrl(resource) || ''; + const redirectUrl = extractRedirectUrl(resource) || ''; - // if current url matches with this._url or with the last redirect url from this._redirectUrls + // if current url matches with this._url or with the last redirect url from this._redirectUrls if ( - redirectUrl && - ( - resource.url === this._url || - resource.url === this._redirectUrls[this._redirectUrls.length - 1] - ) - ) { + redirectUrl && + ( + resource.url === this._url || + resource.url === this._redirectUrls[this._redirectUrls.length - 1] + ) + ) { debug('Redirect to %s', redirectUrl); this._redirectUrls.push(redirectUrl); } const matched = this.getProxyIndicators('redirect').find(item => redirectUrl.match(item.url)); if (matched) { - this.addProxyError(this.createProxyError(matched)); + this.addProxyError(createProxyError(matched)); } } }); } - - /** - * @param {object} resource - * @returns {string} - * @private - */ - _extractRedirectUrl(resource) { - let redirectUrl; - if (resource.redirectUrl) { - redirectUrl = resource.redirectUrl; - } else { - const locationHeader = (resource.headers || []).find(header => header.name && header.name.toLowerCase() === 'location'); - - if (locationHeader && locationHeader.value) { - redirectUrl = locationHeader.value; - } - } - - return redirectUrl ? this._getRedirectUrl(resource.url, redirectUrl) : ''; - } - - /** - * @param {string} uri - * @returns {string} - * @private - */ - _getHostName(uri) { - const parsed = url.parse(uri); - return parsed.hostname; - } - - /** - * @param {string} currentUrl - * @param {string} redirectUri - * @returns {string} - * @private - */ - _getRedirectUrl(currentUrl, redirectUri) { - currentUrl = url.parse(currentUrl); - redirectUri = url.parse(redirectUri); - const hostname = redirectUri.hostname || currentUrl.hostname; - const protocol = redirectUri.protocol || currentUrl.protocol; - - return protocol + '//' + hostname + redirectUri.path; - } } module.exports = PhantomEnvironment; diff --git a/package.json b/package.json index a2730fe..795b7cf 100644 --- a/package.json +++ b/package.json @@ -11,11 +11,9 @@ "debug": "^3.0.1", "goose-abstract-environment": "1.0.8", "lodash": "^4.17.4", - "mkdir-p": "^0.0.7", + "mkdirp": "^0.5.1", "phantom": "^0.8.4", - "phantomjs-prebuilt": "^2.1.7", - "vow": "^0.4.16", - "vow-node": "^0.3.0" + "phantomjs-prebuilt": "^2.1.7" }, "devDependencies": { "babel-cli": "^6.18.0", diff --git a/yarn.lock b/yarn.lock index d087cd2..72ceab1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1601,9 +1601,9 @@ globby@^5.0.0: pify "^2.0.0" pinkie-promise "^2.0.0" -goose-abstract-environment@1.0.6: - version "1.0.6" - resolved "https://registry.yarnpkg.com/goose-abstract-environment/-/goose-abstract-environment-1.0.6.tgz#64f28d091d21edf9da924dbbb8504d471b4d81ad" +goose-abstract-environment@1.0.8: + version "1.0.8" + resolved "https://registry.yarnpkg.com/goose-abstract-environment/-/goose-abstract-environment-1.0.8.tgz#8b3ea8aecb89c19f001af2ccdca98a97a7d9101f" dependencies: babel-polyfill "^6.23.0" debug "^3.0.1" @@ -2158,10 +2158,6 @@ minimist@^1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/minimist/-/minimist-1.2.0.tgz#a35008b20f41383eec1fb914f4cd5df79a264284" -mkdir-p@^0.0.7: - version "0.0.7" - resolved "https://registry.yarnpkg.com/mkdir-p/-/mkdir-p-0.0.7.tgz#24c5dbe26da3a99ef158a1eef9a5c2dd9de5683c" - mkdirp@0.5.0: version "0.5.0" resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-0.5.0.tgz#1d73076a6df986cd9344e15e71fcc05a4c9abf12" @@ -3046,18 +3042,12 @@ weak@^1.0.0: bindings "^1.2.1" nan "^2.0.5" -which@1.2.x, which@~1.2.10: +which@1.2.x, which@^1.2.10, which@^1.2.9, which@~1.2.10: version "1.2.14" resolved "https://registry.yarnpkg.com/which/-/which-1.2.14.tgz#9a87c4378f03e827cecaf1acdf56c736c01c14e5" dependencies: isexe "^2.0.0" -which@^1.2.10, which@^1.2.9: - version "1.3.0" - resolved "https://registry.yarnpkg.com/which/-/which-1.3.0.tgz#ff04bdfc010ee547d780bec38e1ac1c2777d253a" - dependencies: - isexe "^2.0.0" - wide-align@^1.1.0: version "1.1.2" resolved "https://registry.yarnpkg.com/wide-align/-/wide-align-1.1.2.tgz#571e0f1b0604636ebc0dfc21b0339bbe31341710"