Skip to content

Commit

Permalink
Context limit middleware (#45)
Browse files Browse the repository at this point in the history
* Base

* separated middlewares

* readme

* Too many requests error

* Now limit_context is a small service.

* Typos

* Typos

* Delete useless variable

* Version update
  • Loading branch information
MatthewZMSU authored Jul 4, 2024
1 parent ca1e25c commit 615340c
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 71 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ Here we list them all with their purpose.
* `VIEWPORT_HEIGHT = 720` - height of the browser's window
* `TOKEN_2CAPTCHA = undefined` - token of [2captcha service](https://2captcha.com)
* `STEALTH_BROWSING = true` - should the service use the [stealth browsing](https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-stealth) mode
* `MAX_CONCURRENT_CONTEXTS = undefined` - should the service limit the number of contexts

## Notes on memory usage
You need to explicitly close the browser tab once you don't need it (e.g. at the end of the parse method).
Expand Down
7 changes: 5 additions & 2 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ const mhtmlRouter = require('./routes/mhtml');
const harRouter = require('./routes/har');
const closeContextRouter = require('./routes/close_context');

const middlewares = require('./helpers/middlewares')
const middlewares = require('./helpers/middlewares');
const limitContext = require('./helpers/limit_context');
const loggers = require("./helpers/loggers");

const app = express();
Expand All @@ -36,12 +37,14 @@ const VIEWPORT_WIDTH = parseInt(process.env.VIEWPORT_WIDTH) || 1280;
const VIEWPORT_HEIGHT = parseInt(process.env.VIEWPORT_HEIGHT) || 720;
const TOKEN_2CAPTCHA = process.env.TOKEN_2CAPTCHA;
const STEALTH_BROWSING = (process.env.STEALTH_BROWSING || "true").toLowerCase() === "true";
const MAX_CONCURRENT_CONTEXTS = process.env.MAX_CONCURRENT_CONTEXTS === "Infinity" ? Infinity : parseInt(process.env.MAX_CONCURRENT_CONTEXTS);

limitContext.initContextCounter(MAX_CONCURRENT_CONTEXTS);
loggers.initLogger(LOG_LEVEL, LOG_FILE, LOGSTASH_HOST, LOGSTASH_PORT);

async function setupBrowser() {
try {
if (TOKEN_2CAPTCHA) { // If token is given then RecapcthaPlugin is activated
if (TOKEN_2CAPTCHA) { // If token is given then RecaptchaPlugin is activated
puppeteer.use(
RecaptchaPlugin({
provider: {
Expand Down
8 changes: 8 additions & 0 deletions helpers/exceptions.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,11 @@ exports.ContextNotFoundError = class ContextNotFoundError extends Error {
this.name = "ContextNotFoundError";
}
}

exports.TooManyContextsError = class TooManyContextsError extends Error {
constructor(message="Could not create new context due to restriction", ...args) {
super(message, ...args);
this.message = message;
this.name = "TooManyContextsError";
}
}
18 changes: 18 additions & 0 deletions helpers/limit_context.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
let contextCounter = 0;

function incContextCounter() {}
exports.incContextCounter = incContextCounter; // Empty function or incrementer

function decContextCounter() {}
exports.decContextCounter = decContextCounter; // Empty function or decrementer

function canCreateContext() { return true; }
exports.canCreateContext = canCreateContext; // Truish function or checker if the context can be created

exports.initContextCounter = function (maxContextCounter) {
if (!isNaN(maxContextCounter)) {
exports.incContextCounter = () => { contextCounter++ };
exports.decContextCounter = () => { contextCounter-- };
exports.canCreateContext = () => { return contextCounter < maxContextCounter }
}
}
66 changes: 0 additions & 66 deletions helpers/middlewares.js

This file was deleted.

3 changes: 3 additions & 0 deletions helpers/middlewares/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
exports.logHTTPMiddleware = require('./logging').logHTTPMiddleware;
exports.logExceptionMiddleware = require('./logging').logExceptionMiddleware;
exports.processExceptionMiddleware = require('./process_exception').processExceptionMiddleware;
30 changes: 30 additions & 0 deletions helpers/middlewares/logging.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
const loggers = require("../loggers");
const morgan = require("morgan");

/***
* Returns the middleware for logging HTTP request-response.
***/
exports.logHTTPMiddleware = function logHTTPMiddleware() {
const logger = loggers.getLogger();

return morgan(
loggers.HTTPFormat,
{
stream: {
write: (message) => logger.http(message),
},
}
);
}

/***
* Middleware for logging exceptions.
***/
exports.logExceptionMiddleware = async function logExceptionMiddleware(err, req, res, next) {
loggers.getLogger().error({
message: err,
contextId: req.query["contextId"],
pageId: req.query["pageId"],
});
next();
}
38 changes: 38 additions & 0 deletions helpers/middlewares/process_exception.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
const exceptions = require("../exceptions");

/***
* Middleware for processing exceptions.
***/
exports.processExceptionMiddleware = async function processExceptionMiddleware(err, req, res, next) {
if (res.headersSent) {
return next(err);
}

const contextId = err.contextId || req.query.contextId;
const pageId = err.pageId || req.query.pageId;
const errorMessage = err.message || 'Unknown error';

if (contextId) {
res.header('scrapy-puppeteer-service-context-id', contextId);
}

if (err instanceof exceptions.TooManyContextsError) {
res.status(429); // Too Many Requests
} else if (err.contextId) { // there was a context, but something went wrong
res.status(500);
} else { // No context. Possibly, our service was restarted
if (err instanceof exceptions.PageNotFoundError || err instanceof exceptions.ContextNotFoundError) {
res.status(422); // Unprocessable Entity
} else {
res.status(500);
}
}

res.send({
contextId,
pageId,
error: errorMessage
});

next(err);
}
20 changes: 18 additions & 2 deletions helpers/utils.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const exceptions = require("./exceptions");
const { proxyRequest } = require('puppeteer-proxy');
const limitContext = require('./limit_context');

const PROXY_URL_KEY = 'puppeteer-service-proxy-url'

Expand All @@ -26,6 +27,7 @@ exports.closeContexts = async function closeContexts(browser, contextIds) {
const closePromises = [];
for (const context of browser.browserContexts()) {
if (contextIds.includes(context.id)) {
limitContext.decContextCounter();
closePromises.push(context.close());
}
}
Expand Down Expand Up @@ -106,6 +108,20 @@ async function newPage(context) {
return page;
}

async function newContext(browser, options = {}) {
if (!limitContext.canCreateContext()) {
throw new exceptions.TooManyContextsError();
}

try {
limitContext.incContextCounter();
return await browser.createIncognitoBrowserContext(options);
} catch (err) {
limitContext.decContextCounter();
throw err;
}
}

function getProxy(request) {
if ('body' in request && 'proxy' in request.body) {
return request.body.proxy;
Expand All @@ -127,12 +143,12 @@ exports.getBrowserPage = async function getBrowserPage(browser, request) {
}
const proxy = getProxy(request);
if (!proxy) {
const context = await browser.createIncognitoBrowserContext();
const context = await newContext(browser);
return newPage(context);
}
const { origin: proxyServer, username, password } = new URL(proxy);

const context = await browser.createIncognitoBrowserContext({ proxyServer });
const context = await newContext(browser, { proxyServer });
context[PROXY_URL_KEY] = proxy;
const page = await newPage(context);
if (username) {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "scrapy-puppeteer-service",
"version": "0.3.0",
"version": "0.3.1",
"private": true,
"scripts": {
"start": "node ./bin/www"
Expand Down

0 comments on commit 615340c

Please sign in to comment.