Skip to content

Commit

Permalink
feat: stopping the crawlers gracefully with BasicCrawler.stop() (#2792
Browse files Browse the repository at this point in the history
)

Allows users to call `crawler.stop()` to stop the crawler gracefully. 

Closes #2777

---------

Co-authored-by: Martin Adámek <[email protected]>
  • Loading branch information
barjin and B4nan authored Jan 20, 2025
1 parent 53331e8 commit af2966f
Show file tree
Hide file tree
Showing 9 changed files with 159 additions and 1 deletion.
19 changes: 18 additions & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
/**
* Allows to keep the crawler alive even if the {@apilink RequestQueue} gets empty.
* By default, the `crawler.run()` will resolve once the queue is empty. With `keepAlive: true` it will keep running,
* waiting for more requests to come. Use `crawler.teardown()` to exit the crawler.
* waiting for more requests to come. Use `crawler.stop()` to exit the crawler gracefully, or `crawler.teardown()` to stop it immediately.
*/
keepAlive?: boolean;

Expand Down Expand Up @@ -977,6 +977,23 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return stats;
}

/**
* Gracefully stops the current run of the crawler.
*
* All the tasks active at the time of calling this method will be allowed to finish.
*/
stop(message = 'The crawler has been gracefully stopped.'): void {
// Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
this.autoscaledPool
?.pause()
// Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
.then(async () => this.autoscaledPool?.abort())
.then(() => this.log.info(message))
.catch((err) => {
this.log.error('An error occurred when stopping the crawler:', err);
});
}

async getRequestQueue() {
if (!this.requestQueue && this.requestList) {
this.log.warningOnce(
Expand Down
7 changes: 7 additions & 0 deletions test/e2e/cheerio-stop-resume-ts/actor/.actor/actor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"actorSpecification": 1,
"name": "test-cheerio-stop-resume-ts",
"version": "0.0",
"buildTag": "latest",
"env": null
}
8 changes: 8 additions & 0 deletions test/e2e/cheerio-stop-resume-ts/actor/.eslintrc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"root": true,
"extends": "../../.eslintrc.json",
"parserOptions": {
"project": "./test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json",
"ecmaVersion": 2022
}
}
11 changes: 11 additions & 0 deletions test/e2e/cheerio-stop-resume-ts/actor/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
.idea
.DS_Store
node_modules
package-lock.json
apify_storage
crawlee_storage
storage
main.d.ts
main.d.ts.map
main.js
main.js.map
28 changes: 28 additions & 0 deletions test/e2e/cheerio-stop-resume-ts/actor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# using multistage build, as we need dev deps to build the TS source code
FROM apify/actor-node:20-beta AS builder

# copy all files, install all dependencies (including dev deps) and build the project
COPY . ./
RUN npm install --include=dev \
&& npm run build

# create final image
FROM apify/actor-node:20-beta
# copy only necessary files
COPY --from=builder /usr/src/app/packages ./packages
COPY --from=builder /usr/src/app/package.json ./
COPY --from=builder /usr/src/app/main.js ./

# install only prod deps
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional --no-audit \
&& npm update --no-audit \
&& echo "Installed NPM packages:" \
&& (npm list --only=prod --no-optional --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version

# run compiled code
CMD npm run start:prod
31 changes: 31 additions & 0 deletions test/e2e/cheerio-stop-resume-ts/actor/main.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { CheerioCrawler, Dataset } from '@crawlee/cheerio';
import { Actor } from 'apify';

if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') {
// @ts-ignore
await Actor.init({ storage: new (await import('@apify/storage-local')).ApifyStorageLocal() });
} else {
await Actor.init();
}

let requestCount = 0;

const crawler = new CheerioCrawler();
crawler.router.addDefaultHandler(async ({ $, enqueueLinks, request, log }) => {
const { url } = request;
await enqueueLinks({
globs: ['https://crawlee.dev/docs/**'],
});

const pageTitle = $('title').first().text();
log.info(`URL: ${url} TITLE: ${pageTitle}`);
await Dataset.pushData({ url, pageTitle });

if (requestCount++ > 10) crawler.stop();
});

await crawler.run(['https://crawlee.dev/docs/quick-start']);

requestCount = 0;
await crawler.run(['https://crawlee.dev/docs/quick-start'], { purgeRequestQueue: false });
await Actor.exit({ exit: Actor.isAtHome() });
35 changes: 35 additions & 0 deletions test/e2e/cheerio-stop-resume-ts/actor/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"name": "test-cheerio-stop-resume-ts",
"version": "0.0.1",
"description": "Crawler Stop-Resume Test - TypeScript",
"dependencies": {
"apify": "next",
"@apify/storage-local": "^2.1.3",
"@crawlee/basic": "file:./packages/basic-crawler",
"@crawlee/browser-pool": "file:./packages/browser-pool",
"@crawlee/http": "file:./packages/http-crawler",
"@crawlee/cheerio": "file:./packages/cheerio-crawler",
"@crawlee/core": "file:./packages/core",
"@crawlee/memory-storage": "file:./packages/memory-storage",
"@crawlee/types": "file:./packages/types",
"@crawlee/utils": "file:./packages/utils"
},
"overrides": {
"apify": {
"@crawlee/core": "file:./packages/core",
"@crawlee/types": "file:./packages/types",
"@crawlee/utils": "file:./packages/utils"
}
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"typescript": "^5.0.0"
},
"scripts": {
"start": "tsc && node main.js",
"start:prod": "node main.js",
"build": "tsc"
},
"type": "module",
"license": "ISC"
}
9 changes: 9 additions & 0 deletions test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "ES2022",
"target": "ES2022",
"lib": ["DOM"]
},
"include": ["./**/*.ts"]
}
12 changes: 12 additions & 0 deletions test/e2e/cheerio-stop-resume-ts/test.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs';

const testActorDirname = getActorTestDir(import.meta.url);
await initialize(testActorDirname);

const { stats, datasetItems } = await runActor(testActorDirname);

/// Some extra requests are expected (at most 10 extra for each run).
await expect(stats.requestsFinished < 40, 'crawler.stop() works');

const visitedUrls = new Set(datasetItems.map((x) => x.url));
await expect(visitedUrls.size === datasetItems.length, 'stateful crawler.run({ purgeRQ: false }) works');

0 comments on commit af2966f

Please sign in to comment.