-
Notifications
You must be signed in to change notification settings - Fork 740
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: stopping the crawlers gracefully with
BasicCrawler.stop()
(#2792
) Allows users to call `crawler.stop()` to stop the crawler gracefully. Closes #2777 --------- Co-authored-by: Martin Adámek <[email protected]>
- Loading branch information
Showing
9 changed files
with
159 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"actorSpecification": 1, | ||
"name": "test-cheerio-stop-resume-ts", | ||
"version": "0.0", | ||
"buildTag": "latest", | ||
"env": null | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"root": true, | ||
"extends": "../../.eslintrc.json", | ||
"parserOptions": { | ||
"project": "./test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json", | ||
"ecmaVersion": 2022 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
.idea | ||
.DS_Store | ||
node_modules | ||
package-lock.json | ||
apify_storage | ||
crawlee_storage | ||
storage | ||
main.d.ts | ||
main.d.ts.map | ||
main.js | ||
main.js.map |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# using multistage build, as we need dev deps to build the TS source code | ||
FROM apify/actor-node:20-beta AS builder | ||
|
||
# copy all files, install all dependencies (including dev deps) and build the project | ||
COPY . ./ | ||
RUN npm install --include=dev \ | ||
&& npm run build | ||
|
||
# create final image | ||
FROM apify/actor-node:20-beta | ||
# copy only necessary files | ||
COPY --from=builder /usr/src/app/packages ./packages | ||
COPY --from=builder /usr/src/app/package.json ./ | ||
COPY --from=builder /usr/src/app/main.js ./ | ||
|
||
# install only prod deps | ||
RUN npm --quiet set progress=false \ | ||
&& npm install --only=prod --no-optional --no-audit \ | ||
&& npm update --no-audit \ | ||
&& echo "Installed NPM packages:" \ | ||
&& (npm list --only=prod --no-optional --all || true) \ | ||
&& echo "Node.js version:" \ | ||
&& node --version \ | ||
&& echo "NPM version:" \ | ||
&& npm --version | ||
|
||
# run compiled code | ||
CMD npm run start:prod |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import { CheerioCrawler, Dataset } from '@crawlee/cheerio'; | ||
import { Actor } from 'apify'; | ||
|
||
if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') { | ||
// @ts-ignore | ||
await Actor.init({ storage: new (await import('@apify/storage-local')).ApifyStorageLocal() }); | ||
} else { | ||
await Actor.init(); | ||
} | ||
|
||
let requestCount = 0; | ||
|
||
const crawler = new CheerioCrawler(); | ||
crawler.router.addDefaultHandler(async ({ $, enqueueLinks, request, log }) => { | ||
const { url } = request; | ||
await enqueueLinks({ | ||
globs: ['https://crawlee.dev/docs/**'], | ||
}); | ||
|
||
const pageTitle = $('title').first().text(); | ||
log.info(`URL: ${url} TITLE: ${pageTitle}`); | ||
await Dataset.pushData({ url, pageTitle }); | ||
|
||
if (requestCount++ > 10) crawler.stop(); | ||
}); | ||
|
||
await crawler.run(['https://crawlee.dev/docs/quick-start']); | ||
|
||
requestCount = 0; | ||
await crawler.run(['https://crawlee.dev/docs/quick-start'], { purgeRequestQueue: false }); | ||
await Actor.exit({ exit: Actor.isAtHome() }); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
{ | ||
"name": "test-cheerio-stop-resume-ts", | ||
"version": "0.0.1", | ||
"description": "Crawler Stop-Resume Test - TypeScript", | ||
"dependencies": { | ||
"apify": "next", | ||
"@apify/storage-local": "^2.1.3", | ||
"@crawlee/basic": "file:./packages/basic-crawler", | ||
"@crawlee/browser-pool": "file:./packages/browser-pool", | ||
"@crawlee/http": "file:./packages/http-crawler", | ||
"@crawlee/cheerio": "file:./packages/cheerio-crawler", | ||
"@crawlee/core": "file:./packages/core", | ||
"@crawlee/memory-storage": "file:./packages/memory-storage", | ||
"@crawlee/types": "file:./packages/types", | ||
"@crawlee/utils": "file:./packages/utils" | ||
}, | ||
"overrides": { | ||
"apify": { | ||
"@crawlee/core": "file:./packages/core", | ||
"@crawlee/types": "file:./packages/types", | ||
"@crawlee/utils": "file:./packages/utils" | ||
} | ||
}, | ||
"devDependencies": { | ||
"@apify/tsconfig": "^0.1.0", | ||
"typescript": "^5.0.0" | ||
}, | ||
"scripts": { | ||
"start": "tsc && node main.js", | ||
"start:prod": "node main.js", | ||
"build": "tsc" | ||
}, | ||
"type": "module", | ||
"license": "ISC" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"extends": "@apify/tsconfig", | ||
"compilerOptions": { | ||
"module": "ES2022", | ||
"target": "ES2022", | ||
"lib": ["DOM"] | ||
}, | ||
"include": ["./**/*.ts"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs'; | ||
|
||
const testActorDirname = getActorTestDir(import.meta.url); | ||
await initialize(testActorDirname); | ||
|
||
const { stats, datasetItems } = await runActor(testActorDirname); | ||
|
||
/// Some extra requests are expected (at most 10 extra for each run). | ||
await expect(stats.requestsFinished < 40, 'crawler.stop() works'); | ||
|
||
const visitedUrls = new Set(datasetItems.map((x) => x.url)); | ||
await expect(visitedUrls.size === datasetItems.length, 'stateful crawler.run({ purgeRQ: false }) works'); |