diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 168282c8fe6c..0c87d8b36c18 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -294,7 +294,7 @@ export interface BasicCrawlerOptions this.autoscaledPool?.abort()) + .then(() => this.log.info(message)) + .catch((err) => { + this.log.error('An error occurred when stopping the crawler:', err); + }); + } + async getRequestQueue() { if (!this.requestQueue && this.requestList) { this.log.warningOnce( diff --git a/test/e2e/cheerio-stop-resume-ts/actor/.actor/actor.json b/test/e2e/cheerio-stop-resume-ts/actor/.actor/actor.json new file mode 100644 index 000000000000..67b63ddeba6e --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/.actor/actor.json @@ -0,0 +1,7 @@ +{ + "actorSpecification": 1, + "name": "test-cheerio-stop-resume-ts", + "version": "0.0", + "buildTag": "latest", + "env": null +} diff --git a/test/e2e/cheerio-stop-resume-ts/actor/.eslintrc.json b/test/e2e/cheerio-stop-resume-ts/actor/.eslintrc.json new file mode 100644 index 000000000000..20fde449cb45 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/.eslintrc.json @@ -0,0 +1,8 @@ +{ + "root": true, + "extends": "../../.eslintrc.json", + "parserOptions": { + "project": "./test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json", + "ecmaVersion": 2022 + } +} diff --git a/test/e2e/cheerio-stop-resume-ts/actor/.gitignore b/test/e2e/cheerio-stop-resume-ts/actor/.gitignore new file mode 100644 index 000000000000..f2fc11c72bcc --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/.gitignore @@ -0,0 +1,11 @@ +.idea +.DS_Store +node_modules +package-lock.json +apify_storage +crawlee_storage +storage +main.d.ts +main.d.ts.map +main.js +main.js.map diff --git a/test/e2e/cheerio-stop-resume-ts/actor/Dockerfile b/test/e2e/cheerio-stop-resume-ts/actor/Dockerfile new file mode 100644 index 000000000000..59ba4ae8b5e8 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/Dockerfile @@ -0,0 +1,28 @@ +# using multistage build, as we need dev deps to build the TS source code +FROM apify/actor-node:20-beta AS builder + +# copy all files, install all dependencies (including dev deps) and build the project +COPY . ./ +RUN npm install --include=dev \ + && npm run build + +# create final image +FROM apify/actor-node:20-beta +# copy only necessary files +COPY --from=builder /usr/src/app/packages ./packages +COPY --from=builder /usr/src/app/package.json ./ +COPY --from=builder /usr/src/app/main.js ./ + +# install only prod deps +RUN npm --quiet set progress=false \ + && npm install --only=prod --no-optional --no-audit \ + && npm update --no-audit \ + && echo "Installed NPM packages:" \ + && (npm list --only=prod --no-optional --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version + +# run compiled code +CMD npm run start:prod diff --git a/test/e2e/cheerio-stop-resume-ts/actor/main.ts b/test/e2e/cheerio-stop-resume-ts/actor/main.ts new file mode 100644 index 000000000000..8f14b3068168 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/main.ts @@ -0,0 +1,31 @@ +import { CheerioCrawler, Dataset } from '@crawlee/cheerio'; +import { Actor } from 'apify'; + +if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') { + // @ts-ignore + await Actor.init({ storage: new (await import('@apify/storage-local')).ApifyStorageLocal() }); +} else { + await Actor.init(); +} + +let requestCount = 0; + +const crawler = new CheerioCrawler(); +crawler.router.addDefaultHandler(async ({ $, enqueueLinks, request, log }) => { + const { url } = request; + await enqueueLinks({ + globs: ['https://crawlee.dev/docs/**'], + }); + + const pageTitle = $('title').first().text(); + log.info(`URL: ${url} TITLE: ${pageTitle}`); + await Dataset.pushData({ url, pageTitle }); + + if (requestCount++ > 10) crawler.stop(); +}); + +await crawler.run(['https://crawlee.dev/docs/quick-start']); + +requestCount = 0; +await crawler.run(['https://crawlee.dev/docs/quick-start'], { purgeRequestQueue: false }); +await Actor.exit({ exit: Actor.isAtHome() }); diff --git a/test/e2e/cheerio-stop-resume-ts/actor/package.json b/test/e2e/cheerio-stop-resume-ts/actor/package.json new file mode 100644 index 000000000000..cf307b836523 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/package.json @@ -0,0 +1,35 @@ +{ + "name": "test-cheerio-stop-resume-ts", + "version": "0.0.1", + "description": "Crawler Stop-Resume Test - TypeScript", + "dependencies": { + "apify": "next", + "@apify/storage-local": "^2.1.3", + "@crawlee/basic": "file:./packages/basic-crawler", + "@crawlee/browser-pool": "file:./packages/browser-pool", + "@crawlee/http": "file:./packages/http-crawler", + "@crawlee/cheerio": "file:./packages/cheerio-crawler", + "@crawlee/core": "file:./packages/core", + "@crawlee/memory-storage": "file:./packages/memory-storage", + "@crawlee/types": "file:./packages/types", + "@crawlee/utils": "file:./packages/utils" + }, + "overrides": { + "apify": { + "@crawlee/core": "file:./packages/core", + "@crawlee/types": "file:./packages/types", + "@crawlee/utils": "file:./packages/utils" + } + }, + "devDependencies": { + "@apify/tsconfig": "^0.1.0", + "typescript": "^5.0.0" + }, + "scripts": { + "start": "tsc && node main.js", + "start:prod": "node main.js", + "build": "tsc" + }, + "type": "module", + "license": "ISC" +} diff --git a/test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json b/test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json new file mode 100644 index 000000000000..7a212668d291 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "@apify/tsconfig", + "compilerOptions": { + "module": "ES2022", + "target": "ES2022", + "lib": ["DOM"] + }, + "include": ["./**/*.ts"] +} diff --git a/test/e2e/cheerio-stop-resume-ts/test.mjs b/test/e2e/cheerio-stop-resume-ts/test.mjs new file mode 100644 index 000000000000..b118f15ad612 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/test.mjs @@ -0,0 +1,12 @@ +import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs'; + +const testActorDirname = getActorTestDir(import.meta.url); +await initialize(testActorDirname); + +const { stats, datasetItems } = await runActor(testActorDirname); + +/// Some extra requests are expected (at most 10 extra for each run). +await expect(stats.requestsFinished < 40, 'crawler.stop() works'); + +const visitedUrls = new Set(datasetItems.map((x) => x.url)); +await expect(visitedUrls.size === datasetItems.length, 'stateful crawler.run({ purgeRQ: false }) works');