From 4aee02a4ad718d905bef66e9b9c8efdaaac1c858 Mon Sep 17 00:00:00 2001 From: muzafferkadir Date: Fri, 6 Sep 2024 16:58:43 +0300 Subject: [PATCH 1/2] feat: adding crawlee's EnqueueStrategy config --- src/config.ts | 9 +++++++++ src/core.ts | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/src/config.ts b/src/config.ts index 787744ce..13178e47 100644 --- a/src/config.ts +++ b/src/config.ts @@ -26,6 +26,15 @@ export const configSchema = z.object({ * @default "" */ exclude: z.string().or(z.array(z.string())).optional(), + /** + * Set Crawlee strategy to check certain parts of the URLs found. + * @example "same-origin" + * @default "same-hostname" + * @see https://crawlee.dev/api/core/enum/EnqueueStrategy + */ + crawlStrategy: z + .enum(["all", "same-origin", "same-hostname", "same-domain"]) + .optional(), /** * Selector to grab the inner text from * @example ".docs-builder-container" diff --git a/src/core.ts b/src/core.ts index c996f2bb..2a51ffb0 100644 --- a/src/core.ts +++ b/src/core.ts @@ -97,6 +97,10 @@ export async function crawl(config: Config) { typeof config.exclude === "string" ? [config.exclude] : config.exclude ?? [], + strategy: + typeof config.crawlStrategy === "string" + ? config.crawlStrategy + : undefined, }); }, // Comment this option to scrape the full website. From 72b0464ac6879b739c1b813ed94b45a616d2a8fb Mon Sep 17 00:00:00 2001 From: muzafferkadir Date: Wed, 11 Sep 2024 02:53:12 +0300 Subject: [PATCH 2/2] style: formatting --- CHANGELOG.md | 3 +-- src/core.ts | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac73b4e7..4ab0556d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,8 @@ # [1.5.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.4.0...v1.5.0) (2024-07-05) - ### Features -* git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) +- git clone depth limit in docker ([87767db](https://github.com/BuilderIO/gpt-crawler/commit/87767dbda99b3259d44ec2c02dceb3a59bb2ca3c)) # [1.4.0](https://github.com/BuilderIO/gpt-crawler/compare/v1.3.0...v1.4.0) (2024-01-15) diff --git a/src/core.ts b/src/core.ts index 2a51ffb0..02c15e16 100644 --- a/src/core.ts +++ b/src/core.ts @@ -96,7 +96,7 @@ export async function crawl(config: Config) { exclude: typeof config.exclude === "string" ? [config.exclude] - : config.exclude ?? [], + : (config.exclude ?? []), strategy: typeof config.crawlStrategy === "string" ? config.crawlStrategy