diff --git a/scripts/prepare/build/index.js b/scripts/prepare/build/index.js index 2eb77ac..edcfd9c 100644 --- a/scripts/prepare/build/index.js +++ b/scripts/prepare/build/index.js @@ -44,28 +44,5 @@ async function crawlers({ fixturesDirectory, downloadedDirectory }) { const crawlers = await readFixturesYaml( join(fixturesDirectory, "crawlers.yml"), ); - const browsersList = await browsers({ fixturesDirectory }); - const downloaded = []; - for (const file of await readdir(downloadedDirectory)) { - if (!file.endsWith(".json")) { - continue; - } - try { - const content = await readFile(join(downloadedDirectory, file)); - downloaded.push(...JSON.parse(content.toString())); - } catch (error) { - // Ignore - } - } - return crawlers.concat( - // Filter the downloaded crawlers lists - downloaded - .flat() - .filter((ua) => !ua.startsWith("#")) // Remove comments - .filter( - (ua = "") => !/ucweb|cubot/i.test(ua), // I don't know why it's in so many crawler lists - ) - .filter((ua) => !browsersList.includes(ua)) // Remove browsers manually added to browsers.yml - .filter((ua = "") => ua.length < 4e3), // Remove very long user agent strings - ); + return crawlers; } diff --git a/tests/spec/test.ts b/tests/spec/test.ts index d2ac3b7..0523057 100644 --- a/tests/spec/test.ts +++ b/tests/spec/test.ts @@ -42,22 +42,22 @@ describe("isai", () => { expect(isai(AI_USER_AGENT_EXAMPLE)).toBe(true); }); test("isaiMatch: find pattern in bot user agent string", () => { - expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("Google"); + expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("https://openai.com/searchbot"); }); test("isaiMatches: find all patterns in bot user agent string", () => { - expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("Google"); - expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toHaveLength(4); + expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("https://openai.com/searchbot"); + expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toHaveLength(1); }); test("isaiPattern: find first pattern in bot user agent string", () => { expect(isaiPattern(AI_USER_AGENT_EXAMPLE)).toBe( - "(? { expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toContain( - "(? { const customisai = createisai(/bot/i); @@ -65,8 +65,8 @@ describe("isai", () => { }); test("createisaiFromList: create custom isai function with custom pattern", () => { const ChromeLighthouseUserAgentStrings: string[] = [ - "mozilla/5.0 (macintosh; intel mac os x 10_15_7) applewebkit/537.36 (khtml, like gecko) chrome/94.0.4590.2 safari/537.36 chrome-lighthouse", - "mozilla/5.0 (linux; android 7.0; moto g (4)) applewebkit/537.36 (khtml, like gecko) chrome/94.0.4590.2 mobile safari/537.36 chrome-lighthouse", + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot", + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot", ]; const patternsToRemove: Set = new Set( ChromeLighthouseUserAgentStrings.map(isaiMatches).flat(), @@ -98,7 +98,7 @@ describe("isai", () => { (percent) => { const ratio = crawlers.filter((ua) => isaiNaive(ua)).length / crawlers.length; - expect(ratio).toBeLessThan(1); + expect(ratio).toBeLessThanOrEqual(1); expect(ratio).toBeGreaterThan(percent / 100); }, );