Skip to content

Commit

Permalink
clearer scope check (#615)
Browse files Browse the repository at this point in the history
split isInScope into a protected sync getScope() used for link
extraction (no need for async as we know seed is already set) and which
returns url / isOOS count.
and a simpler, public async isInScope() which just returns a bool, but
also ensures the seed exists.
  • Loading branch information
ikreymer authored Jun 18, 2024
1 parent ac722cc commit 6329b19
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 10 deletions.
23 changes: 15 additions & 8 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ export class Crawler {
}
}

async isInScope(
protected getScope(
{
seedId,
url,
Expand All @@ -620,13 +620,25 @@ export class Crawler {
}: { seedId: number; url: string; depth: number; extraHops: number },
logDetails = {},
) {
return this.seeds[seedId].isIncluded(url, depth, extraHops, logDetails);
}

async isInScope(
{
seedId,
url,
depth,
extraHops,
}: { seedId: number; url: string; depth: number; extraHops: number },
logDetails = {},
): Promise<boolean> {
const seed = await this.crawlState.getSeedAt(
this.seeds,
this.numOriginalSeeds,
seedId,
);

return seed.isIncluded(url, depth, extraHops, logDetails);
return !!seed.isIncluded(url, depth, extraHops, logDetails);
}

async setupPage({
Expand Down Expand Up @@ -2014,7 +2026,7 @@ self.__bx_behaviors.selectMainBehavior();
const newExtraHops = extraHops + 1;

for (const possibleUrl of urls) {
const res = await this.isInScope(
const res = this.getScope(
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId },
logDetails,
);
Expand All @@ -2023,11 +2035,6 @@ self.__bx_behaviors.selectMainBehavior();
continue;
}

if (res === true) {
logger.warn("Invalid scope response: true", logDetails, "links");
continue;
}

const { url, isOOS } = res;

if (url) {
Expand Down
9 changes: 7 additions & 2 deletions src/util/seeds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,12 @@ export class ScopedSeed {
return depth >= this.maxDepth;
}

isIncluded(url: string, depth: number, extraHops = 0, logDetails = {}) {
isIncluded(
url: string,
depth: number,
extraHops = 0,
logDetails = {},
): { url: string; isOOS: boolean } | false {
if (depth > this.maxDepth) {
return false;
}
Expand All @@ -231,7 +236,7 @@ export class ScopedSeed {
url = urlParsed.href;

if (url === this.url) {
return true;
return { url, isOOS: false };
}

// skip already crawled
Expand Down

0 comments on commit 6329b19

Please sign in to comment.