Skip to content

Commit

Permalink
Add option to load custom behaviors from git repo
Browse files Browse the repository at this point in the history
Git repos must be prefixed with git+ to be recognized by the
crawler, and can optionally use branch and path query parameters
to specify which branch to use and the relative filepath to a
directory within the repository, respectively.
  • Loading branch information
tw4l committed Nov 8, 2024
1 parent d045096 commit 297c184
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 3 deletions.
6 changes: 4 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -564,8 +564,10 @@ class ArgParser {

customBehaviors: {
describe:
"Custom behavior files to inject. Values can be URLs, paths to individual behavior files, or paths" +
" to a directory of behavior files",
"Custom behavior files to inject. Valid values: URL to file, path to file, path to directory" +
" of behaviors, URL to Git repo of behaviors (prefixed with git+, optionally specify branch and" +
" relative path to a directory within repo as branch and path query parameters, e.g." +
" git+https://git.example.com/repo.git?branch=dev&path=some/dir",
type: "array",
default: [],
},
Expand Down
42 changes: 41 additions & 1 deletion src/util/file_reader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ import fsp from "fs/promises";
import path from "path";
import crypto from "crypto";
import { fetch } from "undici";
import util from "util";
import { exec as execCallback } from "child_process";

import { logger } from "./logger.js";

const exec = util.promisify(execCallback);

const MAX_DEPTH = 2;

// Add .ts to allowed extensions when we can support it
Expand All @@ -23,7 +27,11 @@ export async function collectCustomBehaviors(
const collectedSources: FileSources = [];

for (const fileSource of sources) {
if (fileSource.startsWith("http")) {
if (fileSource.startsWith("git+")) {
const newSources = await collectGitBehaviors(fileSource);
collectedSources.push(...newSources);
}
else if (fileSource.startsWith("http")) {
const newSources = await collectOnlineBehavior(fileSource);
collectedSources.push(...newSources);
} else {
Expand All @@ -35,6 +43,38 @@ export async function collectCustomBehaviors(
return collectedSources;
}

async function collectGitBehaviors(gitUrl: string): Promise<FileSources> {

Check failure on line 46 in src/util/file_reader.ts

View workflow job for this annotation

GitHub Actions / build (20.x)

Function lacks ending return statement and return type does not include 'undefined'.
const url = gitUrl.split("git+").pop();
const params = new URL(url).searchParams;

Check failure on line 48 in src/util/file_reader.ts

View workflow job for this annotation

GitHub Actions / build (20.x)

Argument of type 'string | undefined' is not assignable to parameter of type 'string | URL'.
const branch = params.get("branch") || "";
const relPath = params.get("path") || "";

const timestamp = Date.now();
const tmpDir = `/tmp/behaviors-repo-${timestamp}`;

let cloneCommand = "git clone ";
if (branch) {
cloneCommand += `-b ${branch} --single-branch `;
}
cloneCommand += `${url} ${tmpDir}`;

let pathToCollect = tmpDir;
if (relPath) {
pathToCollect = path.join(tmpDir, relPath);
}

try {
await exec(cloneCommand);
return await collectLocalPathBehaviors(pathToCollect);
} catch (e) {
logger.error(
"Error downloading custom behaviors from Git repo",
{ url, error: e },
"behavior",
);
}
}

async function collectOnlineBehavior(url: string): Promise<FileSources> {
const filename = crypto.randomBytes(4).toString("hex") + ".js";
const behaviorFilepath = `/app/behaviors/${filename}`;
Expand Down
35 changes: 35 additions & 0 deletions tests/custom-behavior.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,41 @@ test("test mixed custom behavior sources", async () => {
).toBe(true);
});

test("test custom behaviors from git repo", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors git+https://github.com/webrecorder/browsertrix-crawler.git?branch=custom-behaviors-testing&path=tests/custom-behaviors --scopeType page",
);

const log = res.toString();

// custom behavior ran for specs.webrecorder.net
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://specs.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);

// but not for example.org
expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
) > 0,
).toBe(false);

expect(
log.indexOf(
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
) > 0,
).toBe(true);

// another custom behavior ran for old.webrecorder.net
expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://old.webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
});

test("test invalid behavior exit", async () => {
let status = 0;

Expand Down

0 comments on commit 297c184

Please sign in to comment.