Skip to content

Commit

Permalink
Merge branch 'main' into url-behaviors-new
Browse files Browse the repository at this point in the history
  • Loading branch information
ikreymer committed Nov 5, 2024
2 parents b7a6b00 + e5bab8e commit d9205bd
Show file tree
Hide file tree
Showing 22 changed files with 200 additions and 149 deletions.
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.3.3",
"version": "1.3.4",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand All @@ -17,7 +17,7 @@
},
"dependencies": {
"@novnc/novnc": "^1.4.0",
"@webrecorder/wabac": "^2.20.0-beta.4",
"@webrecorder/wabac": "^2.20.0",
"browsertrix-behaviors": "^0.6.4",
"client-zip": "^2.4.5",
"fetch-socks": "^1.3.0",
Expand All @@ -30,7 +30,7 @@
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^23.5.1",
"puppeteer-core": "^23.6.0",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
Expand Down
6 changes: 4 additions & 2 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ export class Crawler {
finalExit = false;
uploadAndDeleteLocal = false;
done = false;
postCrawling = false;

textInPages = false;

Expand Down Expand Up @@ -1536,12 +1537,13 @@ self.__bx_behaviors.selectMainBehavior();
}

async postCrawl() {
this.postCrawling = true;
logger.info("Crawling done");

if (this.params.combineWARC && !this.params.dryRun) {
await this.combineWARC();
}

logger.info("Crawling done");

if (
(this.params.generateCDX || this.params.generateWACZ) &&
!this.params.dryRun
Expand Down
36 changes: 11 additions & 25 deletions src/util/browser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { Readable } from "node:stream";
import os from "os";
import path from "path";

import { LogContext, logger } from "./logger.js";
import { formatErr, LogContext, logger } from "./logger.js";
import { initStorage } from "./storage.js";

import { DISPLAY, type ServiceWorkerOpt } from "./constants.js";
Expand Down Expand Up @@ -126,7 +126,7 @@ export class Browser {
? undefined
: (target) => this.targetFilter(target),
};
await this._init(launchOpts, ondisconnect, recording);
await this._init(launchOpts, ondisconnect);
}

targetFilter(target: Target) {
Expand Down Expand Up @@ -392,17 +392,14 @@ export class Browser {
launchOpts: PuppeteerLaunchOptions,
// eslint-disable-next-line @typescript-eslint/ban-types
ondisconnect: Function | null = null,
recording: boolean,
) {
this.browser = await puppeteer.launch(launchOpts);

const target = this.browser.target();

this.firstCDP = await target.createCDPSession();

if (recording) {
await this.serviceWorkerFetch();
}
await this.browserContextFetch();

if (ondisconnect) {
this.browser.on("disconnected", (err) => ondisconnect(err));
Expand Down Expand Up @@ -479,35 +476,24 @@ export class Browser {
return { page, cdp };
}

async serviceWorkerFetch() {
async browserContextFetch() {
if (!this.firstCDP) {
return;
}

this.firstCDP.on("Fetch.requestPaused", async (params) => {
const { frameId, requestId, networkId, request } = params;
const { frameId, requestId, request } = params;

const { url } = request;

if (!this.firstCDP) {
throw new Error("CDP missing");
}

if (networkId) {
try {
await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) {
logger.warn(
"continueResponse failed",
{ url: request.url },
"recorder",
);
}
return;
}

let foundRecorder = null;

for (const recorder of this.recorders) {
if (recorder.swUrls.has(request.url)) {
if (recorder.swUrls.has(url)) {
recorder.swFrameIds.add(frameId);
}

Expand All @@ -520,16 +506,16 @@ export class Browser {
if (!foundRecorder) {
logger.warn(
"Skipping URL from unknown frame",
{ url: request.url, frameId },
{ url, frameId },
"recorder",
);

try {
await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) {
logger.warn(
logger.debug(
"continueResponse failed",
{ url: request.url },
{ url, ...formatErr(e), from: "serviceWorker" },
"recorder",
);
}
Expand Down
Loading

0 comments on commit d9205bd

Please sign in to comment.