Skip to content

Commit

Permalink
Support for dynamic page loading + multi WACZ loading optimization (#220
Browse files Browse the repository at this point in the history
)

When dealing with multi-WACZ collection specified via json manifest,
loading all WACZ files on init becomes unscalable.
This PR provides an initial optimization to:
- Avoid loading all WACZ files on load for multi WACZ, if pagesQueryUrl is
provided, also avoid loading textIndex for all WACZs
- Support for querying pages for exact match 'url' param via
pagesQueryUrl to determine if pages exist, and which WACZ files they're
in, and only loading those.
- Support for preloadResources to automatically load a subset of WACZ
files (such as those that have no pages, for patches).
- Support loading initial pages in initialPages list
- Track if page is a seed via isSeed, store isSeed in pages data, also store waczhash.
- Handling of crawl id in initial pages and wacz files to build a mapping of
seed-page-for-crawl to always load the WACZ that contains seed pages.
- Fallback to load all WACZs if < 3 files total.
- Dynamic search interface via pagesQueryUrl with 'search', 'page',
'pageSize' params.
- bump wombat to 3.8.8
- bump to 2.21.0
  • Loading branch information
ikreymer authored Feb 14, 2025
1 parent 1259d09 commit fca2a3c
Show file tree
Hide file tree
Showing 7 changed files with 310 additions and 42 deletions.
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@webrecorder/wabac",
"version": "2.20.8",
"version": "2.21.0",
"main": "index.js",
"type": "module",
"exports": {
Expand All @@ -19,7 +19,7 @@
"@peculiar/asn1-schema": "^2.3.3",
"@peculiar/x509": "^1.9.2",
"@types/js-levenshtein": "^1.1.3",
"@webrecorder/wombat": "^3.8.7",
"@webrecorder/wombat": "^3.8.8",
"acorn": "^8.10.0",
"auto-js-ipfs": "^2.1.1",
"base64-js": "^1.5.1",
Expand Down
26 changes: 25 additions & 1 deletion src/api.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { Path } from "path-parser";
import { getCollData } from "./utils";
import { type SWCollections } from "./swmain";
import { MultiWACZ } from "./wacz/multiwacz";

// [TODO]
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Expand Down Expand Up @@ -126,6 +127,9 @@ class API {
data.pages = await coll.store.getAllPages();
data.lists = await coll.store.db.getAll("pageLists");
data.curatedPages = await coll.store.db.getAll("curatedPages");
if (coll.store instanceof MultiWACZ) {
data.canQueryPages = !!coll.store.pagesQueryUrl;
}
} else {
data.pages = [];
data.lists = [];
Expand Down Expand Up @@ -293,8 +297,28 @@ class API {
if (!coll) {
return { error: "collection_not_found" };
}
let total = undefined;
if (coll.store instanceof MultiWACZ) {
// @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query'].
const search = params._query.get("search");
// @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query'].
const page = Number(params._query.get("page")) || 1;
// @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query'].
const pageSize = Number(params._query.get("pageSize")) || 25;
if (search || page > 1) {
const { pages, total } = await coll.store.queryPages(
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
search,
page,
pageSize,
);
return { pages, total };
} else {
total = coll.store.totalPages;
}
}
const pages = await coll.store.getAllPages();
return { pages };
return { pages, total };
}

case "textIndex": {
Expand Down
8 changes: 8 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ export type PageEntry = {

timestamp?: string;

mime?: string;
depth?: number;
status?: number;
favIconUrl?: string;
wacz?: string;
waczhash?: string;
isSeed?: boolean;

pos?: number;
// [TODO]
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Expand Down
Loading

0 comments on commit fca2a3c

Please sign in to comment.