From 5dc61415193bf2bbb012efc84f0514a5982a10f9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 11 Feb 2025 18:34:20 -0800 Subject: [PATCH] support alwaysLoad param --- src/api.ts | 6 +- src/wacz/multiwacz.ts | 130 ++++++++++++++++++++++++++++++------------ src/wacz/waczfile.ts | 7 ++- 3 files changed, 102 insertions(+), 41 deletions(-) diff --git a/src/api.ts b/src/api.ts index 06dcc5a..e8cf90c 100644 --- a/src/api.ts +++ b/src/api.ts @@ -301,10 +301,12 @@ class API { // @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query']. const q = params._query.get("q"); // @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query']. - const limit = Number(params._query.get("limit")) || 25; + const page = Number(params._query.get("page")) || 1; + // @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query']. + const pageSize = Number(params._query.get("pageSize")) || 25; if (q) { // eslint-disable-next-line @typescript-eslint/no-unsafe-argument - const pages = await coll.store.queryPages(q, limit); + const pages = await coll.store.queryPages(q, page, pageSize); return { pages }; } } diff --git a/src/wacz/multiwacz.ts b/src/wacz/multiwacz.ts index 407a89a..06fe73e 100644 --- a/src/wacz/multiwacz.ts +++ b/src/wacz/multiwacz.ts @@ -58,6 +58,12 @@ export type IDXLine = { loaded: boolean; }; +export type AlwaysLoadData = { + wacz: string; + crawlId: string; + hasPages: boolean; +} + interface MDBType extends ADBType { ziplines: { key: [string, string]; @@ -95,6 +101,9 @@ export class MultiWACZ pagesQuery = ""; + alwaysLoadNoPages: string[] = []; + alwaysLoadByCrawl: Map = new Map(); + constructor( config: WACZCollConfig, sourceLoader: BaseLoader, @@ -915,6 +924,7 @@ export class MultiWACZ name, hash, path, + crawlId, parent, loader = null, // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -923,7 +933,7 @@ export class MultiWACZ hash = await this.computeFileHash(waczname, hash); - const file = this.addWACZFile({ waczname, hash, path, parent, loader }); + const file = this.addWACZFile({ waczname, hash, crawlId, path, parent, loader }); if (!this.pagesQuery) { await file.init(); @@ -964,17 +974,18 @@ export class MultiWACZ // @ts-expect-error [TODO] - TS4111 - Property 'resources' comes from an index signature, so it must be accessed with ['resources']. const files = json.resources.map( - (res: { path: string; name: string; hash: string }) => { + (res: { path: string; name: string; hash: string, crawlId?: string }) => { const path = parent.getLoadPath(res.path); - const name = parent.getName(res.name).split("/")[1]; + const name = parent.getName(res.name); const hash = res.hash; - return { name, hash, path }; + const crawlId = res.crawlId; + return { name, hash, path, crawlId }; }, ); - for (const { name, hash, path } of files) { + for (const { name, hash, path, crawlId } of files) { if (!this.waczfiles[name]) { - promises.push(this.addNewWACZ({ name, hash, path, parent })); + promises.push(this.addNewWACZ({ name, hash, path, parent, crawlId })); } else if (this.waczfiles[name].path !== path) { // [TODO] // eslint-disable-next-line @typescript-eslint/no-unsafe-argument @@ -986,12 +997,28 @@ export class MultiWACZ await Promise.allSettled(promises); } + if (json["alwaysLoad"]) { + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + this.initAlwaysLoadData(json["alwaysLoad"]); + } + if (json["pages"]) { // eslint-disable-next-line @typescript-eslint/no-unsafe-argument await this.addInitialPages(json["pages"]); } } + initAlwaysLoadData(alwaysLoad: AlwaysLoadData[]) { + for (const {wacz, crawlId, hasPages} of alwaysLoad) { + if (!hasPages) { + this.alwaysLoadNoPages.push(wacz); + } + if (crawlId) { + this.alwaysLoadByCrawl.set(crawlId, [wacz]); + } + } + } + // eslint-disable-next-line @typescript-eslint/no-explicit-any async addInitialPages(pagesImport: Record[]) { const pages: PageEntry[] = []; @@ -1134,29 +1161,20 @@ export class MultiWACZ } } - const foundMap = new Map(); + const waczFilesToTry: string[] = await this.getWACZFilesToTry(request, waczname); - let waczFilesToTry: Record | null = null; - - if (this.pagesQuery) { - if ( - request.destination === "document" || - request.destination === "iframe" - ) { - const res = await this.getWACZFilesForPagesQuery(request.url); - if (res) { - waczFilesToTry = res; - } - } - } else { - waczFilesToTry = this.waczfiles; - } - - if (!waczFilesToTry) { + if (!waczFilesToTry.length) { return null; } - for (const [name, file] of Object.entries(waczFilesToTry)) { + const foundMap = new Map(); + + for (const name of waczFilesToTry) { + const file = this.waczfiles[name]; + if (!file) { + continue; + } + if (file.fileType !== WACZ_LEAF) { continue; } @@ -1239,16 +1257,17 @@ export class MultiWACZ return await handleAuthNeeded(e, this.config); } } - async queryPages( - urlPrefix: string, - limit = 25, + search: string, + page = 1, + pageSize = 25, // eslint-disable-next-line @typescript-eslint/no-explicit-any ): Promise[]> { const params = new URLSearchParams(); - params.set("urlPrefix", urlPrefix); - params.set("pageSize", limit + ""); + params.set("search", search); + params.set("page", page + ""); + params.set("pageSize", pageSize + ""); const res = await fetch(this.pagesQuery + "?" + params.toString(), { headers: this.sourceLoader?.headers, }); @@ -1278,7 +1297,46 @@ export class MultiWACZ return pages; } - async getWACZFilesForPagesQuery(requestUrl: string) { + async getWACZFilesToTry(request: ArchiveRequest, waczname: string | null) { + let names : string[] = []; + + // always try WACZ files with no pages + if (this.alwaysLoadNoPages.length) { + names = [...this.alwaysLoadNoPages]; + } + + // if top-level doc, and has page query, query for which WACZ files should be tried + if (this.pagesQuery && ( + request.destination === "document" || + request.destination === "iframe" + )) { + const res = await this.getWACZFilesForPagesQuery(request.url); + if (res) { + names = [...names, ...res]; + return names; + } + } + + // if already has a WACZ files, try others from same crawl + if (waczname) { + const file = this.waczfiles[waczname]; + if (file?.crawlId) { + const res = this.alwaysLoadByCrawl.get(file.crawlId); + if (res) { + names = [...names, ...res]; + } + } + } + + // finally if 3 or less WACZ files, just try all of them + if (!names.length && Object.keys(this.waczfiles).length <= 3) { + names = Object.keys(this.waczfiles); + } + + return names; + } + + async getWACZFilesForPagesQuery(requestUrl: string) : Promise { const params = new URLSearchParams(); const url = new URL(requestUrl); url.search = ""; @@ -1295,13 +1353,9 @@ export class MultiWACZ if (!json) { return null; } - const selectFiles: Record = {}; - json.items.forEach((x: { filename: string }) => { - const file = this.waczfiles[x.filename]; - if (file) { - selectFiles[x.filename] = file; - } - }); + const items: {filename: string}[] = json.items; + const selectFiles = items.map((x: {filename: string}) => x.filename); + return selectFiles; } diff --git a/src/wacz/waczfile.ts b/src/wacz/waczfile.ts index c0058c1..ec754ee 100644 --- a/src/wacz/waczfile.ts +++ b/src/wacz/waczfile.ts @@ -35,6 +35,7 @@ export type WACZFileInitOptions = { path?: string; parent?: WACZLoadSource | null; fileType?: WACZType; + crawlId?: string; indexType?: IndexType; // [TODO] // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -47,13 +48,14 @@ export type WACZFileInitOptions = { export type WACZFileOptions = WACZFileInitOptions & { waczname: string; hash: string; -}; +} // ========================================================================== export class WACZFile implements WACZLoadSource { waczname?: string; hash?: string; path?: string; + crawlId?: string; parent: WACZLoadSource | null; fileType: WACZType; indexType: IndexType; @@ -74,6 +76,7 @@ export class WACZFile implements WACZLoadSource { indexType = INDEX_NOT_LOADED, nonSurt = false, loader = null, + crawlId, }: WACZFileInitOptions) { this.waczname = waczname; this.hash = hash; @@ -85,6 +88,7 @@ export class WACZFile implements WACZLoadSource { this.indexType = indexType; this.fileType = fileType; this.nonSurt = nonSurt; + this.crawlId = crawlId; } markAsMultiWACZ() { @@ -143,6 +147,7 @@ export class WACZFile implements WACZLoadSource { waczname: this.waczname, hash: this.hash, path: this.path, + crawlId: this.crawlId, entries: this.entries, indexType: this.indexType, nonSurt: this.nonSurt,