diff --git a/package.json b/package.json index fa6e8a19..7e136459 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@webrecorder/wabac", - "version": "2.20.8", + "version": "2.21.0", "main": "index.js", "type": "module", "exports": { @@ -19,7 +19,7 @@ "@peculiar/asn1-schema": "^2.3.3", "@peculiar/x509": "^1.9.2", "@types/js-levenshtein": "^1.1.3", - "@webrecorder/wombat": "^3.8.7", + "@webrecorder/wombat": "^3.8.8", "acorn": "^8.10.0", "auto-js-ipfs": "^2.1.1", "base64-js": "^1.5.1", diff --git a/src/api.ts b/src/api.ts index 4bcd718b..511d3255 100644 --- a/src/api.ts +++ b/src/api.ts @@ -1,6 +1,7 @@ import { Path } from "path-parser"; import { getCollData } from "./utils"; import { type SWCollections } from "./swmain"; +import { MultiWACZ } from "./wacz/multiwacz"; // [TODO] // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -126,6 +127,9 @@ class API { data.pages = await coll.store.getAllPages(); data.lists = await coll.store.db.getAll("pageLists"); data.curatedPages = await coll.store.db.getAll("curatedPages"); + if (coll.store instanceof MultiWACZ) { + data.canQueryPages = !!coll.store.pagesQueryUrl; + } } else { data.pages = []; data.lists = []; @@ -293,8 +297,28 @@ class API { if (!coll) { return { error: "collection_not_found" }; } + let total = undefined; + if (coll.store instanceof MultiWACZ) { + // @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query']. + const search = params._query.get("search"); + // @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query']. + const page = Number(params._query.get("page")) || 1; + // @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query']. + const pageSize = Number(params._query.get("pageSize")) || 25; + if (search || page > 1) { + const { pages, total } = await coll.store.queryPages( + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + search, + page, + pageSize, + ); + return { pages, total }; + } else { + total = coll.store.totalPages; + } + } const pages = await coll.store.getAllPages(); - return { pages }; + return { pages, total }; } case "textIndex": { diff --git a/src/types.ts b/src/types.ts index 5afd3c31..612cd833 100644 --- a/src/types.ts +++ b/src/types.ts @@ -57,6 +57,14 @@ export type PageEntry = { timestamp?: string; + mime?: string; + depth?: number; + status?: number; + favIconUrl?: string; + wacz?: string; + waczhash?: string; + isSeed?: boolean; + pos?: number; // [TODO] // eslint-disable-next-line @typescript-eslint/no-explicit-any diff --git a/src/wacz/multiwacz.ts b/src/wacz/multiwacz.ts index 27a7f34a..cbe3a764 100644 --- a/src/wacz/multiwacz.ts +++ b/src/wacz/multiwacz.ts @@ -38,7 +38,11 @@ import { import { type ArchiveResponse } from "../response"; import { type ArchiveRequest } from "../request"; import { type LoadWACZEntry } from "./ziprangereader"; -import { type RemoteResourceEntry, type WACZCollConfig } from "../types"; +import { + type PageEntry, + type RemoteResourceEntry, + type WACZCollConfig, +} from "../types"; const MAX_BLOCKS = 3; @@ -54,6 +58,11 @@ export type IDXLine = { loaded: boolean; }; +export type PreloadResources = { + name: string; + crawlId: string; +}; + interface MDBType extends ADBType { ziplines: { key: [string, string]; @@ -89,6 +98,13 @@ export class MultiWACZ // eslint-disable-next-line @typescript-eslint/no-explicit-any fuzzyUrlRules: { match: RegExp; replace: any }[]; + pagesQueryUrl = ""; + + totalPages?: number = undefined; + + preloadResources: string[] = []; + seedPageWACZs: Map> = new Map>(); + constructor( config: WACZCollConfig, sourceLoader: BaseLoader, @@ -218,9 +234,10 @@ export class MultiWACZ } addWACZFile(file: WACZFileOptions) { - this.waczfiles[file.waczname] = new WACZFile(file); + const waczfile = new WACZFile(file); + this.waczfiles[file.waczname] = waczfile; this.waczNameForHash[file.hash] = file.waczname; - return this.waczfiles[file.waczname]; + return waczfile; } override async init() { @@ -410,6 +427,10 @@ export class MultiWACZ throw new Error("unknown waczfile: " + waczname); } + if (!this.waczfiles[waczname].entries) { + await this.waczfiles[waczname].init(); + } + if (this.waczfiles[waczname].indexType) { return { indexType: this.waczfiles[waczname].indexType, isNew: false }; } @@ -904,59 +925,82 @@ export class MultiWACZ name, hash, path, + crawlId, parent, loader = null, - }: WACZFileInitOptions & { name: string }) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + }: WACZFileInitOptions & { name: string }): Promise> { const waczname = name || path || ""; hash = await this.computeFileHash(waczname, hash); - const file = this.addWACZFile({ waczname, hash, path, parent, loader }); - - // @ts-expect-error [TODO] - TS2532 - Object is possibly 'undefined'. - await file.init(); + const file = this.addWACZFile({ + waczname, + hash, + crawlId, + path, + parent, + loader, + }); + + if (!this.pagesQueryUrl) { + await file.init(); + } - // @ts-expect-error [TODO] - TS2532 - Object is possibly 'undefined'. await file.save(this.db, true); - // @ts-expect-error [TODO] - TS2345 - Argument of type 'WACZFile | undefined' is not assignable to parameter of type 'WACZFile'. - const importer = new WACZImporter(this, file, !parent); - - // [TODO] - // eslint-disable-next-line @typescript-eslint/no-unsafe-return - return await importer.load(); + if (!this.pagesQueryUrl) { + const importer = new WACZImporter(this, file, !parent); + // eslint-disable-next-line @typescript-eslint/no-unsafe-return + return await importer.load(); + } else { + return {}; + } } async loadWACZFiles( // [TODO] - // eslint-disable-next-line @typescript-eslint/no-explicit-any - json: Record, + + json: { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + resources: any; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + initialPages: any; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + preloadResources: any; + totalPages: number; + }, parent: WACZLoadSource = this, ) { - const promises: Promise[] = []; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const promises: Promise[] = []; const update = async (name: string, path: string) => { - // @ts-expect-error [TODO] - TS2532 - Object is possibly 'undefined'. - await this.waczfiles[name].init(path); - // @ts-expect-error [TODO] - TS2532 - Object is possibly 'undefined'. - await this.waczfiles[name].save(this.db, true); + const waczfile = this.waczfiles[name]; + if (!waczfile) { + return; + } + if (!this.pagesQueryUrl) { + waczfile.path = path; + } else { + await waczfile.init(path); + } + await waczfile.save(this.db, true); }; - // @ts-expect-error [TODO] - TS4111 - Property 'resources' comes from an index signature, so it must be accessed with ['resources']. const files = json.resources.map( - (res: { path: string; name: string; hash: string }) => { + (res: { path: string; name: string; hash: string; crawlId?: string }) => { const path = parent.getLoadPath(res.path); const name = parent.getName(res.name); const hash = res.hash; - return { name, hash, path }; + const crawlId = res.crawlId; + return { name, hash, path, crawlId }; }, ); - for (const { name, hash, path } of files) { + for (const { name, hash, path, crawlId } of files) { if (!this.waczfiles[name]) { - // [TODO] - // eslint-disable-next-line @typescript-eslint/no-unsafe-argument - promises.push(this.addNewWACZ({ name, hash, path, parent })); + promises.push(this.addNewWACZ({ name, hash, path, parent, crawlId })); } else if (this.waczfiles[name].path !== path) { // [TODO] // eslint-disable-next-line @typescript-eslint/no-unsafe-argument @@ -967,6 +1011,67 @@ export class MultiWACZ if (promises.length) { await Promise.allSettled(promises); } + + if (json.preloadResources) { + for (const { name } of json.preloadResources) { + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + this.preloadResources.push(name); + } + } + + if (json.initialPages) { + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + await this.addInitialPages(json.initialPages); + } + + if (!isNaN(json.totalPages)) { + this.totalPages = json.totalPages; + } + } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + async addInitialPages(pagesImport: Record[]) { + const pages: PageEntry[] = []; + for (const { + id, + url, + title, + ts, + mime, + status, + depth, + favIconUrl, + filename, + isSeed, + crawl_id, + } of pagesImport) { + const file = this.waczfiles[filename]; + const waczhash = file ? file.hash : ""; + pages.push({ + id, + url, + title, + ts, + mime, + status, + depth, + favIconUrl, + wacz: filename, + waczhash, + isSeed, + }); + if (isSeed) { + const set: Set = + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + this.seedPageWACZs.get(crawl_id) || new Set(); + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + set.add(filename); + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + this.seedPageWACZs.set(crawl_id, set); + } + } + + return await this.addPages(pages); } async getTextIndex() { @@ -976,7 +1081,7 @@ export class MultiWACZ const keys = Object.keys(this.waczfiles); - if (!this.textIndex || !keys.length) { + if (this.pagesQueryUrl || !this.textIndex || !keys.length) { return new Response("", { headers }); } @@ -1078,9 +1183,23 @@ export class MultiWACZ } } + const waczFilesToTry: string[] = await this.getWACZFilesToTry( + request, + waczname, + ); + + if (!waczFilesToTry.length) { + return null; + } + const foundMap = new Map(); - for (const [name, file] of Object.entries(this.waczfiles)) { + for (const name of waczFilesToTry) { + const file = this.waczfiles[name]; + if (!file) { + continue; + } + if (file.fileType !== WACZ_LEAF) { continue; } @@ -1164,6 +1283,113 @@ export class MultiWACZ } } + async queryPages( + search = "", + page = 1, + pageSize = 25, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + ): Promise<{ pages: Record[]; total: number }> { + const params = new URLSearchParams(); + if (search) { + params.set("search", search); + } + params.set("page", page + ""); + params.set("pageSize", pageSize + ""); + const res = await fetch(this.pagesQueryUrl + "?" + params.toString(), { + headers: this.sourceLoader?.headers, + }); + if (res.status !== 200) { + return { pages: [], total: 0 }; + } + const json = await res.json(); + if (!json) { + return { pages: [], total: 0 }; + } + + const total = json.total; + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const pages: Record[] = json.items.map((x: any) => { + x.wacz = x.filename; + const file = this.waczfiles[x.filename]; + if (file) { + x.waczhash = file.hash; + } + if (typeof x.ts === "string") { + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + x.ts = new Date(x.ts).getTime(); + } + // eslint-disable-next-line @typescript-eslint/no-unsafe-return + return x; + }); + + return { pages, total }; + } + + async getWACZFilesToTry(request: ArchiveRequest, waczname: string | null) { + let names: string[] = []; + + // always try WACZ files with no pages + if (this.preloadResources.length) { + names = [...this.preloadResources]; + } + + // if top-level doc, and has page query, query for which WACZ files should be tried + if ( + this.pagesQueryUrl && + (request.destination === "document" || request.destination === "iframe") + ) { + const res = await this.getWACZFilesForPagesQuery(request.url); + if (res) { + names = [...names, ...res]; + return names; + } + } + + // if already has a WACZ files, try others from same crawl + if (waczname) { + const file = this.waczfiles[waczname]; + if (file?.crawlId) { + const res = this.seedPageWACZs.get(file.crawlId); + if (res) { + names = [...names, ...res.values()]; + } + } + } + + // finally if 3 or less WACZ files, just try all of them + if (!names.length && Object.keys(this.waczfiles).length <= 3) { + names = Object.keys(this.waczfiles); + } + + return names; + } + + async getWACZFilesForPagesQuery( + requestUrl: string, + ): Promise { + const params = new URLSearchParams(); + const url = new URL(requestUrl); + url.search = ""; + url.hash = ""; + params.set("url", url.href); + params.set("pageSize", "10"); + const res = await fetch(this.pagesQueryUrl + "?" + params.toString(), { + headers: this.sourceLoader?.headers, + }); + if (res.status !== 200) { + return null; + } + const json = await res.json(); + if (!json) { + return null; + } + const items: { filename: string }[] = json.items; + const selectFiles = items.map((x: { filename: string }) => x.filename); + + return selectFiles; + } + async checkUpdates() { if (this.rootSourceType === "json") { await this.loadFromJSON(); @@ -1188,6 +1414,10 @@ export class MultiWACZ const data = await response.json(); + if (data.pagesQueryUrl) { + this.pagesQueryUrl = data.pagesQueryUrl; + } + switch (data.profile) { case "data-package": case "wacz-package": diff --git a/src/wacz/waczfile.ts b/src/wacz/waczfile.ts index 1038d294..99a2e064 100644 --- a/src/wacz/waczfile.ts +++ b/src/wacz/waczfile.ts @@ -35,6 +35,7 @@ export type WACZFileInitOptions = { path?: string; parent?: WACZLoadSource | null; fileType?: WACZType; + crawlId?: string; indexType?: IndexType; // [TODO] // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -54,10 +55,11 @@ export class WACZFile implements WACZLoadSource { waczname?: string; hash?: string; path?: string; + crawlId?: string; parent: WACZLoadSource | null; fileType: WACZType; indexType: IndexType; - // [TODO] + // eslint-disable-next-line @typescript-eslint/no-explicit-any entries: Record | null; nonSurt: boolean; @@ -74,6 +76,7 @@ export class WACZFile implements WACZLoadSource { indexType = INDEX_NOT_LOADED, nonSurt = false, loader = null, + crawlId, }: WACZFileInitOptions) { this.waczname = waczname; this.hash = hash; @@ -85,6 +88,7 @@ export class WACZFile implements WACZLoadSource { this.indexType = indexType; this.fileType = fileType; this.nonSurt = nonSurt; + this.crawlId = crawlId; } markAsMultiWACZ() { @@ -143,13 +147,13 @@ export class WACZFile implements WACZLoadSource { waczname: this.waczname, hash: this.hash, path: this.path, + crawlId: this.crawlId, entries: this.entries, indexType: this.indexType, nonSurt: this.nonSurt, }; } - // [TODO] // eslint-disable-next-line @typescript-eslint/no-explicit-any async save(db: any, always = false) { const zipreader = this.zipreader; diff --git a/src/wacz/waczimporter.ts b/src/wacz/waczimporter.ts index 19377dfe..20f6e063 100644 --- a/src/wacz/waczimporter.ts +++ b/src/wacz/waczimporter.ts @@ -149,7 +149,7 @@ export class WACZImporter { case "multi-wacz-package": // [TODO] - // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + // eslint-disable-next-line @typescript-eslint/no-unsafe-return return await this.loadMultiWACZPackage(root); default: @@ -159,9 +159,11 @@ export class WACZImporter { // [TODO] // eslint-disable-next-line @typescript-eslint/no-explicit-any - async loadMultiWACZPackage(root: Record) { + async loadMultiWACZPackage(root: any) { this.file.markAsMultiWACZ(); + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument await this.store.loadWACZFiles(root, this.file); + // eslint-disable-next-line @typescript-eslint/no-unsafe-return return root; } diff --git a/yarn.lock b/yarn.lock index 8434c9b1..9f10ec0c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -896,10 +896,10 @@ resolved "https://registry.yarnpkg.com/@webpack-cli/serve/-/serve-1.5.2.tgz#ea584b637ff63c5a477f6f21604b5a205b72c9ec" integrity sha512-vgJ5OLWadI8aKjDlOH3rb+dYyPd2GTZuQC/Tihjct6F9GpXGZINo3Y/IVuZVTM1eDQB+/AOsjPUWH/WySDaXvw== -"@webrecorder/wombat@^3.8.7": - version "3.8.7" - resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.7.tgz#51c7465c589e0020be064121127c7c10a38ec21f" - integrity sha512-bW5V7cBweTkTazOIN8oZZGwHLevsGNv1luY3t0RYdEZhs5BDpTmUHN33zEbrXDOiPUlY3N3I8+73VA+PuxihoQ== +"@webrecorder/wombat@^3.8.8": + version "3.8.8" + resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.8.tgz#aab4dd8eea6d6cb17bfefb7ee1802e7b45b11ed7" + integrity sha512-XkJOZAyHrdXNkAVoISQEh/NHzaBMekQZfWqes/k2vYkW6v9DmZ0wjP7Kf6MHCuajKX8uSH+caB2tv1kJgvnv3Q== dependencies: warcio "^2.4.0"