Skip to content

Commit

Permalink
support alwaysLoad param
Browse files Browse the repository at this point in the history
  • Loading branch information
ikreymer committed Feb 12, 2025
1 parent 480b6b4 commit 5dc6141
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 41 deletions.
6 changes: 4 additions & 2 deletions src/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -301,10 +301,12 @@ class API {
// @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query'].
const q = params._query.get("q");
// @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query'].
const limit = Number(params._query.get("limit")) || 25;
const page = Number(params._query.get("page")) || 1;
// @ts-expect-error [TODO] - TS4111 - Property '_query' comes from an index signature, so it must be accessed with ['_query'].
const pageSize = Number(params._query.get("pageSize")) || 25;
if (q) {
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
const pages = await coll.store.queryPages(q, limit);
const pages = await coll.store.queryPages(q, page, pageSize);
return { pages };
}
}
Expand Down
130 changes: 92 additions & 38 deletions src/wacz/multiwacz.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ export type IDXLine = {
loaded: boolean;
};

export type AlwaysLoadData = {
wacz: string;
crawlId: string;
hasPages: boolean;
}

interface MDBType extends ADBType {
ziplines: {
key: [string, string];
Expand Down Expand Up @@ -95,6 +101,9 @@ export class MultiWACZ

pagesQuery = "";

alwaysLoadNoPages: string[] = [];
alwaysLoadByCrawl: Map<string, string[]> = new Map<string, string[]>();

constructor(
config: WACZCollConfig,
sourceLoader: BaseLoader,
Expand Down Expand Up @@ -915,6 +924,7 @@ export class MultiWACZ
name,
hash,
path,
crawlId,
parent,
loader = null,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Expand All @@ -923,7 +933,7 @@ export class MultiWACZ

hash = await this.computeFileHash(waczname, hash);

const file = this.addWACZFile({ waczname, hash, path, parent, loader });
const file = this.addWACZFile({ waczname, hash, crawlId, path, parent, loader });

if (!this.pagesQuery) {
await file.init();
Expand Down Expand Up @@ -964,17 +974,18 @@ export class MultiWACZ

// @ts-expect-error [TODO] - TS4111 - Property 'resources' comes from an index signature, so it must be accessed with ['resources'].
const files = json.resources.map(
(res: { path: string; name: string; hash: string }) => {
(res: { path: string; name: string; hash: string, crawlId?: string }) => {
const path = parent.getLoadPath(res.path);
const name = parent.getName(res.name).split("/")[1];
const name = parent.getName(res.name);
const hash = res.hash;
return { name, hash, path };
const crawlId = res.crawlId;
return { name, hash, path, crawlId };
},
);

for (const { name, hash, path } of files) {
for (const { name, hash, path, crawlId } of files) {
if (!this.waczfiles[name]) {
promises.push(this.addNewWACZ({ name, hash, path, parent }));
promises.push(this.addNewWACZ({ name, hash, path, parent, crawlId }));
} else if (this.waczfiles[name].path !== path) {
// [TODO]
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
Expand All @@ -986,12 +997,28 @@ export class MultiWACZ
await Promise.allSettled(promises);
}

if (json["alwaysLoad"]) {
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
this.initAlwaysLoadData(json["alwaysLoad"]);
}

if (json["pages"]) {
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
await this.addInitialPages(json["pages"]);
}
}

initAlwaysLoadData(alwaysLoad: AlwaysLoadData[]) {
for (const {wacz, crawlId, hasPages} of alwaysLoad) {
if (!hasPages) {
this.alwaysLoadNoPages.push(wacz);
}
if (crawlId) {
this.alwaysLoadByCrawl.set(crawlId, [wacz]);
}
}
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
async addInitialPages(pagesImport: Record<string, any>[]) {
const pages: PageEntry[] = [];
Expand Down Expand Up @@ -1134,29 +1161,20 @@ export class MultiWACZ
}
}

const foundMap = new Map();
const waczFilesToTry: string[] = await this.getWACZFilesToTry(request, waczname);

let waczFilesToTry: Record<string, WACZFile> | null = null;

if (this.pagesQuery) {
if (
request.destination === "document" ||
request.destination === "iframe"
) {
const res = await this.getWACZFilesForPagesQuery(request.url);
if (res) {
waczFilesToTry = res;
}
}
} else {
waczFilesToTry = this.waczfiles;
}

if (!waczFilesToTry) {
if (!waczFilesToTry.length) {
return null;
}

for (const [name, file] of Object.entries(waczFilesToTry)) {
const foundMap = new Map();

for (const name of waczFilesToTry) {
const file = this.waczfiles[name];
if (!file) {
continue;
}

if (file.fileType !== WACZ_LEAF) {
continue;
}
Expand Down Expand Up @@ -1239,16 +1257,17 @@ export class MultiWACZ
return await handleAuthNeeded(e, this.config);
}
}


async queryPages(
urlPrefix: string,
limit = 25,
search: string,
page = 1,
pageSize = 25,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
): Promise<Record<string, any>[]> {
const params = new URLSearchParams();
params.set("urlPrefix", urlPrefix);
params.set("pageSize", limit + "");
params.set("search", search);
params.set("page", page + "");
params.set("pageSize", pageSize + "");
const res = await fetch(this.pagesQuery + "?" + params.toString(), {
headers: this.sourceLoader?.headers,
});
Expand Down Expand Up @@ -1278,7 +1297,46 @@ export class MultiWACZ
return pages;
}

async getWACZFilesForPagesQuery(requestUrl: string) {
async getWACZFilesToTry(request: ArchiveRequest, waczname: string | null) {
let names : string[] = [];

// always try WACZ files with no pages
if (this.alwaysLoadNoPages.length) {
names = [...this.alwaysLoadNoPages];
}

// if top-level doc, and has page query, query for which WACZ files should be tried
if (this.pagesQuery && (
request.destination === "document" ||
request.destination === "iframe"
)) {
const res = await this.getWACZFilesForPagesQuery(request.url);
if (res) {
names = [...names, ...res];
return names;
}
}

// if already has a WACZ files, try others from same crawl
if (waczname) {
const file = this.waczfiles[waczname];
if (file?.crawlId) {
const res = this.alwaysLoadByCrawl.get(file.crawlId);
if (res) {
names = [...names, ...res];
}
}
}

// finally if 3 or less WACZ files, just try all of them
if (!names.length && Object.keys(this.waczfiles).length <= 3) {
names = Object.keys(this.waczfiles);
}

return names;
}

async getWACZFilesForPagesQuery(requestUrl: string) : Promise<string[] | null> {
const params = new URLSearchParams();
const url = new URL(requestUrl);
url.search = "";
Expand All @@ -1295,13 +1353,9 @@ export class MultiWACZ
if (!json) {
return null;
}
const selectFiles: Record<string, WACZFile> = {};
json.items.forEach((x: { filename: string }) => {
const file = this.waczfiles[x.filename];
if (file) {
selectFiles[x.filename] = file;
}
});
const items: {filename: string}[] = json.items;
const selectFiles = items.map((x: {filename: string}) => x.filename);

return selectFiles;
}

Expand Down
7 changes: 6 additions & 1 deletion src/wacz/waczfile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export type WACZFileInitOptions = {
path?: string;
parent?: WACZLoadSource | null;
fileType?: WACZType;
crawlId?: string;
indexType?: IndexType;
// [TODO]
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Expand All @@ -47,13 +48,14 @@ export type WACZFileInitOptions = {
export type WACZFileOptions = WACZFileInitOptions & {
waczname: string;
hash: string;
};
}

// ==========================================================================
export class WACZFile implements WACZLoadSource {
waczname?: string;
hash?: string;
path?: string;
crawlId?: string;
parent: WACZLoadSource | null;
fileType: WACZType;
indexType: IndexType;
Expand All @@ -74,6 +76,7 @@ export class WACZFile implements WACZLoadSource {
indexType = INDEX_NOT_LOADED,
nonSurt = false,
loader = null,
crawlId,
}: WACZFileInitOptions) {
this.waczname = waczname;
this.hash = hash;
Expand All @@ -85,6 +88,7 @@ export class WACZFile implements WACZLoadSource {
this.indexType = indexType;
this.fileType = fileType;
this.nonSurt = nonSurt;
this.crawlId = crawlId;
}

markAsMultiWACZ() {
Expand Down Expand Up @@ -143,6 +147,7 @@ export class WACZFile implements WACZLoadSource {
waczname: this.waczname,
hash: this.hash,
path: this.path,
crawlId: this.crawlId,
entries: this.entries,
indexType: this.indexType,
nonSurt: this.nonSurt,
Expand Down

0 comments on commit 5dc6141

Please sign in to comment.