Skip to content

Commit

Permalink
[Chore] Reorganize engine.ts (#536)
Browse files Browse the repository at this point in the history
This PR reorganizes `engine.ts` to keep it relatively more organized.
- We move `getToolCallFromOutputMessage()` from `engine.ts` to
`support.ts` since it does not depend on any engine-related fields and
is stateless
- We move `asyncLoadTokenizer()` from `engine.ts` to `cache_util.ts` for
the same reason
- For `engine.ts`, we break down the functions into sections
  • Loading branch information
CharlieFRuan committed Aug 10, 2024
1 parent 3345417 commit d21f00e
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 194 deletions.
50 changes: 48 additions & 2 deletions src/cache_util.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import * as tvmjs from "tvmjs";
import { AppConfig, ModelRecord, prebuiltAppConfig } from "./config";
import {
AppConfig,
ChatConfig,
ModelRecord,
prebuiltAppConfig,
} from "./config";
import { cleanModelUrl } from "./support";
import { ModelNotFoundError } from "./error";
import { ModelNotFoundError, UnsupportedTokenizerFilesError } from "./error";
import { Tokenizer } from "@mlc-ai/web-tokenizers";

function findModelRecord(modelId: string, appConfig?: AppConfig): ModelRecord {
const matchedItem = appConfig?.model_list.find(
Expand Down Expand Up @@ -101,3 +107,43 @@ export async function deleteModelWasmInCache(
}
await wasmCache.deleteInCache(modelRecord.model_lib);
}

/**
*
* @param baseUrl The link to which we can find tokenizer files, usually is a `ModelRecord.model`.
* @param config A ChatConfig, usually loaded from `mlc-chat-config.json` in `baseUrl`.
* @param appConfig An AppConfig, usually `webllm.prebuiltAppConfig` if not defined by user.
* @param logger Logging function, console.log by default.
* @returns
*/
export async function asyncLoadTokenizer(
baseUrl: string,
config: ChatConfig,
appConfig: AppConfig,
logger: (msg: string) => void = console.log,
): Promise<Tokenizer> {
let modelCache: tvmjs.ArtifactCacheTemplate;
if (appConfig.useIndexedDBCache) {
modelCache = new tvmjs.ArtifactIndexedDBCache("webllm/model");
} else {
modelCache = new tvmjs.ArtifactCache("webllm/model");
}

if (config.tokenizer_files.includes("tokenizer.json")) {
const url = new URL("tokenizer.json", baseUrl).href;
const model = await modelCache.fetchWithCache(url, "arraybuffer");
return Tokenizer.fromJSON(model);
} else if (config.tokenizer_files.includes("tokenizer.model")) {
logger(
"Using `tokenizer.model` since we cannot locate `tokenizer.json`.\n" +
"It is recommended to use `tokenizer.json` to ensure all token mappings are included, " +
"since currently, files like `added_tokens.json`, `tokenizer_config.json` are ignored.\n" +
"Consider converting `tokenizer.model` to `tokenizer.json` by compiling the model " +
"with MLC again, or see if MLC's huggingface provides this file.",
);
const url = new URL("tokenizer.model", baseUrl).href;
const model = await modelCache.fetchWithCache(url, "arraybuffer");
return Tokenizer.fromSentencePiece(model);
}
throw new UnsupportedTokenizerFilesError(config.tokenizer_files);
}
Loading

0 comments on commit d21f00e

Please sign in to comment.