diff --git a/package-lock.json b/package-lock.json index e33d27da..70a66922 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,6 +8,9 @@ "name": "@mlc-ai/web-llm", "version": "0.2.38", "license": "Apache-2.0", + "dependencies": { + "loglevel": "^1.9.1" + }, "devDependencies": { "@mlc-ai/web-tokenizers": "^0.1.3", "@rollup/plugin-commonjs": "^20.0.0", @@ -6116,6 +6119,18 @@ "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==", "dev": true }, + "node_modules/loglevel": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/loglevel/-/loglevel-1.9.1.tgz", + "integrity": "sha512-hP3I3kCrDIMuRwAwHltphhDM1r8i55H33GgqjXbrisuJhF4kRhW1dNuxsRklp4bXl8DSdLaNLuiL4A/LWRfxvg==", + "engines": { + "node": ">= 0.6.0" + }, + "funding": { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/loglevel" + } + }, "node_modules/lru-cache": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", diff --git a/package.json b/package.json index df4a331e..1184654c 100644 --- a/package.json +++ b/package.json @@ -51,5 +51,8 @@ "tslib": "^2.3.1", "tvmjs": "file:./tvm_home/web", "typescript": "^4.9.5" + }, + "dependencies": { + "loglevel": "^1.9.1" } } diff --git a/src/config.ts b/src/config.ts index 0a5b3fd1..b758baf7 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,7 +1,7 @@ /* eslint-disable @typescript-eslint/no-non-null-assertion */ - +import log from "loglevel"; import { ResponseFormat } from "./openai_api_protocols"; -import { LogitProcessor, InitProgressCallback } from "./types"; +import { LogitProcessor, InitProgressCallback, LogLevel } from "./types"; /** * Conversation template config @@ -26,6 +26,8 @@ export enum Role { assistant = "assistant", } +export const DefaultLogLevel: LogLevel = "WARN"; + /** * Place holders that can be used in role templates. * For example, a role template of @@ -91,6 +93,7 @@ export interface MLCEngineConfig { appConfig?: AppConfig; initProgressCallback?: InitProgressCallback; logitProcessorRegistry?: Map; + logLevel: LogLevel; } /** @@ -167,16 +170,14 @@ export function postInitAndCheckGenerationConfigValues( !_hasValue(config.presence_penalty) ) { config.presence_penalty = 0.0; - console.log( - "Only frequency_penalty is set; we default presence_penaty to 0.", - ); + log.warn("Only frequency_penalty is set; we default presence_penaty to 0."); } if ( _hasValue(config.presence_penalty) && !_hasValue(config.frequency_penalty) ) { config.frequency_penalty = 0.0; - console.log( + log.warn( "Only presence_penalty is set; we default frequency_penalty to 0.", ); } diff --git a/src/engine.ts b/src/engine.ts index 3c76a721..65823ea7 100644 --- a/src/engine.ts +++ b/src/engine.ts @@ -1,4 +1,5 @@ import * as tvmjs from "tvmjs"; +import log from "loglevel"; import { Tokenizer } from "@mlc-ai/web-tokenizers"; import * as API from "./openai_api_protocols/apis"; import { @@ -10,6 +11,7 @@ import { postInitAndCheckGenerationConfigValues, Role, MLCEngineConfig, + DefaultLogLevel, } from "./config"; import { LLMChatPipeline } from "./llm_chat"; import { @@ -30,6 +32,7 @@ import { MLCEngineInterface, GenerateProgressCallback, LogitProcessor, + LogLevel, } from "./types"; import { Conversation, @@ -61,6 +64,7 @@ export async function CreateMLCEngine( engineConfig?: MLCEngineConfig, ): Promise { const engine = new MLCEngine(); + engine.setLogLevel(engineConfig?.logLevel || DefaultLogLevel); engine.setInitProgressCallback(engineConfig?.initProgressCallback); engine.setLogitProcessorRegistry(engineConfig?.logitProcessorRegistry); await engine.reload(modelId, engineConfig?.chatOpts, engineConfig?.appConfig); @@ -76,7 +80,7 @@ export class MLCEngine implements MLCEngineInterface { public chat: API.Chat; private currentModelId?: string = undefined; // Model current loaded, undefined if nothing is loaded - private logger: (msg: string) => void = console.log; + private logger: (msg: string) => void = log.info; private logitProcessorRegistry?: Map; private logitProcessor?: LogitProcessor; private pipeline?: LLMChatPipeline; @@ -238,7 +242,7 @@ export class MLCEngine implements MLCEngineInterface { let deviceLostInReload = false; gpuDetectOutput.device.lost.then((info: any) => { if (this.deviceLostIsError) { - console.error( + log.error( `Device was lost during reload. This can happen due to insufficient memory or other GPU constraints. Detailed error: ${info}. Please try to reload WebLLM with a less resource-intensive model.`, ); this.unload(); @@ -291,7 +295,7 @@ export class MLCEngine implements MLCEngineInterface { streamInterval = 1, genConfig?: GenerationConfig, ): Promise { - console.log( + log.warn( "WARNING: `generate()` will soon be deprecated. " + "Please use `engine.chat.completions.create()` instead. " + "For multi-round chatting, see `examples/multi-round-chat` on how to use " + @@ -579,7 +583,7 @@ export class MLCEngine implements MLCEngineInterface { gpuDetectOutput.device.limits.maxStorageBufferBindingSize; const defaultMaxStorageBufferBindingSize = 1 << 30; // 1GB if (maxStorageBufferBindingSize < defaultMaxStorageBufferBindingSize) { - console.log( + log.warn( `WARNING: the current maxStorageBufferBindingSize ` + `(${computeMB(maxStorageBufferBindingSize)}) ` + `may only work for a limited number of models, e.g.: \n` + @@ -636,6 +640,15 @@ export class MLCEngine implements MLCEngineInterface { return this.getPipeline().getMessage(); } + /** + * Set MLCEngine logging output level + * + * @param logLevel The new log level + */ + setLogLevel(logLevel: LogLevel) { + log.setLevel(logLevel); + } + /** * Get a new Conversation object based on the chat completion request. * @@ -792,7 +805,7 @@ export class MLCEngine implements MLCEngineInterface { this.resetChat(); this.getPipeline().setConversation(newConv); } else { - console.log("Multiround chatting, reuse KVCache."); + log.info("Multiround chatting, reuse KVCache."); } // 2. Treat the last message as the usual input diff --git a/src/extension_service_worker.ts b/src/extension_service_worker.ts index 3e6aae71..1c575253 100644 --- a/src/extension_service_worker.ts +++ b/src/extension_service_worker.ts @@ -1,7 +1,8 @@ import * as tvmjs from "tvmjs"; +import log from "loglevel"; import { AppConfig, ChatOptions, MLCEngineConfig } from "./config"; import { ReloadParams, WorkerRequest } from "./message"; -import { MLCEngineInterface } from "./types"; +import { LogLevel, MLCEngineInterface } from "./types"; import { ChatWorker, MLCEngineWorkerHandler, @@ -88,7 +89,7 @@ export class ServiceWorkerMLCEngineHandler extends MLCEngineWorkerHandler { areChatOptionsEqual(this.chatOpts, params.chatOpts) && areAppConfigsEqual(this.appConfig, params.appConfig) ) { - console.log("Already loaded the model. Skip loading"); + log.info("Already loaded the model. Skip loading"); const gpuDetectOutput = await tvmjs.detectGPUDevice(); if (gpuDetectOutput == undefined) { throw Error("Cannot find WebGPU in the environment"); @@ -140,6 +141,9 @@ export async function CreateServiceWorkerMLCEngine( keepAliveMs = 10000, ): Promise { const serviceWorkerMLCEngine = new ServiceWorkerMLCEngine(keepAliveMs); + if (engineConfig?.logLevel) { + serviceWorkerMLCEngine.setLogLevel(engineConfig.logLevel); + } serviceWorkerMLCEngine.setInitProgressCallback( engineConfig?.initProgressCallback, ); diff --git a/src/index.ts b/src/index.ts index 9f0387bd..0b6672f1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,6 +14,7 @@ export { InitProgressReport, MLCEngineInterface, LogitProcessor, + LogLevel, } from "./types"; export { MLCEngine, CreateMLCEngine } from "./engine"; diff --git a/src/llm_chat.ts b/src/llm_chat.ts index 945e1b5b..129d7e92 100644 --- a/src/llm_chat.ts +++ b/src/llm_chat.ts @@ -1,6 +1,7 @@ /* eslint-disable @typescript-eslint/no-non-null-assertion */ /* eslint-disable no-prototype-builtins */ import * as tvmjs from "tvmjs"; +import log from "loglevel"; import { Tokenizer } from "@mlc-ai/web-tokenizers"; import { ChatConfig, GenerationConfig, Role } from "./config"; import { getConversation, Conversation } from "./conversation"; @@ -72,9 +73,6 @@ export class LLMChatPipeline { private curRoundDecodingTotalTokens = 0; private curRoundPrefillTotalTokens = 0; - // logger - private logger = console.log; - // LogitProcessor private logitProcessor?: LogitProcessor = undefined; @@ -154,7 +152,7 @@ export class LLMChatPipeline { // 4. Read in compilation configurations from metadata this.prefillChunkSize = metadata.prefill_chunk_size; - this.logger("Using prefillChunkSize: ", this.prefillChunkSize); + log.info("Using prefillChunkSize: ", this.prefillChunkSize); if (this.prefillChunkSize <= 0) { throw Error("Prefill chunk size needs to be positive."); } @@ -164,14 +162,14 @@ export class LLMChatPipeline { metadata.sliding_window_size != -1 ) { this.slidingWindowSize = metadata.sliding_window_size; - this.logger("Using slidingWindowSize: ", this.slidingWindowSize); + log.info("Using slidingWindowSize: ", this.slidingWindowSize); // Parse attention sink size if ( metadata.hasOwnProperty("attention_sink_size") && metadata.attention_sink_size >= 0 ) { this.attentionSinkSize = metadata.attention_sink_size; - this.logger("Using attentionSinkSize: ", this.attentionSinkSize); + log.info("Using attentionSinkSize: ", this.attentionSinkSize); } else { throw Error( "Need to specify non-negative attention_sink_size if using sliding window. " + @@ -184,7 +182,7 @@ export class LLMChatPipeline { metadata.context_window_size != -1 ) { this.maxWindowLength = metadata.context_window_size; - this.logger("Using maxWindowLength: ", this.maxWindowLength); + log.info("Using maxWindowLength: ", this.maxWindowLength); } else { throw Error( "Need to specify either sliding window size or max window size.", @@ -905,7 +903,7 @@ export class LLMChatPipeline { } // need shift window and re-encode - this.logger("need shift window"); + log.info("need shift window"); this.filledKVCacheLength = 0; this.resetKVCache(); @@ -1056,8 +1054,8 @@ export class LLMChatPipeline { `decoding-time=${((decodingEnd - decodingStart) / 1000).toFixed(4)} sec`; // simply log tokens for eyeballing. - console.log("Logits:"); - console.log(logitsOnCPU.toArray()); - console.log(msg); + log.info("Logits:"); + log.info(logitsOnCPU.toArray()); + log.info(msg); } } diff --git a/src/service_worker.ts b/src/service_worker.ts index 0cd2707d..eba3e564 100644 --- a/src/service_worker.ts +++ b/src/service_worker.ts @@ -1,7 +1,8 @@ import * as tvmjs from "tvmjs"; +import log from "loglevel"; import { AppConfig, ChatOptions, MLCEngineConfig } from "./config"; import { ReloadParams, WorkerRequest, WorkerResponse } from "./message"; -import { MLCEngineInterface, InitProgressReport } from "./types"; +import { MLCEngineInterface, InitProgressReport, LogLevel } from "./types"; import { MLCEngineWorkerHandler, WebWorkerMLCEngine, @@ -90,7 +91,7 @@ export class ServiceWorkerMLCEngineHandler extends MLCEngineWorkerHandler { onError?: () => void, ): void { const msg = event.data as WorkerRequest; - console.debug( + log.trace( `ServiceWorker message: [${msg.kind}] ${JSON.stringify(msg.content)}`, ); @@ -114,7 +115,7 @@ export class ServiceWorkerMLCEngineHandler extends MLCEngineWorkerHandler { areChatOptionsEqual(this.chatOpts, params.chatOpts) && areAppConfigsEqual(this.appConfig, params.appConfig) ) { - console.log("Already loaded the model. Skip loading"); + log.info("Already loaded the model. Skip loading"); const gpuDetectOutput = await tvmjs.detectGPUDevice(); if (gpuDetectOutput == undefined) { throw Error("Cannot find WebGPU in the environment"); @@ -206,6 +207,9 @@ export async function CreateServiceWorkerMLCEngine( ); } const serviceWorkerMLCEngine = new ServiceWorkerMLCEngine(serviceWorker); + if (engineConfig?.logLevel) { + serviceWorkerMLCEngine.setLogLevel(engineConfig.logLevel); + } serviceWorkerMLCEngine.setInitProgressCallback( engineConfig?.initProgressCallback, ); @@ -234,7 +238,7 @@ export class ServiceWorkerMLCEngine extends WebWorkerMLCEngine { "message", (event: MessageEvent) => { const msg = event.data; - console.debug( + log.trace( `MLC client message: [${msg.kind}] ${JSON.stringify(msg.content)}`, ); try { @@ -246,7 +250,7 @@ export class ServiceWorkerMLCEngine extends WebWorkerMLCEngine { } catch (err: any) { // This is expected to throw if user has multiple windows open if (!err.message.startsWith("return from a unknown uuid")) { - console.error("CreateWebServiceWorkerMLCEngine.onmessage", err); + log.error("CreateWebServiceWorkerMLCEngine.onmessage", err); } } }, @@ -255,7 +259,7 @@ export class ServiceWorkerMLCEngine extends WebWorkerMLCEngine { setInterval(() => { this.worker.postMessage({ kind: "keepAlive", uuid: crypto.randomUUID() }); this.missedHeatbeat += 1; - console.debug("missedHeatbeat", this.missedHeatbeat); + log.trace("missedHeatbeat", this.missedHeatbeat); }, keepAliveMs); } diff --git a/src/types.ts b/src/types.ts index e2dfcec1..910c08e7 100644 --- a/src/types.ts +++ b/src/types.ts @@ -194,4 +194,21 @@ export interface MLCEngineInterface { inputIds: Array, isPrefill: boolean, ): Promise; + + /** + * Set MLCEngine logging output level + * + * @param logLevel The new log level + */ + setLogLevel(logLevel: LogLevel): void; } + +export const LOG_LEVELS = { + TRACE: 0, + DEBUG: 1, + INFO: 2, + WARN: 3, + ERROR: 4, + SILENT: 5, +}; +export type LogLevel = keyof typeof LOG_LEVELS; diff --git a/src/web_worker.ts b/src/web_worker.ts index cc1c95b7..2ff79f95 100644 --- a/src/web_worker.ts +++ b/src/web_worker.ts @@ -9,6 +9,7 @@ import { GenerateProgressCallback, InitProgressCallback, InitProgressReport, + LogLevel, } from "./types"; import { ChatCompletionRequest, @@ -31,6 +32,7 @@ import { WorkerResponse, WorkerRequest, } from "./message"; +import log from "loglevel"; export interface PostMessageHandler { postMessage: (message: any) => void; @@ -624,4 +626,8 @@ export class WebWorkerMLCEngine implements MLCEngineInterface { } } } + + setLogLevel(logLevel: LogLevel) { + log.setLevel(logLevel); + } } diff --git a/utils/vram_requirements/src/vram_requirements.ts b/utils/vram_requirements/src/vram_requirements.ts index 2e5c2a8c..14988fcc 100644 --- a/utils/vram_requirements/src/vram_requirements.ts +++ b/utils/vram_requirements/src/vram_requirements.ts @@ -1,6 +1,7 @@ import ModelRecord from "@mlc-ai/web-llm"; -import appConfig from "./app-config"; // Modify this to inspect vram requirement for models of choice +import appConfig from "./app-config"; // Modify this to inspect vram requirement for models of choice import * as tvmjs from "tvmjs"; +import log from "loglevel"; function setLabel(id: string, text: string) { const label = document.getElementById(id); @@ -14,16 +15,16 @@ interface AppConfig { model_list: Array; } -let dtypeBytesMap = new Map([ +const dtypeBytesMap = new Map([ ["uint32", 4], ["uint16", 2], ["float32", 4], - ["float16", 4] + ["float16", 4], ]); async function main() { - let config: AppConfig = appConfig; - let report: string = ""; + const config: AppConfig = appConfig; + let report = ""; for (let i = 0; i < config.model_list.length; ++i) { // 1. Read each model record const modelRecord: ModelRecord = config.model_list[i]; @@ -36,7 +37,7 @@ async function main() { const tvm = await tvmjs.instantiate( new Uint8Array(wasmSource), tvmjs.createPolyfillWASI(), - console.log + log.info, ); const gpuDetectOutput = await tvmjs.detectGPUDevice(); if (gpuDetectOutput == undefined) { @@ -45,14 +46,17 @@ async function main() { tvm.initWebGPU(gpuDetectOutput.device); tvm.beginScope(); const vm = tvm.detachFromCurrentScope( - tvm.createVirtualMachine(tvm.webgpu()) + tvm.createVirtualMachine(tvm.webgpu()), ); // 4. Get metadata from the vm let fgetMetadata: any; try { fgetMetadata = vm.getFunction("_metadata"); } catch (err) { - console.error("The wasm needs to have function `_metadata` to inspect vram requirement.", err); + log.error( + "The wasm needs to have function `_metadata` to inspect vram requirement.", + err, + ); } const ret_value = fgetMetadata(); const metadataStr = tvm.detachFromCurrentScope(ret_value).toString(); @@ -65,33 +69,38 @@ async function main() { // Possible to have shape -1 signifying a dynamic shape -- we disregard them const dtypeBytes = dtypeBytesMap.get(param.dtype); if (dtypeBytes === undefined) { - throw Error("Cannot find size of " + param.dtype + ", add it to `dtypeBytesMap`.") + throw Error( + "Cannot find size of " + + param.dtype + + ", add it to `dtypeBytesMap`.", + ); } const numParams = param.shape.reduce((a: number, b: number) => a * b); paramBytes += numParams * dtypeBytes; } else { - console.log(`${model_id}'s ${param.name} has dynamic shape; excluded from vRAM calculation.`) + log.info( + `${model_id}'s ${param.name} has dynamic shape; excluded from vRAM calculation.`, + ); } }); // 5.2. Get maximum bytes needed for temporary buffer across all functions - let maxTempFuncBytes: number = 0; + let maxTempFuncBytes = 0; Object.entries(metadata.memory_usage).forEach(([funcName, funcBytes]) => { if (typeof funcBytes !== "number") { - throw Error("`memory_usage` expects entry `funcName: funcBytes`.") + throw Error("`memory_usage` expects entry `funcName: funcBytes`."); } maxTempFuncBytes = Math.max(maxTempFuncBytes, funcBytes); - }) + }); // 5.3. Get kv cache bytes const kv_cache_bytes: number = metadata.kv_cache_bytes; // 5.4. Get total vRAM needed const totalBytes = paramBytes + maxTempFuncBytes + kv_cache_bytes; // 6. Report vRAM Requirement - report += ( + report += `totalBytes: ${(totalBytes / 1024 / 1024).toFixed(2)} MB\n` + `paramBytes: ${(paramBytes / 1024 / 1024).toFixed(2)} MB\n` + `maxTempFuncBytes: ${(maxTempFuncBytes / 1024 / 1024).toFixed(2)} MB\n` + - `kv_cache_bytes: ${(kv_cache_bytes / 1024 / 1024).toFixed(2)} MB\n\n` - ); + `kv_cache_bytes: ${(kv_cache_bytes / 1024 / 1024).toFixed(2)} MB\n\n`; // 7. Dispose everything tvm.endScope(); vm.dispose();