From cfbf0d47755cc9365e7621748c1a9be4aefb5f0d Mon Sep 17 00:00:00 2001 From: Jesse Luoto Date: Sat, 24 Aug 2024 10:52:40 +0300 Subject: [PATCH 1/2] Fix typo: disableGPU != disableGpu This made the local inference try to use the GPU even when the user provided the `--disableGpu` flag. --- src/commands/local.ts | 4 +++- src/plugins/local-llm-rename/llama.ts | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/commands/local.ts b/src/commands/local.ts index 8ad2296..b24a0a9 100644 --- a/src/commands/local.ts +++ b/src/commands/local.ts @@ -25,9 +25,11 @@ export const local = cli() verbose.enabled = true; } + verbose.log("Starting local inference with options: ", opts); + const prompt = await llama({ model: opts.model, - disableGPU: opts.disableGPU, + disableGpu: opts.disableGpu, seed: opts.seed ? parseInt(opts.seed) : undefined }); await unminify(filename, opts.outputDir, [ diff --git a/src/plugins/local-llm-rename/llama.ts b/src/plugins/local-llm-rename/llama.ts index 48cc493..f81eb99 100644 --- a/src/plugins/local-llm-rename/llama.ts +++ b/src/plugins/local-llm-rename/llama.ts @@ -1,6 +1,12 @@ -import { getLlama, LlamaChatSession, LlamaGrammar } from "node-llama-cpp"; +import { + getLlama, + LlamaChatSession, + LlamaGrammar, + LlamaModelOptions +} from "node-llama-cpp"; import { Gbnf } from "./gbnf.js"; import { getModelPath, getModelWrapper } from "../../local-models.js"; +import { verbose } from "../../verbose.js"; export type Prompt = ( systemPrompt: string, @@ -13,13 +19,15 @@ const IS_CI = process.env["CI"] === "true"; export async function llama(opts: { seed?: number; model: string; - disableGPU?: boolean; + disableGpu?: boolean; }): Promise { const llama = await getLlama(); - const model = await llama.loadModel({ + const modelOpts: LlamaModelOptions = { modelPath: getModelPath(opts?.model), - gpuLayers: (opts?.disableGPU ?? IS_CI) ? 0 : undefined - }); + gpuLayers: (opts?.disableGpu ?? IS_CI) ? 0 : undefined + }; + verbose.log("Loading model with options", modelOpts); + const model = await llama.loadModel(modelOpts); const context = await model.createContext({ seed: opts?.seed }); From 7fb58d4dceb3a833068d0b89ce37c4ad00e1ea9f Mon Sep 17 00:00:00 2001 From: Jesse Luoto Date: Sat, 24 Aug 2024 10:55:37 +0300 Subject: [PATCH 2/2] Add disabling GPU altogether Setting the layers to 0 seemed to do most of the work, but it still used some of the available GPU. Which was weird. Let's see if this messes up the CI test times. --- src/plugins/local-llm-rename/llama.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/plugins/local-llm-rename/llama.ts b/src/plugins/local-llm-rename/llama.ts index f81eb99..717fc76 100644 --- a/src/plugins/local-llm-rename/llama.ts +++ b/src/plugins/local-llm-rename/llama.ts @@ -21,10 +21,11 @@ export async function llama(opts: { model: string; disableGpu?: boolean; }): Promise { - const llama = await getLlama(); + const disableGpu = opts.disableGpu ?? IS_CI; + const llama = await getLlama({ gpu: disableGpu ? false : "auto" }); const modelOpts: LlamaModelOptions = { modelPath: getModelPath(opts?.model), - gpuLayers: (opts?.disableGpu ?? IS_CI) ? 0 : undefined + gpuLayers: disableGpu ? 0 : undefined }; verbose.log("Loading model with options", modelOpts); const model = await llama.loadModel(modelOpts);