From cfbf0d47755cc9365e7621748c1a9be4aefb5f0d Mon Sep 17 00:00:00 2001
From: Jesse Luoto <jesse.luoto@gmail.com>
Date: Sat, 24 Aug 2024 10:52:40 +0300
Subject: [PATCH 1/2] Fix typo: disableGPU != disableGpu

This made the local inference try to use the GPU even when the user
provided the `--disableGpu` flag.
---
 src/commands/local.ts                 |  4 +++-
 src/plugins/local-llm-rename/llama.ts | 18 +++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/src/commands/local.ts b/src/commands/local.ts
index 8ad2296..b24a0a9 100644
--- a/src/commands/local.ts
+++ b/src/commands/local.ts
@@ -25,9 +25,11 @@ export const local = cli()
       verbose.enabled = true;
     }
 
+    verbose.log("Starting local inference with options: ", opts);
+
     const prompt = await llama({
       model: opts.model,
-      disableGPU: opts.disableGPU,
+      disableGpu: opts.disableGpu,
       seed: opts.seed ? parseInt(opts.seed) : undefined
     });
     await unminify(filename, opts.outputDir, [
diff --git a/src/plugins/local-llm-rename/llama.ts b/src/plugins/local-llm-rename/llama.ts
index 48cc493..f81eb99 100644
--- a/src/plugins/local-llm-rename/llama.ts
+++ b/src/plugins/local-llm-rename/llama.ts
@@ -1,6 +1,12 @@
-import { getLlama, LlamaChatSession, LlamaGrammar } from "node-llama-cpp";
+import {
+  getLlama,
+  LlamaChatSession,
+  LlamaGrammar,
+  LlamaModelOptions
+} from "node-llama-cpp";
 import { Gbnf } from "./gbnf.js";
 import { getModelPath, getModelWrapper } from "../../local-models.js";
+import { verbose } from "../../verbose.js";
 
 export type Prompt = (
   systemPrompt: string,
@@ -13,13 +19,15 @@ const IS_CI = process.env["CI"] === "true";
 export async function llama(opts: {
   seed?: number;
   model: string;
-  disableGPU?: boolean;
+  disableGpu?: boolean;
 }): Promise<Prompt> {
   const llama = await getLlama();
-  const model = await llama.loadModel({
+  const modelOpts: LlamaModelOptions = {
     modelPath: getModelPath(opts?.model),
-    gpuLayers: (opts?.disableGPU ?? IS_CI) ? 0 : undefined
-  });
+    gpuLayers: (opts?.disableGpu ?? IS_CI) ? 0 : undefined
+  };
+  verbose.log("Loading model with options", modelOpts);
+  const model = await llama.loadModel(modelOpts);
 
   const context = await model.createContext({ seed: opts?.seed });
 

From 7fb58d4dceb3a833068d0b89ce37c4ad00e1ea9f Mon Sep 17 00:00:00 2001
From: Jesse Luoto <jesse.luoto@gmail.com>
Date: Sat, 24 Aug 2024 10:55:37 +0300
Subject: [PATCH 2/2] Add disabling GPU altogether

Setting the layers to 0 seemed to do most of the work, but it still used
some of the available GPU. Which was weird. Let's see if this messes up
the CI test times.
---
 src/plugins/local-llm-rename/llama.ts | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/plugins/local-llm-rename/llama.ts b/src/plugins/local-llm-rename/llama.ts
index f81eb99..717fc76 100644
--- a/src/plugins/local-llm-rename/llama.ts
+++ b/src/plugins/local-llm-rename/llama.ts
@@ -21,10 +21,11 @@ export async function llama(opts: {
   model: string;
   disableGpu?: boolean;
 }): Promise<Prompt> {
-  const llama = await getLlama();
+  const disableGpu = opts.disableGpu ?? IS_CI;
+  const llama = await getLlama({ gpu: disableGpu ? false : "auto" });
   const modelOpts: LlamaModelOptions = {
     modelPath: getModelPath(opts?.model),
-    gpuLayers: (opts?.disableGpu ?? IS_CI) ? 0 : undefined
+    gpuLayers: disableGpu ? 0 : undefined
   };
   verbose.log("Loading model with options", modelOpts);
   const model = await llama.loadModel(modelOpts);