mybigday · jhen0409 · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023
diff --git a/README.md b/README.md
@@ -41,10 +41,10 @@ python3 convert.py models/7B/ --outtype f16
 make quantize
 
 # quantize the model to 2-bits (using q2_k method)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q2_k.bin q2_k
+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q2_k.gguf q2_k
 
 # quantize the model to 4-bits (using q4_0 method)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
 ```
 
 ## Usage

diff --git a/cpp/rn-llama.hpp b/cpp/rn-llama.hpp
@@ -217,7 +217,7 @@ struct llama_rn_context
             grammar_parser::print_grammar(stderr, parsed_grammar);
 
             {
-                auto it = params.logit_bias.find(llama_token_eos());
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
                 if (it != params.logit_bias.end() && it->second == -INFINITY) {
                     LOG_WARNING("EOS token is disabled, which will cause most grammars to fail");
                 }
@@ -336,7 +336,7 @@ struct llama_rn_context
         if (params.n_predict == 0)
         {
             has_next_token = false;
-            result.tok = llama_token_eos();
+            result.tok = llama_token_eos(ctx);
             return result;
         }
 
@@ -376,7 +376,7 @@ struct llama_rn_context
             llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
 
             // Apply penalties
-            float nl_logit = logits[llama_token_nl()];
+            float nl_logit = logits[llama_token_nl(ctx)];
             auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
             llama_sample_repetition_penalty(ctx, &candidates_p,
                                             last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -386,7 +386,7 @@ struct llama_rn_context
                                                           last_n_repeat, alpha_frequency, alpha_presence);
             if (!penalize_nl)
             {
-                logits[llama_token_nl()] = nl_logit;
+                logits[llama_token_nl(ctx)] = nl_logit;
             }
 
             if (grammar != nullptr) {
@@ -448,7 +448,7 @@ struct llama_rn_context
         // decrement remaining sampling budget
         --n_remain;
 
-        if (!embd.empty() && embd.back() == llama_token_eos())
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
         {
             // stopping_word = llama_token_to_str(ctx, embd.back());
             has_next_token = false;

diff --git a/example/App.tsx b/example/App.tsx
@@ -112,7 +112,7 @@
   }
 
   const handlePickModel = async () => {
-    DocumentPicker.pick({ type: ['public.archive'] })
+    DocumentPicker.pick() // TODO: Is there a way to filter gguf model files?
       .then(async (res) => {
         const [file] = res
         if (file) handleInitContext(file)
@@ -220,7 +220,7 @@

      const converted = convertJsonSchemaToGrammar({ schema, propOrder: { function: 0, arguments: 1 } })
      // @ts-ignore
      if (false) console.log('Converted grammar:', converted)
      grammar = undefined
      // Uncomment to test:
      // grammar = converted

diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
@@ -8,7 +8,7 @@ PODS:
     - hermes-engine/Pre-built (= 0.72.3)
   - hermes-engine/Pre-built (0.72.3)
   - libevent (2.1.12)
-  - llama-rn (0.1.4):
+  - llama-rn (0.1.5):
     - RCT-Folly
     - RCTRequired
     - RCTTypeSafety
@@ -1223,7 +1223,7 @@ SPEC CHECKSUMS:
   glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b
   hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322
   libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913
-  llama-rn: f8cb1160d9506a40743054510177149d24daf516
+  llama-rn: 2fc75a540ad1b89e773cb00f4b02c764e2b1b87a
   RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1
   RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18
   RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3

diff --git a/ios/RNLlamaContext.mm b/ios/RNLlamaContext.mm
@@ -63,8 +63,6 @@ + (instancetype)initWithParams:(NSDictionary *)params {
     }
     if (params[@"lora_base"]) defaultParams.lora_base = [params[@"lora_base"] UTF8String];
 
-    if (params[@"n_gqa"]) defaultParams.n_gqa = [params[@"n_gqa"] intValue];
-    if (params[@"rms_norm_eps"]) defaultParams.rms_norm_eps = [params[@"rms_norm_eps"] floatValue];
     if (params[@"rope_freq_base"]) defaultParams.rope_freq_base = [params[@"rope_freq_base"] floatValue];
     if (params[@"rope_freq_scale"]) defaultParams.rope_freq_scale = [params[@"rope_freq_scale"] floatValue];
 
@@ -177,7 +175,7 @@ - (NSDictionary *)completion:(NSDictionary *)params
 
     self->llama->params.logit_bias.clear();
     if (params[@"ignore_eos"] && [params[@"ignore_eos"] boolValue]) {
-        self->llama->params.logit_bias[llama_token_eos()] = -INFINITY;
+        self->llama->params.logit_bias[llama_token_eos(self->llama->ctx)] = -INFINITY;
     }
 
     if (params[@"logit_bias"] && [params[@"logit_bias"] isKindOfClass:[NSArray class]]) {

diff --git a/llama.cpp b/llama.cpp
diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh
@@ -10,25 +10,24 @@ cp ./llama.cpp/ggml-metal.m ./cpp/ggml-metal.m
 cp ./llama.cpp/ggml-metal.metal ./cpp/ggml-metal.metal
 cp ./llama.cpp/ggml-alloc.h ./cpp/ggml-alloc.h
 cp ./llama.cpp/ggml-alloc.c ./cpp/ggml-alloc.c
-cp ./llama.cpp/llama-util.h ./cpp/llama-util.h
 cp ./llama.cpp/llama.h ./cpp/llama.h
 cp ./llama.cpp/llama.cpp ./cpp/llama.cpp
 cp ./llama.cpp/k_quants.h ./cpp/k_quants.h
 cp ./llama.cpp/k_quants.c ./cpp/k_quants.c
-cp ./llama.cpp/examples/common.h ./cpp/common.h
-cp ./llama.cpp/examples/common.cpp ./cpp/common.cpp
-cp ./llama.cpp/examples/grammar-parser.h ./cpp/grammar-parser.h
-cp ./llama.cpp/examples/grammar-parser.cpp ./cpp/grammar-parser.cpp
+cp ./llama.cpp/common/common.h ./cpp/common.h
+cp ./llama.cpp/common/common.cpp ./cpp/common.cpp
+cp ./llama.cpp/common/grammar-parser.h ./cpp/grammar-parser.h
+cp ./llama.cpp/common/grammar-parser.cpp ./cpp/grammar-parser.cpp
 
 # List of files to process
 files=(
   "./cpp/ggml.h"
   "./cpp/ggml.c"
+  "./cpp/common.cpp"
   "./cpp/ggml-metal.h"
   "./cpp/ggml-metal.m"
   "./cpp/llama.h"
   "./cpp/llama.cpp"
-  "./cpp/llama-util.h"
   "./cpp/k_quants.h"
   "./cpp/k_quants.c"
   "./cpp/ggml-alloc.h"

diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch
@@ -1,9 +1,9 @@
---- ggml-metal-orig.m	2023-08-17 10:15:26
-+++ ggml-metal.m	2023-08-17 10:14:18
+--- ggml-metal-orig.m	2023-08-22 12:22:42
++++ ggml-metal.m	2023-08-22 12:22:43
 @@ -218,13 +218,13 @@
  #undef LM_GGML_METAL_ADD_KERNEL
      }
- 
+
 -    fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
 +    // fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
      fprintf(stderr, "%s: hasUnifiedMemory             = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
@@ -17,20 +17,20 @@
 +    // } else {
 +    //     fprintf(stderr, "%s: maxTransferRate              = built-in GPU\n", __func__);
 +    // }
- 
+
      return ctx;
  }
-@@ -351,15 +351,15 @@
+@@ -366,15 +366,15 @@
              }
          }
- 
+
 -        fprintf(stderr, ", (%8.2f / %8.2f)",
 -                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
 -                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
 +        // fprintf(stderr, ", (%8.2f / %8.2f)",
 +        //         ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
 +        //         ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
- 
+
 -        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
 -            fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
 -        } else {
@@ -42,5 +42,5 @@
 +        //     fprintf(stderr, "\n");
 +        // }
      }
- 
+
      return true;
diff --git a/src/NativeRNLlama.ts b/src/NativeRNLlama.ts
@@ -21,8 +21,6 @@ export type NativeContextParams = {
   lora?: string // lora_adaptor
   lora_base?: string
 
-  n_gqa?: number
-  rms_norm_eps?: number
   rope_freq_base?: number
   rope_freq_scale?: number
 }