mybigday · jhen0409 · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023
diff --git a/android/src/main/java/com/rnllama/LlamaContext.java b/android/src/main/java/com/rnllama/LlamaContext.java
@@ -52,8 +52,6 @@ public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap pa
       params.hasKey("use_mlock") ? params.getBoolean("use_mlock") : true,
       // boolean use_mmap,
       params.hasKey("use_mmap") ? params.getBoolean("use_mmap") : true,
-      // boolean memory_f16,
-      params.hasKey("memory_f16") ? params.getBoolean("memory_f16") : true,
       // String lora,
       params.hasKey("lora") ? params.getString("lora") : "",
       // float lora_scaled,
@@ -285,7 +283,6 @@ protected static native long initContext(
     int n_gpu_layers, // TODO: Support this
     boolean use_mlock,
     boolean use_mmap,
-    boolean memory_f16,
     String lora,
     float lora_scaled,
     String lora_base,

diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp
@@ -129,7 +129,6 @@ Java_com_rnllama_LlamaContext_initContext(
     jint n_gpu_layers, // TODO: Support this
     jboolean use_mlock,
     jboolean use_mmap,
-    jboolean memory_f16,
     jstring lora_str,
     jfloat lora_scaled,
     jstring lora_base_str,
@@ -158,8 +157,6 @@ Java_com_rnllama_LlamaContext_initContext(
     defaultParams.use_mlock = use_mlock;
     defaultParams.use_mmap = use_mmap;
 
-    defaultParams.memory_f16 = memory_f16;
-
     const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
     const char *lora_base_chars = env->GetStringUTFChars(lora_base_str, nullptr);
     if (lora_chars) {

diff --git a/cpp/common.cpp b/cpp/common.cpp
diff --git a/cpp/common.h b/cpp/common.h
@@ -86,6 +86,8 @@ struct gpt_params {
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
     std::string logdir            = "";  // directory in which to save YAML log files
 
+    std::vector<llama_model_kv_override> kv_overrides;
+
     // TODO: avoid tuple, use struct
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter
@@ -98,10 +100,10 @@ struct gpt_params {
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
+    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
     bool prompt_cache_all  = false; // save user input and generations to prompt cache
     bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
 
@@ -121,10 +123,15 @@ struct gpt_params {
     bool numa              = false; // attempt optimizations that help on some NUMA systems
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool infill            = false; // use infill mode
+    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
+    bool no_kv_offload     = false; // disable KV offloading
+
+    std::string cache_type_k = "f16"; // KV cache data type for the K
+    std::string cache_type_v = "f16"; // KV cache data type for the V
 
     // multimodal models (see examples/llava)
     std::string mmproj = ""; // path to multimodal projector
-    std::string image = ""; // path to an image file
+    std::string image  = ""; // path to an image file
 };
 
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
@@ -139,6 +146,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 
 void process_escapes(std::string& input);
 
+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input);
+
 //
 // Model utils
 //
@@ -200,6 +213,10 @@ std::string llama_detokenize_bpe(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens);
 
+// Uses the value from the model metadata if possible, otherwise
+// defaults to true when model type is SPM, otherwise false.
+bool llama_should_add_bos_token(const llama_model * model);
+
 //
 // YAML utils
 //
@@ -213,3 +230,13 @@ std::string get_sortable_timestamp();
 void dump_non_result_info_yaml(
     FILE * stream, const gpt_params & params, const llama_context * lctx,
     const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
+
+//
+// KV cache utils
+//
+
+// Dump the KV cache view with the number of sequences per cell.
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
+
+// Dump the KV cache view showing individual sequences in each cell (long output).
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);