fixed a few OOM errors with larger contexts - I cannot figure out why…

… they happen, so I am forced to increase the buffer size.
LostRuins · Apr 10, 2023 · 69b85f5 · 69b85f5
1 parent f53238f
commit 69b85f5
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 27 deletions.
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -49,6 +49,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     n_threads = params.n_threads = inputs.threads;
     n_batch = params.n_batch = inputs.batch_size;
     modelname = params.model = inputs.model_filename;
+    params.memory_f16 = inputs.f16_kv;
+    params.n_ctx = inputs.max_context_length;
+    model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx;
 
     if (file_format == FileFormat::GPT2_1)
     {
@@ -153,6 +156,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     params.temp = inputs.temperature;
     params.repeat_last_n = inputs.rep_pen_range;
     params.repeat_penalty = inputs.rep_pen;
+    params.n_ctx = inputs.max_context_length;
     params.n_batch = n_batch;
     params.n_threads = n_threads;
 
@@ -173,23 +177,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
 
     //truncate to front of the prompt if its too long
-    int32_t nctx = 512;
-    if(file_format == FileFormat::GPTJ_1||file_format == FileFormat::GPTJ_2)
-    {
-        nctx = model_v1.hparams.n_ctx;
-    }    
-    else if(file_format==FileFormat::GPTJ_3)
-    {
-        nctx = model_v2.hparams.n_ctx;
-    }
-    else if(file_format==FileFormat::GPT2_1)
-    {
-        nctx = model_gpt2_v1.hparams.n_ctx;
-    }
-    else if(file_format==FileFormat::GPT2_2)
-    {
-        nctx = model_gpt2_v2.hparams.n_ctx;
-    }
+    int32_t nctx = params.n_ctx;
 
     if (embd_inp.size() + params.n_predict > nctx)
     {

diff --git a/koboldcpp.py b/koboldcpp.py
@@ -349,7 +349,7 @@ def main(args):
     mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
     modelname = os.path.abspath(ggml_selected_file)
     print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}]")
-    loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,args.usemmap)
+    loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap))
     print("Load Model OK: " + str(loadok))
 
     if not loadok:
@@ -378,7 +378,7 @@ def main(args):
     RunServerMultiThreaded(args.host, args.port, embedded_kailite)
 
 if __name__ == '__main__':
-    print("Welcome to KoboldCpp - Version 1.3") # just update version manually
+    print("Welcome to KoboldCpp - Version 1.4") # just update version manually
     parser = argparse.ArgumentParser(description='Kobold llama.cpp server')
     parser.add_argument("model_file", help="Model file to load", nargs="?")
     portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
@@ -396,6 +396,6 @@ def main(args):
     parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
     parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
     parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
-    parser.add_argument("--usemmap", help="Use mmap to load newer models (default false)", action='store_true')
+    parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
     args = parser.parse_args()
     main(args)
diff --git a/otherarch/gpt2_v1.cpp b/otherarch/gpt2_v1.cpp
@@ -36,17 +36,23 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
         }
     }
 
+    auto desiredMaxCtx = model.hparams.n_ctx;
+
     // load hparams
     {
         auto & hparams = model.hparams;
 
+
         fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
         fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
         fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
         fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
 
+        //used to expand KV size if needed
+        desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx);
+
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
         printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
@@ -94,7 +100,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
 
         const int n_embd  = hparams.n_embd;
         const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
+        const int n_ctx   = desiredMaxCtx;
         const int n_vocab = hparams.n_vocab;
 
         ctx_size += n_embd*ggml_v1_type_size(GGML_V1_TYPE_F32); // ln_f_g
@@ -215,7 +221,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
 
         const int n_embd  = hparams.n_embd;
         const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
+        const int n_ctx   = desiredMaxCtx;
 
         const int n_mem      = n_layer*n_ctx;
         const int n_elements = n_embd*n_mem;

diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp
@@ -81,6 +81,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         }
     }
 
+    auto memory_type = GGML_TYPE_F16;
+
     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
     // in order to save memory and also to speed up the computation
     ggml_type wtype = GGML_TYPE_COUNT;
@@ -242,9 +244,9 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
 
         const int n_mem      = n_layer*n_ctx;
         const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+       
+        model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
+        model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
 
         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 
@@ -370,7 +372,8 @@ bool gpt2_eval(
     const int n_head  = hparams.n_head;
     const int n_vocab = hparams.n_vocab;
 
-    static size_t buf_size = 256u*1024*1024;
+    //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now  
+    static size_t buf_size = 1024u*1024*1024;
     static void * buf = malloc(buf_size);
 
     if (mem_per_token > 0 && mem_per_token*N > buf_size) {

diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp
@@ -378,7 +378,8 @@ bool gptj_eval(
 
     const int d_key = n_embd/n_head;
 
-    static size_t buf_size = 256u*1024*1024;
+    //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now  
+    static size_t buf_size = 1024u*1024*1024;
     static void * buf = malloc(buf_size);
 
     if (mem_per_token > 0 && mem_per_token*N > buf_size) {