Skip to content

Commit

Permalink
fixed a few OOM errors with larger contexts - I cannot figure out why…
Browse files Browse the repository at this point in the history
… they happen, so I am forced to increase the buffer size.
  • Loading branch information
LostRuins committed Apr 10, 2023
1 parent f53238f commit 69b85f5
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 27 deletions.
22 changes: 5 additions & 17 deletions gpttype_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_threads = params.n_threads = inputs.threads;
n_batch = params.n_batch = inputs.batch_size;
modelname = params.model = inputs.model_filename;
params.memory_f16 = inputs.f16_kv;
params.n_ctx = inputs.max_context_length;
model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx;

if (file_format == FileFormat::GPT2_1)
{
Expand Down Expand Up @@ -153,6 +156,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
params.temp = inputs.temperature;
params.repeat_last_n = inputs.rep_pen_range;
params.repeat_penalty = inputs.rep_pen;
params.n_ctx = inputs.max_context_length;
params.n_batch = n_batch;
params.n_threads = n_threads;

Expand All @@ -173,23 +177,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);

//truncate to front of the prompt if its too long
int32_t nctx = 512;
if(file_format == FileFormat::GPTJ_1||file_format == FileFormat::GPTJ_2)
{
nctx = model_v1.hparams.n_ctx;
}
else if(file_format==FileFormat::GPTJ_3)
{
nctx = model_v2.hparams.n_ctx;
}
else if(file_format==FileFormat::GPT2_1)
{
nctx = model_gpt2_v1.hparams.n_ctx;
}
else if(file_format==FileFormat::GPT2_2)
{
nctx = model_gpt2_v2.hparams.n_ctx;
}
int32_t nctx = params.n_ctx;

if (embd_inp.size() + params.n_predict > nctx)
{
Expand Down
6 changes: 3 additions & 3 deletions koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def main(args):
mdl_nparts = sum(1 for n in range(1, 9) if os.path.exists(f"{ggml_selected_file}.{n}")) + 1
modelname = os.path.abspath(ggml_selected_file)
print(f"Loading model: {modelname} \n[Parts: {mdl_nparts}, Threads: {args.threads}]")
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,args.usemmap)
loadok = load_model(modelname,8,maxctx,mdl_nparts,args.threads,(not args.nommap))
print("Load Model OK: " + str(loadok))

if not loadok:
Expand Down Expand Up @@ -378,7 +378,7 @@ def main(args):
RunServerMultiThreaded(args.host, args.port, embedded_kailite)

if __name__ == '__main__':
print("Welcome to KoboldCpp - Version 1.3") # just update version manually
print("Welcome to KoboldCpp - Version 1.4") # just update version manually
parser = argparse.ArgumentParser(description='Kobold llama.cpp server')
parser.add_argument("model_file", help="Model file to load", nargs="?")
portgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
Expand All @@ -396,6 +396,6 @@ def main(args):
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
parser.add_argument("--usemmap", help="Use mmap to load newer models (default false)", action='store_true')
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
args = parser.parse_args()
main(args)
10 changes: 8 additions & 2 deletions otherarch/gpt2_v1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,23 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
}
}

auto desiredMaxCtx = model.hparams.n_ctx;

// load hparams
{
auto & hparams = model.hparams;


fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.f16, sizeof(hparams.f16));

//used to expand KV size if needed
desiredMaxCtx = std::max(hparams.n_ctx,desiredMaxCtx);

printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
Expand Down Expand Up @@ -94,7 +100,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model

const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_ctx = desiredMaxCtx;
const int n_vocab = hparams.n_vocab;

ctx_size += n_embd*ggml_v1_type_size(GGML_V1_TYPE_F32); // ln_f_g
Expand Down Expand Up @@ -215,7 +221,7 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model

const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_ctx = desiredMaxCtx;

const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem;
Expand Down
11 changes: 7 additions & 4 deletions otherarch/gpt2_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
}
}

auto memory_type = GGML_TYPE_F16;

// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = GGML_TYPE_COUNT;
Expand Down Expand Up @@ -242,9 +244,9 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g

const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem;

model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);

const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);

Expand Down Expand Up @@ -370,7 +372,8 @@ bool gpt2_eval(
const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab;

static size_t buf_size = 256u*1024*1024;
//todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now
static size_t buf_size = 1024u*1024*1024;
static void * buf = malloc(buf_size);

if (mem_per_token > 0 && mem_per_token*N > buf_size) {
Expand Down
3 changes: 2 additions & 1 deletion otherarch/gptj_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,8 @@ bool gptj_eval(

const int d_key = n_embd/n_head;

static size_t buf_size = 256u*1024*1024;
//todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now
static size_t buf_size = 1024u*1024*1024;
static void * buf = malloc(buf_size);

if (mem_per_token > 0 && mem_per_token*N > buf_size) {
Expand Down

0 comments on commit 69b85f5

Please sign in to comment.