Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sync llama.cpp (breaking change) #9

Merged
merged 5 commits into from
Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ python3 convert.py models/7B/ --outtype f16
make quantize

# quantize the model to 2-bits (using q2_k method)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q2_k.bin q2_k
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q2_k.gguf q2_k

# quantize the model to 4-bits (using q4_0 method)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
```

## Usage
Expand Down
10 changes: 5 additions & 5 deletions cpp/rn-llama.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ struct llama_rn_context
grammar_parser::print_grammar(stderr, parsed_grammar);

{
auto it = params.logit_bias.find(llama_token_eos());
auto it = params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) {
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail");
}
Expand Down Expand Up @@ -336,7 +336,7 @@ struct llama_rn_context
if (params.n_predict == 0)
{
has_next_token = false;
result.tok = llama_token_eos();
result.tok = llama_token_eos(ctx);
return result;
}

Expand Down Expand Up @@ -376,7 +376,7 @@ struct llama_rn_context
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};

// Apply penalties
float nl_logit = logits[llama_token_nl()];
float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
Expand All @@ -386,7 +386,7 @@ struct llama_rn_context
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl)
{
logits[llama_token_nl()] = nl_logit;
logits[llama_token_nl(ctx)] = nl_logit;
}

if (grammar != nullptr) {
Expand Down Expand Up @@ -448,7 +448,7 @@ struct llama_rn_context
// decrement remaining sampling budget
--n_remain;

if (!embd.empty() && embd.back() == llama_token_eos())
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
{
// stopping_word = llama_token_to_str(ctx, embd.back());
has_next_token = false;
Expand Down
2 changes: 1 addition & 1 deletion example/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
}

const handlePickModel = async () => {
DocumentPicker.pick({ type: ['public.archive'] })
DocumentPicker.pick() // TODO: Is there a way to filter gguf model files?
.then(async (res) => {
const [file] = res
if (file) handleInitContext(file)
Expand Down Expand Up @@ -220,7 +220,7 @@

const converted = convertJsonSchemaToGrammar({ schema, propOrder: { function: 0, arguments: 1 } })
// @ts-ignore
if (false) console.log('Converted grammar:', converted)

Check warning on line 223 in example/App.tsx

View workflow job for this annotation

GitHub Actions / lint

Unexpected constant condition
grammar = undefined
// Uncomment to test:
// grammar = converted
Expand Down
4 changes: 2 additions & 2 deletions example/ios/Podfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ PODS:
- hermes-engine/Pre-built (= 0.72.3)
- hermes-engine/Pre-built (0.72.3)
- libevent (2.1.12)
- llama-rn (0.1.4):
- llama-rn (0.1.5):
- RCT-Folly
- RCTRequired
- RCTTypeSafety
Expand Down Expand Up @@ -1223,7 +1223,7 @@ SPEC CHECKSUMS:
glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b
hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322
libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913
llama-rn: f8cb1160d9506a40743054510177149d24daf516
llama-rn: 2fc75a540ad1b89e773cb00f4b02c764e2b1b87a
RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1
RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18
RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3
Expand Down
4 changes: 1 addition & 3 deletions ios/RNLlamaContext.mm
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,6 @@ + (instancetype)initWithParams:(NSDictionary *)params {
}
if (params[@"lora_base"]) defaultParams.lora_base = [params[@"lora_base"] UTF8String];

if (params[@"n_gqa"]) defaultParams.n_gqa = [params[@"n_gqa"] intValue];
if (params[@"rms_norm_eps"]) defaultParams.rms_norm_eps = [params[@"rms_norm_eps"] floatValue];
if (params[@"rope_freq_base"]) defaultParams.rope_freq_base = [params[@"rope_freq_base"] floatValue];
if (params[@"rope_freq_scale"]) defaultParams.rope_freq_scale = [params[@"rope_freq_scale"] floatValue];

Expand Down Expand Up @@ -177,7 +175,7 @@ - (NSDictionary *)completion:(NSDictionary *)params

self->llama->params.logit_bias.clear();
if (params[@"ignore_eos"] && [params[@"ignore_eos"] boolValue]) {
self->llama->params.logit_bias[llama_token_eos()] = -INFINITY;
self->llama->params.logit_bias[llama_token_eos(self->llama->ctx)] = -INFINITY;
}

if (params[@"logit_bias"] && [params[@"logit_bias"] isKindOfClass:[NSArray class]]) {
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Submodule llama.cpp updated 64 files
+4 −1 .gitignore
+11 −2 CMakeLists.txt
+20 −11 Makefile
+34 −19 README.md
+54 −20 build.zig
+22 −22 ci/run.sh
+20 −0 common/CMakeLists.txt
+95 −29 common/common.cpp
+25 −9 common/common.h
+0 −0 common/console.cpp
+0 −0 common/console.h
+0 −0 common/grammar-parser.cpp
+0 −0 common/grammar-parser.h
+282 −0 convert-falcon-hf-to-gguf.py
+266 −0 convert-gptneox-hf-to-gguf.py
+307 −0 convert-llama-7b-pth-to-gguf.py
+333 −0 convert-llama-ggmlv3-to-gguf.py
+327 −0 convert-llama-hf-to-gguf.py
+0 −13 convert-pth-to-ggml.py
+374 −646 convert.py
+3 −3 docs/token_generation_performance_tips.md
+1 −21 examples/CMakeLists.txt
+91 −89 examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+3 −3 examples/embd-input/embd-input-lib.cpp
+1 −1 examples/embedding/embedding.cpp
+246 −0 examples/gguf/gguf.cpp
+1,133 −0 examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+1,111 −0 examples/gptneox-wip/falcon-main.cpp
+1,082 −0 examples/gptneox-wip/gptneox-main.cpp
+8 −0 examples/llama-bench/CMakeLists.txt
+969 −0 examples/llama-bench/llama-bench.cpp
+15 −24 examples/main/main.cpp
+1 −1 examples/metal/metal.cpp
+90 −25 examples/perplexity/perplexity.cpp
+1 −1 examples/quantize-stats/quantize-stats.cpp
+4 −4 examples/quantize/quantize.cpp
+4 −6 examples/save-load-state/save-load-state.cpp
+3 −4 examples/server/README.md
+5 −3 examples/server/deps.sh
+1,356 −918 examples/server/index.html.hpp
+158 −2 examples/server/public/index.html
+40 −50 examples/server/server.cpp
+50 −101 examples/simple/simple.cpp
+68 −70 examples/train-text-from-scratch/train-text-from-scratch.cpp
+12 −5 ggml-alloc.c
+12 −0 ggml-cuda.cu
+19 −23 ggml-cuda.h
+3 −0 ggml-metal.h
+15 −0 ggml-metal.m
+2 −1 ggml-metal.metal
+1,121 −135 ggml.c
+124 −4 ggml.h
+718 −0 gguf.py
+0 −553 llama-util.h
+2,257 −1,395 llama.cpp
+141 −122 llama.h
+1 −0 models/.editorconfig
+ models/ggml-vocab-llama.gguf
+ models/ggml-vocab.bin
+30 −10 tests/CMakeLists.txt
+2 −1 tests/test-grammar-parser.cpp
+403 −0 tests/test-llama-grammar.cpp
+43 −17 tests/test-tokenizer-0.cpp
+131 −0 tests/test-tokenizer-1.cpp
11 changes: 5 additions & 6 deletions scripts/bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,24 @@ cp ./llama.cpp/ggml-metal.m ./cpp/ggml-metal.m
cp ./llama.cpp/ggml-metal.metal ./cpp/ggml-metal.metal
cp ./llama.cpp/ggml-alloc.h ./cpp/ggml-alloc.h
cp ./llama.cpp/ggml-alloc.c ./cpp/ggml-alloc.c
cp ./llama.cpp/llama-util.h ./cpp/llama-util.h
cp ./llama.cpp/llama.h ./cpp/llama.h
cp ./llama.cpp/llama.cpp ./cpp/llama.cpp
cp ./llama.cpp/k_quants.h ./cpp/k_quants.h
cp ./llama.cpp/k_quants.c ./cpp/k_quants.c
cp ./llama.cpp/examples/common.h ./cpp/common.h
cp ./llama.cpp/examples/common.cpp ./cpp/common.cpp
cp ./llama.cpp/examples/grammar-parser.h ./cpp/grammar-parser.h
cp ./llama.cpp/examples/grammar-parser.cpp ./cpp/grammar-parser.cpp
cp ./llama.cpp/common/common.h ./cpp/common.h
cp ./llama.cpp/common/common.cpp ./cpp/common.cpp
cp ./llama.cpp/common/grammar-parser.h ./cpp/grammar-parser.h
cp ./llama.cpp/common/grammar-parser.cpp ./cpp/grammar-parser.cpp

# List of files to process
files=(
"./cpp/ggml.h"
"./cpp/ggml.c"
"./cpp/common.cpp"
"./cpp/ggml-metal.h"
"./cpp/ggml-metal.m"
"./cpp/llama.h"
"./cpp/llama.cpp"
"./cpp/llama-util.h"
"./cpp/k_quants.h"
"./cpp/k_quants.c"
"./cpp/ggml-alloc.h"
Expand Down
16 changes: 8 additions & 8 deletions scripts/ggml-metal.m.patch
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
--- ggml-metal-orig.m 2023-08-17 10:15:26
+++ ggml-metal.m 2023-08-17 10:14:18
--- ggml-metal-orig.m 2023-08-22 12:22:42
+++ ggml-metal.m 2023-08-22 12:22:43
@@ -218,13 +218,13 @@
#undef LM_GGML_METAL_ADD_KERNEL
}

- fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+ // fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
Expand All @@ -17,20 +17,20 @@
+ // } else {
+ // fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
+ // }

return ctx;
}
@@ -351,15 +351,15 @@
@@ -366,15 +366,15 @@
}
}

- fprintf(stderr, ", (%8.2f / %8.2f)",
- ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
- ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+ // fprintf(stderr, ", (%8.2f / %8.2f)",
+ // ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
+ // ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);

- if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
- fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
- } else {
Expand All @@ -42,5 +42,5 @@
+ // fprintf(stderr, "\n");
+ // }
}

return true;
2 changes: 0 additions & 2 deletions src/NativeRNLlama.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ export type NativeContextParams = {
lora?: string // lora_adaptor
lora_base?: string

n_gqa?: number
rms_norm_eps?: number
rope_freq_base?: number
rope_freq_scale?: number
}
Expand Down