Skip to content

Commit 47cb93d

Browse files
committed
simplify conversion by using huggingface autotokenizer for everything; use models dir for gguf files only
1 parent 14f8c2a commit 47cb93d

File tree

10 files changed

+216
-319
lines changed

10 files changed

+216
-319
lines changed

.gitignore

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,3 @@
1-
.vscode
2-
3-
build
4-
models/*/*
5-
6-
compile_commands.json
7-
8-
.exrc
9-
.cache
10-
.DS_Store
11-
121
__pycache__
13-
14-
models/*/
15-
16-
error_logs.txt
2+
build
3+
models

CMakeLists.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,17 @@ endif()
8989

9090
add_subdirectory(ggml)
9191
add_subdirectory(examples)
92-
add_subdirectory(models)
9392

93+
# bert library
9494
add_library(bert bert.cpp bert.h)
95-
9695
target_include_directories(bert PUBLIC .)
9796
target_compile_features(bert PUBLIC cxx_std_20)
9897
target_link_libraries(bert PRIVATE ggml ${BERT_EXTRA_LIBS})
9998

10099
# for shared libraries
101100
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
102101
set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON)
102+
103+
# quantization
104+
add_executable(quantize quantize.cpp)
105+
target_link_libraries(quantize PRIVATE bert ggml)

README.md

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@ pip install -r requirements.txt
1515
To fetch models from `huggingface` and convert them to `gguf` format run the following
1616
```sh
1717
cd models
18-
python download.py BAAI/bge-base-en-v1.5 # or any other model
19-
python convert.py bge-base-en-v1.5 f16
20-
python convert.py bge-base-en-v1.5 f32
18+
python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf # f16 is default
19+
python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f32.gguf f32 # optional
2120
```
2221

2322
### Build
@@ -46,10 +45,10 @@ make -C build -j
4645
All executables are placed in `build/bin`. To run inference on a given text, run
4746
```sh
4847
# CPU / CUDA
49-
build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
48+
build/bin/main -m models/bge-base-en-v1.5-f16.gguf -p "Hello world"
5049

5150
# Metal
52-
GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
51+
GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5-f16.gguf -p "Hello world"
5352
```
5453
To force CPU usage, add the flag `-c`.
5554

@@ -58,7 +57,7 @@ To force CPU usage, add the flag `-c`.
5857
You can also run everything through Python, which is particularly useful for batch inference. For instance,
5958
```python
6059
import bert
61-
mod = bert.BertModel('models/bge-base-en-v1.5/ggml-model-f16.gguf')
60+
mod = bert.BertModel('models/bge-base-en-v1.5-f16.gguf')
6261
emb = mod.embed(batch)
6362
```
6463
where `batch` is a list of strings and `emb` is a `numpy` array of embedding vectors.
@@ -67,6 +66,6 @@ where `batch` is a list of strings and `emb` is a `numpy` array of embedding vec
6766

6867
You can quantize models with the command
6968
```sh
70-
build/bin/quantize models/bge-base-en-v1.5/ggml-model-f32.gguf models/bge-base-en-v1.5/ggml-model-q8_0.gguf q8_0
69+
build/bin/quantize models/bge-base-en-v1.5-f16.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
7170
```
7271
or whatever your desired quantization level is. Currently supported values are: `q8_0`, `q5_0`, `q5_1`, `q4_0`, and `q4_1`. You can then pass these model files directly to `main` as above.

bert.cpp

Lines changed: 38 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,7 @@ Forked with gratitude from:
2626

2727
#define BERT_MAX_NODES 4096
2828

29-
// model keys
30-
31-
#define KEY_FTYPE "general.file_type"
32-
#define KEY_NAME "general.name"
33-
#define KEY_DESCRIPTION "general.description"
34-
35-
#define KEY_PAD_ID "tokenizer.ggml.padding_token_id"
36-
#define KEY_UNK_ID "tokenizer.ggml.unknown_token_id"
37-
#define KEY_BOS_ID "tokenizer.ggml.bos_token_id"
38-
#define KEY_EOS_ID "tokenizer.ggml.eos_token_id"
39-
#define KEY_SUBWORD_PREFIX "tokenizer.ggml.subword_prefix"
40-
#define KEY_TOKEN_LIST "tokenizer.ggml.tokens"
41-
42-
const int verbosity = 0;
29+
const int verbosity = 1;
4330

4431
//
4532
// utilities to get data from a gguf file
@@ -73,9 +60,11 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
7360
return gguf_get_val_f32(ctx, i);
7461
}
7562

76-
static std::string get_str(const gguf_context * ctx, const std::string & key) {
77-
const int i = get_key_idx(ctx, key.c_str());
78-
63+
static std::string get_str(const gguf_context * ctx, const std::string & key, const std::string & def = "") {
64+
const int i = gguf_find_key(ctx, key.c_str());
65+
if (i == -1) {
66+
return def;
67+
}
7968
return gguf_get_val_str(ctx, i);
8069
}
8170

@@ -325,16 +314,22 @@ bert_tokens bert_tokenize(struct bert_ctx * ctx, bert_string text, uint64_t n_ma
325314
return tokens;
326315
}
327316

328-
bert_string bert_detokenize(struct bert_ctx * ctx, bert_tokens tokens, bool debug) {
317+
bert_string bert_detokenize(struct bert_ctx * ctx, bert_tokens tokens, bool debug = false) {
329318
const bert_token bos_id = ctx->vocab.bos_id;
330319
const bert_token eos_id = ctx->vocab.eos_id;
320+
321+
const std::string word_prefix = ctx->vocab.word_prefix;
331322
const std::string subword_prefix = ctx->vocab.subword_prefix;
332-
const std::string prefix = subword_prefix + subword_prefix;
323+
const uint32_t word_prefix_len = word_prefix.size();
324+
const uint32_t subword_prefix_len = subword_prefix.size();
333325

334326
bert_string str = "";
335327
for (const uint64_t &t : tokens) {
336328
std::string token = bert_vocab_id_to_token(ctx, t);
337-
bool subword = token.find(prefix) == 0;
329+
bool subword = (
330+
(subword_prefix_len > 0 && token.find(subword_prefix) == 0) ||
331+
(word_prefix_len > 0 && token.find(word_prefix) != 0)
332+
);
338333
if (debug) {
339334
if ((str.size() > 0) && !subword) {
340335
str += " ";
@@ -345,12 +340,12 @@ bert_string bert_detokenize(struct bert_ctx * ctx, bert_tokens tokens, bool debu
345340
continue;
346341
}
347342
if (subword) {
348-
str += token.substr(2);
343+
str += token.substr(subword_prefix_len);
349344
} else {
350345
if (str.size() > 0) {
351346
str += " ";
352347
}
353-
str += token;
348+
str += token.substr(word_prefix_len);
354349
}
355350
}
356351
}
@@ -462,8 +457,11 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
462457
vocab.unk_id = get_i32(ctx_gguf, KEY_UNK_ID);
463458
vocab.bos_id = get_i32(ctx_gguf, KEY_BOS_ID);
464459
vocab.eos_id = get_i32(ctx_gguf, KEY_EOS_ID);
460+
461+
vocab.word_prefix = get_str(ctx_gguf, KEY_WORD_PREFIX);
465462
vocab.subword_prefix = get_str(ctx_gguf, KEY_SUBWORD_PREFIX);
466-
uint32_t prefix_len = vocab.subword_prefix.size();
463+
uint32_t word_prefix_len = vocab.word_prefix.size();
464+
uint32_t subword_prefix_len = vocab.subword_prefix.size();
467465

468466
const int token_idx = gguf_find_key(ctx_gguf, KEY_TOKEN_LIST);
469467
const int n_vocab = gguf_get_arr_n(ctx_gguf, token_idx);
@@ -472,20 +470,25 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
472470
std::string word = gguf_get_arr_str(ctx_gguf, token_idx, i);
473471
vocab.tokens.push_back(word);
474472

475-
if (word.find(vocab.subword_prefix) == 0) {
476-
vocab.subword_token_to_id[word.substr(prefix_len)] = i;
477-
vocab._id_to_subword_token[i] = word;
478-
}
473+
bool subword = (
474+
(subword_prefix_len > 0 && word.find(vocab.subword_prefix) == 0) ||
475+
(word_prefix_len > 0 && word.find(vocab.word_prefix) != 0)
476+
);
479477

480-
if (vocab.token_to_id.count(word) == 0) {
481-
vocab.token_to_id[word] = i;
478+
if (subword) {
479+
vocab.subword_token_to_id[word.substr(subword_prefix_len)] = i;
480+
vocab._id_to_subword_token[i] = word;
481+
} else {
482+
vocab.token_to_id[word.substr(word_prefix_len)] = i;
482483
vocab._id_to_token[i] = word;
483484
}
484485
}
485486

486487
if (verbosity >= 1) {
487488
fprintf(stderr, "%s: TOKENIZER\n", __func__);
488489
fprintf(stderr, "%s: vocab size: %d\n", __func__, n_vocab);
490+
fprintf(stderr, "%s: word_prefix: %s\n", __func__, vocab.word_prefix.c_str());
491+
fprintf(stderr, "%s: subword_prefix: %s\n", __func__, vocab.subword_prefix.c_str());
489492
fprintf(stderr, "%s: pad_id = %d\n", __func__, vocab.pad_id);
490493
fprintf(stderr, "%s: unk_id = %d\n", __func__, vocab.unk_id);
491494
fprintf(stderr, "%s: bos_id = %d\n", __func__, vocab.bos_id);
@@ -627,13 +630,6 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
627630
bert_layer & layer = model.layers[i];
628631
std::string pre = "encoder.layer." + std::to_string(i) + ".";
629632

630-
// normalization
631-
layer.ln_att_w = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.weight");
632-
layer.ln_att_b = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.bias");
633-
634-
layer.ln_out_w = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.weight");
635-
layer.ln_out_b = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.bias");
636-
637633
// attention
638634
layer.q_w = get_tensor(new_bert->ctx_data, pre + "attention.self.query.weight");
639635
layer.q_b = get_tensor(new_bert->ctx_data, pre + "attention.self.query.bias");
@@ -645,12 +641,18 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
645641
layer.o_w = get_tensor(new_bert->ctx_data, pre + "attention.output.dense.weight");
646642
layer.o_b = get_tensor(new_bert->ctx_data, pre + "attention.output.dense.bias");
647643

644+
layer.ln_att_w = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.weight");
645+
layer.ln_att_b = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.bias");
646+
648647
// ff
649648
layer.ff_i_w = get_tensor(new_bert->ctx_data, pre + "intermediate.dense.weight");
650649
layer.ff_i_b = get_tensor(new_bert->ctx_data, pre + "intermediate.dense.bias");
651650

652651
layer.ff_o_w = get_tensor(new_bert->ctx_data, pre + "output.dense.weight");
653652
layer.ff_o_b = get_tensor(new_bert->ctx_data, pre + "output.dense.bias");
653+
654+
layer.ln_out_w = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.weight");
655+
layer.ln_out_b = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.bias");
654656
}
655657
}
656658

bert.h

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,22 @@
1313
#include <fstream>
1414
#include <map>
1515

16+
// model keys
17+
18+
#define KEY_FTYPE "general.file_type"
19+
#define KEY_NAME "general.name"
20+
#define KEY_DESCRIPTION "general.description"
21+
22+
#define KEY_PAD_ID "tokenizer.ggml.padding_token_id"
23+
#define KEY_UNK_ID "tokenizer.ggml.unknown_token_id"
24+
#define KEY_BOS_ID "tokenizer.ggml.bos_token_id"
25+
#define KEY_EOS_ID "tokenizer.ggml.eos_token_id"
26+
#define KEY_WORD_PREFIX "tokenizer.ggml.word_prefix"
27+
#define KEY_SUBWORD_PREFIX "tokenizer.ggml.subword_prefix"
28+
#define KEY_TOKEN_LIST "tokenizer.ggml.tokens"
29+
30+
// api
31+
1632
#define BERT_API __attribute__ ((visibility ("default")))
1733

1834
#ifdef __cplusplus
@@ -45,13 +61,6 @@ struct bert_hparams {
4561
};
4662

4763
struct bert_layer {
48-
// normalization
49-
struct ggml_tensor *ln_att_w;
50-
struct ggml_tensor *ln_att_b;
51-
52-
struct ggml_tensor *ln_out_w;
53-
struct ggml_tensor *ln_out_b;
54-
5564
// attention
5665
struct ggml_tensor *q_w;
5766
struct ggml_tensor *q_b;
@@ -63,19 +72,27 @@ struct bert_layer {
6372
struct ggml_tensor *o_w;
6473
struct ggml_tensor *o_b;
6574

75+
struct ggml_tensor *ln_att_w;
76+
struct ggml_tensor *ln_att_b;
77+
6678
// ff
6779
struct ggml_tensor *ff_i_w;
6880
struct ggml_tensor *ff_i_b;
6981

7082
struct ggml_tensor *ff_o_w;
7183
struct ggml_tensor *ff_o_b;
84+
85+
struct ggml_tensor *ln_out_w;
86+
struct ggml_tensor *ln_out_b;
7287
};
7388

7489
struct bert_vocab {
7590
bert_token pad_id;
7691
bert_token unk_id;
7792
bert_token bos_id;
7893
bert_token eos_id;
94+
95+
std::string word_prefix;
7996
std::string subword_prefix;
8097

8198
std::vector<std::string> tokens;

0 commit comments

Comments
 (0)