Skip to content

Commit

Permalink
simplify conversion by using huggingface autotokenizer for everything…
Browse files Browse the repository at this point in the history
…; use models dir for gguf files only
  • Loading branch information
iamlemec committed Feb 5, 2024
1 parent 14f8c2a commit 47cb93d
Show file tree
Hide file tree
Showing 10 changed files with 216 additions and 319 deletions.
17 changes: 2 additions & 15 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,16 +1,3 @@
.vscode

build
models/*/*

compile_commands.json

.exrc
.cache
.DS_Store

__pycache__

models/*/

error_logs.txt
build
models
7 changes: 5 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,17 @@ endif()

add_subdirectory(ggml)
add_subdirectory(examples)
add_subdirectory(models)

# bert library
add_library(bert bert.cpp bert.h)

target_include_directories(bert PUBLIC .)
target_compile_features(bert PUBLIC cxx_std_20)
target_link_libraries(bert PRIVATE ggml ${BERT_EXTRA_LIBS})

# for shared libraries
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON)

# quantization
add_executable(quantize quantize.cpp)
target_link_libraries(quantize PRIVATE bert ggml)
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ pip install -r requirements.txt
To fetch models from `huggingface` and convert them to `gguf` format run the following
```sh
cd models
python download.py BAAI/bge-base-en-v1.5 # or any other model
python convert.py bge-base-en-v1.5 f16
python convert.py bge-base-en-v1.5 f32
python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf # f16 is default
python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f32.gguf f32 # optional
```

### Build
Expand Down Expand Up @@ -46,10 +45,10 @@ make -C build -j
All executables are placed in `build/bin`. To run inference on a given text, run
```sh
# CPU / CUDA
build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
build/bin/main -m models/bge-base-en-v1.5-f16.gguf -p "Hello world"

# Metal
GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5-f16.gguf -p "Hello world"
```
To force CPU usage, add the flag `-c`.

Expand All @@ -58,7 +57,7 @@ To force CPU usage, add the flag `-c`.
You can also run everything through Python, which is particularly useful for batch inference. For instance,
```python
import bert
mod = bert.BertModel('models/bge-base-en-v1.5/ggml-model-f16.gguf')
mod = bert.BertModel('models/bge-base-en-v1.5-f16.gguf')
emb = mod.embed(batch)
```
where `batch` is a list of strings and `emb` is a `numpy` array of embedding vectors.
Expand All @@ -67,6 +66,6 @@ where `batch` is a list of strings and `emb` is a `numpy` array of embedding vec

You can quantize models with the command
```sh
build/bin/quantize models/bge-base-en-v1.5/ggml-model-f32.gguf models/bge-base-en-v1.5/ggml-model-q8_0.gguf q8_0
build/bin/quantize models/bge-base-en-v1.5-f16.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
```
or whatever your desired quantization level is. Currently supported values are: `q8_0`, `q5_0`, `q5_1`, `q4_0`, and `q4_1`. You can then pass these model files directly to `main` as above.
74 changes: 38 additions & 36 deletions bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,7 @@ Forked with gratitude from:

#define BERT_MAX_NODES 4096

// model keys

#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"

#define KEY_PAD_ID "tokenizer.ggml.padding_token_id"
#define KEY_UNK_ID "tokenizer.ggml.unknown_token_id"
#define KEY_BOS_ID "tokenizer.ggml.bos_token_id"
#define KEY_EOS_ID "tokenizer.ggml.eos_token_id"
#define KEY_SUBWORD_PREFIX "tokenizer.ggml.subword_prefix"
#define KEY_TOKEN_LIST "tokenizer.ggml.tokens"

const int verbosity = 0;
const int verbosity = 1;

//
// utilities to get data from a gguf file
Expand Down Expand Up @@ -73,9 +60,11 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
return gguf_get_val_f32(ctx, i);
}

static std::string get_str(const gguf_context * ctx, const std::string & key) {
const int i = get_key_idx(ctx, key.c_str());

static std::string get_str(const gguf_context * ctx, const std::string & key, const std::string & def = "") {
const int i = gguf_find_key(ctx, key.c_str());
if (i == -1) {
return def;
}
return gguf_get_val_str(ctx, i);
}

Expand Down Expand Up @@ -325,16 +314,22 @@ bert_tokens bert_tokenize(struct bert_ctx * ctx, bert_string text, uint64_t n_ma
return tokens;
}

bert_string bert_detokenize(struct bert_ctx * ctx, bert_tokens tokens, bool debug) {
bert_string bert_detokenize(struct bert_ctx * ctx, bert_tokens tokens, bool debug = false) {
const bert_token bos_id = ctx->vocab.bos_id;
const bert_token eos_id = ctx->vocab.eos_id;

const std::string word_prefix = ctx->vocab.word_prefix;
const std::string subword_prefix = ctx->vocab.subword_prefix;
const std::string prefix = subword_prefix + subword_prefix;
const uint32_t word_prefix_len = word_prefix.size();
const uint32_t subword_prefix_len = subword_prefix.size();

bert_string str = "";
for (const uint64_t &t : tokens) {
std::string token = bert_vocab_id_to_token(ctx, t);
bool subword = token.find(prefix) == 0;
bool subword = (
(subword_prefix_len > 0 && token.find(subword_prefix) == 0) ||
(word_prefix_len > 0 && token.find(word_prefix) != 0)
);
if (debug) {
if ((str.size() > 0) && !subword) {
str += " ";
Expand All @@ -345,12 +340,12 @@ bert_string bert_detokenize(struct bert_ctx * ctx, bert_tokens tokens, bool debu
continue;
}
if (subword) {
str += token.substr(2);
str += token.substr(subword_prefix_len);
} else {
if (str.size() > 0) {
str += " ";
}
str += token;
str += token.substr(word_prefix_len);
}
}
}
Expand Down Expand Up @@ -462,8 +457,11 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
vocab.unk_id = get_i32(ctx_gguf, KEY_UNK_ID);
vocab.bos_id = get_i32(ctx_gguf, KEY_BOS_ID);
vocab.eos_id = get_i32(ctx_gguf, KEY_EOS_ID);

vocab.word_prefix = get_str(ctx_gguf, KEY_WORD_PREFIX);
vocab.subword_prefix = get_str(ctx_gguf, KEY_SUBWORD_PREFIX);
uint32_t prefix_len = vocab.subword_prefix.size();
uint32_t word_prefix_len = vocab.word_prefix.size();
uint32_t subword_prefix_len = vocab.subword_prefix.size();

const int token_idx = gguf_find_key(ctx_gguf, KEY_TOKEN_LIST);
const int n_vocab = gguf_get_arr_n(ctx_gguf, token_idx);
Expand All @@ -472,20 +470,25 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
std::string word = gguf_get_arr_str(ctx_gguf, token_idx, i);
vocab.tokens.push_back(word);

if (word.find(vocab.subword_prefix) == 0) {
vocab.subword_token_to_id[word.substr(prefix_len)] = i;
vocab._id_to_subword_token[i] = word;
}
bool subword = (
(subword_prefix_len > 0 && word.find(vocab.subword_prefix) == 0) ||
(word_prefix_len > 0 && word.find(vocab.word_prefix) != 0)
);

if (vocab.token_to_id.count(word) == 0) {
vocab.token_to_id[word] = i;
if (subword) {
vocab.subword_token_to_id[word.substr(subword_prefix_len)] = i;
vocab._id_to_subword_token[i] = word;
} else {
vocab.token_to_id[word.substr(word_prefix_len)] = i;
vocab._id_to_token[i] = word;
}
}

if (verbosity >= 1) {
fprintf(stderr, "%s: TOKENIZER\n", __func__);
fprintf(stderr, "%s: vocab size: %d\n", __func__, n_vocab);
fprintf(stderr, "%s: word_prefix: %s\n", __func__, vocab.word_prefix.c_str());
fprintf(stderr, "%s: subword_prefix: %s\n", __func__, vocab.subword_prefix.c_str());
fprintf(stderr, "%s: pad_id = %d\n", __func__, vocab.pad_id);
fprintf(stderr, "%s: unk_id = %d\n", __func__, vocab.unk_id);
fprintf(stderr, "%s: bos_id = %d\n", __func__, vocab.bos_id);
Expand Down Expand Up @@ -627,13 +630,6 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
bert_layer & layer = model.layers[i];
std::string pre = "encoder.layer." + std::to_string(i) + ".";

// normalization
layer.ln_att_w = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.weight");
layer.ln_att_b = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.bias");

layer.ln_out_w = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.weight");
layer.ln_out_b = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.bias");

// attention
layer.q_w = get_tensor(new_bert->ctx_data, pre + "attention.self.query.weight");
layer.q_b = get_tensor(new_bert->ctx_data, pre + "attention.self.query.bias");
Expand All @@ -645,12 +641,18 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
layer.o_w = get_tensor(new_bert->ctx_data, pre + "attention.output.dense.weight");
layer.o_b = get_tensor(new_bert->ctx_data, pre + "attention.output.dense.bias");

layer.ln_att_w = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.weight");
layer.ln_att_b = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.bias");

// ff
layer.ff_i_w = get_tensor(new_bert->ctx_data, pre + "intermediate.dense.weight");
layer.ff_i_b = get_tensor(new_bert->ctx_data, pre + "intermediate.dense.bias");

layer.ff_o_w = get_tensor(new_bert->ctx_data, pre + "output.dense.weight");
layer.ff_o_b = get_tensor(new_bert->ctx_data, pre + "output.dense.bias");

layer.ln_out_w = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.weight");
layer.ln_out_b = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.bias");
}
}

Expand Down
31 changes: 24 additions & 7 deletions bert.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,22 @@
#include <fstream>
#include <map>

// model keys

#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"

#define KEY_PAD_ID "tokenizer.ggml.padding_token_id"
#define KEY_UNK_ID "tokenizer.ggml.unknown_token_id"
#define KEY_BOS_ID "tokenizer.ggml.bos_token_id"
#define KEY_EOS_ID "tokenizer.ggml.eos_token_id"
#define KEY_WORD_PREFIX "tokenizer.ggml.word_prefix"
#define KEY_SUBWORD_PREFIX "tokenizer.ggml.subword_prefix"
#define KEY_TOKEN_LIST "tokenizer.ggml.tokens"

// api

#define BERT_API __attribute__ ((visibility ("default")))

#ifdef __cplusplus
Expand Down Expand Up @@ -45,13 +61,6 @@ struct bert_hparams {
};

struct bert_layer {
// normalization
struct ggml_tensor *ln_att_w;
struct ggml_tensor *ln_att_b;

struct ggml_tensor *ln_out_w;
struct ggml_tensor *ln_out_b;

// attention
struct ggml_tensor *q_w;
struct ggml_tensor *q_b;
Expand All @@ -63,19 +72,27 @@ struct bert_layer {
struct ggml_tensor *o_w;
struct ggml_tensor *o_b;

struct ggml_tensor *ln_att_w;
struct ggml_tensor *ln_att_b;

// ff
struct ggml_tensor *ff_i_w;
struct ggml_tensor *ff_i_b;

struct ggml_tensor *ff_o_w;
struct ggml_tensor *ff_o_b;

struct ggml_tensor *ln_out_w;
struct ggml_tensor *ln_out_b;
};

struct bert_vocab {
bert_token pad_id;
bert_token unk_id;
bert_token bos_id;
bert_token eos_id;

std::string word_prefix;
std::string subword_prefix;

std::vector<std::string> tokens;
Expand Down
Loading

0 comments on commit 47cb93d

Please sign in to comment.