iamlemec
diff --git a/‎.gitignore
Lines changed: 2 additions & 15 deletions b/‎.gitignore
Lines changed: 2 additions & 15 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 5 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 5 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 6 additions & 7 deletions b/‎README.md
Lines changed: 6 additions & 7 deletions
diff --git a/‎bert.cpp
Lines changed: 38 additions & 36 deletions b/‎bert.cpp
Lines changed: 38 additions & 36 deletions
diff --git a/‎bert.h
Lines changed: 24 additions & 7 deletions b/‎bert.h
Lines changed: 24 additions & 7 deletions
@@ -1,16 +1,3 @@
-.vscode
-
-build
-models/*/*
-
-compile_commands.json
-
-.exrc
-.cache
-.DS_Store
-
 __pycache__
-
-models/*/
-
-error_logs.txt
+build
+models
@@ -89,14 +89,17 @@ endif()
 
 add_subdirectory(ggml)
 add_subdirectory(examples)
-add_subdirectory(models)
 
+# bert library
 add_library(bert bert.cpp bert.h)
-
 target_include_directories(bert PUBLIC .)
 target_compile_features(bert PUBLIC cxx_std_20)
 target_link_libraries(bert PRIVATE ggml ${BERT_EXTRA_LIBS})
 
 # for shared libraries
 set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# quantization
+add_executable(quantize quantize.cpp)
+target_link_libraries(quantize PRIVATE bert ggml)
@@ -15,9 +15,8 @@ pip install -r requirements.txt
 To fetch models from `huggingface`  and convert them to `gguf` format run the following
 ```sh
 cd models
-python download.py BAAI/bge-base-en-v1.5 # or any other model
-python convert.py bge-base-en-v1.5 f16
-python convert.py bge-base-en-v1.5 f32
+python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf # f16 is default
+python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f32.gguf f32 # optional
 ```
 
 ### Build
@@ -46,10 +45,10 @@ make -C build -j
 All executables are placed in `build/bin`. To run inference on a given text, run
 ```sh
 # CPU / CUDA
-build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
+build/bin/main -m models/bge-base-en-v1.5-f16.gguf -p "Hello world"
 
 # Metal
-GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5/ggml-model-f16.gguf -p "Hello world"
+GGML_METAL_PATH_RESOURCES=build/bin/ build/bin/main -m models/bge-base-en-v1.5-f16.gguf -p "Hello world"
 ```
 To force CPU usage, add the flag `-c`.
 
@@ -58,7 +57,7 @@ To force CPU usage, add the flag `-c`.
 You can also run everything through Python, which is particularly useful for batch inference. For instance,
 ```python
 import bert
-mod = bert.BertModel('models/bge-base-en-v1.5/ggml-model-f16.gguf')
+mod = bert.BertModel('models/bge-base-en-v1.5-f16.gguf')
 emb = mod.embed(batch)
 ```
 where `batch` is a list of strings and `emb` is a `numpy` array of embedding vectors.
@@ -67,6 +66,6 @@ where `batch` is a list of strings and `emb` is a `numpy` array of embedding vec
 
 You can quantize models with the command
 ```sh
-build/bin/quantize models/bge-base-en-v1.5/ggml-model-f32.gguf models/bge-base-en-v1.5/ggml-model-q8_0.gguf q8_0
+build/bin/quantize models/bge-base-en-v1.5-f16.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
 ```
 or whatever your desired quantization level is. Currently supported values are: `q8_0`, `q5_0`, `q5_1`, `q4_0`, and `q4_1`. You can then pass these model files directly to `main` as above.
@@ -26,20 +26,7 @@ Forked with gratitude from:
 
 #define BERT_MAX_NODES 4096
 
-// model keys
-
-#define KEY_FTYPE "general.file_type"
-#define KEY_NAME "general.name"
-#define KEY_DESCRIPTION "general.description"
-
-#define KEY_PAD_ID "tokenizer.ggml.padding_token_id"
-#define KEY_UNK_ID "tokenizer.ggml.unknown_token_id"
-#define KEY_BOS_ID "tokenizer.ggml.bos_token_id"
-#define KEY_EOS_ID "tokenizer.ggml.eos_token_id"
-#define KEY_SUBWORD_PREFIX "tokenizer.ggml.subword_prefix"
-#define KEY_TOKEN_LIST "tokenizer.ggml.tokens"
-
-const int verbosity = 0;
+const int verbosity = 1;
 
 //
 // utilities to get data from a gguf file
@@ -73,9 +60,11 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
     return gguf_get_val_f32(ctx, i);
 }
 
-static std::string get_str(const gguf_context * ctx, const std::string & key) {
-    const int i = get_key_idx(ctx, key.c_str());
-
+static std::string get_str(const gguf_context * ctx, const std::string & key, const std::string & def = "") {
+    const int i = gguf_find_key(ctx, key.c_str());
+    if (i == -1) {
+        return def;
+    }
     return gguf_get_val_str(ctx, i);
 }
 
@@ -325,16 +314,22 @@ bert_tokens bert_tokenize(struct bert_ctx * ctx, bert_string text, uint64_t n_ma
     return tokens;
 }
 
-bert_string bert_detokenize(struct bert_ctx * ctx, bert_tokens tokens, bool debug) {
+bert_string bert_detokenize(struct bert_ctx * ctx, bert_tokens tokens, bool debug = false) {
     const bert_token bos_id = ctx->vocab.bos_id;
     const bert_token eos_id = ctx->vocab.eos_id;
+
+    const std::string word_prefix = ctx->vocab.word_prefix;
     const std::string subword_prefix = ctx->vocab.subword_prefix;
-    const std::string prefix = subword_prefix + subword_prefix;
+    const uint32_t word_prefix_len = word_prefix.size();
+    const uint32_t subword_prefix_len = subword_prefix.size();
 
     bert_string str = "";
     for (const uint64_t &t : tokens) {
         std::string token = bert_vocab_id_to_token(ctx, t);
-        bool subword = token.find(prefix) == 0;
+        bool subword = (
+            (subword_prefix_len > 0 && token.find(subword_prefix) == 0) ||
+            (word_prefix_len > 0 && token.find(word_prefix) != 0)
+        );
         if (debug) {
             if ((str.size() > 0) && !subword) {
                 str += " ";
@@ -345,12 +340,12 @@ bert_string bert_detokenize(struct bert_ctx * ctx, bert_tokens tokens, bool debu
                 continue;
             }
             if (subword) {
-                str += token.substr(2);
+                str += token.substr(subword_prefix_len);
             } else {
                 if (str.size() > 0) {
                     str += " ";
                 }
-                str += token;
+                str += token.substr(word_prefix_len);
             }
         }
     }
@@ -462,8 +457,11 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
         vocab.unk_id = get_i32(ctx_gguf, KEY_UNK_ID);
         vocab.bos_id = get_i32(ctx_gguf, KEY_BOS_ID);
         vocab.eos_id = get_i32(ctx_gguf, KEY_EOS_ID);
+
+        vocab.word_prefix = get_str(ctx_gguf, KEY_WORD_PREFIX);
         vocab.subword_prefix = get_str(ctx_gguf, KEY_SUBWORD_PREFIX);
-        uint32_t prefix_len = vocab.subword_prefix.size();
+        uint32_t word_prefix_len = vocab.word_prefix.size();
+        uint32_t subword_prefix_len = vocab.subword_prefix.size();
 
         const int token_idx = gguf_find_key(ctx_gguf, KEY_TOKEN_LIST);
         const int n_vocab = gguf_get_arr_n(ctx_gguf, token_idx);
@@ -472,20 +470,25 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
             std::string word = gguf_get_arr_str(ctx_gguf, token_idx, i);
             vocab.tokens.push_back(word);
 
-            if (word.find(vocab.subword_prefix) == 0) {
-                vocab.subword_token_to_id[word.substr(prefix_len)] = i;
-                vocab._id_to_subword_token[i] = word;
-            }
+            bool subword = (
+                (subword_prefix_len > 0 && word.find(vocab.subword_prefix) == 0) ||
+                (word_prefix_len > 0 && word.find(vocab.word_prefix) != 0)
+            );
 
-            if (vocab.token_to_id.count(word) == 0) {
-                vocab.token_to_id[word] = i;
+            if (subword) {
+                vocab.subword_token_to_id[word.substr(subword_prefix_len)] = i;
+                vocab._id_to_subword_token[i] = word;
+            } else {
+                vocab.token_to_id[word.substr(word_prefix_len)] = i;
                 vocab._id_to_token[i] = word;
             }
         }
 
         if (verbosity >= 1) {
             fprintf(stderr, "%s: TOKENIZER\n", __func__);
             fprintf(stderr, "%s: vocab size: %d\n", __func__, n_vocab);
+            fprintf(stderr, "%s: word_prefix: %s\n", __func__, vocab.word_prefix.c_str());
+            fprintf(stderr, "%s: subword_prefix: %s\n", __func__, vocab.subword_prefix.c_str());
             fprintf(stderr, "%s: pad_id = %d\n", __func__, vocab.pad_id);
             fprintf(stderr, "%s: unk_id = %d\n", __func__, vocab.unk_id);
             fprintf(stderr, "%s: bos_id = %d\n", __func__, vocab.bos_id);
@@ -627,13 +630,6 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
             bert_layer & layer = model.layers[i];
             std::string pre = "encoder.layer." + std::to_string(i) + ".";
 
-            // normalization
-            layer.ln_att_w = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.weight");
-            layer.ln_att_b = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.bias");
-
-            layer.ln_out_w = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.weight");
-            layer.ln_out_b = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.bias");
-
             // attention
             layer.q_w = get_tensor(new_bert->ctx_data, pre + "attention.self.query.weight");
             layer.q_b = get_tensor(new_bert->ctx_data, pre + "attention.self.query.bias");
@@ -645,12 +641,18 @@ struct bert_ctx * bert_load_from_file(const char *fname, bool use_cpu) {
             layer.o_w = get_tensor(new_bert->ctx_data, pre + "attention.output.dense.weight");
             layer.o_b = get_tensor(new_bert->ctx_data, pre + "attention.output.dense.bias");
 
+            layer.ln_att_w = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.weight");
+            layer.ln_att_b = get_tensor(new_bert->ctx_data, pre + "attention.output.LayerNorm.bias");
+
             // ff
             layer.ff_i_w = get_tensor(new_bert->ctx_data, pre + "intermediate.dense.weight");
             layer.ff_i_b = get_tensor(new_bert->ctx_data, pre + "intermediate.dense.bias");
 
             layer.ff_o_w = get_tensor(new_bert->ctx_data, pre + "output.dense.weight");
             layer.ff_o_b = get_tensor(new_bert->ctx_data, pre + "output.dense.bias");
+
+            layer.ln_out_w = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.weight");
+            layer.ln_out_b = get_tensor(new_bert->ctx_data, pre + "output.LayerNorm.bias");
         }
     }
 
 
@@ -13,6 +13,22 @@
 #include <fstream>
 #include <map>
 
+// model keys
+
+#define KEY_FTYPE "general.file_type"
+#define KEY_NAME "general.name"
+#define KEY_DESCRIPTION "general.description"
+
+#define KEY_PAD_ID "tokenizer.ggml.padding_token_id"
+#define KEY_UNK_ID "tokenizer.ggml.unknown_token_id"
+#define KEY_BOS_ID "tokenizer.ggml.bos_token_id"
+#define KEY_EOS_ID "tokenizer.ggml.eos_token_id"
+#define KEY_WORD_PREFIX "tokenizer.ggml.word_prefix"
+#define KEY_SUBWORD_PREFIX "tokenizer.ggml.subword_prefix"
+#define KEY_TOKEN_LIST "tokenizer.ggml.tokens"
+
+// api
+
 #define BERT_API __attribute__ ((visibility ("default")))
 
 #ifdef __cplusplus
@@ -45,13 +61,6 @@ struct bert_hparams {
 };
 
 struct bert_layer {
-    // normalization
-    struct ggml_tensor *ln_att_w;
-    struct ggml_tensor *ln_att_b;
-
-    struct ggml_tensor *ln_out_w;
-    struct ggml_tensor *ln_out_b;
-
     // attention
     struct ggml_tensor *q_w;
     struct ggml_tensor *q_b;
@@ -63,19 +72,27 @@ struct bert_layer {
     struct ggml_tensor *o_w;
     struct ggml_tensor *o_b;
 
+    struct ggml_tensor *ln_att_w;
+    struct ggml_tensor *ln_att_b;
+
     // ff
     struct ggml_tensor *ff_i_w;
     struct ggml_tensor *ff_i_b;
 
     struct ggml_tensor *ff_o_w;
     struct ggml_tensor *ff_o_b;
+
+    struct ggml_tensor *ln_out_w;
+    struct ggml_tensor *ln_out_b;
 };
 
 struct bert_vocab {
     bert_token pad_id;
     bert_token unk_id;
     bert_token bos_id;
     bert_token eos_id;
+
+    std::string word_prefix;
     std::string subword_prefix;
 
     std::vector<std::string> tokens;