iamlemec
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 14 deletions b/‎CMakeLists.txt
Lines changed: 4 additions & 14 deletions
diff --git a/‎README.md
Lines changed: 16 additions & 23 deletions b/‎README.md
Lines changed: 16 additions & 23 deletions
diff --git a/‎bert_cpp/__init__.py
Lines changed: 3 additions & 0 deletions b/‎bert_cpp/__init__.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎bert_cpp/bench.py
Lines changed: 50 additions & 0 deletions b/‎bert_cpp/bench.py
Lines changed: 50 additions & 0 deletions
diff --git a/‎convert.py renamed to ‎bert_cpp/convert.py
Lines changed: 22 additions & 16 deletions b/‎convert.py renamed to ‎bert_cpp/convert.py
Lines changed: 22 additions & 16 deletions
diff --git a/‎bert.py renamed to ‎bert_cpp/model.py
Lines changed: 3 additions & 3 deletions b/‎bert.py renamed to ‎bert_cpp/model.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/CMakeLists.txt
Lines changed: 0 additions & 7 deletions b/‎examples/CMakeLists.txt
Lines changed: 0 additions & 7 deletions
@@ -84,22 +84,12 @@ if (GGML_METAL)
 endif()
 
 #
-# Build libraries
+# build it
 #
 
+# add ggml and bert
 add_subdirectory(ggml)
-add_subdirectory(examples)
+add_subdirectory(src)
 
-# bert library
-add_library(bert bert.cpp bert.h)
-target_include_directories(bert PUBLIC .)
-target_compile_features(bert PUBLIC cxx_std_20)
-target_link_libraries(bert PRIVATE ggml ${BERT_EXTRA_LIBS})
-
-# for shared libraries
+# for shared library
 set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-# quantization
-add_executable(quantize quantize.cpp)
-target_link_libraries(quantize PRIVATE bert ggml)
@@ -8,37 +8,30 @@ This repo is a fork of original [bert.cpp](https://github.com/skeskinen/bert.cpp
 
 Fetch this repository then download submodules and install packages with
 ```sh
-git submodule update --init --recursive
+git submodule update --init
 pip install -r requirements.txt
 ```
 
-To fetch models from `huggingface`  and convert them to `gguf` format run the following
+To fetch models from `huggingface` and convert them to `gguf` format run something like the following (after creating the `models` directory)
 ```sh
-cd models
-python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf # f16 is default
-python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f32.gguf f32 # optional
+python bert_cpp/convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf
 ```
+This will convert to `float16` by default. To do `float32` add `f32` to the end of the command.
 
 ### Build
 
-To build the dynamic library for usage from Python
+To build the C++ library for CPU/CUDA/Metal, run the following
 ```sh
-cmake -B build .
-make -C build -j
-```
+# CPU
+cmake -B build . && make -C build -j
 
-If you're compiling for GPU, you should run
-```sh
-cmake -DGGML_CUBLAS=ON -B build .
-make -C build -j
-```
-On some distros, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
+# CUDA
+cmake -DGGML_CUBLAS=ON -B build . && make -C build -j
 
-And for Apple Metal, you should run
-```sh
-cmake -DGGML_METAL=ON -B build .
-make -C build -j
+# Metal
+cmake -DGGML_METAL=ON -B build . && make -C build -j
 ```
+On some distros, when compiling with CUDA, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
 
 ### Execute
 
@@ -56,16 +49,16 @@ To force CPU usage, add the flag `-c`.
 
 You can also run everything through Python, which is particularly useful for batch inference. For instance,
 ```python
-import bert
-mod = bert.BertModel('models/bge-base-en-v1.5-f16.gguf')
+from bert_cpp import BertModel
+mod = BertModel('models/bge-base-en-v1.5-f16.gguf')
 emb = mod.embed(batch)
 ```
 where `batch` is a list of strings and `emb` is a `numpy` array of embedding vectors.
 
 ### Quantize
 
-You can quantize models with the command
+You can quantize models with the command (using the `f32` model as a base seems to work better)
 ```sh
-build/bin/quantize models/bge-base-en-v1.5-f16.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
+build/bin/quantize models/bge-base-en-v1.5-f32.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
 ```
 or whatever your desired quantization level is. Currently supported values are: `q8_0`, `q5_0`, `q5_1`, `q4_0`, and `q4_1`. You can then pass these model files directly to `main` as above.
@@ -0,0 +1,3 @@
+from .convert import convert_hf
+from .model import BertModel
+from .bench import benchmark, load_jsonl
@@ -0,0 +1,50 @@
+# benchmarking BERT
+
+import re
+import json
+from .model import BertModel
+
+# get batch indices
+def batch_indices(length, batch_size):
+    return [(i, min(i+batch_size, length)) for i in range(0, length, batch_size)]
+
+# split text into chunks
+def list_splitter(text, maxlen):
+    for i, j in batch_indices(len(text), maxlen):
+        yield text[i:j]
+
+# default paragraph splitter
+def text_splitter(text, delim, min_len=1, max_len=None):
+    if delim is not None:
+        paras = [p.strip() for p in re.split(delim, text)]
+    else:
+        paras = [text]
+    paras = [p for p in paras if len(p) >= min_len]
+    if max_len is not None:
+        paras = list(chain.from_iterable(
+            list_splitter(p, max_len) for p in paras
+        ))
+    return paras
+
+# generate loader for jsonl file
+def stream_jsonl(path, max_rows=None):
+    with open(path) as fid:
+        for i, line in enumerate(fid):
+            if max_rows is not None and i >= max_rows:
+                break
+            yield json.loads(line)
+
+# load column of jsonl file and chunkify
+def load_jsonl(wiki_path, max_rows=1024, column='text', min_len=32, max_len=None):
+    splitter = lambda s: text_splitter(s, '\n', min_len=min_len, max_len=max_len)
+    stream = stream_jsonl(wiki_path, max_rows=max_rows)
+    chunks = sum([splitter(d[column]) for d in stream], [])
+    return chunks
+
+# run benchmark for one model/data pair
+def benchmark(model, data, min_len=32, max_len=None, batch_size=32, max_rows=None, columns='text', use_cpu=False):
+    if type(model) is str:
+        model = BertModel(model, batch_size=batch_size, use_cpu=use_cpu)
+    if type(data) is str:
+        data = load_jsonl(data, max_rows=max_rows, column=column, min_len=min_len, max_len=max_len)
+    return model.embed(data)
@@ -1,11 +1,8 @@
 import sys
-import json
 import torch
 
-from pathlib import Path
-from gguf import GGUFWriter, GGMLQuantizationType, TokenType
+from gguf import GGUFWriter, GGMLQuantizationType
 from transformers import AutoModel, AutoTokenizer
-from sentencepiece import SentencePieceProcessor
 
 KEY_PAD_ID = 'tokenizer.ggml.padding_token_id'
 KEY_UNK_ID = 'tokenizer.ggml.unknown_token_id'
@@ -14,18 +11,7 @@
 KEY_WORD_PREFIX = 'tokenizer.ggml.word_prefix'
 KEY_SUBWORD_PREFIX = 'tokenizer.ggml.subword_prefix'
 
-# script usage
-if __name__ == '__main__':
-    # primay usage
-    if len(sys.argv) < 3:
-        print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n')
-        sys.exit(1)
-
-    # output in the same directory as the model
-    repo_id = Path(sys.argv[1])
-    output_path = Path(sys.argv[2])
-    float_type = sys.argv[3].lower() if len(sys.argv) > 3 else 'f16'
-
+def convert_hf(repo_id, output_path, float_type='f16'):
     # convert to ggml quantization type
     if float_type not in ['f16', 'f32']:
         print(f'Float type must be f16 or f32, got: {float_type}')
@@ -127,3 +113,23 @@
     # print success
     print()
     print(f'GGML model written to {output_path}')
+
+# script usage
+if __name__ == '__main__':
+    # primay usage
+    if len(sys.argv) < 3:
+        print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n')
+        sys.exit(1)
+
+    # output in the same directory as the model
+    repo_id = sys.argv[1]
+    output_path = sys.argv[2]
+
+    # get float type
+    if len(sys.argv) > 3:
+        kwargs = {'float_type': sys.argv[3].lower()}
+    else:
+        kwargs = {}
+
+    # convert to ggml
+    convert_hf(repo_id, output_path, **kwargs)
@@ -5,11 +5,11 @@
 from tqdm import tqdm
 
 LIB_DIR = os.path.dirname(__file__)
-LIB_PATH = os.path.join(LIB_DIR, 'build/libbert.so')
+LIB_PATH = os.path.join(LIB_DIR, '../build/src/libbert.so')
 
 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
-outnull_file = open(os.devnull, "w")
-errnull_file = open(os.devnull, "w")
+outnull_file = open(os.devnull, 'w')
+errnull_file = open(os.devnull, 'w')
 
 class suppress_stdout_stderr():
     # NOTE: these must be "saved" here to avoid exceptions when using
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .convert import convert_hf`
	`2`	`+from .model import BertModel`
	`3`	`+from .bench import benchmark, load_jsonl`