Skip to content

Commit a4ad764

Browse files
committed
overhaul directory structure
1 parent 47cb93d commit a4ad764

File tree

13 files changed

+120
-348
lines changed

13 files changed

+120
-348
lines changed

CMakeLists.txt

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -84,22 +84,12 @@ if (GGML_METAL)
8484
endif()
8585

8686
#
87-
# Build libraries
87+
# build it
8888
#
8989

90+
# add ggml and bert
9091
add_subdirectory(ggml)
91-
add_subdirectory(examples)
92+
add_subdirectory(src)
9293

93-
# bert library
94-
add_library(bert bert.cpp bert.h)
95-
target_include_directories(bert PUBLIC .)
96-
target_compile_features(bert PUBLIC cxx_std_20)
97-
target_link_libraries(bert PRIVATE ggml ${BERT_EXTRA_LIBS})
98-
99-
# for shared libraries
94+
# for shared library
10095
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
101-
set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON)
102-
103-
# quantization
104-
add_executable(quantize quantize.cpp)
105-
target_link_libraries(quantize PRIVATE bert ggml)

README.md

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,30 @@ This repo is a fork of original [bert.cpp](https://github.com/skeskinen/bert.cpp
88

99
Fetch this repository then download submodules and install packages with
1010
```sh
11-
git submodule update --init --recursive
11+
git submodule update --init
1212
pip install -r requirements.txt
1313
```
1414

15-
To fetch models from `huggingface` and convert them to `gguf` format run the following
15+
To fetch models from `huggingface` and convert them to `gguf` format run something like the following (after creating the `models` directory)
1616
```sh
17-
cd models
18-
python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf # f16 is default
19-
python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f32.gguf f32 # optional
17+
python bert_cpp/convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf
2018
```
19+
This will convert to `float16` by default. To do `float32` add `f32` to the end of the command.
2120

2221
### Build
2322

24-
To build the dynamic library for usage from Python
23+
To build the C++ library for CPU/CUDA/Metal, run the following
2524
```sh
26-
cmake -B build .
27-
make -C build -j
28-
```
25+
# CPU
26+
cmake -B build . && make -C build -j
2927

30-
If you're compiling for GPU, you should run
31-
```sh
32-
cmake -DGGML_CUBLAS=ON -B build .
33-
make -C build -j
34-
```
35-
On some distros, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
28+
# CUDA
29+
cmake -DGGML_CUBLAS=ON -B build . && make -C build -j
3630

37-
And for Apple Metal, you should run
38-
```sh
39-
cmake -DGGML_METAL=ON -B build .
40-
make -C build -j
31+
# Metal
32+
cmake -DGGML_METAL=ON -B build . && make -C build -j
4133
```
34+
On some distros, when compiling with CUDA, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
4235

4336
### Execute
4437

@@ -56,16 +49,16 @@ To force CPU usage, add the flag `-c`.
5649

5750
You can also run everything through Python, which is particularly useful for batch inference. For instance,
5851
```python
59-
import bert
60-
mod = bert.BertModel('models/bge-base-en-v1.5-f16.gguf')
52+
from bert_cpp import BertModel
53+
mod = BertModel('models/bge-base-en-v1.5-f16.gguf')
6154
emb = mod.embed(batch)
6255
```
6356
where `batch` is a list of strings and `emb` is a `numpy` array of embedding vectors.
6457

6558
### Quantize
6659

67-
You can quantize models with the command
60+
You can quantize models with the command (using the `f32` model as a base seems to work better)
6861
```sh
69-
build/bin/quantize models/bge-base-en-v1.5-f16.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
62+
build/bin/quantize models/bge-base-en-v1.5-f32.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
7063
```
7164
or whatever your desired quantization level is. Currently supported values are: `q8_0`, `q5_0`, `q5_1`, `q4_0`, and `q4_1`. You can then pass these model files directly to `main` as above.

bert_cpp/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .convert import convert_hf
2+
from .model import BertModel
3+
from .bench import benchmark, load_jsonl

bert_cpp/bench.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# benchmarking BERT
2+
3+
import re
4+
import json
5+
from .model import BertModel
6+
7+
# get batch indices
8+
def batch_indices(length, batch_size):
9+
return [(i, min(i+batch_size, length)) for i in range(0, length, batch_size)]
10+
11+
# split text into chunks
12+
def list_splitter(text, maxlen):
13+
for i, j in batch_indices(len(text), maxlen):
14+
yield text[i:j]
15+
16+
# default paragraph splitter
17+
def text_splitter(text, delim, min_len=1, max_len=None):
18+
if delim is not None:
19+
paras = [p.strip() for p in re.split(delim, text)]
20+
else:
21+
paras = [text]
22+
paras = [p for p in paras if len(p) >= min_len]
23+
if max_len is not None:
24+
paras = list(chain.from_iterable(
25+
list_splitter(p, max_len) for p in paras
26+
))
27+
return paras
28+
29+
# generate loader for jsonl file
30+
def stream_jsonl(path, max_rows=None):
31+
with open(path) as fid:
32+
for i, line in enumerate(fid):
33+
if max_rows is not None and i >= max_rows:
34+
break
35+
yield json.loads(line)
36+
37+
# load column of jsonl file and chunkify
38+
def load_jsonl(wiki_path, max_rows=1024, column='text', min_len=32, max_len=None):
39+
splitter = lambda s: text_splitter(s, '\n', min_len=min_len, max_len=max_len)
40+
stream = stream_jsonl(wiki_path, max_rows=max_rows)
41+
chunks = sum([splitter(d[column]) for d in stream], [])
42+
return chunks
43+
44+
# run benchmark for one model/data pair
45+
def benchmark(model, data, min_len=32, max_len=None, batch_size=32, max_rows=None, columns='text', use_cpu=False):
46+
if type(model) is str:
47+
model = BertModel(model, batch_size=batch_size, use_cpu=use_cpu)
48+
if type(data) is str:
49+
data = load_jsonl(data, max_rows=max_rows, column=column, min_len=min_len, max_len=max_len)
50+
return model.embed(data)

convert.py renamed to bert_cpp/convert.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,8 @@
11
import sys
2-
import json
32
import torch
43

5-
from pathlib import Path
6-
from gguf import GGUFWriter, GGMLQuantizationType, TokenType
4+
from gguf import GGUFWriter, GGMLQuantizationType
75
from transformers import AutoModel, AutoTokenizer
8-
from sentencepiece import SentencePieceProcessor
96

107
KEY_PAD_ID = 'tokenizer.ggml.padding_token_id'
118
KEY_UNK_ID = 'tokenizer.ggml.unknown_token_id'
@@ -14,18 +11,7 @@
1411
KEY_WORD_PREFIX = 'tokenizer.ggml.word_prefix'
1512
KEY_SUBWORD_PREFIX = 'tokenizer.ggml.subword_prefix'
1613

17-
# script usage
18-
if __name__ == '__main__':
19-
# primay usage
20-
if len(sys.argv) < 3:
21-
print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n')
22-
sys.exit(1)
23-
24-
# output in the same directory as the model
25-
repo_id = Path(sys.argv[1])
26-
output_path = Path(sys.argv[2])
27-
float_type = sys.argv[3].lower() if len(sys.argv) > 3 else 'f16'
28-
14+
def convert_hf(repo_id, output_path, float_type='f16'):
2915
# convert to ggml quantization type
3016
if float_type not in ['f16', 'f32']:
3117
print(f'Float type must be f16 or f32, got: {float_type}')
@@ -127,3 +113,23 @@
127113
# print success
128114
print()
129115
print(f'GGML model written to {output_path}')
116+
117+
# script usage
118+
if __name__ == '__main__':
119+
# primay usage
120+
if len(sys.argv) < 3:
121+
print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n')
122+
sys.exit(1)
123+
124+
# output in the same directory as the model
125+
repo_id = sys.argv[1]
126+
output_path = sys.argv[2]
127+
128+
# get float type
129+
if len(sys.argv) > 3:
130+
kwargs = {'float_type': sys.argv[3].lower()}
131+
else:
132+
kwargs = {}
133+
134+
# convert to ggml
135+
convert_hf(repo_id, output_path, **kwargs)

bert.py renamed to bert_cpp/model.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
from tqdm import tqdm
66

77
LIB_DIR = os.path.dirname(__file__)
8-
LIB_PATH = os.path.join(LIB_DIR, 'build/libbert.so')
8+
LIB_PATH = os.path.join(LIB_DIR, '../build/src/libbert.so')
99

1010
# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
11-
outnull_file = open(os.devnull, "w")
12-
errnull_file = open(os.devnull, "w")
11+
outnull_file = open(os.devnull, 'w')
12+
errnull_file = open(os.devnull, 'w')
1313

1414
class suppress_stdout_stderr():
1515
# NOTE: these must be "saved" here to avoid exceptions when using

examples/CMakeLists.txt

Lines changed: 0 additions & 7 deletions
This file was deleted.

0 commit comments

Comments
 (0)