Skip to content

Commit

Permalink
overhaul directory structure
Browse files Browse the repository at this point in the history
  • Loading branch information
iamlemec committed Feb 5, 2024
1 parent 47cb93d commit a4ad764
Show file tree
Hide file tree
Showing 13 changed files with 120 additions and 348 deletions.
18 changes: 4 additions & 14 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,22 +84,12 @@ if (GGML_METAL)
endif()

#
# Build libraries
# build it
#

# add ggml and bert
add_subdirectory(ggml)
add_subdirectory(examples)
add_subdirectory(src)

# bert library
add_library(bert bert.cpp bert.h)
target_include_directories(bert PUBLIC .)
target_compile_features(bert PUBLIC cxx_std_20)
target_link_libraries(bert PRIVATE ggml ${BERT_EXTRA_LIBS})

# for shared libraries
# for shared library
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON)

# quantization
add_executable(quantize quantize.cpp)
target_link_libraries(quantize PRIVATE bert ggml)
39 changes: 16 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,30 @@ This repo is a fork of original [bert.cpp](https://github.com/skeskinen/bert.cpp

Fetch this repository then download submodules and install packages with
```sh
git submodule update --init --recursive
git submodule update --init
pip install -r requirements.txt
```

To fetch models from `huggingface` and convert them to `gguf` format run the following
To fetch models from `huggingface` and convert them to `gguf` format run something like the following (after creating the `models` directory)
```sh
cd models
python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf # f16 is default
python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f32.gguf f32 # optional
python bert_cpp/convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf
```
This will convert to `float16` by default. To do `float32` add `f32` to the end of the command.

### Build

To build the dynamic library for usage from Python
To build the C++ library for CPU/CUDA/Metal, run the following
```sh
cmake -B build .
make -C build -j
```
# CPU
cmake -B build . && make -C build -j

If you're compiling for GPU, you should run
```sh
cmake -DGGML_CUBLAS=ON -B build .
make -C build -j
```
On some distros, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
# CUDA
cmake -DGGML_CUBLAS=ON -B build . && make -C build -j

And for Apple Metal, you should run
```sh
cmake -DGGML_METAL=ON -B build .
make -C build -j
# Metal
cmake -DGGML_METAL=ON -B build . && make -C build -j
```
On some distros, when compiling with CUDA, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.

### Execute

Expand All @@ -56,16 +49,16 @@ To force CPU usage, add the flag `-c`.

You can also run everything through Python, which is particularly useful for batch inference. For instance,
```python
import bert
mod = bert.BertModel('models/bge-base-en-v1.5-f16.gguf')
from bert_cpp import BertModel
mod = BertModel('models/bge-base-en-v1.5-f16.gguf')
emb = mod.embed(batch)
```
where `batch` is a list of strings and `emb` is a `numpy` array of embedding vectors.

### Quantize

You can quantize models with the command
You can quantize models with the command (using the `f32` model as a base seems to work better)
```sh
build/bin/quantize models/bge-base-en-v1.5-f16.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
build/bin/quantize models/bge-base-en-v1.5-f32.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
```
or whatever your desired quantization level is. Currently supported values are: `q8_0`, `q5_0`, `q5_1`, `q4_0`, and `q4_1`. You can then pass these model files directly to `main` as above.
3 changes: 3 additions & 0 deletions bert_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .convert import convert_hf
from .model import BertModel
from .bench import benchmark, load_jsonl
50 changes: 50 additions & 0 deletions bert_cpp/bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# benchmarking BERT

import re
import json
from .model import BertModel

# get batch indices
def batch_indices(length, batch_size):
return [(i, min(i+batch_size, length)) for i in range(0, length, batch_size)]

# split text into chunks
def list_splitter(text, maxlen):
for i, j in batch_indices(len(text), maxlen):
yield text[i:j]

# default paragraph splitter
def text_splitter(text, delim, min_len=1, max_len=None):
if delim is not None:
paras = [p.strip() for p in re.split(delim, text)]
else:
paras = [text]
paras = [p for p in paras if len(p) >= min_len]
if max_len is not None:
paras = list(chain.from_iterable(
list_splitter(p, max_len) for p in paras
))
return paras

# generate loader for jsonl file
def stream_jsonl(path, max_rows=None):
with open(path) as fid:
for i, line in enumerate(fid):
if max_rows is not None and i >= max_rows:
break
yield json.loads(line)

# load column of jsonl file and chunkify
def load_jsonl(wiki_path, max_rows=1024, column='text', min_len=32, max_len=None):
splitter = lambda s: text_splitter(s, '\n', min_len=min_len, max_len=max_len)
stream = stream_jsonl(wiki_path, max_rows=max_rows)
chunks = sum([splitter(d[column]) for d in stream], [])
return chunks

# run benchmark for one model/data pair
def benchmark(model, data, min_len=32, max_len=None, batch_size=32, max_rows=None, columns='text', use_cpu=False):
if type(model) is str:
model = BertModel(model, batch_size=batch_size, use_cpu=use_cpu)
if type(data) is str:
data = load_jsonl(data, max_rows=max_rows, column=column, min_len=min_len, max_len=max_len)
return model.embed(data)
38 changes: 22 additions & 16 deletions convert.py → bert_cpp/convert.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import sys
import json
import torch

from pathlib import Path
from gguf import GGUFWriter, GGMLQuantizationType, TokenType
from gguf import GGUFWriter, GGMLQuantizationType
from transformers import AutoModel, AutoTokenizer
from sentencepiece import SentencePieceProcessor

KEY_PAD_ID = 'tokenizer.ggml.padding_token_id'
KEY_UNK_ID = 'tokenizer.ggml.unknown_token_id'
Expand All @@ -14,18 +11,7 @@
KEY_WORD_PREFIX = 'tokenizer.ggml.word_prefix'
KEY_SUBWORD_PREFIX = 'tokenizer.ggml.subword_prefix'

# script usage
if __name__ == '__main__':
# primay usage
if len(sys.argv) < 3:
print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n')
sys.exit(1)

# output in the same directory as the model
repo_id = Path(sys.argv[1])
output_path = Path(sys.argv[2])
float_type = sys.argv[3].lower() if len(sys.argv) > 3 else 'f16'

def convert_hf(repo_id, output_path, float_type='f16'):
# convert to ggml quantization type
if float_type not in ['f16', 'f32']:
print(f'Float type must be f16 or f32, got: {float_type}')
Expand Down Expand Up @@ -127,3 +113,23 @@
# print success
print()
print(f'GGML model written to {output_path}')

# script usage
if __name__ == '__main__':
# primay usage
if len(sys.argv) < 3:
print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n')
sys.exit(1)

# output in the same directory as the model
repo_id = sys.argv[1]
output_path = sys.argv[2]

# get float type
if len(sys.argv) > 3:
kwargs = {'float_type': sys.argv[3].lower()}
else:
kwargs = {}

# convert to ggml
convert_hf(repo_id, output_path, **kwargs)
6 changes: 3 additions & 3 deletions bert.py → bert_cpp/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
from tqdm import tqdm

LIB_DIR = os.path.dirname(__file__)
LIB_PATH = os.path.join(LIB_DIR, 'build/libbert.so')
LIB_PATH = os.path.join(LIB_DIR, '../build/src/libbert.so')

# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
outnull_file = open(os.devnull, "w")
errnull_file = open(os.devnull, "w")
outnull_file = open(os.devnull, 'w')
errnull_file = open(os.devnull, 'w')

class suppress_stdout_stderr():
# NOTE: these must be "saved" here to avoid exceptions when using
Expand Down
7 changes: 0 additions & 7 deletions examples/CMakeLists.txt

This file was deleted.

Loading

0 comments on commit a4ad764

Please sign in to comment.