diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a258af..60d45e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,22 +84,12 @@ if (GGML_METAL) endif() # -# Build libraries +# build it # +# add ggml and bert add_subdirectory(ggml) -add_subdirectory(examples) +add_subdirectory(src) -# bert library -add_library(bert bert.cpp bert.h) -target_include_directories(bert PUBLIC .) -target_compile_features(bert PUBLIC cxx_std_20) -target_link_libraries(bert PRIVATE ggml ${BERT_EXTRA_LIBS}) - -# for shared libraries +# for shared library set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) -set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON) - -# quantization -add_executable(quantize quantize.cpp) -target_link_libraries(quantize PRIVATE bert ggml) diff --git a/README.md b/README.md index 7a6eb08..1eb0deb 100644 --- a/README.md +++ b/README.md @@ -8,37 +8,30 @@ This repo is a fork of original [bert.cpp](https://github.com/skeskinen/bert.cpp Fetch this repository then download submodules and install packages with ```sh -git submodule update --init --recursive +git submodule update --init pip install -r requirements.txt ``` -To fetch models from `huggingface` and convert them to `gguf` format run the following +To fetch models from `huggingface` and convert them to `gguf` format run something like the following (after creating the `models` directory) ```sh -cd models -python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf # f16 is default -python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f32.gguf f32 # optional +python bert_cpp/convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf ``` +This will convert to `float16` by default. To do `float32` add `f32` to the end of the command. ### Build -To build the dynamic library for usage from Python +To build the C++ library for CPU/CUDA/Metal, run the following ```sh -cmake -B build . -make -C build -j -``` +# CPU +cmake -B build . && make -C build -j -If you're compiling for GPU, you should run -```sh -cmake -DGGML_CUBLAS=ON -B build . -make -C build -j -``` -On some distros, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir. +# CUDA +cmake -DGGML_CUBLAS=ON -B build . && make -C build -j -And for Apple Metal, you should run -```sh -cmake -DGGML_METAL=ON -B build . -make -C build -j +# Metal +cmake -DGGML_METAL=ON -B build . && make -C build -j ``` +On some distros, when compiling with CUDA, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir. ### Execute @@ -56,16 +49,16 @@ To force CPU usage, add the flag `-c`. You can also run everything through Python, which is particularly useful for batch inference. For instance, ```python -import bert -mod = bert.BertModel('models/bge-base-en-v1.5-f16.gguf') +from bert_cpp import BertModel +mod = BertModel('models/bge-base-en-v1.5-f16.gguf') emb = mod.embed(batch) ``` where `batch` is a list of strings and `emb` is a `numpy` array of embedding vectors. ### Quantize -You can quantize models with the command +You can quantize models with the command (using the `f32` model as a base seems to work better) ```sh -build/bin/quantize models/bge-base-en-v1.5-f16.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0 +build/bin/quantize models/bge-base-en-v1.5-f32.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0 ``` or whatever your desired quantization level is. Currently supported values are: `q8_0`, `q5_0`, `q5_1`, `q4_0`, and `q4_1`. You can then pass these model files directly to `main` as above. diff --git a/bert_cpp/__init__.py b/bert_cpp/__init__.py new file mode 100644 index 0000000..f518688 --- /dev/null +++ b/bert_cpp/__init__.py @@ -0,0 +1,3 @@ +from .convert import convert_hf +from .model import BertModel +from .bench import benchmark, load_jsonl diff --git a/bert_cpp/bench.py b/bert_cpp/bench.py new file mode 100644 index 0000000..9db740c --- /dev/null +++ b/bert_cpp/bench.py @@ -0,0 +1,50 @@ +# benchmarking BERT + +import re +import json +from .model import BertModel + +# get batch indices +def batch_indices(length, batch_size): + return [(i, min(i+batch_size, length)) for i in range(0, length, batch_size)] + +# split text into chunks +def list_splitter(text, maxlen): + for i, j in batch_indices(len(text), maxlen): + yield text[i:j] + +# default paragraph splitter +def text_splitter(text, delim, min_len=1, max_len=None): + if delim is not None: + paras = [p.strip() for p in re.split(delim, text)] + else: + paras = [text] + paras = [p for p in paras if len(p) >= min_len] + if max_len is not None: + paras = list(chain.from_iterable( + list_splitter(p, max_len) for p in paras + )) + return paras + +# generate loader for jsonl file +def stream_jsonl(path, max_rows=None): + with open(path) as fid: + for i, line in enumerate(fid): + if max_rows is not None and i >= max_rows: + break + yield json.loads(line) + +# load column of jsonl file and chunkify +def load_jsonl(wiki_path, max_rows=1024, column='text', min_len=32, max_len=None): + splitter = lambda s: text_splitter(s, '\n', min_len=min_len, max_len=max_len) + stream = stream_jsonl(wiki_path, max_rows=max_rows) + chunks = sum([splitter(d[column]) for d in stream], []) + return chunks + +# run benchmark for one model/data pair +def benchmark(model, data, min_len=32, max_len=None, batch_size=32, max_rows=None, columns='text', use_cpu=False): + if type(model) is str: + model = BertModel(model, batch_size=batch_size, use_cpu=use_cpu) + if type(data) is str: + data = load_jsonl(data, max_rows=max_rows, column=column, min_len=min_len, max_len=max_len) + return model.embed(data) diff --git a/convert.py b/bert_cpp/convert.py similarity index 91% rename from convert.py rename to bert_cpp/convert.py index e34c58e..c2d33a7 100644 --- a/convert.py +++ b/bert_cpp/convert.py @@ -1,11 +1,8 @@ import sys -import json import torch -from pathlib import Path -from gguf import GGUFWriter, GGMLQuantizationType, TokenType +from gguf import GGUFWriter, GGMLQuantizationType from transformers import AutoModel, AutoTokenizer -from sentencepiece import SentencePieceProcessor KEY_PAD_ID = 'tokenizer.ggml.padding_token_id' KEY_UNK_ID = 'tokenizer.ggml.unknown_token_id' @@ -14,18 +11,7 @@ KEY_WORD_PREFIX = 'tokenizer.ggml.word_prefix' KEY_SUBWORD_PREFIX = 'tokenizer.ggml.subword_prefix' -# script usage -if __name__ == '__main__': - # primay usage - if len(sys.argv) < 3: - print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n') - sys.exit(1) - - # output in the same directory as the model - repo_id = Path(sys.argv[1]) - output_path = Path(sys.argv[2]) - float_type = sys.argv[3].lower() if len(sys.argv) > 3 else 'f16' - +def convert_hf(repo_id, output_path, float_type='f16'): # convert to ggml quantization type if float_type not in ['f16', 'f32']: print(f'Float type must be f16 or f32, got: {float_type}') @@ -127,3 +113,23 @@ # print success print() print(f'GGML model written to {output_path}') + +# script usage +if __name__ == '__main__': + # primay usage + if len(sys.argv) < 3: + print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n') + sys.exit(1) + + # output in the same directory as the model + repo_id = sys.argv[1] + output_path = sys.argv[2] + + # get float type + if len(sys.argv) > 3: + kwargs = {'float_type': sys.argv[3].lower()} + else: + kwargs = {} + + # convert to ggml + convert_hf(repo_id, output_path, **kwargs) diff --git a/bert.py b/bert_cpp/model.py similarity index 98% rename from bert.py rename to bert_cpp/model.py index bf78424..1dc0e01 100644 --- a/bert.py +++ b/bert_cpp/model.py @@ -5,11 +5,11 @@ from tqdm import tqdm LIB_DIR = os.path.dirname(__file__) -LIB_PATH = os.path.join(LIB_DIR, 'build/libbert.so') +LIB_PATH = os.path.join(LIB_DIR, '../build/src/libbert.so') # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor -outnull_file = open(os.devnull, "w") -errnull_file = open(os.devnull, "w") +outnull_file = open(os.devnull, 'w') +errnull_file = open(os.devnull, 'w') class suppress_stdout_stderr(): # NOTE: these must be "saved" here to avoid exceptions when using diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt deleted file mode 100644 index e2f6d48..0000000 --- a/examples/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -include_directories(${CMAKE_SOURCE_DIR}/) - -add_executable(main main.cpp) -target_link_libraries(main PRIVATE bert ggml) - -add_executable(basic basic.cpp) -target_link_libraries(basic PRIVATE bert ggml) diff --git a/examples/basic.cpp b/examples/basic.cpp deleted file mode 100644 index 3203767..0000000 --- a/examples/basic.cpp +++ /dev/null @@ -1,285 +0,0 @@ -#include "ggml.h" -#include "ggml-backend.h" - -#ifdef GGML_USE_CUBLAS -#include "ggml-cuda.h" -#endif - -#include -#include -#include - -// -// definitions -// - -#define BASIC_MAX_NODES 128 - -typedef std::vector basic_input; -typedef std::vector basic_batch; - -// -// data structures -// - -struct basic_hparams { - int32_t n_size = 256; -}; - -struct basic_model { - basic_hparams hparams; - struct ggml_tensor *weights; -}; - -struct basic_ctx { - basic_model model; - - struct ggml_context * ctx_data; - std::vector buf_compute_meta; - - // memory buffers to evaluate the model - ggml_backend_buffer_t params_buffer = NULL; - ggml_backend_buffer_t compute_buffer = NULL; - ggml_backend_t backend = NULL; - ggml_allocr * compute_alloc = NULL; -}; - -// -// helper functions - -static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); - if (!cur) { - throw printf("%s: unable to find tensor %s\n", __func__, name.c_str()); - } - - return cur; -} - -static void tensor_stats(ggml_tensor * t) { - int32_t src0 = t->src[0] ? t->src[0]->backend : -1; - int32_t src1 = t->src[1] ? t->src[1]->backend : -1; - printf( - "type = %s, dims = %d, shape = (%d, %d, %d, %d), backend = %d, src0 = %d, src1 = %d\n", - ggml_type_name(t->type), ggml_n_dims(t), t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->backend, src0, src1 - ); -} - -// -// model definition -// - -ggml_cgraph * basic_build_graph(basic_ctx * ctx, basic_batch batch) { - const basic_model & model = ctx->model; - const basic_hparams & hparams = model.hparams; - - // extract model params - const int n_size = hparams.n_size; - const int n_batch_size = batch.size(); - - // params for graph data - struct ggml_init_params params = { - /*.mem_size =*/ ctx->buf_compute_meta.size(), - /*.mem_buffer =*/ ctx->buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - // initialze computational graph - struct ggml_context * ctx_compute = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx_compute, BASIC_MAX_NODES, false); - - // construct input tensors - struct ggml_tensor *input = ggml_new_tensor_2d(ctx_compute, GGML_TYPE_F32, n_size, n_batch_size); - ggml_set_name(input, "input"); - ggml_allocr_alloc(ctx->compute_alloc, input); - - // avoid writing input embeddings in memory measure mode - if (!ggml_allocr_is_measure(ctx->compute_alloc)) { - float * input_data = (float*)malloc(ggml_nbytes(input)); - for (int ba = 0; ba < n_batch_size; ba++) { - for (int i = 0; i < n_size; i++) { - input_data[ba * n_size + i] = batch[ba][i]; - } - } - ggml_backend_tensor_set(input, input_data, 0, ggml_nbytes(input)); - free(input_data); - } - - // the only computation - ggml_tensor * output = ggml_mul_mat(ctx_compute, input, model.weights); // [bs, ns] * [ns] -> [bs] - - // build the graph - ggml_build_forward_expand(gf, output); - - // free context - ggml_free(ctx_compute); - - // return complete graph - return gf; -} - -// -// loading and setup -// - -struct basic_ctx * basic_create_model() { - printf("%s: creating model\n", __func__); - - // create context - basic_ctx * new_basic = new basic_ctx; - basic_model & model = new_basic->model; - basic_hparams & hparams = model.hparams; - - // get hparams - const int32_t n_size = hparams.n_size; - - // initialize advanced backend -#ifdef GGML_USE_CUBLAS - new_basic->backend = ggml_backend_cuda_init(0); - if (!new_basic->backend) { - printf("%s: ggml_backend_cuda_init() failed\n", __func__); - } else { - printf("%s: BERT using CUDA backend\n", __func__); - } -#endif - - // fall back to CPU backend - if (!new_basic->backend) { - new_basic->backend = ggml_backend_cpu_init(); - printf("%s: BERT using CPU backend\n", __func__); - } - - // load tensors - { - struct ggml_init_params params = { - /*.mem_size =*/ 2 * ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - - new_basic->ctx_data = ggml_init(params); - if (!new_basic->ctx_data) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - free(new_basic); - return nullptr; - } - - // add tensors to context - const char * name = "weights"; - struct ggml_tensor * weights = ggml_new_tensor_1d(new_basic->ctx_data, GGML_TYPE_F32, n_size); - ggml_set_name(weights, name); - size_t weights_size = ggml_nbytes(weights); - - // alloc memory and offload data - new_basic->params_buffer = ggml_backend_alloc_buffer(new_basic->backend, weights_size); - ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_basic->params_buffer); - ggml_allocr_alloc(alloc, weights); - - // get local buffer - float * data; - std::vector read_buf; - if (ggml_backend_buffer_is_host(new_basic->params_buffer)) { - data = (float*)weights->data; - } else { - read_buf.resize(weights_size); - data = reinterpret_cast(read_buf.data()); - } - - // fill in on host - for (int i = 0; i < n_size; i++) { - data[i] = 1.0; - } - - // copy to device - if (!ggml_backend_buffer_is_host(new_basic->params_buffer)) { - ggml_backend_tensor_set(weights, data, 0, weights_size); - } - - // free memory - ggml_allocr_free(alloc); - } - - // use get_tensors to populate basic_model - model.weights = get_tensor(new_basic->ctx_data, "weights"); - - // allocate space for graph - { - // measure compute graph memory usage - new_basic->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); - new_basic->compute_alloc = ggml_allocr_new_measure_from_backend(new_basic->backend); - - // construct batch and compute graph - basic_input input(hparams.n_size); - basic_batch batch = {input, input}; - ggml_cgraph * gf = basic_build_graph(new_basic, batch); - - // get measurement results - size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_basic->compute_alloc, gf); - ggml_allocr_free(new_basic->compute_alloc); - - // create real compute buffer and allocator - new_basic->compute_buffer = ggml_backend_alloc_buffer(new_basic->backend, compute_memory_buffer_size); - new_basic->compute_alloc = ggml_allocr_new_from_buffer(new_basic->compute_buffer); - - printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size / 1024.0 / 1024.0); - } - - return new_basic; -} - -void basic_free(basic_ctx * ctx) { - ggml_free(ctx->ctx_data); - delete ctx; -} - -// -// model execution -// - -void basic_forward_batch(basic_ctx * ctx, basic_batch batch, float * output) { - // reset alloc buffer to clean the memory from previous invocations - ggml_allocr_reset(ctx->compute_alloc); - - // build the inference graph - ggml_cgraph * gf = basic_build_graph(ctx, batch); - ggml_allocr_alloc_graph(ctx->compute_alloc, gf); - - // compute the graph - ggml_backend_graph_compute(ctx->backend, gf); - - // print graph info - printf("%s: compute done\n", __func__); - ggml_graph_print(gf); - - // the last node is the embedding tensor - struct ggml_tensor * final = gf->nodes[gf->n_nodes - 1]; - printf( - "%s: type = %s, ndim = %d, nelem = %d, nrows = %d\n", - __func__, ggml_type_name(final->type), ggml_n_dims(final), ggml_nelements(final), ggml_nrows(final) - ); - - // copy the embeddings to the location passed by the user - ggml_backend_tensor_get(final, output, 0, ggml_nbytes(final)); -} - -float basic_forward_one(struct basic_ctx * ctx, basic_input input) { - basic_batch batch = {input}; - float output; - basic_forward_batch(ctx, batch, &output); - return output; -} - -int main(int argc, char ** argv) { - basic_input input(256); - for (int i = 0; i < 256; i++) { - input[i] = (float)i; - } - basic_batch batch = {input, input}; - float output[2]; - - basic_ctx * ctx = basic_create_model(); - basic_forward_batch(ctx, batch, output); - - printf("output = %f %f\n", output[0], output[1]); - return 0; -} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..6c47adc --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,22 @@ +# +# bert.cpp main +# + +# ggml includes +include_directories(${CMAKE_SOURCE_DIR}/ggml/src ${CMAKE_SOURCE_DIR}/ggml/include/ggml) + +# bert library +add_library(bert bert.cpp bert.h) +target_compile_features(bert PUBLIC cxx_std_20) + +# for shared libraries +set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_link_libraries(bert PRIVATE ggml) + +# main entry +add_executable(main main.cpp) +target_link_libraries(main PRIVATE bert ggml) + +# quantization +add_executable(quantize quantize.cpp) +target_link_libraries(quantize PRIVATE bert ggml) diff --git a/bert.cpp b/src/bert.cpp similarity index 100% rename from bert.cpp rename to src/bert.cpp diff --git a/bert.h b/src/bert.h similarity index 100% rename from bert.h rename to src/bert.h diff --git a/examples/main.cpp b/src/main.cpp similarity index 100% rename from examples/main.cpp rename to src/main.cpp diff --git a/quantize.cpp b/src/quantize.cpp similarity index 100% rename from quantize.cpp rename to src/quantize.cpp