diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a258af..60d45e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,22 +84,12 @@ if (GGML_METAL)
 endif()
 
 #
-# Build libraries
+# build it
 #
 
+# add ggml and bert
 add_subdirectory(ggml)
-add_subdirectory(examples)
+add_subdirectory(src)
 
-# bert library
-add_library(bert bert.cpp bert.h)
-target_include_directories(bert PUBLIC .)
-target_compile_features(bert PUBLIC cxx_std_20)
-target_link_libraries(bert PRIVATE ggml ${BERT_EXTRA_LIBS})
-
-# for shared libraries
+# for shared library
 set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-# quantization
-add_executable(quantize quantize.cpp)
-target_link_libraries(quantize PRIVATE bert ggml)
diff --git a/README.md b/README.md
index 7a6eb08..1eb0deb 100644
--- a/README.md
+++ b/README.md
@@ -8,37 +8,30 @@ This repo is a fork of original [bert.cpp](https://github.com/skeskinen/bert.cpp
 
 Fetch this repository then download submodules and install packages with
 ```sh
-git submodule update --init --recursive
+git submodule update --init
 pip install -r requirements.txt
 ```
 
-To fetch models from `huggingface`  and convert them to `gguf` format run the following
+To fetch models from `huggingface` and convert them to `gguf` format run something like the following (after creating the `models` directory)
 ```sh
-cd models
-python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf # f16 is default
-python convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f32.gguf f32 # optional
+python bert_cpp/convert.py BAAI/bge-base-en-v1.5 models/bge-base-en-v1.5-f16.gguf
 ```
+This will convert to `float16` by default. To do `float32` add `f32` to the end of the command.
 
 ### Build
 
-To build the dynamic library for usage from Python
+To build the C++ library for CPU/CUDA/Metal, run the following
 ```sh
-cmake -B build .
-make -C build -j
-```
+# CPU
+cmake -B build . && make -C build -j
 
-If you're compiling for GPU, you should run
-```sh
-cmake -DGGML_CUBLAS=ON -B build .
-make -C build -j
-```
-On some distros, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
+# CUDA
+cmake -DGGML_CUBLAS=ON -B build . && make -C build -j
 
-And for Apple Metal, you should run
-```sh
-cmake -DGGML_METAL=ON -B build .
-make -C build -j
+# Metal
+cmake -DGGML_METAL=ON -B build . && make -C build -j
 ```
+On some distros, when compiling with CUDA, you also need to specify the host C++ compiler. To do this, I suggest setting the `CUDAHOSTCXX` environment variable to your C++ bindir.
 
 ### Execute
 
@@ -56,16 +49,16 @@ To force CPU usage, add the flag `-c`.
 
 You can also run everything through Python, which is particularly useful for batch inference. For instance,
 ```python
-import bert
-mod = bert.BertModel('models/bge-base-en-v1.5-f16.gguf')
+from bert_cpp import BertModel
+mod = BertModel('models/bge-base-en-v1.5-f16.gguf')
 emb = mod.embed(batch)
 ```
 where `batch` is a list of strings and `emb` is a `numpy` array of embedding vectors.
 
 ### Quantize
 
-You can quantize models with the command
+You can quantize models with the command (using the `f32` model as a base seems to work better)
 ```sh
-build/bin/quantize models/bge-base-en-v1.5-f16.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
+build/bin/quantize models/bge-base-en-v1.5-f32.gguf models/bge-base-en-v1.5-q8_0.gguf q8_0
 ```
 or whatever your desired quantization level is. Currently supported values are: `q8_0`, `q5_0`, `q5_1`, `q4_0`, and `q4_1`. You can then pass these model files directly to `main` as above.
diff --git a/bert_cpp/__init__.py b/bert_cpp/__init__.py
new file mode 100644
index 0000000..f518688
--- /dev/null
+++ b/bert_cpp/__init__.py
@@ -0,0 +1,3 @@
+from .convert import convert_hf
+from .model import BertModel
+from .bench import benchmark, load_jsonl
diff --git a/bert_cpp/bench.py b/bert_cpp/bench.py
new file mode 100644
index 0000000..9db740c
--- /dev/null
+++ b/bert_cpp/bench.py
@@ -0,0 +1,50 @@
+# benchmarking BERT
+
+import re
+import json
+from .model import BertModel
+
+# get batch indices
+def batch_indices(length, batch_size):
+    return [(i, min(i+batch_size, length)) for i in range(0, length, batch_size)]
+
+# split text into chunks
+def list_splitter(text, maxlen):
+    for i, j in batch_indices(len(text), maxlen):
+        yield text[i:j]
+
+# default paragraph splitter
+def text_splitter(text, delim, min_len=1, max_len=None):
+    if delim is not None:
+        paras = [p.strip() for p in re.split(delim, text)]
+    else:
+        paras = [text]
+    paras = [p for p in paras if len(p) >= min_len]
+    if max_len is not None:
+        paras = list(chain.from_iterable(
+            list_splitter(p, max_len) for p in paras
+        ))
+    return paras
+
+# generate loader for jsonl file
+def stream_jsonl(path, max_rows=None):
+    with open(path) as fid:
+        for i, line in enumerate(fid):
+            if max_rows is not None and i >= max_rows:
+                break
+            yield json.loads(line)
+
+# load column of jsonl file and chunkify
+def load_jsonl(wiki_path, max_rows=1024, column='text', min_len=32, max_len=None):
+    splitter = lambda s: text_splitter(s, '\n', min_len=min_len, max_len=max_len)
+    stream = stream_jsonl(wiki_path, max_rows=max_rows)
+    chunks = sum([splitter(d[column]) for d in stream], [])
+    return chunks
+
+# run benchmark for one model/data pair
+def benchmark(model, data, min_len=32, max_len=None, batch_size=32, max_rows=None, columns='text', use_cpu=False):
+    if type(model) is str:
+        model = BertModel(model, batch_size=batch_size, use_cpu=use_cpu)
+    if type(data) is str:
+        data = load_jsonl(data, max_rows=max_rows, column=column, min_len=min_len, max_len=max_len)
+    return model.embed(data)
diff --git a/convert.py b/bert_cpp/convert.py
similarity index 91%
rename from convert.py
rename to bert_cpp/convert.py
index e34c58e..c2d33a7 100644
--- a/convert.py
+++ b/bert_cpp/convert.py
@@ -1,11 +1,8 @@
 import sys
-import json
 import torch
 
-from pathlib import Path
-from gguf import GGUFWriter, GGMLQuantizationType, TokenType
+from gguf import GGUFWriter, GGMLQuantizationType
 from transformers import AutoModel, AutoTokenizer
-from sentencepiece import SentencePieceProcessor
 
 KEY_PAD_ID = 'tokenizer.ggml.padding_token_id'
 KEY_UNK_ID = 'tokenizer.ggml.unknown_token_id'
@@ -14,18 +11,7 @@
 KEY_WORD_PREFIX = 'tokenizer.ggml.word_prefix'
 KEY_SUBWORD_PREFIX = 'tokenizer.ggml.subword_prefix'
 
-# script usage
-if __name__ == '__main__':
-    # primay usage
-    if len(sys.argv) < 3:
-        print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n')
-        sys.exit(1)
-
-    # output in the same directory as the model
-    repo_id = Path(sys.argv[1])
-    output_path = Path(sys.argv[2])
-    float_type = sys.argv[3].lower() if len(sys.argv) > 3 else 'f16'
-
+def convert_hf(repo_id, output_path, float_type='f16'):
     # convert to ggml quantization type
     if float_type not in ['f16', 'f32']:
         print(f'Float type must be f16 or f32, got: {float_type}')
@@ -127,3 +113,23 @@
     # print success
     print()
     print(f'GGML model written to {output_path}')
+
+# script usage
+if __name__ == '__main__':
+    # primay usage
+    if len(sys.argv) < 3:
+        print('Usage: convert-to-ggml.py repo_id output_path [float-type=f16,f32]\n')
+        sys.exit(1)
+
+    # output in the same directory as the model
+    repo_id = sys.argv[1]
+    output_path = sys.argv[2]
+
+    # get float type
+    if len(sys.argv) > 3:
+        kwargs = {'float_type': sys.argv[3].lower()}
+    else:
+        kwargs = {}
+
+    # convert to ggml
+    convert_hf(repo_id, output_path, **kwargs)
diff --git a/bert.py b/bert_cpp/model.py
similarity index 98%
rename from bert.py
rename to bert_cpp/model.py
index bf78424..1dc0e01 100644
--- a/bert.py
+++ b/bert_cpp/model.py
@@ -5,11 +5,11 @@
 from tqdm import tqdm
 
 LIB_DIR = os.path.dirname(__file__)
-LIB_PATH = os.path.join(LIB_DIR, 'build/libbert.so')
+LIB_PATH = os.path.join(LIB_DIR, '../build/src/libbert.so')
 
 # Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
-outnull_file = open(os.devnull, "w")
-errnull_file = open(os.devnull, "w")
+outnull_file = open(os.devnull, 'w')
+errnull_file = open(os.devnull, 'w')
 
 class suppress_stdout_stderr():
     # NOTE: these must be "saved" here to avoid exceptions when using
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
deleted file mode 100644
index e2f6d48..0000000
--- a/examples/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories(${CMAKE_SOURCE_DIR}/)
-
-add_executable(main main.cpp)
-target_link_libraries(main PRIVATE bert ggml)
-
-add_executable(basic basic.cpp)
-target_link_libraries(basic PRIVATE bert ggml)
diff --git a/examples/basic.cpp b/examples/basic.cpp
deleted file mode 100644
index 3203767..0000000
--- a/examples/basic.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#include <string>
-#include <vector>
-#include <iostream>
-
-//
-// definitions
-//
-
-#define BASIC_MAX_NODES 128
-
-typedef std::vector<float> basic_input;
-typedef std::vector<basic_input> basic_batch;
-
-//
-// data structures
-//
-
-struct basic_hparams {
-    int32_t n_size = 256;
-};
-
-struct basic_model {
-    basic_hparams hparams;
-    struct ggml_tensor *weights;
-};
-
-struct basic_ctx {
-    basic_model model;
-
-    struct ggml_context * ctx_data;
-    std::vector<uint8_t> buf_compute_meta;
-
-    // memory buffers to evaluate the model
-    ggml_backend_buffer_t params_buffer = NULL;
-    ggml_backend_buffer_t compute_buffer = NULL;
-    ggml_backend_t backend = NULL;
-    ggml_allocr * compute_alloc = NULL;
-};
-
-//
-// helper functions
-
-static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
-    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
-    if (!cur) {
-        throw printf("%s: unable to find tensor %s\n", __func__, name.c_str());
-    }
-
-    return cur;
-}
-
-static void tensor_stats(ggml_tensor * t) {
-    int32_t src0 = t->src[0] ? t->src[0]->backend : -1;
-    int32_t src1 = t->src[1] ? t->src[1]->backend : -1;
-    printf(
-        "type = %s, dims = %d, shape = (%d, %d, %d, %d), backend = %d, src0 = %d, src1 = %d\n",
-        ggml_type_name(t->type), ggml_n_dims(t), t->ne[0], t->ne[1], t->ne[2], t->ne[3], t->backend, src0, src1
-    );
-}
-
-//
-// model definition
-//
-
-ggml_cgraph * basic_build_graph(basic_ctx * ctx, basic_batch batch) {
-    const basic_model & model = ctx->model;
-    const basic_hparams & hparams = model.hparams;
-
-    // extract model params
-    const int n_size = hparams.n_size;
-    const int n_batch_size = batch.size();
-
-    // params for graph data
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
-        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    // initialze computational graph
-    struct ggml_context * ctx_compute = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx_compute, BASIC_MAX_NODES, false);
-
-    // construct input tensors
-    struct ggml_tensor *input = ggml_new_tensor_2d(ctx_compute, GGML_TYPE_F32, n_size, n_batch_size);
-    ggml_set_name(input, "input");
-    ggml_allocr_alloc(ctx->compute_alloc, input);
-
-    // avoid writing input embeddings in memory measure mode
-    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
-        float * input_data = (float*)malloc(ggml_nbytes(input));
-        for (int ba = 0; ba < n_batch_size; ba++) {
-            for (int i = 0; i < n_size; i++) {
-                input_data[ba * n_size + i] = batch[ba][i];
-            }
-        }
-        ggml_backend_tensor_set(input, input_data, 0, ggml_nbytes(input));
-        free(input_data);
-    }
-
-    // the only computation
-    ggml_tensor * output = ggml_mul_mat(ctx_compute, input, model.weights); // [bs, ns] * [ns] -> [bs]
-
-    // build the graph
-    ggml_build_forward_expand(gf, output);
-
-    // free context
-    ggml_free(ctx_compute);
-
-    // return complete graph
-    return gf;
-}
-
-//
-// loading and setup
-//
-
-struct basic_ctx * basic_create_model() {
-    printf("%s: creating model\n", __func__);
-
-    // create context
-    basic_ctx * new_basic = new basic_ctx;
-    basic_model & model = new_basic->model;
-    basic_hparams & hparams = model.hparams;
-
-    // get hparams
-    const int32_t n_size = hparams.n_size;
-
-    // initialize advanced backend
-#ifdef GGML_USE_CUBLAS
-    new_basic->backend = ggml_backend_cuda_init(0);
-    if (!new_basic->backend) {
-        printf("%s: ggml_backend_cuda_init() failed\n", __func__);
-    } else {
-        printf("%s: BERT using CUDA backend\n", __func__);
-    }
-#endif
-
-    // fall back to CPU backend
-    if (!new_basic->backend) {
-        new_basic->backend = ggml_backend_cpu_init();
-        printf("%s: BERT using CPU backend\n", __func__);
-    }
-
-    // load tensors
-    {
-        struct ggml_init_params params = {
-            /*.mem_size =*/ 2 * ggml_tensor_overhead(),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc =*/ true,
-        };
-
-        new_basic->ctx_data = ggml_init(params);
-        if (!new_basic->ctx_data) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            free(new_basic);
-            return nullptr;
-        }
-
-        // add tensors to context
-        const char * name = "weights";
-        struct ggml_tensor * weights = ggml_new_tensor_1d(new_basic->ctx_data, GGML_TYPE_F32, n_size);
-        ggml_set_name(weights, name);
-        size_t weights_size = ggml_nbytes(weights);
-
-        // alloc memory and offload data
-        new_basic->params_buffer = ggml_backend_alloc_buffer(new_basic->backend, weights_size);
-        ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_basic->params_buffer);
-        ggml_allocr_alloc(alloc, weights);
-
-        // get local buffer
-        float * data;
-        std::vector<float> read_buf;
-        if (ggml_backend_buffer_is_host(new_basic->params_buffer)) {
-            data = (float*)weights->data;
-        } else {
-            read_buf.resize(weights_size);
-            data = reinterpret_cast<float*>(read_buf.data());
-        }
-
-        // fill in on host
-        for (int i = 0; i < n_size; i++) {
-            data[i] = 1.0;
-        }
-
-        // copy to device
-        if (!ggml_backend_buffer_is_host(new_basic->params_buffer)) {
-            ggml_backend_tensor_set(weights, data, 0, weights_size);
-        }
-
-        // free memory
-        ggml_allocr_free(alloc);
-    }
-
-    // use get_tensors to populate basic_model
-    model.weights = get_tensor(new_basic->ctx_data, "weights");
-
-    // allocate space for graph
-    {
-        // measure compute graph memory usage
-        new_basic->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
-        new_basic->compute_alloc = ggml_allocr_new_measure_from_backend(new_basic->backend);
-
-        // construct batch and compute graph
-        basic_input input(hparams.n_size);
-        basic_batch batch = {input, input};
-        ggml_cgraph * gf = basic_build_graph(new_basic, batch);
-
-        // get measurement results
-        size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_basic->compute_alloc, gf);
-        ggml_allocr_free(new_basic->compute_alloc);
-
-        // create real compute buffer and allocator
-        new_basic->compute_buffer = ggml_backend_alloc_buffer(new_basic->backend, compute_memory_buffer_size);
-        new_basic->compute_alloc = ggml_allocr_new_from_buffer(new_basic->compute_buffer);
-
-        printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size / 1024.0 / 1024.0);
-    }
-
-    return new_basic;
-}
-
-void basic_free(basic_ctx * ctx) {
-    ggml_free(ctx->ctx_data);
-    delete ctx;
-}
-
-//
-// model execution
-//
-
-void basic_forward_batch(basic_ctx * ctx, basic_batch batch, float * output) {
-    // reset alloc buffer to clean the memory from previous invocations
-    ggml_allocr_reset(ctx->compute_alloc);
-
-    // build the inference graph
-    ggml_cgraph * gf = basic_build_graph(ctx, batch);
-    ggml_allocr_alloc_graph(ctx->compute_alloc, gf);
-
-    // compute the graph
-    ggml_backend_graph_compute(ctx->backend, gf);
-
-    // print graph info
-    printf("%s: compute done\n", __func__);
-    ggml_graph_print(gf);
-
-    // the last node is the embedding tensor
-    struct ggml_tensor * final = gf->nodes[gf->n_nodes - 1];
-    printf(
-        "%s: type = %s, ndim = %d, nelem = %d, nrows = %d\n",
-        __func__, ggml_type_name(final->type), ggml_n_dims(final), ggml_nelements(final), ggml_nrows(final)
-    );
-
-    // copy the embeddings to the location passed by the user
-    ggml_backend_tensor_get(final, output, 0, ggml_nbytes(final));
-}
-
-float basic_forward_one(struct basic_ctx * ctx, basic_input input) {
-    basic_batch batch = {input};
-    float output;
-    basic_forward_batch(ctx, batch, &output);
-    return output;
-}
-
-int main(int argc, char ** argv) {
-    basic_input input(256);
-    for (int i = 0; i < 256; i++) {
-        input[i] = (float)i;
-    }
-    basic_batch batch = {input, input};
-    float output[2];
-
-    basic_ctx * ctx = basic_create_model();
-    basic_forward_batch(ctx, batch, output);
-
-    printf("output = %f %f\n", output[0], output[1]);
-    return 0;
-}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..6c47adc
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# bert.cpp main
+#
+
+# ggml includes
+include_directories(${CMAKE_SOURCE_DIR}/ggml/src ${CMAKE_SOURCE_DIR}/ggml/include/ggml)
+
+# bert library
+add_library(bert bert.cpp bert.h)
+target_compile_features(bert PUBLIC cxx_std_20)
+
+# for shared libraries
+set_target_properties(bert PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_link_libraries(bert PRIVATE ggml)
+
+# main entry
+add_executable(main main.cpp)
+target_link_libraries(main PRIVATE bert ggml)
+
+# quantization
+add_executable(quantize quantize.cpp)
+target_link_libraries(quantize PRIVATE bert ggml)
diff --git a/bert.cpp b/src/bert.cpp
similarity index 100%
rename from bert.cpp
rename to src/bert.cpp
diff --git a/bert.h b/src/bert.h
similarity index 100%
rename from bert.h
rename to src/bert.h
diff --git a/examples/main.cpp b/src/main.cpp
similarity index 100%
rename from examples/main.cpp
rename to src/main.cpp
diff --git a/quantize.cpp b/src/quantize.cpp
similarity index 100%
rename from quantize.cpp
rename to src/quantize.cpp