Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sync llama.cpp #71

Merged
merged 3 commits into from
Jul 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions android/src/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ set(
${RNLLAMA_LIB_DIR}/unicode-data.cpp
${RNLLAMA_LIB_DIR}/unicode.cpp
${RNLLAMA_LIB_DIR}/llama.cpp
${RNLLAMA_LIB_DIR}/llama-vocab.cpp
${RNLLAMA_LIB_DIR}/llama-sampling.cpp
${RNLLAMA_LIB_DIR}/llama-grammar.cpp
${RNLLAMA_LIB_DIR}/sgemm.cpp
${RNLLAMA_LIB_DIR}/ggml-aarch64.c
${RNLLAMA_LIB_DIR}/rn-llama.hpp
Expand Down
3 changes: 0 additions & 3 deletions android/src/main/java/com/rnllama/LlamaContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap pa
params.hasKey("lora") ? params.getString("lora") : "",
// float lora_scaled,
params.hasKey("lora_scaled") ? (float) params.getDouble("lora_scaled") : 1.0f,
// String lora_base,
params.hasKey("lora_base") ? params.getString("lora_base") : "",
// float rope_freq_base,
params.hasKey("rope_freq_base") ? (float) params.getDouble("rope_freq_base") : 0.0f,
// float rope_freq_scale
Expand Down Expand Up @@ -312,7 +310,6 @@ protected static native long initContext(
boolean use_mmap,
String lora,
float lora_scaled,
String lora_base,
float rope_freq_base,
float rope_freq_scale
);
Expand Down
4 changes: 0 additions & 4 deletions android/src/main/jni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ Java_com_rnllama_LlamaContext_initContext(
jboolean use_mmap,
jstring lora_str,
jfloat lora_scaled,
jstring lora_base_str,
jfloat rope_freq_base,
jfloat rope_freq_scale
) {
Expand All @@ -158,10 +157,8 @@ Java_com_rnllama_LlamaContext_initContext(
defaultParams.use_mmap = use_mmap;

const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
const char *lora_base_chars = env->GetStringUTFChars(lora_base_str, nullptr);
if (lora_chars != nullptr && lora_chars[0] != '\0') {
defaultParams.lora_adapter.push_back({lora_chars, lora_scaled});
defaultParams.lora_base = lora_base_chars;
defaultParams.use_mmap = false;
}

Expand All @@ -180,7 +177,6 @@ Java_com_rnllama_LlamaContext_initContext(

env->ReleaseStringUTFChars(model_path_str, model_path_chars);
env->ReleaseStringUTFChars(lora_str, lora_chars);
env->ReleaseStringUTFChars(lora_base_str, lora_base_chars);

return reinterpret_cast<jlong>(llama->ctx);
}
Expand Down
21 changes: 11 additions & 10 deletions cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -700,11 +700,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
return true;
}
if (arg == "--lora-base") {
CHECK_ARG
params.lora_base = argv[i];
return true;
}
if (arg == "--control-vector") {
CHECK_ARG
params.control_vectors.push_back({ 1.0f, argv[i], });
Expand Down Expand Up @@ -1280,6 +1275,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
CHECK_ARG
params.out_file = argv[i];
params.cvector_outfile = argv[i];
params.lora_outfile = argv[i];
return true;
}
if (arg == "-ofreq" || arg == "--output-frequency") {
Expand Down Expand Up @@ -1589,9 +1585,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
"advanced option to override model metadata by key. may be specified multiple times.\n"
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
"note: this argument can be repeated to add multiple control vectors" });
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
Expand Down Expand Up @@ -1682,6 +1677,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });

options.push_back({ "export-lora" });
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });

printf("usage: %s [options]\n", argv[0]);

for (const auto & o : options) {
Expand Down Expand Up @@ -2727,7 +2729,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
const llama_chat_msg & new_msg,
bool add_ass) {
std::ostringstream ss;
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
Expand Down Expand Up @@ -3172,7 +3174,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
Expand Down
3 changes: 2 additions & 1 deletion cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ struct gpt_params {

// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter

std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

Expand Down Expand Up @@ -266,6 +265,8 @@ struct gpt_params {
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

bool spm_infill = false; // suffix/prefix/middle pattern for infill

std::string lora_outfile = "ggml-lora-merged-f16.gguf";
};

void gpt_params_handle_hf_token(gpt_params & params);
Expand Down
12 changes: 6 additions & 6 deletions cpp/ggml-aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
LM_GGML_ASSERT(!(lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__)
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -501,7 +501,7 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -613,7 +613,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(ncols_interleaved);
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE)
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
const void * b_ptr = vx;
const void * a_ptr = vy;
Expand Down Expand Up @@ -753,7 +753,7 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
LM_GGML_ASSERT(!(lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__)
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -1271,7 +1271,7 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
Expand Down Expand Up @@ -1727,7 +1727,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
UNUSED(ncols_interleaved);
UNUSED(blocklen);

#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
const void * b_ptr = vx;
const void * a_ptr = vy;
Expand Down
42 changes: 18 additions & 24 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,7 @@ void lm_ggml_tallocr_alloc(struct lm_ggml_tallocr * talloc, struct lm_ggml_tenso
if (talloc->offset + size > lm_ggml_backend_buffer_get_size(talloc->buffer)) {
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, lm_ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
LM_GGML_ASSERT(!"not enough space in the buffer");
return;
LM_GGML_ABORT("not enough space in the buffer");
}

void * addr = (char *)lm_ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
Expand Down Expand Up @@ -133,7 +132,7 @@ static void add_allocated_tensor(struct lm_ggml_dyn_tallocr * alloc, size_t offs
return;
}
}
LM_GGML_ASSERT(!"out of allocated_tensors");
LM_GGML_ABORT("out of allocated_tensors");
}
static void remove_allocated_tensor(struct lm_ggml_dyn_tallocr * alloc, size_t offset, const struct lm_ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
Expand All @@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct lm_ggml_dyn_tallocr * alloc, size_t o
return;
}
}
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
LM_GGML_ASSERT(!"tensor not found");
LM_GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
}
#endif

Expand Down Expand Up @@ -176,8 +174,7 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
// this should never happen
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
LM_GGML_ASSERT(!"not enough space in the buffer");
LM_GGML_UNREACHABLE();
LM_GGML_ABORT("not enough space in the buffer");
}
}

Expand Down Expand Up @@ -443,7 +440,7 @@ void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc) {
}
}

free(galloc->hash_set.keys);
lm_ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
free(galloc->buffers);
Expand All @@ -456,7 +453,7 @@ void lm_ggml_gallocr_free(lm_ggml_gallocr_t galloc) {
typedef struct lm_ggml_gallocr * lm_ggml_gallocr_t;

static struct hash_node * lm_ggml_gallocr_hash_get(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) {
size_t i = lm_ggml_hash_find_or_insert(galloc->hash_set, t);
size_t i = lm_ggml_hash_find_or_insert(&galloc->hash_set, t);
return &galloc->hash_values[i];
}

Expand Down Expand Up @@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {

static void lm_ggml_gallocr_alloc_graph_impl(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// clear hash tables
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct lm_ggml_tensor *));
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
lm_ggml_hash_set_reset(&galloc->hash_set);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);

// allocate leafs
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
Expand Down Expand Up @@ -671,21 +668,19 @@ static void lm_ggml_gallocr_alloc_graph_impl(lm_ggml_gallocr_t galloc, struct lm
}

bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
size_t hash_size = graph->visited_hash_table.size;
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;

// initialize hash table
if (galloc->hash_set.size < hash_size) {
free(galloc->hash_set.keys);
free(galloc->hash_values);
galloc->hash_set.size = hash_size;
galloc->hash_set.keys = calloc(hash_size, sizeof(struct lm_ggml_tensor *));
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
if (galloc->hash_set.size < min_hash_size) {
lm_ggml_hash_set_free(&galloc->hash_set);
galloc->hash_set = lm_ggml_hash_set_new(min_hash_size);
LM_GGML_ASSERT(galloc->hash_set.keys != NULL);

free(galloc->hash_values);
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
LM_GGML_ASSERT(galloc->hash_values != NULL);
} else {
// reset hash table
memset(galloc->hash_set.keys, 0, sizeof(struct lm_ggml_tensor *) * galloc->hash_set.size);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
}

// reset allocators
Expand Down Expand Up @@ -817,8 +812,7 @@ static void lm_ggml_gallocr_init_tensor(lm_ggml_gallocr_t galloc, struct lm_ggml
}

static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, struct tensor_alloc * talloc) {
lm_ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
size_t node_size = (node->data || node->view_src) ? 0 : lm_ggml_backend_buft_get_alloc_size(buft, node);
size_t node_size = (node->data || node->view_src) ? 0 : lm_ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
return talloc->size_max >= node_size;
}

Expand Down
Loading
Loading