Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sync llama.cpp #54

Merged
merged 1 commit into from
May 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
377 changes: 208 additions & 169 deletions cpp/common.cpp

Large diffs are not rendered by default.

17 changes: 13 additions & 4 deletions cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)

#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
Expand Down Expand Up @@ -103,7 +105,7 @@ struct gpt_params {
// // sampling parameters
struct llama_sampling_params sparams;

std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download
Expand Down Expand Up @@ -144,7 +146,7 @@ struct gpt_params {
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed

bool kl_divergence = false; // compute KL-divergence
bool kl_divergence = false; // compute KL divergence

bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
Expand All @@ -159,6 +161,7 @@ struct gpt_params {
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
Expand All @@ -172,15 +175,20 @@ struct gpt_params {
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data

std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V

// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::string image = ""; // path to an image file
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
};

void gpt_params_handle_model_default(gpt_params & params);

bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
Expand All @@ -204,6 +212,7 @@ bool validate_file_name(const std::string & filename);
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);

//
Expand Down
12 changes: 7 additions & 5 deletions cpp/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1784,12 +1784,14 @@ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) {

void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) {
// reset state for the next run
size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
if (!sched->is_reset) {
size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);

sched->is_reset = true;
sched->is_reset = true;
}
sched->is_alloc = false;
}

Expand Down
8 changes: 7 additions & 1 deletion cpp/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@
#include <string.h> // memcpy
#include <math.h> // fabsf

#undef MIN
#undef MAX

#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -307,7 +313,7 @@ inline static int32x4_t lm_ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t

#endif // defined(__ARM_NEON)

#if defined(__ARM_NEON) && !defined(__MSC_VER)
#if defined(__ARM_NEON) && !defined(_MSC_VER)

#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
Expand Down
Loading
Loading