Skip to content

Commit

Permalink
Merge branch 'upstream' into concedo_experimental
Browse files Browse the repository at this point in the history
# Conflicts:
#	README.md
#	examples/server/README.md
#	examples/speculative/speculative.cpp
#	flake.lock
#	ggml/src/CMakeLists.txt
#	scripts/sync-ggml.last
#	tests/test-backend-ops.cpp
  • Loading branch information
LostRuins committed Nov 14, 2024
2 parents bfa118e + 2a82891 commit df080b0
Show file tree
Hide file tree
Showing 41 changed files with 147,491 additions and 145,726 deletions.
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ struct common_params {
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
float defrag_thold = 0.1f; // KV cache defragmentation threshold

struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
Expand Down
12 changes: 5 additions & 7 deletions examples/chat-persistent.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"

SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
'|'\
'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"

CTX_SIZE=2048
Expand Down Expand Up @@ -129,15 +130,12 @@ while read -e line; do

printf ' '

# HACK get num tokens from debug message
# TODO get both messages in one go
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
exit 1
fi

n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")

if ((n_tokens > CTX_ROTATE_POINT)); then
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
Expand Down
26 changes: 24 additions & 2 deletions examples/convert_legacy_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,8 @@ def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None
self.gguf.add_base_model_version(key, base_model_entry["version"])
if "organization" in base_model_entry:
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
if "description" in base_model_entry:
self.gguf.add_base_model_description(key, base_model_entry["description"])
if "url" in base_model_entry:
self.gguf.add_base_model_url(key, base_model_entry["url"])
if "doi" in base_model_entry:
Expand All @@ -849,12 +851,32 @@ def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None
if "repo_url" in base_model_entry:
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])

if metadata.datasets is not None:
self.gguf.add_dataset_count(len(metadata.datasets))
for key, dataset_entry in enumerate(metadata.datasets):
if "name" in dataset_entry:
self.gguf.add_dataset_name(key, dataset_entry["name"])
if "author" in dataset_entry:
self.gguf.add_dataset_author(key, dataset_entry["author"])
if "version" in dataset_entry:
self.gguf.add_dataset_version(key, dataset_entry["version"])
if "organization" in dataset_entry:
self.gguf.add_dataset_organization(key, dataset_entry["organization"])
if "description" in dataset_entry:
self.gguf.add_dataset_description(key, dataset_entry["description"])
if "url" in dataset_entry:
self.gguf.add_dataset_url(key, dataset_entry["url"])
if "doi" in dataset_entry:
self.gguf.add_dataset_doi(key, dataset_entry["doi"])
if "uuid" in dataset_entry:
self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
if "repo_url" in dataset_entry:
self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])

if metadata.tags is not None:
self.gguf.add_tags(metadata.tags)
if metadata.languages is not None:
self.gguf.add_languages(metadata.languages)
if metadata.datasets is not None:
self.gguf.add_datasets(metadata.datasets)

def add_meta_arch(self, params: Params) -> None:
# Metadata About The Neural Architecture Itself
Expand Down
106 changes: 97 additions & 9 deletions examples/server/public/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -200,23 +200,38 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
<div class="label">System Message</div>
<textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
</label>
<template v-for="key in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
<label class="input input-bordered flex items-center gap-2 mb-2">
<b>{{ key }}</b>
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[key] || 'none')" v-model="config[key]" />
</label>
<template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
</template>
<!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
<div class="collapse collapse-arrow bg-base-200 mb-2">
<input type="checkbox" />
<div class="collapse-title font-bold">Advanced config</div>
<!-- Section: Other sampler settings -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Other sampler settings</summary>
<div class="collapse-content">
<template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
</template>
</div>
</details>
<!-- Section: Penalties settings -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Penalties settings</summary>
<div class="collapse-content">
<template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
</template>
</div>
</details>
<!-- Section: Advanced config -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Advanced config</summary>
<div class="collapse-content">
<label class="form-control mb-2">
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
<textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
</label>
</div>
</div>
</details>
</div>

<!-- action buttons -->
Expand All @@ -229,6 +244,21 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
</dialog>
</div>

<!-- Template to be used by settings modal -->
<template id="settings-modal-numeric-input">
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
<!-- Show help message on hovering on the input label -->
<div class="dropdown dropdown-hover">
<div tabindex="0" role="button" class="font-bold">{{ configKey }}</div>
<div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
{{ configInfo[configKey] || '(no help message available)' }}
</div>
</div>
<!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
<input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
</label>
</template>

<script src="./deps_markdown-it.js"></script>
<script type="module">
import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
Expand All @@ -245,12 +275,48 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
systemMessage: 'You are a helpful assistant.',
// make sure these default values are in sync with `common.h`
temperature: 0.8,
dynatemp_range: 0.0,
dynatemp_exponent: 1.0,
top_k: 40,
top_p: 0.95,
min_p: 0.05,
xtc_probability: 0.0,
xtc_threshold: 0.1,
typical_p: 1.0,
repeat_last_n: 64,
repeat_penalty: 1.0,
presence_penalty: 0.0,
frequency_penalty: 0.0,
dry_multiplier: 0.0,
dry_base: 1.75,
dry_allowed_length: 2,
dry_penalty_last_n: -1,
max_tokens: -1,
custom: '', // custom json-stringified object
};
const CONFIG_INFO = {
apiKey: '',
systemMessage: 'The starting message that defines how model should behave.',
temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
top_k: 'Keeps only k top tokens.',
top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
repeat_last_n: 'Last n tokens to consider for penalizing repetition',
repeat_penalty: 'Controls the repetition of token sequences in the generated text',
presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
frequency_penalty: 'Limits tokens based on how often they appear in the output.',
dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
max_tokens: 'The maximum number of token per output.',
custom: '', // custom json-stringified object
};
// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
// list of themes supported by daisyui
Expand All @@ -269,6 +335,12 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
{ props: ["source", "options", "plugins"] }
);

// inout field to be used by settings modal
const SettingsModalNumericInput = defineComponent({
template: document.getElementById('settings-modal-numeric-input').innerHTML,
props: ['configKey', 'configDefault', 'configInfo', 'modelValue'],
});

// coversations is stored in localStorage
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
// convId is a string prefixed with 'conv-'
Expand Down Expand Up @@ -359,6 +431,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
const mainApp = createApp({
components: {
VueMarkdown,
SettingsModalNumericInput,
},
data() {
return {
Expand All @@ -376,6 +449,7 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
// const
themes: THEMES,
configDefault: {...CONFIG_DEFAULT},
configInfo: {...CONFIG_INFO},
}
},
computed: {},
Expand Down Expand Up @@ -452,8 +526,22 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
stream: true,
cache_prompt: true,
temperature: this.config.temperature,
dynatemp_range: this.config.dynatemp_range,
dynatemp_exponent: this.config.dynatemp_exponent,
top_k: this.config.top_k,
top_p: this.config.top_p,
min_p: this.config.min_p,
typical_p: this.config.typical_p,
xtc_probability: this.config.xtc_probability,
xtc_threshold: this.config.xtc_threshold,
repeat_last_n: this.config.repeat_last_n,
repeat_penalty: this.config.repeat_penalty,
presence_penalty: this.config.presence_penalty,
frequency_penalty: this.config.frequency_penalty,
dry_multiplier: this.config.dry_multiplier,
dry_base: this.config.dry_base,
dry_allowed_length: this.config.dry_allowed_length,
dry_penalty_last_n: this.config.dry_penalty_last_n,
max_tokens: this.config.max_tokens,
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
Expand Down
15 changes: 10 additions & 5 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -656,11 +656,16 @@ struct server_context {
}

bool validate_model_chat_template() const {
llama_chat_message chat[] = {{"user", "test"}};

const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);

return res > 0;
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
std::string template_key = "tokenizer.chat_template";
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
if (res >= 0) {
llama_chat_message chat[] = {{"user", "test"}};
std::string tmpl = std::string(model_template.data(), model_template.size());
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
return chat_res > 0;
}
return false;
}

void init() {
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/count-equal.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
const int64_t dne = GGML_PAD((ne + 4*nsm - 1) / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);

CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));

Expand Down
9 changes: 3 additions & 6 deletions ggml/src/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -596,17 +596,12 @@ @implementation GGMLMetalClass
ctx->kernels[i].pipeline = nil;
}

/*
GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
(int) kernel->pipeline.threadExecutionWidth); \
*/
#define GGML_METAL_ADD_KERNEL(e, name, supported) \
if (supported) { \
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
kernel->pipeline = [device newComputePipelineStateWithFunction:metal_function error:&error]; \
GGML_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
(int) kernel->pipeline.threadExecutionWidth); \
[metal_function release]; \
Expand Down Expand Up @@ -3046,6 +3041,8 @@ static void ggml_metal_encode_node(

bool use_vec_kernel = false;

// TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
// for now avoiding mainly to keep the number of templates/kernels a bit lower
if (ne01 >= 4 || (ne00%128 != 0)) {
switch (src1->type) {
case GGML_TYPE_F16:
Expand Down
Loading

0 comments on commit df080b0

Please sign in to comment.