Skip to content

Commit

Permalink
Reuse querybatch to reduce frequent memory allocation
Browse files Browse the repository at this point in the history
  • Loading branch information
gtygo committed Aug 9, 2024
1 parent fe6dc61 commit 88105b7
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions examples/retrieval/retrieval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,20 +253,22 @@ int main(int argc, char ** argv) {
chunks[i].tokens.clear();
}

struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);

// start loop, receive query and return top k similar chunks based on cosine similarity
std::string query;
while (true) {
printf("Enter query: ");
std::getline(std::cin, query);
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);

struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
batch_add_seq(query_batch, query_tokens, 0);

std::vector<float> query_emb(n_embd, 0);
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);

llama_batch_free(query_batch);

llama_batch_clear(query_batch);

// compute cosine similarities
{
Expand All @@ -293,6 +295,7 @@ int main(int argc, char ** argv) {
}

// clean up
llama_batch_free(query_batch);
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
Expand Down

0 comments on commit 88105b7

Please sign in to comment.