Skip to content

Commit

Permalink
wip on qwen2vl integration, updated msvc runtimes
Browse files Browse the repository at this point in the history
  • Loading branch information
LostRuins committed Dec 15, 2024
1 parent f456ed7 commit 00d154b
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 22 deletions.
147 changes: 125 additions & 22 deletions gpttype_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -542,9 +542,10 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch
std::vector<int32_t *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast) {
kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope) {
int32_t seq_id = 0;
pos.resize(n_tokens);
pos.resize(n_tokens * (use_mrope?4:1));
std::fill(pos.begin(), pos.end(), 0);
n_seq_id.resize(n_tokens);
seq_ids.resize(n_tokens + 1);
logits.resize(n_tokens);
Expand All @@ -560,23 +561,39 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = npast + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;

if(!use_mrope)
{
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = npast + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
else
{
for (int i = 0; i < n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
for (int j = 0; j < batch.n_tokens * 3; j++) {
batch.pos[j] = npast + (j % batch.n_tokens);
}
}
}
kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool return_all_logits) {
kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits) {
int32_t seq_id = 0;
int32_t n_tokens = tokens.size();
pos.resize(n_tokens);
pos.resize(n_tokens * (use_mrope?4:1));
std::fill(pos.begin(), pos.end(), 0);
n_seq_id.resize(n_tokens);
seq_ids.resize(n_tokens + 1);
logits.resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
seq_ids[n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens.data(),
Expand All @@ -586,11 +603,26 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = npast + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = (return_all_logits?true:false);

if(!use_mrope)
{
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = npast + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = (return_all_logits?true:false);
}
}
else
{
for (int i = 0; i < n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = (return_all_logits?true:false);
}
for (int j = 0; j < batch.n_tokens * 3; j++) {
batch.pos[j] = npast + (j % batch.n_tokens);
}
}
batch.logits[n_tokens - 1] = true;
}
Expand Down Expand Up @@ -687,7 +719,7 @@ static speculative_draft_result speculative_decoding_eval_chunk(llama_context *
drafted_ids.push_back(embd[0]);
for(int i=0;i<speculative_chunk_amt;++i)
{
kcpp_embd_batch batch1 = kcpp_embd_batch(temp_embd, draft_npast, false);
kcpp_embd_batch batch1 = kcpp_embd_batch(temp_embd, draft_npast, false, false);
auto draftok = (llama_decode(draft_ctx, batch1.batch)==0);
if(!draftok)
{
Expand All @@ -706,7 +738,8 @@ static speculative_draft_result speculative_decoding_eval_chunk(llama_context *

std::vector<int> real_embd = drafted_ids;
real_embd.pop_back();
kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, true);
bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, use_mrope, true);
auto draftok = (llama_decode(main_ctx, batch2.batch)==0); //actual eval for big model
if(!draftok)
{
Expand Down Expand Up @@ -1754,14 +1787,15 @@ static void load_grammar(const std::string & gammarstr)

static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num_img_tokens, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);

for (int i = 0; i < num_img_tokens; i += n_batch) {
int n_eval = num_img_tokens - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
float * embd = img_embd+i*n_embd;
kcpp_embd_batch llava_batch = kcpp_embd_batch(embd, n_eval, *n_past);
kcpp_embd_batch llava_batch = kcpp_embd_batch(embd, n_eval, *n_past, use_mrope);
if (llama_decode(ctx_llama, llava_batch.batch)) {
fprintf(stderr, "\n%s : failed to eval image\n", __func__);
return false;
Expand All @@ -1770,6 +1804,70 @@ static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num
}
return true;
}
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, float * image_embd, int num_img_tokens,
int n_batch, int * n_past) {
auto image_size = clip_get_load_image_size(clp_ctx);
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
const int patch_size = 14 * 2;
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
auto img_tokens = num_img_tokens;
// llama_pos mrope_pos[img_tokens * 4];
std::vector<llama_pos> mrope_pos;
mrope_pos.resize(img_tokens * 4);

int st_pos_id = *n_past;

for (int y = 0; y < ph; y++)
{
for (int x = 0; x < pw; x++)
{
int i = y * pw + x;
mrope_pos[i] = st_pos_id;
mrope_pos[i + img_tokens] = st_pos_id + y;
mrope_pos[i + img_tokens * 2] = st_pos_id + x;
mrope_pos[i + img_tokens * 3] = 0;
}
}
st_pos_id += std::max(pw, ph);

int processed = 0;
std::vector<llama_pos> batch_mrope_pos;
batch_mrope_pos.resize(img_tokens * 4);

for (int i = 0; i < img_tokens; i += n_batch) {
int n_eval = img_tokens - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}

// llama_pos batch_mrope_pos[n_eval * 4];
std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));

llama_batch batch = {
int32_t(n_eval), // n_tokens
nullptr, // token
(image_embd+i*n_embd), // embed
batch_mrope_pos.data(), // pos
nullptr, // n_seq_id
nullptr, // seq_id
nullptr, // logits
};

if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "\n%s : failed to eval image\n", __func__);
return false;
}
*n_past += n_eval;
processed += n_eval;
}
return true;
}


//given an old GGUF context and a new context that has some middle portion removed,
//find and remove the middle portion from the old context from the KV. Does not fast forward after this destructive action
Expand Down Expand Up @@ -2160,11 +2258,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
}
ggml_cuda_set_mul_mat_q(inputs.use_mmq);
if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 && !kcpp_data->flash_attn)
#endif
if((file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 || file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL) && !kcpp_data->flash_attn)
{
printf("CUBLAS: Warning, you are running Qwen2 without Flash Attention and may observe incoherent output.\n");
printf("Warning, you are running Qwen2 without Flash Attention. If you observe incoherent output, try enabling it.\n");
}
if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL)
{
printf("Qwen2VL detected! Mrope will be used!\n");
}
#endif
model_params.main_gpu = cu_parseinfo_maindevice;

#if defined(GGML_USE_CUDA)
Expand Down Expand Up @@ -3423,7 +3525,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
if(embd.size()!=1 || draft_ctx==nullptr || remaining_tokens<=speculative_chunk_amt || grammar!=nullptr || startedsampling==false) //for large batch, or if no draft model, PP/TG as usual
{
draft_used = false;
kcpp_embd_batch batch = kcpp_embd_batch(embd, n_past, false);
bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
kcpp_embd_batch batch = kcpp_embd_batch(embd, n_past, use_mrope, false);
evalres = (llama_decode(llama_ctx_v4, batch.batch)==0);
if(draft_ctx)
{
Expand Down
4 changes: 4 additions & 0 deletions model_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,10 @@ void print_tok_vec(std::vector<float> &embd)
{
fileformatmeta->model_architecture = GGUFArch::ARCH_QWEN2;
}
else if(modelarch=="qwen2vl")
{
fileformatmeta->model_architecture = GGUFArch::ARCH_QWEN2VL;
}
else if(modelarch=="rwkv6")
{
fileformatmeta->model_architecture = GGUFArch::ARCH_RWKV;
Expand Down
1 change: 1 addition & 0 deletions model_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ enum GGUFArch
ARCH_SOLAR = 4,
ARCH_QWEN2 = 5,
ARCH_RWKV = 6,
ARCH_QWEN2VL = 7,
};

struct FileFormatExtraMeta
Expand Down
Binary file modified msvcp140.dll
Binary file not shown.
Binary file modified msvcp140_codecvt_ids.dll
Binary file not shown.
Binary file modified vcruntime140.dll
Binary file not shown.
Binary file modified vcruntime140_1.dll
Binary file not shown.

0 comments on commit 00d154b

Please sign in to comment.