wip on qwen2vl integration, updated msvc runtimes

LostRuins · Dec 15, 2024 · 00d154b · 00d154b
1 parent f456ed7
commit 00d154b
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 22 deletions.
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -542,9 +542,10 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch
     std::vector<int32_t *> seq_ids;
     std::vector<int8_t> logits;
     llama_batch batch;
-    kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast) {
+    kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope) {
         int32_t seq_id = 0;
-        pos.resize(n_tokens);
+        pos.resize(n_tokens * (use_mrope?4:1));
+        std::fill(pos.begin(), pos.end(), 0);
         n_seq_id.resize(n_tokens);
         seq_ids.resize(n_tokens + 1);
         logits.resize(n_tokens);
@@ -560,23 +561,39 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch
             /*seq_id         =*/ seq_ids.data(),
             /*logits         =*/ logits.data(),
         };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = npast + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
+
+        if(!use_mrope)
+        {
+           for (int i = 0; i < n_tokens; i++) {
+                batch.pos     [i] = npast + i;
+                batch.n_seq_id[i] = 1;
+                batch.seq_id  [i] = seq_id_0.data();
+                batch.logits  [i] = false;
+            }
+        }
+        else
+        {
+            for (int i = 0; i < n_tokens; i++) {
+                batch.n_seq_id[i] = 1;
+                batch.seq_id  [i] = seq_id_0.data();
+                batch.logits  [i] = false;
+            }
+             for (int j = 0; j < batch.n_tokens * 3; j++) {
+                batch.pos[j] = npast + (j % batch.n_tokens);
+            }
         }
     }
-    kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool return_all_logits) {
+    kcpp_embd_batch(std::vector<llama_token> & tokens, int32_t npast, bool use_mrope, bool return_all_logits) {
         int32_t seq_id = 0;
         int32_t n_tokens = tokens.size();
-        pos.resize(n_tokens);
+        pos.resize(n_tokens * (use_mrope?4:1));
+        std::fill(pos.begin(), pos.end(), 0);
         n_seq_id.resize(n_tokens);
         seq_ids.resize(n_tokens + 1);
         logits.resize(n_tokens);
         seq_id_0.resize(1);
         seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
+        seq_ids[n_tokens] = nullptr;
         batch = {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ tokens.data(),
@@ -586,11 +603,26 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch
             /*seq_id         =*/ seq_ids.data(),
             /*logits         =*/ logits.data(),
         };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = npast + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = (return_all_logits?true:false);
+
+        if(!use_mrope)
+        {
+           for (int i = 0; i < n_tokens; i++) {
+                batch.pos     [i] = npast + i;
+                batch.n_seq_id[i] = 1;
+                batch.seq_id  [i] = seq_id_0.data();
+                batch.logits  [i] = (return_all_logits?true:false);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < n_tokens; i++) {
+                batch.n_seq_id[i] = 1;
+                batch.seq_id  [i] = seq_id_0.data();
+                batch.logits  [i] = (return_all_logits?true:false);
+            }
+             for (int j = 0; j < batch.n_tokens * 3; j++) {
+                batch.pos[j] = npast + (j % batch.n_tokens);
+            }
         }
         batch.logits[n_tokens - 1] = true;
     }
@@ -687,7 +719,7 @@ static speculative_draft_result speculative_decoding_eval_chunk(llama_context *
     drafted_ids.push_back(embd[0]);
     for(int i=0;i<speculative_chunk_amt;++i)
     {
-        kcpp_embd_batch batch1 = kcpp_embd_batch(temp_embd, draft_npast, false);
+        kcpp_embd_batch batch1 = kcpp_embd_batch(temp_embd, draft_npast, false, false);
         auto draftok = (llama_decode(draft_ctx, batch1.batch)==0);
         if(!draftok)
         {
@@ -706,7 +738,8 @@ static speculative_draft_result speculative_decoding_eval_chunk(llama_context *
 
     std::vector<int> real_embd = drafted_ids;
     real_embd.pop_back();
-    kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, true);
+    bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
+    kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, use_mrope, true);
     auto draftok = (llama_decode(main_ctx, batch2.batch)==0); //actual eval for big model
     if(!draftok)
     {
@@ -1754,14 +1787,15 @@ static void load_grammar(const std::string & gammarstr)
 
 static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num_img_tokens, int n_batch, int * n_past) {
     int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+    bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
 
     for (int i = 0; i < num_img_tokens; i += n_batch) {
         int n_eval = num_img_tokens - i;
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
         float * embd = img_embd+i*n_embd;
-        kcpp_embd_batch llava_batch = kcpp_embd_batch(embd, n_eval, *n_past);
+        kcpp_embd_batch llava_batch = kcpp_embd_batch(embd, n_eval, *n_past, use_mrope);
         if (llama_decode(ctx_llama, llava_batch.batch)) {
             fprintf(stderr, "\n%s : failed to eval image\n", __func__);
             return false;
@@ -1770,6 +1804,70 @@ static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num
     }
     return true;
 }
+static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, float * image_embd, int num_img_tokens,
+                                     int n_batch, int * n_past) {
+    auto image_size = clip_get_load_image_size(clp_ctx);
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+    const int patch_size = 14 * 2;
+    const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
+    const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
+    auto img_tokens = num_img_tokens;
+    // llama_pos mrope_pos[img_tokens * 4];
+    std::vector<llama_pos> mrope_pos;
+    mrope_pos.resize(img_tokens * 4);
+
+    int st_pos_id = *n_past;
+
+    for (int y = 0; y < ph; y++)
+    {
+        for (int x = 0; x < pw; x++)
+        {
+            int i = y * pw + x;
+            mrope_pos[i] = st_pos_id;
+            mrope_pos[i + img_tokens] = st_pos_id + y;
+            mrope_pos[i + img_tokens * 2] = st_pos_id + x;
+            mrope_pos[i + img_tokens * 3] = 0;
+        }
+    }
+    st_pos_id += std::max(pw, ph);
+
+    int processed = 0;
+    std::vector<llama_pos> batch_mrope_pos;
+    batch_mrope_pos.resize(img_tokens * 4);
+
+    for (int i = 0; i < img_tokens; i += n_batch) {
+        int n_eval = img_tokens - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+
+        // llama_pos batch_mrope_pos[n_eval * 4];
+        std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
+        memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
+        memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
+        memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
+        memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
+
+        llama_batch batch = {
+            int32_t(n_eval),                // n_tokens
+            nullptr,                        // token
+            (image_embd+i*n_embd),  // embed
+            batch_mrope_pos.data(),         // pos
+            nullptr,  // n_seq_id
+            nullptr,  // seq_id
+            nullptr,  // logits
+        };
+
+        if (llama_decode(ctx_llama, batch)) {
+            fprintf(stderr, "\n%s : failed to eval image\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+        processed += n_eval;
+    }
+    return true;
+}
+
 
 //given an old GGUF context and a new context that has some middle portion removed,
 //find and remove the middle portion from the old context from the KV. Does not fast forward after this destructive action
@@ -2160,11 +2258,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
         }
         ggml_cuda_set_mul_mat_q(inputs.use_mmq);
-        if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 && !kcpp_data->flash_attn)
+        #endif
+        if((file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 || file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL) && !kcpp_data->flash_attn)
         {
-            printf("CUBLAS: Warning, you are running Qwen2 without Flash Attention and may observe incoherent output.\n");
+            printf("Warning, you are running Qwen2 without Flash Attention. If you observe incoherent output, try enabling it.\n");
+        }
+        if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL)
+        {
+            printf("Qwen2VL detected! Mrope will be used!\n");
         }
-        #endif
         model_params.main_gpu = cu_parseinfo_maindevice;
 
         #if defined(GGML_USE_CUDA)
@@ -3423,7 +3525,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 if(embd.size()!=1 || draft_ctx==nullptr || remaining_tokens<=speculative_chunk_amt || grammar!=nullptr || startedsampling==false) //for large batch, or if no draft model, PP/TG as usual
                 {
                     draft_used = false;
-                    kcpp_embd_batch batch = kcpp_embd_batch(embd, n_past, false);
+                    bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
+                    kcpp_embd_batch batch = kcpp_embd_batch(embd, n_past, use_mrope, false);
                     evalres = (llama_decode(llama_ctx_v4, batch.batch)==0);
                     if(draft_ctx)
                     {

diff --git a/model_adapter.cpp b/model_adapter.cpp
@@ -315,6 +315,10 @@ void print_tok_vec(std::vector<float> &embd)
             {
                 fileformatmeta->model_architecture = GGUFArch::ARCH_QWEN2;
             }
+            else if(modelarch=="qwen2vl")
+            {
+                fileformatmeta->model_architecture = GGUFArch::ARCH_QWEN2VL;
+            }
             else if(modelarch=="rwkv6")
             {
                 fileformatmeta->model_architecture = GGUFArch::ARCH_RWKV;

diff --git a/model_adapter.h b/model_adapter.h
@@ -59,6 +59,7 @@ enum GGUFArch
     ARCH_SOLAR = 4,
     ARCH_QWEN2 = 5,
     ARCH_RWKV = 6,
+    ARCH_QWEN2VL = 7,
 };
 
 struct FileFormatExtraMeta

diff --git a/msvcp140.dll b/msvcp140.dll
diff --git a/msvcp140_codecvt_ids.dll b/msvcp140_codecvt_ids.dll
diff --git a/vcruntime140.dll b/vcruntime140.dll
diff --git a/vcruntime140_1.dll b/vcruntime140_1.dll