Merge pull request #94 from emt0re0/stablelm-2-1.6b

feat: Stablelm 2 1.6b support
UbiquitousLearning · Jul 17, 2024 · 56bdff5 · 56bdff5
2 parents be80c5c + d8998d3
commit 56bdff5
Show file tree

Hide file tree

Showing 11 changed files with 100,471 additions and 14 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -318,7 +318,16 @@ else ()
     target_link_libraries(main_imagebind MLLM_CPU)
 endif ()
 
-
+add_executable(demo_stablelm ${PROJECT_SOURCE_DIR}/examples/demo_stablelm.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
+        src/tokenizers/Tokenizer.cpp
+        src/tokenizers/BPE/Bpe.cpp
+)
+if (ARM AND NOT APK)
+    target_compile_options(demo_stablelm PRIVATE -fopenmp)
+    target_link_libraries(demo_stablelm PUBLIC MLLM_CPU -fopenmp -static-openmp)
+else ()
+    target_link_libraries(demo_stablelm MLLM_CPU)
+endif ()
 
 add_executable(demo_llama ${PROJECT_SOURCE_DIR}/examples/demo_llama.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
         src/tokenizers/Tokenizer.cpp

diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
 
 ## Recent update
 - [🔥🔥Comming soon] Supporting Qualcomm NPU: [>1000 tokens/second prefilling!](https://arxiv.org/pdf/2407.05858v1)
+V1- [2024 July 17] Support new model: StableLM V2 1.6B https://github.com/UbiquitousLearning/mllm/pull/94
 - [2024 July 2] Support new model: Yi V1.5 6B https://github.com/UbiquitousLearning/mllm/pull/88
 - [2024 May 29] Support new model: Mistral V0.2 7B https://github.com/UbiquitousLearning/mllm/pull/83
 - [2024 May 4] Support new model: QWen V1.5 0.5B https://github.com/UbiquitousLearning/mllm/pull/79
@@ -74,9 +75,10 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
 | [ImageBind](https://github.com/facebookresearch/ImageBind) (3 modalities)   | [✔️](https://huggingface.co/mllmTeam/imagebind_huge-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/imagebind_huge-mllm/tree/main)   |
 | [LLaVA 7B](https://github.com/haotian-liu/LLaVA)                            | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/llava-1.5-7b-mllm/tree/main)   |
 | [Gemma 2B](https://github.com/google/gemma_pytorch)                         | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/gemma-2b-mllm/tree/main)   |
-| [Qwen 0.5B](https://github.com/QwenLM/Qwen)                         | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)   |
-| [Mistral 7B](https://github.com/mistralai/mistral-src)                         | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main)   |
-| [Yi 6B](https://huggingface.co/01-ai/Yi-1.5-6B)                         | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main)   |
+| [Qwen 0.5B](https://github.com/QwenLM/Qwen)                                 | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/qwen-1.5-0.5b-mllm/tree/main)   |
+| [Mistral 7B](https://github.com/mistralai/mistral-src)                      | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/mistral-7b-instruct-v0.2-mllm/tree/main)   |
+| [Yi 6B](https://huggingface.co/01-ai/Yi-1.5-6B)                             | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/yi-1.5-6b-chat-mllm/tree/main)   |
+| [StableLM 1.6B](https://github.com/Stability-AI/StableLM)                     | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main)  | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main)   |
 
 ## Quick Start
 

diff --git a/examples/demo_stablelm.cpp b/examples/demo_stablelm.cpp
@@ -0,0 +1,68 @@
+#include <iostream>
+#include "cmdline.h"
+#include "models/stablelm/modeling_stablelm.hpp"
+#include "models/stablelm/tokenization_stablelm.hpp"
+#include "processor/PostProcess.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/stablelm_vocab.mllm");
+    cmdParser.add<string>("merge", 'm', "specify mllm merge path", false, "../vocab/stablelm_merges.txt");
+    cmdParser.add<string>("model", 'o', "specify mllm model path", false, "../models/stablelm-2-1.6b-chat-q4_k.mllm");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string merge_path = cmdParser.get<string>("merge");
+    string model_path = cmdParser.get<string>("model");
+    int tokens_limit = cmdParser.get<int>("limits");
+    CPUBackend::cpu_threads = cmdParser.get<int>("thread");
+
+    auto tokenizer = StableLMTokenizer(vocab_path, merge_path);
+
+    string system_prompt_start = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n";
+    string system_prompt_end = "<|im_end|>\n<|im_start|>assistant\n";
+
+    StableLMConfig config(tokens_limit, "1.6B", HFHUBROPE);
+    auto model = StableLMModel(config);
+    model.load(model_path);
+
+    vector<string> in_strs = {
+        " Hello, who are you?",
+        " What can you do?",
+        "Please introduce Beijing University of Posts and Telecommunications."};
+
+    for (int i = 0; i < in_strs.size(); ++i) {
+        const auto& in_str_origin = in_strs[i];
+        auto in_str = system_prompt_start + in_str_origin + system_prompt_end;
+        std::cout << "[Q] " << in_str_origin << std::endl;
+        auto input_tensor = tokenizer.tokenize(in_str, i);
+        std::cout << "[A] " << std::flush;
+        for (int step = 0; step < 100; step++) {
+            auto result = model({input_tensor});
+            auto outputs = tokenizer.detokenize(result[0]);
+            auto out_string = outputs.first;
+            auto out_token = outputs.second;
+            if (out_token == 100278) {
+                break;
+            }
+            size_t pos = 0;
+            while ((pos = out_string.find("Ċ", pos)) != std::string::npos) {
+                out_string.replace(pos, 2, " ");
+            }
+            pos = 0;
+            while ((pos = out_string.find("Ġ", pos)) != std::string::npos) {
+                out_string.replace(pos, 2, " ");
+            }
+
+            std::cout << out_string << std::flush;
+            chatPostProcessing(out_token, input_tensor, {});
+        }
+        printf("\n");
+    }
+
+    return 0;
+}
diff --git a/src/Layer.hpp b/src/Layer.hpp
@@ -645,6 +645,13 @@ class RoPE final : public Layer {
         param_["max_position_embeddings"] = max_position_embeddings;
         init(std::move(name), OpType::ROPE);
     }
+    explicit RoPE(int pose_type, float rope_theta, float partial_rotary_factor, int max_position_embeddings, std::string name) {
+        param_["pose_type"] = pose_type;
+        param_["rope_theta"] = rope_theta;
+        param_["max_position_embeddings"] = max_position_embeddings;
+        param_["partial_rotary_factor"] = partial_rotary_factor;
+        init(std::move(name), OpType::ROPE);
+    }
     Tensor &operator()(Tensor &input) {
         return _1I1O_OP(input);
     }

diff --git a/src/backends/cpu/CPURoPE.cpp b/src/backends/cpu/CPURoPE.cpp
@@ -76,12 +76,21 @@ CPURoPE::CPURoPE(Backend *bn, string opName, int pose_type, float rope_theta, in
     pos_max_ = max_position_embeddings;
 }
 
+CPURoPE::CPURoPE(Backend *bn, string opName, int pose_type, float rope_theta, float partial_rotary_factor, int max_position_embeddings, int threadCount) :
+    thread_count(threadCount),
+    Op(bn, opName) {
+    pose_type_ = pose_type;
+    rope_theta_ = rope_theta;
+    partial_rotary_factor_ = partial_rotary_factor;
+    pos_max_ = max_position_embeddings;
+}
+
 ErrorCode CPURoPE::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
     // std::cout << name() << "  CPURoPE  reshape" << std::endl;
     assert(inputs.size() == 1);
     assert(outputs.size() == 1);
     outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), inputs[0]->dimension());
-    ishape = inputs[0]->dimension();
+    ishape = inputs[0]->dimension() * partial_rotary_factor_;
     // pos_max_ = 16384;
     if (sin_.empty() || ishape_old < ishape || global_pose_type_ != pose_type_) {
         global_pose_type_ = pose_type_;
@@ -102,11 +111,12 @@ ErrorCode CPURoPE::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
     auto &input = inputs[0];
     auto &output = outputs[0];
     auto out_dtype = output->dtype();
+    int partial_dimension = (input->dimension()) * partial_rotary_factor_;
     for (int n = 0; n < input->batch(); ++n) {
         for (int h = 0; h < input->head(); ++h) {
             for (int s = 0; s < input->sequence(); ++s) { // sequance
 #pragma omp parallel for num_threads(thread_count)
-                for (int d = 0; d < input->dimension(); ++d) {
+                for (int d = 0; d < partial_dimension; ++d) {
                     if (pose_type_ == LLAMAROPE) {
                         float in_value = input->dataAt<float>(n, h, s, d);
                         float in_value_2;
@@ -128,16 +138,16 @@ ErrorCode CPURoPE::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
                         float in_value_2;
                         float sin_value = sin_[s + h_cnt_][d];
                         float cos_value = cos_[s + h_cnt_][d];
-                        if (d < input->dimension() / 4) {
-                            in_value_2 = -input->dataAt<float>(n, h, s, d + input->dimension() / 4);
+                        if (d < partial_dimension / 4) {
+                            in_value_2 = -input->dataAt<float>(n, h, s, d + partial_dimension / 4);
                             auto value = in_value * cos_value + in_value_2 * sin_value;
                             if (output->dtypeAt(n, h, s, d) == MLLM_TYPE_F32) {
                                 output->setDataAt<float>(n, h, s, d, value);
                             } else if (output->dtypeAt(n, h, s, d) == MLLM_TYPE_F16) {
                                 output->setDataAt<mllm_fp16_t>(n, h, s, d, MLLM_FP32_TO_FP16(value));
                             }
-                        } else if (d < input->dimension() / 2) {
-                            in_value_2 = input->dataAt<float>(n, h, s, d - input->dimension() / 4);
+                        } else if (d < (partial_dimension / 2)) {
+                            in_value_2 = input->dataAt<float>(n, h, s, d - partial_dimension / 4);
                             auto value = in_value * cos_value + in_value_2 * sin_value;
                             if (output->dtypeAt(n, h, s, d) == MLLM_TYPE_F32) {
                                 output->setDataAt<float>(n, h, s, d, value);
@@ -154,10 +164,10 @@ ErrorCode CPURoPE::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
                     } else if (pose_type_ == HFHUBROPE) {
                         float in_value = input->dataAt<float>(n, h, s, d);
                         float in_value_2;
-                        if (d < input->dimension() / 2) {
-                            in_value_2 = -input->dataAt<float>(n, h, s, d + input->dimension() / 2);
+                        if (d < (partial_dimension / 2)) {
+                            in_value_2 = -input->dataAt<float>(n, h, s, d + partial_dimension / 2);
                         } else {
-                            in_value_2 = input->dataAt<float>(n, h, s, d - input->dimension() / 2);
+                            in_value_2 = input->dataAt<float>(n, h, s, d - partial_dimension / 2);
                         }
                         float sin_value = sin_[s + h_cnt_][d];
                         float cos_value = cos_[s + h_cnt_][d];
@@ -201,6 +211,21 @@ ErrorCode CPURoPE::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
     if (h_cnt_ > pos_max_) {
         h_cnt_ = 0;
     }
+
+    for (int n = 0; n < input->batch(); ++n) {
+        for (int h = 0; h < input->head(); ++h) {
+            for (int s = 0; s < input->sequence(); ++s) {
+#pragma omp parallel for num_threads(thread_count)
+                for (int d = partial_dimension; d < input->dimension(); ++d) {
+                    if (output->dtypeAt(n, h, s, d) == MLLM_TYPE_F32) {
+                        output->setDataAt<float>(n, h, s, d, input->dataAt<float>(n, h, s, d));
+                    } else if (output->dtypeAt(n, h, s, d) == MLLM_TYPE_F16) {
+                        output->setDataAt<mllm_fp16_t>(n, h, s, d, MLLM_FP32_TO_FP16(input->dataAt<float>(n, h, s, d)));
+                    }
+                }
+            }
+        }
+    }
     return Op::execute(inputs, outputs);
 }
 

diff --git a/src/backends/cpu/CPURoPE.hpp b/src/backends/cpu/CPURoPE.hpp
@@ -10,6 +10,7 @@ class CPURoPE final : public Op {
 public:
     CPURoPE(Backend *bn, string opName, int pose_type, int threadCount);
     CPURoPE(Backend *bn, string opName, int pose_type, float rope_theta, int max_position_embeddings, int threadCount);
+    CPURoPE(Backend *bn, string opName, int pose_type, float rope_theta, float partial_rotary_factor, int max_position_embeddings, int threadCount);
     virtual ~CPURoPE() = default;
     virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
     virtual ErrorCode load(AbstructLoader &loader) override;
@@ -30,6 +31,7 @@ class CPURoPE final : public Op {
     int pose_type_ = 4;
     int ishape;
     int thread_count = 4;
+    float partial_rotary_factor_ = 1;
 };
 
 class CPURoPECreator : public CPUBackend::Creator {
@@ -41,7 +43,11 @@ class CPURoPECreator : public CPUBackend::Creator {
         }
         float rope_theta = op_param["rope_theta"];
         int max_position_embeddings = op_param["max_position_embeddings"];
-        return new CPURoPE(bn, name, pose_type, rope_theta, max_position_embeddings, threadCount);
+        if (op_param.find("partial_rotary_factor") == op_param.end()) {
+            return new CPURoPE(bn, name, pose_type, rope_theta, max_position_embeddings, threadCount);
+        }
+        float partial_rotary_factor = op_param["partial_rotary_factor"];
+        return new CPURoPE(bn, name, pose_type, rope_theta, partial_rotary_factor, max_position_embeddings, threadCount);
     }
 };
 } // namespace mllm

diff --git a/src/models/stablelm/configuration_stablelm.hpp b/src/models/stablelm/configuration_stablelm.hpp
@@ -0,0 +1,69 @@
+#ifndef CONFIG_STABLELM_HPP
+#define CONFIG_STABLELM_HPP
+#include "models/transformer/configuration_transformer.hpp"
+
+using namespace mllm;
+
+class stablelmNameConfig : public TransformerNameConfig {
+public:
+    std::string blk_name;
+    std::string token_embd_name;
+    std::string post_norm_name;
+    std::string lm_head_name;
+    std::string _gate_proj_name;
+
+    void init(RoPEType type = HFHUBROPE) {
+        switch (type) {
+        case HFHUBROPE: {
+            blk_name = "model.layers.";
+            _attn_base_name = "self_attn.";
+            _ffn_base_name = "mlp.";
+            _q_proj_name = "q_proj";
+            _k_proj_name = "k_proj";
+            _v_proj_name = "v_proj";
+            _o_proj_name = "o_proj";
+            _gate_proj_name = "gate_proj";
+            _up_proj_name = "up_proj";
+            _down_proj_name = "down_proj";
+            _attn_norm_name = "input_layernorm";
+            _ffn_norm_name = "post_attention_layernorm";
+            token_embd_name = "model.embed_tokens";
+            post_norm_name = "model.norm";
+            lm_head_name = "lm_head";
+            break;
+        }
+        default: {
+            throw std::runtime_error("Unsupported llama type");
+        }
+        }
+    }
+};
+
+class StableLMConfig {
+public:
+    int vocab_size{};
+    int hidden_dim{};
+    int head_size{};
+    int ffn_hidden{};
+    int block_num{};
+    RoPEType RoPE_type;
+    int cache_limit{};
+    stablelmNameConfig names_config;
+
+    explicit StableLMConfig(int token_limit, string billions = "1.6B", RoPEType type = HFHUBROPE, int vocab = 100352) {
+        names_config.init(type);
+        vocab_size = vocab;
+        if (billions == "1.6B" || billions == "1.6b") {
+            hidden_dim = 2048;
+            head_size = 32;
+            ffn_hidden = 5632;
+            block_num = 24;
+        } else {
+            throw std::runtime_error("Unsupported model size");
+        }
+        RoPE_type = type;
+        cache_limit = token_limit;
+    }
+};
+
+#endif //