Merge pull request #36 from liang1232018/develop-i8-temp

Develop i8 temp
UbiquitousLearning · Aug 1, 2024 · 0673a65 · 0673a65
2 parents 5223137 + 6ebe067
commit 0673a65
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 39 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -567,29 +567,29 @@ endif ()
 
 
 
-if(QNN)
-    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/QNN)
-
-    # if (ARM)
-    #     set(CMAKE_CXX_FLAGS " -fPIC -Wall -pthread -march=armv8-a -O3 -g -Wno-write-strings -fvisibility=hidden -flto")
-    # else()
-    #     set (CMAKE_CXX_FLAGS " -fPIC -Wall  -pg -pthread -march=x86-64 -O3 -g -Wno-write-strings -fvisibility=hidden -flto")
-    # endif()
-    # set(CMAKE_LD_FLAGS "-shared -s -fPIC -pthread -fvisibility=hidden -flto")
-
-    # qnn executables
-    add_executable(qnn_opt_smoothquant ${PROJECT_SOURCE_DIR}/demo/qnn/qnn_opt_smoothquant.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
-        src/tokenizers/Tokenizer.cpp
-        src/tokenizers/Tokenizer.hpp
-        src/tokenizers/BPE/Bpe.cpp
-        src/tokenizers/BPE/Bpe.hpp
-    )
-    target_compile_definitions(qnn_opt_smoothquant PRIVATE USE_QNN)
-    if (ARM)
-        target_compile_options(qnn_opt_smoothquant PRIVATE -fopenmp)
-        target_link_libraries(qnn_opt_smoothquant PUBLIC  MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS} -fopenmp -static-openmp)
-    endif ()
-endif()
+# if(QNN)
+#     add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/QNN)
+# 
+#     # if (ARM)
+#     #     set(CMAKE_CXX_FLAGS " -fPIC -Wall -pthread -march=armv8-a -O3 -g -Wno-write-strings -fvisibility=hidden -flto")
+#     # else()
+#     #     set (CMAKE_CXX_FLAGS " -fPIC -Wall  -pg -pthread -march=x86-64 -O3 -g -Wno-write-strings -fvisibility=hidden -flto")
+#     # endif()
+#     # set(CMAKE_LD_FLAGS "-shared -s -fPIC -pthread -fvisibility=hidden -flto")
+# 
+#     # qnn executables
+#     add_executable(qnn_opt_smoothquant ${PROJECT_SOURCE_DIR}/demo/qnn/qnn_opt_smoothquant.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
+#         src/tokenizers/Tokenizer.cpp
+#         src/tokenizers/Tokenizer.hpp
+#         src/tokenizers/BPE/Bpe.cpp
+#         src/tokenizers/BPE/Bpe.hpp
+#     )
+#     target_compile_definitions(qnn_opt_smoothquant PRIVATE USE_QNN)
+#     if (ARM)
+#         target_compile_options(qnn_opt_smoothquant PRIVATE -fopenmp)
+#         target_link_libraries(qnn_opt_smoothquant PUBLIC  MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS} -fopenmp -static-openmp)
+#     endif ()
+# endif()
 
 if (APK)
     add_library(mllm_lib STATIC ${DIR_SRC_CPU} ${DIR_SRC_EXP} ${DIR_SRC} ${DIR_SRC_MEM_MANAGER}

diff --git a/demo/qnn/main_qwen.cpp b/demo/qnn/main_qwen.cpp
@@ -55,7 +55,6 @@ void fullTensor(shared_ptr<Tensor> input_tensor, Net net, vector<int> shape, Dty
 
 NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head_size, int cache_max, string name) {
     auto *q = _LinearINT8({x}, embedding_size, hidden_size * head_size, true, name + ".q_proj");
-
     auto *k = _LinearINT8({x}, embedding_size, hidden_size * head_size, true, name + ".k_proj");
     auto *v = _LinearINT8({x}, embedding_size, hidden_size * head_size, true, name + ".v_proj");
 
@@ -65,8 +64,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
 
     q = _RoPE({q}, HFHUBROPE, name + ".q_rope", 1000000, 32768);
     k = _RoPE({k}, HFHUBROPE, name + ".k_rope", 1000000, 32768);
-    k = _KVCache({k}, cache_max, name + ".k_cache");
-    v = _KVCache({v}, cache_max, name + ".v_cache");
+    k = _KVCacheNPU({k}, cache_max, name + ".k_cache");
+    v = _KVCacheNPU({v}, cache_max, name + ".v_cache");
     auto *qk = _Matmul({q, k}, false, true, name + ".qk");
     qk = *qk / std::sqrt(hidden_size);
 
@@ -97,6 +96,7 @@ void qwen_model(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int f
 
         auto tmp = Attention(res, hidden_dim, hidden_dim / mutil_head_size, mutil_head_size, cache_max, (string) "model.layers." + std::to_string(layer) + ".self_attn");
 
+        return ;
         i = *tmp+i;
 
         res = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm");
@@ -124,7 +124,7 @@ int main(int argc, char **argv) {
     cmdParser.parse_check(argc, argv);
 
     const string cpu_model_path = "./models/Qwen1.5-1.8B-Chat_152_int8_biasint8_ns.mllm";
-    const string merge_file_path = "./vocab/merges-qwen.txt";
+    const string merge_file_path = "./vocab/merges_qwen.txt";
 
     string vocab_path = cmdParser.get<string>("vocab");
     int tokens_limit = cmdParser.get<int>("limits");
@@ -191,6 +191,11 @@ int main(int argc, char **argv) {
             tokens_id[0] = 13;
         }
 
+        for (int ti = 0; ti < tokens_id.size(); ti++) {
+            tokens_id[ti] = 9707;
+            std::cout << tokens_id[ti] << std::endl;
+        }
+
         BPETokenizer::token2Tensor(&cpuNet, tokens_id, input);
 
 
@@ -203,6 +208,9 @@ int main(int argc, char **argv) {
             cpuExe.run(&cpuNet, {input});
             auto result = cpuExe.result();
 
+            result[0]->printData<float>();
+            exit(-1);
+
             auto token_idx = postProcessing(result[0], input);
             if (token_idx == 151645) { // "</s>"
                 break;

diff --git a/scripts/build_qnn_android.sh b/scripts/build_qnn_android.sh
@@ -13,6 +13,6 @@ cmake .. \
 -DDEBUG=ON \
 -DTEST=OFF \
 -DQUANT=OFF \
--DSMOOTHQUANT=ON\
+-DSMOOTHQUANT=OFF\
 
 make -j4
diff --git a/src/backends/cpu/CPULinearInt8.cpp b/src/backends/cpu/CPULinearInt8.cpp
@@ -14,7 +14,13 @@ CPULinearInt8::CPULinearInt8(Backend *bn, string opName, int in_features, int ou
     support_bias_ = bias;
     thread_count = threadCount;
     weight_.setBackend(bn);
+    originWeight_.setBackend(bn);
     bias_.setBackend(bn);
+
+    weightScale_.setBackend(bn);
+    biasScale_.setBackend(bn);
+    inputActivatationScale_.setBackend(bn);
+    outputActivatationScale_.setBackend(bn);
 }
 
 ErrorCode CPULinearInt8::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
@@ -44,13 +50,34 @@ ErrorCode CPULinearInt8::reshape(vector<shared_ptr<Tensor>> inputs, vector<share
 
 ErrorCode CPULinearInt8::load(AbstructLoader &loader) {
     //std::cout << name() << "  CPULinearInt8 load" << std::endl;
-    weight_.setName(name() + ".weight");
-    weight_.reshape(1, 1, out_features_, in_features_);
-    if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) {
-        std::cout << "load weight: " << loader.getDataType(weight_.name()) <<  std::endl;
-        weight_.setDtype(loader.getDataType(weight_.name()));
+    originWeight_.setName(name() + ".weight");
+    // origin weight is [in, out], while the linear weight is [out, in]
+    originWeight_.reshape(1, 1, in_features_, out_features_);
+    if (loader.getDataType(originWeight_.name()) != MLLM_TYPE_COUNT) {
+        originWeight_.setDtype(loader.getDataType(originWeight_.name()));
+        originWeight_.alloc();
+        loader.load(&originWeight_);
+
+
+        weight_.setName(name() + ".linear.weight");
+        weight_.reshape(1, 1, out_features_, in_features_);
+        weight_.setDtype(MLLM_TYPE_I8);
         weight_.alloc();
-        loader.load(&weight_);
+
+        for (int i = 0; i < in_features_; ++i) {
+            for (int j = 0; j < out_features_; ++j) {
+                weight_.setDataAt<int8_t>(0, 0, j, i, originWeight_.dataAt<int8_t>(0,0, i,j));
+            }
+        }
+
+        originWeight_.free();
+
+        weightScale_.setName(name() + ".weight.scale");
+        weightScale_.reshape(1, 1, 1, 1);
+        weightScale_.setDtype(MLLM_TYPE_F32);
+        weightScale_.alloc();
+        loader.load(&weightScale_);
+
     } else {
         weight_.setDtype(MLLM_TYPE_F32);
         weight_.alloc();
@@ -62,11 +89,32 @@ ErrorCode CPULinearInt8::load(AbstructLoader &loader) {
             bias_.setDtype(loader.getDataType(bias_.name()));
             bias_.alloc();
             loader.load(&bias_);
+
+            biasScale_.setName(name() + ".bias.scale");
+            biasScale_.reshape(1, 1, 1, 1);
+            biasScale_.setDtype(MLLM_TYPE_F32);
+            biasScale_.alloc();
+            loader.load(&biasScale_);
         } else {
             bias_.setDtype(MLLM_TYPE_F32);
             bias_.alloc();
         }
     }
+
+
+    inputActivatationScale_.setName(name() + ".input_scale");
+    inputActivatationScale_.reshape(1, 1, 1, 1);
+    inputActivatationScale_.setDtype(MLLM_TYPE_F32);
+    inputActivatationScale_.alloc();
+    loader.load(&inputActivatationScale_);
+
+    outputActivatationScale_.setName(name() + ".output_scale");
+    outputActivatationScale_.reshape(1, 1, 1, 1);
+    outputActivatationScale_.setDtype(MLLM_TYPE_F32);
+    outputActivatationScale_.alloc();
+    loader.load(&outputActivatationScale_);
+
+
     return Op::load(loader);
 }
 
@@ -91,9 +139,19 @@ ErrorCode CPULinearInt8::free(vector<shared_ptr<Tensor>> inputs, vector<shared_p
 
 ErrorCode CPULinearInt8::mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count){
     // todo: load scale from loader
-    const float scale1 = 1.0, scale2 = 1.0;
+    float scale1 = inputActivatationScale_.hostPtr<float>()[0]  / 127.0;
+    scale1 = roundf(scale1 * 100000) / 100000;
+
+    float scale2 = weightScale_.hostPtr<float>()[0];
+
+    float scale3 = 0.0;
+    if(support_bias_)
+        scale3 = biasScale_.hostPtr<float>()[0];
+
+    float scale4 = outputActivatationScale_.hostPtr<float>()[0]/ 127.0;
+    scale4 = roundf(scale4 * 100000) / 100000;
 
-    assert(src1->dtype() == MLLM_TYPE_Q4_0);
+    assert(src1->dtype() == MLLM_TYPE_I8);
     assert(src0_->dtype() == MLLM_TYPE_F32);
     Tensor src0_i8(src0_->shape());
     src0_i8.setBackend(src0_->backend());
@@ -125,7 +183,7 @@ ErrorCode CPULinearInt8::mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *ds
     Tensor *src0_cal = src0;
     Tensor *src1_cal = src1;
     const int64_t blck_0 = 16;
-#pragma omp parallel for collapse(4) num_threads(thread_count)
+// #pragma omp parallel for collapse(4) num_threads(thread_count)
     for (int b = 0; b < src0->batch(); b++) {
         for (int h = 0; h < src0->head(); h++) {
             const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
@@ -144,8 +202,9 @@ ErrorCode CPULinearInt8::mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *ds
 
                         vec_dot_i8_i8(K, dst->ptrAt<float>(b, h, m, n), src1_cal->hostPtr<int8_t>() + src1_cal->offset(b_1, h_1, s_1, d_1), src0_cal->hostPtr<int8_t>() + src0_cal->offset(b, h, s_0, d_0), scale1, scale2);
                         if (support_bias) {
-                            *dst->ptrAt<float>(b, h, m, n) += bias->dataAt<float>(0, 0, 0, n);
+                            *dst->ptrAt<float>(b, h, m, n) += bias->dataAt<int8_t>(0, 0, 0, n) * scale3;    
                         }
+                        *dst->ptrAt<float>(b, h, m, n) = std::fmaxf(std::fminf(roundf(*dst->ptrAt<float>(b, h, m, n) / scale4), 127), -128) * scale4;
                     }
                 }
             }

diff --git a/src/backends/cpu/CPULinearInt8.hpp b/src/backends/cpu/CPULinearInt8.hpp
@@ -30,8 +30,14 @@ class CPULinearInt8 final : public Op {
     bool support_bias_;
     int thread_count = 4;
     Tensor weight_;
+    Tensor originWeight_;
     Tensor bias_;
 
+    Tensor weightScale_;
+    Tensor biasScale_;
+    Tensor inputActivatationScale_;
+    Tensor outputActivatationScale_;
+
     ErrorCode mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count = 4);
 };
 

diff --git a/src/backends/cpu/quantize/QuantizeQ8.cpp b/src/backends/cpu/quantize/QuantizeQ8.cpp
@@ -278,14 +278,20 @@ void quantize_row_i8(const float *__restrict x, void *__restrict vy, int k, floa
     const float d = scale;
     const float id = d ? 1.0f / d : 0.0f;
 
+    const int32x4_t min_128 = vdupq_n_s32(-128);
+    const int32x4_t max127 = vdupq_n_s32( 127);
+
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
         float32x4_t srcv[8];
         for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i * 32 + 4 * j);
 
         for (int j = 0; j < 8; j++) {
             const float32x4_t v = vmulq_n_f32(srcv[j], id);
-            const int32x4_t vi = vcvtnq_s32_f32(v);
+            int32x4_t vi = vcvtnq_s32_f32(v);
+
+            vi = vminq_s32(vi, max127);
+            vi = vmaxq_s32(vi, min_128);
 
             y[i*32+ 4 * j + 0] = vgetq_lane_s32(vi, 0);
             y[i*32+ 4 * j + 1] = vgetq_lane_s32(vi, 1);