diff --git a/CMakeLists.txt b/CMakeLists.txt index d6251685..79c58724 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -567,29 +567,29 @@ endif () -if(QNN) - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/QNN) - - # if (ARM) - # set(CMAKE_CXX_FLAGS " -fPIC -Wall -pthread -march=armv8-a -O3 -g -Wno-write-strings -fvisibility=hidden -flto") - # else() - # set (CMAKE_CXX_FLAGS " -fPIC -Wall -pg -pthread -march=x86-64 -O3 -g -Wno-write-strings -fvisibility=hidden -flto") - # endif() - # set(CMAKE_LD_FLAGS "-shared -s -fPIC -pthread -fvisibility=hidden -flto") - - # qnn executables - add_executable(qnn_opt_smoothquant ${PROJECT_SOURCE_DIR}/demo/qnn/qnn_opt_smoothquant.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} - src/tokenizers/Tokenizer.cpp - src/tokenizers/Tokenizer.hpp - src/tokenizers/BPE/Bpe.cpp - src/tokenizers/BPE/Bpe.hpp - ) - target_compile_definitions(qnn_opt_smoothquant PRIVATE USE_QNN) - if (ARM) - target_compile_options(qnn_opt_smoothquant PRIVATE -fopenmp) - target_link_libraries(qnn_opt_smoothquant PUBLIC MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS} -fopenmp -static-openmp) - endif () -endif() +# if(QNN) +# add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/QNN) +# +# # if (ARM) +# # set(CMAKE_CXX_FLAGS " -fPIC -Wall -pthread -march=armv8-a -O3 -g -Wno-write-strings -fvisibility=hidden -flto") +# # else() +# # set (CMAKE_CXX_FLAGS " -fPIC -Wall -pg -pthread -march=x86-64 -O3 -g -Wno-write-strings -fvisibility=hidden -flto") +# # endif() +# # set(CMAKE_LD_FLAGS "-shared -s -fPIC -pthread -fvisibility=hidden -flto") +# +# # qnn executables +# add_executable(qnn_opt_smoothquant ${PROJECT_SOURCE_DIR}/demo/qnn/qnn_opt_smoothquant.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} +# src/tokenizers/Tokenizer.cpp +# src/tokenizers/Tokenizer.hpp +# src/tokenizers/BPE/Bpe.cpp +# src/tokenizers/BPE/Bpe.hpp +# ) +# target_compile_definitions(qnn_opt_smoothquant PRIVATE USE_QNN) +# if (ARM) +# target_compile_options(qnn_opt_smoothquant PRIVATE -fopenmp) +# target_link_libraries(qnn_opt_smoothquant PUBLIC MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS} -fopenmp -static-openmp) +# endif () +# endif() if (APK) add_library(mllm_lib STATIC ${DIR_SRC_CPU} ${DIR_SRC_EXP} ${DIR_SRC} ${DIR_SRC_MEM_MANAGER} diff --git a/demo/qnn/main_qwen.cpp b/demo/qnn/main_qwen.cpp index 85d3a0b8..e738de70 100644 --- a/demo/qnn/main_qwen.cpp +++ b/demo/qnn/main_qwen.cpp @@ -55,7 +55,6 @@ void fullTensor(shared_ptr input_tensor, Net net, vector shape, Dty NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head_size, int cache_max, string name) { auto *q = _LinearINT8({x}, embedding_size, hidden_size * head_size, true, name + ".q_proj"); - auto *k = _LinearINT8({x}, embedding_size, hidden_size * head_size, true, name + ".k_proj"); auto *v = _LinearINT8({x}, embedding_size, hidden_size * head_size, true, name + ".v_proj"); @@ -65,8 +64,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head q = _RoPE({q}, HFHUBROPE, name + ".q_rope", 1000000, 32768); k = _RoPE({k}, HFHUBROPE, name + ".k_rope", 1000000, 32768); - k = _KVCache({k}, cache_max, name + ".k_cache"); - v = _KVCache({v}, cache_max, name + ".v_cache"); + k = _KVCacheNPU({k}, cache_max, name + ".k_cache"); + v = _KVCacheNPU({v}, cache_max, name + ".v_cache"); auto *qk = _Matmul({q, k}, false, true, name + ".qk"); qk = *qk / std::sqrt(hidden_size); @@ -97,6 +96,7 @@ void qwen_model(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int f auto tmp = Attention(res, hidden_dim, hidden_dim / mutil_head_size, mutil_head_size, cache_max, (string) "model.layers." + std::to_string(layer) + ".self_attn"); + return ; i = *tmp+i; res = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm"); @@ -124,7 +124,7 @@ int main(int argc, char **argv) { cmdParser.parse_check(argc, argv); const string cpu_model_path = "./models/Qwen1.5-1.8B-Chat_152_int8_biasint8_ns.mllm"; - const string merge_file_path = "./vocab/merges-qwen.txt"; + const string merge_file_path = "./vocab/merges_qwen.txt"; string vocab_path = cmdParser.get("vocab"); int tokens_limit = cmdParser.get("limits"); @@ -191,6 +191,11 @@ int main(int argc, char **argv) { tokens_id[0] = 13; } + for (int ti = 0; ti < tokens_id.size(); ti++) { + tokens_id[ti] = 9707; + std::cout << tokens_id[ti] << std::endl; + } + BPETokenizer::token2Tensor(&cpuNet, tokens_id, input); @@ -203,6 +208,9 @@ int main(int argc, char **argv) { cpuExe.run(&cpuNet, {input}); auto result = cpuExe.result(); + result[0]->printData(); + exit(-1); + auto token_idx = postProcessing(result[0], input); if (token_idx == 151645) { // "" break; diff --git a/scripts/build_qnn_android.sh b/scripts/build_qnn_android.sh index 55b21836..8b96f41f 100644 --- a/scripts/build_qnn_android.sh +++ b/scripts/build_qnn_android.sh @@ -13,6 +13,6 @@ cmake .. \ -DDEBUG=ON \ -DTEST=OFF \ -DQUANT=OFF \ --DSMOOTHQUANT=ON\ +-DSMOOTHQUANT=OFF\ make -j4 diff --git a/src/backends/cpu/CPULinearInt8.cpp b/src/backends/cpu/CPULinearInt8.cpp index da166872..d726b5dd 100644 --- a/src/backends/cpu/CPULinearInt8.cpp +++ b/src/backends/cpu/CPULinearInt8.cpp @@ -14,7 +14,13 @@ CPULinearInt8::CPULinearInt8(Backend *bn, string opName, int in_features, int ou support_bias_ = bias; thread_count = threadCount; weight_.setBackend(bn); + originWeight_.setBackend(bn); bias_.setBackend(bn); + + weightScale_.setBackend(bn); + biasScale_.setBackend(bn); + inputActivatationScale_.setBackend(bn); + outputActivatationScale_.setBackend(bn); } ErrorCode CPULinearInt8::reshape(vector> inputs, vector> outputs) { @@ -44,13 +50,34 @@ ErrorCode CPULinearInt8::reshape(vector> inputs, vector(0, 0, j, i, originWeight_.dataAt(0,0, i,j)); + } + } + + originWeight_.free(); + + weightScale_.setName(name() + ".weight.scale"); + weightScale_.reshape(1, 1, 1, 1); + weightScale_.setDtype(MLLM_TYPE_F32); + weightScale_.alloc(); + loader.load(&weightScale_); + } else { weight_.setDtype(MLLM_TYPE_F32); weight_.alloc(); @@ -62,11 +89,32 @@ ErrorCode CPULinearInt8::load(AbstructLoader &loader) { bias_.setDtype(loader.getDataType(bias_.name())); bias_.alloc(); loader.load(&bias_); + + biasScale_.setName(name() + ".bias.scale"); + biasScale_.reshape(1, 1, 1, 1); + biasScale_.setDtype(MLLM_TYPE_F32); + biasScale_.alloc(); + loader.load(&biasScale_); } else { bias_.setDtype(MLLM_TYPE_F32); bias_.alloc(); } } + + + inputActivatationScale_.setName(name() + ".input_scale"); + inputActivatationScale_.reshape(1, 1, 1, 1); + inputActivatationScale_.setDtype(MLLM_TYPE_F32); + inputActivatationScale_.alloc(); + loader.load(&inputActivatationScale_); + + outputActivatationScale_.setName(name() + ".output_scale"); + outputActivatationScale_.reshape(1, 1, 1, 1); + outputActivatationScale_.setDtype(MLLM_TYPE_F32); + outputActivatationScale_.alloc(); + loader.load(&outputActivatationScale_); + + return Op::load(loader); } @@ -91,9 +139,19 @@ ErrorCode CPULinearInt8::free(vector> inputs, vector()[0] / 127.0; + scale1 = roundf(scale1 * 100000) / 100000; + + float scale2 = weightScale_.hostPtr()[0]; + + float scale3 = 0.0; + if(support_bias_) + scale3 = biasScale_.hostPtr()[0]; + + float scale4 = outputActivatationScale_.hostPtr()[0]/ 127.0; + scale4 = roundf(scale4 * 100000) / 100000; - assert(src1->dtype() == MLLM_TYPE_Q4_0); + assert(src1->dtype() == MLLM_TYPE_I8); assert(src0_->dtype() == MLLM_TYPE_F32); Tensor src0_i8(src0_->shape()); src0_i8.setBackend(src0_->backend()); @@ -125,7 +183,7 @@ ErrorCode CPULinearInt8::mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *ds Tensor *src0_cal = src0; Tensor *src1_cal = src1; const int64_t blck_0 = 16; -#pragma omp parallel for collapse(4) num_threads(thread_count) +// #pragma omp parallel for collapse(4) num_threads(thread_count) for (int b = 0; b < src0->batch(); b++) { for (int h = 0; h < src0->head(); h++) { const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b; @@ -144,8 +202,9 @@ ErrorCode CPULinearInt8::mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *ds vec_dot_i8_i8(K, dst->ptrAt(b, h, m, n), src1_cal->hostPtr() + src1_cal->offset(b_1, h_1, s_1, d_1), src0_cal->hostPtr() + src0_cal->offset(b, h, s_0, d_0), scale1, scale2); if (support_bias) { - *dst->ptrAt(b, h, m, n) += bias->dataAt(0, 0, 0, n); + *dst->ptrAt(b, h, m, n) += bias->dataAt(0, 0, 0, n) * scale3; } + *dst->ptrAt(b, h, m, n) = std::fmaxf(std::fminf(roundf(*dst->ptrAt(b, h, m, n) / scale4), 127), -128) * scale4; } } } diff --git a/src/backends/cpu/CPULinearInt8.hpp b/src/backends/cpu/CPULinearInt8.hpp index d1652153..91cb2c3c 100644 --- a/src/backends/cpu/CPULinearInt8.hpp +++ b/src/backends/cpu/CPULinearInt8.hpp @@ -30,8 +30,14 @@ class CPULinearInt8 final : public Op { bool support_bias_; int thread_count = 4; Tensor weight_; + Tensor originWeight_; Tensor bias_; + Tensor weightScale_; + Tensor biasScale_; + Tensor inputActivatationScale_; + Tensor outputActivatationScale_; + ErrorCode mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count = 4); }; diff --git a/src/backends/cpu/quantize/QuantizeQ8.cpp b/src/backends/cpu/quantize/QuantizeQ8.cpp index 7ae4055a..040f85a1 100644 --- a/src/backends/cpu/quantize/QuantizeQ8.cpp +++ b/src/backends/cpu/quantize/QuantizeQ8.cpp @@ -278,6 +278,9 @@ void quantize_row_i8(const float *__restrict x, void *__restrict vy, int k, floa const float d = scale; const float id = d ? 1.0f / d : 0.0f; + const int32x4_t min_128 = vdupq_n_s32(-128); + const int32x4_t max127 = vdupq_n_s32( 127); + #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { float32x4_t srcv[8]; @@ -285,7 +288,10 @@ void quantize_row_i8(const float *__restrict x, void *__restrict vy, int k, floa for (int j = 0; j < 8; j++) { const float32x4_t v = vmulq_n_f32(srcv[j], id); - const int32x4_t vi = vcvtnq_s32_f32(v); + int32x4_t vi = vcvtnq_s32_f32(v); + + vi = vminq_s32(vi, max127); + vi = vmaxq_s32(vi, min_128); y[i*32+ 4 * j + 0] = vgetq_lane_s32(vi, 0); y[i*32+ 4 * j + 1] = vgetq_lane_s32(vi, 1);