Merge pull request #21 from liang1232018/develop-QNN-zh

Develop qnn zh
UbiquitousLearning · Apr 3, 2024 · c866437 · c866437
2 parents c8b04bc + 3cc4f66
commit c866437
Show file tree

Hide file tree

Showing 28 changed files with 938 additions and 26 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -367,6 +367,15 @@ add_executable(qnn_opt_smoothquant ${PROJECT_SOURCE_DIR}/demo/qnn/qnn_opt_smooth
 if(QNN)
     target_link_libraries(qnn_opt_smoothquant MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS})
 endif()
+add_executable(mat_test ${PROJECT_SOURCE_DIR}/demo/qnn/matmul_test.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT}
+        src/tokenizers/Tokenizer.cpp
+        src/tokenizers/Tokenizer.hpp
+        src/tokenizers/BPE/Bpe.cpp
+        src/tokenizers/BPE/Bpe.hpp
+)
+if(QNN)
+    target_link_libraries(mat_test MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS})
+endif()
 
 if(QNN)
     add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/QNN)

diff --git a/demo/qnn/TestNet.hpp b/demo/qnn/TestNet.hpp
@@ -1,4 +1,5 @@
 #include "Net.hpp"
+#include "Types.hpp"
 #include "express/Express.hpp"
 NetTensor *AttentionTest(Context *c, NetTensor *x, int embedding_size, int hidden_size, int head_size, int cache_max, string name) {
     // x = _Quantize({x}, true, (string)name + ".x.quantize");
@@ -42,7 +43,9 @@ void linearTest2048(Context *c, int vocab_size = 32000, int hidden_dim = 4096, i
     i = _Embedding({i}, vocab_size, hidden_dim, (string) "model.decoder.embed_tokens");
     _SubgraphBegin(c);
     i = _Quantize({i}, true, "x.quantize");
-    _LinearINT8({i}, 2048, 2048, false, "model.decoder.layers.0.fc1");
+    i = _LinearINT8({i}, 2048, 2048, false, "model.decoder.layers.0.fc1");
+    _SubgraphBegin(c, MLLM_CPU);
+    i = _MatmulINT8({i, i}, false, true, "model.decoder.layers.0.fc2");
 }
 void linearTest11008(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn_hidden_dim = 11008, int mutil_head_size = 32, int cache_max = 200) {
     auto *i = _Input(c);

diff --git a/demo/qnn/matmul_test.cpp b/demo/qnn/matmul_test.cpp
@@ -0,0 +1,81 @@
+#include <cstdint>
+#include <iostream>
+#include <valarray>
+#include <csignal>
+#include "MockLoader.hpp"
+#include "Types.hpp"
+#include "backends/QNN/QNNOptNet.hpp"
+#include "cmdline.h"
+#include "Net.hpp"
+#include "Executor.hpp"
+#include "express/Express.hpp"
+#include "tokenizers/BPE/Bpe.hpp"
+#include "backends/QNN/QNNNet.hpp"
+#include "backends/QNN/QNNExecutor.hpp"
+#include "TestNet.hpp"
+
+using namespace mllm;
+
+
+
+template <typename Dtype>
+void fullTensor(shared_ptr<Tensor> input_tensor, Net net, vector<int> shape, Dtype value) {
+    input_tensor->setBackend(net.backends()[BackendType::MLLM_CPU].get());
+    input_tensor->setCtype(ChlType::BSHD);
+    input_tensor->setDtype(MLLM_TYPE_I8);
+    input_tensor->reshape(shape[0], shape[1], shape[2], shape[3]);
+    input_tensor->alloc();
+    input_tensor->fullData<Dtype>(value);
+}
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "./vocab/vocab_opt_6.7b.mllm");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "./models/opt-1.3b-sq_nohead.mllm");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    // cmdParser.add<int>("thread", 't', "num of threads", false, 4);
+    cmdParser.add<int>("seq", 's', "num of threads", false, 1);
+    cmdParser.add<int>("head", 'h', "num of heads", false, 32);
+    cmdParser.add<int>("type", 't', "type of test", false, 1);
+    cmdParser.parse_check(argc, argv);
+
+    string vocab_path = cmdParser.get<string>("vocab");
+    string model_path = cmdParser.get<string>("model");
+    int tokens_limit = cmdParser.get<int>("limits");
+    // int thread_num = cmdParser.get<int>("thread");
+    int seqLength = cmdParser.get<int>("seq");
+    int head_num = cmdParser.get<int>("head");
+    int type = cmdParser.get<int>("type");
+
+    std::unique_ptr<Context> c_ptr(new Context());
+    auto *c = c_ptr.get();
+
+    auto *i = _Input(c);
+    i = _MatmulINT8({i, i}, false, true, "model.decoder.layers.0.fc2");
+
+    BackendConfig bn;
+    Net net(bn);
+    net.convert(c->sub_param_, BackendType::MLLM_CPU);
+
+    // ParamLoader param_loader(model_path);
+    MockLoader param_loader(model_path);
+    Executor ex(&param_loader);
+    ex.setup(&net);
+
+    shared_ptr<Tensor> input = std::make_shared<Tensor>();
+    fullTensor(input, net, {1, 1, 1, 32}, (uint8_t)2);
+    uint8_t *data = input->hostPtr<uint8_t>();
+    for (int i = 0; i < 16; i++) {
+        std::cout << (int)data[i] << " ";
+    }
+
+    ex.run(&net, {input});
+
+    auto result = ex.result();
+    result[0]->printData<float>();
+
+    ex.perf();
+
+
+    return 0;
+}
diff --git a/demo/qnn/qnn_opt_smoothquant.cpp b/demo/qnn/qnn_opt_smoothquant.cpp
@@ -56,33 +56,38 @@ NetTensor *Attention(Context *c, NetTensor *x, int embedding_size, int hidden_si
     // k = _KVCache({k}, cache_max, name + ".k_cache");
     // v = _KVCache({v}, cache_max, name + ".v_cache");
 
-    // auto *m = _MergeOutput({q, k, v}, name + ".qkv_merge");
+    auto *m = _MergeOutput({q, k, v}, name + ".qkv_merge");
 
-    // _SubgraphBegin(c);
+    // --------------------
+    _SubgraphBegin(c, MLLM_CPU);
+    // --------------------
 
-    // auto s = _SplitInput({m}, true, name + ".qkv_split");
+    auto s = _SplitInput({m}, true, name + ".qkv_split");
 
-    // q = s[0];
-    // k = s[1];
-    // v = s[2];
-    q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize");
-    k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize");
-    v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize");
+    q = s[0];
+    k = s[1];
+    v = s[2];
+    // q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize");
+    // k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize");
+    // v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize");
 
-    auto *qk = _Matmul({q, k}, false, true, name + ".qk");
+    auto *qk = _MatmulINT8({q, k}, false, true, name + ".qk");
     // qk = _Dequantize({qk}, false, (string) name + ".qk.dequantize");
 
     // qk = *qk / std::sqrt(hidden_size);
     qk = _Causalmask({qk}, name + ".mask");
     qk = _Softmax({qk}, DIMENSION, name + ".softmax");
 
-    auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
-    // // _SubgraphBegin(c);
+    auto *o = _MatmulINT8({qk, v}, false, false, name + ".qkv");
+
+    // // --------------------
+    // _SubgraphBegin(c);
+    // // --------------------
 
-    o = _Quantize({o}, true, (string)name + ".out_proj.quantize");
-    o = o->view(-1, 1, -1, hidden_size * head_size);
-    o = _LinearINT8({o}, hidden_size * head_size, embedding_size, false, name + ".out_proj");
-    o = _Dequantize({o}, true, (string)name + ".out_proj.dequantize");
+    // o = _Quantize({o}, true, (string)name + ".out_proj.quantize");
+    // o = o->view(-1, 1, -1, hidden_size * head_size);
+    // o = _LinearINT8({o}, hidden_size * head_size, embedding_size, false, name + ".out_proj");
+    // o = _Dequantize({o}, true, (string)name + ".out_proj.dequantize");
     return o;
 }
 NetTensor *FFN(Context *c, NetTensor *i, int hidden_dim, int ffn_hidden_dim, string name) {
@@ -137,7 +142,7 @@ int main(int argc, char **argv) {
     // cmdParser.add<int>("thread", 't', "num of threads", false, 4);
     cmdParser.add<int>("seq", 's', "num of threads", false, 1);
     cmdParser.add<int>("head", 'h', "num of heads", false, 32);
-    cmdParser.add<int>("type", 't', "type of test", false, 1);
+    cmdParser.add<int>("type", 't', "type of test", false, 13);
     cmdParser.parse_check(argc, argv);
 
     string vocab_path = cmdParser.get<string>("vocab");
@@ -218,7 +223,7 @@ int main(int argc, char **argv) {
 
     BackendConfig bn;
     QNNOptNet net(bn, c);
-    net.convert(c->sub_param_, BackendType::MLLM_QNN);
+    net.convert(c, BackendType::MLLM_QNN);
 
     // ParamLoader param_loader(model_path);
     ParamLoader param_loader(model_path);
@@ -253,7 +258,9 @@ int main(int argc, char **argv) {
         std::cout << "[Q] " << in_str << std::endl;
         std::cout << "[A] " << std::flush;
         for (int step = 0; step < 1; step++) {
-            ex.run(&net, {input});
+            ex.run(c, &net, {input});
+            //  ---------------------------------
+            // ex.run(&net, {input});
             auto result = ex.result();
             result[0]->printShape();
             result[0]->printData<float>();

diff --git a/include/OpDefined.hpp b/include/OpDefined.hpp
@@ -17,6 +17,7 @@ enum OpType {
     MATMULINT8,
     SCALE,
     ROPE,
+    POSITIOANL_EMBEDDING,
     RMSNORM,
     CAUSALMASK,
     LINEAR,

diff --git a/include/Types.hpp b/include/Types.hpp
@@ -19,6 +19,7 @@ using std::map;
 typedef map<std::string, float> OpParam;
 
 typedef enum {
+    MLLM_DEFAULT,
     MLLM_CPU,
     MLLM_OPENCL,
     MLLM_QNN
@@ -193,6 +194,11 @@ typedef struct {
     int8_t qs[QK8_0]; // quants
 } block_q8_0;
 #pragma pack()
+#pragma pack(1)
+typedef struct {
+    int8_t qs[QK8_0]; // quants
+} block_q8_0_sq; // q8 block for smoothquant
+#pragma pack()
 
 // This is only used for intermediate quantization and dot products
 #pragma pack(1)

diff --git a/src/Op.hpp b/src/Op.hpp
@@ -112,7 +112,7 @@ class Op {
         type_ = type;
     }
 
-private:
+protected:
     Backend *backend_;
     vector<Tensor *> inputs_;
     vector<Tensor *> outputs_;

diff --git a/src/backends/QNN/QNNBackend.cpp b/src/backends/QNN/QNNBackend.cpp
@@ -110,7 +110,7 @@ QNNBackend::QNNBackend(shared_ptr<MemoryManager> mm) :
         return;
     }
     // TODO: make debug level configuable
-    log::setLogLevel(QnnLog_Level_t::QNN_LOG_LEVEL_INFO);
+    log::setLogLevel(QnnLog_Level_t::QNN_LOG_LEVEL_ERROR);
 
     std::string backEndPath = "libQnnHtp.so";
     std::string opPackagePaths = "libQnnLLaMAPackage_CPU.so:LLaMAPackageInterfaceProvider:CPU,libQnnLLaMAPackage_HTP.so:LLaMAPackageInterfaceProvider:HTP";