Skip to content

Commit

Permalink
Merge pull request #21 from liang1232018/develop-QNN-zh
Browse files Browse the repository at this point in the history
Develop qnn zh
  • Loading branch information
liang1232018 authored Apr 3, 2024
2 parents c8b04bc + 3cc4f66 commit c866437
Show file tree
Hide file tree
Showing 28 changed files with 938 additions and 26 deletions.
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,15 @@ add_executable(qnn_opt_smoothquant ${PROJECT_SOURCE_DIR}/demo/qnn/qnn_opt_smooth
if(QNN)
target_link_libraries(qnn_opt_smoothquant MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS})
endif()
add_executable(mat_test ${PROJECT_SOURCE_DIR}/demo/qnn/matmul_test.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC} # ${DIR_SRC_QUANT}
src/tokenizers/Tokenizer.cpp
src/tokenizers/Tokenizer.hpp
src/tokenizers/BPE/Bpe.cpp
src/tokenizers/BPE/Bpe.hpp
)
if(QNN)
target_link_libraries(mat_test MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS})
endif()

if(QNN)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/QNN)
Expand Down
5 changes: 4 additions & 1 deletion demo/qnn/TestNet.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "Net.hpp"
#include "Types.hpp"
#include "express/Express.hpp"
NetTensor *AttentionTest(Context *c, NetTensor *x, int embedding_size, int hidden_size, int head_size, int cache_max, string name) {
// x = _Quantize({x}, true, (string)name + ".x.quantize");
Expand Down Expand Up @@ -42,7 +43,9 @@ void linearTest2048(Context *c, int vocab_size = 32000, int hidden_dim = 4096, i
i = _Embedding({i}, vocab_size, hidden_dim, (string) "model.decoder.embed_tokens");
_SubgraphBegin(c);
i = _Quantize({i}, true, "x.quantize");
_LinearINT8({i}, 2048, 2048, false, "model.decoder.layers.0.fc1");
i = _LinearINT8({i}, 2048, 2048, false, "model.decoder.layers.0.fc1");
_SubgraphBegin(c, MLLM_CPU);
i = _MatmulINT8({i, i}, false, true, "model.decoder.layers.0.fc2");
}
void linearTest11008(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn_hidden_dim = 11008, int mutil_head_size = 32, int cache_max = 200) {
auto *i = _Input(c);
Expand Down
81 changes: 81 additions & 0 deletions demo/qnn/matmul_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#include <cstdint>
#include <iostream>
#include <valarray>
#include <csignal>
#include "MockLoader.hpp"
#include "Types.hpp"
#include "backends/QNN/QNNOptNet.hpp"
#include "cmdline.h"
#include "Net.hpp"
#include "Executor.hpp"
#include "express/Express.hpp"
#include "tokenizers/BPE/Bpe.hpp"
#include "backends/QNN/QNNNet.hpp"
#include "backends/QNN/QNNExecutor.hpp"
#include "TestNet.hpp"

using namespace mllm;



template <typename Dtype>
void fullTensor(shared_ptr<Tensor> input_tensor, Net net, vector<int> shape, Dtype value) {
input_tensor->setBackend(net.backends()[BackendType::MLLM_CPU].get());
input_tensor->setCtype(ChlType::BSHD);
input_tensor->setDtype(MLLM_TYPE_I8);
input_tensor->reshape(shape[0], shape[1], shape[2], shape[3]);
input_tensor->alloc();
input_tensor->fullData<Dtype>(value);
}

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "./vocab/vocab_opt_6.7b.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "./models/opt-1.3b-sq_nohead.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
// cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("seq", 's', "num of threads", false, 1);
cmdParser.add<int>("head", 'h', "num of heads", false, 32);
cmdParser.add<int>("type", 't', "type of test", false, 1);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
int tokens_limit = cmdParser.get<int>("limits");
// int thread_num = cmdParser.get<int>("thread");
int seqLength = cmdParser.get<int>("seq");
int head_num = cmdParser.get<int>("head");
int type = cmdParser.get<int>("type");

std::unique_ptr<Context> c_ptr(new Context());
auto *c = c_ptr.get();

auto *i = _Input(c);
i = _MatmulINT8({i, i}, false, true, "model.decoder.layers.0.fc2");

BackendConfig bn;
Net net(bn);
net.convert(c->sub_param_, BackendType::MLLM_CPU);

// ParamLoader param_loader(model_path);
MockLoader param_loader(model_path);
Executor ex(&param_loader);
ex.setup(&net);

shared_ptr<Tensor> input = std::make_shared<Tensor>();
fullTensor(input, net, {1, 1, 1, 32}, (uint8_t)2);
uint8_t *data = input->hostPtr<uint8_t>();
for (int i = 0; i < 16; i++) {
std::cout << (int)data[i] << " ";
}

ex.run(&net, {input});

auto result = ex.result();
result[0]->printData<float>();

ex.perf();


return 0;
}
45 changes: 26 additions & 19 deletions demo/qnn/qnn_opt_smoothquant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,33 +56,38 @@ NetTensor *Attention(Context *c, NetTensor *x, int embedding_size, int hidden_si
// k = _KVCache({k}, cache_max, name + ".k_cache");
// v = _KVCache({v}, cache_max, name + ".v_cache");

// auto *m = _MergeOutput({q, k, v}, name + ".qkv_merge");
auto *m = _MergeOutput({q, k, v}, name + ".qkv_merge");

// _SubgraphBegin(c);
// --------------------
_SubgraphBegin(c, MLLM_CPU);
// --------------------

// auto s = _SplitInput({m}, true, name + ".qkv_split");
auto s = _SplitInput({m}, true, name + ".qkv_split");

// q = s[0];
// k = s[1];
// v = s[2];
q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize");
k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize");
v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize");
q = s[0];
k = s[1];
v = s[2];
// q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize");
// k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize");
// v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize");

auto *qk = _Matmul({q, k}, false, true, name + ".qk");
auto *qk = _MatmulINT8({q, k}, false, true, name + ".qk");
// qk = _Dequantize({qk}, false, (string) name + ".qk.dequantize");

// qk = *qk / std::sqrt(hidden_size);
qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, name + ".softmax");

auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
// // _SubgraphBegin(c);
auto *o = _MatmulINT8({qk, v}, false, false, name + ".qkv");

// // --------------------
// _SubgraphBegin(c);
// // --------------------

o = _Quantize({o}, true, (string)name + ".out_proj.quantize");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _LinearINT8({o}, hidden_size * head_size, embedding_size, false, name + ".out_proj");
o = _Dequantize({o}, true, (string)name + ".out_proj.dequantize");
// o = _Quantize({o}, true, (string)name + ".out_proj.quantize");
// o = o->view(-1, 1, -1, hidden_size * head_size);
// o = _LinearINT8({o}, hidden_size * head_size, embedding_size, false, name + ".out_proj");
// o = _Dequantize({o}, true, (string)name + ".out_proj.dequantize");
return o;
}
NetTensor *FFN(Context *c, NetTensor *i, int hidden_dim, int ffn_hidden_dim, string name) {
Expand Down Expand Up @@ -137,7 +142,7 @@ int main(int argc, char **argv) {
// cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("seq", 's', "num of threads", false, 1);
cmdParser.add<int>("head", 'h', "num of heads", false, 32);
cmdParser.add<int>("type", 't', "type of test", false, 1);
cmdParser.add<int>("type", 't', "type of test", false, 13);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
Expand Down Expand Up @@ -218,7 +223,7 @@ int main(int argc, char **argv) {

BackendConfig bn;
QNNOptNet net(bn, c);
net.convert(c->sub_param_, BackendType::MLLM_QNN);
net.convert(c, BackendType::MLLM_QNN);

// ParamLoader param_loader(model_path);
ParamLoader param_loader(model_path);
Expand Down Expand Up @@ -253,7 +258,9 @@ int main(int argc, char **argv) {
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 1; step++) {
ex.run(&net, {input});
ex.run(c, &net, {input});
// ---------------------------------
// ex.run(&net, {input});
auto result = ex.result();
result[0]->printShape();
result[0]->printData<float>();
Expand Down
1 change: 1 addition & 0 deletions include/OpDefined.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ enum OpType {
MATMULINT8,
SCALE,
ROPE,
POSITIOANL_EMBEDDING,
RMSNORM,
CAUSALMASK,
LINEAR,
Expand Down
6 changes: 6 additions & 0 deletions include/Types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ using std::map;
typedef map<std::string, float> OpParam;

typedef enum {
MLLM_DEFAULT,
MLLM_CPU,
MLLM_OPENCL,
MLLM_QNN
Expand Down Expand Up @@ -193,6 +194,11 @@ typedef struct {
int8_t qs[QK8_0]; // quants
} block_q8_0;
#pragma pack()
#pragma pack(1)
typedef struct {
int8_t qs[QK8_0]; // quants
} block_q8_0_sq; // q8 block for smoothquant
#pragma pack()

// This is only used for intermediate quantization and dot products
#pragma pack(1)
Expand Down
2 changes: 1 addition & 1 deletion src/Op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class Op {
type_ = type;
}

private:
protected:
Backend *backend_;
vector<Tensor *> inputs_;
vector<Tensor *> outputs_;
Expand Down
2 changes: 1 addition & 1 deletion src/backends/QNN/QNNBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ QNNBackend::QNNBackend(shared_ptr<MemoryManager> mm) :
return;
}
// TODO: make debug level configuable
log::setLogLevel(QnnLog_Level_t::QNN_LOG_LEVEL_INFO);
log::setLogLevel(QnnLog_Level_t::QNN_LOG_LEVEL_ERROR);

std::string backEndPath = "libQnnHtp.so";
std::string opPackagePaths = "libQnnLLaMAPackage_CPU.so:LLaMAPackageInterfaceProvider:CPU,libQnnLLaMAPackage_HTP.so:LLaMAPackageInterfaceProvider:HTP";
Expand Down
Loading

0 comments on commit c866437

Please sign in to comment.