Skip to content

Commit

Permalink
Merge pull request #36 from liang1232018/develop-i8-temp
Browse files Browse the repository at this point in the history
Develop i8 temp
  • Loading branch information
liang1232018 authored Aug 1, 2024
2 parents 5223137 + 6ebe067 commit 0673a65
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 39 deletions.
46 changes: 23 additions & 23 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -567,29 +567,29 @@ endif ()



if(QNN)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/QNN)

# if (ARM)
# set(CMAKE_CXX_FLAGS " -fPIC -Wall -pthread -march=armv8-a -O3 -g -Wno-write-strings -fvisibility=hidden -flto")
# else()
# set (CMAKE_CXX_FLAGS " -fPIC -Wall -pg -pthread -march=x86-64 -O3 -g -Wno-write-strings -fvisibility=hidden -flto")
# endif()
# set(CMAKE_LD_FLAGS "-shared -s -fPIC -pthread -fvisibility=hidden -flto")

# qnn executables
add_executable(qnn_opt_smoothquant ${PROJECT_SOURCE_DIR}/demo/qnn/qnn_opt_smoothquant.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
src/tokenizers/Tokenizer.cpp
src/tokenizers/Tokenizer.hpp
src/tokenizers/BPE/Bpe.cpp
src/tokenizers/BPE/Bpe.hpp
)
target_compile_definitions(qnn_opt_smoothquant PRIVATE USE_QNN)
if (ARM)
target_compile_options(qnn_opt_smoothquant PRIVATE -fopenmp)
target_link_libraries(qnn_opt_smoothquant PUBLIC MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS} -fopenmp -static-openmp)
endif ()
endif()
# if(QNN)
# add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/QNN)
#
# # if (ARM)
# # set(CMAKE_CXX_FLAGS " -fPIC -Wall -pthread -march=armv8-a -O3 -g -Wno-write-strings -fvisibility=hidden -flto")
# # else()
# # set (CMAKE_CXX_FLAGS " -fPIC -Wall -pg -pthread -march=x86-64 -O3 -g -Wno-write-strings -fvisibility=hidden -flto")
# # endif()
# # set(CMAKE_LD_FLAGS "-shared -s -fPIC -pthread -fvisibility=hidden -flto")
#
# # qnn executables
# add_executable(qnn_opt_smoothquant ${PROJECT_SOURCE_DIR}/demo/qnn/qnn_opt_smoothquant.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
# src/tokenizers/Tokenizer.cpp
# src/tokenizers/Tokenizer.hpp
# src/tokenizers/BPE/Bpe.cpp
# src/tokenizers/BPE/Bpe.hpp
# )
# target_compile_definitions(qnn_opt_smoothquant PRIVATE USE_QNN)
# if (ARM)
# target_compile_options(qnn_opt_smoothquant PRIVATE -fopenmp)
# target_link_libraries(qnn_opt_smoothquant PUBLIC MLLM_CPU MLLM_QNN ${CMAKE_DL_LIBS} -fopenmp -static-openmp)
# endif ()
# endif()

if (APK)
add_library(mllm_lib STATIC ${DIR_SRC_CPU} ${DIR_SRC_EXP} ${DIR_SRC} ${DIR_SRC_MEM_MANAGER}
Expand Down
16 changes: 12 additions & 4 deletions demo/qnn/main_qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ void fullTensor(shared_ptr<Tensor> input_tensor, Net net, vector<int> shape, Dty

NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head_size, int cache_max, string name) {
auto *q = _LinearINT8({x}, embedding_size, hidden_size * head_size, true, name + ".q_proj");

auto *k = _LinearINT8({x}, embedding_size, hidden_size * head_size, true, name + ".k_proj");
auto *v = _LinearINT8({x}, embedding_size, hidden_size * head_size, true, name + ".v_proj");

Expand All @@ -65,8 +64,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head

q = _RoPE({q}, HFHUBROPE, name + ".q_rope", 1000000, 32768);
k = _RoPE({k}, HFHUBROPE, name + ".k_rope", 1000000, 32768);
k = _KVCache({k}, cache_max, name + ".k_cache");
v = _KVCache({v}, cache_max, name + ".v_cache");
k = _KVCacheNPU({k}, cache_max, name + ".k_cache");
v = _KVCacheNPU({v}, cache_max, name + ".v_cache");
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
qk = *qk / std::sqrt(hidden_size);

Expand Down Expand Up @@ -97,6 +96,7 @@ void qwen_model(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int f

auto tmp = Attention(res, hidden_dim, hidden_dim / mutil_head_size, mutil_head_size, cache_max, (string) "model.layers." + std::to_string(layer) + ".self_attn");

return ;
i = *tmp+i;

res = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm");
Expand Down Expand Up @@ -124,7 +124,7 @@ int main(int argc, char **argv) {
cmdParser.parse_check(argc, argv);

const string cpu_model_path = "./models/Qwen1.5-1.8B-Chat_152_int8_biasint8_ns.mllm";
const string merge_file_path = "./vocab/merges-qwen.txt";
const string merge_file_path = "./vocab/merges_qwen.txt";

string vocab_path = cmdParser.get<string>("vocab");
int tokens_limit = cmdParser.get<int>("limits");
Expand Down Expand Up @@ -191,6 +191,11 @@ int main(int argc, char **argv) {
tokens_id[0] = 13;
}

for (int ti = 0; ti < tokens_id.size(); ti++) {
tokens_id[ti] = 9707;
std::cout << tokens_id[ti] << std::endl;
}

BPETokenizer::token2Tensor(&cpuNet, tokens_id, input);


Expand All @@ -203,6 +208,9 @@ int main(int argc, char **argv) {
cpuExe.run(&cpuNet, {input});
auto result = cpuExe.result();

result[0]->printData<float>();
exit(-1);

auto token_idx = postProcessing(result[0], input);
if (token_idx == 151645) { // "</s>"
break;
Expand Down
2 changes: 1 addition & 1 deletion scripts/build_qnn_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ cmake .. \
-DDEBUG=ON \
-DTEST=OFF \
-DQUANT=OFF \
-DSMOOTHQUANT=ON\
-DSMOOTHQUANT=OFF\

make -j4
79 changes: 69 additions & 10 deletions src/backends/cpu/CPULinearInt8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@ CPULinearInt8::CPULinearInt8(Backend *bn, string opName, int in_features, int ou
support_bias_ = bias;
thread_count = threadCount;
weight_.setBackend(bn);
originWeight_.setBackend(bn);
bias_.setBackend(bn);

weightScale_.setBackend(bn);
biasScale_.setBackend(bn);
inputActivatationScale_.setBackend(bn);
outputActivatationScale_.setBackend(bn);
}

ErrorCode CPULinearInt8::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
Expand Down Expand Up @@ -44,13 +50,34 @@ ErrorCode CPULinearInt8::reshape(vector<shared_ptr<Tensor>> inputs, vector<share

ErrorCode CPULinearInt8::load(AbstructLoader &loader) {
//std::cout << name() << " CPULinearInt8 load" << std::endl;
weight_.setName(name() + ".weight");
weight_.reshape(1, 1, out_features_, in_features_);
if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) {
std::cout << "load weight: " << loader.getDataType(weight_.name()) << std::endl;
weight_.setDtype(loader.getDataType(weight_.name()));
originWeight_.setName(name() + ".weight");
// origin weight is [in, out], while the linear weight is [out, in]
originWeight_.reshape(1, 1, in_features_, out_features_);
if (loader.getDataType(originWeight_.name()) != MLLM_TYPE_COUNT) {
originWeight_.setDtype(loader.getDataType(originWeight_.name()));
originWeight_.alloc();
loader.load(&originWeight_);


weight_.setName(name() + ".linear.weight");
weight_.reshape(1, 1, out_features_, in_features_);
weight_.setDtype(MLLM_TYPE_I8);
weight_.alloc();
loader.load(&weight_);

for (int i = 0; i < in_features_; ++i) {
for (int j = 0; j < out_features_; ++j) {
weight_.setDataAt<int8_t>(0, 0, j, i, originWeight_.dataAt<int8_t>(0,0, i,j));
}
}

originWeight_.free();

weightScale_.setName(name() + ".weight.scale");
weightScale_.reshape(1, 1, 1, 1);
weightScale_.setDtype(MLLM_TYPE_F32);
weightScale_.alloc();
loader.load(&weightScale_);

} else {
weight_.setDtype(MLLM_TYPE_F32);
weight_.alloc();
Expand All @@ -62,11 +89,32 @@ ErrorCode CPULinearInt8::load(AbstructLoader &loader) {
bias_.setDtype(loader.getDataType(bias_.name()));
bias_.alloc();
loader.load(&bias_);

biasScale_.setName(name() + ".bias.scale");
biasScale_.reshape(1, 1, 1, 1);
biasScale_.setDtype(MLLM_TYPE_F32);
biasScale_.alloc();
loader.load(&biasScale_);
} else {
bias_.setDtype(MLLM_TYPE_F32);
bias_.alloc();
}
}


inputActivatationScale_.setName(name() + ".input_scale");
inputActivatationScale_.reshape(1, 1, 1, 1);
inputActivatationScale_.setDtype(MLLM_TYPE_F32);
inputActivatationScale_.alloc();
loader.load(&inputActivatationScale_);

outputActivatationScale_.setName(name() + ".output_scale");
outputActivatationScale_.reshape(1, 1, 1, 1);
outputActivatationScale_.setDtype(MLLM_TYPE_F32);
outputActivatationScale_.alloc();
loader.load(&outputActivatationScale_);


return Op::load(loader);
}

Expand All @@ -91,9 +139,19 @@ ErrorCode CPULinearInt8::free(vector<shared_ptr<Tensor>> inputs, vector<shared_p

ErrorCode CPULinearInt8::mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias, int thread_count){
// todo: load scale from loader
const float scale1 = 1.0, scale2 = 1.0;
float scale1 = inputActivatationScale_.hostPtr<float>()[0] / 127.0;
scale1 = roundf(scale1 * 100000) / 100000;

float scale2 = weightScale_.hostPtr<float>()[0];

float scale3 = 0.0;
if(support_bias_)
scale3 = biasScale_.hostPtr<float>()[0];

float scale4 = outputActivatationScale_.hostPtr<float>()[0]/ 127.0;
scale4 = roundf(scale4 * 100000) / 100000;

assert(src1->dtype() == MLLM_TYPE_Q4_0);
assert(src1->dtype() == MLLM_TYPE_I8);
assert(src0_->dtype() == MLLM_TYPE_F32);
Tensor src0_i8(src0_->shape());
src0_i8.setBackend(src0_->backend());
Expand Down Expand Up @@ -125,7 +183,7 @@ ErrorCode CPULinearInt8::mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *ds
Tensor *src0_cal = src0;
Tensor *src1_cal = src1;
const int64_t blck_0 = 16;
#pragma omp parallel for collapse(4) num_threads(thread_count)
// #pragma omp parallel for collapse(4) num_threads(thread_count)
for (int b = 0; b < src0->batch(); b++) {
for (int h = 0; h < src0->head(); h++) {
const int b_1 = (src1->batch() == 1 && src1->head() == 1) ? 0 : b;
Expand All @@ -144,8 +202,9 @@ ErrorCode CPULinearInt8::mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *ds

vec_dot_i8_i8(K, dst->ptrAt<float>(b, h, m, n), src1_cal->hostPtr<int8_t>() + src1_cal->offset(b_1, h_1, s_1, d_1), src0_cal->hostPtr<int8_t>() + src0_cal->offset(b, h, s_0, d_0), scale1, scale2);
if (support_bias) {
*dst->ptrAt<float>(b, h, m, n) += bias->dataAt<float>(0, 0, 0, n);
*dst->ptrAt<float>(b, h, m, n) += bias->dataAt<int8_t>(0, 0, 0, n) * scale3;
}
*dst->ptrAt<float>(b, h, m, n) = std::fmaxf(std::fminf(roundf(*dst->ptrAt<float>(b, h, m, n) / scale4), 127), -128) * scale4;
}
}
}
Expand Down
6 changes: 6 additions & 0 deletions src/backends/cpu/CPULinearInt8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,14 @@ class CPULinearInt8 final : public Op {
bool support_bias_;
int thread_count = 4;
Tensor weight_;
Tensor originWeight_;
Tensor bias_;

Tensor weightScale_;
Tensor biasScale_;
Tensor inputActivatationScale_;
Tensor outputActivatationScale_;

ErrorCode mat_mul_fp32_i8(Tensor *src0_, Tensor *src1, Tensor *dst, bool support_bias, Tensor *bias = nullptr, int thread_count = 4);
};

Expand Down
8 changes: 7 additions & 1 deletion src/backends/cpu/quantize/QuantizeQ8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,20 @@ void quantize_row_i8(const float *__restrict x, void *__restrict vy, int k, floa
const float d = scale;
const float id = d ? 1.0f / d : 0.0f;

const int32x4_t min_128 = vdupq_n_s32(-128);
const int32x4_t max127 = vdupq_n_s32( 127);

#if defined(__ARM_NEON)
for (int i = 0; i < nb; i++) {
float32x4_t srcv[8];
for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i * 32 + 4 * j);

for (int j = 0; j < 8; j++) {
const float32x4_t v = vmulq_n_f32(srcv[j], id);
const int32x4_t vi = vcvtnq_s32_f32(v);
int32x4_t vi = vcvtnq_s32_f32(v);

vi = vminq_s32(vi, max127);
vi = vmaxq_s32(vi, min_128);

y[i*32+ 4 * j + 0] = vgetq_lane_s32(vi, 0);
y[i*32+ 4 * j + 1] = vgetq_lane_s32(vi, 1);
Expand Down

0 comments on commit 0673a65

Please sign in to comment.