Skip to content

Commit affe946

Browse files
authored
Merge pull request #104 from yirongjie/main
perf: add AArch64 GEMM/GEMV for q4_0.
2 parents 56a0603 + 6e941b5 commit affe946

File tree

81 files changed

+4913
-1232
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+4913
-1232
lines changed

CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ endif ()
2121

2222
if (ARM)
2323
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
24+
add_compile_definitions(__ARM_FEATURE_DOTPROD)
25+
# 检查是否使用的是 GCC 或 Clang 编译器
26+
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
27+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod")
28+
endif()
2429
else ()
2530
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin)
2631
endif ()
@@ -96,7 +101,8 @@ endif ()
96101
if (QUANT)
97102
include_directories(${PROJECT_SOURCE_DIR}/src/quantizer)
98103
file(GLOB_RECURSE MLLM_QUANT
99-
104+
${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.hpp
105+
${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.cpp
100106
${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.hpp
101107
${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.cpp
102108
)

examples/demo_imagebind_1mod.cpp

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -13,53 +13,57 @@ int main(int argc, char **argv) {
1313
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/imagebind_huge-q4_k.mllm");
1414
cmdParser.add<string>("merges", 'f', "specify mllm tokenizer merges.txt path", false, "../vocab/clip_merges.txt");
1515
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
16+
cmdParser.add<int>("loop_times", 'l', "number of inference loops", false, 10);
17+
cmdParser.add<string>("modality", 'o', "inference modality (text/vision/audio/all)", false, "all");
1618
cmdParser.parse_check(argc, argv);
1719

1820
string vocab_path = cmdParser.get<string>("vocab");
1921
string model_path = cmdParser.get<string>("model");
2022
string merges_path = cmdParser.get<string>("merges");
23+
int loop_times = cmdParser.get<int>("loop_times");
24+
string modality = cmdParser.get<string>("modality");
2125
CPUBackend::cpu_threads = cmdParser.get<int>("thread");
2226

2327
auto processor = ImagebindProcessor(vocab_path, merges_path);
24-
2528
ImagebindConfig config("huge");
2629

27-
int loop_times = 10;
28-
29-
// auto input_tensors = processor.process(
30-
// {"a dog.", "A car", "A bird"},config.max_position_embeddings,
31-
// {"../assets/dog_image.jpg", "../assets/car_image.jpg", "../assets/bird_image.jpg"}, config.img_hw,
32-
// {"../assets/dog_audio.wav", "../assets/car_audio.wav", "../assets/bird_audio.wav"});
33-
3430
auto input_tensors = processor.process(
35-
{"a dog."},config.max_position_embeddings,
31+
{"a dog."}, config.max_position_embeddings,
3632
{"../assets/dog_image.jpg"}, config.img_hw,
3733
{"../assets/dog_audio.wav"});
38-
39-
std::cout<<"Text| input_shape:["<<input_tensors.text_tensors.batch()<<", "<<input_tensors.text_tensors.sequence()<<", "<<input_tensors.text_tensors.head()<<", "<<input_tensors.text_tensors.dimension()<<"]"<<std::endl;
40-
auto text_model = ImagebindTextModel(config);
41-
text_model.load(model_path);
42-
for (int step = 0; step < loop_times; step++) {
43-
auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
34+
35+
if (modality == "text" || modality == "all") {
36+
std::cout << "Text| input_shape:[" << input_tensors.text_tensors.batch() << ", " << input_tensors.text_tensors.sequence() << ", " << input_tensors.text_tensors.head() << ", " << input_tensors.text_tensors.dimension() << "]" << std::endl;
37+
auto text_model = ImagebindTextModel(config);
38+
text_model.load(model_path);
39+
for (int step = 0; step < loop_times; step++) {
40+
auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
41+
}
42+
text_model.profiling();
43+
text_model.free();
4444
}
45-
text_model.profiling();
46-
text_model.free();
4745

48-
std::cout<<"Vision| input_shape:["<<input_tensors.img_tensors.batch()<<", "<<input_tensors.img_tensors.channel()<<", "<<input_tensors.img_tensors.time()<<", "<<input_tensors.img_tensors.height()<<", "<<input_tensors.img_tensors.width()<<"]"<<std::endl;
49-
auto vision_model = ImagebindVisionModel(config);
50-
vision_model.load(model_path);
51-
for (int step = 0; step < loop_times; step++) {
52-
auto result = vision_model({input_tensors.img_tensors});
46+
if (modality == "vision" || modality == "all") {
47+
std::cout << "Vision| input_shape:[" << input_tensors.img_tensors.batch() << ", " << input_tensors.img_tensors.channel() << ", " << input_tensors.img_tensors.time() << ", " << input_tensors.img_tensors.height() << ", " << input_tensors.img_tensors.width() << "]" << std::endl;
48+
auto vision_model = ImagebindVisionModel(config);
49+
vision_model.load(model_path);
50+
for (int step = 0; step < loop_times; step++) {
51+
auto result = vision_model({input_tensors.img_tensors});
52+
}
53+
vision_model.profiling();
54+
vision_model.free();
5355
}
54-
vision_model.profiling();
55-
vision_model.free();
5656

57-
std::cout<<"Audio| input_shape:["<<input_tensors.audio_tensors.batch()<<", "<<input_tensors.audio_tensors.sequence()<<", "<<input_tensors.audio_tensors.head()<<", "<<input_tensors.audio_tensors.dimension()<<"]"<<std::endl;
58-
auto audio_model = ImagebindAudioModel(config);
59-
audio_model.load(model_path);
60-
for (int step = 0; step < loop_times; step++) {
61-
auto result = audio_model({input_tensors.audio_tensors});
57+
if (modality == "audio" || modality == "all") {
58+
std::cout << "Audio| input_shape:[" << input_tensors.audio_tensors.batch() << ", " << input_tensors.audio_tensors.sequence() << ", " << input_tensors.audio_tensors.head() << ", " << input_tensors.audio_tensors.dimension() << "]" << std::endl;
59+
auto audio_model = ImagebindAudioModel(config);
60+
audio_model.load(model_path);
61+
for (int step = 0; step < loop_times; step++) {
62+
auto result = audio_model({input_tensors.audio_tensors});
63+
}
64+
audio_model.profiling();
65+
audio_model.free();
6266
}
63-
audio_model.profiling();
64-
audio_model.free();
67+
68+
return 0;
6569
}

examples/demo_llama.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ int main(int argc, char **argv) {
5151
chatPostProcessing(out_token, input_tensor, {});
5252
}
5353
printf("\n");
54+
model.profiling();
5455
}
5556

5657
return 0;

examples/demo_qwen.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ int main(int argc, char **argv) {
1919
cmdline::parser cmdParser;
2020
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
2121
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
22-
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-0.5b-q4_k.mllm");
22+
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-q8_0.mllm");
2323
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
2424
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
2525
cmdParser.parse_check(argc, argv);
@@ -31,7 +31,7 @@ int main(int argc, char **argv) {
3131
CPUBackend::cpu_threads = cmdParser.get<int>("thread");
3232

3333
auto tokenizer = QWenTokenizer(vocab_path, merge_path);
34-
QWenConfig config(tokens_limit, "0.5B", RoPEType::HFHUBROPE);
34+
QWenConfig config(tokens_limit, "1.8B", RoPEType::HFHUBROPE);
3535
auto model = QWenForCausalLM(config);
3636
model.load(model_path);
3737

examples/demo_yi.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
*
1010
*/
1111
#include "cmdline.h"
12-
#include "models/yi/configuration_yi.hpp"
13-
#include "models/yi/modeling_yi.hpp"
14-
#include "models/yi/tokenization_yi.hpp"
12+
#include "models/llama/configuration_llama.hpp"
13+
#include "models/llama/modeling_llama.hpp"
14+
#include "models/llama/tokenization_llama.hpp"
1515
#include "processor/PostProcess.hpp"
1616

1717
using namespace mllm;
@@ -29,9 +29,9 @@ int main(int argc, char **argv) {
2929
int tokens_limit = cmdParser.get<int>("limits");
3030
CPUBackend::cpu_threads = cmdParser.get<int>("thread");
3131

32-
auto tokenizer = YiTokenizer(vocab_path);
33-
YiConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE);
34-
auto model = YiForCausalLM(config);
32+
auto tokenizer = LLaMATokenizer(vocab_path, false);
33+
LLaMAConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE, 64000);
34+
auto model = LLaMAModel(config);
3535
model.load(model_path);
3636

3737
vector<string> in_strs = {

examples/main_alpaca.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ NetTensor *Attention( NetTensor * x, int embedding_size, int hidden_size, int he
5151
v = _KVCache( {v}, cache_max, name + ".v_cache");
5252
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
5353
qk = *qk/std::sqrt(hidden_size);
54-
qk = _Causalmask( {qk}, name + ".mask");
55-
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
54+
// qk = _Causalmask( {qk}, name + ".mask");
55+
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
5656
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
5757
o = o->view(-1, 1, -1, hidden_size * head_size);
5858
o = _Linear( {o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");

examples/main_clip.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,11 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
4545
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
4646
qk = _Scale( {qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
4747
if(name.find("text_model") != std::string::npos){
48-
qk = _Causalmask( {qk}, name + ".mask");
48+
// qk = _Causalmask( {qk}, name + ".mask");
49+
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
50+
} else{
51+
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
4952
}
50-
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
5153
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
5254
o = o->view(-1, 1, -1, hidden_size * head_size);
5355
o = _Linear( {o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");

examples/main_fuyu.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
102102
v = _KVCache({v}, cache_max, name + ".v_cache");
103103
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
104104
qk = _Scale({qk}, 1.0F / std::sqrt(head_size), 0.0F, false, name + ".scale");
105-
qk = _Causalmask({qk}, name + ".mask");
106-
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
105+
// qk = _Causalmask({qk}, name + ".mask");
106+
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
107107
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
108108
o = o->view(-1, 1, -1, hidden_size * head_size);
109109
o = _Linear({o}, hidden_size * head_size, embedding_size, true, name + ".dense");

examples/main_imagebind.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,10 @@ NetTensor *Attention(Context *c,NetTensor *x, int embedding_size, int hidden_siz
118118
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
119119
qk = *qk/std::sqrt(hidden_size);
120120
if(name.find("text") != std::string::npos){
121-
qk = _Causalmask( {qk}, name + ".mask");
121+
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
122+
} else{
123+
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
122124
}
123-
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
124125
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
125126
o = o->view(-1, 1, -1, hidden_size * head_size);
126127
o = _Linear( {o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");
@@ -227,10 +228,10 @@ void ImageBind(Context* c) {
227228
a = a->transpose(BATCH, SEQUENCE);
228229

229230
auto *j1 = _Matmul( {p, i}, false, true, "final.vision@text");
230-
j1 = _Softmax( {j1}, DIMENSION, "[email protected]");
231+
j1 = _Softmax( {j1}, DIMENSION, false, "[email protected]");
231232

232233
auto *j2 = _Matmul( {p, a}, false, true, "final.vision@audio");
233-
j2 = _Softmax( {j2}, DIMENSION, "[email protected]");
234+
j2 = _Softmax( {j2}, DIMENSION, false, "[email protected]");
234235

235236
i = _Cat( {j1, j2}, BATCH, "final.cat");
236237
}

examples/main_llama.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
5050
v = _KVCache({v}, cache_max, name + ".v_cache");
5151
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
5252
qk = *qk / std::sqrt(hidden_size);
53-
qk = _Causalmask({qk}, name + ".mask");
54-
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
53+
// qk = _Causalmask({qk}, name + ".mask");
54+
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
5555
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
5656
o = o->view(-1, 1, -1, hidden_size * head_size);
5757
o = _Linear({o}, hidden_size * head_size, embedding_size, false, name + ".wo");

0 commit comments

Comments
 (0)