Skip to content

Commit

Permalink
Merge pull request #104 from yirongjie/main
Browse files Browse the repository at this point in the history
perf: add AArch64 GEMM/GEMV for q4_0.
  • Loading branch information
yirongjie authored Jul 30, 2024
2 parents 56a0603 + 6e941b5 commit affe946
Show file tree
Hide file tree
Showing 81 changed files with 4,913 additions and 1,232 deletions.
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ endif ()

if (ARM)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin-arm)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# 检查是否使用的是 GCC 或 Clang 编译器
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod")
endif()
else ()
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin)
endif ()
Expand Down Expand Up @@ -96,7 +101,8 @@ endif ()
if (QUANT)
include_directories(${PROJECT_SOURCE_DIR}/src/quantizer)
file(GLOB_RECURSE MLLM_QUANT

${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.hpp
${PROJECT_SOURCE_DIR}/src/backends/cpu/compute/GEMM_AArch64.cpp
${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.hpp
${PROJECT_SOURCE_DIR}/src/backends/cpu/quantize/*.cpp
)
Expand Down
66 changes: 35 additions & 31 deletions examples/demo_imagebind_1mod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,53 +13,57 @@ int main(int argc, char **argv) {
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/imagebind_huge-q4_k.mllm");
cmdParser.add<string>("merges", 'f', "specify mllm tokenizer merges.txt path", false, "../vocab/clip_merges.txt");
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.add<int>("loop_times", 'l', "number of inference loops", false, 10);
cmdParser.add<string>("modality", 'o', "inference modality (text/vision/audio/all)", false, "all");
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
string merges_path = cmdParser.get<string>("merges");
int loop_times = cmdParser.get<int>("loop_times");
string modality = cmdParser.get<string>("modality");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto processor = ImagebindProcessor(vocab_path, merges_path);

ImagebindConfig config("huge");

int loop_times = 10;

// auto input_tensors = processor.process(
// {"a dog.", "A car", "A bird"},config.max_position_embeddings,
// {"../assets/dog_image.jpg", "../assets/car_image.jpg", "../assets/bird_image.jpg"}, config.img_hw,
// {"../assets/dog_audio.wav", "../assets/car_audio.wav", "../assets/bird_audio.wav"});

auto input_tensors = processor.process(
{"a dog."},config.max_position_embeddings,
{"a dog."}, config.max_position_embeddings,
{"../assets/dog_image.jpg"}, config.img_hw,
{"../assets/dog_audio.wav"});

std::cout<<"Text| input_shape:["<<input_tensors.text_tensors.batch()<<", "<<input_tensors.text_tensors.sequence()<<", "<<input_tensors.text_tensors.head()<<", "<<input_tensors.text_tensors.dimension()<<"]"<<std::endl;
auto text_model = ImagebindTextModel(config);
text_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);

if (modality == "text" || modality == "all") {
std::cout << "Text| input_shape:[" << input_tensors.text_tensors.batch() << ", " << input_tensors.text_tensors.sequence() << ", " << input_tensors.text_tensors.head() << ", " << input_tensors.text_tensors.dimension() << "]" << std::endl;
auto text_model = ImagebindTextModel(config);
text_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = text_model({input_tensors.text_tensors}, input_tensors.in_len);
}
text_model.profiling();
text_model.free();
}
text_model.profiling();
text_model.free();

std::cout<<"Vision| input_shape:["<<input_tensors.img_tensors.batch()<<", "<<input_tensors.img_tensors.channel()<<", "<<input_tensors.img_tensors.time()<<", "<<input_tensors.img_tensors.height()<<", "<<input_tensors.img_tensors.width()<<"]"<<std::endl;
auto vision_model = ImagebindVisionModel(config);
vision_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = vision_model({input_tensors.img_tensors});
if (modality == "vision" || modality == "all") {
std::cout << "Vision| input_shape:[" << input_tensors.img_tensors.batch() << ", " << input_tensors.img_tensors.channel() << ", " << input_tensors.img_tensors.time() << ", " << input_tensors.img_tensors.height() << ", " << input_tensors.img_tensors.width() << "]" << std::endl;
auto vision_model = ImagebindVisionModel(config);
vision_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = vision_model({input_tensors.img_tensors});
}
vision_model.profiling();
vision_model.free();
}
vision_model.profiling();
vision_model.free();

std::cout<<"Audio| input_shape:["<<input_tensors.audio_tensors.batch()<<", "<<input_tensors.audio_tensors.sequence()<<", "<<input_tensors.audio_tensors.head()<<", "<<input_tensors.audio_tensors.dimension()<<"]"<<std::endl;
auto audio_model = ImagebindAudioModel(config);
audio_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = audio_model({input_tensors.audio_tensors});
if (modality == "audio" || modality == "all") {
std::cout << "Audio| input_shape:[" << input_tensors.audio_tensors.batch() << ", " << input_tensors.audio_tensors.sequence() << ", " << input_tensors.audio_tensors.head() << ", " << input_tensors.audio_tensors.dimension() << "]" << std::endl;
auto audio_model = ImagebindAudioModel(config);
audio_model.load(model_path);
for (int step = 0; step < loop_times; step++) {
auto result = audio_model({input_tensors.audio_tensors});
}
audio_model.profiling();
audio_model.free();
}
audio_model.profiling();
audio_model.free();

return 0;
}
1 change: 1 addition & 0 deletions examples/demo_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ int main(int argc, char **argv) {
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
model.profiling();
}

return 0;
Expand Down
4 changes: 2 additions & 2 deletions examples/demo_qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-0.5b-q4_k.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-q8_0.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
Expand All @@ -31,7 +31,7 @@ int main(int argc, char **argv) {
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = QWenTokenizer(vocab_path, merge_path);
QWenConfig config(tokens_limit, "0.5B", RoPEType::HFHUBROPE);
QWenConfig config(tokens_limit, "1.8B", RoPEType::HFHUBROPE);
auto model = QWenForCausalLM(config);
model.load(model_path);

Expand Down
12 changes: 6 additions & 6 deletions examples/demo_yi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
*
*/
#include "cmdline.h"
#include "models/yi/configuration_yi.hpp"
#include "models/yi/modeling_yi.hpp"
#include "models/yi/tokenization_yi.hpp"
#include "models/llama/configuration_llama.hpp"
#include "models/llama/modeling_llama.hpp"
#include "models/llama/tokenization_llama.hpp"
#include "processor/PostProcess.hpp"

using namespace mllm;
Expand All @@ -29,9 +29,9 @@ int main(int argc, char **argv) {
int tokens_limit = cmdParser.get<int>("limits");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = YiTokenizer(vocab_path);
YiConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE);
auto model = YiForCausalLM(config);
auto tokenizer = LLaMATokenizer(vocab_path, false);
LLaMAConfig config(tokens_limit, "6B", RoPEType::HFHUBROPE, 64000);
auto model = LLaMAModel(config);
model.load(model_path);

vector<string> in_strs = {
Expand Down
4 changes: 2 additions & 2 deletions examples/main_alpaca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ NetTensor *Attention( NetTensor * x, int embedding_size, int hidden_size, int he
v = _KVCache( {v}, cache_max, name + ".v_cache");
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
qk = *qk/std::sqrt(hidden_size);
qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");
Expand Down
6 changes: 4 additions & 2 deletions examples/main_clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
qk = _Scale( {qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
if(name.find("text_model") != std::string::npos){
qk = _Causalmask( {qk}, name + ".mask");
// qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
} else{
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
}
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");
Expand Down
4 changes: 2 additions & 2 deletions examples/main_fuyu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
v = _KVCache({v}, cache_max, name + ".v_cache");
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
qk = _Scale({qk}, 1.0F / std::sqrt(head_size), 0.0F, false, name + ".scale");
qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear({o}, hidden_size * head_size, embedding_size, true, name + ".dense");
Expand Down
9 changes: 5 additions & 4 deletions examples/main_imagebind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,10 @@ NetTensor *Attention(Context *c,NetTensor *x, int embedding_size, int hidden_siz
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
qk = *qk/std::sqrt(hidden_size);
if(name.find("text") != std::string::npos){
qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
} else{
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
}
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");
Expand Down Expand Up @@ -227,10 +228,10 @@ void ImageBind(Context* c) {
a = a->transpose(BATCH, SEQUENCE);

auto *j1 = _Matmul( {p, i}, false, true, "final.vision@text");
j1 = _Softmax( {j1}, DIMENSION, "[email protected]");
j1 = _Softmax( {j1}, DIMENSION, false, "[email protected]");

auto *j2 = _Matmul( {p, a}, false, true, "final.vision@audio");
j2 = _Softmax( {j2}, DIMENSION, "[email protected]");
j2 = _Softmax( {j2}, DIMENSION, false, "[email protected]");

i = _Cat( {j1, j2}, BATCH, "final.cat");
}
Expand Down
4 changes: 2 additions & 2 deletions examples/main_llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
v = _KVCache({v}, cache_max, name + ".v_cache");
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
qk = *qk / std::sqrt(hidden_size);
qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear({o}, hidden_size * head_size, embedding_size, false, name + ".wo");
Expand Down
9 changes: 5 additions & 4 deletions examples/main_llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ NetTensor *Attention(NetTensor *x, int embedding_size, int hidden_size, int head
v = _KVCache({v}, cache_max, name + ".v_cache");
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
qk = *qk / std::sqrt(hidden_size);
qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax({qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear({o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");
Expand Down Expand Up @@ -117,9 +117,10 @@ NetTensor *VisionAttention(NetTensor *x, int embedding_size, int hidden_size, in
auto *qk = _Matmul({q, k}, false, true, name + ".qk");
qk = _Scale({qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
if (name.find("text_model") != std::string::npos) {
qk = _Causalmask({qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
} else{
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
}
qk = _Softmax({qk}, DIMENSION, name + ".softmax");
auto *o = _Matmul({qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear({o}, hidden_size * head_size, embedding_size, true, name + ".out_proj");
Expand Down
4 changes: 2 additions & 2 deletions examples/main_tinyllama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ NetTensor *Attention( NetTensor * x, int embedding_size, int hidden_size, int he
v = _KVCache( {v},head_size/mutil_key_value_head, cache_max, name + ".v_cache");
auto *qk = _Matmul( {q, k}, false, true, name + ".qk");
qk = *qk/std::sqrt(hidden_size);
qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
// qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, true, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedding_size, false, name + ".o_proj");
Expand Down
2 changes: 1 addition & 1 deletion examples/main_vit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,7 @@ NetTensor *Attention(NetTensor * x, int embedded_size, int hidden_size, int head
qk = *qk/std::sqrt(hidden_size);
// qk = _Scale( {qk}, 1.0F / std::sqrt(hidden_size), 0.0F, false, name + ".scale");
// qk = _Causalmask( {qk}, name + ".mask");
qk = _Softmax( {qk}, DIMENSION, name + ".softmax");
qk = _Softmax( {qk}, DIMENSION, false, name + ".softmax");
auto *o = _Matmul( {qk, v}, false, false, name + ".qkv");
o = o->view(-1, 1, -1, hidden_size * head_size);
o = _Linear( {o}, hidden_size * head_size, embedded_size, true, name + ".output.dense");
Expand Down
56 changes: 56 additions & 0 deletions include/Types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ enum DataType {
MLLM_TYPE_I8,
MLLM_TYPE_I16,
MLLM_TYPE_I32,
MLLM_TYPE_Q4_0_4_4=19,
MLLM_TYPE_Q4_0_4_8=20,
MLLM_TYPE_Q4_0_8_8=21,
MLLM_TYPE_Q8_0_4_4,
MLLM_TYPE_COUNT,
};
enum ChlType {
Expand Down Expand Up @@ -147,6 +151,8 @@ enum RoPEType {
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
// #define LLAMAFILE_SGEMM

#if defined(__ARM_NEON) && !defined(_MSC_VER)
typedef __fp16 mllm_fp16_t;
#else
Expand Down Expand Up @@ -223,6 +229,39 @@ typedef struct {
#pragma pack()
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K / 16 * sizeof(int16_t), "wrong q8_K block size/padding");


#pragma pack(1)
typedef struct {
mllm_fp16_t d[4]; // deltas for 4 q4_0 blocks
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
} block_q4_0x4;
#pragma pack()
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(mllm_fp16_t) + QK4_0 * 2, "wrong q4_0x4 block size/padding");

#pragma pack(1)
typedef struct {
mllm_fp16_t d[8]; // deltas for 8 q4_0 blocks
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
} block_q4_0x8;
#pragma pack()
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(mllm_fp16_t) + QK4_0 * 4, "wrong q4_0x8 block size/padding");

#pragma pack(1)
typedef struct {
mllm_fp16_t d[4]; // deltas for 4 q8_0 blocks
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
} block_q8_0x4;
#pragma pack()
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(mllm_fp16_t) + QK8_0 * 4, "wrong q8_0x4 block size/padding");

#pragma pack(1)
typedef struct {
mllm_fp16_t d[8]; // deltas for 8 q8_0 blocks
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
} block_q8_0x8;
#pragma pack()
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(mllm_fp16_t) + QK8_0 * 8, "wrong q8_0x8 block size/padding");

//

static string DataTypeName(DataType dataType) {
Expand Down Expand Up @@ -251,6 +290,14 @@ static string DataTypeName(DataType dataType) {
return "Q4_1";
case MLLM_TYPE_Q8_1:
return "Q8_1";
case MLLM_TYPE_Q4_0_4_4:
return "Q4_0_4_4";
case MLLM_TYPE_Q4_0_4_8:
return "Q4_0_4_8";
case MLLM_TYPE_Q4_0_8_8:
return "Q4_0_8_8";
case MLLM_TYPE_Q8_0_4_4:
return "Q8_0_4_4";
case MLLM_TYPE_COUNT:
return "COUNT";
default:
Expand Down Expand Up @@ -281,6 +328,15 @@ static size_t DataTypeSize(DataType dtype, int count = 1) {
return (sizeof(block_q8_K)) * count / (QK_K);
case MLLM_TYPE_Q4_1:
case MLLM_TYPE_Q8_1:
return -1;
case MLLM_TYPE_Q4_0_4_4:
return (sizeof(block_q4_0x4)) * count / (QK4_0 * 4);
case MLLM_TYPE_Q4_0_4_8:
return (sizeof(block_q4_0x8)) * count / (QK4_0 * 8);
case MLLM_TYPE_Q4_0_8_8:
return (sizeof(block_q4_0x8)) * count / (QK4_0 * 8);
case MLLM_TYPE_Q8_0_4_4:
return (sizeof(block_q8_0x4)) * count / (QK8_0 * 4);
case MLLM_TYPE_COUNT:
return 0;
default:
Expand Down
Loading

0 comments on commit affe946

Please sign in to comment.