Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/yirongjie/mllm
Browse files Browse the repository at this point in the history
  • Loading branch information
yirongjie committed Aug 3, 2024
2 parents 88de194 + 7158040 commit 3fb20f9
Show file tree
Hide file tree
Showing 10 changed files with 8,146 additions and 67 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,8 @@ endif ()
add_executable(demo_qwen ${PROJECT_SOURCE_DIR}/examples/demo_qwen.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
src/tokenizers/Tokenizer.cpp
src/tokenizers/BPE/Bpe.cpp
src/tokenizers/Unicode.cpp
src/tokenizers/UnicodeData.cpp
src/processor/PreProcess.cpp
)
if (ARM AND NOT APK)
Expand Down Expand Up @@ -546,6 +548,8 @@ if (APK)
src/tokenizers/Unigram/trie.hpp
src/tokenizers/BPE/Bpe.cpp
src/tokenizers/BPE/Bpe.hpp
src/tokenizers/Unicode.cpp
src/tokenizers/UnicodeData.cpp
src/processor/PreProcess.hpp
src/processor/FuyuPreProcess.hpp
src/processor/FuyuPreProcess.cpp
Expand Down
5 changes: 3 additions & 2 deletions examples/demo_qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@
#include "models/qwen/configuration_qwen.hpp"
#include "models/qwen/modeling_qwen.hpp"
#include "models/qwen/tokenization_qwen.hpp"
#include "processor/PostProcess.hpp"

using namespace mllm;

int main(int argc, char **argv) {
// std::iostream::sync_with_stdio(false);

cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
Expand Down Expand Up @@ -70,6 +71,6 @@ int main(int argc, char **argv) {
}
return true;
});
printf("\n");
std::cout << "\n";
}
}
39 changes: 21 additions & 18 deletions src/backends/cpu/compute/GEMM_AArch64.hpp
Original file line number Diff line number Diff line change
@@ -1,34 +1,37 @@
#ifndef MLLM_GEMM_AARCH64_HPP
#define MLLM_GEMM_AARCH64_HPP


#include "VecDot.hpp"
using namespace mllm;



// Quantization
void quantize_q8_0_4x4(const float * __restrict x, void * __restrict y, int64_t k);
void quantize_q8_0_4x8(const float * __restrict x, void * __restrict y, int64_t k);
void quantize_q8_0_4x4(const float *__restrict x, void *__restrict y, int64_t k);
void quantize_q8_0_4x8(const float *__restrict x, void *__restrict y, int64_t k);

void quantize_mat_q8_0(const float * __restrict x, void * __restrict y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
void quantize_mat_q8_0(const float *__restrict x, void *__restrict y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);

// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * __restrict src, void * __restrict dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * __restrict src, void * __restrict dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * __restrict src, void * __restrict dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x4(const float *__restrict src, void *__restrict dst, int64_t nrows, int64_t n_per_row, const float *imatrix);
size_t quantize_q4_0_4x8(const float *__restrict src, void *__restrict dst, int64_t nrows, int64_t n_per_row, const float *imatrix);
size_t quantize_q4_0_8x8(const float *__restrict src, void *__restrict dst, int64_t nrows, int64_t n_per_row, const float *imatrix);

// GEMV
void mllm_gemv_q4_0_4x4_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
void mllm_gemv_q4_0_4x8_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
void mllm_gemv_q4_0_8x8_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
void mllm_gemv_q4_0_4x4_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
void mllm_gemv_q4_0_4x8_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
void mllm_gemv_q4_0_8x8_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
// void mllm_gemv_q4_0_4x4_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
// void mllm_gemv_q4_0_4x8_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
// void mllm_gemv_q4_0_8x8_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);

// GEMM
void mllm_gemm_q4_0_4x4_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
void mllm_gemm_q4_0_4x8_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
void mllm_gemm_q4_0_8x8_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);

void quantize_row_q4_0_4x4(const float * __restrict x, void * __restrict y, int k);
void quantize_row_q4_0_4x4(const float * __restrict x, void * __restrict y, int k, int raw);
void mllm_gemm_q4_0_4x4_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
void mllm_gemm_q4_0_4x8_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
void mllm_gemm_q4_0_8x8_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
// void mllm_gemm_q4_0_4x4_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
// void mllm_gemm_q4_0_4x8_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
// void mllm_gemm_q4_0_8x8_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);

void quantize_row_q4_0_4x4(const float *__restrict x, void *__restrict y, int k);
void quantize_row_q4_0_4x4(const float *__restrict x, void *__restrict y, int k, int raw);

#endif // MLLM_GEMM_HPP
107 changes: 93 additions & 14 deletions src/models/qwen/tokenization_qwen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "tokenizers/BPE/Bpe.hpp"
#include "tokenizers/Tokenizer.hpp"
#include "tokenizers/Unicode.hpp"
#include <algorithm>
#include <unordered_map>

Expand Down Expand Up @@ -45,10 +46,15 @@ static std::vector<int> __ord(std::string v) {
}

static const std::string PAT_STR = R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?:$|[^\S])|\s+)";
static const std::string SPLIT_PAT_STR = R"(<\|im_start\|>|<\|im_end\|>|<\|endoftext\|>)";
static const std::vector<std::string> FIXED_PAT_STRS = {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};

class QWenTokenizer final {
public:
explicit QWenTokenizer(const std::string &vocab_file, const std::string &merge_file) {
explicit QWenTokenizer(const std::string &vocab_file, const std::string &merge_file, bool split_special_tokens = false) :
split_special_tokens_(split_special_tokens) {
Module::initBackend(MLLM_CPU);
tokenizer = new BPETokenizer(vocab_file);

Expand Down Expand Up @@ -114,28 +120,95 @@ class QWenTokenizer final {
return elems;
}

std::vector<std::string> _splitWithDelimiters(const std::string &str, const std::vector<std::string> &delimiters) {
std::string s = str;
std::vector<std::string> result;
size_t pos = 0;
auto isDelimiter = [&](size_t currentPos) {
for (const auto &delimiter : delimiters) {
if (currentPos + delimiter.length() <= s.length() && s.substr(currentPos, delimiter.length()) == delimiter) {
return true;
}
}
return false;
};

while (pos < s.length()) {
if (isDelimiter(pos)) {
if (pos != 0) {
result.push_back(s.substr(0, pos));
}
size_t delimiterLength = delimiters.front().length();
for (const auto &delimiter : delimiters) {
if (s.substr(pos, delimiter.length()) == delimiter) {
delimiterLength = delimiter.length();
result.push_back(delimiter);
break;
}
}
pos += delimiterLength;
s = s.substr(pos);
pos = 0;
} else {
++pos;
}
}

if (!s.empty()) {
result.push_back(s);
}

return result;
}

Tensor tokenize(std::string &text, int str_i = 0) {
std::vector<token_id_t> ret;

auto splited = stringSplit(text, ' ');
if (text[0] == ' ') splited[0] = " " + splited[0];
for (auto piece : splited) {
// look up table
std::string token;
for (auto b : UTF8(piece)) token += byte_encoder_[b];
if (split_special_tokens_) {
const auto word_collection = unicode_regex_split(text, FIXED_PAT_STRS);
for (auto &piece : word_collection) {
// look up table
// std::string token;
// for (auto b : UTF8(piece)) token += byte_encoder_[b];

// using bpe
std::vector<token_id_t> tmp;
tokenizer->tokenize(token, tmp, /*bos*/ false, /*byte fallback*/ true, "");
// using bpe
std::vector<token_id_t> tmp;
tokenizer->tokenize(piece, tmp, false, true, "");
ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
}
} else {
auto parts = _splitWithDelimiters(text, special_tokens);
// for (auto p : parts) {
// std::cout << "\"" << p << "\"" << std::endl;
// }
for (auto &p : parts) {
if (std::find(special_tokens.begin(), special_tokens.end(), p) != special_tokens.end()) {
std::string token;
for (auto b : UTF8(p)) token += byte_encoder_[b];

ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
std::vector<token_id_t> tmp;
tokenizer->tokenize(token, tmp, false, special_tokens, true);
ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
} else {
const auto word_collection = unicode_regex_split(p, FIXED_PAT_STRS);
for (auto &piece : word_collection) {
// look up table
// std::string token;
// for (auto b : UTF8(piece)) token += byte_encoder_[b];

// using bpe
std::vector<token_id_t> tmp;
tokenizer->tokenize(piece, tmp, false, true, "");
assert(tmp.size() != 0);
ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
}
}
}
}
// FIXME if we need bos or not?
ret.insert(ret.begin(), bos_id_);

return Tokenizer::tokens2Input(ret);
}

// FIXME std::string += std::string has performance issues when string is large.
std::string _byte_decode_(const std::string &text) {
std::string ret;
auto _ = ORD(text);
Expand Down Expand Up @@ -168,11 +241,17 @@ class QWenTokenizer final {
}

public:
bool split_special_tokens_ = false;
BPETokenizer *tokenizer;
std::unordered_map<int, std::string> byte_encoder_;
std::unordered_map<std::string, int> byte_decoder_;
std::unordered_map<std::string, unsigned int> bpe_ranks_;
token_id_t eos_id_ = 151645, bos_id_ = 151643;
std::vector<std::string> special_tokens = {
"<|endoftext|>",
"<|im_start|>",
"<|im_end|>",
};
};

#undef UTF8
Expand Down
Loading

0 comments on commit 3fb20f9

Please sign in to comment.