diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 60c613d4..db7d5f41 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -44,6 +44,7 @@ macro(func_vlm_add_executable target) ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp + ${PROJECT_SOURCE_DIR}/src/tokenizers/WordPiece/WordPiece.cpp ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp ${DIR_SRC_PROCESSOE} ${DIR_THIRDPARTY_AUDIO} diff --git a/examples/demo_bert.cpp b/examples/demo_bert.cpp index dd4397bd..b21281df 100644 --- a/examples/demo_bert.cpp +++ b/examples/demo_bert.cpp @@ -25,14 +25,12 @@ int main(int argc, char *argv[]) { BertTokenizer tokenizer(vocab_path, true); string text = "Help me set an alarm at 21:30"; - auto [token_ids, type_ids, position_ids] = tokenizer.process(text); - // token_ids.printData(); - + auto inputs = tokenizer.tokenizes(text); auto config = BertConfig(); auto model = BertModel(config); model.load(model_path); - auto res = model({token_ids, type_ids, position_ids})[0]; + auto res = model({inputs[0], inputs[1], inputs[2]})[0]; res.printData(); diff --git a/src/models/bert/modeling_bert.hpp b/src/models/bert/modeling_bert.hpp index e7d6730f..adb34437 100644 --- a/src/models/bert/modeling_bert.hpp +++ b/src/models/bert/modeling_bert.hpp @@ -39,9 +39,7 @@ class BertLayer : public Module { BertLayer() = default; BertLayer(const BertConfig &config, const string &base_name) { // base_name: encoder.layer.n. - attention = MultiHeadAttention(config.hidden_size, config.num_attention_heads, config.num_attention_heads, - config.hidden_size / config.num_attention_heads, SPLIT_NONE, false, false, RoPEType::NONE, -1, -1, 0, false, true, config.names_config, - base_name + config.names_config._attn_base_name); + attention = MultiHeadAttention(config.hidden_size, config.num_attention_heads, config.num_attention_heads, config.hidden_size / config.num_attention_heads, SPLIT_NONE, false, false, RoPEType::NONE, -1, -1, 0, false, true, config.names_config, base_name + config.names_config._attn_base_name); feed_forward = FeedForward(config.hidden_size, config.intermediate_size, config.hidden_act, true, config.names_config, base_name); @@ -55,15 +53,10 @@ class BertLayer : public Module { std::vector Forward(std::vector inputs, std::vector args) override { auto hidden_states = inputs[0]; - auto attn_out = attention({hidden_states, hidden_states, hidden_states})[0]; - hidden_states = attn_norm({hidden_states + attn_out}); - auto ff_out = feed_forward({hidden_states})[0]; - hidden_states = ff_norm({hidden_states + ff_out}); - return {hidden_states}; } diff --git a/src/models/bert/tokenization_bert.hpp b/src/models/bert/tokenization_bert.hpp index d4d03809..650bf8ce 100644 --- a/src/models/bert/tokenization_bert.hpp +++ b/src/models/bert/tokenization_bert.hpp @@ -1,19 +1,14 @@ #ifndef TOKENIZATION_BERT_HPP #define TOKENIZATION_BERT_HPP -#include "tokenizers/BPE/Bpe.hpp" #include "tokenizers/Tokenizer.hpp" -#include "tokenizers/Unicode.hpp" #include "tokenizers/WordPiece/WordPiece.hpp" -#include -#include // unicode -#include +#include using namespace mllm; - class BertTokenizer final : public WordPieceTokenizer { public: explicit BertTokenizer(const std::string &vocab_file, bool add_special_tokens = true) : @@ -22,17 +17,12 @@ class BertTokenizer final : public WordPieceTokenizer { _add_special_tokens = add_special_tokens; this->add_special_tokens({"[PAD]", "[CLS]", "[SEP]", "[MASK]"}); } - std::tuple process(std::string text){ + std::vector tokenizes(std::string &text) override { if (_add_special_tokens) { text = "[CLS] " + text + " [SEP]"; } auto tokens_id = vector(); WordPieceTokenizer::tokenize(text, tokens_id, false); -// printf("token: "); -// for (auto &token_id : tokens_id) { -// printf("%d ", token_id); -// } - printf("\n"); auto tokens_type = vector(tokens_id.size(), 0); auto position_ids = vector(tokens_id.size()); for (size_t i = 0; i < tokens_id.size(); i++) { @@ -41,8 +31,7 @@ class BertTokenizer final : public WordPieceTokenizer { return { tokens2Input(tokens_id, "input_tokens"), tokens2Input(tokens_type, "input_tokens_type"), - tokens2Input(position_ids, "input_position_ids") - }; + tokens2Input(position_ids, "input_position_ids")}; } private: diff --git a/src/tokenizers/Tokenizer.hpp b/src/tokenizers/Tokenizer.hpp index 3462a24b..10841f04 100644 --- a/src/tokenizers/Tokenizer.hpp +++ b/src/tokenizers/Tokenizer.hpp @@ -108,6 +108,9 @@ class Tokenizer { this->tokenize(text, tokens_id, bos_flag); return tokens2Input(tokens_id); } + virtual vector tokenizes(std::string &text) { + return {tokenize(text)}; + } virtual std::string detokenize(const std::vector &tokens); virtual std::pair detokenize(Tensor &result) {