Skip to content

Commit

Permalink
Merge pull request #172 from yirongjie/main
Browse files Browse the repository at this point in the history
fix: BerTokenizer::tokenizes
  • Loading branch information
yirongjie authored Oct 30, 2024
2 parents a88c22f + 65f58c3 commit bd95142
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 26 deletions.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ macro(func_vlm_add_executable target)
${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/WordPiece/WordPiece.cpp
${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
${DIR_SRC_PROCESSOE}
${DIR_THIRDPARTY_AUDIO}
Expand Down
6 changes: 2 additions & 4 deletions examples/demo_bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,12 @@ int main(int argc, char *argv[]) {

BertTokenizer tokenizer(vocab_path, true);
string text = "Help me set an alarm at 21:30";
auto [token_ids, type_ids, position_ids] = tokenizer.process(text);
// token_ids.printData<float>();

auto inputs = tokenizer.tokenizes(text);
auto config = BertConfig();
auto model = BertModel(config);
model.load(model_path);

auto res = model({token_ids, type_ids, position_ids})[0];
auto res = model({inputs[0], inputs[1], inputs[2]})[0];

res.printData<float>();

Expand Down
9 changes: 1 addition & 8 deletions src/models/bert/modeling_bert.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ class BertLayer : public Module {
BertLayer() = default;
BertLayer(const BertConfig &config, const string &base_name) {
// base_name: encoder.layer.n.
attention = MultiHeadAttention(config.hidden_size, config.num_attention_heads, config.num_attention_heads,
config.hidden_size / config.num_attention_heads, SPLIT_NONE, false, false, RoPEType::NONE, -1, -1, 0, false, true, config.names_config,
base_name + config.names_config._attn_base_name);
attention = MultiHeadAttention(config.hidden_size, config.num_attention_heads, config.num_attention_heads, config.hidden_size / config.num_attention_heads, SPLIT_NONE, false, false, RoPEType::NONE, -1, -1, 0, false, true, config.names_config, base_name + config.names_config._attn_base_name);

feed_forward = FeedForward(config.hidden_size, config.intermediate_size,
config.hidden_act, true, config.names_config, base_name);
Expand All @@ -55,15 +53,10 @@ class BertLayer : public Module {

std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
auto hidden_states = inputs[0];

auto attn_out = attention({hidden_states, hidden_states, hidden_states})[0];

hidden_states = attn_norm({hidden_states + attn_out});

auto ff_out = feed_forward({hidden_states})[0];

hidden_states = ff_norm({hidden_states + ff_out});

return {hidden_states};
}

Expand Down
17 changes: 3 additions & 14 deletions src/models/bert/tokenization_bert.hpp
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
#ifndef TOKENIZATION_BERT_HPP
#define TOKENIZATION_BERT_HPP

#include "tokenizers/BPE/Bpe.hpp"
#include "tokenizers/Tokenizer.hpp"
#include "tokenizers/Unicode.hpp"
#include "tokenizers/WordPiece/WordPiece.hpp"
#include <algorithm>
#include <unordered_map>

// unicode
#include <codecvt>
#include <vector>

using namespace mllm;


class BertTokenizer final : public WordPieceTokenizer {
public:
explicit BertTokenizer(const std::string &vocab_file, bool add_special_tokens = true) :
Expand All @@ -22,17 +17,12 @@ class BertTokenizer final : public WordPieceTokenizer {
_add_special_tokens = add_special_tokens;
this->add_special_tokens({"[PAD]", "[CLS]", "[SEP]", "[MASK]"});
}
std::tuple<Tensor, Tensor, Tensor> process(std::string text){
std::vector<Tensor> tokenizes(std::string &text) override {
if (_add_special_tokens) {
text = "[CLS] " + text + " [SEP]";
}
auto tokens_id = vector<token_id_t>();
WordPieceTokenizer::tokenize(text, tokens_id, false);
// printf("token: ");
// for (auto &token_id : tokens_id) {
// printf("%d ", token_id);
// }
printf("\n");
auto tokens_type = vector<token_id_t>(tokens_id.size(), 0);
auto position_ids = vector<token_id_t>(tokens_id.size());
for (size_t i = 0; i < tokens_id.size(); i++) {
Expand All @@ -41,8 +31,7 @@ class BertTokenizer final : public WordPieceTokenizer {
return {
tokens2Input(tokens_id, "input_tokens"),
tokens2Input(tokens_type, "input_tokens_type"),
tokens2Input(position_ids, "input_position_ids")
};
tokens2Input(position_ids, "input_position_ids")};
}

private:
Expand Down
3 changes: 3 additions & 0 deletions src/tokenizers/Tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ class Tokenizer {
this->tokenize(text, tokens_id, bos_flag);
return tokens2Input(tokens_id);
}
virtual vector<Tensor> tokenizes(std::string &text) {
return {tokenize(text)};
}
virtual std::string detokenize(const std::vector<token_id_t> &tokens);

virtual std::pair<std::string, unsigned> detokenize(Tensor &result) {
Expand Down

0 comments on commit bd95142

Please sign in to comment.