diff --git a/test/processor/OPTTokenizerTest.cpp b/test/processor/OPTTokenizerTest.cpp new file mode 100644 index 00000000..b4d3293a --- /dev/null +++ b/test/processor/OPTTokenizerTest.cpp @@ -0,0 +1,37 @@ +// +// Created by 咸的鱼 on 2024/2/23. +// +#include "gtest/gtest.h" +#include "TokenizorTest.hpp" +#include "tokenizers/BPE/Bpe.hpp" +TEST_F(TokenizerTest, OPTTokenizerTest) { + GTEST_SKIP(); + auto bpe = new mllm::BPETokenizer("./vocab_opt.mllm"); + std::unordered_map merge_rank; + auto merge_file = std::ifstream("./merges.txt"); + std::string line; + unsigned rank=0; + while (std::getline(merge_file, line)) { + if (line.empty()) { + continue; + } + if (line[0]=='#'){ + continue; + } + merge_rank[line]=rank; + rank++; + } + bpe->setMergeRank(merge_rank); + std::cout<getVocabSize()< tokens={}; + string text="Hello, world!"; + text = mllm::Tokenizer::replaceString(text,' ',"Ġ"); + bpe->setSpecialToken("",""); + bpe->tokenize(text,tokens,true); + + for (auto token:tokens){ + std::cout<