UbiquitousLearning · yirongjie · Mar 12, 2024 · Feb 25, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/test/processor/OPTTokenizerTest.cpp b/test/processor/OPTTokenizerTest.cpp
@@ -0,0 +1,37 @@
+//
+// Created by 咸的鱼 on 2024/2/23.
+//
+#include "gtest/gtest.h"
+#include "TokenizorTest.hpp"
+#include "tokenizers/BPE/Bpe.hpp"
+TEST_F(TokenizerTest, OPTTokenizerTest) {
+    GTEST_SKIP();
+    auto bpe = new mllm::BPETokenizer("./vocab_opt.mllm");
+    std::unordered_map<string,unsigned> merge_rank;
+    auto merge_file = std::ifstream("./merges.txt");
+    std::string line;
+    unsigned rank=0;
+    while (std::getline(merge_file, line)) {
+        if (line.empty()) {
+            continue;
+        }
+        if (line[0]=='#'){
+            continue;
+        }
+        merge_rank[line]=rank;
+        rank++;
+    }
+    bpe->setMergeRank(merge_rank);
+    std::cout<<bpe->getVocabSize()<<std::endl;
+    vector<mllm::token_id_t> tokens={};
+    string text="Hello, world!";
+    text = mllm::Tokenizer::replaceString(text,' ',"Ġ");
+    bpe->setSpecialToken("</s>","");
+    bpe->tokenize(text,tokens,true);
+
+    for (auto token:tokens){
+        std::cout<<token<< " ";
+    }
+
+
+}
diff --git a/tools/convertor/vocab.py b/tools/convertor/vocab.py
@@ -2,6 +2,7 @@
 import struct
 from typing import Iterable, Tuple
 import argparse
+import os
 
 MAGIC_NUM = 23333
 parser = argparse.ArgumentParser()
@@ -89,7 +90,7 @@ def write_unigram(vocab_file, tokenizer_config):
     args = parser.parse_args()
     output_file = args.output_file
     input_file = args.input_file
-
+    added_vocab = []
     with open(output_file, "wb+") as vocab_file:
         vocab_file.write(struct.pack("<i", MAGIC_NUM))
 
@@ -98,6 +99,11 @@ def write_unigram(vocab_file, tokenizer_config):
 
             sentencepiece_tokenizer = SentencePieceProcessor(str(input_file))
             write_vocab(vocab_file, sentencepiece_tokenizer)
+        elif args.type == "BPE" and os.path.basename(input_file) =="vocab.json":
+            vocabs = json.load(open(input_file, "r"))
+            config = {"vocab": vocabs, "type": "BPE"}
+            write_unigram(vocab_file, config)
+
         else:
             tokenizer_config = json.load(open(input_file, "r"))