Skip to content

Commit

Permalink
Merge pull request #36 from UbiquitousLearning/develop-lx
Browse files Browse the repository at this point in the history
Develop lx
  • Loading branch information
lx200916 authored Dec 4, 2023
2 parents a29a2ef + 35dd912 commit d070733
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 10 deletions.
4 changes: 2 additions & 2 deletions src/tokenizers/Unigram/trie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,11 @@ class TrieIterator {
node = next->second;
path.push_back(*iter);
if (node->is_leaf) {
iter++;
++iter;
return path;
}
}
iter++;
++iter;
}
return {};

Expand Down
15 changes: 12 additions & 3 deletions test/preprocess/UnigramTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,19 @@ TEST_F(TokenizerTest, test) {
tokenizer->setSpecialToken("|ENDOFTEXT|");
std::string text = "Hello world";
// normalization text
// replace all " " to "_"
std::replace(text.begin(), text.end(), ' ', '_');
// replace all " " to "▁"
std::string text_ = "";
for (auto &ch : text) {
if (ch == ' ') {
text_ += "";
}else {
text_ += ch;
}

}
// std::replace(text.begin(), text.end(), ' ', L'▁');
// prepend "_" to text
std::string new_text = "_" + std::string(text);
std::string new_text = "" + std::string(text_);

tokenizer->tokenize(new_text, ids, true);
for (auto id : ids) {
Expand Down
10 changes: 5 additions & 5 deletions test/preprocess/fuyu_process_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

model_id = "adept/fuyu-8b"
processor = FuyuProcessor.from_pretrained(model_id)
text_prompt = "Generate a coco-style caption.\n"
text_prompt = "Hello world"
url = "https://huggingface.co/adept/fuyu-8b/resolve/main/bus.png"
image = Image.open(requests.get(url, stream=True).raw)
with open("bus.tmp", "wb") as f:
f.write(image.tobytes())
inputs = processor(text=text_prompt, images=image, return_tensors="pt")
# image = Image.open(requests.get(url, stream=True).raw)
# with open("bus.tmp", "wb") as f:
# f.write(image.tobytes())
inputs = processor(text=text_prompt, return_tensors="pt")
print(inputs)

0 comments on commit d070733

Please sign in to comment.