From 767778f1aec64cda8630b9b1083789f2a01da53e Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 05:57:25 +0000 Subject: [PATCH] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 2 +- tests/test_tokenizer/test_tokenizer.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 8563fe49..668bdd68 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -101,7 +101,7 @@ def tokenize(text, elif (tokenizer == 'bpe'): try: tokenizer = HGTokenizer.from_file('bpeTokenizer.json') - except : + except Exception: tokenizer = huggingface_tokenizer.Tokenizer( huggingface_tokenizer.models.BPE()) if (bpe_trainfile is None): diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index d1e01c15..44b4b58a 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -71,7 +71,7 @@ def test_TokenizerSpacy(): some, there are still 4 packs left, 25 each, how many are sold?"] ans = [ 'The', 'stationery', 'store', 'has', '600', 'exercise', - 'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still', + 'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still', '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' ] tokenizer = get_tokenizer("pure_text", @@ -84,7 +84,7 @@ def test_TokenizerSpacy(): def test_TokenizerBPE(): items = ['The stationery store has $600$ exercise books, and after selling some,\ there are still $4$ packs left, $25$ each, how many are sold?'] - ans = ['h', '600', ' ', '^', '4', '^', ' ', '25', ' '] + ans = ['h', '600', ' ', '4', ' ', '25', ' '] data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True) tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"),