From ad7df8b2edc9c44b7e8a5a00ab74882d94cd5519 Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Mon, 11 Mar 2024 12:56:27 +0000 Subject: [PATCH] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 2 +- tests/test_tokenizer/test_tokenizer.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index c1941720..954145ac 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -87,7 +87,7 @@ def tokenize(text, elif (tokenizer == 'spacy'): try: spacy_tokenizer = spacy.load(tok_model) - except LookupError: + except OSError: spacy.cli.download(tok_model) spacy_tokenizer = spacy.load(tok_model) diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index b5a254cd..fd450a1c 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -78,10 +78,8 @@ def test_TokenizerBPE(): ', ', '$', '25', '$ ', 'e', 'a', 'c', 'h', ', ', 'h', 'ow', ' m', 'an', 'y', ' ', 'are', ' ', 's', 'o', 'l', 'd'] ] - tokenizer = get_tokenizer("pure_text", - text_params={"tokenizer": 'bpe', - "bpe_trainfile": "../../static/test_data/standard_luna_data.json", - "stopwords": set(",?")}) + tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"), + "bpe_trainfile": "../../../../static/test_data/standard_luna_data.json"}) tokens = tokenizer(items) ret = next(tokens) assert ret == ans