Skip to content

Commit

Permalink
Merge pull request #158 from KINGNEWBLUSH/dev
Browse files Browse the repository at this point in the history
[FEATURE] Update tokenizers
  • Loading branch information
KenelmQLH authored Mar 14, 2024
2 parents 855e250 + e86a5e6 commit 7abc7d1
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 14 deletions.
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@

[Heng Yu](https://github.com/GNEHUY)

[Tianyun Ji](https://github.com/KINGNEWBLUSH)
The stared contributors are the corresponding authors.
92 changes: 78 additions & 14 deletions EduNLP/SIF/tokenization/text/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@
# 2021/5/18 @ tongshiwei
import logging
import jieba
from nltk.tokenize import word_tokenize
import nltk
import spacy
import tokenizers as huggingface_tokenizer
from tokenizers.trainers import BpeTrainer
from .stopwords import DEFAULT_STOPWORDS
from tokenizers import Tokenizer as HGTokenizer


jieba.setLogLevel(logging.INFO)

Expand All @@ -15,7 +22,13 @@ def is_chinese(word):
return True


def tokenize(text, granularity="word", stopwords="default"):
def tokenize(text,
granularity="word",
stopwords="default",
tokenizer="jieba",
tok_model="en_core_web_sm",
bpe_json='bpe.tokenizer.json',
bpe_trainfile=None):
"""
Using jieba library to tokenize item by word or char.
Expand All @@ -37,17 +50,68 @@ def tokenize(text, granularity="word", stopwords="default"):
"""
stopwords = DEFAULT_STOPWORDS if stopwords == "default" else stopwords
stopwords = stopwords if stopwords is not None else {}
if granularity == "word":
return [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
elif granularity == "char":
jieba_tokens = [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
# Use jieba_tokens to hangle sentence with mixed chinese and english.
split_tokens = []
for token in jieba_tokens:
if is_chinese(token):
split_tokens.extend(list(token))
else:
split_tokens.append(token)
return split_tokens

if (tokenizer == 'jieba'):
if granularity == "word":
return [
token for token in jieba.cut(text)
if token not in stopwords and token.strip()
]
elif granularity == "char":
jieba_tokens = [
token for token in jieba.cut(text)
if token not in stopwords and token.strip()
]
# Use jieba_tokens to hangle sentence with mixed chinese and english.
split_tokens = []
for token in jieba_tokens:
if is_chinese(token):
split_tokens.extend(list(token))
else:
split_tokens.append(token)
return split_tokens
else:
raise TypeError("Unknown granularity %s" % granularity)

elif (tokenizer == 'nltk'):
try:
return [
token for token in word_tokenize(text)
if token not in stopwords and token.strip()
]
except LookupError:
nltk.download('punkt')
return [
token for token in word_tokenize(text)
if token not in stopwords and token.strip()
]

elif (tokenizer == 'spacy'):
try:
spacy_tokenizer = spacy.load(tok_model)
except OSError:
spacy.cli.download(tok_model)
spacy_tokenizer = spacy.load(tok_model)
output = spacy_tokenizer(str(text))
return [
token.text for token in output
if token.text not in stopwords
]

elif (tokenizer == 'bpe'):
try:
tokenizer = HGTokenizer.from_file(bpe_json)
except Exception:
tokenizer = huggingface_tokenizer.Tokenizer(
huggingface_tokenizer.models.BPE())
if (bpe_trainfile is None):
raise LookupError("bpe train file not found, using %s." % bpe_trainfile)
trainer = BpeTrainer(
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(files=[bpe_trainfile], trainer=trainer)
tokenizer.save(bpe_json, pretty=True)
output = tokenizer.encode(text)
output = output.tokens
return output[0]
else:
raise TypeError("Unknown granularity %s" % granularity)
raise TypeError("Invalid Spliter: %s" % tokenizer)
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@
'networkx',
'numpy>=1.17.0',
'jieba',
'nltk',
'spacy',
'tokenizers',
'js2py',
'EduData>=0.0.16',
'PyBaize>=0.0.3'
Expand Down
44 changes: 44 additions & 0 deletions tests/test_tokenizer/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest
from EduNLP.Tokenizer import get_tokenizer
from EduNLP.Pretrain import DisenQTokenizer
from EduNLP.utils import abs_current_dir, path_append


def test_tokenizer():
Expand Down Expand Up @@ -50,6 +51,49 @@ def test_CharTokenizer():
assert ret == ans


def test_TokenizerNLTK():
items = ["The stationery store has 600 exercise books, and after selling\
some, there are still 4 packs left, 25 each, how many are sold?"]
ans = [
'The', 'stationery', 'store', 'has', '600', 'exercise',
'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still',
'4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold'
]
tokenizer = get_tokenizer("pure_text",
text_params={"tokenizer": 'nltk', "stopwords": set(",?")})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans


def test_TokenizerSpacy():
items = ["The stationery store has 600 exercise books, and after selling\
some, there are still 4 packs left, 25 each, how many are sold?"]
ans = [
'The', 'stationery', 'store', 'has', '600', 'exercise',
'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still',
'4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold'
]
tokenizer = get_tokenizer("pure_text",
text_params={"tokenizer": 'spacy', "stopwords": set(",?")})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans


def test_TokenizerBPE():
items = ['The stationery store has $600$ exercise books, and after selling some,\
there are still $4$ packs left, $25$ each, how many are sold?']
ans = ['h', '600', ' ', '4', ' ', '25', ' ']
data_path = path_append(abs_current_dir(__file__),
"../../static/test_data/standard_luna_data.json", to_str=True)
tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"),
"bpe_trainfile": data_path})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans


def test_SpaceTokenizer():
items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?']
tokenizer = get_tokenizer("space", stop_words=[])
Expand Down

0 comments on commit 7abc7d1

Please sign in to comment.