Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Update tokenizers #158

Merged
merged 21 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
dbe8936
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 10, 2024
8fd96b2
Update tokenization.py
KINGNEWBLUSH Mar 10, 2024
025fa86
modified: AUTHORS.md
KINGNEWBLUSH Mar 11, 2024
aea99a2
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
970c1b9
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
a289a7a
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
ad7df8b
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
9423b31
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
5792e48
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
edc266f
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
c526016
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 11, 2024
64c6cda
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
3a53b51
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
569bb9f
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
4542258
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
721bc0a
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
1476f8a
modified: tests/test_tokenizer/test_tokenizer.py
KINGNEWBLUSH Mar 12, 2024
f02ccce
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
05172b4
modified: tests/test_tokenizer/test_tokenizer.py
KINGNEWBLUSH Mar 12, 2024
767778f
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
e86a5e6
modified: EduNLP/SIF/tokenization/text/tokenization.py
KINGNEWBLUSH Mar 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 77 additions & 14 deletions EduNLP/SIF/tokenization/text/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
# 2021/5/18 @ tongshiwei
import logging
import jieba
from nltk.tokenize import word_tokenize
import nltk
import spacy
import tokenizers as huggingface_tokenizer
from tokenizers.trainers import BpeTrainer
from .stopwords import DEFAULT_STOPWORDS

jieba.setLogLevel(logging.INFO)
Expand All @@ -15,7 +20,13 @@ def is_chinese(word):
return True


def tokenize(text, granularity="word", stopwords="default"):
def tokenize(text,
granularity="word",
stopwords="default",
tokenizer="jieba",
tok_model="en_core_web_sm",
bpe_json='bpe.tokenizer.json',
bpe_trainfile=None):
"""
Using jieba library to tokenize item by word or char.

Expand All @@ -37,17 +48,69 @@ def tokenize(text, granularity="word", stopwords="default"):
"""
stopwords = DEFAULT_STOPWORDS if stopwords == "default" else stopwords
stopwords = stopwords if stopwords is not None else {}
if granularity == "word":
return [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
elif granularity == "char":
jieba_tokens = [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
# Use jieba_tokens to hangle sentence with mixed chinese and english.
split_tokens = []
for token in jieba_tokens:
if is_chinese(token):
split_tokens.extend(list(token))
else:
split_tokens.append(token)
return split_tokens

if (tokenizer == 'jieba'):
if granularity == "word":
return [
token for token in jieba.cut(text)
if token not in stopwords and token.strip()
]
elif granularity == "char":
jieba_tokens = [
token for token in jieba.cut(text)
if token not in stopwords and token.strip()
]
# Use jieba_tokens to hangle sentence with mixed chinese and english.
split_tokens = []
for token in jieba_tokens:
if is_chinese(token):
split_tokens.extend(list(token))
else:
split_tokens.append(token)
return split_tokens
else:
raise TypeError("Unknown granularity %s" % granularity)

elif (tokenizer == 'nltk'):
try:
return [
token for token in word_tokenize(text)
if token not in stopwords and token.strip()
]
except OSError:
nltk.download('punkt')
return [
token for token in word_tokenize(text)
if token not in stopwords and token.strip()
]

elif (tokenizer == 'spacy'):
try:
spacy_tokenizer = spacy.load(tok_model)
except OSError:
spacy.cli.download(tok_model)
spacy_tokenizer = spacy.load(tok_model)

return [
token.text for token in spacy_tokenizer(text)
if token.text not in stopwords and token.text.strip()
]

elif (tokenizer == 'bpe'):
tokenizer = huggingface_tokenizer.Tokenizer(
huggingface_tokenizer.models.BPE())
try:
tokenizer.load(bpe_json, pretty=True)
except OSError:
if (bpe_trainfile is None):
raise OSError("bpe train file not found, using %s." %
bpe_trainfile)
trainer = BpeTrainer(
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(files=[bpe_trainfile], trainer=trainer)
tokenizer.save(bpe_json, pretty=True)
return [
token for token in tokenizer.encode(text) if token not in stopwords
]
else:
raise TypeError("Unknown granularity %s" % granularity)
raise TypeError("Invalid Spliter: %s" % tokenizer)
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@
'networkx',
'numpy>=1.17.0',
'jieba',
'nltk',
'spacy',
'tokenizers',
'js2py',
'EduData>=0.0.16',
'PyBaize>=0.0.3'
Expand Down
21 changes: 21 additions & 0 deletions tests/test_tokenizer/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,27 @@ def test_CharTokenizer():
assert ret == ans


def test_Tokenizer():
items = [{
"stem":
"The stationery store has $600$ exercise books, and after selling some,\
there are still $4$ packs left, $25$ each, how many are sold?",
}]
ans = [
'The', 'stationery', 'store', 'has', '$', '600', '$', 'exercise',
'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still',
'$', '4', '$', 'packs', 'left', '$', '25', '$', 'each', 'how', 'many',
'are', 'sold'
]
for tok in ['nltk', 'spacy']:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add test for bpe

tokenizer = get_tokenizer("char",
nnnyt marked this conversation as resolved.
Show resolved Hide resolved
stop_words=set(",?"),
text_params={"tokenizer": tok})
tokens = tokenizer(items, key=lambda x: x['stem'])
ret = next(tokens)
assert ret == ans


def test_SpaceTokenizer():
items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?']
tokenizer = get_tokenizer("space", stop_words=[])
Expand Down
Loading