-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathmecab_sp.py
executable file
·29 lines (22 loc) · 991 Bytes
/
mecab_sp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from typing import List
from tokenizer.base import BaseTokenizer
from tokenizer.mecab import MeCabTokenizer
from tokenizer.sentencepiece import SentencePieceTokenizer
class MeCabSentencePieceTokenizer(BaseTokenizer):
def __init__(self, mecab: MeCabTokenizer, sp: SentencePieceTokenizer):
self.mecab = mecab
self.sp = sp
def tokenize(self, text: str) -> List[str]:
tokenized = self.mecab.tokenize(text)
tokenized = self.sp.tokenize(" ".join(tokenized))
output = []
for i in range(0, len(tokenized)):
if i + 1 < len(tokenized) and (tokenized[i] == "▁" and tokenized[i + 1] == "▃"):
continue
if tokenized[i] == "▁▃":
tokenized[i] = "▃"
output.append(tokenized[i])
return output
def detokenize(self, tokens: List[str]) -> str:
text = "".join(tokens).replace("▁", "").replace(" ", "").replace("▃", " ").strip()
return text