diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 91307121e..098be97b1 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -8,6 +8,7 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu Modules ------- +.. autofunction:: clause_tokenize .. autofunction:: sent_tokenize .. autofunction:: subword_tokenize .. autofunction:: word_tokenize diff --git a/pythainlp/corpus/blackboard-cls_v1.0.crfsuite b/pythainlp/corpus/blackboard-cls_v1.0.crfsuite new file mode 100644 index 000000000..018b29985 Binary files /dev/null and b/pythainlp/corpus/blackboard-cls_v1.0.crfsuite differ diff --git a/pythainlp/tag/blackboard.py b/pythainlp/tag/blackboard.py new file mode 100644 index 000000000..4a038d3f0 --- /dev/null +++ b/pythainlp/tag/blackboard.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +from typing import List, Tuple + +# defined strings for special characters +CHAR_TO_ESCAPE = {" ": "_"} +ESCAPE_TO_CHAR = dict((v, k) for k, v in CHAR_TO_ESCAPE.items()) + + +# map from Blackboard treebank POS tag to Universal POS tag +# from Wannaphong Phatthiyaphaibun & Korakot Chaovavanich +TO_UD = { + "": "", + "AJ": "ADJ", + "AV": "ADV", + "AX": "AUX", + "CC": "CCONJ", + "CL": "NOUN", + "FX": "NOUN", + "IJ": "INTJ", + "NG": "PART", + "NN": "NOUN", + "NU": "NUM", + "PA": "PART", + "PR": "PROPN", + "PS": "ADP", + "PU": "PUNCT", + "VV": "VERB", + "XX": "X", +} + + +def pre_process(words: List[str]) -> List[str]: + """ + Convert signs and symbols with their defined strings. + This function is to be used as a preprocessing step, + before the actual POS tagging. + """ + keys = CHAR_TO_ESCAPE.keys() + words = [CHAR_TO_ESCAPE[word] if word in keys else word for word in words] + return words + + +def post_process( + word_tags: List[Tuple[str, str]], to_ud: bool = False +) -> List[Tuple[str, str]]: + """ + Convert defined strings back to corresponding signs and symbols. + This function is to be used as a post-processing step, + after the POS tagging. + """ + keys = ESCAPE_TO_CHAR.keys() + + if not to_ud: + word_tags = [ + (ESCAPE_TO_CHAR[word], tag) if word in keys else (word, tag) + for word, tag in word_tags + ] + else: + word_tags = [ + (ESCAPE_TO_CHAR[word], TO_UD[tag]) + if word in keys + else (word, TO_UD[tag]) + for word, tag in word_tags + ] + return word_tags diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py index 192837533..2b6d3b198 100644 --- a/pythainlp/tag/perceptron.py +++ b/pythainlp/tag/perceptron.py @@ -5,8 +5,8 @@ import os from typing import List, Tuple -from pythainlp.corpus import corpus_path -from pythainlp.tag import PerceptronTagger, orchid +from pythainlp.corpus import corpus_path, get_corpus_path +from pythainlp.tag import PerceptronTagger, blackboard, orchid _ORCHID_FILENAME = "pos_orchid_perceptron.json" _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME) @@ -14,9 +14,11 @@ _PUD_FILENAME = "pos_ud_perceptron-v0.2.json" _PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME) +_BLACKBOARD_NAME = "blackboard_pt_tagger" _ORCHID_TAGGER = None _PUD_TAGGER = None +_BLACKBOARD_TAGGER = None def _orchid_tagger(): @@ -33,6 +35,14 @@ def _pud_tagger(): return _PUD_TAGGER +def _blackboard_tagger(): + global _BLACKBOARD_TAGGER + if not _BLACKBOARD_TAGGER: + path = get_corpus_path(_BLACKBOARD_NAME) + _LST20_TAGGER = PerceptronTagger(path=path) + return _LST20_TAGGER + + def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]: """ :param list words: a list of tokenized words @@ -52,6 +62,10 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]: words = orchid.pre_process(words) word_tags = _orchid_tagger().tag(words) word_tags = orchid.post_process(word_tags, to_ud) + elif corpus == "blackboard" or corpus == "blackboard_ud": + words = blackboard.pre_process(words) + word_tags = _blackboard_tagger().tag(words) + word_tags = blackboard.post_process(word_tags, to_ud) else: # default, use "pud" as a corpus tagger = _pud_tagger() word_tags = tagger.tag(words) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index eab1fd487..c8b91b3c5 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -22,6 +22,9 @@ def pos_tag( `_ corpus, \ text from Thai academic articles (default) * *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags + * *blackboard* - `blackboard treebank `_ + * *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \ + from `Universal Dependencies ` * *pud* - `Parallel Universal Dependencies (PUD)\ `_ \ treebanks, natively use Universal POS tags @@ -87,7 +90,7 @@ def pos_tag( if not words: return [] - _support_corpus = ["orchid", "orchid_ud", "pud"] + _support_corpus = ["blackboard", "blackboard_ud", "orchid", "orchid_ud", "pud"] if engine == "perceptron" and corpus in _support_corpus: from pythainlp.tag.perceptron import tag as tag_ @@ -128,6 +131,9 @@ def pos_tag_sents( `_ corpus, \ text from Thai academic articles (default) * *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags + * *blackboard* - `blackboard treebank `_ + * *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \ + from `Universal Dependencies ` * *pud* - `Parallel Universal Dependencies (PUD)\ `_ \ treebanks, natively use Universal POS tags diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py index e05feea8d..33f535f6c 100644 --- a/pythainlp/tag/unigram.py +++ b/pythainlp/tag/unigram.py @@ -7,7 +7,7 @@ from typing import List, Tuple from pythainlp.corpus import corpus_path, get_corpus_path -from pythainlp.tag import orchid +from pythainlp.tag import blackboard, orchid _ORCHID_FILENAME = "pos_orchid_unigram.json" _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME) @@ -15,8 +15,11 @@ _PUD_FILENAME = "pos_ud_unigram-v0.2.json" _PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME) +_BLACKBOARD_NAME = "blackboard_unigram_tagger" + _ORCHID_TAGGER = None _PUD_TAGGER = None +_BLACKBOARD_TAGGER = None def _orchid_tagger(): @@ -34,6 +37,14 @@ def _pud_tagger(): _PUD_TAGGER = json.load(fh) return _PUD_TAGGER +def _blackboard_tagger(): + global _BLACKBOARD_TAGGER + if not _BLACKBOARD_TAGGER: + path = get_corpus_path(_BLACKBOARD_NAME) + with open(path, encoding="utf-8-sig") as fh: + _BLACKBOARD_TAGGER = json.load(fh) + return _BLACKBOARD_TAGGER + def _find_tag( words: List[str], dictdata: dict, default_tag: str = "" @@ -64,6 +75,10 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]: words = orchid.pre_process(words) word_tags = _find_tag(words, _orchid_tagger()) word_tags = orchid.post_process(word_tags, to_ud) + elif corpus == "blackboard" or corpus == "blackboard_ud": + words = blackboard.pre_process(words) + word_tags = _find_tag(words, _blackboard_tagger()) + word_tags = blackboard.post_process(word_tags, to_ud) else: # default, use "pud" as a corpus word_tags = _find_tag(words, _pud_tagger()) diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 8a1bdd45a..4ff7420d4 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -7,6 +7,7 @@ "THAI2FIT_TOKENIZER", "Tokenizer", "Trie", + "clause_tokenize", "sent_tokenize", "subword_tokenize", "word_tokenize", @@ -27,6 +28,7 @@ from pythainlp.tokenize.core import ( Tokenizer, + clause_tokenize, sent_tokenize, subword_tokenize, word_tokenize, diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 8ae97fa2a..e27a6a601 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -21,6 +21,28 @@ from pythainlp.util.trie import Trie, dict_trie +def clause_tokenize(doc: List[str]) -> List[List[str]]: + """ + Clause tokenizer. (or Clause segmentation) + Tokenizes running word list into list of clauses (list of strings). + split by CRF trained on Blackboard Treebank. + + :param str doc: word list to be clause + :return: list of claues + :rtype: list[list[str]] + :Example: + Clause tokenizer:: + from pythainlp.tokenize import clause_tokenize + clause_tokenize(["ฉัน","นอน","และ","คุณ","เล่น","มือถือ","ส่วน","น้อง","เขียน","โปรแกรม"]) + # [['ฉัน', 'นอน'], + # ['และ', 'คุณ', 'เล่น', 'มือถือ'], + # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']] + """ + from pythainlp.tokenize.crfcls import segment + + return segment(doc) + + def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "str") -> Union[str, List[str]]: """ Word detokenizer. diff --git a/pythainlp/tokenize/crfcls.py b/pythainlp/tokenize/crfcls.py new file mode 100644 index 000000000..5852d72b2 --- /dev/null +++ b/pythainlp/tokenize/crfcls.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +""" +Clause segmenter +""" +from typing import List + +import pycrfsuite +from pythainlp.tag import pos_tag +from pythainlp.corpus import path_pythainlp_corpus + + +def _doc2features(doc, i): + # features from current word + curr_word = doc[i][0] + curr_pos = doc[i][1] + features = { + "word.curr_word": curr_word, + "word.curr_isspace": curr_word.isspace(), + "word.curr_isdigit": curr_word.isdigit(), + "word.curr_postag": curr_pos, + } + + # features from previous word + if i > 0: + prev_word = doc[i - 1][0] + prev_pos = doc[i - 1][1] + features["word.prev_word"] = prev_word + features["word.prev_isspace"] = prev_word.isspace() + features["word.prev_isdigit"] = prev_word.isdigit() + features["word.prev_postag"] = prev_pos + else: + features["BOS"] = True # Beginning of Sequence + + # features from next word + if i < len(doc) - 1: + next_word = doc[i + 1][0] + next_pos = doc[i + 1][1] + features["word.next_word"] = next_word + features["word.next_isspace"] = next_word.isspace() + features["word.next_isdigit"] = next_word.isdigit() + features["word.next_postag"] = next_pos + else: + features["EOS"] = True # End of Sequence + + return features + + +def _extract_features(doc): + return [_doc2features(doc, i) for i in range(len(doc))] + + +_CORPUS_NAME = "blackboard-cls_v1.0.crfsuite" +tagger = pycrfsuite.Tagger() +tagger.open(path_pythainlp_corpus(_CORPUS_NAME)) + + +def segment(doc: List[str]) -> List[List[str]]: + word_tags = pos_tag(doc, corpus="blackboard") + features = _extract_features(word_tags) + word_markers = list(zip(doc, tagger.tag(features))) + + clauses = [] + temp = [] + len_doc = len(doc) - 1 + for i, word_marker in enumerate(word_markers): + word, marker = word_marker + if marker == "E_CLS" or i == len_doc: + temp.append(word) + clauses.append(temp) + temp = [] + else: + temp.append(word) + + return clauses diff --git a/tests/test_tag.py b/tests/test_tag.py index 97a0bae97..f034d6610 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -48,6 +48,8 @@ def test_pos_tag(self): self.assertEqual(unigram.tag([], corpus="pud"), []) self.assertEqual(unigram.tag(None, corpus="orchid"), []) self.assertEqual(unigram.tag([], corpus="orchid"), []) + self.assertEqual(unigram.tag(None, corpus="blackboard"), []) + self.assertEqual(unigram.tag([], corpus="blackboard"), []) self.assertIsNotNone( pos_tag(tokens, engine="unigram", corpus="orchid") ) @@ -56,6 +58,11 @@ def test_pos_tag(self): ) self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud")) self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud")) + self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="blackboard")) + self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="blackboard")) + self.assertIsNotNone( + pos_tag([""], engine="unigram", corpus="blackboard_ud") + ) self.assertEqual( pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"), [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")], @@ -74,6 +81,8 @@ def test_pos_tag(self): self.assertEqual(perceptron.tag([], corpus="orchid_ud"), []) self.assertEqual(perceptron.tag(None, corpus="pud"), []) self.assertEqual(perceptron.tag([], corpus="pud"), []) + self.assertEqual(perceptron.tag(None, corpus="blackboard"), []) + self.assertEqual(perceptron.tag([], corpus="blackboard"), []) self.assertIsNotNone( pos_tag(tokens, engine="perceptron", corpus="orchid") ) @@ -83,6 +92,12 @@ def test_pos_tag(self): self.assertIsNotNone( pos_tag(tokens, engine="perceptron", corpus="pud") ) + self.assertIsNotNone( + pos_tag(tokens, engine="perceptron", corpus="blackboard") + ) + self.assertIsNotNone( + pos_tag(tokens, engine="perceptron", corpus="blackboard_ud") + ) self.assertIsNotNone( pos_tag(tokens, engine="tltk") ) @@ -96,6 +111,10 @@ def test_pos_tag(self): [("แมว", "NCMN"), ("วิ่ง", "VACT")], ], ) + with self.assertRaises(ValueError): + self.assertIsNotNone( + tltk.pos_tag(tokens, corpus="blackboard") + ) # ### pythainlp.tag.PerceptronTagger diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index d9edfb427..fd65445de 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -23,6 +23,7 @@ oskut, word_detokenize, ) +from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize from pythainlp.util import dict_trie @@ -204,6 +205,10 @@ def test_Tokenizer(self): with self.assertRaises(NotImplementedError): Tokenizer(engine="catcut") + def test_clause_tokenize(self): + self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"])) + self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list) + def test_sent_tokenize(self): self.assertEqual(sent_tokenize(None), []) self.assertEqual(sent_tokenize(""), [])