Merge pull request #732 from PyThaiNLP/add-blackboard-cls

Add blackboard cls
PyThaiNLP · Oct 18, 2022 · 904439b · 904439b
2 parents 3a65056 + 16406a4
commit 904439b
Show file tree

Hide file tree

Showing 11 changed files with 227 additions and 4 deletions.
diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -8,6 +8,7 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu
 Modules
 -------
 
+.. autofunction:: clause_tokenize
 .. autofunction:: sent_tokenize
 .. autofunction:: subword_tokenize
 .. autofunction:: word_tokenize

diff --git a/pythainlp/corpus/blackboard-cls_v1.0.crfsuite b/pythainlp/corpus/blackboard-cls_v1.0.crfsuite
diff --git a/pythainlp/tag/blackboard.py b/pythainlp/tag/blackboard.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+from typing import List, Tuple
+
+# defined strings for special characters
+CHAR_TO_ESCAPE = {" ": "_"}
+ESCAPE_TO_CHAR = dict((v, k) for k, v in CHAR_TO_ESCAPE.items())
+
+
+# map from Blackboard treebank POS tag to Universal POS tag
+# from Wannaphong Phatthiyaphaibun & Korakot Chaovavanich
+TO_UD = {
+    "": "",
+    "AJ": "ADJ",
+    "AV": "ADV",
+    "AX": "AUX",
+    "CC": "CCONJ",
+    "CL": "NOUN",
+    "FX": "NOUN",
+    "IJ": "INTJ",
+    "NG": "PART",
+    "NN": "NOUN",
+    "NU": "NUM",
+    "PA": "PART",
+    "PR": "PROPN",
+    "PS": "ADP",
+    "PU": "PUNCT",
+    "VV": "VERB",
+    "XX": "X",
+}
+
+
+def pre_process(words: List[str]) -> List[str]:
+    """
+    Convert signs and symbols with their defined strings.
+    This function is to be used as a preprocessing step,
+    before the actual POS tagging.
+    """
+    keys = CHAR_TO_ESCAPE.keys()
+    words = [CHAR_TO_ESCAPE[word] if word in keys else word for word in words]
+    return words
+
+
+def post_process(
+    word_tags: List[Tuple[str, str]], to_ud: bool = False
+) -> List[Tuple[str, str]]:
+    """
+    Convert defined strings back to corresponding signs and symbols.
+    This function is to be used as a post-processing step,
+    after the POS tagging.
+    """
+    keys = ESCAPE_TO_CHAR.keys()
+
+    if not to_ud:
+        word_tags = [
+            (ESCAPE_TO_CHAR[word], tag) if word in keys else (word, tag)
+            for word, tag in word_tags
+        ]
+    else:
+        word_tags = [
+            (ESCAPE_TO_CHAR[word], TO_UD[tag])
+            if word in keys
+            else (word, TO_UD[tag])
+            for word, tag in word_tags
+        ]
+    return word_tags
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -5,18 +5,20 @@
 import os
 from typing import List, Tuple
 
-from pythainlp.corpus import corpus_path
-from pythainlp.tag import PerceptronTagger, orchid
+from pythainlp.corpus import corpus_path, get_corpus_path
+from pythainlp.tag import PerceptronTagger, blackboard, orchid
 
 _ORCHID_FILENAME = "pos_orchid_perceptron.json"
 _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
 
 _PUD_FILENAME = "pos_ud_perceptron-v0.2.json"
 _PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)
 
+_BLACKBOARD_NAME = "blackboard_pt_tagger"
 
 _ORCHID_TAGGER = None
 _PUD_TAGGER = None
+_BLACKBOARD_TAGGER = None
 
 
 def _orchid_tagger():
@@ -33,6 +35,14 @@ def _pud_tagger():
     return _PUD_TAGGER
 
 
+def _blackboard_tagger():
+    global _BLACKBOARD_TAGGER
+    if not _BLACKBOARD_TAGGER:
+        path = get_corpus_path(_BLACKBOARD_NAME)
+        _LST20_TAGGER = PerceptronTagger(path=path)
+    return _LST20_TAGGER
+
+
 def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
     """
     :param list words: a list of tokenized words
@@ -52,6 +62,10 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
         words = orchid.pre_process(words)
         word_tags = _orchid_tagger().tag(words)
         word_tags = orchid.post_process(word_tags, to_ud)
+    elif corpus == "blackboard" or corpus == "blackboard_ud":
+        words = blackboard.pre_process(words)
+        word_tags = _blackboard_tagger().tag(words)
+        word_tags = blackboard.post_process(word_tags, to_ud)
     else:  # default, use "pud" as a corpus
         tagger = _pud_tagger()
         word_tags = tagger.tag(words)

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -22,6 +22,9 @@ def pos_tag(
             <https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \
             text from Thai academic articles (default)
         * *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags
+        * *blackboard* - `blackboard treebank <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_
+        * *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \
+            from `Universal Dependencies <https://universaldependencies.org/>`
         * *pud* - `Parallel Universal Dependencies (PUD)\
             <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
             treebanks, natively use Universal POS tags
@@ -87,7 +90,7 @@ def pos_tag(
     if not words:
         return []
 
-    _support_corpus = ["orchid", "orchid_ud", "pud"]
+    _support_corpus = ["blackboard", "blackboard_ud", "orchid", "orchid_ud", "pud"]
 
     if engine == "perceptron" and corpus in _support_corpus:
         from pythainlp.tag.perceptron import tag as tag_
@@ -128,6 +131,9 @@ def pos_tag_sents(
             <https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \
             text from Thai academic articles (default)
         * *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags
+        * *blackboard* - `blackboard treebank <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_
+        * *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \
+            from `Universal Dependencies <https://universaldependencies.org/>`
         * *pud* - `Parallel Universal Dependencies (PUD)\
             <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
             treebanks, natively use Universal POS tags

diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py
@@ -7,16 +7,19 @@
 from typing import List, Tuple
 
 from pythainlp.corpus import corpus_path, get_corpus_path
-from pythainlp.tag import orchid
+from pythainlp.tag import blackboard, orchid
 
 _ORCHID_FILENAME = "pos_orchid_unigram.json"
 _ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)
 
 _PUD_FILENAME = "pos_ud_unigram-v0.2.json"
 _PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)
 
+_BLACKBOARD_NAME = "blackboard_unigram_tagger"
+
 _ORCHID_TAGGER = None
 _PUD_TAGGER = None
+_BLACKBOARD_TAGGER = None
 
 
 def _orchid_tagger():
@@ -34,6 +37,14 @@ def _pud_tagger():
             _PUD_TAGGER = json.load(fh)
     return _PUD_TAGGER
 
+def _blackboard_tagger():
+    global _BLACKBOARD_TAGGER
+    if not _BLACKBOARD_TAGGER:
+        path = get_corpus_path(_BLACKBOARD_NAME)
+        with open(path, encoding="utf-8-sig") as fh:
+            _BLACKBOARD_TAGGER = json.load(fh)
+    return _BLACKBOARD_TAGGER
+
 
 def _find_tag(
     words: List[str], dictdata: dict, default_tag: str = ""
@@ -64,6 +75,10 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
         words = orchid.pre_process(words)
         word_tags = _find_tag(words, _orchid_tagger())
         word_tags = orchid.post_process(word_tags, to_ud)
+    elif corpus == "blackboard" or corpus == "blackboard_ud":
+        words = blackboard.pre_process(words)
+        word_tags = _find_tag(words, _blackboard_tagger())
+        word_tags = blackboard.post_process(word_tags, to_ud)
     else:  # default, use "pud" as a corpus
         word_tags = _find_tag(words, _pud_tagger())
 

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -7,6 +7,7 @@
     "THAI2FIT_TOKENIZER",
     "Tokenizer",
     "Trie",
+    "clause_tokenize",
     "sent_tokenize",
     "subword_tokenize",
     "word_tokenize",
@@ -27,6 +28,7 @@
 
 from pythainlp.tokenize.core import (
     Tokenizer,
+    clause_tokenize,
     sent_tokenize,
     subword_tokenize,
     word_tokenize,

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -21,6 +21,28 @@
 from pythainlp.util.trie import Trie, dict_trie
 
 
+def clause_tokenize(doc: List[str]) -> List[List[str]]:
+    """
+    Clause tokenizer. (or Clause segmentation)
+    Tokenizes running word list into list of clauses (list of strings).
+    split by CRF trained on Blackboard Treebank.
+
+    :param str doc: word list to be clause
+    :return: list of claues
+    :rtype: list[list[str]]
+    :Example:
+    Clause tokenizer::
+        from pythainlp.tokenize import clause_tokenize
+        clause_tokenize(["ฉัน","นอน","และ","คุณ","เล่น","มือถือ","ส่วน","น้อง","เขียน","โปรแกรม"])
+        # [['ฉัน', 'นอน'],
+        # ['และ', 'คุณ', 'เล่น', 'มือถือ'],
+        # ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
+    """
+    from pythainlp.tokenize.crfcls import segment
+
+    return segment(doc)
+
+
 def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "str") -> Union[str, List[str]]:
     """
     Word detokenizer.

diff --git a/pythainlp/tokenize/crfcls.py b/pythainlp/tokenize/crfcls.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+"""
+Clause segmenter
+"""
+from typing import List
+
+import pycrfsuite
+from pythainlp.tag import pos_tag
+from pythainlp.corpus import path_pythainlp_corpus
+
+
+def _doc2features(doc, i):
+    # features from current word
+    curr_word = doc[i][0]
+    curr_pos = doc[i][1]
+    features = {
+        "word.curr_word": curr_word,
+        "word.curr_isspace": curr_word.isspace(),
+        "word.curr_isdigit": curr_word.isdigit(),
+        "word.curr_postag": curr_pos,
+    }
+
+    # features from previous word
+    if i > 0:
+        prev_word = doc[i - 1][0]
+        prev_pos = doc[i - 1][1]
+        features["word.prev_word"] = prev_word
+        features["word.prev_isspace"] = prev_word.isspace()
+        features["word.prev_isdigit"] = prev_word.isdigit()
+        features["word.prev_postag"] = prev_pos
+    else:
+        features["BOS"] = True  # Beginning of Sequence
+
+    # features from next word
+    if i < len(doc) - 1:
+        next_word = doc[i + 1][0]
+        next_pos = doc[i + 1][1]
+        features["word.next_word"] = next_word
+        features["word.next_isspace"] = next_word.isspace()
+        features["word.next_isdigit"] = next_word.isdigit()
+        features["word.next_postag"] = next_pos
+    else:
+        features["EOS"] = True  # End of Sequence
+
+    return features
+
+
+def _extract_features(doc):
+    return [_doc2features(doc, i) for i in range(len(doc))]
+
+
+_CORPUS_NAME = "blackboard-cls_v1.0.crfsuite"
+tagger = pycrfsuite.Tagger()
+tagger.open(path_pythainlp_corpus(_CORPUS_NAME))
+
+
+def segment(doc: List[str]) -> List[List[str]]:
+    word_tags = pos_tag(doc, corpus="blackboard")
+    features = _extract_features(word_tags)
+    word_markers = list(zip(doc, tagger.tag(features)))
+
+    clauses = []
+    temp = []
+    len_doc = len(doc) - 1
+    for i, word_marker in enumerate(word_markers):
+        word, marker = word_marker
+        if marker == "E_CLS" or i == len_doc:
+            temp.append(word)
+            clauses.append(temp)
+            temp = []
+        else:
+            temp.append(word)
+
+    return clauses
diff --git a/tests/test_tag.py b/tests/test_tag.py
@@ -48,6 +48,8 @@ def test_pos_tag(self):
         self.assertEqual(unigram.tag([], corpus="pud"), [])
         self.assertEqual(unigram.tag(None, corpus="orchid"), [])
         self.assertEqual(unigram.tag([], corpus="orchid"), [])
+        self.assertEqual(unigram.tag(None, corpus="blackboard"), [])
+        self.assertEqual(unigram.tag([], corpus="blackboard"), [])
         self.assertIsNotNone(
             pos_tag(tokens, engine="unigram", corpus="orchid")
         )
@@ -56,6 +58,11 @@ def test_pos_tag(self):
         )
         self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
         self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
+        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="blackboard"))
+        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="blackboard"))
+        self.assertIsNotNone(
+            pos_tag([""], engine="unigram", corpus="blackboard_ud")
+        )
         self.assertEqual(
             pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
             [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
@@ -74,6 +81,8 @@ def test_pos_tag(self):
         self.assertEqual(perceptron.tag([], corpus="orchid_ud"), [])
         self.assertEqual(perceptron.tag(None, corpus="pud"), [])
         self.assertEqual(perceptron.tag([], corpus="pud"), [])
+        self.assertEqual(perceptron.tag(None, corpus="blackboard"), [])
+        self.assertEqual(perceptron.tag([], corpus="blackboard"), [])
         self.assertIsNotNone(
             pos_tag(tokens, engine="perceptron", corpus="orchid")
         )
@@ -83,6 +92,12 @@ def test_pos_tag(self):
         self.assertIsNotNone(
             pos_tag(tokens, engine="perceptron", corpus="pud")
         )
+        self.assertIsNotNone(
+            pos_tag(tokens, engine="perceptron", corpus="blackboard")
+        )
+        self.assertIsNotNone(
+            pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
+        )
         self.assertIsNotNone(
             pos_tag(tokens, engine="tltk")
         )
@@ -96,6 +111,10 @@ def test_pos_tag(self):
                 [("แมว", "NCMN"), ("วิ่ง", "VACT")],
             ],
         )
+        with self.assertRaises(ValueError):
+            self.assertIsNotNone(
+                tltk.pos_tag(tokens, corpus="blackboard")
+            )
 
     # ### pythainlp.tag.PerceptronTagger
 

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -23,6 +23,7 @@
     oskut,
     word_detokenize,
 )
+from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
 from pythainlp.util import dict_trie
 
 
@@ -204,6 +205,10 @@ def test_Tokenizer(self):
         with self.assertRaises(NotImplementedError):
             Tokenizer(engine="catcut")
 
+    def test_clause_tokenize(self):
+        self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"]))
+        self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list)
+
     def test_sent_tokenize(self):
         self.assertEqual(sent_tokenize(None), [])
         self.assertEqual(sent_tokenize(""), [])