Skip to content

Commit

Permalink
Merge pull request #732 from PyThaiNLP/add-blackboard-cls
Browse files Browse the repository at this point in the history
Add blackboard cls
  • Loading branch information
wannaphong authored Oct 18, 2022
2 parents 3a65056 + 16406a4 commit 904439b
Show file tree
Hide file tree
Showing 11 changed files with 227 additions and 4 deletions.
1 change: 1 addition & 0 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu
Modules
-------

.. autofunction:: clause_tokenize
.. autofunction:: sent_tokenize
.. autofunction:: subword_tokenize
.. autofunction:: word_tokenize
Expand Down
Binary file added pythainlp/corpus/blackboard-cls_v1.0.crfsuite
Binary file not shown.
65 changes: 65 additions & 0 deletions pythainlp/tag/blackboard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple

# defined strings for special characters
CHAR_TO_ESCAPE = {" ": "_"}
ESCAPE_TO_CHAR = dict((v, k) for k, v in CHAR_TO_ESCAPE.items())


# map from Blackboard treebank POS tag to Universal POS tag
# from Wannaphong Phatthiyaphaibun & Korakot Chaovavanich
TO_UD = {
"": "",
"AJ": "ADJ",
"AV": "ADV",
"AX": "AUX",
"CC": "CCONJ",
"CL": "NOUN",
"FX": "NOUN",
"IJ": "INTJ",
"NG": "PART",
"NN": "NOUN",
"NU": "NUM",
"PA": "PART",
"PR": "PROPN",
"PS": "ADP",
"PU": "PUNCT",
"VV": "VERB",
"XX": "X",
}


def pre_process(words: List[str]) -> List[str]:
"""
Convert signs and symbols with their defined strings.
This function is to be used as a preprocessing step,
before the actual POS tagging.
"""
keys = CHAR_TO_ESCAPE.keys()
words = [CHAR_TO_ESCAPE[word] if word in keys else word for word in words]
return words


def post_process(
word_tags: List[Tuple[str, str]], to_ud: bool = False
) -> List[Tuple[str, str]]:
"""
Convert defined strings back to corresponding signs and symbols.
This function is to be used as a post-processing step,
after the POS tagging.
"""
keys = ESCAPE_TO_CHAR.keys()

if not to_ud:
word_tags = [
(ESCAPE_TO_CHAR[word], tag) if word in keys else (word, tag)
for word, tag in word_tags
]
else:
word_tags = [
(ESCAPE_TO_CHAR[word], TO_UD[tag])
if word in keys
else (word, TO_UD[tag])
for word, tag in word_tags
]
return word_tags
18 changes: 16 additions & 2 deletions pythainlp/tag/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,20 @@
import os
from typing import List, Tuple

from pythainlp.corpus import corpus_path
from pythainlp.tag import PerceptronTagger, orchid
from pythainlp.corpus import corpus_path, get_corpus_path
from pythainlp.tag import PerceptronTagger, blackboard, orchid

_ORCHID_FILENAME = "pos_orchid_perceptron.json"
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)

_PUD_FILENAME = "pos_ud_perceptron-v0.2.json"
_PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)

_BLACKBOARD_NAME = "blackboard_pt_tagger"

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_BLACKBOARD_TAGGER = None


def _orchid_tagger():
Expand All @@ -33,6 +35,14 @@ def _pud_tagger():
return _PUD_TAGGER


def _blackboard_tagger():
global _BLACKBOARD_TAGGER
if not _BLACKBOARD_TAGGER:
path = get_corpus_path(_BLACKBOARD_NAME)
_LST20_TAGGER = PerceptronTagger(path=path)
return _LST20_TAGGER


def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
"""
:param list words: a list of tokenized words
Expand All @@ -52,6 +62,10 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
words = orchid.pre_process(words)
word_tags = _orchid_tagger().tag(words)
word_tags = orchid.post_process(word_tags, to_ud)
elif corpus == "blackboard" or corpus == "blackboard_ud":
words = blackboard.pre_process(words)
word_tags = _blackboard_tagger().tag(words)
word_tags = blackboard.post_process(word_tags, to_ud)
else: # default, use "pud" as a corpus
tagger = _pud_tagger()
word_tags = tagger.tag(words)
Expand Down
8 changes: 7 additions & 1 deletion pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ def pos_tag(
<https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \
text from Thai academic articles (default)
* *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags
* *blackboard* - `blackboard treebank <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_
* *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \
from `Universal Dependencies <https://universaldependencies.org/>`
* *pud* - `Parallel Universal Dependencies (PUD)\
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
treebanks, natively use Universal POS tags
Expand Down Expand Up @@ -87,7 +90,7 @@ def pos_tag(
if not words:
return []

_support_corpus = ["orchid", "orchid_ud", "pud"]
_support_corpus = ["blackboard", "blackboard_ud", "orchid", "orchid_ud", "pud"]

if engine == "perceptron" and corpus in _support_corpus:
from pythainlp.tag.perceptron import tag as tag_
Expand Down Expand Up @@ -128,6 +131,9 @@ def pos_tag_sents(
<https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \
text from Thai academic articles (default)
* *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags
* *blackboard* - `blackboard treebank <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_
* *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \
from `Universal Dependencies <https://universaldependencies.org/>`
* *pud* - `Parallel Universal Dependencies (PUD)\
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
treebanks, natively use Universal POS tags
Expand Down
17 changes: 16 additions & 1 deletion pythainlp/tag/unigram.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,19 @@
from typing import List, Tuple

from pythainlp.corpus import corpus_path, get_corpus_path
from pythainlp.tag import orchid
from pythainlp.tag import blackboard, orchid

_ORCHID_FILENAME = "pos_orchid_unigram.json"
_ORCHID_PATH = os.path.join(corpus_path(), _ORCHID_FILENAME)

_PUD_FILENAME = "pos_ud_unigram-v0.2.json"
_PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)

_BLACKBOARD_NAME = "blackboard_unigram_tagger"

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_BLACKBOARD_TAGGER = None


def _orchid_tagger():
Expand All @@ -34,6 +37,14 @@ def _pud_tagger():
_PUD_TAGGER = json.load(fh)
return _PUD_TAGGER

def _blackboard_tagger():
global _BLACKBOARD_TAGGER
if not _BLACKBOARD_TAGGER:
path = get_corpus_path(_BLACKBOARD_NAME)
with open(path, encoding="utf-8-sig") as fh:
_BLACKBOARD_TAGGER = json.load(fh)
return _BLACKBOARD_TAGGER


def _find_tag(
words: List[str], dictdata: dict, default_tag: str = ""
Expand Down Expand Up @@ -64,6 +75,10 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
words = orchid.pre_process(words)
word_tags = _find_tag(words, _orchid_tagger())
word_tags = orchid.post_process(word_tags, to_ud)
elif corpus == "blackboard" or corpus == "blackboard_ud":
words = blackboard.pre_process(words)
word_tags = _find_tag(words, _blackboard_tagger())
word_tags = blackboard.post_process(word_tags, to_ud)
else: # default, use "pud" as a corpus
word_tags = _find_tag(words, _pud_tagger())

Expand Down
2 changes: 2 additions & 0 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"THAI2FIT_TOKENIZER",
"Tokenizer",
"Trie",
"clause_tokenize",
"sent_tokenize",
"subword_tokenize",
"word_tokenize",
Expand All @@ -27,6 +28,7 @@

from pythainlp.tokenize.core import (
Tokenizer,
clause_tokenize,
sent_tokenize,
subword_tokenize,
word_tokenize,
Expand Down
22 changes: 22 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,28 @@
from pythainlp.util.trie import Trie, dict_trie


def clause_tokenize(doc: List[str]) -> List[List[str]]:
"""
Clause tokenizer. (or Clause segmentation)
Tokenizes running word list into list of clauses (list of strings).
split by CRF trained on Blackboard Treebank.
:param str doc: word list to be clause
:return: list of claues
:rtype: list[list[str]]
:Example:
Clause tokenizer::
from pythainlp.tokenize import clause_tokenize
clause_tokenize(["ฉัน","นอน","และ","คุณ","เล่น","มือถือ","ส่วน","น้อง","เขียน","โปรแกรม"])
# [['ฉัน', 'นอน'],
# ['และ', 'คุณ', 'เล่น', 'มือถือ'],
# ['ส่วน', 'น้อง', 'เขียน', 'โปรแกรม']]
"""
from pythainlp.tokenize.crfcls import segment

return segment(doc)


def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "str") -> Union[str, List[str]]:
"""
Word detokenizer.
Expand Down
74 changes: 74 additions & 0 deletions pythainlp/tokenize/crfcls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
"""
Clause segmenter
"""
from typing import List

import pycrfsuite
from pythainlp.tag import pos_tag
from pythainlp.corpus import path_pythainlp_corpus


def _doc2features(doc, i):
# features from current word
curr_word = doc[i][0]
curr_pos = doc[i][1]
features = {
"word.curr_word": curr_word,
"word.curr_isspace": curr_word.isspace(),
"word.curr_isdigit": curr_word.isdigit(),
"word.curr_postag": curr_pos,
}

# features from previous word
if i > 0:
prev_word = doc[i - 1][0]
prev_pos = doc[i - 1][1]
features["word.prev_word"] = prev_word
features["word.prev_isspace"] = prev_word.isspace()
features["word.prev_isdigit"] = prev_word.isdigit()
features["word.prev_postag"] = prev_pos
else:
features["BOS"] = True # Beginning of Sequence

# features from next word
if i < len(doc) - 1:
next_word = doc[i + 1][0]
next_pos = doc[i + 1][1]
features["word.next_word"] = next_word
features["word.next_isspace"] = next_word.isspace()
features["word.next_isdigit"] = next_word.isdigit()
features["word.next_postag"] = next_pos
else:
features["EOS"] = True # End of Sequence

return features


def _extract_features(doc):
return [_doc2features(doc, i) for i in range(len(doc))]


_CORPUS_NAME = "blackboard-cls_v1.0.crfsuite"
tagger = pycrfsuite.Tagger()
tagger.open(path_pythainlp_corpus(_CORPUS_NAME))


def segment(doc: List[str]) -> List[List[str]]:
word_tags = pos_tag(doc, corpus="blackboard")
features = _extract_features(word_tags)
word_markers = list(zip(doc, tagger.tag(features)))

clauses = []
temp = []
len_doc = len(doc) - 1
for i, word_marker in enumerate(word_markers):
word, marker = word_marker
if marker == "E_CLS" or i == len_doc:
temp.append(word)
clauses.append(temp)
temp = []
else:
temp.append(word)

return clauses
19 changes: 19 additions & 0 deletions tests/test_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def test_pos_tag(self):
self.assertEqual(unigram.tag([], corpus="pud"), [])
self.assertEqual(unigram.tag(None, corpus="orchid"), [])
self.assertEqual(unigram.tag([], corpus="orchid"), [])
self.assertEqual(unigram.tag(None, corpus="blackboard"), [])
self.assertEqual(unigram.tag([], corpus="blackboard"), [])
self.assertIsNotNone(
pos_tag(tokens, engine="unigram", corpus="orchid")
)
Expand All @@ -56,6 +58,11 @@ def test_pos_tag(self):
)
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud"))
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud"))
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="blackboard"))
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="blackboard"))
self.assertIsNotNone(
pos_tag([""], engine="unigram", corpus="blackboard_ud")
)
self.assertEqual(
pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
[("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
Expand All @@ -74,6 +81,8 @@ def test_pos_tag(self):
self.assertEqual(perceptron.tag([], corpus="orchid_ud"), [])
self.assertEqual(perceptron.tag(None, corpus="pud"), [])
self.assertEqual(perceptron.tag([], corpus="pud"), [])
self.assertEqual(perceptron.tag(None, corpus="blackboard"), [])
self.assertEqual(perceptron.tag([], corpus="blackboard"), [])
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="orchid")
)
Expand All @@ -83,6 +92,12 @@ def test_pos_tag(self):
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="pud")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="blackboard")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
)
self.assertIsNotNone(
pos_tag(tokens, engine="tltk")
)
Expand All @@ -96,6 +111,10 @@ def test_pos_tag(self):
[("แมว", "NCMN"), ("วิ่ง", "VACT")],
],
)
with self.assertRaises(ValueError):
self.assertIsNotNone(
tltk.pos_tag(tokens, corpus="blackboard")
)

# ### pythainlp.tag.PerceptronTagger

Expand Down
5 changes: 5 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
oskut,
word_detokenize,
)
from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
from pythainlp.util import dict_trie


Expand Down Expand Up @@ -204,6 +205,10 @@ def test_Tokenizer(self):
with self.assertRaises(NotImplementedError):
Tokenizer(engine="catcut")

def test_clause_tokenize(self):
self.assertIsNotNone(sent_clause_tokenize(["ฉัน", "ทดสอบ"]))
self.assertIsInstance(sent_clause_tokenize(["ฉัน", "ทดสอบ"]), list)

def test_sent_tokenize(self):
self.assertEqual(sent_tokenize(None), [])
self.assertEqual(sent_tokenize(""), [])
Expand Down

0 comments on commit 904439b

Please sign in to comment.