-
Notifications
You must be signed in to change notification settings - Fork 274
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #732 from PyThaiNLP/add-blackboard-cls
Add blackboard cls
- Loading branch information
Showing
11 changed files
with
227 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# -*- coding: utf-8 -*- | ||
from typing import List, Tuple | ||
|
||
# defined strings for special characters | ||
CHAR_TO_ESCAPE = {" ": "_"} | ||
ESCAPE_TO_CHAR = dict((v, k) for k, v in CHAR_TO_ESCAPE.items()) | ||
|
||
|
||
# map from Blackboard treebank POS tag to Universal POS tag | ||
# from Wannaphong Phatthiyaphaibun & Korakot Chaovavanich | ||
TO_UD = { | ||
"": "", | ||
"AJ": "ADJ", | ||
"AV": "ADV", | ||
"AX": "AUX", | ||
"CC": "CCONJ", | ||
"CL": "NOUN", | ||
"FX": "NOUN", | ||
"IJ": "INTJ", | ||
"NG": "PART", | ||
"NN": "NOUN", | ||
"NU": "NUM", | ||
"PA": "PART", | ||
"PR": "PROPN", | ||
"PS": "ADP", | ||
"PU": "PUNCT", | ||
"VV": "VERB", | ||
"XX": "X", | ||
} | ||
|
||
|
||
def pre_process(words: List[str]) -> List[str]: | ||
""" | ||
Convert signs and symbols with their defined strings. | ||
This function is to be used as a preprocessing step, | ||
before the actual POS tagging. | ||
""" | ||
keys = CHAR_TO_ESCAPE.keys() | ||
words = [CHAR_TO_ESCAPE[word] if word in keys else word for word in words] | ||
return words | ||
|
||
|
||
def post_process( | ||
word_tags: List[Tuple[str, str]], to_ud: bool = False | ||
) -> List[Tuple[str, str]]: | ||
""" | ||
Convert defined strings back to corresponding signs and symbols. | ||
This function is to be used as a post-processing step, | ||
after the POS tagging. | ||
""" | ||
keys = ESCAPE_TO_CHAR.keys() | ||
|
||
if not to_ud: | ||
word_tags = [ | ||
(ESCAPE_TO_CHAR[word], tag) if word in keys else (word, tag) | ||
for word, tag in word_tags | ||
] | ||
else: | ||
word_tags = [ | ||
(ESCAPE_TO_CHAR[word], TO_UD[tag]) | ||
if word in keys | ||
else (word, TO_UD[tag]) | ||
for word, tag in word_tags | ||
] | ||
return word_tags |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Clause segmenter | ||
""" | ||
from typing import List | ||
|
||
import pycrfsuite | ||
from pythainlp.tag import pos_tag | ||
from pythainlp.corpus import path_pythainlp_corpus | ||
|
||
|
||
def _doc2features(doc, i): | ||
# features from current word | ||
curr_word = doc[i][0] | ||
curr_pos = doc[i][1] | ||
features = { | ||
"word.curr_word": curr_word, | ||
"word.curr_isspace": curr_word.isspace(), | ||
"word.curr_isdigit": curr_word.isdigit(), | ||
"word.curr_postag": curr_pos, | ||
} | ||
|
||
# features from previous word | ||
if i > 0: | ||
prev_word = doc[i - 1][0] | ||
prev_pos = doc[i - 1][1] | ||
features["word.prev_word"] = prev_word | ||
features["word.prev_isspace"] = prev_word.isspace() | ||
features["word.prev_isdigit"] = prev_word.isdigit() | ||
features["word.prev_postag"] = prev_pos | ||
else: | ||
features["BOS"] = True # Beginning of Sequence | ||
|
||
# features from next word | ||
if i < len(doc) - 1: | ||
next_word = doc[i + 1][0] | ||
next_pos = doc[i + 1][1] | ||
features["word.next_word"] = next_word | ||
features["word.next_isspace"] = next_word.isspace() | ||
features["word.next_isdigit"] = next_word.isdigit() | ||
features["word.next_postag"] = next_pos | ||
else: | ||
features["EOS"] = True # End of Sequence | ||
|
||
return features | ||
|
||
|
||
def _extract_features(doc): | ||
return [_doc2features(doc, i) for i in range(len(doc))] | ||
|
||
|
||
_CORPUS_NAME = "blackboard-cls_v1.0.crfsuite" | ||
tagger = pycrfsuite.Tagger() | ||
tagger.open(path_pythainlp_corpus(_CORPUS_NAME)) | ||
|
||
|
||
def segment(doc: List[str]) -> List[List[str]]: | ||
word_tags = pos_tag(doc, corpus="blackboard") | ||
features = _extract_features(word_tags) | ||
word_markers = list(zip(doc, tagger.tag(features))) | ||
|
||
clauses = [] | ||
temp = [] | ||
len_doc = len(doc) - 1 | ||
for i, word_marker in enumerate(word_markers): | ||
word, marker = word_marker | ||
if marker == "E_CLS" or i == len_doc: | ||
temp.append(word) | ||
clauses.append(temp) | ||
temp = [] | ||
else: | ||
temp.append(word) | ||
|
||
return clauses |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters