diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index 47b43c219..8ba41a241 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -13,21 +13,31 @@ class ThaiTextAugmenter: - def __init__(self,) -> None: - from transformers import (AutoTokenizer, - AutoModelForMaskedLM, - pipeline,) + def __init__(self) -> None: + from transformers import ( + AutoTokenizer, + AutoModelForMaskedLM, + pipeline, + ) + self.tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME) - self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_MODEL_NAME) - self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm) + self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained( + _MODEL_NAME + ) + self.model = pipeline( + "fill-mask", + tokenizer=self.tokenizer, + model=self.model_for_masked_lm, + ) self.processor = ThaiTextProcessor() - def generate(self, - sample_text: str, - word_rank: int, - max_length: int = 3, - sample: bool = False - ) -> str: + def generate( + self, + sample_text: str, + word_rank: int, + max_length: int = 3, + sample: bool = False, + ) -> str: sample_txt = sample_text final_text = "" @@ -45,11 +55,9 @@ def generate(self, return gen_txt - def augment(self, - text: str, - num_augs: int = 3, - sample: bool = False - ) -> List[str]: + def augment( + self, text: str, num_augs: int = 3, sample: bool = False + ) -> List[str]: """ Text augmentation from PhayaThaiBERT @@ -84,11 +92,13 @@ def augment(self, if num_augs <= MAX_NUM_AUGS: for rank in range(num_augs): gen_text = self.generate(text, rank, sample=sample) - processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text)) + processed_text = re.sub( + "<_>", " ", self.processor.preprocess(gen_text) + ) augment_list.append(processed_text) + else: + raise ValueError( + f"augmentation of more than {num_augs} is exceeded the default limit: {MAX_NUM_AUGS}" + ) - return augment_list - - raise ValueError( - f"augmentation of more than {num_augs} is exceeded the default limit: {MAX_NUM_AUGS}" - ) + return augment_list diff --git a/pythainlp/augment/lm/wangchanberta.py b/pythainlp/augment/lm/wangchanberta.py index 595378186..1299654da 100644 --- a/pythainlp/augment/lm/wangchanberta.py +++ b/pythainlp/augment/lm/wangchanberta.py @@ -1,7 +1,9 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 + from typing import List + from transformers import ( CamembertTokenizer, pipeline, @@ -51,9 +53,9 @@ def generate(self, sentence: str, num_replace_tokens: int = 3): def augment(self, sentence: str, num_replace_tokens: int = 3) -> List[str]: """ - Text Augment from wangchanberta + Text augmentation from WangchanBERTa - :param str sentence: thai sentence + :param str sentence: Thai sentence :param int num_replace_tokens: number replace tokens :return: list of text augment @@ -64,7 +66,7 @@ def augment(self, sentence: str, num_replace_tokens: int = 3) -> List[str]: from pythainlp.augment.lm import Thai2transformersAug - aug=Thai2transformersAug() + aug = Thai2transformersAug() aug.augment("ช้างมีทั้งหมด 50 ตัว บน") # output: ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้', diff --git a/pythainlp/augment/wordnet.py b/pythainlp/augment/wordnet.py index 99b0d2c3a..0665bd6c7 100644 --- a/pythainlp/augment/wordnet.py +++ b/pythainlp/augment/wordnet.py @@ -12,6 +12,7 @@ from collections import OrderedDict import itertools from typing import List + from nltk.corpus import wordnet as wn from pythainlp.corpus import wordnet from pythainlp.tokenize import word_tokenize diff --git a/pythainlp/khavee/__init__.py b/pythainlp/khavee/__init__.py index 4efbf2f62..9afefe8c3 100644 --- a/pythainlp/khavee/__init__.py +++ b/pythainlp/khavee/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 + __all__ = ["KhaveeVerifier"] from pythainlp.khavee.core import KhaveeVerifier diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 468d26a82..0bedff0de 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -1,10 +1,14 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 +# ruff: noqa: C901 + from typing import List, Union + from pythainlp.tokenize import subword_tokenize -from pythainlp.util import sound_syllable -from pythainlp.util import remove_tonemark +from pythainlp.util import remove_tonemark, sound_syllable + + class KhaveeVerifier: def __init__(self): """ @@ -26,166 +30,179 @@ def check_sara(self, word: str) -> str: kv = KhaveeVerifier() - print(kv.check_sara('เริง')) + print(kv.check_sara("เริง")) # output: 'เออ' """ sara = [] countoa = 0 + # In case of การันย์ - if '์' in word[-1]: + if "์" in word[-1]: word = word[:-2] + # In case of สระเดี่ยว for i in word: - if i in ('ะ', 'ั'): - sara.append('อะ') - elif i == 'ิ': - sara.append('อิ') - elif i == 'ุ': - sara.append('อุ') - elif i == 'ึ': - sara.append('อึ') - elif i == 'ี': - sara.append('อี') - elif i == 'ู': - sara.append('อู') - elif i == 'ื': - sara.append('อือ') - elif i == 'เ': - sara.append('เอ') - elif i == 'แ': - sara.append('แอ') - elif i == 'า': - sara.append('อา') - elif i == 'โ': - sara.append('โอ') - elif i == 'ำ': - sara.append('อำ') - elif i == 'อ': + if i in ("ะ", "ั"): + sara.append("อะ") + elif i == "ิ": + sara.append("อิ") + elif i == "ุ": + sara.append("อุ") + elif i == "ึ": + sara.append("อึ") + elif i == "ี": + sara.append("อี") + elif i == "ู": + sara.append("อู") + elif i == "ื": + sara.append("อือ") + elif i == "เ": + sara.append("เอ") + elif i == "แ": + sara.append("แอ") + elif i == "า": + sara.append("อา") + elif i == "โ": + sara.append("โอ") + elif i == "ำ": + sara.append("อำ") + elif i == "อ": countoa += 1 - sara.append('ออ') - elif i == 'ั' and 'ว' in word: - sara.append('อัว') - elif i in ('ไ', 'ใ'): - sara.append('ไอ') - elif i == '็': - sara.append('ออ') - elif 'รร' in word: - if self.check_marttra(word) == 'กม': - sara.append('อำ') + sara.append("ออ") + elif i == "ั" and "ว" in word: + sara.append("อัว") + elif i in ("ไ", "ใ"): + sara.append("ไอ") + elif i == "็": + sara.append("ออ") + elif "รร" in word: + if self.check_marttra(word) == "กม": + sara.append("อำ") else: - sara.append('อะ') + sara.append("อะ") + # In case of ออ - if countoa == 1 and 'อ' in word[-1] and 'เ' not in word: - sara.remove('ออ') + if countoa == 1 and "อ" in word[-1] and "เ" not in word: + sara.remove("ออ") + # In case of เอ เอ countA = 0 for i in sara: - if i == 'เอ': + if i == "เอ": countA = countA + 1 if countA > 1: - sara.remove('เอ') - sara.remove('เอ') - sara.append('แ') + sara.remove("เอ") + sara.remove("เอ") + sara.append("แ") + # In case of สระประสม - if 'เอ' in sara and 'อะ' in sara: - sara.remove('เอ') - sara.remove('อะ') - sara.append('เอะ') - elif 'แอ' in sara and 'อะ' in sara: - sara.remove('แอ') - sara.remove('อะ') - sara.append('แอะ') - if 'เอะ' in sara and 'ออ' in sara: - sara.remove('เอะ') - sara.remove('ออ') - sara.append('เออะ') - elif 'เอ' in sara and 'อิ' in sara: - sara.remove('เอ') - sara.remove('อิ') - sara.append('เออ') - elif 'เอ' in sara and 'ออ' in sara and 'อ' in word[-1]: - sara.remove('เอ') - sara.remove('ออ') - sara.append('เออ') - elif 'โอ' in sara and 'อะ' in sara: - sara.remove('โอ') - sara.remove('อะ') - sara.append('โอะ') - elif 'เอ' in sara and 'อี' in sara: - sara.remove('เอ') - sara.remove('อี') - sara.append('เอีย') - elif 'เอ' in sara and 'อือ' in sara: - sara.remove('เอ') - sara.remove('อือ') - sara.append('อัว') - elif 'เอ' in sara and 'อา' in sara: - sara.remove('เอ') - sara.remove('อา') - sara.append('เอา') - elif 'เ' in word and 'า' in word and 'ะ' in word: + if "เอ" in sara and "อะ" in sara: + sara.remove("เอ") + sara.remove("อะ") + sara.append("เอะ") + elif "แอ" in sara and "อะ" in sara: + sara.remove("แอ") + sara.remove("อะ") + sara.append("แอะ") + + if "เอะ" in sara and "ออ" in sara: + sara.remove("เอะ") + sara.remove("ออ") + sara.append("เออะ") + elif "เอ" in sara and "อิ" in sara: + sara.remove("เอ") + sara.remove("อิ") + sara.append("เออ") + elif "เอ" in sara and "ออ" in sara and "อ" in word[-1]: + sara.remove("เอ") + sara.remove("ออ") + sara.append("เออ") + elif "โอ" in sara and "อะ" in sara: + sara.remove("โอ") + sara.remove("อะ") + sara.append("โอะ") + elif "เอ" in sara and "อี" in sara: + sara.remove("เอ") + sara.remove("อี") + sara.append("เอีย") + elif "เอ" in sara and "อือ" in sara: + sara.remove("เอ") + sara.remove("อือ") + sara.append("อัว") + elif "เอ" in sara and "อา" in sara: + sara.remove("เอ") + sara.remove("อา") + sara.append("เอา") + elif "เ" in word and "า" in word and "ะ" in word: sara = [] - sara.append('เอาะ') - if 'อือ' in sara and 'เออ' in sara: - sara.remove('เออ') - sara.remove('อือ') - sara.append('เอือ') - elif 'ออ' in sara and len(sara) > 1: - sara.remove('ออ') - elif 'ว' in word and len(sara) == 0: - sara.append('อัว') - if 'ั' in word and self.check_marttra(word) == 'กา': + sara.append("เอาะ") + + if "อือ" in sara and "เออ" in sara: + sara.remove("เออ") + sara.remove("อือ") + sara.append("เอือ") + elif "ออ" in sara and len(sara) > 1: + sara.remove("ออ") + elif "ว" in word and len(sara) == 0: + sara.append("อัว") + + if "ั" in word and self.check_marttra(word) == "กา": sara = [] - sara.append('ไอ') + sara.append("ไอ") + # In case of อ - if word == 'เออะ': + if word == "เออะ": sara = [] - sara.append('เออะ') - elif word == 'เออ': + sara.append("เออะ") + elif word == "เออ": sara = [] - sara.append('เออ') - elif word == 'เอ': + sara.append("เออ") + elif word == "เอ": sara = [] - sara.append('เอ') - elif word == 'เอะ': + sara.append("เอ") + elif word == "เอะ": sara = [] - sara.append('เอะ') - elif word == 'เอา': + sara.append("เอะ") + elif word == "เอา": sara = [] - sara.append('เอา') - elif word == 'เอาะ': + sara.append("เอา") + elif word == "เอาะ": sara = [] - sara.append('เอาะ') - if 'ฤา' in word or 'ฦา' in word: + sara.append("เอาะ") + + if "ฤา" in word or "ฦา" in word: sara = [] - sara.append('อือ') - elif 'ฤ' in word or 'ฦ' in word: + sara.append("อือ") + elif "ฤ" in word or "ฦ" in word: sara = [] - sara.append('อึ') + sara.append("อึ") + # In case of กน if not sara and len(word) == 2: - if word[-1] != 'ร': - sara.append('โอะ') + if word[-1] != "ร": + sara.append("โอะ") else: - sara.append('ออ') + sara.append("ออ") elif not sara and len(word) == 3: - sara.append('ออ') + sara.append("ออ") # In case of บ่ - if 'บ่' == word: + if word == "บ่": sara = [] - sara.append('ออ') - if 'ํ' in word: + sara.append("ออ") + + if "ํ" in word: sara = [] - sara.append('อำ') - if 'เ' in word and 'ื' in word and 'อ' in word: + sara.append("อำ") + + if "เ" in word and "ื" in word and "อ" in word: sara = [] - sara.append('เอือ') + sara.append("เอือ") + if not sara: - return 'Can\'t find Sara in this word' - else: - return sara[0] + return "Can't find Sara in this word" + return sara[0] def check_marttra(self, word: str) -> str: """ @@ -205,44 +222,64 @@ def check_marttra(self, word: str) -> str: print(kv.check_marttra('สาว')) # output: 'เกอว' """ - if word[-1] == 'ร' and word[-2] in ['ต', 'ท']: + if word[-1] == "ร" and word[-2] in ["ต", "ท"]: word = word[:-1] word = self.handle_karun_sound_silence(word) word = remove_tonemark(word) - if 'ำ' in word or ('ํ' in word and 'า' in word) or 'ไ' in word or 'ใ' in word: - return 'กา' + if ( + "ำ" in word + or ("ํ" in word and "า" in word) + or "ไ" in word + or "ใ" in word + ): + return "กา" elif ( - word[-1] in ['า', 'ะ', 'ิ', 'ี', 'ุ', 'ู', 'อ'] or - ('ี' in word and 'ย' in word[-1]) or - ('ื' in word and 'อ' in word[-1]) + word[-1] in ["า", "ะ", "ิ", "ี", "ุ", "ู", "อ"] + or ("ี" in word and "ย" in word[-1]) + or ("ื" in word and "อ" in word[-1]) ): - return 'กา' - elif word[-1] in ['ง']: - return 'กง' - elif word[-1] in ['ม']: - return 'กม' - elif word[-1] in ['ย']: - if 'ั' in word: - return 'กา' + return "กา" + elif word[-1] in ["ง"]: + return "กง" + elif word[-1] in ["ม"]: + return "กม" + elif word[-1] in ["ย"]: + if "ั" in word: + return "กา" else: - return 'เกย' - elif word[-1] in ['ว']: - return 'เกอว' - elif word[-1] in ['ก', 'ข', 'ค', 'ฆ']: - return 'กก' + return "เกย" + elif word[-1] in ["ว"]: + return "เกอว" + elif word[-1] in ["ก", "ข", "ค", "ฆ"]: + return "กก" elif word[-1] in [ - 'จ', 'ช', 'ซ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ด', 'ต', 'ถ', 'ท', 'ธ', 'ศ', 'ษ', 'ส' + "จ", + "ช", + "ซ", + "ฎ", + "ฏ", + "ฐ", + "ฑ", + "ฒ", + "ด", + "ต", + "ถ", + "ท", + "ธ", + "ศ", + "ษ", + "ส", ]: - return 'กด' - elif word[-1] in ['ญ', ', ณ', 'น', 'ร', 'ล', 'ฬ']: - return 'กน' - elif word[-1] in ['บ', 'ป', 'พ', 'ฟ', 'ภ']: - return 'กบ' + return "กด" + elif word[-1] in ["ญ", ", ณ", "น", "ร", "ล", "ฬ"]: + return "กน" + elif word[-1] in ["บ", "ป", "พ", "ฟ", "ภ"]: + return "กบ" else: - if '็' in word: - return 'กา' + if "็" in word: + return "กา" else: - return 'Cant find Marttra in this word' + return "Cant find Marttra in this word" def is_sumpus(self, word1: str, word2: str) -> bool: """ @@ -270,39 +307,46 @@ def is_sumpus(self, word1: str, word2: str) -> bool: marttra2 = self.check_marttra(word2) sara1 = self.check_sara(word1) sara2 = self.check_sara(word2) - if sara1 == 'อะ' and marttra1 == 'เกย': - sara1 = 'ไอ' - marttra1 = 'กา' - elif sara2 == 'อะ' and marttra2 == 'เกย': - sara2 = 'ไอ' - marttra2 = 'กา' - if sara1 == 'อำ' and marttra1 == 'กม': - sara1 = 'อำ' - marttra1 = 'กา' - elif sara2 == 'อำ' and marttra2 == 'กม': - sara2 = 'อำ' - marttra2 = 'กา' + if sara1 == "อะ" and marttra1 == "เกย": + sara1 = "ไอ" + marttra1 = "กา" + elif sara2 == "อะ" and marttra2 == "เกย": + sara2 = "ไอ" + marttra2 = "กา" + if sara1 == "อำ" and marttra1 == "กม": + sara1 = "อำ" + marttra1 = "กา" + elif sara2 == "อำ" and marttra2 == "กม": + sara2 = "อำ" + marttra2 = "กา" return bool(marttra1 == marttra2 and sara1 == sara2) def check_karu_lahu(self, text): if ( - ( - self.check_marttra(text) != 'กา' or - ( - self.check_marttra(text) == 'กา' and - self.check_sara(text) in [ - 'อา', 'อี', 'อือ', 'อู', 'เอ', - 'แอ', 'โอ', 'ออ', 'เออ', 'เอีย', - 'เอือ', 'อัว' - ] - ) or - self.check_sara(text) in ['อำ', 'ไอ', 'เอา'] - ) and - text not in ['บ่', 'ณ', 'ธ', 'ก็'] - ): - return 'karu' + self.check_marttra(text) != "กา" + or ( + self.check_marttra(text) == "กา" + and self.check_sara(text) + in [ + "อา", + "อี", + "อือ", + "อู", + "เอ", + "แอ", + "โอ", + "ออ", + "เออ", + "เอีย", + "เอือ", + "อัว", + ] + ) + or self.check_sara(text) in ["อำ", "ไอ", "เอา"] + ) and text not in ["บ่", "ณ", "ธ", "ก็"]: + return "karu" else: - return 'lahu' + return "lahu" def check_klon(self, text: str, k_type: int = 8) -> Union[List[str], str]: """ @@ -346,19 +390,24 @@ def check_klon(self, text: str, k_type: int = 8) -> Union[List[str], str]: list_sumpus_sent3 = [] list_sumpus_sent4 = [] for i, sent in enumerate(text.split()): - sub_sent = subword_tokenize(sent, engine='dict') + sub_sent = subword_tokenize(sent, engine="dict") if len(sub_sent) > 10: error.append( - 'In sentence ' + - str(i + 2) + - ', there are more than 10 words. ' + - str(sub_sent) + "In sentence " + + str(i + 2) + + ", there are more than 10 words. " + + str(sub_sent) ) if (i + 1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) elif (i + 1) % 4 == 2: list_sumpus_sent2h.append( - [sub_sent[1], sub_sent[2], sub_sent[3], sub_sent[4]] + [ + sub_sent[1], + sub_sent[2], + sub_sent[3], + sub_sent[4], + ] ) list_sumpus_sent2l.append(sub_sent[-1]) elif (i + 1) % 4 == 3: @@ -366,49 +415,78 @@ def check_klon(self, text: str, k_type: int = 8) -> Union[List[str], str]: elif (i + 1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if ( - len(list_sumpus_sent1) != len(list_sumpus_sent2h) or - len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or - len(list_sumpus_sent2l) != len(list_sumpus_sent3) or - len(list_sumpus_sent3) != len(list_sumpus_sent4) or - len(list_sumpus_sent4) != len(list_sumpus_sent1) + len(list_sumpus_sent1) != len(list_sumpus_sent2h) + or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) + or len(list_sumpus_sent2l) != len(list_sumpus_sent3) + or len(list_sumpus_sent3) != len(list_sumpus_sent4) + or len(list_sumpus_sent4) != len(list_sumpus_sent1) ): - return 'The poem does not have 4 complete sentences.' + return "The poem does not have 4 complete sentences." else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: - if self.is_sumpus(list_sumpus_sent1[i], j) is False: + if ( + self.is_sumpus(list_sumpus_sent1[i], j) + is False + ): countwrong += 1 if countwrong > 3: error.append( - 'Can\'t find rhyme between paragraphs ' + - str((list_sumpus_sent1[i], list_sumpus_sent2h[i])) + - ' in paragraph ' + - str(i + 1) + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent1[i], + list_sumpus_sent2h[i], + ) + ) + + " in paragraph " + + str(i + 1) + ) + if ( + self.is_sumpus( + list_sumpus_sent2l[i], list_sumpus_sent3[i] ) - if self.is_sumpus(list_sumpus_sent2l[i], list_sumpus_sent3[i]) is False: + is False + ): error.append( - 'Can\'t find rhyme between paragraphs ' + - str((list_sumpus_sent2l[i], list_sumpus_sent3[i])) + - ' in paragraph ' + - str(i + 1) + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent2l[i], + list_sumpus_sent3[i], + ) + ) + + " in paragraph " + + str(i + 1) ) if i > 0: - if self.is_sumpus( - list_sumpus_sent2l[i], list_sumpus_sent4[i - 1] - ) is False: + if ( + self.is_sumpus( + list_sumpus_sent2l[i], + list_sumpus_sent4[i - 1], + ) + is False + ): error.append( - 'Can\'t find rhyme between paragraphs ' + - str((list_sumpus_sent2l[i], list_sumpus_sent4[i - 1])) + - ' in paragraph ' + - str(i + 1) + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent2l[i], + list_sumpus_sent4[i - 1], + ) + ) + + " in paragraph " + + str(i + 1) ) if not error: - return 'The poem is correct according to the principle.' + return ( + "The poem is correct according to the principle." + ) else: return error except: - return 'Something went wrong. Make sure you enter it in the correct form of klon 8.' + return "Something went wrong. Make sure you enter it in the correct form of klon 8." elif k_type == 4: try: error = [] @@ -418,13 +496,13 @@ def check_klon(self, text: str, k_type: int = 8) -> Union[List[str], str]: list_sumpus_sent3 = [] list_sumpus_sent4 = [] for i, sent in enumerate(text.split()): - sub_sent = subword_tokenize(sent, engine='dict') + sub_sent = subword_tokenize(sent, engine="dict") if len(sub_sent) > 5: error.append( - 'In sentence ' + - str(i + 2) + - ', there are more than 4 words. ' + - str(sub_sent) + "In sentence " + + str(i + 2) + + ", there are more than 4 words. " + + str(sub_sent) ) if (i + 1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) @@ -436,57 +514,84 @@ def check_klon(self, text: str, k_type: int = 8) -> Union[List[str], str]: elif (i + 1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if ( - len(list_sumpus_sent1) != len(list_sumpus_sent2h) or - len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or - len(list_sumpus_sent2l) != len(list_sumpus_sent3) or - len(list_sumpus_sent3) != len(list_sumpus_sent4) or - len(list_sumpus_sent4) != len(list_sumpus_sent1) + len(list_sumpus_sent1) != len(list_sumpus_sent2h) + or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) + or len(list_sumpus_sent2l) != len(list_sumpus_sent3) + or len(list_sumpus_sent3) != len(list_sumpus_sent4) + or len(list_sumpus_sent4) != len(list_sumpus_sent1) ): - return 'The poem does not have 4 complete sentences.' + return "The poem does not have 4 complete sentences." else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: - if self.is_sumpus(list_sumpus_sent1[i], j) is False: + if ( + self.is_sumpus(list_sumpus_sent1[i], j) + is False + ): countwrong += 1 if countwrong > 1: error.append( - 'Can\'t find rhyme between paragraphs ' + - str((list_sumpus_sent1[i], list_sumpus_sent2h[i])) + - ' in paragraph ' + - str(i + 1) + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent1[i], + list_sumpus_sent2h[i], + ) + ) + + " in paragraph " + + str(i + 1) + ) + if ( + self.is_sumpus( + list_sumpus_sent2l[i], list_sumpus_sent3[i] ) - if self.is_sumpus(list_sumpus_sent2l[i], list_sumpus_sent3[i]) is False: + is False + ): error.append( - 'Can\'t find rhyme between paragraphs ' + - str((list_sumpus_sent2l[i], list_sumpus_sent3[i])) + - ' in paragraph ' + - str(i + 1) + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent2l[i], + list_sumpus_sent3[i], + ) + ) + + " in paragraph " + + str(i + 1) ) if i > 0: - if self.is_sumpus( - list_sumpus_sent2l[i], list_sumpus_sent4[i - 1] - ) is False: + if ( + self.is_sumpus( + list_sumpus_sent2l[i], + list_sumpus_sent4[i - 1], + ) + is False + ): error.append( - 'Can\'t find rhyme between paragraphs ' + - str((list_sumpus_sent2l[i], list_sumpus_sent4[i - 1])) + - ' in paragraph ' + - str(i + 1) + "Can't find rhyme between paragraphs " + + str( + ( + list_sumpus_sent2l[i], + list_sumpus_sent4[i - 1], + ) + ) + + " in paragraph " + + str(i + 1) ) if not error: - return 'The poem is correct according to the principle.' + return ( + "The poem is correct according to the principle." + ) else: return error except: - return 'Something went wrong. Make sure you enter it in the correct form.' + return "Something went wrong. Make sure you enter it in the correct form." else: - return 'Something went wrong. Make sure you enter it in the correct form.' + return "Something went wrong. Make sure you enter it in the correct form." def check_aek_too( - self, - text: Union[List[str], str], - dead_syllable_as_aek: bool = False + self, text: Union[List[str], str], dead_syllable_as_aek: bool = False ) -> Union[List[bool], List[str], bool, str]: """ Checker of Thai tonal words @@ -515,15 +620,15 @@ def check_aek_too( return [self.check_aek_too(t, dead_syllable_as_aek) for t in text] if not isinstance(text, str): - raise TypeError('text must be str or iterable list[str]') + raise TypeError("text must be str or iterable list[str]") word_characters = [*text] - if '่' in word_characters and not '้' in word_characters: - return 'aek' - elif '้' in word_characters and not '่' in word_characters: - return 'too' - if dead_syllable_as_aek and sound_syllable(text) == 'dead': - return 'aek' + if "่" in word_characters and not "้" in word_characters: + return "aek" + elif "้" in word_characters and not "่" in word_characters: + return "too" + if dead_syllable_as_aek and sound_syllable(text) == "dead": + return "aek" else: return False @@ -536,12 +641,12 @@ def handle_karun_sound_silence(self, word: str) -> str: :return: Thai word with silent words stripped :rtype: str """ - sound_silenced = word.endswith('์') + sound_silenced = word.endswith("์") if not sound_silenced: return word thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" - locate_silenced = word.rfind('์') - 1 - can_silence_two = word[locate_silenced-2] in thai_consonants + locate_silenced = word.rfind("์") - 1 + can_silence_two = word[locate_silenced - 2] in thai_consonants cut_off = 2 if can_silence_two else 1 - word = word[:locate_silenced + 1 - cut_off] + word = word[: locate_silenced + 1 - cut_off] return word diff --git a/pythainlp/khavee/example.py b/pythainlp/khavee/example.py index 6fd1a7a3e..e1fd1e9b4 100644 --- a/pythainlp/khavee/example.py +++ b/pythainlp/khavee/example.py @@ -1,34 +1,40 @@ # -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 + import core + kv = core.KhaveeVerifier() # การเช็คสระ -print('เออ', kv.check_sara('เมอ')) +print("เออ", kv.check_sara("เมอ")) # 'เออ' # การเช็คมาตราตัวสะกด -print('เทอว', kv.check_marttra('เทอว')) +print("เทอว", kv.check_marttra("เทอว")) # 'เกอว' # การตรวจสอบคำสำผัสที่ถูกต้อง -print('สรร อัน', kv.is_sumpus('สรร', 'อัน')) +print("สรร อัน", kv.is_sumpus("สรร", "อัน")) # True # การตรวจสอบคำสำผัสที่ผิด -print('เพื่อน ล้วน', kv.is_sumpus('เพื่อน', 'ล้วน')) +print("เพื่อน ล้วน", kv.is_sumpus("เพื่อน", "ล้วน")) # False # การตรวจสอบคำ ครุ ลหุ -print('สรร', kv.check_karu_lahu('สรร')) -#karu +print("สรร", kv.check_karu_lahu("สรร")) +# karu # การตรวจสอบคำ ครุ ลหุ -print('ชิชะ', kv.check_karu_lahu('ชิชะ')) +print("ชิชะ", kv.check_karu_lahu("ชิชะ")) # lahu # การตรวจสอบกลอน 8 ที่ถูกฉันทลักษณ์ -print(kv.check_klon('''ณรงค์วุฒิผู้เปี่ยมวุฒิสมสง่า มากวิชาหาความรู้ไปสู่ผล +print( + kv.check_klon( + """ณรงค์วุฒิผู้เปี่ยมวุฒิสมสง่า มากวิชาหาความรู้ไปสู่ผล เรื่องฟิสิกส์คณิตศาสตร์เอิร์นอดทน เล่นเกมเก่งลำดับต้นของโรงเรียน ต่อมาหยกธนัชพรชอบนอนหลับ แต่ผลลัพธ์คือฉลาดเรื่องอ่านเขียน เหมือนจะเล่นแต่เขายังพากเพียร ในการเรียนการเล่นบ้างคละกันไป @@ -37,11 +43,16 @@ นริศราอีฟเก่งกว่าใครเพื่อน คอยช่วยเตือนเรื่องงานคอยสั่งสอน อ่านตำราหาความรู้ไม่ละทอน เป็นคนดีศรีนครของจิตรลดา ภัสนันท์นาคลออหรือมีมี่ เรื่องเกมนี้เก่งกาจไม่กังขา -เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน''', k_type=8)) +เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน""", + k_type=8, + ) +) # -> The poem is correct according to the principle. # การตรวจสอบกลอน 8 ที่ผิดฉันทลักษณ์ -print(kv.check_klon('''ณรงค์วุฒิผู้เปี่ยมวุฒิสมสง่า มากวิชาหาความรู้ไปสู่ผล +print( + kv.check_klon( + """ณรงค์วุฒิผู้เปี่ยมวุฒิสมสง่า มากวิชาหาความรู้ไปสู่ผล เรื่องฟิสิกส์คณิตศาสตร์เอิร์นอดทน เล่นเกมเก่งลำดับต้นของโรงเรียน ต่อมาหยกธนัชพรชอบนอนหลับ แต่ผลลัพธ์คือฉลาดเรื่องอ่านเขียน เหมือนจะเล่นแต่เขายังพากเพียร ในการเรียนการเล่นบ้างคละกันไป @@ -50,22 +61,39 @@ นริศราอีฟเก่งกว่าใครเพื่อน คอยช่วยเตือนเรื่องงานคอยสั่งสอน อ่านตำราหาความรู้ไม่ละทอน เป็นคนดีศรีนครของจิตรลดา ภัสนันท์นาคลออหรือมีมี่ เรื่องเกมเอ่อเก่งกาจไม่กังขา -เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน''', k_type=8)) +เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน""", + k_type=8, + ) +) # -> ["Cant find rhyme between paragraphs ('สอน', 'ไป') in paragraph 4", "Cant find rhyme between paragraphs ('มี่', ['เกม', 'เอ่อ', 'เก่ง', 'กาจ']) in paragraph 5"] # การตรวจสอบกลอน 4 ที่ถูกฉันทลักษณ์ -print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง มีคนจับจอง เขาชื่อน้องเธียร''', k_type=4)) +print( + kv.check_klon( + """ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง มีคนจับจอง เขาชื่อน้องเธียร""", + k_type=4, + ) +) # -> The poem is correct according to the principle. # การตรวจสอบกลอน 4 ที่ผิดฉันทลักษณ์ -print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''', k_type=4)) +print( + kv.check_klon( + """ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร""", + k_type=4, + ) +) # -> ["Cant find rhyme between paragraphs ('หมา', 'จอง') in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง') in paragraph 2"] # การเช็คคำเอกโท -print(kv.check_aek_too('เอง'), kv.check_aek_too('เอ่ง'), kv.check_aek_too('เอ้ง')) +print( + kv.check_aek_too("เอง"), kv.check_aek_too("เอ่ง"), kv.check_aek_too("เอ้ง") +) # -> False, aek, too -print(kv.check_aek_too(['เอง', 'เอ่ง', 'เอ้ง'])) # ใช้ List ได้เหมือนกัน +print(kv.check_aek_too(["เอง", "เอ่ง", "เอ้ง"])) # ใช้ List ได้เหมือนกัน # -> [False, 'aek', 'too'] -print(kv.check_aek_too(['ห๊ะ', 'เอ่ง', 'เอ้ง'], dead_syllable_as_aek=True)) # ใช้ List ได้เหมือนกัน และสามารถตั้งค่า ให้นับคำที่เสียงตายเป็นเอกได้ ตามการเช็คคฉันทลักษณ์กลอน +print( + kv.check_aek_too(["ห๊ะ", "เอ่ง", "เอ้ง"], dead_syllable_as_aek=True) +) # ใช้ List ได้เหมือนกัน และสามารถตั้งค่า ให้นับคำที่เสียงตายเป็นเอกได้ ตามการเช็คคฉันทลักษณ์กลอน # -> ['aek', 'aek', 'too'] diff --git a/pythainlp/parse/__init__.py b/pythainlp/parse/__init__.py index 9be9834a9..5cd6c0988 100644 --- a/pythainlp/parse/__init__.py +++ b/pythainlp/parse/__init__.py @@ -1,8 +1,10 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 + """ PyThaiNLP Parse """ __all__ = ["dependency_parsing"] + from pythainlp.parse.core import dependency_parsing diff --git a/pythainlp/parse/core.py b/pythainlp/parse/core.py index 1dd5d8bd2..1dcb9e4da 100644 --- a/pythainlp/parse/core.py +++ b/pythainlp/parse/core.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 + from typing import List, Union @@ -9,7 +10,7 @@ def dependency_parsing( - text: str, model: str = None, tag: str = "str", engine: str = "esupar" + text: str, model: str = "", tag: str = "str", engine: str = "esupar" ) -> Union[List[List[str]], str]: """ Dependency Parsing @@ -92,6 +93,7 @@ def dependency_parsing( # 3 คนดี NOUN NCMN _ 2 obj _ SpaceAfter=No """ global _tagger, _tagger_name + if _tagger_name != engine: if engine == "esupar": from pythainlp.parse.esupar_engine import Parse @@ -111,5 +113,7 @@ def dependency_parsing( _tagger = Parse(model=model) else: raise NotImplementedError("The engine doesn't support.") + _tagger_name = engine + return _tagger(text, tag=tag) diff --git a/pythainlp/parse/spacy_thai_engine.py b/pythainlp/parse/spacy_thai_engine.py index dabf4caeb..6d0eef7ab 100644 --- a/pythainlp/parse/spacy_thai_engine.py +++ b/pythainlp/parse/spacy_thai_engine.py @@ -3,8 +3,10 @@ spacy_thai: Tokenizer, POS tagger, and dependency parser for the Thai language using Universal Dependencies. GitHub: https://github.com/KoichiYasuoka/spacy-thai + """ from typing import List, Union + import spacy_thai diff --git a/pythainlp/parse/transformers_ud.py b/pythainlp/parse/transformers_ud.py index 2adfe2849..9d1ffca5b 100644 --- a/pythainlp/parse/transformers_ud.py +++ b/pythainlp/parse/transformers_ud.py @@ -12,14 +12,15 @@ """ import os from typing import List, Union + import numpy import torch import ufal.chu_liu_edmonds from transformers import ( - AutoTokenizer, + AutoConfig, AutoModelForQuestionAnswering, AutoModelForTokenClassification, - AutoConfig, + AutoTokenizer, TokenClassificationPipeline, ) from transformers.utils import cached_file @@ -35,8 +36,9 @@ def __init__( self.model = AutoModelForQuestionAnswering.from_pretrained(model) x = AutoModelForTokenClassification.from_pretrained if os.path.isdir(model): - d, t = x(os.path.join(model, "deprel")), x( - os.path.join(model, "tagger") + d, t = ( + x(os.path.join(model, "deprel")), + x(os.path.join(model, "tagger")), ) else: c = AutoConfig.from_pretrained( @@ -61,11 +63,13 @@ def __call__( (t["start"], t["end"], t["entity_group"]) for t in self.deprel(text) ] - z, n = { - t["start"]: t["entity"].split("|") for t in self.tagger(text) - }, len(w) - r, m = [text[s:e] for s, e, p in w], numpy.full( - (n + 1, n + 1), numpy.nan + z, n = ( + {t["start"]: t["entity"].split("|") for t in self.tagger(text)}, + len(w), + ) + r, m = ( + [text[s:e] for s, e, p in w], + numpy.full((n + 1, n + 1), numpy.nan), ) v, c = self.tokenizer(r, add_special_tokens=False)["input_ids"], [] for i, t in enumerate(v): diff --git a/pythainlp/parse/ud_goeswith.py b/pythainlp/parse/ud_goeswith.py index eb9b27810..fc258d41d 100644 --- a/pythainlp/parse/ud_goeswith.py +++ b/pythainlp/parse/ud_goeswith.py @@ -11,10 +11,11 @@ GitHub: https://github.com/KoichiYasuoka """ from typing import List, Union -from transformers import AutoTokenizer, AutoModelForTokenClassification + import numpy as np import torch import ufal.chu_liu_edmonds +from transformers import AutoModelForTokenClassification, AutoTokenizer class Parse: @@ -27,21 +28,20 @@ def __init__( self.model = AutoModelForTokenClassification.from_pretrained(model) def __call__( - self, - text: str, tag: str = "str" + self, text: str, tag: str = "str" ) -> Union[List[List[str]], str]: w = self.tokenizer(text, return_offsets_mapping=True) v = w["input_ids"] x = [ - v[0:i] + [self.tokenizer.mask_token_id] + v[i + 1:] + [j] + v[0:i] + [self.tokenizer.mask_token_id] + v[i + 1 :] + [j] for i, j in enumerate(v[1:-1], 1) ] with torch.no_grad(): - e = self.model(input_ids=torch.tensor(x) - ).logits.numpy()[:, 1:-2, :] + e = self.model(input_ids=torch.tensor(x)).logits.numpy()[ + :, 1:-2, : + ] r = [ - 1 if i == 0 else -1 - if j.endswith("|root") else 0 + 1 if i == 0 else -1 if j.endswith("|root") else 0 for i, j in sorted(self.model.config.id2label.items()) ] e += np.where(np.add.outer(np.identity(e.shape[0]), r) == 0, 0, np.nan) @@ -60,8 +60,10 @@ def __call__( h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0] if [0 for i in h if i == 0] != [0]: m[:, 0] += np.where( - m[:, 0] == np.nanmax( - m[[i for i, j in enumerate(h) if j == 0], 0]), 0, np.nan + m[:, 0] + == np.nanmax(m[[i for i, j in enumerate(h) if j == 0], 0]), + 0, + np.nan, ) m[[i for i, j in enumerate(h) if j == 0]] += [ 0 if i == 0 or j == 0 else np.nan for i, j in enumerate(h) @@ -84,21 +86,30 @@ def __call__( str(h[i]), q[-1], "_", - "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No" + "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No", ] ) return _tag_data else: for i, (s, e) in enumerate(v, 1): q = self.model.config.id2label[p[i, h[i]]].split("|") - u += "\t".join([str(i), - text[s:e], - "_", - q[0], - "_", - "|".join(q[1:-1]), - str(h[i]), - q[-1], - "_", - "_" if i < len(v) and e < v[i][0] else "SpaceAfter=No"]) + "\n" + u += ( + "\t".join( + [ + str(i), + text[s:e], + "_", + q[0], + "_", + "|".join(q[1:-1]), + str(h[i]), + q[-1], + "_", + "_" + if i < len(v) and e < v[i][0] + else "SpaceAfter=No", + ] + ) + + "\n" + ) return u + "\n" diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py index bf0f847bf..537de7706 100644 --- a/pythainlp/phayathaibert/__init__.py +++ b/pythainlp/phayathaibert/__init__.py @@ -1,6 +1,11 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 + +""" +PhayaThaiBERT +""" + __all__ = [ "NamedEntityTagger", "PartOfSpeechTagger", diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 579d24ddc..c4498799a 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 + from typing import Callable, Collection, List, Tuple, Union import random import re @@ -18,19 +19,24 @@ class ThaiTextProcessor: def __init__(self): - self._TK_UNK, self._TK_REP, self._TK_WREP, self._TK_URL, self._TK_END = \ - " ".split() + ( + self._TK_UNK, + self._TK_REP, + self._TK_WREP, + self._TK_URL, + self._TK_END, + ) = " ".split() self.SPACE_SPECIAL_TOKEN = "<_>" def replace_url(self, text: str) -> str: """ - Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965) - :param str text: text to replace url - :return: text where urls are replaced - :rtype: str - :Example: - >>> replace_url("go to https://github.com") - go to + Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965) + :param str text: text to replace url + :return: text where urls are replaced + :rtype: str + :Example: + >>> replace_url("go to https://github.com") + go to """ URL_PATTERN = \ r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?" @@ -38,13 +44,13 @@ def replace_url(self, text: str) -> str: def rm_brackets(text: str) -> str: """ - Remove all empty brackets and artifacts within brackets from `text`. - :param str text: text to remove useless brackets - :return: text where all useless brackets are removed - :rtype: str - :Example: - >>> rm_brackets("hey() whats[;] up{*&} man(hey)") - hey whats up man(hey) + Remove all empty brackets and artifacts within brackets from `text`. + :param str text: text to remove useless brackets + :return: text where all useless brackets are removed + :rtype: str + :Example: + >>> rm_brackets("hey() whats[;] up{*&} man(hey)") + hey whats up man(hey) """ # remove empty brackets new_line = re.sub(r"\(\)", "", text) @@ -55,49 +61,61 @@ def rm_brackets(text: str) -> str: new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line) new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line) # artifiacts after ( - new_line = re.sub(r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line) - new_line = re.sub(r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line) - new_line = re.sub(r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line) + new_line = re.sub( + r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line + ) + new_line = re.sub( + r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line + ) + new_line = re.sub( + r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line + ) # artifacts before ) - new_line = re.sub(r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line) - new_line = re.sub(r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line) - new_line = re.sub(r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line) + new_line = re.sub( + r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line + ) + new_line = re.sub( + r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line + ) + new_line = re.sub( + r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line + ) return new_line def replace_newlines(text: str) -> str: """ - Replace newlines in `text` with spaces. - :param str text: text to replace all newlines with spaces - :return: text where all newlines are replaced with spaces - :rtype: str - :Example: - >>> rm_useless_spaces("hey whats\n\nup") - hey whats up + Replace newlines in `text` with spaces. + :param str text: text to replace all newlines with spaces + :return: text where all newlines are replaced with spaces + :rtype: str + :Example: + >>> rm_useless_spaces("hey whats\n\nup") + hey whats up """ return re.sub(r"[\n]", " ", text.strip()) def rm_useless_spaces(text: str) -> str: """ - Remove multiple spaces in `text`. (code from `fastai`) - :param str text: text to replace useless spaces - :return: text where all spaces are reduced to one - :rtype: str - :Example: - >>> rm_useless_spaces("oh no") - oh no + Remove multiple spaces in `text`. (code from `fastai`) + :param str text: text to replace useless spaces + :return: text where all spaces are reduced to one + :rtype: str + :Example: + >>> rm_useless_spaces("oh no") + oh no """ return re.sub(" {2,}", " ", text) def replace_spaces(text: str, space_token: str = "<_>") -> str: """ - Replace spaces with _ - :param str text: text to replace spaces - :return: text where all spaces replaced with _ - :rtype: str - :Example: - >>> replace_spaces("oh no") - oh_no + Replace spaces with _ + :param str text: text to replace spaces + :return: text where all spaces replaced with _ + :rtype: str + :Example: + >>> replace_spaces("oh no") + oh_no """ return re.sub(" ", space_token, text) @@ -144,6 +162,7 @@ def replace_wrep_post(self, toks: Collection[str]) -> Collection[str]: else: res.append(previous_word) previous_word = current_word + return res[1:] def remove_space(toks: Collection[str]) -> Collection[str]: @@ -162,6 +181,7 @@ def remove_space(toks: Collection[str]) -> Collection[str]: t = t.strip() if t: res.append(t) + return res # combine them together @@ -181,48 +201,58 @@ def preprocess( for rule in pre_rules: text = rule(text) toks = tok_func(text) + return "".join(toks) class ThaiTextAugmenter: def __init__(self) -> None: - from transformers import (AutoTokenizer, - AutoModelForMaskedLM, - pipeline,) + from transformers import ( + AutoModelForMaskedLM, + AutoTokenizer, + pipeline, + ) + self.tokenizer = AutoTokenizer.from_pretrained(_model_name) - self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name) - self.model = pipeline("fill-mask", - tokenizer=self.tokenizer, - model=self.model_for_masked_lm, - ) + self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained( + _model_name + ) + self.model = pipeline( + "fill-mask", + tokenizer=self.tokenizer, + model=self.model_for_masked_lm, + ) self.processor = ThaiTextProcessor() - def generate(self, - sample_text: str, - word_rank: int, - max_length: int = 3, - sample: bool = False, - ) -> str: + def generate( + self, + sample_text: str, + word_rank: int, + max_length: int = 3, + sample: bool = False, + ) -> str: sample_txt = sample_text final_text = "" for j in range(max_length): input = self.processor.preprocess(sample_txt) if sample: random_word_idx = random.randint(0, 4) - output = self.model(input)[random_word_idx]['sequence'] + output = self.model(input)[random_word_idx]["sequence"] else: - output = self.model(input)[word_rank]['sequence'] + output = self.model(input)[word_rank]["sequence"] sample_txt = output + "" final_text = sample_txt gen_txt = re.sub("", "", final_text) + return gen_txt - def augment(self, - text: str, - num_augs: int = 3, - sample: bool = False, - ) -> List[str]: + def augment( + self, + text: str, + num_augs: int = 3, + sample: bool = False, + ) -> List[str]: """ Text Augment from phayathaibert @@ -248,80 +278,96 @@ def augment(self, 'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼', 'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁'] """ + MAX_NUM_AUGS = 5 augment_list = [] - if num_augs <= 5: + + if num_augs <= MAX_NUM_AUGS: for rank in range(num_augs): - gen_text = self.generate(text, - rank, - sample=sample, - ) - processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text)) + gen_text = self.generate( + text, + rank, + sample=sample, + ) + processed_text = re.sub( + "<_>", " ", self.processor.preprocess(gen_text) + ) augment_list.append(processed_text) + else: + raise ValueError( + f"augmentation of more than {num_augs} is exceeded the default limit: {MAX_NUM_AUGS}" + ) - return augment_list + return augment_list class PartOfSpeechTagger: def __init__(self, model: str = "lunarlist/pos_thai_phayathai") -> None: # Load model directly - from transformers import (AutoTokenizer, - AutoModelForTokenClassification, - ) + from transformers import ( + AutoTokenizer, + AutoModelForTokenClassification, + ) + self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForTokenClassification.from_pretrained(model) - def get_tag(self, - sentence: str, - strategy: str = 'simple' - ) -> List[List[Tuple[str, str]]]: + def get_tag( + self, sentence: str, strategy: str = "simple" + ) -> List[List[Tuple[str, str]]]: """ - Marks sentences with part-of-speech (POS) tags. + Marks sentences with part-of-speech (POS) tags. - :param str sentence: a list of lists of tokenized words - :return: a list of lists of tuples (word, POS tag) - :rtype: list[list[tuple[str, str]]] + :param str sentence: a list of lists of tokenized words + :return: a list of lists of tuples (word, POS tag) + :rtype: list[list[tuple[str, str]]] - :Example: + :Example: - Labels POS for given sentence:: + Labels POS for given sentence:: - from pythainlp.phayathaibert.core import PartOfSpeechTagger + from pythainlp.phayathaibert.core import PartOfSpeechTagger - tagger = PartOfSpeechTagger() - tagger.get_tag("แมวทำอะไรตอนห้าโมงเช้า") - # output: - # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]] + tagger = PartOfSpeechTagger() + tagger.get_tag("แมวทำอะไรตอนห้าโมงเช้า") + # output: + # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]] """ from transformers import TokenClassificationPipeline - pipeline = TokenClassificationPipeline(model=self.model, - tokenizer=self.tokenizer, - aggregation_strategy=strategy, - ) + + pipeline = TokenClassificationPipeline( + model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy=strategy, + ) outputs = pipeline(sentence) - word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]] + word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]] + return word_tags class NamedEntityTagger: def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None: - from transformers import (AutoTokenizer, - AutoModelForTokenClassification, - ) + from transformers import ( + AutoTokenizer, + AutoModelForTokenClassification, + ) + self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForTokenClassification.from_pretrained(model) - def get_ner(self, - text: str, - tag: bool = False, - pos: bool = False, - strategy: str = "simple", - ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: + def get_ner( + self, + text: str, + tag: bool = False, + pos: bool = False, + strategy: str = "simple", + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named entities in text in IOB format. :param str text: text in Thai to be tagged :param bool pos: output with part-of-speech tags.\ - (phayathaibert is supported in PartOfSpeechTagger) + (PhayaThaiBERT is supported in PartOfSpeechTagger) :return: a list of tuples associated with tokenized words, NER tags, POS tags (if the parameter `pos` is specified as `True`), and output HTML-like tags (if the parameter `tag` is @@ -343,41 +389,56 @@ def get_ner(self, จากประเทศไทย' """ from transformers import TokenClassificationPipeline + if pos: - warnings.warn("This model doesn't support output \ - postag and It doesn't output the postag.") + warnings.warn( + "This model doesn't support output \ + postag and It doesn't output the postag." + ) + sample_output = [] tag_text_list = [] current_pos = 0 - pipeline = TokenClassificationPipeline(model=self.model, - tokenizer=self.tokenizer, - aggregation_strategy=strategy, - ) + pipeline = TokenClassificationPipeline( + model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy=strategy, + ) outputs = pipeline(text) + for token in outputs: - ner_tag = token['entity_group'] - begin_pos, end_pos = token['start'], token['end'] + ner_tag = token["entity_group"] + begin_pos, end_pos = token["start"], token["end"] if current_pos == 0: - text_tag = text[:begin_pos] + f"<{ner_tag}>" \ - + text[begin_pos:end_pos] + f"" + text_tag = ( + text[:begin_pos] + + f"<{ner_tag}>" + + text[begin_pos:end_pos] + + f"" + ) else: - text_tag = text[current_pos:begin_pos] + f"<{ner_tag}>" \ - + text[begin_pos:end_pos] + f"" + text_tag = ( + text[current_pos:begin_pos] + + f"<{ner_tag}>" + + text[begin_pos:end_pos] + + f"" + ) tag_text_list.append(text_tag) - sample_output.append((token['word'], token['entity_group'])) + sample_output.append((token["word"], token["entity_group"])) current_pos = end_pos + if tag: return str("".join(tag_text_list)) - else: - return sample_output + + return sample_output def segment(sentence: str) -> List[str]: """ - Subword tokenize of phayathaibert, \ - sentencepiece from wangchanberta model with Vocabulary Expansion. + Subword tokenize of PhayaThaiBERT, \ + sentencepiece from WangchanBERTa model with vocabulary expansion. - :param str text: text to be tokenized + :param str sentence: text to be tokenized :return: list of subwords :rtype: list[str] """