Skip to content

Commit

Permalink
Merge pull request #817 from PyThaiNLP/add-spell_words
Browse files Browse the repository at this point in the history
Add pythainlp.util.spell_words
  • Loading branch information
wannaphong authored Jul 14, 2023
2 parents b5292f0 + 4109d68 commit c2451ad
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 7 deletions.
16 changes: 9 additions & 7 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,29 @@ Modules
.. autofunction:: bahttext
.. autofunction:: convert_years
.. autofunction:: collate
.. autofunction:: count_thai_chars
.. autofunction:: countthai
.. autofunction:: dict_trie
.. autofunction:: digit_to_text
.. autofunction:: display_thai_char
.. autofunction:: emoji_to_thai
.. autofunction:: eng_to_thai
.. autofunction:: find_keyword
.. autofunction:: countthai
.. autofunction:: count_thai_chars
.. autofunction:: ipa_to_rtgs
.. autofunction:: is_native_thai
.. autofunction:: isthai
.. autofunction:: isthaichar
.. autofunction:: maiyamok
.. autofunction:: nectec_to_ipa
.. autofunction:: normalize
.. autofunction:: now_reign_year
.. autofunction:: num_to_thaiword
.. autofunction:: maiyamok
.. autofunction:: rank
.. autofunction:: reign_year_to_ad
.. autofunction:: remove_dangling
.. autofunction:: remove_dup_spaces
.. autofunction:: remove_repeat_vowels
.. autofunction:: remove_tone_ipa
.. autofunction:: remove_tonemark
.. autofunction:: remove_zw
.. autofunction:: reorder_vowels
Expand All @@ -40,20 +43,19 @@ Modules
.. autofunction:: text_to_arabic_digit
.. autofunction:: text_to_num
.. autofunction:: text_to_thai_digit
.. autofunction:: thai_digit_to_arabic_digit
.. autofunction:: thai_strftime
.. autofunction:: thai_strptime
.. autofunction:: thai_to_eng
.. autofunction:: thai_word_tone_detector
.. autofunction:: thai_digit_to_arabic_digit
.. autofunction:: thaiword_to_date
.. autofunction:: thaiword_to_num
.. autofunction:: thaiword_to_time
.. autofunction:: time_to_thaiword
.. autofunction:: tis620_to_utf8
.. autofunction:: tone_detector
.. autofunction:: words_to_num
.. autofunction:: nectec_to_ipa
.. autofunction:: ipa_to_rtgs
.. autofunction:: remove_tone_ipa
.. autofunction:: pythainlp.util.spell_words.spell_syllable
.. autofunction:: pythainlp.util.spell_words.spell_word
.. autoclass:: Trie
:members:
2 changes: 2 additions & 0 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
"ipa_to_rtgs",
"remove_tone_ipa",
"tis620_to_utf8",
"spell_words",
]

from pythainlp.util.collate import collate
Expand Down Expand Up @@ -123,3 +124,4 @@
)
from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa
from pythainlp.util.encoding import tis620_to_utf8
import pythainlp.util.spell_words as spell_words
121 changes: 121 additions & 0 deletions pythainlp/util/spell_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import List
from pythainlp import (
thai_letters,
thai_consonants,
thai_lead_vowels,
thai_follow_vowels,
thai_above_vowels,
thai_below_vowels,
thai_tonemarks
)
from pythainlp.tokenize import Tokenizer
from pythainlp.tokenize import subword_tokenize


_r1=["เ-ย","เ-ะ","แ-ะ","โ-ะ","เ-าะ","เ-อะ","เ-อ","เ-า"]
_r2=["–ั:วะ","เ–ี:ยะ","เ–ือะ","–ั:ว","เ–ี:ย","เ–ื:อ","–ื:อ"]
tonemarks={i:"ไม้"+j for i,j in zip(list(thai_tonemarks),["เอก","โท","ตรี","จัตวา"])}

rule1=[i.replace("-",f"([{thai_letters}](thai_tonemarks)?)") for i in _r1]
rule2=[i.replace("–",f"([{thai_letters}])").replace(":",f"") for i in _r2]
rule3=[i.replace("–",f"([{thai_letters}])").replace(":",f"([{thai_tonemarks}])") for i in _r2]
dict_vowel_ex={}
for i in _r1+_r2:
dict_vowel_ex[i.replace("-","อ").replace("–","อ").replace(":","")]=i.replace("-","อ").replace(":","").replace("–","อ")
dict_vowel={}
for i in _r1+_r2:
dict_vowel[i.replace("-","อ").replace("–","อ").replace(":","")]=i.replace("-","อ").replace(":","").replace("–","อ")
for i in thai_lead_vowels:
dict_vowel[i]=i+"อ"
for i in thai_follow_vowels:
dict_vowel[i]="อ"+i
for i in thai_above_vowels:
dict_vowel[i]="อ"+i
for i in thai_below_vowels:
dict_vowel[i]="อ"+i

_cut=Tokenizer(list(dict_vowel.keys())+list(thai_consonants),engine="mm")


def _clean(w):
if bool(re.match('|'.join(rule3),w)):
for r in rule3:
if bool(re.match(r,w)):
_w=re.sub(r,"\\1==\\2==",w)
_temp=_w.split("==")
w=_temp[0]+r.replace(f"([{thai_letters}])","อ").replace(f"([{thai_tonemarks}])","")+_temp[1]
elif bool(re.match('|'.join(rule2),w)):
for r in rule2:
if bool(re.match(r,w)):
w=re.sub(r,"\\1",w)+r.replace(f"([{thai_letters}])","อ")
elif bool(re.match('|'.join(rule1),w)):
for r in rule1:
if bool(re.match(r,w)):
w=re.sub(r,"\\1",w)+r.replace(f"([{thai_letters}](thai_tonemarks)?)","อ")
return w


def spell_syllable(s: str)-> List[str]:
"""
Spell syllable by Thai word distribution form.
:param str s: Thai syllable only
:return: List of spell syllable
:rtype: List[str]
:Example:
::
from pythainlp.util.spell_words import spell_syllable
print(spell_syllable("แมว"))
# output: ['มอ', 'วอ', 'แอ', 'แมว']
"""
_t=s
s=_cut.word_tokenize(_clean(s))
_c_only = [i+"อ" for i in s if i in set(thai_consonants)]
_v_only = [dict_vowel[i] for i in s if i in set(dict_vowel.keys())]
_t_only = [tonemarks[i] for i in s if i in set(tonemarks.keys())]
_out=_c_only+_v_only+_t_only
_out.append(_t)
return _out


def spell_word(w: str)-> List[str]:
"""
Spell word by Thai word distribution form.
:param str w: Thai word only
:return: List of spell word
:rtype: List[str]
:Example:
::
from pythainlp.util.spell_words import spell_word
print(spell_word("คนดี"))
# output: ['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี']
"""
_r=[]
_temp=subword_tokenize(w,engine="ssg")
for i in _temp:
_r.extend(spell_syllable(i))
if len(_temp)>1:
_r.append(w)
return _r
7 changes: 7 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
remove_tone_ipa,
tis620_to_utf8,
)
from pythainlp.util.spell_words import spell_word


class TestUtilPackage(unittest.TestCase):
Expand Down Expand Up @@ -844,3 +845,9 @@ def test_remove_tone_ipa(self):

def test_tis620_to_utf8(self):
self.assertEqual(tis620_to_utf8("¡ÃзÃǧÍصÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")

def test_spell_word(self):
self.assertEqual(spell_word("เสือ"),['สอ', 'เอือ', 'เสือ'])
self.assertEqual(spell_word("เสื้อ"),['สอ', 'เอือ', 'ไม้โท', 'เสื้อ'])
self.assertEqual(spell_word("คน"),['คอ', 'นอ', 'คน'])
self.assertEqual(spell_word("คนดี"),['คอ', 'นอ', 'คน', 'ดอ', 'อี', 'ดี', 'คนดี'])

0 comments on commit c2451ad

Please sign in to comment.