-
Notifications
You must be signed in to change notification settings - Fork 274
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1e94d32
commit dd2ddaa
Showing
5 changed files
with
244 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
.. currentmodule:: pythainlp.morpheme | ||
|
||
pythainlp.morpheme | ||
================== | ||
|
||
The `pythainlp.benchmarks` module is collect functions for morpheme analysis, word formation and more for Thai language. | ||
|
||
.. autofunction:: nighit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pythainlp.transliterate import pronunciate\n", | ||
"from pythainlp import thai_consonants" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'พุด-ทะ'" | ||
] | ||
}, | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"pronunciate(\"พุทธ\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'บู-ชา'" | ||
] | ||
}, | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"pronunciate(\"บูชา\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'อะ-นุก'" | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"pronunciate(\"อนุค\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def nighit(w1,w2): # read: https://www.trueplookpanya.com/learning/detail/1180\n", | ||
" if not str(w1).endswith('ํ') and len(w1)!=2:\n", | ||
" raise NotImplementedError(f\"The function doesn't support {w1}.\")\n", | ||
" list_w1 = list(w1)\n", | ||
" list_w2 = list(w2)\n", | ||
" newword = list()\n", | ||
" newword.append(list_w1[0])\n", | ||
" newword.append(\"ั\")\n", | ||
" consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0]\n", | ||
" if consonant_start in [\"ก\",\"ช\",\"ค\",\"ข\",\"ง\"]:\n", | ||
" newword.append(\"ง\")\n", | ||
" elif consonant_start in [\"จ\",\"ฉ\",\"ช\",\"ฌ\"]:\n", | ||
" newword.append(\"ญ\")\n", | ||
" elif consonant_start in [\"ฎ\",\"ฐ\",\"ฑ\",\"ณ\"]:\n", | ||
" newword.append(\"ณ\")\n", | ||
" elif consonant_start in [\"ด\",\"ถ\",\"ท\",\"ธ\",\"น\"]:\n", | ||
" newword.append(\"น\")\n", | ||
" elif consonant_start in [\"ป\",\"ผ\",\"พ\",\"ภ\"]:\n", | ||
" newword.append(\"ม\")\n", | ||
" elif consonant_start in [\"ย\",\"ร\",\"ล\",\"ฬ\",\"ว\",\"ศ\",\"ษ\",\"ส\",\"ห\"]:\n", | ||
" newword.append(\"ง\")\n", | ||
" else:\n", | ||
" raise NotImplementedError(f\"The function doesn't support {w1} and {w2}.\")\n", | ||
" newword.extend(list_w2)\n", | ||
" return ''.join(newword)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"assert nighit(\"สํ\",\"คีต\")==\"สังคีต\"\n", | ||
"assert nighit(\"สํ\",\"จร\")==\"สัญจร\"\n", | ||
"assert nighit(\"สํ\",\"ฐาน\")==\"สัณฐาน\"\n", | ||
"assert nighit(\"สํ\",\"นิษฐาน\")==\"สันนิษฐาน\"\n", | ||
"assert nighit(\"สํ\",\"ปทา\")==\"สัมปทา\"\n", | ||
"assert nighit(\"สํ\",\"โยค\")==\"สังโยค\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3.8.13 ('base')", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.13" | ||
}, | ||
"orig_nbformat": 4, | ||
"vscode": { | ||
"interpreter": { | ||
"hash": "a1d6ff38954a1cdba4cf61ffa51e42f4658fc35985cd256cd89123cae8466a39" | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# -*- coding: utf-8 -*- | ||
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
""" | ||
PyThaiNLP morpheme | ||
""" | ||
from pythainlp.morpheme.word_formation import nighit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# -*- coding: utf-8 -*- | ||
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from pythainlp.transliterate import pronunciate | ||
from pythainlp import thai_consonants | ||
|
||
|
||
def nighit(w1: str,w2: str)->str: | ||
""" | ||
Nighit (นิคหิต or ํ ) is the niggahita in Thai language for create new \ | ||
words from Pali language in Thai. | ||
The function use simple method to create new Thai word from two words \ | ||
that the root is from Pali language. | ||
Read more: https://www.trueplookpanya.com/learning/detail/1180 | ||
:param str w1: A Thai word that has a nighit. | ||
:param str w2: A Thai word. | ||
:return: Thai word. | ||
:rtype: str | ||
:Example: | ||
:: | ||
from pythainlp.morpheme import nighit | ||
assert nighit("สํ","คีต")=="สังคีต" | ||
assert nighit("สํ","จร")=="สัญจร" | ||
assert nighit("สํ","ฐาน")=="สัณฐาน" | ||
assert nighit("สํ","นิษฐาน")=="สันนิษฐาน" | ||
assert nighit("สํ","ปทา")=="สัมปทา" | ||
assert nighit("สํ","โยค")=="สังโยค" | ||
""" | ||
if not str(w1).endswith('ํ') and len(w1)!=2: | ||
raise NotImplementedError(f"The function doesn't support {w1}.") | ||
list_w1 = list(w1) | ||
list_w2 = list(w2) | ||
newword = list() | ||
newword.append(list_w1[0]) | ||
newword.append("ั") | ||
consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0] | ||
if consonant_start in ["ก","ช","ค","ข","ง"]: | ||
newword.append("ง") | ||
elif consonant_start in ["จ","ฉ","ช","ฌ"]: | ||
newword.append("ญ") | ||
elif consonant_start in ["ฎ","ฐ","ฑ","ณ"]: | ||
newword.append("ณ") | ||
elif consonant_start in ["ด","ถ","ท","ธ","น"]: | ||
newword.append("น") | ||
elif consonant_start in ["ป","ผ","พ","ภ"]: | ||
newword.append("ม") | ||
elif consonant_start in ["ย","ร","ล","ฬ","ว","ศ","ษ","ส","ห"]: | ||
newword.append("ง") | ||
else: | ||
raise NotImplementedError(f""" | ||
The function doesn't support {w1} and {w2}. | ||
""") | ||
newword.extend(list_w2) | ||
return ''.join(newword) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# -*- coding: utf-8 -*- | ||
# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import unittest | ||
from pythainlp.morpheme import nighit | ||
|
||
|
||
class TestMorphemePackage(unittest.TestCase): | ||
def test_nighit(self): | ||
self.assertEqual(nighit("สํ","คีต"), "สังคีต") | ||
self.assertEqual(nighit("สํ","จร"), "สัญจร") | ||
self.assertEqual(nighit("สํ","ฐาน"), "สัณฐาน") | ||
self.assertEqual(nighit("สํ","นิษฐาน"), "สันนิษฐาน") | ||
self.assertEqual(nighit("สํ","ปทา"), "สัมปทา") | ||
self.assertEqual(nighit("สํ","โยค"), "สังโยค") |