From dd2ddaa80377d5b8c56d9929320e1b9115ff8bc2 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 1 Jan 2024 14:58:43 +0700 Subject: [PATCH] Add pythainlp.morpheme.nighit --- docs/api/morpheme.rst | 8 ++ notebooks/create_words.ipynb | 155 +++++++++++++++++++++++++++ pythainlp/morpheme/__init__.py | 8 ++ pythainlp/morpheme/word_formation.py | 57 ++++++++++ tests/test_morpheme.py | 16 +++ 5 files changed, 244 insertions(+) create mode 100644 docs/api/morpheme.rst create mode 100644 notebooks/create_words.ipynb create mode 100644 pythainlp/morpheme/__init__.py create mode 100644 pythainlp/morpheme/word_formation.py create mode 100644 tests/test_morpheme.py diff --git a/docs/api/morpheme.rst b/docs/api/morpheme.rst new file mode 100644 index 000000000..0f8a311d5 --- /dev/null +++ b/docs/api/morpheme.rst @@ -0,0 +1,8 @@ +.. currentmodule:: pythainlp.morpheme + +pythainlp.morpheme +================== + +The `pythainlp.benchmarks` module is collect functions for morpheme analysis, word formation and more for Thai language. + +.. autofunction:: nighit \ No newline at end of file diff --git a/notebooks/create_words.ipynb b/notebooks/create_words.ipynb new file mode 100644 index 000000000..d8d3ced83 --- /dev/null +++ b/notebooks/create_words.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pythainlp.transliterate import pronunciate\n", + "from pythainlp import thai_consonants" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'พุด-ทะ'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pronunciate(\"พุทธ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'บู-ชา'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pronunciate(\"บูชา\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'อะ-นุก'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pronunciate(\"อนุค\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def nighit(w1,w2): # read: https://www.trueplookpanya.com/learning/detail/1180\n", + " if not str(w1).endswith('ํ') and len(w1)!=2:\n", + " raise NotImplementedError(f\"The function doesn't support {w1}.\")\n", + " list_w1 = list(w1)\n", + " list_w2 = list(w2)\n", + " newword = list()\n", + " newword.append(list_w1[0])\n", + " newword.append(\"ั\")\n", + " consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0]\n", + " if consonant_start in [\"ก\",\"ช\",\"ค\",\"ข\",\"ง\"]:\n", + " newword.append(\"ง\")\n", + " elif consonant_start in [\"จ\",\"ฉ\",\"ช\",\"ฌ\"]:\n", + " newword.append(\"ญ\")\n", + " elif consonant_start in [\"ฎ\",\"ฐ\",\"ฑ\",\"ณ\"]:\n", + " newword.append(\"ณ\")\n", + " elif consonant_start in [\"ด\",\"ถ\",\"ท\",\"ธ\",\"น\"]:\n", + " newword.append(\"น\")\n", + " elif consonant_start in [\"ป\",\"ผ\",\"พ\",\"ภ\"]:\n", + " newword.append(\"ม\")\n", + " elif consonant_start in [\"ย\",\"ร\",\"ล\",\"ฬ\",\"ว\",\"ศ\",\"ษ\",\"ส\",\"ห\"]:\n", + " newword.append(\"ง\")\n", + " else:\n", + " raise NotImplementedError(f\"The function doesn't support {w1} and {w2}.\")\n", + " newword.extend(list_w2)\n", + " return ''.join(newword)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "assert nighit(\"สํ\",\"คีต\")==\"สังคีต\"\n", + "assert nighit(\"สํ\",\"จร\")==\"สัญจร\"\n", + "assert nighit(\"สํ\",\"ฐาน\")==\"สัณฐาน\"\n", + "assert nighit(\"สํ\",\"นิษฐาน\")==\"สันนิษฐาน\"\n", + "assert nighit(\"สํ\",\"ปทา\")==\"สัมปทา\"\n", + "assert nighit(\"สํ\",\"โยค\")==\"สังโยค\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.13 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "a1d6ff38954a1cdba4cf61ffa51e42f4658fc35985cd256cd89123cae8466a39" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pythainlp/morpheme/__init__.py b/pythainlp/morpheme/__init__.py new file mode 100644 index 000000000..cfd74284d --- /dev/null +++ b/pythainlp/morpheme/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 + +""" +PyThaiNLP morpheme +""" +from pythainlp.morpheme.word_formation import nighit \ No newline at end of file diff --git a/pythainlp/morpheme/word_formation.py b/pythainlp/morpheme/word_formation.py new file mode 100644 index 000000000..b98c58dd7 --- /dev/null +++ b/pythainlp/morpheme/word_formation.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +from pythainlp.transliterate import pronunciate +from pythainlp import thai_consonants + + +def nighit(w1: str,w2: str)->str: + """ + Nighit (นิคหิต or ํ ) is the niggahita in Thai language for create new \ + words from Pali language in Thai. + The function use simple method to create new Thai word from two words \ + that the root is from Pali language. + + Read more: https://www.trueplookpanya.com/learning/detail/1180 + + :param str w1: A Thai word that has a nighit. + :param str w2: A Thai word. + :return: Thai word. + :rtype: str + :Example: + :: + from pythainlp.morpheme import nighit + + assert nighit("สํ","คีต")=="สังคีต" + assert nighit("สํ","จร")=="สัญจร" + assert nighit("สํ","ฐาน")=="สัณฐาน" + assert nighit("สํ","นิษฐาน")=="สันนิษฐาน" + assert nighit("สํ","ปทา")=="สัมปทา" + assert nighit("สํ","โยค")=="สังโยค" + """ + if not str(w1).endswith('ํ') and len(w1)!=2: + raise NotImplementedError(f"The function doesn't support {w1}.") + list_w1 = list(w1) + list_w2 = list(w2) + newword = list() + newword.append(list_w1[0]) + newword.append("ั") + consonant_start = [i for i in list_w2 if i in set(thai_consonants)][0] + if consonant_start in ["ก","ช","ค","ข","ง"]: + newword.append("ง") + elif consonant_start in ["จ","ฉ","ช","ฌ"]: + newword.append("ญ") + elif consonant_start in ["ฎ","ฐ","ฑ","ณ"]: + newword.append("ณ") + elif consonant_start in ["ด","ถ","ท","ธ","น"]: + newword.append("น") + elif consonant_start in ["ป","ผ","พ","ภ"]: + newword.append("ม") + elif consonant_start in ["ย","ร","ล","ฬ","ว","ศ","ษ","ส","ห"]: + newword.append("ง") + else: + raise NotImplementedError(f""" + The function doesn't support {w1} and {w2}. + """) + newword.extend(list_w2) + return ''.join(newword) diff --git a/tests/test_morpheme.py b/tests/test_morpheme.py new file mode 100644 index 000000000..0bc609e49 --- /dev/null +++ b/tests/test_morpheme.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: Copyright 2016-2024 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 + +import unittest +from pythainlp.morpheme import nighit + + +class TestMorphemePackage(unittest.TestCase): + def test_nighit(self): + self.assertEqual(nighit("สํ","คีต"), "สังคีต") + self.assertEqual(nighit("สํ","จร"), "สัญจร") + self.assertEqual(nighit("สํ","ฐาน"), "สัณฐาน") + self.assertEqual(nighit("สํ","นิษฐาน"), "สันนิษฐาน") + self.assertEqual(nighit("สํ","ปทา"), "สัมปทา") + self.assertEqual(nighit("สํ","โยค"), "สังโยค") \ No newline at end of file