Skip to content

Commit

Permalink
Add pythainlp.corpus.thai_wsd_dict
Browse files Browse the repository at this point in the history
  • Loading branch information
wannaphong committed Jul 12, 2023
1 parent 87106b5 commit 1bec0f2
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 16 deletions.
1 change: 1 addition & 0 deletions docs/api/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Modules
.. autofunction:: thai_dict
.. autofunction:: thai_stopwords
.. autofunction:: thai_words
.. autofunction:: thai_wsd_dict
.. autofunction:: thai_orst_words
.. autofunction:: thai_syllables
.. autofunction:: thai_negations
Expand Down
87 changes: 82 additions & 5 deletions notebooks/test_wsd.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 9,
"id": "2ef43b65-5df9-42e3-a712-0e60ca64ea16",
"metadata": {
"tags": []
Expand All @@ -24,8 +24,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Setting ds_accelerator to cuda (auto detect)\n",
"[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086)]\n"
"[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.0974416732788086), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.09319090843200684)]\n"
]
}
],
Expand All @@ -35,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 10,
"id": "6aafefdf-4658-4e35-b69f-7d2b54e34d33",
"metadata": {
"tags": []
Expand All @@ -45,7 +44,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232), ('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584)]\n"
"[('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน', 0.1005704402923584), ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ', 0.12473666667938232)]\n"
]
}
],
Expand All @@ -70,6 +69,84 @@
"source": [
"print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "32fa3fe9-0e1a-4176-b8f3-18d666eb3162",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from pythainlp.corpus import get_corpus_path, thai_wsd_dict"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0f88ff4c-06db-4cba-8086-4bb2160bead0",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"_w=thai_wsd_dict()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "83642893-d9a6-4271-a1b7-5e57638a74d4",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['word', 'meaning'])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_w.keys()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bb67c468-ce65-4581-adc6-832d70cfabab",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"('เดิน', ['ยกเท้าก้าวไป', 'เคลื่อนไปด้วยกำลังต่าง ๆ'])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_w[\"word\"][0],_w[\"meaning\"][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "27fbe522-019f-4157-a9a8-50ae62b50727",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"thai_stopwords",
"thai_syllables",
"thai_words",
"thai_wsd_dict",
"thai_orst_words",
"path_pythainlp_corpus",
"get_path_folder_corpus",
Expand Down Expand Up @@ -114,4 +115,5 @@ def corpus_db_path() -> str:
thai_words,
thai_orst_words,
thai_dict,
thai_wsd_dict
)
29 changes: 29 additions & 0 deletions pythainlp/corpus/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"thai_syllables",
"thai_words",
"thai_dict",
"thai_wsd_dict",
]

from typing import FrozenSet, List, Union
Expand Down Expand Up @@ -62,6 +63,7 @@
_THAI_ORST_WORDS = set()

_THAI_DICT = {}
_THAI_WSD_DICT = {}


def countries() -> FrozenSet[str]:
Expand Down Expand Up @@ -260,6 +262,7 @@ def thai_male_names() -> FrozenSet[str]:

return _THAI_MALE_NAMES


def thai_dict() -> dict:
"""
Return Thai dictionary with definition from wiktionary.
Expand All @@ -280,3 +283,29 @@ def thai_dict() -> dict:
_THAI_DICT["meaning"].append(row["meaning"])

return _THAI_DICT


def thai_wsd_dict() -> dict:
"""
Return Thai Word Sense Disambiguation dictionary with definition from wiktionary.
\n(See: `thai_dict\
<https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_)
:return: Thai word with part-of-speech type and definition
:rtype: :class:`frozenset`
"""
global _THAI_WSD_DICT
if _THAI_WSD_DICT == {}:
_thai_wsd = thai_dict()
_THAI_WSD_DICT = {"word":[],"meaning":[]}
for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
_all_value = list(eval(j).values())
_use = []
for k in _all_value:
_use.extend(k)
_use=list(set(_use))
if len(_use)>1:
_THAI_WSD_DICT["word"].append(i)
_THAI_WSD_DICT["meaning"].append(_use)

return _THAI_WSD_DICT
16 changes: 5 additions & 11 deletions pythainlp/wsd/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,13 @@
from pythainlp.corpus import thai_words
from pythainlp.tokenize import Tokenizer
from pythainlp.util.trie import Trie, dict_trie
from pythainlp.corpus import get_corpus_path, thai_dict
from pythainlp.corpus import get_corpus_path, thai_wsd_dict

_thai_wsd = thai_dict()
_wsd_dict = thai_wsd_dict()
_mean_all = {}
for i,j in zip(_thai_wsd["word"],_thai_wsd["meaning"]):
_all_value = list(eval(j).values())
_use = []
for k in _all_value:
_use.extend(k)
_use=list(set(_use))
if len(_use)>1:
_mean_all[i]=_use
_all_word=set(list(_mean_all.keys()))
for i,j in zip(_wsd_dict["word"], _wsd_dict["meaning"]):
_mean_all[i]=j
_all_word = set(list(_mean_all.keys()))
_TRIE = Trie(list(_all_word))
_word_cut = Tokenizer(custom_dict=_TRIE)

Expand Down

0 comments on commit 1bec0f2

Please sign in to comment.