Skip to content

Commit

Permalink
Merge pull request #923 from PyThaiNLP/add-thaig2p_v2
Browse files Browse the repository at this point in the history
Add thaig2p_v2
  • Loading branch information
wannaphong authored Aug 16, 2024
2 parents fedcd90 + 6aaac9c commit 17c3382
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 1 deletion.
3 changes: 2 additions & 1 deletion docs/api/transliterate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ This section includes multiple transliteration engines designed to suit various

- **icu**: Utilizes the ICU transliteration system for phonetic conversion.
- **ipa**: Provides International Phonetic Alphabet (IPA) representation of Thai text.
- **thaig2p**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation.
- **thaig2p**: (default) Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation.
- **thaig2p_v2**: Transliterates Thai text into the Grapheme-to-Phoneme (G2P) representation. This model is from https://huggingface.co/pythainlp/thaig2p-v2.0
- **tltk**: Utilizes the TLTK transliteration system for a specific approach to transliteration.
- **iso_11940**: Focuses on the ISO 11940 transliteration standard.

Expand Down
4 changes: 4 additions & 0 deletions pythainlp/transliterate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ def transliterate(
`TLTK <https://pypi.org/project/tltk/>`_.,
* *iso_11940* - Thai text into Latin characters with ISO 11940.
* *tltk_ipa* - tltk, output is International Phonetic Alphabet (IPA)
* *thaig2p_v2* - Thai Grapheme-to-Phoneme,
output is IPA. https://huggingface.co/pythainlp/thaig2p-v2.0
:Example:
::
Expand Down Expand Up @@ -159,6 +161,8 @@ def transliterate(
from pythainlp.transliterate.tltk import tltk_ipa as transliterate
elif engine == "iso_11940":
from pythainlp.transliterate.iso_11940 import transliterate
elif engine == "thaig2p_v2":
from pythainlp.transliterate.thaig2p_v2 import transliterate
else: # use default engine: "thaig2p"
from pythainlp.transliterate.thaig2p import transliterate

Expand Down
33 changes: 33 additions & 0 deletions pythainlp/transliterate/thaig2p_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Thai Grapheme-to-Phoneme (Thai G2P)
huggingface: https://huggingface.co/pythainlp/thaig2p-v2.0
"""

# Use a pipeline as a high-level helper
from transformers import pipeline


class ThaiG2P:
"""
Latin transliteration of Thai words, using International Phonetic Alphabet
"""

def __init__(self, device: str = "cpu"):
self.pipe = pipeline("text2text-generation", model="pythainlp/thaig2p-v2.0", device=device)

def g2p(self, text: str) -> str:
return self.pipe(text)[0]["generated_text"]


_THAI_G2P = None


def transliterate(text: str, device="cpu") -> str:
global _THAI_G2P
if _THAI_G2P is None:
_THAI_G2P = ThaiG2P(device=device)
return _THAI_G2P.g2p(text)
2 changes: 2 additions & 0 deletions tests/test_transliterate.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ def test_transliterate(self):
self.assertEqual(transliterate("คน", engine="ipa"), "kʰon")
self.assertIsNotNone(transliterate("คน", engine="thaig2p"))
self.assertIsNotNone(transliterate("แมว", engine="thaig2p"))
self.assertIsNotNone(transliterate("คน", engine="thaig2p_v2"))
self.assertIsNotNone(transliterate("แมว", engine="thaig2p_v2"))
self.assertIsNotNone(transliterate("คน", engine="tltk_g2p"))
self.assertIsNotNone(transliterate("แมว", engine="tltk_g2p"))
self.assertIsNotNone(transliterate("คน", engine="tltk_ipa"))
Expand Down

0 comments on commit 17c3382

Please sign in to comment.