Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat:standardize_lang_tag #267

Merged
merged 8 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ovos_plugin_manager/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from ovos_plugin_manager.templates.segmentation import Segmenter



def find_segmentation_plugins() -> dict:
"""
Find all installed plugins
Expand Down
14 changes: 7 additions & 7 deletions ovos_plugin_manager/templates/coreference.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from quebra_frases import word_tokenize

Expand Down Expand Up @@ -64,10 +65,10 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-US")

def contains_corefs(self, text, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang, macro=True)
if lang.startswith("en"):
indicators = self.COREFERENCE_INDICATORS_EN
elif lang.startswith("pt"):
Expand Down Expand Up @@ -120,7 +121,7 @@ def extract_replacements(original, solved):
return bucket

def add_context(self, word, solved, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
if lang not in self.contexts:
self.contexts[lang] = {}
if word not in self.contexts[lang]:
Expand All @@ -130,7 +131,7 @@ def add_context(self, word, solved, lang=None):
self.contexts[lang][word].append(solved)

def extract_context(self, text=None, solved=None, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
text = text or self._prev_sentence
solved = solved or self._prev_solved
replaced = self.extract_replacements(text, solved)
Expand All @@ -139,7 +140,7 @@ def extract_context(self, text=None, solved=None, lang=None):
return replaced

def replace_coreferences(self, text, lang=None, set_context=False):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
solved = self.solve_corefs(text, lang=lang)
self._prev_sentence = text
self._prev_solved = solved
Expand All @@ -148,7 +149,7 @@ def replace_coreferences(self, text, lang=None, set_context=False):
return solved

def replace_coreferences_with_context(self, text, lang=None, context=None, set_context=False):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
lang_context = self.contexts.get(lang) or {}
default_context = {k: v[0] for k, v in lang_context.items() if v}

Expand All @@ -168,7 +169,6 @@ def replace_coreferences_with_context(self, text, lang=None, context=None, set_c
return solved

def solve_corefs(self, text, lang=None):
lang = lang or self.lang
return text


Expand Down
5 changes: 3 additions & 2 deletions ovos_plugin_manager/templates/hotwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""
from ovos_config import Configuration
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements


Expand All @@ -30,7 +31,7 @@ class HotWordEngine:
lang (str): language code (BCP-47)
"""

def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"):
def __init__(self, key_phrase="hey mycroft", config=None, lang="en-US"):
self.key_phrase = str(key_phrase).lower()
mycroft_config = Configuration()
if config is None:
Expand All @@ -49,7 +50,7 @@ def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"):
self.expected_duration = self.num_phonemes * phoneme_duration

self.listener_config = mycroft_config.get("listener") or {}
self.lang = str(self.config.get("lang", lang)).lower()
self.lang = standardize_lang_tag(self.config.get("lang", lang))

@classproperty
def runtime_requirements(self):
Expand Down
16 changes: 9 additions & 7 deletions ovos_plugin_manager/templates/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from ovos_config.config import Configuration
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from typing import Optional, Dict, Union, List, Set

Expand All @@ -16,8 +17,10 @@ def __init__(self, config: Optional[Dict[str, Union[str, int]]] = None):
Can contain "lang" for default language, "hint_lang" for a hint language, and "boost" for language boost score.
"""
self.config = config or {}
self.default_language = self.config.get("lang", "en-us")
self.hint_language = self.config.get("hint_lang") or self.config.get('user') or self.default_language
self.default_language = standardize_lang_tag(self.config.get("lang", "en-US"))
self.hint_language = standardize_lang_tag(self.config.get("hint_lang") or
self.config.get('user') or
self.default_language)
self.boost = self.config.get("boost")

@classproperty
Expand Down Expand Up @@ -46,7 +49,7 @@ def detect(self, text: str) -> str:
text (str): The text to detect the language of.

Returns:
str: The detected language code (e.g., 'en-us').
str: The detected language code (e.g., 'en-US').
"""

@abc.abstractmethod
Expand Down Expand Up @@ -85,11 +88,10 @@ def __init__(self, config: Optional[Dict[str, str]] = None):
"""
self.config = config or {}
# translate from, unless specified/detected otherwise
self.default_language = self.config.get("lang") or "en-us"
self.default_language = standardize_lang_tag(self.config.get("lang") or "en-US")
# translate to
self.internal_language = (Configuration().get('language') or
dict()).get("internal") or \
self.default_language
self.internal_language = standardize_lang_tag(Configuration().get('language', {}).get("internal") or \
self.default_language)

@classproperty
def runtime_requirements(self) -> RuntimeRequirements:
Expand Down
5 changes: 3 additions & 2 deletions ovos_plugin_manager/templates/postag.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements


Expand Down Expand Up @@ -48,10 +49,10 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-US")

def postag(self, spans, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
# this should be implemented by plugins!
if lang.startswith("pt"):
return _dummy_postag_pt(spans)
Expand Down
6 changes: 3 additions & 3 deletions ovos_plugin_manager/templates/segmentation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils import flatten_list
from ovos_utils import classproperty, flatten_list
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from quebra_frases import sentence_tokenize

Expand Down Expand Up @@ -58,7 +58,7 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-US")

@staticmethod
def __extract(text, markers):
Expand Down
5 changes: 5 additions & 0 deletions ovos_plugin_manager/templates/solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from json_database import JsonStorageXDG
from ovos_utils.log import LOG, log_deprecation
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.xdg_utils import xdg_cache_home

from ovos_plugin_manager.templates.language import LanguageTranslator, LanguageDetector
Expand All @@ -26,6 +27,8 @@ def func_wrapper(*args, **kwargs):
return func(*args, **kwargs)

lang = kwargs.get("lang")
if lang:
lang = standardize_lang_tag(lang)
# check if translation can be skipped
if any([lang is None,
lang == solver.default_lang,
Expand Down Expand Up @@ -91,6 +94,8 @@ def func_wrapper(*args, **kwargs):
lang = solver.detect_language(v)
LOG.debug(f"detected 'lang': {lang} in argument '{idx}' for func: {func}")

if lang:
lang = standardize_lang_tag(lang)
kwargs["lang"] = lang
return func(*args, **kwargs)

Expand Down
17 changes: 7 additions & 10 deletions ovos_plugin_manager/templates/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ovos_utils import classproperty
from ovos_utils.log import deprecated
from ovos_utils.process_utils import RuntimeRequirements

from ovos_utils.lang import standardize_lang_tag
from ovos_plugin_manager.utils.config import get_plugin_config


Expand Down Expand Up @@ -78,14 +78,14 @@ def recognizer(self, val):

@property
def lang(self):
return self._lang or \
return standardize_lang_tag(self._lang or \
self.config.get("lang") or \
Configuration().get("lang", "en-us")
Configuration().get("lang", "en-US"))

@lang.setter
def lang(self, val):
# backwards compat
self._lang = val
self._lang = standardize_lang_tag(val)

@property
@deprecated("self.keys has been deprecated! "
Expand Down Expand Up @@ -114,10 +114,7 @@ def credential(self, val):
"implement config handling directly instead", "1.0.0")
def init_language(config_core):
lang = config_core.get("lang", "en-US")
langs = lang.split("-")
if len(langs) == 2:
return langs[0].lower() + "-" + langs[1].upper()
return lang
return standardize_lang_tag(lang, macro=True)

@abstractmethod
def execute(self, audio, language: Optional[str] = None) -> str:
Expand Down Expand Up @@ -180,7 +177,7 @@ class StreamThread(Thread, metaclass=ABCMeta):

def __init__(self, queue, language):
super().__init__()
self.language = language
self.language = standardize_lang_tag(language)
self.queue = queue
self.text = None

Expand Down Expand Up @@ -219,7 +216,7 @@ def stream_start(self, language=None):
self.stream_stop()
self.queue = Queue()
self.stream = self.create_streaming_thread()
self.stream.language = language or self.lang
self.stream.language = standardize_lang_tag(language or self.lang)
self.transcript_ready.clear()
self.stream.start()

Expand Down
5 changes: 2 additions & 3 deletions ovos_plugin_manager/templates/tokenization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from quebra_frases import span_indexed_word_tokenize, word_tokenize

Expand Down Expand Up @@ -49,14 +50,12 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-US")

def span_tokenize(self, text, lang=None):
lang = lang or self.lang
return span_indexed_word_tokenize(text)

def tokenize(self, text, lang=None):
lang = lang or self.lang
return word_tokenize(text)

@staticmethod
Expand Down
7 changes: 4 additions & 3 deletions ovos_plugin_manager/templates/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ovos_utils.fakebus import FakeBus
from ovos_utils.file_utils import get_cache_directory
from ovos_utils.lang.visimes import VISIMES
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.log import LOG, deprecated, log_deprecation
from ovos_utils.metrics import Stopwatch
from ovos_utils.process_utils import RuntimeRequirements
Expand Down Expand Up @@ -62,7 +63,7 @@ def __init__(self, plugin_id: str, lang: str, voice: str, synth_kwargs: dict = N
synth_kwargs (dict, optional): Additional keyword arguments for the synthesizer.
"""
self.plugin_id = plugin_id
self.lang = lang
self.lang = standardize_lang_tag(lang)
self.voice = voice
self.synth_kwargs = synth_kwargs or {}

Expand Down Expand Up @@ -593,7 +594,7 @@ def _get_ctxt(self, kwargs=None) -> TTSContext:

LOG.debug(f"TTS kwargs: {kwargs}")
return TTSContext(plugin_id=self.plugin_id,
lang=kwargs.get("lang") or Configuration().get("lang", "en-us"),
lang=kwargs.get("lang") or Configuration().get("lang", "en-US"),
JarbasAl marked this conversation as resolved.
Show resolved Hide resolved
voice=kwargs.get("voice", "default"),
synth_kwargs=kwargs)

Expand Down Expand Up @@ -933,7 +934,7 @@ def lang(self):
if message:
sess = SessionManager.get(message)
return sess.lang
return self.config.get("lang") or 'en-us'
return standardize_lang_tag(self.config.get("lang") or 'en-US')

@lang.setter
@deprecated("language is defined per request in get_tts, self.lang is not used",
Expand Down
9 changes: 5 additions & 4 deletions ovos_plugin_manager/thirdparty/solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from typing import Optional, List, Dict

from ovos_utils import flatten_list
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.log import LOG
from quebra_frases import sentence_tokenize

Expand All @@ -53,7 +54,7 @@ def __init__(self, config=None,
self.enable_cache = enable_cache
self.config = config or {}
self.supported_langs = self.config.get("supported_langs") or []
self.default_lang = internal_lang or self.config.get("lang", "en")
self.default_lang = standardize_lang_tag(internal_lang or self.config.get("lang", "en"), macro=True)
if self.default_lang not in self.supported_langs:
self.supported_langs.insert(0, self.default_lang)
self._translator = translator or OVOSLangTranslationFactory.create() if self.enable_tx else None
Expand Down Expand Up @@ -123,9 +124,9 @@ def translate(self, text: str,
:param source_lang: Source language code.
:return: Translated text.
"""
source_lang = source_lang or self.detect_language(text)
target_lang = target_lang or self.default_lang
if source_lang.split("-")[0] == target_lang.split("-")[0]:
source_lang = standardize_lang_tag(source_lang or self.detect_language(text), macro=True)
target_lang = standardize_lang_tag(target_lang or self.default_lang, macro=True)
if source_lang == target_lang:
return text # skip translation
return self.translator.translate(text,
target=target_lang,
Expand Down
5 changes: 0 additions & 5 deletions ovos_plugin_manager/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,3 @@ def create(config=None):
f'\nAvailable modules: {modules}')
raise
return tts


if __name__ == "__main__":
lang = "en-us"
print(find_tts_plugins())
23 changes: 4 additions & 19 deletions ovos_plugin_manager/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from typing import Optional

import pkg_resources
from ovos_utils.log import LOG, log_deprecation
from ovos_utils.log import LOG, log_deprecation, deprecated


class PluginTypes(str, Enum):
Expand Down Expand Up @@ -173,25 +173,10 @@ def load_plugin(plug_name: str, plug_type: Optional[PluginTypes] = None):
LOG.warning(f'Could not find the plugin {plug_type}.{plug_name}')
return None


@deprecated("normalize_lang has been deprecated! update to 'from ovos_utils.lang import standardize_lang_tag'", "1.0.0")
def normalize_lang(lang):
# TODO consider moving to LF or ovos_utils
# special handling, the parse sometimes messes this up
# eg, uk-ua gets normalized to uk-gb
# this also makes lookup easier as we
# often get duplicate entries with both variants
if "-" in lang:
pieces = lang.split("-")
if len(pieces) == 2 and pieces[0] == pieces[1]:
lang = pieces[0]

try:
from langcodes import standardize_tag as _normalize_lang
lang = _normalize_lang(lang, macro=True)
except ValueError:
# this lang code is apparently not valid ?
pass
return lang
from ovos_utils.lang import standardize_lang_tag
return standardize_lang_tag(lang)
JarbasAl marked this conversation as resolved.
Show resolved Hide resolved


class ReadWriteStream:
Expand Down
Loading
Loading