diff --git a/bindings/python/test/test.py b/bindings/python/test/test.py index f8cc587b..1b0ba188 100644 --- a/bindings/python/test/test.py +++ b/bindings/python/test/test.py @@ -56,6 +56,10 @@ def test_invalid_lang(): pyonmttok.Tokenizer("conservative", lang="xxx") +def test_deprecated_lang(): + pyonmttok.Tokenizer("conservative", lang="tl") + + def test_invalid_sentencepiece_model(): with pytest.raises(ValueError): pyonmttok.Tokenizer("none", sp_model_path="xxx") diff --git a/src/unicode/Unicode.cc b/src/unicode/Unicode.cc index cd83d565..cfda328f 100644 --- a/src/unicode/Unicode.cc +++ b/src/unicode/Unicode.cc @@ -235,14 +235,7 @@ namespace onmt bool is_valid_language(const char* language) { - for (const char* const* available_languages = icu::Locale::getISOLanguages(); - *available_languages; - ++available_languages) - { - if (strcmp(*available_languages, language) == 0) - return true; - } - return false; + return icu::Locale(language).getISO3Language()[0] != '\0'; } // The functions below are made backward compatible with the Kangxi and Kanbun script names