From 87016a216b4274d932c03ad721d6ede61823a5e9 Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 17:26:52 +0900 Subject: [PATCH 01/66] implent a model config manager class --- src/langcheck/metrics/__init__.py | 13 ++ src/langcheck/metrics/_model_management.py | 111 ++++++++++++++++++ src/langcheck/metrics/modelconfig.ini | 10 ++ .../zh/reference_based_text_quality.py | 14 +-- 4 files changed, 140 insertions(+), 8 deletions(-) create mode 100644 src/langcheck/metrics/_model_management.py create mode 100644 src/langcheck/metrics/modelconfig.ini diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 4c23260d..94262a88 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -1,3 +1,4 @@ +from langcheck.metrics._model_management import ModelConfig from langcheck.metrics import en, ja, zh from langcheck.metrics.en.reference_based_text_quality import ( rouge1, rouge2, rougeL, semantic_similarity) @@ -13,6 +14,13 @@ is_json_array, is_json_object, matches_regex, validation_fn) +_model_manager = ModelConfig() +reset_model_config = _model_manager.reset +set_model_for_metric = _model_manager.set_model_for_metric +list_metric_model = _model_manager.list_metric_model +load_config_from_file = _model_manager.load_config_from_file +save_config_to_disk = _model_manager.save_config_to_disk + __all__ = [ 'en', 'ja', @@ -39,4 +47,9 @@ 'semantic_similarity', 'sentiment', 'toxicity', + 'set_model_for_metric', + 'list_metric_model', + 'load_config_from_file', + 'save_config_to_disk', + 'reset_model_config' ] diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py new file mode 100644 index 00000000..b8486535 --- /dev/null +++ b/src/langcheck/metrics/_model_management.py @@ -0,0 +1,111 @@ +import os +import configparser +import collections +from pathlib import Path + + +class ModelConfig: + """ + A class to manage different models for multiple languages in the + langcheck. + This class allows setting and retrieving different model names. + (like sentiment_model, semantic_similarity_model, etc.) for each language. + It also supports loading model configurations from a file. + """ + + def __init__(self): + """ + Initializes the ModelConfig with empty model dictionaries for each + language. + """ + self.__init__config() + + def __init__config(self): + cwd = os.path.dirname(__file__) + cfg = configparser.ConfigParser() + # Initial DEFAULT config from modelconfig.ini + cfg.read(os.path.join(Path(cwd), 'modelconfig.ini')) + self.model_config = collections.defaultdict(dict) + for lang in cfg.sections(): + for metric_type in cfg[lang]: + self.model_config[lang][metric_type] = cfg.get(section=lang, + option=metric_type) # type: ignore[reportGeneralIssue] # NOQA:E501 + + def reset(self): + ''' reset all model used in langcheck to default''' + self.__init__config() + + def list_metric_model(self, language: str, metric_type: str): + """ + return the model used in current metric for a given language. + + Args: + language: The language for which to get the model. + metric_type: The metric name. + + Returns: + str: The name of the specified model. + + Raises: + KeyError: If the specified language or model type is not found. + """ + if language in self.model_config: + if metric_type in self.model_config[language]: + return self.model_config[language][metric_type] + else: + raise KeyError(f"Model type '{metric_type}' not found for language '{language}'.") # NOQA:E501 + else: + raise KeyError(f"Language '{language}' not supported.") + + def set_model_for_metric(self, language: str, + metric_type: str, model_name: str): + """ + Sets a specific model used in metric_type for a given language. + + Args: + language: The language for which to set the model. + metric_type: The type of the model (e.g., 'sentiment_model'). + model_name: The name of the model. + + Raises: + KeyError: If the specified language is not supported. + """ + if language in self.model_config: + if metric_type in self.model_config[language]: + self.model_config[language][metric_type] = model_name + else: + raise KeyError(f"Metrics '{metric_type}' not used in metric.") + else: + raise KeyError(f"Language '{language}' not supported.") + + def load_config_from_file(self, file_path: str): + """ + Loads model configurations from a specified configuration file. + + The configuration file should have sections for each language with + key-value pairs for each metrics and model_name. + + Args: + file_path: The path to the configuration file containing model + configurations. + """ + config = configparser.ConfigParser() + config.read(file_path) + + for lanuage_section in config.sections(): + if lanuage_section in self.model_config: + for metric_type, model_name in config[lanuage_section].items(): + if metric_type in self.model_config[lanuage_section]: + self.model_config[lanuage_section][metric_type] = model_name # NOQA:E501 + + def save_config_to_disk(self, output_path: str): + """ + Save Model Configuration to output path. + Args: + output_path: The path to save the configuration file + """ + cfg = configparser.ConfigParser() + cfg.read_dict(self.model_config) + + with open(output_path, 'w') as f: + cfg.write(f) diff --git a/src/langcheck/metrics/modelconfig.ini b/src/langcheck/metrics/modelconfig.ini new file mode 100644 index 00000000..baec4017 --- /dev/null +++ b/src/langcheck/metrics/modelconfig.ini @@ -0,0 +1,10 @@ +[zh] +# According to the C-MTEB Benchmark +# (https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB) +# the 3 models of different sizes provided BAAI are the best on the +# embedding task +# Ref: https://huggingface.co/BAAI/bge-base-zh-v1.5 +# Using this model, it is hard to find two sentence where cos_sim < 0.25. +semantic_similarity = BAAI/bge-base-zh-v1.5 +sentiment = IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment +toxicity = alibaba-pai/pai-bert-base-zh-llm-risk-detection \ No newline at end of file diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 7821ed7c..1ac03bec 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -90,14 +90,12 @@ def semantic_similarity( openai_args) metric_value.language = 'zh' return metric_value - - # According to the C-MTEB Benchmark - # (https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB) - # the 3 models of different sizes provided BAAI are the best on the - # embedding task - # Ref: https://huggingface.co/BAAI/bge-base-zh-v1.5 - # Using this model, it is hard to find two sentence where cos_sim < 0.25. - model = SentenceTransformer('BAAI/bge-base-zh-v1.5') + # lazy import + from langcheck.metrics import _model_manager + print(_model_manager.list_metric_model(language='zh', + metric_type='semantic_similarity')) + model = SentenceTransformer(_model_manager.list_metric_model(language='zh', + metric_type='semantic_similarity')) # NOQA: E501 generated_embeddings = model.encode(generated_outputs) reference_embeddings = model.encode(reference_outputs) cosine_scores = util.pairwise_cos_sim( From 2bc3b75d1c2db6409156d00dd7713f2f2b1cbe57 Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 18:02:45 +0900 Subject: [PATCH 02/66] add test case for model management --- src/langcheck/metrics/_model_management.py | 7 ++-- tests/metrics/test_model_management.py | 38 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 tests/metrics/test_model_management.py diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index b8486535..9f6fa7f6 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -37,21 +37,18 @@ def reset(self): def list_metric_model(self, language: str, metric_type: str): """ - return the model used in current metric for a given language. + list the model used in current metric for a given language. Args: language: The language for which to get the model. metric_type: The metric name. - Returns: - str: The name of the specified model. - Raises: KeyError: If the specified language or model type is not found. """ if language in self.model_config: if metric_type in self.model_config[language]: - return self.model_config[language][metric_type] + print(self.model_config[language][metric_type]) else: raise KeyError(f"Model type '{metric_type}' not found for language '{language}'.") # NOQA:E501 else: diff --git a/tests/metrics/test_model_management.py b/tests/metrics/test_model_management.py new file mode 100644 index 00000000..42444d8e --- /dev/null +++ b/tests/metrics/test_model_management.py @@ -0,0 +1,38 @@ +from unittest.mock import mock_open, patch +from langcheck.metrics._model_management import ModelConfig + + +def test_initialization_with_mock_file(): + try: + mock_file_content = "[zh]\nsemantic_similarity=test_model\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)): + config = ModelConfig() + assert config.model_config['zh']['semantic_similarity'] == 'test_model' # NOQA:E501 + except AssertionError as err: + raise err + + +def test_list_metric_model_with_mock_file(capsys): + try: + mock_file_content = "[zh]\nsemantic_similarity=test_model\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)): + config = ModelConfig() + config.list_metric_model(language='zh', + metric_type='semantic_similarity') + captured = capsys.readouterr() # type: ignore + assert 'test_model' in captured.out + except AssertionError as err: + raise err + + +def test_set_model_for_metric_with_mock_file(): + try: + mock_file_content = "[zh]\nsemantic_similarity=test_model\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)): + config = ModelConfig() + config.set_model_for_metric(model_name='another_test_model', + language='zh', + metric_type='semantic_similarity') + assert config.model_config['zh']['semantic_similarity'] == 'another_test_model' # NOQA:E501 + except AssertionError as err: + raise err \ No newline at end of file From 34e49fca8a398f48fa3a06ae98966c909c66dbb7 Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 18:09:00 +0900 Subject: [PATCH 03/66] apply format suggestion --- src/langcheck/metrics/__init__.py | 2 +- src/langcheck/metrics/_model_management.py | 18 +++++++++++------- .../metrics/zh/reference_based_text_quality.py | 10 ++++++---- tests/metrics/test_model_management.py | 9 ++++++--- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 94262a88..56df78e6 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -1,5 +1,5 @@ -from langcheck.metrics._model_management import ModelConfig from langcheck.metrics import en, ja, zh +from langcheck.metrics._model_management import ModelConfig from langcheck.metrics.en.reference_based_text_quality import ( rouge1, rouge2, rougeL, semantic_similarity) from langcheck.metrics.en.reference_free_text_quality import ( diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index 9f6fa7f6..b083a979 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -1,6 +1,6 @@ import os -import configparser import collections +import configparser from pathlib import Path @@ -28,8 +28,9 @@ def __init__config(self): self.model_config = collections.defaultdict(dict) for lang in cfg.sections(): for metric_type in cfg[lang]: - self.model_config[lang][metric_type] = cfg.get(section=lang, - option=metric_type) # type: ignore[reportGeneralIssue] # NOQA:E501 + self.model_config[lang][metric_type] = cfg.get( + section=lang, option=metric_type + ) # type: ignore[reportGeneralIssue] # NOQA:E501 def reset(self): ''' reset all model used in langcheck to default''' @@ -50,12 +51,14 @@ def list_metric_model(self, language: str, metric_type: str): if metric_type in self.model_config[language]: print(self.model_config[language][metric_type]) else: - raise KeyError(f"Model type '{metric_type}' not found for language '{language}'.") # NOQA:E501 + raise KeyError( + f"Model type '{metric_type}' not found for language '{language}'." + ) # NOQA:E501 else: raise KeyError(f"Language '{language}' not supported.") - def set_model_for_metric(self, language: str, - metric_type: str, model_name: str): + def set_model_for_metric(self, language: str, metric_type: str, + model_name: str): """ Sets a specific model used in metric_type for a given language. @@ -93,7 +96,8 @@ def load_config_from_file(self, file_path: str): if lanuage_section in self.model_config: for metric_type, model_name in config[lanuage_section].items(): if metric_type in self.model_config[lanuage_section]: - self.model_config[lanuage_section][metric_type] = model_name # NOQA:E501 + self.model_config[lanuage_section][ + metric_type] = model_name # NOQA:E501 def save_config_to_disk(self, output_path: str): """ diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 1ac03bec..9f9062ae 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -92,10 +92,12 @@ def semantic_similarity( return metric_value # lazy import from langcheck.metrics import _model_manager - print(_model_manager.list_metric_model(language='zh', - metric_type='semantic_similarity')) - model = SentenceTransformer(_model_manager.list_metric_model(language='zh', - metric_type='semantic_similarity')) # NOQA: E501 + print( + _model_manager.list_metric_model(language='zh', + metric_type='semantic_similarity')) + model = SentenceTransformer( + _model_manager.list_metric_model( + language='zh', metric_type='semantic_similarity')) # NOQA: E501 generated_embeddings = model.encode(generated_outputs) reference_embeddings = model.encode(reference_outputs) cosine_scores = util.pairwise_cos_sim( diff --git a/tests/metrics/test_model_management.py b/tests/metrics/test_model_management.py index 42444d8e..1bc29b7a 100644 --- a/tests/metrics/test_model_management.py +++ b/tests/metrics/test_model_management.py @@ -1,4 +1,5 @@ from unittest.mock import mock_open, patch + from langcheck.metrics._model_management import ModelConfig @@ -7,7 +8,8 @@ def test_initialization_with_mock_file(): mock_file_content = "[zh]\nsemantic_similarity=test_model\n" with patch('builtins.open', mock_open(read_data=mock_file_content)): config = ModelConfig() - assert config.model_config['zh']['semantic_similarity'] == 'test_model' # NOQA:E501 + assert config.model_config['zh'][ + 'semantic_similarity'] == 'test_model' # NOQA:E501 except AssertionError as err: raise err @@ -33,6 +35,7 @@ def test_set_model_for_metric_with_mock_file(): config.set_model_for_metric(model_name='another_test_model', language='zh', metric_type='semantic_similarity') - assert config.model_config['zh']['semantic_similarity'] == 'another_test_model' # NOQA:E501 + assert config.model_config['zh'][ + 'semantic_similarity'] == 'another_test_model' # NOQA:E501 except AssertionError as err: - raise err \ No newline at end of file + raise err From 083c6129a616340cbc3f2f6221bc5a2b4182b191 Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 18:10:07 +0900 Subject: [PATCH 04/66] apply format suggestion --- src/langcheck/metrics/_model_management.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index b083a979..e9eab35b 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -1,6 +1,6 @@ -import os import collections import configparser +import os from pathlib import Path @@ -52,8 +52,8 @@ def list_metric_model(self, language: str, metric_type: str): print(self.model_config[language][metric_type]) else: raise KeyError( - f"Model type '{metric_type}' not found for language '{language}'." - ) # NOQA:E501 + f"Model type '{metric_type}' not found for language '{language}'." # NOQA:E501 + ) else: raise KeyError(f"Language '{language}' not supported.") From 2cdf43c1192bfe999c9e472796e1dce5b7acc3e9 Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 18:42:32 +0900 Subject: [PATCH 05/66] pydoc update & fix test case --- src/langcheck/metrics/__init__.py | 4 +++- src/langcheck/metrics/_model_management.py | 21 +++++++++++++++++++ .../zh/reference_based_text_quality.py | 5 +---- tests/metrics/test_model_management.py | 12 +++++++++++ 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 56df78e6..2c73aa48 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -18,6 +18,7 @@ reset_model_config = _model_manager.reset set_model_for_metric = _model_manager.set_model_for_metric list_metric_model = _model_manager.list_metric_model +get_metric_model = _model_manager.get_metric_model load_config_from_file = _model_manager.load_config_from_file save_config_to_disk = _model_manager.save_config_to_disk @@ -51,5 +52,6 @@ 'list_metric_model', 'load_config_from_file', 'save_config_to_disk', - 'reset_model_config' + 'reset_model_config', + 'get_metric_model' ] diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index e9eab35b..cb939193 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -36,6 +36,27 @@ def reset(self): ''' reset all model used in langcheck to default''' self.__init__config() + def get_metric_model(self, language: str, metric_type: str): + """ + list the model used in current metric for a given language. + + Args: + language: The language for which to get the model. + metric_type: The metric name. + + Raises: + KeyError: If the specified language or model type is not found. + """ + if language in self.model_config: + if metric_type in self.model_config[language]: + return self.model_config[language][metric_type] + else: + raise KeyError( + f"Model type '{metric_type}' not found for language '{language}'." # NOQA:E501 + ) + else: + raise KeyError(f"Language '{language}' not supported.") + def list_metric_model(self, language: str, metric_type: str): """ list the model used in current metric for a given language. diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 9f9062ae..11b3a20b 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -92,11 +92,8 @@ def semantic_similarity( return metric_value # lazy import from langcheck.metrics import _model_manager - print( - _model_manager.list_metric_model(language='zh', - metric_type='semantic_similarity')) model = SentenceTransformer( - _model_manager.list_metric_model( + _model_manager.get_metric_model( language='zh', metric_type='semantic_similarity')) # NOQA: E501 generated_embeddings = model.encode(generated_outputs) reference_embeddings = model.encode(reference_outputs) diff --git a/tests/metrics/test_model_management.py b/tests/metrics/test_model_management.py index 1bc29b7a..62a7f025 100644 --- a/tests/metrics/test_model_management.py +++ b/tests/metrics/test_model_management.py @@ -14,6 +14,18 @@ def test_initialization_with_mock_file(): raise err +def test_get_metric_model_with_mock_file(): + try: + mock_file_content = "[zh]\nsemantic_similarity=test_model\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)): + config = ModelConfig() + model_name = config.get_metric_model( + language='zh', metric_type='semantic_similarity') # NOQA:E501 + assert model_name == 'test_model' + except AssertionError as err: + raise err + + def test_list_metric_model_with_mock_file(capsys): try: mock_file_content = "[zh]\nsemantic_similarity=test_model\n" From 99fe02e74862362379792a1c9a8b971424d06267 Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 17:26:52 +0900 Subject: [PATCH 06/66] implement a model config manager class Changes to be committed: modified: src/langcheck/metrics/__init__.py new file: src/langcheck/metrics/_model_management.py new file: src/langcheck/metrics/modelconfig.ini modified: src/langcheck/metrics/zh/reference_based_text_quality.py Changes to be committed: modified: src/langcheck/metrics/__init__.py new file: src/langcheck/metrics/_model_management.py new file: src/langcheck/metrics/modelconfig.ini modified: src/langcheck/metrics/zh/reference_based_text_quality.py --- src/langcheck/metrics/__init__.py | 13 ++ src/langcheck/metrics/_model_management.py | 111 ++++++++++++++++++ src/langcheck/metrics/modelconfig.ini | 10 ++ .../zh/reference_based_text_quality.py | 14 +-- 4 files changed, 140 insertions(+), 8 deletions(-) create mode 100644 src/langcheck/metrics/_model_management.py create mode 100644 src/langcheck/metrics/modelconfig.ini diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 4c23260d..94262a88 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -1,3 +1,4 @@ +from langcheck.metrics._model_management import ModelConfig from langcheck.metrics import en, ja, zh from langcheck.metrics.en.reference_based_text_quality import ( rouge1, rouge2, rougeL, semantic_similarity) @@ -13,6 +14,13 @@ is_json_array, is_json_object, matches_regex, validation_fn) +_model_manager = ModelConfig() +reset_model_config = _model_manager.reset +set_model_for_metric = _model_manager.set_model_for_metric +list_metric_model = _model_manager.list_metric_model +load_config_from_file = _model_manager.load_config_from_file +save_config_to_disk = _model_manager.save_config_to_disk + __all__ = [ 'en', 'ja', @@ -39,4 +47,9 @@ 'semantic_similarity', 'sentiment', 'toxicity', + 'set_model_for_metric', + 'list_metric_model', + 'load_config_from_file', + 'save_config_to_disk', + 'reset_model_config' ] diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py new file mode 100644 index 00000000..b8486535 --- /dev/null +++ b/src/langcheck/metrics/_model_management.py @@ -0,0 +1,111 @@ +import os +import configparser +import collections +from pathlib import Path + + +class ModelConfig: + """ + A class to manage different models for multiple languages in the + langcheck. + This class allows setting and retrieving different model names. + (like sentiment_model, semantic_similarity_model, etc.) for each language. + It also supports loading model configurations from a file. + """ + + def __init__(self): + """ + Initializes the ModelConfig with empty model dictionaries for each + language. + """ + self.__init__config() + + def __init__config(self): + cwd = os.path.dirname(__file__) + cfg = configparser.ConfigParser() + # Initial DEFAULT config from modelconfig.ini + cfg.read(os.path.join(Path(cwd), 'modelconfig.ini')) + self.model_config = collections.defaultdict(dict) + for lang in cfg.sections(): + for metric_type in cfg[lang]: + self.model_config[lang][metric_type] = cfg.get(section=lang, + option=metric_type) # type: ignore[reportGeneralIssue] # NOQA:E501 + + def reset(self): + ''' reset all model used in langcheck to default''' + self.__init__config() + + def list_metric_model(self, language: str, metric_type: str): + """ + return the model used in current metric for a given language. + + Args: + language: The language for which to get the model. + metric_type: The metric name. + + Returns: + str: The name of the specified model. + + Raises: + KeyError: If the specified language or model type is not found. + """ + if language in self.model_config: + if metric_type in self.model_config[language]: + return self.model_config[language][metric_type] + else: + raise KeyError(f"Model type '{metric_type}' not found for language '{language}'.") # NOQA:E501 + else: + raise KeyError(f"Language '{language}' not supported.") + + def set_model_for_metric(self, language: str, + metric_type: str, model_name: str): + """ + Sets a specific model used in metric_type for a given language. + + Args: + language: The language for which to set the model. + metric_type: The type of the model (e.g., 'sentiment_model'). + model_name: The name of the model. + + Raises: + KeyError: If the specified language is not supported. + """ + if language in self.model_config: + if metric_type in self.model_config[language]: + self.model_config[language][metric_type] = model_name + else: + raise KeyError(f"Metrics '{metric_type}' not used in metric.") + else: + raise KeyError(f"Language '{language}' not supported.") + + def load_config_from_file(self, file_path: str): + """ + Loads model configurations from a specified configuration file. + + The configuration file should have sections for each language with + key-value pairs for each metrics and model_name. + + Args: + file_path: The path to the configuration file containing model + configurations. + """ + config = configparser.ConfigParser() + config.read(file_path) + + for lanuage_section in config.sections(): + if lanuage_section in self.model_config: + for metric_type, model_name in config[lanuage_section].items(): + if metric_type in self.model_config[lanuage_section]: + self.model_config[lanuage_section][metric_type] = model_name # NOQA:E501 + + def save_config_to_disk(self, output_path: str): + """ + Save Model Configuration to output path. + Args: + output_path: The path to save the configuration file + """ + cfg = configparser.ConfigParser() + cfg.read_dict(self.model_config) + + with open(output_path, 'w') as f: + cfg.write(f) diff --git a/src/langcheck/metrics/modelconfig.ini b/src/langcheck/metrics/modelconfig.ini new file mode 100644 index 00000000..baec4017 --- /dev/null +++ b/src/langcheck/metrics/modelconfig.ini @@ -0,0 +1,10 @@ +[zh] +# According to the C-MTEB Benchmark +# (https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB) +# the 3 models of different sizes provided BAAI are the best on the +# embedding task +# Ref: https://huggingface.co/BAAI/bge-base-zh-v1.5 +# Using this model, it is hard to find two sentence where cos_sim < 0.25. +semantic_similarity = BAAI/bge-base-zh-v1.5 +sentiment = IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment +toxicity = alibaba-pai/pai-bert-base-zh-llm-risk-detection \ No newline at end of file diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 7821ed7c..1ac03bec 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -90,14 +90,12 @@ def semantic_similarity( openai_args) metric_value.language = 'zh' return metric_value - - # According to the C-MTEB Benchmark - # (https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB) - # the 3 models of different sizes provided BAAI are the best on the - # embedding task - # Ref: https://huggingface.co/BAAI/bge-base-zh-v1.5 - # Using this model, it is hard to find two sentence where cos_sim < 0.25. - model = SentenceTransformer('BAAI/bge-base-zh-v1.5') + # lazy import + from langcheck.metrics import _model_manager + print(_model_manager.list_metric_model(language='zh', + metric_type='semantic_similarity')) + model = SentenceTransformer(_model_manager.list_metric_model(language='zh', + metric_type='semantic_similarity')) # NOQA: E501 generated_embeddings = model.encode(generated_outputs) reference_embeddings = model.encode(reference_outputs) cosine_scores = util.pairwise_cos_sim( From 2843c884748e69b49e1ea5d92eb16986124a6b5d Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 18:02:45 +0900 Subject: [PATCH 07/66] add test case for model management --- src/langcheck/metrics/_model_management.py | 7 ++-- tests/metrics/test_model_management.py | 38 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 tests/metrics/test_model_management.py diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index b8486535..9f6fa7f6 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -37,21 +37,18 @@ def reset(self): def list_metric_model(self, language: str, metric_type: str): """ - return the model used in current metric for a given language. + list the model used in current metric for a given language. Args: language: The language for which to get the model. metric_type: The metric name. - Returns: - str: The name of the specified model. - Raises: KeyError: If the specified language or model type is not found. """ if language in self.model_config: if metric_type in self.model_config[language]: - return self.model_config[language][metric_type] + print(self.model_config[language][metric_type]) else: raise KeyError(f"Model type '{metric_type}' not found for language '{language}'.") # NOQA:E501 else: diff --git a/tests/metrics/test_model_management.py b/tests/metrics/test_model_management.py new file mode 100644 index 00000000..42444d8e --- /dev/null +++ b/tests/metrics/test_model_management.py @@ -0,0 +1,38 @@ +from unittest.mock import mock_open, patch +from langcheck.metrics._model_management import ModelConfig + + +def test_initialization_with_mock_file(): + try: + mock_file_content = "[zh]\nsemantic_similarity=test_model\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)): + config = ModelConfig() + assert config.model_config['zh']['semantic_similarity'] == 'test_model' # NOQA:E501 + except AssertionError as err: + raise err + + +def test_list_metric_model_with_mock_file(capsys): + try: + mock_file_content = "[zh]\nsemantic_similarity=test_model\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)): + config = ModelConfig() + config.list_metric_model(language='zh', + metric_type='semantic_similarity') + captured = capsys.readouterr() # type: ignore + assert 'test_model' in captured.out + except AssertionError as err: + raise err + + +def test_set_model_for_metric_with_mock_file(): + try: + mock_file_content = "[zh]\nsemantic_similarity=test_model\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)): + config = ModelConfig() + config.set_model_for_metric(model_name='another_test_model', + language='zh', + metric_type='semantic_similarity') + assert config.model_config['zh']['semantic_similarity'] == 'another_test_model' # NOQA:E501 + except AssertionError as err: + raise err \ No newline at end of file From 49983cd5d7c287a6e229e5079f649d71357f867d Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 18:09:00 +0900 Subject: [PATCH 08/66] apply format suggestion --- src/langcheck/metrics/__init__.py | 2 +- src/langcheck/metrics/_model_management.py | 18 +++++++++++------- .../metrics/zh/reference_based_text_quality.py | 10 ++++++---- tests/metrics/test_model_management.py | 9 ++++++--- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 94262a88..56df78e6 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -1,5 +1,5 @@ -from langcheck.metrics._model_management import ModelConfig from langcheck.metrics import en, ja, zh +from langcheck.metrics._model_management import ModelConfig from langcheck.metrics.en.reference_based_text_quality import ( rouge1, rouge2, rougeL, semantic_similarity) from langcheck.metrics.en.reference_free_text_quality import ( diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index 9f6fa7f6..b083a979 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -1,6 +1,6 @@ import os -import configparser import collections +import configparser from pathlib import Path @@ -28,8 +28,9 @@ def __init__config(self): self.model_config = collections.defaultdict(dict) for lang in cfg.sections(): for metric_type in cfg[lang]: - self.model_config[lang][metric_type] = cfg.get(section=lang, - option=metric_type) # type: ignore[reportGeneralIssue] # NOQA:E501 + self.model_config[lang][metric_type] = cfg.get( + section=lang, option=metric_type + ) # type: ignore[reportGeneralIssue] # NOQA:E501 def reset(self): ''' reset all model used in langcheck to default''' @@ -50,12 +51,14 @@ def list_metric_model(self, language: str, metric_type: str): if metric_type in self.model_config[language]: print(self.model_config[language][metric_type]) else: - raise KeyError(f"Model type '{metric_type}' not found for language '{language}'.") # NOQA:E501 + raise KeyError( + f"Model type '{metric_type}' not found for language '{language}'." + ) # NOQA:E501 else: raise KeyError(f"Language '{language}' not supported.") - def set_model_for_metric(self, language: str, - metric_type: str, model_name: str): + def set_model_for_metric(self, language: str, metric_type: str, + model_name: str): """ Sets a specific model used in metric_type for a given language. @@ -93,7 +96,8 @@ def load_config_from_file(self, file_path: str): if lanuage_section in self.model_config: for metric_type, model_name in config[lanuage_section].items(): if metric_type in self.model_config[lanuage_section]: - self.model_config[lanuage_section][metric_type] = model_name # NOQA:E501 + self.model_config[lanuage_section][ + metric_type] = model_name # NOQA:E501 def save_config_to_disk(self, output_path: str): """ diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 1ac03bec..9f9062ae 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -92,10 +92,12 @@ def semantic_similarity( return metric_value # lazy import from langcheck.metrics import _model_manager - print(_model_manager.list_metric_model(language='zh', - metric_type='semantic_similarity')) - model = SentenceTransformer(_model_manager.list_metric_model(language='zh', - metric_type='semantic_similarity')) # NOQA: E501 + print( + _model_manager.list_metric_model(language='zh', + metric_type='semantic_similarity')) + model = SentenceTransformer( + _model_manager.list_metric_model( + language='zh', metric_type='semantic_similarity')) # NOQA: E501 generated_embeddings = model.encode(generated_outputs) reference_embeddings = model.encode(reference_outputs) cosine_scores = util.pairwise_cos_sim( diff --git a/tests/metrics/test_model_management.py b/tests/metrics/test_model_management.py index 42444d8e..1bc29b7a 100644 --- a/tests/metrics/test_model_management.py +++ b/tests/metrics/test_model_management.py @@ -1,4 +1,5 @@ from unittest.mock import mock_open, patch + from langcheck.metrics._model_management import ModelConfig @@ -7,7 +8,8 @@ def test_initialization_with_mock_file(): mock_file_content = "[zh]\nsemantic_similarity=test_model\n" with patch('builtins.open', mock_open(read_data=mock_file_content)): config = ModelConfig() - assert config.model_config['zh']['semantic_similarity'] == 'test_model' # NOQA:E501 + assert config.model_config['zh'][ + 'semantic_similarity'] == 'test_model' # NOQA:E501 except AssertionError as err: raise err @@ -33,6 +35,7 @@ def test_set_model_for_metric_with_mock_file(): config.set_model_for_metric(model_name='another_test_model', language='zh', metric_type='semantic_similarity') - assert config.model_config['zh']['semantic_similarity'] == 'another_test_model' # NOQA:E501 + assert config.model_config['zh'][ + 'semantic_similarity'] == 'another_test_model' # NOQA:E501 except AssertionError as err: - raise err \ No newline at end of file + raise err From 63aa6e68d535224b205ecf4dc6c33f5cde534a96 Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 18:42:32 +0900 Subject: [PATCH 09/66] pydoc update & fix test case --- src/langcheck/metrics/__init__.py | 4 +++- src/langcheck/metrics/_model_management.py | 21 +++++++++++++++++++ .../zh/reference_based_text_quality.py | 5 +---- tests/metrics/test_model_management.py | 12 +++++++++++ 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 56df78e6..2c73aa48 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -18,6 +18,7 @@ reset_model_config = _model_manager.reset set_model_for_metric = _model_manager.set_model_for_metric list_metric_model = _model_manager.list_metric_model +get_metric_model = _model_manager.get_metric_model load_config_from_file = _model_manager.load_config_from_file save_config_to_disk = _model_manager.save_config_to_disk @@ -51,5 +52,6 @@ 'list_metric_model', 'load_config_from_file', 'save_config_to_disk', - 'reset_model_config' + 'reset_model_config', + 'get_metric_model' ] diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index e9eab35b..cb939193 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -36,6 +36,27 @@ def reset(self): ''' reset all model used in langcheck to default''' self.__init__config() + def get_metric_model(self, language: str, metric_type: str): + """ + list the model used in current metric for a given language. + + Args: + language: The language for which to get the model. + metric_type: The metric name. + + Raises: + KeyError: If the specified language or model type is not found. + """ + if language in self.model_config: + if metric_type in self.model_config[language]: + return self.model_config[language][metric_type] + else: + raise KeyError( + f"Model type '{metric_type}' not found for language '{language}'." # NOQA:E501 + ) + else: + raise KeyError(f"Language '{language}' not supported.") + def list_metric_model(self, language: str, metric_type: str): """ list the model used in current metric for a given language. diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 9f9062ae..11b3a20b 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -92,11 +92,8 @@ def semantic_similarity( return metric_value # lazy import from langcheck.metrics import _model_manager - print( - _model_manager.list_metric_model(language='zh', - metric_type='semantic_similarity')) model = SentenceTransformer( - _model_manager.list_metric_model( + _model_manager.get_metric_model( language='zh', metric_type='semantic_similarity')) # NOQA: E501 generated_embeddings = model.encode(generated_outputs) reference_embeddings = model.encode(reference_outputs) diff --git a/tests/metrics/test_model_management.py b/tests/metrics/test_model_management.py index 1bc29b7a..62a7f025 100644 --- a/tests/metrics/test_model_management.py +++ b/tests/metrics/test_model_management.py @@ -14,6 +14,18 @@ def test_initialization_with_mock_file(): raise err +def test_get_metric_model_with_mock_file(): + try: + mock_file_content = "[zh]\nsemantic_similarity=test_model\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)): + config = ModelConfig() + model_name = config.get_metric_model( + language='zh', metric_type='semantic_similarity') # NOQA:E501 + assert model_name == 'test_model' + except AssertionError as err: + raise err + + def test_list_metric_model_with_mock_file(capsys): try: mock_file_content = "[zh]\nsemantic_similarity=test_model\n" From 66ef0c695c9a84004c567c7990652f6d0b09733c Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 18:10:07 +0900 Subject: [PATCH 10/66] apply format suggestion --- src/langcheck/metrics/_model_management.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index b083a979..e9eab35b 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -1,6 +1,6 @@ -import os import collections import configparser +import os from pathlib import Path @@ -52,8 +52,8 @@ def list_metric_model(self, language: str, metric_type: str): print(self.model_config[language][metric_type]) else: raise KeyError( - f"Model type '{metric_type}' not found for language '{language}'." - ) # NOQA:E501 + f"Model type '{metric_type}' not found for language '{language}'." # NOQA:E501 + ) else: raise KeyError(f"Language '{language}' not supported.") From b89ed30345fed18104d059c8067d5231fd0be149 Mon Sep 17 00:00:00 2001 From: vela Date: Sun, 10 Dec 2023 18:42:32 +0900 Subject: [PATCH 11/66] pydoc update & fix test case --- src/langcheck/metrics/__init__.py | 4 +++- src/langcheck/metrics/_model_management.py | 21 +++++++++++++++++++ .../zh/reference_based_text_quality.py | 5 +---- tests/metrics/test_model_management.py | 12 +++++++++++ 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 56df78e6..2c73aa48 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -18,6 +18,7 @@ reset_model_config = _model_manager.reset set_model_for_metric = _model_manager.set_model_for_metric list_metric_model = _model_manager.list_metric_model +get_metric_model = _model_manager.get_metric_model load_config_from_file = _model_manager.load_config_from_file save_config_to_disk = _model_manager.save_config_to_disk @@ -51,5 +52,6 @@ 'list_metric_model', 'load_config_from_file', 'save_config_to_disk', - 'reset_model_config' + 'reset_model_config', + 'get_metric_model' ] diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index e9eab35b..cb047825 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -36,6 +36,27 @@ def reset(self): ''' reset all model used in langcheck to default''' self.__init__config() + def get_metric_model(self, language: str, metric_type: str): + """ + return the model used in current metric for a given language. + + Args: + language: The language for which to get the model. + metric_type: The metric name. + + Raises: + KeyError: If the specified language or model type is not found. + """ + if language in self.model_config: + if metric_type in self.model_config[language]: + return self.model_config[language][metric_type] + else: + raise KeyError( + f"Model type '{metric_type}' not found for language '{language}'." # NOQA:E501 + ) + else: + raise KeyError(f"Language '{language}' not supported.") + def list_metric_model(self, language: str, metric_type: str): """ list the model used in current metric for a given language. diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 9f9062ae..11b3a20b 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -92,11 +92,8 @@ def semantic_similarity( return metric_value # lazy import from langcheck.metrics import _model_manager - print( - _model_manager.list_metric_model(language='zh', - metric_type='semantic_similarity')) model = SentenceTransformer( - _model_manager.list_metric_model( + _model_manager.get_metric_model( language='zh', metric_type='semantic_similarity')) # NOQA: E501 generated_embeddings = model.encode(generated_outputs) reference_embeddings = model.encode(reference_outputs) diff --git a/tests/metrics/test_model_management.py b/tests/metrics/test_model_management.py index 1bc29b7a..62a7f025 100644 --- a/tests/metrics/test_model_management.py +++ b/tests/metrics/test_model_management.py @@ -14,6 +14,18 @@ def test_initialization_with_mock_file(): raise err +def test_get_metric_model_with_mock_file(): + try: + mock_file_content = "[zh]\nsemantic_similarity=test_model\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)): + config = ModelConfig() + model_name = config.get_metric_model( + language='zh', metric_type='semantic_similarity') # NOQA:E501 + assert model_name == 'test_model' + except AssertionError as err: + raise err + + def test_list_metric_model_with_mock_file(capsys): try: mock_file_content = "[zh]\nsemantic_similarity=test_model\n" From 2506ececf233c967c49a570d2d17ac3be5decae0 Mon Sep 17 00:00:00 2001 From: vela Date: Sat, 23 Dec 2023 23:59:00 +0900 Subject: [PATCH 12/66] add model loader --- src/langcheck/metrics/_model_loader.py | 42 ++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 src/langcheck/metrics/_model_loader.py diff --git a/src/langcheck/metrics/_model_loader.py b/src/langcheck/metrics/_model_loader.py new file mode 100644 index 00000000..37e7d06e --- /dev/null +++ b/src/langcheck/metrics/_model_loader.py @@ -0,0 +1,42 @@ +from typing import Tuple, Optional +from transformers.pipelines import pipeline +from transformers.models.auto.tokenization_auto import AutoTokenizer +from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification # NOQA:E501 +from sentence_transformers import SentenceTransformer + + +def load_sentence_transformers(model_name: str) -> SentenceTransformer: + """ + return a sentence-transformer model. + + Args: + model_name: The model name of a sentence-transformers model + """ + return SentenceTransformer(model_name) + + +def load_auto_model_for_text_classification(model_name: str, + tokenizer_name: Optional[str])\ + -> Tuple[AutoTokenizer, + AutoModelForSequenceClassification]: + """ + return a Huggingface text-classification pipeline. + + Args: + model_name: The name of a sequenceclassification model on huggingface hub. # NOQA:E501 + tokenizer_name: the name of a tokenizer on huggingface hub. + """ + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + model = AutoModelForSequenceClassification.from_pretrained(model_name) + return tokenizer, model + + +def load_pipeline_for_text_classification(model_name: str, **kwargs): + """ + return a Huggingface text-classification pipeline. + + Args: + model_name: A huggingface model model for text classification. + """ + top_k = kwargs.pop('top_k', None) + return pipeline('text-classification', model=model_name, top_k=top_k) From 57ea21784dbc95434306614aab08cc3a554881e2 Mon Sep 17 00:00:00 2001 From: vela Date: Tue, 26 Dec 2023 01:07:00 +0900 Subject: [PATCH 13/66] re-implent a model manager class --- src/langcheck/metrics/__init__.py | 16 +- src/langcheck/metrics/_model_loader.py | 9 +- src/langcheck/metrics/_model_management.py | 205 ++++++++++-------- .../metrics/config/metric_config.ini | 22 ++ src/langcheck/metrics/modelconfig.ini | 10 - .../zh/reference_based_text_quality.py | 6 +- .../metrics/zh/reference_free_text_quality.py | 13 +- .../metrics/zh/source_based_text_quality.py | 4 +- 8 files changed, 158 insertions(+), 127 deletions(-) create mode 100644 src/langcheck/metrics/config/metric_config.ini delete mode 100644 src/langcheck/metrics/modelconfig.ini diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 2c73aa48..0479c2ce 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -1,5 +1,5 @@ from langcheck.metrics import en, ja, zh -from langcheck.metrics._model_management import ModelConfig +from langcheck.metrics._model_management import ModelManager from langcheck.metrics.en.reference_based_text_quality import ( rouge1, rouge2, rougeL, semantic_similarity) from langcheck.metrics.en.reference_free_text_quality import ( @@ -14,13 +14,7 @@ is_json_array, is_json_object, matches_regex, validation_fn) -_model_manager = ModelConfig() -reset_model_config = _model_manager.reset -set_model_for_metric = _model_manager.set_model_for_metric -list_metric_model = _model_manager.list_metric_model -get_metric_model = _model_manager.get_metric_model -load_config_from_file = _model_manager.load_config_from_file -save_config_to_disk = _model_manager.save_config_to_disk +_model_manager = ModelManager() __all__ = [ 'en', @@ -48,10 +42,4 @@ 'semantic_similarity', 'sentiment', 'toxicity', - 'set_model_for_metric', - 'list_metric_model', - 'load_config_from_file', - 'save_config_to_disk', - 'reset_model_config', - 'get_metric_model' ] diff --git a/src/langcheck/metrics/_model_loader.py b/src/langcheck/metrics/_model_loader.py index 37e7d06e..8b5299a1 100644 --- a/src/langcheck/metrics/_model_loader.py +++ b/src/langcheck/metrics/_model_loader.py @@ -1,4 +1,5 @@ from typing import Tuple, Optional +from click import Option from transformers.pipelines import pipeline from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification # NOQA:E501 @@ -16,7 +17,8 @@ def load_sentence_transformers(model_name: str) -> SentenceTransformer: def load_auto_model_for_text_classification(model_name: str, - tokenizer_name: Optional[str])\ + tokenizer_name: Optional[str], + revision: Optional[str])\ -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: """ @@ -25,9 +27,10 @@ def load_auto_model_for_text_classification(model_name: str, Args: model_name: The name of a sequenceclassification model on huggingface hub. # NOQA:E501 tokenizer_name: the name of a tokenizer on huggingface hub. + revisoin: the shorted sha1 string of a model """ - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - model = AutoModelForSequenceClassification.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) + model = AutoModelForSequenceClassification.from_pretrained(model_name, revision=revision) # NOQA: E501 return tokenizer, model diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index cb047825..b6551bec 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -1,10 +1,24 @@ -import collections -import configparser +from copy import deepcopy import os -from pathlib import Path - +import requests -class ModelConfig: +from pathlib import Path +from pprint import pprint +from typing import Optional +from functools import lru_cache +from configobj import ConfigObj +from ._model_loader import (load_auto_model_for_text_classification, + load_sentence_transformers) +import pandas as pd + +# TODO: Use a ENUM class to parse these +VALID_METRIC_NAME = ['factual_consistency', 'toxicity', + 'sentiment', 'semantic_similarity' + ] +VALID_LANGUAGE = ['zh'] + + +class ModelManager: """ A class to manage different models for multiple languages in the langcheck. @@ -18,116 +32,119 @@ def __init__(self): Initializes the ModelConfig with empty model dictionaries for each language. """ + self.config = None self.__init__config() + self.validate_config() def __init__config(self): cwd = os.path.dirname(__file__) - cfg = configparser.ConfigParser() - # Initial DEFAULT config from modelconfig.ini - cfg.read(os.path.join(Path(cwd), 'modelconfig.ini')) - self.model_config = collections.defaultdict(dict) - for lang in cfg.sections(): - for metric_type in cfg[lang]: - self.model_config[lang][metric_type] = cfg.get( - section=lang, option=metric_type - ) # type: ignore[reportGeneralIssue] # NOQA:E501 + self.config = ConfigObj(os.path.join(Path(cwd), + 'config', + 'metric_config.ini')) # NOQA:E501 - def reset(self): - ''' reset all model used in langcheck to default''' - self.__init__config() - - def get_metric_model(self, language: str, metric_type: str): + @lru_cache + def fetch_model(self, language: str, metric_type: str): """ return the model used in current metric for a given language. Args: language: The language for which to get the model. metric_type: The metric name. - - Raises: - KeyError: If the specified language or model type is not found. """ - if language in self.model_config: - if metric_type in self.model_config[language]: - return self.model_config[language][metric_type] + if language in self.config: # type: ignore + if metric_type in self.config[language]: # type: ignore + # deep copy the confguration + # any action on config would not distrub self.config + config = deepcopy(self.config[language][metric_type]) # type: ignore[reportGeneralTypeIssues] # NOQA:E501 + # get model name, model loader type + model_name, loader_type = config['model_name'], config['loader'] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 + # check if model version fixed + revision = config.pop("revision", None) + if loader_type == 'sentence-transformers': + if revision is not None: + print('Info: Sentence-Transformers do not support model version fixed yet') # NOQA: E501 + model = load_sentence_transformers(model_name=model_name) + return model + elif loader_type == 'huggingface': + tokenizer_name = config.pop('tokenizer_name', None) + return load_auto_model_for_text_classification(model_name=model_name, # NOQA:E501 + tokenizer_name=tokenizer_name, # NOQA:E501 + revision=revision # NOQA:E501 + ) + else: + raise KeyError(f'Loader {loader_type} not supported yet.') else: - raise KeyError( - f"Model type '{metric_type}' not found for language '{language}'." # NOQA:E501 - ) + raise KeyError(f'Metric {metric_type} not supported yet.') else: - raise KeyError(f"Language '{language}' not supported.") + raise KeyError(f'language {language} not supported yet') - def list_metric_model(self, language: str, metric_type: str): - """ - list the model used in current metric for a given language. + def list_current_model_in_use(self, language='all', metric='all'): + """ list model in use. Args: - language: The language for which to get the model. - metric_type: The metric name. - - Raises: - KeyError: If the specified language or model type is not found. + language: The abbrevation name of language. + metric: The evaluation metric name. """ - if language in self.model_config: - if metric_type in self.model_config[language]: - print(self.model_config[language][metric_type]) - else: - raise KeyError( - f"Model type '{metric_type}' not found for language '{language}'." # NOQA:E501 - ) + df = pd.DataFrame.from_records( + [ + (lang, metric_name, key, value) + for lang, lang_model_settings in self.config.items() + for metric_name, model_settings in lang_model_settings.items() + for key, value in model_settings.items() + ], + columns=['language', 'metric_name', 'attribute', 'value']) + + # the code below would generate a dataframe: + # |index| language | metric_name | loader | model_name | revision | + # |.....|..........|.............|........|............|..........| + df_pivot = df.pivot_table(index=['language', 'metric_name'], + columns="attribute", values="value", + aggfunc='first').reset_index().\ + drop(columns=["attribute"]).reset_index() + df_pivot.columns = ['language', 'metric_name', 'loader', 'model_name', 'revision'] # NOQA:E501 + + if language == 'all' and metric == 'all': + pprint(df_pivot) else: - raise KeyError(f"Language '{language}' not supported.") + if language != "all": + df_pivot = df_pivot.loc[df_pivot.language == language] + if metric != 'all': + df_pivot = df_pivot.loc[df_pivot.metric_name == metric] + pprint(df_pivot) - def set_model_for_metric(self, language: str, metric_type: str, - model_name: str): - """ - Sets a specific model used in metric_type for a given language. + def validate_config(self, language='all', metric='all'): + """validate configuration. Args: - language: The language for which to set the model. - metric_type: The type of the model (e.g., 'sentiment_model'). - model_name: The name of the model. - - Raises: - KeyError: If the specified language is not supported. + language (str, optional):the name of the language. Defaults to 'all'. + metric (str, optional): the name of evaluation metric. Defaults to 'all'. """ - if language in self.model_config: - if metric_type in self.model_config[language]: - self.model_config[language][metric_type] = model_name + def check_model_availability(model_name, revision): + if revision is None: + url = f"https://huggingface.co/api/models/{model_name}" else: - raise KeyError(f"Metrics '{metric_type}' not used in metric.") - else: - raise KeyError(f"Language '{language}' not supported.") - - def load_config_from_file(self, file_path: str): - """ - Loads model configurations from a specified configuration file. - - The configuration file should have sections for each language with - key-value pairs for each metrics and model_name. - - Args: - file_path: The path to the configuration file containing model - configurations. - """ - config = configparser.ConfigParser() - config.read(file_path) - - for lanuage_section in config.sections(): - if lanuage_section in self.model_config: - for metric_type, model_name in config[lanuage_section].items(): - if metric_type in self.model_config[lanuage_section]: - self.model_config[lanuage_section][ - metric_type] = model_name # NOQA:E501 - - def save_config_to_disk(self, output_path: str): - """ - Save Model Configuration to output path. - Args: - output_path: The path to save the configuration file - """ - cfg = configparser.ConfigParser() - cfg.read_dict(self.model_config) - - with open(output_path, 'w') as f: - cfg.write(f) + url = f"https://huggingface.co/api/models/{model_name}/revision/{revision}" + response = requests.get(url) + return response.status_code == 200 + + config = deepcopy(self.config) + for lang, lang_setting in config.items(): + if language == 'all' or lang == language: + for metric_name, model_setting in lang_setting.items(): + if metric == 'all' or metric_name == metric: + # if model name not set + if 'model_name' not in model_setting: + raise KeyError(f'{lang} metrics {metric_name} need a model, but found None!') # NOQA:E501 + if 'loader' not in model_setting: + raise KeyError(f'Metrics {metric_name} need a loader, but found None!') # NOQA:E501 + # check if the model and revision is available on huggingface Hub # NOQA:E501 + loader_type = model_setting.pop('loader') + if loader_type == 'huggingface': + model_name = model_setting.pop('model_name') + revision = model_setting.pop('revision', None) + if not check_model_availability(model_name, revision): # NOQA:E501 + raise ValueError(f"""Cannot find {model_name} with # NOQA:E501 + {revision} and Huggingface Hub""") + # may also need other validate method for other loader + # not found yet + print('Configuration Validation Passed') diff --git a/src/langcheck/metrics/config/metric_config.ini b/src/langcheck/metrics/config/metric_config.ini new file mode 100644 index 00000000..4d8ee985 --- /dev/null +++ b/src/langcheck/metrics/config/metric_config.ini @@ -0,0 +1,22 @@ +[zh] +[[semantic_similarity]] + # According to the C-MTEB Benchmark + # (https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB) + # the 3 models of different sizes provided BAAI are the best on the + # embedding task + # Ref: https://huggingface.co/BAAI/bge-base-zh-v1.5 + # Using this model, it is hard to find two sentence where cos_sim < 0.25. + model_name = BAAI/bge-base-zh-v1.5 + revision = f03589c + loader = sentence-transformers +[[sentiment]] + model_name = IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment + loader = huggingface +[[toxicity]] + model_name = alibaba-pai/pai-bert-base-zh-llm-risk-detection + loader = huggingface + revision = 0a61c79744cb0173216f015ffecc1ea81c4e0229 +[[factual_consistency]] + model_name = Helsinki-NLP/opus-mt-zh-en + loader = huggingface + revision = cf109095479db38d6df799875e34039d4938aaa6 diff --git a/src/langcheck/metrics/modelconfig.ini b/src/langcheck/metrics/modelconfig.ini deleted file mode 100644 index baec4017..00000000 --- a/src/langcheck/metrics/modelconfig.ini +++ /dev/null @@ -1,10 +0,0 @@ -[zh] -# According to the C-MTEB Benchmark -# (https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB) -# the 3 models of different sizes provided BAAI are the best on the -# embedding task -# Ref: https://huggingface.co/BAAI/bge-base-zh-v1.5 -# Using this model, it is hard to find two sentence where cos_sim < 0.25. -semantic_similarity = BAAI/bge-base-zh-v1.5 -sentiment = IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment -toxicity = alibaba-pai/pai-bert-base-zh-llm-risk-detection \ No newline at end of file diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 11b3a20b..15d0d080 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -92,9 +92,9 @@ def semantic_similarity( return metric_value # lazy import from langcheck.metrics import _model_manager - model = SentenceTransformer( - _model_manager.get_metric_model( - language='zh', metric_type='semantic_similarity')) # NOQA: E501 + model = _model_manager.fetch_model(language='zh', + metric_type="semantic_similarity") + generated_embeddings = model.encode(generated_outputs) reference_embeddings = model.encode(reference_outputs) cosine_scores = util.pairwise_cos_sim( diff --git a/src/langcheck/metrics/zh/reference_free_text_quality.py b/src/langcheck/metrics/zh/reference_free_text_quality.py index bab490a6..68fbec15 100644 --- a/src/langcheck/metrics/zh/reference_free_text_quality.py +++ b/src/langcheck/metrics/zh/reference_free_text_quality.py @@ -92,7 +92,11 @@ def sentiment( _sentiment_pipeline = pipeline( 'sentiment-analysis', model=_sentiment_model_path ) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 - # {0:"Negative", 1:'Positive'} + # # {0:"Negative", 1:'Positive'} + from langcheck.metrics import _model_manager + tokenizer, model = _model_manager.fetch_model(lanaguage='zh', metric_type='sentiment') + _sentiment_pipeline = pipeline( + 'sentiment-analysis', model=model, tokenizer=tokenizer) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 _model_id2label = _sentiment_pipeline.model.config.id2label _predict_result = _sentiment_pipeline( generated_outputs @@ -210,8 +214,13 @@ def _toxicity_local(generated_outputs: List[str]) -> List[float]: global _toxicity_model_path # this pipeline output predict probability for each text on each label. # the output format is List[List[Dict(str)]] + from langcheck.metrics import _model_manager + tokenizer, model = _model_manager.fetch_model(language='zh', + metric_type="toxicity") + _toxicity_pipeline = pipeline('text-classification', - model=_toxicity_model_path, + model=model, + tokenizer=tokenizer, top_k=5) # {'Normal': 0, 'Pulp': 1, 'Sex': 2, 'Other Risk': 3, 'Adult': 4} diff --git a/src/langcheck/metrics/zh/source_based_text_quality.py b/src/langcheck/metrics/zh/source_based_text_quality.py index 239f583b..5e18ec3d 100644 --- a/src/langcheck/metrics/zh/source_based_text_quality.py +++ b/src/langcheck/metrics/zh/source_based_text_quality.py @@ -86,8 +86,10 @@ def factual_consistency( global _factual_consistency_translation_pipeline if _factual_consistency_translation_pipeline is None: + from langcheck.metrics import _model_manager + tokenizer, model = _model_manager.fetch_model(language='zh', metric_type='factual') _factual_consistency_translation_pipeline = pipeline( - 'translation', model=_factual_consistency_translation_model_path) + 'translation', model=model, tokenizer=tokenizer) # Translate the sources and generated outputs to English. # Currently, the type checks are not working for the pipeline, since From 3bf196c9301aaa0a4680073eeaf39555f930149a Mon Sep 17 00:00:00 2001 From: vela Date: Tue, 26 Dec 2023 02:12:23 +0900 Subject: [PATCH 14/66] add update_metrics_for_model method --- src/langcheck/metrics/_model_management.py | 60 +++++++++++++++++++--- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index b6551bec..82e027c8 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -16,7 +16,7 @@ 'sentiment', 'semantic_similarity' ] VALID_LANGUAGE = ['zh'] - +VALID_LOADER = ['huggingface', 'sentence-transformers'] class ModelManager: """ @@ -43,7 +43,7 @@ def __init__config(self): 'metric_config.ini')) # NOQA:E501 @lru_cache - def fetch_model(self, language: str, metric_type: str): + def fetch_model(self, language: str, metric: str): """ return the model used in current metric for a given language. @@ -54,7 +54,7 @@ def fetch_model(self, language: str, metric_type: str): if language in self.config: # type: ignore if metric_type in self.config[language]: # type: ignore # deep copy the confguration - # any action on config would not distrub self.config + # any action on config would not distrub self.config config = deepcopy(self.config[language][metric_type]) # type: ignore[reportGeneralTypeIssues] # NOQA:E501 # get model name, model loader type model_name, loader_type = config['model_name'], config['loader'] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 @@ -67,10 +67,12 @@ def fetch_model(self, language: str, metric_type: str): return model elif loader_type == 'huggingface': tokenizer_name = config.pop('tokenizer_name', None) - return load_auto_model_for_text_classification(model_name=model_name, # NOQA:E501 - tokenizer_name=tokenizer_name, # NOQA:E501 - revision=revision # NOQA:E501 - ) + tokenizer, model = load_auto_model_for_text_classification(model_name=model_name, # NOQA:E501 + tokenizer_name=tokenizer_name, # NOQA:E501 + revision=revision # NOQA:E501 + ) + print(model.config) + return tokenizer, model else: raise KeyError(f'Loader {loader_type} not supported yet.') else: @@ -145,6 +147,50 @@ def check_model_availability(model_name, revision): if not check_model_availability(model_name, revision): # NOQA:E501 raise ValueError(f"""Cannot find {model_name} with # NOQA:E501 {revision} and Huggingface Hub""") + elif loader_type not in VALID_LOADER: + raise ValueError(f'loader type should in {VALID_LOADER}') # NOQA: E501 # may also need other validate method for other loader # not found yet print('Configuration Validation Passed') + + def set_model_for_metric(self, language: str, metric: str, + model_name: str, loader: Optional[str], + **kwargs): + """set model for specified metric in specified language + + Args: + language (str): the name of the lanuage + metric (str): the name of the evaluation metrics, + loader(str): the loader of the model, optional + model_name(str): the name of the model + tokenizer_name(str): optional, the name of the tokenizer + revision(str): a version string of the model + """ + config_copy = deepcopy(self.config) + try: + if language not in VALID_LANGUAGE: + raise ValueError('Language {language} not supported yet') + + if metric not in self.config[language]: + raise ValueError('Language {language} not supported {metric} yet') + + config = self.config[language][metric] + config['loader'] = loader + config['model_name'] = model_name + # if tokenizer_name is different with model + tokenizer_name = kwargs.pop('tokenizer_name', None) + if tokenizer_name: + config['tokenizer_name'] = tokenizer_name + # if model's revision is pinned + revision = kwargs.pop('revision', None) + if revision: + config['revision'] = revision + # validate the change + if self.validate_config(language=language, metric=metric): + # clear the LRU cache to make the config change + # reflected imediately + self.fetch_model.cache_clear() + except (ValueError, KeyError) as err: + # trace back the configuration + self.config = config_copy + raise err From bb70f644098f48391e98a81763be367fa7508c1b Mon Sep 17 00:00:00 2001 From: vela Date: Tue, 26 Dec 2023 03:00:41 +0900 Subject: [PATCH 15/66] apply format suggestion --- src/langcheck/metrics/_model_loader.py | 11 +- src/langcheck/metrics/_model_management.py | 138 +++++++++++------- .../zh/reference_based_text_quality.py | 4 +- .../metrics/zh/reference_free_text_quality.py | 4 +- .../metrics/zh/source_based_text_quality.py | 4 +- 5 files changed, 96 insertions(+), 65 deletions(-) diff --git a/src/langcheck/metrics/_model_loader.py b/src/langcheck/metrics/_model_loader.py index 8b5299a1..8b031cdf 100644 --- a/src/langcheck/metrics/_model_loader.py +++ b/src/langcheck/metrics/_model_loader.py @@ -1,9 +1,10 @@ -from typing import Tuple, Optional -from click import Option -from transformers.pipelines import pipeline -from transformers.models.auto.tokenization_auto import AutoTokenizer -from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification # NOQA:E501 +from typing import Optional, Tuple + from sentence_transformers import SentenceTransformer +from transformers.models.auto.modeling_auto import \ + AutoModelForSequenceClassification +from transformers.models.auto.tokenization_auto import AutoTokenizer +from transformers.pipelines import pipeline def load_sentence_transformers(model_name: str) -> SentenceTransformer: diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index 82e027c8..2c12c108 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -1,23 +1,29 @@ -from copy import deepcopy import os -import requests - +from copy import deepcopy +from functools import lru_cache from pathlib import Path from pprint import pprint -from typing import Optional -from functools import lru_cache +from typing import Optional, Tuple, Union + +import pandas as pd +import requests from configobj import ConfigObj +from sentence_transformers import SentenceTransformer +from transformers.models.auto.modeling_auto import \ + AutoModelForSequenceClassification +from transformers.models.auto.tokenization_auto import AutoTokenizer + from ._model_loader import (load_auto_model_for_text_classification, load_sentence_transformers) -import pandas as pd # TODO: Use a ENUM class to parse these -VALID_METRIC_NAME = ['factual_consistency', 'toxicity', - 'sentiment', 'semantic_similarity' - ] +VALID_METRIC_NAME = [ + 'factual_consistency', 'toxicity', 'sentiment', 'semantic_similarity' +] VALID_LANGUAGE = ['zh'] VALID_LOADER = ['huggingface', 'sentence-transformers'] + class ModelManager: """ A class to manage different models for multiple languages in the @@ -38,12 +44,13 @@ def __init__(self): def __init__config(self): cwd = os.path.dirname(__file__) - self.config = ConfigObj(os.path.join(Path(cwd), - 'config', - 'metric_config.ini')) # NOQA:E501 + self.config = ConfigObj( + os.path.join(Path(cwd), 'config', 'metric_config.ini')) # NOQA:E501 @lru_cache - def fetch_model(self, language: str, metric: str): + def fetch_model(self, language: str, metric: str)\ + -> Union[Tuple[AutoTokenizer, AutoModelForSequenceClassification], + SentenceTransformer]: """ return the model used in current metric for a given language. @@ -52,31 +59,36 @@ def fetch_model(self, language: str, metric: str): metric_type: The metric name. """ if language in self.config: # type: ignore - if metric_type in self.config[language]: # type: ignore + if metric in self.config[language]: # type: ignore # deep copy the confguration # any action on config would not distrub self.config - config = deepcopy(self.config[language][metric_type]) # type: ignore[reportGeneralTypeIssues] # NOQA:E501 + config = deepcopy( + self.config[language][metric] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 + ) # get model name, model loader type - model_name, loader_type = config['model_name'], config['loader'] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 + model_name, loader_type = config['model_name'], config[ + 'loader'] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 # check if model version fixed revision = config.pop("revision", None) if loader_type == 'sentence-transformers': if revision is not None: - print('Info: Sentence-Transformers do not support model version fixed yet') # NOQA: E501 + print( + 'Info: Sentence-Transformers do not support model version fixed yet' # NOQA: E501 + ) model = load_sentence_transformers(model_name=model_name) return model elif loader_type == 'huggingface': tokenizer_name = config.pop('tokenizer_name', None) - tokenizer, model = load_auto_model_for_text_classification(model_name=model_name, # NOQA:E501 - tokenizer_name=tokenizer_name, # NOQA:E501 - revision=revision # NOQA:E501 - ) - print(model.config) + tokenizer, model = load_auto_model_for_text_classification( + model_name=model_name, # NOQA:E501 + tokenizer_name=tokenizer_name, # NOQA:E501 + revision=revision # NOQA:E501 + ) return tokenizer, model else: raise KeyError(f'Loader {loader_type} not supported yet.') else: - raise KeyError(f'Metric {metric_type} not supported yet.') + raise KeyError(f'Metric {metric} not supported yet.') else: raise KeyError(f'language {language} not supported yet') @@ -90,8 +102,10 @@ def list_current_model_in_use(self, language='all', metric='all'): df = pd.DataFrame.from_records( [ (lang, metric_name, key, value) - for lang, lang_model_settings in self.config.items() - for metric_name, model_settings in lang_model_settings.items() + for lang, lang_model_settings in + self.config.items() # type: ignore # NOQA:E501 + for metric_name, model_settings in + lang_model_settings.items() # type: ignore # NOQA:E501 for key, value in model_settings.items() ], columns=['language', 'metric_name', 'attribute', 'value']) @@ -99,11 +113,15 @@ def list_current_model_in_use(self, language='all', metric='all'): # the code below would generate a dataframe: # |index| language | metric_name | loader | model_name | revision | # |.....|..........|.............|........|............|..........| - df_pivot = df.pivot_table(index=['language', 'metric_name'], - columns="attribute", values="value", - aggfunc='first').reset_index().\ - drop(columns=["attribute"]).reset_index() - df_pivot.columns = ['language', 'metric_name', 'loader', 'model_name', 'revision'] # NOQA:E501 + df_pivot = df.pivot_table( + index=['language', 'metric_name'], + columns="attribute", + values="value", + aggfunc='first').reset_index().drop( + columns=["attribute"]).reset_index() # NOQA:E501 + df_pivot.columns = [ + 'language', 'metric_name', 'loader', 'model_name', 'revision' + ] # NOQA:E501 if language == 'all' and metric == 'all': pprint(df_pivot) @@ -118,63 +136,75 @@ def validate_config(self, language='all', metric='all'): """validate configuration. Args: - language (str, optional):the name of the language. Defaults to 'all'. - metric (str, optional): the name of evaluation metric. Defaults to 'all'. + language (str, optional):the name of the language. Defaults to 'all'. # NOQA:E501 + metric (str, optional): the name of evaluation metric. Defaults to 'all'. # NOQA:E501 """ + def check_model_availability(model_name, revision): if revision is None: url = f"https://huggingface.co/api/models/{model_name}" else: - url = f"https://huggingface.co/api/models/{model_name}/revision/{revision}" + url = f"https://huggingface.co/api/models/{model_name}/revision/{revision}" # NOQA:E501 response = requests.get(url) return response.status_code == 200 config = deepcopy(self.config) - for lang, lang_setting in config.items(): + for lang, lang_setting in config.items(): # type: ignore # NOQA:E501 if language == 'all' or lang == language: - for metric_name, model_setting in lang_setting.items(): + for metric_name, model_setting in lang_setting.items( # type: ignore # NOQA:E501 + ): if metric == 'all' or metric_name == metric: # if model name not set if 'model_name' not in model_setting: - raise KeyError(f'{lang} metrics {metric_name} need a model, but found None!') # NOQA:E501 + raise KeyError( + f'{lang} metrics {metric_name} need a model, but found None!' # NOQA:E501 + ) if 'loader' not in model_setting: - raise KeyError(f'Metrics {metric_name} need a loader, but found None!') # NOQA:E501 + raise KeyError( + f'Metrics {metric_name} need a loader, but found None!' # NOQA:E501 + ) # check if the model and revision is available on huggingface Hub # NOQA:E501 loader_type = model_setting.pop('loader') if loader_type == 'huggingface': model_name = model_setting.pop('model_name') revision = model_setting.pop('revision', None) - if not check_model_availability(model_name, revision): # NOQA:E501 - raise ValueError(f"""Cannot find {model_name} with # NOQA:E501 - {revision} and Huggingface Hub""") + if not check_model_availability( + model_name, revision): # NOQA:E501 + raise ValueError( + f"""Cannot find {model_name} with # NOQA:E501 + {revision} and Huggingface Hub""" + ) elif loader_type not in VALID_LOADER: - raise ValueError(f'loader type should in {VALID_LOADER}') # NOQA: E501 + raise ValueError( + f'loader type should in {VALID_LOADER}' + ) # NOQA: E501 # may also need other validate method for other loader # not found yet print('Configuration Validation Passed') - def set_model_for_metric(self, language: str, metric: str, - model_name: str, loader: Optional[str], - **kwargs): + def set_model_for_metric(self, language: str, metric: str, model_name: str, + loader: Optional[str], **kwargs): """set model for specified metric in specified language Args: - language (str): the name of the lanuage + language (str): the name of the lanuage, metric (str): the name of the evaluation metrics, - loader(str): the loader of the model, optional - model_name(str): the name of the model - tokenizer_name(str): optional, the name of the tokenizer - revision(str): a version string of the model + loader(str): the loader of the model, optional, + model_name(str): the name of the model, + tokenizer_name(str): optional, the name of the tokenizer, + revision(str): a version string of the model. """ config_copy = deepcopy(self.config) try: if language not in VALID_LANGUAGE: raise ValueError('Language {language} not supported yet') - - if metric not in self.config[language]: - raise ValueError('Language {language} not supported {metric} yet') - - config = self.config[language][metric] + + if metric not in self.config[language]: # type: ignore # NOQA:E501 + raise ValueError( + 'Language {language} not supported {metric} yet' + ) # NOQA:E501 + + config = self.config[language][metric] # type: ignore # NOQA:E501 config['loader'] = loader config['model_name'] = model_name # if tokenizer_name is different with model diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 15d0d080..72d3b09e 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -95,8 +95,8 @@ def semantic_similarity( model = _model_manager.fetch_model(language='zh', metric_type="semantic_similarity") - generated_embeddings = model.encode(generated_outputs) - reference_embeddings = model.encode(reference_outputs) + generated_embeddings = model.encode(generated_outputs) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 + reference_embeddings = model.encode(reference_outputs) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 cosine_scores = util.pairwise_cos_sim( generated_embeddings, # type: ignore[reportGeneralTypeIssues] reference_embeddings # type: ignore[reportGeneralTypeIssues] diff --git a/src/langcheck/metrics/zh/reference_free_text_quality.py b/src/langcheck/metrics/zh/reference_free_text_quality.py index 68fbec15..984d7c4b 100644 --- a/src/langcheck/metrics/zh/reference_free_text_quality.py +++ b/src/langcheck/metrics/zh/reference_free_text_quality.py @@ -94,7 +94,7 @@ def sentiment( ) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 # # {0:"Negative", 1:'Positive'} from langcheck.metrics import _model_manager - tokenizer, model = _model_manager.fetch_model(lanaguage='zh', metric_type='sentiment') + tokenizer, model = _model_manager.fetch_model(lanaguage='zh', metric='sentiment') # NOQA: E501 _sentiment_pipeline = pipeline( 'sentiment-analysis', model=model, tokenizer=tokenizer) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 _model_id2label = _sentiment_pipeline.model.config.id2label @@ -220,7 +220,7 @@ def _toxicity_local(generated_outputs: List[str]) -> List[float]: _toxicity_pipeline = pipeline('text-classification', model=model, - tokenizer=tokenizer, + tokenizer=tokenizer, # type: ignore[reportOptionalIterable] # NOQA: E501 top_k=5) # {'Normal': 0, 'Pulp': 1, 'Sex': 2, 'Other Risk': 3, 'Adult': 4} diff --git a/src/langcheck/metrics/zh/source_based_text_quality.py b/src/langcheck/metrics/zh/source_based_text_quality.py index 5e18ec3d..718a19e4 100644 --- a/src/langcheck/metrics/zh/source_based_text_quality.py +++ b/src/langcheck/metrics/zh/source_based_text_quality.py @@ -87,9 +87,9 @@ def factual_consistency( global _factual_consistency_translation_pipeline if _factual_consistency_translation_pipeline is None: from langcheck.metrics import _model_manager - tokenizer, model = _model_manager.fetch_model(language='zh', metric_type='factual') + tokenizer, model = _model_manager.fetch_model(language='zh', metric_type='factual') # NOQA: E501 _factual_consistency_translation_pipeline = pipeline( - 'translation', model=model, tokenizer=tokenizer) + 'translation', model=model, tokenizer=tokenizer) # type: ignore # Translate the sources and generated outputs to English. # Currently, the type checks are not working for the pipeline, since From db15318e643571e9c2472bf9521798dc4e29e874 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 27 Dec 2023 02:49:53 +0000 Subject: [PATCH 16/66] clean up model loader docstrings --- src/langcheck/metrics/_model_loader.py | 31 +++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/langcheck/metrics/_model_loader.py b/src/langcheck/metrics/_model_loader.py index 8b031cdf..2b1579f0 100644 --- a/src/langcheck/metrics/_model_loader.py +++ b/src/langcheck/metrics/_model_loader.py @@ -8,12 +8,12 @@ def load_sentence_transformers(model_name: str) -> SentenceTransformer: - """ - return a sentence-transformer model. + ''' + Return a Hugging Face sentence-transformer model. Args: - model_name: The model name of a sentence-transformers model - """ + model_name: The name of a sentence-transformer model + ''' return SentenceTransformer(model_name) @@ -22,25 +22,26 @@ def load_auto_model_for_text_classification(model_name: str, revision: Optional[str])\ -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: - """ - return a Huggingface text-classification pipeline. + ''' + Return a Hugging Face sequence-classification model. Args: - model_name: The name of a sequenceclassification model on huggingface hub. # NOQA:E501 - tokenizer_name: the name of a tokenizer on huggingface hub. - revisoin: the shorted sha1 string of a model - """ + model_name: The name of a sequence-classification model on Hugging Face + tokenizer_name: The name of a tokenizer on Hugging Face + revision: The shortened sha1 string of a model + ''' tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) - model = AutoModelForSequenceClassification.from_pretrained(model_name, revision=revision) # NOQA: E501 + model = AutoModelForSequenceClassification.from_pretrained( + model_name, revision=revision) # NOQA: E501 return tokenizer, model def load_pipeline_for_text_classification(model_name: str, **kwargs): - """ - return a Huggingface text-classification pipeline. + ''' + Return a Hugging Face text-classification pipeline. Args: - model_name: A huggingface model model for text classification. - """ + model_name: The name of a text-classification pipeline on Hugging Face + ''' top_k = kwargs.pop('top_k', None) return pipeline('text-classification', model=model_name, top_k=top_k) From ddac3cf402beef0e329dfeffac3238cc11eb9eeb Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 27 Dec 2023 02:50:51 +0000 Subject: [PATCH 17/66] fix format --- src/langcheck/metrics/_model_loader.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/langcheck/metrics/_model_loader.py b/src/langcheck/metrics/_model_loader.py index 2b1579f0..8dcff6ed 100644 --- a/src/langcheck/metrics/_model_loader.py +++ b/src/langcheck/metrics/_model_loader.py @@ -17,11 +17,9 @@ def load_sentence_transformers(model_name: str) -> SentenceTransformer: return SentenceTransformer(model_name) -def load_auto_model_for_text_classification(model_name: str, - tokenizer_name: Optional[str], - revision: Optional[str])\ - -> Tuple[AutoTokenizer, - AutoModelForSequenceClassification]: +def load_auto_model_for_text_classification( + model_name: str, tokenizer_name: Optional[str], revision: Optional[str] +) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: ''' Return a Hugging Face sequence-classification model. From 6b2a382f31129426b8b266fdac5411ed5193d7fc Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 27 Dec 2023 03:03:17 +0000 Subject: [PATCH 18/66] clean up docstrings in model management --- src/langcheck/metrics/_model_management.py | 66 +++++++++++----------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index 2c12c108..34e544bd 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -25,19 +25,18 @@ class ModelManager: - """ - A class to manage different models for multiple languages in the - langcheck. - This class allows setting and retrieving different model names. - (like sentiment_model, semantic_similarity_model, etc.) for each language. + ''' + A class to manage different models for multiple languages in LangCheck. + This class allows setting and retrieving different model names (like + sentiment_model, semantic_similarity_model, etc.) for each language. It also supports loading model configurations from a file. - """ + ''' def __init__(self): - """ + ''' Initializes the ModelConfig with empty model dictionaries for each language. - """ + ''' self.config = None self.__init__config() self.validate_config() @@ -51,20 +50,20 @@ def __init__config(self): def fetch_model(self, language: str, metric: str)\ -> Union[Tuple[AutoTokenizer, AutoModelForSequenceClassification], SentenceTransformer]: - """ - return the model used in current metric for a given language. + ''' + Return the model used for the given metric and language. Args: - language: The language for which to get the model. - metric_type: The metric name. - """ + language: The language for which to get the model + metric_type: The metric name + ''' if language in self.config: # type: ignore if metric in self.config[language]: # type: ignore # deep copy the confguration # any action on config would not distrub self.config - config = deepcopy( - self.config[language][metric] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 - ) + config = deepcopy(self.config[language][ + metric] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 + ) # get model name, model loader type model_name, loader_type = config['model_name'], config[ 'loader'] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 @@ -93,12 +92,13 @@ def fetch_model(self, language: str, metric: str)\ raise KeyError(f'language {language} not supported yet') def list_current_model_in_use(self, language='all', metric='all'): - """ list model in use. + ''' + List the models currently in use. Args: - language: The abbrevation name of language. - metric: The evaluation metric name. - """ + language: The abbrevation name of language + metric: The evaluation metric name + ''' df = pd.DataFrame.from_records( [ (lang, metric_name, key, value) @@ -133,12 +133,13 @@ def list_current_model_in_use(self, language='all', metric='all'): pprint(df_pivot) def validate_config(self, language='all', metric='all'): - """validate configuration. + ''' + Validate configuration. Args: - language (str, optional):the name of the language. Defaults to 'all'. # NOQA:E501 - metric (str, optional): the name of evaluation metric. Defaults to 'all'. # NOQA:E501 - """ + language: The name of the language. Defaults to 'all'. + metric: The name of the metric. Defaults to 'all'. + ''' def check_model_availability(model_name, revision): if revision is None: @@ -184,16 +185,17 @@ def check_model_availability(model_name, revision): def set_model_for_metric(self, language: str, metric: str, model_name: str, loader: Optional[str], **kwargs): - """set model for specified metric in specified language + ''' + Set model for specified metric in specified language. Args: - language (str): the name of the lanuage, - metric (str): the name of the evaluation metrics, - loader(str): the loader of the model, optional, - model_name(str): the name of the model, - tokenizer_name(str): optional, the name of the tokenizer, - revision(str): a version string of the model. - """ + language: The name of the language + metric: The name of the evaluation metrics + model_name: The name of the model + loader: The loader of the model + tokenizer_name: (Optional) the name of the tokenizer + revision: (Optional) a version string of the model + ''' config_copy = deepcopy(self.config) try: if language not in VALID_LANGUAGE: From 0e23c39201cd4857c971a24d9d84a6bfa2a312f1 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 27 Dec 2023 03:19:24 +0000 Subject: [PATCH 19/66] make self.config not None --- src/langcheck/metrics/_model_management.py | 33 ++++++++-------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index 34e544bd..7452ef06 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -37,7 +37,6 @@ def __init__(self): Initializes the ModelConfig with empty model dictionaries for each language. ''' - self.config = None self.__init__config() self.validate_config() @@ -57,16 +56,13 @@ def fetch_model(self, language: str, metric: str)\ language: The language for which to get the model metric_type: The metric name ''' - if language in self.config: # type: ignore - if metric in self.config[language]: # type: ignore + if language in self.config: + if metric in self.config[language]: # deep copy the confguration # any action on config would not distrub self.config - config = deepcopy(self.config[language][ - metric] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 - ) + config = deepcopy(self.config[language][metric]) # get model name, model loader type - model_name, loader_type = config['model_name'], config[ - 'loader'] # type: ignore[reportGeneralTypeIssues] # NOQA:E501 + model_name, loader_type = config['model_name'], config['loader'] # check if model version fixed revision = config.pop("revision", None) if loader_type == 'sentence-transformers': @@ -100,14 +96,10 @@ def list_current_model_in_use(self, language='all', metric='all'): metric: The evaluation metric name ''' df = pd.DataFrame.from_records( - [ - (lang, metric_name, key, value) - for lang, lang_model_settings in - self.config.items() # type: ignore # NOQA:E501 - for metric_name, model_settings in - lang_model_settings.items() # type: ignore # NOQA:E501 - for key, value in model_settings.items() - ], + [(lang, metric_name, key, value) + for lang, lang_model_settings in self.config.items() + for metric_name, model_settings in lang_model_settings.items() + for key, value in model_settings.items()], columns=['language', 'metric_name', 'attribute', 'value']) # the code below would generate a dataframe: @@ -150,10 +142,9 @@ def check_model_availability(model_name, revision): return response.status_code == 200 config = deepcopy(self.config) - for lang, lang_setting in config.items(): # type: ignore # NOQA:E501 + for lang, lang_setting in config.items(): if language == 'all' or lang == language: - for metric_name, model_setting in lang_setting.items( # type: ignore # NOQA:E501 - ): + for metric_name, model_setting in lang_setting.items(): if metric == 'all' or metric_name == metric: # if model name not set if 'model_name' not in model_setting: @@ -201,12 +192,12 @@ def set_model_for_metric(self, language: str, metric: str, model_name: str, if language not in VALID_LANGUAGE: raise ValueError('Language {language} not supported yet') - if metric not in self.config[language]: # type: ignore # NOQA:E501 + if metric not in self.config[language]: raise ValueError( 'Language {language} not supported {metric} yet' ) # NOQA:E501 - config = self.config[language][metric] # type: ignore # NOQA:E501 + config = self.config[language][metric] config['loader'] = loader config['model_name'] = model_name # if tokenizer_name is different with model From a1fd9728c0d9f82c4a2a5838720fff84ea18704c Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 27 Dec 2023 03:25:24 +0000 Subject: [PATCH 20/66] remove unnecessary noqa tags --- src/langcheck/metrics/_model_management.py | 38 ++++++++++------------ 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index 7452ef06..a235af3a 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -43,7 +43,7 @@ def __init__(self): def __init__config(self): cwd = os.path.dirname(__file__) self.config = ConfigObj( - os.path.join(Path(cwd), 'config', 'metric_config.ini')) # NOQA:E501 + os.path.join(Path(cwd), 'config', 'metric_config.ini')) @lru_cache def fetch_model(self, language: str, metric: str)\ @@ -68,17 +68,16 @@ def fetch_model(self, language: str, metric: str)\ if loader_type == 'sentence-transformers': if revision is not None: print( - 'Info: Sentence-Transformers do not support model version fixed yet' # NOQA: E501 + 'Info: Sentence-Transformers do not support model version fixed yet' # NOQA:E501 ) model = load_sentence_transformers(model_name=model_name) return model elif loader_type == 'huggingface': tokenizer_name = config.pop('tokenizer_name', None) tokenizer, model = load_auto_model_for_text_classification( - model_name=model_name, # NOQA:E501 - tokenizer_name=tokenizer_name, # NOQA:E501 - revision=revision # NOQA:E501 - ) + model_name=model_name, + tokenizer_name=tokenizer_name, + revision=revision) return tokenizer, model else: raise KeyError(f'Loader {loader_type} not supported yet.') @@ -105,15 +104,14 @@ def list_current_model_in_use(self, language='all', metric='all'): # the code below would generate a dataframe: # |index| language | metric_name | loader | model_name | revision | # |.....|..........|.............|........|............|..........| - df_pivot = df.pivot_table( - index=['language', 'metric_name'], - columns="attribute", - values="value", - aggfunc='first').reset_index().drop( - columns=["attribute"]).reset_index() # NOQA:E501 + df_pivot = df.pivot_table(index=['language', 'metric_name'], + columns="attribute", + values="value", + aggfunc='first').reset_index().drop( + columns=["attribute"]).reset_index() df_pivot.columns = [ 'language', 'metric_name', 'loader', 'model_name', 'revision' - ] # NOQA:E501 + ] if language == 'all' and metric == 'all': pprint(df_pivot) @@ -155,21 +153,20 @@ def check_model_availability(model_name, revision): raise KeyError( f'Metrics {metric_name} need a loader, but found None!' # NOQA:E501 ) - # check if the model and revision is available on huggingface Hub # NOQA:E501 + # Check if the model and revision is available on + # Hugging Face Hub loader_type = model_setting.pop('loader') if loader_type == 'huggingface': model_name = model_setting.pop('model_name') revision = model_setting.pop('revision', None) if not check_model_availability( - model_name, revision): # NOQA:E501 + model_name, revision): raise ValueError( - f"""Cannot find {model_name} with # NOQA:E501 - {revision} and Huggingface Hub""" + f'Cannot find {model_name} with {revision} and Huggingface Hub' # NOQA:E501 ) elif loader_type not in VALID_LOADER: raise ValueError( - f'loader type should in {VALID_LOADER}' - ) # NOQA: E501 + f'loader type should in {VALID_LOADER}') # may also need other validate method for other loader # not found yet print('Configuration Validation Passed') @@ -194,8 +191,7 @@ def set_model_for_metric(self, language: str, metric: str, model_name: str, if metric not in self.config[language]: raise ValueError( - 'Language {language} not supported {metric} yet' - ) # NOQA:E501 + 'Language {language} not supported {metric} yet') config = self.config[language][metric] config['loader'] = loader From e80285d0d1bcceb6a56bb6ba25b6549d9979d2c9 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 27 Dec 2023 03:36:29 +0000 Subject: [PATCH 21/66] clean up comments --- src/langcheck/metrics/_model_management.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index a235af3a..dd19c02e 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -58,17 +58,17 @@ def fetch_model(self, language: str, metric: str)\ ''' if language in self.config: if metric in self.config[language]: - # deep copy the confguration - # any action on config would not distrub self.config + # Deep copy the confguration so that changes to `config` would + # not affect the original `self.config`. config = deepcopy(self.config[language][metric]) - # get model name, model loader type + # Get model name, model loader type model_name, loader_type = config['model_name'], config['loader'] - # check if model version fixed + # Check if the model version is fixed revision = config.pop("revision", None) if loader_type == 'sentence-transformers': if revision is not None: print( - 'Info: Sentence-Transformers do not support model version fixed yet' # NOQA:E501 + 'Info: Sentence-Transformers do not support fixed model versions yet' # NOQA:E501 ) model = load_sentence_transformers(model_name=model_name) return model @@ -101,7 +101,7 @@ def list_current_model_in_use(self, language='all', metric='all'): for key, value in model_settings.items()], columns=['language', 'metric_name', 'attribute', 'value']) - # the code below would generate a dataframe: + # The code below would generate a dataframe: # |index| language | metric_name | loader | model_name | revision | # |.....|..........|.............|........|............|..........| df_pivot = df.pivot_table(index=['language', 'metric_name'], @@ -144,7 +144,7 @@ def check_model_availability(model_name, revision): if language == 'all' or lang == language: for metric_name, model_setting in lang_setting.items(): if metric == 'all' or metric_name == metric: - # if model name not set + # If model name not set if 'model_name' not in model_setting: raise KeyError( f'{lang} metrics {metric_name} need a model, but found None!' # NOQA:E501 @@ -167,7 +167,7 @@ def check_model_availability(model_name, revision): elif loader_type not in VALID_LOADER: raise ValueError( f'loader type should in {VALID_LOADER}') - # may also need other validate method for other loader + # TODO: May also need other validations for other loader # not found yet print('Configuration Validation Passed') @@ -196,20 +196,20 @@ def set_model_for_metric(self, language: str, metric: str, model_name: str, config = self.config[language][metric] config['loader'] = loader config['model_name'] = model_name - # if tokenizer_name is different with model + # If tokenizer_name is different from model_name tokenizer_name = kwargs.pop('tokenizer_name', None) if tokenizer_name: config['tokenizer_name'] = tokenizer_name - # if model's revision is pinned + # If model's revision is pinned revision = kwargs.pop('revision', None) if revision: config['revision'] = revision - # validate the change + # Validate the change if self.validate_config(language=language, metric=metric): - # clear the LRU cache to make the config change - # reflected imediately + # Clear the LRU cache to make the config change reflected + # immediately self.fetch_model.cache_clear() except (ValueError, KeyError) as err: - # trace back the configuration + # Trace back the configuration self.config = config_copy raise err From d841711b9c69c522ab8d48b9f8d4829d5fb58172 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 27 Dec 2023 04:39:20 +0000 Subject: [PATCH 22/66] fix fetch_model format --- src/langcheck/metrics/_model_management.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index dd19c02e..1fab5a0c 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -46,9 +46,10 @@ def __init__config(self): os.path.join(Path(cwd), 'config', 'metric_config.ini')) @lru_cache - def fetch_model(self, language: str, metric: str)\ - -> Union[Tuple[AutoTokenizer, AutoModelForSequenceClassification], - SentenceTransformer]: + def fetch_model( + self, language: str, metric: str + ) -> Union[Tuple[AutoTokenizer, AutoModelForSequenceClassification], + SentenceTransformer]: ''' Return the model used for the given metric and language. From 2aacbf263e778c864a7c0d747d081d96f952f564 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 27 Dec 2023 05:13:32 +0000 Subject: [PATCH 23/66] fix ref based and source based format --- src/langcheck/metrics/zh/reference_based_text_quality.py | 6 ++++-- src/langcheck/metrics/zh/source_based_text_quality.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 72d3b09e..a8fd4d8a 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -95,8 +95,10 @@ def semantic_similarity( model = _model_manager.fetch_model(language='zh', metric_type="semantic_similarity") - generated_embeddings = model.encode(generated_outputs) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 - reference_embeddings = model.encode(reference_outputs) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 + # For type checking + assert isinstance(model, SentenceTransformer) + generated_embeddings = model.encode(generated_outputs) + reference_embeddings = model.encode(reference_outputs) cosine_scores = util.pairwise_cos_sim( generated_embeddings, # type: ignore[reportGeneralTypeIssues] reference_embeddings # type: ignore[reportGeneralTypeIssues] diff --git a/src/langcheck/metrics/zh/source_based_text_quality.py b/src/langcheck/metrics/zh/source_based_text_quality.py index 718a19e4..17766cc6 100644 --- a/src/langcheck/metrics/zh/source_based_text_quality.py +++ b/src/langcheck/metrics/zh/source_based_text_quality.py @@ -87,7 +87,8 @@ def factual_consistency( global _factual_consistency_translation_pipeline if _factual_consistency_translation_pipeline is None: from langcheck.metrics import _model_manager - tokenizer, model = _model_manager.fetch_model(language='zh', metric_type='factual') # NOQA: E501 + tokenizer, model = _model_manager.fetch_model(language='zh', + metric_type='factual') _factual_consistency_translation_pipeline = pipeline( 'translation', model=model, tokenizer=tokenizer) # type: ignore From 6d0a5306c247312dbdf6febe1c855e1a789d2656 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 27 Dec 2023 05:17:15 +0000 Subject: [PATCH 24/66] fix format in ref free --- .../metrics/zh/reference_free_text_quality.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/langcheck/metrics/zh/reference_free_text_quality.py b/src/langcheck/metrics/zh/reference_free_text_quality.py index 984d7c4b..c5832b85 100644 --- a/src/langcheck/metrics/zh/reference_free_text_quality.py +++ b/src/langcheck/metrics/zh/reference_free_text_quality.py @@ -90,13 +90,17 @@ def sentiment( global _sentiment_model_path _sentiment_pipeline = pipeline( - 'sentiment-analysis', model=_sentiment_model_path - ) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 - # # {0:"Negative", 1:'Positive'} + 'sentiment-analysis', + model=_sentiment_model_path) # type: ignore[reportGeneralTypeIssues] + # {0:"Negative", 1:'Positive'} from langcheck.metrics import _model_manager - tokenizer, model = _model_manager.fetch_model(lanaguage='zh', metric='sentiment') # NOQA: E501 + tokenizer, model = _model_manager.fetch_model(lanaguage='zh', + metric='sentiment') _sentiment_pipeline = pipeline( - 'sentiment-analysis', model=model, tokenizer=tokenizer) # type: ignore[reportGeneralTypeIssues] # NOQA: E501 + 'sentiment-analysis', + model=model, # type: ignore[reportGeneralTypeIssues] + tokenizer=tokenizer # type: ignore[reportGeneralTypeIssues] + ) _model_id2label = _sentiment_pipeline.model.config.id2label _predict_result = _sentiment_pipeline( generated_outputs @@ -218,10 +222,11 @@ def _toxicity_local(generated_outputs: List[str]) -> List[float]: tokenizer, model = _model_manager.fetch_model(language='zh', metric_type="toxicity") - _toxicity_pipeline = pipeline('text-classification', - model=model, - tokenizer=tokenizer, # type: ignore[reportOptionalIterable] # NOQA: E501 - top_k=5) + _toxicity_pipeline = pipeline( + 'text-classification', + model=model, # type: ignore[reportOptionalIterable] + tokenizer=tokenizer, # type: ignore[reportOptionalIterable] + top_k=5) # {'Normal': 0, 'Pulp': 1, 'Sex': 2, 'Other Risk': 3, 'Adult': 4} _model_id2label = _toxicity_pipeline.model.config.id2label From b2805d6e81374a1927c61088e3173b30ae34e63d Mon Sep 17 00:00:00 2001 From: vela Date: Thu, 28 Dec 2023 01:37:22 +0900 Subject: [PATCH 25/66] add dependencies --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f7f2c1a0..adc4b31b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,9 @@ dependencies = [ 'tokenizers >= 0.13.2; python_version >= "3.11"', # See https://github.com/citadel-ai/langcheck/pull/45 'torch >= 2', 'transformers >= 4.6', - "unidic-lite >= 1.0.1" # For tokenizer of metrics.ja.toxicity() + "unidic-lite >= 1.0.1", # For tokenizer of metrics.ja.toxicity() + "tabulate >= 0.9.0", # For model manager paint table + "configobj >= 5.0.8" # For model manager manage config file ] requires-python = ">=3.8" From bc443a6cdd6458bb9d120b31010303f5cbdf03d8 Mon Sep 17 00:00:00 2001 From: vela Date: Thu, 28 Dec 2023 01:38:27 +0900 Subject: [PATCH 26/66] fix model manager plot table problem --- src/langcheck/metrics/_model_management.py | 81 +++++++++++----------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/_model_management.py index 1fab5a0c..eeb50d51 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/_model_management.py @@ -2,7 +2,7 @@ from copy import deepcopy from functools import lru_cache from pathlib import Path -from pprint import pprint +from tabulate import tabulate from typing import Optional, Tuple, Union import pandas as pd @@ -85,7 +85,7 @@ def fetch_model( else: raise KeyError(f'Metric {metric} not supported yet.') else: - raise KeyError(f'language {language} not supported yet') + raise KeyError(f'Language {language} not supported yet') def list_current_model_in_use(self, language='all', metric='all'): ''' @@ -101,27 +101,28 @@ def list_current_model_in_use(self, language='all', metric='all'): for metric_name, model_settings in lang_model_settings.items() for key, value in model_settings.items()], columns=['language', 'metric_name', 'attribute', 'value']) - # The code below would generate a dataframe: # |index| language | metric_name | loader | model_name | revision | # |.....|..........|.............|........|............|..........| df_pivot = df.pivot_table(index=['language', 'metric_name'], columns="attribute", values="value", - aggfunc='first').reset_index().drop( - columns=["attribute"]).reset_index() + aggfunc='first').reset_index().rename_axis( + None, axis=1) df_pivot.columns = [ 'language', 'metric_name', 'loader', 'model_name', 'revision' ] if language == 'all' and metric == 'all': - pprint(df_pivot) + print(tabulate(df_pivot, headers=df_pivot.columns, + tablefmt="github")) else: if language != "all": df_pivot = df_pivot.loc[df_pivot.language == language] if metric != 'all': df_pivot = df_pivot.loc[df_pivot.metric_name == metric] - pprint(df_pivot) + print(tabulate(df_pivot, headers=df_pivot.columns, + tablefmt="github")) def validate_config(self, language='all', metric='all'): ''' @@ -142,34 +143,36 @@ def check_model_availability(model_name, revision): config = deepcopy(self.config) for lang, lang_setting in config.items(): - if language == 'all' or lang == language: - for metric_name, model_setting in lang_setting.items(): - if metric == 'all' or metric_name == metric: - # If model name not set - if 'model_name' not in model_setting: - raise KeyError( - f'{lang} metrics {metric_name} need a model, but found None!' # NOQA:E501 - ) - if 'loader' not in model_setting: - raise KeyError( - f'Metrics {metric_name} need a loader, but found None!' # NOQA:E501 - ) - # Check if the model and revision is available on - # Hugging Face Hub - loader_type = model_setting.pop('loader') - if loader_type == 'huggingface': - model_name = model_setting.pop('model_name') - revision = model_setting.pop('revision', None) - if not check_model_availability( - model_name, revision): - raise ValueError( - f'Cannot find {model_name} with {revision} and Huggingface Hub' # NOQA:E501 - ) - elif loader_type not in VALID_LOADER: - raise ValueError( - f'loader type should in {VALID_LOADER}') - # TODO: May also need other validations for other loader - # not found yet + if language != 'all' and lang != language: + continue + for metric_name, model_setting in lang_setting.items(): + if metric != 'all' and metric_name != metric: + continue + # If model name not set + if 'model_name' not in model_setting: + raise KeyError( + f'{lang} metrics {metric_name} need a model, but found None!' # NOQA:E501 + ) + if 'loader' not in model_setting: + raise KeyError( + f'Metrics {metric_name} need a loader, but found None!' # NOQA:E501 + ) + # Check if the model and revision is available on + # Hugging Face Hub + loader_type = model_setting.pop('loader') + if loader_type == 'huggingface': + model_name = model_setting.pop('model_name') + revision = model_setting.pop('revision', None) + if not check_model_availability( + model_name, revision): + raise ValueError( + f'Cannot find {model_name} with {revision} and Huggingface Hub' # NOQA:E501 + ) + elif loader_type not in VALID_LOADER: + raise ValueError( + f'loader type should in {VALID_LOADER}') + # TODO: May also need other validations for other loader + # not found yet print('Configuration Validation Passed') def set_model_for_metric(self, language: str, metric: str, model_name: str, @@ -182,16 +185,16 @@ def set_model_for_metric(self, language: str, metric: str, model_name: str, metric: The name of the evaluation metrics model_name: The name of the model loader: The loader of the model - tokenizer_name: (Optional) the name of the tokenizer - revision: (Optional) a version string of the model + tokenizer_name: (Optional) The name of the tokenizer + revision: (Optional) A version string of the model ''' config_copy = deepcopy(self.config) try: if language not in VALID_LANGUAGE: - raise ValueError('Language {language} not supported yet') + raise KeyError('Language {language} not supported yet') if metric not in self.config[language]: - raise ValueError( + raise KeyError( 'Language {language} not supported {metric} yet') config = self.config[language][metric] From 4433da1afc441ec6d9df8c254223358ec2620deb Mon Sep 17 00:00:00 2001 From: vela Date: Fri, 29 Dec 2023 04:11:37 +0900 Subject: [PATCH 27/66] fix typo mistakes --- src/langcheck/metrics/_model_loader.py | 2 ++ src/langcheck/metrics/zh/reference_free_text_quality.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/langcheck/metrics/_model_loader.py b/src/langcheck/metrics/_model_loader.py index 8dcff6ed..3eb74d59 100644 --- a/src/langcheck/metrics/_model_loader.py +++ b/src/langcheck/metrics/_model_loader.py @@ -28,6 +28,8 @@ def load_auto_model_for_text_classification( tokenizer_name: The name of a tokenizer on Hugging Face revision: The shortened sha1 string of a model ''' + if tokenizer_name is None: + tokenizer_name = model_name tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) model = AutoModelForSequenceClassification.from_pretrained( model_name, revision=revision) # NOQA: E501 diff --git a/src/langcheck/metrics/zh/reference_free_text_quality.py b/src/langcheck/metrics/zh/reference_free_text_quality.py index c5832b85..caf5e968 100644 --- a/src/langcheck/metrics/zh/reference_free_text_quality.py +++ b/src/langcheck/metrics/zh/reference_free_text_quality.py @@ -94,7 +94,7 @@ def sentiment( model=_sentiment_model_path) # type: ignore[reportGeneralTypeIssues] # {0:"Negative", 1:'Positive'} from langcheck.metrics import _model_manager - tokenizer, model = _model_manager.fetch_model(lanaguage='zh', + tokenizer, model = _model_manager.fetch_model(language='zh', metric='sentiment') _sentiment_pipeline = pipeline( 'sentiment-analysis', @@ -220,7 +220,7 @@ def _toxicity_local(generated_outputs: List[str]) -> List[float]: # the output format is List[List[Dict(str)]] from langcheck.metrics import _model_manager tokenizer, model = _model_manager.fetch_model(language='zh', - metric_type="toxicity") + metric="toxicity") _toxicity_pipeline = pipeline( 'text-classification', From c540f12fe2f27f18b759fd70557dba48fdb988f7 Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 5 Feb 2024 17:54:01 +0900 Subject: [PATCH 28/66] move and update model manager --- src/langcheck/metrics/_model_loader.py | 47 ---- .../metrics/config/metric_config.ini | 22 -- .../metrics/model_manager/__init__.py | 13 + .../metrics/model_manager/_model_loader.py | 66 +++++ .../{ => model_manager}/_model_management.py | 229 +++++++++--------- 5 files changed, 198 insertions(+), 179 deletions(-) delete mode 100644 src/langcheck/metrics/_model_loader.py delete mode 100644 src/langcheck/metrics/config/metric_config.ini create mode 100644 src/langcheck/metrics/model_manager/__init__.py create mode 100644 src/langcheck/metrics/model_manager/_model_loader.py rename src/langcheck/metrics/{ => model_manager}/_model_management.py (58%) diff --git a/src/langcheck/metrics/_model_loader.py b/src/langcheck/metrics/_model_loader.py deleted file mode 100644 index 3eb74d59..00000000 --- a/src/langcheck/metrics/_model_loader.py +++ /dev/null @@ -1,47 +0,0 @@ -from typing import Optional, Tuple - -from sentence_transformers import SentenceTransformer -from transformers.models.auto.modeling_auto import \ - AutoModelForSequenceClassification -from transformers.models.auto.tokenization_auto import AutoTokenizer -from transformers.pipelines import pipeline - - -def load_sentence_transformers(model_name: str) -> SentenceTransformer: - ''' - Return a Hugging Face sentence-transformer model. - - Args: - model_name: The name of a sentence-transformer model - ''' - return SentenceTransformer(model_name) - - -def load_auto_model_for_text_classification( - model_name: str, tokenizer_name: Optional[str], revision: Optional[str] -) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: - ''' - Return a Hugging Face sequence-classification model. - - Args: - model_name: The name of a sequence-classification model on Hugging Face - tokenizer_name: The name of a tokenizer on Hugging Face - revision: The shortened sha1 string of a model - ''' - if tokenizer_name is None: - tokenizer_name = model_name - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) - model = AutoModelForSequenceClassification.from_pretrained( - model_name, revision=revision) # NOQA: E501 - return tokenizer, model - - -def load_pipeline_for_text_classification(model_name: str, **kwargs): - ''' - Return a Hugging Face text-classification pipeline. - - Args: - model_name: The name of a text-classification pipeline on Hugging Face - ''' - top_k = kwargs.pop('top_k', None) - return pipeline('text-classification', model=model_name, top_k=top_k) diff --git a/src/langcheck/metrics/config/metric_config.ini b/src/langcheck/metrics/config/metric_config.ini deleted file mode 100644 index 4d8ee985..00000000 --- a/src/langcheck/metrics/config/metric_config.ini +++ /dev/null @@ -1,22 +0,0 @@ -[zh] -[[semantic_similarity]] - # According to the C-MTEB Benchmark - # (https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB) - # the 3 models of different sizes provided BAAI are the best on the - # embedding task - # Ref: https://huggingface.co/BAAI/bge-base-zh-v1.5 - # Using this model, it is hard to find two sentence where cos_sim < 0.25. - model_name = BAAI/bge-base-zh-v1.5 - revision = f03589c - loader = sentence-transformers -[[sentiment]] - model_name = IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment - loader = huggingface -[[toxicity]] - model_name = alibaba-pai/pai-bert-base-zh-llm-risk-detection - loader = huggingface - revision = 0a61c79744cb0173216f015ffecc1ea81c4e0229 -[[factual_consistency]] - model_name = Helsinki-NLP/opus-mt-zh-en - loader = huggingface - revision = cf109095479db38d6df799875e34039d4938aaa6 diff --git a/src/langcheck/metrics/model_manager/__init__.py b/src/langcheck/metrics/model_manager/__init__.py new file mode 100644 index 00000000..d489012d --- /dev/null +++ b/src/langcheck/metrics/model_manager/__init__.py @@ -0,0 +1,13 @@ +from ._model_management import ModelManager +from ._model_loader import (load_sentence_transformers, + load_auto_model_for_seq2seq, + load_auto_model_for_text_classification) + +manager = ModelManager() + +__all__ = [ + "manager", + "load_sentence_transformers", + "load_auto_model_for_seq2seq", + "load_auto_model_for_text_classification" +] diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py new file mode 100644 index 00000000..a6c97aa1 --- /dev/null +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -0,0 +1,66 @@ +from typing import Optional, Tuple + +from sentence_transformers import SentenceTransformer +from transformers.models.auto.modeling_auto import (AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification) # NOQA:E501 + +from transformers.models.auto.tokenization_auto import AutoTokenizer +from transformers.pipelines import pipeline + + +def load_sentence_transformers(model_name: str, + tokenizer_name: Optional[str] = None, + revision: Optional[str] = None) -> SentenceTransformer: # NOQA:E501 + ''' + Return a sequence embeddiing model parsed by sentence-transformer library. + + Args: + model_name: The name of a sentence-transformer model + ''' + if revision is not None: + print("Version Pined not supported in Sentence-Transformers yet.") + + if tokenizer_name is not None: + print("Tokenizer customize not supported in Sentence-Transformers yet.") + + return SentenceTransformer(model_name) + + +def load_auto_model_for_text_classification( + model_name: str, tokenizer_name: Optional[str] = None, + revision: Optional[str] = None +) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: + ''' + Return a sequence classification model on huggingface hub. + + Args: + model_name: The name of a sequence-classification model on Hugging Face + tokenizer_name: The name of a tokenizer on Hugging Face + revision: The shortened sha1 string of a model + ''' + if tokenizer_name is None: + tokenizer_name = model_name + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) + model = AutoModelForSequenceClassification.from_pretrained( + model_name, revision=revision) # NOQA: E501 + return tokenizer, model + + +def load_auto_model_for_seq2seq( + model_name: str, tokenizer_name: Optional[str] = None, + revision: Optional[str] = None +) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: + ''' + Return a sequence to sequence model availble on huggingface hub. + + Args: + model_name: The name of a sequence-classification model on Hugging Face + tokenizer_name: The name of a tokenizer on Hugging Face + revision: The shortened sha1 string of a model + ''' + if tokenizer_name is None: + tokenizer_name = model_name + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) + model = AutoModelForSeq2SeqLM.from_pretrained( + model_name, revision=revision) # NOQA: E501 + return tokenizer, model diff --git a/src/langcheck/metrics/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py similarity index 58% rename from src/langcheck/metrics/_model_management.py rename to src/langcheck/metrics/model_manager/_model_management.py index eeb50d51..e6c2aa50 100644 --- a/src/langcheck/metrics/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -5,23 +5,42 @@ from tabulate import tabulate from typing import Optional, Tuple, Union -import pandas as pd import requests -from configobj import ConfigObj +import pandas as pd +from omegaconf import OmegaConf + from sentence_transformers import SentenceTransformer -from transformers.models.auto.modeling_auto import \ - AutoModelForSequenceClassification +from transformers.models.auto.modeling_auto import (AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification) # NOQA:E501 from transformers.models.auto.tokenization_auto import AutoTokenizer -from ._model_loader import (load_auto_model_for_text_classification, - load_sentence_transformers) - -# TODO: Use a ENUM class to parse these -VALID_METRIC_NAME = [ - 'factual_consistency', 'toxicity', 'sentiment', 'semantic_similarity' -] +from ._model_loader import (load_sentence_transformers, + load_auto_model_for_text_classification, + load_auto_model_for_seq2seq) + +LOADER_MAP = { + "load_sentence_transformers": load_sentence_transformers, + "load_auto_model_for_text_classification": load_auto_model_for_text_classification, # NOQA:E501 + "load_auto_model_for_seq2seq": load_auto_model_for_seq2seq +} +VALID_LOADER_FUNCTION = LOADER_MAP.keys() # NOQA:E501 +VALID_METRICS = ['semantic_similarity', 'sentiment', + 'toxicity', 'factual_consistency'] + +VALID_METRIC_ATTRIBUTE = ['model_revision', 'model_revision', + 'loader', 'tokenizer_name'] VALID_LANGUAGE = ['zh'] -VALID_LOADER = ['huggingface', 'sentence-transformers'] + + +def check_model_availability(model_name: str, + revision: Optional[str]): + # TODO: add local cached model availability check for offline environment + if revision is None: + url = f"https://huggingface.co/api/models/{model_name}" + else: + url = f"https://huggingface.co/api/models/{model_name}/revision/{revision}" # NOQA:E501 + response = requests.get(url, timeout=(1.0, 1.0)) + return response.status_code == 200 class ModelManager: @@ -37,18 +56,29 @@ def __init__(self): Initializes the ModelConfig with empty model dictionaries for each language. ''' - self.__init__config() - self.validate_config() - - def __init__config(self): + self.config = OmegaConf.create() cwd = os.path.dirname(__file__) - self.config = ConfigObj( - os.path.join(Path(cwd), 'config', 'metric_config.ini')) + default_config_file_path = os.path.join(cwd, "config", + "metric_config.yaml") + self.__load_config(default_config_file_path) + + def __load_config(self, path: str): + conf = OmegaConf.load(path) + + for lang, lang_conf in conf.items(): + for metric_name, metric_conf in lang_conf.items(): + # check model availbility, if key not in conf + # omega conf will return None in default + self.__set_model_for_metric( + language=lang, metric=metric_name, + **metric_conf) + print('Configuration Load Successed!') @lru_cache def fetch_model( self, language: str, metric: str ) -> Union[Tuple[AutoTokenizer, AutoModelForSequenceClassification], + Tuple[AutoTokenizer, AutoModelForSeq2SeqLM], SentenceTransformer]: ''' Return the model used for the given metric and language. @@ -63,68 +93,16 @@ def fetch_model( # not affect the original `self.config`. config = deepcopy(self.config[language][metric]) # Get model name, model loader type - model_name, loader_type = config['model_name'], config['loader'] - # Check if the model version is fixed - revision = config.pop("revision", None) - if loader_type == 'sentence-transformers': - if revision is not None: - print( - 'Info: Sentence-Transformers do not support fixed model versions yet' # NOQA:E501 - ) - model = load_sentence_transformers(model_name=model_name) - return model - elif loader_type == 'huggingface': - tokenizer_name = config.pop('tokenizer_name', None) - tokenizer, model = load_auto_model_for_text_classification( - model_name=model_name, - tokenizer_name=tokenizer_name, - revision=revision) - return tokenizer, model - else: - raise KeyError(f'Loader {loader_type} not supported yet.') + loader_func = config.pop('loader_func') + loader = LOADER_MAP[loader_func] + return loader(**config) else: raise KeyError(f'Metric {metric} not supported yet.') else: raise KeyError(f'Language {language} not supported yet') - def list_current_model_in_use(self, language='all', metric='all'): - ''' - List the models currently in use. - - Args: - language: The abbrevation name of language - metric: The evaluation metric name - ''' - df = pd.DataFrame.from_records( - [(lang, metric_name, key, value) - for lang, lang_model_settings in self.config.items() - for metric_name, model_settings in lang_model_settings.items() - for key, value in model_settings.items()], - columns=['language', 'metric_name', 'attribute', 'value']) - # The code below would generate a dataframe: - # |index| language | metric_name | loader | model_name | revision | - # |.....|..........|.............|........|............|..........| - df_pivot = df.pivot_table(index=['language', 'metric_name'], - columns="attribute", - values="value", - aggfunc='first').reset_index().rename_axis( - None, axis=1) - df_pivot.columns = [ - 'language', 'metric_name', 'loader', 'model_name', 'revision' - ] - - if language == 'all' and metric == 'all': - print(tabulate(df_pivot, headers=df_pivot.columns, - tablefmt="github")) - else: - if language != "all": - df_pivot = df_pivot.loc[df_pivot.language == language] - if metric != 'all': - df_pivot = df_pivot.loc[df_pivot.metric_name == metric] - print(tabulate(df_pivot, headers=df_pivot.columns, - tablefmt="github")) - - def validate_config(self, language='all', metric='all'): + @staticmethod + def validate_config(config, language='all', metric='all'): ''' Validate configuration. @@ -132,16 +110,7 @@ def validate_config(self, language='all', metric='all'): language: The name of the language. Defaults to 'all'. metric: The name of the metric. Defaults to 'all'. ''' - - def check_model_availability(model_name, revision): - if revision is None: - url = f"https://huggingface.co/api/models/{model_name}" - else: - url = f"https://huggingface.co/api/models/{model_name}/revision/{revision}" # NOQA:E501 - response = requests.get(url) - return response.status_code == 200 - - config = deepcopy(self.config) + config = deepcopy(config) for lang, lang_setting in config.items(): if language != 'all' and lang != language: continue @@ -153,30 +122,25 @@ def check_model_availability(model_name, revision): raise KeyError( f'{lang} metrics {metric_name} need a model, but found None!' # NOQA:E501 ) - if 'loader' not in model_setting: + if 'loader_func' not in model_setting: raise KeyError( f'Metrics {metric_name} need a loader, but found None!' # NOQA:E501 ) # Check if the model and revision is available on # Hugging Face Hub - loader_type = model_setting.pop('loader') - if loader_type == 'huggingface': - model_name = model_setting.pop('model_name') - revision = model_setting.pop('revision', None) - if not check_model_availability( - model_name, revision): - raise ValueError( - f'Cannot find {model_name} with {revision} and Huggingface Hub' # NOQA:E501 - ) - elif loader_type not in VALID_LOADER: + model_name = model_setting.pop('model_name') + revision = model_setting.pop('revision', None) + loader_func = model_setting.pop('loader_func', None) + if loader_func not in VALID_LOADER_FUNCTION: + raise ValueError( + f'loader type should in {VALID_LOADER_FUNCTION}') + if not check_model_availability(model_name, revision): raise ValueError( - f'loader type should in {VALID_LOADER}') - # TODO: May also need other validations for other loader - # not found yet - print('Configuration Validation Passed') + f'Cannot find {model_name} with {revision} and Huggingface Hub' # NOQA:E501 + ) - def set_model_for_metric(self, language: str, metric: str, model_name: str, - loader: Optional[str], **kwargs): + def __set_model_for_metric(self, language: str, metric: str, + model_name: str, loader_func: str, **kwargs): ''' Set model for specified metric in specified language. @@ -193,23 +157,31 @@ def set_model_for_metric(self, language: str, metric: str, model_name: str, if language not in VALID_LANGUAGE: raise KeyError('Language {language} not supported yet') - if metric not in self.config[language]: + if metric not in VALID_METRICS: raise KeyError( 'Language {language} not supported {metric} yet') - config = self.config[language][metric] - config['loader'] = loader - config['model_name'] = model_name + # initialize configuration structure if it is empty. + if self.config.get(language) is None: + self.config[language] = {} + if self.config.get(language).get(metric) is None: + self.config[language][metric] = {} + + detail_config = self.config[language][metric] + # set metric attribute + detail_config['loader_func'] = loader_func + detail_config['model_name'] = model_name # If tokenizer_name is different from model_name tokenizer_name = kwargs.pop('tokenizer_name', None) if tokenizer_name: - config['tokenizer_name'] = tokenizer_name + detail_config['tokenizer_name'] = tokenizer_name # If model's revision is pinned - revision = kwargs.pop('revision', None) + revision = kwargs.pop('model_revision', None) if revision: - config['revision'] = revision + detail_config['revision'] = revision # Validate the change - if self.validate_config(language=language, metric=metric): + if ModelManager.validate_config(self.config, + language=language, metric=metric): # Clear the LRU cache to make the config change reflected # immediately self.fetch_model.cache_clear() @@ -217,3 +189,40 @@ def set_model_for_metric(self, language: str, metric: str, model_name: str, # Trace back the configuration self.config = config_copy raise err + + def list_current_model_in_use(self, language='all', metric='all'): + ''' + List the models currently in use. + + Args: + language: The abbrevation name of language + metric: The evaluation metric name + ''' + df = pd.DataFrame.from_records( + [(lang, metric_name, key, value) + for lang, lang_model_settings in self.config.items() + for metric_name, model_settings in lang_model_settings.items() + for key, value in model_settings.items()], + columns=['language', 'metric_name', 'attribute', 'value']) + # The code below would generate a dataframe: + # |index| language | metric_name | loader | model_name | revision | + # |.....|..........|.............|........|............|..........| + df_pivot = df.pivot_table(index=['language', 'metric_name'], + columns="attribute", + values="value", + aggfunc='first').reset_index().rename_axis( + None, axis=1) + df_pivot.columns = [ + 'language', 'metric_name', 'loader', 'model_name', 'revision' + ] + + if language == 'all' and metric == 'all': + print(tabulate(df_pivot, headers=df_pivot.columns, + tablefmt="github")) + else: + if language != "all": + df_pivot = df_pivot.loc[df_pivot.language == language] + if metric != 'all': + df_pivot = df_pivot.loc[df_pivot.metric_name == metric] + print(tabulate(df_pivot, headers=df_pivot.columns, + tablefmt="github")) \ No newline at end of file From ea7564baf5d03a018f58f9231db9ac1d18f52c30 Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 5 Feb 2024 17:55:06 +0900 Subject: [PATCH 29/66] use yaml file for read ease --- src/langcheck/metrics/__init__.py | 5 ++--- .../model_manager/config/metric_config.yaml | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) create mode 100644 src/langcheck/metrics/model_manager/config/metric_config.yaml diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 0479c2ce..fe767faf 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -1,5 +1,5 @@ from langcheck.metrics import en, ja, zh -from langcheck.metrics._model_management import ModelManager +from langcheck.metrics import model_manager from langcheck.metrics.en.reference_based_text_quality import ( rouge1, rouge2, rougeL, semantic_similarity) from langcheck.metrics.en.reference_free_text_quality import ( @@ -14,8 +14,6 @@ is_json_array, is_json_object, matches_regex, validation_fn) -_model_manager = ModelManager() - __all__ = [ 'en', 'ja', @@ -42,4 +40,5 @@ 'semantic_similarity', 'sentiment', 'toxicity', + 'model_manager' ] diff --git a/src/langcheck/metrics/model_manager/config/metric_config.yaml b/src/langcheck/metrics/model_manager/config/metric_config.yaml new file mode 100644 index 00000000..be8d605a --- /dev/null +++ b/src/langcheck/metrics/model_manager/config/metric_config.yaml @@ -0,0 +1,19 @@ +zh: + semantic_similarity: + model_name: BAAI/bge-base-zh-v1.5 + model_revision: f03589c + loader_func: load_sentence_transformers + + sentiment: + model_name: IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment + loader_func: load_auto_model_for_text_classification + + toxicity: + model_name: alibaba-pai/pai-bert-base-zh-llm-risk-detection + model_revision: 0a61c79744cb0173216f015ffecc1ea81c4e0229 + loader_func: load_auto_model_for_text_classification + + factual_consistency: + model_name: Helsinki-NLP/opus-mt-zh-en + model_revision: cf109095479db38d6df799875e34039d4938aaa6 + loader_func: load_auto_model_for_seq2seq \ No newline at end of file From a1120dbbeada1d7df406003a80a5ffc3c2f62e04 Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 5 Feb 2024 17:55:49 +0900 Subject: [PATCH 30/66] apply manager changes to zh metric --- .../metrics/zh/reference_based_text_quality.py | 6 +++--- .../metrics/zh/reference_free_text_quality.py | 11 +++++------ src/langcheck/metrics/zh/source_based_text_quality.py | 6 +++--- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index a8fd4d8a..338c4723 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -91,9 +91,9 @@ def semantic_similarity( metric_value.language = 'zh' return metric_value # lazy import - from langcheck.metrics import _model_manager - model = _model_manager.fetch_model(language='zh', - metric_type="semantic_similarity") + from langcheck.metrics.model_manager import manager + model = manager.fetch_model(language='zh', + metric="semantic_similarity") # For type checking assert isinstance(model, SentenceTransformer) diff --git a/src/langcheck/metrics/zh/reference_free_text_quality.py b/src/langcheck/metrics/zh/reference_free_text_quality.py index caf5e968..606dae68 100644 --- a/src/langcheck/metrics/zh/reference_free_text_quality.py +++ b/src/langcheck/metrics/zh/reference_free_text_quality.py @@ -93,9 +93,8 @@ def sentiment( 'sentiment-analysis', model=_sentiment_model_path) # type: ignore[reportGeneralTypeIssues] # {0:"Negative", 1:'Positive'} - from langcheck.metrics import _model_manager - tokenizer, model = _model_manager.fetch_model(language='zh', - metric='sentiment') + from langcheck.metrics.model_manager import manager + tokenizer, model = manager.fetch_model(language='zh', metric='sentiment') _sentiment_pipeline = pipeline( 'sentiment-analysis', model=model, # type: ignore[reportGeneralTypeIssues] @@ -218,9 +217,9 @@ def _toxicity_local(generated_outputs: List[str]) -> List[float]: global _toxicity_model_path # this pipeline output predict probability for each text on each label. # the output format is List[List[Dict(str)]] - from langcheck.metrics import _model_manager - tokenizer, model = _model_manager.fetch_model(language='zh', - metric="toxicity") + from langcheck.metrics.model_manager import manager + tokenizer, model = manager.fetch_model(language='zh', + metric="toxicity") _toxicity_pipeline = pipeline( 'text-classification', diff --git a/src/langcheck/metrics/zh/source_based_text_quality.py b/src/langcheck/metrics/zh/source_based_text_quality.py index 17766cc6..a643a799 100644 --- a/src/langcheck/metrics/zh/source_based_text_quality.py +++ b/src/langcheck/metrics/zh/source_based_text_quality.py @@ -86,9 +86,9 @@ def factual_consistency( global _factual_consistency_translation_pipeline if _factual_consistency_translation_pipeline is None: - from langcheck.metrics import _model_manager - tokenizer, model = _model_manager.fetch_model(language='zh', - metric_type='factual') + from langcheck.metrics.model_manager import manager + tokenizer, model = manager.fetch_model(language='zh', + metric='factual_consistency') # NOQA:E501 _factual_consistency_translation_pipeline = pipeline( 'translation', model=model, tokenizer=tokenizer) # type: ignore From b1718d62a3016302c1492492af2b44f0a6a13f32 Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 5 Feb 2024 17:56:52 +0900 Subject: [PATCH 31/66] update environment settings and delete test case --- pyproject.toml | 2 +- tests/metrics/test_model_management.py | 53 -------------------------- 2 files changed, 1 insertion(+), 54 deletions(-) delete mode 100644 tests/metrics/test_model_management.py diff --git a/pyproject.toml b/pyproject.toml index adc4b31b..18de76c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ 'transformers >= 4.6', "unidic-lite >= 1.0.1", # For tokenizer of metrics.ja.toxicity() "tabulate >= 0.9.0", # For model manager paint table - "configobj >= 5.0.8" # For model manager manage config file + "omegaconf >= 2.3.0" # For model manager paint table ] requires-python = ">=3.8" diff --git a/tests/metrics/test_model_management.py b/tests/metrics/test_model_management.py deleted file mode 100644 index 62a7f025..00000000 --- a/tests/metrics/test_model_management.py +++ /dev/null @@ -1,53 +0,0 @@ -from unittest.mock import mock_open, patch - -from langcheck.metrics._model_management import ModelConfig - - -def test_initialization_with_mock_file(): - try: - mock_file_content = "[zh]\nsemantic_similarity=test_model\n" - with patch('builtins.open', mock_open(read_data=mock_file_content)): - config = ModelConfig() - assert config.model_config['zh'][ - 'semantic_similarity'] == 'test_model' # NOQA:E501 - except AssertionError as err: - raise err - - -def test_get_metric_model_with_mock_file(): - try: - mock_file_content = "[zh]\nsemantic_similarity=test_model\n" - with patch('builtins.open', mock_open(read_data=mock_file_content)): - config = ModelConfig() - model_name = config.get_metric_model( - language='zh', metric_type='semantic_similarity') # NOQA:E501 - assert model_name == 'test_model' - except AssertionError as err: - raise err - - -def test_list_metric_model_with_mock_file(capsys): - try: - mock_file_content = "[zh]\nsemantic_similarity=test_model\n" - with patch('builtins.open', mock_open(read_data=mock_file_content)): - config = ModelConfig() - config.list_metric_model(language='zh', - metric_type='semantic_similarity') - captured = capsys.readouterr() # type: ignore - assert 'test_model' in captured.out - except AssertionError as err: - raise err - - -def test_set_model_for_metric_with_mock_file(): - try: - mock_file_content = "[zh]\nsemantic_similarity=test_model\n" - with patch('builtins.open', mock_open(read_data=mock_file_content)): - config = ModelConfig() - config.set_model_for_metric(model_name='another_test_model', - language='zh', - metric_type='semantic_similarity') - assert config.model_config['zh'][ - 'semantic_similarity'] == 'another_test_model' # NOQA:E501 - except AssertionError as err: - raise err From da11f46d9c5345ab24bf2d952d784c6a871daa63 Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 5 Feb 2024 18:43:40 +0900 Subject: [PATCH 32/66] apply format check suggestions --- src/langcheck/metrics/__init__.py | 1 - .../metrics/model_manager/__init__.py | 10 ++- .../metrics/model_manager/_model_loader.py | 18 +++--- .../model_manager/_model_management.py | 62 ++++++++++--------- 4 files changed, 46 insertions(+), 45 deletions(-) diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index fe767faf..4a5afec9 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -1,5 +1,4 @@ from langcheck.metrics import en, ja, zh -from langcheck.metrics import model_manager from langcheck.metrics.en.reference_based_text_quality import ( rouge1, rouge2, rougeL, semantic_similarity) from langcheck.metrics.en.reference_free_text_quality import ( diff --git a/src/langcheck/metrics/model_manager/__init__.py b/src/langcheck/metrics/model_manager/__init__.py index d489012d..49082c76 100644 --- a/src/langcheck/metrics/model_manager/__init__.py +++ b/src/langcheck/metrics/model_manager/__init__.py @@ -1,13 +1,11 @@ +from ._model_loader import (load_auto_model_for_seq2seq, + load_auto_model_for_text_classification, + load_sentence_transformers) from ._model_management import ModelManager -from ._model_loader import (load_sentence_transformers, - load_auto_model_for_seq2seq, - load_auto_model_for_text_classification) manager = ModelManager() __all__ = [ - "manager", - "load_sentence_transformers", - "load_auto_model_for_seq2seq", + "manager", "load_sentence_transformers", "load_auto_model_for_seq2seq", "load_auto_model_for_text_classification" ] diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py index a6c97aa1..3e558a49 100644 --- a/src/langcheck/metrics/model_manager/_model_loader.py +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -1,16 +1,16 @@ from typing import Optional, Tuple from sentence_transformers import SentenceTransformer -from transformers.models.auto.modeling_auto import (AutoModelForSeq2SeqLM, - AutoModelForSequenceClassification) # NOQA:E501 - +from transformers.models.auto.modeling_auto import ( # NOQA:E501 + AutoModelForSeq2SeqLM, AutoModelForSequenceClassification) from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.pipelines import pipeline -def load_sentence_transformers(model_name: str, - tokenizer_name: Optional[str] = None, - revision: Optional[str] = None) -> SentenceTransformer: # NOQA:E501 +def load_sentence_transformers( + model_name: str, + tokenizer_name: Optional[str] = None, + revision: Optional[str] = None) -> SentenceTransformer: # NOQA:E501 ''' Return a sequence embeddiing model parsed by sentence-transformer library. @@ -27,7 +27,8 @@ def load_sentence_transformers(model_name: str, def load_auto_model_for_text_classification( - model_name: str, tokenizer_name: Optional[str] = None, + model_name: str, + tokenizer_name: Optional[str] = None, revision: Optional[str] = None ) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: ''' @@ -47,7 +48,8 @@ def load_auto_model_for_text_classification( def load_auto_model_for_seq2seq( - model_name: str, tokenizer_name: Optional[str] = None, + model_name: str, + tokenizer_name: Optional[str] = None, revision: Optional[str] = None ) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: ''' diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index e6c2aa50..d64a7bb3 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -2,38 +2,41 @@ from copy import deepcopy from functools import lru_cache from pathlib import Path -from tabulate import tabulate from typing import Optional, Tuple, Union -import requests import pandas as pd +import requests from omegaconf import OmegaConf - from sentence_transformers import SentenceTransformer -from transformers.models.auto.modeling_auto import (AutoModelForSeq2SeqLM, - AutoModelForSequenceClassification) # NOQA:E501 +from tabulate import tabulate +from transformers.models.auto.modeling_auto import ( # NOQA:E501 + AutoModelForSeq2SeqLM, AutoModelForSequenceClassification) from transformers.models.auto.tokenization_auto import AutoTokenizer -from ._model_loader import (load_sentence_transformers, +from ._model_loader import (load_auto_model_for_seq2seq, load_auto_model_for_text_classification, - load_auto_model_for_seq2seq) + load_sentence_transformers) LOADER_MAP = { - "load_sentence_transformers": load_sentence_transformers, - "load_auto_model_for_text_classification": load_auto_model_for_text_classification, # NOQA:E501 - "load_auto_model_for_seq2seq": load_auto_model_for_seq2seq + "load_sentence_transformers": + load_sentence_transformers, + "load_auto_model_for_text_classification": + load_auto_model_for_text_classification, # NOQA:E501 + "load_auto_model_for_seq2seq": + load_auto_model_for_seq2seq } VALID_LOADER_FUNCTION = LOADER_MAP.keys() # NOQA:E501 -VALID_METRICS = ['semantic_similarity', 'sentiment', - 'toxicity', 'factual_consistency'] +VALID_METRICS = [ + 'semantic_similarity', 'sentiment', 'toxicity', 'factual_consistency' +] -VALID_METRIC_ATTRIBUTE = ['model_revision', 'model_revision', - 'loader', 'tokenizer_name'] +VALID_METRIC_ATTRIBUTE = [ + 'model_revision', 'model_revision', 'loader', 'tokenizer_name' +] VALID_LANGUAGE = ['zh'] -def check_model_availability(model_name: str, - revision: Optional[str]): +def check_model_availability(model_name: str, revision: Optional[str]): # TODO: add local cached model availability check for offline environment if revision is None: url = f"https://huggingface.co/api/models/{model_name}" @@ -69,17 +72,16 @@ def __load_config(self, path: str): for metric_name, metric_conf in lang_conf.items(): # check model availbility, if key not in conf # omega conf will return None in default - self.__set_model_for_metric( - language=lang, metric=metric_name, - **metric_conf) + self.__set_model_for_metric(language=lang, # type: ignore # NOQA:E501 + metric=metric_name, + **metric_conf) print('Configuration Load Successed!') @lru_cache def fetch_model( self, language: str, metric: str - ) -> Union[Tuple[AutoTokenizer, AutoModelForSequenceClassification], - Tuple[AutoTokenizer, AutoModelForSeq2SeqLM], - SentenceTransformer]: + ) -> Union[Tuple[AutoTokenizer, AutoModelForSequenceClassification], Tuple[ + AutoTokenizer, AutoModelForSeq2SeqLM], SentenceTransformer]: ''' Return the model used for the given metric and language. @@ -158,8 +160,7 @@ def __set_model_for_metric(self, language: str, metric: str, raise KeyError('Language {language} not supported yet') if metric not in VALID_METRICS: - raise KeyError( - 'Language {language} not supported {metric} yet') + raise KeyError('Language {language} not supported {metric} yet') # initialize configuration structure if it is empty. if self.config.get(language) is None: @@ -181,7 +182,8 @@ def __set_model_for_metric(self, language: str, metric: str, detail_config['revision'] = revision # Validate the change if ModelManager.validate_config(self.config, - language=language, metric=metric): + language=language, + metric=metric): # Clear the LRU cache to make the config change reflected # immediately self.fetch_model.cache_clear() @@ -211,18 +213,18 @@ def list_current_model_in_use(self, language='all', metric='all'): columns="attribute", values="value", aggfunc='first').reset_index().rename_axis( - None, axis=1) + None, axis=1) df_pivot.columns = [ 'language', 'metric_name', 'loader', 'model_name', 'revision' ] if language == 'all' and metric == 'all': - print(tabulate(df_pivot, headers=df_pivot.columns, - tablefmt="github")) + print( + tabulate(df_pivot, headers=df_pivot.columns, tablefmt="github")) # type: ignore # NOQA:E501 else: if language != "all": df_pivot = df_pivot.loc[df_pivot.language == language] if metric != 'all': df_pivot = df_pivot.loc[df_pivot.metric_name == metric] - print(tabulate(df_pivot, headers=df_pivot.columns, - tablefmt="github")) \ No newline at end of file + print( + tabulate(df_pivot, headers=df_pivot.columns, tablefmt="github")) # type: ignore # NOQA:E501 From a83d6a381afaa5e7cbae041a3cb1ebf6ee95eab5 Mon Sep 17 00:00:00 2001 From: vela Date: Tue, 6 Feb 2024 00:36:47 +0900 Subject: [PATCH 33/66] add test case for model loader. --- .../metrics/model_manager/_model_loader.py | 3 +- .../model_manager/_model_management.py | 3 +- tests/metrics/model_manager/__init__.py | 0 .../model_manager/test_model_loader.py | 89 +++++++++++++++++++ .../model_manager/test_model_manager.py | 0 5 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 tests/metrics/model_manager/__init__.py create mode 100644 tests/metrics/model_manager/test_model_loader.py create mode 100644 tests/metrics/model_manager/test_model_manager.py diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py index 3e558a49..fa6b9cb5 100644 --- a/src/langcheck/metrics/model_manager/_model_loader.py +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -23,7 +23,8 @@ def load_sentence_transformers( if tokenizer_name is not None: print("Tokenizer customize not supported in Sentence-Transformers yet.") - return SentenceTransformer(model_name) + model = SentenceTransformer(model_name) + return model def load_auto_model_for_text_classification( diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index d64a7bb3..b841ce67 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -227,4 +227,5 @@ def list_current_model_in_use(self, language='all', metric='all'): if metric != 'all': df_pivot = df_pivot.loc[df_pivot.metric_name == metric] print( - tabulate(df_pivot, headers=df_pivot.columns, tablefmt="github")) # type: ignore # NOQA:E501 + tabulate(df_pivot, headers=df_pivot.columns, # type: ignore # NOQA:E501 + tablefmt="github")) diff --git a/tests/metrics/model_manager/__init__.py b/tests/metrics/model_manager/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/metrics/model_manager/test_model_loader.py b/tests/metrics/model_manager/test_model_loader.py new file mode 100644 index 00000000..95057610 --- /dev/null +++ b/tests/metrics/model_manager/test_model_loader.py @@ -0,0 +1,89 @@ +import pytest +from unittest.mock import patch, MagicMock +from sentence_transformers import SentenceTransformer +from transformers.models.auto.tokenization_auto import AutoTokenizer +from transformers.models.auto.modeling_auto \ + import (AutoModelForSeq2SeqLM, AutoModelForSequenceClassification) +from langcheck.metrics.model_manager._model_loader \ + import (load_auto_model_for_seq2seq, + load_auto_model_for_text_classification, + load_sentence_transformers) + +# Mock objects for AutoTokenizer and AutoModelForSeq2SeqLM +MockTokenizer = MagicMock(spec=AutoTokenizer) +MockSeq2SeqModel = MagicMock(spec=AutoModelForSeq2SeqLM) +MockSentenceTransModel = MagicMock(spec=SentenceTransformer) +MockSeqClassifcationModel = MagicMock(spec=AutoModelForSequenceClassification) + + +@pytest.mark.parametrize("model_name,tokenizer_name,revision", [ + ("t5-small", None, "main"), + ("t5-small", "t5-base", "main") +]) +def test_load_auto_model_for_seq2seq(model_name, tokenizer_name, revision): + with patch('transformers.AutoTokenizer.from_pretrained', + return_value=MockTokenizer) as mock_tokenizer, \ + patch('transformers.AutoModelForSeq2SeqLM.from_pretrained', + return_value=MockSeq2SeqModel) as mock_model: + tokenizer, model = load_auto_model_for_seq2seq(model_name, + tokenizer_name, revision) + + # Check if the tokenizer was loaded correctly + if tokenizer_name is None: + tokenizer_name = model_name + mock_tokenizer.assert_called_once_with(tokenizer_name, + revision=revision) + + # Check if the model was loaded correctly + mock_model.assert_called_once_with(model_name, + revision=revision) + + # Assert that the returned objects are instances of the mocked objects + assert tokenizer == MockTokenizer, \ + "The returned tokenizer is not the expected mock object" + assert model == MockSeq2SeqModel, \ + "The returned model is not the expected mock object" + + +@pytest.mark.parametrize("model_name,tokenizer_name,revision", [ + ("bert-base-uncased", None, "main"), + ("bert-base-uncased", "bert-large-uncased", "main") +]) +def test_load_auto_model_for_text_classification(model_name, tokenizer_name, revision): # NOQA:E501 + with patch('transformers.AutoTokenizer.from_pretrained', + return_value=MockTokenizer) as mock_tokenizer, \ + patch('transformers.AutoModelForSequenceClassification.from_pretrained', + return_value=MockSeqClassifcationModel) as mock_model: + tokenizer, model = load_auto_model_for_text_classification(model_name, + tokenizer_name, revision) # NOQA:E501 + + # Check if the tokenizer was loaded correctly + if tokenizer_name is None: + tokenizer_name = model_name + mock_tokenizer.assert_called_once_with(tokenizer_name, + revision=revision) + + # Check if the model was loaded correctly + mock_model.assert_called_once_with(model_name, + revision=revision) + + # Assert that the returned objects are instances of the mocked objects + assert tokenizer == MockTokenizer, \ + "The returned tokenizer is not the expected mock object" + assert model == MockSeqClassifcationModel, \ + "The returned model is not the expected mock object" + + +@pytest.mark.parametrize("model_name,tokenizer_name,revision", [ + ("all-MiniLM-L6-v2", None, "main"), + ("all-MiniLM-L6-v2", "all-mpnet-base-v2", "main") +]) +def test_load_sentence_transformers(model_name, tokenizer_name, revision): + with patch.object(SentenceTransformer, '__init__', return_value=None) as mock_init: + model = load_sentence_transformers(model_name, tokenizer_name, revision) + # Check if the model was loaded correctly + mock_init.assert_called_once_with(model_name) + + # Assert that the returned objects are instances of the mocked objects + assert isinstance(model, SentenceTransformer), \ + "The returned model is not the expected mock object" \ No newline at end of file diff --git a/tests/metrics/model_manager/test_model_manager.py b/tests/metrics/model_manager/test_model_manager.py new file mode 100644 index 00000000..e69de29b From ac59b6bd864e85218c084b2ea86f4e35fba16597 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 7 Feb 2024 08:20:57 +0000 Subject: [PATCH 34/66] add package data to pyproject.toml --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 18de76c3..50008d76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,3 +82,9 @@ ignore = [ markers = [ "optional: marks tests as optional", ] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +langcheck.metrics.model_manager.config = ["*.yaml"] \ No newline at end of file From e4091b3e742f0b72345807b9f7bdaacd2d83dc9b Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 7 Feb 2024 08:25:53 +0000 Subject: [PATCH 35/66] package-data fix --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 50008d76..73c07341 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,4 +87,4 @@ markers = [ where = ["src"] [tool.setuptools.package-data] -langcheck.metrics.model_manager.config = ["*.yaml"] \ No newline at end of file +langcheck = ["*.yaml"] \ No newline at end of file From 37c388406b404d07b8372d7fd63fb79068b1930e Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 7 Feb 2024 08:33:58 +0000 Subject: [PATCH 36/66] try again --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 73c07341..8a177e2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,4 +87,4 @@ markers = [ where = ["src"] [tool.setuptools.package-data] -langcheck = ["*.yaml"] \ No newline at end of file +langcheck = ["metrics/model_manager/config/*.yaml"] \ No newline at end of file From bd3ab62d949d95639029818c043b8711e22d3875 Mon Sep 17 00:00:00 2001 From: vela Date: Thu, 8 Feb 2024 01:01:57 +0900 Subject: [PATCH 37/66] remove global value in metric --- .../metrics/zh/reference_free_text_quality.py | 9 --------- .../metrics/zh/source_based_text_quality.py | 15 +++++---------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/src/langcheck/metrics/zh/reference_free_text_quality.py b/src/langcheck/metrics/zh/reference_free_text_quality.py index 606dae68..add9beac 100644 --- a/src/langcheck/metrics/zh/reference_free_text_quality.py +++ b/src/langcheck/metrics/zh/reference_free_text_quality.py @@ -19,10 +19,6 @@ sentiment as en_sentiment from langcheck.metrics.metric_value import MetricValue -_sentiment_model_path = 'IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment' # NOQA: E501 - -_toxicity_model_path = "alibaba-pai/pai-bert-base-zh-llm-risk-detection" - def sentiment( generated_outputs: List[str] | str, @@ -87,11 +83,6 @@ def sentiment( metric_value.language = 'zh' return metric_value - global _sentiment_model_path - - _sentiment_pipeline = pipeline( - 'sentiment-analysis', - model=_sentiment_model_path) # type: ignore[reportGeneralTypeIssues] # {0:"Negative", 1:'Positive'} from langcheck.metrics.model_manager import manager tokenizer, model = manager.fetch_model(language='zh', metric='sentiment') diff --git a/src/langcheck/metrics/zh/source_based_text_quality.py b/src/langcheck/metrics/zh/source_based_text_quality.py index a643a799..5c8224b1 100644 --- a/src/langcheck/metrics/zh/source_based_text_quality.py +++ b/src/langcheck/metrics/zh/source_based_text_quality.py @@ -11,9 +11,6 @@ factual_consistency as en_factual_consistency from langcheck.metrics.metric_value import MetricValue -_factual_consistency_translation_model_path = 'Helsinki-NLP/opus-mt-zh-en' -_factual_consistency_translation_pipeline: Pipeline | None = None - def factual_consistency( generated_outputs: List[str] | str, @@ -84,13 +81,11 @@ def factual_consistency( metric_value.language = 'zh' return metric_value - global _factual_consistency_translation_pipeline - if _factual_consistency_translation_pipeline is None: - from langcheck.metrics.model_manager import manager - tokenizer, model = manager.fetch_model(language='zh', - metric='factual_consistency') # NOQA:E501 - _factual_consistency_translation_pipeline = pipeline( - 'translation', model=model, tokenizer=tokenizer) # type: ignore + from langcheck.metrics.model_manager import manager + tokenizer, model = manager.fetch_model(language='zh', + metric='factual_consistency') # NOQA:E501 + _factual_consistency_translation_pipeline = pipeline( + 'translation', model=model, tokenizer=tokenizer) # type: ignore # Translate the sources and generated outputs to English. # Currently, the type checks are not working for the pipeline, since From d8f20bce664b0c76c61de616ac1a4b5753406b03 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 01:38:13 +0000 Subject: [PATCH 38/66] clean load_sentence_transformers comments --- .../metrics/model_manager/_model_loader.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py index fa6b9cb5..e869bb82 100644 --- a/src/langcheck/metrics/model_manager/_model_loader.py +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -10,18 +10,27 @@ def load_sentence_transformers( model_name: str, tokenizer_name: Optional[str] = None, - revision: Optional[str] = None) -> SentenceTransformer: # NOQA:E501 + revision: Optional[str] = None) -> SentenceTransformer: ''' - Return a sequence embeddiing model parsed by sentence-transformer library. + Loads a SentenceTransformer model. + + This function currently does not support specifying a tokenizer or a + revision. If these arguments are provided, a warning message will be + printed. Args: - model_name: The name of a sentence-transformer model + model_name: The name of the SentenceTransformer model to load. + tokenizer_name: The name of the tokenizer to use. Currently not + supported. + revision: The model revision to load. Currently not supported. + + Returns: + model: The loaded SentenceTransformer model. ''' if revision is not None: - print("Version Pined not supported in Sentence-Transformers yet.") - + print("Warning: Specifying a revision is not currently supported.") if tokenizer_name is not None: - print("Tokenizer customize not supported in Sentence-Transformers yet.") + print("Warning: Customizing the tokenizer is not currently supported.") model = SentenceTransformer(model_name) return model From c46645bedd91b840b9e2d28a062ae6ae32216a5a Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 01:49:01 +0000 Subject: [PATCH 39/66] clean load_auto_model_for_text_classification docstring --- .../metrics/model_manager/_model_loader.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py index e869bb82..ce69aff4 100644 --- a/src/langcheck/metrics/model_manager/_model_loader.py +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -42,19 +42,24 @@ def load_auto_model_for_text_classification( revision: Optional[str] = None ) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: ''' - Return a sequence classification model on huggingface hub. + Loads a sequence classification model and its tokenizer. Args: - model_name: The name of a sequence-classification model on Hugging Face - tokenizer_name: The name of a tokenizer on Hugging Face - revision: The shortened sha1 string of a model + model_name: The name of the sequence-classification model to load. + tokenizer_name: The name of the tokenizer to load. If None, the + tokenizer associated with the model will be loaded. + revision: The model revision to load. + + Returns: + tokenizer: The loaded tokenizer. + model: The loaded sequence classification model. ''' if tokenizer_name is None: tokenizer_name = model_name tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) model = AutoModelForSequenceClassification.from_pretrained( - model_name, revision=revision) # NOQA: E501 - return tokenizer, model + model_name, revision=revision) + return tokenizer, model # type: ignore def load_auto_model_for_seq2seq( From ff9038150bdeff52cb7400541a299fc6ea7bd37e Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 01:52:16 +0000 Subject: [PATCH 40/66] clean load_auto_model_for_seq2seq docstring --- .../metrics/model_manager/_model_loader.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py index ce69aff4..c0dc2b31 100644 --- a/src/langcheck/metrics/model_manager/_model_loader.py +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -66,18 +66,23 @@ def load_auto_model_for_seq2seq( model_name: str, tokenizer_name: Optional[str] = None, revision: Optional[str] = None -) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: +) -> Tuple[AutoTokenizer, AutoModelForSeq2SeqLM]: ''' - Return a sequence to sequence model availble on huggingface hub. + Loads a sequence-to-sequence model and its tokenizer. Args: - model_name: The name of a sequence-classification model on Hugging Face - tokenizer_name: The name of a tokenizer on Hugging Face - revision: The shortened sha1 string of a model + model_name: The name of the sequence-classification model to load. + tokenizer_name: The name of the tokenizer to load. If None, the + tokenizer associated with the model will be loaded. + revision: The model revision to load. + + Returns: + tokenizer: The loaded tokenizer. + model: The loaded sequence-to-sequence model. ''' if tokenizer_name is None: tokenizer_name = model_name tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) model = AutoModelForSeq2SeqLM.from_pretrained( model_name, revision=revision) # NOQA: E501 - return tokenizer, model + return tokenizer, model # type: ignore From e35b66ed6e7274b42a88f18e105278a371c79455 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 01:52:57 +0000 Subject: [PATCH 41/66] clean up model loader imports --- src/langcheck/metrics/model_manager/_model_loader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py index c0dc2b31..3882400a 100644 --- a/src/langcheck/metrics/model_manager/_model_loader.py +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -1,10 +1,9 @@ from typing import Optional, Tuple from sentence_transformers import SentenceTransformer -from transformers.models.auto.modeling_auto import ( # NOQA:E501 +from transformers.models.auto.modeling_auto import ( AutoModelForSeq2SeqLM, AutoModelForSequenceClassification) from transformers.models.auto.tokenization_auto import AutoTokenizer -from transformers.pipelines import pipeline def load_sentence_transformers( From a959633e9c64d76a42c77dab9a880598d06757a5 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 02:44:02 +0000 Subject: [PATCH 42/66] clean up __load_config --- .../model_manager/_model_management.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index b841ce67..3f08c8da 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -65,17 +65,23 @@ def __init__(self): "metric_config.yaml") self.__load_config(default_config_file_path) - def __load_config(self, path: str): + def __load_config(self, path: str) -> None: + ''' + Loads the model configuration from a file. + + Args: + path: The path to the configuration file. + ''' conf = OmegaConf.load(path) for lang, lang_conf in conf.items(): for metric_name, metric_conf in lang_conf.items(): # check model availbility, if key not in conf # omega conf will return None in default - self.__set_model_for_metric(language=lang, # type: ignore # NOQA:E501 + self.__set_model_for_metric(language=lang, metric=metric_name, **metric_conf) - print('Configuration Load Successed!') + print('Configuration Load Succeeded!') @lru_cache def fetch_model( @@ -220,12 +226,17 @@ def list_current_model_in_use(self, language='all', metric='all'): if language == 'all' and metric == 'all': print( - tabulate(df_pivot, headers=df_pivot.columns, tablefmt="github")) # type: ignore # NOQA:E501 + tabulate( + df_pivot, # type: ignore + headers=df_pivot.columns, # type: ignore + tablefmt="github")) else: if language != "all": df_pivot = df_pivot.loc[df_pivot.language == language] if metric != 'all': df_pivot = df_pivot.loc[df_pivot.metric_name == metric] print( - tabulate(df_pivot, headers=df_pivot.columns, # type: ignore # NOQA:E501 - tablefmt="github")) + tabulate( + df_pivot, # type: ignore + headers=df_pivot.columns, # type: ignore + tablefmt="github")) From 3240129ffc5e45dd24ac27a3d9e8373fe1c7bc2b Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 03:11:08 +0000 Subject: [PATCH 43/66] add comment for fetch_model --- .../metrics/model_manager/_model_management.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index 3f08c8da..bc2d3cd5 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -89,20 +89,27 @@ def fetch_model( ) -> Union[Tuple[AutoTokenizer, AutoModelForSequenceClassification], Tuple[ AutoTokenizer, AutoModelForSeq2SeqLM], SentenceTransformer]: ''' - Return the model used for the given metric and language. + Return the model (and if applicable, the tokenizer) used for the given + metric and language. Args: language: The language for which to get the model metric_type: The metric name + + Returns: + A (tokenizer, modle) tuple, or just the model depending on the + loader function. ''' if language in self.config: if metric in self.config[language]: # Deep copy the confguration so that changes to `config` would # not affect the original `self.config`. config = deepcopy(self.config[language][metric]) - # Get model name, model loader type + # Get model loader function loader_func = config.pop('loader_func') loader = LOADER_MAP[loader_func] + # Call the loader function with the model_name, tokenizer_name + # (optional), and revision (optional) as arguments return loader(**config) else: raise KeyError(f'Metric {metric} not supported yet.') From 78b296edf141c9954aaa0e757d738515839dc0c3 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 03:16:14 +0000 Subject: [PATCH 44/66] clean __set_model_for_metric --- .../model_manager/_model_management.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index bc2d3cd5..12237a52 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -155,15 +155,16 @@ def validate_config(config, language='all', metric='all'): ) def __set_model_for_metric(self, language: str, metric: str, - model_name: str, loader_func: str, **kwargs): + model_name: str, loader_func: str, + **kwargs) -> None: ''' Set model for specified metric in specified language. Args: language: The name of the language - metric: The name of the evaluation metrics + metric: The name of the evaluation metric model_name: The name of the model - loader: The loader of the model + loader_func: The loader function of the model tokenizer_name: (Optional) The name of the tokenizer revision: (Optional) A version string of the model ''' @@ -173,18 +174,22 @@ def __set_model_for_metric(self, language: str, metric: str, raise KeyError('Language {language} not supported yet') if metric not in VALID_METRICS: - raise KeyError('Language {language} not supported {metric} yet') + raise KeyError( + f'Metric {metric} not supported for language {language} yet' + ) - # initialize configuration structure if it is empty. + # Initialize the configuration for the language and metric if it + # doesn't exist if self.config.get(language) is None: self.config[language] = {} if self.config.get(language).get(metric) is None: self.config[language][metric] = {} detail_config = self.config[language][metric] - # set metric attribute + # Set the loader function and model name detail_config['loader_func'] = loader_func detail_config['model_name'] = model_name + # If tokenizer_name is different from model_name tokenizer_name = kwargs.pop('tokenizer_name', None) if tokenizer_name: @@ -193,6 +198,7 @@ def __set_model_for_metric(self, language: str, metric: str, revision = kwargs.pop('model_revision', None) if revision: detail_config['revision'] = revision + # Validate the change if ModelManager.validate_config(self.config, language=language, @@ -201,7 +207,7 @@ def __set_model_for_metric(self, language: str, metric: str, # immediately self.fetch_model.cache_clear() except (ValueError, KeyError) as err: - # Trace back the configuration + # If an error occurred, restore the original configuration self.config = config_copy raise err From 8b247fd46c9e75c1bb65a3b21c700b98aae670c5 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 03:17:56 +0000 Subject: [PATCH 45/66] minor cleanup --- .../metrics/model_manager/_model_management.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index 12237a52..3e8dde36 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -9,7 +9,7 @@ from omegaconf import OmegaConf from sentence_transformers import SentenceTransformer from tabulate import tabulate -from transformers.models.auto.modeling_auto import ( # NOQA:E501 +from transformers.models.auto.modeling_auto import ( AutoModelForSeq2SeqLM, AutoModelForSequenceClassification) from transformers.models.auto.tokenization_auto import AutoTokenizer @@ -21,11 +21,11 @@ "load_sentence_transformers": load_sentence_transformers, "load_auto_model_for_text_classification": - load_auto_model_for_text_classification, # NOQA:E501 + load_auto_model_for_text_classification, "load_auto_model_for_seq2seq": load_auto_model_for_seq2seq } -VALID_LOADER_FUNCTION = LOADER_MAP.keys() # NOQA:E501 +VALID_LOADER_FUNCTION = LOADER_MAP.keys() VALID_METRICS = [ 'semantic_similarity', 'sentiment', 'toxicity', 'factual_consistency' ] @@ -36,7 +36,7 @@ VALID_LANGUAGE = ['zh'] -def check_model_availability(model_name: str, revision: Optional[str]): +def check_model_availability(model_name: str, revision: Optional[str]) -> bool: # TODO: add local cached model availability check for offline environment if revision is None: url = f"https://huggingface.co/api/models/{model_name}" @@ -211,7 +211,7 @@ def __set_model_for_metric(self, language: str, metric: str, self.config = config_copy raise err - def list_current_model_in_use(self, language='all', metric='all'): + def list_current_model_in_use(self, language='all', metric='all') -> None: ''' List the models currently in use. From 42ccc6649ee89420006b0f4e4cb0ddaec6a767b8 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 03:21:53 +0000 Subject: [PATCH 46/66] clean validate_config --- .../metrics/model_manager/_model_management.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index 3e8dde36..86a5bfb6 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -117,11 +117,12 @@ def fetch_model( raise KeyError(f'Language {language} not supported yet') @staticmethod - def validate_config(config, language='all', metric='all'): + def validate_config(config, language='all', metric='all') -> None: ''' Validate configuration. Args: + config: The configuration dictionary to validate. language: The name of the language. Defaults to 'all'. metric: The name of the metric. Defaults to 'all'. ''' @@ -132,7 +133,8 @@ def validate_config(config, language='all', metric='all'): for metric_name, model_setting in lang_setting.items(): if metric != 'all' and metric_name != metric: continue - # If model name not set + + # Check that the model name and loader function are set if 'model_name' not in model_setting: raise KeyError( f'{lang} metrics {metric_name} need a model, but found None!' # NOQA:E501 @@ -141,14 +143,15 @@ def validate_config(config, language='all', metric='all'): raise KeyError( f'Metrics {metric_name} need a loader, but found None!' # NOQA:E501 ) - # Check if the model and revision is available on - # Hugging Face Hub - model_name = model_setting.pop('model_name') - revision = model_setting.pop('revision', None) loader_func = model_setting.pop('loader_func', None) if loader_func not in VALID_LOADER_FUNCTION: raise ValueError( f'loader type should in {VALID_LOADER_FUNCTION}') + + # Check that the model and revision are available on the Hugging + # Face Hub + model_name = model_setting.pop('model_name') + revision = model_setting.pop('revision', None) if not check_model_availability(model_name, revision): raise ValueError( f'Cannot find {model_name} with {revision} and Huggingface Hub' # NOQA:E501 From f4bf665c9a3fc1c5d41090908c93cd1ae8e78c86 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Fri, 9 Feb 2024 03:22:23 +0000 Subject: [PATCH 47/66] remove unused import --- src/langcheck/metrics/model_manager/_model_management.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index 86a5bfb6..876a0d21 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -1,7 +1,6 @@ import os from copy import deepcopy from functools import lru_cache -from pathlib import Path from typing import Optional, Tuple, Union import pandas as pd From e7d42e9493257a490c6bad95e0736ce32780d211 Mon Sep 17 00:00:00 2001 From: vela Date: Fri, 16 Feb 2024 20:50:17 +0900 Subject: [PATCH 48/66] add test case for model manager class --- .../model_manager/test_model_manager.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/metrics/model_manager/test_model_manager.py b/tests/metrics/model_manager/test_model_manager.py index e69de29b..dca0c9b2 100644 --- a/tests/metrics/model_manager/test_model_manager.py +++ b/tests/metrics/model_manager/test_model_manager.py @@ -0,0 +1,68 @@ +from unittest.mock import MagicMock, patch + +import pytest +import requests +from omegaconf import OmegaConf + +from langcheck.metrics.model_manager._model_management import ( + ModelManager, check_model_availability) + + +@pytest.fixture +def temp_config_path(tmp_path): + config = """ + zh: + toxicity: + model_name: "Alnusjaponica/toxicity-score-multi-classification" + tokenizer_name: "line-corporation/line-distilbert-base-japanese" + loader_func: "load_auto_model_for_text_classification" + """ + config_path = tmp_path / "metric_config.yaml" + config_path.write_text(config) + return str(config_path) + + +@pytest.fixture +def mock_model_manager(temp_config_path): + with patch("os.path.join", return_value=temp_config_path), \ + patch('langcheck.metrics.model_manager._model_management.check_model_availability', # NOQA:E501 + return_value=True): + model_manager = ModelManager() + return model_manager + + +@pytest.mark.parametrize( + "model_name,revision, status_code", + [("bert-base-uncased", "", "200"), ("bert-base-uncased", None, "200"), + ("bert-base-uncased", "main", "200"), + ("bert-base-uncased", "a265f77", "200"), + ("bert-base-uncased", "a265f773a47193eed794233aa2a0f0bb6d3eaa63", "200"), + pytest.param( + "bert-base-uncased", "a265f78", "404", marks=pytest.mark.xfail), + pytest.param("", "0e9f4", "404", marks=pytest.mark.xfail), + pytest.param("terb-base-uncased", "", "404", marks=pytest.mark.xfail)], +) +@patch("requests.get") +def test_check_model_availability(mock_get, model_name, revision, status_code): + mock_get.return_value.status_code = status_code + available = check_model_availability(model_name, revision) + assert available is (status_code == requests.codes.OK) + + +def test_model_manager_initiation(mock_model_manager): + mock_config = mock_model_manager.config + assert "toxicity" in mock_config["zh"] + assert mock_config["zh"]["toxicity"]["model_name"] ==\ + "Alnusjaponica/toxicity-score-multi-classification" + assert mock_config["zh"]["toxicity"]["tokenizer_name"] == \ + "line-corporation/line-distilbert-base-japanese" + assert mock_config["zh"]["toxicity"]["loader_func"] == \ + "load_auto_model_for_text_classification" + + +def test_model_manager_fetch_model(mock_model_manager): + with patch.dict( + 'langcheck.metrics.model_manager._model_management.LOADER_MAP', + {'load_auto_model_for_text_classification': MagicMock()}): + model = mock_model_manager.fetch_model(language='zh', metric='toxicity') + assert model is not None From 108682671769bb64bea74b4e9223eee4c9f24b56 Mon Sep 17 00:00:00 2001 From: vela Date: Fri, 16 Feb 2024 20:51:45 +0900 Subject: [PATCH 49/66] remove global value in metrics --- src/langcheck/metrics/zh/reference_based_text_quality.py | 3 +-- src/langcheck/metrics/zh/reference_free_text_quality.py | 6 +----- src/langcheck/metrics/zh/source_based_text_quality.py | 4 ++-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/langcheck/metrics/zh/reference_based_text_quality.py b/src/langcheck/metrics/zh/reference_based_text_quality.py index 338c4723..01ee7e8d 100644 --- a/src/langcheck/metrics/zh/reference_based_text_quality.py +++ b/src/langcheck/metrics/zh/reference_based_text_quality.py @@ -92,8 +92,7 @@ def semantic_similarity( return metric_value # lazy import from langcheck.metrics.model_manager import manager - model = manager.fetch_model(language='zh', - metric="semantic_similarity") + model = manager.fetch_model(language='zh', metric="semantic_similarity") # For type checking assert isinstance(model, SentenceTransformer) diff --git a/src/langcheck/metrics/zh/reference_free_text_quality.py b/src/langcheck/metrics/zh/reference_free_text_quality.py index add9beac..a01a1622 100644 --- a/src/langcheck/metrics/zh/reference_free_text_quality.py +++ b/src/langcheck/metrics/zh/reference_free_text_quality.py @@ -205,19 +205,15 @@ def _toxicity_local(generated_outputs: List[str]) -> List[float]: Returns: A list of scores ''' - global _toxicity_model_path # this pipeline output predict probability for each text on each label. # the output format is List[List[Dict(str)]] from langcheck.metrics.model_manager import manager - tokenizer, model = manager.fetch_model(language='zh', - metric="toxicity") - + tokenizer, model = manager.fetch_model(language='zh', metric="toxicity") _toxicity_pipeline = pipeline( 'text-classification', model=model, # type: ignore[reportOptionalIterable] tokenizer=tokenizer, # type: ignore[reportOptionalIterable] top_k=5) - # {'Normal': 0, 'Pulp': 1, 'Sex': 2, 'Other Risk': 3, 'Adult': 4} _model_id2label = _toxicity_pipeline.model.config.id2label _predict_results = _toxicity_pipeline( diff --git a/src/langcheck/metrics/zh/source_based_text_quality.py b/src/langcheck/metrics/zh/source_based_text_quality.py index 5c8224b1..ccebbf0c 100644 --- a/src/langcheck/metrics/zh/source_based_text_quality.py +++ b/src/langcheck/metrics/zh/source_based_text_quality.py @@ -82,8 +82,8 @@ def factual_consistency( return metric_value from langcheck.metrics.model_manager import manager - tokenizer, model = manager.fetch_model(language='zh', - metric='factual_consistency') # NOQA:E501 + tokenizer, model = manager.fetch_model( + language='zh', metric='factual_consistency') # NOQA:E501 _factual_consistency_translation_pipeline = pipeline( 'translation', model=model, tokenizer=tokenizer) # type: ignore From 89d9aaa86a4efefc66fb6ddd125bfd93f8aab3b8 Mon Sep 17 00:00:00 2001 From: vela Date: Fri, 16 Feb 2024 20:52:14 +0900 Subject: [PATCH 50/66] apply format suggestions --- src/langcheck/metrics/__init__.py | 1 - .../metrics/model_manager/__init__.py | 10 +-- .../metrics/model_manager/_model_loader.py | 12 ++-- .../model_manager/_model_management.py | 15 +++-- .../model_manager/config/metric_config.yaml | 6 ++ .../model_manager/test_model_loader.py | 66 ++++++++----------- 6 files changed, 53 insertions(+), 57 deletions(-) diff --git a/src/langcheck/metrics/__init__.py b/src/langcheck/metrics/__init__.py index 4a5afec9..4c23260d 100644 --- a/src/langcheck/metrics/__init__.py +++ b/src/langcheck/metrics/__init__.py @@ -39,5 +39,4 @@ 'semantic_similarity', 'sentiment', 'toxicity', - 'model_manager' ] diff --git a/src/langcheck/metrics/model_manager/__init__.py b/src/langcheck/metrics/model_manager/__init__.py index 49082c76..1e580725 100644 --- a/src/langcheck/metrics/model_manager/__init__.py +++ b/src/langcheck/metrics/model_manager/__init__.py @@ -1,11 +1,5 @@ -from ._model_loader import (load_auto_model_for_seq2seq, - load_auto_model_for_text_classification, - load_sentence_transformers) from ._model_management import ModelManager manager = ModelManager() - -__all__ = [ - "manager", "load_sentence_transformers", "load_auto_model_for_seq2seq", - "load_auto_model_for_text_classification" -] +list_current_model_in_use = manager.list_current_model_in_use +__all__ = ["manager", "list_current_model_in_use"] diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py index 3882400a..b5e90e77 100644 --- a/src/langcheck/metrics/model_manager/_model_loader.py +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -55,9 +55,11 @@ def load_auto_model_for_text_classification( ''' if tokenizer_name is None: tokenizer_name = model_name - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, + trust_remote_code=True, + revision=revision) model = AutoModelForSequenceClassification.from_pretrained( - model_name, revision=revision) + model_name, revision=revision, trust_remote_code=True) return tokenizer, model # type: ignore @@ -81,7 +83,9 @@ def load_auto_model_for_seq2seq( ''' if tokenizer_name is None: tokenizer_name = model_name - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, revision=revision) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, + revision=revision, + trust_remote_code=True) model = AutoModelForSeq2SeqLM.from_pretrained( - model_name, revision=revision) # NOQA: E501 + model_name, revision=revision, trust_remote_code=True) # NOQA: E501 return tokenizer, model # type: ignore diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index 876a0d21..1d2c4697 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -37,7 +37,7 @@ def check_model_availability(model_name: str, revision: Optional[str]) -> bool: # TODO: add local cached model availability check for offline environment - if revision is None: + if revision is None or revision == "": url = f"https://huggingface.co/api/models/{model_name}" else: url = f"https://huggingface.co/api/models/{model_name}/revision/{revision}" # NOQA:E501 @@ -77,6 +77,7 @@ def __load_config(self, path: str) -> None: for metric_name, metric_conf in lang_conf.items(): # check model availbility, if key not in conf # omega conf will return None in default + assert isinstance(lang, str) self.__set_model_for_metric(language=lang, metric=metric_name, **metric_conf) @@ -202,12 +203,12 @@ def __set_model_for_metric(self, language: str, metric: str, detail_config['revision'] = revision # Validate the change - if ModelManager.validate_config(self.config, - language=language, - metric=metric): - # Clear the LRU cache to make the config change reflected - # immediately - self.fetch_model.cache_clear() + ModelManager.validate_config(self.config, + language=language, + metric=metric) + # Clear the LRU cache to make the config change reflected + # immediately + self.fetch_model.cache_clear() except (ValueError, KeyError) as err: # If an error occurred, restore the original configuration self.config = config_copy diff --git a/src/langcheck/metrics/model_manager/config/metric_config.yaml b/src/langcheck/metrics/model_manager/config/metric_config.yaml index be8d605a..00dac306 100644 --- a/src/langcheck/metrics/model_manager/config/metric_config.yaml +++ b/src/langcheck/metrics/model_manager/config/metric_config.yaml @@ -1,3 +1,9 @@ +# LANG: +# METRIC_NAME: +# model_name: str +# model_revisoin: str +# tokenizer_name:str +# loader_func: str zh: semantic_similarity: model_name: BAAI/bge-base-zh-v1.5 diff --git a/tests/metrics/model_manager/test_model_loader.py b/tests/metrics/model_manager/test_model_loader.py index 95057610..0ae3499c 100644 --- a/tests/metrics/model_manager/test_model_loader.py +++ b/tests/metrics/model_manager/test_model_loader.py @@ -1,13 +1,14 @@ +from unittest.mock import MagicMock, patch + import pytest -from unittest.mock import patch, MagicMock from sentence_transformers import SentenceTransformer +from transformers.models.auto.modeling_auto import ( + AutoModelForSeq2SeqLM, AutoModelForSequenceClassification) from transformers.models.auto.tokenization_auto import AutoTokenizer -from transformers.models.auto.modeling_auto \ - import (AutoModelForSeq2SeqLM, AutoModelForSequenceClassification) -from langcheck.metrics.model_manager._model_loader \ - import (load_auto_model_for_seq2seq, - load_auto_model_for_text_classification, - load_sentence_transformers) + +from langcheck.metrics.model_manager._model_loader import ( + load_auto_model_for_seq2seq, load_auto_model_for_text_classification, + load_sentence_transformers) # Mock objects for AutoTokenizer and AutoModelForSeq2SeqLM MockTokenizer = MagicMock(spec=AutoTokenizer) @@ -16,10 +17,9 @@ MockSeqClassifcationModel = MagicMock(spec=AutoModelForSequenceClassification) -@pytest.mark.parametrize("model_name,tokenizer_name,revision", [ - ("t5-small", None, "main"), - ("t5-small", "t5-base", "main") -]) +@pytest.mark.parametrize("model_name,tokenizer_name,revision", + [("t5-small", None, "main"), + ("t5-small", "t5-base", "main")]) def test_load_auto_model_for_seq2seq(model_name, tokenizer_name, revision): with patch('transformers.AutoTokenizer.from_pretrained', return_value=MockTokenizer) as mock_tokenizer, \ @@ -31,13 +31,9 @@ def test_load_auto_model_for_seq2seq(model_name, tokenizer_name, revision): # Check if the tokenizer was loaded correctly if tokenizer_name is None: tokenizer_name = model_name - mock_tokenizer.assert_called_once_with(tokenizer_name, - revision=revision) - - # Check if the model was loaded correctly - mock_model.assert_called_once_with(model_name, - revision=revision) + mock_model.assert_called_once() + mock_tokenizer.assert_called_once() # Assert that the returned objects are instances of the mocked objects assert tokenizer == MockTokenizer, \ "The returned tokenizer is not the expected mock object" @@ -45,28 +41,24 @@ def test_load_auto_model_for_seq2seq(model_name, tokenizer_name, revision): "The returned model is not the expected mock object" -@pytest.mark.parametrize("model_name,tokenizer_name,revision", [ - ("bert-base-uncased", None, "main"), - ("bert-base-uncased", "bert-large-uncased", "main") -]) -def test_load_auto_model_for_text_classification(model_name, tokenizer_name, revision): # NOQA:E501 +@pytest.mark.parametrize("model_name,tokenizer_name,revision", + [("bert-base-uncased", None, "main"), + ("bert-base-uncased", "bert-large-uncased", "main")]) +def test_load_auto_model_for_text_classification(model_name, tokenizer_name, + revision): # NOQA:E501 with patch('transformers.AutoTokenizer.from_pretrained', return_value=MockTokenizer) as mock_tokenizer, \ - patch('transformers.AutoModelForSequenceClassification.from_pretrained', + patch('transformers.AutoModelForSequenceClassification.from_pretrained', # NOQA:E501 return_value=MockSeqClassifcationModel) as mock_model: - tokenizer, model = load_auto_model_for_text_classification(model_name, - tokenizer_name, revision) # NOQA:E501 + tokenizer, model = load_auto_model_for_text_classification( + model_name, tokenizer_name, revision) # NOQA:E501 # Check if the tokenizer was loaded correctly if tokenizer_name is None: tokenizer_name = model_name - mock_tokenizer.assert_called_once_with(tokenizer_name, - revision=revision) - - # Check if the model was loaded correctly - mock_model.assert_called_once_with(model_name, - revision=revision) + mock_model.assert_called_once() + mock_tokenizer.assert_called_once() # Assert that the returned objects are instances of the mocked objects assert tokenizer == MockTokenizer, \ "The returned tokenizer is not the expected mock object" @@ -74,16 +66,16 @@ def test_load_auto_model_for_text_classification(model_name, tokenizer_name, rev "The returned model is not the expected mock object" -@pytest.mark.parametrize("model_name,tokenizer_name,revision", [ - ("all-MiniLM-L6-v2", None, "main"), - ("all-MiniLM-L6-v2", "all-mpnet-base-v2", "main") -]) +@pytest.mark.parametrize("model_name,tokenizer_name,revision", + [("all-MiniLM-L6-v2", None, "main"), + ("all-MiniLM-L6-v2", "all-mpnet-base-v2", "main")]) def test_load_sentence_transformers(model_name, tokenizer_name, revision): - with patch.object(SentenceTransformer, '__init__', return_value=None) as mock_init: + with patch.object(SentenceTransformer, '__init__', + return_value=None) as mock_init: model = load_sentence_transformers(model_name, tokenizer_name, revision) # Check if the model was loaded correctly mock_init.assert_called_once_with(model_name) # Assert that the returned objects are instances of the mocked objects assert isinstance(model, SentenceTransformer), \ - "The returned model is not the expected mock object" \ No newline at end of file + "The returned model is not the expected mock object" From a09dbd3bce805aaf3021d2fafc51a0d3c0fdef39 Mon Sep 17 00:00:00 2001 From: vela Date: Fri, 16 Feb 2024 21:01:30 +0900 Subject: [PATCH 51/66] make jp char and zh char show formally in test, not unicode --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 8a177e2f..84c08f39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,7 @@ ignore = [ markers = [ "optional: marks tests as optional", ] +disable_test_id_escaping_and_forfeit_all_rights_to_community_support = true [tool.setuptools.packages.find] where = ["src"] From daeb706e0807c1b91eaed29e124d788d25aa6572 Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 19 Feb 2024 16:58:25 +0900 Subject: [PATCH 52/66] fix import error in en detoxify raised by pyright --- src/langcheck/metrics/en/_detoxify.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/langcheck/metrics/en/_detoxify.py b/src/langcheck/metrics/en/_detoxify.py index ed4efdad..a3c1c03d 100644 --- a/src/langcheck/metrics/en/_detoxify.py +++ b/src/langcheck/metrics/en/_detoxify.py @@ -1,7 +1,9 @@ from typing import List, Tuple import torch -from transformers import BertForSequenceClassification, BertTokenizer +from transformers.models.bert.modeling_bert import \ + BertForSequenceClassification +from transformers.models.bert.tokenization_bert import BertTokenizer def load_checkpoint( From 64f7e95480ac8e610fde7737a1890fca29876f62 Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 19 Feb 2024 16:58:43 +0900 Subject: [PATCH 53/66] apply format check suggestion --- tests/metrics/model_manager/test_model_manager.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/metrics/model_manager/test_model_manager.py b/tests/metrics/model_manager/test_model_manager.py index dca0c9b2..8975af60 100644 --- a/tests/metrics/model_manager/test_model_manager.py +++ b/tests/metrics/model_manager/test_model_manager.py @@ -61,8 +61,9 @@ def test_model_manager_initiation(mock_model_manager): def test_model_manager_fetch_model(mock_model_manager): - with patch.dict( - 'langcheck.metrics.model_manager._model_management.LOADER_MAP', - {'load_auto_model_for_text_classification': MagicMock()}): + with \ + patch.dict( + 'langcheck.metrics.model_manager._model_management.LOADER_MAP', + {'load_auto_model_for_text_classification': MagicMock()}): model = mock_model_manager.fetch_model(language='zh', metric='toxicity') assert model is not None From 61556233d4d6d6e9d8a9b40933d32f66bbfe544c Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 19 Feb 2024 18:06:51 +0900 Subject: [PATCH 54/66] apply format check suggestions and remove useless import --- src/langcheck/metrics/en/_detoxify.py | 3 +-- src/langcheck/metrics/model_manager/__init__.py | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/langcheck/metrics/en/_detoxify.py b/src/langcheck/metrics/en/_detoxify.py index a3c1c03d..0699d870 100644 --- a/src/langcheck/metrics/en/_detoxify.py +++ b/src/langcheck/metrics/en/_detoxify.py @@ -1,8 +1,7 @@ from typing import List, Tuple import torch -from transformers.models.bert.modeling_bert import \ - BertForSequenceClassification +from transformers.models.bert.modeling_bert import BertForSequenceClassification from transformers.models.bert.tokenization_bert import BertTokenizer diff --git a/src/langcheck/metrics/model_manager/__init__.py b/src/langcheck/metrics/model_manager/__init__.py index 1e580725..24e9ef91 100644 --- a/src/langcheck/metrics/model_manager/__init__.py +++ b/src/langcheck/metrics/model_manager/__init__.py @@ -1,5 +1,3 @@ from ._model_management import ModelManager manager = ModelManager() -list_current_model_in_use = manager.list_current_model_in_use -__all__ = ["manager", "list_current_model_in_use"] From 7f45a8c08f3f134bc536f7245ed6dfc740834f9e Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Tue, 20 Feb 2024 01:57:03 +0000 Subject: [PATCH 55/66] remove unused imports --- src/langcheck/metrics/zh/reference_free_text_quality.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/langcheck/metrics/zh/reference_free_text_quality.py b/src/langcheck/metrics/zh/reference_free_text_quality.py index a01a1622..ada82bcc 100644 --- a/src/langcheck/metrics/zh/reference_free_text_quality.py +++ b/src/langcheck/metrics/zh/reference_free_text_quality.py @@ -1,20 +1,13 @@ from __future__ import annotations -import pickle -from math import e -from pathlib import PosixPath from typing import Dict, List, Optional import hanlp -import regex as re -import torch from openai import OpenAI from transformers.pipelines import pipeline -from langcheck._handle_logs import _handle_logging_level from langcheck.metrics._validation import validate_parameters_reference_free -from langcheck.metrics.en.reference_free_text_quality import (_fluency_openai, - _toxicity_openai) +from langcheck.metrics.en.reference_free_text_quality import _toxicity_openai from langcheck.metrics.en.reference_free_text_quality import \ sentiment as en_sentiment from langcheck.metrics.metric_value import MetricValue From 0f15528bc90d0626499f76edb0e4041cceccdcf7 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Tue, 20 Feb 2024 01:58:22 +0000 Subject: [PATCH 56/66] remove unused import --- src/langcheck/metrics/zh/source_based_text_quality.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/langcheck/metrics/zh/source_based_text_quality.py b/src/langcheck/metrics/zh/source_based_text_quality.py index ccebbf0c..843d17ed 100644 --- a/src/langcheck/metrics/zh/source_based_text_quality.py +++ b/src/langcheck/metrics/zh/source_based_text_quality.py @@ -4,7 +4,6 @@ from openai import OpenAI from transformers.pipelines import pipeline -from transformers.pipelines.base import Pipeline from langcheck.metrics._validation import validate_parameters_source_based from langcheck.metrics.en.source_based_text_quality import \ From 5371f864c254fd9f82eb64961171cd9d3bf4a846 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Tue, 20 Feb 2024 02:25:50 +0000 Subject: [PATCH 57/66] cleanup and docstrings --- .../model_manager/config/metric_config.yaml | 2 +- .../model_manager/test_model_loader.py | 4 +-- .../model_manager/test_model_manager.py | 30 ++++++++++++++++--- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/langcheck/metrics/model_manager/config/metric_config.yaml b/src/langcheck/metrics/model_manager/config/metric_config.yaml index 00dac306..8d80d72f 100644 --- a/src/langcheck/metrics/model_manager/config/metric_config.yaml +++ b/src/langcheck/metrics/model_manager/config/metric_config.yaml @@ -9,7 +9,7 @@ zh: model_name: BAAI/bge-base-zh-v1.5 model_revision: f03589c loader_func: load_sentence_transformers - + sentiment: model_name: IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment loader_func: load_auto_model_for_text_classification diff --git a/tests/metrics/model_manager/test_model_loader.py b/tests/metrics/model_manager/test_model_loader.py index 0ae3499c..2baadb6f 100644 --- a/tests/metrics/model_manager/test_model_loader.py +++ b/tests/metrics/model_manager/test_model_loader.py @@ -45,13 +45,13 @@ def test_load_auto_model_for_seq2seq(model_name, tokenizer_name, revision): [("bert-base-uncased", None, "main"), ("bert-base-uncased", "bert-large-uncased", "main")]) def test_load_auto_model_for_text_classification(model_name, tokenizer_name, - revision): # NOQA:E501 + revision): with patch('transformers.AutoTokenizer.from_pretrained', return_value=MockTokenizer) as mock_tokenizer, \ patch('transformers.AutoModelForSequenceClassification.from_pretrained', # NOQA:E501 return_value=MockSeqClassifcationModel) as mock_model: tokenizer, model = load_auto_model_for_text_classification( - model_name, tokenizer_name, revision) # NOQA:E501 + model_name, tokenizer_name, revision) # Check if the tokenizer was loaded correctly if tokenizer_name is None: diff --git a/tests/metrics/model_manager/test_model_manager.py b/tests/metrics/model_manager/test_model_manager.py index 8975af60..7b6f7e09 100644 --- a/tests/metrics/model_manager/test_model_manager.py +++ b/tests/metrics/model_manager/test_model_manager.py @@ -9,14 +9,23 @@ @pytest.fixture -def temp_config_path(tmp_path): - config = """ +def temp_config_path(tmp_path) -> str: + ''' + Fixture that creates a temporary configuration file for testing. + + Args: + tmp_path: A unique temporary directory path provided by pytest. + + Returns: + The path to the temporary configuration file. + ''' + config = ''' zh: toxicity: model_name: "Alnusjaponica/toxicity-score-multi-classification" tokenizer_name: "line-corporation/line-distilbert-base-japanese" loader_func: "load_auto_model_for_text_classification" - """ + ''' config_path = tmp_path / "metric_config.yaml" config_path.write_text(config) return str(config_path) @@ -24,6 +33,19 @@ def temp_config_path(tmp_path): @pytest.fixture def mock_model_manager(temp_config_path): + ''' + Fixture that creates a mock ModelManager for testing. + + The ModelManager is patched to use the temporary configuration file + created by the temp_config_path fixture, and to always return True + when checking model availability. + + Args: + temp_config_path: The path to the temporary configuration file. + + Returns: + The mock ModelManager. + ''' with patch("os.path.join", return_value=temp_config_path), \ patch('langcheck.metrics.model_manager._model_management.check_model_availability', # NOQA:E501 return_value=True): @@ -52,7 +74,7 @@ def test_check_model_availability(mock_get, model_name, revision, status_code): def test_model_manager_initiation(mock_model_manager): mock_config = mock_model_manager.config assert "toxicity" in mock_config["zh"] - assert mock_config["zh"]["toxicity"]["model_name"] ==\ + assert mock_config["zh"]["toxicity"]["model_name"] == \ "Alnusjaponica/toxicity-score-multi-classification" assert mock_config["zh"]["toxicity"]["tokenizer_name"] == \ "line-corporation/line-distilbert-base-japanese" From 57b864f91410b314510b928e691111da88ed6539 Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 26 Feb 2024 15:41:58 +0900 Subject: [PATCH 58/66] add tokenizer_revision for fine grained control --- .../metrics/model_manager/_model_loader.py | 41 +++++++++-------- .../model_manager/_model_management.py | 44 ++++++++++++------- .../model_manager/config/metric_config.yaml | 5 ++- 3 files changed, 53 insertions(+), 37 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py index b5e90e77..4c53e34e 100644 --- a/src/langcheck/metrics/model_manager/_model_loader.py +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -8,8 +8,9 @@ def load_sentence_transformers( model_name: str, + model_revision: Optional[str] = None, tokenizer_name: Optional[str] = None, - revision: Optional[str] = None) -> SentenceTransformer: + tokenizer_revision: Optional[str] = None) -> SentenceTransformer: ''' Loads a SentenceTransformer model. @@ -21,12 +22,14 @@ def load_sentence_transformers( model_name: The name of the SentenceTransformer model to load. tokenizer_name: The name of the tokenizer to use. Currently not supported. - revision: The model revision to load. Currently not supported. + model_revision: The model revision to load. Currently not supported. + tokenizerl_revision: The tokenizedr revision to load. Currently not + supported. Returns: model: The loaded SentenceTransformer model. ''' - if revision is not None: + if model_revision is not None or tokenizer_revision is not None: print("Warning: Specifying a revision is not currently supported.") if tokenizer_name is not None: print("Warning: Customizing the tokenizer is not currently supported.") @@ -36,9 +39,10 @@ def load_sentence_transformers( def load_auto_model_for_text_classification( - model_name: str, - tokenizer_name: Optional[str] = None, - revision: Optional[str] = None + model_name: str, + model_revision: Optional[str] = None, + tokenizer_name: Optional[str] = None, + tokenizer_revision: Optional[str] = None ) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: ''' Loads a sequence classification model and its tokenizer. @@ -47,7 +51,8 @@ def load_auto_model_for_text_classification( model_name: The name of the sequence-classification model to load. tokenizer_name: The name of the tokenizer to load. If None, the tokenizer associated with the model will be loaded. - revision: The model revision to load. + model_revision: The model revision to load. + tokenizer_revision: the tokenizer revision to load. Returns: tokenizer: The loaded tokenizer. @@ -56,17 +61,17 @@ def load_auto_model_for_text_classification( if tokenizer_name is None: tokenizer_name = model_name tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, - trust_remote_code=True, - revision=revision) + revision=tokenizer_revision) model = AutoModelForSequenceClassification.from_pretrained( - model_name, revision=revision, trust_remote_code=True) + model_name, revision=model_revision) return tokenizer, model # type: ignore def load_auto_model_for_seq2seq( - model_name: str, - tokenizer_name: Optional[str] = None, - revision: Optional[str] = None + model_name: str, + model_revision: Optional[str] = None, + tokenizer_name: Optional[str] = None, + tokenizer_revision: Optional[str] = None ) -> Tuple[AutoTokenizer, AutoModelForSeq2SeqLM]: ''' Loads a sequence-to-sequence model and its tokenizer. @@ -75,7 +80,8 @@ def load_auto_model_for_seq2seq( model_name: The name of the sequence-classification model to load. tokenizer_name: The name of the tokenizer to load. If None, the tokenizer associated with the model will be loaded. - revision: The model revision to load. + model_revision: The model revision to load. + tokenizer_revision: the tokenizer revision to load Returns: tokenizer: The loaded tokenizer. @@ -84,8 +90,7 @@ def load_auto_model_for_seq2seq( if tokenizer_name is None: tokenizer_name = model_name tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, - revision=revision, - trust_remote_code=True) - model = AutoModelForSeq2SeqLM.from_pretrained( - model_name, revision=revision, trust_remote_code=True) # NOQA: E501 + revision=tokenizer_revision) + model = AutoModelForSeq2SeqLM.from_pretrained(model_name, + revision=model_revision) return tokenizer, model # type: ignore diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index 1d2c4697..812787a1 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -28,10 +28,6 @@ VALID_METRICS = [ 'semantic_similarity', 'sentiment', 'toxicity', 'factual_consistency' ] - -VALID_METRIC_ATTRIBUTE = [ - 'model_revision', 'model_revision', 'loader', 'tokenizer_name' -] VALID_LANGUAGE = ['zh'] @@ -143,19 +139,27 @@ def validate_config(config, language='all', metric='all') -> None: raise KeyError( f'Metrics {metric_name} need a loader, but found None!' # NOQA:E501 ) - loader_func = model_setting.pop('loader_func', None) + loader_func = model_setting.get('loader_func') if loader_func not in VALID_LOADER_FUNCTION: raise ValueError( f'loader type should in {VALID_LOADER_FUNCTION}') - # Check that the model and revision are available on the Hugging - # Face Hub - model_name = model_setting.pop('model_name') - revision = model_setting.pop('revision', None) - if not check_model_availability(model_name, revision): + # Check model availability with revision if specified. + model_name = model_setting.get('model_name') + model_revision = model_setting.get('model_revision') + if not check_model_availability(model_name, model_revision): raise ValueError( - f'Cannot find {model_name} with {revision} and Huggingface Hub' # NOQA:E501 + f'Cannot find {model_name} with {model_revision} at Huggingface Hub' # NOQA:E501 ) + # Check tokenizer availability with revision if specified. + tokenizer_name = model_setting.get('tokenizer_name') + if tokenizer_name is not None and tokenizer_name != model_name: + tokenizer_revision = model_setting.get('tokenizer_revision') + if not check_model_availability(tokenizer_name, + tokenizer_revision): + raise ValueError( + f'Cannot find {tokenizer_name} with {tokenizer_revision} ay Huggingface Hub' # NOQA:E501 + ) def __set_model_for_metric(self, language: str, metric: str, model_name: str, loader_func: str, @@ -169,7 +173,10 @@ def __set_model_for_metric(self, language: str, metric: str, model_name: The name of the model loader_func: The loader function of the model tokenizer_name: (Optional) The name of the tokenizer - revision: (Optional) A version string of the model + model_revision: (Optional) A version string of the model, if not + specified, load latest model in default. + model_revision: (Optional) A version string of the tokenizer, same + with model's revision in default. ''' config_copy = deepcopy(self.config) try: @@ -194,14 +201,17 @@ def __set_model_for_metric(self, language: str, metric: str, detail_config['model_name'] = model_name # If tokenizer_name is different from model_name - tokenizer_name = kwargs.pop('tokenizer_name', None) + tokenizer_name = kwargs.get('tokenizer_name') if tokenizer_name: detail_config['tokenizer_name'] = tokenizer_name # If model's revision is pinned - revision = kwargs.pop('model_revision', None) - if revision: - detail_config['revision'] = revision - + model_revision = kwargs.get('model_revision') + if model_revision: + detail_config['model_revision'] = model_revision + # If model's revision is pinned + tokenzier_revision = kwargs.get('tokenizer_revision') + if tokenzier_revision: + detail_config['tokenizer_revision'] = tokenzier_revision # Validate the change ModelManager.validate_config(self.config, language=language, diff --git a/src/langcheck/metrics/model_manager/config/metric_config.yaml b/src/langcheck/metrics/model_manager/config/metric_config.yaml index 8d80d72f..e2aa6f00 100644 --- a/src/langcheck/metrics/model_manager/config/metric_config.yaml +++ b/src/langcheck/metrics/model_manager/config/metric_config.yaml @@ -1,8 +1,9 @@ # LANG: # METRIC_NAME: # model_name: str -# model_revisoin: str -# tokenizer_name:str +# model_revision: str +# tokenizer_name: str +# tokenizer_revision: str # loader_func: str zh: semantic_similarity: From 160a0d3a7f15584697416fbabb24c530071b3b23 Mon Sep 17 00:00:00 2001 From: vela Date: Mon, 26 Feb 2024 15:42:55 +0900 Subject: [PATCH 59/66] apply tokenizer_revision update to test case --- .../model_manager/test_model_loader.py | 16 ++++----- .../model_manager/test_model_manager.py | 34 +++++++++++++++---- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/tests/metrics/model_manager/test_model_loader.py b/tests/metrics/model_manager/test_model_loader.py index 2baadb6f..1bc4fea0 100644 --- a/tests/metrics/model_manager/test_model_loader.py +++ b/tests/metrics/model_manager/test_model_loader.py @@ -25,10 +25,9 @@ def test_load_auto_model_for_seq2seq(model_name, tokenizer_name, revision): return_value=MockTokenizer) as mock_tokenizer, \ patch('transformers.AutoModelForSeq2SeqLM.from_pretrained', return_value=MockSeq2SeqModel) as mock_model: - tokenizer, model = load_auto_model_for_seq2seq(model_name, - tokenizer_name, revision) - - # Check if the tokenizer was loaded correctly + tokenizer, model = load_auto_model_for_seq2seq( + model_name=model_name, tokenizer_name=tokenizer_name, + model_revision=revision, tokenizer_revision=revision) if tokenizer_name is None: tokenizer_name = model_name @@ -51,9 +50,8 @@ def test_load_auto_model_for_text_classification(model_name, tokenizer_name, patch('transformers.AutoModelForSequenceClassification.from_pretrained', # NOQA:E501 return_value=MockSeqClassifcationModel) as mock_model: tokenizer, model = load_auto_model_for_text_classification( - model_name, tokenizer_name, revision) - - # Check if the tokenizer was loaded correctly + model_name=model_name, tokenizer_name=tokenizer_name, + model_revision=revision, tokenizer_revision=revision) if tokenizer_name is None: tokenizer_name = model_name @@ -72,7 +70,9 @@ def test_load_auto_model_for_text_classification(model_name, tokenizer_name, def test_load_sentence_transformers(model_name, tokenizer_name, revision): with patch.object(SentenceTransformer, '__init__', return_value=None) as mock_init: - model = load_sentence_transformers(model_name, tokenizer_name, revision) + model = load_sentence_transformers( + model_name=model_name, tokenizer_name=tokenizer_name, + model_revision=revision, tokenizer_revision=revision) # Check if the model was loaded correctly mock_init.assert_called_once_with(model_name) diff --git a/tests/metrics/model_manager/test_model_manager.py b/tests/metrics/model_manager/test_model_manager.py index 7b6f7e09..719278f5 100644 --- a/tests/metrics/model_manager/test_model_manager.py +++ b/tests/metrics/model_manager/test_model_manager.py @@ -4,6 +4,7 @@ import requests from omegaconf import OmegaConf +from langcheck.metrics.model_manager import _model_management from langcheck.metrics.model_manager._model_management import ( ModelManager, check_model_availability) @@ -21,10 +22,16 @@ def temp_config_path(tmp_path) -> str: ''' config = ''' zh: - toxicity: - model_name: "Alnusjaponica/toxicity-score-multi-classification" - tokenizer_name: "line-corporation/line-distilbert-base-japanese" - loader_func: "load_auto_model_for_text_classification" + toxicity: + model_name: alibaba-pai/pai-bert-base-zh-llm-risk-detection + loader_func: load_auto_model_for_text_classification + ja: + toxicity: + model_name: Alnusjaponica/toxicity-score-multi-classification + model_revision: bc7a465029744889c8252ee858ab04ab9efdb0e7 + tokenizer_name: line-corporation/line-distilbert-base-japanese + tokenizer_revision: 93bd4811608eecb95ffaaba957646efd9a909cc8 + loader_func: load_auto_model_for_text_classification ''' config_path = tmp_path / "metric_config.yaml" config_path.write_text(config) @@ -48,7 +55,8 @@ def mock_model_manager(temp_config_path): ''' with patch("os.path.join", return_value=temp_config_path), \ patch('langcheck.metrics.model_manager._model_management.check_model_availability', # NOQA:E501 - return_value=True): + return_value=True), \ + patch.object(_model_management, 'VALID_LANGUAGE', ['ja', 'zh']): model_manager = ModelManager() return model_manager @@ -75,10 +83,20 @@ def test_model_manager_initiation(mock_model_manager): mock_config = mock_model_manager.config assert "toxicity" in mock_config["zh"] assert mock_config["zh"]["toxicity"]["model_name"] == \ + "alibaba-pai/pai-bert-base-zh-llm-risk-detection" + assert mock_config["zh"]["toxicity"]["loader_func"] == \ + "load_auto_model_for_text_classification" + + assert "toxicity" in mock_config["ja"] + assert mock_config["ja"]["toxicity"]["model_name"] ==\ "Alnusjaponica/toxicity-score-multi-classification" - assert mock_config["zh"]["toxicity"]["tokenizer_name"] == \ + assert mock_config["ja"]["toxicity"]["model_revision"] ==\ + "bc7a465029744889c8252ee858ab04ab9efdb0e7" + assert mock_config["ja"]["toxicity"]["tokenizer_name"] ==\ "line-corporation/line-distilbert-base-japanese" - assert mock_config["zh"]["toxicity"]["loader_func"] == \ + assert mock_config["ja"]["toxicity"]["tokenizer_revision"] ==\ + "93bd4811608eecb95ffaaba957646efd9a909cc8" + assert mock_config["ja"]["toxicity"]["loader_func"] ==\ "load_auto_model_for_text_classification" @@ -89,3 +107,5 @@ def test_model_manager_fetch_model(mock_model_manager): {'load_auto_model_for_text_classification': MagicMock()}): model = mock_model_manager.fetch_model(language='zh', metric='toxicity') assert model is not None + model = mock_model_manager.fetch_model(language='ja', metric='toxicity') + assert model is not None From 10593fcb9a248d85a1f6a31185fc11ebe4f3448f Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 28 Feb 2024 09:47:09 +0000 Subject: [PATCH 60/66] clean up docstring and comments --- .../metrics/model_manager/_model_management.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index 812787a1..feaa188c 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -173,10 +173,10 @@ def __set_model_for_metric(self, language: str, metric: str, model_name: The name of the model loader_func: The loader function of the model tokenizer_name: (Optional) The name of the tokenizer - model_revision: (Optional) A version string of the model, if not - specified, load latest model in default. - model_revision: (Optional) A version string of the tokenizer, same - with model's revision in default. + model_revision: (Optional) A version string of the model. If not + specified, load the latest model by default. + tokenizer_revision: (Optional) A version string of the tokenizer. If + not specified, load the latest tokenizer by default. ''' config_copy = deepcopy(self.config) try: @@ -208,7 +208,7 @@ def __set_model_for_metric(self, language: str, metric: str, model_revision = kwargs.get('model_revision') if model_revision: detail_config['model_revision'] = model_revision - # If model's revision is pinned + # If tokenizer's revision is pinned tokenzier_revision = kwargs.get('tokenizer_revision') if tokenzier_revision: detail_config['tokenizer_revision'] = tokenzier_revision From 7c770b0d175dca8ed252329328467f128db46f1d Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 28 Feb 2024 09:47:44 +0000 Subject: [PATCH 61/66] fix typo --- src/langcheck/metrics/model_manager/_model_management.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_management.py b/src/langcheck/metrics/model_manager/_model_management.py index feaa188c..fd686bf0 100644 --- a/src/langcheck/metrics/model_manager/_model_management.py +++ b/src/langcheck/metrics/model_manager/_model_management.py @@ -209,9 +209,9 @@ def __set_model_for_metric(self, language: str, metric: str, if model_revision: detail_config['model_revision'] = model_revision # If tokenizer's revision is pinned - tokenzier_revision = kwargs.get('tokenizer_revision') - if tokenzier_revision: - detail_config['tokenizer_revision'] = tokenzier_revision + tokenizer_revision = kwargs.get('tokenizer_revision') + if tokenizer_revision: + detail_config['tokenizer_revision'] = tokenizer_revision # Validate the change ModelManager.validate_config(self.config, language=language, From 2ce534fd78a999126e26cfe371033696c58ea655 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 28 Feb 2024 09:49:56 +0000 Subject: [PATCH 62/66] specify which fields are optional in config --- .../metrics/model_manager/config/metric_config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/langcheck/metrics/model_manager/config/metric_config.yaml b/src/langcheck/metrics/model_manager/config/metric_config.yaml index e2aa6f00..470b1843 100644 --- a/src/langcheck/metrics/model_manager/config/metric_config.yaml +++ b/src/langcheck/metrics/model_manager/config/metric_config.yaml @@ -1,9 +1,9 @@ # LANG: # METRIC_NAME: # model_name: str -# model_revision: str -# tokenizer_name: str -# tokenizer_revision: str +# model_revision: str (optional) +# tokenizer_name: str (optional) +# tokenizer_revision: str (optional) # loader_func: str zh: semantic_similarity: From 86f55fe9062b5ef4de5ae33064124bfe5e257ee5 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 28 Feb 2024 09:54:06 +0000 Subject: [PATCH 63/66] removed unnecessary noqa --- src/langcheck/metrics/zh/source_based_text_quality.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/langcheck/metrics/zh/source_based_text_quality.py b/src/langcheck/metrics/zh/source_based_text_quality.py index 843d17ed..d2cdfd43 100644 --- a/src/langcheck/metrics/zh/source_based_text_quality.py +++ b/src/langcheck/metrics/zh/source_based_text_quality.py @@ -81,8 +81,8 @@ def factual_consistency( return metric_value from langcheck.metrics.model_manager import manager - tokenizer, model = manager.fetch_model( - language='zh', metric='factual_consistency') # NOQA:E501 + tokenizer, model = manager.fetch_model(language='zh', + metric='factual_consistency') _factual_consistency_translation_pipeline = pipeline( 'translation', model=model, tokenizer=tokenizer) # type: ignore @@ -93,14 +93,13 @@ def factual_consistency( cast(str, d['translation_text']) # type: ignore[reportGeneralTypeIssues] for d in _factual_consistency_translation_pipeline( - sources) # type: ignore[reportOptionalIterable] # NOQA: E501 + sources) # type: ignore[reportOptionalIterable] ] en_generated_outputs = [ cast(str, d['translation_text']) # type: ignore[reportGeneralTypeIssues] for d in _factual_consistency_translation_pipeline( - generated_outputs - ) # type: ignore[reportOptionalIterable] # NOQA: E501 + generated_outputs) # type: ignore[reportOptionalIterable] ] # Compute the factual consistency scores in English. factual_consistency_scores = en_factual_consistency( From f5401590b1269929a2c56c1f578f3a3a30437f2d Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 28 Feb 2024 09:58:52 +0000 Subject: [PATCH 64/66] fix yapf format --- .../metrics/model_manager/_model_loader.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/langcheck/metrics/model_manager/_model_loader.py b/src/langcheck/metrics/model_manager/_model_loader.py index 4c53e34e..f147de3b 100644 --- a/src/langcheck/metrics/model_manager/_model_loader.py +++ b/src/langcheck/metrics/model_manager/_model_loader.py @@ -39,10 +39,10 @@ def load_sentence_transformers( def load_auto_model_for_text_classification( - model_name: str, - model_revision: Optional[str] = None, - tokenizer_name: Optional[str] = None, - tokenizer_revision: Optional[str] = None + model_name: str, + model_revision: Optional[str] = None, + tokenizer_name: Optional[str] = None, + tokenizer_revision: Optional[str] = None ) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]: ''' Loads a sequence classification model and its tokenizer. @@ -68,10 +68,10 @@ def load_auto_model_for_text_classification( def load_auto_model_for_seq2seq( - model_name: str, - model_revision: Optional[str] = None, - tokenizer_name: Optional[str] = None, - tokenizer_revision: Optional[str] = None + model_name: str, + model_revision: Optional[str] = None, + tokenizer_name: Optional[str] = None, + tokenizer_revision: Optional[str] = None ) -> Tuple[AutoTokenizer, AutoModelForSeq2SeqLM]: ''' Loads a sequence-to-sequence model and its tokenizer. From fdd353ef27a55459b5bb691bfbf0610ed9906348 Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 28 Feb 2024 10:04:40 +0000 Subject: [PATCH 65/66] fix yapf format --- .../model_manager/test_model_loader.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/metrics/model_manager/test_model_loader.py b/tests/metrics/model_manager/test_model_loader.py index 1bc4fea0..467f7f51 100644 --- a/tests/metrics/model_manager/test_model_loader.py +++ b/tests/metrics/model_manager/test_model_loader.py @@ -26,8 +26,10 @@ def test_load_auto_model_for_seq2seq(model_name, tokenizer_name, revision): patch('transformers.AutoModelForSeq2SeqLM.from_pretrained', return_value=MockSeq2SeqModel) as mock_model: tokenizer, model = load_auto_model_for_seq2seq( - model_name=model_name, tokenizer_name=tokenizer_name, - model_revision=revision, tokenizer_revision=revision) + model_name=model_name, + tokenizer_name=tokenizer_name, + model_revision=revision, + tokenizer_revision=revision) if tokenizer_name is None: tokenizer_name = model_name @@ -50,8 +52,10 @@ def test_load_auto_model_for_text_classification(model_name, tokenizer_name, patch('transformers.AutoModelForSequenceClassification.from_pretrained', # NOQA:E501 return_value=MockSeqClassifcationModel) as mock_model: tokenizer, model = load_auto_model_for_text_classification( - model_name=model_name, tokenizer_name=tokenizer_name, - model_revision=revision, tokenizer_revision=revision) + model_name=model_name, + tokenizer_name=tokenizer_name, + model_revision=revision, + tokenizer_revision=revision) if tokenizer_name is None: tokenizer_name = model_name @@ -70,9 +74,10 @@ def test_load_auto_model_for_text_classification(model_name, tokenizer_name, def test_load_sentence_transformers(model_name, tokenizer_name, revision): with patch.object(SentenceTransformer, '__init__', return_value=None) as mock_init: - model = load_sentence_transformers( - model_name=model_name, tokenizer_name=tokenizer_name, - model_revision=revision, tokenizer_revision=revision) + model = load_sentence_transformers(model_name=model_name, + tokenizer_name=tokenizer_name, + model_revision=revision, + tokenizer_revision=revision) # Check if the model was loaded correctly mock_init.assert_called_once_with(model_name) From b1d5d768b92835e5d63a1e74c9acbf82c15d0bcb Mon Sep 17 00:00:00 2001 From: Yosuke Higashi Date: Wed, 28 Feb 2024 10:38:11 +0000 Subject: [PATCH 66/66] maximize disk space --- .github/workflows/pytest.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index f275206f..cc0060e0 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -31,6 +31,19 @@ jobs: pip install --upgrade pip pip install .[dev] + # Remove unneeded system libraries to maximize disk space + # https://github.com/easimon/maximize-build-space/blob/master/action.yml + # https://github.com/actions/virtual-environments/issues/2840#issuecomment-790492173 + - name: Maximize disk space + run: | + echo "Available disk space (before):" + df -h + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/lib/android + echo "Available disk space (after):" + df -h + # Run integration tests - name: Test run: |