Skip to content

Commit

Permalink
moving translation wrapper to a file, adding tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ischender committed Jan 17, 2024
1 parent 0dca8ba commit 4eb0676
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 67 deletions.
2 changes: 2 additions & 0 deletions src/langcheck/metrics/de/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from langcheck.metrics.de._tokenizers import DeTokenizer
from langcheck.metrics.de._translation import Translate
from langcheck.metrics.de.reference_based_text_quality import (
rouge1, rouge2, rougeL, semantic_similarity)
from langcheck.metrics.de.reference_free_text_quality import (
Expand All @@ -21,4 +22,5 @@
'sentiment',
'toxicity',
'DeTokenizer',
'Translate',
]
64 changes: 64 additions & 0 deletions src/langcheck/metrics/de/_translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from math import floor

from nltk.tokenize import sent_tokenize
from transformers.pipelines import pipeline


class Translate:
'''Translation class based on HuggingFace's translation pipeline.'''

def __init__(self, model_name: str) -> None:
'''
Initialize the Translation class with given parameters.
Args:
model_name: The name of the model to use for translation
'''
self._translation_pipeline = pipeline("translation",
model=model_name,
tokenizer=model_name)
self._max_length = self._translation_pipeline.model.config.max_length

def _translate(self, texts: str) -> str:
'''Translate the texts using the translation pipeline.
It splits the texts into blocks and translates each block separately,
avoiding problems with long texts.
Args:
texts: The texts to translate
Returns:
The translated texts
'''
tokenization = self._translation_pipeline.tokenizer(
texts, return_tensors="pt") # type: ignore
if tokenization.input_ids.shape[1] > (self._max_length / 2):
blocks = floor(
tokenization.input_ids.shape[1] / self._max_length) + 3
sentences = sent_tokenize(texts)
# Split sentences into a number of blocks, e.g., 2 blocks = 2 groups
len_block = floor(len(sentences) / blocks) + 1
sentences_list = []
for i in range(blocks):
sentences_list.append(sentences[i * len_block:(i + 1) *
len_block])
texts_ = [" ".join(sent) for sent in sentences_list]
else:
texts_ = [texts]
texts_en = []
for text in texts_:
print(text)
text_en = [
str(d['translation_text']) # type: ignore
for d in self._translation_pipeline(text) # type: ignore
]
texts_en.append(" ".join(text_en))
text_en_final = " ".join(texts_en)
return text_en_final

def __call__(self, text: str) -> str:
'''Translate the text using the translation pipeline.
Args:
text: The text to translate
Returns:
The translated text
'''
return self._translate(text)
17 changes: 3 additions & 14 deletions src/langcheck/metrics/de/reference_free_text_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@
from transformers.models.auto.modeling_auto import \
AutoModelForSequenceClassification
from transformers.models.auto.tokenization_auto import AutoTokenizer
from transformers.pipelines import pipeline
from transformers.pipelines.base import Pipeline

from langcheck._handle_logs import _handle_logging_level
from langcheck.metrics._detoxify import Detoxify
from langcheck.metrics._validation import validate_parameters_reference_free
from langcheck.metrics.de._translation import Translate
from langcheck.metrics.de.reference_based_text_quality import \
semantic_similarity
from langcheck.metrics.en.reference_free_text_quality import _toxicity_openai
Expand All @@ -32,7 +31,6 @@
_sentiment_model = None

_translation_model_path = 'Helsinki-NLP/opus-mt-de-en'
_translation_pipeline: Pipeline | None = None

_toxicity_model = None

Expand Down Expand Up @@ -148,22 +146,13 @@ def fluency(
Parrot fluency model to calculate the fluency scores, from the English
counterpart.
'''
global _translation_pipeline
if _translation_pipeline is None:
_translation_pipeline = pipeline('translation',
model=_translation_model_path)
translation = Translate(_translation_model_path)

if isinstance(generated_outputs, str):
generated_outputs = [generated_outputs]

# Translate to English
generated_outputs_en = [
cast(str,
d['translation_text']) # type: ignore[reportGeneralTypeIssues]
for d in _translation_pipeline(
generated_outputs
) # type: ignore[reportOptionalIterable] # NOQA: E501
]
generated_outputs_en = [translation(str) for str in generated_outputs]

_metric_value = en_fluency(generated_outputs_en, prompts, model_type,
openai_client, openai_args)
Expand Down
58 changes: 5 additions & 53 deletions src/langcheck/metrics/de/source_based_text_quality.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,23 @@
from __future__ import annotations

from math import floor
from typing import Dict, List, Optional, cast
from typing import Dict, List, Optional

from nltk.tokenize import sent_tokenize
from openai import OpenAI
from transformers.pipelines import pipeline
from transformers.pipelines.base import Pipeline

from langcheck.metrics._validation import (
validate_parameters_context_relevance, validate_parameters_source_based)
from langcheck.metrics.de._translation import Translate
from langcheck.metrics.en._openai import OpenAIBasedEvaluator
from langcheck.metrics.en.source_based_text_quality import \
factual_consistency as en_factual_consistency
from langcheck.metrics.metric_value import MetricValue
from langcheck.utils.progess_bar import tqdm_wrapper

_factual_consistency_translation_model_path = 'Helsinki-NLP/opus-mt-de-en'
_factual_consistency_translation_pipeline: Pipeline | None = None

LANG = 'de'


def _translate(texts: str, _translation_pipeline: Pipeline) -> str:
'''Translate the texts using the translation pipeline.
It splits the texts into blocks and translates each block separately,
avoiding problems with long texts.
Args:
texts: The texts to translate
_translation_pipeline: The translation pipeline
Returns:
The translated texts
'''
tokenization = _translation_pipeline.tokenizer(
texts, return_tensors="pt") # type: ignore
if tokenization.input_ids.shape[1] > (
_translation_pipeline.model.config.max_length / 2):
max_length = _translation_pipeline.model.config.max_length
blocks = floor(tokenization.input_ids.shape[1] / max_length) + 3
sentences = sent_tokenize(texts)
# Split sentences into a number of blocks, e.g., 2 blocks = 2 groups
len_block = floor(len(sentences) / blocks) + 1
# print(len_block, len(sentences), blocks)
sentences_list = []
for i in range(blocks):
sentences_list.append(sentences[i * len_block:(i + 1) * len_block])
texts_ = [" ".join(sent) for sent in sentences_list]
else:
texts_ = [texts]
texts_en = []
for text in texts_:
text_en = [
str(d['translation_text']) # type: ignore
for d in _translation_pipeline(text) # type: ignore
]
texts_en.append(" ".join(text_en))
text_en_final = " ".join(texts_en)
return text_en_final


def factual_consistency(
generated_outputs: List[str] | str,
sources: List[str] | str,
Expand Down Expand Up @@ -128,21 +87,14 @@ def factual_consistency(
metric_value.language = LANG
return metric_value

global _factual_consistency_translation_pipeline
if _factual_consistency_translation_pipeline is None:
_factual_consistency_translation_pipeline = pipeline(
'translation', model=_factual_consistency_translation_model_path)
translation = Translate(_factual_consistency_translation_model_path)

# Translate the sources and generated outputs to English.
# Currently, the type checks are not working for the pipeline, since
# too diverse types can be returned.
en_source = [
_translate(source, _factual_consistency_translation_pipeline)
for source in sources
]
en_source = [translation(source) for source in sources]
en_generated_outputs = [
_translate(gen_out, _factual_consistency_translation_pipeline)
for gen_out in generated_outputs
translation(gen_out) for gen_out in generated_outputs
]
# Compute the factual consistency scores in English.
factual_consistency_scores = en_factual_consistency(
Expand Down
38 changes: 38 additions & 0 deletions tests/metrics/de/test_translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import List

import pytest

from langcheck.metrics.de import Translate


@pytest.mark.parametrize(
'de_text,en_text',
[
([
'Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein.', # noqa: E501
'I have no personal opinions, emotions or consciousness.'
]),
([
'Mein Freund. Willkommen in den Karpaten.',
'My friend, welcome to the Carpathians.'
]),
([
'Tokio ist die Hauptstadt von Japan.',
'Tokyo is the capital of Japan.'
]),
])
def test_translate_de_en(de_text: str, en_text: str) -> None:
translation = Translate('Helsinki-NLP/opus-mt-de-en')
assert translation(de_text) == en_text


@pytest.mark.parametrize('en_text,de_text', [
('I have no personal opinions, emotions or consciousness.',
'Ich habe keine persönlichen Meinungen, Emotionen oder Bewusstsein.'),
('My Friend. Welcome to the Carpathians. I am anxiously expecting you.',
'Willkommen bei den Karpaten, ich erwarte Sie.'),
('Tokyo is the capital of Japan.', 'Tokio ist die Hauptstadt Japans.'),
])
def test_translate_en_de(en_text: str, de_text: List[str]) -> None:
translation = Translate('Helsinki-NLP/opus-mt-en-de')
assert translation(en_text) == de_text

0 comments on commit 4eb0676

Please sign in to comment.