diff --git a/README.md b/README.md
index 35e29818..edfb2f61 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,8 @@ Simple, Pythonic building blocks to evaluate LLM applications.
[Examples](#examples) •
[Quickstart](https://langcheck.readthedocs.io/en/latest/quickstart.html) •
[Docs](https://langcheck.readthedocs.io/en/latest/index.html) •
-[日本語](README_ja.md)
+[日本語](README_ja.md) •
+[Deutsch](README_de.md)
@@ -56,9 +57,9 @@ LangCheck includes several types of metrics to evaluate LLM applications. Some e
| Type of Metric | Examples | Languages |
| ------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- | ------------- |
-| [Reference-Free Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-free-text-quality-metrics) | `toxicity(generated_outputs)`
`sentiment(generated_outputs)`
`ai_disclaimer_similarity(generated_outputs)` | EN, JA |
-| [Reference-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-based-text-quality-metrics) | `semantic_similarity(generated_outputs, reference_outputs)`
`rouge2(generated_outputs, reference_outputs)` | EN, JA |
-| [Source-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#source-based-text-quality-metrics) | `factual_consistency(generated_outputs, sources)` | EN, JA |
+| [Reference-Free Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-free-text-quality-metrics) | `toxicity(generated_outputs)`
`sentiment(generated_outputs)`
`ai_disclaimer_similarity(generated_outputs)` | EN, JA, DE |
+| [Reference-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-based-text-quality-metrics) | `semantic_similarity(generated_outputs, reference_outputs)`
`rouge2(generated_outputs, reference_outputs)` | EN, JA, DE |
+| [Source-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#source-based-text-quality-metrics) | `factual_consistency(generated_outputs, sources)` | EN, JA, DE |
| [Text Structure Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#text-structure-metrics) | `is_float(generated_outputs, min=0, max=None)`
`is_json_object(generated_outputs)` | All Languages |
### Visualize Metrics
diff --git a/README_ja.md b/README_ja.md
index a9f9af02..8a1cc19f 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -14,7 +14,9 @@ LLMアプリケーションの評価のためのシンプルなPythonライブ
[利用例](#利用例) •
[クイックスタート](https://langcheck.readthedocs.io/en/latest/quickstart.html) •
[ドキュメント](https://langcheck.readthedocs.io/en/latest/index.html) •
-[English](README.md)
+[English](README.md) •
+[Deutsch](README_de.md)
+
diff --git a/benchmarking/factual_consistency_de.ipynb b/benchmarking/factual_consistency_de.ipynb
new file mode 100644
index 00000000..ada91947
--- /dev/null
+++ b/benchmarking/factual_consistency_de.ipynb
@@ -0,0 +1,214 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Notebook to compute the correlation between the `factual_consistency` metric outputs and human annotated consistency scores on benchmark datasets\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Note\n",
+ "\n",
+ "The consistency scores texts have been translated to German using https://huggingface.co/Helsinki-NLP/opus-mt-en-de\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the benchmark datasets\n",
+ "import json\n",
+ "\n",
+ "# These files were copied from the UniEval repo\n",
+ "# (https://github.com/maszhongming/UniEval/tree/main/reproduce/data/fact), which\n",
+ "# is a modified version of the dataset from https://github.com/W4ngatang/qags, then translated to German\n",
+ "\n",
+ "qags_xsum_path = 'data/qags_xsum-de.json'\n",
+ "qags_cnndm_path = 'data/qags_cnndm-de.json'\n",
+ "\n",
+ "with open(qags_xsum_path) as f:\n",
+ " qags_xsum_data = json.loads(f.read())\n",
+ "with open(qags_cnndm_path) as f:\n",
+ " qags_cnndm_data = json.loads(f.read())\n",
+ "\n",
+ "print(f'QAGS-XSUM has {len(qags_xsum_data)} data points')\n",
+ "print(f'QAGS-CNN has {len(qags_cnndm_data)} data points')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extract the generated outputs, sources, and human annotated scores\n",
+ "qags_xsum_generated_outputs = [item['system_output'] for item in qags_xsum_data]\n",
+ "qags_xsum_sources = [item['source'] for item in qags_xsum_data]\n",
+ "qags_xsum_scores = [item['scores']['consistency'] for item in qags_xsum_data]\n",
+ "\n",
+ "qags_cnndm_generated_outputs = [\n",
+ " item['system_output'] for item in qags_cnndm_data\n",
+ "]\n",
+ "qags_cnndm_sources = [item['source'] for item in qags_cnndm_data]\n",
+ "qags_cnndm_scores = [item['scores']['consistency'] for item in qags_cnndm_data]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from scipy.stats import spearmanr, pearsonr, kendalltau\n",
+ "\n",
+ "\n",
+ "def compute_correlation_values(result, annotated_scores):\n",
+ " '''Function to compute and output the correlation values between the metric\n",
+ " score and the human annotation scores.'''\n",
+ " # Ignore any data points where the evaluator returned `None`. This may happen\n",
+ " # if, for example, the prompt triggers Azure OpenAI's content filter.\n",
+ " result_df = result.to_df()\n",
+ " indices = list(result_df[result_df['metric_value'].notna()].index)\n",
+ " valid_metric_values = [result.metric_values[i] for i in indices]\n",
+ " valid_annotated_scores = [annotated_scores[i] for i in indices]\n",
+ "\n",
+ " pearson_corr = pearsonr(valid_metric_values, valid_annotated_scores)[0]\n",
+ " spearman_corr = spearmanr(valid_metric_values, valid_annotated_scores)[0]\n",
+ " kendalltau_corr = kendalltau(valid_metric_values, valid_annotated_scores)[0]\n",
+ "\n",
+ " print(f'Pearson correlation = {pearson_corr}')\n",
+ " print(f'Spearman correlation = {spearman_corr}')\n",
+ " print(f'Kendall-Tau correlation = {kendalltau_corr}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Compute the factual consistency scores on QAGS-XSUM using the local (UniEval)\n",
+ "# model option and measure various correlations with the human annotated scores\n",
+ "from langcheck.metrics.de import factual_consistency\n",
+ "\n",
+ "result = factual_consistency(qags_xsum_generated_outputs, qags_xsum_sources)\n",
+ "compute_correlation_values(result, qags_xsum_scores)\n",
+ "\n",
+ "# RUN-DATE: 2024-1-17\n",
+ "# Resulting correlation values:\n",
+ "# Pearson correlation = 0.40358016311552586\n",
+ "# Spearman correlation = 0.37558373934197853\n",
+ "# Kendall-Tau correlation = 0.3097142857142857"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Compute the factual consistency scores on QAGS-XSUM using the OpenAI\n",
+ "# (gpt-3.5-turbo) model option and measure various correlations with the human\n",
+ "# annotated scores\n",
+ "from langcheck.metrics.de import factual_consistency\n",
+ "import os\n",
+ "\n",
+ "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n",
+ "os.environ[\"OPENAI_API_VERSION\"] = 'YOUR_OPENAI_API_VERSION'\n",
+ "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = 'YOUR_AZURE_OPENAI_ENDPOINT'\n",
+ "result = factual_consistency(qags_xsum_generated_outputs,\n",
+ " qags_xsum_sources,\n",
+ " model_type='azure_openai',\n",
+ " openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n",
+ "\n",
+ "compute_correlation_values(result, qags_xsum_scores)\n",
+ "\n",
+ "# RUN-DATE: 2024-1-17\n",
+ "# OpenAI deployment details:\n",
+ "# - Model name: gpt-35-turbo\n",
+ "# - Model version: 0613\n",
+ "# Resulting correlation values:\n",
+ "# Computed on 230 examples\n",
+ "# Pearson correlation = 0.1632062194597614\n",
+ "# Spearman correlation = 0.15952417117218096\n",
+ "# Kendall-Tau correlation = 0.15103303151237832"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Compute the factual consistency scores on QAGS-CNN using the local (UniEval)\n",
+ "# model option and measure various correlations with the human annotated scores\n",
+ "from langcheck.metrics.de import factual_consistency\n",
+ "\n",
+ "result = factual_consistency(qags_cnndm_generated_outputs, qags_cnndm_sources)\n",
+ "compute_correlation_values(result, qags_cnndm_scores)\n",
+ "\n",
+ "# RUN-DATE: 2024-1-18\n",
+ "# Resulting correlation values:\n",
+ "# Pearson correlation = 0.5126921817479836\n",
+ "# Spearman correlation = 0.4940799552395499\n",
+ "# Kendall-Tau correlation = 0.3910688466232861"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Compute the factual consistency scores on QAGS-CNN using the OpenAI\n",
+ "# (gpt-3.5-turbo) model option and measure various correlations with the human\n",
+ "# annotated scores\n",
+ "from langcheck.metrics.de import factual_consistency\n",
+ "import os\n",
+ "\n",
+ "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n",
+ "os.environ[\"OPENAI_API_VERSION\"] = 'YOUR_OPENAI_API_VERSION'\n",
+ "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = 'YOUR_AZURE_OPENAI_ENDPOINT'\n",
+ "result = factual_consistency(qags_cnndm_generated_outputs,\n",
+ " qags_cnndm_sources,\n",
+ " model_type='azure_openai',\n",
+ " openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n",
+ "compute_correlation_values(result, qags_cnndm_scores)\n",
+ "\n",
+ "# RUN-DATE: 2024-1-18\n",
+ "# Resulting correlation values:\n",
+ "# Pearson correlation = 0.2562263899971331\n",
+ "# Spearman correlation = 0.21022360246996274\n",
+ "# Kendall-Tau correlation = 0.19670459803185497"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}