From 7a7d6be94eb1f812ab04c39ac61814b48a35627e Mon Sep 17 00:00:00 2001
From: ischender <alessandro.pedori@gmail.com>
Date: Fri, 26 Jan 2024 17:07:08 +0100
Subject: [PATCH 1/5] factual consistency notebook for German

---
 benchmarking/factual_consistency_de.ipynb | 235 ++++++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 benchmarking/factual_consistency_de.ipynb

diff --git a/benchmarking/factual_consistency_de.ipynb b/benchmarking/factual_consistency_de.ipynb
new file mode 100644
index 00000000..e8a673d1
--- /dev/null
+++ b/benchmarking/factual_consistency_de.ipynb
@@ -0,0 +1,235 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Notebook to compute the correlation between the `factual_consistency` metric outputs and human annotated consistency scores on benchmark datasets\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Note\n",
+    "\n",
+    "The consistency scores texts have been translated to German using https://huggingface.co/Helsinki-NLP/opus-mt-en-de\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the benchmark datasets\n",
+    "import json\n",
+    "\n",
+    "# These files were copied from the UniEval repo\n",
+    "# (https://github.com/maszhongming/UniEval/tree/main/reproduce/data/fact), which\n",
+    "# is a modified version of the dataset from https://github.com/W4ngatang/qags, then translated to German\n",
+    "\n",
+    "qags_xsum_path = 'data/qags_xsum-de.json'\n",
+    "qags_cnndm_path = 'data/qags_cnndm-de.json'\n",
+    "\n",
+    "with open(qags_xsum_path) as f:\n",
+    "    qags_xsum_data = json.loads(f.read())\n",
+    "with open(qags_cnndm_path) as f:\n",
+    "    qags_cnndm_data = json.loads(f.read())\n",
+    "\n",
+    "print(f'QAGS-XSUM has {len(qags_xsum_data)} data points')\n",
+    "print(f'QAGS-CNN has {len(qags_cnndm_data)} data points')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract the generated outputs, sources, and human annotated scores\n",
+    "qags_xsum_generated_outputs = [item['system_output'] for item in qags_xsum_data]\n",
+    "qags_xsum_sources = [item['source'] for item in qags_xsum_data]\n",
+    "qags_xsum_scores = [item['scores']['consistency'] for item in qags_xsum_data]\n",
+    "\n",
+    "qags_cnndm_generated_outputs = [\n",
+    "    item['system_output'] for item in qags_cnndm_data\n",
+    "]\n",
+    "qags_cnndm_sources = [item['source'] for item in qags_cnndm_data]\n",
+    "qags_cnndm_scores = [item['scores']['consistency'] for item in qags_cnndm_data]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scipy.stats import spearmanr, pearsonr, kendalltau\n",
+    "\n",
+    "\n",
+    "def compute_correlation_values(result, annotated_scores):\n",
+    "    '''Function to compute and output the correlation values between the metric\n",
+    "    score and the human annotation scores.'''\n",
+    "    # Ignore any data points where the evaluator returned `None`. This may happen\n",
+    "    # if, for example, the prompt triggers Azure OpenAI's content filter.\n",
+    "    result_df = result.to_df()\n",
+    "    indices = list(result_df[result_df['metric_value'].notna()].index)\n",
+    "    valid_metric_values = [result.metric_values[i] for i in indices]\n",
+    "    valid_annotated_scores = [annotated_scores[i] for i in indices]\n",
+    "\n",
+    "    pearson_corr = pearsonr(valid_metric_values, valid_annotated_scores)[0]\n",
+    "    spearman_corr = spearmanr(valid_metric_values, valid_annotated_scores)[0]\n",
+    "    kendalltau_corr = kendalltau(valid_metric_values, valid_annotated_scores)[0]\n",
+    "\n",
+    "    print(f'Pearson correlation = {pearson_corr}')\n",
+    "    print(f'Spearman correlation = {spearman_corr}')\n",
+    "    print(f'Kendall-Tau correlation = {kendalltau_corr}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute the factual consistency scores on QAGS-XSUM using the local (UniEval)\n",
+    "# model option and measure various correlations with the human annotated scores\n",
+    "from langcheck.metrics.de import factual_consistency\n",
+    "\n",
+    "result = factual_consistency(qags_xsum_generated_outputs, qags_xsum_sources)\n",
+    "compute_correlation_values(result, qags_xsum_scores)\n",
+    "\n",
+    "# RUN-DATE: 2024-1-17\n",
+    "# Resulting correlation values:\n",
+    "# Pearson correlation = 0.40358016311552586\n",
+    "# Spearman correlation = 0.37558373934197853\n",
+    "# Kendall-Tau correlation = 0.3097142857142857\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute the factual consistency scores on QAGS-XSUM using the OpenAI\n",
+    "# (gpt-3.5-turbo) model option and measure various correlations with the human\n",
+    "# annotated scores\n",
+    "from langcheck.metrics.de import factual_consistency\n",
+    "import os\n",
+    "\n",
+    "result = factual_consistency(qags_xsum_generated_outputs,\n",
+    "                             qags_xsum_sources,\n",
+    "                             model_type='azure_openai',\n",
+    "                             openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n",
+    "\n",
+    "compute_correlation_values(result, qags_xsum_scores[:50])\n",
+    "\n",
+    "# RUN-DATE: 2024-1-17\n",
+    "# OpenAI deployment details:\n",
+    "# - Model name: gpt-35-turbo\n",
+    "# - Model version: 0613\n",
+    "# Resulting correlation values:\n",
+    "#   Computed on 230 examples\n",
+    "# Pearson correlation = 0.1632062194597614\n",
+    "# Spearman correlation = 0.15952417117218096\n",
+    "# Kendall-Tau correlation = 0.15103303151237832\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute the factual consistency scores on QAGS-CNN using the local (UniEval)\n",
+    "# model option and measure various correlations with the human annotated scores\n",
+    "from langcheck.metrics.de import factual_consistency\n",
+    "\n",
+    "result = factual_consistency(qags_cnndm_generated_outputs, qags_cnndm_sources)\n",
+    "compute_correlation_values(result, qags_cnndm_scores)\n",
+    "\n",
+    "# RUN-DATE: 2024-1-18\n",
+    "# Resulting correlation values:\n",
+    "# Pearson correlation = 0.5126921817479836\n",
+    "# Spearman correlation = 0.4940799552395499\n",
+    "# Kendall-Tau correlation = 0.3910688466232861"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = factual_consistency(\n",
+    "    qags_cnndm_generated_outputs[:50],\n",
+    "    qags_cnndm_sources[:50],\n",
+    "    model_type='openai',\n",
+    ")\n",
+    "\n",
+    "compute_correlation_values(result, qags_cnndm_scores[:50])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute the factual consistency scores on QAGS-CNN using the OpenAI\n",
+    "# (gpt-3.5-turbo) model option and measure various correlations with the human\n",
+    "# annotated scores\n",
+    "from langcheck.metrics import factual_consistency\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n",
+    "os.environ[\"OPENAI_API_VERSION\"] = 'YOUR_OPENAI_API_VERSION'\n",
+    "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = 'YOUR_AZURE_OPENAI_ENDPOINT'\n",
+    "result = factual_consistency(qags_cnndm_generated_outputs,\n",
+    "                             qags_cnndm_sources,\n",
+    "                             model_type='azure_openai',\n",
+    "                             openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n",
+    "compute_correlation_values(result, qags_cnndm_scores)\n",
+    "\n",
+    "# RUN-DATE: 2024-1-18\n",
+    "# Resulting correlation values:\n",
+    "# Pearson correlation = 0.2562263899971331\n",
+    "# Spearman correlation = 0.21022360246996274\n",
+    "# Kendall-Tau correlation = 0.19670459803185497\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 32f4e562c117d34c3d5db19c869fc75183b34ba5 Mon Sep 17 00:00:00 2001
From: ischender <alessandro.pedori@gmail.com>
Date: Fri, 26 Jan 2024 17:07:30 +0100
Subject: [PATCH 2/5] adding German (Deutsch) to the READMEs

---
 README.md    | 9 +++++----
 README_ja.md | 4 +++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 35e29818..edfb2f61 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,8 @@ Simple, Pythonic building blocks to evaluate LLM applications.
 [Examples](#examples) •
 [Quickstart](https://langcheck.readthedocs.io/en/latest/quickstart.html) •
 [Docs](https://langcheck.readthedocs.io/en/latest/index.html) •
-[日本語](README_ja.md)
+[日本語](README_ja.md) •
+[Deutsch](README_de.md)
 
 </div>
 
@@ -56,9 +57,9 @@ LangCheck includes several types of metrics to evaluate LLM applications. Some e
 
 |                                                            Type of Metric                                                            |                                                     Examples                                                     |   Languages   |
 | ------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- | ------------- |
-| [Reference-Free Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-free-text-quality-metrics)   | `toxicity(generated_outputs)`<br>`sentiment(generated_outputs)`<br>`ai_disclaimer_similarity(generated_outputs)` | EN, JA        |
-| [Reference-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-based-text-quality-metrics) | `semantic_similarity(generated_outputs, reference_outputs)`<br>`rouge2(generated_outputs, reference_outputs)`    | EN, JA        |
-| [Source-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#source-based-text-quality-metrics)       | `factual_consistency(generated_outputs, sources)`                                                                | EN, JA        |
+| [Reference-Free Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-free-text-quality-metrics)   | `toxicity(generated_outputs)`<br>`sentiment(generated_outputs)`<br>`ai_disclaimer_similarity(generated_outputs)` | EN, JA, DE        |
+| [Reference-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-based-text-quality-metrics) | `semantic_similarity(generated_outputs, reference_outputs)`<br>`rouge2(generated_outputs, reference_outputs)`    | EN, JA, DE        |
+| [Source-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#source-based-text-quality-metrics)       | `factual_consistency(generated_outputs, sources)`                                                                | EN, JA, DE        |
 | [Text Structure Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#text-structure-metrics)                             | `is_float(generated_outputs, min=0, max=None)`<br>`is_json_object(generated_outputs)`                            | All Languages |
 
 ### Visualize Metrics
diff --git a/README_ja.md b/README_ja.md
index a9f9af02..8a1cc19f 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -14,7 +14,9 @@ LLMアプリケーションの評価のためのシンプルなPythonライブ
 [利用例](#利用例) •
 [クイックスタート](https://langcheck.readthedocs.io/en/latest/quickstart.html) •
 [ドキュメント](https://langcheck.readthedocs.io/en/latest/index.html) •
-[English](README.md)
+[English](README.md) •
+[Deutsch](README_de.md)
+
 
 </div>
 

From 883b85fe98ababebe41cdc95e41dc93e19508c0c Mon Sep 17 00:00:00 2001
From: Alessandro Pedori <aljosha@gmail.com>
Date: Tue, 30 Jan 2024 09:55:52 +0100
Subject: [PATCH 3/5] Update benchmarking/factual_consistency_de.ipynb

Co-authored-by: Yosuke Higashi <107823399+yosukehigashi@users.noreply.github.com>
---
 benchmarking/factual_consistency_de.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/factual_consistency_de.ipynb b/benchmarking/factual_consistency_de.ipynb
index e8a673d1..0af5fb6d 100644
--- a/benchmarking/factual_consistency_de.ipynb
+++ b/benchmarking/factual_consistency_de.ipynb
@@ -124,7 +124,7 @@
     "                             model_type='azure_openai',\n",
     "                             openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n",
     "\n",
-    "compute_correlation_values(result, qags_xsum_scores[:50])\n",
+    "compute_correlation_values(result, qags_xsum_scores)\n",
     "\n",
     "# RUN-DATE: 2024-1-17\n",
     "# OpenAI deployment details:\n",

From 5f50504061d1716addbd1afe97b73a87d08a8c6c Mon Sep 17 00:00:00 2001
From: Alessandro Pedori <aljosha@gmail.com>
Date: Tue, 30 Jan 2024 09:56:59 +0100
Subject: [PATCH 4/5] Update benchmarking/factual_consistency_de.ipynb

Co-authored-by: Yosuke Higashi <107823399+yosukehigashi@users.noreply.github.com>
---
 benchmarking/factual_consistency_de.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/factual_consistency_de.ipynb b/benchmarking/factual_consistency_de.ipynb
index 0af5fb6d..d9842d2d 100644
--- a/benchmarking/factual_consistency_de.ipynb
+++ b/benchmarking/factual_consistency_de.ipynb
@@ -190,7 +190,7 @@
     "# Compute the factual consistency scores on QAGS-CNN using the OpenAI\n",
     "# (gpt-3.5-turbo) model option and measure various correlations with the human\n",
     "# annotated scores\n",
-    "from langcheck.metrics import factual_consistency\n",
+    "from langcheck.metrics.de import factual_consistency\n",
     "import os\n",
     "\n",
     "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n",

From 759a357076a43cc97b5c6a49c02e61a288637166 Mon Sep 17 00:00:00 2001
From: ischender <alessandro.pedori@gmail.com>
Date: Tue, 30 Jan 2024 09:59:07 +0100
Subject: [PATCH 5/5] corrections to notebook

---
 benchmarking/factual_consistency_de.ipynb | 31 ++++-------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/benchmarking/factual_consistency_de.ipynb b/benchmarking/factual_consistency_de.ipynb
index e8a673d1..341b5f73 100644
--- a/benchmarking/factual_consistency_de.ipynb
+++ b/benchmarking/factual_consistency_de.ipynb
@@ -119,12 +119,15 @@
     "from langcheck.metrics.de import factual_consistency\n",
     "import os\n",
     "\n",
+    "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n",
+    "os.environ[\"OPENAI_API_VERSION\"] = 'YOUR_OPENAI_API_VERSION'\n",
+    "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = 'YOUR_AZURE_OPENAI_ENDPOINT'\n",
     "result = factual_consistency(qags_xsum_generated_outputs,\n",
     "                             qags_xsum_sources,\n",
     "                             model_type='azure_openai',\n",
     "                             openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n",
     "\n",
-    "compute_correlation_values(result, qags_xsum_scores[:50])\n",
+    "compute_correlation_values(result, qags_xsum_scores)\n",
     "\n",
     "# RUN-DATE: 2024-1-17\n",
     "# OpenAI deployment details:\n",
@@ -157,30 +160,6 @@
     "# Kendall-Tau correlation = 0.3910688466232861"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "result = factual_consistency(\n",
-    "    qags_cnndm_generated_outputs[:50],\n",
-    "    qags_cnndm_sources[:50],\n",
-    "    model_type='openai',\n",
-    ")\n",
-    "\n",
-    "compute_correlation_values(result, qags_cnndm_scores[:50])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "result"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -190,7 +169,7 @@
     "# Compute the factual consistency scores on QAGS-CNN using the OpenAI\n",
     "# (gpt-3.5-turbo) model option and measure various correlations with the human\n",
     "# annotated scores\n",
-    "from langcheck.metrics import factual_consistency\n",
+    "from langcheck.metrics.de import factual_consistency\n",
     "import os\n",
     "\n",
     "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n",