From 7a7d6be94eb1f812ab04c39ac61814b48a35627e Mon Sep 17 00:00:00 2001 From: ischender Date: Fri, 26 Jan 2024 17:07:08 +0100 Subject: [PATCH 1/5] factual consistency notebook for German --- benchmarking/factual_consistency_de.ipynb | 235 ++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 benchmarking/factual_consistency_de.ipynb diff --git a/benchmarking/factual_consistency_de.ipynb b/benchmarking/factual_consistency_de.ipynb new file mode 100644 index 00000000..e8a673d1 --- /dev/null +++ b/benchmarking/factual_consistency_de.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Notebook to compute the correlation between the `factual_consistency` metric outputs and human annotated consistency scores on benchmark datasets\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Note\n", + "\n", + "The consistency scores texts have been translated to German using https://huggingface.co/Helsinki-NLP/opus-mt-en-de\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the benchmark datasets\n", + "import json\n", + "\n", + "# These files were copied from the UniEval repo\n", + "# (https://github.com/maszhongming/UniEval/tree/main/reproduce/data/fact), which\n", + "# is a modified version of the dataset from https://github.com/W4ngatang/qags, then translated to German\n", + "\n", + "qags_xsum_path = 'data/qags_xsum-de.json'\n", + "qags_cnndm_path = 'data/qags_cnndm-de.json'\n", + "\n", + "with open(qags_xsum_path) as f:\n", + " qags_xsum_data = json.loads(f.read())\n", + "with open(qags_cnndm_path) as f:\n", + " qags_cnndm_data = json.loads(f.read())\n", + "\n", + "print(f'QAGS-XSUM has {len(qags_xsum_data)} data points')\n", + "print(f'QAGS-CNN has {len(qags_cnndm_data)} data points')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract the generated outputs, sources, and human annotated scores\n", + "qags_xsum_generated_outputs = [item['system_output'] for item in qags_xsum_data]\n", + "qags_xsum_sources = [item['source'] for item in qags_xsum_data]\n", + "qags_xsum_scores = [item['scores']['consistency'] for item in qags_xsum_data]\n", + "\n", + "qags_cnndm_generated_outputs = [\n", + " item['system_output'] for item in qags_cnndm_data\n", + "]\n", + "qags_cnndm_sources = [item['source'] for item in qags_cnndm_data]\n", + "qags_cnndm_scores = [item['scores']['consistency'] for item in qags_cnndm_data]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import spearmanr, pearsonr, kendalltau\n", + "\n", + "\n", + "def compute_correlation_values(result, annotated_scores):\n", + " '''Function to compute and output the correlation values between the metric\n", + " score and the human annotation scores.'''\n", + " # Ignore any data points where the evaluator returned `None`. This may happen\n", + " # if, for example, the prompt triggers Azure OpenAI's content filter.\n", + " result_df = result.to_df()\n", + " indices = list(result_df[result_df['metric_value'].notna()].index)\n", + " valid_metric_values = [result.metric_values[i] for i in indices]\n", + " valid_annotated_scores = [annotated_scores[i] for i in indices]\n", + "\n", + " pearson_corr = pearsonr(valid_metric_values, valid_annotated_scores)[0]\n", + " spearman_corr = spearmanr(valid_metric_values, valid_annotated_scores)[0]\n", + " kendalltau_corr = kendalltau(valid_metric_values, valid_annotated_scores)[0]\n", + "\n", + " print(f'Pearson correlation = {pearson_corr}')\n", + " print(f'Spearman correlation = {spearman_corr}')\n", + " print(f'Kendall-Tau correlation = {kendalltau_corr}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the factual consistency scores on QAGS-XSUM using the local (UniEval)\n", + "# model option and measure various correlations with the human annotated scores\n", + "from langcheck.metrics.de import factual_consistency\n", + "\n", + "result = factual_consistency(qags_xsum_generated_outputs, qags_xsum_sources)\n", + "compute_correlation_values(result, qags_xsum_scores)\n", + "\n", + "# RUN-DATE: 2024-1-17\n", + "# Resulting correlation values:\n", + "# Pearson correlation = 0.40358016311552586\n", + "# Spearman correlation = 0.37558373934197853\n", + "# Kendall-Tau correlation = 0.3097142857142857\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the factual consistency scores on QAGS-XSUM using the OpenAI\n", + "# (gpt-3.5-turbo) model option and measure various correlations with the human\n", + "# annotated scores\n", + "from langcheck.metrics.de import factual_consistency\n", + "import os\n", + "\n", + "result = factual_consistency(qags_xsum_generated_outputs,\n", + " qags_xsum_sources,\n", + " model_type='azure_openai',\n", + " openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n", + "\n", + "compute_correlation_values(result, qags_xsum_scores[:50])\n", + "\n", + "# RUN-DATE: 2024-1-17\n", + "# OpenAI deployment details:\n", + "# - Model name: gpt-35-turbo\n", + "# - Model version: 0613\n", + "# Resulting correlation values:\n", + "# Computed on 230 examples\n", + "# Pearson correlation = 0.1632062194597614\n", + "# Spearman correlation = 0.15952417117218096\n", + "# Kendall-Tau correlation = 0.15103303151237832\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the factual consistency scores on QAGS-CNN using the local (UniEval)\n", + "# model option and measure various correlations with the human annotated scores\n", + "from langcheck.metrics.de import factual_consistency\n", + "\n", + "result = factual_consistency(qags_cnndm_generated_outputs, qags_cnndm_sources)\n", + "compute_correlation_values(result, qags_cnndm_scores)\n", + "\n", + "# RUN-DATE: 2024-1-18\n", + "# Resulting correlation values:\n", + "# Pearson correlation = 0.5126921817479836\n", + "# Spearman correlation = 0.4940799552395499\n", + "# Kendall-Tau correlation = 0.3910688466232861" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = factual_consistency(\n", + " qags_cnndm_generated_outputs[:50],\n", + " qags_cnndm_sources[:50],\n", + " model_type='openai',\n", + ")\n", + "\n", + "compute_correlation_values(result, qags_cnndm_scores[:50])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the factual consistency scores on QAGS-CNN using the OpenAI\n", + "# (gpt-3.5-turbo) model option and measure various correlations with the human\n", + "# annotated scores\n", + "from langcheck.metrics import factual_consistency\n", + "import os\n", + "\n", + "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n", + "os.environ[\"OPENAI_API_VERSION\"] = 'YOUR_OPENAI_API_VERSION'\n", + "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = 'YOUR_AZURE_OPENAI_ENDPOINT'\n", + "result = factual_consistency(qags_cnndm_generated_outputs,\n", + " qags_cnndm_sources,\n", + " model_type='azure_openai',\n", + " openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n", + "compute_correlation_values(result, qags_cnndm_scores)\n", + "\n", + "# RUN-DATE: 2024-1-18\n", + "# Resulting correlation values:\n", + "# Pearson correlation = 0.2562263899971331\n", + "# Spearman correlation = 0.21022360246996274\n", + "# Kendall-Tau correlation = 0.19670459803185497\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 32f4e562c117d34c3d5db19c869fc75183b34ba5 Mon Sep 17 00:00:00 2001 From: ischender Date: Fri, 26 Jan 2024 17:07:30 +0100 Subject: [PATCH 2/5] adding German (Deutsch) to the READMEs --- README.md | 9 +++++---- README_ja.md | 4 +++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 35e29818..edfb2f61 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ Simple, Pythonic building blocks to evaluate LLM applications. [Examples](#examples) • [Quickstart](https://langcheck.readthedocs.io/en/latest/quickstart.html) • [Docs](https://langcheck.readthedocs.io/en/latest/index.html) • -[日本語](README_ja.md) +[日本語](README_ja.md) • +[Deutsch](README_de.md) @@ -56,9 +57,9 @@ LangCheck includes several types of metrics to evaluate LLM applications. Some e | Type of Metric | Examples | Languages | | ------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- | ------------- | -| [Reference-Free Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-free-text-quality-metrics) | `toxicity(generated_outputs)`
`sentiment(generated_outputs)`
`ai_disclaimer_similarity(generated_outputs)` | EN, JA | -| [Reference-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-based-text-quality-metrics) | `semantic_similarity(generated_outputs, reference_outputs)`
`rouge2(generated_outputs, reference_outputs)` | EN, JA | -| [Source-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#source-based-text-quality-metrics) | `factual_consistency(generated_outputs, sources)` | EN, JA | +| [Reference-Free Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-free-text-quality-metrics) | `toxicity(generated_outputs)`
`sentiment(generated_outputs)`
`ai_disclaimer_similarity(generated_outputs)` | EN, JA, DE | +| [Reference-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#reference-based-text-quality-metrics) | `semantic_similarity(generated_outputs, reference_outputs)`
`rouge2(generated_outputs, reference_outputs)` | EN, JA, DE | +| [Source-Based Text Quality Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#source-based-text-quality-metrics) | `factual_consistency(generated_outputs, sources)` | EN, JA, DE | | [Text Structure Metrics](https://langcheck.readthedocs.io/en/latest/metrics.html#text-structure-metrics) | `is_float(generated_outputs, min=0, max=None)`
`is_json_object(generated_outputs)` | All Languages | ### Visualize Metrics diff --git a/README_ja.md b/README_ja.md index a9f9af02..8a1cc19f 100644 --- a/README_ja.md +++ b/README_ja.md @@ -14,7 +14,9 @@ LLMアプリケーションの評価のためのシンプルなPythonライブ [利用例](#利用例) • [クイックスタート](https://langcheck.readthedocs.io/en/latest/quickstart.html) • [ドキュメント](https://langcheck.readthedocs.io/en/latest/index.html) • -[English](README.md) +[English](README.md) • +[Deutsch](README_de.md) + From 883b85fe98ababebe41cdc95e41dc93e19508c0c Mon Sep 17 00:00:00 2001 From: Alessandro Pedori Date: Tue, 30 Jan 2024 09:55:52 +0100 Subject: [PATCH 3/5] Update benchmarking/factual_consistency_de.ipynb Co-authored-by: Yosuke Higashi <107823399+yosukehigashi@users.noreply.github.com> --- benchmarking/factual_consistency_de.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/factual_consistency_de.ipynb b/benchmarking/factual_consistency_de.ipynb index e8a673d1..0af5fb6d 100644 --- a/benchmarking/factual_consistency_de.ipynb +++ b/benchmarking/factual_consistency_de.ipynb @@ -124,7 +124,7 @@ " model_type='azure_openai',\n", " openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n", "\n", - "compute_correlation_values(result, qags_xsum_scores[:50])\n", + "compute_correlation_values(result, qags_xsum_scores)\n", "\n", "# RUN-DATE: 2024-1-17\n", "# OpenAI deployment details:\n", From 5f50504061d1716addbd1afe97b73a87d08a8c6c Mon Sep 17 00:00:00 2001 From: Alessandro Pedori Date: Tue, 30 Jan 2024 09:56:59 +0100 Subject: [PATCH 4/5] Update benchmarking/factual_consistency_de.ipynb Co-authored-by: Yosuke Higashi <107823399+yosukehigashi@users.noreply.github.com> --- benchmarking/factual_consistency_de.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/factual_consistency_de.ipynb b/benchmarking/factual_consistency_de.ipynb index 0af5fb6d..d9842d2d 100644 --- a/benchmarking/factual_consistency_de.ipynb +++ b/benchmarking/factual_consistency_de.ipynb @@ -190,7 +190,7 @@ "# Compute the factual consistency scores on QAGS-CNN using the OpenAI\n", "# (gpt-3.5-turbo) model option and measure various correlations with the human\n", "# annotated scores\n", - "from langcheck.metrics import factual_consistency\n", + "from langcheck.metrics.de import factual_consistency\n", "import os\n", "\n", "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n", From 759a357076a43cc97b5c6a49c02e61a288637166 Mon Sep 17 00:00:00 2001 From: ischender Date: Tue, 30 Jan 2024 09:59:07 +0100 Subject: [PATCH 5/5] corrections to notebook --- benchmarking/factual_consistency_de.ipynb | 31 ++++------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/benchmarking/factual_consistency_de.ipynb b/benchmarking/factual_consistency_de.ipynb index e8a673d1..341b5f73 100644 --- a/benchmarking/factual_consistency_de.ipynb +++ b/benchmarking/factual_consistency_de.ipynb @@ -119,12 +119,15 @@ "from langcheck.metrics.de import factual_consistency\n", "import os\n", "\n", + "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n", + "os.environ[\"OPENAI_API_VERSION\"] = 'YOUR_OPENAI_API_VERSION'\n", + "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = 'YOUR_AZURE_OPENAI_ENDPOINT'\n", "result = factual_consistency(qags_xsum_generated_outputs,\n", " qags_xsum_sources,\n", " model_type='azure_openai',\n", " openai_args={'model': 'YOUR_DEPLOYMENT_NAME'})\n", "\n", - "compute_correlation_values(result, qags_xsum_scores[:50])\n", + "compute_correlation_values(result, qags_xsum_scores)\n", "\n", "# RUN-DATE: 2024-1-17\n", "# OpenAI deployment details:\n", @@ -157,30 +160,6 @@ "# Kendall-Tau correlation = 0.3910688466232861" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result = factual_consistency(\n", - " qags_cnndm_generated_outputs[:50],\n", - " qags_cnndm_sources[:50],\n", - " model_type='openai',\n", - ")\n", - "\n", - "compute_correlation_values(result, qags_cnndm_scores[:50])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "result" - ] - }, { "cell_type": "code", "execution_count": null, @@ -190,7 +169,7 @@ "# Compute the factual consistency scores on QAGS-CNN using the OpenAI\n", "# (gpt-3.5-turbo) model option and measure various correlations with the human\n", "# annotated scores\n", - "from langcheck.metrics import factual_consistency\n", + "from langcheck.metrics.de import factual_consistency\n", "import os\n", "\n", "os.environ[\"AZURE_OPENAI_KEY\"] = 'YOUR_AZURE_OPENAI_KEY'\n",