From 45c9bcbd1eed87ce5405ffff8ba6c626d70f9bce Mon Sep 17 00:00:00 2001 From: ikka Date: Tue, 10 Dec 2024 21:07:46 +0530 Subject: [PATCH] docs: demo related fixes (#1750) Co-authored-by: jjmachan --- docs/_static/edited_chain_runs.json | 4 +- docs/howtos/customizations/index.md | 2 +- .../metrics/train_your_own_metric.ipynb | 364 ------------------ ...own_metric.md => train_your_own_metric.md} | 35 +- mkdocs.yml | 2 +- src/ragas/optimizers/genetic.py | 6 +- 6 files changed, 31 insertions(+), 382 deletions(-) delete mode 100644 docs/howtos/customizations/metrics/train_your_own_metric.ipynb rename docs/howtos/customizations/metrics/{_train_your_own_metric.md => train_your_own_metric.md} (70%) diff --git a/docs/_static/edited_chain_runs.json b/docs/_static/edited_chain_runs.json index 78c5925fa..fd0eb9e58 100644 --- a/docs/_static/edited_chain_runs.json +++ b/docs/_static/edited_chain_runs.json @@ -353,7 +353,7 @@ "response": "General Relativity is a big idea from Albert Einstein in 1915 that changed how we think about gravity. Instead of just pulling things down, gravity is like a big invisible blanket that bends around heavy stuff like stars and planets. This helps us understand cool things like black holes, ripples in space called gravitational waves, and how the universe is getting smaller. Scientists have tested it a lot, like seeing how light bends around big things in space and how planets move just right. It also helps us learn about the Big Bang and how the universe is put together!", "reference": "General Relativity, formulated by Albert Einstein in 1915, revolutionized our understanding of gravity by describing it as the curvature of spacetime caused by mass and energy. This theory has significant implications in modern physics, providing the framework for understanding phenomena such as black holes, gravitational waves, and the expansion of the universe. It has been confirmed by numerous experiments and observations, including the bending of light around massive objects and the precise orbit of planets. General Relativity also plays a crucial role in the field of cosmology, influencing our understanding of the Big Bang and the overall structure of the universe." }, - "metric_output": 1, + "metric_output": 0, "prompts": { "single_turn_aspect_critic_prompt": { "prompt_input": { @@ -374,7 +374,7 @@ } } }, - "is_accepted": false + "is_accepted": true }, { "metric_input": { diff --git a/docs/howtos/customizations/index.md b/docs/howtos/customizations/index.md index 5e71b59cc..65f1d18c1 100644 --- a/docs/howtos/customizations/index.md +++ b/docs/howtos/customizations/index.md @@ -12,7 +12,7 @@ How to customize various aspects of Ragas to suit your needs. - [Write your own metrics](./metrics/_write_your_own_metric.md) - [Adapt metrics to target language](./metrics/_metrics_language_adaptation.md) - [Trace evaluations with Observability tools](metrics/tracing.md) -- [Train and align metric](./metrics/_train_your_own_metric.md) +- [Train and align metric](./metrics/train_your_own_metric.md) ## Testset Generation diff --git a/docs/howtos/customizations/metrics/train_your_own_metric.ipynb b/docs/howtos/customizations/metrics/train_your_own_metric.ipynb deleted file mode 100644 index 8eeebdd83..000000000 --- a/docs/howtos/customizations/metrics/train_your_own_metric.ipynb +++ /dev/null @@ -1,364 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "ff2c00f7", - "metadata": { - "notebookRunGroups": { - "groupValue": "2" - } - }, - "source": [ - "## Train your own metric\n", - "\n", - "LLM as judge metric often makes mistakes and lack alignment with human evaluators. This makes them risky to use as their results cannot be trusted fully. Now, you can fix this using ragas. This simple tutorial notebook showcasing how to train and align any LLM as judge metric using ragas. One can use this to train any LLM based metric in ragas. \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "53d79c6c-5316-41ce-84d1-61a7a3d4a320", - "metadata": {}, - "source": [ - "\n", - "### Import required modules" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "b0f9dd63-91c7-4ba4-a21f-7dbb54ce2414", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/requests/__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.20) or chardet (5.2.0)/charset_normalizer (None) doesn't match a supported version!\n", - " warnings.warn(\"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported \"\n", - "/opt/homebrew/Caskroom/miniforge/base/envs/ragas/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import os\n", - "from datasets import load_dataset\n", - "from ragas import evaluate, EvaluationDataset\n", - "from ragas.metrics import AspectCritic\n" - ] - }, - { - "cell_type": "markdown", - "id": "d1d03eb0", - "metadata": {}, - "source": [ - "Now, sign up for a free account at [app.ragas](https://app.ragas.io) and get your API key.\n", - "Navigate to App tokens -> Create new token. Copy the key and paste it in the below code. Store it safely." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d83b4505-68f4-4a07-8f79-73da8d4ef3f7", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ['RAGAS_APP_TOKEN'] = 'your_app_token'" - ] - }, - { - "cell_type": "markdown", - "id": "3d82272d-eb13-483c-ba06-db4c0a73f3fd", - "metadata": {}, - "source": [ - "### Setup the models used for evaluation and training\n", - "You may choose any LLM model for training and evaluation. Here's [how to do it](../customize_models.md)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2461866d-e8c6-4cbf-b1f2-20079069c1ff", - "metadata": {}, - "outputs": [], - "source": [ - "from ragas.llms import LangchainLLMWrapper\n", - "from ragas.embeddings import LangchainEmbeddingsWrapper\n", - "from langchain_openai import ChatOpenAI\n", - "from langchain_openai import OpenAIEmbeddings\n", - "llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o-mini\"))\n", - "embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())" - ] - }, - { - "cell_type": "markdown", - "id": "6ebeb666", - "metadata": {}, - "source": [ - "### Load sample evaluation dataset\n", - "Here, we are loading the sample dataset for evaluation. You can replace it with your own dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a9dcab3f-325a-4b55-a662-759db6a2a9f1", - "metadata": {}, - "outputs": [], - "source": [ - "dataset = load_dataset(\"explodinggradients/ELI5\",split=\"test\")\n", - "eval_dataset = EvaluationDataset.from_hf_dataset(dataset)" - ] - }, - { - "cell_type": "markdown", - "id": "64a83029", - "metadata": {}, - "source": [ - "### Setup the Metric\n", - "You may use any LLM based metric. For simplicity, I am using aspect critic metric and setting it up so that it can compare the response with the reference." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "43df046e-07de-4967-a232-54578fb3f880", - "metadata": {}, - "outputs": [], - "source": [ - "critic = AspectCritic(name=\"answer_correctness\",definition=\"Given the user_input, reference and response. Is the response correct compared with the reference\",llm=llm)\n" - ] - }, - { - "cell_type": "markdown", - "id": "27caa5a5", - "metadata": {}, - "source": [ - "### Evaluate and Upload the results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a6964abe-0360-4249-946a-5c388a166758", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Evaluating: 100%|██████████████████████████████████████████████████████████| 5/5 [00:01<00:00, 3.49it/s]\n" - ] - } - ], - "source": [ - "results = evaluate(eval_dataset,metrics=[critic])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "ea52de01-b639-4011-8def-6321b54c2bf3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation results uploaded! View at https://app.ragas.io/dashboard/alignment/evaluation/a6baf6ff-027f-4097-89e3-e11c70b8cf61\n" - ] - }, - { - "data": { - "text/plain": [ - "'https://app.ragas.io/dashboard/alignment/evaluation/a6baf6ff-027f-4097-89e3-e11c70b8cf61'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results.upload()" - ] - }, - { - "cell_type": "markdown", - "id": "99767b85-08cd-45d7-b81b-5ab2c6aeffd3", - "metadata": {}, - "source": [ - "### Review and annotate some results\n", - "You may now view and annotate the evaluation results in app.ragas. These annotations will be used to train the metric. Please make sure to annotate at least 15-20 examples for good results." - ] - }, - { - "cell_type": "markdown", - "id": "328a2463-08f6-4639-b63f-2e27966afd12", - "metadata": {}, - "source": [ - "### Train the metric\n", - "Download the annotated samples from app.ragas using `Download annotated json` button. " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "5fb47d6f-4514-4dc3-89ca-0be074f58397", - "metadata": {}, - "outputs": [], - "source": [ - "from ragas.config import InstructionConfig,DemonstrationConfig\n", - "demo_config = DemonstrationConfig(embedding = embeddings)\n", - "inst_config = InstructionConfig(llm=llm)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2df934d-5701-4c64-972f-2455d3d915d7", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Feedback Mutation Step 2/4: 19%|███████▍ | 28/146 [00:03<00:10, 11.23it/s]No samples found for the feedback generation.\n", - "No feedbacks found for the prompt single_turn_aspect_critic_prompt. Returning the original prompt.\n", - "Feedback Mutation Step 2/4: 23%|████████▊ | 33/146 [00:06<00:26, 4.33it/s]No samples found for the feedback generation.\n", - "No feedbacks found for the prompt single_turn_aspect_critic_prompt. Returning the original prompt.\n", - "Feedback Mutation Step 2/4: 24%|█████████▎ | 35/146 [00:06<00:22, 4.97it/s]Error in LangChainTracer.on_chain_end callback: TracerException('No indexed run ID 2046bdfe-27cc-4ce2-b999-3e8fc674969c.')\n", - "Fitness Evaluation Step 4/4: 100%|█████████████████████████████████████| 146/146 [00:24<00:00, 6.03it/s]\n", - "Few-shot examples [single_turn_aspect_critic_prompt]: 100%|██████████████| 18/18 [00:09<00:00, 1.82it/s]\n" - ] - } - ], - "source": [ - "critic.train(path=\"edited_chain_runs.json\",demonstration_config=demo_config,instruction_config=inst_config)" - ] - }, - { - "cell_type": "markdown", - "id": "7ea9e387-e413-42c2-a188-bce106a7d526", - "metadata": {}, - "source": [ - "### Inspect\n", - "Now, let's do some analysis on the trained metric.\n", - "\n", - "First, let's take a look at new instructions that was obtained for the metric after training." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "74574d75-dc1d-4319-99db-ea7ac284b098", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Evaluate the provided user responses against the reference information for accuracy and completeness. Assign a verdict of 1 if the response is accurate and aligns well with the reference, or 0 if it contains inaccuracies or misrepresentations.'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "critic.get_prompts()['single_turn_aspect_critic_prompt'].instruction" - ] - }, - { - "cell_type": "markdown", - "id": "3d710de9-9cb2-4517-a816-11d52e41c35e", - "metadata": {}, - "source": [ - "#### Re-evaluate\n", - "Let's evaluate again and see if the metric has improved for any un-annotated examples." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6beed95-63d6-4df5-afe4-09c4a1c5f1e4", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Evaluating: 100%|████████████████████████████████████████████████████████| 50/50 [00:28<00:00, 1.78it/s]\n" - ] - } - ], - "source": [ - "results = evaluate(eval_dataset,metrics=[critic])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "8d125209-fe42-4299-b878-fefc0d837247", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation results uploaded! View at https://app.ragas.io/dashboard/alignment/evaluation/687e7cdf-ff31-4c15-9780-c179207c929c\n" - ] - }, - { - "data": { - "text/plain": [ - "'https://app.ragas.io/dashboard/alignment/evaluation/687e7cdf-ff31-4c15-9780-c179207c929c'" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results.upload()" - ] - }, - { - "cell_type": "markdown", - "id": "8fc991be", - "metadata": {}, - "source": [ - "Here in my case, the metric has improved significantly. You can see the difference in the scores. To show the difference, let's compares the scores and changed reasoning for one specific example before and after training." - ] - }, - { - "cell_type": "markdown", - "id": "e09b041f", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ragas", - "language": "python", - "name": "ragas" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.20" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/howtos/customizations/metrics/_train_your_own_metric.md b/docs/howtos/customizations/metrics/train_your_own_metric.md similarity index 70% rename from docs/howtos/customizations/metrics/_train_your_own_metric.md rename to docs/howtos/customizations/metrics/train_your_own_metric.md index 27957bf43..6bdc16608 100644 --- a/docs/howtos/customizations/metrics/_train_your_own_metric.md +++ b/docs/howtos/customizations/metrics/train_your_own_metric.md @@ -1,5 +1,7 @@ # Train and Align your own Metric +[Open notebook in colab](https://colab.research.google.com/drive/16RIHEAJ0Ded3RuPoMq5498vBuhvPIruv?usp=sharing) + LLM as judge metric often makes mistakes and lack alignment with human evaluators. This makes them risky to use as their results cannot be trusted fully. Now, you can fix this using ragas. This simple tutorial notebook showcasing how to train and align any LLM as judge metric using ragas. One can use this to train any LLM based metric in ragas. @@ -36,20 +38,29 @@ Here, we are loading the sample dataset for evaluation. You can replace it with ```python dataset = load_dataset("explodinggradients/ELI5",split="test") eval_dataset = EvaluationDataset.from_hf_dataset(dataset) -print(dataset[10]) +print(eval_dataset[10].to_string()) ``` ``` -{ - "user_input": "What is the Theory of Cosmic Inflation and how does it explain the early universe?", - "reference": "The Theory of Cosmic Inflation proposes that the universe underwent an exponential expansion in its earliest moments, just after the Big Bang. This rapid expansion helps to explain several observed phenomena in cosmology, such as the uniformity of the cosmic microwave background radiation and the large-scale structure of the universe. Inflation theory suggests that tiny quantum fluctuations were stretched to macroscopic scales, seeding the formation of galaxies and other cosmic structures. It also addresses the horizon and flatness problems, providing a more comprehensive understanding of the universe's initial conditions.", - "response": "The Theory of Cosmic Inflation is like saying the universe blew up like a giant balloon really fast right after it was born. This helps us understand why the universe looks the same everywhere we look and how galaxies and stars started to form. It also helps answer some big questions about why the universe is so flat and even." -} +user_input: + What is the Theory of Glacial Cycles and how does it explain the occurrence of ice ages? + +response: + The Theory of Glacial Cycles, or Milankovitch Cycles, is like a big clock for Earth's ice ages. It says that the way Earth moves around the sun changes over a long time, like how it tilts and wobbles. These changes make the sunlight hit Earth differently, which can make it colder or warmer. Over thousands of years, this can make big ice sheets grow or melt. It's like a chain reaction where these changes also affect things like greenhouse gases and how shiny Earth is, making the ice ages come and go. + +reference: + The Theory of Glacial Cycles, also known as the Milankovitch Cycles, explains the occurrence of ice ages through variations in Earth's orbit and axial tilt. These cycles include changes in eccentricity, axial tilt, and precession, which affect the distribution and intensity of sunlight received by Earth. Over tens of thousands of years, these variations lead to significant climate changes, including the advance and retreat of ice sheets. The theory suggests that these orbital changes trigger feedback mechanisms, such as changes in greenhouse gas concentrations and albedo, amplifying the climatic effects and leading to glacial and interglacial periods. ``` -The dataset contains user input, reference and response. The metric will evaluate the response based on the reference. The response here is in ELI5 format, which is a simple way of explaining complex topics. This is a good example to align the metric with human evaluators as in this situation the human evaluator will consider the response as correct if the ELI5 response is accurate and complete compared to the reference and incorrect if the response contains any factual inaccuracy. +The dataset contains user input, reference and response. Our goal is to evaluate the response based on the reference. The response here is in ELI5 format, which is a simple way of explaining complex topics. + +In this particular application, we need to align our evaluation metric to evaluate the correctness of the response compared to the reference. + +- LLM as judge by default may regard the response as incorrect as it's not written in the same way as the reference, which is not the case here. +- At the same time, we also need it to identify instances where response makes factual errors or misrepresents the reference. + ### Setup the Metric You may use any LLM based metric. For simplicity, I am using aspect critic metric and setting it up so that it can compare the response with the reference. @@ -74,11 +85,11 @@ results = evaluate(eval_dataset,metrics=[critic]) ## Review and Annotate -Now you have the evaluation results. Now it's time to review the evaluations and give feedback to the metric. This feedback will be used to train the metric. For this you can use [app.ragas](https://app.ragas.io) or any other annotation tool like prodigy, label studio etc. +Now you have the evaluation results. Now it's time to review the evaluations and give feedback to the metric. This feedback will be used to train the metric. For this you can use [app.ragas.io](https://app.ragas.io) or any other annotation tool like prodigy, label studio etc. -If you're using app.ragas, +If you're using app.ragas.io, -- Go to [app.ragas](https://app.ragas.io) and login. +- Go to [app.ragas.io](https://app.ragas.io) and login. - Then go to [App tokens](https://app.ragas.io/dashboard/settings/app-tokens) and create a new app token. - Set the token in the environment variable @@ -98,7 +109,7 @@ Evaluation results uploaded! View at https://app.ragas.io/dashboard/alignment/ev ### Review and annotate some results -You may now view and annotate the evaluation results in app.ragas. These annotations will be used to train the metric. Please make sure to annotate at least 15-20 examples for good results. +You may now view and annotate the evaluation results in app.ragas.io. These annotations will be used to train the metric. Please make sure to annotate at least 15-20 examples for good results.
![Annotating](../../../_static/imgs/annotation.png){width="600"} @@ -110,7 +121,7 @@ Here is a sample annotation for the above example. You can [download](../../../_ ## Training and Alignment ### Train the metric -Download the annotated samples from app.ragas using `Download annotated json` button. +Download the annotated samples from app.ragas.io using `Download annotated json` button. Instruction and demonstration configurations are required tells ragas how to optimize instruction and few shot demonstrations respectively. You can customize these configurations as per your requirements. ```python diff --git a/mkdocs.yml b/mkdocs.yml index 88c919b1d..22f35c55e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,7 +82,7 @@ nav: - Adapt Metrics to Languages: howtos/customizations/metrics/_metrics_language_adaptation.md - Write your own Metrics: howtos/customizations/metrics/_write_your_own_metric.md - Write your own Metrics - (advanced): howtos/customizations/metrics/_write_your_own_metric_advanced.md - - Train and Align Metrics: howtos/customizations/metrics/_train_your_own_metric.md + - Train and Align Metrics: howtos/customizations/metrics/train_your_own_metric.md - Testset Generation: - Non-English Testset Generation: howtos/customizations/testgenerator/_language_adaptation.md - Persona Generation: howtos/customizations/testgenerator/_persona_generator.md diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 65ddc7f3c..eb0327b27 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -106,7 +106,7 @@ class FeedbackMutationPrompt( ): name: str = "feedback_mutation" instruction: str = ( - "You're an expert reviewer. Given an instruction and a set of (input containing (user_input, response, reference, etc), output, expected_output) examples, give maximum 3 feedbacks on how the instruction can be improved to correct the mistakes in incorrect outputs and reach expected output." + "You're an expert reviewer. Given an instruction and a set of (input containing (user_input, response, reference, etc), output, expected_output) examples. After analyzing the examples, give maximum 3 concrete feedbacks on how the instruction can be modified so that the model arrives at the expected output." "Do not provide the feedback to add examples with the instruction." ) input_model = FeedbackMutationInput @@ -160,7 +160,7 @@ def optimize( population_size = config.get("population_size", 3) num_demonstrations = config.get("num_demonstrations", 3) - sample_size = config.get("sample_size", 10) + sample_size = config.get("sample_size", 12) # new group for optimization optimization_generation_rm, optimization_generation_grp = new_group( @@ -518,6 +518,8 @@ def dict_to_str(dict: t.Dict[str, t.Any]) -> str: exclude_none=True ), expected_output=dataset[idx]["prompts"][prompt_name][ + "edited_output" + ] or dataset[idx]["prompts"][prompt_name][ "prompt_output" ], )