From 9444617f598de1a53b11877e3f500d5e72625233 Mon Sep 17 00:00:00 2001
From: Jithin James <jamesjithin97@gmail.com>
Date: Thu, 20 Jul 2023 17:11:41 +0530
Subject: [PATCH] docs: notebook guide for custom llm (#68)

---
 docs/guides/llms.ipynb | 202 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 docs/guides/llms.ipynb

diff --git a/docs/guides/llms.ipynb b/docs/guides/llms.ipynb
new file mode 100644
index 000000000..fcca27d0e
--- /dev/null
+++ b/docs/guides/llms.ipynb
@@ -0,0 +1,202 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0174eb96",
+   "metadata": {},
+   "source": [
+    "# Bring your own LLMs\n",
+    "\n",
+    "Ragas uses langchain under the hood for connecting to LLMs for metrices that require them. This means you can swap out the default LLM we use (`gpt-3.5-turbo-16k`) to use any 100s of API supported out of the box with langchain.\n",
+    "\n",
+    "- [Completion LLMs Supported](https://api.python.langchain.com/en/latest/api_reference.html#module-langchain.llms)\n",
+    "- [Chat based LLMs Supported](https://api.python.langchain.com/en/latest/api_reference.html#module-langchain.chat_models)\n",
+    "\n",
+    "This guide will show you how to use another or LLM API for evaluation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55f0f9b9",
+   "metadata": {},
+   "source": [
+    "## Evaluating with GPT4\n",
+    "\n",
+    "Ragas uses gpt3.5 by default but using gpt4 for evaluation can improve the results so lets use that for the `Faithfulness` metric\n",
+    "\n",
+    "To start-off, we initialise the gpt4 `chat_model` from langchain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a6d96660",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make sure you have you OpenAI API key ready\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"your-openai-key\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6906a4d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "\n",
+    "gpt4 = ChatOpenAI(model_name=\"gpt-4\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1fdb48b",
+   "metadata": {},
+   "source": [
+    "Now initialise `Faithfulness` with `gpt4`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "307321ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ragas.metrics import Faithfulness\n",
+    "\n",
+    "faithfulness_gpt4 = Faithfulness(\n",
+    "    name=\"faithfulness_gpt4\", llm=gpt4, batch_size=3\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1930dd49",
+   "metadata": {},
+   "source": [
+    "That's it!\n",
+    "\n",
+    "Now lets run the evaluations using the example from [quickstart](../quickstart.ipnb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "62c0eadb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset fiqa (/home/jjmachan/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c55f09ffe1094e6190c255c09c0eb141",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    baseline: Dataset({\n",
+       "        features: ['question', 'ground_truths', 'answer', 'contexts'],\n",
+       "        num_rows: 30\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# data\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
+    "fiqa_eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c4396f6e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "evaluating with [faithfulness_gpt4]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████████████████████████████████████████████████████| 10/10 [15:38<00:00, 93.84s/it]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'faithfulness_gpt4': 0.6594}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# evaluate\n",
+    "from ragas import evaluate\n",
+    "\n",
+    "result = evaluate(\n",
+    "    fiqa_eval[\"baseline\"], metrics=[faithfulness_gpt4]\n",
+    ")\n",
+    "\n",
+    "result"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}