From ea9f11910ecde61afa1930087df1b12a087b4b64 Mon Sep 17 00:00:00 2001
From: Gustavo Cid Ornelas <gustavocidornelas@gmail.com>
Date: Wed, 25 Sep 2024 10:16:46 -0300
Subject: [PATCH 1/2] chore: show how to log context in RAG notebook example

---
 examples/tracing/rag/rag_tracing.ipynb | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/examples/tracing/rag/rag_tracing.ipynb b/examples/tracing/rag/rag_tracing.ipynb
index febf6710..a6bf01b2 100644
--- a/examples/tracing/rag/rag_tracing.ipynb
+++ b/examples/tracing/rag/rag_tracing.ipynb
@@ -19,7 +19,6 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import openai\n",
     "\n",
     "# OpenAI env variables\n",
     "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY_HERE\"\n",
@@ -58,13 +57,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import random\n",
-    "import time\n",
+    "from typing import List\n",
     "\n",
     "import numpy as np\n",
     "from openai import OpenAI\n",
-    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "\n",
     "from openlayer.lib import trace, trace_openai"
    ]
@@ -93,13 +91,13 @@
     "\n",
     "        Answers to a user query with the LLM.\n",
     "        \"\"\"\n",
-    "        context = self.retrieve_context(user_query)\n",
+    "        context = self.retrieve_contexts(user_query)\n",
     "        prompt = self.inject_prompt(user_query, context)\n",
     "        answer = self.generate_answer_with_gpt(prompt)\n",
     "        return answer\n",
     "\n",
     "    @trace()\n",
-    "    def retrieve_context(self, query: str) -> str:\n",
+    "    def retrieve_contexts(self, query: str) -> List[str]:\n",
     "        \"\"\"Context retriever.\n",
     "\n",
     "        Given the query, returns the most similar context (using TFIDF).\n",
@@ -107,17 +105,21 @@
     "        query_vector = self.vectorizer.transform([query])\n",
     "        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()\n",
     "        most_relevant_idx = np.argmax(cosine_similarities)\n",
-    "        return self.context_sections[most_relevant_idx]\n",
+    "        contexts = [self.context_sections[most_relevant_idx]]\n",
+    "        return contexts\n",
     "\n",
-    "    @trace()\n",
-    "    def inject_prompt(self, query: str, context: str):\n",
+    "    # You can also specify the name of the `context_kwarg` to unlock RAG metrics that\n",
+    "    # evaluate the performance of the context retriever. The value of the `context_kwarg`\n",
+    "    # should be a list of strings.\n",
+    "    @trace(context_kwarg=\"contexts\")\n",
+    "    def inject_prompt(self, query: str, contexts: List[str]) -> List[dict]:\n",
     "        \"\"\"Combines the query with the context and returns\n",
     "        the prompt (formatted to conform with OpenAI models).\"\"\"\n",
     "        return [\n",
     "            {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
     "            {\n",
     "                \"role\": \"user\",\n",
-    "                \"content\": f\"Answer the user query using only the following context: {context}. \\nUser query: {query}\",\n",
+    "                \"content\": f\"Answer the user query using only the following context: {contexts[0]}. \\nUser query: {query}\",\n",
     "            },\n",
     "        ]\n",
     "\n",
@@ -172,7 +174,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f960a36f-3438-4c81-8cdb-ca078aa509cd",
+   "id": "a45d5562",
    "metadata": {},
    "outputs": [],
    "source": []

From c431a9de350edbf936af62d0e672f99beecb0165 Mon Sep 17 00:00:00 2001
From: Gustavo Cid Ornelas <gustavocidornelas@gmail.com>
Date: Wed, 25 Sep 2024 10:38:37 -0300
Subject: [PATCH 2/2] fix: make sure that context logging works in development
 mode

---
 src/openlayer/lib/core/base_model.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/openlayer/lib/core/base_model.py b/src/openlayer/lib/core/base_model.py
index a105e0bf..306526ff 100644
--- a/src/openlayer/lib/core/base_model.py
+++ b/src/openlayer/lib/core/base_model.py
@@ -42,9 +42,7 @@ class OpenlayerModel(abc.ABC):
     def run_from_cli(self) -> None:
         """Run the model from the command line."""
         parser = argparse.ArgumentParser(description="Run data through a model.")
-        parser.add_argument(
-            "--dataset-path", type=str, required=True, help="Path to the dataset"
-        )
+        parser.add_argument("--dataset-path", type=str, required=True, help="Path to the dataset")
         parser.add_argument(
             "--output-dir",
             type=str,
@@ -87,9 +85,7 @@ def run_batch_from_df(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
             # Filter row_dict to only include keys that are valid parameters
             # for the 'run' method
             row_dict = row.to_dict()
-            filtered_kwargs = {
-                k: v for k, v in row_dict.items() if k in run_signature.parameters
-            }
+            filtered_kwargs = {k: v for k, v in row_dict.items() if k in run_signature.parameters}
 
             # Call the run method with filtered kwargs
             output = self.run(**filtered_kwargs)
@@ -111,6 +107,8 @@ def run_batch_from_df(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
                     df.at[index, "cost"] = processed_trace["cost"]
                 if "tokens" in processed_trace:
                     df.at[index, "tokens"] = processed_trace["tokens"]
+                if "context" in processed_trace:
+                    df.at[index, "context"] = processed_trace["context"]
 
         config = {
             "outputColumnName": "output",
@@ -126,6 +124,8 @@ def run_batch_from_df(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
             config["costColumnName"] = "cost"
         if "tokens" in df.columns:
             config["numOfTokenColumnName"] = "tokens"
+        if "context" in df.columns:
+            config["contextColumnName"] = "context"
 
         return df, config