python 3.12, add retries for evals

wandb · Jan 1, 2025 · a3a1fdd · a3a1fdd
1 parent b0893be
commit a3a1fdd
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 31 deletions.
diff --git a/.replit b/.replit
@@ -1,5 +1,5 @@
 entrypoint = "main.py"
-modules = ["python-3.11"]
+modules = ["python-3.12"]
 
 [nix]
 channel = "stable-24_05"

diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ Japanese
 
 ## Installation
 
-The project is built with Python version `3.11` and utilizes `uv` for dependency management. Follow the steps below to install the necessary dependencies:
+The project is built with Python version `3.12` and utilizes `uv` for dependency management. Follow the steps below to install the necessary dependencies:
 
 ```bash
 bash build.sh
@@ -154,6 +154,18 @@ set +o allexport
 ```
 
 **Launch the wandbot app**
+You can either use `uvicorn` or `gunicorn` to launch N workers to be able to serve eval requests in parallel. Note that weave Evaluations also have a limit on the number of parallel calls make, set via the `WEAVE_PARALLELISM` env variable, which is set further down in the `eval.py` file using the `n_weave_parallelism` flag.
+
+`uvicorn`
+```bash
+ uvicorn wandbot.api.app:app 
+  --host="0.0.0.0" --port=8000 \
+  --workers 8 \
+  --timeout-keep-alive 75 \
+  --loop uvloop \
+  --http httptools \
+  --log-level debug
+```
 
 Launch wandbot with 8 workers for faster evaluation. The `WANDBOT_EVALUATION` env var triggers the full wandbot app initialization.
 
@@ -176,9 +188,16 @@ curl -X POST \
   -d '{"question": "How do I log a W&B artifact?"}'
 ```
 
+**Debugging**
+For debugging purposes during evaluation you can run a single instance of the app using this command 
+```
+ uvicorn wandbot.api.app:app --host="0.0.0.0" --port=8000 \
+--workers 1 --timeout-keep-alive 75 --loop uvloop --http httptools --log-level debug
+```
+
 **Run the evaluation**
 
-Launch W&B Weave evaluation in the root `wandbot` directory. Ensure that you're virtual envionment is active. By default, a sample will be evaluated 3 times in order to account for both the stochasticity of wandbot and our LLM judge. For debugging, pass the `--debug` flag to only evaluate on a small number of samples
+Launch W&B Weave evaluation in the root `wandbot` directory. Ensure that you're virtual envionment is active. By default, a sample will be evaluated 3 times in order to account for both the stochasticity of wandbot and our LLM judge. For debugging, pass the `--debug` flag to only evaluate on a small number of samples. To adjust the number of parallel evaluation calls weave makes use the `--n_weave_parallelism` flag when calling `eval.py` 
 
 ```
 source wandbot_venv/bin/activate

diff --git a/build.sh b/build.sh
@@ -3,10 +3,11 @@ set -x  # Enable command echo
 set -e  # Exit on error
 
 # Debug disk usage
-# du -sh .
-# du -ah . | sort -rh | head -n 20
-# current_dir_usage=$(du -sm . | awk '{print $1}')
-# echo "Current directory usage: ${current_dir_usage}M"
+du -sh .
+top_usage=$(du -ah . | sort -rh | head -n 20)
+current_dir_usage=$(du -sm . | awk '{print $1}')
+echo -e "Current directory usage: ${current_dir_usage}M"
+echo -e "Top files/dirs usage: ${top_usage}\n"
 
 # Find libstdc++ to use 
 for dir in /nix/store/*-gcc-*/lib64 /nix/store/*-stdenv-*/lib /nix/store/*-libstdc++*/lib; do
@@ -20,7 +21,7 @@ done
 
 # Create virtualenv & set up
 rm -rf .venv
-python3.11 -m venv wandbot_venv --clear
+python3.12 -m venv wandbot_venv --clear
 export VIRTUAL_ENV=wandbot_venv
 export PATH="$VIRTUAL_ENV/bin:$PATH"
 export PYTHONPATH="$(pwd)/src:$PYTHONPATH" 
@@ -56,10 +57,10 @@ ls -la $LIBSTDCXX_DIR/libstdc++.so* || true
 ldd $VIRTUAL_ENV/lib/python*/site-packages/pandas/_libs/*.so || true
 
 # Debug disk usage
-# du -sh .
-# du -ah . | sort -rh | head -n 20
-# current_disk_usage=$(du -sm . | awk '{print $1}')
-# echo "Current disk usage: ${current_disk_usage}M"
-# increment=$((current_disk_usage - initial_disk_usage))
-# echo "Disk usage increment: ${increment}M"
-
+du -sh .
+top_usage=$(du -ah . | sort -rh | head -n 20)
+current_disk_usage=$(du -sm . | awk '{print $1}')
+echo -e "Current directory usage: ${current_dir_usage}M"
+echo -e "Top files/dirs usage: ${top_usage}\n"
+increment=$((current_disk_usage - initial_disk_usage))
+echo -e "Disk usage increment: ${increment}M\n"
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ repository = "https://github.com/wandb/wandbot"
 packages = [{include = "wandbot", from = "src"}]
 
 [tool.poetry.dependencies]
-python = ">=3.11,<3.13"
+python = ">=3.12,<3.14"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

diff --git a/src/wandbot/evaluation/eval/correctness.py b/src/wandbot/evaluation/eval/correctness.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Any, Optional, Sequence
+from typing import Any, Optional, List
 
 import regex as re
 from llama_index.core.evaluation import CorrectnessEvaluator, EvaluationResult
@@ -88,7 +88,7 @@ async def aevaluate(
         self,
         query: Optional[str] = None,
         response: Optional[str] = None,
-        contexts: Optional[Sequence[str]] = None,
+        contexts: Optional[List[str]] = [],
         reference: Optional[str] = None,
         sleep_time_in_seconds: int = 0,
         **kwargs: Any,

diff --git a/src/wandbot/evaluation/weave_eval/eval.py b/src/wandbot/evaluation/weave_eval/eval.py
@@ -4,9 +4,11 @@
 import weave
 import asyncio
 import re
+import logging
 from weave import Evaluation
 from weave import Model
 from llama_index.llms.openai import OpenAI
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
 
 from wandbot.utils import get_logger
 from wandbot.evaluation.eval.correctness import (
@@ -33,18 +35,46 @@
     eval_template=CORRECTNESS_EVAL_TEMPLATE,
 )
 
+# @weave.op
+# async def get_answer(question: str, application: str = "api-eval", language: str = "en") -> str:
+#     url = "http://0.0.0.0:8000/chat/query"
+#     payload = {"question": question, "application": application, "language": language}
+#     try:
+#         async with httpx.AsyncClient(timeout=900.0) as client:
+#             response = await client.post(url, json=payload)
+#             response.raise_for_status() 
+#             return json.dumps(response.json())
+#     except Exception as e:
+#         logger.error(f"Error getting answer: {str(e)}")
+#         return json.dumps({}) 
+
+from tenacity import wait_random, after_log
+
 @weave.op
 async def get_answer(question: str, application: str = "api-eval", language: str = "en") -> str:
-    url = "http://0.0.0.0:8000/chat/query"
-    payload = {"question": question, "application": application, "language": language}
-    try:
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=1, min=4, max=600) + wait_random(0, 2),
+        retry=retry_if_exception_type((httpx.HTTPError, httpx.RequestError)),
+        before_sleep=lambda retry_state: logger.warning(
+            f"Attempt {retry_state.attempt_number} failed. Retrying in {retry_state.next_action.sleep} seconds..."
+        ),
+        after=after_log(logger, logging.ERROR)
+    )
+    async def _make_request():
         async with httpx.AsyncClient(timeout=900.0) as client:
-            response = await client.post(url, json=payload)
-            response.raise_for_status() 
-            return json.dumps(response.json())
+            response = await client.post(
+                "http://0.0.0.0:8000/chat/query",
+                json={"question": question, "application": application, "language": language}
+            )
+            response.raise_for_status()
+            return response.json()
+    try:
+        result = await _make_request()
+        return json.dumps(result)
     except Exception as e:
-        logger.error(f"Error getting answer: {str(e)}")
-        return json.dumps({}) 
+        logger.error(f"All retry attempts failed, returing an empty dict. Error: {str(e)}")
+        return json.dumps({})
 
 
 def parse_text_to_json(text):
@@ -69,7 +99,7 @@ async def get_eval_record(question: str, language: str = "en") -> dict:
         return {
             "system_prompt": "",
             "generated_answer": "",
-            "retrieved_contexts_individual": [],
+            "retrieved_contexts": [],
             "model": "",
             "total_tokens": 0,
             "prompt_tokens": 0,
@@ -80,7 +110,7 @@ async def get_eval_record(question: str, language: str = "en") -> dict:
     return {
         "system_prompt": response.get("system_prompt", ""),
         "generated_answer": response.get("answer", ""),
-        "retrieved_contexts_individual": parse_text_to_json(
+        "retrieved_contexts": parse_text_to_json(
             response.get("source_documents", "")
         ),
         "model": response.get("model", ""),
@@ -105,13 +135,16 @@ async def get_answer_correctness(
     question: str, ground_truth: str, notes: str, model_output: dict
 ) -> dict:
     if config.debug:
-        print("model_output keys:", model_output.keys())
-        print("\nmodel_output content:", model_output)
+        if model_output is not None:
+            logger.info(f"In get_answer_correctness, model_output keys:\n{model_output.keys()}")
+        else:
+            logger.error("model_output is None")
+    contexts = [c["content"] for c in model_output.get("retrieved_contexts", [])] if model_output.get("retrieved_contexts") else []
     result = await correctness_evaluator.aevaluate(
         query=question,
         response=model_output["generated_answer"],
         reference=ground_truth,
-        contexts=model_output["retrieved_contexts"],
+        contexts=contexts,
         reference_notes=notes,
     )
     return {"answer_correctness": result.dict()["passing"]}
@@ -157,6 +190,7 @@ def main():
             {
             "evaluation_strategy_name": config.experiment_name,
             "n_samples": len(question_rows),
+            "n_trials": config.n_trials,
             "language": config.lang,
             "is_debug": config.debug,
             "eval_judge_model": config.eval_judge_model,