From a3a1fdded8758e65dd3ac125e0eebfb72a405420 Mon Sep 17 00:00:00 2001 From: Morgan McGuire Date: Wed, 1 Jan 2025 17:14:05 -0500 Subject: [PATCH] python 3.12, add retries for evals --- .replit | 2 +- README.md | 23 ++++++++- build.sh | 25 ++++----- pyproject.toml | 2 +- src/wandbot/evaluation/eval/correctness.py | 4 +- src/wandbot/evaluation/weave_eval/eval.py | 60 +++++++++++++++++----- 6 files changed, 85 insertions(+), 31 deletions(-) diff --git a/.replit b/.replit index 1cb0e33..253e191 100644 --- a/.replit +++ b/.replit @@ -1,5 +1,5 @@ entrypoint = "main.py" -modules = ["python-3.11"] +modules = ["python-3.12"] [nix] channel = "stable-24_05" diff --git a/README.md b/README.md index bc3d0bc..616cf70 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Japanese ## Installation -The project is built with Python version `3.11` and utilizes `uv` for dependency management. Follow the steps below to install the necessary dependencies: +The project is built with Python version `3.12` and utilizes `uv` for dependency management. Follow the steps below to install the necessary dependencies: ```bash bash build.sh @@ -154,6 +154,18 @@ set +o allexport ``` **Launch the wandbot app** +You can either use `uvicorn` or `gunicorn` to launch N workers to be able to serve eval requests in parallel. Note that weave Evaluations also have a limit on the number of parallel calls make, set via the `WEAVE_PARALLELISM` env variable, which is set further down in the `eval.py` file using the `n_weave_parallelism` flag. + +`uvicorn` +```bash + uvicorn wandbot.api.app:app + --host="0.0.0.0" --port=8000 \ + --workers 8 \ + --timeout-keep-alive 75 \ + --loop uvloop \ + --http httptools \ + --log-level debug +``` Launch wandbot with 8 workers for faster evaluation. The `WANDBOT_EVALUATION` env var triggers the full wandbot app initialization. @@ -176,9 +188,16 @@ curl -X POST \ -d '{"question": "How do I log a W&B artifact?"}' ``` +**Debugging** +For debugging purposes during evaluation you can run a single instance of the app using this command +``` + uvicorn wandbot.api.app:app --host="0.0.0.0" --port=8000 \ +--workers 1 --timeout-keep-alive 75 --loop uvloop --http httptools --log-level debug +``` + **Run the evaluation** -Launch W&B Weave evaluation in the root `wandbot` directory. Ensure that you're virtual envionment is active. By default, a sample will be evaluated 3 times in order to account for both the stochasticity of wandbot and our LLM judge. For debugging, pass the `--debug` flag to only evaluate on a small number of samples +Launch W&B Weave evaluation in the root `wandbot` directory. Ensure that you're virtual envionment is active. By default, a sample will be evaluated 3 times in order to account for both the stochasticity of wandbot and our LLM judge. For debugging, pass the `--debug` flag to only evaluate on a small number of samples. To adjust the number of parallel evaluation calls weave makes use the `--n_weave_parallelism` flag when calling `eval.py` ``` source wandbot_venv/bin/activate diff --git a/build.sh b/build.sh index 3fb2b66..21dd4b7 100644 --- a/build.sh +++ b/build.sh @@ -3,10 +3,11 @@ set -x # Enable command echo set -e # Exit on error # Debug disk usage -# du -sh . -# du -ah . | sort -rh | head -n 20 -# current_dir_usage=$(du -sm . | awk '{print $1}') -# echo "Current directory usage: ${current_dir_usage}M" +du -sh . +top_usage=$(du -ah . | sort -rh | head -n 20) +current_dir_usage=$(du -sm . | awk '{print $1}') +echo -e "Current directory usage: ${current_dir_usage}M" +echo -e "Top files/dirs usage: ${top_usage}\n" # Find libstdc++ to use for dir in /nix/store/*-gcc-*/lib64 /nix/store/*-stdenv-*/lib /nix/store/*-libstdc++*/lib; do @@ -20,7 +21,7 @@ done # Create virtualenv & set up rm -rf .venv -python3.11 -m venv wandbot_venv --clear +python3.12 -m venv wandbot_venv --clear export VIRTUAL_ENV=wandbot_venv export PATH="$VIRTUAL_ENV/bin:$PATH" export PYTHONPATH="$(pwd)/src:$PYTHONPATH" @@ -56,10 +57,10 @@ ls -la $LIBSTDCXX_DIR/libstdc++.so* || true ldd $VIRTUAL_ENV/lib/python*/site-packages/pandas/_libs/*.so || true # Debug disk usage -# du -sh . -# du -ah . | sort -rh | head -n 20 -# current_disk_usage=$(du -sm . | awk '{print $1}') -# echo "Current disk usage: ${current_disk_usage}M" -# increment=$((current_disk_usage - initial_disk_usage)) -# echo "Disk usage increment: ${increment}M" - +du -sh . +top_usage=$(du -ah . | sort -rh | head -n 20) +current_disk_usage=$(du -sm . | awk '{print $1}') +echo -e "Current directory usage: ${current_dir_usage}M" +echo -e "Top files/dirs usage: ${top_usage}\n" +increment=$((current_disk_usage - initial_disk_usage)) +echo -e "Disk usage increment: ${increment}M\n" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index bb0cfb7..e69d36d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ repository = "https://github.com/wandb/wandbot" packages = [{include = "wandbot", from = "src"}] [tool.poetry.dependencies] -python = ">=3.11,<3.13" +python = ">=3.12,<3.14" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/src/wandbot/evaluation/eval/correctness.py b/src/wandbot/evaluation/eval/correctness.py index 627f676..af268fa 100644 --- a/src/wandbot/evaluation/eval/correctness.py +++ b/src/wandbot/evaluation/eval/correctness.py @@ -1,5 +1,5 @@ import asyncio -from typing import Any, Optional, Sequence +from typing import Any, Optional, List import regex as re from llama_index.core.evaluation import CorrectnessEvaluator, EvaluationResult @@ -88,7 +88,7 @@ async def aevaluate( self, query: Optional[str] = None, response: Optional[str] = None, - contexts: Optional[Sequence[str]] = None, + contexts: Optional[List[str]] = [], reference: Optional[str] = None, sleep_time_in_seconds: int = 0, **kwargs: Any, diff --git a/src/wandbot/evaluation/weave_eval/eval.py b/src/wandbot/evaluation/weave_eval/eval.py index 5a3a332..debc8a5 100644 --- a/src/wandbot/evaluation/weave_eval/eval.py +++ b/src/wandbot/evaluation/weave_eval/eval.py @@ -4,9 +4,11 @@ import weave import asyncio import re +import logging from weave import Evaluation from weave import Model from llama_index.llms.openai import OpenAI +from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type from wandbot.utils import get_logger from wandbot.evaluation.eval.correctness import ( @@ -33,18 +35,46 @@ eval_template=CORRECTNESS_EVAL_TEMPLATE, ) +# @weave.op +# async def get_answer(question: str, application: str = "api-eval", language: str = "en") -> str: +# url = "http://0.0.0.0:8000/chat/query" +# payload = {"question": question, "application": application, "language": language} +# try: +# async with httpx.AsyncClient(timeout=900.0) as client: +# response = await client.post(url, json=payload) +# response.raise_for_status() +# return json.dumps(response.json()) +# except Exception as e: +# logger.error(f"Error getting answer: {str(e)}") +# return json.dumps({}) + +from tenacity import wait_random, after_log + @weave.op async def get_answer(question: str, application: str = "api-eval", language: str = "en") -> str: - url = "http://0.0.0.0:8000/chat/query" - payload = {"question": question, "application": application, "language": language} - try: + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=600) + wait_random(0, 2), + retry=retry_if_exception_type((httpx.HTTPError, httpx.RequestError)), + before_sleep=lambda retry_state: logger.warning( + f"Attempt {retry_state.attempt_number} failed. Retrying in {retry_state.next_action.sleep} seconds..." + ), + after=after_log(logger, logging.ERROR) + ) + async def _make_request(): async with httpx.AsyncClient(timeout=900.0) as client: - response = await client.post(url, json=payload) - response.raise_for_status() - return json.dumps(response.json()) + response = await client.post( + "http://0.0.0.0:8000/chat/query", + json={"question": question, "application": application, "language": language} + ) + response.raise_for_status() + return response.json() + try: + result = await _make_request() + return json.dumps(result) except Exception as e: - logger.error(f"Error getting answer: {str(e)}") - return json.dumps({}) + logger.error(f"All retry attempts failed, returing an empty dict. Error: {str(e)}") + return json.dumps({}) def parse_text_to_json(text): @@ -69,7 +99,7 @@ async def get_eval_record(question: str, language: str = "en") -> dict: return { "system_prompt": "", "generated_answer": "", - "retrieved_contexts_individual": [], + "retrieved_contexts": [], "model": "", "total_tokens": 0, "prompt_tokens": 0, @@ -80,7 +110,7 @@ async def get_eval_record(question: str, language: str = "en") -> dict: return { "system_prompt": response.get("system_prompt", ""), "generated_answer": response.get("answer", ""), - "retrieved_contexts_individual": parse_text_to_json( + "retrieved_contexts": parse_text_to_json( response.get("source_documents", "") ), "model": response.get("model", ""), @@ -105,13 +135,16 @@ async def get_answer_correctness( question: str, ground_truth: str, notes: str, model_output: dict ) -> dict: if config.debug: - print("model_output keys:", model_output.keys()) - print("\nmodel_output content:", model_output) + if model_output is not None: + logger.info(f"In get_answer_correctness, model_output keys:\n{model_output.keys()}") + else: + logger.error("model_output is None") + contexts = [c["content"] for c in model_output.get("retrieved_contexts", [])] if model_output.get("retrieved_contexts") else [] result = await correctness_evaluator.aevaluate( query=question, response=model_output["generated_answer"], reference=ground_truth, - contexts=model_output["retrieved_contexts"], + contexts=contexts, reference_notes=notes, ) return {"answer_correctness": result.dict()["passing"]} @@ -157,6 +190,7 @@ def main(): { "evaluation_strategy_name": config.experiment_name, "n_samples": len(question_rows), + "n_trials": config.n_trials, "language": config.lang, "is_debug": config.debug, "eval_judge_model": config.eval_judge_model,