From d4dba2e77d9bdfdb71e62ca0b7e4223a505701ef Mon Sep 17 00:00:00 2001
From: Roger Yang <80478925+RogerHYang@users.noreply.github.com>
Date: Thu, 26 Sep 2024 12:12:48 -0700
Subject: [PATCH 1/3] ci: fix cron for python 3.9 (#4769)

---
 .github/workflows/python-cron.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python-cron.yml b/.github/workflows/python-cron.yml
index 359f21b538..7d0225e653 100644
--- a/.github/workflows/python-cron.yml
+++ b/.github/workflows/python-cron.yml
@@ -2,7 +2,7 @@ name: Python Cron
 
 on:
   schedule:
-    - cron: "*/15 15-23 * * *"
+    - cron: "*/15 15-23 * * 1-5"
   workflow_dispatch:
 
 jobs:
@@ -22,13 +22,11 @@ jobs:
           - 5432:5432
     steps:
       - uses: actions/checkout@v4
-        with:
-          ref: auth
       - uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.9
       - run: pip install tox-uv==1.11.3
-      - run: tox run-parallel --parallel-no-spinner -e py39-test-integration_tests
+      - run: tox run-parallel --parallel-no-spinner -e test-integration_tests
       - uses: slackapi/slack-github-action@v1
         if: failure()
         env:

From 8795c52776deed8b3f275a49f5b4c6676f98db5b Mon Sep 17 00:00:00 2001
From: Dustin Ngo <dustin@arize.com>
Date: Wed, 25 Sep 2024 22:50:19 -0700
Subject: [PATCH 2/3] EXPERIMENTAL: async entrypoints for experiments

---
 .../src/phoenix/evals/executors.py            |  11 +
 src/phoenix/experiments/functions.py          | 421 +++++++++++++++++-
 tests/datasets/test_experiments.py            |  39 +-
 3 files changed, 443 insertions(+), 28 deletions(-)

diff --git a/packages/phoenix-evals/src/phoenix/evals/executors.py b/packages/phoenix-evals/src/phoenix/evals/executors.py
index 625b4e7e6f..5c0213e812 100644
--- a/packages/phoenix-evals/src/phoenix/evals/executors.py
+++ b/packages/phoenix-evals/src/phoenix/evals/executors.py
@@ -382,7 +382,18 @@ def get_executor_on_sync_context(
     max_retries: int = 10,
     exit_on_error: bool = True,
     fallback_return_value: Union[Unset, Any] = _unset,
+    use_async: bool = False,
 ) -> Executor:
+    if use_async is True:
+        return AsyncExecutor(
+            async_fn,
+            concurrency=concurrency,
+            tqdm_bar_format=tqdm_bar_format,
+            max_retries=max_retries,
+            exit_on_error=exit_on_error,
+            fallback_return_value=fallback_return_value,
+        )
+
     if threading.current_thread() is not threading.main_thread():
         # run evals synchronously if not in the main thread
 
diff --git a/src/phoenix/experiments/functions.py b/src/phoenix/experiments/functions.py
index 34ee3d26e9..4cf5ba8bc8 100644
--- a/src/phoenix/experiments/functions.py
+++ b/src/phoenix/experiments/functions.py
@@ -42,7 +42,7 @@
 from typing_extensions import TypeAlias
 
 from phoenix.config import get_base_url
-from phoenix.evals.executors import get_executor_on_sync_context
+from phoenix.evals.executors import AsyncExecutor, get_executor_on_sync_context
 from phoenix.evals.models.rate_limiters import RateLimiter
 from phoenix.evals.utils import get_tqdm_progress_bar_formatter
 from phoenix.experiments.evaluators import create_evaluator
@@ -94,6 +94,260 @@ def _phoenix_clients() -> Tuple[httpx.Client, httpx.AsyncClient]:
 RateLimitErrors: TypeAlias = Union[Type[BaseException], Sequence[Type[BaseException]]]
 
 
+async def async_run_experiment(
+    dataset: Dataset,
+    task: ExperimentTask,
+    evaluators: Optional[Evaluators] = None,
+    *,
+    experiment_name: Optional[str] = None,
+    experiment_description: Optional[str] = None,
+    experiment_metadata: Optional[Mapping[str, Any]] = None,
+    rate_limit_errors: Optional[RateLimitErrors] = None,
+    dry_run: Union[bool, int] = False,
+    print_summary: bool = True,
+    concurrency: int = 3,
+) -> RanExperiment:
+    """
+    Runs an experiment using a given set of dataset of examples.
+
+    An experiment is a user-defined task that runs on each example in a dataset. The results from
+    each experiment can be evaluated using any number of evaluators to measure the behavior of the
+    task. The experiment and evaluation results are stored in the Phoenix database for comparison
+    and analysis.
+
+    A `task` is either a synchronous or asynchronous function that returns a JSON serializable
+    output. If the `task` is a function of one argument then that argument will be bound to the
+    `input` field of the dataset example. Alternatively, the `task` can be a function of any
+    combination of specific argument names that will be bound to special values:
+
+    - `input`: The input field of the dataset example
+    - `expected`: The expected or reference output of the dataset example
+    - `reference`: An alias for `expected`
+    - `metadata`: Metadata associated with the dataset example
+    - `example`: The dataset `Example` object with all associated fields
+
+    An `evaluator` is either a synchronous or asynchronous function that returns either a boolean
+    or numeric "score". If the `evaluator` is a function of one argument then that argument will be
+    bound to the `output` of the task. Alternatively, the `evaluator` can be a function of any
+    combination of specific argument names that will be bound to special values:
+
+    - `input`: The input field of the dataset example
+    - `output`: The output of the task
+    - `expected`: The expected or reference output of the dataset example
+    - `reference`: An alias for `expected`
+    - `metadata`: Metadata associated with the dataset example
+
+    Phoenix also provides pre-built evaluators in the `phoenix.experiments.evaluators` module.
+
+    Args:
+        dataset (Dataset): The dataset on which to run the experiment.
+        task (ExperimentTask): The task to run on each example in the dataset.
+        evaluators (Optional[Evaluators]): A single evaluator or sequence of evaluators used to
+            evaluate the results of the experiment. Defaults to None.
+        experiment_name (Optional[str]): The name of the experiment. Defaults to None.
+        experiment_description (Optional[str]): A description of the experiment. Defaults to None.
+        experiment_metadata (Optional[Mapping[str, Any]]): Metadata to associate with the
+            experiment. Defaults to None.
+        rate_limit_errors (Optional[BaseException | Sequence[BaseException]]): An exception or
+            sequence of exceptions to adaptively throttle on. Defaults to None.
+        dry_run (bool | int): Run the experiment in dry-run mode. When set, experiment results will
+            not be recorded in Phoenix. If True, the experiment will run on a random dataset
+            example. If an integer, the experiment will run on a random sample of the dataset
+            examples of the given size. Defaults to False.
+        print_summary (bool): Whether to print a summary of the experiment and evaluation results.
+            Defaults to True.
+        concurrency (int): Specifies the concurrency for task execution. In order to enable
+            concurrent task execution, the task callable must be a coroutine function.
+            Defaults to 3.
+
+    Returns:
+        RanExperiment: The results of the experiment and evaluation. Additional evaluations can be
+            added to the experiment using the `evaluate_experiment` function.
+    """
+    task_signature = inspect.signature(task)
+    _validate_task_signature(task_signature)
+
+    if not dataset.examples:
+        raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
+    # Add this to the params once supported in the UI
+    repetitions = 1
+    assert repetitions > 0, "Must run the experiment at least once."
+    evaluators_by_name = _evaluators_by_name(evaluators)
+
+    sync_client, async_client = _phoenix_clients()
+
+    payload = {
+        "version_id": dataset.version_id,
+        "name": experiment_name,
+        "description": experiment_description,
+        "metadata": experiment_metadata,
+        "repetitions": repetitions,
+    }
+    if not dry_run:
+        experiment_response = await async_client.post(
+            f"/v1/datasets/{dataset.id}/experiments",
+            json=payload,
+        )
+        experiment_response.raise_for_status()
+        exp_json = experiment_response.json()["data"]
+        project_name = exp_json["project_name"]
+        experiment = Experiment(
+            dataset_id=dataset.id,
+            dataset_version_id=dataset.version_id,
+            repetitions=repetitions,
+            id=exp_json["id"],
+            project_name=project_name,
+        )
+    else:
+        experiment = Experiment(
+            dataset_id=dataset.id,
+            dataset_version_id=dataset.version_id,
+            repetitions=repetitions,
+            id=DRY_RUN,
+            project_name="",
+        )
+
+    tracer, resource = _get_tracer(experiment.project_name)
+    root_span_name = f"Task: {get_func_name(task)}"
+    root_span_kind = CHAIN
+
+    print("🧪 Experiment started.")
+    if dry_run:
+        examples = {
+            (ex := dataset[i]).id: ex
+            for i in pd.Series(range(len(dataset)))
+            .sample(min(len(dataset), int(dry_run)), random_state=42)
+            .sort_values()
+        }
+        id_selection = "\n".join(examples)
+        print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
+        dataset = replace(dataset, examples=examples)
+    else:
+        dataset_experiments_url = get_dataset_experiments_url(dataset_id=dataset.id)
+        experiment_compare_url = get_experiment_url(
+            dataset_id=dataset.id,
+            experiment_id=experiment.id,
+        )
+        print(f"📺 View dataset experiments: {dataset_experiments_url}")
+        print(f"🔗 View this experiment: {experiment_compare_url}")
+
+    async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
+        example, repetition_number = test_case.example, test_case.repetition_number
+        output = None
+        error: Optional[BaseException] = None
+        status = Status(StatusCode.OK)
+        with ExitStack() as stack:
+            span: Span = stack.enter_context(
+                tracer.start_as_current_span(root_span_name, context=Context())
+            )
+            stack.enter_context(capture_spans(resource))
+            try:
+                # Do not use keyword arguments, which can fail at runtime
+                # even when function obeys protocol, because keyword arguments
+                # are implementation details.
+                bound_task_args = _bind_task_signature(task_signature, example)
+                _output = task(*bound_task_args.args, **bound_task_args.kwargs)
+                if isinstance(_output, Awaitable):
+                    output = await _output
+                else:
+                    output = _output
+            except BaseException as exc:
+                span.record_exception(exc)
+                status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
+                error = exc
+                _print_experiment_error(
+                    exc,
+                    example_id=example.id,
+                    repetition_number=repetition_number,
+                    kind="task",
+                )
+            output = jsonify(output)
+            span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
+            span.set_attribute(INPUT_MIME_TYPE, JSON.value)
+            if output is not None:
+                if isinstance(output, str):
+                    span.set_attribute(OUTPUT_VALUE, output)
+                else:
+                    span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
+                    span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
+            span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
+            span.set_status(status)
+
+        assert isinstance(
+            output, (dict, list, str, int, float, bool, type(None))
+        ), "Output must be JSON serializable"
+        exp_run = ExperimentRun(
+            start_time=_decode_unix_nano(cast(int, span.start_time)),
+            end_time=_decode_unix_nano(cast(int, span.end_time)),
+            experiment_id=experiment.id,
+            dataset_example_id=example.id,
+            repetition_number=repetition_number,
+            output=output,
+            error=repr(error) if error else None,
+            trace_id=_str_trace_id(span.get_span_context().trace_id),  # type: ignore[no-untyped-call]
+        )
+        if not dry_run:
+            # Below is a workaround to avoid timeout errors sometimes
+            # encountered when the task is a synchronous function that
+            # blocks for too long.
+            resp = await async_client.post(
+                f"/v1/experiments/{experiment.id}/runs",
+                json=jsonify(exp_run),
+            )
+            resp.raise_for_status()
+            exp_run = replace(exp_run, id=resp.json()["data"]["id"])
+        return exp_run
+
+    _errors: Tuple[Type[BaseException], ...]
+    if not isinstance(rate_limit_errors, Sequence):
+        _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
+    else:
+        _errors = tuple(filter(None, rate_limit_errors))
+    rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
+
+    rate_limited_async_run_experiment = functools.reduce(
+        lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
+    )
+
+    executor = AsyncExecutor(
+        rate_limited_async_run_experiment,
+        concurrency=concurrency,
+        tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
+        max_retries=0,
+        exit_on_error=False,
+        fallback_return_value=None,
+    )
+
+    test_cases = [
+        TestCase(example=deepcopy(ex), repetition_number=rep)
+        for ex, rep in product(dataset.examples.values(), range(1, repetitions + 1))
+    ]
+    task_runs, _execution_details = await executor.execute(test_cases)
+    print("✅ Task runs completed.")
+    params = ExperimentParameters(n_examples=len(dataset.examples), n_repetitions=repetitions)
+    task_summary = TaskSummary.from_task_runs(params, task_runs)
+    ran_experiment: RanExperiment = object.__new__(RanExperiment)
+    ran_experiment.__init__(  # type: ignore[misc]
+        params=params,
+        dataset=dataset,
+        runs={r.id: r for r in task_runs if r is not None},
+        task_summary=task_summary,
+        **_asdict(experiment),
+    )
+    if evaluators_by_name:
+        return await async_evaluate_experiment(
+            ran_experiment,
+            evaluators=evaluators_by_name,
+            dry_run=dry_run,
+            print_summary=print_summary,
+            rate_limit_errors=rate_limit_errors,
+            concurrency=concurrency,
+        )
+    if print_summary:
+        print(ran_experiment)
+    return ran_experiment
+
+
 def run_experiment(
     dataset: Dataset,
     task: ExperimentTask,
@@ -423,6 +677,171 @@ async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
     return ran_experiment
 
 
+async def async_evaluate_experiment(
+    experiment: Experiment,
+    evaluators: Evaluators,
+    *,
+    dry_run: Union[bool, int] = False,
+    print_summary: bool = True,
+    rate_limit_errors: Optional[RateLimitErrors] = None,
+    concurrency: int = 3,
+) -> RanExperiment:
+    if not dry_run and _is_dry_run(experiment):
+        dry_run = True
+    evaluators_by_name = _evaluators_by_name(evaluators)
+    if not evaluators_by_name:
+        raise ValueError("Must specify at least one Evaluator")
+    sync_client, async_client = _phoenix_clients()
+    dataset_id = experiment.dataset_id
+    dataset_version_id = experiment.dataset_version_id
+    if isinstance(experiment, RanExperiment):
+        ran_experiment: RanExperiment = experiment
+    else:
+        dataset = Dataset.from_dict(
+            (
+                await async_client.get(
+                    f"/v1/datasets/{dataset_id}/examples",
+                    params={"version_id": str(dataset_version_id)},
+                )
+            ).json()["data"]
+        )
+        if not dataset.examples:
+            raise ValueError(f"Dataset has no examples: {dataset_id=}, {dataset_version_id=}")
+        experiment_runs = {
+            exp_run["id"]: ExperimentRun.from_dict(exp_run)
+            for exp_run in sync_client.get(f"/v1/experiments/{experiment.id}/runs").json()["data"]
+        }
+        if not experiment_runs:
+            raise ValueError("Experiment has not been run")
+        params = ExperimentParameters(n_examples=len(dataset.examples))
+        task_summary = TaskSummary.from_task_runs(params, experiment_runs.values())
+        ran_experiment = object.__new__(RanExperiment)
+        ran_experiment.__init__(  # type: ignore[misc]
+            dataset=dataset,
+            params=params,
+            runs=experiment_runs,
+            task_summary=task_summary,
+            **_asdict(experiment),
+        )
+    print("🧠 Evaluation started.")
+    examples = ran_experiment.dataset.examples
+    if dry_run:
+        if not _is_dry_run(ran_experiment):
+            dataset = ran_experiment.dataset
+            examples = {
+                (ex := dataset[i]).id: ex
+                for i in pd.Series(range(len(dataset)))
+                .sample(min(len(dataset), int(dry_run)), random_state=42)
+                .sort_values()
+            }
+            dataset = replace(ran_experiment.dataset, examples=examples)
+            ran_experiment = _replace(ran_experiment, id=DRY_RUN, dataset=dataset)
+        id_selection = "\n".join(examples)
+        print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
+    # not all dataset examples have associated experiment runs, so we need to pair them up
+    example_run_pairs = []
+    examples = ran_experiment.dataset.examples
+    for exp_run in ran_experiment.runs.values():
+        example = examples.get(exp_run.dataset_example_id)
+        if example:
+            example_run_pairs.append((deepcopy(example), exp_run))
+    evaluation_input = [
+        (example, run, evaluator)
+        for (example, run), evaluator in product(example_run_pairs, evaluators_by_name.values())
+    ]
+
+    tracer, resource = _get_tracer(None if dry_run else "evaluators")
+    root_span_kind = EVALUATOR
+
+    async def async_evaluate_run(
+        obj: Tuple[Example, ExperimentRun, Evaluator],
+    ) -> ExperimentEvaluationRun:
+        example, experiment_run, evaluator = obj
+        result: Optional[EvaluationResult] = None
+        error: Optional[BaseException] = None
+        status = Status(StatusCode.OK)
+        root_span_name = f"Evaluation: {evaluator.name}"
+        with ExitStack() as stack:
+            span: Span = stack.enter_context(
+                tracer.start_as_current_span(root_span_name, context=Context())
+            )
+            stack.enter_context(capture_spans(resource))
+            try:
+                result = await evaluator.async_evaluate(
+                    output=deepcopy(experiment_run.output),
+                    expected=example.output,
+                    reference=example.output,
+                    input=example.input,
+                    metadata=example.metadata,
+                )
+            except BaseException as exc:
+                span.record_exception(exc)
+                status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
+                error = exc
+                _print_experiment_error(
+                    exc,
+                    example_id=example.id,
+                    repetition_number=experiment_run.repetition_number,
+                    kind="evaluator",
+                )
+            if result:
+                span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
+            span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
+            span.set_status(status)
+
+        eval_run = ExperimentEvaluationRun(
+            experiment_run_id=experiment_run.id,
+            start_time=_decode_unix_nano(cast(int, span.start_time)),
+            end_time=_decode_unix_nano(cast(int, span.end_time)),
+            name=evaluator.name,
+            annotator_kind=evaluator.kind,
+            error=repr(error) if error else None,
+            result=result,
+            trace_id=_str_trace_id(span.get_span_context().trace_id),  # type: ignore[no-untyped-call]
+        )
+        if not dry_run:
+            # Below is a workaround to avoid timeout errors sometimes
+            # encountered when the evaluator is a synchronous function
+            # that blocks for too long.
+            resp = await async_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
+            resp.raise_for_status()
+            eval_run = replace(eval_run, id=resp.json()["data"]["id"])
+        return eval_run
+
+    _errors: Tuple[Type[BaseException], ...]
+    if not isinstance(rate_limit_errors, Sequence):
+        _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
+    else:
+        _errors = tuple(filter(None, rate_limit_errors))
+    rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
+
+    rate_limited_async_evaluate_run = functools.reduce(
+        lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_evaluate_run
+    )
+
+    executor = AsyncExecutor(
+        rate_limited_async_evaluate_run,
+        concurrency=concurrency,
+        tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
+        max_retries=0,
+        exit_on_error=False,
+        fallback_return_value=None,
+    )
+
+    eval_runs, _execution_details = await executor.execute(evaluation_input)
+    eval_summary = EvaluationSummary.from_eval_runs(
+        EvaluationParameters(
+            eval_names=frozenset(evaluators_by_name),
+            exp_params=ran_experiment.params,
+        ),
+        *eval_runs,
+    )
+    ran_experiment = ran_experiment.add(eval_summary, *eval_runs)
+    if print_summary:
+        print(ran_experiment)
+    return ran_experiment
+
+
 def evaluate_experiment(
     experiment: Experiment,
     evaluators: Evaluators,
diff --git a/tests/datasets/test_experiments.py b/tests/datasets/test_experiments.py
index 5c3fa3e74e..85db9b5e6d 100644
--- a/tests/datasets/test_experiments.py
+++ b/tests/datasets/test_experiments.py
@@ -1,6 +1,5 @@
 import asyncio
 import json
-import platform
 from datetime import datetime, timezone
 from typing import Any, Dict
 from unittest.mock import patch
@@ -11,13 +10,13 @@
 from strawberry.relay import GlobalID
 
 from phoenix.db import models
-from phoenix.experiments import evaluate_experiment, run_experiment
 from phoenix.experiments.evaluators import (
     ConcisenessEvaluator,
     ContainsKeyword,
     HelpfulnessEvaluator,
     create_evaluator,
 )
+from phoenix.experiments.functions import async_evaluate_experiment, async_run_experiment
 from phoenix.experiments.types import (
     AnnotatorKind,
     Dataset,
@@ -30,7 +29,6 @@
 from phoenix.server.types import DbSessionFactory
 
 
-@pytest.mark.skipif(platform.system() in ("Windows", "Darwin"), reason="Flaky on CI")
 @patch("opentelemetry.sdk.trace.export.SimpleSpanProcessor.on_end")
 async def test_run_experiment(
     _,
@@ -39,9 +37,6 @@ async def test_run_experiment(
     simple_dataset: Any,
     dialect: str,
 ) -> None:
-    if dialect == "postgresql":
-        pytest.xfail("This test fails on PostgreSQL")
-
     example_input = {"input": "fancy input 1"}
     example_output = {"output": "fancy output 1"}
     example_metadata = {"metadata": "fancy metadata 1"}
@@ -73,18 +68,18 @@ def experiment_task(_) -> Dict[str, str]:
             lambda output: ContainsKeyword(keyword="doesn't matter").evaluate(
                 output=json.dumps(output)
             ),
-            # lambda output: output == task_output,
+            lambda output: output == task_output,
             lambda output: output is not task_output,
             lambda input: input == example_input,
             lambda input: input is not example_input,
-            # lambda expected: expected == example_output,
-            # lambda expected: expected is not example_output,
-            # lambda metadata: metadata == example_metadata,
-            # lambda metadata: metadata is not example_metadata,
-            # lambda reference, expected: expected == reference,
-            # lambda reference, expected: expected is reference,
+            lambda expected: expected == example_output,
+            lambda expected: expected is not example_output,
+            lambda metadata: metadata == example_metadata,
+            lambda metadata: metadata is not example_metadata,
+            lambda reference, expected: expected == reference,
+            lambda reference, expected: expected is reference,
         ]
-        experiment = run_experiment(
+        experiment = await async_run_experiment(
             dataset=test_dataset,
             task=experiment_task,
             experiment_name="test",
@@ -94,7 +89,6 @@ def experiment_task(_) -> Dict[str, str]:
             print_summary=False,
         )
 
-        await asyncio.sleep(5)
         experiment_id = from_global_id_with_expected_type(
             GlobalID.from_id(experiment.id), "Experiment"
         )
@@ -103,10 +97,9 @@ def experiment_task(_) -> Dict[str, str]:
         assert len(experiment.runs) == 1, "Experiment has 1 example"
         runs = [run for run in experiment.runs.values()]
         assert runs[0].output == {"doesn't matter": "this is the output"}
-        # assert len(experiment.eval_runs) == len(evaluators)  # this assertion is flaky
+        assert len(experiment.eval_runs) == len(evaluators)
 
 
-@pytest.mark.skipif(platform.system() in ("Windows", "Darwin"), reason="Flaky on CI")
 @patch("opentelemetry.sdk.trace.export.SimpleSpanProcessor.on_end")
 async def test_run_experiment_with_llm_eval(
     _,
@@ -115,9 +108,6 @@ async def test_run_experiment_with_llm_eval(
     simple_dataset: Any,
     dialect: str,
 ) -> None:
-    if dialect == "postgresql":
-        pytest.xfail("This test fails on PostgreSQL")
-
     test_dataset = Dataset(
         id=str(GlobalID("Dataset", "0")),
         version_id=str(GlobalID("DatasetVersion", "0")),
@@ -158,7 +148,7 @@ def experiment_task(input, example, metadata) -> None:
             assert isinstance(example, Example)
             return "doesn't matter, this is the output"
 
-        experiment = run_experiment(
+        experiment = await async_run_experiment(
             dataset=test_dataset,
             task=experiment_task,
             experiment_name="test",
@@ -169,7 +159,6 @@ def experiment_task(input, example, metadata) -> None:
                 HelpfulnessEvaluator(model=PostitiveFakeLLMModel()),
             ],
         )
-        await asyncio.sleep(5)
         experiment_id = from_global_id_with_expected_type(
             GlobalID.from_id(experiment.id), "Experiment"
         )
@@ -182,7 +171,6 @@ def experiment_task(input, example, metadata) -> None:
         # assert experiment.eval_runs[1].result.score == 1.0
 
 
-@pytest.mark.skipif(platform.system() in ("Windows", "Darwin"), reason="Flaky on CI")
 @patch("opentelemetry.sdk.trace.export.SimpleSpanProcessor.on_end")
 async def test_run_evaluation(
     _,
@@ -191,9 +179,6 @@ async def test_run_evaluation(
     simple_dataset_with_one_experiment_run: Any,
     dialect: str,
 ) -> None:
-    if dialect == "postgresql":
-        pytest.xfail("This test fails on PostgreSQL")
-
     experiment = Experiment(
         id=str(GlobalID("Experiment", "0")),
         dataset_id=str(GlobalID("Dataset", "0")),
@@ -202,7 +187,7 @@ async def test_run_evaluation(
         project_name="test",
     )
     with patch("phoenix.experiments.functions._phoenix_clients", return_value=httpx_clients):
-        evaluate_experiment(experiment, evaluators=[lambda _: _])
+        await async_evaluate_experiment(experiment, evaluators=[lambda _: _])
         await asyncio.sleep(1)  # Wait for the evaluations to be inserted
         async with db() as session:
             evaluations = list(await session.scalars(select(models.ExperimentRunAnnotation)))

From a619bfafae4fb0d1a6508672f929308c17b7b12e Mon Sep 17 00:00:00 2001
From: Dustin Ngo <dustin@arize.com>
Date: Thu, 26 Sep 2024 11:39:39 -0700
Subject: [PATCH 3/3] =?UTF-8?q?Ruff=20=F0=9F=90=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/datasets/test_experiments.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/datasets/test_experiments.py b/tests/datasets/test_experiments.py
index 85db9b5e6d..59cdb7540f 100644
--- a/tests/datasets/test_experiments.py
+++ b/tests/datasets/test_experiments.py
@@ -5,7 +5,6 @@
 from unittest.mock import patch
 
 import httpx
-import pytest
 from sqlalchemy import select
 from strawberry.relay import GlobalID