biocypher · mengerj · Dec 10, 2024 · Dec 10, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,5 @@ serve.sh
 .blast/*
 .api_results/*
 *.coverage
+scaling_test.py
+myvenv/
diff --git a/benchmark/conftest.py b/benchmark/conftest.py
@@ -8,6 +8,7 @@
 
 from biochatter.llm_connect import (
     AnthropicConversation,
+    Conversation,
     GptConversation,
     XinferenceConversation,
 )
@@ -17,20 +18,24 @@
 from .load_dataset import get_benchmark_dataset
 
 # how often should each benchmark be run?
-N_ITERATIONS = 1
+N_ITERATIONS = 3
 
 # which dataset should be used for benchmarking?
 BENCHMARK_DATASET = get_benchmark_dataset()
 
 # which models should be benchmarked?
 OPENAI_MODEL_NAMES = [
-    # "gpt-3.5-turbo-0125",
+    "gpt-3.5-turbo-0125",
     # "gpt-4-0613",
+    # "gpt-4-1106-preview",
     # "gpt-4-0125-preview",
     # "gpt-4-turbo-2024-04-09",
     # "gpt-4o-2024-05-13",
-    "gpt-4o-2024-08-06",
+    # "gpt-4o-2024-08-06",
+    # "gpt-4o-2024-11-20",
     # "gpt-4o-mini-2024-07-18",
+    # "o1-preview-2024-09-12",
+    # "o1-mini-2024-09-12",
 ]
 
 ANTHROPIC_MODEL_NAMES = [
@@ -288,8 +293,9 @@ def client():
 
 @pytest.fixture(scope="session", autouse=True)
 def register_model(client):
-    """Register custom (non-builtin) models with the Xinference server. Should only
-    happen once per session.
+    """Register custom (non-builtin) models with the Xinference server.
+
+    Should only happen once per session.
     """
     if client is None:
         return  # ignore if server is not running
@@ -302,16 +308,11 @@ def register_model(client):
             model = fd.read()
         client.register_model(model_type="LLM", model=model, persist=False)
 
-    # if "custom-llama-3-instruct-70b" not in registered_models:
-    #     with open("benchmark/models/custom-llama-3-instruct-70b.json") as fd:
-    #         model = fd.read()
-    #     client.register_model(model_type="LLM", model=model, persist=False)
-
 
 def pytest_collection_modifyitems(items):
-    """Pytest hook function to modify the collected test items.
-    Called once after collection has been performed.
+    """Modify the collected test items (Pytest hook).
 
+    Called once after collection has been performed.
     Used here to order items by their `callspec.id` (which starts with the
     model name and configuration) to ensure running all tests for one model
     before moving to the next model.
@@ -329,9 +330,9 @@ def model_name(request):
     return request.param
 
 
-@pytest.fixture()
+@pytest.fixture
 def multiple_testing(request):
-    def run_multiple_times(test_func, *args, **kwargs):
+    def run_multiple_times(test_func, *args, **kwargs) -> tuple[str, int, int]:
         scores = []
         for _ in range(N_ITERATIONS):
             score, max = test_func(*args, **kwargs)
@@ -344,23 +345,22 @@ def run_multiple_times(test_func, *args, **kwargs):
 
 def calculate_bool_vector_score(vector: list[bool]) -> tuple[int, int]:
     score = sum(vector)
-    max = len(vector)
-    return (score, max)
+    max_score = len(vector)
+    return (score, max_score)
 
 
-@pytest.fixture()
-def conversation(request, model_name, client):
-    """Decides whether to run the test or skip due to the test having been run
-    before. If not skipped, will create a conversation object for interfacing
-    with the model.
+@pytest.fixture
+def conversation(request, model_name, client) -> Conversation:
+    """Return conversation object.
+
+    Could skip due to the test having been run before (but not sure how to
+    implement yet). If not skipped, will create a conversation object for
+    interfacing with the model.
     """
     test_name = request.node.originalname.replace("test_", "")
-    subtask = "?"  # TODO can we get the subtask here?
+    subtask = "?"  # TODO: can we get the subtask here?
     if benchmark_already_executed(model_name, test_name, subtask):
         pass
-        # pytest.skip(
-        #     f"benchmark {test_name}: {subtask} with {model_name} already executed"
-        # )
 
     if model_name in OPENAI_MODEL_NAMES:
         conversation = GptConversation(
@@ -428,7 +428,6 @@ def conversation(request, model_name, client):
             quantization=_model_quantization,
         )
 
-        # return conversation
         conversation = XinferenceConversation(
             base_url=BENCHMARK_URL,
             model_name=_model_name,
@@ -439,14 +438,14 @@ def conversation(request, model_name, client):
     return conversation
 
 
-@pytest.fixture()
+@pytest.fixture
 def prompt_engine(request, model_name, conversation):
-    """Generates a constructor for the prompt engine for the current model name."""
+    """Generate a constructor for the prompt engine for current model name."""
 
-    def conversation_factory():
+    def conversation_factory() -> Conversation:
         return conversation
 
-    def setup_prompt_engine(kg_schema_dict):
+    def setup_prompt_engine(kg_schema_dict) -> BioCypherPromptEngine:
         return BioCypherPromptEngine(
             schema_config_or_info_dict=kg_schema_dict,
             model_name=model_name,
@@ -456,8 +455,8 @@ def setup_prompt_engine(kg_schema_dict):
     return setup_prompt_engine
 
 
-@pytest.fixture()
-def evaluation_conversation():
+@pytest.fixture
+def evaluation_conversation() -> Conversation:
     conversation = GptConversation(
         model_name="gpt-3.5-turbo",
         prompts={},
@@ -478,7 +477,9 @@ def pytest_addoption(parser):
 
 @pytest.fixture(autouse=True, scope="session")
 def delete_results_csv_file_content(request):
-    """If --run-all is set, the former benchmark data are deleted and all
+    """Delete the content of the result files.
+
+    If --run-all is set, the former benchmark data are deleted and all
     benchmarks are executed again.
     """
     if request.config.getoption("--run-all"):
@@ -524,7 +525,8 @@ def result_files():
 
 
 def pytest_generate_tests(metafunc):
-    """Pytest hook function to generate test cases.
+    """Generate test cases (Pytest hook).
+
     Called once for each test case in the benchmark test collection.
     If fixture is part of test declaration, the test is parametrized.
     """
@@ -559,7 +561,7 @@ def pytest_generate_tests(metafunc):
         )
 
 
-@pytest.fixture()
+@pytest.fixture
 def kg_schemas():
     data = BENCHMARK_DATASET
     return data["kg_schemas"]