Include tools information in evaluation output directory names

AlexCuadron · Feb 26, 2025 · 8ffe33e · 8ffe33e
1 parent 8a5dc59
commit 8ffe33e
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 3 deletions.
diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -295,13 +295,23 @@ def process_instance(
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+
     metadata = make_metadata(
         llm_config,
         'AiderBench',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
+        details=agent_details,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
 

diff --git a/evaluation/benchmarks/polyglot_benchmark/run_infer.py b/evaluation/benchmarks/polyglot_benchmark/run_infer.py
@@ -504,13 +504,23 @@ def add_arguments(parser):
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+
     metadata = make_metadata(
         llm_config,
         'PolyglotBenchmark',
         args.agent_cls,
         args.max_iterations,
         args.eval_note,
         args.eval_output_dir,
+        details=agent_details,
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
 

diff --git a/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py b/evaluation/benchmarks/polyglot_benchmark/test_all_languages.py
@@ -44,6 +44,15 @@ def test_language(language, model, agent):
         print(f"Could not find LLM config: {model}")
         return False
 
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+
     # Create metadata
     metadata = make_metadata(
         llm_config,
@@ -52,6 +61,7 @@ def test_language(language, model, agent):
         30,  # max_iterations
         f"test_{language}",
         f"evaluation/evaluation_outputs/test_{language}",
+        details=agent_details,
     )
 
     # Process the instance

diff --git a/evaluation/benchmarks/polyglot_benchmark/test_run.py b/evaluation/benchmarks/polyglot_benchmark/test_run.py
@@ -50,6 +50,15 @@ def main():
         print(f"Could not find LLM config: {args.model}")
         return
 
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+
     # Create metadata
     metadata = make_metadata(
         llm_config,
@@ -58,6 +67,7 @@ def main():
         30,  # max_iterations
         "test",
         "evaluation/evaluation_outputs/test",
+        details=agent_details,
     )
 
     # Process the instance

diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -531,7 +531,14 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     if llm_config is None:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
-    details = {}
+    # Create details dictionary with agent configuration
+    details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": RUN_WITH_BROWSING,
+            "codeact_enable_llm_editor": False,
+        }
+    }
     _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
 
     dataset_descrption = (

diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
@@ -158,6 +158,35 @@ def cleanup():
         process.join()
 
 
+def get_tools_string(agent_class: str, details: dict[str, Any] | None = None) -> str:
+    """Generate a string representation of the tools used by the agent.
+    
+    Args:
+        agent_class: The agent class name.
+        details: Additional details that might contain tool configuration.
+        
+    Returns:
+        A string representation of the tools used, e.g., "bash+finish+str_replace".
+    """
+    # Default tools for CodeActAgent
+    if agent_class == "CodeActAgent":
+        tools = ["bash", "finish", "str_replace"]
+
+        # Check if additional tools are enabled
+        if details and "agent_config" in details:
+            agent_config = details.get("agent_config", {})
+            if agent_config.get("codeact_enable_browsing", False):
+                tools.extend(["web_read", "browser"])
+            if agent_config.get("codeact_enable_jupyter", False):
+                tools.append("ipython")
+            if agent_config.get("codeact_enable_llm_editor", False):
+                tools[-1] = "llm_editor"  # Replace str_replace with llm_editor
+
+        return "+".join(tools)
+
+    # For other agents, return a default string
+    return "default_tools"
+
 def make_metadata(
     llm_config: LLMConfig,
     dataset_name: str,
@@ -172,12 +201,15 @@ def make_metadata(
     model_name = llm_config.model.split('/')[-1]
     model_path = model_name.replace(':', '_').replace('@', '-')
     eval_note = f'_N_{eval_note}' if eval_note else ''
-
+
+    # Get tools string
+    tools_string = get_tools_string(agent_class, details)
+
     eval_output_path = os.path.join(
         eval_output_dir,
         dataset_name,
         agent_class,
-        f'{model_path}_maxiter_{max_iterations}{eval_note}',
+        f'{model_path}_maxiter_{max_iterations}_tools_{tools_string}{eval_note}',
     )
 
     pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)