Skip to content

Commit

Permalink
Include tools information in evaluation output directory names
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexCuadron committed Feb 26, 2025
1 parent 8a5dc59 commit 8ffe33e
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 3 deletions.
10 changes: 10 additions & 0 deletions evaluation/benchmarks/aider_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,13 +295,23 @@ def process_instance(
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

# Create details dictionary with agent configuration
agent_details = {
"agent_config": {
"codeact_enable_jupyter": False,
"codeact_enable_browsing": False,
"codeact_enable_llm_editor": False,
}
}

metadata = make_metadata(
llm_config,
'AiderBench',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
details=agent_details,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')

Expand Down
10 changes: 10 additions & 0 deletions evaluation/benchmarks/polyglot_benchmark/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,13 +504,23 @@ def add_arguments(parser):
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

# Create details dictionary with agent configuration
agent_details = {
"agent_config": {
"codeact_enable_jupyter": False,
"codeact_enable_browsing": False,
"codeact_enable_llm_editor": False,
}
}

metadata = make_metadata(
llm_config,
'PolyglotBenchmark',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
details=agent_details,
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')

Expand Down
10 changes: 10 additions & 0 deletions evaluation/benchmarks/polyglot_benchmark/test_all_languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,15 @@ def test_language(language, model, agent):
print(f"Could not find LLM config: {model}")
return False

# Create details dictionary with agent configuration
agent_details = {
"agent_config": {
"codeact_enable_jupyter": False,
"codeact_enable_browsing": False,
"codeact_enable_llm_editor": False,
}
}

# Create metadata
metadata = make_metadata(
llm_config,
Expand All @@ -52,6 +61,7 @@ def test_language(language, model, agent):
30, # max_iterations
f"test_{language}",
f"evaluation/evaluation_outputs/test_{language}",
details=agent_details,
)

# Process the instance
Expand Down
10 changes: 10 additions & 0 deletions evaluation/benchmarks/polyglot_benchmark/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ def main():
print(f"Could not find LLM config: {args.model}")
return

# Create details dictionary with agent configuration
agent_details = {
"agent_config": {
"codeact_enable_jupyter": False,
"codeact_enable_browsing": False,
"codeact_enable_llm_editor": False,
}
}

# Create metadata
metadata = make_metadata(
llm_config,
Expand All @@ -58,6 +67,7 @@ def main():
30, # max_iterations
"test",
"evaluation/evaluation_outputs/test",
details=agent_details,
)

# Process the instance
Expand Down
9 changes: 8 additions & 1 deletion evaluation/benchmarks/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,14 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
if llm_config is None:
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')

details = {}
# Create details dictionary with agent configuration
details = {
"agent_config": {
"codeact_enable_jupyter": False,
"codeact_enable_browsing": RUN_WITH_BROWSING,
"codeact_enable_llm_editor": False,
}
}
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)

dataset_descrption = (
Expand Down
36 changes: 34 additions & 2 deletions evaluation/utils/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,35 @@ def cleanup():
process.join()


def get_tools_string(agent_class: str, details: dict[str, Any] | None = None) -> str:
"""Generate a string representation of the tools used by the agent.
Args:
agent_class: The agent class name.
details: Additional details that might contain tool configuration.
Returns:
A string representation of the tools used, e.g., "bash+finish+str_replace".
"""
# Default tools for CodeActAgent
if agent_class == "CodeActAgent":
tools = ["bash", "finish", "str_replace"]

# Check if additional tools are enabled
if details and "agent_config" in details:
agent_config = details.get("agent_config", {})
if agent_config.get("codeact_enable_browsing", False):
tools.extend(["web_read", "browser"])
if agent_config.get("codeact_enable_jupyter", False):
tools.append("ipython")
if agent_config.get("codeact_enable_llm_editor", False):
tools[-1] = "llm_editor" # Replace str_replace with llm_editor

return "+".join(tools)

# For other agents, return a default string
return "default_tools"

def make_metadata(
llm_config: LLMConfig,
dataset_name: str,
Expand All @@ -172,12 +201,15 @@ def make_metadata(
model_name = llm_config.model.split('/')[-1]
model_path = model_name.replace(':', '_').replace('@', '-')
eval_note = f'_N_{eval_note}' if eval_note else ''


# Get tools string
tools_string = get_tools_string(agent_class, details)

eval_output_path = os.path.join(
eval_output_dir,
dataset_name,
agent_class,
f'{model_path}_maxiter_{max_iterations}{eval_note}',
f'{model_path}_maxiter_{max_iterations}_tools_{tools_string}{eval_note}',
)

pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)
Expand Down

0 comments on commit 8ffe33e

Please sign in to comment.