diff --git a/evaluation/README.md b/evaluation/README.md
index cfaf1ba36c4d..5f6963d93baf 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -60,6 +60,7 @@ The OpenHands evaluation harness supports a wide variety of benchmarks across [s
 - AiderBench: [`evaluation/benchmarks/aider_bench`](./benchmarks/aider_bench/)
 - Commit0: [`evaluation/benchmarks/commit0_bench`](./benchmarks/commit0_bench/)
 - DiscoveryBench: [`evaluation/benchmarks/discoverybench`](./benchmarks/discoverybench/)
+- APPS: [`evaluation/benchmarks/apps`](./benchmarks/apps/)
 
 ### Web Browsing
 
@@ -76,6 +77,10 @@ The OpenHands evaluation harness supports a wide variety of benchmarks across [s
 - Entity deduction Arena (EDA): [`evaluation/benchmarks/EDA`](./benchmarks/EDA)
 - ProofWriter: [`evaluation/benchmarks/logic_reasoning`](./benchmarks/logic_reasoning)
 - ScienceAgentBench: [`evaluation/benchmarks/scienceagentbench`](./benchmarks/scienceagentbench)
+- MATH: [`evaluation/benchmarks/math`](./benchmarks/math/)
+- HotpotQA: [`evaluation/benchmarks/hotpotqa`](./benchmarks/hotpotqa/)
+- WikiTableQuestion: [`evaluation/benchmarks/wiki_table_question`](./benchmarks/wiki_table_question/)
+- AlfWorld: [`evaluation/benchmarks/alfworld`](./benchmarks/alfworld/)
 
 ### Real World
 
diff --git a/evaluation/benchmarks/alfworld/__init__.py b/evaluation/benchmarks/alfworld/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/benchmarks/apps/README.md b/evaluation/benchmarks/apps/README.md
new file mode 100644
index 000000000000..339fdc377395
--- /dev/null
+++ b/evaluation/benchmarks/apps/README.md
@@ -0,0 +1,45 @@
+# APPS Benchmark Evaluation
+
+This folder contains evaluation harness for evaluating agents on the [APPS benchmark](https://huggingface.co/datasets/codeparrot/apps).
+
+APPS is a benchmark for code generation that consists of 10,000 problems, which range from introductory programming problems to competition-level problems. The benchmark contains natural language descriptions of problems, canonical solutions, and test cases.
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
+
+## Start the evaluation
+
+```bash
+./evaluation/benchmarks/apps/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation]
+```
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default, the script evaluates the entire test set.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the given IDs (comma separated).
+- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes.
+
+Following is the basic command to start the evaluation:
+
+```bash
+# Run benchmark without evaluation
+./evaluation/benchmarks/apps/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10"
+
+# Run benchmark with automatic evaluation
+./evaluation/benchmarks/apps/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10" eval
+```
+
+## Summarize Results
+
+```bash
+poetry run python ./evaluation/benchmarks/apps/scripts/summarize_results.py [path_to_output_jsonl_file]
+```
+
+Full example:
+
+```bash
+poetry run python ./evaluation/benchmarks/apps/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/APPS/CodeActAgent/gpt-4o-2024-05-13@20240620_maxiter_30_N_v1.9/output.jsonl
+```
\ No newline at end of file
diff --git a/evaluation/benchmarks/apps/__init__.py b/evaluation/benchmarks/apps/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/benchmarks/apps/run_infer.py b/evaluation/benchmarks/apps/run_infer.py
new file mode 100644
index 000000000000..54ac0398bacc
--- /dev/null
+++ b/evaluation/benchmarks/apps/run_infer.py
@@ -0,0 +1,319 @@
+import asyncio
+import copy
+import os
+import tempfile
+from typing import Any
+
+import pandas as pd
+from datasets import load_dataset
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    get_llm_config_arg,
+    load_from_toml,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+# Configure any environment variables
+SKIP_NUM = os.environ.get('SKIP_NUM')
+SKIP_NUM = (
+    int(SKIP_NUM) if SKIP_NUM and SKIP_NUM.isdigit() and int(SKIP_NUM) >= 0 else None
+)
+
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.11-bookworm'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        max_iterations=metadata.max_iterations,
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    config.set_llm_config(llm_config)
+    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config.enable_prompt_extensions = False
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
+    return config
+
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n")
+    obs: CmdOutputObservation
+
+    # Set up workspace
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # Create problem file
+    with tempfile.TemporaryDirectory() as tmpdir:
+        file_path = os.path.join(tmpdir, 'problem.txt')
+        with open(file_path, 'w') as f:
+            f.write(instance.problem)
+        runtime.copy_to(
+            file_path,
+            '/workspace',
+        )
+
+        # Create test cases file
+        file_path = os.path.join(tmpdir, 'test_cases.py')
+        with open(file_path, 'w') as f:
+            f.write(instance.test_cases)
+        runtime.copy_to(
+            file_path,
+            '/workspace',
+        )
+
+    logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n")
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called after the agent has run.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n")
+    obs: CmdOutputObservation
+
+    # Check if solution.py exists
+    action = CmdRunAction(command='ls -la /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    # Run test cases
+    action = CmdRunAction(command='python3 /workspace/test_cases.py')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    exit_code = 1
+    if isinstance(obs, CmdOutputObservation):
+        exit_code = obs.exit_code
+
+    logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n")
+
+    runtime.close()
+
+    return {
+        'test_output': obs.content,
+        'exit_code': exit_code,
+    }
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    logger.info(instance)
+    instruction = f"""You are given a programming problem to solve. The problem description is in the file 'problem.txt'.
+
+Please read the problem carefully and implement a solution in Python. Save your solution in a file named 'solution.py'.
+
+After implementing your solution, you can test it by running 'python3 test_cases.py'. This will execute your solution against a set of test cases.
+
+IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
+"""
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    initialize_runtime(runtime, instance=instance)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+        )
+    )
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # =============================================
+    # result evaluation
+    # =============================================
+
+    return_val = complete_runtime(runtime, instance)
+    exit_code = return_val['exit_code']
+    test_output = return_val['test_output']
+
+    test_result = {
+        'exit_code': exit_code,
+        'test_output': test_output,
+    }
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = compatibility_for_eval_history_pairs(state.history)
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
+    return output
+
+
+def prepare_apps_dataset():
+    """Prepare the APPS dataset for evaluation."""
+    # Load the APPS dataset
+    dataset = load_dataset('codeparrot/apps', split='test')
+    
+    # Convert to pandas DataFrame
+    df = dataset.to_pandas()
+    
+    # Add instance_id column
+    df['instance_id'] = df.index
+    
+    # Rename columns to match expected format
+    df = df.rename(columns={
+        'question': 'problem',
+        'test': 'test_cases',
+    })
+    
+    return df
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    
+    # Prepare the APPS dataset
+    apps_dataset = prepare_apps_dataset()
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
+        llm_config.modify_params = False
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+    
+    metadata = make_metadata(
+        llm_config,
+        'APPS',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+        details=agent_details,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+
+    instances = prepare_dataset(
+        apps_dataset,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+        skip_num=SKIP_NUM,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
\ No newline at end of file
diff --git a/evaluation/benchmarks/apps/scripts/run_infer.sh b/evaluation/benchmarks/apps/scripts/run_infer.sh
new file mode 100755
index 000000000000..c053c6e3fbae
--- /dev/null
+++ b/evaluation/benchmarks/apps/scripts/run_infer.sh
@@ -0,0 +1,114 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+EVAL_IDS=$6
+RUN_EVALUATION=$7  # New parameter to run evaluation after benchmark
+
+# Special case: if the 7th parameter is "eval", set RUN_EVALUATION to "eval"
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo "Evaluation mode enabled"
+fi
+
+# Special case: if any parameter is "eval", set RUN_EVALUATION to "eval"
+for param in "$@"; do
+  if [ "$param" = "eval" ]; then
+    RUN_EVALUATION="eval"
+    echo "Evaluation mode enabled"
+    break
+  fi
+done
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE=$OPENHANDS_VERSION
+
+COMMAND="export PYTHONPATH=evaluation/benchmarks/apps:\$PYTHONPATH && poetry run python evaluation/benchmarks/apps/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode)
+if [ -n "$EVAL_IDS" ] && [ "$EVAL_IDS" != "eval" ]; then
+  echo "EVAL_IDS: $EVAL_IDS"
+  COMMAND="$COMMAND --eval-ids $EVAL_IDS"
+fi
+
+# Run the command
+eval $COMMAND
+
+# Get the output directory - first try the default location
+OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/APPS/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+
+# If not found, try to find it anywhere under evaluation_outputs
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/APPS/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+fi
+
+# If still not found, try to find any output.jsonl file
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1)
+  if [ -n "$OUTPUT_FILE" ]; then
+    OUTPUT_DIR=$(dirname "$OUTPUT_FILE")
+  fi
+else
+  OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+fi
+
+# Print the output directory and file for debugging
+echo ""
+echo "Output directory: $OUTPUT_DIR"
+echo "Output file: $OUTPUT_FILE"
+
+# Run evaluation if requested
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo ""
+  echo "======================================"
+  echo "Running evaluation on results..."
+  echo "======================================"
+  echo ""
+  
+  if [ -f "$OUTPUT_FILE" ]; then
+    echo "Evaluating results in: $OUTPUT_FILE"
+    poetry run python evaluation/benchmarks/apps/scripts/summarize_results.py "$OUTPUT_FILE"
+    
+    # Save the evaluation results
+    EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt"
+    echo "Saving evaluation results to: $EVAL_RESULTS_FILE"
+    poetry run python evaluation/benchmarks/apps/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE"
+    
+    echo ""
+    echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE"
+  else
+    echo "Error: Output file not found: $OUTPUT_FILE"
+    echo "Cannot run evaluation."
+  fi
+fi
\ No newline at end of file
diff --git a/evaluation/benchmarks/apps/scripts/summarize_results.py b/evaluation/benchmarks/apps/scripts/summarize_results.py
new file mode 100755
index 000000000000..a661ac078d87
--- /dev/null
+++ b/evaluation/benchmarks/apps/scripts/summarize_results.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+from collections import defaultdict
+
+
+def load_jsonl(file_path):
+    """Load a jsonl file."""
+    data = []
+    with open(file_path, 'r') as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+
+
+def summarize_results(output_file):
+    """Summarize the results of the APPS benchmark."""
+    print(f"Summarizing results from {output_file}")
+    
+    # Load the results
+    results = load_jsonl(output_file)
+    
+    # Count the number of instances that passed and failed
+    passed = []
+    failed = []
+    
+    for result in results:
+        instance_id = result['instance_id']
+        test_result = result.get('test_result', {})
+        exit_code = test_result.get('exit_code', 1)
+        
+        if exit_code == 0:
+            passed.append(instance_id)
+        else:
+            failed.append(instance_id)
+    
+    # Print the summary
+    print(f"\nTotal instances: {len(results)}")
+    print(f"Passed: {len(passed)} ({len(passed) / len(results) * 100:.2f}%)")
+    print(f"Failed: {len(failed)} ({len(failed) / len(results) * 100:.2f}%)")
+    
+    # Print the list of passed and failed instances
+    print("\nPassed instances:")
+    for instance_id in passed:
+        print(f"  - {instance_id}")
+    
+    print("\nFailed instances:")
+    for instance_id in failed:
+        print(f"  - {instance_id}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Summarize APPS benchmark results")
+    parser.add_argument("output_file", help="Path to the output.jsonl file")
+    args = parser.parse_args()
+    
+    summarize_results(args.output_file)
\ No newline at end of file
diff --git a/evaluation/benchmarks/hotpotqa/README.md b/evaluation/benchmarks/hotpotqa/README.md
new file mode 100644
index 000000000000..3aea9b507293
--- /dev/null
+++ b/evaluation/benchmarks/hotpotqa/README.md
@@ -0,0 +1,45 @@
+# HotpotQA Benchmark Evaluation
+
+This folder contains evaluation harness for evaluating agents on the [HotpotQA benchmark](http://curtis.ml.cmu.edu/datasets/hotpot/).
+
+HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong supervision for supporting facts to enable more explainable question answering systems.
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
+
+## Start the evaluation
+
+```bash
+./evaluation/benchmarks/hotpotqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation]
+```
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default, the script evaluates the entire test set.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the given IDs (comma separated).
+- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes.
+
+Following is the basic command to start the evaluation:
+
+```bash
+# Run benchmark without evaluation
+./evaluation/benchmarks/hotpotqa/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10"
+
+# Run benchmark with automatic evaluation
+./evaluation/benchmarks/hotpotqa/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10" eval
+```
+
+## Summarize Results
+
+```bash
+poetry run python ./evaluation/benchmarks/hotpotqa/scripts/summarize_results.py [path_to_output_jsonl_file]
+```
+
+Full example:
+
+```bash
+poetry run python ./evaluation/benchmarks/hotpotqa/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/HotpotQA/CodeActAgent/gpt-4o-2024-05-13@20240620_maxiter_30_N_v1.9/output.jsonl
+```
\ No newline at end of file
diff --git a/evaluation/benchmarks/hotpotqa/__init__.py b/evaluation/benchmarks/hotpotqa/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/benchmarks/hotpotqa/run_infer.py b/evaluation/benchmarks/hotpotqa/run_infer.py
new file mode 100644
index 000000000000..b0ac799e1979
--- /dev/null
+++ b/evaluation/benchmarks/hotpotqa/run_infer.py
@@ -0,0 +1,354 @@
+import asyncio
+import copy
+import json
+import os
+import tempfile
+from typing import Any
+
+import pandas as pd
+import requests
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    get_llm_config_arg,
+    load_from_toml,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+# Configure any environment variables
+SKIP_NUM = os.environ.get('SKIP_NUM')
+SKIP_NUM = (
+    int(SKIP_NUM) if SKIP_NUM and SKIP_NUM.isdigit() and int(SKIP_NUM) >= 0 else None
+)
+
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.11-bookworm'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        max_iterations=metadata.max_iterations,
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    config.set_llm_config(llm_config)
+    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config.enable_prompt_extensions = False
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
+    return config
+
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n")
+    obs: CmdOutputObservation
+
+    # Set up workspace
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # Create question file
+    with tempfile.TemporaryDirectory() as tmpdir:
+        file_path = os.path.join(tmpdir, 'question.txt')
+        with open(file_path, 'w') as f:
+            f.write(instance.question)
+        runtime.copy_to(
+            file_path,
+            '/workspace',
+        )
+
+        # Create context files
+        for i, context in enumerate(instance.context):
+            file_path = os.path.join(tmpdir, f'context_{i}.txt')
+            with open(file_path, 'w') as f:
+                f.write(context)
+            runtime.copy_to(
+                file_path,
+                '/workspace',
+            )
+
+    logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n")
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called after the agent has run.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n")
+    obs: CmdOutputObservation
+
+    # Check if answer.txt exists
+    action = CmdRunAction(command='ls -la /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    # Get the answer content
+    answer_content = ""
+    if "answer.txt" in obs.content:
+        action = CmdRunAction(command='cat /workspace/answer.txt')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        answer_content = obs.content
+
+    logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n")
+
+    runtime.close()
+
+    # For HotpotQA, we need to evaluate the answer against the ground truth
+    # Here we just return the answer content for evaluation
+    return {
+        'answer': answer_content,
+        'correct_answer': instance.answer,
+    }
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    logger.info(instance)
+    instruction = f"""You are given a question and some context documents to help you answer it. The question is in the file 'question.txt'.
+
+The context documents are in files named 'context_0.txt', 'context_1.txt', etc. You should read all the context files to gather information needed to answer the question.
+
+Please write your answer in a file named 'answer.txt'. Your answer should be concise and directly address the question.
+
+IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
+"""
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    initialize_runtime(runtime, instance=instance)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+        )
+    )
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # =============================================
+    # result evaluation
+    # =============================================
+
+    return_val = complete_runtime(runtime, instance)
+    answer = return_val['answer']
+    correct_answer = return_val['correct_answer']
+
+    # Simple evaluation - check if the answer matches the correct answer
+    # In a real implementation, you would need a more sophisticated evaluation
+    is_correct = answer.strip().lower() == correct_answer.strip().lower()
+
+    test_result = {
+        'answer': answer,
+        'correct_answer': correct_answer,
+        'is_correct': is_correct,
+    }
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = compatibility_for_eval_history_pairs(state.history)
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
+    return output
+
+
+def prepare_hotpotqa_dataset():
+    """Prepare the HotpotQA dataset for evaluation."""
+    # In a real implementation, you would download and process the HotpotQA dataset
+    # For now, we'll create a simple mock dataset
+    data = {
+        'instance_id': list(range(10)),
+        'question': [
+            "What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?",
+            "Were Scott Derrickson and Ed Wood of the same nationality?",
+            "What is the name of the professional wrestler who had a role in the film The Princess Bride?",
+            "Which magazine was started first Arthur's Magazine or First for Women?",
+            "What city was the birthplace of the actor who played Humpty Dumpty in the 2010 adaptation of Alice in Wonderland?",
+            "What is the difference in years between the release of The Innocents and The Others?",
+            "What is the name of the actor who played the character Wolverine in the X-Men film series?",
+            "Which country is the birthplace of the actor who played James Bond in the film Skyfall?",
+            "What is the name of the director who directed the film Inception?",
+            "Which film won more Academy Awards, The Lord of the Rings: The Return of the King or Titanic?"
+        ],
+        'context': [
+            ["Shirley Temple was an American actress, singer, dancer, businesswoman, and diplomat who was Hollywood's number one box-office draw as a child actress from 1935 to 1938.", "Shirley Temple Black (April 23, 1928 – February 10, 2014) was an American actress, singer, dancer, businesswoman, and diplomat who was Hollywood's number one box-office draw as a child actress from 1935 to 1938. As an adult, she was named United States ambassador to Ghana and to Czechoslovakia, and also served as Chief of Protocol of the United States."],
+            ["Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer. He lives in Los Angeles, California. He is best known for directing horror films such as Sinister, The Exorcism of Emily Rose, and Deliver Us From Evil, as well as the 2016 Marvel Cinematic Universe installment, Doctor Strange.", "Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director."],
+            ["André René Roussimoff (May 19, 1946 – January 27, 1993), best known as André the Giant, was a French professional wrestler and actor.", "The Princess Bride is a 1987 American fantasy comedy film directed and co-produced by Rob Reiner, starring Cary Elwes, Robin Wright, Mandy Patinkin, Chris Sarandon, Wallace Shawn, André the Giant, and Christopher Guest."],
+            ["Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.", "First for Women is a woman's magazine published by Bauer Media Group in the USA. The magazine was started in 1989."],
+            ["Sir Sydney Smirke RA (20 October 1798 – 8 December 1877) was a British architect who was born in London, England, the younger brother of Sir Robert Smirke, also an architect. Their father, also Robert Smirke, was a well-known painter.", "Alice in Wonderland is a 2010 American dark fantasy adventure film directed by Tim Burton from a screenplay written by Linda Woolverton."],
+            ["The Innocents is a 1961 British supernatural gothic horror film directed and produced by Jack Clayton, and starring Deborah Kerr, Michael Redgrave, and Megs Jenkins.", "The Others (Spanish: Los Otros) is a 2001 English-language Spanish gothic supernatural psychological horror film written, directed, and scored by Alejandro Amenábar."],
+            ["Hugh Michael Jackman (born 12 October 1968) is an Australian actor, singer, and producer.", "Wolverine is a fictional character appearing in American comic books published by Marvel Comics, mostly in association with the X-Men."],
+            ["Daniel Wroughton Craig (born 2 March 1968) is an English actor.", "Skyfall is a 2012 spy film and the twenty-third in the James Bond series produced by Eon Productions."],
+            ["Christopher Edward Nolan CBE (born 30 July 1970) is a British-American film director, producer, and screenwriter.", "Inception is a 2010 science fiction action film written and directed by Christopher Nolan, who also produced the film with his wife, Emma Thomas."],
+            ["The Lord of the Rings: The Return of the King is a 2003 epic fantasy adventure film directed by Peter Jackson, based on the third volume of J. R. R. Tolkien's The Lord of the Rings.", "Titanic is a 1997 American epic romance and disaster film directed, written, co-produced, and co-edited by James Cameron."]
+        ],
+        'answer': [
+            "United States ambassador",
+            "Yes",
+            "André the Giant",
+            "Arthur's Magazine",
+            "London",
+            "40 years",
+            "Hugh Jackman",
+            "England",
+            "Christopher Nolan",
+            "The Lord of the Rings: The Return of the King"
+        ]
+    }
+    
+    return pd.DataFrame(data)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    
+    # Prepare the HotpotQA dataset
+    hotpotqa_dataset = prepare_hotpotqa_dataset()
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
+        llm_config.modify_params = False
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+    
+    metadata = make_metadata(
+        llm_config,
+        'HotpotQA',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+        details=agent_details,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+
+    instances = prepare_dataset(
+        hotpotqa_dataset,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+        skip_num=SKIP_NUM,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
\ No newline at end of file
diff --git a/evaluation/benchmarks/hotpotqa/scripts/run_infer.sh b/evaluation/benchmarks/hotpotqa/scripts/run_infer.sh
new file mode 100755
index 000000000000..434f2f35dd45
--- /dev/null
+++ b/evaluation/benchmarks/hotpotqa/scripts/run_infer.sh
@@ -0,0 +1,114 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+EVAL_IDS=$6
+RUN_EVALUATION=$7  # New parameter to run evaluation after benchmark
+
+# Special case: if the 7th parameter is "eval", set RUN_EVALUATION to "eval"
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo "Evaluation mode enabled"
+fi
+
+# Special case: if any parameter is "eval", set RUN_EVALUATION to "eval"
+for param in "$@"; do
+  if [ "$param" = "eval" ]; then
+    RUN_EVALUATION="eval"
+    echo "Evaluation mode enabled"
+    break
+  fi
+done
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE=$OPENHANDS_VERSION
+
+COMMAND="export PYTHONPATH=evaluation/benchmarks/hotpotqa:\$PYTHONPATH && poetry run python evaluation/benchmarks/hotpotqa/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode)
+if [ -n "$EVAL_IDS" ] && [ "$EVAL_IDS" != "eval" ]; then
+  echo "EVAL_IDS: $EVAL_IDS"
+  COMMAND="$COMMAND --eval-ids $EVAL_IDS"
+fi
+
+# Run the command
+eval $COMMAND
+
+# Get the output directory - first try the default location
+OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/HotpotQA/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+
+# If not found, try to find it anywhere under evaluation_outputs
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/HotpotQA/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+fi
+
+# If still not found, try to find any output.jsonl file
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1)
+  if [ -n "$OUTPUT_FILE" ]; then
+    OUTPUT_DIR=$(dirname "$OUTPUT_FILE")
+  fi
+else
+  OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+fi
+
+# Print the output directory and file for debugging
+echo ""
+echo "Output directory: $OUTPUT_DIR"
+echo "Output file: $OUTPUT_FILE"
+
+# Run evaluation if requested
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo ""
+  echo "======================================"
+  echo "Running evaluation on results..."
+  echo "======================================"
+  echo ""
+  
+  if [ -f "$OUTPUT_FILE" ]; then
+    echo "Evaluating results in: $OUTPUT_FILE"
+    poetry run python evaluation/benchmarks/hotpotqa/scripts/summarize_results.py "$OUTPUT_FILE"
+    
+    # Save the evaluation results
+    EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt"
+    echo "Saving evaluation results to: $EVAL_RESULTS_FILE"
+    poetry run python evaluation/benchmarks/hotpotqa/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE"
+    
+    echo ""
+    echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE"
+  else
+    echo "Error: Output file not found: $OUTPUT_FILE"
+    echo "Cannot run evaluation."
+  fi
+fi
\ No newline at end of file
diff --git a/evaluation/benchmarks/hotpotqa/scripts/summarize_results.py b/evaluation/benchmarks/hotpotqa/scripts/summarize_results.py
new file mode 100755
index 000000000000..9c7bba2fbbfc
--- /dev/null
+++ b/evaluation/benchmarks/hotpotqa/scripts/summarize_results.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+from collections import defaultdict
+
+
+def load_jsonl(file_path):
+    """Load a jsonl file."""
+    data = []
+    with open(file_path, 'r') as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+
+
+def summarize_results(output_file):
+    """Summarize the results of the HotpotQA benchmark."""
+    print(f"Summarizing results from {output_file}")
+    
+    # Load the results
+    results = load_jsonl(output_file)
+    
+    # Count the number of instances that passed and failed
+    correct = []
+    incorrect = []
+    
+    for result in results:
+        instance_id = result['instance_id']
+        test_result = result.get('test_result', {})
+        is_correct = test_result.get('is_correct', False)
+        
+        if is_correct:
+            correct.append(instance_id)
+        else:
+            incorrect.append(instance_id)
+    
+    # Print the summary
+    print(f"\nTotal instances: {len(results)}")
+    print(f"Correct: {len(correct)} ({len(correct) / len(results) * 100:.2f}%)")
+    print(f"Incorrect: {len(incorrect)} ({len(incorrect) / len(results) * 100:.2f}%)")
+    
+    # Print the list of correct and incorrect instances
+    print("\nCorrect instances:")
+    for instance_id in correct:
+        print(f"  - {instance_id}")
+    
+    print("\nIncorrect instances:")
+    for instance_id in incorrect:
+        print(f"  - {instance_id}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Summarize HotpotQA benchmark results")
+    parser.add_argument("output_file", help="Path to the output.jsonl file")
+    args = parser.parse_args()
+    
+    summarize_results(args.output_file)
\ No newline at end of file
diff --git a/evaluation/benchmarks/math/README.md b/evaluation/benchmarks/math/README.md
new file mode 100644
index 000000000000..46589f56621c
--- /dev/null
+++ b/evaluation/benchmarks/math/README.md
@@ -0,0 +1,45 @@
+# MATH Benchmark Evaluation
+
+This folder contains evaluation harness for evaluating agents on the [MATH benchmark](https://github.com/hendrycks/math).
+
+MATH is a dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
+
+## Start the evaluation
+
+```bash
+./evaluation/benchmarks/math/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation]
+```
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default, the script evaluates the entire test set.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the given IDs (comma separated).
+- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes.
+
+Following is the basic command to start the evaluation:
+
+```bash
+# Run benchmark without evaluation
+./evaluation/benchmarks/math/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10"
+
+# Run benchmark with automatic evaluation
+./evaluation/benchmarks/math/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10" eval
+```
+
+## Summarize Results
+
+```bash
+poetry run python ./evaluation/benchmarks/math/scripts/summarize_results.py [path_to_output_jsonl_file]
+```
+
+Full example:
+
+```bash
+poetry run python ./evaluation/benchmarks/math/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/MATH/CodeActAgent/gpt-4o-2024-05-13@20240620_maxiter_30_N_v1.9/output.jsonl
+```
\ No newline at end of file
diff --git a/evaluation/benchmarks/math/__init__.py b/evaluation/benchmarks/math/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/benchmarks/math/run_infer.py b/evaluation/benchmarks/math/run_infer.py
new file mode 100644
index 000000000000..18089778329a
--- /dev/null
+++ b/evaluation/benchmarks/math/run_infer.py
@@ -0,0 +1,336 @@
+import asyncio
+import copy
+import os
+import tempfile
+from typing import Any
+
+import pandas as pd
+from datasets import load_dataset
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    get_llm_config_arg,
+    load_from_toml,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+# Configure any environment variables
+SKIP_NUM = os.environ.get('SKIP_NUM')
+SKIP_NUM = (
+    int(SKIP_NUM) if SKIP_NUM and SKIP_NUM.isdigit() and int(SKIP_NUM) >= 0 else None
+)
+
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.11-bookworm'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        max_iterations=metadata.max_iterations,
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    config.set_llm_config(llm_config)
+    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config.enable_prompt_extensions = False
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
+    return config
+
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n")
+    obs: CmdOutputObservation
+
+    # Set up workspace
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # Create problem file
+    with tempfile.TemporaryDirectory() as tmpdir:
+        file_path = os.path.join(tmpdir, 'problem.txt')
+        with open(file_path, 'w') as f:
+            f.write(instance.problem)
+        runtime.copy_to(
+            file_path,
+            '/workspace',
+        )
+
+    logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n")
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called after the agent has run.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n")
+    obs: CmdOutputObservation
+
+    # Check if solution.txt exists
+    action = CmdRunAction(command='ls -la /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    # Get the solution content
+    solution_content = ""
+    if "solution.txt" in obs.content:
+        action = CmdRunAction(command='cat /workspace/solution.txt')
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        solution_content = obs.content
+
+    logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n")
+
+    runtime.close()
+
+    # For MATH problems, we need to manually evaluate the solution
+    # Here we just return the solution content for manual evaluation
+    return {
+        'solution': solution_content,
+        'correct_answer': instance.answer,
+    }
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    logger.info(instance)
+    instruction = f"""You are given a mathematics problem to solve. The problem is in the file 'problem.txt'.
+
+Please read the problem carefully and solve it step by step. Write your solution in a file named 'solution.txt'.
+
+Your solution should include:
+1. A clear understanding of the problem
+2. Step-by-step working
+3. The final answer
+
+IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
+"""
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    initialize_runtime(runtime, instance=instance)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+        )
+    )
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # =============================================
+    # result evaluation
+    # =============================================
+
+    return_val = complete_runtime(runtime, instance)
+    solution = return_val['solution']
+    correct_answer = return_val['correct_answer']
+
+    # Simple evaluation - check if the correct answer appears in the solution
+    # In a real implementation, you would need a more sophisticated evaluation
+    is_correct = correct_answer in solution
+
+    test_result = {
+        'solution': solution,
+        'correct_answer': correct_answer,
+        'is_correct': is_correct,
+    }
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = compatibility_for_eval_history_pairs(state.history)
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
+    return output
+
+
+def prepare_math_dataset():
+    """Prepare the MATH dataset for evaluation."""
+    # In a real implementation, you would load the MATH dataset
+    # For now, we'll create a simple mock dataset
+    data = {
+        'instance_id': list(range(10)),
+        'problem': [
+            "Find the value of x in the equation 2x + 3 = 7.",
+            "Solve for y: 3y - 5 = 10.",
+            "Calculate the area of a circle with radius 5 cm.",
+            "Find the derivative of f(x) = x^2 + 3x + 2.",
+            "Solve the system of equations: 2x + y = 5, x - y = 1.",
+            "Find the indefinite integral of g(x) = 2x + 3.",
+            "Calculate the limit of (x^2 - 1)/(x - 1) as x approaches 1.",
+            "Find the value of sin(30°) + cos(60°).",
+            "Solve the quadratic equation x^2 - 5x + 6 = 0.",
+            "Find the sum of the first 10 terms of the arithmetic sequence with a_1 = 3 and d = 2."
+        ],
+        'answer': [
+            "x = 2",
+            "y = 5",
+            "78.54 cm²",
+            "f'(x) = 2x + 3",
+            "x = 2, y = 1",
+            "∫(2x + 3)dx = x² + 3x + C",
+            "2",
+            "1",
+            "x = 2, x = 3",
+            "75"
+        ],
+        'level': ['Algebra'] * 10,
+        'type': ['Equation'] * 5 + ['Calculus'] * 3 + ['Equation'] * 2
+    }
+    
+    return pd.DataFrame(data)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    
+    # Prepare the MATH dataset
+    math_dataset = prepare_math_dataset()
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
+        llm_config.modify_params = False
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+    
+    metadata = make_metadata(
+        llm_config,
+        'MATH',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+        details=agent_details,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+
+    instances = prepare_dataset(
+        math_dataset,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+        skip_num=SKIP_NUM,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
\ No newline at end of file
diff --git a/evaluation/benchmarks/math/scripts/run_infer.sh b/evaluation/benchmarks/math/scripts/run_infer.sh
new file mode 100755
index 000000000000..b157b2cd0df6
--- /dev/null
+++ b/evaluation/benchmarks/math/scripts/run_infer.sh
@@ -0,0 +1,114 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+EVAL_IDS=$6
+RUN_EVALUATION=$7  # New parameter to run evaluation after benchmark
+
+# Special case: if the 7th parameter is "eval", set RUN_EVALUATION to "eval"
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo "Evaluation mode enabled"
+fi
+
+# Special case: if any parameter is "eval", set RUN_EVALUATION to "eval"
+for param in "$@"; do
+  if [ "$param" = "eval" ]; then
+    RUN_EVALUATION="eval"
+    echo "Evaluation mode enabled"
+    break
+  fi
+done
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE=$OPENHANDS_VERSION
+
+COMMAND="export PYTHONPATH=evaluation/benchmarks/math:\$PYTHONPATH && poetry run python evaluation/benchmarks/math/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Only pass eval-ids if it's not "eval" (which is a special parameter for evaluation mode)
+if [ -n "$EVAL_IDS" ] && [ "$EVAL_IDS" != "eval" ]; then
+  echo "EVAL_IDS: $EVAL_IDS"
+  COMMAND="$COMMAND --eval-ids $EVAL_IDS"
+fi
+
+# Run the command
+eval $COMMAND
+
+# Get the output directory - first try the default location
+OUTPUT_DIR=$(find evaluation/evaluation_outputs -path "*/MATH/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+
+# If not found, try to find it anywhere under evaluation_outputs
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_DIR=$(find . -path "*/evaluation_outputs/*" -path "*/MATH/$AGENT/*" -type d -name "*$EVAL_NOTE*" 2>/dev/null | sort -r | head -n 1)
+fi
+
+# If still not found, try to find any output.jsonl file
+if [ -z "$OUTPUT_DIR" ]; then
+  OUTPUT_FILE=$(find . -name "output.jsonl" 2>/dev/null | sort -r | head -n 1)
+  if [ -n "$OUTPUT_FILE" ]; then
+    OUTPUT_DIR=$(dirname "$OUTPUT_FILE")
+  fi
+else
+  OUTPUT_FILE="$OUTPUT_DIR/output.jsonl"
+fi
+
+# Print the output directory and file for debugging
+echo ""
+echo "Output directory: $OUTPUT_DIR"
+echo "Output file: $OUTPUT_FILE"
+
+# Run evaluation if requested
+if [ "$RUN_EVALUATION" = "eval" ]; then
+  echo ""
+  echo "======================================"
+  echo "Running evaluation on results..."
+  echo "======================================"
+  echo ""
+  
+  if [ -f "$OUTPUT_FILE" ]; then
+    echo "Evaluating results in: $OUTPUT_FILE"
+    poetry run python evaluation/benchmarks/math/scripts/summarize_results.py "$OUTPUT_FILE"
+    
+    # Save the evaluation results
+    EVAL_RESULTS_FILE="$OUTPUT_DIR/evaluation_results.txt"
+    echo "Saving evaluation results to: $EVAL_RESULTS_FILE"
+    poetry run python evaluation/benchmarks/math/scripts/summarize_results.py "$OUTPUT_FILE" > "$EVAL_RESULTS_FILE"
+    
+    echo ""
+    echo "Evaluation complete. Results saved to: $EVAL_RESULTS_FILE"
+  else
+    echo "Error: Output file not found: $OUTPUT_FILE"
+    echo "Cannot run evaluation."
+  fi
+fi
\ No newline at end of file
diff --git a/evaluation/benchmarks/math/scripts/summarize_results.py b/evaluation/benchmarks/math/scripts/summarize_results.py
new file mode 100755
index 000000000000..0880de840254
--- /dev/null
+++ b/evaluation/benchmarks/math/scripts/summarize_results.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+from collections import defaultdict
+
+
+def load_jsonl(file_path):
+    """Load a jsonl file."""
+    data = []
+    with open(file_path, 'r') as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+
+
+def summarize_results(output_file):
+    """Summarize the results of the MATH benchmark."""
+    print(f"Summarizing results from {output_file}")
+    
+    # Load the results
+    results = load_jsonl(output_file)
+    
+    # Count the number of instances that passed and failed
+    correct = []
+    incorrect = []
+    
+    for result in results:
+        instance_id = result['instance_id']
+        test_result = result.get('test_result', {})
+        is_correct = test_result.get('is_correct', False)
+        
+        if is_correct:
+            correct.append(instance_id)
+        else:
+            incorrect.append(instance_id)
+    
+    # Print the summary
+    print(f"\nTotal instances: {len(results)}")
+    print(f"Correct: {len(correct)} ({len(correct) / len(results) * 100:.2f}%)")
+    print(f"Incorrect: {len(incorrect)} ({len(incorrect) / len(results) * 100:.2f}%)")
+    
+    # Print the list of correct and incorrect instances
+    print("\nCorrect instances:")
+    for instance_id in correct:
+        print(f"  - {instance_id}")
+    
+    print("\nIncorrect instances:")
+    for instance_id in incorrect:
+        print(f"  - {instance_id}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Summarize MATH benchmark results")
+    parser.add_argument("output_file", help="Path to the output.jsonl file")
+    args = parser.parse_args()
+    
+    summarize_results(args.output_file)
\ No newline at end of file
diff --git a/evaluation/benchmarks/wiki_table_question/__init__.py b/evaluation/benchmarks/wiki_table_question/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1