diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py index 25492073928e..cce795e954bf 100644 --- a/evaluation/benchmarks/EDA/run_infer.py +++ b/evaluation/benchmarks/EDA/run_infer.py @@ -4,7 +4,7 @@ import pandas as pd from datasets import load_dataset -from evaluation.EDA.game import Q20Game, Q20GameCelebrity +from evaluation.benchmarks.EDA.game import Q20Game, Q20GameCelebrity from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index acdf60fe4850..693718357a59 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -7,7 +7,7 @@ import pandas as pd from datasets import load_dataset -from evaluation.agent_bench.helper import ( +from evaluation.benchmarks.agent_bench.helper import ( FAKE_RESPONSES, INST_SUFFIXES, compare_results, diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index c6e5bbb9db6f..f7796c7696de 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -7,7 +7,7 @@ import pandas as pd from datasets import load_dataset -from evaluation.aider_bench.helper import ( +from evaluation.benchmarks.aider_bench.helper import ( FAKE_RESPONSES, INST_SUFFIXES, INSTRUCTIONS_ADDENDUM, diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index 68bbf892d54b..f5cdd44471a8 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -8,7 +8,7 @@ import pandas as pd from datasets import load_dataset -from evaluation.biocoder.utils import BiocoderData +from evaluation.benchmarks.biocoder.utils import BiocoderData from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py index 7cfd2dbac7ad..6d8dcbd89b3c 100644 --- a/evaluation/benchmarks/discoverybench/run_infer.py +++ b/evaluation/benchmarks/discoverybench/run_infer.py @@ -5,10 +5,10 @@ import git import pandas as pd -from evaluation.discoverybench.eval_utils.eval_w_subhypo_gen import ( +from evaluation.benchmarks.discoverybench.eval_utils.eval_w_subhypo_gen import ( run_eval_gold_vs_gen_NL_hypo_workflow, ) -from evaluation.discoverybench.eval_utils.response_parser import ( +from evaluation.benchmarks.discoverybench.eval_utils.response_parser import ( extract_gen_hypo_from_logs, ) from evaluation.utils.shared import ( diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index 1fa0c00e6d6a..fb6d4b3db050 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -7,7 +7,7 @@ import pandas as pd from datasets import load_dataset -from evaluation.gaia.scorer import question_scorer +from evaluation.benchmarks.gaia.scorer import question_scorer from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py index aa932a388f88..6f5b6c9d4388 100644 --- a/evaluation/benchmarks/gorilla/run_infer.py +++ b/evaluation/benchmarks/gorilla/run_infer.py @@ -5,7 +5,7 @@ import pandas as pd import requests -from evaluation.gorilla.utils import encode_question, get_data_for_hub +from evaluation.benchmarks.gorilla.utils import encode_question, get_data_for_hub from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py index 7f6985fc2aae..4414e1c4625f 100644 --- a/evaluation/benchmarks/mint/run_infer.py +++ b/evaluation/benchmarks/mint/run_infer.py @@ -6,10 +6,10 @@ import pandas as pd from datasets import load_dataset -from evaluation.mint.datatypes import TaskState -from evaluation.mint.env import SimplifiedEnv -from evaluation.mint.prompts import ToolPromptTemplate -from evaluation.mint.tasks import Task +from evaluation.benchmarks.mint.datatypes import TaskState +from evaluation.benchmarks.mint.env import SimplifiedEnv +from evaluation.benchmarks.mint.prompts import ToolPromptTemplate +from evaluation.benchmarks.mint.tasks import Task from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/benchmarks/mint/tasks/__init__.py b/evaluation/benchmarks/mint/tasks/__init__.py index 4f6ac721aca9..96c628f85462 100644 --- a/evaluation/benchmarks/mint/tasks/__init__.py +++ b/evaluation/benchmarks/mint/tasks/__init__.py @@ -1,6 +1,6 @@ -from evaluation.mint.tasks.base import Task -from evaluation.mint.tasks.codegen import HumanEvalTask, MBPPTask -from evaluation.mint.tasks.reasoning import ( +from evaluation.benchmarks.mint.tasks.base import Task +from evaluation.benchmarks.mint.tasks.codegen import HumanEvalTask, MBPPTask +from evaluation.benchmarks.mint.tasks.reasoning import ( MultipleChoiceTask, ReasoningTask, TheoremqaTask, diff --git a/evaluation/benchmarks/mint/tasks/codegen.py b/evaluation/benchmarks/mint/tasks/codegen.py index 8a80594ce4b7..cbd127ac0eac 100644 --- a/evaluation/benchmarks/mint/tasks/codegen.py +++ b/evaluation/benchmarks/mint/tasks/codegen.py @@ -2,7 +2,7 @@ from utils import check_correctness -from evaluation.mint.tasks.base import Task +from evaluation.benchmarks.mint.tasks.base import Task LOGGER = logging.getLogger('MINT') diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py index d40f984fca9c..95f65245f22f 100644 --- a/evaluation/benchmarks/swe_bench/eval_infer.py +++ b/evaluation/benchmarks/swe_bench/eval_infer.py @@ -12,7 +12,7 @@ from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec from swebench.harness.utils import load_swebench_dataset -from evaluation.swe_bench.run_infer import get_instance_docker_image +from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image from evaluation.utils.shared import ( EvalMetadata, EvalOutput, diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 9cb9dd77f498..3ffc08d29bfb 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -9,7 +9,7 @@ from datasets import load_dataset import openhands.agenthub -from evaluation.swe_bench.prompt import CODEACT_SWE_PROMPT +from evaluation.benchmarks.swe_bench.prompt import CODEACT_SWE_PROMPT from evaluation.utils.shared import ( EvalException, EvalMetadata, diff --git a/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py index 20fb1b94c0b6..52e2ea4cb141 100644 --- a/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py +++ b/evaluation/benchmarks/swe_bench/scripts/docker/push_docker_instance_images.py @@ -32,7 +32,7 @@ from openhands.core.logger import openhands_logger as logger logger.setLevel('ERROR') -from evaluation.swe_bench.run_infer import get_instance_docker_image # noqa +from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image # noqa parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench_Lite') diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py index 17a375ee3b79..8e9fc407d93b 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py @@ -8,7 +8,7 @@ import pandas as pd from tqdm import tqdm -from evaluation.swe_bench.eval_infer import process_git_patch +from evaluation.benchmarks.swe_bench.eval_infer import process_git_patch from openhands.events.serialization import event_from_dict tqdm.pandas() diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py index 5006d3dde357..f333012f489a 100644 --- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py @@ -3,7 +3,7 @@ import pandas as pd -from evaluation.swe_bench.eval_infer import process_git_patch +from evaluation.benchmarks.swe_bench.eval_infer import process_git_patch parser = argparse.ArgumentParser() parser.add_argument('oh_output_file', type=str) diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py index a7c5242d2f48..c99f15a89ae9 100644 --- a/evaluation/benchmarks/toolqa/run_infer.py +++ b/evaluation/benchmarks/toolqa/run_infer.py @@ -4,7 +4,7 @@ import pandas as pd -from evaluation.toolqa.utils import encode_question, eval_answer, get_data +from evaluation.benchmarks.toolqa.utils import encode_question, eval_answer, get_data from evaluation.utils.shared import ( EvalMetadata, EvalOutput,