Skip to content

Commit

Permalink
Fix imports to match new directory structure under evaluation/benchma…
Browse files Browse the repository at this point in the history
…rks/
  • Loading branch information
openhands-agent committed Nov 23, 2024
1 parent 4c62196 commit a759dd8
Show file tree
Hide file tree
Showing 16 changed files with 22 additions and 22 deletions.
2 changes: 1 addition & 1 deletion evaluation/benchmarks/EDA/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from datasets import load_dataset

from evaluation.EDA.game import Q20Game, Q20GameCelebrity
from evaluation.benchmarks.EDA.game import Q20Game, Q20GameCelebrity
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/agent_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
from datasets import load_dataset

from evaluation.agent_bench.helper import (
from evaluation.benchmarks.agent_bench.helper import (
FAKE_RESPONSES,
INST_SUFFIXES,
compare_results,
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/aider_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
from datasets import load_dataset

from evaluation.aider_bench.helper import (
from evaluation.benchmarks.aider_bench.helper import (
FAKE_RESPONSES,
INST_SUFFIXES,
INSTRUCTIONS_ADDENDUM,
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/biocoder/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
from datasets import load_dataset

from evaluation.biocoder.utils import BiocoderData
from evaluation.benchmarks.biocoder.utils import BiocoderData
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
Expand Down
4 changes: 2 additions & 2 deletions evaluation/benchmarks/discoverybench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import git
import pandas as pd

from evaluation.discoverybench.eval_utils.eval_w_subhypo_gen import (
from evaluation.benchmarks.discoverybench.eval_utils.eval_w_subhypo_gen import (
run_eval_gold_vs_gen_NL_hypo_workflow,
)
from evaluation.discoverybench.eval_utils.response_parser import (
from evaluation.benchmarks.discoverybench.eval_utils.response_parser import (
extract_gen_hypo_from_logs,
)
from evaluation.utils.shared import (
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
from datasets import load_dataset

from evaluation.gaia.scorer import question_scorer
from evaluation.benchmarks.gaia.scorer import question_scorer
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/gorilla/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
import requests

from evaluation.gorilla.utils import encode_question, get_data_for_hub
from evaluation.benchmarks.gorilla.utils import encode_question, get_data_for_hub
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
Expand Down
8 changes: 4 additions & 4 deletions evaluation/benchmarks/mint/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
import pandas as pd
from datasets import load_dataset

from evaluation.mint.datatypes import TaskState
from evaluation.mint.env import SimplifiedEnv
from evaluation.mint.prompts import ToolPromptTemplate
from evaluation.mint.tasks import Task
from evaluation.benchmarks.mint.datatypes import TaskState
from evaluation.benchmarks.mint.env import SimplifiedEnv
from evaluation.benchmarks.mint.prompts import ToolPromptTemplate
from evaluation.benchmarks.mint.tasks import Task
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/mint/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from evaluation.mint.tasks.base import Task
from evaluation.mint.tasks.codegen import HumanEvalTask, MBPPTask
from evaluation.mint.tasks.reasoning import (
from evaluation.benchmarks.mint.tasks.base import Task
from evaluation.benchmarks.mint.tasks.codegen import HumanEvalTask, MBPPTask
from evaluation.benchmarks.mint.tasks.reasoning import (
MultipleChoiceTask,
ReasoningTask,
TheoremqaTask,
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/mint/tasks/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from utils import check_correctness

from evaluation.mint.tasks.base import Task
from evaluation.benchmarks.mint.tasks.base import Task

LOGGER = logging.getLogger('MINT')

Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
from swebench.harness.utils import load_swebench_dataset

from evaluation.swe_bench.run_infer import get_instance_docker_image
from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from datasets import load_dataset

import openhands.agenthub
from evaluation.swe_bench.prompt import CODEACT_SWE_PROMPT
from evaluation.benchmarks.swe_bench.prompt import CODEACT_SWE_PROMPT
from evaluation.utils.shared import (
EvalException,
EvalMetadata,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from openhands.core.logger import openhands_logger as logger

logger.setLevel('ERROR')
from evaluation.swe_bench.run_infer import get_instance_docker_image # noqa
from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image # noqa

parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench_Lite')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
from tqdm import tqdm

from evaluation.swe_bench.eval_infer import process_git_patch
from evaluation.benchmarks.swe_bench.eval_infer import process_git_patch
from openhands.events.serialization import event_from_dict

tqdm.pandas()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pandas as pd

from evaluation.swe_bench.eval_infer import process_git_patch
from evaluation.benchmarks.swe_bench.eval_infer import process_git_patch

parser = argparse.ArgumentParser()
parser.add_argument('oh_output_file', type=str)
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/toolqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pandas as pd

from evaluation.toolqa.utils import encode_question, eval_answer, get_data
from evaluation.benchmarks.toolqa.utils import encode_question, eval_answer, get_data
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
Expand Down

0 comments on commit a759dd8

Please sign in to comment.