Skip to content

Commit

Permalink
Merge branch 'integrate-resolver' into fix-mypy-unreachable-code
Browse files Browse the repository at this point in the history
  • Loading branch information
neubig authored Nov 13, 2024
2 parents 1353fa6 + 99c86b7 commit cd7c136
Show file tree
Hide file tree
Showing 172 changed files with 3,059 additions and 3,106 deletions.
18 changes: 9 additions & 9 deletions evaluation/agent_bench/scripts/summarise_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,33 @@
def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
passed = []
failed = []
with open(res_file_path, 'r') as file:
with open(res_file_path, "r") as file:
for line in file:
data = json.loads(line.strip())
instance_id = data['instance_id']
instance_id = data["instance_id"]
resolved = False
if 'test_result' in data and 'result' in data['test_result']:
resolved = data['test_result']['result']
if "test_result" in data and "result" in data["test_result"]:
resolved = data["test_result"]["result"]
if resolved:
passed.append(instance_id)
else:
failed.append(instance_id)
return passed, failed


if __name__ == '__main__':
if __name__ == "__main__":
if len(sys.argv) != 2:
print(
'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
"Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>"
)
sys.exit(1)
json_file_path = sys.argv[1]
passed_tests, failed_tests = extract_test_results(json_file_path)
succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
print(
f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
f"\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}"
)
print('PASSED TESTS:')
print("PASSED TESTS:")
print(passed_tests)
print('FAILED TESTS:')
print("FAILED TESTS:")
print(failed_tests)
46 changes: 23 additions & 23 deletions evaluation/aider_bench/scripts/summarize_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ def extract_test_results(df: pd.DataFrame) -> tuple[list[str], list[str]]:
passed = []
failed = []
for _, row in df.iterrows():
instance_id = row['instance_id']
instance_id = row["instance_id"]
resolved = False
if 'test_result' in row and 'exit_code' in row['test_result']:
resolved = row['test_result']['exit_code'] == 0
if "test_result" in row and "exit_code" in row["test_result"]:
resolved = row["test_result"]["exit_code"] == 0
if resolved:
passed.append(instance_id)
else:
Expand All @@ -21,38 +21,38 @@ def extract_test_results(df: pd.DataFrame) -> tuple[list[str], list[str]]:

def visualize_results(df: pd.DataFrame):
df1 = pd.DataFrame()
df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost']
df1['result'] = (
df['test_result'].apply(pd.Series)['exit_code'].map({0: 'Pass', 1: 'Fail'})
df1["cost"] = df["metrics"].apply(pd.Series)["accumulated_cost"]
df1["result"] = (
df["test_result"].apply(pd.Series)["exit_code"].map({0: "Pass", 1: "Fail"})
)
df1['actions'] = pd.Series([len(a) - 1 for a in df['history']])
df1["actions"] = pd.Series([len(a) - 1 for a in df["history"]])

passed = np.sum(df1['result'] == 'Pass')
passed = np.sum(df1["result"] == "Pass")
total = df.shape[0]
resolve_rate = round((passed / total) * 100, 2)

print('Number of passed tests:', f'{passed}/{total} {resolve_rate:.2f}%')
print('\nDescriptive statistics for number of actions:')
print(df1['actions'].describe())
print('\nDescriptive statistics for costs:')
print(df1['cost'].describe())
print("Number of passed tests:", f"{passed}/{total} {resolve_rate:.2f}%")
print("\nDescriptive statistics for number of actions:")
print(df1["actions"].describe())
print("\nDescriptive statistics for costs:")
print(df1["cost"].describe())

# Bin counts for actions
action_bins = pd.cut(df1['actions'], bins=range(0, 32, 2))
print('\nAction bin counts:')
action_bins = pd.cut(df1["actions"], bins=range(0, 32, 2))
print("\nAction bin counts:")
print(action_bins.value_counts().sort_index())

# Bin counts for costs
cost_bins = pd.cut(df1['cost'], bins=10)
print('\nCost bin counts:')
cost_bins = pd.cut(df1["cost"], bins=10)
print("\nCost bin counts:")
print(cost_bins.value_counts().sort_index())

return resolve_rate


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Summarize AiderBench results')
parser.add_argument('input_filepath', type=str, help='Path to the JSONL file')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Summarize AiderBench results")
parser.add_argument("input_filepath", type=str, help="Path to the JSONL file")
args = parser.parse_args()

# Create DataFrame from JSONL file
Expand All @@ -62,9 +62,9 @@ def visualize_results(df: pd.DataFrame):
resolve_rate = visualize_results(df)

print(
f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%'
f"\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%"
)
print('PASSED TESTS:')
print("PASSED TESTS:")
print(passed_tests)
print('FAILED TESTS:')
print("FAILED TESTS:")
print(failed_tests)
22 changes: 11 additions & 11 deletions evaluation/biocoder/scripts/setup/copy_changed_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,35 +7,35 @@ def get_changed_code(target_filepath, line_start, include_signature=False):
selected_lines = []
offset = 1 if include_signature else 0

with open('/testing_files/first_line_after_removed.txt', 'r') as f:
with open("/testing_files/first_line_after_removed.txt", "r") as f:
first_line_after_removed = f.read()
if first_line_after_removed is None:
print('First line after removed is None')
print("First line after removed is None")

with open(target_filepath, 'r') as f:
lines = f.read().split('\n')
with open(target_filepath, "r") as f:
lines = f.read().split("\n")
for i in range(line_start - offset, len(lines)):
if lines[i].strip() == first_line_after_removed.strip():
break
selected_lines.append(lines[i])
text = '\n'.join(selected_lines)
text = "\n".join(selected_lines)
return text


def copy_changed_code(
target_filepath, generated_code_filepath, line_start, include_signature=False
):
changed_code = get_changed_code(target_filepath, line_start, include_signature)
with open(generated_code_filepath, 'w') as f:
with open(generated_code_filepath, "w") as f:
f.write(changed_code)


if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--target_filepath', type=str, required=True)
parser.add_argument('--generated_code_filepath', type=str, required=True)
parser.add_argument('--line_start', type=int, required=True)
parser.add_argument('--include_signature', action='store_true')
parser.add_argument("--target_filepath", type=str, required=True)
parser.add_argument("--generated_code_filepath", type=str, required=True)
parser.add_argument("--line_start", type=int, required=True)
parser.add_argument("--include_signature", action="store_true")
args = parser.parse_args()
copy_changed_code(
args.target_filepath,
Expand Down
30 changes: 15 additions & 15 deletions evaluation/biocoder/scripts/setup/remove_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,24 @@ def get_likely_indent_size(array_of_tabs) -> int:
def get_target_filepath(self):
target_filepath = os.path.join(
self.workspace_mount_path,
self.biocoder_instance.repository.split('/')[1],
self.biocoder_instance.repository.split("/")[1],
self.biocoder_instance.filePath,
)
return target_filepath


def remove_code(target_filepath: str, line_start: int, line_end: int, language: str):
comment_prefix = {'python': '#', 'java': '//'}
comment_prefix = {"python": "#", "java": "//"}

with open(target_filepath, 'r') as f:
lines = f.read().split('\n')
with open(target_filepath, "r") as f:
lines = f.read().split("\n")
# print("="*10+"ORIGINAL"+"="*10)
# print("\n".join(lines))
signature_line = lines[line_start - 1]

# get the number of tabs
def get_indent_size(s: str):
return len(re.match(r'\s*', s).group())
return len(re.match(r"\s*", s).group())

indent_sizes = list(map(get_indent_size, lines))
indent_size = get_likely_indent_size(indent_sizes)
Expand All @@ -46,7 +46,7 @@ def get_indent_size(s: str):
+ [
f"{' '*comment_indent_size+comment_prefix[language.lower()]}TODO: replace with your code here"
]
+ ([''] * 2)
+ ([""] * 2)
+ lines[line_end:]
)
first_line_after_removed_index = line_start
Expand All @@ -56,19 +56,19 @@ def get_indent_size(s: str):
first_line_after_removed_index += 1

first_line_after_removed = lines[first_line_after_removed_index]
print('FIRST LINE AFTER REMOVED: ', first_line_after_removed)
with open('/testing_files/first_line_after_removed.txt', 'w') as f:
print("FIRST LINE AFTER REMOVED: ", first_line_after_removed)
with open("/testing_files/first_line_after_removed.txt", "w") as f:
f.write(first_line_after_removed)

with open(target_filepath, 'w') as f:
f.write('\n'.join(lines))
with open(target_filepath, "w") as f:
f.write("\n".join(lines))


if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--target_filepath', type=str, required=True)
parser.add_argument('--line_start', type=int, required=True)
parser.add_argument('--line_end', type=int, required=True)
parser.add_argument('--language', type=str, required=True)
parser.add_argument("--target_filepath", type=str, required=True)
parser.add_argument("--line_start", type=int, required=True)
parser.add_argument("--line_end", type=int, required=True)
parser.add_argument("--language", type=str, required=True)
args = parser.parse_args()
remove_code(args.target_filepath, args.line_start, args.line_end, args.language)
Loading

0 comments on commit cd7c136

Please sign in to comment.