-
Notifications
You must be signed in to change notification settings - Fork 2
/
run_benchmark.py
200 lines (159 loc) · 7.62 KB
/
run_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
help_info = """
Usage:
$ python run_benchmark.py \\
--algorithms=ALGO1[,ALGO2[,ALGO3[,...]]] \\
--challenges=SUBTASK1[,SUBTASK2[,SUBTASK3[,...]]] \\
--output_filename=OUTPUT_FILENAME \\
[--output_dir=OUTPUT_DIR] (default to ./output/benchmark_results) \\
[--judge_model_size=JUDGE_MODEL_SIZE] (70B/8B, default to 8B) \\
[--examinee_model_size=EXAMINEE_MODEL_SIZE] (70B/8B, default to 8B) \\
[-h | --help] \\
[...] (additional arguments will be supplied to the algorithms and the challenges when they are instantiated; only string values are supported)
Examples:
$ python run_benchmark.py \\
--algorithms=LifelongRLHF,LifelongDPO,OPO \\
--challenges=Follow,Predict,Coevolve \\
--output_filename=3x3_benchmark \\
--judge_model_size=8B \\
--examinee_model_size=8B
$ python run_benchmark.py \\
--algorithms=Dummy \\
--challenges=Dummy,Coevolve \\
--output_filename=dummy_debugging_run \\
--judge_model_size=70B \\
--examinee_model_size=70B
Note that all names are case-sensitive. Dummies are for debugging purposes only.
"""
import pdb
import traceback
import argparse
import os
import sys
import time
import json
from typing import List, Dict, Any, Type
from multiprocessing import freeze_support
from src.download_models import download_all_models
from benchmark.framework import JudgeBase, ExamineeBase
def run_benchmark(
ExamineeClass: Type[ExamineeBase], JudgeClass: Type[JudgeBase], **kwargs
) -> Dict[str, Any]:
"""
Run a single benchmarking test on a single examinee and a single judge, and return the results.
:param ExamineeClass: Necessary, examinee class object representing the algorithm to be evaluated. Can be any subclass of ExamineeBase, including user-implemented ones. Note that this is the class object, not an instance of the class.
:type examinee: Type[ExamineeBase]
:param judge: Necessary, judge class object representing the challenge to be evaluated. Can be any subclass of JudgeBase, including user-implemented ones. Note that this is the class object, not an instance of the class.
:type judge: Type[JudgeBase]
:param kwargs: Optional, additional arguments to be passed to the examinee and the judge. Pass the same str-typed arguments as you would in the command line.
:type kwargs: Dict[str, str]
:return: A dictionary containing the results of the benchmarking test. The dictionary is in the exact same format as the results of command-line benchmarking.
:rtype: Dict[str, Any]
Example:
.. code-block:: python
from progressgym import run_benchmark, CoevolveJudge, LifelongDPOExaminee # if using PyPI package
results = run_benchmark(LifelongDPOExaminee, CoevolveJudge)
"""
print(f"Running {ExamineeClass} on {JudgeClass}...")
examinee = ExamineeClass(**kwargs)
judge = JudgeClass(**kwargs)
start_time = time.time()
result: Dict[str, Any] = judge.test(examinee)
end_time = time.time()
result["duration_seconds"] = end_time - start_time
print(f"Benchmarking complete. Duration: {result['duration_seconds']} seconds")
return result
if __name__ == "__main__":
freeze_support()
try:
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("-h", "--help", action="store_true")
args_help, _ = parser.parse_known_args()
if hasattr(args_help, "help") and args_help.help:
print(help_info)
sys.exit(0)
parser.add_argument("--algorithms", type=str, required=True)
parser.add_argument("--challenges", type=str, required=True)
parser.add_argument("--output_filename", type=str, required=True)
parser.add_argument(
"--output_dir",
type=str,
default="./output/benchmark_results",
required=False,
)
args, unknownargs = parser.parse_known_args()
kwargs: Dict[str, str] = {}
for s in unknownargs:
k, v = s.split("=")
kwargs[k.strip().strip("-")] = v.strip()
print(
f"Captured additional arguments: {kwargs}. They will be passed to `__init__()` and `reset()` of both the judges and the examinees, as str-typed arguments."
)
download_70B_models = "70" in kwargs.get(
"examinee_model_size", ""
) or "70" in kwargs.get("judge_model_size", "")
download_all_models(download_70B=download_70B_models)
algorithms: List[str] = args.algorithms.split(",")
challenges: List[str] = args.challenges.split(",")
output_dir: str = args.output_dir
examinees: Dict[str, ExamineeBase] = {}
judges: Dict[str, JudgeBase] = {}
# Dynamically importing all algorithms
for algorithm in algorithms:
lib = "algorithms"
try:
exec(f"from {lib} import {algorithm}Examinee")
except ImportError:
print(
f"Error: Class {algorithm}Examinee not found in {lib}. Did you forget to implement it?"
)
sys.exit(1)
# Instantiating the algorithm
kwargs_str = ", ".join([f"{k}={repr(v)}" for k, v in kwargs.items()])
examinee: ExamineeBase = eval(f"{algorithm}Examinee({kwargs_str})")
examinees[algorithm] = examinee
# Dynamically importing all challenges
for challenge in challenges:
lib = "benchmark"
try:
exec(f"from {lib} import {challenge}Judge")
except ImportError:
print(
f"Error: Class {challenge}Judge not found in {lib}. Does this challenge exist?"
)
sys.exit(1)
# Instantiating the challenge
kwargs_str = ", ".join([f"{k}={repr(v)}" for k, v in kwargs.items()])
judge: JudgeBase = eval(f"{challenge}Judge({kwargs_str})")
judges[challenge] = judge
eval_results: Dict[str, Dict[str, Dict[str, Any]]] = {}
if not os.path.exists(output_dir):
os.makedirs(output_dir)
path = os.path.join(output_dir, f'{args.output_filename.split(".")[0]}.json')
# Running all algorithms on all challenges
for algorithm in algorithms:
for challenge in challenges:
print(f"Running {algorithm} on {challenge}...")
examinee = examinees[algorithm]
judge = judges[challenge]
examinee.reset(**kwargs)
judge.reset(**kwargs)
start_time = time.time()
result: Dict[str, Any] = judge.test(examinee)
end_time = time.time()
result["duration_seconds"] = end_time - start_time
if algorithm not in eval_results:
eval_results[algorithm] = {}
eval_results[algorithm][challenge] = result
with open(path, "w") as f:
json.dump(eval_results, f)
with open(path, "w") as f:
json.dump(eval_results, f)
print(
f"""Evaluation completed. Evaluation results saved to {path}. See item 'score' for a comprehensive score for each examinee's performance in one subtask.
However, note that when submitting to the leaderboard, the 'score' field will be ignored, and the eventual score will be calculated from scratch."""
)
except:
print(f"Exception occured. Entering debugger.")
extype, value, tb = sys.exc_info()
traceback.print_exc()
pdb.post_mortem(tb)