forked from Nutlope/finetune
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3-eval.py
115 lines (98 loc) · 3.73 KB
/
3-eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# PART 3 – an evaluation script to test the accuracy of the fine-tuned model vs the base model.
import os
import json
from textwrap import dedent
from together import AsyncTogether
from dataclasses import dataclass
from autoblocks.testing.run import run_test_suite
from autoblocks.testing.models import BaseTestCase
from autoblocks.testing.run import grid_search_ctx
from autoblocks.testing.util import md5
from autoblocks.testing.models import BaseTestEvaluator
from autoblocks.testing.models import Evaluation
from autoblocks.testing.models import Threshold
async_together_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))
base_model = "meta-llama/Llama-3-8b-chat-hf"
top_oss_model = "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"
finetuned_model = "YOUR_FINETUNED_MODEL"
evaluator_model = "meta-llama/Llama-3-70b-chat-hf"
eval_dataset = "EvalDataset-100.json"
@dataclass
class TestCase(BaseTestCase):
instruction: str
expected_output: str
def hash(self) -> str:
"""
This hash serves as a unique identifier for a test case throughout its lifetime.
"""
return md5(self.instruction)
with open(eval_dataset, "r", encoding="utf-8") as eval_file:
"""
Initialize the eval dataset as test cases.
"""
eval_data = json.load(eval_file)
unique_test_cases = {}
for test_case in eval_data:
tc = TestCase(instruction=test_case["instruction"], expected_output=test_case["output"])
unique_test_cases[tc.hash()] = tc
test_cases = list(unique_test_cases.values())
async def test_fn(test_case: TestCase) -> str:
"""
Using the current model from the grid search context, generate a completion for the test case.
"""
ctx = grid_search_ctx()
model = ctx.get("model")
completion = await async_together_client.chat.completions.create(
messages=[
{"role": "user", "content": test_case.instruction},
],
model=model,
max_tokens=1500,
)
return completion.choices[0].message.content
class Accuracy(BaseTestEvaluator):
"""
Use the Autoblocks accuracy evaluator to compare the ground truth to the model's output.
"""
id = "accuracy"
max_concurrency = 3
async def evaluate_test_case(self, test_case: TestCase, output: str) -> Evaluation:
resp = await async_together_client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You will be given a ground truth answer and a model answer. Please output ACCURATE if the model answer matches the ground truth answer or INACCURATE otherwise. Please only return ACCURATE or INACCURATE. It is very important for my job that you do this.",
},
{
"role": "user",
"content": dedent(f"""
<GroundTruthAnswer>
{test_case.expected_output}
</GroundTruthAnswer>
<ModelAnswer>
{output}
</ModelAnswer>
""").strip(),
},
],
model=evaluator_model,
)
score = 1 if resp.choices[0].message.content == "ACCURATE" else 0
return Evaluation(score=score, threshold=Threshold(gte=1))
# Run the test suite using the Autoblocks SDK
# npx autoblocks testing exec -- python3 3-eval.py
run_test_suite(
id="autoblocks-together-ai",
test_cases=test_cases,
evaluators=[Accuracy()],
fn=test_fn,
grid_search_params=dict(
model=[
base_model,
top_oss_model,
evaluator_model,
finetuned_model
],
),
max_test_case_concurrency=4,
)