-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexamples_http_requests.py
76 lines (54 loc) · 2.1 KB
/
examples_http_requests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from deepeval.metrics import AnswerRelevancyMetric, GEval
from http_provider import AnswerAIProvide
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval import assert_test
from utils import DeepEvalModelInterface
import configparser
import pandas as pd
from evals import _evaluation_standard_same_meaning, _evaluation_standard_default
# Create a ConfigParser object
config = configparser.ConfigParser()
config.read('llmsetting.conf')
base_url = config.get('answerAI', "base_url")
token = config.get('answerAI', "token")
assert base_url is not None, "Please provide a valid base_url in llmsetting.conf"
MODEL_NAME = "gpt-4o-mini"
##
custom_model = AnswerAIProvide(
model=MODEL_NAME,
base_url=base_url,
headers={
"Authorization": token
},
system="最後給出的`reason`請用台灣繁體中文進行輸出,且不得包含`\"`字元",
)
model = DeepEvalModelInterface(model=custom_model, model_name=MODEL_NAME)
# print("\n\n\n-------------------")
def get_correctness_metric_score(test_case):
correctness_metric = GEval(
name="Correctness",
model=model,
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT],
evaluation_steps=_evaluation_standard_default,
async_mode=False,
strict_mode=False # the sorce will be 0
)
correctness_metric.measure(test_case)
return correctness_metric.score, correctness_metric.reason
# load the dataset
df = pd.read_csv('error_case.csv')
## prepare the test cases
def test_source(question_content: str, retrieval_context: list, expected_output: str, actual_output: str):
return LLMTestCase(**{
"input": question_content,
"retrieval_context": retrieval_context,
"expected_output": expected_output,
"actual_output": actual_output
})
df[['v1_score','v1_reason']] = df.apply(lambda x:
pd.Series(get_correctness_metric_score(test_source(x['question'],[], x['note'], x['answer']))),
axis=1)
df.to_csv('error_case_with_score.csv', index=False)