1
1
import json
2
2
import os
3
- from tonic_validate import ValidateScorer , Benchmark , LLMResponse , ValidateApi
3
+ from tonic_validate import ValidateApi
4
4
from tonic_validate .metrics import AnswerSimilarityMetric , RetrievalPrecisionMetric , AugmentationPrecisionMetric , AnswerConsistencyMetric
5
+ from llama_index .evaluation import TonicValidateEvaluator
5
6
import requests
6
7
7
8
from dotenv import load_dotenv
@@ -24,35 +25,37 @@ def get_llm_response(prompt):
24
25
result = response ['result' ]
25
26
return result ['content' ], result ['context' ]
26
27
27
- def test_llama_index ():
28
+ def get_q_and_a ():
28
29
# Load qa_pairs.json
29
30
qa_pairs = json .load (open ('./tests/qa_pairs.json' ))
30
- benchmark = Benchmark (
31
- questions = [x ['question' ] for x in qa_pairs ],
32
- answers = [x ['answer' ] for x in qa_pairs ]
33
- )
31
+ return ([x ['question' ] for x in qa_pairs ], [x ['answer' ] for x in qa_pairs ])
34
32
35
- # Save the responses into an array for scoring
36
- responses = []
37
- for item in benchmark :
38
- llm_answer , llm_context_list = get_llm_response (item .question )
39
- llm_response = LLMResponse (
40
- llm_answer = llm_answer ,
41
- llm_context_list = llm_context_list ,
42
- benchmark_item = item
43
- )
44
- responses .append (llm_response )
45
-
46
- # Score run
33
+ def get_responses (questions ):
34
+ llm_answers = []
35
+ context_lists = []
36
+ for item in questions :
37
+ llm_answer , llm_context_list = get_llm_response (item )
38
+ llm_answers .append (llm_answer )
39
+ context_lists .append (llm_context_list )
40
+ return (llm_answers , context_lists )
41
+
42
+ def score_run (questions , context_lists , reference_answers , llm_answers ):
47
43
metrics = [
48
44
AnswerSimilarityMetric (),
49
45
RetrievalPrecisionMetric (),
50
46
AugmentationPrecisionMetric (),
51
47
AnswerConsistencyMetric ()
52
48
]
53
- scorer = ValidateScorer (metrics )
54
- run = scorer .score_run (responses )
49
+ scorer = TonicValidateEvaluator (metrics , model_evaluator = "gpt-4-1106-preview" )
50
+ run = scorer .evaluate_run (
51
+ questions , context_lists , reference_answers , llm_answers
52
+ )
53
+ return run , metrics
55
54
55
+ def test_llama_index ():
56
+ questions , reference_answers = get_q_and_a ()
57
+ llm_answers , context_lists = get_responses (questions )
58
+ run , metrics = score_run (questions , context_lists , reference_answers , llm_answers )
56
59
# Upload results to web ui
57
60
validate_api = ValidateApi ()
58
61
# Get project id from env
0 commit comments