-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathobject_creation_for_models_evaluation.py
157 lines (126 loc) · 8.37 KB
/
object_creation_for_models_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import json
import os
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any, Literal
import numpy as np
from openai import OpenAI
from ai_querying.ai_querying_util_funcs import create_query_prompt_for_model_evaluation, \
extract_obj_from_passage_with_retry
from ai_querying.system_prompts import model_evaluation_with_cot_system_prompt_prefix, \
model_evaluation_without_cot_system_prompt_prefix
from data_processing.data_loading import load_scenarios, load_data_split
from data_processing.data_mngmt_defs import DataSplitRecord, EvaluationModelOutputRecord, schemas_path, \
fewshot_examples_path, validation_set_path, evaluation_models_output_path, test_set_path, validation_eval_set_name, \
test_eval_set_name
from ai_querying.ai_querying_defs import openai_api_key_env, deepinfra_api_key_env, OpenAiClientBundle, ModelProvider
from utils_and_defs.logging_setup import create_logger
logger = create_logger(__name__)
# Credit- some bits of this script (including the very idea of initially just generating objects with the models to be
# evaluated and then evaluating them later) were inspired by a prior script object_creation_for_evaluation_gpt.py
# (by Kaveen Hiniduma). However, this script is a more general-purpose and feature-rich version
@dataclass
class ModelEvaluationConfig:
client: OpenAI
provider: ModelProvider
model_spec: str
fewshot_count: int
is_cot_enabled: bool
def label(self) -> str:
return f"Model__{self.model_spec.replace("/", "_").replace(".", "_")
}__Fewshot__{self.fewshot_count}__CoT__{self.is_cot_enabled}"
def output_path(self) -> Path:
return evaluation_models_output_path / f"{self.label()}.json"
def create_sys_prompt_for_model_evaluation(
eval_config: ModelEvaluationConfig, chosen_fewshot_examples: list[DataSplitRecord], scenario_domains: list[str],
scenario_text_passage_descriptions: list[str], schemas: list[dict[str, Any]]) -> str:
cot = eval_config.is_cot_enabled
system_prompt = model_evaluation_with_cot_system_prompt_prefix if cot \
else model_evaluation_without_cot_system_prompt_prefix
if eval_config.fewshot_count > 0:
system_prompt += '\n\nPartial examples of good responses (omitting CoT analysis for brevity):' \
if cot else '\n\nExamples of good responses:'
for example_idx_in_prompt, fewshot_example in enumerate(chosen_fewshot_examples):
system_prompt += f"\n\n--------------------\n## Request {example_idx_in_prompt + 1}\n\n"
system_prompt += create_query_prompt_for_model_evaluation(
scenario_domains[fewshot_example.scenario_id],
scenario_text_passage_descriptions[fewshot_example.scenario_id], schemas[fewshot_example.scenario_id],
fewshot_example.text_passage)
system_prompt += f"\n\n## Response {example_idx_in_prompt + 1}\n\n"
system_prompt += "... (CoT analysis) ...\n\n```json\n" if cot else ""
system_prompt += json.dumps(fewshot_example.object, indent=2)
system_prompt += "\n```" if cot else ""
return system_prompt
def generate_outputs_for_evaluation(
eval_config: ModelEvaluationConfig, scenario_domains: list[str], scenario_text_passage_descriptions: list[str],
schemas: list[dict[str, Any]], fewshot_examples: list[DataSplitRecord],
evaluation_src_set: list[DataSplitRecord], src_set_nm: Literal["validation", "test"]
) -> list[EvaluationModelOutputRecord]:
eval_model_outputs: list[EvaluationModelOutputRecord] = []
# even if fewshot count for this evaluation config is equal to the number of examples in the fewshot split,
# this randomizes the order in which the chosen fewshot examples are used while still letting the output records
# identify which examples were used and in what order
chosen_fewshot_indices = np.random.choice(len(fewshot_examples), eval_config.fewshot_count, replace=False).tolist()
chosen_fewshot_examples = [fewshot_examples[idx] for idx in chosen_fewshot_indices]
system_prompt = create_sys_prompt_for_model_evaluation(eval_config, chosen_fewshot_examples, scenario_domains,
scenario_text_passage_descriptions, schemas)
logger.debug(f"System prompt for model evaluation config {eval_config.label()}:\n{system_prompt}")
logger.debug(f"chosen fewshot example indices: {chosen_fewshot_indices}")
bundled_client = OpenAiClientBundle(eval_config.client, system_prompt, 8192, 0.0, eval_config.model_spec,
is_response_forced_json=not eval_config.is_cot_enabled)
for src_record_idx, src_record in enumerate(evaluation_src_set):
scenario_id = src_record.scenario_id
extracted_obj, extraction_analysis_output, _, num_retries_used = extract_obj_from_passage_with_retry(
eval_config.provider, eval_config.model_spec, src_record.text_passage, scenario_domains[scenario_id],
scenario_text_passage_descriptions[scenario_id], schemas[scenario_id],
f"{src_record_idx}'th record in the {src_set_nm} set (scenario id={scenario_id}, domain={scenario_domains[scenario_id]}, name={src_record.scenario_name})",
f"{src_set_nm}-{src_record_idx}-{eval_config.label()}", openai_client_bundle=bundled_client,
is_cot_enabled=eval_config.is_cot_enabled
)
eval_model_outputs.append(EvaluationModelOutputRecord(
scenario_id, src_record.scenario_name, src_set_nm == validation_eval_set_name, src_record_idx, chosen_fewshot_indices,
extracted_obj, extraction_analysis_output, num_retries_used
))
if (src_record_idx + 1) % 25 == 0:
logger.info(
f"Completed {src_record_idx + 1} records in the {src_set_nm} set for model evaluation config {eval_config.label()}")
return eval_model_outputs
def main():
openai_client = OpenAI(api_key=os.environ[openai_api_key_env])
llama_provider_client = OpenAI(api_key=os.environ[deepinfra_api_key_env],
base_url="https://api.deepinfra.com/v1/openai")
scenario_domains, scenario_text_passage_descriptions, schemas = load_scenarios(schemas_path)
fewshot_examples = load_data_split(fewshot_examples_path, schemas)
validation_set = load_data_split(validation_set_path, schemas)
test_set = load_data_split(test_set_path, schemas)
evaluation_models_output_path.mkdir(exist_ok=True)
gpt_4o_model_spec = "gpt-4o-2024-11-20"
gpt_4o_mini_model_spec = "gpt-4o-mini-2024-07-18"
llama_3_3_70b_model_spec = "meta-llama/Llama-3.3-70B-Instruct"
llama_3_1_405b_model_spec = "meta-llama/Meta-Llama-3.1-405B-Instruct"
evaluation_configs: list[ModelEvaluationConfig] = \
[ModelEvaluationConfig(openai_client, ModelProvider.OPENAI, gpt_4o_model_spec, 50, cot_choice)
for cot_choice in [False, True]]
# ([
# ModelEvaluationConfig(llama_provider_client, ModelProvider.DEEPINFRA, model_choice, fewshot_count,
# cot_choice)
# for fewshot_count in [0, 5, 10, 50] for model_choice in
# [llama_3_3_70b_model_spec, llama_3_1_405b_model_spec] for cot_choice in [False, True]
# ] + [
# ModelEvaluationConfig(openai_client, ModelProvider.OPENAI, model_choice, fewshot_count, cot_choice)
# for fewshot_count in [0, 5, 10, 50] for model_choice in [gpt_4o_mini_model_spec, gpt_4o_model_spec] for
# cot_choice in [False, True]
# ]
# )
should_use_validation_vs_test_set = False
source_set_for_evaluation = validation_set if should_use_validation_vs_test_set else test_set
src_set_nm: Literal["validation", "test"] = validation_eval_set_name if should_use_validation_vs_test_set else test_eval_set_name
for eval_config in evaluation_configs:
logger.info(f"starting evaluation for model config {eval_config.label()}")
eval_outputs = generate_outputs_for_evaluation(
eval_config, scenario_domains, scenario_text_passage_descriptions, schemas, fewshot_examples,
source_set_for_evaluation, src_set_nm)
with open(eval_config.output_path(), "w") as output_file:
json.dump(list(map(asdict, eval_outputs)), output_file, indent=2)
if __name__ == "__main__":
main()