-
Notifications
You must be signed in to change notification settings - Fork 85
/
Copy pathcategory.py
66 lines (57 loc) · 3.8 KB
/
category.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Tag structure
# - category_tag
# - criteria_v0.1
# - specificity
# - ...
# - math_v0.1
# - math
# - if_v0.1
# - if
# - score
import ast
import re
class Category:
def __init__(self):
pass
@staticmethod
def create_category(name):
if name == "criteria_v0.1":
return CategoryHardPrompt()
raise Exception(f"Category name is incorrect: {name}")
def post_process(self):
pass
class CategoryHardPrompt(Category):
def __init__(self):
super().__init__()
self.name_tag = "criteria_v0.1"
self.pattern = re.compile(r"(\[[1234567](?:\,\s[1234567])*\])")
self.sys_prompt = "Your task is to evaluate how well the following input prompts can assess the capabilities of advanced AI assistants.\n\nFor the input prompt, please analyze it based on the following 7 criteria.\n1. Specificity: Does the prompt ask for a specific output, such as code, a mathematical solution, a logical simplification, a problem-solving strategy, or a hardware setup recommendation? This specificity allows the AI to demonstrate its ability to understand and generate precise responses.\n2. Domain Knowledge: Does the prompt cover a specific domain, such as programming, mathematics, logic, problem-solving, or hardware setup? Prompts spanning a range of topics test the AI's breadth of knowledge and its ability to apply that knowledge to different domains.\n3. Complexity: Does the prompt vary in complexity, from straightforward tasks to more complex, multi-step problems? This allows evaluators to assess the AI's capability to handle problems of varying difficulty.\n4. Problem-Solving Skills: Does the prompt directly involves the AI to demonstrate active problem-solving skills, such systemically coming up with a solution for a specific setup instead of regurgitating an existing fact? This tests the AI's ability to apply logical reasoning and provide practical solutions.\n5. Creativity: Does the prompt involve a level of creativity in approaching the problem? This criterion tests the AI's ability to provide tailored solutions that take into account the user's specific needs and limitations.\n6. Technical Accuracy: Does the prompt require technical accuracy in the response? This allows evaluators to assess the AI's precision and correctness in technical fields.\n7. Real-world Application: Does the prompt relate to real-world applications, such as setting up a functional system or writing code for a practical use case? This tests the AI's ability to provide practical and actionable information that could be implemented in real-life scenarios.\n\nYou must list the criteria numbers that the prompt satisfies in the format of a Python array. For example, \"[...]\". Do not explain your choice."
self.tags = {
1: "specificity",
2: "domain_knowledge",
3: "complexity",
4: "problem_solving",
5: "creativity",
6: "technical_accuracy",
7: "real_world",
}
def get_score(self, judgment):
matches = self.pattern.findall(judgment)
matches = [m for m in matches if m != ""]
if len(set(matches)) == 0:
return ['No Match']
elif len(set(matches)) == 1:
try:
return ast.literal_eval(matches[0])
except SyntaxError:
print(matches[0])
return ['Syntax Error']
else:
return ['Multiple Match']
def pre_process(self, prompt):
conv = [{"role": "system", "content": self.sys_prompt}]
conv.append({"role": "user", "content": prompt})
return conv
def post_process(self, judgment):
criteria = self.get_score(judgment=judgment)
return {name: bool(i in criteria) for i, name in self.tags.items()}