-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataset_Generation.py
136 lines (124 loc) · 6.8 KB
/
Dataset_Generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Package pre-requisites: openai, pdf2image, tqdm
from openai import OpenAI
import random
import subprocess
import os, shutil, stat, errno
from tqdm import tqdm
from pdf2image import convert_from_path
random.seed(0)
topics =["Education", "Finance", "Health", "Technology", "Science", "Business", "Politics", "History", "Art", "Culture", "Sports", "Entertainment", "Food", "Travel", "Fashion", "Music", "Movies", "Books", "TV Shows", "Video Games", "Social Media", "Internet", "Space", "Environment", "Climate Change", "Sustainability", "Human Rights", "Equality", "Diversity", "Inclusion", "Mental Health", "Physical Health", "Nutrition", "Fitness", "Wellness", "Parenting", "Relationships", "Self-Improvement", "Productivity", "Motivation", "Inspiration", "Creativity", "Innovation", "Entrepreneurship", "Leadership", "Management", "Marketing", "Sales", "Customer Service", "Human Resources",]
random.shuffle(topics)
length_constraint = [5, 4, 3]
num_topics = 25
topics = topics[:num_topics]
slides_per_topic = 5
def generate_text(topic):
client = OpenAI()
n_keypoints = random.randint(1, 3)
prompt = f"Your job is to generate a brief PPT presentation outline on {topic}. Your generation should contain exactly one slide, with {n_keypoints} keypoints. \
Each key point should be concise, followed with a paragraph demonstrating the corresponding details, which should contain at most {length_constraint[n_keypoints - 1]} sentences. \
Each key point, together with the demonstrative paragraph, should span one line. \
There should be exactly one empty line in your output seperating different keypoints. \
Your generation should follow exactly this format: \nSlide title: [your title]\n"
for i in range(n_keypoints):
prompt += f"Keypoint {i+1}: [your keypoint]\nDetails: [your paragraph]\n\n"
#print(prompt)
stream = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
stream=True,
)
response = ""
for chunk in stream:
if chunk.choices[0].delta.content is not None:
response += chunk.choices[0].delta.content
return response, n_keypoints
def parse_response(response, n_keypoints):
success = True
title, keypoints, details = "", [], []
lines = response.splitlines()
if lines[0].startswith("Slide title:"):
title = lines[0].split(":", 1)[1].strip()
else:
success = False
for line in lines[1:]:
if not line:
continue
elif line.startswith("Keypoint"):
keypoints.append(line.split(":", 1)[1].strip())
elif line.startswith("Details"):
details.append(line.split(":", 1)[1].strip())
else:
success = False
if len(keypoints) != len(details) or len(keypoints) != n_keypoints:
success = False
return title, keypoints, details, success
def write_latex(title, keypoints, details, n_keypoints, topic, index):
themes = ["default", "Darmstadt", "Malmoe", "AnnArbor", "Dresden", "Marburg", "Antibes", "Frankfurt", "Montpellier","Bergen", "Goettingen", "PaloAlto", "Berkeley", "Hannover", "Pittsburgh", "Berlin", "Ilmenau", "Rochester", "Boadilla", "JuanLesPins", "Singapore", "CambridgeUS", "Luebeck", "Szeged", "Copenhagen", "Madrid"]
color_themes = ["albatross", "beaver", "beetle", "crane", "default", "dolphin", "dove", "fly", "lily", "orchid", "rose", "seagull", "seahorse", "spruce", "whale", "wolverine"]
block_width = [0.7, 0.45, 0.3]
with open(f"./Synthetic Dataset/Dataset_Generation/Latex/{topic}_{index}.tex", "w+") as f:
f.write("\\documentclass[5pt]{beamer}\n")
f.write("\\usetheme{" + random.choice(themes) + "}\n")
f.write("\\usecolortheme{" + random.choice(color_themes) + "}\n")
f.write("\\setbeamerfont{block title}{size=\\small}\n")
f.write("\\setbeamerfont{block body}{size=\\scriptsize}\n")
f.write("\\begin{document}\n")
f.write("\\begin{frame}\n")
f.write("\\frametitle{" + title + "}\n")
f.write("\\begin{columns}\n")
for i in range(n_keypoints):
f.write("\\begin{column}{"+ str(block_width[n_keypoints - 1]) +"\\textwidth}\n")
f.write("\\begin{block}{\\textbf{" + keypoints[i].replace("&", r"\&") +"}}\n")
f.write(details[i].replace("&", r"\&") + "\n")
f.write("\\end{block}\n")
f.write("\\end{column}\n")
f.write("\\end{columns}\n")
f.write("\\end{frame}\n")
f.write("\\end{document}\n")
def latex_2_image(topic, index):
subprocess.run([
"pdflatex", "-interaction=batchmode",
"--output-directory=./Synthetic Dataset/Dataset_Generation/Latex/aux_files",
f"./Synthetic Dataset/Dataset_Generation/Latex/{topic}_{index}.tex"
])
os.rename(f"./Synthetic Dataset/Dataset_Generation/Latex/aux_files/{topic}_{index}.pdf",
f"./Synthetic Dataset/Dataset_Generation/pdfs/{topic}_{index}.pdf")
images = convert_from_path(f"./Synthetic Dataset/Dataset_Generation/pdfs/{topic}_{index}.pdf")
images[0].save(f"./Synthetic Dataset/Dataset/images/{topic}_{index}.png", "PNG")
def write_keypoints(topic, index, title, keypoints):
with open(f"./Synthetic Dataset/Dataset/keypoints/{topic}_{index}.txt", "w+") as f:
f.write(topic + "\n")
f.write(title)
for keypoint in keypoints:
f.write("\n" + keypoint)
def handleRemoveReadonly(func, path, exc):
'''Handle read-only files for Windows OS.'''
excvalue = exc[1]
if func in (os.rmdir, os.remove) and excvalue.errno == errno.EACCES:
os.chmod(path, stat.S_IRWXU| stat.S_IRWXG| stat.S_IRWXO) # 0777
func(path)
else:
raise
if __name__ == "__main__":
if os.path.exists("./Synthetic Dataset/"):
shutil.rmtree("./Synthetic Dataset/", ignore_errors=False, onerror=handleRemoveReadonly)
os.mkdir("./Synthetic Dataset/")
os.mkdir("./Synthetic Dataset/Dataset")
os.mkdir("./Synthetic Dataset/Dataset_Generation")
os.mkdir("./Synthetic Dataset/Dataset_Generation/Latex")
os.mkdir("./Synthetic Dataset/Dataset_Generation/Latex/aux_files")
os.mkdir("./Synthetic Dataset/Dataset_Generation/pdfs")
os.mkdir("./Synthetic Dataset/Dataset/images")
os.mkdir("./Synthetic Dataset/Dataset/keypoints")
for j in tqdm(range(num_topics)):
topic = topics[j]
for i in range(slides_per_topic):
response, n_keypoints = generate_text(topic)
title, keypoints, details, success = parse_response(response, n_keypoints)
if not success:
print(f"Failed to parse response for generating {topic} {i+1}/{slides_per_topic} slides")
continue
write_latex(title, keypoints, details, n_keypoints, topic, i)
latex_2_image(topic, i)
write_keypoints(topic, i, title, keypoints)