This repository has been archived by the owner on Feb 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 27
/
summarize.py
executable file
·626 lines (509 loc) · 26 KB
/
summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
#!/usr/bin/env python
"""
This script contains functions for extracting text from a PDF file, splitting
the text into sections, and splitting sections into subsections. The
extract_text_from_pdf() function takes in a file path to a PDF and returns a
string of the extracted text. The split_into_sections() function takes in a
string of text and uses a regular expression to split it into a list of tuples,
where each tuple contains a section header and the corresponding text. The
split_section_into_subsections() function takes in a section header, the
corresponding text, and an encoder object and splits the section into smaller
parts, each of which is returned as a tuple containing a subsection header and
the corresponding text. The split_subsection_into_paragraphs() function takes
in a subsection header, the corresponding text, the encoder object, and a
maximum number of tokens and splits the subsection into smaller parts, each of
which is returned as a tuple containing a paragraph header and the
corresponding text.
This script was written by ChatGPT with direction by Scott Leibrand,
then edited by Scott Leibrand w/ CoPilot and ChatGPT.
"""
import sys
import re
import os
import openai
import glob
#import tiktoken
from transformers import GPT2TokenizerFast
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import html2text
def extract_text_from_pdf(pdf_path):
"""Extracts the text from a PDF file and returns it as a string.
Parameters:
pdf_path (str): The file path to the PDF file.
Returns:
str: The extracted text.
"""
with open(pdf_path, 'rb') as fh:
# Create a PDF resource manager object that stores shared resources
rsrcmgr = PDFResourceManager()
# Create a StringIO object to store the extracted text
output = StringIO()
# Create a TextConverter object to convert PDF pages to text
device = TextConverter(rsrcmgr, output, laparams=LAParams())
# Create a PDF page interpreter object
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the PDF document
for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
interpreter.process_page(page)
# Get the extracted text as a string and close the StringIO object
text = output.getvalue()
output.close()
# Close the PDF file and text converter objects
device.close()
# Remove ^L page breaks from the text
text = text.replace('\x0c', '\n')
return text
def split_into_sections(text):
"""Splits a string of text into a list of tuples, where each tuple contains a section header and the corresponding text.
Parameters:
text (str): The input text to split into sections.
Returns:
list: A list of tuples, where each tuple contains a section header and the corresponding text.
"""
# Use a regular expression to match the "References" section
#pattern = r'(\n\nReferences[^\n]*)\n'
# Also match the "References" section if it is preceded by # or ## (markdown-style headers)
pattern = r'(\n\n(#+\s+)?References[^\n]*)\n'
match = re.search(pattern, text)
if match:
# Remove the "References" section and everything that follows
text = text[:match.start()]
# Use a regular expression to match and remove any base64 encoded data
text = re.sub(r'data:.+;base64[^)]+', '', text)
# Use a regular expression to split the text into sections
#pattern = r'\n\n(\d+[\.:]\s+[^\n]+)\n\n'
# Match section headers that start with a number followed by a period or colon,
# or markdown-style headers that start with one to six hash marks followed by a space
pattern = r'\n\n(#+\s+[^\n]+|\d+[\.:]\s+[^\n]+)\n\n'
sections = re.split(pattern, text)
print("Found", len(sections), "sections.")
# Extract the section headers and their corresponding text
headers = ["Title-Abstract"]
content = []
for i, section in enumerate(sections):
if i % 2 == 0:
# This is a section of content
content.append(section)
else:
# This is a section header
headers.append(section)
#print(section)
# Zip the section headers and content together
sections = list(zip(headers, content))
#print(headers)
return sections
def split_section_into_subsections(section_header, section_content, enc, max_tokens=3000):
"""Splits a section of text into smaller parts, each of which is returned
as a tuple containing a subsection header and the corresponding text.
Parameters:
section_header (str): The header for the section to be split.
section_content (str): The content of the section to be split.
enc (object): An encoder object used to encode the section content as a sequence of tokens.
max_tokens (int, optional): The maximum number of tokens allowed in each subsection. Default is 3000.
Returns:
list: A list of tuples, where each tuple contains a subsection header and the corresponding text.
"""
# Encode the section content as a sequence of tokens
tokens = enc.encode(section_content)
if len(tokens) <= max_tokens:
# The section does not need to be split into subsections
return [(section_header, section_content)]
# Split the section into numbered subsections
pattern = r'\n\n(\d+\.\d+[\.:]\s+[^\n]+)\n\n'
subsections = re.split(pattern, section_content)
# Extract the subsection headers and their corresponding text
headers = [f"{section_header.split('.')[0]}. Section intro"]
content = []
for i, subsection in enumerate(subsections):
if i % 2 == 0:
# This is a subsection of content
content.append(subsection)
else:
# This is a subsection header
headers.append(subsection)
# Zip the subsection headers and content together
subsections = list(zip(headers, content))
# Split any subsections that are still too long into smaller parts
result = []
for header, content in subsections:
parts = split_subsection_into_paragraphs(header, content, enc, max_tokens)
result.extend(parts)
return result
def split_subsection_into_paragraphs(subsection_header, subsection_content, enc, max_tokens=3000):
# Encode the subsection content as a sequence of tokens
tokens = enc.encode(subsection_content)
if len(tokens) <= max_tokens:
# The subsection does not need to be split into parts
return [(subsection_header, subsection_content)]
# Split the subsection into parts
start = 0
parts = []
while start < len(tokens):
# Calculate the size of the next part
end = start + max_tokens
# Find the nearest newline boundary within the part
newline_pos = subsection_content[start:end].find('\n\n')
if newline_pos != -1:
end = start + newline_pos
# Extract the part
part_tokens = tokens[start:end]
part_content = enc.decode(part_tokens)
# Add the part to the list of parts
parts.append((subsection_header, part_content))
# Update the start index
start = end + 2
return parts
def combine_subsections(subsections):
# Initialize the list of combined subsections
combined_subsections = []
# Initialize the current combined subsection
current_subsection_header = ""
current_subsection_content = ""
current_subsection_tokens = 0
# Iterate through the subsections
for header, content in subsections:
# Encode the content as a sequence of tokens
tokens = enc.encode(content)
# If the current combined subsection has less than 1000 tokens and the current subsection has less than 1000 tokens, combine them
if current_subsection_tokens + len(tokens) < 2000 and len(tokens) < 1000:
# Update the current combined subsection header
if current_subsection_header == "":
current_subsection_header = header
current_subsection_content = header + "\n"
else:
if current_subsection_header != header:
current_subsection_content += "\n\n" + header + "\n"
#current_subsection_header += header
# Update the current combined subsection content
current_subsection_content += content
# Update the current combined subsection token count
current_subsection_tokens += len(tokens)
else:
# Add the current combined subsection to the list of combined subsections
combined_subsections.append((current_subsection_header, current_subsection_content))
# Reset the current combined subsection
current_subsection_header = header
current_subsection_content = header + "\n" + content
current_subsection_tokens = len(tokens)
# Add the final combined subsection to the list of combined subsections
combined_subsections.append((current_subsection_header, current_subsection_content))
return combined_subsections
def generate_summary(content, prompt, model_engine="text-davinci-003", max_tokens=3000):
# Get the API key from the environment variable
api_key = os.environ["OPENAI_API_KEY"]
openai.api_key = api_key
# Set the model to use, if not specified
if model_engine is None:
model_engine = "text-davinci-003"
# Set the temperature for sampling
temperature = 0
# Set the max token count for the summary
if model_engine == "text-davinci-003":
max_tokens = 1000
else:
max_tokens = 500
# Generate completions
completions = openai.Completion.create(
engine=model_engine,
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature
)
# Get the summary from the first completion
summary = completions.choices[0].text
return summary
def extract_text_from_html(html_path):
# Read the HTML file
with open(html_path, "r") as html_file:
html = html_file.read()
# Extract the text from the HTML
text = html2text.html2text(html)
return text
def create_html_file(basename, url):
# Create the HTML file
html_file = open(basename + ".summary.html", "w")
# Strip the path from the basename to get the filename
filename = os.path.basename(basename)
# Write the HTML header
html_file.write("<html>\n")
html_file.write("<head>\n")
html_file.write("<title>" + filename + "</title>\n")
#html_file.write("<link rel='stylesheet' type='text/css' href='style.css'>\n")
html_file.write("</head>\n")
html_file.write("<body>\n")
html_file.write("<article id='" + filename + "'>\n")
#html_file.write("<h1>" + filename + "</h1>\n")
html_file.write("<h1>" + "<a href='" + url + "'>" + filename + "</h1></a>\n")
# Write the overall summary section
html_file.write("<h2>Overall Summary</h2>\n")
overall_summary_file = open(basename + ".overall_summary.txt", "r")
overall_summary_content = overall_summary_file.read()
html_file.write("<p>" + overall_summary_content + "</p>\n")
overall_summary_file.close()
# Write the subsection summary section
html_file.write("<h2>Subsection Summary</h2>\n")
subsection_summary_files = glob.glob(basename + ".*.summary.txt")
subsection_summary_files.sort()
for subsection_summary_file in subsection_summary_files:
subsection_summary_file_handle = open(subsection_summary_file, "r")
subsection_summary_content = subsection_summary_file_handle.read()
html_file.write("<p>" + subsection_summary_content + "</p>\n")
subsection_summary_file_handle.close()
# Write the HTML footer
html_file.write("<a href='" + url + "'>Original URL</a>\n")
html_file.write("</article>\n")
html_file.write("</body>\n")
html_file.write("</html>\n")
# Print a message indicating that the HTML file was created
print("Created HTML file: " + basename + ".summary.html")
# Close the HTML file
html_file.close()
def download_html(url):
# Strip any trailing /'s from the end of the URL
stripped_url = url.rstrip("/")
# Get the base name of the URL
base_name = stripped_url.split("/")[-1]
# Download the HTML file
if base_name.endswith(".pdf"):
html_path = "/tmp/" + base_name
else:
html_path = "/tmp/" + base_name + ".html"
print("HTML path: " + html_path)
print("URL: " + url)
os.system("curl -s -o " + html_path + " " + url)
return html_path
if __name__ == '__main__':
model_engine = "text-davinci-003"
max_tokens = 3000
doctype=""
# get the base filename of the first argument without the extension
base_name = os.path.splitext(sys.argv[1])[0]
# If the command line argument starts with http, use curl to download it to an HTML file
if sys.argv[1].startswith("http"):
# Get the URL from the command line arguments
url = sys.argv[1]
doctype="article"
# Strip any query parameters from the URL
url = url.split("?")[0]
# Download the HTML file
html_path = download_html(url)
print(html_path)
# Strip any trailing /'s from the end of the URL
url = url.rstrip("/")
# Set the base_name to a /tmp file containing the last part of the URL between /'s
base_name = "/tmp/" + url.split("/")[-1]
print(base_name)
if sys.argv[1].endswith(".pdf"):
text = extract_text_from_pdf(html_path)
else:
# Extract the text from the HTML file
text = extract_text_from_html(html_path)
# If the command line argument references a pdf file
elif sys.argv[1].endswith(".pdf"):
# Get the PDF file path from the command line arguments
pdf_path = sys.argv[1]
doctype="paper"
# Extract the text from the PDF file
text = extract_text_from_pdf(pdf_path)
elif sys.argv[1].endswith(".html") or sys.argv[1].endswith(".htm"):
# Get the HTML file path from the command line arguments
html_path = sys.argv[1]
doctype="article"
# Extract the text from the HTML file
text = extract_text_from_html(html_path)
else:
# Get the text file path from the command line arguments
text_path = sys.argv[1]
# Read the text file
with open(text_path, "r") as text_file:
text = text_file.read()
# Checking if output language is set: if not, leave off any language instructions from the prompt
try:
arg = sys.argv[2]
output_language_prompt = " Please use "+sys.argv[2]+" language for the output."
except IndexError:
output_language_prompt = ""
# Split the text into sections
sections = split_into_sections(text)
# encode the text as a sequence of tokens
#enc = tiktoken.get_encoding("gpt2")
enc = GPT2TokenizerFast.from_pretrained("gpt2")
tokens = enc.encode(text)
# Get the base name of the output file
#base_name, _ = os.path.splitext(pdf_path)
# Write the extracted text to the output file
with open(base_name + ".full.txt", 'w', encoding='utf-8') as f:
f.write(text)
print(f"Text extracted from {sys.argv[1]} and written to {base_name}.full.txt")
print(f"Total token count: {len(tokens)}")
# Write each section to a separate text file
for header, content in sections:
print("Header: ", header)
# Split the section into subsections if necessary
subsections = split_section_into_subsections(header, content, enc)
# Combine adjacent tuples with less than 1000 tokens until they exceed 1000 tokens
combined_subsections = combine_subsections(subsections)
# Initialize the counter for numbering sequential identical subheaders
subheader_count = 1
# Process each combined subsection
for subheader, subcontent in combined_subsections:
# Update the subheader if there are multiple sequential identical subheaders
if subheader_count > 1:
subheader += f"-part{subheader_count}"
subheader_count += 1
# Use tiktoken to encode the subsection content as a sequence of tokens
subcontent_tokens = enc.encode(subcontent)
# Get the name of the output file
#print("Subheader: ",subheader)
section_name = re.sub(r'[^a-zA-Z0-9]', '', subheader.replace('/','-'))
#print("Section name: ",section_name)
output_path = f"{base_name}.{section_name}.full.txt"
if (len(subcontent) == 0):
subheader_count = subheader_count - 1
else:
# Write the content to the output file
with open(output_path, 'w', encoding='utf-8') as f:
f.write(subcontent)
print(f"{subheader} ({len(subcontent)} characters, {len(subcontent_tokens)} tokens) written to {output_path}")
# Get the name of the summary file
summary_path = f"{base_name}.{section_name}.summary.txt"
# If the summary file does not exist, generate a summary
if os.path.exists(summary_path):
print(f"Summary already exists at {summary_path}")
else:
# Set the prompt for the summary
prompt = f"Please provide a detailed summary of the following section, but if the section content is mostly website context/description, just return 'Section has no content':\n{subcontent}\nPlease provide a detailed summary of the section above. If the section content is mostly website context/description, just return 'Section has no content'.{output_language_prompt}"
# Generate a summary for the subsection
summary = generate_summary(subcontent, prompt, model_engine, max_tokens)
# Write the summary to a summary file
with open(summary_path, 'w', encoding='utf-8') as f:
f.write(summary)
print(f"Summary written to {summary_path}")
# If there is more than one summary file matching {base_name}.*{section_number}.summary.txt, generate a combined section summary
section_number = section_name.split('.')[0]
if len(glob.glob(f"{base_name}.{section_number}.*.summary.txt")) < 1:
print(f"No summary files found for section {section_number}")
elif len(glob.glob(f"{base_name}.{section_number}.*.summary.txt")) == 1:
print(f"Only one summary file found for section {section_number}, promoting it to section summary")
# Get the path of the summary file
#print(glob.glob(f"{base_name}.{section_number}.*.summary.txt"))
summary_path = glob.glob(f"{base_name}.{section_number}.*.summary.txt")[0]
# Get the path of the section summary file
section_summary_path = f"{base_name}.{section_name}.section_summary.txt"
# Read the summary file and write it to the section summary file
with open(summary_path, 'r', encoding='utf-8') as f:
summary = f.read()
with open(section_summary_path, 'w', encoding='utf-8') as f:
f.write(summary)
print(f"Summary promoted to section summary at {section_summary_path}")
else:
# Read in the section summaries
summaries = []
summary_pattern = f"{base_name}.*{section_number}.summary.txt"
print(f"Reading summary from {summary_pattern}")
summary_paths = glob.glob(summary_pattern)
summary_paths.sort(key=os.path.getmtime) # sort file names by modification time, oldest first
for summary_path in summary_paths:
print(f"Reading summary from {summary_path}")
with open(summary_path, 'r', encoding='utf-8') as f:
summaries.append(f.read())
# Concatenate the summaries into a single string
subcontent = "\n\n".join(summaries)
# Tokenize the concatenated summaries
subcontent_tokens = enc.encode(subcontent)
print(f"Concatenated {len(summaries)} summaries into a single summary with {len(subcontent)} characters and {len(subcontent_tokens)} tokens")
if len(subcontent_tokens) == 0:
summary_pattern = f"{base_name}.*.summary.txt"
print(f"Reading summary from {summary_pattern}")
summary_paths = glob.glob(summary_pattern)
summary_paths.sort(key=os.path.getmtime) # sort file names by modification time, oldest first
summaries = []
subcontent_tokens = []
for summary_path in summary_paths:
print(f"Reading summary from {summary_path}")
with open(summary_path, 'r', encoding='utf-8') as f:
summary = f.read()
summary_tokens = enc.encode(summary)
if len(subcontent_tokens) + len(summary_tokens) > max_tokens:
break
summaries.append(summary)
subcontent_tokens += summary_tokens
# Concatenate the summaries into a single string
subcontent = "\n\n".join(summaries)
# Tokenize the concatenated summaries
subcontent_tokens = enc.encode(subcontent)
print(f"Concatenated {len(summaries)} out of {len(summary_paths)} section summaries into a single summary with {len(subcontent)} characters and {len(subcontent_tokens)} tokens")
# Set the prompt for the overall section summary
prompt = f"Please provide a detailed summary of the following sections:\n{subcontent}\nPlease provide a detailed summary of the sections above.{output_language_prompt}"
# Get the path of the overall section summary file
section_summary_path = f"{base_name}.{section_number}.section_summary.txt"
# If the overall section summary file does not exist, generate a summary
if os.path.exists(section_summary_path):
print(f"Overall section summary already exists at {section_summary_path}")
else:
# Generate the overall section summary
section_summary = generate_summary(content, prompt, model_engine, max_tokens)
# Write the overall section summary to a file
with open(section_summary_path, 'w', encoding='utf-8') as f:
f.write(section_summary)
print(f"Overall section summary written to {section_summary_path}")
# Check if the overall summary file already exists
overall_summary_path = f"{base_name}.overall_summary.txt"
if os.path.exists(overall_summary_path):
print(f"Overall summary already exists at {overall_summary_path}")
else:
# Read in the abstract, if it exists
try:
abstract_filename=glob.glob(f"{base_name}.Title-Abstract*.full.txt")[0]
with open(f"{abstract_filename}", 'r', encoding='utf-8') as f:
abstract = f.read()
except IndexError:
print(f"No abstract found for {base_name}")
abstract = ""
# Read in the section summaries
summaries = []
summary_pattern = f"{base_name}.*.section_summary.txt"
for summary_path in glob.glob(summary_pattern):
with open(summary_path, 'r', encoding='utf-8') as f:
summaries.append(f.read())
# Concatenate the abstract and summaries into a single string
subcontent = abstract + "\n\n" + "\n\n".join(summaries)
# Tokenize the concatenated abstract and summaries
subcontent_tokens = enc.encode(subcontent)
for summary in summaries:
summary_tokens = enc.encode(summary)
if len(subcontent_tokens) + len(summary_tokens) > max_tokens:
print(f"Exceeded {max_tokens} tokens, stopping concatenation of summaries")
break
subcontent += summary + "\n\n"
subcontent_tokens += summary_tokens
print(f"Concatenated {len(summaries)} summaries into a single summary with {len(subcontent)} characters and {len(subcontent_tokens)} tokens")
if len(subcontent_tokens) < 500:
print(f"Concatenated subsection summaries have less than 500 tokens, reading in all summaries")
summary_pattern = f"{base_name}.*.summary.txt"
for summary_path in glob.glob(summary_pattern):
with open(summary_path, 'r', encoding='utf-8') as f:
summaries.append(f.read())
for summary in summaries:
summary_tokens = enc.encode(summary)
if len(subcontent_tokens) + len(summary_tokens) > max_tokens:
print(f"Exceeded {max_tokens} tokens, stopping concatenation of summaries")
break
subcontent += summary + "\n\n"
subcontent_tokens += summary_tokens
# Set the prompt for the overall summary
prompt = f"Please provide a detailed summary of the following {doctype}, based on its abstract and summaries of each section:\n{subcontent}\nPlease provide a detailed summary of the {doctype} described above, based on the provided abstract/introduction and summaries of each section.{output_language_prompt}"
# Generate the overall summary
overall_summary = generate_summary(subcontent, prompt, model_engine, max_tokens)
# Append a newline to the overall summary
overall_summary += "\n"
# Write the overall summary to a file
with open(overall_summary_path, 'w', encoding='utf-8') as f:
f.write(overall_summary)
print(f"Overall summary written to {overall_summary_path}")
# Call create_html_file() to create an HTML file with the overall summary
create_html_file(base_name, url)