-
Notifications
You must be signed in to change notification settings - Fork 0
/
docusense.py
executable file
·147 lines (129 loc) · 4.88 KB
/
docusense.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
This script provides functionality for summarizing a given document using OpenAI's GPT-3.5-turbo model.
It includes the 'init()' function to initialize environment variables, the 'summarize()' function to generate summaries,
and the 'docusense()' function as the entry point for the script. The 'docusense()' function takes command-line arguments
for the document path, chunk size, and chunk overlap. It utilizes prompts and chains to perform the summarization process.
"""
def init():
"""
Initializes the environment variables by loading the .env file.
Returns:
None
"""
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)
def summarize(
document: str,
summary_file: str,
chunk_size: int,
chunk_overlap: int,
max_single_shot_num_tokens: int = 2048,
) -> None:
"""
Summarizes a given document using OpenAI's GPT-3.5-turbo model.
Args:
document (str): The path to the document to be summarized.
chunk_size (int): The size of each chunk of the document to be summarized.
chunk_overlap (int): The amount of overlap between each chunk of the document.
max_single_shot_num_tokens (int, optional): The maximum number of tokens allowed for a single-shot summarization. Defaults to 2048.
Returns:
None
Raises:
FileNotFoundError: If the specified document path does not exist.
"""
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from document_loaders.document_loaders import (
load_document,
merge_document,
chunk_data,
)
from text_utils.text_utils import num_tokens_and_cost
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
map_prompt = """
Write a concise summary of the following:
Text: `{text}`
CONCISE SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])
combine_prompt = """
Write a concise summary of the following text that covers key points.
Add a title to the summary.
Start the summary with an INTRODUCTION PARAGRAPH that gives an overview of the topic FOLLOWED
by BULLET POINTS if possible AND end the summary with a CONCLUSION.
Text: `{text}`
"""
combine_prompt_template = PromptTemplate(
template=combine_prompt, input_variables=["text"]
)
doc = load_document(document)
num_tokens, cost = num_tokens_and_cost(doc)
print(f"Approximate summarization cost: ${cost:.4f}")
if num_tokens <= max_single_shot_num_tokens:
chain = LLMChain(llm=llm, prompt=combine_prompt_template)
print("Running single-shot summarization")
summary = chain.run({"text": merge_document(doc)})
print(f"Writing summary to {summary_file}... ", end="")
with open(summary_file, "w") as f:
f.write(summary)
print("Done")
else:
chain = load_summarize_chain(
llm=llm,
chain_type="map_reduce",
map_prompt=map_prompt_template,
combine_prompt=combine_prompt_template,
)
print("Running multi-shot summarization")
chain = load_summarize_chain(
llm=llm,
chain_type="map_reduce",
map_prompt=map_prompt_template,
combine_prompt=combine_prompt_template,
)
summary = chain.run(
chunk_data(data=doc, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
)
print(f"Writing summary to {summary_file}... ", end="")
with open(summary_file, "w") as f:
f.write(summary)
print("Done")
def docusense() -> None:
"""
This function takes in a document path and summary file path and summarizes it using DocuSense.
It also takes in optional arguments for chunk size and overlap.
Returns:
None
"""
import argparse
parser = argparse.ArgumentParser(description="DocuSense")
parser.add_argument(
"document", type=str, help="Path to the document to be summarized."
)
parser.add_argument(
"summary_file",
type=str,
help="Path to the file where summary will be written to.",
)
parser.add_argument(
"--chunk_size", type=int, default=3300, help="Chunk size in tokens."
)
parser.add_argument(
"--chunk_overlap", type=int, default=100, help="Chunk overlap in tokens."
)
args = parser.parse_args()
document = args.document
summary_file = args.summary_file
chunk_size = args.chunk_size
chunk_overlap = args.chunk_overlap
print(f"Instantiating DocuSense for {document}")
init()
try:
summarize(document, summary_file, chunk_size, chunk_overlap)
except FileNotFoundError:
print(f"File {document} not found")
if __name__ == "__main__":
docusense()