-
Notifications
You must be signed in to change notification settings - Fork 4
/
duck_duck_go_search.py
178 lines (136 loc) · 6.25 KB
/
duck_duck_go_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import json
from itertools import islice
from typing import Type
from duckduckgo_search import DDGS
# from superagi.lib.logger import logger
from pydantic import BaseModel, Field
# from superagi.helper.token_counter import TokenCounter
# from superagi.llms.base_llm import BaseLlm
from superagi.tools.base_tool import BaseTool
# from superagi.helper.webpage_extractor import WebpageExtractor
# Const variables
DUCKDUCKGO_MAX_ATTEMPTS = 3
WEBPAGE_EXTRACTOR_MAX_ATTEMPTS = 2
MAX_LINKS_TO_SCRAPE = 3
NUM_RESULTS_TO_USE = 10
class DuckDuckGoSearchSchema(BaseModel):
query: str = Field(
...,
description="The search query for duckduckgo search.",
)
class DuckDuckGoSearchTool(BaseTool):
"""
Duck Duck Go Search tool
Attributes:
name : The name.
description : The description.
args_schema : The args schema.
"""
# llm: Optional[BaseLlm] = None
name = "DuckDuckGoSearch"
description = (
"A tool for performing a DuckDuckGo search and extracting snippets and webpages."
"Input should be a search query."
)
args_schema: Type[DuckDuckGoSearchSchema] = DuckDuckGoSearchSchema
class Config:
arbitrary_types_allowed = True
def _execute(self, query: str) -> tuple:
"""
Execute the DuckDuckGo search tool.
Args:
query : The query to search for.
Returns:
Search result summary along with related links
"""
search_results = self.get_raw_duckduckgo_results(query)
# links = []
# for result in search_results:
# links.append(result["href"])
# webpages = self.get_content_from_url(links)
# results = self.get_formatted_webpages(search_results,
# webpages) # array to store objects with keys :{"title":snippet , "body":webpage content, "links":link URL}
# summary = self.summarise_result(query, results) # summarize the content gathered using the function
# links = [result["links"] for result in results if len(result["links"]) > 0]
# if len(links) > 0:
# return summary + "\n\nLinks:\n" + "\n".join("- " + link for link in links[:3])
return search_results
def get_formatted_webpages(self, search_results, webpages):
"""
Generate an array of formatted webpages which can be passed to the summarizer function (summarise_result).
Args:
search_results : The array of objects which were fetched by DuckDuckGo.
Returns:
Returns the result array which is an array of objects
"""
results = [] # array to store objects with keys :{"title":snippet , "body":webpage content, "links":link URL}
i = 0
for webpage in webpages:
results.append({"title": search_results[i]["title"], "body": webpage, "links": search_results[i]["href"]})
i += 1
# if TokenCounter.count_text_tokens(json.dumps(results)) > 3000:
# break
return results
def get_content_from_url(self, links):
"""
Generates a webpage array which stores the content fetched from the links
Args:
links : The array of URLs which were fetched by DuckDuckGo.
Returns:
Returns a webpage array which stores the content fetched from the links
"""
webpages = [] # webpages array for storing the contents extracted from the links
# if links:
# for i in range(0, MAX_LINKS_TO_SCRAPE): # using first 3 (Value of MAX_LINKS_TO_SCRAPE) links
# time.sleep(3)
# content = WebpageExtractor().extract_with_bs4(
# links[i]) # takes in the link and returns content extracted from Webpage extractor
# max_length = len(' '.join(content.split(" ")[:500]))
# content = content[:max_length] # formating the content
# attempts = 0
# while content == "" and attempts < WEBPAGE_EXTRACTOR_MAX_ATTEMPTS:
# attempts += 1
# content = WebpageExtractor().extract_with_bs4(links[i])
# content = content[:max_length]
# webpages.append(content)
return webpages
def get_raw_duckduckgo_results(self, query):
"""
Gets raw search results from the duckduckgosearch python package
Args:
query : The query to search for.
Returns:
Returns raw search results from the duckduckgosearch python package
"""
search_results = []
attempts = 0
while attempts < DUCKDUCKGO_MAX_ATTEMPTS:
if not query: # checking if string is empty, if it is empty-> convert array to JSON object and return it;
return json.dumps(search_results)
results = DDGS().text(
query) # text() method from DDGS takes in query (String) as input and returns the results
search_results = list(islice(results,
NUM_RESULTS_TO_USE)) # gets first 10 results from results and stores them in search_results
if search_results: # if search result is populated,break as there is no need to attempt the search again
break
# time.sleep(1)
attempts += 1
return search_results
def summarise_result(self, query, snippets):
"""
Summarise the result of a DuckDuckGo search.
Args:
query : The query to search for.
snippets (list): A list of snippets from the search.
Returns:
A summary of the search result.
"""
summarize_prompt = """Summarize the following text `{snippets}`
Write a concise or as descriptive as necessary and attempt to
answer the query: `{query}` as best as possible. Use markdown formatting for
longer responses."""
summarize_prompt = summarize_prompt.replace("{snippets}", str(snippets))
summarize_prompt = summarize_prompt.replace("{query}", query)
messages = [{"role": "system", "content": summarize_prompt}]
result = self.llm.chat_completion(messages, max_tokens=self.max_token_limit)
return result["content"]