diff --git a/pyproject.toml b/pyproject.toml index c7b17f6..a983ba5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ # Text to speech beautifulsoup4 = "^4.12.3" chime = "^0.7.0" - duckduckgo-search = "^4.4.3" + duckduckgo-search = "^4.5.0" gtts = "^2.5.1" httpx = "^0.26.0" ipinfo = "^5.0.1" diff --git a/pyrobbot/internet_utils.py b/pyrobbot/internet_utils.py index be5fe0b..e90faaa 100644 --- a/pyrobbot/internet_utils.py +++ b/pyrobbot/internet_utils.py @@ -1,12 +1,13 @@ """Internet search module for the package.""" +import asyncio import re import numpy as np import requests from bs4 import BeautifulSoup from bs4.element import Comment -from duckduckgo_search import DDGS +from duckduckgo_search import AsyncDDGS from loguru import logger from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity @@ -67,56 +68,75 @@ def find_whole_word_index(my_string, my_substring): return -1 # Substring not found +async def async_raw_websearch( + query: str, + max_results: int = 5, + region: str = GeneralDefinitions.IPINFO["country_name"], +): + """Search the web using DuckDuckGo Search API.""" + async with AsyncDDGS() as ddgs: + async_results = [ + result + async for result in ddgs.text( + keywords=query, + region=region, + max_results=max_results, + backend="html", + ) + ] + return async_results + + def raw_websearch( query: str, max_results: int = 5, region: str = GeneralDefinitions.IPINFO["country_name"], ): """Search the web using DuckDuckGo Search API.""" - with DDGS() as ddgs: - for result in ddgs.text( - keywords=query, - region=region, - max_results=max_results, - backend="html", - ): - if not isinstance(result, dict): - logger.error("Expected a `dict`, got type {}: {}", type(result), result) - yield {} - continue + results = asyncio.run( + async_raw_websearch(query=query, max_results=max_results, region=region) + ) - if result["body"] is None: - continue + for result in results: + if not isinstance(result, dict): + logger.error("Expected a `dict`, got type {}: {}", type(result), result) + results.append({}) + continue - try: - response = requests.get(result["href"], allow_redirects=False, timeout=10) - except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout): + if result.get("body") is None: + continue + + try: + response = requests.get(result["href"], allow_redirects=False, timeout=10) + except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout): + continue + else: + content_type = response.headers.get("content-type") + if (not content_type) or ("text/html" not in content_type): continue - else: - content_type = response.headers.get("content-type") - if (not content_type) or ("text/html" not in content_type): - continue - html = unidecode(extract_text_from_html(response.text)) + html = unidecode(extract_text_from_html(response.text)) - summary = unidecode(result["body"]) - relevance = cosine_similarity_sentences(query.lower(), summary.lower()) + summary = unidecode(result["body"]) + relevance = cosine_similarity_sentences(query.lower(), summary.lower()) - relevance_threshold = 1e-2 - if relevance < relevance_threshold: - continue + relevance_threshold = 1e-2 + if relevance < relevance_threshold: + continue - yield { - "href": result["href"], - "summary": summary, - "detailed": html, - "relevance": relevance, - } + new_results = { + "href": result["href"], + "summary": summary, + "detailed": html, + "relevance": relevance, + } + results.append(new_results) + return results @retry(error_msg="Error performing web search") def websearch(query, **kwargs): """Search the web using DuckDuckGo Search API.""" - raw_results = list(raw_websearch(query, **kwargs)) + raw_results = raw_websearch(query, **kwargs) raw_results = iter( sorted(raw_results, key=lambda x: x.get("relevance", 0.0), reverse=True) )