httpx branch create

deedy5 · Apr 3, 2024 · e6526da · e6526da
1 parent b70fb38
commit e6526da
Show file tree

Hide file tree

Showing 6 changed files with 318 additions and 37 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -5,9 +5,9 @@ name: Python package
 
 on:
   push:
-    branches: [ main ]
+    branches: [ httpx ]
   pull_request:
-    branches: [ main ]
+    branches: [ httpx ]
 
 jobs:
   build:

diff --git a/duckduckgo_search/cli.py b/duckduckgo_search/cli.py
@@ -6,10 +6,10 @@
 from urllib.parse import unquote
 
 import click
-from curl_cffi import requests
+import httpx
 
 from .duckduckgo_search import DDGS
-from .utils import json_dumps
+from .utils import _get_ssl_context, json_dumps
 from .version import __version__
 
 logger = logging.getLogger(__name__)
@@ -81,12 +81,12 @@ def _sanitize_keywords(keywords):
 
 def _download_file(url, dir_path, filename, proxy):
     try:
-        resp = requests.get(url, proxy=proxy, impersonate="chrome", timeout=10)
+        resp = httpx.get(url, proxy=proxy, timeout=10, follow_redirects=True, verify=_get_ssl_context())
         resp.raise_for_status()
         with open(os.path.join(dir_path, filename[:200]), "wb") as file:
             file.write(resp.content)
     except Exception as ex:
-        logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")
+        logger.info(f"download_file url={url} {type(ex).__name__} {ex}")
 
 
 def _download_results(keywords, results, images=False, proxy=None, threads=None):

diff --git a/duckduckgo_search/duckduckgo_search.py b/duckduckgo_search/duckduckgo_search.py
@@ -9,7 +9,7 @@
 
 class DDGS(AsyncDDGS):
     _loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
-    Thread(target=_loop.run_forever, daemon=True).start()  # Start the event loop run in a separate thread.
+    Thread(target=_loop.run_forever, daemon=True).start()  # Start the event loop in a separate thread.
 
     def __init__(
         self,
@@ -40,12 +40,12 @@ def __exit__(
         self._close_session()
 
     def __del__(self) -> None:
-        if self._asession._closed is False:
+        if self._asession.is_closed is False:
             self._close_session()
 
     def _close_session(self) -> None:
         """Close the curl-cffi async session."""
-        self._run_async_in_thread(self._asession.close())
+        self._run_async_in_thread(self._asession.aclose())
 
     def _run_async_in_thread(self, coro: Awaitable[Any]) -> Any:
         """Runs an async coroutine in a separate thread."""

diff --git a/duckduckgo_search/duckduckgo_search_async.py b/duckduckgo_search/duckduckgo_search_async.py
@@ -8,13 +8,13 @@
 from functools import cached_property, partial
 from itertools import cycle, islice
 from types import TracebackType
-from typing import Dict, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
-from curl_cffi import requests
+import httpx
 
 try:
+    from lxml.html import Element, document_fromstring
     from lxml.html import HTMLParser as LHTMLParser
-    from lxml.html import document_fromstring
 
     LXML_AVAILABLE = True
 except ImportError:
@@ -24,6 +24,8 @@
 from .utils import (
     _calculate_distance,
     _extract_vqd,
+    _get_headers,
+    _get_ssl_context,
     _normalize,
     _normalize_url,
     _text_extract_json,
@@ -58,12 +60,13 @@ def __init__(
         if not proxy and proxies:
             warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1)
             self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies
-        self._asession = requests.AsyncSession(
-            headers=headers,
+        self._asession = httpx.AsyncClient(
+            headers=_get_headers() if headers is None else headers,
             proxy=self.proxy,
             timeout=timeout,
-            impersonate="chrome",
-            allow_redirects=False,
+            follow_redirects=False,
+            http2=True,
+            verify=_get_ssl_context(),
         )
         self._asession.headers["Referer"] = "https://duckduckgo.com/"
         self._exception_event = asyncio.Event()
@@ -77,15 +80,15 @@ async def __aexit__(
         exc_val: Optional[BaseException] = None,
         exc_tb: Optional[TracebackType] = None,
     ) -> None:
-        await self._asession.close()
+        await self._asession.aclose()
 
     def __del__(self) -> None:
-        if self._asession._closed is False:
+        if self._asession.is_closed is False:
             with suppress(RuntimeError):
-                asyncio.create_task(self._asession.close())
+                asyncio.create_task(self._asession.aclose())
 
     @cached_property
-    def parser(self) -> Optional["LHTMLParser"]:
+    def parser(self) -> "LHTMLParser":
         """Get HTML parser."""
         return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
 
@@ -97,28 +100,29 @@ def _get_executor(cls, max_workers: int = 1) -> ThreadPoolExecutor:
         return cls._executor
 
     @property
-    def executor(cls) -> Optional[ThreadPoolExecutor]:
+    def executor(cls) -> ThreadPoolExecutor:
         return cls._get_executor()
 
     async def _aget_url(
         self,
         method: str,
         url: str,
-        data: Optional[Union[Dict[str, str], bytes]] = None,
+        content: Optional[Union[str, bytes]] = None,
+        data: Optional[Dict[str, Any]] = None,
         params: Optional[Dict[str, str]] = None,
     ) -> bytes:
         if self._exception_event.is_set():
             raise DuckDuckGoSearchException("Exception occurred in previous call.")
         try:
-            resp = await self._asession.request(method, url, data=data, params=params)
+            resp = await self._asession.request(method, url, content=content, data=data, params=params)
+        except httpx.TimeoutException as ex:
+            self._exception_event.set()
+            raise TimeoutException(f"{url} {type(ex).__name__}: {ex}") from ex
         except Exception as ex:
             self._exception_event.set()
-            if "time" in str(ex).lower():
-                raise TimeoutException(f"{url} {type(ex).__name__}: {ex}") from ex
             raise DuckDuckGoSearchException(f"{url} {type(ex).__name__}: {ex}") from ex
-        logger.debug(f"_aget_url() {resp.url} {resp.status_code} {resp.elapsed:.2f} {len(resp.content)}")
         if resp.status_code == 200:
-            return cast(bytes, resp.content)
+            return resp.content
         self._exception_event.set()
         if resp.status_code in (202, 301, 403):
             raise RatelimitException(f"{resp.url} {resp.status_code} Ratelimit")
@@ -303,7 +307,7 @@ async def _text_html_page(s: int, page: int) -> None:
             if b"No  results." in resp_content:
                 return
 
-            tree = await self._asession.loop.run_in_executor(
+            tree: Element = await asyncio.get_running_loop().run_in_executor(
                 self.executor, partial(document_fromstring, resp_content, self.parser)
             )
 
@@ -382,7 +386,7 @@ async def _text_lite_page(s: int, page: int) -> None:
             if b"No more results." in resp_content:
                 return
 
-            tree = await self._asession.loop.run_in_executor(
+            tree: Element = await asyncio.get_running_loop().run_in_executor(
                 self.executor, partial(document_fromstring, resp_content, self.parser)
             )
 
@@ -854,7 +858,7 @@ async def maps(
         lat_b -= Decimal(radius) * Decimal(0.008983)
         lon_l -= Decimal(radius) * Decimal(0.008983)
         lon_r += Decimal(radius) * Decimal(0.008983)
-        logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
+        logger.info(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
 
         cache = set()
         results: List[Dict[str, str]] = []
@@ -981,7 +985,7 @@ async def _translate_keyword(keyword: str) -> None:
                 "POST",
                 "https://duckduckgo.com/translation.js",
                 params=payload,
-                data=keyword.encode(),
+                content=keyword,
             )
             page_data = json_loads(resp_content)
             page_data["original"] = keyword