Skip to content

Commit

Permalink
httpx branch create
Browse files Browse the repository at this point in the history
  • Loading branch information
deedy5 committed Apr 3, 2024
1 parent b70fb38 commit e6526da
Show file tree
Hide file tree
Showing 6 changed files with 318 additions and 37 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ name: Python package

on:
push:
branches: [ main ]
branches: [ httpx ]
pull_request:
branches: [ main ]
branches: [ httpx ]

jobs:
build:
Expand Down
8 changes: 4 additions & 4 deletions duckduckgo_search/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from urllib.parse import unquote

import click
from curl_cffi import requests
import httpx

from .duckduckgo_search import DDGS
from .utils import json_dumps
from .utils import _get_ssl_context, json_dumps
from .version import __version__

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -81,12 +81,12 @@ def _sanitize_keywords(keywords):

def _download_file(url, dir_path, filename, proxy):
try:
resp = requests.get(url, proxy=proxy, impersonate="chrome", timeout=10)
resp = httpx.get(url, proxy=proxy, timeout=10, follow_redirects=True, verify=_get_ssl_context())
resp.raise_for_status()
with open(os.path.join(dir_path, filename[:200]), "wb") as file:
file.write(resp.content)
except Exception as ex:
logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")
logger.info(f"download_file url={url} {type(ex).__name__} {ex}")


def _download_results(keywords, results, images=False, proxy=None, threads=None):
Expand Down
6 changes: 3 additions & 3 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class DDGS(AsyncDDGS):
_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()
Thread(target=_loop.run_forever, daemon=True).start() # Start the event loop run in a separate thread.
Thread(target=_loop.run_forever, daemon=True).start() # Start the event loop in a separate thread.

def __init__(
self,
Expand Down Expand Up @@ -40,12 +40,12 @@ def __exit__(
self._close_session()

def __del__(self) -> None:
if self._asession._closed is False:
if self._asession.is_closed is False:
self._close_session()

def _close_session(self) -> None:
"""Close the curl-cffi async session."""
self._run_async_in_thread(self._asession.close())
self._run_async_in_thread(self._asession.aclose())

def _run_async_in_thread(self, coro: Awaitable[Any]) -> Any:
"""Runs an async coroutine in a separate thread."""
Expand Down
48 changes: 26 additions & 22 deletions duckduckgo_search/duckduckgo_search_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
from functools import cached_property, partial
from itertools import cycle, islice
from types import TracebackType
from typing import Dict, List, Optional, Tuple, Type, Union, cast
from typing import Any, Dict, List, Optional, Tuple, Type, Union

from curl_cffi import requests
import httpx

try:
from lxml.html import Element, document_fromstring
from lxml.html import HTMLParser as LHTMLParser
from lxml.html import document_fromstring

LXML_AVAILABLE = True
except ImportError:
Expand All @@ -24,6 +24,8 @@
from .utils import (
_calculate_distance,
_extract_vqd,
_get_headers,
_get_ssl_context,
_normalize,
_normalize_url,
_text_extract_json,
Expand Down Expand Up @@ -58,12 +60,13 @@ def __init__(
if not proxy and proxies:
warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1)
self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies
self._asession = requests.AsyncSession(
headers=headers,
self._asession = httpx.AsyncClient(
headers=_get_headers() if headers is None else headers,
proxy=self.proxy,
timeout=timeout,
impersonate="chrome",
allow_redirects=False,
follow_redirects=False,
http2=True,
verify=_get_ssl_context(),
)
self._asession.headers["Referer"] = "https://duckduckgo.com/"
self._exception_event = asyncio.Event()
Expand All @@ -77,15 +80,15 @@ async def __aexit__(
exc_val: Optional[BaseException] = None,
exc_tb: Optional[TracebackType] = None,
) -> None:
await self._asession.close()
await self._asession.aclose()

def __del__(self) -> None:
if self._asession._closed is False:
if self._asession.is_closed is False:
with suppress(RuntimeError):
asyncio.create_task(self._asession.close())
asyncio.create_task(self._asession.aclose())

@cached_property
def parser(self) -> Optional["LHTMLParser"]:
def parser(self) -> "LHTMLParser":
"""Get HTML parser."""
return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)

Expand All @@ -97,28 +100,29 @@ def _get_executor(cls, max_workers: int = 1) -> ThreadPoolExecutor:
return cls._executor

@property
def executor(cls) -> Optional[ThreadPoolExecutor]:
def executor(cls) -> ThreadPoolExecutor:
return cls._get_executor()

async def _aget_url(
self,
method: str,
url: str,
data: Optional[Union[Dict[str, str], bytes]] = None,
content: Optional[Union[str, bytes]] = None,
data: Optional[Dict[str, Any]] = None,
params: Optional[Dict[str, str]] = None,
) -> bytes:
if self._exception_event.is_set():
raise DuckDuckGoSearchException("Exception occurred in previous call.")
try:
resp = await self._asession.request(method, url, data=data, params=params)
resp = await self._asession.request(method, url, content=content, data=data, params=params)
except httpx.TimeoutException as ex:
self._exception_event.set()
raise TimeoutException(f"{url} {type(ex).__name__}: {ex}") from ex
except Exception as ex:
self._exception_event.set()
if "time" in str(ex).lower():
raise TimeoutException(f"{url} {type(ex).__name__}: {ex}") from ex
raise DuckDuckGoSearchException(f"{url} {type(ex).__name__}: {ex}") from ex
logger.debug(f"_aget_url() {resp.url} {resp.status_code} {resp.elapsed:.2f} {len(resp.content)}")
if resp.status_code == 200:
return cast(bytes, resp.content)
return resp.content
self._exception_event.set()
if resp.status_code in (202, 301, 403):
raise RatelimitException(f"{resp.url} {resp.status_code} Ratelimit")
Expand Down Expand Up @@ -303,7 +307,7 @@ async def _text_html_page(s: int, page: int) -> None:
if b"No results." in resp_content:
return

tree = await self._asession.loop.run_in_executor(
tree: Element = await asyncio.get_running_loop().run_in_executor(
self.executor, partial(document_fromstring, resp_content, self.parser)
)

Expand Down Expand Up @@ -382,7 +386,7 @@ async def _text_lite_page(s: int, page: int) -> None:
if b"No more results." in resp_content:
return

tree = await self._asession.loop.run_in_executor(
tree: Element = await asyncio.get_running_loop().run_in_executor(
self.executor, partial(document_fromstring, resp_content, self.parser)
)

Expand Down Expand Up @@ -854,7 +858,7 @@ async def maps(
lat_b -= Decimal(radius) * Decimal(0.008983)
lon_l -= Decimal(radius) * Decimal(0.008983)
lon_r += Decimal(radius) * Decimal(0.008983)
logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
logger.info(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")

cache = set()
results: List[Dict[str, str]] = []
Expand Down Expand Up @@ -981,7 +985,7 @@ async def _translate_keyword(keyword: str) -> None:
"POST",
"https://duckduckgo.com/translation.js",
params=payload,
data=keyword.encode(),
content=keyword,
)
page_data = json_loads(resp_content)
page_data["original"] = keyword
Expand Down
Loading

0 comments on commit e6526da

Please sign in to comment.