AsyncDDGS.__init__(): add 'self._parser' for reuse lxml HtmlParser wi…

…thin class instance
deedy5 · Mar 13, 2024 · e400583 · e400583
1 parent ebefde6
commit e400583
Showing 1 changed file with 23 additions and 3 deletions.
diff --git a/duckduckgo_search/duckduckgo_search_async.py b/duckduckgo_search/duckduckgo_search_async.py
@@ -4,6 +4,7 @@
 from contextlib import suppress
 from datetime import datetime, timezone
 from decimal import Decimal
+from functools import partial
 from itertools import cycle, islice
 from types import TracebackType
 from typing import Dict, List, Optional, Tuple, Union
@@ -28,7 +29,7 @@
 class AsyncDDGS:
     """DuckDuckgo_search async class to get search results from duckduckgo.com."""
 
-    _executor = ThreadPoolExecutor()
+    _executor: Optional[ThreadPoolExecutor] = None
 
     def __init__(
         self,
@@ -51,6 +52,7 @@ def __init__(
             headers=headers, proxies=self.proxies, timeout=timeout, impersonate="chrome"
         )
         self._asession.headers["Referer"] = "https://duckduckgo.com/"
+        self._parser: Optional[html.HTMLParser] = None
 
     async def __aenter__(self) -> "AsyncDDGS":
         """A context manager method that is called when entering the 'with' statement."""
@@ -62,6 +64,20 @@ async def __aexit__(
         """Closes the session."""
         await self._asession.close()
 
+    def _get_parser(self) -> html.HTMLParser:
+        """Get HTML parser."""
+        if self._parser is None:
+            self._parser = html.HTMLParser(
+                remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False
+            )
+        return self._parser
+
+    def _get_executor(self, workers: int = 10) -> ThreadPoolExecutor:
+        """Get ThreadPoolExecutor."""
+        if self._executor is None:
+            self._executor = ThreadPoolExecutor(max_workers=workers)
+        return self._executor
+
     async def _aget_url(
         self,
         method: str,
@@ -248,7 +264,9 @@ async def _text_html_page(s: int, page: int) -> None:
             if b"No  results." in resp_content:
                 return
 
-            tree = await self._asession.loop.run_in_executor(self._executor, html.document_fromstring, resp_content)
+            tree = await self._asession.loop.run_in_executor(
+                self._get_executor(), partial(html.document_fromstring, resp_content, self._get_parser())
+            )
 
             for e in tree.xpath("//div[h2]"):
                 href = e.xpath("./a/@href")
@@ -323,7 +341,9 @@ async def _text_lite_page(s: int, page: int) -> None:
             if b"No more results." in resp_content:
                 return
 
-            tree = await self._asession.loop.run_in_executor(self._executor, html.document_fromstring, resp_content)
+            tree = await self._asession.loop.run_in_executor(
+                self._get_executor(), partial(html.document_fromstring, resp_content, self._get_parser())
+            )
 
             data = zip(cycle(range(1, 5)), tree.xpath("//table[last()]//tr"))
             for i, e in data: