From 9774700056e53d6367fef4b52e49e93defd9d5ac Mon Sep 17 00:00:00 2001 From: danieldotnl Date: Tue, 7 May 2024 13:35:07 +0000 Subject: [PATCH 1/2] Support cookies between requests --- custom_components/multiscrape/coordinator.py | 25 +++--- custom_components/multiscrape/form.py | 21 +++-- custom_components/multiscrape/http.py | 80 +++++++++++--------- 3 files changed, 63 insertions(+), 63 deletions(-) diff --git a/custom_components/multiscrape/coordinator.py b/custom_components/multiscrape/coordinator.py index 4f3008a..b526a5c 100644 --- a/custom_components/multiscrape/coordinator.py +++ b/custom_components/multiscrape/coordinator.py @@ -1,26 +1,22 @@ """Coordinator class for multiscrape integration.""" import logging -from datetime import timedelta from collections.abc import Callable +from datetime import timedelta +from homeassistant.const import (CONF_RESOURCE, CONF_RESOURCE_TEMPLATE, + CONF_SCAN_INTERVAL) from homeassistant.core import HomeAssistant -from homeassistant.const import ( - CONF_RESOURCE, - CONF_RESOURCE_TEMPLATE, - CONF_SCAN_INTERVAL, -) -from homeassistant.helpers.update_coordinator import DataUpdateCoordinator -from homeassistant.helpers.update_coordinator import event +from homeassistant.helpers.update_coordinator import (DataUpdateCoordinator, + event) from homeassistant.util.dt import utcnow -from .scraper import Scraper -from .http import HttpWrapper +from .const import DOMAIN from .file import LoggingFileManager from .form import FormSubmitter +from .http import HttpWrapper +from .scraper import Scraper from .util import create_renderer -from .const import DOMAIN - _LOGGER = logging.getLogger(__name__) # we don't want to go with the default 15 seconds defined in helpers/entity_component DEFAULT_SCAN_INTERVAL = timedelta(seconds=60) @@ -56,6 +52,7 @@ def __init__( self._http = http self._form_submitter = form self._resource_renderer = resource_renderer + self._cookies = None def notify_scrape_exception(self): """Notify the form_submitter of an exception so it will re-submit next trigger.""" @@ -68,7 +65,7 @@ async def get_content(self) -> str: if self._form_submitter: try: - result = await self._form_submitter.async_submit(resource) + result, self._cookies = await self._form_submitter.async_submit(resource) if result: _LOGGER.debug( @@ -83,7 +80,7 @@ async def get_content(self) -> str: ex, ) - response = await self._http.async_request("page", resource) + response = await self._http.async_request("page", resource, self._cookies) return response.text diff --git a/custom_components/multiscrape/form.py b/custom_components/multiscrape/form.py index 54e9a59..8073501 100644 --- a/custom_components/multiscrape/form.py +++ b/custom_components/multiscrape/form.py @@ -3,21 +3,15 @@ from urllib.parse import urljoin from bs4 import BeautifulSoup - -from homeassistant.core import HomeAssistant from homeassistant.const import CONF_RESOURCE +from homeassistant.core import HomeAssistant -from .const import ( - CONF_FORM_SELECT, - CONF_FORM_INPUT, - CONF_FORM_INPUT_FILTER, - CONF_FORM_SUBMIT_ONCE, - CONF_FORM_RESUBMIT_ERROR, -) +from .const import (CONF_FORM_INPUT, CONF_FORM_INPUT_FILTER, + CONF_FORM_RESUBMIT_ERROR, CONF_FORM_SELECT, + CONF_FORM_SUBMIT_ONCE) from .file import LoggingFileManager from .http import HttpWrapper - _LOGGER = logging.getLogger(__name__) @@ -76,6 +70,7 @@ def __init__( self._resubmit_error = resubmit_error self._parser = parser self._should_submit = True + self._cookies = None def notify_scrape_exception(self): """Make sure form is re-submitted after an exception.""" @@ -141,6 +136,7 @@ async def async_submit(self, main_resource): submit_resource, method=method, request_data=input_fields, + cookies=self._cookies ) _LOGGER.debug( "%s # Form seems to be submitted successfully (to be sure, use log_response and check file). Now continuing to retrieve target page.", @@ -151,9 +147,9 @@ async def async_submit(self, main_resource): self._should_submit = False if not self._form_resource: - return response.text + return response.text, response.cookies else: - return None + return None, response.cookies def _determine_submit_resource(self, action, main_resource): resource = main_resource @@ -182,6 +178,7 @@ async def _fetch_form_page(self, resource): resource, "GET", ) + self._cookies = response.cookies return response.text def _get_input_fields(self, form): diff --git a/custom_components/multiscrape/http.py b/custom_components/multiscrape/http.py index efe3005..bbca8f6 100644 --- a/custom_components/multiscrape/http.py +++ b/custom_components/multiscrape/http.py @@ -1,21 +1,15 @@ """HTTP request related functionality.""" +import asyncio import logging from collections.abc import Callable -import httpx +import httpx +from homeassistant.const import (CONF_AUTHENTICATION, CONF_HEADERS, + CONF_METHOD, CONF_PARAMS, CONF_PASSWORD, + CONF_PAYLOAD, CONF_TIMEOUT, CONF_USERNAME, + CONF_VERIFY_SSL, HTTP_DIGEST_AUTHENTICATION) from homeassistant.helpers.httpx_client import get_async_client -from homeassistant.const import ( - HTTP_DIGEST_AUTHENTICATION, - CONF_VERIFY_SSL, - CONF_USERNAME, - CONF_PASSWORD, - CONF_AUTHENTICATION, - CONF_TIMEOUT, - CONF_HEADERS, - CONF_PARAMS, - CONF_PAYLOAD, - CONF_METHOD, -) + from .util import create_dict_renderer, create_renderer _LOGGER = logging.getLogger(__name__) @@ -86,7 +80,7 @@ def set_authentication(self, username, password, auth_type): self._auth = (username, password) _LOGGER.debug("%s # Authentication configuration processed", self._config_name) - async def async_request(self, context, resource, method=None, request_data=None): + async def async_request(self, context, resource, method=None, request_data=None, cookies=None): """Execute a HTTP request.""" data = request_data or self._data_renderer() method = method or self._method or "GET" @@ -94,16 +88,19 @@ async def async_request(self, context, resource, method=None, request_data=None) params = self._params_renderer(None) _LOGGER.debug( - "%s # Executing %s-request with a %s to url: %s with headers: %s.", + "%s # Executing %s-request with a %s to url: %s with headers: %s and cookies: %s.", self._config_name, context, method, resource, headers, + cookies ) if self._file_manager: - await self._async_file_log("request_headers", context, headers) - await self._async_file_log("request_body", context, data) + task1 = self._async_file_log("request_headers", context, headers) + task2 = self._async_file_log("request_body", context, data) + task3 = self._async_file_log("request_cookies", context, cookies) + await asyncio.gather(task1, task2, task3) response = None @@ -117,6 +114,7 @@ async def async_request(self, context, resource, method=None, request_data=None) data=data, timeout=self._timeout, follow_redirects=True, + cookies=cookies ) _LOGGER.debug( @@ -125,10 +123,12 @@ async def async_request(self, context, resource, method=None, request_data=None) response.status_code, ) if self._file_manager: - await self._async_file_log( + task1 = self._async_file_log( "response_headers", context, response.headers ) - await self._async_file_log("response_body", context, response.text) + task2 = self._async_file_log("response_body", context, response.text) + task3 = self._async_file_log("response_cookies", context, response.cookies) + await asyncio.gather(task1, task2, task3) # bit of a hack since httpx also raises an exception for redirects: https://github.com/encode/httpx/blob/c6c8cb1fe2da9380f8046a19cdd5aade586f69c8/CHANGELOG.md#0200-13th-october-2021 if 400 <= response.status_code <= 599: @@ -168,36 +168,42 @@ async def async_request(self, context, resource, method=None, request_data=None) async def _handle_request_exception(self, context, response): try: if self._file_manager: - await self._async_file_log( + task1 = self._async_file_log( "response_headers_error", context, response.headers ) - await self._async_file_log( + task2 = self._async_file_log( "response_body_error", context, response.text ) + task3 = self._async_file_log( + "response_cookies_error", context, response.cookies + ) + await asyncio.gather(task1, task2, task3) except Exception as exc: _LOGGER.debug( - "%s # Unable to write headers and body to files during handling of exception.\n Error message:\n %s", + "%s # Unable to write headers, cookies and/or body to file during handling of exception.\n Error message:\n %s", self._config_name, repr(exc), ) async def _async_file_log(self, content_name, context, content): - try: - filename = f"{context}_{content_name}.txt" - await self._hass.async_add_executor_job( - self._file_manager.write, filename, content - ) - except Exception as ex: - _LOGGER.error( - "%s # Unable to write %s to file: %s. \nException: %s", + """Write content to a file if content is not None.""" + if content is not None: + try: + filename = f"{context}_{content_name}.txt" + await self._hass.async_add_executor_job( + self._file_manager.write, filename, content + ) + except Exception as ex: + _LOGGER.error( + "%s # Unable to write %s to file: %s. \nException: %s", + self._config_name, + content_name, + filename, + ex, + ) + _LOGGER.debug( + "%s # %s written to file: %s", self._config_name, content_name, filename, - ex, ) - _LOGGER.debug( - "%s # %s written to file: %s", - self._config_name, - content_name, - filename, - ) From 8f2a31a53415b332ac6ab59d0170f4618fe60b9e Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 11 May 2024 10:08:12 +0000 Subject: [PATCH 2/2] Fix assigning cookies to wrong variable --- custom_components/multiscrape/coordinator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_components/multiscrape/coordinator.py b/custom_components/multiscrape/coordinator.py index b526a5c..d9fa777 100644 --- a/custom_components/multiscrape/coordinator.py +++ b/custom_components/multiscrape/coordinator.py @@ -80,7 +80,7 @@ async def get_content(self) -> str: ex, ) - response = await self._http.async_request("page", resource, self._cookies) + response = await self._http.async_request("page", resource, cookies=self._cookies) return response.text