From df9ca8cc472ba90d62d6cc8f365bba7e81169ebb Mon Sep 17 00:00:00 2001 From: danieldotnl Date: Tue, 27 Aug 2024 08:09:46 +0000 Subject: [PATCH 1/5] Add different extract options --- custom_components/multiscrape/const.py | 3 +++ custom_components/multiscrape/schema.py | 14 ++++++++------ custom_components/multiscrape/scraper.py | 19 ++++++++++++++----- custom_components/multiscrape/selector.py | 9 +++++---- 4 files changed, 30 insertions(+), 15 deletions(-) diff --git a/custom_components/multiscrape/const.py b/custom_components/multiscrape/const.py index ef87737..e7098a0 100644 --- a/custom_components/multiscrape/const.py +++ b/custom_components/multiscrape/const.py @@ -33,7 +33,10 @@ CONF_FORM_RESUBMIT_ERROR = "resubmit_on_error" CONF_FORM_VARIABLES = "variables" CONF_LOG_RESPONSE = "log_response" +CONF_EXTRACT = "extract" +EXTRACT_OPTIONS = ["text", "content", "tag"] DEFAULT_PARSER = "lxml" +DEFAULT_EXTRACT = "text" CONF_FIELDS = "fields" diff --git a/custom_components/multiscrape/schema.py b/custom_components/multiscrape/schema.py index 0263d97..2b641b7 100644 --- a/custom_components/multiscrape/schema.py +++ b/custom_components/multiscrape/schema.py @@ -24,9 +24,9 @@ HTTP_BASIC_AUTHENTICATION, HTTP_DIGEST_AUTHENTICATION) -from .const import (CONF_ATTR, CONF_FORM_INPUT, CONF_FORM_INPUT_FILTER, - CONF_FORM_RESUBMIT_ERROR, CONF_FORM_SELECT, - CONF_FORM_SUBMIT, CONF_FORM_SUBMIT_ONCE, +from .const import (CONF_ATTR, CONF_EXTRACT, CONF_FORM_INPUT, + CONF_FORM_INPUT_FILTER, CONF_FORM_RESUBMIT_ERROR, + CONF_FORM_SELECT, CONF_FORM_SUBMIT, CONF_FORM_SUBMIT_ONCE, CONF_FORM_VARIABLES, CONF_LOG_RESPONSE, CONF_ON_ERROR, CONF_ON_ERROR_DEFAULT, CONF_ON_ERROR_LOG, CONF_ON_ERROR_VALUE, CONF_ON_ERROR_VALUE_DEFAULT, @@ -34,9 +34,10 @@ CONF_PARSER, CONF_PICTURE, CONF_SELECT, CONF_SELECT_LIST, CONF_SENSOR_ATTRS, CONF_SEPARATOR, CONF_STATE_CLASS, DEFAULT_BINARY_SENSOR_NAME, DEFAULT_BUTTON_NAME, - DEFAULT_FORCE_UPDATE, DEFAULT_METHOD, DEFAULT_PARSER, - DEFAULT_SENSOR_NAME, DEFAULT_SEPARATOR, DEFAULT_VERIFY_SSL, - DOMAIN, LOG_ERROR, LOG_LEVELS, METHODS) + DEFAULT_EXTRACT, DEFAULT_FORCE_UPDATE, DEFAULT_METHOD, + DEFAULT_PARSER, DEFAULT_SENSOR_NAME, DEFAULT_SEPARATOR, + DEFAULT_VERIFY_SSL, DOMAIN, EXTRACT_OPTIONS, LOG_ERROR, + LOG_LEVELS, METHODS) from .scraper import DEFAULT_TIMEOUT _LOGGER = logging.getLogger(__name__) @@ -84,6 +85,7 @@ vol.Optional(CONF_ATTR): cv.string, vol.Optional(CONF_VALUE_TEMPLATE): cv.template, vol.Optional(CONF_ON_ERROR): vol.Schema(ON_ERROR_SCHEMA), + vol.Optional(CONF_EXTRACT, default=DEFAULT_EXTRACT): vol.In(EXTRACT_OPTIONS), } FORM_HEADERS_MAPPING_SCHEMA = {vol.Required(CONF_NAME): cv.string, **SELECTOR_SCHEMA} diff --git a/custom_components/multiscrape/scraper.py b/custom_components/multiscrape/scraper.py index 2f37aef..ae9cb68 100644 --- a/custom_components/multiscrape/scraper.py +++ b/custom_components/multiscrape/scraper.py @@ -126,7 +126,7 @@ def scrape(self, selector, sensor, attribute=None, variables: dict = {}): ) values = [tag[selector.attribute] for tag in tags] else: - values = [tag.text for tag in tags] + values = [self.extract_tag_value(tag, selector) for tag in tags] value = self._separator.join(values) _LOGGER.debug("%s # List selector csv: %s", log_prefix, value) @@ -142,10 +142,7 @@ def scrape(self, selector, sensor, attribute=None, variables: dict = {}): ) value = tag[selector.attribute] else: - if tag.name in ("style", "script", "template"): - value = tag.string - else: - value = tag.text + value = self.extract_tag_value(tag, selector) _LOGGER.debug("%s # Selector result: %s", log_prefix, value) if value is not None and selector.value_template is not None: @@ -161,6 +158,18 @@ def scrape(self, selector, sensor, attribute=None, variables: dict = {}): ) return value + def extract_tag_value(self, tag, selector): + """Extract value from a tag.""" + if tag.name in ("style", "script", "template"): + return tag.string + else: + if selector.extract == "text": + return tag.text + elif selector.extract == "content": + return ''.join(map(str, tag.contents)) + elif selector.extract == "tag": + return tag + async def _async_file_log(self, content_name, content): try: filename = f"{content_name}.txt" diff --git a/custom_components/multiscrape/selector.py b/custom_components/multiscrape/selector.py index 55f3882..0bde787 100644 --- a/custom_components/multiscrape/selector.py +++ b/custom_components/multiscrape/selector.py @@ -3,10 +3,10 @@ from homeassistant.const import CONF_VALUE_TEMPLATE -from .const import (CONF_ATTR, CONF_ON_ERROR, CONF_ON_ERROR_DEFAULT, - CONF_ON_ERROR_LOG, CONF_ON_ERROR_VALUE, CONF_SELECT, - CONF_SELECT_LIST, DEFAULT_ON_ERROR_LOG, - DEFAULT_ON_ERROR_VALUE) +from .const import (CONF_ATTR, CONF_EXTRACT, CONF_ON_ERROR, + CONF_ON_ERROR_DEFAULT, CONF_ON_ERROR_LOG, + CONF_ON_ERROR_VALUE, CONF_SELECT, CONF_SELECT_LIST, + DEFAULT_ON_ERROR_LOG, DEFAULT_ON_ERROR_VALUE) class Selector: @@ -27,6 +27,7 @@ def __init__(self, hass, conf): if self.value_template and self.value_template.hass is None: self.value_template.hass = hass + self.extract = conf.get(CONF_EXTRACT) self.on_error = self.create_on_error(conf.get(CONF_ON_ERROR), hass) if ( From 80d5c6b0b418e9c26bce8302aefbdd4a0b76b708 Mon Sep 17 00:00:00 2001 From: danieldotnl Date: Tue, 27 Aug 2024 08:36:03 +0000 Subject: [PATCH 2/5] Return tag as string --- custom_components/multiscrape/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_components/multiscrape/scraper.py b/custom_components/multiscrape/scraper.py index ae9cb68..450881b 100644 --- a/custom_components/multiscrape/scraper.py +++ b/custom_components/multiscrape/scraper.py @@ -168,7 +168,7 @@ def extract_tag_value(self, tag, selector): elif selector.extract == "content": return ''.join(map(str, tag.contents)) elif selector.extract == "tag": - return tag + return str(tag) async def _async_file_log(self, content_name, content): try: From b05b74517e0c5d2e655b79ca78291051e37b98b3 Mon Sep 17 00:00:00 2001 From: danieldotnl Date: Tue, 27 Aug 2024 08:38:37 +0000 Subject: [PATCH 3/5] Add scraper tests for extract setting --- tests/test_scraper.py | 108 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 tests/test_scraper.py diff --git a/tests/test_scraper.py b/tests/test_scraper.py new file mode 100644 index 0000000..58595b3 --- /dev/null +++ b/tests/test_scraper.py @@ -0,0 +1,108 @@ +"""Tests for scraper class.""" +from homeassistant.core import HomeAssistant +from homeassistant.helpers.template import Template + +from custom_components.multiscrape.const import DEFAULT_SEPARATOR +from custom_components.multiscrape.scraper import Scraper +from custom_components.multiscrape.selector import Selector + + +async def test_scrape_extract_text(hass: HomeAssistant) -> None: + """Test scraping and extract text method.""" + scraper = Scraper("test_scraper", hass, None, "lxml", DEFAULT_SEPARATOR) + await scraper.set_content( + "
" + "

Current Version: 2024.8.3

Released: January 17, 2022" + "" + "
" + "" + "
" + "

Current Time:

2022-12-22T13:15:30Z" + "
" + ) + + selector_conf = { + "select": Template(".current-version h1", hass), + "extract": "text", + } + + selector = Selector(hass, selector_conf) + value = scraper.scrape(selector, "test_sensor") + assert value == "Current Version: 2024.8.3" + +async def test_scrape_extract_content(hass: HomeAssistant) -> None: + """Test scraping and extract contents method.""" + scraper = Scraper("test_scraper", hass, None, "lxml", DEFAULT_SEPARATOR) + await scraper.set_content( + "
" + "

Current Version: 2024.8.3

Released: January 17, 2022" + "" + "
" + "" + "
" + "

Current Time:

2022-12-22T13:15:30Z" + "
" + ) + + selector_conf = { + "select": Template(".links", hass), + "extract": "content", + } + + selector = Selector(hass, selector_conf) + value = scraper.scrape(selector, "test_sensor") + assert value == 'Release notes' + +async def test_scrape_extract_tag(hass: HomeAssistant) -> None: + """Test scraping and extract tag method.""" + scraper = Scraper("test_scraper", hass, None, "lxml", DEFAULT_SEPARATOR) + await scraper.set_content( + "
" + "

Current Version: 2024.8.3

Released: January 17, 2022" + "" + "
" + "" + "
" + "

Current Time:

2022-12-22T13:15:30Z" + "
" + ) + + selector_conf = { + "select": Template(".links", hass), + "extract": "tag", + } + + selector = Selector(hass, selector_conf) + value = scraper.scrape(selector, "test_sensor") + assert value == '' + +async def test_scrape_extract_attribute(hass: HomeAssistant) -> None: + """Test scraping and extract an attribute value.""" + scraper = Scraper("test_scraper", hass, None, "lxml", DEFAULT_SEPARATOR) + await scraper.set_content( + "
" + "

Current Version: 2024.8.3

Released: January 17, 2022" + "" + "
" + "" + "
" + "

Current Time:

2022-12-22T13:15:30Z" + "
" + ) + + selector_conf = { + "select": Template(".links a", hass), + "attribute": "href", + } + + selector = Selector(hass, selector_conf) + value = scraper.scrape(selector, "test_sensor") + assert value == '/latest-release-notes/' + + + + From 3f84c125b2330f0ef1fc87441c91c3e5f22df33c Mon Sep 17 00:00:00 2001 From: danieldotnl Date: Tue, 27 Aug 2024 08:43:36 +0000 Subject: [PATCH 4/5] Clarify comment --- tests/test_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 58595b3..b27c065 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -80,7 +80,7 @@ async def test_scrape_extract_tag(hass: HomeAssistant) -> None: assert value == '' async def test_scrape_extract_attribute(hass: HomeAssistant) -> None: - """Test scraping and extract an attribute value.""" + """Test scraping and extract an HTML attribute value.""" scraper = Scraper("test_scraper", hass, None, "lxml", DEFAULT_SEPARATOR) await scraper.set_content( "
" From 1cccc6b80ec65cb544f7d8bac260dcf7ea86995d Mon Sep 17 00:00:00 2001 From: danieldotnl Date: Tue, 27 Aug 2024 08:45:22 +0000 Subject: [PATCH 5/5] Update readme with extract setting --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 602aa73..8974c97 100644 --- a/README.md +++ b/README.md @@ -184,13 +184,14 @@ multiscrape: Used to configure scraping options. -| name | description | required | default | type | -| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | ------- | --------------- | -| select | CSS selector used for retrieving the value of the attribute. Only required when `select_list` or `value_template` is not provided. | False | | string/template | -| select_list | CSS selector for multiple values of multiple elements which will be returned as csv. Only required when `select` or `value_template` is not provided. | False | | string/template | -| attribute | Attribute from the selected element to read as value. | False | | string | -| value_template | Defines a template applied to extract the value from the result of the selector (if provided) or raw page (if selector not provided) | False | | string/template | -| on_error | See [On-error](#on-error) | False | | | +| name | description | required | default | type | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------- | ------- | --------------- | +| select | CSS selector used for retrieving the value of the attribute. Only required when `select_list` or `value_template` is not provided. | False | | string/template | +| select_list | CSS selector for multiple values of multiple elements which will be returned as csv. Only required when `select` or `value_template` is not provided. | False | | string/template | +| attribute | Attribute from the selected element to read as value. | False | | string | +| value_template | Defines a template applied to extract the value from the result of the selector (if provided) or raw page (if selector not provided) | False | | string/template | +| extract | Determines how the result of the CSS selector is extracted. Only applicable to HTML. `text` returns just text, `content` returns the html content of the selected tag and `tag` returns html including the selected tag. | False | text | string | +| on_error | See [On-error](#on-error) | False | | | ### On-error