From ae692108773f5a6f5c502faae3f698423a29374e Mon Sep 17 00:00:00 2001 From: Ashwin Rajeev Date: Mon, 6 Jan 2020 18:22:53 +0530 Subject: [PATCH 1/3] add image src extraction --- selectorlib/selectorlib.py | 2 ++ tests/data/input.yml | 3 +++ tests/data/output.yml | 1 + tests/test_selectorlib.py | 1 + 4 files changed, 7 insertions(+) diff --git a/selectorlib/selectorlib.py b/selectorlib/selectorlib.py index b7af1c4..32ae734 100644 --- a/selectorlib/selectorlib.py +++ b/selectorlib/selectorlib.py @@ -14,6 +14,8 @@ def extract_field(element, item_type, attribute=None, formatter=None): content = element.get() elif item_type == 'Attribute': content = element.attrib.get(attribute) + elif item_type == 'Image': + content = element.attrib.get('src') if formatter: content = formatter.format(content) return content diff --git a/tests/data/input.yml b/tests/data/input.yml index e48a3a4..9e98a16 100644 --- a/tests/data/input.yml +++ b/tests/data/input.yml @@ -4,6 +4,9 @@ name: price: css: p.price type: Text +image: + css: img.wp-post-image + type: Image stock: css: p.stock type: Text diff --git a/tests/data/output.yml b/tests/data/output.yml index 0eb2e34..c89fc3a 100644 --- a/tests/data/output.yml +++ b/tests/data/output.yml @@ -4,6 +4,7 @@ description: "Bulbasaur can be seen napping in bright sunlight. There is a seed \ its back. By soaking up the sun\u2019s rays, the seed grows progressively larger." name: Bulbasaur price: "\xA3 63.00" +image: 'https://scrapeme.live/wp-content/uploads/2018/08/001.png' related_products: - {name: Fearow, price: "\xA3 95.00", sku: 9127, url: 'https://scrapeme.live/shop/Fearow/'} - {name: Blastoise, price: "\xA3 76.00", sku: 5212, url: 'https://scrapeme.live/shop/Blastoise/'} diff --git a/tests/test_selectorlib.py b/tests/test_selectorlib.py index 526f044..164d81d 100644 --- a/tests/test_selectorlib.py +++ b/tests/test_selectorlib.py @@ -51,6 +51,7 @@ def test_content(html, input_yaml, output_yaml): formatters = formatter.Formatter.get_all() extractor = selectorlib.Extractor.from_yaml_string(input_yaml, formatters=formatters) output = extractor.extract(html, base_url=base_url) + print(output) assert output == yaml.safe_load(output_yaml) From eb14d7eeac6421936c890978543946d414af5aa4 Mon Sep 17 00:00:00 2001 From: Ashwin Rajeev Date: Mon, 6 Jan 2020 18:24:31 +0530 Subject: [PATCH 2/3] update dev requirements --- .travis.yml | 1 + requirements_dev.txt | 19 ++++++++----------- tox.ini | 3 ++- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5980ddf..a143dc5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: python sudo: required dist: xenial python: +- 3.8 - 3.7 - 3.6 - 3.5 diff --git a/requirements_dev.txt b/requirements_dev.txt index 8a80171..9a18226 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,12 +1,9 @@ -pip==19.1.1 -bumpversion==0.5.3 -wheel==0.33.4 -watchdog==0.9.0 -flake8==3.7.7 -tox==3.12.1 -coverage==4.5.3 -Sphinx==2.1.0 -twine==1.13.0 +wheel +flake8 +tox +coverage +Sphinx +twine -pytest==4.6.1 -pytest-runner==5.1 +pytest +pytest-runner diff --git a/tox.ini b/tox.ini index 4f85082..de6e77c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,9 @@ [tox] -envlist = py27, py34, py35, py36, flake8 +envlist = py35, py36, py37, py38, flake8 [travis] python = + 3.8: py38 3.7: py37 3.6: py36 3.5: py35 From bb34a684d07094e96bc7b9396570836dbdce0883 Mon Sep 17 00:00:00 2001 From: Rawa Date: Fri, 4 Mar 2022 12:03:24 +0300 Subject: [PATCH 3/3] Adding xpath_alias, string regex commands --- .idea/.gitignore | 8 + .idea/SelectorLib.iml | 17 ++ .idea/discord.xml | 7 + .idea/inspectionProfiles/Project_Default.xml | 201 ++++++++++++++++++ .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + selectorlib/__init__.py | 2 +- selectorlib/cli.py | 6 +- selectorlib/selectorlib.py | 15 +- 11 files changed, 274 insertions(+), 6 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/SelectorLib.iml create mode 100644 .idea/discord.xml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/SelectorLib.iml b/.idea/SelectorLib.iml new file mode 100644 index 0000000..5195124 --- /dev/null +++ b/.idea/SelectorLib.iml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/discord.xml b/.idea/discord.xml new file mode 100644 index 0000000..d8e9561 --- /dev/null +++ b/.idea/discord.xml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..e9c75c4 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,201 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d0f0c37 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..a11b2cc --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/selectorlib/__init__.py b/selectorlib/__init__.py index 4b8c93b..2364f3e 100644 --- a/selectorlib/__init__.py +++ b/selectorlib/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -"""Top-level package for selectorlib.""" +"""Top-level package for Selectorlib.""" __author__ = """scrapehero""" __email__ = 'pypi@scrapehero.com' diff --git a/selectorlib/cli.py b/selectorlib/cli.py index 86082cc..7495957 100644 --- a/selectorlib/cli.py +++ b/selectorlib/cli.py @@ -1,15 +1,15 @@ # -*- coding: utf-8 -*- -"""Console script for selectorlib.""" +"""Console script for Selectorlib.""" import sys import click @click.command() def main(args=None): - """Console script for selectorlib.""" + """Console script for Selectorlib.""" click.echo("Replace this message by putting your code into " - "selectorlib.cli.main") + "Selectorlib.cli.main") click.echo("See click documentation at http://click.pocoo.org/") return 0 diff --git a/selectorlib/selectorlib.py b/selectorlib/selectorlib.py index 7e59f18..c4d21a2 100644 --- a/selectorlib/selectorlib.py +++ b/selectorlib/selectorlib.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- +import re import parsel import yaml import inspect - def extract_field(element, item_type, attribute=None, formatter=None): if item_type == 'Text': texts = [i.strip() for i in element.xpath('.//text()').getall() if i.strip()] @@ -35,7 +35,7 @@ def __init__(self, config, formatters=None): def from_yaml_string(cls, yaml_string: str, formatters=None): """create `Extractor` object from yaml string - >>> yaml_string = ''' + >>> yaml_string = '' title: css: "h1" type: Text @@ -77,13 +77,19 @@ def extract(self, html: str, base_url: str = None): def _extract_selector(self, field_config, parent_parser): if field_config.get("xpath") is not None: elements = parent_parser.xpath(field_config['xpath']) + if len(elements) == 0: + if field_config.get("xpath_alias") is not None: + elements = parent_parser.xpath(field_config['alias']) + else: css = field_config['css'] if css == '': elements = [parent_parser] else: elements = parent_parser.css(field_config['css']) + item_type = field_config.get('type', 'Text') + # print(field_config) # Returns all fields if not elements: return None values = [] @@ -100,6 +106,11 @@ def _extract_selector(self, field_config, parent_parser): value = extract_field(element, item_type, **kwargs) if field_config.get('multiple') is not True: + if 're' in field_config: + pattern = re.compile(f'{field_config.get("re")}') + regex = re.sub(pattern, '', value) + return regex + return value else: values.append(value)