From efe23283d0f6b7e004c75650b1be8c5644bfc87d Mon Sep 17 00:00:00 2001 From: Vinicius Date: Tue, 8 Oct 2024 21:58:45 -0300 Subject: [PATCH 1/2] =?UTF-8?q?Implemented=20spider=20for=20S=C3=A3o=20Car?= =?UTF-8?q?los,=20SP?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/sp/sp_sao_carlos.py | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 data_collection/gazette/spiders/sp/sp_sao_carlos.py diff --git a/data_collection/gazette/spiders/sp/sp_sao_carlos.py b/data_collection/gazette/spiders/sp/sp_sao_carlos.py new file mode 100644 index 000000000..f1eca3c8f --- /dev/null +++ b/data_collection/gazette/spiders/sp/sp_sao_carlos.py @@ -0,0 +1,152 @@ +import datetime +import locale +import logging +import re + +import bs4 +from dateutil.relativedelta import relativedelta +from scrapy import Request +from scrapy.http import Response + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + +logger = logging.getLogger(__name__) + + +class SpSaoCarlosSpider(BaseGazetteSpider): + name = "sp_sao_carlos" + TERRITORY_ID = "3548906" + allowed_domains = ["saocarlos.sp.gov.br"] + base_url = "http://www.saocarlos.sp.gov.br" + start_date = datetime.date(2009, 5, 10) + + def start_requests(self): + year_diff = self.end_date.year - self.start_date.year + month_diff = self.end_date.month - self.start_date.month + months_between_start_and_end = year_diff * 12 + month_diff + 1 + dates_of_interest = ( + self.start_date + relativedelta(months=i) + for i in range(months_between_start_and_end) + ) + + # For converting month names to Portuguese + # Used in the URL generation and in the parsing of the gazette date + try: + locale.setlocale(locale.LC_TIME, "pt_BR.UTF-8") + except locale.Error: + locale.setlocale(locale.LC_TIME, "Portuguese_Brazil.1252") + + def get_url(date_of_interest: datetime): + year = date_of_interest.year + month = date_of_interest.strftime("%B").lower().replace("ç", "c") + return f"{self.base_url}/index.php/diario-oficial-{year}/diario-oficial-{month}-{year}.html" + + urls = map(get_url, dates_of_interest) + yield from map(Request, urls) + + def find_gazette_rows(self, response: Response): + soup = bs4.BeautifulSoup(response.text, "html.parser") + + gazette_table = soup.find( + "table", + style=re.compile(r"(border-color: #dfdfe1|rgb\(223, 223, 225\))"), + recursive=True, + ) + if gazette_table: + gazette_rows = gazette_table.find_all("tr") + return gazette_rows + + # They decided to split the table into multiple single-row tables at some point + gazette_tables = soup.find_all( + "table", + width=620, + recursive=True, + ) + if gazette_tables: + gazette_rows = [table.find("tr") for table in gazette_tables] + return gazette_rows + + logger.error("Could not find gazette data") + + class EndDateReached(Exception): + pass + + def parse(self, response: Response): + gazette_rows = self.find_gazette_rows(response) + + for index, gazette_row in enumerate(gazette_rows): + # Sometimes there are empty rows + has_no_content = gazette_row is None or len(gazette_row.contents) == 1 + if has_no_content: + logger.warning(f"Empty row at index {index}") + continue + + try: + gazette = self.parse_gazette_row(gazette_row) + if gazette: + yield gazette + except self.EndDateReached: + break + + def get_default_match(self, text): + # Examples: + # 'Edição nº 25 • Ano 1 • 1ºde agosto de 2009' + # 'Edição n° 115 • Ano 1 •\xa0 27 defevereiro de 2010' + # 'Edição nº 1898 • Ano 14\xa0• 1º de Fevereiro de 2022' + pattern = r"n(?:º|°) (\d+).*Ano \d+.*?([1-3]?\d)º? ?de ?([A-zÀ-ú]+).*(\d{4})" + matched = re.search(pattern, text) + return matched + + def get_out_of_order_match(self, text): + # Yes, this happens more than once + # Examples: + # 'Edição nº 901 • Ano 8\xa0• 01 Março de \xa0de 2016' + # 'Edição nº 911 • Ano 8\xa0• 01 Abril de \xa0de 2016' + pattern = r"n(?:º|°) (\d+).*Ano \d+.*?([1-3]?\d)º? ?([A-zÀ-ú]+).*(\d{4})" + matched = re.search(pattern, text) + return matched + + already_seen = set() + + def parse_gazette_row(self, gazette_row: bs4.Tag): + matched = self.get_default_match(gazette_row.text) + + if not matched: + matched = self.get_out_of_order_match(gazette_row.text) + + if not matched: + logger.error( + "Gazzette text does not match any known patterns", gazette_row.text + ) + return None + + edition_number = matched.group(1) + if edition_number in self.already_seen: + raise ValueError(f"Duplicate edition number: {edition_number}") + self.already_seen.add(edition_number) + day = matched.group(2) + month = matched.group(3) + year = matched.group(4) + date = datetime.datetime.strptime(f"{day} {month} {year}", "%d %B %Y").date() + + if date < self.start_date: + return None + + if date > self.end_date: + raise self.EndDateReached(f"End date reached: {date}") + + extra = "(extra)" in gazette_row.text + + file_url = gazette_row.find("a").get("href") + if not file_url.startswith("http"): + file_url = f"{self.base_url}{file_url}" + + return Gazette( + edition_number=edition_number, + date=date, + file_url=file_url, + file_urls=[file_url], + is_extra_edition=extra, + power="executive", + ) From f7800f28bf8e2958c7f82350c4664bcd72f9e109 Mon Sep 17 00:00:00 2001 From: Vinicius Date: Tue, 8 Oct 2024 22:21:04 -0300 Subject: [PATCH 2/2] Added beautifulsoup4 to requirements --- data_collection/requirements.in | 1 + data_collection/requirements.txt | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/data_collection/requirements.in b/data_collection/requirements.in index 899f4d4ba..b87d04e7e 100644 --- a/data_collection/requirements.in +++ b/data_collection/requirements.in @@ -1,4 +1,5 @@ awscli==1.25.90 +beautifulsoup4 boto3==1.24.89 click chompjs diff --git a/data_collection/requirements.txt b/data_collection/requirements.txt index 41bb67513..940021683 100644 --- a/data_collection/requirements.txt +++ b/data_collection/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile --allow-unsafe --generate-hashes --no-annotate requirements.in @@ -16,6 +16,9 @@ automat==22.10.0 \ awscli==1.25.90 \ --hash=sha256:51341ff0e4b1e93e34254f7585c40d6480034df77d6f198ff26418d4c9afd067 \ --hash=sha256:ec2fa932bee68fe7b6ba83df2343844a7fd9bb74dc26a98386d185860ff8a913 +beautifulsoup4==4.12.3 \ + --hash=sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051 \ + --hash=sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed boto3==1.24.89 \ --hash=sha256:346f8f0d101a4261dac146a959df18d024feda6431e1d9d84f94efd24d086cae \ --hash=sha256:d0d8ffcdc10821c4562bc7f935cdd840033bbc342ac0e14b6bdd348b3adf4c04 @@ -815,6 +818,9 @@ service-identity==23.1.0 \ six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 +soupsieve==2.6 \ + --hash=sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb \ + --hash=sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9 spidermon==1.20.0 \ --hash=sha256:3fe01556bb3218e01d284be6137b3c55d700c58d1399edfdf785c39a061b7927 \ --hash=sha256:7b183ac0c0390a00dbacfbc7764a7605ac567d3461a7aa7485501f2f24ad4ce9 @@ -921,3 +927,8 @@ zope-interface==6.0 \ --hash=sha256:f299c020c6679cb389814a3b81200fe55d428012c5e76da7e722491f5d205990 \ --hash=sha256:f72f23bab1848edb7472309e9898603141644faec9fd57a823ea6b4d1c4c8995 \ --hash=sha256:fa90bac61c9dc3e1a563e5babb3fd2c0c1c80567e815442ddbe561eadc803b30 + +# The following packages are considered to be unsafe in a requirements file: +setuptools==75.1.0 \ + --hash=sha256:35ab7fd3bcd95e6b7fd704e4a1539513edad446c097797f2985e0e4b960772f2 \ + --hash=sha256:d59a21b17a275fb872a9c3dae73963160ae079f1049ed956880cd7c09b120538