okfn-brasil · vinikuhlmann · Oct 9, 2024 · Oct 9, 2024
diff --git a/data_collection/gazette/spiders/sp/sp_sao_carlos.py b/data_collection/gazette/spiders/sp/sp_sao_carlos.py
@@ -0,0 +1,152 @@
+import datetime
+import locale
+import logging
+import re
+
+import bs4
+from dateutil.relativedelta import relativedelta
+from scrapy import Request
+from scrapy.http import Response
+
+from gazette.items import Gazette
+from gazette.spiders.base import BaseGazetteSpider
+
+logger = logging.getLogger(__name__)
+
+
+class SpSaoCarlosSpider(BaseGazetteSpider):
+    name = "sp_sao_carlos"
+    TERRITORY_ID = "3548906"
+    allowed_domains = ["saocarlos.sp.gov.br"]
+    base_url = "http://www.saocarlos.sp.gov.br"
+    start_date = datetime.date(2009, 5, 10)
+
+    def start_requests(self):
+        year_diff = self.end_date.year - self.start_date.year
+        month_diff = self.end_date.month - self.start_date.month
+        months_between_start_and_end = year_diff * 12 + month_diff + 1
+        dates_of_interest = (
+            self.start_date + relativedelta(months=i)
+            for i in range(months_between_start_and_end)
+        )
+
+        # For converting month names to Portuguese
+        # Used in the URL generation and in the parsing of the gazette date
+        try:
+            locale.setlocale(locale.LC_TIME, "pt_BR.UTF-8")
+        except locale.Error:
+            locale.setlocale(locale.LC_TIME, "Portuguese_Brazil.1252")
+
+        def get_url(date_of_interest: datetime):
+            year = date_of_interest.year
+            month = date_of_interest.strftime("%B").lower().replace("ç", "c")
+            return f"{self.base_url}/index.php/diario-oficial-{year}/diario-oficial-{month}-{year}.html"
+
+        urls = map(get_url, dates_of_interest)
+        yield from map(Request, urls)
+
+    def find_gazette_rows(self, response: Response):
+        soup = bs4.BeautifulSoup(response.text, "html.parser")
+
+        gazette_table = soup.find(
+            "table",
+            style=re.compile(r"(border-color: #dfdfe1|rgb\(223, 223, 225\))"),
+            recursive=True,
+        )
+        if gazette_table:
+            gazette_rows = gazette_table.find_all("tr")
+            return gazette_rows
+
+        # They decided to split the table into multiple single-row tables at some point
+        gazette_tables = soup.find_all(
+            "table",
+            width=620,
+            recursive=True,
+        )
+        if gazette_tables:
+            gazette_rows = [table.find("tr") for table in gazette_tables]
+            return gazette_rows
+
+        logger.error("Could not find gazette data")
+
+    class EndDateReached(Exception):
+        pass
+
+    def parse(self, response: Response):
+        gazette_rows = self.find_gazette_rows(response)
+
+        for index, gazette_row in enumerate(gazette_rows):
+            # Sometimes there are empty rows
+            has_no_content = gazette_row is None or len(gazette_row.contents) == 1
+            if has_no_content:
+                logger.warning(f"Empty row at index {index}")
+                continue
+
+            try:
+                gazette = self.parse_gazette_row(gazette_row)
+                if gazette:
+                    yield gazette
+            except self.EndDateReached:
+                break
+
+    def get_default_match(self, text):
+        # Examples:
+        # 'Edição nº 25 • Ano 1 • 1ºde agosto de 2009'
+        # 'Edição n° 115 • Ano 1 •\xa0 27 defevereiro de 2010'
+        # 'Edição nº 1898 • Ano 14\xa0• 1º de Fevereiro de 2022'
+        pattern = r"n(?:º|°) (\d+).*Ano \d+.*?([1-3]?\d)º? ?de ?([A-zÀ-ú]+).*(\d{4})"
+        matched = re.search(pattern, text)
+        return matched
+
+    def get_out_of_order_match(self, text):
+        # Yes, this happens more than once
+        # Examples:
+        # 'Edição nº 901 • Ano 8\xa0• 01 Março de \xa0de 2016'
+        # 'Edição nº 911 • Ano 8\xa0• 01 Abril de \xa0de 2016'
+        pattern = r"n(?:º|°) (\d+).*Ano \d+.*?([1-3]?\d)º? ?([A-zÀ-ú]+).*(\d{4})"
+        matched = re.search(pattern, text)
+        return matched
+
+    already_seen = set()
+
+    def parse_gazette_row(self, gazette_row: bs4.Tag):
+        matched = self.get_default_match(gazette_row.text)
+
+        if not matched:
+            matched = self.get_out_of_order_match(gazette_row.text)
+
+        if not matched:
+            logger.error(
+                "Gazzette text does not match any known patterns", gazette_row.text
+            )
+            return None
+
+        edition_number = matched.group(1)
+        if edition_number in self.already_seen:
+            raise ValueError(f"Duplicate edition number: {edition_number}")
+        self.already_seen.add(edition_number)
+        day = matched.group(2)
+        month = matched.group(3)
+        year = matched.group(4)
+        date = datetime.datetime.strptime(f"{day} {month} {year}", "%d %B %Y").date()
+
+        if date < self.start_date:
+            return None
+
+        if date > self.end_date:
+            raise self.EndDateReached(f"End date reached: {date}")
+
+        extra = "(extra)" in gazette_row.text
+
+        file_url = gazette_row.find("a").get("href")
+        if not file_url.startswith("http"):
+            file_url = f"{self.base_url}{file_url}"
+
+        return Gazette(
+            edition_number=edition_number,
+            date=date,
+            file_url=file_url,
+            file_urls=[file_url],
+            is_extra_edition=extra,
+            power="executive",
+        )
diff --git a/data_collection/requirements.in b/data_collection/requirements.in
@@ -1,4 +1,5 @@
 awscli==1.25.90
+beautifulsoup4
 boto3==1.24.89
 click
 chompjs

diff --git a/data_collection/requirements.txt b/data_collection/requirements.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
 #    pip-compile --allow-unsafe --generate-hashes --no-annotate requirements.in
@@ -16,6 +16,9 @@ automat==22.10.0 \
 awscli==1.25.90 \
     --hash=sha256:51341ff0e4b1e93e34254f7585c40d6480034df77d6f198ff26418d4c9afd067 \
     --hash=sha256:ec2fa932bee68fe7b6ba83df2343844a7fd9bb74dc26a98386d185860ff8a913
+beautifulsoup4==4.12.3 \
+    --hash=sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051 \
+    --hash=sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed
 boto3==1.24.89 \
     --hash=sha256:346f8f0d101a4261dac146a959df18d024feda6431e1d9d84f94efd24d086cae \
     --hash=sha256:d0d8ffcdc10821c4562bc7f935cdd840033bbc342ac0e14b6bdd348b3adf4c04
@@ -815,6 +818,9 @@ service-identity==23.1.0 \
 six==1.16.0 \
     --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
     --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
+soupsieve==2.6 \
+    --hash=sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb \
+    --hash=sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9
 spidermon==1.20.0 \
     --hash=sha256:3fe01556bb3218e01d284be6137b3c55d700c58d1399edfdf785c39a061b7927 \
     --hash=sha256:7b183ac0c0390a00dbacfbc7764a7605ac567d3461a7aa7485501f2f24ad4ce9
@@ -921,3 +927,8 @@ zope-interface==6.0 \
     --hash=sha256:f299c020c6679cb389814a3b81200fe55d428012c5e76da7e722491f5d205990 \
     --hash=sha256:f72f23bab1848edb7472309e9898603141644faec9fd57a823ea6b4d1c4c8995 \
     --hash=sha256:fa90bac61c9dc3e1a563e5babb3fd2c0c1c80567e815442ddbe561eadc803b30
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==75.1.0 \
+    --hash=sha256:35ab7fd3bcd95e6b7fd704e4a1539513edad446c097797f2985e0e4b960772f2 \
+    --hash=sha256:d59a21b17a275fb872a9c3dae73963160ae079f1049ed956880cd7c09b120538