Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature - Novo spider para São Carlos, SP #1300

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions data_collection/gazette/spiders/sp/sp_sao_carlos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import datetime
import locale
import logging
import re

import bs4
from dateutil.relativedelta import relativedelta
from scrapy import Request
from scrapy.http import Response

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider

logger = logging.getLogger(__name__)


class SpSaoCarlosSpider(BaseGazetteSpider):
name = "sp_sao_carlos"
TERRITORY_ID = "3548906"
allowed_domains = ["saocarlos.sp.gov.br"]
base_url = "http://www.saocarlos.sp.gov.br"
start_date = datetime.date(2009, 5, 10)

def start_requests(self):
year_diff = self.end_date.year - self.start_date.year
month_diff = self.end_date.month - self.start_date.month
months_between_start_and_end = year_diff * 12 + month_diff + 1
dates_of_interest = (
self.start_date + relativedelta(months=i)
for i in range(months_between_start_and_end)
)

# For converting month names to Portuguese
# Used in the URL generation and in the parsing of the gazette date
try:
locale.setlocale(locale.LC_TIME, "pt_BR.UTF-8")
except locale.Error:
locale.setlocale(locale.LC_TIME, "Portuguese_Brazil.1252")

def get_url(date_of_interest: datetime):
year = date_of_interest.year
month = date_of_interest.strftime("%B").lower().replace("ç", "c")
return f"{self.base_url}/index.php/diario-oficial-{year}/diario-oficial-{month}-{year}.html"

urls = map(get_url, dates_of_interest)
yield from map(Request, urls)

def find_gazette_rows(self, response: Response):
soup = bs4.BeautifulSoup(response.text, "html.parser")

gazette_table = soup.find(
"table",
style=re.compile(r"(border-color: #dfdfe1|rgb\(223, 223, 225\))"),
recursive=True,
)
if gazette_table:
gazette_rows = gazette_table.find_all("tr")
return gazette_rows

# They decided to split the table into multiple single-row tables at some point
gazette_tables = soup.find_all(
"table",
width=620,
recursive=True,
)
if gazette_tables:
gazette_rows = [table.find("tr") for table in gazette_tables]
return gazette_rows

logger.error("Could not find gazette data")

class EndDateReached(Exception):
pass

def parse(self, response: Response):
gazette_rows = self.find_gazette_rows(response)

for index, gazette_row in enumerate(gazette_rows):
# Sometimes there are empty rows
has_no_content = gazette_row is None or len(gazette_row.contents) == 1
if has_no_content:
logger.warning(f"Empty row at index {index}")
continue

try:
gazette = self.parse_gazette_row(gazette_row)
if gazette:
yield gazette
except self.EndDateReached:
break

def get_default_match(self, text):
# Examples:
# 'Edição nº 25 • Ano 1 • 1ºde agosto de 2009'
# 'Edição n° 115 • Ano 1 •\xa0 27 defevereiro de 2010'
# 'Edição nº 1898 • Ano 14\xa0• 1º de Fevereiro de 2022'
pattern = r"n(?:º|°) (\d+).*Ano \d+.*?([1-3]?\d)º? ?de ?([A-zÀ-ú]+).*(\d{4})"
matched = re.search(pattern, text)
return matched

def get_out_of_order_match(self, text):
# Yes, this happens more than once
# Examples:
# 'Edição nº 901 • Ano 8\xa0• 01 Março de \xa0de 2016'
# 'Edição nº 911 • Ano 8\xa0• 01 Abril de \xa0de 2016'
pattern = r"n(?:º|°) (\d+).*Ano \d+.*?([1-3]?\d)º? ?([A-zÀ-ú]+).*(\d{4})"
matched = re.search(pattern, text)
return matched

already_seen = set()

def parse_gazette_row(self, gazette_row: bs4.Tag):
matched = self.get_default_match(gazette_row.text)

if not matched:
matched = self.get_out_of_order_match(gazette_row.text)

if not matched:
logger.error(
"Gazzette text does not match any known patterns", gazette_row.text
)
return None

edition_number = matched.group(1)
if edition_number in self.already_seen:
raise ValueError(f"Duplicate edition number: {edition_number}")
self.already_seen.add(edition_number)
day = matched.group(2)
month = matched.group(3)
year = matched.group(4)
date = datetime.datetime.strptime(f"{day} {month} {year}", "%d %B %Y").date()

if date < self.start_date:
return None

if date > self.end_date:
raise self.EndDateReached(f"End date reached: {date}")

extra = "(extra)" in gazette_row.text

file_url = gazette_row.find("a").get("href")
if not file_url.startswith("http"):
file_url = f"{self.base_url}{file_url}"

return Gazette(
edition_number=edition_number,
date=date,
file_url=file_url,
file_urls=[file_url],
is_extra_edition=extra,
power="executive",
)
1 change: 1 addition & 0 deletions data_collection/requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
awscli==1.25.90
beautifulsoup4
boto3==1.24.89
click
chompjs
Expand Down
13 changes: 12 additions & 1 deletion data_collection/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile --allow-unsafe --generate-hashes --no-annotate requirements.in
Expand All @@ -16,6 +16,9 @@ automat==22.10.0 \
awscli==1.25.90 \
--hash=sha256:51341ff0e4b1e93e34254f7585c40d6480034df77d6f198ff26418d4c9afd067 \
--hash=sha256:ec2fa932bee68fe7b6ba83df2343844a7fd9bb74dc26a98386d185860ff8a913
beautifulsoup4==4.12.3 \
--hash=sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051 \
--hash=sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed
boto3==1.24.89 \
--hash=sha256:346f8f0d101a4261dac146a959df18d024feda6431e1d9d84f94efd24d086cae \
--hash=sha256:d0d8ffcdc10821c4562bc7f935cdd840033bbc342ac0e14b6bdd348b3adf4c04
Expand Down Expand Up @@ -815,6 +818,9 @@ service-identity==23.1.0 \
six==1.16.0 \
--hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
--hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
soupsieve==2.6 \
--hash=sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb \
--hash=sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9
spidermon==1.20.0 \
--hash=sha256:3fe01556bb3218e01d284be6137b3c55d700c58d1399edfdf785c39a061b7927 \
--hash=sha256:7b183ac0c0390a00dbacfbc7764a7605ac567d3461a7aa7485501f2f24ad4ce9
Expand Down Expand Up @@ -921,3 +927,8 @@ zope-interface==6.0 \
--hash=sha256:f299c020c6679cb389814a3b81200fe55d428012c5e76da7e722491f5d205990 \
--hash=sha256:f72f23bab1848edb7472309e9898603141644faec9fd57a823ea6b4d1c4c8995 \
--hash=sha256:fa90bac61c9dc3e1a563e5babb3fd2c0c1c80567e815442ddbe561eadc803b30

# The following packages are considered to be unsafe in a requirements file:
setuptools==75.1.0 \
--hash=sha256:35ab7fd3bcd95e6b7fd704e4a1539513edad446c097797f2985e0e4b960772f2 \
--hash=sha256:d59a21b17a275fb872a9c3dae73963160ae079f1049ed956880cd7c09b120538
Loading