From bc382f0acf5ceed2972d44d78d096709a796ee65 Mon Sep 17 00:00:00 2001 From: trevineju Date: Sun, 12 Jan 2025 23:26:43 -0300 Subject: [PATCH 1/2] =?UTF-8?q?Adiciona=20raspador=20para=20S=C3=A3o=20Lu?= =?UTF-8?q?=C3=ADs-MA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/ma/ma_sao_luis.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 data_collection/gazette/spiders/ma/ma_sao_luis.py diff --git a/data_collection/gazette/spiders/ma/ma_sao_luis.py b/data_collection/gazette/spiders/ma/ma_sao_luis.py new file mode 100644 index 000000000..f64235fdc --- /dev/null +++ b/data_collection/gazette/spiders/ma/ma_sao_luis.py @@ -0,0 +1,49 @@ +import re +from datetime import date +from urllib.parse import urlparse + +import dateparser +from scrapy import Request + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class MASaoLuisSpider(BaseGazetteSpider): + name = "ma_sao_luis" + TERRITORY_ID = "2111300" + allowed_domains = ["diariooficial.saoluis.ma.gov.br"] + start_urls = ["https://diariooficial.saoluis.ma.gov.br/diario-oficial"] + start_date = date(1993, 1, 4) + + def parse(self, response, page=1): + for item in response.css(".box-publicacao"): + raw_infos = "".join(item.css("::text").getall()).strip() + + edition_number = re.search(r"(\d+)/", raw_infos).group(1) + is_extra_edition = "extra" in raw_infos.lower() + + raw_edition_date = re.search(r",(.+)\s", raw_infos).group(1).strip() + edition_date = dateparser.parse(raw_edition_date, languages=["pt"]).date() + + edition_path = item.css("a")[1].attrib["href"] + edition_url = ( + urlparse(self.start_urls[0])._replace(path=edition_path).geturl() + ) + + if self.start_date <= edition_date <= self.end_date: + yield Gazette( + date=edition_date, + edition_number=edition_number, + is_extra_edition=is_extra_edition, + file_urls=[edition_url], + power="executive_legislative", + ) + + last_page_number = int(response.css(".pagination .last a").attrib["data-page"]) + if edition_date > self.start_date and page < last_page_number: + page += 1 + yield Request( + f"https://diariooficial.saoluis.ma.gov.br/diario-oficial/index?page={page}", + cb_kwargs={"page": page}, + ) From 02a073b95b51bb4e9b1e0d0df92c2e4d829d435f Mon Sep 17 00:00:00 2001 From: trevineju Date: Sun, 12 Jan 2025 23:43:46 -0300 Subject: [PATCH 2/2] =?UTF-8?q?Adiciona=20exce=C3=A7=C3=A3o=20caso=20link?= =?UTF-8?q?=20para=20documento=20ausente?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/ma/ma_sao_luis.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/data_collection/gazette/spiders/ma/ma_sao_luis.py b/data_collection/gazette/spiders/ma/ma_sao_luis.py index f64235fdc..4c6b43694 100644 --- a/data_collection/gazette/spiders/ma/ma_sao_luis.py +++ b/data_collection/gazette/spiders/ma/ma_sao_luis.py @@ -20,18 +20,24 @@ def parse(self, response, page=1): for item in response.css(".box-publicacao"): raw_infos = "".join(item.css("::text").getall()).strip() - edition_number = re.search(r"(\d+)/", raw_infos).group(1) - is_extra_edition = "extra" in raw_infos.lower() - raw_edition_date = re.search(r",(.+)\s", raw_infos).group(1).strip() edition_date = dateparser.parse(raw_edition_date, languages=["pt"]).date() - edition_path = item.css("a")[1].attrib["href"] - edition_url = ( - urlparse(self.start_urls[0])._replace(path=edition_path).geturl() - ) - if self.start_date <= edition_date <= self.end_date: + edition_number = re.search(r"(\d+)/", raw_infos).group(1) + is_extra_edition = "extra" in raw_infos.lower() + + try: + edition_path = item.css("a")[1].attrib["href"] + edition_url = ( + urlparse(self.start_urls[0]) + ._replace(path=edition_path) + .geturl() + ) + except Exception: + self.logger.error(f"Unable to retrieve PDF URL for {edition_date}.") + continue + yield Gazette( date=edition_date, edition_number=edition_number,