From 4fee239add3d401c9928039e989b66447211b349 Mon Sep 17 00:00:00 2001 From: Gabriel Alves Date: Wed, 22 Jan 2025 10:54:45 -0300 Subject: [PATCH] #1196 Cria spider para rj_itaborai --- .../gazette/spiders/rj/rj_itaborai.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 data_collection/gazette/spiders/rj/rj_itaborai.py diff --git a/data_collection/gazette/spiders/rj/rj_itaborai.py b/data_collection/gazette/spiders/rj/rj_itaborai.py new file mode 100644 index 000000000..9dc822069 --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_itaborai.py @@ -0,0 +1,47 @@ +from datetime import datetime as dt + +import scrapy + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class RjItaboraiSpider(BaseGazetteSpider): + name = "rj_itaborai" + TERRITORY_ID = "3301900" + allowed_domains = ["do.ib.itaborai.rj.gov.br"] + start_date = dt(2020, 1, 1).date() + + def start_requests(self): + start_date = self.start_date.strftime("%Y-%m-%d") + end_date = self.end_date.strftime("%Y-%m-%d") + yield scrapy.FormRequest( + url="https://do.ib.itaborai.rj.gov.br/dados-portal-novo.php", + formdata={"acao": "3", "dado[]": [start_date, end_date]}, + ) + + def parse(self, response): + gazettes = response.xpath('//div[contains(@class, "card-avulso-diario")]') + + for gazette in gazettes: + raw_gazette_date = gazette.xpath( + './/p[contains(text(),"Postado em")]/text()' + ).re_first(r"\d{2}/\d{2}/\d{4}") + + gazette_date = dt.strptime(raw_gazette_date, "%d/%m/%Y").date() + + gazette_edition_number = gazette.xpath( + './/p[contains(text(),"Edição N°")]/text()' + ).re_first(r"\d+") + + is_extra = "extra" in gazette.xpath("@class").get() + + gazette_url = gazette.xpath(".//a/@href").get() + + yield Gazette( + date=gazette_date, + edition_number=gazette_edition_number, + is_extra_edition=is_extra, + file_urls=[gazette_url], + power="executive", + )