Skip to content

Commit

Permalink
okfn-brasil#1196 Cria spider para rj_itaborai
Browse files Browse the repository at this point in the history
  • Loading branch information
Gabriel-gag committed Jan 22, 2025
1 parent cf5a2b4 commit 4fee239
Showing 1 changed file with 47 additions and 0 deletions.
47 changes: 47 additions & 0 deletions data_collection/gazette/spiders/rj/rj_itaborai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from datetime import datetime as dt

import scrapy

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class RjItaboraiSpider(BaseGazetteSpider):
name = "rj_itaborai"
TERRITORY_ID = "3301900"
allowed_domains = ["do.ib.itaborai.rj.gov.br"]
start_date = dt(2020, 1, 1).date()

def start_requests(self):
start_date = self.start_date.strftime("%Y-%m-%d")
end_date = self.end_date.strftime("%Y-%m-%d")
yield scrapy.FormRequest(
url="https://do.ib.itaborai.rj.gov.br/dados-portal-novo.php",
formdata={"acao": "3", "dado[]": [start_date, end_date]},
)

def parse(self, response):
gazettes = response.xpath('//div[contains(@class, "card-avulso-diario")]')

for gazette in gazettes:
raw_gazette_date = gazette.xpath(
'.//p[contains(text(),"Postado em")]/text()'
).re_first(r"\d{2}/\d{2}/\d{4}")

gazette_date = dt.strptime(raw_gazette_date, "%d/%m/%Y").date()

gazette_edition_number = gazette.xpath(
'.//p[contains(text(),"Edição N°")]/text()'
).re_first(r"\d+")

is_extra = "extra" in gazette.xpath("@class").get()

gazette_url = gazette.xpath(".//a/@href").get()

yield Gazette(
date=gazette_date,
edition_number=gazette_edition_number,
is_extra_edition=is_extra,
file_urls=[gazette_url],
power="executive",
)

0 comments on commit 4fee239

Please sign in to comment.