-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadiariosv1.py
68 lines (51 loc) · 2.02 KB
/
adiariosv1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import datetime
import scrapy
from mapeadores.spiders.bases.mapeador_semantico import MapeadorSemantico
from mapeadores.items import MapeamentoItem
class MapeadorAdiariosV1(MapeadorSemantico):
"""Mapeia o padrao adiarios_v1
Exemplos:
https://www.buriticupu.ma.gov.br/diariooficial.php
https://www.anajatuba.ma.gov.br/diariooficial.php
"""
name = "adiarios_v1"
url_patterns = [
"https://www.nome_do_municipio.UF.gov.br/diariooficial.php",
]
def parse(self, response, item):
if self.belongs_to_pattern(response):
item["url"] = response.url
item["status"] = "valido"
date_to = self.get_date(response, 0)
item["date_to"] = date_to
yield scrapy.Request(
f"{response.url}?pagina={self.get_last_page(response)}",
callback=self.parse_last_page,
cb_kwargs={"item": item},
)
else:
yield MapeamentoItem(
**self.make_invalid_item(item, response.url),
)
def parse_last_page(self, response, item):
yield MapeamentoItem(
**item,
date_from = self.get_date(response, -1),
)
def belongs_to_pattern(self, response):
if (
"assesi.com.br" in response.text
or "siasp.com.br" in response.text
or len(response.xpath('//*[@class="public_paginas"]').getall()) > 0
):
if "Foram encontrados 0 registros" not in response.text:
return True
return False
def get_last_page(self, response):
page_pagination = response.css(".pagination li a span::text").getall()
page_numbers = [int(i) for i in page_pagination]
last_page_index = max(page_numbers)
return last_page_index-1
def get_date(self, response, position):
raw = response.css("#diario_lista")[position].css(".calendarioIcon::text").get().strip()
return datetime.datetime.strptime(raw, "%d/%m/%Y").date()