From 48800357693c069c2ec5139d18f8e52383450faf Mon Sep 17 00:00:00 2001 From: Alexandre Harano Date: Tue, 10 Oct 2023 18:58:55 -0300 Subject: [PATCH 01/11] =?UTF-8?q?Add=20Vit=C3=B3ria-ES=20spider?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit resolve okfn-brasil/querido-diario#750 --- .../gazette/spiders/es/es_vitoria.py | 236 ++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 data_collection/gazette/spiders/es/es_vitoria.py diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py new file mode 100644 index 000000000..de73e3ec9 --- /dev/null +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -0,0 +1,236 @@ +from datetime import date, datetime + +from scrapy import FormRequest, Request + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + +BASE_URL = "https://diariooficial.vitoria.es.gov.br/" + + +class EsVitoriaSpider(BaseGazetteSpider): + name = "es_vitoria" + TERRITORY_ID = "3205309" + start_date = date(2014, 7, 21) + + allowed_domains = ["diariooficial.vitoria.es.gov.br"] + + # When there are too many requests, the server may return + # an HTTP 406 status code when trying to download a PDF file + # + # We set `custom_settings` to avoid triggering the 406 HTTP status code + # by spreading the downloads for this spider over time + + custom_settings = { + "DOWNLOAD_DELAY": 0.3, # 300 ms + "RANDOMIZE_DOWNLOAD_DELAY": True, + "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406], + } + + def __init__(self, *args, **kwargs): + super(EsVitoriaSpider, self).__init__(*args, **kwargs) + + # Period queried for gazette source is based on specific year-month + # Within a queried period, it has a paging mechanism that can spread multiple files of the same date + # We collect all the entries for the year-month period to then generate the gazette entries + + # Considering the above descrition, we use a dict named `data_by_monthly_date_by_date` + # with its keys composed by a 2-tuple + # - year + # - month + # and its items is another nested dict composed by + # - gazette_date + # and its items is a list of str representing the URL of the collected files for that date + + # e.g. + # data_by_monthly_date_by_date = { + # (2022, 12): { + # date(2022, 12, 2): [ + # "https://diariooficial.vitoria.es.gov.br/ExibirArquivo.aspx" + # "?qs=nnmrXIDe5L4hR81FZwDXlD95Q%2fWHOCtXgeCw%2fnRIrFMxQA7S5mwuf0RM3mOCPGtiwqKwtsQd8WTWmli6Dukj2duE%2bcjGeiOYdOhFAaD2d4lajnB7Bs8eXyta5UTj79FJ", + # "https://diariooficial.vitoria.es.gov.br/ExibirArquivo.aspx" + # "?qs=nnmrXIDe5L4hR81FZwDXlD95Q%2fWHOCtXgeCw%2fnRIrFMxQA7S5mwuf0RM3mOCPGtiwqKwtsQd8WTWmli6Dukj2duE%2bcjGeiOY4xkUuS2BQabum9G9l8gOaMHLbesi83TO", + # ] + # } + # } + + self.data_by_monthly_date_by_date = {} + + def start_requests(self): + url = BASE_URL + + today = date.today() + year = today.year + month = today.month + + yield Request( + url=url, + callback=self.initial_parse, + meta={"cookiejar": f"{self.name}_{year}_{month}"}, + ) + + def initial_parse(self, response): + year_select = response.xpath("//select[contains(@id, 'ddlAno')]") + year_formkey = year_select.attrib["name"] + years_available = map(int, year_select.xpath("./option/@value").getall()) + chosen_year = int( + year_select.xpath("./option[contains(@selected, 'selected')]/@value").get() + ) + + for year in years_available: + if year < self.start_date.year or self.end_date.year < year: + continue + + if year == chosen_year: + yield from self.parse_year(response, year) + continue + + yield FormRequest.from_response( + response, + formdata={year_formkey: str(year)}, + callback=self.parse_year, + cb_kwargs={"year": year}, + # We are isolating cookiejar per name-year-month combination + # to avoid interference between concurrent requests + # Whenever we request a past year, it sets the month to December + meta={"cookiejar": f"{self.name}_{year}_12"}, + ) + + def parse_year(self, response, year): + year_select = response.xpath("//select[contains(@id, 'ddlAno')]") + year_formkey = year_select.attrib["name"] + + month_select = response.xpath("//select[contains(@id, 'ddlMes')]") + month_formkey = month_select.attrib["name"] + + chosen_month = int( + month_select.xpath("./option[contains(@selected, 'selected')]/@value").get() + ) + + first_day_of_start_date_month = date( + self.start_date.year, self.start_date.month, 1 + ) + + for month in range(1, 13): + first_day_of_month = date(year, month, 1) + if ( + first_day_of_month < first_day_of_start_date_month + or self.end_date < first_day_of_month + ): + continue + + current_year_month = (year, month) + + if month == chosen_month: + yield from self.parse_editions_list(response, current_year_month) + continue + + formdata = { + "__EVENTTARGET": month_formkey, + "__EVENTARGUMENT": "", + year_formkey: str(year), + month_formkey: str(month), + } + yield FormRequest.from_response( + response, + formdata=formdata, + callback=self.parse_editions_list, + cb_kwargs={ + "current_year_month": current_year_month, + }, + # We are isolating cookiejar per name-year-month combination + # to avoid interference between concurrent requests + meta={"cookiejar": f"{self.name}_{year}_{month}"}, + ) + + def parse_editions_list( + self, + response, + current_year_month, # (year, month) + current_page=1, + ): + year_select = response.xpath("//select[contains(@id, 'ddlAno')]") + year_formkey = year_select.attrib["name"] + + month_select = response.xpath("//select[contains(@id, 'ddlMes')]") + month_formkey = month_select.attrib["name"] + + year, month = current_year_month + + for row in response.xpath( + "//ancestor::a[span[contains(@id, '_grdArquivos_')]]" + ): + raw_string = row.xpath("./span/text()").get() + date_string_from_text = raw_string.split()[-1] + gazette_date = self._parse_date(date_string_from_text) + + if not gazette_date: + self.logger.warning( + f"No valid date could be extracted from '{raw_string}'" + ) + continue + + if gazette_date > self.end_date: + continue + elif gazette_date < self.start_date: + return + + if gazette_date.timetuple()[:2] != current_year_month: + self.logger.warning( + f"Found {gazette_date.isoformat()} gazette while querying" + f" for {current_year_month[0]}-{current_year_month[1]:02}" + f" period. Skipping..." + ) + continue + + url = response.urljoin(row.attrib["href"]) + + file_urls = self.data_by_monthly_date_by_date.setdefault( + current_year_month, {} + ).setdefault(gazette_date, []) + + if url not in file_urls: + # We use this strategy to avoid duplicates while maintaining row order + file_urls.append(url) + + number_of_pages = len( + response.xpath("//ul[contains(@class, 'pagination')]/li").getall() + ) + + if current_page < number_of_pages: + formdata = { + "__EVENTARGUMENT": f"Page${current_page + 1}", + "__EVENTTARGET": "ctl00$conteudo$ucPesquisarDiarioOficial$grdArquivos", + year_formkey: str(year), + month_formkey: str(month), + } + + yield FormRequest.from_response( + response, + formdata=formdata, + callback=self.parse_editions_list, + cb_kwargs={ + "current_year_month": current_year_month, + "current_page": current_page + 1, + }, + # We keep using the same cookiejar for the name_year_month combination + # because, if we don't, it can interfere with the paging data for + # a different name_year_month combination + meta={"cookiejar": f"{self.name}_{year}_{month}"}, + ) + else: + # After all the entries of the queried year-month period were collected, + # we finally yield the Gazette per date within that month + current_year_month_data = self.data_by_monthly_date_by_date.get( + current_year_month, {} + ) + for gazette_date, file_urls in current_year_month_data.items(): + yield Gazette( + date=gazette_date, + is_extra_edition=False, + file_urls=file_urls, + power="executive", + ) + + def _parse_date(self, raw_date): + return datetime.strptime(raw_date, "%d/%m/%Y").date() From 1c9b41baf2299a76892f6b3b33942052598f2e30 Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 02:01:35 -0300 Subject: [PATCH 02/11] =?UTF-8?q?remo=C3=A7=C3=A3o=20de=20vari=C3=A1vel=20?= =?UTF-8?q?usada=20uma=20=C3=BAnica=20vez?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/es/es_vitoria.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index de73e3ec9..1d20606b2 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -5,8 +5,6 @@ from gazette.items import Gazette from gazette.spiders.base import BaseGazetteSpider -BASE_URL = "https://diariooficial.vitoria.es.gov.br/" - class EsVitoriaSpider(BaseGazetteSpider): name = "es_vitoria" @@ -57,14 +55,12 @@ def __init__(self, *args, **kwargs): self.data_by_monthly_date_by_date = {} def start_requests(self): - url = BASE_URL - today = date.today() year = today.year month = today.month yield Request( - url=url, + "https://diariooficial.vitoria.es.gov.br/", callback=self.initial_parse, meta={"cookiejar": f"{self.name}_{year}_{month}"}, ) @@ -157,10 +153,8 @@ def parse_editions_list( year, month = current_year_month - for row in response.xpath( - "//ancestor::a[span[contains(@id, '_grdArquivos_')]]" - ): - raw_string = row.xpath("./span/text()").get() + for row in response.xpath("//tbody//td/a[1]"): + raw_string = row.css("span::text")[0].get() date_string_from_text = raw_string.split()[-1] gazette_date = self._parse_date(date_string_from_text) From e424be9654d57ed670d5e83321551972dda4392b Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 02:04:08 -0300 Subject: [PATCH 03/11] =?UTF-8?q?remo=C3=A7=C3=A3o=20de=20=5F=5Finit=5F=5F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/es/es_vitoria.py | 35 +++---------------- 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index 1d20606b2..0b96f4292 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -25,47 +25,22 @@ class EsVitoriaSpider(BaseGazetteSpider): "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406], } - def __init__(self, *args, **kwargs): - super(EsVitoriaSpider, self).__init__(*args, **kwargs) - - # Period queried for gazette source is based on specific year-month - # Within a queried period, it has a paging mechanism that can spread multiple files of the same date - # We collect all the entries for the year-month period to then generate the gazette entries - - # Considering the above descrition, we use a dict named `data_by_monthly_date_by_date` - # with its keys composed by a 2-tuple - # - year - # - month - # and its items is another nested dict composed by - # - gazette_date - # and its items is a list of str representing the URL of the collected files for that date - - # e.g. - # data_by_monthly_date_by_date = { - # (2022, 12): { - # date(2022, 12, 2): [ - # "https://diariooficial.vitoria.es.gov.br/ExibirArquivo.aspx" - # "?qs=nnmrXIDe5L4hR81FZwDXlD95Q%2fWHOCtXgeCw%2fnRIrFMxQA7S5mwuf0RM3mOCPGtiwqKwtsQd8WTWmli6Dukj2duE%2bcjGeiOYdOhFAaD2d4lajnB7Bs8eXyta5UTj79FJ", - # "https://diariooficial.vitoria.es.gov.br/ExibirArquivo.aspx" - # "?qs=nnmrXIDe5L4hR81FZwDXlD95Q%2fWHOCtXgeCw%2fnRIrFMxQA7S5mwuf0RM3mOCPGtiwqKwtsQd8WTWmli6Dukj2duE%2bcjGeiOY4xkUuS2BQabum9G9l8gOaMHLbesi83TO", - # ] - # } - # } + data_by_monthly_date_by_date = None + def start_requests(self): self.data_by_monthly_date_by_date = {} - def start_requests(self): today = date.today() year = today.year month = today.month yield Request( "https://diariooficial.vitoria.es.gov.br/", - callback=self.initial_parse, - meta={"cookiejar": f"{self.name}_{year}_{month}"}, + callback=self.make_year_request, + meta={"cookiejar": f"{self.name}_{year}_{month}"}, # é necessário? ) - def initial_parse(self, response): + def make_year_request(self, response): year_select = response.xpath("//select[contains(@id, 'ddlAno')]") year_formkey = year_select.attrib["name"] years_available = map(int, year_select.xpath("./option/@value").getall()) From 707707ea75825dd9e84458d27e5af973d4d15386 Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 02:54:03 -0300 Subject: [PATCH 04/11] =?UTF-8?q?Atualiza=20m=C3=A9todos=20de=20requisi?= =?UTF-8?q?=C3=A7=C3=B5es=20intermedi=C3=A1rias=20para=20evitar=20controle?= =?UTF-8?q?s=20de=20data=20ao=20carregar=20informa=C3=A7=C3=B5es=20no=20co?= =?UTF-8?q?okiejar?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/es/es_vitoria.py | 87 ++++++------------- 1 file changed, 26 insertions(+), 61 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index 0b96f4292..386f66dce 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -1,5 +1,6 @@ from datetime import date, datetime +from dateutil.rrule import MONTHLY, rrule, rruleset from scrapy import FormRequest, Request from gazette.items import Gazette @@ -43,83 +44,47 @@ def start_requests(self): def make_year_request(self, response): year_select = response.xpath("//select[contains(@id, 'ddlAno')]") year_formkey = year_select.attrib["name"] - years_available = map(int, year_select.xpath("./option/@value").getall()) - chosen_year = int( - year_select.xpath("./option[contains(@selected, 'selected')]/@value").get() - ) - - for year in years_available: - if year < self.start_date.year or self.end_date.year < year: - continue - if year == chosen_year: - yield from self.parse_year(response, year) - continue + monthly_dates = rruleset() + monthly_dates.rrule( + rrule(MONTHLY, dtstart=self.start_date, until=self.end_date, bymonthday=[1]) + ) + monthly_dates.rdate(date(self.start_date.year, self.start_date.month, 1)) + for monthly_date in monthly_dates: yield FormRequest.from_response( response, - formdata={year_formkey: str(year)}, - callback=self.parse_year, - cb_kwargs={"year": year}, - # We are isolating cookiejar per name-year-month combination + formdata={year_formkey: str(monthly_date.year)}, + callback=self.make_month_request, + # We are isolating cookiejar like (year, month) combination # to avoid interference between concurrent requests - # Whenever we request a past year, it sets the month to December - meta={"cookiejar": f"{self.name}_{year}_12"}, + meta={"cookiejar": (monthly_date.year, monthly_date.month)}, ) - def parse_year(self, response, year): + def make_month_request(self, response): year_select = response.xpath("//select[contains(@id, 'ddlAno')]") year_formkey = year_select.attrib["name"] month_select = response.xpath("//select[contains(@id, 'ddlMes')]") month_formkey = month_select.attrib["name"] - chosen_month = int( - month_select.xpath("./option[contains(@selected, 'selected')]/@value").get() - ) - - first_day_of_start_date_month = date( - self.start_date.year, self.start_date.month, 1 - ) - - for month in range(1, 13): - first_day_of_month = date(year, month, 1) - if ( - first_day_of_month < first_day_of_start_date_month - or self.end_date < first_day_of_month - ): - continue + year, month = response.meta.get("cookiejar") - current_year_month = (year, month) + formdata = { + "__EVENTTARGET": month_formkey, + "__EVENTARGUMENT": "", + year_formkey: str(year), + month_formkey: str(month), + } - if month == chosen_month: - yield from self.parse_editions_list(response, current_year_month) - continue - - formdata = { - "__EVENTTARGET": month_formkey, - "__EVENTARGUMENT": "", - year_formkey: str(year), - month_formkey: str(month), - } - yield FormRequest.from_response( - response, - formdata=formdata, - callback=self.parse_editions_list, - cb_kwargs={ - "current_year_month": current_year_month, - }, - # We are isolating cookiejar per name-year-month combination - # to avoid interference between concurrent requests - meta={"cookiejar": f"{self.name}_{year}_{month}"}, - ) + yield FormRequest.from_response( + response, + formdata=formdata, + callback=self.parse_editions_list, + meta={"cookiejar": response.meta.get("cookiejar")}, + ) - def parse_editions_list( - self, - response, - current_year_month, # (year, month) - current_page=1, - ): + def parse_editions_list(self, response, current_page=1): year_select = response.xpath("//select[contains(@id, 'ddlAno')]") year_formkey = year_select.attrib["name"] From ed14bd665714eddd09ea3b26423892f7aa30f08b Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 04:06:34 -0300 Subject: [PATCH 05/11] =?UTF-8?q?Atualiza=20c=C3=B3digo=20para=20coletar?= =?UTF-8?q?=20par=C3=A2metros=20de=20formul=C3=A1rio=20apenas=20uma=20vez?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/es/es_vitoria.py | 43 +++++++++---------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index 386f66dce..278b95ce7 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -26,7 +26,8 @@ class EsVitoriaSpider(BaseGazetteSpider): "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406], } - data_by_monthly_date_by_date = None + FORM_PARAM_YEAR = None + FORM_PARAM_MONTH = None def start_requests(self): self.data_by_monthly_date_by_date = {} @@ -41,9 +42,8 @@ def start_requests(self): meta={"cookiejar": f"{self.name}_{year}_{month}"}, # é necessário? ) - def make_year_request(self, response): - year_select = response.xpath("//select[contains(@id, 'ddlAno')]") - year_formkey = year_select.attrib["name"] + def make_year_request(self, response): + self.set_form_params(response) monthly_dates = rruleset() monthly_dates.rrule( @@ -52,29 +52,36 @@ def make_year_request(self, response): monthly_dates.rdate(date(self.start_date.year, self.start_date.month, 1)) for monthly_date in monthly_dates: + + formdata={ + self.FORM_PARAM_YEAR: str(monthly_date.year) + } + yield FormRequest.from_response( response, - formdata={year_formkey: str(monthly_date.year)}, + formdata=formdata, callback=self.make_month_request, # We are isolating cookiejar like (year, month) combination # to avoid interference between concurrent requests meta={"cookiejar": (monthly_date.year, monthly_date.month)}, ) - def make_month_request(self, response): + def set_form_params(self, response): year_select = response.xpath("//select[contains(@id, 'ddlAno')]") - year_formkey = year_select.attrib["name"] + self.FORM_PARAM_YEAR = year_select.attrib["name"] month_select = response.xpath("//select[contains(@id, 'ddlMes')]") - month_formkey = month_select.attrib["name"] + self.FORM_PARAM_MONTH = month_select.attrib["name"] + + def make_month_request(self, response): year, month = response.meta.get("cookiejar") formdata = { - "__EVENTTARGET": month_formkey, + "__EVENTTARGET": self.FORM_PARAM_MONTH, "__EVENTARGUMENT": "", - year_formkey: str(year), - month_formkey: str(month), + self.FORM_PARAM_YEAR: str(year), + self.FORM_PARAM_MONTH: str(month), } yield FormRequest.from_response( @@ -85,14 +92,6 @@ def make_month_request(self, response): ) def parse_editions_list(self, response, current_page=1): - year_select = response.xpath("//select[contains(@id, 'ddlAno')]") - year_formkey = year_select.attrib["name"] - - month_select = response.xpath("//select[contains(@id, 'ddlMes')]") - month_formkey = month_select.attrib["name"] - - year, month = current_year_month - for row in response.xpath("//tbody//td/a[1]"): raw_string = row.css("span::text")[0].get() date_string_from_text = raw_string.split()[-1] @@ -133,10 +132,10 @@ def parse_editions_list(self, response, current_page=1): if current_page < number_of_pages: formdata = { - "__EVENTARGUMENT": f"Page${current_page + 1}", + "__EVENTARGUMENT": f"Page${next_page}", "__EVENTTARGET": "ctl00$conteudo$ucPesquisarDiarioOficial$grdArquivos", - year_formkey: str(year), - month_formkey: str(month), + self.FORM_PARAM_YEAR: str(year), + self.FORM_PARAM_MONTH: str(month), } yield FormRequest.from_response( From 17200a5c5b3332f07d2f3cdf77f9fcb50060ca4b Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 04:09:18 -0300 Subject: [PATCH 06/11] =?UTF-8?q?Atualiza=20l=C3=B3gica=20de=20pagina?= =?UTF-8?q?=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/es/es_vitoria.py | 81 +++++-------------- 1 file changed, 20 insertions(+), 61 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index 278b95ce7..566e50be0 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -93,44 +93,25 @@ def make_month_request(self, response): def parse_editions_list(self, response, current_page=1): for row in response.xpath("//tbody//td/a[1]"): - raw_string = row.css("span::text")[0].get() - date_string_from_text = raw_string.split()[-1] - gazette_date = self._parse_date(date_string_from_text) + raw_date = row.css("span::text")[0].get().split()[-1] + gazette_date = datetime.strptime(raw_date, "%d/%m/%Y").date() - if not gazette_date: - self.logger.warning( - f"No valid date could be extracted from '{raw_string}'" - ) - continue - - if gazette_date > self.end_date: - continue - elif gazette_date < self.start_date: - return - - if gazette_date.timetuple()[:2] != current_year_month: - self.logger.warning( - f"Found {gazette_date.isoformat()} gazette while querying" - f" for {current_year_month[0]}-{current_year_month[1]:02}" - f" period. Skipping..." + if self.start_date <= gazette_date <= self.end_date: + url = response.urljoin(row.css("a").attrib["href"]) + + yield Gazette( + date=gazette_date, + edition_number="", + is_extra_edition=False, + file_urls=[url], + power="executive", ) - continue - - url = response.urljoin(row.attrib["href"]) - - file_urls = self.data_by_monthly_date_by_date.setdefault( - current_year_month, {} - ).setdefault(gazette_date, []) - - if url not in file_urls: - # We use this strategy to avoid duplicates while maintaining row order - file_urls.append(url) - - number_of_pages = len( - response.xpath("//ul[contains(@class, 'pagination')]/li").getall() - ) - - if current_page < number_of_pages: + + has_next_page = response.css(".pagination li")[-1].css("a::text").get() is not None + if has_next_page: + next_page = current_page + 1 + year, month = response.meta.get("cookiejar") + formdata = { "__EVENTARGUMENT": f"Page${next_page}", "__EVENTTARGET": "ctl00$conteudo$ucPesquisarDiarioOficial$grdArquivos", @@ -142,28 +123,6 @@ def parse_editions_list(self, response, current_page=1): response, formdata=formdata, callback=self.parse_editions_list, - cb_kwargs={ - "current_year_month": current_year_month, - "current_page": current_page + 1, - }, - # We keep using the same cookiejar for the name_year_month combination - # because, if we don't, it can interfere with the paging data for - # a different name_year_month combination - meta={"cookiejar": f"{self.name}_{year}_{month}"}, - ) - else: - # After all the entries of the queried year-month period were collected, - # we finally yield the Gazette per date within that month - current_year_month_data = self.data_by_monthly_date_by_date.get( - current_year_month, {} - ) - for gazette_date, file_urls in current_year_month_data.items(): - yield Gazette( - date=gazette_date, - is_extra_edition=False, - file_urls=file_urls, - power="executive", - ) - - def _parse_date(self, raw_date): - return datetime.strptime(raw_date, "%d/%m/%Y").date() + cb_kwargs={"current_page": next_page}, + meta={"cookiejar": response.meta.get("cookiejar")}, + ) \ No newline at end of file From 88a25c11ff981929fa62b51f7bd3347dc88207e6 Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 04:34:05 -0300 Subject: [PATCH 07/11] =?UTF-8?q?Aplica=20modifica=C3=A7=C3=B5es=20do=20li?= =?UTF-8?q?nter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/es/es_vitoria.py | 41 ++++++++----------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index 566e50be0..ee5e70620 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -9,10 +9,9 @@ class EsVitoriaSpider(BaseGazetteSpider): name = "es_vitoria" - TERRITORY_ID = "3205309" - start_date = date(2014, 7, 21) - + TERRITORY_ID = "3205309" allowed_domains = ["diariooficial.vitoria.es.gov.br"] + start_date = date(2014, 7, 21) # When there are too many requests, the server may return # an HTTP 406 status code when trying to download a PDF file @@ -30,19 +29,12 @@ class EsVitoriaSpider(BaseGazetteSpider): FORM_PARAM_MONTH = None def start_requests(self): - self.data_by_monthly_date_by_date = {} - - today = date.today() - year = today.year - month = today.month - yield Request( "https://diariooficial.vitoria.es.gov.br/", callback=self.make_year_request, - meta={"cookiejar": f"{self.name}_{year}_{month}"}, # é necessário? ) - def make_year_request(self, response): + def make_year_request(self, response): self.set_form_params(response) monthly_dates = rruleset() @@ -52,10 +44,7 @@ def make_year_request(self, response): monthly_dates.rdate(date(self.start_date.year, self.start_date.month, 1)) for monthly_date in monthly_dates: - - formdata={ - self.FORM_PARAM_YEAR: str(monthly_date.year) - } + formdata = {self.FORM_PARAM_YEAR: str(monthly_date.year)} yield FormRequest.from_response( response, @@ -74,14 +63,14 @@ def set_form_params(self, response): self.FORM_PARAM_MONTH = month_select.attrib["name"] - def make_month_request(self, response): + def make_month_request(self, response): year, month = response.meta.get("cookiejar") formdata = { - "__EVENTTARGET": self.FORM_PARAM_MONTH, - "__EVENTARGUMENT": "", self.FORM_PARAM_YEAR: str(year), self.FORM_PARAM_MONTH: str(month), + "__EVENTTARGET": self.FORM_PARAM_MONTH, + "__EVENTARGUMENT": "", } yield FormRequest.from_response( @@ -98,7 +87,7 @@ def parse_editions_list(self, response, current_page=1): if self.start_date <= gazette_date <= self.end_date: url = response.urljoin(row.css("a").attrib["href"]) - + yield Gazette( date=gazette_date, edition_number="", @@ -106,17 +95,19 @@ def parse_editions_list(self, response, current_page=1): file_urls=[url], power="executive", ) - - has_next_page = response.css(".pagination li")[-1].css("a::text").get() is not None + + has_next_page = ( + response.css(".pagination li")[-1].css("a::text").get() is not None + ) if has_next_page: next_page = current_page + 1 year, month = response.meta.get("cookiejar") - + formdata = { - "__EVENTARGUMENT": f"Page${next_page}", - "__EVENTTARGET": "ctl00$conteudo$ucPesquisarDiarioOficial$grdArquivos", self.FORM_PARAM_YEAR: str(year), self.FORM_PARAM_MONTH: str(month), + "__EVENTTARGET": self.FORM_PARAM_PAGINATION, + "__EVENTARGUMENT": f"Page${next_page}", } yield FormRequest.from_response( @@ -125,4 +116,4 @@ def parse_editions_list(self, response, current_page=1): callback=self.parse_editions_list, cb_kwargs={"current_page": next_page}, meta={"cookiejar": response.meta.get("cookiejar")}, - ) \ No newline at end of file + ) From 034395a9fef444982f4cc0f2d4192550b5bcc8c6 Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 04:35:11 -0300 Subject: [PATCH 08/11] =?UTF-8?q?Torna=20par=C3=A2metros=20do=20formul?= =?UTF-8?q?=C3=A1rio=20hardcoded?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/es/es_vitoria.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index ee5e70620..e83111be8 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -9,7 +9,7 @@ class EsVitoriaSpider(BaseGazetteSpider): name = "es_vitoria" - TERRITORY_ID = "3205309" + TERRITORY_ID = "3205309" allowed_domains = ["diariooficial.vitoria.es.gov.br"] start_date = date(2014, 7, 21) @@ -25,8 +25,9 @@ class EsVitoriaSpider(BaseGazetteSpider): "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406], } - FORM_PARAM_YEAR = None - FORM_PARAM_MONTH = None + FORM_PARAM_YEAR = "ctl00$conteudo$ucPesquisarDiarioOficial$ddlAno" + FORM_PARAM_MONTH = "ctl00$conteudo$ucPesquisarDiarioOficial$ddlMes" + FORM_PARAM_PAGINATION = "ctl00$conteudo$ucPesquisarDiarioOficial$grdArquivos" def start_requests(self): yield Request( @@ -35,8 +36,6 @@ def start_requests(self): ) def make_year_request(self, response): - self.set_form_params(response) - monthly_dates = rruleset() monthly_dates.rrule( rrule(MONTHLY, dtstart=self.start_date, until=self.end_date, bymonthday=[1]) @@ -55,14 +54,6 @@ def make_year_request(self, response): meta={"cookiejar": (monthly_date.year, monthly_date.month)}, ) - def set_form_params(self, response): - year_select = response.xpath("//select[contains(@id, 'ddlAno')]") - self.FORM_PARAM_YEAR = year_select.attrib["name"] - - month_select = response.xpath("//select[contains(@id, 'ddlMes')]") - self.FORM_PARAM_MONTH = month_select.attrib["name"] - - def make_month_request(self, response): year, month = response.meta.get("cookiejar") From 78faf2043e0b226d1f00dc6b98be902868b54510 Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 05:37:42 -0300 Subject: [PATCH 09/11] =?UTF-8?q?Ajusta=20custom=5Fsettings,=20pagina?= =?UTF-8?q?=C3=A7=C3=A3o=20e=20convers=C3=A3o=20de=20data?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/es/es_vitoria.py | 56 ++++++++----------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index e83111be8..08e9ba12c 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -1,4 +1,4 @@ -from datetime import date, datetime +from datetime import date, datetime as dt from dateutil.rrule import MONTHLY, rrule, rruleset from scrapy import FormRequest, Request @@ -13,14 +13,8 @@ class EsVitoriaSpider(BaseGazetteSpider): allowed_domains = ["diariooficial.vitoria.es.gov.br"] start_date = date(2014, 7, 21) - # When there are too many requests, the server may return - # an HTTP 406 status code when trying to download a PDF file - # - # We set `custom_settings` to avoid triggering the 406 HTTP status code - # by spreading the downloads for this spider over time - custom_settings = { - "DOWNLOAD_DELAY": 0.3, # 300 ms + "DOWNLOAD_DELAY": 0.3, "RANDOMIZE_DOWNLOAD_DELAY": True, "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406], } @@ -40,7 +34,7 @@ def make_year_request(self, response): monthly_dates.rrule( rrule(MONTHLY, dtstart=self.start_date, until=self.end_date, bymonthday=[1]) ) - monthly_dates.rdate(date(self.start_date.year, self.start_date.month, 1)) + monthly_dates.rdate(dt(self.start_date.year, self.start_date.month, 1)) for monthly_date in monthly_dates: formdata = {self.FORM_PARAM_YEAR: str(monthly_date.year)} @@ -49,7 +43,7 @@ def make_year_request(self, response): response, formdata=formdata, callback=self.make_month_request, - # We are isolating cookiejar like (year, month) combination + # We are isolating cookiejar in (year, month) combination # to avoid interference between concurrent requests meta={"cookiejar": (monthly_date.year, monthly_date.month)}, ) @@ -74,7 +68,7 @@ def make_month_request(self, response): def parse_editions_list(self, response, current_page=1): for row in response.xpath("//tbody//td/a[1]"): raw_date = row.css("span::text")[0].get().split()[-1] - gazette_date = datetime.strptime(raw_date, "%d/%m/%Y").date() + gazette_date = dt.strptime(raw_date, "%d/%m/%Y").date() if self.start_date <= gazette_date <= self.end_date: url = response.urljoin(row.css("a").attrib["href"]) @@ -87,24 +81,22 @@ def parse_editions_list(self, response, current_page=1): power="executive", ) - has_next_page = ( - response.css(".pagination li")[-1].css("a::text").get() is not None - ) - if has_next_page: - next_page = current_page + 1 - year, month = response.meta.get("cookiejar") - - formdata = { - self.FORM_PARAM_YEAR: str(year), - self.FORM_PARAM_MONTH: str(month), - "__EVENTTARGET": self.FORM_PARAM_PAGINATION, - "__EVENTARGUMENT": f"Page${next_page}", - } - - yield FormRequest.from_response( - response, - formdata=formdata, - callback=self.parse_editions_list, - cb_kwargs={"current_page": next_page}, - meta={"cookiejar": response.meta.get("cookiejar")}, - ) + if "pagination" in response.text: + if response.css(".pagination li")[-1].css("a::text").get(): + next_page = current_page + 1 + year, month = response.meta.get("cookiejar") + + formdata = { + self.FORM_PARAM_YEAR: str(year), + self.FORM_PARAM_MONTH: str(month), + "__EVENTTARGET": self.FORM_PARAM_PAGINATION, + "__EVENTARGUMENT": f"Page${next_page}", + } + + yield FormRequest.from_response( + response, + formdata=formdata, + callback=self.parse_editions_list, + cb_kwargs={"current_page": next_page}, + meta={"cookiejar": response.meta.get("cookiejar")}, + ) From a14da77267c996ec284c530268d665a9fda61b87 Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 09:13:03 -0300 Subject: [PATCH 10/11] =?UTF-8?q?Atualiza=20l=C3=B3gica=20de=20sele=C3=A7?= =?UTF-8?q?=C3=A3o=20de=20datas=20para=20serem=20requisitadas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/es/es_vitoria.py | 57 +++++++++++-------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index 08e9ba12c..86567e9a8 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -1,6 +1,6 @@ from datetime import date, datetime as dt -from dateutil.rrule import MONTHLY, rrule, rruleset +from dateutil.rrule import MONTHLY, YEARLY, rrule, rruleset from scrapy import FormRequest, Request from gazette.items import Gazette @@ -30,14 +30,12 @@ def start_requests(self): ) def make_year_request(self, response): - monthly_dates = rruleset() - monthly_dates.rrule( - rrule(MONTHLY, dtstart=self.start_date, until=self.end_date, bymonthday=[1]) - ) - monthly_dates.rdate(dt(self.start_date.year, self.start_date.month, 1)) - - for monthly_date in monthly_dates: - formdata = {self.FORM_PARAM_YEAR: str(monthly_date.year)} + for yearly_date in self._dates_of_interest(YEARLY): + formdata = { + self.FORM_PARAM_YEAR: str(yearly_date.year), + "__EVENTTARGET": self.FORM_PARAM_YEAR, + "__EVENTARGUMENT": "", + } yield FormRequest.from_response( response, @@ -45,25 +43,28 @@ def make_year_request(self, response): callback=self.make_month_request, # We are isolating cookiejar in (year, month) combination # to avoid interference between concurrent requests - meta={"cookiejar": (monthly_date.year, monthly_date.month)}, + meta={"cookiejar": (yearly_date.year)}, ) def make_month_request(self, response): - year, month = response.meta.get("cookiejar") - - formdata = { - self.FORM_PARAM_YEAR: str(year), - self.FORM_PARAM_MONTH: str(month), - "__EVENTTARGET": self.FORM_PARAM_MONTH, - "__EVENTARGUMENT": "", - } - - yield FormRequest.from_response( - response, - formdata=formdata, - callback=self.parse_editions_list, - meta={"cookiejar": response.meta.get("cookiejar")}, - ) + year = response.meta.get("cookiejar") + + for monthly_date in self._dates_of_interest(MONTHLY): + if dt(year, 1, 1) <= monthly_date <= dt(year, 12, 31): + + formdata = { + self.FORM_PARAM_YEAR: str(monthly_date.year), + self.FORM_PARAM_MONTH: str(monthly_date.month), + "__EVENTTARGET": self.FORM_PARAM_MONTH, + "__EVENTARGUMENT": "", + } + + yield FormRequest.from_response( + response, + formdata=formdata, + callback=self.parse_editions_list, + meta={"cookiejar": (monthly_date.year, monthly_date.month)}, + ) def parse_editions_list(self, response, current_page=1): for row in response.xpath("//tbody//td/a[1]"): @@ -100,3 +101,9 @@ def parse_editions_list(self, response, current_page=1): cb_kwargs={"current_page": next_page}, meta={"cookiejar": response.meta.get("cookiejar")}, ) + + def _dates_of_interest(self, recurrence): + dates = rruleset() + dates.rrule(rrule(recurrence, dtstart=self.start_date, until=self.end_date, bymonthday=[1])) + dates.rdate(dt(self.start_date.year, self.start_date.month, 1)) + return dates \ No newline at end of file From e75155f1c0d4d97a7d2721b353d92d96149711a3 Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 13 Jan 2025 09:54:38 -0300 Subject: [PATCH 11/11] Simplifica custom_settings e campos de formrequest sendo solicitados --- data_collection/gazette/spiders/es/es_vitoria.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/data_collection/gazette/spiders/es/es_vitoria.py b/data_collection/gazette/spiders/es/es_vitoria.py index 86567e9a8..64fd82c85 100644 --- a/data_collection/gazette/spiders/es/es_vitoria.py +++ b/data_collection/gazette/spiders/es/es_vitoria.py @@ -16,7 +16,6 @@ class EsVitoriaSpider(BaseGazetteSpider): custom_settings = { "DOWNLOAD_DELAY": 0.3, "RANDOMIZE_DOWNLOAD_DELAY": True, - "RETRY_HTTP_CODES": [500, 502, 503, 504, 522, 524, 408, 429, 406], } FORM_PARAM_YEAR = "ctl00$conteudo$ucPesquisarDiarioOficial$ddlAno" @@ -51,12 +50,9 @@ def make_month_request(self, response): for monthly_date in self._dates_of_interest(MONTHLY): if dt(year, 1, 1) <= monthly_date <= dt(year, 12, 31): - formdata = { - self.FORM_PARAM_YEAR: str(monthly_date.year), self.FORM_PARAM_MONTH: str(monthly_date.month), "__EVENTTARGET": self.FORM_PARAM_MONTH, - "__EVENTARGUMENT": "", } yield FormRequest.from_response( @@ -85,11 +81,8 @@ def parse_editions_list(self, response, current_page=1): if "pagination" in response.text: if response.css(".pagination li")[-1].css("a::text").get(): next_page = current_page + 1 - year, month = response.meta.get("cookiejar") formdata = { - self.FORM_PARAM_YEAR: str(year), - self.FORM_PARAM_MONTH: str(month), "__EVENTTARGET": self.FORM_PARAM_PAGINATION, "__EVENTARGUMENT": f"Page${next_page}", } @@ -104,6 +97,10 @@ def parse_editions_list(self, response, current_page=1): def _dates_of_interest(self, recurrence): dates = rruleset() - dates.rrule(rrule(recurrence, dtstart=self.start_date, until=self.end_date, bymonthday=[1])) + dates.rrule( + rrule( + recurrence, dtstart=self.start_date, until=self.end_date, bymonthday=[1] + ) + ) dates.rdate(dt(self.start_date.year, self.start_date.month, 1)) - return dates \ No newline at end of file + return dates