Skip to content

Commit

Permalink
update download_cnefe
Browse files Browse the repository at this point in the history
  • Loading branch information
rafapereirabr committed Jan 10, 2025
1 parent 2abab31 commit 71eeff7
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 96 deletions.
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ Imports:
purrr,
rlang,
tibble,
tools,
zip
tools
Suggests:
covr,
DBI,
Expand Down
64 changes: 27 additions & 37 deletions R/download_cnefe.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@
#' for Statistical Purposes, in portuguese) data set, purposefully built to be
#' used with this package.
#'
#' @param state A character vector. The states whose CNEFE data should be
#' downloaded. Either `"all"` (the default), in which case the data for all
#' states is downloaded, or a vector with the states abbreviations (e.g.
#' `c("RJ", "DF")` to download the data for Rio de Janeiro and the Federal
#' District).
#' @param progress A logical. Whether to display a download progress bar.
#' Defaults to `TRUE`.
#' @template cache
Expand All @@ -18,19 +13,34 @@
#' @family Support
#'
#' @examplesIf identical(tolower(Sys.getenv("NOT_CRAN")), "true")
#' download_cnefe(state = "AC", progress = FALSE)
#'
#' download_cnefe(state = c("AC", "AL"), progress = FALSE)
#' download_cnefe(progress = FALSE)
#'
#'
#' @export
download_cnefe <- function(state = "all", progress = TRUE, cache = TRUE) {
download_cnefe <- function(progress = TRUE, cache = TRUE) {

checkmate::assert_logical(progress, any.missing = FALSE, len = 1)
checkmate::assert_logical(cache, any.missing = FALSE, len = 1)
state <- assert_and_assign_state(state)

data_url <- glue::glue(
all_files <- c(
"municipio_logradouro_numero_localidade.parquet", # 4 largest files
"municipio.parquet",
"municipio_cep.parquet",
"municipio_cep_localidade.parquet",
"municipio_logradouro_numero_cep_localidade.parquet", # 4 largest files
"municipio_localidade.parquet",
"municipio_logradouro.parquet",
"municipio_logradouro_numero_cep.parquet", # 4 largest files
"municipio_logradouro_cep.parquet",
"municipio_logradouro_cep_localidade.parquet",
"municipio_logradouro_numero.parquet", # 4 largest files
"municipio_logradouro_localidade.parquet"
)

data_urls <- glue::glue(
"https://github.com/ipeaGIT/padronizacao_cnefe/releases/",
"download/{data_release}/estado.{state}.zip"
"download/{data_release}/{all_files}"
)

if (!cache) {
Expand All @@ -43,43 +53,23 @@ download_cnefe <- function(state = "all", progress = TRUE, cache = TRUE) {
# we only need to download data that hasn't been downloaded yet. note that if
# cache=FALSE data_dir is always empty, so we download all required data

existing_data <- list.files(data_dir)
existing_states <- substr(existing_data, start = 8, stop = 9)

states_to_download <- setdiff(state, existing_states)
files_to_download <- data_url[state %in% states_to_download]
existing_files <- list.files(data_dir)

zip_paths <- download_files(files_to_download, progress)
files_to_download <- setdiff(all_files, existing_files)
files_to_download <- data_urls[all_files %in% files_to_download]

purrr::walk(
zip_paths,
function(zipfile) zip::unzip(zipfile, exdir = data_dir)
)
downloaded_files <- download_files(files_to_download, progress)

return(invisible(data_dir))
}

assert_and_assign_state <- function(state) {
all_states <- c(
"RO", "AC", "AM", "RR", "PA", "AP", "TO", "MA", "PI", "CE", "RN", "PB",
"PE", "AL", "SE", "BA", "MG", "ES", "RJ", "SP", "PR", "SC", "RS", "MS",
"MT", "GO", "DF"
)

checkmate::assert_names(state, subset.of = c("all", all_states))

if ("all" %in% state) state <- all_states

return(state)
}

download_files <- function(files_to_download, progress) {
# we always download the files to a temporary directory to prevent any
# potential "garbage" in our cache dir (in case the download fails for some
# reason or the unzipping process crashes mid-operation)

download_dir <- tempfile("zipped_standardized_cnefe")
fs::dir_create(download_dir)
download_dir <- geocodebr::get_cache_dir()

requests <- lapply(files_to_download, httr2::request)

Expand Down Expand Up @@ -119,7 +109,7 @@ perform_requests_in_parallel <- function(requests, dest_files, progress) {
error_cnefe_download_failed <- function() {
geocodebr_error(
c(
"Could not download CNEFE data for one or more states.",
"Could not download CNEFE data for one or more files",
"i" = "Please try again later."
),
call = rlang::caller_env(n = 2)
Expand Down
2 changes: 1 addition & 1 deletion R/progress_bar.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ update_progress_bar <- function(n_rows_affected,
.envir = parent.frame()) {
cli::cli_progress_update(
inc = n_rows_affected,
status = glue::glue("Looking for case {formatted_case} matches"),
status = glue::glue("Looking for match type {formatted_case}"),
force = TRUE,
.envir = .envir
)
Expand Down
58 changes: 29 additions & 29 deletions inst/extdata/small_sample.csv
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
id,nm_logradouro,Numero,Complemento,Cep,Bairro,nm_municipio,code_muni,nm_uf
1,RUA MARIA LUCIA PACIFICO,17,CASA,26042-730,SANTA RITA,NOVA IGUACU,3303500,RIO DE JANEIRO
2,RUA LEOPOLDINA TOME,46,FUNDOS,25030-050,CENTENARIO,DUQUE DE CAXIAS,3301702,RIO DE JANEIRO
3,RUA DONA JUDITE,0,CASA,23915-700,CAPUTERA II,ANGRA DOS REIS,3300100,RIO DE JANEIRO
4,RUA ALEXANDRE AMARAL,0,QUADRA,23098-120,SANTISSIMO,RIO DE JANEIRO,3304557,RIO DE JANEIRO
5,AVENIDA E,300,CASA,23860-000,PRAIA GRANDE,MANGARATIBA,3302601,RIO DE JANEIRO
6,RUA PRINCESA ISABEL,263,APARTAMENTO,69921-026,ESTACAO EXPERIMENTAL,RIO BRANCO,1200401,ACRE
7,CAMINHO SEM DENOMINACAO,2530,NA,69980-000,MIRITIZAL,CRUZEIRO DO SUL,1200203,ACRE
8,RUA DO LIMAO,268,NA,69920-048,MOCINHA MAGALHAES,RIO BRANCO,1200401,ACRE
9,RUA GERALDO MESQUITA,0,FUNDOS,69918-202,NOVA ESTACAO,RIO BRANCO,1200401,ACRE
10,AVENIDA CASTELO BRANCO,3875,NA,69925-000,DEMOCRACIA,SENADOR GUIOMARD,1200450,ACRE
11,RUA LUCIO MARINHO CRUZ,91,NA,57180-000,CENTRO,BARRA DE SAO MIGUEL,2501708,ALAGOAS
12,CONJUNTO GISLENE MATEUS,12,QUADRA,57160-000,TAPERAGUA,MARECHAL DEODORO,2704708,ALAGOAS
13,RUA MONTEIROPOLIS,6,NA,57071-238,CLIMA BOM,MACEIO,2704302,ALAGOAS
14,RUA PRESIDENTE GETULIO VARGAS,1303,NA,57580-000,CENTRO,MAJOR ISIDORO,2704401,ALAGOAS
15,RUA FRANCISCO DE HOLANDA,9,NA,57073-735,CIDADE UNIVERSITARIA,MACEIO,2704302,ALAGOAS
16,RUA MARIA LUCIA PACIFICO,17,CASA,26042-730,,NOVA IGUACU,3303500,RIO DE JANEIRO
17,RUA PRINCESA ISABEL,263,APARTAMENTO,,ESTACAO EXPERIMENTAL,RIO BRANCO,1200401,ACRE
18,RUA LUCIO MARINHO CRUZ,91,NA,,,BARRA DE SAO MIGUEL,2501708,ALAGOAS
19,,,,,,Rio Branco,1200401,ACRE
20,RUA RUI BARBOSA,108,,,,Rio Branco,1200401,ACRE
21,RUA RUI BARBOSA,,,,,Rio Branco,1200401,ACRE
22,,,,,Copacabana,RIO DE JANEIRO,3304557,RIO DE JANEIRO
23,,,,22280-005,Botafogo,RIO DE JANEIRO,3304557,RIO DE JANEIRO
24,,,,22280-005,,RIO DE JANEIRO,3304557,RIO DE JANEIRO
25,RUA LEOPOLDINA TOME,,,25030-050,CENTENARIO,DUQUE DE CAXIAS,3301702,RIO DE JANEIRO
26,RUA LEOPOLDINA TOME,46,,25030-050,,DUQUE DE CAXIAS,3301702,RIO DE JANEIRO
27,CONJUNTO GISLENE MATEUS,,,57160-000,,MARECHAL DEODORO,2704708,ALAGOAS
28,RUA MARIA LUCIA PACIFICO,,,,SANTA RITA,NOVA IGUACU,3303500,RIO DE JANEIRO
id;nm_logradouro;Numero;Cep;Bairro;nm_municipio;code_muni;nm_uf
1;RUA MARIA LUCIA PACIFICO;17;26042-730;SANTA RITA;NOVA IGUACU;3303500;RIO DE JANEIRO
2;RUA LEOPOLDINA TOME;46;25030-050;CENTENARIO;DUQUE DE CAXIAS;3301702;RIO DE JANEIRO
3;RUA DONA JUDITE;0;23915-700;CAPUTERA II;ANGRA DOS REIS;3300100;RIO DE JANEIRO
4;RUA ALEXANDRE AMARAL;0;23098-120;SANTISSIMO;RIO DE JANEIRO;3304557;RIO DE JANEIRO
5;AVENIDA E;300;23860-000;PRAIA GRANDE;MANGARATIBA;3302601;RIO DE JANEIRO
6;RUA PRINCESA ISABEL;263;69921-026;ESTACAO EXPERIMENTAL;RIO BRANCO;1200401;ACRE
7;CAMINHO SEM DENOMINACAO;2530;69980-000;MIRITIZAL;CRUZEIRO DO SUL;1200203;ACRE
8;RUA DO LIMAO;268;69920-048;MOCINHA MAGALHAES;RIO BRANCO;1200401;ACRE
9;RUA GERALDO MESQUITA;0;69918-202;NOVA ESTACAO;RIO BRANCO;1200401;ACRE
10;AVENIDA CASTELO BRANCO;3875;69925-000;DEMOCRACIA;SENADOR GUIOMARD;1200450;ACRE
11;RUA LUCIO MARINHO CRUZ;91;57180-000;CENTRO;BARRA DE SAO MIGUEL;2501708;ALAGOAS
12;CONJUNTO GISLENE MATEUS;12;57160-000;TAPERAGUA;MARECHAL DEODORO;2704708;ALAGOAS
13;RUA MONTEIROPOLIS;6;57071-238;CLIMA BOM;MACEIO;2704302;ALAGOAS
14;RUA PRESIDENTE GETULIO VARGAS;1303;57580-000;CENTRO;MAJOR ISIDORO;2704401;ALAGOAS
15;RUA FRANCISCO DE HOLANDA;9;57073-735;CIDADE UNIVERSITARIA;MACEIO;2704302;ALAGOAS
16;RUA MARIA LUCIA PACIFICO;17;26042-730;;NOVA IGUACU;3303500;RIO DE JANEIRO
17;RUA PRINCESA ISABEL;263;;ESTACAO EXPERIMENTAL;RIO BRANCO;1200401;ACRE
18;RUA LUCIO MARINHO CRUZ;91;;;BARRA DE SAO MIGUEL;2501708;ALAGOAS
19;;;;;Rio Branco;1200401;ACRE
20;RUA RUI BARBOSA;108;;;Rio Branco;1200401;ACRE
21;RUA RUI BARBOSA;;;;Rio Branco;1200401;ACRE
22;;;;Copacabana;RIO DE JANEIRO;3304557;RIO DE JANEIRO
23;;;22280-005;Botafogo;RIO DE JANEIRO;3304557;RIO DE JANEIRO
24;;;22280-005;;RIO DE JANEIRO;3304557;RIO DE JANEIRO
25;RUA LEOPOLDINA TOME;;25030-050;CENTENARIO;DUQUE DE CAXIAS;3301702;RIO DE JANEIRO
26;RUA LEOPOLDINA TOME;46;25030-050;;DUQUE DE CAXIAS;3301702;RIO DE JANEIRO
27;CONJUNTO GISLENE MATEUS;;57160-000;;MARECHAL DEODORO;2704708;ALAGOAS
28;RUA MARIA LUCIA PACIFICO;;;SANTA RITA;NOVA IGUACU;3303500;RIO DE JANEIRO
12 changes: 3 additions & 9 deletions man/download_cnefe.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 16 additions & 16 deletions tests/testthat/test-download_cnefe.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
tester <- function(state = "all", progress = TRUE, cache = TRUE) {
download_cnefe(state, progress, cache)
tester <- function(progress = TRUE, cache = TRUE) {
download_cnefe(progress, cache)
}

test_that("errors with incorrect input", {
Expand All @@ -16,15 +16,15 @@ test_that("errors with incorrect input", {
})

test_that("returns the path to the directory where the files were saved", {
result <- tester("AL")
result <- tester()
expect_identical(result, file.path(get_cache_dir()))
})

test_that("cache usage is controlled by the cache argument", {
result <- tester("AL", cache = TRUE)
result <- tester(cache = TRUE)
expect_identical(result, file.path(get_cache_dir()))

result <- tester("AL", cache = FALSE)
result <- tester(cache = FALSE)
expect_true(
grepl(file.path(fs::path_norm(tempdir()), "standardized_cnefe"), result)
)
Expand All @@ -48,14 +48,14 @@ test_that("errors if could not download the data for one or more states", {
expect_snapshot(tester("AL", cache = FALSE), error = TRUE, cnd_class = TRUE)
})

test_that("would download the data of all states if state='all'", {
local_mocked_bindings(
perform_requests_in_parallel = function(requests, ...) {
if (length(requests) == 27) {
cli::cli_abort("Too much to download", class = "state_all_succeeded")
}
}
)

expect_error(tester("all", cache = FALSE), class = "state_all_succeeded")
})
# test_that("would download the data of all states if state='all'", {
# local_mocked_bindings(
# perform_requests_in_parallel = function(requests, ...) {
# if (length(requests) == 27) {
# cli::cli_abort("Too much to download", class = "state_all_succeeded")
# }
# }
# )
#
# expect_error(tester("all", cache = FALSE), class = "state_all_succeeded")
# })
6 changes: 4 additions & 2 deletions tests/testthat/test_cache.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,10 @@ test_that("behaves correctly", {

expect_identical(list_cached_data(), character(0))

download_cnefe(c("AC", "AL"), progress = FALSE)
expect_true(all(grepl("estado=(AC|AL)", list_cached_data())))
download_cnefe( progress = FALSE)
expect_true(all(grepl(".parquet", list_cached_data())))

expect_true(sum(grepl(".parquet", list_cached_data()))==12)

# expect a tree-like message when print_tree=TRUE

Expand Down

0 comments on commit 71eeff7

Please sign in to comment.