From 046f5d32a0010906e7886edc3bcc80f17f0c2b2f Mon Sep 17 00:00:00 2001 From: Rafael Henrique Moraes Pereira Date: Thu, 19 Dec 2024 12:52:18 -0300 Subject: [PATCH] filtered_cnefe with arrow --- R/geocode.R | 29 +++++++++++------------- R/geocode_rafa.R | 5 ++-- tests/tests_rafa/tests_arrow_vs_duckdb.R | 11 ++++----- 3 files changed, 20 insertions(+), 25 deletions(-) diff --git a/R/geocode.R b/R/geocode.R index 442cc00..e300194 100644 --- a/R/geocode.R +++ b/R/geocode.R @@ -97,19 +97,14 @@ geocode <- function(addresses_table, ) # downloading cnefe. we only need to download the states present in the - # addresses table, which may save us some time. we also subset cnefe to - # include only the municipalities present in the input table, reducing the - # search scope and consequently reducing processing time and memory usage + # addresses table, which may save us some time. present_states <- unique(standard_locations$estado_padr) download_cnefe(present_states, progress = progress, cache = cache) - cnefe <- arrow::open_dataset(get_cache_dir()) - cnefe <- dplyr::filter(cnefe, estado %in% present_states) - # creating a temporary db and registering both the input table and the cnefe - # data + # creating a temporary db and register the input table data tmpdb <- tempfile(fileext = ".duckdb") con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = tmpdb) @@ -123,17 +118,19 @@ geocode <- function(addresses_table, temporary = TRUE ) + # register cnefe data to db but only include states and municipalities + # present in the input table, reducing the search scope and + # consequently reducing processing time and memory usage + unique_muns <- unique(standard_locations$municipio_padr) - muns_list <- paste(glue::glue("'{unique_muns}'"), collapse = ", ") - duckdb::duckdb_register_arrow(con, "cnefe", cnefe) - DBI::dbExecute( - con, - glue::glue( - "CREATE OR REPLACE VIEW filtered_cnefe AS ", - "SELECT * FROM cnefe WHERE municipio IN ({muns_list})" - ) - ) + filtered_cnefe <- arrow::open_dataset(get_cache_dir()) |> + dplyr::filter(estado %in% present_states) |> + dplyr::filter(municipio %in% unique_muns) |> + dplyr::compute() + + duckdb::duckdb_register_arrow(con, "filtered_cnefe", filtered_cnefe) + # to find the coordinates of the addresses, we merge the input table with the # cnefe data. the column names used in the input table are different than the diff --git a/R/geocode_rafa.R b/R/geocode_rafa.R index 01b9e41..97de150 100644 --- a/R/geocode_rafa.R +++ b/R/geocode_rafa.R @@ -96,7 +96,7 @@ geocode_rafa <- function(input_table, # Convert input data frame to DuckDB table duckdb::dbWriteTable(con, "input_padrao_db", input_padrao, - temporary = TRUE, overwrite=TRUE) + temporary = TRUE) input_states <- unique(input_padrao$estado) @@ -123,7 +123,8 @@ geocode_rafa <- function(input_table, if(is.null(input_municipio)){ input_municipio <- "*"} # Load CNEFE data and write to DuckDB - filtered_cnefe <- arrow_open_dataset(geocodebr::get_cache_dir()) |> + filtered_cnefe <- arrow::open_dataset(get_cache_dir()) |> + dplyr::filter(estado %in% input_states) |> dplyr::filter(municipio %in% input_municipio) |> dplyr::compute() diff --git a/tests/tests_rafa/tests_arrow_vs_duckdb.R b/tests/tests_rafa/tests_arrow_vs_duckdb.R index 8b7eb22..5a27fc2 100644 --- a/tests/tests_rafa/tests_arrow_vs_duckdb.R +++ b/tests/tests_rafa/tests_arrow_vs_duckdb.R @@ -58,7 +58,6 @@ fields <- geocodebr::setup_address_fields( estado = 'nm_uf' ) - df_duck_dani <- geocodebr:::geocode( addresses_table = input_df, address_fields = fields, @@ -79,7 +78,7 @@ df_duck_rafa <- geocodebr:::geocode_rafa( municipio = "nm_municipio", estado = "nm_uf", output_simple = F, - ncores=7, + n_cores=7, progress = T ) tictoc::toc() @@ -147,17 +146,16 @@ query_aggregate_and_match <- sprintf( rafa <- function(){ - df_duck_rafa <- geocode( + df_duck_rafa <- geocodebr:::geocode_rafa( input_table = input_df, logradouro = "nm_logradouro", numero = "Numero", - complemento = "Complemento", cep = "Cep", bairro = "Bairro", municipio = "nm_municipio", estado = "nm_uf", output_simple = F, - ncores=7, + n_cores=7, progress = T ) } @@ -166,7 +164,6 @@ rafa <- function(){ fields <- geocodebr::setup_address_fields( logradouro = 'nm_logradouro', numero = 'Numero', - complemento = 'Complemento', cep = 'Cep', bairro = 'Bairro', municipio = 'nm_municipio', @@ -174,7 +171,7 @@ rafa <- function(){ ) - df_duck_dani <- geocodebr:::geocode2( +df_duck_dani <- geocodebr:::geocode( addresses_table = input_df, address_fields = fields, n_cores = 7,