diff --git a/NEWS.md b/NEWS.md index 67b79c2..852bba6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,7 @@ # censobr v0.3.29999 dev * Major changes - * Some functions (`read_population`, `read_mortality`, `read_families`, `read_emigration`) now include a new parameter `merge_households` (logical) to indicate whether the function should merge household variables to the output data. Closes [#31](https://github.com/ipeaGIT/censobr/issues/31) + * Some functions (`read_mortality`, `read_emigration`) now include a new parameter `merge_households` (logical) to indicate whether the function should merge household variables to the output data. Partially closes [#31](https://github.com/ipeaGIT/censobr/issues/31) * {censobr} now imports the {duckplyr} package, which is used for merging household data. Closes issue [#31](https://github.com/ipeaGIT/censobr/issues/31). * New vignette showing how to work with larger-than-memory data. Closes [#42](https://github.com/ipeaGIT/censobr/issues/42). The vignette still needs to be expanded with more examples, though. diff --git a/R/merge_household.R b/R/merge_household.R index 1443d6c..4035586 100644 --- a/R/merge_household.R +++ b/R/merge_household.R @@ -23,11 +23,13 @@ merge_household_var <- function(df, if (year == 1970) { key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state', 'code_region', 'name_region', 'id_household') + key_key <- 'id_household' } if (year == 1980) { key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state', 'code_region', 'name_region', 'V6', 'V601') + key_key <- 'V601' # rename weight var df_household <- dplyr::rename(df_household, 'V603_household' = 'V603') @@ -37,6 +39,7 @@ merge_household_var <- function(df, key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state', 'code_region', 'name_region', 'V0109') + key_key <- 'V0109' # rename weight var df_household <- dplyr::rename(df_household, 'V7300_household' = 'V7300') } @@ -44,12 +47,14 @@ merge_household_var <- function(df, if (year == 2000) { key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state', 'code_region', 'name_region', 'code_weighting', 'V0300') + key_key <- 'V0300' } if (year == 2010) { key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state', 'code_region', 'name_region', 'code_weighting', 'V0300') + key_key <- 'V0300' # rename weight var df_household <- dplyr::rename(df_household, 'V0010_household' = 'V0010') |> dplyr::compute() @@ -62,6 +67,15 @@ merge_household_var <- function(df, df_household <- dplyr::select(df_household, -all_of(vars_to_drop)) |> dplyr::compute() + # # pre-filter right-hand table that matches key in left-hand table + # this improves performance a bit + df <- dplyr::compute(df) + key_values <- as.vector(unique(df$GetColumnByName(key_key))) + df_household <- dplyr::filter(df_household, get(key_key) %in% key_values) |> + dplyr::compute() + +# nrow(df_household) +# [1] 6192332 # convert to duckdb # df <- arrow::to_duckdb(df) diff --git a/R/read_families.R b/R/read_families.R index 5605777..7b4169b 100644 --- a/R/read_families.R +++ b/R/read_families.R @@ -7,7 +7,6 @@ #' @param year Numeric. Year of reference in the format `yyyy`. Defaults to `2000`. #' @template columns #' @template add_labels -#' @template merge_households #' @template as_data_frame #' @template showProgress #' @template cache @@ -24,7 +23,6 @@ read_families <- function(year = 2000, columns = NULL, add_labels = NULL, - merge_households = FALSE, as_data_frame = FALSE, showProgress = TRUE, cache = TRUE){ @@ -33,7 +31,7 @@ read_families <- function(year = 2000, checkmate::assert_numeric(year) checkmate::assert_vector(columns, null.ok = TRUE) checkmate::assert_logical(as_data_frame) - checkmate::assert_logical(merge_households) + # checkmate::assert_logical(merge_households) checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE) # data available for the years: @@ -58,13 +56,13 @@ read_families <- function(year = 2000, ### read data df <- arrow_open_dataset(local_file) - ### merge household data - if (isTRUE(merge_households)) { - df <- merge_household_var(df, - year = year, - add_labels = add_labels, - showProgress) - } + # ### merge household data + # if (isTRUE(merge_households)) { + # df <- merge_household_var(df, + # year = year, + # add_labels = add_labels, + # showProgress) + # } ### Select if (!is.null(columns)) { # columns <- c('V0002','V0011') diff --git a/R/read_population.R b/R/read_population.R index 167b512..b46eb56 100644 --- a/R/read_population.R +++ b/R/read_population.R @@ -7,7 +7,6 @@ #' @template year #' @template columns #' @template add_labels -#' @template merge_households #' @template as_data_frame #' @template showProgress #' @template cache @@ -24,7 +23,6 @@ read_population <- function(year = 2010, columns = NULL, add_labels = NULL, - merge_households = FALSE, as_data_frame = FALSE, showProgress = TRUE, cache = TRUE){ @@ -33,7 +31,7 @@ read_population <- function(year = 2010, checkmate::assert_numeric(year) checkmate::assert_vector(columns, null.ok = TRUE) checkmate::assert_logical(as_data_frame) - checkmate::assert_logical(merge_households) + # checkmate::assert_logical(merge_households) checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE) # data available for the years: @@ -58,13 +56,13 @@ read_population <- function(year = 2010, ### read data df <- arrow_open_dataset(local_file) - ### merge household data - if (isTRUE(merge_households)) { - df <- merge_household_var(df, - year = year, - add_labels = add_labels, - showProgress = showProgress) - } + # ### merge household data + # if (isTRUE(merge_households)) { + # df <- merge_household_var(df, + # year = year, + # add_labels = add_labels, + # showProgress = showProgress) + # } ### Select if (!is.null(columns)) { # columns <- c('V0002','V0011') diff --git a/man/read_families.Rd b/man/read_families.Rd index 0d30f0e..ad56520 100644 --- a/man/read_families.Rd +++ b/man/read_families.Rd @@ -8,7 +8,6 @@ read_families( year = 2000, columns = NULL, add_labels = NULL, - merge_households = FALSE, as_data_frame = FALSE, showProgress = TRUE, cache = TRUE @@ -24,9 +23,6 @@ columns are not read. Defaults to \code{NULL} and read all columns.} responses of categorical variables. When \code{add_labels = "pt"}, the function adds labels in Portuguese. Defaults to \code{NULL}.} -\item{merge_households}{Logical. Indicate whether the function should merge -household variables to the output data. Defaults to \code{FALSE}.} - \item{as_data_frame}{Logical. When \code{FALSE} (Default), the function returns an Arrow Dataset, which allows users to work with larger-than-memory data. If \code{TRUE}, the function returns \code{data.frame}.} diff --git a/man/read_population.Rd b/man/read_population.Rd index eba7a50..cdd1cd0 100644 --- a/man/read_population.Rd +++ b/man/read_population.Rd @@ -8,7 +8,6 @@ read_population( year = 2010, columns = NULL, add_labels = NULL, - merge_households = FALSE, as_data_frame = FALSE, showProgress = TRUE, cache = TRUE @@ -24,9 +23,6 @@ columns are not read. Defaults to \code{NULL} and read all columns.} responses of categorical variables. When \code{add_labels = "pt"}, the function adds labels in Portuguese. Defaults to \code{NULL}.} -\item{merge_households}{Logical. Indicate whether the function should merge -household variables to the output data. Defaults to \code{FALSE}.} - \item{as_data_frame}{Logical. When \code{FALSE} (Default), the function returns an Arrow Dataset, which allows users to work with larger-than-memory data. If \code{TRUE}, the function returns \code{data.frame}.} diff --git a/tests/testthat/test_read_families.R b/tests/testthat/test_read_families.R index 3276eb3..e171661 100644 --- a/tests/testthat/test_read_families.R +++ b/tests/testthat/test_read_families.R @@ -53,15 +53,15 @@ test_that("read_families read", { # Merge households vars ----------------------- -test_that("families merge_households_vars", { - - for(y in c(2000)){ # y = 2000 - message(y) - df_hou <- read_households(year = y) - df_test <- tester(year = y, merge_households = TRUE) - testthat::expect_true( all(names(df_hou) %in% names(df_test)) ) - } -}) +# test_that("families merge_households_vars", { +# +# for(y in c(2000)){ # y = 2000 +# message(y) +# df_hou <- read_households(year = y) +# df_test <- tester(year = y, merge_households = TRUE) +# testthat::expect_true( all(names(df_hou) %in% names(df_test)) ) +# } +# }) # ERRORS and messages ----------------------- diff --git a/tests/testthat/test_read_population.R b/tests/testthat/test_read_population.R index 4858680..887a165 100644 --- a/tests/testthat/test_read_population.R +++ b/tests/testthat/test_read_population.R @@ -131,17 +131,17 @@ test_that("read_population check totals", { # Merge households vars ----------------------- -test_that("population merge_households_vars", { - - for(y in c(1970, 1980, 1991, 2000, 2010)){ # y = 2010 - message(y) - df_hou <- censobr::read_households(year = y) - df_test <- tester(year = y, - merge_households = TRUE, - showProgress = FALSE) - testthat::expect_true( all(names(df_hou) %in% names(df_test)) ) - } -}) +# test_that("population merge_households_vars", { +# +# for(y in c(1970, 1980, 1991, 2000, 2010)){ # y = 2010 +# message(y) +# df_hou <- censobr::read_households(year = y) +# df_test <- tester(year = y, +# merge_households = TRUE, +# showProgress = FALSE) +# testthat::expect_true( all(names(df_hou) %in% names(df_test)) ) +# } +# }) # ERRORS and messages -----------------------