From a63d657604067a34f0a4e79075e52e178e54a77f Mon Sep 17 00:00:00 2001
From: rafapereirabr <rafa.pereira.br@gmail.com>
Date: Wed, 11 Sep 2024 23:06:44 -0300
Subject: [PATCH] remove merge_households from population and families
 functions

---
 NEWS.md                               |  2 +-
 R/merge_household.R                   | 14 ++++++++++++++
 R/read_families.R                     | 18 ++++++++----------
 R/read_population.R                   | 18 ++++++++----------
 man/read_families.Rd                  |  4 ----
 man/read_population.Rd                |  4 ----
 tests/testthat/test_read_families.R   | 18 +++++++++---------
 tests/testthat/test_read_population.R | 22 +++++++++++-----------
 8 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 67b79c2..852bba6 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,7 @@
 # censobr v0.3.29999  dev
 
 * Major changes
-  * Some functions (`read_population`, `read_mortality`, `read_families`, `read_emigration`) now include a new parameter `merge_households` (logical) to indicate whether the function should merge household variables to the output data. Closes [#31](https://github.com/ipeaGIT/censobr/issues/31)
+  * Some functions (`read_mortality`, `read_emigration`) now include a new parameter `merge_households` (logical) to indicate whether the function should merge household variables to the output data. Partially closes [#31](https://github.com/ipeaGIT/censobr/issues/31)
   * {censobr} now imports the {duckplyr} package, which is used for merging household data. Closes issue [#31](https://github.com/ipeaGIT/censobr/issues/31).
   * New vignette showing how to work with larger-than-memory data. Closes [#42](https://github.com/ipeaGIT/censobr/issues/42). The vignette still needs to be expanded with more examples, though.
 
diff --git a/R/merge_household.R b/R/merge_household.R
index 1443d6c..4035586 100644
--- a/R/merge_household.R
+++ b/R/merge_household.R
@@ -23,11 +23,13 @@ merge_household_var <- function(df,
   if (year == 1970) {
     key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
                   'code_region', 'name_region', 'id_household')
+    key_key <- 'id_household'
     }
 
   if (year == 1980) {
     key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
                   'code_region', 'name_region', 'V6', 'V601')
+    key_key <- 'V601'
 
     # rename weight var
     df_household <- dplyr::rename(df_household, 'V603_household' = 'V603')
@@ -37,6 +39,7 @@ merge_household_var <- function(df,
     key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
                   'code_region', 'name_region', 'V0109')
 
+    key_key <- 'V0109'
     # rename weight var
     df_household <- dplyr::rename(df_household, 'V7300_household' = 'V7300')
     }
@@ -44,12 +47,14 @@ merge_household_var <- function(df,
   if (year == 2000) {
     key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
                   'code_region', 'name_region', 'code_weighting', 'V0300')
+    key_key <- 'V0300'
   }
 
   if (year == 2010) {
     key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
                   'code_region', 'name_region', 'code_weighting', 'V0300')
 
+    key_key <- 'V0300'
     # rename weight var
     df_household <- dplyr::rename(df_household, 'V0010_household' = 'V0010') |>
                     dplyr::compute()
@@ -62,6 +67,15 @@ merge_household_var <- function(df,
   df_household <- dplyr::select(df_household, -all_of(vars_to_drop)) |>
                   dplyr::compute()
 
+  # # pre-filter right-hand table that matches key in left-hand table
+  # this improves performance a bit
+  df <- dplyr::compute(df)
+  key_values <- as.vector(unique(df$GetColumnByName(key_key)))
+  df_household <- dplyr::filter(df_household, get(key_key) %in% key_values) |>
+                  dplyr::compute()
+
+#  nrow(df_household)
+#  [1] 6192332
 
   # convert to duckdb
   # df <- arrow::to_duckdb(df)
diff --git a/R/read_families.R b/R/read_families.R
index 5605777..7b4169b 100644
--- a/R/read_families.R
+++ b/R/read_families.R
@@ -7,7 +7,6 @@
 #' @param year Numeric. Year of reference in the format `yyyy`. Defaults to `2000`.
 #' @template columns
 #' @template add_labels
-#' @template merge_households
 #' @template as_data_frame
 #' @template showProgress
 #' @template cache
@@ -24,7 +23,6 @@
 read_families <- function(year = 2000,
                           columns = NULL,
                           add_labels = NULL,
-                          merge_households = FALSE,
                           as_data_frame = FALSE,
                           showProgress = TRUE,
                           cache = TRUE){
@@ -33,7 +31,7 @@ read_families <- function(year = 2000,
   checkmate::assert_numeric(year)
   checkmate::assert_vector(columns, null.ok = TRUE)
   checkmate::assert_logical(as_data_frame)
-  checkmate::assert_logical(merge_households)
+  # checkmate::assert_logical(merge_households)
   checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE)
 
   # data available for the years:
@@ -58,13 +56,13 @@ read_families <- function(year = 2000,
   ### read data
   df <- arrow_open_dataset(local_file)
 
-  ### merge household data
-  if (isTRUE(merge_households)) {
-    df <- merge_household_var(df,
-                              year = year,
-                              add_labels = add_labels,
-                              showProgress)
-  }
+  # ### merge household data
+  # if (isTRUE(merge_households)) {
+  #   df <- merge_household_var(df,
+  #                             year = year,
+  #                             add_labels = add_labels,
+  #                             showProgress)
+  # }
 
   ### Select
   if (!is.null(columns)) { # columns <- c('V0002','V0011')
diff --git a/R/read_population.R b/R/read_population.R
index 167b512..b46eb56 100644
--- a/R/read_population.R
+++ b/R/read_population.R
@@ -7,7 +7,6 @@
 #' @template year
 #' @template columns
 #' @template add_labels
-#' @template merge_households
 #' @template as_data_frame
 #' @template showProgress
 #' @template cache
@@ -24,7 +23,6 @@
 read_population <- function(year = 2010,
                             columns = NULL,
                             add_labels = NULL,
-                            merge_households = FALSE,
                             as_data_frame = FALSE,
                             showProgress = TRUE,
                             cache = TRUE){
@@ -33,7 +31,7 @@ read_population <- function(year = 2010,
   checkmate::assert_numeric(year)
   checkmate::assert_vector(columns, null.ok = TRUE)
   checkmate::assert_logical(as_data_frame)
-  checkmate::assert_logical(merge_households)
+  # checkmate::assert_logical(merge_households)
   checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE)
 
   # data available for the years:
@@ -58,13 +56,13 @@ read_population <- function(year = 2010,
   ### read data
   df <- arrow_open_dataset(local_file)
 
-  ### merge household data
-  if (isTRUE(merge_households)) {
-     df <- merge_household_var(df,
-                               year = year,
-                               add_labels = add_labels,
-                               showProgress = showProgress)
-    }
+  # ### merge household data
+  # if (isTRUE(merge_households)) {
+  #    df <- merge_household_var(df,
+  #                              year = year,
+  #                              add_labels = add_labels,
+  #                              showProgress = showProgress)
+  #   }
 
   ### Select
   if (!is.null(columns)) { # columns <- c('V0002','V0011')
diff --git a/man/read_families.Rd b/man/read_families.Rd
index 0d30f0e..ad56520 100644
--- a/man/read_families.Rd
+++ b/man/read_families.Rd
@@ -8,7 +8,6 @@ read_families(
   year = 2000,
   columns = NULL,
   add_labels = NULL,
-  merge_households = FALSE,
   as_data_frame = FALSE,
   showProgress = TRUE,
   cache = TRUE
@@ -24,9 +23,6 @@ columns are not read. Defaults to \code{NULL} and read all columns.}
 responses of categorical variables. When \code{add_labels = "pt"}, the
 function adds labels in Portuguese. Defaults to \code{NULL}.}
 
-\item{merge_households}{Logical. Indicate whether the function should merge
-household variables to the output data. Defaults to \code{FALSE}.}
-
 \item{as_data_frame}{Logical. When \code{FALSE} (Default), the function returns an
 Arrow Dataset, which allows users to work with larger-than-memory data.
 If \code{TRUE}, the function returns \code{data.frame}.}
diff --git a/man/read_population.Rd b/man/read_population.Rd
index eba7a50..cdd1cd0 100644
--- a/man/read_population.Rd
+++ b/man/read_population.Rd
@@ -8,7 +8,6 @@ read_population(
   year = 2010,
   columns = NULL,
   add_labels = NULL,
-  merge_households = FALSE,
   as_data_frame = FALSE,
   showProgress = TRUE,
   cache = TRUE
@@ -24,9 +23,6 @@ columns are not read. Defaults to \code{NULL} and read all columns.}
 responses of categorical variables. When \code{add_labels = "pt"}, the
 function adds labels in Portuguese. Defaults to \code{NULL}.}
 
-\item{merge_households}{Logical. Indicate whether the function should merge
-household variables to the output data. Defaults to \code{FALSE}.}
-
 \item{as_data_frame}{Logical. When \code{FALSE} (Default), the function returns an
 Arrow Dataset, which allows users to work with larger-than-memory data.
 If \code{TRUE}, the function returns \code{data.frame}.}
diff --git a/tests/testthat/test_read_families.R b/tests/testthat/test_read_families.R
index 3276eb3..e171661 100644
--- a/tests/testthat/test_read_families.R
+++ b/tests/testthat/test_read_families.R
@@ -53,15 +53,15 @@ test_that("read_families read", {
 
 # Merge households vars -----------------------
 
-test_that("families merge_households_vars", {
-
-  for(y in c(2000)){ # y = 2000
-    message(y)
-    df_hou <- read_households(year = y)
-    df_test <- tester(year = y, merge_households = TRUE)
-    testthat::expect_true( all(names(df_hou) %in% names(df_test)) )
-  }
-})
+# test_that("families merge_households_vars", {
+#
+#   for(y in c(2000)){ # y = 2000
+#     message(y)
+#     df_hou <- read_households(year = y)
+#     df_test <- tester(year = y, merge_households = TRUE)
+#     testthat::expect_true( all(names(df_hou) %in% names(df_test)) )
+#   }
+# })
 
 
 # ERRORS and messages  -----------------------
diff --git a/tests/testthat/test_read_population.R b/tests/testthat/test_read_population.R
index 4858680..887a165 100644
--- a/tests/testthat/test_read_population.R
+++ b/tests/testthat/test_read_population.R
@@ -131,17 +131,17 @@ test_that("read_population check totals", {
 
 # Merge households vars -----------------------
 
-test_that("population merge_households_vars", {
-
-  for(y in c(1970, 1980, 1991, 2000, 2010)){ # y = 2010
-    message(y)
-    df_hou <- censobr::read_households(year = y)
-    df_test <- tester(year = y,
-                      merge_households = TRUE,
-                      showProgress = FALSE)
-    testthat::expect_true( all(names(df_hou) %in% names(df_test)) )
-  }
-})
+# test_that("population merge_households_vars", {
+#
+#   for(y in c(1970, 1980, 1991, 2000, 2010)){ # y = 2010
+#     message(y)
+#     df_hou <- censobr::read_households(year = y)
+#     df_test <- tester(year = y,
+#                       merge_households = TRUE,
+#                       showProgress = FALSE)
+#     testthat::expect_true( all(names(df_hou) %in% names(df_test)) )
+#   }
+# })
 
 
 # ERRORS and messages  -----------------------