new parameter merge_households in #31

ipeaGIT · May 5, 2024 · 19b0799 · 19b0799
1 parent d86ee81
commit 19b0799
Show file tree

Hide file tree

Showing 27 changed files with 196 additions and 47 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -12,3 +12,4 @@
 .RData
 ^_pkgdown\.yml$
 ^pkgdown$
+/data_prep/*
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# censobr v0.3.29999 dev
+
+* Minor changes
+  * Some functions `read_population`, `read_mortality`, `read_families`, `read_emigration` now include a new parameter `merge_households` (logical) to indicate whether the function should merge household variables to the output data. Closes [#31](https://github.com/ipeaGIT/censobr/issues/31)
+
 # censobr v0.3.2
 
 * Minor changes

diff --git a/tests/tests_rafa/merge_household.R → R/merge_household.R b/tests/tests_rafa/merge_household.R → R/merge_household.R
@@ -44,7 +44,7 @@ merge_household_var <- function(df,
   if (year == 2000) {
     key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
                   'code_region', 'name_region', 'code_weighting', 'V0300')
-    }
+  }
 
   if (year == 2010) {
     key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
@@ -64,5 +64,5 @@ merge_household_var <- function(df,
   # merge
   temp_df <- dplyr::left_join(df, df_household)
 
-return(temp_df)
-  }
+  return(temp_df)
+}
diff --git a/R/read_emigration.R b/R/read_emigration.R
@@ -7,6 +7,7 @@
 #' @template year
 #' @template columns
 #' @template add_labels
+#' @template merge_households
 #' @template as_data_frame
 #' @template showProgress
 #' @template cache
@@ -29,6 +30,7 @@
 read_emigration <- function(year = 2010,
                             columns = NULL,
                             add_labels = NULL,
+                            merge_households = FALSE,
                             as_data_frame = FALSE,
                             showProgress = TRUE,
                             cache = TRUE){
@@ -37,6 +39,7 @@ read_emigration <- function(year = 2010,
   checkmate::assert_numeric(year)
   checkmate::assert_vector(columns, null.ok = TRUE)
   checkmate::assert_logical(as_data_frame)
+  checkmate::assert_logical(merge_households)
   checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE)
 
   # data available for the years:
@@ -61,6 +64,14 @@ read_emigration <- function(year = 2010,
   ### read data
   df <- arrow_open_dataset(local_file)
 
+  ### merge household data
+  if (isTRUE(merge_households)) {
+    df <- merge_household_var(df,
+                              year = year,
+                              add_labels = add_labels,
+                              showProgress)
+  }
+
   ### Select
   if (!is.null(columns)) { # columns <- c('V0002','V0011')
     df <- dplyr::select(df, dplyr::all_of(columns))

diff --git a/R/read_families.R b/R/read_families.R
@@ -7,6 +7,7 @@
 #' @param year Numeric. Year of reference in the format `yyyy`. Defaults to `2000`.
 #' @template columns
 #' @template add_labels
+#' @template merge_households
 #' @template as_data_frame
 #' @template showProgress
 #' @template cache
@@ -23,6 +24,7 @@
 read_families <- function(year = 2000,
                           columns = NULL,
                           add_labels = NULL,
+                          merge_households = FALSE,
                           as_data_frame = FALSE,
                           showProgress = TRUE,
                           cache = TRUE){
@@ -31,6 +33,7 @@ read_families <- function(year = 2000,
   checkmate::assert_numeric(year)
   checkmate::assert_vector(columns, null.ok = TRUE)
   checkmate::assert_logical(as_data_frame)
+  checkmate::assert_logical(merge_households)
   checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE)
 
   # data available for the years:
@@ -55,6 +58,14 @@ read_families <- function(year = 2000,
   ### read data
   df <- arrow_open_dataset(local_file)
 
+  ### merge household data
+  if (isTRUE(merge_households)) {
+    df <- merge_household_var(df,
+                              year = year,
+                              add_labels = add_labels,
+                              showProgress)
+  }
+
   ### Select
   if (!is.null(columns)) { # columns <- c('V0002','V0011')
     df <- dplyr::select(df, dplyr::all_of(columns))

diff --git a/R/read_mortality.R b/R/read_mortality.R
@@ -7,6 +7,7 @@
 #' @template year
 #' @template columns
 #' @template add_labels
+#' @template merge_households
 #' @template as_data_frame
 #' @template showProgress
 #' @template cache
@@ -33,6 +34,7 @@
 read_mortality <- function(year = 2010,
                            columns = NULL,
                            add_labels = NULL,
+                           merge_households = FALSE,
                            as_data_frame = FALSE,
                            showProgress = TRUE,
                            cache = TRUE){
@@ -41,6 +43,7 @@ read_mortality <- function(year = 2010,
   checkmate::assert_numeric(year)
   checkmate::assert_vector(columns, null.ok = TRUE)
   checkmate::assert_logical(as_data_frame)
+  checkmate::assert_logical(merge_households)
   checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE)
 
   # available for the years:
@@ -66,6 +69,14 @@ read_mortality <- function(year = 2010,
   ### read data
   df <- arrow_open_dataset(local_file)
 
+  ### merge household data
+  if (isTRUE(merge_households)) {
+    df <- merge_household_var(df,
+                            year = year,
+                            add_labels = add_labels,
+                            showProgress)
+    }
+
   ### Select
   if (!is.null(columns)) { # columns <- c('V0002','V0011')
     df <- dplyr::select(df, dplyr::all_of(columns))

diff --git a/R/read_population.R b/R/read_population.R
@@ -7,6 +7,7 @@
 #' @template year
 #' @template columns
 #' @template add_labels
+#' @template merge_households
 #' @template as_data_frame
 #' @template showProgress
 #' @template cache
@@ -23,6 +24,7 @@
 read_population <- function(year = 2010,
                             columns = NULL,
                             add_labels = NULL,
+                            merge_households = FALSE,
                             as_data_frame = FALSE,
                             showProgress = TRUE,
                             cache = TRUE){
@@ -31,6 +33,7 @@ read_population <- function(year = 2010,
   checkmate::assert_numeric(year)
   checkmate::assert_vector(columns, null.ok = TRUE)
   checkmate::assert_logical(as_data_frame)
+  checkmate::assert_logical(merge_households)
   checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE)
 
   # data available for the years:
@@ -56,6 +59,14 @@ read_population <- function(year = 2010,
   ### read data
   df <- arrow_open_dataset(local_file)
 
+  ### merge household data
+  if (isTRUE(merge_households)) {
+    df <- merge_household_var(df,
+                              year = year,
+                              add_labels = add_labels,
+                              showProgress)
+  }
+
   ### Select
   if (!is.null(columns)) { # columns <- c('V0002','V0011')
     df <- dplyr::select(df, dplyr::all_of(columns))

diff --git a/R/read_tracts.R b/R/read_tracts.R
@@ -15,6 +15,8 @@
 #' @export
 #' @family Census tract data
 #' @examplesIf identical(tolower(Sys.getenv("NOT_CRAN")), "true")
+#' library(censobr)
+#'
 #' # return data as arrow Dataset
 #' df <- read_tracts(year = 2010,
 #'                   dataset = 'PessoaRenda',

diff --git a/man/merge_household_var.Rd b/man/merge_household_var.Rd
diff --git a/man/read_emigration.Rd b/man/read_emigration.Rd
diff --git a/man/read_families.Rd b/man/read_families.Rd
diff --git a/man/read_mortality.Rd b/man/read_mortality.Rd
diff --git a/man/read_population.Rd b/man/read_population.Rd
diff --git a/man/read_tracts.Rd b/man/read_tracts.Rd
diff --git a/man/roxygen/templates/merge_households.R b/man/roxygen/templates/merge_households.R
@@ -0,0 +1,3 @@
+#' @param merge_households Logical. Indicate whether the function should merge
+#'        household variables to the output data. Defaults to `FALSE`.
+
diff --git a/tests/tests_rafa/merge_household_notes.R b/tests/tests_rafa/merge_household_notes.R
@@ -14,14 +14,6 @@ df <- censobr::read_population(year = year,
 # censobr::data_dictionary(year = year, dataset = 'households')
 
 
-NA FUNCAO
-### merge household vars
-if (isTRUE(merge_households)) {
-  df <- merge_household_var(df,
-                            year = year,
-                            add_labels = add_labels,
-                            showProgress = showProgress)
-}
 
 
 
@@ -100,9 +92,7 @@ merge_household_var <- function(df, year, add_labels=NULL, showProgress=T){
   df_household <- dplyr::select(df_household, -all_of(vars_to_drop))
 
   # merge
-  nrow(df)
   temp_df <- dplyr::left_join(df, df_household)
-  nrow(df)
 
   return(temp_df)
   }
diff --git a/tests/tests_rafa/test_rafa.R b/tests/tests_rafa/test_rafa.R
@@ -406,6 +406,7 @@ urlchecker::url_update()
 # Check package errors
 
 # run only the tests
+Sys.setenv(NOT_CRAN = "true")
 testthat::test_local()
 
 # LOCAL

diff --git a/tests/testthat/test_labels_emigration.R b/tests/testthat/test_labels_emigration.R
@@ -12,14 +12,14 @@ test_that("add_labels_emigration", {
 
   # sem labels
   test1a <- read_emigration(year = 2010, add_labels = NULL, columns = c('abbrev_state', 'V1006')) |>
-            filter(abbrev_state == 'RO')
+            dplyr::filter(abbrev_state == 'RO')
 
   # com labels
   test1b <- censobr:::add_labels_emigration(arrw = test1a, year=2010, lang = 'pt') |>
-            filter(abbrev_state == 'RO')
+            dplyr::filter(abbrev_state == 'RO')
 
-  test1a <- as.data.frame(test1a)
-  test1b <- as.data.frame(test1b)
+  test1a <- dplyr::collect(test1a)
+  test1b <- dplyr::collect(test1b)
   # add labels
   testthat::expect_true('1' %in% test1a$V1006)
   testthat::expect_true('Urbana' %in% test1b$V1006)

diff --git a/tests/testthat/test_labels_families.R b/tests/testthat/test_labels_families.R
@@ -12,21 +12,19 @@ test_that("add_labels_families", {
 
   # sem labels
   test1a <- read_families(year = 2000, add_labels = NULL, columns = c('abbrev_state', 'CODV0404_2')) |>
-            filter(abbrev_state == 'RO')
+            dplyr::filter(abbrev_state == 'RO')
 
   # com labels
   test1b <- censobr:::add_labels_families(arrw = test1a, year=2000, lang = 'pt') |>
-            filter(abbrev_state == 'RO')
+            dplyr::filter(abbrev_state == 'RO')
 
-  test1a <- as.data.frame(test1a)
-  test1b <- as.data.frame(test1b)
+  test1a <- dplyr::collect(test1a)
+  test1b <- dplyr::collect(test1b)
 
   # add labels
   testthat::expect_true('01' %in% test1a$CODV0404_2)
   testthat::expect_true('Casal sem filhos' %in% test1b$CODV0404_2)
 
-
-
  })
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#' @param merge_households Logical. Indicate whether the function should merge
		#' household variables to the output data. Defaults to `FALSE`.