diff --git a/NAMESPACE b/NAMESPACE index 5648661..88fab1e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,7 @@ S3method(tbl_format_footer,deinterlaced_df) S3method(tbl_format_header,deinterlaced_df) S3method(tbl_format_setup,deinterlaced_df) -export(coalesce_missing_reasons) +export(coalesce_channels) export(deinterlace_type_convert) export(drop_missing_cols) export(drop_value_cols) @@ -16,7 +16,7 @@ export(icol_integer) export(icol_logical) export(icol_number) export(icol_time) -export(interlace_missing_reasons) +export(interlace_channels) export(interlacer_example) export(missing_cols) export(missing_names) diff --git a/R/coalesce_missing_reasons.R b/R/coalesce_channels.R similarity index 92% rename from R/coalesce_missing_reasons.R rename to R/coalesce_channels.R index e26c013..1e727cb 100644 --- a/R/coalesce_missing_reasons.R +++ b/R/coalesce_channels.R @@ -4,7 +4,7 @@ #' #' Mutations of deinterlaced data frames can result in variables that either #' have both values and missing reasons, or no values and no missing reasons. -#' `coalesce_missing_reasons()` takes care of both situations. In the case where +#' `coalesce_channels()` takes care of both situations. In the case where #' there is both a value and missing reason, it will choose which to keep based #' on the `keep` paramter. In case where no value or missing reason exists, it #' will fill the missing reason with the `default_reason` parameter. @@ -25,10 +25,10 @@ #' @return A deinterlaced tibble. #' #' @export -coalesce_missing_reasons <- function( +coalesce_channels <- function( x, - keep = c("values", "missing"), - default_reason = getOption("default_missing_reason") + default_reason = getOption("default_missing_reason"), + keep = c("values", "missing") ) { default_reason <- factor(default_reason %||% "UNKNOWN_REASON") keep <- match.arg(keep) diff --git a/R/deinterlaced_df.R b/R/deinterlaced_df.R index a4bedd3..3a0fe97 100644 --- a/R/deinterlaced_df.R +++ b/R/deinterlaced_df.R @@ -73,7 +73,7 @@ abort_if_deinterlace_df_problems <- function(x, call = caller_call()) { if (length(df_problems) > 0) { cli_abort( - c(df_problems[[1]], "i" = "Run `coalesce_missing_reasons()` to fix."), + c(df_problems[[1]], "i" = "Run `coalesce_channels()` to fix."), call = call ) } @@ -101,7 +101,7 @@ tbl_format_footer.deinterlaced_df <- function(x, setup, ...) { extra <- format_bullets_raw( c( "x" = glue("Warning: {setup$interlaced_probs[[1]]}"), - "i" = glue("Run `coalesce_missing_reasons()` to fix.") + "i" = glue("Run `coalesce_channels()` to fix.") ) ) } else { diff --git a/R/interlace_missing_reasons.R b/R/interlace_channels.R similarity index 95% rename from R/interlace_missing_reasons.R rename to R/interlace_channels.R index 4260a52..74b8f0b 100644 --- a/R/interlace_missing_reasons.R +++ b/R/interlace_channels.R @@ -12,7 +12,7 @@ #' that contain both values and missing reasons. #' #' @export -interlace_missing_reasons <- function(x) { +interlace_channels <- function(x) { abort_if_deinterlace_df_problems(x) # TODO: this is another function that would benefit from native speedup diff --git a/man/coalesce_missing_reasons.Rd b/man/coalesce_channels.Rd similarity index 78% rename from man/coalesce_missing_reasons.Rd rename to man/coalesce_channels.Rd index 7f9aeb1..553a8a0 100644 --- a/man/coalesce_missing_reasons.Rd +++ b/man/coalesce_channels.Rd @@ -1,24 +1,24 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/coalesce_missing_reasons.R -\name{coalesce_missing_reasons} -\alias{coalesce_missing_reasons} +% Please edit documentation in R/coalesce_channels.R +\name{coalesce_channels} +\alias{coalesce_channels} \title{Coalesce missing reasons in a dataframe} \usage{ -coalesce_missing_reasons( +coalesce_channels( x, - keep = c("values", "missing"), - default_reason = getOption("default_missing_reason") + default_reason = getOption("default_missing_reason"), + keep = c("values", "missing") ) } \arguments{ \item{x}{A dataframe} +\item{default_reason}{When a variable is missing a value and a missing +reason, the default missing reason to fill in.} + \item{keep}{When a variable has both a value and missing reason, choose which to keep. (A properly formed deinterlaced dataframe has values OR missing reasons)} - -\item{default_reason}{When a variable is missing a value and a missing -reason, the default missing reason to fill in.} } \value{ A deinterlaced tibble. @@ -26,7 +26,7 @@ A deinterlaced tibble. \description{ Mutations of deinterlaced data frames can result in variables that either have both values and missing reasons, or no values and no missing reasons. -`coalesce_missing_reasons()` takes care of both situations. In the case where +`coalesce_channels()` takes care of both situations. In the case where there is both a value and missing reason, it will choose which to keep based on the `keep` paramter. In case where no value or missing reason exists, it will fill the missing reason with the `default_reason` parameter. diff --git a/man/interlace_missing_reasons.Rd b/man/interlace_channels.Rd similarity index 73% rename from man/interlace_missing_reasons.Rd rename to man/interlace_channels.Rd index 79b68c6..f952265 100644 --- a/man/interlace_missing_reasons.Rd +++ b/man/interlace_channels.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/interlace_missing_reasons.R -\name{interlace_missing_reasons} -\alias{interlace_missing_reasons} +% Please edit documentation in R/interlace_channels.R +\name{interlace_channels} +\alias{interlace_channels} \title{Re-interlacce a deinterlaced dataframe} \usage{ -interlace_missing_reasons(x) +interlace_channels(x) } \arguments{ \item{x}{A deinterlaced dataframe} diff --git a/tests/testthat/test-coalesce_missing_reasons.R b/tests/testthat/test-coalesce_channels.R similarity index 88% rename from tests/testthat/test-coalesce_missing_reasons.R rename to tests/testthat/test-coalesce_channels.R index 993bc8a..9c8148c 100644 --- a/tests/testthat/test-coalesce_missing_reasons.R +++ b/tests/testthat/test-coalesce_channels.R @@ -3,7 +3,7 @@ test_that("nop if no changes are necessary", { a = c(1, NA), .a. = factor(c(NA, "UNKNOWN_REASON")) ) |> - coalesce_missing_reasons() + coalesce_channels() expect_equal(result, result) }) @@ -13,7 +13,7 @@ test_that("new missing value reasons make values disappear when keep=missing", { a = c(1, 2), .a. = factor(c(NA, "UNKNOWN_REASON")) ) |> - coalesce_missing_reasons(keep = "missing") + coalesce_channels(keep = "missing") expected <- tibble( a = c(1, NA), @@ -28,7 +28,7 @@ test_that("new missing value reasons disappear if value available", { a = c(1, 2), .a. = factor(c(NA, "UNKNOWN_REASON")) ) |> - coalesce_missing_reasons() + coalesce_channels() expected <- tibble( a = c(1, 2), @@ -43,7 +43,7 @@ test_that("missing (missing value) reasons result in default reason", { a = c(1, NA), .a. = factor(c(NA, NA)) ) |> - coalesce_missing_reasons() + coalesce_channels() expected <- tibble( a = c(1, NA), diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R index f19717e..4b2676f 100644 --- a/tests/testthat/test-read.R +++ b/tests/testthat/test-read.R @@ -38,7 +38,7 @@ test_that("global missing reasons load properly", { expect_equal( result_raw, - interlace_missing_reasons(result), + interlace_channels(result), ignore_attr = TRUE ) }) @@ -84,7 +84,7 @@ test_that("column-level missing reasons can be specified with icol_*", { expect_equal( result_raw, - interlace_missing_reasons(result), + interlace_channels(result), ignore_attr = TRUE ) }) diff --git a/vignettes/mutations.Rmd b/vignettes/mutations.Rmd index 36838d9..af39719 100644 --- a/vignettes/mutations.Rmd +++ b/vignettes/mutations.Rmd @@ -12,6 +12,7 @@ knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) +library(dplyr) library(interlacer) ``` @@ -24,7 +25,7 @@ Similarly, if a variable is missing its value AND its missing reason, it's probably a sign we made a mistake somewhere. This means whenever we `mutate()` the values of a variable, the missing reasons -are properly updated, and vice versa. To illustrate this, let's load some +must also be updated, and vice versa. To illustrate this, let's load some example data: ```{r} @@ -100,18 +101,18 @@ reason are absent. In next part of the mutation, we fill in the `TECHNICAL_ERROR` missing reason for these rows into `.favorite_color.`, resulting in a well-formed deinterlaced dataframe. -## An easier way +## An easier way with `coalesce_channels()` As you can imagine, manually fixing the value & missing reason structure of your dataframe for every mutation you do can get cumbersome! Luckily, -interlacer provides an easier way via `coalesce_missing_reasons()`: +interlacer provides an easier way via `coalesce_channels()`: ```{r} df |> mutate( .age. = "REDACTED", ) |> - coalesce_missing_reasons(keep = "missing") + coalesce_channels(keep = "missing") df |> mutate( @@ -121,10 +122,10 @@ df |> NA ) ) |> - coalesce_missing_reasons(default_reason = "TECHNICAL_ERROR") + coalesce_channels(default_reason = "TECHNICAL_ERROR") ``` -`coalesce_missing_reasons()` should be run every time you mutate something in +`coalesce_channels()` should be run every time you mutate something in a deinterlaced dataframe. It accepts two arguments `keep`, and `default_reason`. With these paramters set, it fixes both possible problem cases as follows: @@ -139,12 +140,12 @@ Case 2: NEITHER a value nor a missing reason exists These rules allow us to mutate our deinterlaced variables without needing to specify BOTH the values and missing reason actions -- we only need to think -about our operation one channel, and then a call to `coalesce_missing_reasons()` +about our operation one channel, and then a call to `coalesce_channels()` takes care of the other. ## Creating New Columns -`coalesce_missing_reasons()` will also automatically create missing reason +`coalesce_channels()` will also automatically create missing reason columns if they don't automatically exist. This is useful for adding new variables to your dataframe: @@ -152,8 +153,9 @@ variables to your dataframe: df |> mutate( person_type = if_else(age < 18, "CHILD", "ADULT"), + .after = person_id ) %>% - coalesce_missing_reasons(default_reason = "AGE_UNAVAILABLE") + coalesce_channels(default_reason = "AGE_UNAVAILABLE") ``` ## Writing interlaced files @@ -168,15 +170,15 @@ write_interlaced_csv(df, "interlaced_output.csv") This will combine the value and missing reasons into interlaced character columns, and write the result as a csv. Alternatively, if you want to re-interlace the columns without writing to a file for more control in the -writing process, you can use `interlace_missing_reasons()`: +writing process, you can use `interlace_channels()`: ```{r} -interlace_missing_reasons(df) +interlace_channels(df) ``` ## Final note: Setting the global default reason -By default, `coalesce_missing_reasons()` will use `UNKNOWN_REASON` as the +By default, `coalesce_channels()` will use `UNKNOWN_REASON` as the default missing reason. Sometimes you want to use a different default value, to act as the "catch-all" missing reason, so you don't have to constantly specify it. To do this, set the global `default_missing_reason` option: @@ -187,5 +189,5 @@ options(default_missing_reason = -99) tibble( a = c(1,2,3, NA, 5) ) |> - coalesce_missing_reasons() + coalesce_channels() ```