diff --git a/.Rbuildignore b/.Rbuildignore index 92f2733..19d97fc 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,3 +6,4 @@ ^pkgdown$ ^\.github$ ^README.Rmd +^data-raw$ diff --git a/DESCRIPTION b/DESCRIPTION index cfa1c64..ac047c9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: interlacer Title: Read Tabular Data With Interlaced Values And Missing Reasons -Version: 0.1.0 +Version: 0.2.0 Authors@R: person("Kyle", "Husmann", , "kdh38@psu.edu", role = c("aut", "cre")) Description: Textual tabular data sources often encode values and missing @@ -16,18 +16,20 @@ Depends: Imports: cli, dplyr, - glue, + generics, pillar, - purrr, readr, rlang, tibble, - tidyselect + tidyselect, + vctrs, + vroom Suggests: knitr, rmarkdown, forcats, haven, + declared, testthat (>= 3.0.0) Config/testthat/edition: 3 Encoding: UTF-8 @@ -35,3 +37,4 @@ LazyData: true RoxygenNote: 7.2.3 URL: http://kylehusmann.com/interlacer/ VignetteBuilder: knitr +Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 290386a..e29cb15 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,30 +1,140 @@ # Generated by roxygen2: do not edit by hand -S3method(tbl_format_header,deinterlaced_df) -S3method(tbl_format_setup,deinterlaced_df) -export(coalesce_channels) -export(deinterlace_type_convert) -export(drop_missing_cols) -export(drop_value_cols) -export(icol_character) -export(icol_date) -export(icol_datetime) -export(icol_double) -export(icol_factor) -export(icol_integer) -export(icol_logical) -export(icol_number) -export(icol_time) -export(interlace_channels) +S3method("$",interlacer_interlaced) +S3method("$<-",interlacer_interlaced) +S3method("[",interlacer_interlaced) +S3method("[<-",interlacer_interlaced) +S3method("[[",interlacer_interlaced) +S3method("[[<-",interlacer_interlaced) +S3method("length<-",interlacer_interlaced) +S3method("levels<-",interlacer_interlaced) +S3method(as.character,interlacer_interlaced) +S3method(as.double,interlacer_interlaced) +S3method(as.factor,interlacer_interlaced) +S3method(as.integer,interlacer_interlaced) +S3method(as.interlaced,data.frame) +S3method(as.interlaced,default) +S3method(as.interlaced,interlacer_interlaced) +S3method(as.logical,interlacer_interlaced) +S3method(as.na_col_spec,default) +S3method(as.na_col_spec,interlacer_na_col_spec) +S3method(as.na_col_spec,list) +S3method(as.ordered,interlacer_interlaced) +S3method(flatten_channels,data.frame) +S3method(flatten_channels,default) +S3method(flatten_channels,interlacer_interlaced) +S3method(format,interlacer_interlaced) +S3method(format,interlacer_na_col_spec) +S3method(is.empty,default) +S3method(is.empty,interlacer_interlaced) +S3method(is.finite,interlacer_interlaced) +S3method(is.infinite,interlacer_interlaced) +S3method(is.na,interlacer_interlaced) +S3method(levels,interlacer_interlaced) +S3method(max,interlacer_interlaced) +S3method(median,interlacer_interlaced) +S3method(min,interlacer_interlaced) +S3method(na.exclude,interlacer_interlaced) +S3method(na.fail,interlacer_interlaced) +S3method(na.omit,interlacer_interlaced) +S3method(na_channel,data.frame) +S3method(na_channel,default) +S3method(na_channel,interlacer_interlaced) +S3method(obj_print_footer,interlacer_interlaced) +S3method(pillar_shaft,interlacer_interlaced) +S3method(print,interlacer_na_col_spec) +S3method(quantile,interlacer_interlaced) +S3method(range,interlacer_interlaced) +S3method(rep,interlacer_interlaced) +S3method(type_to_col,Date) +S3method(type_to_col,POSIXct) +S3method(type_to_col,default) +S3method(type_to_col,double) +S3method(type_to_col,factor) +S3method(type_to_col,hms) +S3method(type_to_col,integer) +S3method(type_to_col,logical) +S3method(value_channel,data.frame) +S3method(value_channel,default) +S3method(value_channel,interlacer_interlaced) +S3method(vec_arith,interlacer_interlaced) +S3method(vec_arith.interlacer_interlaced,default) +S3method(vec_arith.interlacer_interlaced,interlacer_interlaced) +S3method(vec_arith.logical,interlacer_interlaced) +S3method(vec_arith.numeric,interlacer_interlaced) +S3method(vec_cast,character.interlacer_interlaced) +S3method(vec_cast,double.interlacer_interlaced) +S3method(vec_cast,factor.interlacer_interlaced) +S3method(vec_cast,integer.interlacer_interlaced) +S3method(vec_cast,interlacer_interlaced.character) +S3method(vec_cast,interlacer_interlaced.double) +S3method(vec_cast,interlacer_interlaced.factor) +S3method(vec_cast,interlacer_interlaced.integer) +S3method(vec_cast,interlacer_interlaced.interlacer_interlaced) +S3method(vec_cast,interlacer_interlaced.logical) +S3method(vec_cast,logical.interlacer_interlaced) +S3method(vec_math,interlacer_interlaced) +S3method(vec_proxy,interlacer_interlaced) +S3method(vec_proxy_compare,interlacer_interlaced) +S3method(vec_proxy_equal,interlacer_interlaced) +S3method(vec_proxy_order,interlacer_interlaced) +S3method(vec_ptype2,character.interlacer_interlaced) +S3method(vec_ptype2,double.interlacer_interlaced) +S3method(vec_ptype2,factor.interlacer_interlaced) +S3method(vec_ptype2,integer.interlacer_interlaced) +S3method(vec_ptype2,interlacer_interlaced.character) +S3method(vec_ptype2,interlacer_interlaced.double) +S3method(vec_ptype2,interlacer_interlaced.factor) +S3method(vec_ptype2,interlacer_interlaced.integer) +S3method(vec_ptype2,interlacer_interlaced.interlacer_interlaced) +S3method(vec_ptype2,interlacer_interlaced.logical) +S3method(vec_ptype2,logical.interlacer_interlaced) +S3method(vec_ptype_abbr,interlacer_interlaced) +S3method(vec_ptype_full,interlacer_interlaced) +S3method(vec_restore,interlacer_interlaced) +S3method(xtfrm,interlacer_interlaced) +export("na_levels<-") +export(as.col_spec) +export(as.factor) +export(as.interlaced) +export(as.na_col_spec) +export(as.ordered) +export(col_character) +export(col_date) +export(col_datetime) +export(col_double) +export(col_factor) +export(col_guess) +export(col_integer) +export(col_logical) +export(col_number) +export(col_skip) +export(col_time) +export(cols) +export(cols_condense) +export(cols_only) +export(flatten_channels) +export(interlaced) +export(interlaced_vroom) export(interlacer_example) -export(missing_cols) -export(missing_names) +export(is.empty) +export(is.interlaced) +export(is.na_col_spec) +export(map_na_channel) +export(map_value_channel) +export(na) +export(na_channel) +export(na_cols) +export(na_levels) +export(na_spec) +export(parse_interlaced) export(read_interlaced_csv) export(read_interlaced_csv2) export(read_interlaced_delim) export(read_interlaced_tsv) -export(value_cols) -export(value_names) +export(spec) +export(value_channel) +export(vec_c) export(write_interlaced_csv) export(write_interlaced_csv2) export(write_interlaced_delim) @@ -32,12 +142,33 @@ export(write_interlaced_excel_csv) export(write_interlaced_excel_csv2) export(write_interlaced_tsv) import(cli) -import(dplyr) -import(readr) import(rlang) -import(tibble) -importFrom(glue,glue) -importFrom(pillar,tbl_format_header) -importFrom(pillar,tbl_format_setup) -importFrom(purrr,list_flatten) -importFrom(tidyselect,peek_vars) +import(vctrs) +importFrom(dplyr,if_else) +importFrom(generics,as.factor) +importFrom(generics,as.ordered) +importFrom(pillar,pillar_shaft) +importFrom(readr,as.col_spec) +importFrom(readr,col_character) +importFrom(readr,col_date) +importFrom(readr,col_datetime) +importFrom(readr,col_double) +importFrom(readr,col_factor) +importFrom(readr,col_guess) +importFrom(readr,col_integer) +importFrom(readr,col_logical) +importFrom(readr,col_number) +importFrom(readr,col_skip) +importFrom(readr,col_time) +importFrom(readr,cols) +importFrom(readr,cols_condense) +importFrom(readr,cols_only) +importFrom(readr,spec) +importFrom(stats,median) +importFrom(stats,na.exclude) +importFrom(stats,na.fail) +importFrom(stats,na.omit) +importFrom(stats,quantile) +importFrom(tibble,as_tibble) +importFrom(tibble,tibble) +importFrom(vctrs,vec_c) diff --git a/R/coalesce_channels.R b/R/coalesce_channels.R deleted file mode 100644 index d091681..0000000 --- a/R/coalesce_channels.R +++ /dev/null @@ -1,88 +0,0 @@ -#' Coalesce missing reasons in a data frame -#' -#' @description -#' -#' Mutations of deinterlaced data frames can result in variables that either -#' have both values and missing reasons, or no values and no missing reasons. -#' `coalesce_channels()` takes care of both situations. In the case where -#' there is both a value and missing reason, it will choose which to keep based -#' on the `keep` paramter. In case where no value or missing reason exists, it -#' will fill the missing reason with the `missing_reason` parameter. -#' -#' Mutations can also create new value columns without companion missing reason -#' columns. In that case, a new missing reason will be created and filled with -#' `missing_reason` wherever there are missing values in the value column. ( -#' This behavior can also be used to stub missing reason columns for value-only -#' data frames) -#' -#' @param x A data frame -#' @param keep When a variable has both a value and missing reason, choose which -#' to keep. (A properly formed deinterlaced data frame has values OR missing -#' reasons) -#' @param missing_reason When a variable is missing a value and a missing -#' reason, the missing reason to fill in. -#' -#' @return A deinterlaced tibble. -#' -#' @export -coalesce_channels <- function( - x, - missing_reason = getOption("interlacer.default_missing_reason"), - keep = c("values", "missing") -) { - missing_reason <- factor(missing_reason %||% "UNKNOWN_REASON") - keep <- match.arg(keep) - - for (missing_name in missing_names(x)) { - value_name <- to_value_name(missing_name) - if (is.null(x[[value_name]])) { - cli_abort( - glue( - paste( - "Column `{missing_name}` implies `{value_name}` should exist, but", - "`{value_name}` not found." - ) - ) - ) - } - } - - lapply(value_names(x), function(value_name) { - values <- x[[value_name]] - - missing_name <- to_missing_name(value_name) - - missing_values <- x[[missing_name]] %||% - if_else(is.na(values), missing_reason, NA) - - # Ensure missing reason column is always a factor - if (!is.factor((missing_values))) { - missing_values <- factor(missing_values) - } - - if (keep == "values") { - new_values <- values - - new_missing_values <- case_when( - !is.na(values) ~ NA, - !is.na(missing_values) ~ missing_values, - T ~ missing_reason - ) - } else { - new_values <- if_else( - !is.na(values) & !is.na(missing_values), NA, values - ) - new_missing_values <- if_else( - is.na(values) & is.na(missing_values), missing_reason, missing_values - ) - } - - set_names( - list(new_values, new_missing_values), - c(value_name, missing_name) - ) - }) |> - list_flatten() |> - bind_cols() |> - as_deinterlaced_df() -} diff --git a/R/collectors.R b/R/collectors.R index 4c73fbc..790048a 100644 --- a/R/collectors.R +++ b/R/collectors.R @@ -1,74 +1,203 @@ -interlaced_collector <- function(col_type, na) { - col_type_name <- class(col_type)[[1]] - class(col_type) <- c(col_type_name, "interlaced_collector", "collector") - col_type$na <- na - col_type -} - -is_interlaced_collector <- function(x) inherits(x, "interlaced_collector") - -#' Interlaced collectors for read_interlaced_* -#' -#' @description +#' Create an NA column specification #' -#' Interlaced collector extend `readr` collector types (e.g. `col_double()`) to -#' allow column-level missing value specifications. +#' `na_cols()` creates a specification for the NA channel missing reason when +#' loading data with the `read_interlaced_*()` family of functions. #' -#' @param na Character vector of strings to interpret as column-level missing -#' values +#' @param ... Named vectors to use as missing reasons when loading interlaced +#' columns. Use name `.default` to set default `NA` values for the columns. +#' @param x Named list to construct a NA spec with, or a vector of values that +#' should be used in a spec with `.default` equal to those values. #' -#' @inheritParams readr::col_factor -#' @inheritParams readr::col_date #' @export -icol_logical = function(na) { - interlaced_collector(col_logical(), na) -} +na_cols <- function(...) { + xs <- quos(...) + col_names <- names2(xs) -#' @rdname icol_logical -#' @export -icol_integer <- function(na) { - interlaced_collector(col_integer(), na) + output <- set_names(rep_along(xs, list()), col_names) + + env <- new_environment() + mask <- new_data_mask(env) + + for (j in seq_along(xs)) { + res <- eval_tidy(xs[[j]], mask) + + output[j] <- list(res) + + if (col_names[[j]] != "") { + env[[col_names[[j]]]] <- res + } + } + + na_col_spec(output) } -#' @rdname icol_logical +#' @rdname na_cols #' @export -icol_double <- function(na) { - interlaced_collector(col_double(), na) +as.na_col_spec <- function(x) { + UseMethod("as.na_col_spec") } -#' @rdname icol_logical #' @export -icol_character <- function(na) { - interlaced_collector(col_character(), na) +as.na_col_spec.default <- function(x) { + na_col_spec(list(.default = x)) } -#' @rdname icol_logical #' @export -icol_factor <- function(na, levels = NULL, ordered = FALSE) { - interlaced_collector(col_factor(levels, ordered), na) +as.na_col_spec.interlacer_na_col_spec <- function(x) { + x } -#' @rdname icol_logical #' @export -icol_date <- function(na, format = "") { - interlaced_collector(col_date(format), na) +as.na_col_spec.list <- function(x) { + do.call(na_cols, x) } -#' @rdname icol_logical + +na_col_spec <- function(na_list) { + for (j in seq_along(na_list)) { + if (!is.numeric(j) && !is.character(j)) { + cli_abort( + paste0( + "na collector `{names(na_list)[[j]]}` is not a numeric", + " or character vector" + ) + ) + } + } + + default <- na_list$.default + na_list$.default <- NULL + + structure( + list(cols = na_list, default = default), + class = "interlacer_na_col_spec" + ) +} + +#' @rdname na_cols #' @export -icol_time <- function(na, format = "") { - interlaced_collector(col_time(format), na) +is.na_col_spec <- function(x) { + inherits(x, "interlacer_na_col_spec") } -#' @rdname icol_logical #' @export -icol_datetime <- function(na, format = "") { - interlaced_collector(col_datetime(format), na) +print.interlacer_na_col_spec <- function (x, ...) { + cat(format(x)) + invisible(x) } -#' @rdname icol_logical #' @export -icol_number <- function(na) { - interlaced_collector(col_number(), na) +format.interlacer_na_col_spec <- function (x, ...) { + cols <- x$cols + + cols_args <- c( + map_chr(seq_along(cols), function(i) { + col_value <- format_na_collector(cols[[i]]) + col_name <- names(cols)[[i]] %||% "" + + if (col_name != "") { + col_name <- paste0(fix_non_syntactic(col_name), " = ") + } + + paste0(col_name, col_value) + }) + ) + + if (!is.null(x$default)) { + cols_args <- c( + paste0(".default = ", format_na_collector(x$default)), cols_args + ) + } + + out <- paste0( + "na_cols(\n", + " ", paste0(cols_args, collapse = ",\n "), + "\n)\n" + ) +} + +format_na_collector <- function(x, ...) { + if (is.null(x)) { + col_red("NULL") + } else if (is.numeric(x)) { + col_green(fmt_vec(x, quote = FALSE)) + } else if (is.character(x)) { + col_magenta(fmt_vec(x, quote = TRUE)) + } else { + "???" + } +} + +fmt_vec <- function(x, quote) { + if (quote) { + x <- paste0("\"", x, "\"") + } + paste0( + "c(", paste0(x, collapse= ", "), ")" + ) } + +#' @importFrom readr col_character +#' @export +readr::col_character + +#' @importFrom readr col_date +#' @export +readr::col_date + +#' @importFrom readr col_datetime +#' @export +readr::col_datetime + +#' @importFrom readr col_double +#' @export +readr::col_double + +#' @importFrom readr col_factor +#' @export +readr::col_factor + +#' @importFrom readr col_guess +#' @export +readr::col_guess + +#' @importFrom readr col_integer +#' @export +readr::col_integer + +#' @importFrom readr col_logical +#' @export +readr::col_logical + +#' @importFrom readr col_number +#' @export +readr::col_number + +#' @importFrom readr col_skip +#' @export +readr::col_skip + +#' @importFrom readr col_time +#' @export +readr::col_time + +#' @importFrom readr cols_condense +#' @export +readr::cols_condense + +#' @importFrom readr cols_only +#' @export +readr::cols_only + +#' @importFrom readr cols +#' @export +readr::cols + +#' @importFrom readr as.col_spec +#' @export +readr::as.col_spec + +#' @importFrom readr spec +#' @export +readr::spec diff --git a/R/deinterlace_type_convert.R b/R/deinterlace_type_convert.R deleted file mode 100644 index 4616974..0000000 --- a/R/deinterlace_type_convert.R +++ /dev/null @@ -1,59 +0,0 @@ -#' Convert character columns and deinterlace missing reasons in existing data -#' frame -#' -#' @description -#' -#' This is a simple wrapper for `readr::type_convert()` that deinterlaces -#' missing reasons in addition to parsing values. -#' -#' @param x A data frame -#' @param col_types One of `NULL`, a [readr::cols()] specification, or a string. -#' @param na Character vector of strings to interpret as missing values. -#' @param ... additional parameters to pass to `readr::type_convert()` -#' -#' @return A [tibble()].with separate columns for values and missing reasons -#' for each variable. -#' -#' @export -deinterlace_type_convert <- function( - x, - col_types = NULL, - na = c("", "NA"), - ... -) { - # I thought about adding an `na_labels` to this API, but decided against it - # because col_factor() doesn't provide `labels`. It's probably because it's - # considered better to explicitly `fct_recode` instead. - - col_spec <- as.col_spec(col_types) - - lapply(names(x), function(value_name) { - curr_column <- x[[value_name]] - - missing_name <- to_missing_name(value_name) - - value_collector <- col_spec$cols[[value_name]] %||% col_spec$default - - all_na_values <- unique(c(value_collector$na, na)) - - values <- if_else(curr_column %in% all_na_values, NA, curr_column) - - missing_values <- if_else(curr_column %in% all_na_values, curr_column, NA) - - converted_values <- type_convert_col(values, value_collector, na, ...) - - converted_missing_values <- factor(missing_values, levels = all_na_values) - - set_names( - list2(converted_values, converted_missing_values), - c(value_name, missing_name) - ) - }) |> - list_flatten() |> - bind_cols() -} - -type_convert_col <- function(x, col, na, ...) { - type_convert(tibble(x), col_types = cols(x = col), na, ...)$x -} - diff --git a/R/deinterlaced_df.R b/R/deinterlaced_df.R deleted file mode 100644 index ce9116c..0000000 --- a/R/deinterlaced_df.R +++ /dev/null @@ -1,109 +0,0 @@ -deinterlaced_df <- function(x) { - if (inherits(x, "deinterlaced_df")) { - x - } - new_tibble(x, class = "deinterlaced_df") -} - -as_deinterlaced_df <- function(x) { - deinterlaced_df(x) -} - -is_deinterlaced_df <- function(x) { - inherits(x, "deinterlaced_df") -} - -deinterlaced_df_problems <- function(x) { - missing_probs <- lapply(missing_names(x), function(missing_name) { - value_name <- to_value_name(missing_name) - if (is.null(x[[value_name]])) { - return( - glue( - paste( - "Column `{missing_name}` implies `{value_name}` should exist, but", - "`{value_name}` not found." - ) - ) - ) - } - }) - - value_probs <- lapply(value_names(x), function(value_name) { - missing_name <- to_missing_name(value_name) - - if (!(missing_name %in% names(x))) { - return( - glue( - paste( - "Column `{value_name}` implies `{missing_name}` should exist, but", - "`{missing_name}` not found." - ) - ) - ) - } - - # This check is surprisingly expensive! - # Consider writing a native function here to speed things up - if (any(is.na(x[[value_name]]) & is.na(x[[missing_name]]))) { - return( - glue( - "Column `{value_name}` has rows without values or missing reasons" - ) - ) - } - - # This check is surprisingly expensive! - # Consider writing a native function here to speed things up - if (any(!is.na(x[[value_name]]) & !is.na(x[[missing_name]]))) { - return( - glue( - "Column `{value_name}` has rows with both values and missing reasons" - ) - ) - } - }) - - probs <- c(value_probs, missing_probs) - is_prob <- sapply(probs, \(x) !is.null(x)) - probs[is_prob] -} - -abort_if_deinterlace_df_problems <- function(x, call = caller_call()) { - df_problems <- deinterlaced_df_problems(x) - - if (length(df_problems) > 0) { - cli_abort( - c(df_problems[[1]], "i" = "Run `coalesce_channels()` to fix."), - call = call - ) - } -} - -#' @export -tbl_format_setup.deinterlaced_df <- function(x, width, ...) { - setup <- NextMethod() - - if (getOption("interlacer.print_validation", default = TRUE)) { - interlaced_probs <- deinterlaced_df_problems(x) - - if (length(interlaced_probs) > 0) { - cli_warn( - format_bullets_raw( - c( - glue("{interlaced_probs[[1]]}"), - "i" = glue("Run `coalesce_channels()` to fix.") - ) - ) - ) - } - } - - setup -} - -#' @export -tbl_format_header.deinterlaced_df <- function(x, setup, ...) { - pillar::style_subtle( - glue("# A deinterlaced tibble: {nrow(x)} {symbol$times} {ncol(x)}") - ) -} diff --git a/R/example.R b/R/example.R index d2427ec..315411c 100644 --- a/R/example.R +++ b/R/example.R @@ -1,4 +1,4 @@ -#' Get path to interlacer example +#' Get a path to one of interlacer's example data sets #' #' interlacer comes bundled with a number of sample files in its `inst/extdata` #' directory. This function make them easy to access diff --git a/R/import-internal-readr.R b/R/import-internal-readr.R new file mode 100644 index 0000000..1074c0b --- /dev/null +++ b/R/import-internal-readr.R @@ -0,0 +1,29 @@ +## Misc internal functions from readr + +# nocov start + +type_to_col <- function(x, ...) UseMethod("type_to_col") +#' @export +type_to_col.default <- function(x, ...) col_character() +#' @export +type_to_col.logical <- function(x, ...) col_logical() +#' @export +type_to_col.integer <- function(x, ...) col_integer() +#' @export +type_to_col.double <- function(x, ...) col_double() +#' @export +type_to_col.factor <- function(x, ...) { + col_factor( + levels = levels(x), + ordered = is.ordered(x), + include_na = any(is.na(levels(x))) + ) +} +#' @export +type_to_col.Date <- function(x, ...) col_date() +#' @export +type_to_col.POSIXct <- function(x, ...) col_datetime() +#' @export +type_to_col.hms <- function(x, ...) col_time() + +# nocov end diff --git a/R/import-internal-vroom.R b/R/import-internal-vroom.R new file mode 100644 index 0000000..9c316ba --- /dev/null +++ b/R/import-internal-vroom.R @@ -0,0 +1,72 @@ +## Misc internal functions from vroom + +# nocov start + +vroom_should_show_col_types <- function(has_col_types, show_col_types) { + if (is.null(show_col_types)) { + return(isTRUE(!has_col_types)) + } + isTRUE(show_col_types) +} + +vroom_show_col_types <- function(x, locale) { + show_dims(x) + summary(spec(x), locale = locale) + cli_block(class = "vroom_spec_message", { + cli::cli_verbatim("\n\n") + cli::cli_alert_info("Use {.fn spec} to retrieve the full column specification for this data.") + cli::cli_alert_info("Specify the column types or set {.arg show_col_types = FALSE} to quiet this message.") + }) +} + +show_dims <- function(x) { + cli_block(class = "vroom_dim_message", { + cli::cli_text(" + {.strong Rows: }{.val {NROW(x)}} + {.strong Columns: }{.val {NCOL(x)}} + ") + }) +} + + +cli_block <- function(expr, class = NULL, type = rlang::inform) { + msg <- "" + withCallingHandlers( + expr, + message = function(x) { + msg <<- paste0(msg, x$message) + invokeRestart("muffleMessage") + } + ) + msg <- sub("^\n", "", msg) + msg <- sub("\n+$", "", msg) + + type(msg, class = class) +} + +vroom_col_select_map <- function(col_select, col_spec) { + col_select <- vroom_enquo(enquo(col_select)) + if (inherits(col_select, "quosures") || !quo_is_null(col_select)) { + if (inherits(col_select, "quosures")) { + vars <- tidyselect::vars_select( + names(col_spec$cols), !!!col_select + ) + } else { + vars <- tidyselect::vars_select( + names(col_spec$cols), !!col_select + ) + } + } else { + vars <- set_names(names(col_spec$cols), names(col_spec$cols)) + } + vars +} + +vroom_enquo <- function(x) { + if (quo_is_call(x, "c") || quo_is_call(x, "list")) { + return(as_quosures(get_expr(x)[-1], get_env(x))) + } + x +} + +# nocov end diff --git a/R/import-standalone-purrr.R b/R/import-standalone-purrr.R new file mode 100644 index 0000000..42e132d --- /dev/null +++ b/R/import-standalone-purrr.R @@ -0,0 +1,239 @@ +# Standalone file: do not edit by hand +# Source: +# ---------------------------------------------------------------------- +# +# --- +# repo: r-lib/rlang +# file: standalone-purrr.R +# last-updated: 2023-02-23 +# license: https://unlicense.org +# --- +# +# This file provides a minimal shim to provide a purrr-like API on top of +# base R functions. They are not drop-in replacements but allow a similar style +# of programming. +# +# ## Changelog +# +# 2023-02-23: +# * Added `list_c()` +# +# 2022-06-07: +# * `transpose()` is now more consistent with purrr when inner names +# are not congruent (#1346). +# +# 2021-12-15: +# * `transpose()` now supports empty lists. +# +# 2021-05-21: +# * Fixed "object `x` not found" error in `imap()` (@mgirlich) +# +# 2020-04-14: +# * Removed `pluck*()` functions +# * Removed `*_cpl()` functions +# * Used `as_function()` to allow use of `~` +# * Used `.` prefix for helpers +# +# nocov start + +map <- function(.x, .f, ...) { + .f <- as_function(.f, env = global_env()) + lapply(.x, .f, ...) +} +walk <- function(.x, .f, ...) { + map(.x, .f, ...) + invisible(.x) +} + +map_lgl <- function(.x, .f, ...) { + .rlang_purrr_map_mold(.x, .f, logical(1), ...) +} +map_int <- function(.x, .f, ...) { + .rlang_purrr_map_mold(.x, .f, integer(1), ...) +} +map_dbl <- function(.x, .f, ...) { + .rlang_purrr_map_mold(.x, .f, double(1), ...) +} +map_chr <- function(.x, .f, ...) { + .rlang_purrr_map_mold(.x, .f, character(1), ...) +} +.rlang_purrr_map_mold <- function(.x, .f, .mold, ...) { + .f <- as_function(.f, env = global_env()) + out <- vapply(.x, .f, .mold, ..., USE.NAMES = FALSE) + names(out) <- names(.x) + out +} + +map2 <- function(.x, .y, .f, ...) { + .f <- as_function(.f, env = global_env()) + out <- mapply(.f, .x, .y, MoreArgs = list(...), SIMPLIFY = FALSE) + if (length(out) == length(.x)) { + set_names(out, names(.x)) + } else { + set_names(out, NULL) + } +} +map2_lgl <- function(.x, .y, .f, ...) { + as.vector(map2(.x, .y, .f, ...), "logical") +} +map2_int <- function(.x, .y, .f, ...) { + as.vector(map2(.x, .y, .f, ...), "integer") +} +map2_dbl <- function(.x, .y, .f, ...) { + as.vector(map2(.x, .y, .f, ...), "double") +} +map2_chr <- function(.x, .y, .f, ...) { + as.vector(map2(.x, .y, .f, ...), "character") +} +imap <- function(.x, .f, ...) { + map2(.x, names(.x) %||% seq_along(.x), .f, ...) +} + +pmap <- function(.l, .f, ...) { + .f <- as.function(.f) + args <- .rlang_purrr_args_recycle(.l) + do.call("mapply", c( + FUN = list(quote(.f)), + args, MoreArgs = quote(list(...)), + SIMPLIFY = FALSE, USE.NAMES = FALSE + )) +} +.rlang_purrr_args_recycle <- function(args) { + lengths <- map_int(args, length) + n <- max(lengths) + + stopifnot(all(lengths == 1L | lengths == n)) + to_recycle <- lengths == 1L + args[to_recycle] <- map(args[to_recycle], function(x) rep.int(x, n)) + + args +} + +keep <- function(.x, .f, ...) { + .x[.rlang_purrr_probe(.x, .f, ...)] +} +discard <- function(.x, .p, ...) { + sel <- .rlang_purrr_probe(.x, .p, ...) + .x[is.na(sel) | !sel] +} +map_if <- function(.x, .p, .f, ...) { + matches <- .rlang_purrr_probe(.x, .p) + .x[matches] <- map(.x[matches], .f, ...) + .x +} +.rlang_purrr_probe <- function(.x, .p, ...) { + if (is_logical(.p)) { + stopifnot(length(.p) == length(.x)) + .p + } else { + .p <- as_function(.p, env = global_env()) + map_lgl(.x, .p, ...) + } +} + +compact <- function(.x) { + Filter(length, .x) +} + +transpose <- function(.l) { + if (!length(.l)) { + return(.l) + } + + inner_names <- names(.l[[1]]) + + if (is.null(inner_names)) { + fields <- seq_along(.l[[1]]) + } else { + fields <- set_names(inner_names) + .l <- map(.l, function(x) { + if (is.null(names(x))) { + set_names(x, inner_names) + } else { + x + } + }) + } + + # This way missing fields are subsetted as `NULL` instead of causing + # an error + .l <- map(.l, as.list) + + map(fields, function(i) { + map(.l, .subset2, i) + }) +} + +every <- function(.x, .p, ...) { + .p <- as_function(.p, env = global_env()) + + for (i in seq_along(.x)) { + if (!rlang::is_true(.p(.x[[i]], ...))) return(FALSE) + } + TRUE +} +some <- function(.x, .p, ...) { + .p <- as_function(.p, env = global_env()) + + for (i in seq_along(.x)) { + if (rlang::is_true(.p(.x[[i]], ...))) return(TRUE) + } + FALSE +} +negate <- function(.p) { + .p <- as_function(.p, env = global_env()) + function(...) !.p(...) +} + +reduce <- function(.x, .f, ..., .init) { + f <- function(x, y) .f(x, y, ...) + Reduce(f, .x, init = .init) +} +reduce_right <- function(.x, .f, ..., .init) { + f <- function(x, y) .f(y, x, ...) + Reduce(f, .x, init = .init, right = TRUE) +} +accumulate <- function(.x, .f, ..., .init) { + f <- function(x, y) .f(x, y, ...) + Reduce(f, .x, init = .init, accumulate = TRUE) +} +accumulate_right <- function(.x, .f, ..., .init) { + f <- function(x, y) .f(y, x, ...) + Reduce(f, .x, init = .init, right = TRUE, accumulate = TRUE) +} + +detect <- function(.x, .f, ..., .right = FALSE, .p = is_true) { + .p <- as_function(.p, env = global_env()) + .f <- as_function(.f, env = global_env()) + + for (i in .rlang_purrr_index(.x, .right)) { + if (.p(.f(.x[[i]], ...))) { + return(.x[[i]]) + } + } + NULL +} +detect_index <- function(.x, .f, ..., .right = FALSE, .p = is_true) { + .p <- as_function(.p, env = global_env()) + .f <- as_function(.f, env = global_env()) + + for (i in .rlang_purrr_index(.x, .right)) { + if (.p(.f(.x[[i]], ...))) { + return(i) + } + } + 0L +} +.rlang_purrr_index <- function(x, right = FALSE) { + idx <- seq_along(x) + if (right) { + idx <- rev(idx) + } + idx +} + +list_c <- function(x) { + inject(c(!!!x)) +} + +# nocov end diff --git a/R/interlace_channels.R b/R/interlace_channels.R deleted file mode 100644 index 9cdd8af..0000000 --- a/R/interlace_channels.R +++ /dev/null @@ -1,33 +0,0 @@ -#' Re-interlace a deinterlaced data frame -#' -#' @description -#' -#' This function will take a deinterlaced data frame and re-interlace it by -#' combining value and missing reason column pairs into single character -#' columns. -#' -#' @param x A deinterlaced data frame -#' -#' @returns An interlaced data frame, that is, a data frame with character -#' columns that contain both values and missing reasons. -#' -#' @export -interlace_channels <- function(x) { - abort_if_deinterlace_df_problems(x) - - # TODO: this is another function that would benefit from native speedup - - lapply(value_names(x), function(value_name) { - values <- x[[value_name]] - missing_name <- to_missing_name(value_name) - missing_values <- x[[missing_name]] - - if_else( - is.na(missing_values), - as.character(values), - as.character(missing_values), - ) - }) |> - set_names(value_names(x)) |> - bind_cols() -} diff --git a/R/interlaced.R b/R/interlaced.R new file mode 100644 index 0000000..6ddc8ee --- /dev/null +++ b/R/interlaced.R @@ -0,0 +1,775 @@ +#' Construct an `interlaced` vector +#' +#' The `interlaced` type extends vectors by adding a "missing reason" channel +#' which can be used to distinguish different types of missingness. The +#' `interlaced()` function constructs a new `interlaced` vector from a vector +#' or list of values. +#' +#' @param x A vector or list of values +#' @param na A vector of values to interpret as missing values +#' @param ... Additional arguments, not used +#' +#' @returns An `interlaced` vector +#' +#' @export +interlaced <- function(x, na=NULL) { + if (is.character(na)) { + na <- factor(na, levels = unique(na)) + } + + m <- na[match(x, na)] + v <- x + v[x %in% na] <- NA + new_interlaced(list_c(v), m) +} + +#' @rdname interlaced +#' @export +as.interlaced <- function(x, na = NULL, ...) { + UseMethod("as.interlaced") +} + +#' @rdname interlaced +#' @export +as.interlaced.default <- function(x, na = NULL, ...) { + interlaced(x, na) +} + +#' @rdname interlaced +#' @export +as.interlaced.interlacer_interlaced <- function(x, ...) { + x +} + +#' @rdname interlaced +#' @export +as.interlaced.data.frame <- function(x, ...) { + x[] <- map(x, \(c) as.interlaced(c, ...)) + x +} + +#' @rdname interlaced +#' @export +is.interlaced <- function(x) { + inherits(x, "interlacer_interlaced") +} + +#' Parse a `character` vector into an `interlaced` vector type +#' +#' `parse_interlaced` converts a character vector to an `interlaced` vector +#' by parsing it with a readr `collector` type. +#' +#' @param x A character vector +#' @param na A vector of values to interpret as missing values +#' @param .value_col A collector to parse the character values (e.g. +#' `readr::col_double()`, `readr::col_integer()`, etc.) +#' +#' @returns An `interlaced` vector +#' +#' @export +parse_interlaced <- function( + x, na, + .value_col = col_guess() +) { + if (!is.character(x)) { + cli_abort("{.arg x} must be a character vector") + } + + v <- type_convert_col(x, .value_col, na = as.character(na)) + + m <- na[match(x, na)] + + new_interlaced(v, m) +} + +new_interlaced <- function(value_channel, na_channel, ...) { + obj_check_vector(value_channel) + obj_check_vector(na_channel) + + if (vec_size(value_channel) != vec_size(na_channel)) { + cli_abort("value_channel and na_channel must be the same size") + } + + if (any(!is.na(value_channel) & !is.na(na_channel))) { + cli_abort( + "value_channel and na_channel cannot simultaneously have valid values" + ) + } + + if (is.interlaced(value_channel) || is.interlaced(na_channel)) { + cli_abort("interlaced types cannot be nested") + } + + if (is.character(na_channel)) { + na_channel <- vec_cast(na_channel, factor(levels=unique(na_channel))) + } + + if (is.numeric(na_channel)) { + na_channel <- vec_cast(na_channel, integer()) + } + + if ( + !is.factor(na_channel) && + !is.integer(na_channel) && + !inherits(na_channel, "vctrs_unspecified") + ) { + cli_abort("na_channel must be factor or integer") + } + + v <- new_vctr( + value_channel, + na_channel_values = na_channel, + class = "interlacer_interlaced" + ) + + for (i in names(attributes(value_channel))) { + if (i != "class") { + if (!is.null(attr(v, i))) { + cli_abort("attribute {i} in value vector conflicts with interlaced") + } + attr(v, i) <- attr(value_channel, i) + } + } + + if (!inherits(value_channel, "vctrs_unspecified")) { + class(v) <- c(class(v), class(value_channel)) + } + + v +} + +#' Lift values to missing reasons +#' +#' `na()` lifts a value into an `interlaced` missing reason channel. +#' +#' @param x A character or numeric value +#' +#' @returns An `interlaced` value +#' +#' @export +na <- function(x = unspecified()) { + if (is.logical(x) && all(is.na(x))) { + x <- unspecified(vec_size(x)) + } + new_interlaced(unspecified(vec_size(x)), x) +} + +#' Access the channels of an `interlaced` vector +#' +#' * `value_channel()` returns the value channel of an `interlaced` vector +#' * `na_channel()` returns the missing reason channel of an `interlaced` vector +#' +#' @param x An `interlaced` vector +#' @param ... Additional arguments, not used +#' +#' @returns The value or missing reasons channel +#' +#' @export +value_channel <- function(x, ...) { + UseMethod("value_channel") +} + +#' @export +value_channel.default <- function(x, ...) { + x +} + +#' @export +value_channel.interlacer_interlaced <- function(x, ...) { + attr(x, "na_channel_values") <- NULL + cls <- class(x) + cls_idx <- match("vctrs_vctr", cls) + if (cls_idx == length(cls)) { + # When this uses vec_size(), it generates a stack overflow... + x <- unspecified(length(x)) + } else { + class(x) <- cls[(cls_idx+1):length(cls)] + } + x +} + +#' @export +value_channel.data.frame <- function(x, ...) { + x[] <- map(x, value_channel) + x +} + +#' @rdname value_channel +#' @export +na_channel <- function(x, ...) { + UseMethod("na_channel") +} + +#' @export +na_channel.default <- function(x, ...) { + unspecified(vec_size(x)) +} + +#' @export +na_channel.interlacer_interlaced <- function(x, ...) { + attr(x, "na_channel_values") +} + +#' @export +na_channel.data.frame <- function(x, ...) { + x[] <- map(x, na_channel) + x +} + +#' Flatten a `interlaced` vector +#' +#' `flatten_channels()` flattens an `interlaced` vector into a single channel. +#' This is useful as a step right before writing an `interlaced` vector to a +#' file, for example. +#' +#' @param x An `interlaced` vector +#' @param ... Additional arguments, not used +#' +#' @returns The vector, flattened +#' +#' @export +flatten_channels <- function(x, ...) { + UseMethod("flatten_channels") +} + +#' @export +flatten_channels.default <- function(x, ...) { + x +} + +#' @export +flatten_channels.data.frame <- function(x, ...) { + x[] <- map(x, flatten_channels) + x +} + +#' @export +flatten_channels.interlacer_interlaced <- function(x, ...) { + v <- value_channel(x) + m <- na_channel(x) + + if (!(is.numeric(v) && is.numeric(m)) && !(is.factor(v) && is.factor(m))) { + v <- as.character(v) + m <- as.character(m) + } + + isect <- na.omit(intersect(v, m)) + if (length(isect) > 0) { + cli_abort("value and na channels have items that overlap: {isect}") + } + + if_else(!is.na(v), v, m) +} + + +# Functional utilities ---------------------------------------------------- + +#' `interlaced` functional utilities +#' +#' `map_value_channel()` modifies the values of an `interlaced` +#' vector. `map_na_channel()` modifies the missing reason channel of an +#' `interlaced` vector. +#' +#' @param x an interlaced vector +#' @param fn a function that maps values or missing reasons to new values +#' +#' @returns a new interlaced vector, modified according to the supplied function +#' @export +map_value_channel <- function(x, fn) { + new_interlaced( + fn(value_channel(x)), + na_channel(x) + ) +} + + +#' @rdname map_value_channel +#' @export +map_na_channel <- function(x, fn) { + new_interlaced( + value_channel(x), + fn(na_channel(x)) + ) +} + +# Utility helper, not exported +bimap_interlaced <- function(x, fn) { + new_interlaced( + fn(value_channel(x)), + fn(na_channel(x)) + ) +} + +bimap2_interlaced <- function(x, y, fn) { + new_interlaced( + fn(value_channel(x), value_channel(y)), + fn(na_channel(x), na_channel(y)), + ) +} + +# Display --------------------------------------------------------------- + +#' @export +vec_ptype_full.interlacer_interlaced <- function(x, ...) { + paste0( + "interlaced<", + vec_ptype_abbr(value_channel(x)), + ", ", + vec_ptype_abbr(na_channel(x)), + ">" + ) +} + +#' @export +vec_ptype_abbr.interlacer_interlaced <- function(x, ...) { + paste0(vec_ptype_abbr(value_channel(x)), ",", vec_ptype_abbr(na_channel(x))) +} + +#' @export +format.interlacer_interlaced <- function(x, ...) { + map_chr(x, function(i) { + if (is.empty(i)) { + return(paste0("<<", format(na_channel(i)), ">>")) + } + if (is.na(i)) { + return(paste0("<", format(na_channel(i)), ">")) + } + format(value_channel(i)) + }) +} + +#' @export +obj_print_footer.interlacer_interlaced <- function(x, ...) { + if (!is.null(levels(x))) { + cat("Levels:", paste(levels(x), collapse = " "), "\n") + } + if (!is.null(na_levels(x))) { + cat("NA levels:", paste(na_levels(x), collapse = " "), "\n") + } +} + +style_empty <- function(x) { + cli::col_blue(x) +} + +#' @importFrom pillar pillar_shaft +#' @export +pillar_shaft.interlacer_interlaced <- function(x, ...) { + align <- if (is_character(x)) "left" else "right" + items <- map(x, function(i) { + if (is.empty(i)) { + return(style_empty(format(i))) + } + if (is.na(i)) { + return(pillar::style_na(format(i))) + } + format(i) + }) + pillar::new_pillar_shaft_simple(items, align = align) +} + +# Proxies -------------------------------------------------------------- + +#' @export +vec_proxy.interlacer_interlaced <- function(x, ...) { + data_frame( + v = value_channel(x), + m = na_channel(x), + ) +} + +#' @export +vec_restore.interlacer_interlaced <- function(x, to, ...) { + new_interlaced(x$v, x$m) +} + +#' @export +vec_proxy_equal.interlacer_interlaced <- function(x, ...) { + map(x, function(i) { + if (is.empty(i)) { + return(NULL) + } else { + as.list(i) + } + }) +} + +#' @export +vec_proxy_compare.interlacer_interlaced <- function(x, ...) { + value_channel(x) +} + +#' @export +vec_proxy_order.interlacer_interlaced <- function(x, ...) { + vec_proxy(x) +} + + +# Subsetting -------------------------------------------------------------- + +#' @export +`[.interlacer_interlaced` <- function(x, i, ...) { + if (!missing(...)) { + cli_abort("Can't index interlaced vectors on dimensions greater than 1.") + } + i <- maybe_missing(i, TRUE) + bimap_interlaced(x, \(v) v[i]) +} + +#' @export +`[[.interlacer_interlaced` <- function(x, i, ...) { + if (!missing(...)) { + cli_abort("Can't index interlaced vectors on dimensions greater than 1.") + } + bimap_interlaced(x, \(v) v[[i]]) +} + +#' @export +`$.interlacer_interlaced` <- function(x, i, ...) { + stop_unsupported(x, "subsetting with $") +} + +#' @export +`[<-.interlacer_interlaced` <- function(x, i, ..., value) { + if (!missing(...)) { + cli_abort("Can't index interlaced vectors on dimensions greater than 1.") + } + bimap2_interlaced(x, value, \(v, new_v) `[<-`(v, i, value=new_v)) +} + +#' @export +`[[<-.interlacer_interlaced` <- function(x, i, ..., value) { + if (!missing(...)) { + cli_abort("Can't index interlaced vectors on dimensions greater than 1.") + } + bimap2_interlaced(x, value, \(v, new_v) `[[<-`(v, i, value=new_v)) +} + +#' @export +`$<-.interlacer_interlaced` <- function(x, i, value) { + stop_unsupported(x, "subset assignment with $") +} + +# Misc ----------------------------------------------------------- + +#' @export +rep.interlacer_interlaced <- function(x, ...) { + bimap_interlaced(x, \(v) rep(v, ...)) +} + +#' @export +`length<-.interlacer_interlaced` <- function(x, value) { + bimap_interlaced(x, \(v) `length<-`(v, value)) +} + +#' Get the factor levels of the value or missing reason channel +#' +#' The base S3 `levels()` function is overloaded for `interlaced` vectors, so +#' when the value channel is a factor type, `levels()` will return its levels. +#' Similarly `na_levels()` will return the levels for the missing reason +#' channel, when it is a `factor` type. +#' +#' @param x an `interlaced` vector +#' @param value new levels to set +#' +#' @returns The levels of the values or missing reason channel +#' +#' @export +na_levels <- function(x) { + levels(na_channel(x)) +} + +#' @rdname na_levels +#' @export +`na_levels<-` <- function(x, value) { + map_na_channel(x, \(v) `levels<-`(v, value)) +} + +#' @rdname na_levels +#' @export +levels.interlacer_interlaced <- function(x) { + levels(value_channel(x)) +} + +#' @rdname na_levels +#' @export +`levels<-.interlacer_interlaced` <- function(x, value) { + map_value_channel(x, \(v) `levels<-`(v, value)) +} + + +# NA functions --------------------------------------------------------- + +#' NA missing reasons +#' +#' When a value is missing both a value and a missing reason, it is considered +#' "empty". `is.empty()` checks for these type of values. Regular `NA` values +#' (with no missing reasons) are also considered "empty". +#' +#' @param x a vector +#' +#' @returns a logical vector the same length as x, containing TRUE for all +#' empty elements, and FALSE otherwise. +#' +#' @export +is.empty <- function(x) { + UseMethod("is.empty") +} + +#' @export +is.empty.default <- function(x) { + is.na(x) +} + +#' @export +is.empty.interlacer_interlaced <- function(x) { + is.na(value_channel(x)) & is.na(na_channel(x)) +} + +# TODO: Remove these if not inheriting from vctrs_vctr + +#' @export +is.na.interlacer_interlaced <- function(x) { + is.na(value_channel(x)) +} + +#' @importFrom stats na.omit +#' @export +na.omit.interlacer_interlaced <- function(object, ...) { + na.omit(value_channel(object), ...) +} + +#' @importFrom stats na.exclude +#' @export +na.exclude.interlacer_interlaced <- function(object, ...) { + na.exclude(value_channel(object), ...) +} + +#' @importFrom stats na.fail +#' @export +na.fail.interlacer_interlaced <- function(object, ...) { + na.fail(value_channel(object), ...) +} + +# Comparison & Order ---------------------------------------------------- + +# Min, max, and range, etc. have to be redefined here because they are +# implemented in the original vctrs_vctr by finding the index of the desired +# value, then returning the vctr indexed at that location. Problem is, this +# causes it to sometimes return na(Reason)s instead of a base NA when na.rm=F + +#' @export +min.interlacer_interlaced <- function(x, ...) { + min(value_channel(x), ...) +} + +#' @export +max.interlacer_interlaced <- function(x, ...) { + max(value_channel(x), ...) +} + +#' @export +range.interlacer_interlaced <- function(x, ...) { + range(value_channel(x), ...) +} + +#' @importFrom stats median +#' @export +median.interlacer_interlaced <- function(x, ...) { + median(value_channel(x), ...) +} + +#' @importFrom stats quantile +#' @export +quantile.interlacer_interlaced <- function(x, ...) { + quantile(value_channel(x), ...) +} + +#' @export +xtfrm.interlacer_interlaced <- function(x) { + xtfrm(value_channel(x)) +} + +# Math / Arith ------------------------------------------------------- + +#' @export +vec_math.interlacer_interlaced <- function(.fn, .x, ...) { + vec_math(.fn, value_channel(.x), ...) +} + +#' @export +#' @method vec_arith interlacer_interlaced +vec_arith.interlacer_interlaced <- function(op, x, y, ...) { + UseMethod("vec_arith.interlacer_interlaced", y) +} + +arith_unwrap <- function(op, x, y, ...) { + vec_arith(op, value_channel(x), value_channel(y)) +} + +#' @export +#' @method vec_arith.interlacer_interlaced default +vec_arith.interlacer_interlaced.default <- arith_unwrap + +#' @export +#' @method vec_arith.interlacer_interlaced interlacer_interlaced +vec_arith.interlacer_interlaced.interlacer_interlaced <- arith_unwrap + +## + +#' @export +#' @method vec_arith.numeric interlacer_interlaced +vec_arith.numeric.interlacer_interlaced <- arith_unwrap + +#' @export +#' @method vec_arith.logical interlacer_interlaced +vec_arith.logical.interlacer_interlaced <- arith_unwrap + +# Coercion ---------------------------------------------------------------- + +vec_ptype2_interlaced <- function(x, y, ...) { + bimap2_interlaced(x, y, vec_ptype2) +} + +#' @export +vec_ptype2.interlacer_interlaced.character <- vec_ptype2_interlaced + +#' @export +vec_ptype2.interlacer_interlaced.double <- vec_ptype2_interlaced + +#' @export +vec_ptype2.interlacer_interlaced.integer <- vec_ptype2_interlaced + +#' @export +vec_ptype2.interlacer_interlaced.logical <- vec_ptype2_interlaced + +#' @export +vec_ptype2.interlacer_interlaced.factor <- vec_ptype2_interlaced + +####### + +#' @export +vec_ptype2.interlacer_interlaced.interlacer_interlaced <- vec_ptype2_interlaced + +####### + +#' @export +vec_ptype2.character.interlacer_interlaced <- vec_ptype2_interlaced + +#' @export +vec_ptype2.integer.interlacer_interlaced <- vec_ptype2_interlaced + +#' @export +vec_ptype2.double.interlacer_interlaced <- vec_ptype2_interlaced + +#' @export +vec_ptype2.logical.interlacer_interlaced <- vec_ptype2_interlaced + +#' @export +vec_ptype2.factor.interlacer_interlaced <- vec_ptype2_interlaced + +# Casting ----------------------------------------------------------------- + +cast_lift <- function(x, to, ...) { + bimap2_interlaced(x, to, vec_cast) +} + +#' @export +vec_cast.interlacer_interlaced.character <- cast_lift + +#' @export +vec_cast.interlacer_interlaced.double <- cast_lift + +#' @export +vec_cast.interlacer_interlaced.integer <- cast_lift + +#' @export +vec_cast.interlacer_interlaced.logical <- cast_lift + +#' @export +vec_cast.interlacer_interlaced.factor <- cast_lift + +#### + +#' @export +vec_cast.interlacer_interlaced.interlacer_interlaced <- cast_lift + +#### + +cast_unwrap <- function(x, to, ...) { + vec_cast(value_channel(x), to, ...) +} + +#' @export +vec_cast.character.interlacer_interlaced <- cast_unwrap + +#' @export +vec_cast.double.interlacer_interlaced <- cast_unwrap + +#' @export +vec_cast.integer.interlacer_interlaced <- cast_unwrap + +#' @export +vec_cast.logical.interlacer_interlaced <- cast_unwrap + +#' @export +vec_cast.factor.interlacer_interlaced <- cast_unwrap + +# Override misc operations carried from vctrs_vctr that cause problems +# +# TODO: Maybe we should move away from vctrs_vctr so it more reliably uses +# the functions for the base type? + +#' @export +is.infinite.interlacer_interlaced <- function(x, ...) { + is.infinite(value_channel(x), ...) +} + +#' @export +is.finite.interlacer_interlaced <- function(x, ...) { + is.finite(value_channel(x), ...) +} + +#' @export +as.logical.interlacer_interlaced <- function(x, ...) { + as.logical(value_channel(x), ...) +} + +#' @export +as.character.interlacer_interlaced <- function(x, ...) { + as.character(value_channel(x), ...) +} + +#' @export +as.integer.interlacer_interlaced <- function(x, ...) { + as.integer(value_channel(x), ...) +} + +#' @export +as.double.interlacer_interlaced <- function(x, ...) { + as.double(value_channel(x), ...) +} + +#' @importFrom generics as.factor +#' @export +generics::as.factor + +#' @export +as.factor.interlacer_interlaced <- function(x, ...) { + as.factor(value_channel(x), ...) +} + +#' @importFrom generics as.ordered +#' @export +generics::as.ordered + +#' @export +as.ordered.interlacer_interlaced <- function(x, ...) { + as.ordered(value_channel(x), ...) +} + +# Helpers ----------------------------------------------------------------- + +stop_unsupported <- function(x, method) { + cli_abort("`{method}.{class(x)[[1]]}()` not supported.") +} diff --git a/R/interlacer.R b/R/interlacer.R index f1eb8ac..7180a47 100644 --- a/R/interlacer.R +++ b/R/interlacer.R @@ -1,11 +1,7 @@ #' @import rlang -#' @import dplyr +#' @import vctrs #' @import cli -#' @import readr -#' @import tibble -#' @importFrom pillar tbl_format_setup tbl_format_header -#' @importFrom purrr list_flatten -#' @importFrom tidyselect peek_vars -#' @importFrom glue glue +#' @importFrom tibble tibble as_tibble +#' @importFrom dplyr if_else NULL diff --git a/R/read.R b/R/read.R index 49e3534..5ad6efd 100644 --- a/R/read.R +++ b/R/read.R @@ -3,103 +3,456 @@ #' The `read_interlaced_*()`, family of functions extend `readr`'s #' `read_delim()`, `read_csv`, etc. functions for use on data sources where #' values are interlaced with missing reasons. These functions return a tibble -#' with two columns for each interlaced source column: a column with -#' values, and a column with missing reasons. Missing reason columns are named -#' by taking the value column name and surrounding it by dots -#' (e.g. missing reasons for "col_name" are read into a column named -#' ".col_name.") +#' with `interlaced` columns. #' -#' @param file Either a path to a file, a connection, or literal data (either -#' a single string or a raw vector). -#' @param delim Single character used to separate fields within a record. -#' @param col_types One of `NULL`, a [readr::cols()] specification, or a string. In -#' addition to the `col_*` specifiers provided by `readr`, `icol_*()` -#' specifiers may be used. See `vignette("interlacer")` for more details. -#' @param col_select Columns to include in the results. As with -#' [reader::read_delim], you can use the same mini-language as -#' [dplyr::select()] to refer to the columns by name. -#' @param na Character vector of strings to interpret as missing values. These -#' values will become the factor levels of the missing reason column. -#' @param ... Additional parameters to pass to `read_delim` +#' @inheritParams readr::read_delim +#' @inheritParams vroom::vroom #' -#' @return A deinterlaced [tibble()], that is, a tibble with separate columns -#' for values and missing reasonskfor each variable. +#' @param na A NA col spec defined by `na_cols()` or a character or numeric +#' vector of values to interpret as missing values. +#' +#' @return A [tibble()], with interlaced columns. #' #' @export #' @examples #' # Beep boop read_interlaced_delim <- function( - file, - delim = NULL, - col_types = NULL, - col_select = NULL, - na = c("", "NA"), - ... + file, + delim = NULL, + quote = "\"", + escape_backslash = FALSE, + escape_double = TRUE, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + id = NULL, + locale = readr::default_locale(), + na = c("", "NA"), + comment = "", + trim_ws = FALSE, + skip = 0, + n_max = Inf, + guess_max = min(1000, n_max), + name_repair = "unique", + # num_threads = readr::readr_threads(), + progress = readr::show_progress(), + show_col_types = readr::should_show_types(), + skip_empty_rows = TRUE + # lazy = should_read_lazy() ) { - read_delim( - file, delim, col_types = cols(.default = "c"), na = NULL, ... - ) |> - read_interlaced_helper(col_types, {{ col_select }}, na) + interlaced_vroom( + file = file, + delim = delim, + col_names = col_names, + col_types = col_types, + col_select = {{col_select}}, + id = id, + skip = skip, + n_max = n_max, + na = na, + quote = quote, + comment = comment, + skip_empty_rows = skip_empty_rows, + trim_ws = trim_ws, + escape_double = escape_double, + escape_backslash = escape_backslash, + locale = locale, + guess_max = guess_max, + # altrep + # num_threads = num_threads, + progress = progress, + show_col_types = show_col_types, + .name_repair = name_repair + ) } #' @rdname read_interlaced_delim #' @export read_interlaced_csv <- function( file, + col_names = TRUE, col_types = NULL, col_select = NULL, + id = NULL, + locale = readr::default_locale(), na = c("", "NA"), - ... + quote = "\"", + comment = "", + trim_ws = TRUE, + skip = 0, + n_max = Inf, + guess_max = min(1000, n_max), + name_repair = "unique", + # num_threads = readr::readr_threads(), + progress = readr::show_progress(), + show_col_types = readr::should_show_types(), + skip_empty_rows = TRUE + # lazy = should_read_lazy() ) { - read_csv( - file, col_types = cols(.default = "c"), na = character(), ... - ) |> - read_interlaced_helper(col_types, {{ col_select }}, na) + interlaced_vroom( + file = file, + delim = ",", + col_names = col_names, + col_types = col_types, + col_select = {{col_select}}, + id = id, + skip = skip, + n_max = n_max, + na = na, + quote = quote, + comment = comment, + skip_empty_rows = skip_empty_rows, + trim_ws = trim_ws, + escape_double = TRUE, + escape_backslash = FALSE, + locale = locale, + guess_max = guess_max, + # altrep + # num_threads = num_threads, + progress = progress, + show_col_types = show_col_types, + .name_repair = name_repair + ) } #' @rdname read_interlaced_delim #' @export read_interlaced_csv2 <- function( file, + col_names = TRUE, col_types = NULL, col_select = NULL, + id = NULL, + locale = readr::default_locale(), na = c("", "NA"), - ... + quote = "\"", + comment = "", + trim_ws = TRUE, + skip = 0, + n_max = Inf, + guess_max = min(1000, n_max), + name_repair = "unique", + # num_threads = readr::num_threads(), + progress = readr::show_progress(), + show_col_types = readr::should_show_types(), + skip_empty_rows = TRUE + # lazy = should_read_lazy() ) { - read_csv2( - file, col_types = cols(.default = "c"), na = character(), ... - ) |> - read_interlaced_helper(col_types, {{ col_select }}, na) + interlaced_vroom( + file = file, + delim = ";", + col_names = col_names, + col_types = col_types, + col_select = {{col_select}}, + id = id, + skip = skip, + n_max = n_max, + na = na, + quote = quote, + comment = comment, + skip_empty_rows = skip_empty_rows, + trim_ws = trim_ws, + escape_double = TRUE, + escape_backslash = FALSE, + locale = locale, + guess_max = guess_max, + # altrep + # num_threads = num_threads, + progress = progress, + show_col_types = show_col_types, + .name_repair = name_repair + ) } #' @rdname read_interlaced_delim #' @export read_interlaced_tsv <- function( file, + col_names = TRUE, col_types = NULL, col_select = NULL, + id = NULL, + locale = readr::default_locale(), na = c("", "NA"), - ... + quote = "\"", + comment = "", + trim_ws = TRUE, + skip = 0, + n_max = Inf, + guess_max = min(1000, n_max), + name_repair = "unique", + # num_threads = readr::num_threads(), + progress = readr::show_progress(), + show_col_types = readr::should_show_types(), + skip_empty_rows = TRUE + # lazy = should_read_lazy() ) { - read_tsv( - file, col_types = cols(.default = "c"), na = character(), ... - ) |> - read_interlaced_helper(col_types, {{ col_select }}, na) + interlaced_vroom( + file = file, + delim = "\t", + col_names = col_names, + col_types = col_types, + col_select = {{col_select}}, + id = id, + skip = skip, + n_max = n_max, + na = na, + quote = quote, + comment = comment, + skip_empty_rows = skip_empty_rows, + trim_ws = trim_ws, + escape_double = TRUE, + escape_backslash = FALSE, + locale = locale, + guess_max = guess_max, + # altrep + # num_threads = num_threads, + progress = progress, + show_col_types = show_col_types, + .name_repair = name_repair + ) } -read_interlaced_helper <- function( - x, - col_types, - col_select, - na +#' @rdname read_interlaced_delim +#' @export +interlaced_vroom <- function( + file, + delim = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + id = NULL, + skip = 0, + n_max = Inf, + na = c("", "NA"), + quote = "\"", + comment = "", + skip_empty_rows = TRUE, + trim_ws = TRUE, + escape_double = TRUE, + escape_backslash = FALSE, + locale = vroom::default_locale(), + guess_max = 100, + # altrep = TRUE, + # num_threads = vroom_threads(), + progress = vroom::vroom_progress(), + show_col_types = NULL, + .name_repair = "unique" ) { - col_select <- enquo(col_select) - if (quo_is_null(col_select)){ - col_select <- expr(everything()) + std_opts <- list2( + file = file, + delim = delim, + col_names = col_names, + # col_types + # col_select + # id + skip = skip, + n_max = n_max, + # na + quote = quote, + comment = comment, + skip_empty_rows = skip_empty_rows, + trim_ws = trim_ws, + escape_double = escape_double, + escape_backslash = escape_backslash, + locale = locale, + guess_max = guess_max, + altrep = FALSE, + # num_threads + progress = FALSE, + show_col_types = FALSE, + .name_repair = "check_unique", + ) + + col_spec <- as.col_spec(col_types) + na_col_spec <- as.na_col_spec(na) + + check_col_spec(col_spec, "col_types") + check_col_spec(na_col_spec, "na_col_types") + + if (!is.null(id)) { + cli_abort( + "{.arg id} arg not supported (yet)" + ) } - x |> - select(!!col_select) |> - deinterlace_type_convert(col_types, na) |> - as_deinterlaced_df() + # Step 1: Read everything as string + + df_chr <- inject( + vroom::vroom( + !!!std_opts, + col_types = cols(.default = "c"), + col_select = {{ col_select }}, + id = NULL, + na = character(), + # num_threads = num_threads + ) + ) + + # IMMEDIATELY rename the cols back to their original names. + # (We will do the final rename after loading the values) + # `vars` has the original column names in the file, and + # `names(vars)` is what we want to rename them to. + vars <- vroom_col_select_map({{col_select}}, spec(df_chr)) + names(df_chr) <- vars + + # Set names of unnamed col_specs according to the columns vroom found + col_spec <- fix_col_spec_names( + col_spec, + names(spec(df_chr)$cols), + col_guess(), + "col_types" + ) + + na_col_spec <- fix_col_spec_names( + na_col_spec, + names(spec(df_chr)$cols), + NULL, + "na" + ) + + # Step 2: For each of the resulting columns, go back and convert values + + if (progress) { + p <- cli_progress_bar("Loading", total = length(vars)) + } + + out <- map(set_names(vars, vars), function(i) { + collector <- col_spec$cols[[i]] %||% col_spec$default + + if (i %in% names2(na_col_spec$cols)) { + # Col is explicitly overridden; don't use .default + na_collector <- na_col_spec$cols[[i]] + } else { + na_collector <- na_col_spec$cols[[i]] %||% na_col_spec$default + } + + vroom_call <- withWarnings( + inject( + vroom::vroom( + !!!std_opts, + col_types = col_spec, + col_select = tidyselect::all_of(i), + id = NULL, + na = as.character(na_collector), + # num_threads = 1 + ) + ) + ) + + value_df <- vroom_call$value + + used_value_collector <- spec(value_df)$cols[[i]] + values <- value_df[[1]] + + if (is.null(na_collector)) { + out_value <- values + } else { + na_idx <- match(df_chr[[i]], na_collector) + na_values <- factor(na_collector[na_idx], levels=na_collector) + out_value <- new_interlaced(values, na_values) + } + + if (progress) { + cli_progress_update(id = p) + } + + list( + values = out_value, + problems = vroom_call$warnings, + spec = used_value_collector + ) + }) + + df <- as_tibble(map(out, \(i) i$values), .name_repair = .name_repair) + + # Replace spec cols from chr spec into values col specs + attr(df, "spec") <- update_col_spec( + spec(df_chr), map(out, \(i) i$spec), col_spec$default + ) + + attr(df, "na_spec") <- na_col_spec + + # Rename result to names from col_select + names(df) <- names(vars) + + # Show col types if requested + if ( + !is_testing() && + vroom_should_show_col_types(!is.null(col_types), show_col_types) + ) { + vroom_show_col_types(df, locale) + } + + # I'd like to hoover up all the vroom problems and put them together as a + # `problems` attr on the result, but I can't because of this bug: + # https://github.com/tidyverse/vroom/issues/534 + # + # Instead, I just warn if there was an issue... + for (i in names(df)) { + for (w in out[[i]]$problems) { + if (inherits(w, "vroom_parse_issue")) { + cli_warn("column `{i}` had a vroom parse issue") + } else { + cli_warn( + c( + "unexpected vroom warning on column `{i}`", + ">" = "{w}" + ) + ) + } + } + } + + df +} + +#' Examine the NA spec of a data frame +#' +#' Like `readr::spec()`, `na_spec()` extracts the NA column specification from +#' a tibble created by `read_interlaced_*` +#' +#' @param x The data frame object to extract from +#' +#' @returns An `na_col_spec` object +#' +#' @export +na_spec <- function(x) { + attr(x, "na_spec") +} + +check_col_spec <- function(col_spec, arg) { + if (any(names2(col_spec$cols) == "") && any(names2(col_spec$cols) != "")) { + cli_abort( + "{.arg arg} cannot have a mix of named and unnamed values" + ) + } +} + +update_col_spec <- function(col_spec, update_list, default) { + col_spec$cols[names(update_list)] <- update_list + col_spec$default <- default + col_spec +} + +fix_col_spec_names <- function(col_spec, spec_names, default, arg) { + is_unnamed_col_spec <- all(names2(col_spec$cols) == "") + + if (length(col_spec$cols) > 0 && is_unnamed_col_spec) { + if (length(col_spec$cols) != length(spec_names)) { + cli_warn( + paste0( + "mismatch between number of unnamed columns defined in ", + "`{arg}` ({length(col_spec$cols)}) and columns found in ", + "file ({length(spec_names)})" + ) + ) + } + + col_spec$cols <- map( + set_names(seq_along(spec_names), spec_names), + \(i) col_spec$cols[i][[1]] %||% default + ) + } + col_spec } diff --git a/R/util.R b/R/util.R new file mode 100644 index 0000000..676ca56 --- /dev/null +++ b/R/util.R @@ -0,0 +1,36 @@ +is_testing <- function() { + identical(Sys.getenv("TESTTHAT"), "true") +} + +#' @importFrom vctrs vec_c +#' @export +vctrs::vec_c + +is_syntactic <- function(x) make.names(x) == x + +fix_non_syntactic <- function(x) { + non_syntactic <- !is_syntactic(x) + x[non_syntactic] <- paste0("`", gsub("`", "\\\\`", x[non_syntactic]), "`") + x +} + +# Source: +# https://stackoverflow.com/questions/3903157/how-can-i-check-whether-a-function-call-results-in-a-warning +withWarnings <- function(expr) { + myWarnings <- list() + wHandler <- function(w) { + myWarnings <<- c(myWarnings, list(w)) + invokeRestart("muffleWarning") + } + val <- withCallingHandlers(expr, warning = wHandler) + list(value = val, warnings = myWarnings) +} + +type_convert_col <- function(x, col, na) { + out <- readr::type_convert(tibble(x), col_types = list(x = col), na=na)$x + if (all(is.na(out)) && inherits(col, "collector_guess")) { + unspecified(length(out)) + } else { + out + } +} diff --git a/R/utils.R b/R/utils.R deleted file mode 100644 index 8d25f80..0000000 --- a/R/utils.R +++ /dev/null @@ -1,94 +0,0 @@ -#' The names of an deinterlaced data frame -#' -#' @description -#' -#' Functions to get the names of missing reason columns or value columns in -#' an deinterlaced data frame -#' -#' @param x A deinterlaced data frame -#' @return A vector of missing reason or value column names. -#' -#' @export -missing_names <- function(x) { - names(x)[is_missing_name(names(x))] -} - -#' @rdname missing_names -#' @export -value_names <- function(x) { - names(x)[is_value_name(names(x))] -} - -#' Selection helpers for deinterlaced data frames -#' -#' @description -#' -#' These tidy selection helpers match missing reason or value columns in an -#' deinterlaced data frame -#' -#' * `missing_cols()` selects missing reason columns. -#' -#' * `value_cols()` selects value columns. -#' -#' @inheritParams tidyselect::starts_with -#' @export -missing_cols <- function(vars = NULL) { - vars <- vars %||% peek_vars(fn = "missing_cols") - vars[is_missing_name(vars)] -} - -#' @rdname missing_cols -#' @export -value_cols <- function(vars = NULL) { - vars <- vars %||% peek_vars(fn = "value_cols") - vars[is_value_name(vars)] -} - -to_missing_name <- function(x) { - if (!all(is_value_name(x))) { - cli_abort(glue("Expected value column names, got: {x}")) - } - glue(".{x}.") -} - -to_value_name <- function(x) { - if (!all(is_missing_name(x))) { - cli_abort(glue("Expected missing column names, got: {x}")) - } - gsub("^\\.(.*)\\.$", "\\1", x) -} - -is_missing_name <- function(x) { - grepl("^\\..*\\.$", x) -} - -is_value_name <- function(x) { - !is_missing_name(x) -} - -#' Drop missing reasons from a deinterlaced data frame -#' -#' @description -#' -#' Drop the missing reason or value columns from a deinterlaced data frame, -#' turning it into a regular data frame with unlabelled `NA` values. -#' -#' @param x A data frame -#' -#' @return A tibble without missing reason columns. -#' -#' @export -drop_missing_cols <- function(x) { - x |> - select(value_cols()) |> - as_tibble() -} - -#' @rdname drop_missing_cols -#' @export -drop_value_cols <- function(x) { - x |> - select(missing_cols()) |> - as_tibble() -} - diff --git a/R/write.R b/R/write.R index 4761598..6cefcad 100644 --- a/R/write.R +++ b/R/write.R @@ -2,74 +2,189 @@ #' #' @description #' -#' The `write_interlaced_*()` family of functions will take a deinterlaced -#' data frame, re-interlace it, and write it to a flie. The behavior of these +#' The `write_interlaced_*()` family of functions will take a data frame +#' with interlaced columns, flatten all interlaced columns, then write it to +#' a file. Non-interlaced columns just pass through. The behavior of these #' functions match their similarly named counterparts in [readr]. #' -#' @param x A data frame or tibble to write to disk -#' @param file File or connection to write to -#' @param delim Delimiter used to separate values. Defaults to " " for -#' `write_interlaced_delim()`, "," for `write_interlaced_excel_csv()` and ";" -#' for `write_interlaced_excel_csv2()`. Must be a single character. -#' @param ... Additional parameters to pass to [readr] +#' @inheritParams readr::write_delim +#' +#' @param empty String used for empty values (or `NA` values in non-interlaced +#' columns). Defaults to NA. #' #' @returns `write_interlaced_*` returns the input x invisibly #' @export -write_interlaced_delim <- function(x, file, delim = " ", ...) { - write_delim( - interlace_channels(x), - file, - delim, - ... +write_interlaced_delim <- function( + x, + file, + delim = " ", + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) { + readr::write_delim( + x = flatten_channels(x), + file = file, + delim = delim, + na = empty, + append = append, + col_names = col_names, + quote = quote, + escape = escape, + eol = eol, + num_threads = num_threads, + progress = progress ) + invisible(x) } #' @rdname write_interlaced_delim #' @export -write_interlaced_csv <- function(x, file, ...) { - write_csv( - interlace_channels(x), +write_interlaced_csv <- function( + x, file, - ... + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) { + readr::write_csv( + x = flatten_channels(x), + file = file, + na = empty, + append = append, + col_names = col_names, + quote = quote, + escape = escape, + eol = eol, + num_threads = num_threads, + progress = progress ) + invisible(x) } #' @rdname write_interlaced_delim #' @export -write_interlaced_csv2 <- function(x, file, ...) { - write_csv2( - interlace_channels(x), +write_interlaced_csv2 <- function( + x, file, - ... + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) { + readr::write_csv2( + x = flatten_channels(x), + file = file, + na = empty, + append = append, + col_names = col_names, + quote = quote, + escape = escape, + eol = eol, + num_threads = num_threads, + progress = progress ) + invisible(x) } #' @rdname write_interlaced_delim #' @export -write_interlaced_excel_csv <- function(x, file, ...) { - write_excel_csv( - interlace_channels(x), +write_interlaced_excel_csv <- function( + x, file, - ... + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) { + readr::write_excel_csv( + x = flatten_channels(x), + file = file, + na = empty, + append = append, + col_names = col_names, + quote = quote, + escape = escape, + eol = eol, + num_threads = num_threads, + progress = progress ) + invisible(x) } + #' @rdname write_interlaced_delim #' @export -write_interlaced_excel_csv2 <- function(x, file, ...) { - write_excel_csv2( - interlace_channels(x), +write_interlaced_excel_csv2 <- function( + x, file, - ... + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) { + readr::write_excel_csv2( + x = flatten_channels(x), + file = file, + na = empty, + append = append, + col_names = col_names, + quote = quote, + escape = escape, + eol = eol, + num_threads = num_threads, + progress = progress ) + invisible(x) } #' @rdname write_interlaced_delim #' @export -write_interlaced_tsv <- function(x, file, ...) { - write_tsv( - interlace_channels(x), +write_interlaced_tsv <- function( + x, file, - ... + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) { + readr::write_tsv( + x = flatten_channels(x), + file = file, + na = empty, + append = append, + col_names = col_names, + quote = quote, + escape = escape, + eol = eol, + num_threads = num_threads, + progress = progress ) + invisible(x) } diff --git a/README.Rmd b/README.Rmd index 9f71a6a..7f53d2c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -10,7 +10,7 @@ knitr::opts_chunk$set( comment = "#>", fig.path = "man/figures/README-" ) -library(dplyr) +options(warn=-1) ``` # interlacer @@ -19,8 +19,6 @@ library(dplyr) [![R-CMD-check](https://github.com/khusmann/interlacer/actions/workflows/check-standard.yaml/badge.svg)](https://github.com/khusmann/interlacer/actions/workflows/check-standard.yaml) [![codecov](https://codecov.io/gh/khusmann/interlacer/graph/badge.svg?token=R4WNWH5NXU)](https://codecov.io/gh/khusmann/interlacer) -## Overview - When a value is missing in your data, sometimes you want to know *why* it is missing. Many textual tabular data sources will encode missing reasons as special values *interlaced* with the regular values in a column (e.g. `N/A`, @@ -30,25 +28,26 @@ with missing reasons in R traditionally requires loading variables as character vectors and doing a bunch of string comparisons and type conversions to make sense of them. -Interlacer was created based on the insight that values and missing reasons -can be handled as separate *channels* of the same variable. Interlacer -provides functions that load variables from interlaced data sources into two -separate columns: -One containing the variable's values, the other containing its missing reasons. -As it turns out, this structure gives us an extremely powerful and expressive -way to simultaneously work with values and missing reasons in tidy pipelines, -as described in `vignette("interlacer")`. (tldr: It allows us to interact with -a variable as a [`Result` type](https://en.wikipedia.org/wiki/Result_type), -an abstraction often found in functional programming) +interlacer provides functions that load variables from interlaced data sources +into a special `interlaced` column type that holds values and `NA` +reasons in separate *channels* of the same variable. In most contexts, you +can treat `interlaced` columns as if they were regular values: if you take +the `mean` of an interlaced column, for example, +you get the mean of its values, without its missing reasons interfering in +the computation. -Although this may seem like a simple premise on the surface, it has deep -implications! In addition to `vignette("interlacer")`, be sure to also -check out: +Unlike a regular column, however, the missing reasons are still +available. This means you can still filter data frames on variables +by specific missing reasons, or generate +summary statistics with +breakdowns by missing reason. In other words, you no longer have to constantly +manually include / exclude missing reasons in computations by filtering them +with awkward string comparisons or type conversions... everything just works! -- `vignette("mutations")` for a discussion on how to motify data frames when in -this format +In addition to the introduction in `vignette("interlacer")` +be sure to also check out: -- `vignette("column-types")` to see how to handle variable-level missing reasons +- `vignette("na-column-types")` to see how to handle variable-level missing reasons - `vignette("coded-data")` for some recipies for working with coded data (e.g. data produced by SPSS, SAS or Stata) @@ -57,16 +56,19 @@ data produced by SPSS, SAS or Stata) compares to other approaches for representing and manipulating missing reasons alongside data values -This library is currently in its experimental stages, so be aware that its -interface is likely to change in the future. In the meantime, please try it out -and +### ⚠️ ⚠️ ⚠️ WARNING ⚠️ ⚠️ ⚠️ + +This library is currently in its experimental stages, so be aware +that its +interface is quite likely to change in the future. In the meantime, please try +it out and [let me know what you think](mailto:kdh38@psu.edu)! ## Installation -```{r, eval = FALSE} -# The easiest way to get interlacer is to install via devtools: +The easiest way to get interlacer is to install via devtools: +```{r, eval = FALSE} install.packages("devtools") # If devtools is not already installed devtools::install_github("khusmann/interlacer") @@ -77,7 +79,7 @@ devtools::install_github("khusmann/interlacer") To use interlacer, load it into your current R session: ```{r} -library(interlacer) +library(interlacer, warn.conflicts = FALSE) ``` interlacer supports the following file formats with these `read_interlaced_*()` @@ -91,6 +93,7 @@ functions, which extend the `readr::read_*()` family of functions: As a quick demo, consider the following example file bundled with interlacer: ```{r} +library(dplyr, warn.conflicts = FALSE) library(readr) read_file(interlacer_example("colors.csv")) |> @@ -100,74 +103,128 @@ read_file(interlacer_example("colors.csv")) |> In this csv file, values are interlaced with three possible missing reasons: `REFUSED`, `OMITTED`, and `N/A`. -With readr, loading these data would result in a data frame like this: +With `readr`, loading these data would result in a data frame where all missing +reasons are replaced with `NA`: ```{r} read_csv( interlacer_example("colors.csv"), - na = c("REFUSED", "OMITTED", "N/A") + na = c("REFUSED", "OMITTED", "N/A"), + show_col_types = FALSE, ) ``` -With interlacer, we get a "deinterlaced data frame" instead: +With interlacer, missing reasons are preserved: ```{r} (ex <- read_interlaced_csv( interlacer_example("colors.csv"), - na = c("REFUSED", "OMITTED", "N/A") + na = c("REFUSED", "OMITTED", "N/A"), + show_col_types = FALSE, )) ``` -Deinterlaced data frames have two columns for each variable: one for -values, and another for missing reasons. Missing reason columns are denoted -by column names surrounded by dots (e.g. `.age.` is the missing reason for the -`age` column). When a value is `NA`, it always has a reason in the missing -reason column. Similarly, when a missing reason is `NA`, it always has a -value in the value column. +As you can see, in the printout above each column is defined by *two* types: a type +for values, and a type for missing reasons. The `age` column, for example, has type +`double` for its values, and type `factor` for its missing reasons: + +```{r} +ex$age +``` + +Computations automatically operate on values: + +```{r} +mean(ex$age, na.rm=TRUE) +``` + +But the missing reasons are still there! To indicate a value should be treated +as a missing reason instead of a regular value, you can use the `na()` function. +The following, for example, +will filter the data set for all individuals that `REFUSED` to give their +favorite color: -This allows us to separately reference values and missing reasons -in a tidy and type-aware manner. For example, if I wanted to get a -breakdown of the mean age of respondents missing a report of their -favorite color, grouped by the missing reason, it would simply be: +```{r} +ex |> + filter(favorite_color == na("REFUSED")) +``` + +And here's a pipeline that will compute a breakdown of the mean age of +respondents for each favorite color, with separate categories for each missing +reason: ```{r} ex |> summarize( mean_age = mean(age, na.rm=T), n = n(), - .by = .favorite_color. - ) + .by = favorite_color + ) %>% + arrange(favorite_color) ``` -(Note that the `` category in the result refers to the mean age of responses -*without* missing color values, i.e. with available favorite color responses). - But this just scratches the surface of what can be done with interlacer... check out `vignette("interlacer")` for a more complete overview! ## Known Issues -Large data frames (many columns & rows) are slow to run and print with -interlacer. Deinterlaced data frames are validated to check that they conform -to the rule of "one value OR missing reason per row", and this check is done -completely in R. There are a few key places -(noted in the source) that would extremely benefit from a native implementation, -and make the library much more snappy. Before I invest the time in that though, -I want to get enough feedback from users of this package to stabilize the -current approach / API. (If you find this package useful, please -[let me know](mailto:kdh38@psu.edu)!) - -In the meantime, if your deinterlaced data frames are too slow to print, -you can disable row-level validation by setting this option in your current -session: - -```{r, eval=FALSE} -options(interlacer.print_validation = FALSE) +1. Some base functions, like `base::ifelse()`, drop the missing reason channel +on interlaced types, converting them into regular vectors + +For example: + +```{r} +ex |> + mutate( + favorite_color = ifelse(age < 18, na("REDACTED"), favorite_color) + ) ``` -When `interlacer.print_validation = FALSE`, you will need to be extra careful -during mutations, because if you create an invalid state you won't get a -warning! +This is due to a [limitation of R](https://vctrs.r-lib.org/#motivation). +If you run into this, use the tidyverse equivalent of the function. Tidyverse +functions are designed to more correctly handle type conversions. +In this example, we can use `dplyr::if_else()`: + +```{r} +ex |> + mutate( + favorite_color = if_else( + age < 18, + na("REDACTED_UNDERAGE"), + favorite_color, + missing = na("REDACTED_MISSING_AGE") + ) + ) +``` + +2. Performance with large data sets + +You may notice that on large datasets `interlacer` runs significantly slower +than `readr` / `vroom`. Although `interlacer` uses `vroom` under the hood to load +delimited data, it is not able to take advantage of many of its optimizations +because `vroom` does not +[does not currently support](https://github.com/tidyverse/vroom/issues/532) +column-level missing values. As soon as `vroom` supports column-level +missing values, I will be able to remedy this! + +## Related work + +interlacer was inspired by the [`haven`](https://haven.tidyverse.org/), +[`labelled`](https://larmarange.github.io/labelled/), and +[`declared`](https://dusadrian.github.io/declared/) packages. These packages +provide similar functionality to interlacer, but are more focused on +providing compatibility with missing reason data imported from SPSS, SAS, and +Stata. interlacer has slightly different aims: + +1. Be fully generic: Add a missing value channel to *any* vector type. +2. Provide functions for reading / writing interlaced CSV files (not just SPSS +/ SAS / Stata files) +3. Provide a functional API that integrates well into tidy pipelines + +Future versions of interlacer will provide functions to convert to and from +these other packages' types. + +For a more detailed discussion, see `vignette("other-approaches")`. ## Acknowledgements diff --git a/README.md b/README.md index 920eb0f..fe6c0fc 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,6 @@ public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostat [![R-CMD-check](https://github.com/khusmann/interlacer/actions/workflows/check-standard.yaml/badge.svg)](https://github.com/khusmann/interlacer/actions/workflows/check-standard.yaml) [![codecov](https://codecov.io/gh/khusmann/interlacer/graph/badge.svg?token=R4WNWH5NXU)](https://codecov.io/gh/khusmann/interlacer) -## Overview - When a value is missing in your data, sometimes you want to know *why* it is missing. Many textual tabular data sources will encode missing reasons as special values *interlaced* with the regular values in a @@ -20,27 +18,27 @@ type. Working with missing reasons in R traditionally requires loading variables as character vectors and doing a bunch of string comparisons and type conversions to make sense of them. -Interlacer was created based on the insight that values and missing -reasons can be handled as separate *channels* of the same variable. -Interlacer provides functions that load variables from interlaced data -sources into two separate columns: One containing the variable’s values, -the other containing its missing reasons. As it turns out, this -structure gives us an extremely powerful and expressive way to -simultaneously work with values and missing reasons in tidy pipelines, -as described in `vignette("interlacer")`. (tldr: It allows us to -interact with a variable as a [`Result` -type](https://en.wikipedia.org/wiki/Result_type), an abstraction often -found in functional programming) +interlacer provides functions that load variables from interlaced data +sources into a special `interlaced` column type that holds values and +`NA` reasons in separate *channels* of the same variable. In most +contexts, you can treat `interlaced` columns as if they were regular +values: if you take the `mean` of an interlaced column, for example, you +get the mean of its values, without its missing reasons interfering in +the computation. -Although this may seem like a simple premise on the surface, it has deep -implications! In addition to `vignette("interlacer")`, be sure to also -check out: +Unlike a regular column, however, the missing reasons are still +available. This means you can still filter data frames on variables by +specific missing reasons, or generate summary statistics with breakdowns +by missing reason. In other words, you no longer have to constantly +manually include / exclude missing reasons in computations by filtering +them with awkward string comparisons or type conversions… everything +just works! -- `vignette("mutations")` for a discussion on how to motify data frames - when in this format +In addition to the introduction in `vignette("interlacer")` be sure to +also check out: -- `vignette("column-types")` to see how to handle variable-level missing - reasons +- `vignette("na-column-types")` to see how to handle variable-level + missing reasons - `vignette("coded-data")` for some recipies for working with coded data (e.g. data produced by SPSS, SAS or Stata) @@ -49,15 +47,18 @@ check out: approach compares to other approaches for representing and manipulating missing reasons alongside data values +### ⚠️ ⚠️ ⚠️ WARNING ⚠️ ⚠️ ⚠️ + This library is currently in its experimental stages, so be aware that -its interface is likely to change in the future. In the meantime, please -try it out and [let me know what you think](mailto:kdh38@psu.edu)! +its interface is quite likely to change in the future. In the meantime, +please try it out and [let me know what you +think](mailto:kdh38@psu.edu)! ## Installation -``` r -# The easiest way to get interlacer is to install via devtools: +The easiest way to get interlacer is to install via devtools: +``` r install.packages("devtools") # If devtools is not already installed devtools::install_github("khusmann/interlacer") @@ -68,7 +69,7 @@ devtools::install_github("khusmann/interlacer") To use interlacer, load it into your current R session: ``` r -library(interlacer) +library(interlacer, warn.conflicts = FALSE) ``` interlacer supports the following file formats with these @@ -84,6 +85,7 @@ As a quick demo, consider the following example file bundled with interlacer: ``` r +library(dplyr, warn.conflicts = FALSE) library(readr) read_file(interlacer_example("colors.csv")) |> @@ -105,21 +107,15 @@ read_file(interlacer_example("colors.csv")) |> In this csv file, values are interlaced with three possible missing reasons: `REFUSED`, `OMITTED`, and `N/A`. -With readr, loading these data would result in a data frame like this: +With `readr`, loading these data would result in a data frame where all +missing reasons are replaced with `NA`: ``` r read_csv( interlacer_example("colors.csv"), - na = c("REFUSED", "OMITTED", "N/A") + na = c("REFUSED", "OMITTED", "N/A"), + show_col_types = FALSE, ) -#> Rows: 11 Columns: 3 -#> ── Column specification ──────────────────────────────────────────────────────── -#> Delimiter: "," -#> chr (1): favorite_color -#> dbl (2): person_id, age -#> -#> ℹ Use `spec()` to retrieve the full column specification for this data. -#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message. #> # A tibble: 11 × 3 #> person_id age favorite_color #> @@ -136,87 +132,179 @@ read_csv( #> 11 11 10 ``` -With interlacer, we get a “deinterlaced data frame” instead: +With interlacer, missing reasons are preserved: ``` r (ex <- read_interlaced_csv( interlacer_example("colors.csv"), - na = c("REFUSED", "OMITTED", "N/A") + na = c("REFUSED", "OMITTED", "N/A"), + show_col_types = FALSE, )) -#> # A deinterlaced tibble: 11 × 6 -#> person_id .person_id. age .age. favorite_color .favorite_color. -#> -#> 1 1 20 BLUE -#> 2 2 NA REFUSED BLUE -#> 3 3 21 REFUSED -#> 4 4 30 OMITTED -#> 5 5 1 N/A -#> 6 6 41 RED -#> 7 7 50 OMITTED -#> 8 8 30 YELLOW -#> 9 9 NA REFUSED REFUSED -#> 10 10 NA OMITTED RED -#> 11 11 10 REFUSED +#> # A tibble: 11 × 3 +#> person_id age favorite_color +#> +#> 1 1 20 BLUE +#> 2 2 BLUE +#> 3 3 21 +#> 4 4 30 +#> 5 5 1 +#> 6 6 41 RED +#> 7 7 50 +#> 8 8 30 YELLOW +#> 9 9 +#> 10 10 RED +#> 11 11 10 +``` + +As you can see, in the printout above each column is defined by *two* +types: a type for values, and a type for missing reasons. The `age` +column, for example, has type `double` for its values, and type `factor` +for its missing reasons: + +``` r +ex$age +#> [11]> +#> [1] 20 21 30 1 41 50 +#> [8] 30 10 +#> NA levels: REFUSED OMITTED N/A ``` -Deinterlaced data frames have two columns for each variable: one for -values, and another for missing reasons. Missing reason columns are -denoted by column names surrounded by dots (e.g. `.age.` is the missing -reason for the `age` column). When a value is `NA`, it always has a -reason in the missing reason column. Similarly, when a missing reason is -`NA`, it always has a value in the value column. +Computations automatically operate on values: -This allows us to separately reference values and missing reasons in a -tidy and type-aware manner. For example, if I wanted to get a breakdown -of the mean age of respondents missing a report of their favorite color, -grouped by the missing reason, it would simply be: +``` r +mean(ex$age, na.rm=TRUE) +#> [1] 25.375 +``` + +But the missing reasons are still there! To indicate a value should be +treated as a missing reason instead of a regular value, you can use the +`na()` function. The following, for example, will filter the data set +for all individuals that `REFUSED` to give their favorite color: + +``` r +ex |> + filter(favorite_color == na("REFUSED")) +#> # A tibble: 3 × 3 +#> person_id age favorite_color +#> +#> 1 3 21 +#> 2 9 +#> 3 11 10 +``` + +And here’s a pipeline that will compute a breakdown of the mean age of +respondents for each favorite color, with separate categories for each +missing reason: ``` r ex |> summarize( mean_age = mean(age, na.rm=T), n = n(), - .by = .favorite_color. - ) -#> # A tibble: 4 × 3 -#> .favorite_color. mean_age n -#> -#> 1 30.3 5 -#> 2 REFUSED 15.5 3 -#> 3 OMITTED 40 2 -#> 4 N/A 1 1 + .by = favorite_color + ) %>% + arrange(favorite_color) +#> # A tibble: 6 × 3 +#> favorite_color mean_age n +#> +#> 1 BLUE 20 2 +#> 2 RED 41 2 +#> 3 YELLOW 30 1 +#> 4 15.5 3 +#> 5 40 2 +#> 6 1 1 ``` -(Note that the `` category in the result refers to the mean age of -responses *without* missing color values, i.e. with available favorite -color responses). - But this just scratches the surface of what can be done with interlacer… check out `vignette("interlacer")` for a more complete overview! ## Known Issues -Large data frames (many columns & rows) are slow to run and print with -interlacer. Deinterlaced data frames are validated to check that they -conform to the rule of “one value OR missing reason per row”, and this -check is done completely in R. There are a few key places (noted in the -source) that would extremely benefit from a native implementation, and -make the library much more snappy. Before I invest the time in that -though, I want to get enough feedback from users of this package to -stabilize the current approach / API. (If you find this package useful, -please [let me know](mailto:kdh38@psu.edu)!) +1. Some base functions, like `base::ifelse()`, drop the missing reason + channel on interlaced types, converting them into regular vectors -In the meantime, if your deinterlaced data frames are too slow to print, -you can disable row-level validation by setting this option in your -current session: +For example: ``` r -options(interlacer.print_validation = FALSE) +ex |> + mutate( + favorite_color = ifelse(age < 18, na("REDACTED"), favorite_color) + ) +#> # A tibble: 11 × 3 +#> person_id age favorite_color +#> +#> 1 1 20 BLUE +#> 2 2 +#> 3 3 21 +#> 4 4 30 +#> 5 5 1 +#> 6 6 41 RED +#> 7 7 50 +#> 8 8 30 YELLOW +#> 9 9 +#> 10 10 +#> 11 11 10 ``` -When `interlacer.print_validation = FALSE`, you will need to be extra -careful during mutations, because if you create an invalid state you -won’t get a warning! +This is due to a [limitation of R](https://vctrs.r-lib.org/#motivation). +If you run into this, use the tidyverse equivalent of the function. +Tidyverse functions are designed to more correctly handle type +conversions. In this example, we can use `dplyr::if_else()`: + +``` r +ex |> + mutate( + favorite_color = if_else( + age < 18, + na("REDACTED_UNDERAGE"), + favorite_color, + missing = na("REDACTED_MISSING_AGE") + ) + ) +#> # A tibble: 11 × 3 +#> person_id age favorite_color +#> +#> 1 1 20 BLUE +#> 2 2 +#> 3 3 21 +#> 4 4 30 +#> 5 5 1 +#> 6 6 41 RED +#> 7 7 50 +#> 8 8 30 YELLOW +#> 9 9 +#> 10 10 +#> 11 11 10 +``` + +2. Performance with large data sets + +You may notice that on large datasets `interlacer` runs significantly +slower than `readr` / `vroom`. Although `interlacer` uses `vroom` under +the hood to load delimited data, it is not able to take advantage of +many of its optimizations because `vroom` does not [does not currently +support](https://github.com/tidyverse/vroom/issues/532) column-level +missing values. As soon as `vroom` supports column-level missing values, +I will be able to remedy this! + +## Related work + +interlacer was inspired by the [`haven`](https://haven.tidyverse.org/), +[`labelled`](https://larmarange.github.io/labelled/), and +[`declared`](https://dusadrian.github.io/declared/) packages. These +packages provide similar functionality to interlacer, but are more +focused on providing compatibility with missing reason data imported +from SPSS, SAS, and Stata. interlacer has slightly different aims: + +1. Be fully generic: Add a missing value channel to *any* vector type. +2. Provide functions for reading / writing interlaced CSV files (not + just SPSS / SAS / Stata files) +3. Provide a functional API that integrates well into tidy pipelines + +Future versions of interlacer will provide functions to convert to and +from these other packages’ types. + +For a more detailed discussion, see `vignette("other-approaches")`. ## Acknowledgements diff --git a/_pkgdown.yml b/_pkgdown.yml index 302e41a..ae26bba 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -10,7 +10,26 @@ articles: - title: Articles navbar: ~ contents: - - mutations - - column-types - - coded-data - - other-approaches + - na-column-types + - coded-data + - other-approaches + +reference: +- title: Reading and writing interlaced data +- contents: + - starts_with("read_") + - starts_with("write_") + - na_cols + - na_spec + - parse_interlaced + - interlacer_example +- title: The `interlaced` type + desc: Functions for working with the `interlaced` type +- contents: + - interlaced + - na + - value_channel + - is.empty + - na_levels + - map_value_channel + - flatten_channels diff --git a/man/coalesce_channels.Rd b/man/coalesce_channels.Rd deleted file mode 100644 index 24df09e..0000000 --- a/man/coalesce_channels.Rd +++ /dev/null @@ -1,39 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/coalesce_channels.R -\name{coalesce_channels} -\alias{coalesce_channels} -\title{Coalesce missing reasons in a data frame} -\usage{ -coalesce_channels( - x, - missing_reason = getOption("interlacer.default_missing_reason"), - keep = c("values", "missing") -) -} -\arguments{ -\item{x}{A data frame} - -\item{missing_reason}{When a variable is missing a value and a missing -reason, the missing reason to fill in.} - -\item{keep}{When a variable has both a value and missing reason, choose which -to keep. (A properly formed deinterlaced data frame has values OR missing -reasons)} -} -\value{ -A deinterlaced tibble. -} -\description{ -Mutations of deinterlaced data frames can result in variables that either -have both values and missing reasons, or no values and no missing reasons. -`coalesce_channels()` takes care of both situations. In the case where -there is both a value and missing reason, it will choose which to keep based -on the `keep` paramter. In case where no value or missing reason exists, it -will fill the missing reason with the `missing_reason` parameter. - -Mutations can also create new value columns without companion missing reason -columns. In that case, a new missing reason will be created and filled with -`missing_reason` wherever there are missing values in the value column. ( -This behavior can also be used to stub missing reason columns for value-only -data frames) -} diff --git a/man/deinterlace_type_convert.Rd b/man/deinterlace_type_convert.Rd deleted file mode 100644 index 2cb9784..0000000 --- a/man/deinterlace_type_convert.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/deinterlace_type_convert.R -\name{deinterlace_type_convert} -\alias{deinterlace_type_convert} -\title{Convert character columns and deinterlace missing reasons in existing data -frame} -\usage{ -deinterlace_type_convert(x, col_types = NULL, na = c("", "NA"), ...) -} -\arguments{ -\item{x}{A data frame} - -\item{col_types}{One of `NULL`, a [readr::cols()] specification, or a string.} - -\item{na}{Character vector of strings to interpret as missing values.} - -\item{...}{additional parameters to pass to `readr::type_convert()`} -} -\value{ -A [tibble()].with separate columns for values and missing reasons -for each variable. -} -\description{ -This is a simple wrapper for `readr::type_convert()` that deinterlaces -missing reasons in addition to parsing values. -} diff --git a/man/drop_missing_cols.Rd b/man/drop_missing_cols.Rd deleted file mode 100644 index 6179754..0000000 --- a/man/drop_missing_cols.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{drop_missing_cols} -\alias{drop_missing_cols} -\alias{drop_value_cols} -\title{Drop missing reasons from a deinterlaced data frame} -\usage{ -drop_missing_cols(x) - -drop_value_cols(x) -} -\arguments{ -\item{x}{A data frame} -} -\value{ -A tibble without missing reason columns. -} -\description{ -Drop the missing reason or value columns from a deinterlaced data frame, -turning it into a regular data frame with unlabelled `NA` values. -} diff --git a/man/flatten_channels.Rd b/man/flatten_channels.Rd new file mode 100644 index 0000000..f20df34 --- /dev/null +++ b/man/flatten_channels.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interlaced.R +\name{flatten_channels} +\alias{flatten_channels} +\title{Flatten a \code{interlaced} vector} +\usage{ +flatten_channels(x, ...) +} +\arguments{ +\item{x}{An \code{interlaced} vector} + +\item{...}{Additional arguments, not used} +} +\value{ +The vector, flattened +} +\description{ +\code{flatten_channels()} flattens an \code{interlaced} vector into a single channel. +This is useful as a step right before writing an \code{interlaced} vector to a +file, for example. +} diff --git a/man/icol_logical.Rd b/man/icol_logical.Rd deleted file mode 100644 index ab45028..0000000 --- a/man/icol_logical.Rd +++ /dev/null @@ -1,53 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/collectors.R -\name{icol_logical} -\alias{icol_logical} -\alias{icol_integer} -\alias{icol_double} -\alias{icol_character} -\alias{icol_factor} -\alias{icol_date} -\alias{icol_time} -\alias{icol_datetime} -\alias{icol_number} -\title{Interlaced collectors for read_interlaced_*} -\usage{ -icol_logical(na) - -icol_integer(na) - -icol_double(na) - -icol_character(na) - -icol_factor(na, levels = NULL, ordered = FALSE) - -icol_date(na, format = "") - -icol_time(na, format = "") - -icol_datetime(na, format = "") - -icol_number(na) -} -\arguments{ -\item{na}{Character vector of strings to interpret as column-level missing -values} - -\item{levels}{Character vector of the allowed levels. When \code{levels = NULL} -(the default), \code{levels} are discovered from the unique values of \code{x}, in -the order in which they appear in \code{x}.} - -\item{ordered}{Is it an ordered factor?} - -\item{format}{A format specification, as described below. If set to "", -date times are parsed as ISO8601, dates and times used the date and -time formats specified in the \code{\link[readr:locale]{locale()}}. - -Unlike \code{\link[=strptime]{strptime()}}, the format specification must match -the complete string.} -} -\description{ -Interlaced collector extend `readr` collector types (e.g. `col_double()`) to -allow column-level missing value specifications. -} diff --git a/man/interlace_channels.Rd b/man/interlace_channels.Rd deleted file mode 100644 index 4b773ab..0000000 --- a/man/interlace_channels.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/interlace_channels.R -\name{interlace_channels} -\alias{interlace_channels} -\title{Re-interlace a deinterlaced data frame} -\usage{ -interlace_channels(x) -} -\arguments{ -\item{x}{A deinterlaced data frame} -} -\value{ -An interlaced data frame, that is, a data frame with character -columns that contain both values and missing reasons. -} -\description{ -This function will take a deinterlaced data frame and re-interlace it by -combining value and missing reason column pairs into single character -columns. -} diff --git a/man/interlaced.Rd b/man/interlaced.Rd new file mode 100644 index 0000000..ef42c63 --- /dev/null +++ b/man/interlaced.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interlaced.R +\name{interlaced} +\alias{interlaced} +\alias{as.interlaced} +\alias{as.interlaced.default} +\alias{as.interlaced.interlacer_interlaced} +\alias{as.interlaced.data.frame} +\alias{is.interlaced} +\title{Construct an \code{interlaced} vector} +\usage{ +interlaced(x, na = NULL) + +as.interlaced(x, na = NULL, ...) + +\method{as.interlaced}{default}(x, na = NULL, ...) + +\method{as.interlaced}{interlacer_interlaced}(x, ...) + +\method{as.interlaced}{data.frame}(x, ...) + +is.interlaced(x) +} +\arguments{ +\item{x}{A vector or list of values} + +\item{na}{A vector of values to interpret as missing values} + +\item{...}{Additional arguments, not used} +} +\value{ +An \code{interlaced} vector +} +\description{ +The \code{interlaced} type extends vectors by adding a "missing reason" channel +which can be used to distinguish different types of missingness. The +\code{interlaced()} function constructs a new \code{interlaced} vector from a vector +or list of values. +} diff --git a/man/interlacer_example.Rd b/man/interlacer_example.Rd index 99dbb55..a8e0c29 100644 --- a/man/interlacer_example.Rd +++ b/man/interlacer_example.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/example.R \name{interlacer_example} \alias{interlacer_example} -\title{Get path to interlacer example} +\title{Get a path to one of interlacer's example data sets} \usage{ interlacer_example(file = NULL) } @@ -10,7 +10,7 @@ interlacer_example(file = NULL) \item{file}{Name of file. If \code{NULL}, the example files will be listed.} } \description{ -interlacer comes bundled with a number of sample files in its `inst/extdata` +interlacer comes bundled with a number of sample files in its \code{inst/extdata} directory. This function make them easy to access } \examples{ diff --git a/man/is.empty.Rd b/man/is.empty.Rd new file mode 100644 index 0000000..011412c --- /dev/null +++ b/man/is.empty.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interlaced.R +\name{is.empty} +\alias{is.empty} +\title{NA missing reasons} +\usage{ +is.empty(x) +} +\arguments{ +\item{x}{a vector} +} +\value{ +a logical vector the same length as x, containing TRUE for all +empty elements, and FALSE otherwise. +} +\description{ +When a value is missing both a value and a missing reason, it is considered +"empty". \code{is.empty()} checks for these type of values. Regular \code{NA} values +(with no missing reasons) are also considered "empty". +} diff --git a/man/map_value_channel.Rd b/man/map_value_channel.Rd new file mode 100644 index 0000000..aa67415 --- /dev/null +++ b/man/map_value_channel.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interlaced.R +\name{map_value_channel} +\alias{map_value_channel} +\alias{map_na_channel} +\title{\code{interlaced} functional utilities} +\usage{ +map_value_channel(x, fn) + +map_na_channel(x, fn) +} +\arguments{ +\item{x}{an interlaced vector} + +\item{fn}{a function that maps values or missing reasons to new values} +} +\value{ +a new interlaced vector, modified according to the supplied function +} +\description{ +\code{map_value_channel()} modifies the values of an \code{interlaced} +vector. \code{map_na_channel()} modifies the missing reason channel of an +\code{interlaced} vector. +} diff --git a/man/missing_cols.Rd b/man/missing_cols.Rd deleted file mode 100644 index d3cd0e2..0000000 --- a/man/missing_cols.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{missing_cols} -\alias{missing_cols} -\alias{value_cols} -\title{Selection helpers for deinterlaced data frames} -\usage{ -missing_cols(vars = NULL) - -value_cols(vars = NULL) -} -\arguments{ -\item{vars}{A character vector of variable names. If not supplied, -the variables are taken from the current selection context (as -established by functions like \code{select()} or \code{pivot_longer()}).} -} -\description{ -These tidy selection helpers match missing reason or value columns in an -deinterlaced data frame - -* `missing_cols()` selects missing reason columns. - -* `value_cols()` selects value columns. -} diff --git a/man/missing_names.Rd b/man/missing_names.Rd deleted file mode 100644 index 29bc7de..0000000 --- a/man/missing_names.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{missing_names} -\alias{missing_names} -\alias{value_names} -\title{The names of an deinterlaced data frame} -\usage{ -missing_names(x) - -value_names(x) -} -\arguments{ -\item{x}{A deinterlaced data frame} -} -\value{ -A vector of missing reason or value column names. -} -\description{ -Functions to get the names of missing reason columns or value columns in -an deinterlaced data frame -} diff --git a/man/na.Rd b/man/na.Rd new file mode 100644 index 0000000..6966623 --- /dev/null +++ b/man/na.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interlaced.R +\name{na} +\alias{na} +\title{Lift values to missing reasons} +\usage{ +na(x = unspecified()) +} +\arguments{ +\item{x}{A character or numeric value} +} +\value{ +An \code{interlaced} value +} +\description{ +\code{na()} lifts a value into an \code{interlaced} missing reason channel. +} diff --git a/man/na_cols.Rd b/man/na_cols.Rd new file mode 100644 index 0000000..3d0eb27 --- /dev/null +++ b/man/na_cols.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/collectors.R +\name{na_cols} +\alias{na_cols} +\alias{as.na_col_spec} +\alias{is.na_col_spec} +\title{Create an NA column specification} +\usage{ +na_cols(...) + +as.na_col_spec(x) + +is.na_col_spec(x) +} +\arguments{ +\item{...}{Named vectors to use as missing reasons when loading interlaced +columns. Use name \code{.default} to set default \code{NA} values for the columns.} + +\item{x}{Named list to construct a NA spec with, or a vector of values that +should be used in a spec with \code{.default} equal to those values.} +} +\description{ +\code{na_cols()} creates a specification for the NA channel missing reason when +loading data with the \verb{read_interlaced_*()} family of functions. +} diff --git a/man/na_levels.Rd b/man/na_levels.Rd new file mode 100644 index 0000000..3125354 --- /dev/null +++ b/man/na_levels.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interlaced.R +\name{na_levels} +\alias{na_levels} +\alias{na_levels<-} +\alias{levels.interlacer_interlaced} +\alias{levels<-.interlacer_interlaced} +\title{Get the factor levels of the value or missing reason channel} +\usage{ +na_levels(x) + +na_levels(x) <- value + +\method{levels}{interlacer_interlaced}(x) + +\method{levels}{interlacer_interlaced}(x) <- value +} +\arguments{ +\item{x}{an \code{interlaced} vector} + +\item{value}{new levels to set} +} +\value{ +The levels of the values or missing reason channel +} +\description{ +The base S3 \code{levels()} function is overloaded for \code{interlaced} vectors, so +when the value channel is a factor type, \code{levels()} will return its levels. +Similarly \code{na_levels()} will return the levels for the missing reason +channel, when it is a \code{factor} type. +} diff --git a/man/na_spec.Rd b/man/na_spec.Rd new file mode 100644 index 0000000..0b486a7 --- /dev/null +++ b/man/na_spec.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read.R +\name{na_spec} +\alias{na_spec} +\title{Examine the NA spec of a data frame} +\usage{ +na_spec(x) +} +\arguments{ +\item{x}{The data frame object to extract from} +} +\value{ +An \code{na_col_spec} object +} +\description{ +Like \code{readr::spec()}, \code{na_spec()} extracts the NA column specification from +a tibble created by \verb{read_interlaced_*} +} diff --git a/man/parse_interlaced.Rd b/man/parse_interlaced.Rd new file mode 100644 index 0000000..ea8581c --- /dev/null +++ b/man/parse_interlaced.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interlaced.R +\name{parse_interlaced} +\alias{parse_interlaced} +\title{Parse a \code{character} vector into an \code{interlaced} vector type} +\usage{ +parse_interlaced(x, na, .value_col = col_guess()) +} +\arguments{ +\item{x}{A character vector} + +\item{na}{A vector of values to interpret as missing values} + +\item{.value_col}{A collector to parse the character values (e.g. +\code{readr::col_double()}, \code{readr::col_integer()}, etc.)} +} +\value{ +An \code{interlaced} vector +} +\description{ +\code{parse_interlaced} converts a character vector to an \code{interlaced} vector +by parsing it with a readr \code{collector} type. +} diff --git a/man/read_interlaced_delim.Rd b/man/read_interlaced_delim.Rd index e32b0ec..399b512 100644 --- a/man/read_interlaced_delim.Rd +++ b/man/read_interlaced_delim.Rd @@ -5,73 +5,270 @@ \alias{read_interlaced_csv} \alias{read_interlaced_csv2} \alias{read_interlaced_tsv} +\alias{interlaced_vroom} \title{Read an delimited file with interlaced missing reasons into a tibble} \usage{ read_interlaced_delim( file, delim = NULL, + quote = "\\"", + escape_backslash = FALSE, + escape_double = TRUE, + col_names = TRUE, col_types = NULL, col_select = NULL, + id = NULL, + locale = readr::default_locale(), na = c("", "NA"), - ... + comment = "", + trim_ws = FALSE, + skip = 0, + n_max = Inf, + guess_max = min(1000, n_max), + name_repair = "unique", + progress = readr::show_progress(), + show_col_types = readr::should_show_types(), + skip_empty_rows = TRUE ) read_interlaced_csv( file, + col_names = TRUE, col_types = NULL, col_select = NULL, + id = NULL, + locale = readr::default_locale(), na = c("", "NA"), - ... + quote = "\\"", + comment = "", + trim_ws = TRUE, + skip = 0, + n_max = Inf, + guess_max = min(1000, n_max), + name_repair = "unique", + progress = readr::show_progress(), + show_col_types = readr::should_show_types(), + skip_empty_rows = TRUE ) read_interlaced_csv2( file, + col_names = TRUE, col_types = NULL, col_select = NULL, + id = NULL, + locale = readr::default_locale(), na = c("", "NA"), - ... + quote = "\\"", + comment = "", + trim_ws = TRUE, + skip = 0, + n_max = Inf, + guess_max = min(1000, n_max), + name_repair = "unique", + progress = readr::show_progress(), + show_col_types = readr::should_show_types(), + skip_empty_rows = TRUE ) read_interlaced_tsv( file, + col_names = TRUE, col_types = NULL, col_select = NULL, + id = NULL, + locale = readr::default_locale(), na = c("", "NA"), - ... + quote = "\\"", + comment = "", + trim_ws = TRUE, + skip = 0, + n_max = Inf, + guess_max = min(1000, n_max), + name_repair = "unique", + progress = readr::show_progress(), + show_col_types = readr::should_show_types(), + skip_empty_rows = TRUE +) + +interlaced_vroom( + file, + delim = NULL, + col_names = TRUE, + col_types = NULL, + col_select = NULL, + id = NULL, + skip = 0, + n_max = Inf, + na = c("", "NA"), + quote = "\\"", + comment = "", + skip_empty_rows = TRUE, + trim_ws = TRUE, + escape_double = TRUE, + escape_backslash = FALSE, + locale = vroom::default_locale(), + guess_max = 100, + progress = vroom::vroom_progress(), + show_col_types = NULL, + .name_repair = "unique" ) } \arguments{ -\item{file}{Either a path to a file, a connection, or literal data (either -a single string or a raw vector).} +\item{file}{Either a path to a file, a connection, or literal data +(either a single string or a raw vector). + +Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will +be automatically uncompressed. Files starting with \verb{http://}, +\verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically +downloaded. Remote gz files can also be automatically downloaded and +decompressed. + +Literal data is most useful for examples and tests. To be recognised as +literal data, the input must be either wrapped with \code{I()}, be a string +containing at least one new line, or be a vector containing at least one +string with a new line. + +Using a value of \code{\link[readr:clipboard]{clipboard()}} will read from the system clipboard.} \item{delim}{Single character used to separate fields within a record.} -\item{col_types}{One of `NULL`, a [readr::cols()] specification, or a string. In -addition to the `col_*` specifiers provided by `readr`, `icol_*()` -specifiers may be used. See `vignette("interlacer")` for more details.} +\item{quote}{Single character used to quote strings.} + +\item{escape_backslash}{Does the file use backslashes to escape special +characters? This is more general than \code{escape_double} as backslashes +can be used to escape the delimiter character, the quote character, or +to add special characters like \verb{\\\\n}.} + +\item{escape_double}{Does the file escape quotes by doubling them? +i.e. If this option is \code{TRUE}, the value \verb{""""} represents +a single quote, \verb{\\"}.} + +\item{col_names}{Either \code{TRUE}, \code{FALSE} or a character vector +of column names. + +If \code{TRUE}, the first row of the input will be used as the column +names, and will not be included in the data frame. If \code{FALSE}, column +names will be generated automatically: X1, X2, X3 etc. + +If \code{col_names} is a character vector, the values will be used as the +names of the columns, and the first row of the input will be read into +the first row of the output data frame. + +Missing (\code{NA}) column names will generate a warning, and be filled +in with dummy names \code{...1}, \code{...2} etc. Duplicate column names +will generate a warning and be made unique, see \code{name_repair} to control +how this is done.} + +\item{col_types}{One of \code{NULL}, a \code{\link[readr:cols]{cols()}} specification, or +a string. See \code{vignette("readr")} for more details. + +If \code{NULL}, all column types will be inferred from \code{guess_max} rows of the +input, interspersed throughout the file. This is convenient (and fast), +but not robust. If the guessed types are wrong, you'll need to increase +\code{guess_max} or supply the correct types yourself. + +Column specifications created by \code{\link[=list]{list()}} or \code{\link[readr:cols]{cols()}} must contain +one column specification for each column. If you only want to read a +subset of the columns, use \code{\link[readr:cols_only]{cols_only()}}. + +Alternatively, you can use a compact string representation where each +character represents one column: +\itemize{ +\item c = character +\item i = integer +\item n = number +\item d = double +\item l = logical +\item f = factor +\item D = date +\item T = date time +\item t = time +\item ? = guess +\item _ or - = skip +} + +By default, reading a file without a column specification will print a +message showing what \code{readr} guessed they were. To remove this message, +set \code{show_col_types = FALSE} or set \code{options(readr.show_col_types = FALSE)}.} + +\item{col_select}{Columns to include in the results. You can use the same +mini-language as \code{dplyr::select()} to refer to the columns by name. Use +\code{c()} to use more than one selection expression. Although this +usage is less common, \code{col_select} also accepts a numeric column index. See +\code{\link[tidyselect:language]{?tidyselect::language}} for full details on the +selection language.} + +\item{id}{The name of a column in which to store the file path. This is +useful when reading multiple input files and there is data in the file +paths, such as the data collection date. If \code{NULL} (the default) no extra +column is created.} + +\item{locale}{The locale controls defaults that vary from place to place. +The default locale is US-centric (like R), but you can use +\code{\link[readr:locale]{locale()}} to create your own locale that controls things like +the default time zone, encoding, decimal mark, big mark, and day/month +names.} + +\item{na}{A NA col spec defined by \code{na_cols()} or a character or numeric +vector of values to interpret as missing values.} + +\item{comment}{A string used to identify comments. Any text after the +comment characters will be silently ignored.} + +\item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from +each field before parsing it?} + +\item{skip}{Number of lines to skip before reading data. If \code{comment} is +supplied any commented lines are ignored \emph{after} skipping.} + +\item{n_max}{Maximum number of lines to read.} + +\item{guess_max}{Maximum number of lines to use for guessing column types. +Will never use more than the number of lines read. +See \code{vignette("column-types", package = "readr")} for more details.} + +\item{name_repair, .name_repair}{Handling of column names. The default behaviour is to +ensure column names are \code{"unique"}. Various repair strategies are +supported: +\itemize{ +\item \code{"minimal"}: No name repair or checks, beyond basic existence of names. +\item \code{"unique"} (default value): Make sure names are unique and not empty. +\item \code{"check_unique"}: No name repair, but check they are \code{unique}. +\item \code{"unique_quiet"}: Repair with the \code{unique} strategy, quietly. +\item \code{"universal"}: Make the names \code{unique} and syntactic. +\item \code{"universal_quiet"}: Repair with the \code{universal} strategy, quietly. +\item A function: Apply custom name repair (e.g., \code{name_repair = make.names} +for names in the style of base R). +\item A purrr-style anonymous function, see \code{\link[rlang:as_function]{rlang::as_function()}}. +} + +This argument is passed on as \code{repair} to \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}}. +See there for more details on these terms and the strategies used +to enforce them.} -\item{col_select}{Columns to include in the results. As with -[reader::read_delim], you can use the same mini-language as -[dplyr::select()] to refer to the columns by name.} +\item{progress}{Display a progress bar? By default it will only display +in an interactive session and not while knitting a document. The automatic +progress bar can be disabled by setting option \code{readr.show_progress} to +\code{FALSE}.} -\item{na}{Character vector of strings to interpret as missing values. These -values will become the factor levels of the missing reason column.} +\item{show_col_types}{If \code{FALSE}, do not show the guessed column types. If +\code{TRUE} always show the column types, even if they are supplied. If \code{NULL} +(the default) only show the column types if they are not explicitly supplied +by the \code{col_types} argument.} -\item{...}{Additional parameters to pass to `read_delim`} +\item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this +option is \code{TRUE} then blank rows will not be represented at all. If it is +\code{FALSE} then they will be represented by \code{NA} values in all the columns.} } \value{ -A deinterlaced [tibble()], that is, a tibble with separate columns -for values and missing reasonskfor each variable. +A \code{\link[=tibble]{tibble()}}, with interlaced columns. } \description{ -The `read_interlaced_*()`, family of functions extend `readr`'s -`read_delim()`, `read_csv`, etc. functions for use on data sources where +The \verb{read_interlaced_*()}, family of functions extend \code{readr}'s +\code{read_delim()}, \code{read_csv}, etc. functions for use on data sources where values are interlaced with missing reasons. These functions return a tibble -with two columns for each interlaced source column: a column with -values, and a column with missing reasons. Missing reason columns are named -by taking the value column name and surrounding it by dots -(e.g. missing reasons for "col_name" are read into a column named -".col_name.") +with \code{interlaced} columns. } \examples{ # Beep boop diff --git a/man/reexports.Rd b/man/reexports.Rd new file mode 100644 index 0000000..bada92f --- /dev/null +++ b/man/reexports.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/collectors.R, R/interlaced.R, R/util.R +\docType{import} +\name{reexports} +\alias{reexports} +\alias{col_character} +\alias{col_date} +\alias{col_datetime} +\alias{col_double} +\alias{col_factor} +\alias{col_guess} +\alias{col_integer} +\alias{col_logical} +\alias{col_number} +\alias{col_skip} +\alias{col_time} +\alias{cols_condense} +\alias{cols_only} +\alias{cols} +\alias{as.col_spec} +\alias{spec} +\alias{as.factor} +\alias{as.ordered} +\alias{vec_c} +\title{Objects exported from other packages} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{generics}{\code{\link[generics:coercion-factor]{as.factor}}, \code{\link[generics:coercion-factor]{as.ordered}}} + + \item{readr}{\code{\link[readr]{as.col_spec}}, \code{\link[readr:parse_atomic]{col_character}}, \code{\link[readr:parse_datetime]{col_date}}, \code{\link[readr:parse_datetime]{col_datetime}}, \code{\link[readr:parse_atomic]{col_double}}, \code{\link[readr:parse_factor]{col_factor}}, \code{\link[readr:parse_guess]{col_guess}}, \code{\link[readr:parse_atomic]{col_integer}}, \code{\link[readr:parse_atomic]{col_logical}}, \code{\link[readr:parse_number]{col_number}}, \code{\link[readr]{col_skip}}, \code{\link[readr:parse_datetime]{col_time}}, \code{\link[readr]{cols}}, \code{\link[readr:spec]{cols_condense}}, \code{\link[readr:cols]{cols_only}}, \code{\link[readr]{spec}}} + + \item{vctrs}{\code{\link[vctrs]{vec_c}}} +}} + diff --git a/man/value_channel.Rd b/man/value_channel.Rd new file mode 100644 index 0000000..788c47c --- /dev/null +++ b/man/value_channel.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/interlaced.R +\name{value_channel} +\alias{value_channel} +\alias{na_channel} +\title{Access the channels of an \code{interlaced} vector} +\usage{ +value_channel(x, ...) + +na_channel(x, ...) +} +\arguments{ +\item{x}{An \code{interlaced} vector} + +\item{...}{Additional arguments, not used} +} +\value{ +The value or missing reasons channel +} +\description{ +\itemize{ +\item \code{value_channel()} returns the value channel of an \code{interlaced} vector +\item \code{na_channel()} returns the missing reason channel of an \code{interlaced} vector +} +} diff --git a/man/write_interlaced_delim.Rd b/man/write_interlaced_delim.Rd index 1ff127c..726790c 100644 --- a/man/write_interlaced_delim.Rd +++ b/man/write_interlaced_delim.Rd @@ -9,34 +9,138 @@ \alias{write_interlaced_tsv} \title{Interlace a deinterlaced data frame and write it to a file} \usage{ -write_interlaced_delim(x, file, delim = " ", ...) +write_interlaced_delim( + x, + file, + delim = " ", + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) -write_interlaced_csv(x, file, ...) +write_interlaced_csv( + x, + file, + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) -write_interlaced_csv2(x, file, ...) +write_interlaced_csv2( + x, + file, + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) -write_interlaced_excel_csv(x, file, ...) +write_interlaced_excel_csv( + x, + file, + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) -write_interlaced_excel_csv2(x, file, ...) +write_interlaced_excel_csv2( + x, + file, + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) -write_interlaced_tsv(x, file, ...) +write_interlaced_tsv( + x, + file, + empty = "NA", + append = FALSE, + col_names = !append, + quote = c("needed", "all", "none"), + escape = c("double", "backslash", "none"), + eol = "\\n", + num_threads = readr::readr_threads(), + progress = readr::show_progress() +) } \arguments{ -\item{x}{A data frame or tibble to write to disk} +\item{x}{A data frame or tibble to write to disk.} -\item{file}{File or connection to write to} +\item{file}{File or connection to write to.} -\item{delim}{Delimiter used to separate values. Defaults to " " for -`write_interlaced_delim()`, "," for `write_interlaced_excel_csv()` and ";" -for `write_interlaced_excel_csv2()`. Must be a single character.} +\item{delim}{Delimiter used to separate values. Defaults to \code{" "} for \code{write_delim()}, \code{","} for \code{write_excel_csv()} and +\code{";"} for \code{write_excel_csv2()}. Must be a single character.} -\item{...}{Additional parameters to pass to [readr]} +\item{empty}{String used for empty values (or \code{NA} values in non-interlaced +columns). Defaults to NA.} + +\item{append}{If \code{FALSE}, will overwrite existing file. If \code{TRUE}, +will append to existing file. In both cases, if the file does not exist a new +file is created.} + +\item{col_names}{If \code{FALSE}, column names will not be included at the top of the file. If \code{TRUE}, +column names will be included. If not specified, \code{col_names} will take the opposite value given to \code{append}.} + +\item{quote}{How to handle fields which contain characters that need to be +quoted. +\itemize{ +\item \code{needed} - Values are only quoted if needed: if they contain a delimiter, +quote, or newline. +\item \code{all} - Quote all fields. +\item \code{none} - Never quote fields. +}} + +\item{escape}{The type of escape to use when quotes are in the data. +\itemize{ +\item \code{double} - quotes are escaped by doubling them. +\item \code{backslash} - quotes are escaped by a preceding backslash. +\item \code{none} - quotes are not escaped. +}} + +\item{eol}{The end of line character to use. Most commonly either \code{"\n"} for +Unix style newlines, or \code{"\r\n"} for Windows style newlines.} + +\item{num_threads}{Number of threads to use when reading and materializing +vectors. If your data contains newlines within fields the parser will +automatically be forced to use a single thread only.} + +\item{progress}{Display a progress bar? By default it will only display +in an interactive session and not while knitting a document. The display +is updated every 50,000 values and will only display if estimated reading +time is 5 seconds or more. The automatic progress bar can be disabled by +setting option \code{readr.show_progress} to \code{FALSE}.} } \value{ -`write_interlaced_*` returns the input x invisibly +\verb{write_interlaced_*} returns the input x invisibly } \description{ -The `write_interlaced_*()` family of functions will take a deinterlaced -data frame, re-interlace it, and write it to a flie. The behavior of these -functions match their similarly named counterparts in [readr]. +The \verb{write_interlaced_*()} family of functions will take a data frame +with interlaced columns, flatten all interlaced columns, then write it to +a file. Non-interlaced columns just pass through. The behavior of these +functions match their similarly named counterparts in \link{readr}. } diff --git a/tests/testthat/_snaps/type-interlaced.md b/tests/testthat/_snaps/type-interlaced.md new file mode 100644 index 0000000..2860dec --- /dev/null +++ b/tests/testthat/_snaps/type-interlaced.md @@ -0,0 +1,660 @@ +# interlaced type coercion is symmetric and unchanging + + Code + mat + Output + interlaced interlaced + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + +# base type coercion is symmetric and unchanging + + Code + mat + Output + logical integer + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + double factor + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced "interlaced" NA + interlaced NA "interlaced" + interlaced NA "interlaced" + interlaced NA "interlaced" + interlaced NA "interlaced" + interlaced NA "interlaced" + interlaced NA "interlaced" + interlaced NA "interlaced" + interlaced NA "interlaced" + character + interlaced NA + interlaced NA + interlaced NA + interlaced NA + interlaced NA + interlaced NA + interlaced NA + interlaced NA + interlaced NA + interlaced NA + interlaced NA + interlaced NA + interlaced "interlaced" + interlaced "interlaced" + interlaced "interlaced" + interlaced "interlaced" + interlaced "interlaced" + interlaced "interlaced" + interlaced "interlaced" + interlaced "interlaced" + +--- + + Code + mat2 + Output + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical NA NA + integer NA NA + double NA NA + factor "interlaced" "interlaced" + character "interlaced" "interlaced" + interlaced interlaced + logical NA NA + integer NA NA + double NA NA + factor "interlaced" "interlaced" + character "interlaced" "interlaced" + interlaced interlaced + logical NA NA + integer NA NA + double NA NA + factor "interlaced" "interlaced" + character "interlaced" "interlaced" + interlaced interlaced + logical NA NA + integer NA NA + double NA NA + factor "interlaced" "interlaced" + character "interlaced" "interlaced" + +# base type casting unwraps / lifts and is unchanging + + Code + lifts + Output + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical "interlaced" "interlaced" + integer "interlaced" "interlaced" + double "interlaced" "interlaced" + factor NA NA + character NA NA + interlaced interlaced + logical NA NA + integer NA NA + double NA NA + factor "interlaced" "interlaced" + character "interlaced" "interlaced" + interlaced interlaced + logical NA NA + integer NA NA + double NA NA + factor "interlaced" "interlaced" + character "interlaced" "interlaced" + interlaced interlaced + logical NA NA + integer NA NA + double NA NA + factor "interlaced" "interlaced" + character "interlaced" "interlaced" + interlaced interlaced + logical NA NA + integer NA NA + double NA NA + factor "interlaced" "interlaced" + character "interlaced" "interlaced" + +--- + + Code + unwraps + Output + logical integer double factor character + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced "logical" "integer" "double" NA NA + interlaced NA NA NA "factor" "character" + interlaced NA NA NA "factor" "character" + interlaced NA NA NA "factor" "character" + interlaced NA NA NA "factor" "character" + interlaced NA NA NA "factor" "character" + interlaced NA NA NA "factor" "character" + interlaced NA NA NA "factor" "character" + interlaced NA NA NA "factor" "character" + +# interlaced type casting is unchanging + + Code + mat + Output + interlaced interlaced + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced interlaced + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + interlaced NA NA + interlaced NA NA + interlaced "interlaced" "interlaced" + interlaced "interlaced" "interlaced" + diff --git a/tests/testthat/helper-types.R b/tests/testthat/helper-types.R new file mode 100644 index 0000000..0f1e454 --- /dev/null +++ b/tests/testthat/helper-types.R @@ -0,0 +1,72 @@ +EXAMPLE_FCT <- factor(c(), levels=c("a", "b", "c")) + +INTERLACED_TYPES <- list2( +# new_interlaced(logical(), logical()), + new_interlaced(logical(), integer()), + new_interlaced(logical(), double()), + new_interlaced(logical(), EXAMPLE_FCT), + new_interlaced(logical(), character()), + +# new_interlaced(integer(), logical()), + new_interlaced(integer(), integer()), + new_interlaced(integer(), double()), + new_interlaced(integer(), EXAMPLE_FCT), + new_interlaced(integer(), character()), + +# new_interlaced(double(), logical()), + new_interlaced(double(), integer()), + new_interlaced(double(), double()), + new_interlaced(double(), EXAMPLE_FCT), + new_interlaced(double(), character()), + +# new_interlaced(EXAMPLE_FCT, logical()), + new_interlaced(EXAMPLE_FCT, integer()), + new_interlaced(EXAMPLE_FCT, double()), + new_interlaced(EXAMPLE_FCT, EXAMPLE_FCT), + new_interlaced(EXAMPLE_FCT, character()), + +# new_interlaced(character(), logical()), + new_interlaced(character(), integer()), + new_interlaced(character(), double()), + new_interlaced(character(), EXAMPLE_FCT), + new_interlaced(character(), character()), +) + +BASE_TYPES = list2( + logical(), + integer(), + double(), + EXAMPLE_FCT, + character(), +) + +coerces_to <- function(x, y, using = "strict") { + type_max <- switch(using, + strict = vec_ptype2, + base_c = c, + base_unlist = function(x, y) unlist(list(x, y)), + base_modify = function(x, y) `[<-`(x, 2, value = y), + cast = vec_cast + ) + + tryCatch({ + type <- suppressWarnings(type_max(x, y)) + vec_ptype_full(type) + }, error = function(e) { + NA_character_ + }) +} + +maxtype_mat <- function(x_types, y_types = x_types, using = "strict") { + names(x_types) <- map_chr(x_types, function(x) vec_ptype_full(vec_ptype(x))) + names(y_types) <- map_chr(y_types, function(x) vec_ptype_full(vec_ptype(x))) + + grid <- expand.grid(x = x_types, y = y_types) + grid$max <- map2_chr(grid$x, grid$y, coerces_to, using = using) + + matrix( + grid$max, + nrow = length(x_types), + dimnames = list(names(x_types), names(y_types)) + ) +} diff --git a/tests/testthat/test-annoying-interlaced.R b/tests/testthat/test-annoying-interlaced.R new file mode 100644 index 0000000..1c12d5d --- /dev/null +++ b/tests/testthat/test-annoying-interlaced.R @@ -0,0 +1,65 @@ +# Annoying things I can't fix. + +test_that("right combining with base type doesn't become interlaced", { + # This is annoying, but no way to fix base R + expect_equal( + c(1, 2, na("reason")), + c(1, 2, NA) + ) + + # (use vec_c instead) + expect_equal( + vec_c(1, 2, na("reason")), + new_interlaced(c(1, 2, NA), c(NA, NA, "reason")) + ) +}) + +test_that( + "index assignment on base types with interlaced are not promoted", { + # Annoying, no alternative :( + foo <- c(1, 2) + foo[[1]] <- na("hello") + + expect_equal(foo, c(NA, 2)) +}) + +test_that( + "ifelse doesn't promote interlaced types", { + # ...because it relies on base index assignment + expect_equal( + ifelse(c(TRUE, FALSE), na(4), 5), + c(NA, 5) + ) + + # (use if_else instead) + expect_equal( + if_else(c(TRUE, FALSE), na(4), 5), + vec_c(na(4), 5) + ) +}) + +test_that("tibble subassign loses NA reason", { + # This is due to the line: + # https://github.com/tidyverse/tibble/blob/b7a8b70c076c2d87c0c3d2e7071b45cce17df9e9/R/subassign-backend.R#L181 + # + # To see what's going on, run: + # `tibble:::vectbl_assign(c("a", "b", "c"), 3, na("reason"))` + # + # It checks if it's logical, and then looks to see if it's a valid + # cast. Because the value channel is unspecified (and therefore the + # values don't matter), it goes ahead and + # `vec_slice`s into an NA value, thus removing the missing reason. + + bar <- tibble( + a = vec_c(1, na("z"), 3) + ) + + bar[1, "a"] <- na("z") + + expect_equal(bar$a, vec_c(NA, na("z"), 3)) + + # Interestingly, using `$` for this situation works! + bar$a[1] <- na("z") + + expect_equal(bar$a, vec_c(na("z"), na("z"), 3)) +}) diff --git a/tests/testthat/test-coalesce_channels.R b/tests/testthat/test-coalesce_channels.R deleted file mode 100644 index 9c8148c..0000000 --- a/tests/testthat/test-coalesce_channels.R +++ /dev/null @@ -1,54 +0,0 @@ -test_that("nop if no changes are necessary", { - result <- tibble( - a = c(1, NA), - .a. = factor(c(NA, "UNKNOWN_REASON")) - ) |> - coalesce_channels() - - expect_equal(result, result) -}) - -test_that("new missing value reasons make values disappear when keep=missing", { - result <- tibble( - a = c(1, 2), - .a. = factor(c(NA, "UNKNOWN_REASON")) - ) |> - coalesce_channels(keep = "missing") - - expected <- tibble( - a = c(1, NA), - .a. = factor(c(NA, "UNKNOWN_REASON")) - ) - - expect_equal(result, expected, ignore_attr = TRUE) -}) - -test_that("new missing value reasons disappear if value available", { - result <- tibble( - a = c(1, 2), - .a. = factor(c(NA, "UNKNOWN_REASON")) - ) |> - coalesce_channels() - - expected <- tibble( - a = c(1, 2), - .a. = factor(c(NA, NA), levels = "UNKNOWN_REASON") - ) - - expect_equal(result, expected, ignore_attr = TRUE) -}) - -test_that("missing (missing value) reasons result in default reason", { - result <- tibble( - a = c(1, NA), - .a. = factor(c(NA, NA)) - ) |> - coalesce_channels() - - expected <- tibble( - a = c(1, NA), - .a. = factor(c(NA, "UNKNOWN_REASON")) - ) - - expect_equal(result, expected, ignore_attr = TRUE) -}) diff --git a/tests/testthat/test-deinterlaced_df.R b/tests/testthat/test-deinterlaced_df.R deleted file mode 100644 index 0ff7e89..0000000 --- a/tests/testthat/test-deinterlaced_df.R +++ /dev/null @@ -1,44 +0,0 @@ - -test_that("warnings on invalid states", { - df_good <- tibble(a = 1) |> - coalesce_channels() - - df_both <- df_good |> - mutate(.a. = "FOO") - - df_none <- df_good |> - mutate(a = NA) - - df_missing_col <- df_good |> - select(a) - - expect_no_warning(capture_output(print(df_good))) - expect_warning(capture_output(print(df_both))) - expect_warning(capture_output(print(df_none))) - expect_warning(capture_output(print(df_missing_col))) -}) - -test_that("enable / disable warnings", { - df_good <- tibble(a = 1) |> - coalesce_channels() - - df_bad <- df_good |> - mutate(a = NA) - - expect_no_warning(capture_output(print(df_good))) - expect_warning(capture_output(print(df_bad))) - - options(interlacer.print_validation = TRUE) - - expect_no_warning(capture_output(print(df_good))) - expect_warning(capture_output(print(df_bad))) - - options(interlacer.print_validation = FALSE) - - expect_no_warning(capture_output(print(df_good))) - expect_no_warning(capture_output(print(df_bad))) - - - options(interlacer.print_validation = NULL) -}) - diff --git a/tests/testthat/test-interlaced.R b/tests/testthat/test-interlaced.R new file mode 100644 index 0000000..a185621 --- /dev/null +++ b/tests/testthat/test-interlaced.R @@ -0,0 +1,162 @@ + +test_that("interlaced type cannot construct with different sizes", { + expect_error(new_interlaced(c(1, 2), 1)) +}) + +test_that("interlaced type cannot construct with simultaneous valid and na", { + expect_error(new_interlaced(c(1, 2), c(1, 2))) +}) + +test_that("interlaced type cannot construct with existing 'na_values' attr", { + foo <- c(1, 2) + attr(foo, "na_channel_values") <- "bar" + expect_error(new_interlaced(foo, c(NA, NA))) +}) + +test_that("parse_interlaced returns unspecified types", { + expect_error( + parse_interlaced("a", NA), + ) + expect_equal( + parse_interlaced("a", "a"), + new_interlaced(unspecified(1), "a") + ) +}) + + +test_that("is.na() and is.empty() detect correct values", { + foo <- new_interlaced(c("a", NA, NA), c(NA, "reason", NA)) + expect_equal(is.empty(foo), c(FALSE, FALSE, TRUE)) + expect_equal(is.na(foo), c(FALSE, TRUE, TRUE)) +}) + +test_that("map_value_channel works", { + foo <- vec_c("a", "b", "c", na("reason")) + + expect_equal( + map_value_channel(foo, \(x) if_else(x == "a", "z", x)), + vec_c("z", "b", "c", na("reason")) + ) + + expect_error( + map_value_channel(foo, \(x) if_else(x == "a", na("z"), x)) + ) +}) + +test_that("rep() works", { + foo <- vec_c("a", na("b")) + expect_equal( + rep(foo, 3), + vec_c("a", na("b"), "a", na("b"), "a", na("b")) + ) +}) + +test_that("length<-() works", { + foo <- new_interlaced(c("a", NA, "c"), c(NA, "b", NA)) + + length(foo) <- 5 + + expect_equal( + foo, vec_c("a", na("b"), "c", NA, NA) + ) +}) + +test_that("levels() returns and sets levels of values channel", { + foo <- interlaced(factor(c("a", "b", "c")), na="reason") + + expect_equal( + levels(foo), + c("a", "b", "c") + ) + + levels(foo) <- c("d", "e", "f") + + expect_equal( + levels(foo), + c("d", "e", "f") + ) + + expect_null( + levels(interlaced(1, na="reason")) + ) +}) + +test_that("indexing operates on underlying channels", { + foo <- vec_c(1, 2, 3, na("reason")) + + expect_equal(foo[], foo) + expect_equal(foo[2], interlaced(2, na = "reason")) + expect_equal(foo[[2]], interlaced(2, na = "reason")) + + expect_equal(foo[c(2, 4)], vec_c(2, na("reason"))) + expect_error(foo[[c(2, 4)]]) + + # Out of bounds indexing creates blanks + expect_equal(foo[1:6], vec_c(foo, NA, NA)) + + # named indexing ($) errors + expect_error(foo$bar) + # multidimensional indexing errors + expect_error(foo[1, 2]) + expect_error(foo[[1, 2]]) +}) + +test_that("indexing assignment casts & operates on underlying channels", { + foo <- vec_c(1, 2, 3, na("reason")) + + foo[2] <- 5 + + expect_equal(foo, vec_c(1, 5, 3, na("reason"))) + + foo[[3]] <- 6 + + expect_equal(foo, vec_c(1, 5, 6, na("reason"))) + + foo[c(1, 4)] <- vec_c(na("reason"), 9) + + expect_equal(foo, vec_c(na("reason"), 5, 6, 9)) + + expect_error(foo[[c(1, 4)]] <- 5) +}) + + +test_that("comparisons and ordering works", { + foo <- vec_c(10, 2, na("reason 1"), 3, na("reason 2"), 9, 1) + + expect_equal(min(foo, na.rm=TRUE), 1) + expect_equal(max(foo, na.rm=TRUE), 10) + expect_equal(range(foo, na.rm=TRUE), c(1, 10)) + expect_equal(median(foo, na.rm=TRUE), 3) + expect_equal(quantile(foo, na.rm=TRUE), c(1,2,3,9,10), ignore_attr=TRUE) + expect_equal(xtfrm(foo), c(10, 2, NA, 3, NA, 9, 1)) + + expect_equal(min(foo), NA_real_) + expect_equal(max(foo), NA_real_) + expect_equal(range(foo), c(NA_real_, NA_real_)) + expect_equal(median(foo), NA_real_) + expect_error(quantile(foo)) +}) + +test_that("vec_math works", { + foo <- vec_c(10, 2, na("reason 1"), 3, na("reason 2"), 9, 1) + expect_equal(mean(foo, na.rm=TRUE), 5) + expect_equal(var(foo, na.rm=TRUE), 17.5) + + expect_equal(mean(foo), NA_real_) + expect_equal(var(foo), NA_real_) +}) + +test_that("vec_arith works", { + foo <- vec_c(10, 2, na("reason 1"), 3, na("reason 2"), 9, 1) + foo_plus_one <- c(11, 3, NA, 4, NA, 10, 2) + + expect_equal(foo + 1, foo_plus_one) + expect_equal(1 + foo, foo_plus_one) + expect_equal(foo + TRUE, foo_plus_one) + expect_equal(TRUE + foo, foo_plus_one) + + expect_equal( + foo + foo, + c(20, 4, NA, 6, NA, 18, 2) + ) +}) diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R index 3e90976..587ff16 100644 --- a/tests/testthat/test-read.R +++ b/tests/testthat/test-read.R @@ -1,132 +1,462 @@ +all_na_reasons <- function() { + c("REASON_1", "REASON_2", "REASON_3") +} -test_that("default missing reasons are overrided", { - result <- tibble(a = "NA") |> - deinterlace_type_convert(na = "FOO") +to_na_reason_factor <- function(c) { + factor(c, all_na_reasons()) +} + +basic_df_expected <- function() { + tibble( + a = interlaced(list("REASON_1", TRUE, TRUE, "REASON_3"), na = all_na_reasons()), + b = interlaced(list(7, "REASON_2", 8, 10), na = all_na_reasons()), + c = interlaced( + c( + 0.181526642525569, + 0.833227441413328, + 0.926790483295918, + 0.375270307529718 + ), na = all_na_reasons() + ), + d = interlaced(list("m", "z", "r", "REASON_3"), na = all_na_reasons()) + ) +} + +test_that("basic reading works", { + result <- read_interlaced_csv( + test_path("basic-df.csv"), + na = all_na_reasons() + ) + + expected <- basic_df_expected() + + expect_equal(result, expected, ignore_attr = TRUE) expect_equal( - result, tibble(a = "NA", .a. = factor(NA, levels = "FOO")) + spec(result)$cols, + list( + a = col_logical(), + b = col_double(), + c = col_double(), + d = col_character() + ) + ) + + expect_equal( + na_spec(result)$default, + all_na_reasons() ) }) -test_that("global missing reasons load properly", { - missing_levels <- c("REASON_1", "REASON_2", "REASON_3") +test_that("NULL column-level missing reasons override default", { + df <- read_interlaced_csv( + interlacer_example("colors_coded.csv"), + na = na_cols( + .default = c(-99, -98, -97), + person_id = NULL, + ), + ) + + expect_false(is.interlaced(df$person_id)) +}) +test_that("column-level missing reasons can be specified na arg", { col_types <- cols( a = col_logical(), - b = col_integer(), + b = col_double(), c = col_double(), d = col_character(), ) - expected_col_types <- cols( - a = col_logical(), - b = col_integer(), - c = col_double(), - d = col_character(), - .default = col_factor(levels = missing_levels), + result <- read_interlaced_csv( + test_path("basic-df.csv"), + na = na_cols( + .default = c("REASON_3"), + a = c(.default, "REASON_1"), + b = c(.default, "REASON_2"), + ), + col_types = col_types, ) + expected <- basic_df_expected() |> + dplyr::mutate( + a = map_na_channel(a, \(v) vec_cast(v, factor(levels=c("REASON_3", "REASON_1")))), + b = map_na_channel(b, \(v) vec_cast(v, factor(levels=c("REASON_3", "REASON_2")))), + c = map_na_channel(c, \(v) vec_cast(v, factor(levels="REASON_3"))), + d = map_na_channel(d, \(v) vec_cast(v, factor(levels="REASON_3"))) + ) + + expect_equal(result, expected, ignore_attr = TRUE) + + expect_equal(spec(result)$cols, col_types$cols) +}) + +### col_select + +test_that("col_select selects columns", { result <- read_interlaced_csv( test_path("basic-df.csv"), - na = missing_levels, - col_types = col_types, + na = all_na_reasons(), + col_select = a, ) - result_expected <- read_csv( - test_path("basic-df.expected.csv"), - na = "NA", - col_types = expected_col_types, + expected <- basic_df_expected() |> + dplyr::select(a) + + expect_equal(result, expected, ignore_attr = TRUE) + + expect_equal( + spec(result)$cols, + list( + a = col_logical(), + b = col_skip(), + c = col_skip(), + d = col_skip() + ) ) - expect_equal(result, result_expected, ignore_attr = TRUE) + expect_equal( + na_spec(result)$default, + all_na_reasons() + ) +}) - result_raw <- read_csv( +test_that("col_select renames columns", { + result <- read_interlaced_csv( test_path("basic-df.csv"), - col_types = cols(.default = "c") + na = all_na_reasons(), + col_select = c(z = a), + ) + + expected <- basic_df_expected() |> + dplyr::select(z = a) + + expect_equal(result, expected, ignore_attr = TRUE) + + expect_equal( + spec(result)$cols, + list( + a = col_logical(), + b = col_skip(), + c = col_skip(), + d = col_skip() + ) ) expect_equal( - result_raw, - interlace_channels(result), - ignore_attr = TRUE + na_spec(result)$default, + all_na_reasons() ) }) +test_that("col_select reorders columns", { + result <- read_interlaced_csv( + test_path("basic-df.csv"), + na = all_na_reasons(), + col_select = c(b, c, a), + ) + + expected <- basic_df_expected() |> + dplyr::select(b, c, a) -test_that("column-level missing reasons can be specified with icol_*", { - col_types <- cols( - a = icol_logical(na = "REASON_1"), - b = icol_integer(na = "REASON_2"), - c = col_double(), - d = col_character(), + expect_equal(result, expected, ignore_attr = TRUE) + + expect_equal( + spec(result)$cols, + list( + a = col_logical(), + b = col_double(), + c = col_double(), + d = col_skip() + ) ) - expected_col_types <- cols( - a = col_logical(), - .a. = col_factor(levels = c("REASON_1", "REASON_3")), - b = col_integer(), - .b. = col_factor(levels = c("REASON_2", "REASON_3")), - c = col_double(), - .c. = col_factor(levels = c("REASON_3")), - d = col_character(), - .d. = col_factor(levels = c("REASON_3")), + expect_equal( + na_spec(result)$default, + all_na_reasons() ) +}) +test_that("col_select reorders and renames columns", { result <- read_interlaced_csv( test_path("basic-df.csv"), - na = c("REASON_3"), - col_types = col_types, + na = all_na_reasons(), + col_select = c(x = b, y = c, z = a), + ) + + expected <- basic_df_expected() |> + dplyr::select(x = b, y = c, z = a) + + expect_equal(result, expected, ignore_attr = TRUE) + + expect_equal( + spec(result)$cols, + list( + a = col_logical(), + b = col_double(), + c = col_double(), + d = col_skip() + ) ) - result_expected <- read_csv( - test_path("basic-df.expected.csv"), - na = "NA", - col_types = expected_col_types, + expect_equal( + na_spec(result)$default, + all_na_reasons() ) +}) - expect_equal(result, result_expected, ignore_attr = TRUE) +### Unnamed col_types - result_raw <- read_csv( +test_that("unnamed col_types work", { + result <- read_interlaced_csv( test_path("basic-df.csv"), - col_types = cols(.default = "c") + na = all_na_reasons(), + col_types = "cccc" ) + expected <- basic_df_expected() |> + dplyr::mutate( + dplyr::across( + everything(), + \(v) map_value_channel(v, as.character) + ) + ) + + expect_equal(result, expected, ignore_attr = TRUE) + expect_equal( - result_raw, - interlace_channels(result), - ignore_attr = TRUE + spec(result)$cols, + list( + a = col_character(), + b = col_character(), + c = col_character(), + d = col_character() + ) ) }) +test_that("incomplete unnamed col_types work with warning", { + expect_warning( + result <- read_interlaced_csv( + test_path("basic-df.csv"), + na = all_na_reasons(), + col_types = "c" + ) + ) -test_that("col_select correctly selects columns", { - missing_levels <- c("REASON_1", "REASON_2", "REASON_3") + expected <- basic_df_expected() |> + dplyr::mutate( + a = map_value_channel(a, as.character) + ) - col_types <- cols( - a = col_logical(), - b = col_integer(), - c = col_double(), - d = col_character(), + expect_equal(result, expected, ignore_attr = TRUE) + + expect_equal( + spec(result)$cols, + list( + a = col_character(), + b = col_double(), + c = col_double(), + d = col_character() + ) ) +}) - expected_col_types <- cols( - a = col_logical(), - .default = col_factor(levels = missing_levels), +test_that("overcomplete na spec work with warning", { + expect_warning( + result <- read_interlaced_csv( + test_path("basic-df.csv"), + na = all_na_reasons(), + col_types = "ccccc" + ) ) - result <- read_interlaced_csv( + expected <- basic_df_expected() |> + dplyr::mutate( + dplyr::across( + everything(), + \(v) map_value_channel(v, as.character) + ) + ) + + expect_equal(result, expected, ignore_attr = TRUE) + + expect_equal( + spec(result)$cols, + list( + a = col_character(), + b = col_character(), + c = col_character(), + d = col_character() + ) + ) +}) + +test_that("cannot mix unnamed and named col_types", { + expect_error( + read_interlaced_csv( + test_path("basic-df.csv"), + na = all_na_reasons(), + col_types = list("c", a = "b") + ) + ) +}) + +test_that("factor na values works", { + result <- read_interlaced_csv( test_path("basic-df.csv"), - na = missing_levels, - col_types = col_types, - col_select = a, + na = all_na_reasons(), + ) + + expect_equal(result, basic_df_expected(), ignore_attr = TRUE) + + expect_equal( + na_spec(result)$default, + all_na_reasons() + ) +}) + +test_that("incomplete unnamed na_col_types work with warning", { + expect_warning( + result <- read_interlaced_csv( + test_path("basic-df.csv"), + na = list(all_na_reasons()) + ) + ) + + expected <- basic_df_expected() |> + dplyr::mutate( + a = map_na_channel(a, to_na_reason_factor), + b = flatten_channels(b), + c = as.numeric(flatten_channels(c)), + d = flatten_channels(d) + ) + + expect_equal(result, expected, ignore_attr = TRUE) + + expect_equal( + na_spec(result)$cols, + list( + a = all_na_reasons(), + b = NULL, + c = NULL, + d = NULL + ) ) +}) - result_expected <- read_csv( - test_path("basic-df.expected.csv"), - na = "NA", - col_types = expected_col_types, - col_select = c(a, .a.), +test_that("overcomplete unnamed na_col_types work with warning", { + expect_warning( + result <- read_interlaced_csv( + test_path("basic-df.csv"), + na = list( + all_na_reasons(), + all_na_reasons(), + all_na_reasons(), + all_na_reasons(), + all_na_reasons() + ) + ) ) - expect_equal(result, result_expected, ignore_attr = TRUE) + expect_equal(result, basic_df_expected(), ignore_attr = TRUE) + + expect_equal( + na_spec(result)$cols, + list( + a = all_na_reasons(), + b = all_na_reasons(), + c = all_na_reasons(), + d = all_na_reasons() + ) + ) +}) + +test_that("cannot mix unnamed and named na_col_types", { + expect_error( + read_interlaced_csv( + test_path("basic-df.csv"), + na = list("c", a = "b") + ) + ) +}) + +# special cases + +test_that("all read & write fn variants work", { + test_fns <- tibble( + txt = list( + "aZb\n1Z2\nNAZ\n5Z6\n", + "a,b\n1,2\nNA,\n5,6\n", + "a;b\n1;2\nNA;\n5;6\n", + "a\tb\n1\t2\nNA\t\n5\t6\n" + ), + read_fn = list( + \(f) read_interlaced_delim(f, delim = "Z"), + read_interlaced_csv, + read_interlaced_csv2, + read_interlaced_tsv + ), + write_fn = list( + list(\(f, o) write_interlaced_delim(f, o, delim = "Z")), + list(write_interlaced_csv, write_interlaced_excel_csv), + list(write_interlaced_csv2, write_interlaced_csv2), + list(write_interlaced_tsv) + ) + ) + + expected <- tibble( + a = vec_c(1, na("NA"), 5), + b = vec_c(2, na(""), 6), + ) + + pmap(test_fns, function(txt, read_fn, write_fns) { + # Test read + result <- read_fn(I(txt)) + expect_equal(result, expected, ignore_attr = TRUE) + + # Test write + for (write_fn in write_fns) { + out <- tempfile() + on.exit(unlink(out)) + + out_inv <- write_fn(result, out) + + expect_equal(out_inv, result) # Write fns return invisible(x) + expect_equal(txt, readr::read_file(out)) + + unlink(out) + } + }) +}) + +test_that("columns with NA as the na reason read properly", { + result <- read_interlaced_csv(I("a,b\n1,2\nNA,\n5,6\n")) + expected <- tibble( + a = factor(c(NA, "NA", NA), levels = c("", "NA")), + b = factor(c(NA, "", NA), levels = c("", "NA")) + ) + expect_equal(na_channel(result), expected, ignore_attr = TRUE) +}) + +test_that("duplicate columns fail", { + expect_error( + read_interlaced_csv(I("a,a\n1,2\nNA,\n5,6\n")) + ) +}) + +test_that("type_convert_col() returns unspecified", { + chr_values <- rep("", 5) + expect_equal( + type_convert_col(chr_values, col_guess(), na = character()), + unspecified(5) + ) + expect_equal( + type_convert_col(chr_values, col_integer(), na = character()), + rep(NA_integer_, 5) + ) + expect_equal( + type_convert_col(chr_values, col_logical(), na = character()), + rep(NA, 5) + ) }) diff --git a/tests/testthat/test-type-interlaced.R b/tests/testthat/test-type-interlaced.R new file mode 100644 index 0000000..ef9f657 --- /dev/null +++ b/tests/testthat/test-type-interlaced.R @@ -0,0 +1,33 @@ + +test_that("interlaced type coercion is symmetric and unchanging", { + mat <- maxtype_mat(INTERLACED_TYPES, INTERLACED_TYPES) + + expect_true(isSymmetric(mat)) + expect_snapshot(mat) +}) + +test_that("base type coercion is symmetric and unchanging", { + mat <- maxtype_mat(INTERLACED_TYPES, BASE_TYPES) + mat2 <- maxtype_mat(BASE_TYPES, INTERLACED_TYPES) + + expect_true(all(mat == t(mat2), na.rm = TRUE)) + expect_snapshot(mat) + expect_snapshot(mat2) +}) + +test_that("base type casting unwraps / lifts and is unchanging", { + lifts <- maxtype_mat(BASE_TYPES, INTERLACED_TYPES, using = "cast") + unwraps <- maxtype_mat(INTERLACED_TYPES, BASE_TYPES, using = "cast") + + expect_true(all(grepl("^interlaced", na.omit(lifts)))) + expect_true(!any(grepl("^interlaced", na.omit(unwraps)))) + + expect_snapshot(lifts) + expect_snapshot(unwraps) +}) + +test_that("interlaced type casting is unchanging", { + mat <- maxtype_mat(INTERLACED_TYPES, INTERLACED_TYPES, using = "cast") + expect_true(all(grepl("^interlaced", na.omit(mat)))) + expect_snapshot(mat) +}) diff --git a/vignettes/coded-data.Rmd b/vignettes/coded-data.Rmd index 4c5e768..d00c435 100644 --- a/vignettes/coded-data.Rmd +++ b/vignettes/coded-data.Rmd @@ -37,7 +37,7 @@ example: ```{r} library(readr) -library(interlacer) +library(interlacer, warn.conflicts = FALSE) read_file( interlacer_example("colors_coded.csv") @@ -77,14 +77,14 @@ library(dplyr, warn.conflicts = FALSE) df_coded |> mutate( - favorite_color_missing = if_else(favorite_color < 0, favorite_color, NA), age = if_else(age > 0, age, NA) ) |> summarize( mean_age = mean(age, na.rm=T), n = n(), - .by = favorite_color_missing - ) + .by = favorite_color + ) |> + arrange(favorite_color) ``` The downsides of this approach are twofold: 1) all of your values and @@ -99,14 +99,14 @@ compute aggregations using the negative numbers! ```{r} df_coded |> mutate( - favorite_color_missing = if_else(favorite_color < 0, favorite_color, NA), # age = if_else(age > 0, age, NA) ) |> summarize( mean_age = mean(age, na.rm=T), n = n(), - .by = favorite_color_missing - ) + .by = favorite_color + ) |> + arrange(favorite_color) ``` In fact, ANY math you do without filtering for missing codes potentially ruins @@ -118,14 +118,12 @@ the integrity of your data: df_coded |> mutate( age_next_year = age + 1, - .after = person_id ) # This will give you your intended result, but it's easy to forget df_coded |> mutate( age_next_year = if_else(age < 0, age, age + 1), - .after = person_id ) ``` @@ -133,31 +131,35 @@ Have you ever thought you had a significant result, only to find that it's only because there are some stray missing reason codes still interlaced with your values? It's a bad time. -You're much better off loading these formats with interlacer, then converting -the codes into labelled factor levels: +You're much better off loading these formats with interlacer as factors, then +converting the codes into labels: ```{r} -library(forcats) - -(df_decoded_deinterlaced <- read_interlaced_csv( +(df_decoded <- read_interlaced_csv( interlacer_example("colors_coded.csv"), - na = c("-99", "-98", "-97") + na = c(-99, -98, -97), + show_col_types = FALSE, ) |> mutate( across( - missing_cols(), - \(x) fct_recode(x, - `N/A` = "-99", - REFUSED = "-98", - OMITTED = "-97", + everything(), + \(x) map_na_channel( + x, + \(v) factor( + v, + levels = c(-99, -98, -97), + labels = c("N/A", "REFUSED", "OMITTED"), + ) + ) + ), + favorite_color = map_value_channel( + favorite_color, + \(v) factor( + v, + levels = c(1, 2, 3), + labels = c("BLUE", "RED", "YELLOW") ) ), - favorite_color = fct_recode( - as.character(favorite_color), - BLUE = "1", - RED = "2", - YELLOW = "3", - ) )) ``` @@ -165,23 +167,22 @@ Now aggregations won't mix up values and missing codes, and you won't have to keep cross-referencing your codebook to know what values mean: ```{r} -df_decoded_deinterlaced |> +df_decoded |> summarize( - mean_age = mean(age, na.rm=T), + mean_age = mean(age, na.rm=TRUE), n = n(), - .by = .favorite_color. - ) + .by = favorite_color + ) |> + arrange(favorite_color) ``` Other operations work with similar ease: ```{r} -df_decoded_deinterlaced |> +df_decoded |> mutate( age_next_year = age + 1, - .after = person_id - ) |> - coalesce_channels(missing_reason = "AGE_UNAVAILABLE") + ) ``` @@ -225,18 +226,14 @@ you know it's a missing code. If it is successful, you know it's a coded value. ```{r} df_coded_char |> mutate( - favorite_color_missing = if_else( - is.na(as.numeric(favorite_color)), - favorite_color, - NA - ), age = if_else(!is.na(as.numeric(age)), as.numeric(age), NA) ) |> summarize( mean_age = mean(age, na.rm=T), n = n(), - .by = favorite_color_missing - ) + .by = favorite_color + ) |> + arrange(favorite_color) ``` Although the character missing codes help prevent us from mistakenly including @@ -248,46 +245,60 @@ decode the values and missing reasons: ```{r} read_interlaced_csv( interlacer_example("colors_coded_char.csv"), - na = c(".", ".a", ".b") + na = c(".", ".a", ".b"), + show_col_types = FALSE, ) |> mutate( across( - missing_cols(), - \(x) fct_recode(x, - `N/A` = ".", - REFUSED = ".a", - OMITTED = ".b", + everything(), + \(x) map_na_channel( + x, + \(v) factor( + v, + levels = c(".", ".a", ".b"), + labels = c("N/A", "REFUSED", "OMITTED") + ) ) ), - favorite_color = fct_recode( - as.character(favorite_color), - BLUE = "1", - RED = "2", - YELLOW = "3", + favorite_color = map_value_channel( + favorite_color, + \(v) factor( + v, + levels = c(1, 2, 3), + labels = c("BLUE", "RED", "YELLOW") + ) ) ) ``` ## Encoding a decoded & deinterlaced data frame. -Re-coding and re-interlacing a data frame is easily done as follows: +Re-coding and re-interlacing a data frame can be done as follows: ```{r, eval = FALSE} -df_decoded_deinterlaced |> +library(forcats) + +df_decoded |> mutate( across( - missing_cols(), - \(x) fct_recode(x, - `-99` = "N/A", - `-98` = "REFUSED", - `-97` = "OMITTED" + everything(), + \(x) map_na_channel( + x, + \(v) fct_recode(v, + `-99` = "N/A", + `-98` = "REFUSED", + `-97` = "OMITTED" + ) ) ), - favorite_color = fct_recode( + favorite_color = map_value_channel( favorite_color, - `1` = "BLUE", - `2` = "RED", - `3` = "YELLOW" + \(v) fct_recode( + v, + `1` = "BLUE", + `2` = "RED", + `3` = "YELLOW" + ) ) ) |> write_interlaced_csv("output.csv") @@ -298,10 +309,5 @@ df_decoded_deinterlaced |> The [haven](https://haven.tidyverse.org/) package has functions for loading native SPSS, SAS, and Stata native file formats into special data frames that use column attributes and special values to keep track -of interlaces values and missing reasons. For a complete discussion of how this +of interlaced values and missing reasons. For a complete discussion of how this compares to interlacer's approach, see `vignette("other-approaches")`. - -Future versions of interlacer could have the ability to convert haven data -frames to and from deinterlaced data frames, but I want to gauge interest for -this feature before I invest the time to implement it. If this is a a feature -you'd use, [please let me know](mailto:kdh38@psu.edu)! diff --git a/vignettes/column-types.Rmd b/vignettes/column-types.Rmd deleted file mode 100644 index 0be879e..0000000 --- a/vignettes/column-types.Rmd +++ /dev/null @@ -1,136 +0,0 @@ ---- -title: "Interlaced Column Types" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Interlaced Column Types} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -Like the `readr::read_*()` family of functions, `read_interlaced_*()` will -automatically guess column types by default: - -```{r} -library(interlacer) - -(read_interlaced_csv( - interlacer_example("colors.csv"), - na = c("REFUSED", "OMITTED", "N/A"), -)) -``` - -As with readr, these column type guess can be overridden using the `col_types`, -using readr's `readr::col_*()` column specifiers: - -```{r} -library(readr) - -(read_interlaced_csv( - interlacer_example("colors.csv"), - na = c("REFUSED", "OMITTED", "N/A"), - col_types = cols( - person_id = col_integer(), - age = col_number(), - favorite_color = col_factor(levels = c("BLUE", "RED", "YELLOW", "GREEN")) - ) -)) -``` - -## Interlaced Column Types - -In addition to the standard `readr::col_*` column specification types, -interlacer provides *interlaced* column types that enable you to specify missing -reasons at the column level. - -This is useful when you have missing reasons that only apply to particular items -as opposed to the file as a whole. For example, say we had a measure with the -following two items: - -> 1. What is your current stress level? -> a. Low -> b. Moderate -> c. High -> d. I don't know -> e. I don't understand the question -> -> 2. How well do you feel you manage your time and responsibilities today? -> a. Poorly -> b. Fairly well -> c. Well -> d. Very well -> e. Does not apply (Today was a vacation day) -> f. Does not apply (Other reason) - -As you can see, both items have two selection choices that should be mapped to -missing reasons. To specify missing reasons at the variable level, the -`icol_*()` family -of column specification types can be used. These extend all of readr's -`col_*()` column types by adding a parameter for specifying missing values -unique to that particular variable: - -```{r} -(df_stress <- read_interlaced_csv( - interlacer_example("stress.csv"), - col_types = cols( - person_id = col_integer(), - current_stress = icol_factor( - levels = c("LOW", "MODERATE", "HIGH"), - na = c("DONT_KNOW", "DONT_UNDERSTAND") - ), - time_management = icol_factor( - levels = c("POORLY", "FAIRLY_WELL", "WELL", "VERY_WELL"), - na = c("NA_VACATION", "NA_OTHER") - ) - ), - na = c( - "REFUSED", - "OMITTED", - "N/A" - ) -)) - -``` - -The `icol_factor()` column spec works just like -`readr::col_factor()`, but additionally accepts an `na` argument for specifying -missing values at the variable level. When you specify missing -reasons at the variable-level like this, the available levels in the resulting -missing reason column correctly show only the possible missing reasons for -that variable: - -```{r} -levels(df_stress$.person_id.) -levels(df_stress$.current_stress.) -levels(df_stress$.time_management.) -``` - -For comparison, if we loaded all of these variable-level missing reasons as -file-level level missing reasons, each variable would have ALL missing -reasons as possible levels, even if they didn't apply to that particular -variable: - -```{r} -df_stress_file <- read_interlaced_csv( - interlacer_example("stress.csv"), - na = c( - "REFUSED", - "OMITTED", - "N/A", - "DONT_KNOW", - "DONT_UNDERSTAND", - "NA_VACATION", - "NA_OTHER" - ) -) - -levels(df_stress_file$.person_id.) -levels(df_stress_file$.current_stress.) -levels(df_stress_file$.time_management.) -``` diff --git a/vignettes/interlacer.Rmd b/vignettes/interlacer.Rmd index 7c360ba..047ea66 100644 --- a/vignettes/interlacer.Rmd +++ b/vignettes/interlacer.Rmd @@ -19,7 +19,7 @@ values or codes. For example, consider the following CSV: ```{r} library(readr) -library(interlacer) +library(interlacer, warn.conflicts = FALSE) read_file(interlacer_example("colors.csv")) |> cat() @@ -34,9 +34,10 @@ To load the values of this data source, it is an easy call to the venerable `readr::read_csv()`: ```{r} -(df <- read_csv( +(df_simple <- read_csv( interlacer_example("colors.csv"), na = c("REFUSED", "OMITTED", "N/A"), + show_col_types = FALSE, )) ``` @@ -60,15 +61,13 @@ Our current dataframe only gets us part way: ```{r} library(dplyr, warn.conflicts = FALSE) -df |> - mutate( - favorite_color_missing = is.na(favorite_color) - ) |> +df_simple |> summarize( mean_age = mean(age, na.rm = T), n = n(), - .by = favorite_color_missing - ) + .by = favorite_color + ) |> + arrange(favorite_color) ``` As you can see, because we converted all our missing reasons into a single `NA`, @@ -81,7 +80,8 @@ something else: the type information of the values. ```{r} (df_with_missing <- read_csv( interlacer_example("colors.csv"), - col_types = cols(.default = "c") + col_types = cols(.default = "c"), + show_col_types = FALSE )) ``` @@ -96,129 +96,300 @@ reasons <- c("REFUSED", "OMITTED", "N/A") df_with_missing |> mutate( age_values = as.numeric(if_else(age %in% reasons, NA, age)), - favorite_color_missing_reasons = if_else( - favorite_color %in% reasons, - favorite_color, - NA - ) ) |> summarize( mean_age = mean(age_values, na.rm=T), n = n(), - .by = favorite_color_missing_reasons - ) + .by = favorite_color + ) |> + arrange(favorite_color) ``` -This gives us the information we want, but it is cumbersome and starts to get +This gives us the information we want, but it is cumbersome. Notice how there's +no distinction between favorite color values and missing reasons! Things start +to get really complex when different columns have different sets of possible missing reasons. It means you have to do a lot of type conversion gymnastics to switch between value types and missing types. ### The interlacer approach -Interlacer was built based on the insight that everything becomes much more +interlacer was built based on the insight that everything becomes much more tidy, simple, and expressive when we explicitly work with values and missing -reasons as separate *channels* of the same variable. The functions -the `read_interlaced_*` functions in interlacer do this for you: -they *deinterlace* variables from interlaced data sources into two columns per -variable: one for holding values, one for holding missing reasons. +reasons as separate *channels* of the same variable. interlacer +introduces a new `interlaced` column type that facilitates this. +The `read_interlaced_*` functions in interlacer import data with this new +column type. ```{r} -(df_deinterlaced <- read_interlaced_csv( +(df <- read_interlaced_csv( interlacer_example("colors.csv"), na = c("REFUSED", "OMITTED", "N/A"), + show_col_types = FALSE )) ``` -As you can see, missing reasons columns are denoted by names surrounded by -dots: the `.age.` column holds the missing reasons for the `age` variable, -and so on. -Now, all the missing reason information you need is right at your fingertips, -AND the value types are preserved. To make the same report as we did before, -we would run: +As you can see by the column headers, each column loaded is composed of two +channels: a value channel, +and missing reason channel. Each channel can have its own type. The `age` +column, for example, has `double` values and `factor` missing reasons: + +```{r} +df$age +``` + +These channels can be explicitly accessed using the `value_channel()` and +`na_channel()` helper functions: ```{r} -df_deinterlaced |> +value_channel(df$age) +na_channel(df$age) +``` + +These helpers are rarely needed, however, because computations automatically +operate on an `interlaced` column's value channel, +and ignore the missing reasons channel. The following will compute the mean +age, without the missing reasons interfering: + +```{r} +mean(df$age, na.rm = TRUE) +``` + +(We could have equivalently used the `value_channel()` helper to achieve the +same result, albeit with more verbosity): + +```{r} +mean(value_channel(df$age), na.rm = TRUE) +``` + +Although missing reasons are excluded in computations, they still treated as +unique values. This means that when we group by age we get a breakdown by +the unique missing reasons, rather than being lumped into a single `NA`: + +```{r} +df |> summarize( mean_age = mean(age, na.rm=T), n = n(), - .by = .favorite_color. - ) + .by = favorite_color + ) |> + arrange(favorite_color) ``` -We get the same results as before but without needing to do any type gymnastics! +As you can see, we can generate the same report as we did before, but without +needing to do any type gymnastics! Also, the values are neatly distinguished +from the missing reasons. ## Filtering based on missing reasons -Having separate columns for values and missing reasons also helpful for creating +`interlaced` columns are also helpful when creating samples with inclusion / exclusion criteria based on missing reasons. For example, using our example data, say we wanted to create a sample of respondents -that `REFUSED` to give their age? +that `REFUSED` to give their age. To indicate that a value should be interpreted +as a missing reason, you can use the `na()` function with the value: ```{r} -df_deinterlaced |> - filter(.age. == "REFUSED") +df |> + filter(age == na("REFUSED")) + +# na_channel() can also be used to get an equivalent result: +df |> + filter(na_channel(age) == "REFUSED") ``` How about people who `REFUSED` to report their age AND favorite color? ```{r} -df_deinterlaced |> - filter(.age. == "REFUSED" & .favorite_color. == "REFUSED") +df |> + filter(age == na("REFUSED") & favorite_color == na("REFUSED")) ``` -With separate columns, we can combine value conditions with missing reason +It's also possible to combine value conditions with missing reason conditions. For example, this will select everyone who `REFUSED` to give their favorite color, and was over 20 years old: ```{r} -df_deinterlaced |> - filter(age > 20 & .favorite_color. == "REFUSED") +df |> + filter(age > 20 & favorite_color == na("REFUSED")) ``` -After we've created our sample, and are ready to start analyzing our data, -we typically don't need to keep the missing reasons around anymore. Interlacer -provides a convenient `drop_missing_cols()` function to take care of this: +## Mutations + +As you might expect, the `na()` function can be used with values in mutations. The +following pipeline will replace the favorite color of respondents with the missing +value `"REDACTED"` ```{r} -df_deinterlaced |> - filter(.age. == "REFUSED") |> - drop_missing_cols() +df |> + mutate( + favorite_color = na("REDACTED") + ) ``` -## Next steps +Conditionals also work exactly as you would expect in mutations. +The following will replace the favorite color of respondents with `age < 18` +with the missing reason `"REDACTED_UNDERAGE"`. Respondents missing an age will +be replaced with `"REDACTED_MISSING_AGE"` -So far, we've covered how interlacer's `read_interlaced_*` family -of functions enabled us to deinterlace value and missing reason channels from -interlaced data sources into separate dataframe columns. Separate value and -missing reason columns enable us to create tidy and type-aware aggregation -and filtering pipelines that can simultaneously consider a variable's value -AND missing reasons. +```{r} +df |> + mutate( + favorite_color = if_else( + age < 18, + na("REDACTED_UNDERAGE"), + favorite_color, + missing = na("REDACTED_MISSING_AGE") + ) + ) +``` + +The following mutation will create a new column called `person_type` +that will be `"CHILD"` when `age < 18`, `"ADULT"` when `age >= 18`, and missing +with reason `"AGE_UNAVAILABLE"` when `age` is missing: + +```{r} +df |> + mutate( + person_type = if_else( + age < 18, + "CHILD", + "ADULT", + missing = na("AGE_UNAVAILABLE") + ), + ) +``` + +Important note: You must use `dplyr::if_else()` with `interlaced` vectors +instead of R's `base::ifelse()` function, because the base function strips out +the missing reason channel due to a +[fundamental limitation in base R](https://vctrs.r-lib.org/#motivation). -That's all well and good, but what happens when we want to make modifications -to our data? What if we want to add variables to our dataframe, replace -values with missing reasons, or missing reasons with values? Inevitably, we'll -create situations where we simultaneously have a value and a missing reason, -or neither a value nor a missing reason: +## Empty cells (`NA` missing reasons) + +When a cell in a column is missing both a value and a missing reason, the cell +is considered "empty". Such values can occur when missing reasons are not +specified. For example, if we did not include a `missing = ` argument in the +second example in the previous section, we would get the following result: ```{r} -# Value and missing reason: -df_deinterlaced |> +df |> mutate( - .age. = "REDACTED" + favorite_color = if_else( + age < 18, + na("REDACTED_UNDERAGE"), + favorite_color, + ) ) +``` + +Empty values can be detected by using the `is.empty()` function: + +```{r} +df |> + mutate( + favorite_color = if_else( + age < 18, + na("REDACTED_UNDERAGE"), + favorite_color, + ) + ) |> + filter(is.empty(favorite_color)) +``` + +Raw `NA` values are also considered "empty": + +```{r} +# regular values are neither missing nor empty +is.na(42) +is.empty(42) + +# na("REASON") is a missing value, but is not an empty value +is.na(na("REASON")) +is.empty(na("REASON")) + +# na(NA) values are considered missing and empty +is.na(na(NA)) +is.empty(na(NA)) + +# regular NA values are also missing and empty +is.na(NA) +is.empty(NA) +``` + +Empty values often occur as the result of joins, because the `dplyr::*_join()` +family of functions do not have a `missing = ` parameter, +like `dplyr::if_else()` does. For example, +say we had the following data frame we wanted to join to our sample: + +```{r} +conditions <- tribble( + ~person_id, ~condition, + 1, "TREATMENT", + 2, "CONTROL", + 3, na("TECHNICAL_ERROR"), + 6, "CONTROL", + 8, "TREATMENT", +) +``` + +Because we're missing condition information for some of the respondents, +these show up as empty values when we join the data frame to our sample: + +```{r} +df |> + left_join(conditions, by = join_by(person_id)) +``` -# No value, no missing reason: -df_deinterlaced |> +We can remedy this by replacing these empty values after the join: + +```{r} +df |> + left_join(conditions, by = join_by(person_id)) |> mutate( - favorite_color = na_if(favorite_color, "BLUE") + condition = if_else(is.empty(condition), na("LEFT_STUDY"), condition), ) ``` -Notice the warnings! These operations produce dataframes that don't conform -to the rule of "one value -OR missing reason per variable row". We could manually solve this by manually -fixing the corresponding column, but as the above output hints, -interlacer provides an easier way by way of the function -`coalesce_channels()`. The next vignette, -`vignette("mutations")`, will show how this works! +## Writing interlaced files + +After you've made made changes to your data, you probably want to save them. +interlacer provides the `write_interlaced_*` family of functions for this: + +```{r, eval = FALSE} +write_interlaced_csv(df, "interlaced_output.csv") +``` + +This will combine the value and missing reasons into interlaced character +columns, and write the result as a csv. Alternatively, if you want to +re-interlace the columns without writing to a file for more control in the +writing process, you can use `flatten_channels()`: + +```{r} +flatten_channels(df) + +# (it works on single vectors as well) +flatten_channels(df$age) +``` + +The value and missing reason channels of data frames with `interlaced` vectors +can similarly be accessed using the `value_channel()` and `na_channel()` helper +functions: + +```{r} +value_channel(df) +na_channel(df) +``` + +## Next steps + +So far, we've covered how interlacer's `read_interlaced_*` family +of functions enabled us to load `interlaced` columns that contain +separate challens for value and missing reasons. The `interlaced` type enables +us to create tidy and type-aware pipelines that can flexibly consider a +variable's value AND missing reasons. + +In all the examples in this vignette, column types were automatically detected. +To explicitly specify value and missing column types, (and specify individual +missing reasons for specific columns), interlacer extends +`readr`'s `collector()` system. This will be covered in the next vignette, +`vignette("na-column-types")` diff --git a/vignettes/mutations.Rmd b/vignettes/mutations.Rmd deleted file mode 100644 index 94a9718..0000000 --- a/vignettes/mutations.Rmd +++ /dev/null @@ -1,297 +0,0 @@ ---- -title: "Mutating Values and Missing Reasons" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Mutating Values and Missing Reasons} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -When working with a "deinterlaced data frame", care must be taken to ensure that -variables have a missing reason whenever a value is `NA`, and a -value whenever a missing reason is `NA`. When this rule is violated, it creates -ambiguous states. For example if a variable has a values AND a missing reason, -it's not clear which one represents the "correct" state of the variable. -Similarly, if a variable is missing its value AND its missing reason, it's -probably a sign we made a mistake somewhere. - -This means whenever we `mutate()` the values of a variable, the missing reasons -must also be updated, and vice versa. To illustrate this, let's load some -example data: - -```{r} -library(interlacer) - -(df <- read_interlaced_csv( - interlacer_example("colors.csv"), - na = c("REFUSED", "OMITTED", "N/A"), -)) -``` - -Say we wanted to redact the values in the `age` variable, by setting their -missing reason to `REDACTED`: - -```{r} -library(dplyr, warn.conflicts = FALSE) - -df |> - mutate( - .age. = "REDACTED" - ) -``` - -As you can see by the warning message, we've created an ambigous situation: -there are now rows where `age` and `.age.` both have values! We need to get rid -of all the `age` values: - -```{r} -df |> - mutate( - .age. = "REDACTED", - age = NA, - ) -``` - -Before I reveal a better way to go about this, let's look at another motivating -example. Say our study was supposed to only let -participants choose between `RED` and `YELLOW` for their favorite colors -- -but for some reason `BLUE` was included as an option because of a technical -glitch. In this situation, we'd want to set all responses that weren't `RED` and -`YELLOW` to be considered missing: - -```{r} -df |> - mutate( - favorite_color = if_else( - favorite_color %in% c("RED", "YELLOW"), - favorite_color, - NA - ) - ) -``` - -As you can see by the warning, we've created another invalid state, with some -`favorite_color` responses having neither values nor missing reasons. To fix -this, we need to make a corresponding mutation to the missing reason column: - -```{r} -df |> - mutate( - favorite_color = if_else( - favorite_color %in% c("RED", "YELLOW"), - favorite_color, - NA - ), - .favorite_color. = if_else( - is.na(favorite_color) & is.na(.favorite_color.), - "TECHNICAL_ERROR", - .favorite_color. - ) - ) -``` - -To understand what's going on here, let's look at the mutation step by step: -First, where `favorite_color` is not `RED` or `YELLOW`, we set as a missing -value. In doing this, we've created a bunch of rows where both the value and -missing reason are absent. In next part of the mutation, we fill in the -`TECHNICAL_ERROR` missing reason for these rows into `.favorite_color.`, -resulting in a well-formed deinterlaced data frame. - -## An easier way with `coalesce_channels()` - -As you can imagine, manually fixing the value & missing reason structure -of your data frame for every mutation you do can get cumbersome! Luckily, -interlacer provides an easier way via `coalesce_channels()`: - -`coalesce_channels()` should be run every time you mutate something in -a deinterlaced data frame. It accepts two arguments `keep`, and -`missing_reason`. It fixes both possible problem cases as follows: - -Case 1: BOTH a value and a missing reason exists - -- Keep the value when `keep = 'value'` -- Keep the missing reason when `keep = 'missing'` - -Case 2: NEITHER a value nor a missing reason exists - -- Fill in the missing reason with `missing_reason` - -These rules allow us to mutate our deinterlaced variables without needing to -specify BOTH the values and missing reason actions -- we only need to think -about our intended operation in the context of one channel, and then a call to -`coalesce_channels()` can take care of the other for us. - -Here's how we'd use `coalesce_channels()` in the two examples from the previous -section: - -```{r} -df |> - mutate( - .age. = "REDACTED", - ) |> - coalesce_channels(keep = "missing") - -df |> - mutate( - favorite_color = if_else( - favorite_color %in% c("RED", "YELLOW"), - favorite_color, - NA - ) - ) |> - coalesce_channels(missing_reason = "TECHNICAL_ERROR") -``` - - -## Creating new columns - -`coalesce_channels()` will also automatically create missing reason -columns if they don't automatically exist. This is useful for adding new -variables to your data frame: - -```{r} -df |> - mutate( - person_type = if_else(age < 18, "CHILD", "ADULT"), - .after = person_id - ) |> - coalesce_channels(missing_reason = "AGE_UNAVAILABLE") -``` - -## Joining columns - -`coalesce_channels()` should also be used when joining new columns onto -an interlaced data frame, to fill in missing reasons when no matches are found: - -```{r} -conditions <- tribble( - ~person_id, ~condition, - 1, "TREATMENT", - 2, "CONTROL", - 3, "CONTROL", - 6, "CONTROL", - 8, "TREATMENT", -) - -df |> - left_join(conditions, by = join_by(person_id)) |> - relocate(condition, .after = person_id) |> - coalesce_channels(missing_reason = "LEFT_STUDY") -``` - -Deinterlaced data frames can be joined as well, but you need to include -both the value and missing reason columns for the key in `join_by()`: - -```{r} -conditions <- tribble( - ~person_id, ~.person_id., ~condition, ~.condition., - 1, NA, "TREATMENT", NA, - 2, NA, "CONTROL", NA, - 3, NA, NA, "TECHNICAL_ERROR", - 6, NA, "CONTROL", NA, - 8, NA, "TREATMENT", NA, -) - -df |> - left_join(conditions, by = join_by(person_id, .person_id.)) |> - relocate(condition, .after = person_id) |> - coalesce_channels(missing_reason = "LEFT_STUDY") -``` - -Use caution when your keys have missing reasons though: - -```{r} -df_left <- tribble( - ~a, ~.a., ~b, ~.b., - 1, NA, "a", NA, - NA, "REASON_1", "b", NA, - 3, NA, NA, "REASON_2", - 4, NA, "c", NA, - 5, NA, "d", NA, -) - -df_right <- tribble( - ~a, ~.a., ~c, ~.c., - 1, NA, "e", NA, - 4, NA, "g", NA, - 5, NA, "h", NA, - NA, "REASON_1", "i", NA, - NA, "REASON_1", "j", NA, - NA, "REASON_1", "k", NA, -) - -left_join(df_left, df_right, by = join_by(a, .a.)) |> - coalesce_channels(missing_reason = "REASON_3") -``` - -When keys have missing reasons, missing reasons will be matched as well as -values! Before you get mad at interlacer though, note how this situation echoes -a similar situation with missing values in regular data frames: - -```{r} -df_left <- tribble( - ~a, ~b, - 1, "a", - NA, "b", - 3, NA, - 4, "c", - 5, "d", -) - -df_right <- tribble( - ~a, ~c, - 1, "e", - 4, "g", - 5, "h", - NA, "i", - NA, "j", - NA, "k", -) - -left_join(df_left, df_right, by = join_by(a)) -``` - -In short, you always need to exercise caution when you have missing join keys, -regardless of if you're using interlacer or not! - -## Writing interlaced files - -After you've made made changes to your data, you probably want to save them. -Interlacer provides the `write_interlaced_*` family of functions for this: - -```{r, eval = FALSE} -write_interlaced_csv(df, "interlaced_output.csv") -``` - -This will combine the value and missing reasons into interlaced character -columns, and write the result as a csv. Alternatively, if you want to -re-interlace the columns without writing to a file for more control in the -writing process, you can use `interlace_channels()`: - -```{r} -interlace_channels(df) -``` - -## Final note: Setting the global default reason - -By default, `coalesce_channels()` will use `UNKNOWN_REASON` as the -default missing reason. Sometimes you want to use a different default value, -to act as the "catch-all" missing reason, so you don't have to constantly -specify it. To do this, set the global `interlacer.default_missing_reason` -option: - -```{r} -options(interlacer.default_missing_reason = -99) - -tibble( - a = c(1,2,3, NA, 5) -) |> - coalesce_channels() -``` diff --git a/vignettes/na-column-types.Rmd b/vignettes/na-column-types.Rmd new file mode 100644 index 0000000..98116e1 --- /dev/null +++ b/vignettes/na-column-types.Rmd @@ -0,0 +1,122 @@ +--- +title: "NA Column Types" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Interlaced Column Types} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +Like the `readr::read_*()` family of functions, `read_interlaced_*()` will +automatically guess column types by default: + +```{r} +library(interlacer, warn.conflicts = FALSE) + +(read_interlaced_csv( + interlacer_example("colors.csv"), + na = c("REFUSED", "OMITTED", "N/A"), + show_col_types = FALSE +)) +``` + +As with readr, these column type guess can be overridden using the `col_types` +parameter with readr's `readr::col_*()` column specifiers: + +```{r} +library(readr) + +(read_interlaced_csv( + interlacer_example("colors.csv"), + na = c("REFUSED", "OMITTED", "N/A"), + col_types = cols( + person_id = col_integer(), + age = col_number(), + favorite_color = col_factor(levels = c("BLUE", "RED", "YELLOW", "GREEN")) + ) +)) +``` + +## `NA` collector types + +In addition to the standard `readr::col_*` column specification types, +interlacer provides the ability to specify missing +reasons at the column level, using the `na` parameter. + +This is useful when you have missing reasons that only apply to particular items +as opposed to the file as a whole. For example, say we had a measure with the +following two items: + +> 1. What is your current stress level? +> a. Low +> b. Moderate +> c. High +> d. I don't know +> e. I don't understand the question +> +> 2. How well do you feel you manage your time and responsibilities today? +> a. Poorly +> b. Fairly well +> c. Well +> d. Very well +> e. Does not apply (Today was a vacation day) +> f. Does not apply (Other reason) + +As you can see, both items have two selection choices that should be mapped to +missing reasons. These can be specified with the `na_cols()` function, which +works similarly to readr's `cols()` function: + +```{r} +(df_stress <- read_interlaced_csv( + interlacer_example("stress.csv"), + col_types = cols( + person_id = col_integer(), + current_stress = col_factor( + levels = c("LOW", "MODERATE", "HIGH") + ), + time_management = col_factor( + levels = c("POORLY", "FAIRLY_WELL", "WELL", "VERY_WELL") + ) + ), + na = na_cols( + .default = c("REFUSED", "OMITTED", "N/A"), + current_stress = c(.default, "DONT_KNOW", "DONT_UNDERSTAND"), + time_management = c(.default, "NA_VACATION", "NA_OTHER") + ) +)) +``` + + +Setting na type to `NULL` indicates the column should be loaded +as a regular type instead of an `interlaced` one. The following +will load `person_id` as a regular, non-interlaced type: + +```{r} +read_interlaced_csv( + interlacer_example("colors_coded.csv"), + na = na_cols( + .default = c(-99, -98, -97), + person_id = NULL, + ), + show_col_types = FALSE +) +``` + +## Next steps + +In this vignette we covered how the column types for values and missing reasons +can be explicitly specified using collectors. We also illustrated how +column-level missing values can be specified by creating a missing channel +specification using `na_cols()`. + +In the final example, we used an example data set with coded values and missing +reasons. Coded values are especially common in data sets produced by SPSS, SAS, +and Stata. For some recipes for working with coded data like this, check +out the next vignette, `vignette("coded-data")`. diff --git a/vignettes/other-approaches.Rmd b/vignettes/other-approaches.Rmd index 5200cb4..4477559 100644 --- a/vignettes/other-approaches.Rmd +++ b/vignettes/other-approaches.Rmd @@ -14,27 +14,36 @@ knitr::opts_chunk$set( ) ``` -The [haven](https://haven.tidyverse.org/) package has functions for loading -native SPSS, SAS, and Stata file formats into -special data frames that use column attributes and special values to keep track -of interlaced values and missing reasons. In this section I discuss the -advantages and disadvantages of these approaches, and how they compare with -interlacer. In the final section, I describe what I think would be an "ideal" -way to work with missingness in R, and the technical challenges we'd have to -overcome in order to implement it. - -(Note: Future versions of interlacer could have the ability to convert haven -data frames to and from deinterlaced data frames, but I want to gauge interest -for this feature before I invest the time to implement it. If this is a -feature you'd use, [please let me know](mailto:kdh38@psu.edu)!) - -## "Labelled" missing values +interlacer was inspired by the [haven](https://haven.tidyverse.org/), +[labelled](https://larmarange.github.io/labelled/), and +[declared](https://dusadrian.github.io/declared/) packages. These packages +provide similar functionality to interlacer, but are more focused on +providing compatibility with missing reason data imported from SPSS, SAS, and +Stata. + +In this section I discuss some of the particularities of these +approaches, and how they compare with interlacer. + +(Note: Future versions of interlacer will have the ability to convert +`haven_labelled` and `declared` types to and from `interlaced` types.) + +## haven and labelled + +The [haven](https://haven.tidyverse.org/) and +[labelled](https://larmarange.github.io/labelled/) packages rely on two +functions +for creating vectors that interlace values and missing reasons: +`haven::labelled_spss()` and `haven::tagged_na()`. Although they both create +`haven_labelled` vectors, they use very different methods for representing +missing values. + +### "Labelled" missing values (`haven::labelled_spss()`) When SPSS files are loaded with haven via `haven::read_spss()`, values and missing reasons are loaded into a single interlaced numeric vector: ```{r} -library(interlacer) +library(interlacer, warn.conflicts = FALSE) library(haven) library(dplyr) @@ -83,19 +92,18 @@ df_spss |> It's a little bit of an improvement to working with raw coded values, because you can use `is.na()`, and your codes get labels, so you don't have be -constantly looking up codes in your codebook. But it still falls short of the -interlacer approach for two key reasons: +constantly looking up codes in your codebook. But it still falls short of +interlacer's functionality for two key reasons: -Reason 1: With the interlacer approach -of having separate columns for values and missing reasons, your value column +Reason 1: With interlacer, your value column can be whatever type you want: numeric, character, factor, etc. With labelled missing reasons, values and missing reasons need to be the same type, usually numeric codes. This creates a lot more type gymnastics and potential errors when you're manipulating them. -Reason 2: Keeping interlaced columns, even when the missing values are labelled, -means aggregations and other math operatiosn are not protected. If you forget +Reason 2: Even when the missing values are labelled in the `labelled_spss` type, +aggregations and other math operatiosn are not protected. If you forget to take out your missing values, you get incorrect results / corrupted data: ```{r} @@ -119,7 +127,7 @@ df_spss |> ``` -## "Tagged" missing values +### "Tagged" missing values (`haven::tagged_na()`) For loading Stata and SAS files, haven uses a "tagged missingness" approach to mirror how these values are handled in Stata and SAS: @@ -165,108 +173,193 @@ df_stata |> Another limitation of this approach is that it requires values types to be numeric, because the trick of "tagging" the `NA` values depends on the -peculiarities of how floating point values are stored in memory. Again, -keeping separate columns for values and missing reasons solves all these issues. +peculiarities of how floating point values are stored in memory. -## The "ideal" approach +## declared -The biggest downside of keeping separate columns for values and missing reasons -are the invalid states that come up when you start trying to mutate your data -frames. `coalesce_channels()` helps a lot, but it's a pragmatic solution, -not an ideal one. +The [declared](https://dusadrian.github.io/declared/) package uses the +function`declared::declared()` for constructing interlaced vectors: -I think the ideal way to handle missing reasons would be to implement a proper -generic [`Result` type](https://en.wikipedia.org/wiki/Result_type) natively -into R's type system. A real `Result` type would act similar to haven's -`haven::tagged_na()`, but be a container for any type of value, not only -missing values. +```{r} +library(declared) -In an early attempt of this library, I tried using nested data frames for this -effect: +(dcl <- declared(c(1, 2, 3, -99, -98), na_values = c(-99, -98))) +``` + +`declared` vectors are similar to `haven_labelled_spss` vectors, except with +a critical innovation: they store actual `NA` values where there are missing +values, and then keep track of the missing reasons entirely in the attributes +of the object: ```{r} -df_interlaced <- read_interlaced_csv( - interlacer_example("colors.csv"), - na = c("REFUSED", "OMITTED", "N/A") -) +# All the missing reason info is tracked in the attributes +attributes(dcl) -(df_nested <- tibble( - person_id = tibble( - v = df_interlaced$person_id, - m = df_interlaced$.person_id., - ), - age = tibble( - v = df_interlaced$age, - m = df_interlaced$.age., - ), - favorite_color = tibble( - v = df_interlaced$favorite_color, - m = df_interlaced$.favorite_color., - ) -)) + +# The data stored has actual NA values, so it works as you would expect +# with summary stats like `mean()`, etc. +attributes(dcl) <- NULL +dcl ``` -This sort of works, because we can use `$v` and `$m` to reference separate -channels of the data frame. Unfortunately it requires creating separate columns -when grouping: +This means aggregations work exactly as you would expect! ```{r} -df_nested |> - mutate( - favorite_color_missing = favorite_color$m - ) |> - summarize( - mean_age = mean(age$v, na.rm=T), - n = n(), - .by = favorite_color_missing - ) +dcl <- declared(c(1, 2, 3, -99, -98), na_values = c(-99, -98)) + +sum(dcl, na.rm=TRUE) ``` -And mutations get ugly, even though they're more "correct" from a strongly-typed -functional programming perspective... +## interlacer + +interlacer builds on the ideas of haven, labelled, and declared with following +goals: + +1. Be fully generic: Add a missing value channel to *any* vector type. + +As mentioned above, `haven::labelled_spss()` only works with `numeric` +and `character` types, and `haven::tagged_na()` only works with `numeric` types. +`declared::declared()` supports `numeric`, `character` and `date` types. + +`interlaced` types, by contrast, can imbue *any* vector type with a missing value +channel: ```{r} -df_nested |> - mutate( - favorite_color = if_else( - favorite_color$v %in% c("RED", "YELLOW"), - tibble(v = favorite_color$v, m = NA), - tibble(v = NA, m = "TECHNICAL_ERROR") - ) - ) +interlaced(list(TRUE, FALSE, "reason"), na = "reason") + +interlaced(c("2020-01-01", "2020-01-02", "reason"), na = "reason") |> + map_value_channel(as.Date) + + +interlaced(c("red", "green", "reason"), na = "reason") |> + map_value_channel(factor) ``` -If we were to implement this somehow as a custom native type in R, I'd want -syntax something like this instead: +Like `declared` vectors, the missing reasons are tracked in the attributes. But +unlike `declared`, missing reasons are stored as an entirely separate *channel* +rather than by tracking their indices: -```{r, eval = FALSE} -df_mutated <- df |> - mutate( - favorite_color = if_else( - favorite_color %in% c("RED", "YELLOW"), - favorite_color, - missing_reason("TECHNICAL_ERROR") +```{r} +(int <- interlaced(c(1,2,3, -99, -98), na = c(-99, -98))) + +attributes(int) + +attributes(int) <- NULL +int +``` + +This data structure drives their functional API, described in (3) below. + +2. Provide functions for reading / writing interlaced CSV files (not just SPSS +/ SAS / Stata files) + +(See `interlacer::read_interlaced_csv()`, etc.) + +3. Provide a functional API that integrates well into tidy pipelines + +interlacer provides functions to facilitate working with the `interlaced` type +as a [Result type](https://en.wikipedia.org/wiki/Result_type), +a well-understood abstraction in functional programming. The functions `na()` +`map_value_channel()` and `map_na_channel()` all come from this influence. + +The `na()` function creates an `interlaced` type by "lifting" a value into +the missing reason channel. This approach helps create a safer separation between +the value and missing reason channels, because it's always clear which channel +you're making comparisons on. + +For example: + +```{r} +# haven +labelled_spss(c(-99, 1, 2), na_values = -99) == 1 # value channel comparison +labelled_spss(c(-99, 1, 2), na_values = -99) == -99 # na channel comparison + +# declared +declared(c(-99, 1, 2), na_values = -99) == 1 # value channel comparison +declared(c(-99, 1, 2), na_values = -99) == -99 # na channel comparison + +# interlacer +interlaced(c(-99, 1, 2), na = -99) == 1 # value channel comparison +interlaced(c(-99, 1, 2), na = -99) == na(-99) # na channel comparison +``` + +Similarly, `map_value_channel()` and `map_na_channel()` allow you to safely +mutate a particular channel, without touching the values of the +other channel. This interface is especially useful in tidy pipelines. + +Finally, because the `interlaced` type is based on the `vctrs` type system, it +plays nicely with all the packages in the tidyverse. + +## Questions for the future + +1. More flexible missing reason channel types? + +Earlier versions allowed arbitrary types to occupy +the missing reason channel (i.e. it was a fully generic Result +type). I ended +up constricting the missing reason channel to only allow `integer` or `factor` +types to help simplify the `na_cols()` specifications. When arbitrary types +are allowed, the `na_cols()` specs become quite long (e.g. +`column_name = factor(levels=c("REASON_1", "REASON_2")))`). As far as I can +tell, in 99.9% of the time, it is preferable to use `integer` and `factor` +missing reason channels over `double` and `character` ones, so for now I've +made the executive decision to only allow `integer` and `factor` types. + +2. A better `na_cols()` specification? + +Right now, missing values are supplied in `na` a separate argument from +`col_types`. This means custom missing values get pretty far separated from +their `col_type` definitions: + +```{r} +read_interlaced_csv( + interlacer_example("stress.csv"), + col_types = cols( + person_id = col_integer(), + current_stress = col_factor( + levels = c("LOW", "MODERATE", "HIGH") + ), + time_management = col_factor( + levels = c("POORLY", "FAIRLY_WELL", "WELL", "VERY_WELL") ) + ), + na = na_cols( + .default = c("REFUSED", "OMITTED", "N/A"), + current_stress = c(.default, "DONT_KNOW", "DONT_UNDERSTAND"), + time_management = c(.default, "NA_VACATION", "NA_OTHER") ) - -df_mutated |> - summarize( - mean_age = mean(age, na.rm=T), - n = n(), - .by = missing_reason(favorite_color) - ) +) +``` + +In an earlier version I created an extension of readr collectors, a family +of `icol_*` types, that allowed you to do something like this: + +```{r, eval=FALSE} +read_interlaced_csv( + interlacer_example("stress.csv"), + col_types = cols( + person_id = col_integer(), + current_stress = icol_factor( + levels = c("LOW", "MODERATE", "HIGH"), + na = c("DONT_KNOW", "DONT_UNDERSTAND") + ), + time_management = col_factor( + levels = c("POORLY", "FAIRLY_WELL", "WELL", "VERY_WELL"), + na = c("NA_VACATION", "NA_OTHER") + ) + ), + na = c("REFUSED", "OMITTED", "N/A") +) ``` -This would be "ideal" in my book: we can use values as usual, but anytime -we want to access the "missing reason" channel, we can wrap it in a -`missing_reason()` (similar to how `haven::tagged_na()` works). It's type safe -and super ergonomic. But implementing this would be a major headache and -involve very intimate knowledge of R internals... (@Hadley Wickham if by some -miracle you're reading this, could we talk sometime??) - -So this is why I'm using the present current "deinterlaced data frame" -approach. It is easy to understand and use, even though it's not "perfect" -from a strongly typed functional programming perspective. If there's enough -demand for missing-reason-aware tooling in R though, it might convince me -to go down the "generic tagged type" rabbit hole... -[Please drop me a line](mailto:kdh38@psu.edu) to let me know what you think! +...I can't decide which interface I like better. Although the latter approach +feels cleaner because it folds custom missing reasons into the `cols` +definition, one disadvantage is that it cannot overwrite missing values (e.g. +I cannot set the missing reason on `person_id` to `NULL` as long as there's a +default missing reason specified). +It also feels a little "hackish" to extend readr's types in this way; I think +making use of the `na` parameter in my own `na_cols()` function provides me +with a little bit more insulation to changes from readr. + +Anyway, if you have thoughts or opinions on any of these things, I'd really +[appreciate your feedback](mailto:kdh38@psu.edu)!