From 022c67f02eb57e5376ae9d937e5ee9896629416d Mon Sep 17 00:00:00 2001 From: Peter Desmet Date: Fri, 7 Jun 2024 11:06:23 +0200 Subject: [PATCH 1/7] Fix typo --- README.Rmd | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.Rmd b/README.Rmd index 7f53d2c..99b6ef9 100644 --- a/README.Rmd +++ b/README.Rmd @@ -202,7 +202,7 @@ ex |> You may notice that on large datasets `interlacer` runs significantly slower than `readr` / `vroom`. Although `interlacer` uses `vroom` under the hood to load delimited data, it is not able to take advantage of many of its optimizations -because `vroom` does not +because `vroom` [does not currently support](https://github.com/tidyverse/vroom/issues/532) column-level missing values. As soon as `vroom` supports column-level missing values, I will be able to remedy this! diff --git a/README.md b/README.md index fe6c0fc..6cafbb5 100644 --- a/README.md +++ b/README.md @@ -282,7 +282,7 @@ ex |> You may notice that on large datasets `interlacer` runs significantly slower than `readr` / `vroom`. Although `interlacer` uses `vroom` under the hood to load delimited data, it is not able to take advantage of -many of its optimizations because `vroom` does not [does not currently +many of its optimizations because `vroom` [does not currently support](https://github.com/tidyverse/vroom/issues/532) column-level missing values. As soon as `vroom` supports column-level missing values, I will be able to remedy this! From 2976b7cbe9af024a6af0baaa7f63904e4cacdad7 Mon Sep 17 00:00:00 2001 From: Peter Desmet Date: Fri, 7 Jun 2024 12:27:56 +0200 Subject: [PATCH 2/7] Use `na.rm = TRUE`, not shorthand --- README.Rmd | 4 ++-- README.md | 4 ++-- vignettes/coded-data.Rmd | 8 ++++---- vignettes/interlacer.Rmd | 6 +++--- vignettes/other-approaches.Rmd | 10 +++++----- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/README.Rmd b/README.Rmd index 99b6ef9..17f26b0 100644 --- a/README.Rmd +++ b/README.Rmd @@ -135,7 +135,7 @@ ex$age Computations automatically operate on values: ```{r} -mean(ex$age, na.rm=TRUE) +mean(ex$age, na.rm = TRUE) ``` But the missing reasons are still there! To indicate a value should be treated @@ -156,7 +156,7 @@ reason: ```{r} ex |> summarize( - mean_age = mean(age, na.rm=T), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color ) %>% diff --git a/README.md b/README.md index 6cafbb5..111e829 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,7 @@ ex$age Computations automatically operate on values: ``` r -mean(ex$age, na.rm=TRUE) +mean(ex$age, na.rm = TRUE) #> [1] 25.375 ``` @@ -199,7 +199,7 @@ missing reason: ``` r ex |> summarize( - mean_age = mean(age, na.rm=T), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color ) %>% diff --git a/vignettes/coded-data.Rmd b/vignettes/coded-data.Rmd index d00c435..d7d0f55 100644 --- a/vignettes/coded-data.Rmd +++ b/vignettes/coded-data.Rmd @@ -80,7 +80,7 @@ df_coded |> age = if_else(age > 0, age, NA) ) |> summarize( - mean_age = mean(age, na.rm=T), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color ) |> @@ -102,7 +102,7 @@ df_coded |> # age = if_else(age > 0, age, NA) ) |> summarize( - mean_age = mean(age, na.rm=T), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color ) |> @@ -169,7 +169,7 @@ keep cross-referencing your codebook to know what values mean: ```{r} df_decoded |> summarize( - mean_age = mean(age, na.rm=TRUE), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color ) |> @@ -229,7 +229,7 @@ df_coded_char |> age = if_else(!is.na(as.numeric(age)), as.numeric(age), NA) ) |> summarize( - mean_age = mean(age, na.rm=T), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color ) |> diff --git a/vignettes/interlacer.Rmd b/vignettes/interlacer.Rmd index 047ea66..15a0637 100644 --- a/vignettes/interlacer.Rmd +++ b/vignettes/interlacer.Rmd @@ -63,7 +63,7 @@ library(dplyr, warn.conflicts = FALSE) df_simple |> summarize( - mean_age = mean(age, na.rm = T), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color ) |> @@ -98,7 +98,7 @@ df_with_missing |> age_values = as.numeric(if_else(age %in% reasons, NA, age)), ) |> summarize( - mean_age = mean(age_values, na.rm=T), + mean_age = mean(age_values, na.rm = TRUE), n = n(), .by = favorite_color ) |> @@ -169,7 +169,7 @@ the unique missing reasons, rather than being lumped into a single `NA`: ```{r} df |> summarize( - mean_age = mean(age, na.rm=T), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color ) |> diff --git a/vignettes/other-approaches.Rmd b/vignettes/other-approaches.Rmd index 4477559..6743ebd 100644 --- a/vignettes/other-approaches.Rmd +++ b/vignettes/other-approaches.Rmd @@ -78,7 +78,7 @@ df_spss |> ) ) |> summarize( - mean_age = mean(age_values, na.rm=T), + mean_age = mean(age_values, na.rm = TRUE), n = n(), .by = favorite_color_missing_reasons ) @@ -114,7 +114,7 @@ df_spss |> ) ) |> summarize( - mean_age = mean(age, na.rm=T), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color_missing_reasons ) @@ -151,7 +151,7 @@ character "tag" (usually a letter from a-z). This means that they work with ```{r} is.na(df_stata$age) -mean(df_stata$age, na.rm=TRUE) +mean(df_stata$age, na.rm = TRUE) ``` Unfortunately, you can't group by them, because `dplyr::group_by()` is not @@ -165,7 +165,7 @@ df_stata |> ) ) |> summarize( - mean_age = mean(age, na.rm=T), + mean_age = mean(age, na.rm = TRUE), n = n(), .by = favorite_color_missing_reasons ) @@ -207,7 +207,7 @@ This means aggregations work exactly as you would expect! ```{r} dcl <- declared(c(1, 2, 3, -99, -98), na_values = c(-99, -98)) -sum(dcl, na.rm=TRUE) +sum(dcl, na.rm = TRUE) ``` ## interlacer From abc98b12412e7381363f981d3d3d0f29abc9d07b Mon Sep 17 00:00:00 2001 From: Peter Desmet Date: Fri, 7 Jun 2024 12:28:03 +0200 Subject: [PATCH 3/7] Add period --- vignettes/interlacer.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/interlacer.Rmd b/vignettes/interlacer.Rmd index 15a0637..0c53365 100644 --- a/vignettes/interlacer.Rmd +++ b/vignettes/interlacer.Rmd @@ -392,4 +392,4 @@ In all the examples in this vignette, column types were automatically detected. To explicitly specify value and missing column types, (and specify individual missing reasons for specific columns), interlacer extends `readr`'s `collector()` system. This will be covered in the next vignette, -`vignette("na-column-types")` +`vignette("na-column-types")`. From 03de813fa2217ed2beea5cf80750ea0c827b73a9 Mon Sep 17 00:00:00 2001 From: Peter Desmet Date: Fri, 7 Jun 2024 12:37:27 +0200 Subject: [PATCH 4/7] Use somewhat more readable layout for stress example Follows format in coded-data --- vignettes/na-column-types.Rmd | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vignettes/na-column-types.Rmd b/vignettes/na-column-types.Rmd index 98116e1..fd5324d 100644 --- a/vignettes/na-column-types.Rmd +++ b/vignettes/na-column-types.Rmd @@ -54,14 +54,16 @@ This is useful when you have missing reasons that only apply to particular items as opposed to the file as a whole. For example, say we had a measure with the following two items: -> 1. What is your current stress level? +1. What is your current stress level? + > a. Low > b. Moderate > c. High > d. I don't know > e. I don't understand the question -> -> 2. How well do you feel you manage your time and responsibilities today? + +2. How well do you feel you manage your time and responsibilities today? + > a. Poorly > b. Fairly well > c. Well From 232e745d03a32b589ab20f6f1a4f16a54dd2e5a7 Mon Sep 17 00:00:00 2001 From: Peter Desmet Date: Fri, 7 Jun 2024 12:39:01 +0200 Subject: [PATCH 5/7] Wrap values in backticks --- vignettes/coded-data.Rmd | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/vignettes/coded-data.Rmd b/vignettes/coded-data.Rmd index d7d0f55..a7c3969 100644 --- a/vignettes/coded-data.Rmd +++ b/vignettes/coded-data.Rmd @@ -47,19 +47,19 @@ read_file( Where missing reasons are: -> -99: N/A +> `-99`: N/A > -> -98: REFUSED +> `-98`: REFUSED > -> -97: OMITTED +> `-97`: OMITTED And colors are coded: -> 1: BLUE +> `1`: BLUE > -> 2: RED +> `2`: RED > -> 3: YELLOW +> `3`: YELLOW This format gives you the ability to load everything as a numeric type: @@ -185,8 +185,6 @@ df_decoded |> ) ``` - - ## Numeric codes with character missing reasons (SAS, Stata) Like SPSS, SAS and Stata will encode factor levels as numeric values, but @@ -203,12 +201,11 @@ read_file( Here, the same value codes are used as the previous example, except the missing reasons are coded as follows: -> ".": N/A +> `"."`: N/A > -> ".a": REFUSED +> `".a"`: REFUSED > -> ".b": OMITTED - +> `".b"`: OMITTED To handle these missing reasons without interlacer, columns must be loaded as character vectors: From 053f078f977e6952ce15efa27182982d2a5fb76b Mon Sep 17 00:00:00 2001 From: Peter Desmet Date: Fri, 7 Jun 2024 13:03:24 +0200 Subject: [PATCH 6/7] Use h3 headers --- vignettes/other-approaches.Rmd | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/vignettes/other-approaches.Rmd b/vignettes/other-approaches.Rmd index 6743ebd..a7ee062 100644 --- a/vignettes/other-approaches.Rmd +++ b/vignettes/other-approaches.Rmd @@ -195,7 +195,6 @@ of the object: # All the missing reason info is tracked in the attributes attributes(dcl) - # The data stored has actual NA values, so it works as you would expect # with summary stats like `mean()`, etc. attributes(dcl) <- NULL @@ -215,7 +214,7 @@ sum(dcl, na.rm = TRUE) interlacer builds on the ideas of haven, labelled, and declared with following goals: -1. Be fully generic: Add a missing value channel to *any* vector type. +### 1. Be fully generic: Add a missing value channel to *any* vector type As mentioned above, `haven::labelled_spss()` only works with `numeric` and `character` types, and `haven::tagged_na()` only works with `numeric` types. @@ -250,12 +249,12 @@ int This data structure drives their functional API, described in (3) below. -2. Provide functions for reading / writing interlaced CSV files (not just SPSS +### 2. Provide functions for reading / writing interlaced CSV files (not just SPSS / SAS / Stata files) -(See `interlacer::read_interlaced_csv()`, etc.) +See `interlacer::read_interlaced_csv()`, etc. -3. Provide a functional API that integrates well into tidy pipelines +### 3. Provide a functional API that integrates well into tidy pipelines interlacer provides functions to facilitate working with the `interlaced` type as a [Result type](https://en.wikipedia.org/wiki/Result_type), @@ -292,7 +291,7 @@ plays nicely with all the packages in the tidyverse. ## Questions for the future -1. More flexible missing reason channel types? +### 1. More flexible missing reason channel types? Earlier versions allowed arbitrary types to occupy the missing reason channel (i.e. it was a fully generic Result @@ -305,7 +304,7 @@ tell, in 99.9% of the time, it is preferable to use `integer` and `factor` missing reason channels over `double` and `character` ones, so for now I've made the executive decision to only allow `integer` and `factor` types. -2. A better `na_cols()` specification? +### 2. A better `na_cols()` specification? Right now, missing values are supplied in `na` a separate argument from `col_types`. This means custom missing values get pretty far separated from From de009861abe6e090b4dbfcf107e0a831c37904cc Mon Sep 17 00:00:00 2001 From: Peter Desmet Date: Fri, 7 Jun 2024 13:03:34 +0200 Subject: [PATCH 7/7] Fix typos --- vignettes/other-approaches.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/other-approaches.Rmd b/vignettes/other-approaches.Rmd index a7ee062..fe31cd8 100644 --- a/vignettes/other-approaches.Rmd +++ b/vignettes/other-approaches.Rmd @@ -103,7 +103,7 @@ This creates a lot more type gymnastics and potential errors when you're manipulating them. Reason 2: Even when the missing values are labelled in the `labelled_spss` type, -aggregations and other math operatiosn are not protected. If you forget +aggregations and other math operations are not protected. If you forget to take out your missing values, you get incorrect results / corrupted data: ```{r} @@ -155,7 +155,7 @@ mean(df_stata$age, na.rm = TRUE) ``` Unfortunately, you can't group by them, because `dplyr::group_by()` is not -missing tag-aware :( +tag-aware. :( ```{r} df_stata |> @@ -306,7 +306,7 @@ made the executive decision to only allow `integer` and `factor` types. ### 2. A better `na_cols()` specification? -Right now, missing values are supplied in `na` a separate argument from +Right now, missing values are supplied in a separate argument from `col_types`. This means custom missing values get pretty far separated from their `col_type` definitions: