diff --git a/R/lists.R b/R/lists.R index 35e42e1..103832d 100644 --- a/R/lists.R +++ b/R/lists.R @@ -26,7 +26,7 @@ #' unlisted_length(l) # length of vector if we unlist #' paste0("length: ", length(print(unlist(l)))) #' -#' unlisted_length(l) - num_na(l) # Number of non-NA elements +#' unlisted_length(l) - na_count(l) # Number of non-NA elements #' #' # We can create and initialise a new list with a default value #' l <- new_list(20, 0L) diff --git a/R/overview.R b/R/overview.R index 1972e21..3bb621d 100644 --- a/R/overview.R +++ b/R/overview.R @@ -145,7 +145,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi n_false = NA_integer_[value_size])) lgl_out <- df_add_cols(lgl_out, list(p_true = NA_real_[value_size])) if (N > 0L && length(which_lgl) > 0) { - lgl_out$n_missing <- pluck_row(summarise_all(lgl_data, num_na), 1) + lgl_out$n_missing <- pluck_row(summarise_all(lgl_data, na_count), 1) lgl_out$p_complete <- pluck_row(summarise_all(lgl_data, prop_complete), 1) lgl_out$n_true <- pluck_row(summarise_all(lgl_data, function(x) sum(x, na.rm = TRUE)), 1) lgl_out$n_false <- N - lgl_out[["n_missing"]] - lgl_out[["n_true"]] @@ -173,7 +173,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi num_data <- transform_all(num_data, as.double, int64_vars) if (N > 0L && length(which_num) > 0) { - num_out$n_missing <- pluck_row(summarise_all(num_data, num_na), 1) + num_out$n_missing <- pluck_row(summarise_all(num_data, na_count), 1) num_out$p_complete <- pluck_row(summarise_all(num_data, prop_complete), 1) num_out$n_unique <- pluck_row(summarise_all(num_data, n_unique), 1) num_out$n_unique <- num_out$n_unique - (num_out$n_missing > 0L) @@ -210,7 +210,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi date_out <- df_add_cols(date_out, list(min = .Date(NA_real_[value_size]), max = .Date(NA_real_[value_size]))) if (N > 0L && length(which_date) > 0) { - date_out$n_missing <- pluck_row(summarise_all(date_data, num_na), 1) + date_out$n_missing <- pluck_row(summarise_all(date_data, na_count), 1) date_out$p_complete <- pluck_row(summarise_all(date_data, prop_complete), 1) date_out$n_unique <- pluck_row(summarise_all(date_data, n_unique), 1) date_out$n_unique <- date_out$n_unique - (date_out$n_missing > 0L) @@ -235,7 +235,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi datetime_out <- df_add_cols(datetime_out, list(min = .POSIXct(NA_real_[value_size]), max = .POSIXct(NA_real_[value_size]))) if (N > 0L && length(which_datetime) > 0) { - datetime_out$n_missing <- pluck_row(summarise_all(datetime_data, num_na), 1) + datetime_out$n_missing <- pluck_row(summarise_all(datetime_data, na_count), 1) datetime_out$p_complete <- pluck_row(summarise_all(datetime_data, prop_complete), 1) datetime_out$n_unique <- pluck_row(summarise_all(datetime_data, n_unique), 1) datetime_out$n_unique <- datetime_out$n_unique - (datetime_out$n_missing > 0L) @@ -280,7 +280,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi cat_out <- df_add_cols(cat_out, list(min = NA_character_[value_size], max = NA_character_[value_size])) if (N > 0L && length(which_cat) > 0) { - cat_out$n_missing <- pluck_row(summarise_all(cat_data, num_na), 1) + cat_out$n_missing <- pluck_row(summarise_all(cat_data, na_count), 1) cat_out$p_complete <- pluck_row(summarise_all(cat_data, prop_complete), 1) cat_out$n_unique <- pluck_row(summarise_all(cat_data, n_unique), 1) cat_out$n_unique <- cat_out$n_unique - (cat_out$n_missing > 0L) @@ -306,7 +306,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi other_out <- df_add_cols(other_out, list(n_unique = NA_integer_[value_size])) if (N > 0L && length(which_other) > 0) { other_out$n_missing <- pluck_row(summarise_all( - other_data, function(x) num_na(x, recursive = FALSE) + other_data, function(x) na_count(x, recursive = FALSE) ), 1) other_out$p_complete <- pluck_row(summarise_all( other_data, function(x) prop_complete(x, recursive = FALSE) @@ -421,7 +421,7 @@ prop_missing <- function(x, recursive = TRUE){ } else { N <- cpp_vec_length(x) } - num_na(x, recursive = recursive) / N + na_count(x, recursive = recursive) / N } prop_complete <- function(x, recursive = TRUE){ 1 - prop_missing(x, recursive = recursive) diff --git a/R/scalars.R b/R/scalars.R index 0c8bd59..008ae53 100644 --- a/R/scalars.R +++ b/R/scalars.R @@ -15,14 +15,36 @@ #' @param replace Replacement scalar value. #' #' @details +#' The `val_` functions allow you to very efficiently work with +#' scalars, i.e length 1 vectors. Many common common operations like +#' counting the occurrence of `NA` or zeros, e.g. `sum(x == 0)` or +#' `sum(is.na(x))` can be replaced more efficiently with +#' `val_count(x, 0)` and `na_count(x)` respectively. +#' #' At the moment these functions only work for #' integer, double and character vectors with the exception of the `NA` #' functions. #' They are intended mainly for developers who wish to write cheaper code -#' and reduce expensive vector operations. For example -#' `val_count(x, 0)` will always be cheaper than `sum(x == 0)`. +#' and reduce expensive vector operations. +#' +#' * `val_count()` - Counts occurrences of a value +#' * `val_find()` Finds locations (indices) of a value +#' * `val_replace()` - Replaces value with another value +#' * `val_rm()` - Removes occurrences of value from an object +#' +#' There are `NA` equivalent convenience functions. +#' +#' * `na_count()` == `val_count(x, NA)` +#' * `na_find()` == `val_find(x, NA)` +#' * `na_replace()` == `val_replace(x, NA)` +#' * `na_rm()` == `val_rm(x, NA)` +#' +#' `val_count()` and `val_replace()` can work recursively. For example, +#' when applied to a data frame, `na_replace` will replace `NA` values across +#' the entire data frame with the specified replacement value. #' -#' Historically function naming has not been consistent, though going forward +#' In 'cheapr' function-naming conventions have not been consistent but +#' going forward #' all scalar functions (including the `NA` convenience functions) will be #' prefixed with 'val_' and 'na_' respectively. #' Functions named with the older naming scheme like `which_na` may be diff --git a/man/lists.Rd b/man/lists.Rd index 21f4e47..fca2002 100644 --- a/man/lists.Rd +++ b/man/lists.Rd @@ -42,7 +42,7 @@ lengths_(l) # Faster lengths() unlisted_length(l) # length of vector if we unlist paste0("length: ", length(print(unlist(l)))) -unlisted_length(l) - num_na(l) # Number of non-NA elements +unlisted_length(l) - na_count(l) # Number of non-NA elements # We can create and initialise a new list with a default value l <- new_list(20, 0L) diff --git a/man/scalars.Rd b/man/scalars.Rd index 8e64485..86d8ba2 100644 --- a/man/scalars.Rd +++ b/man/scalars.Rd @@ -65,14 +65,38 @@ They are particularly useful for working with \code{NA} values in a fast and efficient manner. } \details{ +The \code{val_} functions allow you to very efficiently work with +scalars, i.e length 1 vectors. Many common common operations like +counting the occurrence of \code{NA} or zeros, e.g. \code{sum(x == 0)} or +\code{sum(is.na(x))} can be replaced more efficiently with +\code{val_count(x, 0)} and \code{na_count(x)} respectively. + At the moment these functions only work for integer, double and character vectors with the exception of the \code{NA} functions. They are intended mainly for developers who wish to write cheaper code -and reduce expensive vector operations. For example -\code{val_count(x, 0)} will always be cheaper than \code{sum(x == 0)}. +and reduce expensive vector operations. +\itemize{ +\item \code{val_count()} - Counts occurrences of a value +\item \code{val_find()} Finds locations (indices) of a value +\item \code{val_replace()} - Replaces value with another value +\item \code{val_rm()} - Removes occurrences of value from an object +} + +There are \code{NA} equivalent convenience functions. +\itemize{ +\item \code{na_count()} == \code{val_count(x, NA)} +\item \code{na_find()} == \code{val_find(x, NA)} +\item \code{na_replace()} == \code{val_replace(x, NA)} +\item \code{na_rm()} == \code{val_rm(x, NA)} +} + +\code{val_count()} and \code{val_replace()} can work recursively. For example, +when applied to a data frame, \code{na_replace} will replace \code{NA} values across +the entire data frame with the specified replacement value. -Historically function naming has not been consistent, though going forward +In 'cheapr' function-naming conventions have not been consistent but +going forward all scalar functions (including the \code{NA} convenience functions) will be prefixed with 'val_' and 'na_' respectively. Functions named with the older naming scheme like \code{which_na} may be