diff --git a/.travis.yml b/.travis.yml index 6b2494bd..55e90b40 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,13 @@ warnings_are_errors: true env: global: - NOT_CRAN=false +addons: + apt: + packages: + - libudunits2-dev + - gdal-bin + - libgdal1-dev + - libproj-dev before_install: - echo "options(repos = c(CRAN='http://cran.rstudio.com'))" > ~/.Rprofile r_github_packages: diff --git a/DESCRIPTION b/DESCRIPTION index 8062adde..6ae3c8dd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -49,11 +49,12 @@ Imports: tidyr (>= 1.0), tidyselect (>= 0.2.5) Suggests: + covr, extrafont, rmarkdown, + sf, testthat (>= 2.0.0), - withr, - covr + withr License: GPL-3 Encoding: UTF-8 LazyData: true diff --git a/R/skim_with.R b/R/skim_with.R index adb73edf..1eb23bc1 100644 --- a/R/skim_with.R +++ b/R/skim_with.R @@ -1,10 +1,7 @@ #' Set or add the summary functions for a particular type of data #' #' While skim is designed around having an opinionated set of defaults, you -#' can use these functions to change the summary statistics that it returns. -#' To do that, provide type you wish to change as an argument to this function, -#' along with a list of named functions that you want to use instead of the -#' defaults. +#' can use this function to change the summary statistics that it returns. #' #' `skim_with()` is a closure: a function that returns a new function. This #' lets you have several skimming functions in a single R session, but it @@ -14,7 +11,11 @@ #' You assign values within `skim_with` by using the [sfl()] helper (`skimr` #' function list). This helper behaves mostly like [dplyr::funs()], but lets #' you also identify which skimming functions you want to remove, by setting -#' them to `NULL`. +#' them to `NULL`. Assign an `sfl` to each column type that you wish to modify. +#' Functions that summarize all data types, and always return the same type +#' of value, can be assigned to the `base` argument. The default base skimmers +#' compute the number of missing values `n_missing` and the rate of values being +#' complete, i.e. not missing. #' #' When `append = TRUE` and local skimmers have names matching the names of #' entries in the default `skim_function_list`, the values in the default list @@ -68,6 +69,7 @@ skim_with <- function(..., local_skimmers <- validate_assignment(...) function(data, ...) { + data_name <- rlang::expr_label(substitute(data)) if (!is.data.frame(data)) { data <- as.data.frame(data) } @@ -110,7 +112,7 @@ skim_with <- function(..., class = c("skim_df", "tbl_df", "tbl", "data.frame"), data_rows = nrow(data), data_cols = ncol(data), - df_name = rlang::expr_label(substitute(data)), + df_name = data_name, groups = dplyr::groups(data), base_skimmers = names(base$funs), skimmers_used = get_skimmers_used(unique_skimmers) @@ -193,8 +195,12 @@ get_final_skimmers <- function(column, data, local_skimmers, append) { if (is.null(locals$funs)) { if (defaults$skim_type == "default") { - warning("Couldn't find skimmers for class: %s; No user-defined `sfl` ", - "provided. Falling back to `character`.", + msg <- sprintf( + "Couldn't find skimmers for class: %s;", + paste(all_classes, collapse = ", ") + ) + warning(msg, + " No user-defined `sfl` provided. Falling back to `character`.", call. = FALSE ) data[[column]] <- as.character(data[[column]]) @@ -284,39 +290,48 @@ mangle_names <- function(skimmers, base_names) { #' mangle the function names. That way, each set of relevant columns begin #' with the column name + `_` + our internal delimiter. #' +#' @param mangled_skimmers The `sfl`'s whose function names have been mangled. +#' @param variable_names The names of columns in the original data, matching a +#' data type, that will be summarized. +#' @param data The original data. #' @keywords internal #' @noRd -skim_by_type <- function(mangled, columns, data) { +skim_by_type <- function(mangled_skimmers, variable_names, data) { UseMethod("skim_by_type", data) } #' @export -skim_by_type.grouped_df <- function(mangled, columns, data) { +skim_by_type.grouped_df <- function(mangled_skimmers, variable_names, data) { group_columns <- dplyr::groups(data) grouped <- dplyr::group_by(data, !!!group_columns) - skimmed <- dplyr::summarize_at(grouped, columns, mangled$funs) - build_results(skimmed, columns, group_columns) + skimmed <- dplyr::summarize_at(grouped, variable_names, mangled_skimmers$funs) + build_results(skimmed, variable_names, group_columns) } #' @export -skim_by_type.data.frame <- function(mangled, columns, data) { - skimmed <- dplyr::summarize_at(data, columns, mangled$funs) - build_results(skimmed, columns, NULL) +skim_by_type.data.frame <- function(mangled_skimmers, variable_names, data) { + skimmed <- dplyr::summarize_at(data, variable_names, mangled_skimmers$funs) + build_results(skimmed, variable_names, NULL) } #' Summarize returns a single row data frame, make it tall. #' @noRd -build_results <- function(skimmed, data_cols, groups) { - if (length(data_cols) > 1) { +build_results <- function(skimmed, variable_names, groups) { + if (length(variable_names) > 1) { out <- tibble::tibble( - skim_variable = data_cols, - by_variable = purrr::map(data_cols, reshape_skimmed, skimmed, groups) + skim_variable = variable_names, + by_variable = purrr::map(variable_names, reshape_skimmed, skimmed, groups) ) tidyr::unnest(out, .data$by_variable) } else { + out <- dplyr::select( + as.data.frame(skimmed), + !!!groups, + tidyselect::contains(NAME_DELIMETER) + ) tibble::tibble( - skim_variable = data_cols, - !!!set_clean_names(skimmed) + skim_variable = variable_names, + !!!set_clean_names(out) ) } } @@ -324,7 +339,7 @@ build_results <- function(skimmed, data_cols, groups) { reshape_skimmed <- function(column, skimmed, groups) { delim_name <- paste0(column, "_", NAME_DELIMETER) out <- dplyr::select( - skimmed, + as.data.frame(skimmed), !!!groups, tidyselect::starts_with(delim_name) ) diff --git a/R/summary.R b/R/summary.R index 64b3f273..fe5559d7 100644 --- a/R/summary.R +++ b/R/summary.R @@ -13,12 +13,12 @@ summary.skim_df <- function(object, ...) { if (is.null(object)) { stop("dataframe is null.") } - df_name <- df_name(object) - df_name <- ifelse(df_name %in% c("`.`", ".data"), "Piped data", df_name) - df_name <- gsub("`", "", df_name) - df_name <- ifelse(nchar(df_name) > 25, - paste0(substring(df_name, 1, 25), "..."), - df_name + data_name <- df_name(object) + data_name <- ifelse(data_name %in% c("`.`", ".data"), "Piped data", data_name) + data_name <- gsub("`", "", data_name) + data_name <- ifelse(nchar(data_name) > 25, + paste0(substring(data_name, 1, 25), "..."), + data_name ) duplicated <- duplicated(object$skim_variable) @@ -32,7 +32,7 @@ summary.skim_df <- function(object, ...) { ) summary_object <- c( - df_name, + data_name, data_rows(object), data_cols(object), " ", diff --git a/codemeta.json b/codemeta.json index 71fd2f74..cb25d9af 100644 --- a/codemeta.json +++ b/codemeta.json @@ -162,6 +162,18 @@ } ], "softwareSuggestions": [ + { + "@type": "SoftwareApplication", + "identifier": "covr", + "name": "covr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=covr" + }, { "@type": "SoftwareApplication", "identifier": "extrafont", @@ -188,40 +200,40 @@ }, { "@type": "SoftwareApplication", - "identifier": "testthat", - "name": "testthat", - "version": ">= 2.0.0", + "identifier": "sf", + "name": "sf", "provider": { "@id": "https://cran.r-project.org", "@type": "Organization", "name": "Comprehensive R Archive Network (CRAN)", "url": "https://cran.r-project.org" }, - "sameAs": "https://CRAN.R-project.org/package=testthat" + "sameAs": "https://CRAN.R-project.org/package=sf" }, { "@type": "SoftwareApplication", - "identifier": "withr", - "name": "withr", + "identifier": "testthat", + "name": "testthat", + "version": ">= 2.0.0", "provider": { "@id": "https://cran.r-project.org", "@type": "Organization", "name": "Comprehensive R Archive Network (CRAN)", "url": "https://cran.r-project.org" }, - "sameAs": "https://CRAN.R-project.org/package=withr" + "sameAs": "https://CRAN.R-project.org/package=testthat" }, { "@type": "SoftwareApplication", - "identifier": "covr", - "name": "covr", + "identifier": "withr", + "name": "withr", "provider": { "@id": "https://cran.r-project.org", "@type": "Organization", "name": "Comprehensive R Archive Network (CRAN)", "url": "https://cran.r-project.org" }, - "sameAs": "https://CRAN.R-project.org/package=covr" + "sameAs": "https://CRAN.R-project.org/package=withr" } ], "softwareRequirements": [ @@ -391,7 +403,7 @@ ], "releaseNotes": "https://github.com/ropensci/skimr/blob/master/NEWS.md", "readme": "https://github.com/ropensci/skimr/blob/master/README.md", - "fileSize": "4120.816KB", + "fileSize": "3788.726KB", "contIntegration": [ "https://travis-ci.org/ropenscilabs/skimr", "https://codecov.io/gh/ropenscilabs/skimr" diff --git a/man/skim_with.Rd b/man/skim_with.Rd index befd8d0b..5b1a0e4b 100644 --- a/man/skim_with.Rd +++ b/man/skim_with.Rd @@ -22,10 +22,7 @@ more details. } \description{ While skim is designed around having an opinionated set of defaults, you -can use these functions to change the summary statistics that it returns. -To do that, provide type you wish to change as an argument to this function, -along with a list of named functions that you want to use instead of the -defaults. +can use this function to change the summary statistics that it returns. } \details{ \code{skim_with()} is a closure: a function that returns a new function. This @@ -36,7 +33,11 @@ you can use it. You assign values within \code{skim_with} by using the \code{\link[=sfl]{sfl()}} helper (\code{skimr} function list). This helper behaves mostly like \code{\link[dplyr:funs]{dplyr::funs()}}, but lets you also identify which skimming functions you want to remove, by setting -them to \code{NULL}. +them to \code{NULL}. Assign an \code{sfl} to each column type that you wish to modify. +Functions that summarize all data types, and always return the same type +of value, can be assigned to the \code{base} argument. The default base skimmers +compute the number of missing values \code{n_missing} and the rate of values being +complete, i.e. not missing. When \code{append = TRUE} and local skimmers have names matching the names of entries in the default \code{skim_function_list}, the values in the default list diff --git a/tests/testthat/helper-expectations.R b/tests/testthat/helper-expectations.R index 7acaa92b..2f17a99d 100644 --- a/tests/testthat/helper-expectations.R +++ b/tests/testthat/helper-expectations.R @@ -30,14 +30,15 @@ expect_NA <- function(object) { expect_print_matches_file <- function(object, filename, - skip_on_windows = TRUE) { + skip_on_windows = TRUE, + width = 100) { if (skip_on_windows) testthat::skip_on_os("windows") - withr::with_options(list(crayon.enabled = FALSE), { + withr::with_options(list(crayon.enabled = FALSE, width = width), { testthat::expect_known_output( print(object), filename, update = FALSE, - width = 100 + width = width ) }) } diff --git a/tests/testthat/print/smaller.txt b/tests/testthat/print/smaller.txt index e3aac13c..e0b78845 100644 --- a/tests/testthat/print/smaller.txt +++ b/tests/testthat/print/smaller.txt @@ -10,13 +10,20 @@ Column type frequency: ________________________ Group variables None -── Variable type: factor ─────────────────────────────────────────────────────────────────────────── - skim_variable n_missing complete_rate ordered n_unique top_counts -1 Species 0 1 FALSE 3 set: 50, ver: 50, vir: 50 +── Variable type: factor ───────────────────────── + skim_variable n_missing complete_rate ordered +1 Species 0 1 FALSE + n_unique top_counts +1 3 set: 50, ver: 50, vir: 50 -── Variable type: numeric ────────────────────────────────────────────────────────────────────────── - skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist -1 Sepal.Length 0 1 5.84 0.828 4.3 5.1 5.8 6.4 7.9 ▆▇▇▅▂ -2 Sepal.Width 0 1 3.06 0.436 2 2.8 3 3.3 4.4 ▁▆▇▂▁ -3 Petal.Length 0 1 3.76 1.77 1 1.6 4.35 5.1 6.9 ▇▁▆▇▂ -4 Petal.Width 0 1 1.20 0.762 0.1 0.3 1.3 1.8 2.5 ▇▁▇▅▃ +── Variable type: numeric ──────────────────────── + skim_variable n_missing complete_rate mean +1 Sepal.Length 0 1 5.84 +2 Sepal.Width 0 1 3.06 +3 Petal.Length 0 1 3.76 +4 Petal.Width 0 1 1.20 + sd p0 p25 p50 p75 p100 hist +1 0.828 4.3 5.1 5.8 6.4 7.9 ▆▇▇▅▂ +2 0.436 2 2.8 3 3.3 4.4 ▁▆▇▂▁ +3 1.77 1 1.6 4.35 5.1 6.9 ▇▁▆▇▂ +4 0.762 0.1 0.3 1.3 1.8 2.5 ▇▁▇▅▃ diff --git a/tests/testthat/test-skim_print.R b/tests/testthat/test-skim_print.R index be573731..dcc70095 100644 --- a/tests/testthat/test-skim_print.R +++ b/tests/testthat/test-skim_print.R @@ -86,10 +86,8 @@ test_that("Print focused objects appropriately", { }) test_that("Metadata is stripped from smaller consoles", { - withr::with_options(list(width = 50), { - skimmed <- skim(iris) - expect_print_matches_file(skimmed, "print/smaller.txt") - }) + skimmed <- skim(iris) + expect_print_matches_file(skimmed, "print/smaller.txt", width = 50) }) test_that("Crayon is supported", { diff --git a/vignettes/Supporting_additional_objects.Rmd b/vignettes/Supporting_additional_objects.Rmd index 47e46730..bb1868ef 100644 --- a/vignettes/Supporting_additional_objects.Rmd +++ b/vignettes/Supporting_additional_objects.Rmd @@ -30,31 +30,28 @@ example. However to run it on your own you can install `sf` and then run the following code. Note that code in this vignette was not evaluated when rendering the vignette in order to avoid forcing installation of sf. -```{r, eval = FALSE} +```{r} +library(skimr) library(sf) nc <- st_read(system.file("shape/nc.shp", package = "sf")) ``` -``` -Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0 -Reading layer `nc' from data source -`/path/to/library/sf/shape/nc.shp' using driver `ESRI Shapefile' -Simple feature collection with 100 features and 14 fields -geometry type: MULTIPOLYGON -dimension: XY -bbox: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965 -epsg (SRID): 4267 -proj4string: +proj=longlat +datum=NAD27 +no_defs -``` -```{r, eval=FALSE} + +```{r} class(nc) ``` -``` -[1] "sf" "data.frame" -``` + Unlike the example of having a new type of data in a column of a simple data frame in the "Using skimr" vignette, this is a different type of object -with special attributes . +with special attributes. + +In this object there is also a column of a class that does not have default +skimmers. By default, skimr falls back to use the sfl for character variables. + +```{r} +skim(nc$geometry) +``` + ## Experiment interactively @@ -70,7 +67,7 @@ this case, you're best off adding your data type with `skim_with()`. Before we begin, we'll be using the following custom summary statistic throughout. It's a naive example, but covers the requirements of what we need. -```{r, eval = FALSE} +```{r} funny_sf <- function(x) { length(x) + 1 } @@ -100,7 +97,7 @@ will build support for `sfc_MULTIPOLYGON`, but note that we'll have to eventually think about `sfc_LINESTRING`, `sfc_POLYGON`, `sfc_MULTIPOINT` and others if we want to fully support `sf`. -```{r, eval = FALSE} +```{r} skim_sf <- skim_with( sfc_MULTIPOLYGON = sfl( n_unique = n_unique, @@ -114,96 +111,30 @@ The example above creates a new *function*, and you can call that function on a specific column with `sfc_MULTIPOLYGON` data to get the appropriate summary statistics. -```{r, eval = FALSE} +```{r} skim_sf(nc$geometry) ``` -``` -── Data Summary ──────────────────────── - Values -Name structure(list(geometry =... -Number of rows 100 -Number of columns 1 -_______________________ -Column type frequency: - sfc_MULTIPOLYGON 1 -________________________ -Group variables None - -── Variable type: sfc_MULTIPOLYGON ────────────────────────────────────────────────────── - skim_variable n_missing complete_rate n_unique valid funny missing n -1 geometry 0 1 100 100 101 0 100 -``` + Creating a function that is a method of the skim_by_type generic for the data type allows skimming of an entire data frame that contains some columns of that type. -```{r, eval = FALSE} +```{r} skim_by_type.sfc_MULTIPOLYGON <- function(mangled, columns, data) { skimmed <- dplyr::summarize_at(data, columns, mangled$funs) build_results(skimmed, columns, NULL) } ``` -```{r, eval = FALSE} -skim(nc) +```{r} +skim_sf(nc) ``` -``` -── Data Summary ──────────────────────── - Values -Name nc -Number of rows 100 -Number of columns 15 -_______________________ -Column type frequency: - factor 2 - numeric 12 - sfc_MULTIPOLYGON 1 -________________________ -Group variables None - -── Variable type: factor ──────────────────────────────────────────────────────────────── - skim_variable n_missing complete_rate ordered n_unique top_counts -1 NAME 0 1 FALSE 100 Ala: 1, Ale: 1, All: 1, Ans: 1 -2 FIPS 0 1 FALSE 100 370: 1, 370: 1, 370: 1, 370: 1 - -── Variable type: numeric─────────────────────────────────────────────────────── - skim_variable n_missing complete_rate mean sd p0 p25 - 1 AREA 0 1 0.126 0.0492 0.042 0.091 - 2 PERIMETER 0 1 1.67 0.482 0.999 1.32 - 3 CNTY_ 0 1 1986. 107. 1825 1902. - 4 CNTY_ID 0 1 1986. 107. 1825 1902. - 5 FIPSNO 0 1 37100 58.0 37001 37050. - 6 CRESS_ID 0 1 50.5 29.0 1 25.8 - 7 BIR74 0 1 3300. 3848. 248 1077 - 8 SID74 0 1 6.67 7.78 0 2 - 9 NWBIR74 0 1 1051. 1433. 1 190 -10 BIR79 0 1 4224. 5179. 319 1336. -11 SID79 0 1 8.36 9.43 0 2 -12 NWBIR79 0 1 1353. 1976. 3 250. - p50 p75 p100 hist - 1 0.120 0.154 0.241 ▆▇▆▃▂ - 2 1.61 1.86 3.64 ▇▇▂▁▁ - 3 1982 2067. 2241 ▇▆▆▅▁ - 4 1982 2067. 2241 ▇▆▆▅▁ - 5 37100 37150. 37199 ▇▇▇▇▇ - 6 50.5 75.2 100 ▇▇▇▇▇ - 7 2180. 3936 21588 ▇▁▁▁▁ - 8 4 8.25 44 ▇▂▁▁▁ - 9 698. 1168. 8027 ▇▁▁▁▁ -10 2636 4889 30757 ▇▁▁▁▁ -11 5 10.2 57 ▇▂▁▁▁ -12 874. 1407. 11631 ▇▁▁▁▁ - -── Variable type: sfc_MULTIPOLYGON────────────────────────────────────────────── - skim_variable n_missing complete_rate n_unique valid funny -1 geometry 0 1 100 100 101 -``` Sharing these functions within a separate package requires an export. The simplest way to do this is with Roxygen. -```{r, eval = FALSE} +```{r} #' Skimming functions for `sfc_MULTIPOLYGON` objects. #' @export skim_sf <- skim_with( @@ -220,7 +151,7 @@ skim_sf <- skim_with( #' @export skim_by_type.sfc_MULTIPOLYGON <- function(mangled, columns, data) { skimmed <- dplyr::summarize_at(data, columns, mangled$funs) - build_results(skimmed, columns, NULL) + skimr::build_results(skimmed, columns, NULL) } ``` @@ -240,7 +171,7 @@ function list. This is the same list-like data structure used in the `skim_with()` example above. But note! There is one key difference. When adding a generic we also want to identify the `skim_type` in the `sfl`. -```{r, eval = FALSE} +```{r} #' @importFrom skimr get_skimmers #' @export get_skimmers.sfc_MULTIPOLYGON <- function(column) { @@ -259,7 +190,7 @@ The same strategy follows for other data types. * return an `sfl` * make sure that the `skim_type` is there -```{r, eval = FALSE} +```{r} #' @export get_skimmers.sfc_POINT <- function(column) { sfl( @@ -274,18 +205,12 @@ Users of your package should load `skimr` to get the `skim()` function. Once loaded, a call to `get_default_skimmer_names()` will return defaults for your data types as well! -```{r, eval = FALSE} +```{r} get_default_skimmer_names() ``` ``` -... -$sfc_MULTIPOLYGON -[1] "n_unique" "valid" "funny" -$sfc_POINT -[1] "n_unique" "valid" -``` ## Conclusion This is a very simple example. For a package such as sf the custom statistics