diff --git a/.gitignore b/.gitignore index f573519..11f9805 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ checks.R big_tests.R *.gitignore Untitled.R -*.dll \ No newline at end of file +*.dll +new_cor.R \ No newline at end of file diff --git a/NEWS.Rmd b/NEWS.Rmd index 652c6b3..af3e64b 100644 --- a/NEWS.Rmd +++ b/NEWS.Rmd @@ -8,6 +8,7 @@ knitr::opts_chunk$set(echo = TRUE) # `inspectdf` 0.0.7.9000 +- Added `pcnt_nna` column to `inspect_cor()` output containin the percentage of pairwise complete observations used calculated correlations. Thanks to Theo Broekman for the suggestion. - Fixed bug causing order of grouping variable in grouped `inspect_` statements to be incorrect. Thanks to the report from Theo Broekman. - Removed erroneous print statement form `inspect_num()`. diff --git a/NEWS.md b/NEWS.md index 2ba43bb..fdb9324 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,9 @@ # `inspectdf` 0.0.7.9000 + - Added `pcnt_nna` column to `inspect_cor()` output containin the + percentage of pairwise complete observations used calculated + correlations. Thanks to Theo Broekman for the suggestion. - Fixed bug causing order of grouping variable in grouped `inspect_` statements to be incorrect. Thanks to the report from Theo Broekman. - Removed erroneous print statement form `inspect_num()`. diff --git a/R/cor_test.R b/R/cor_test.R index eac27e0..ab043f0 100644 --- a/R/cor_test.R +++ b/R/cor_test.R @@ -43,13 +43,13 @@ cor_test_2 <- function(df_input, df_name, with_col, alpha, method){ as_tibble(rownames = 'col_1') %>% gather(key = "col_2", value = "corr", -col_1) %>% filter(!corr == Inf | is.na(corr)) %>% - mutate(nna = nna_df$nna, se = (1 / sqrt(nna - 3))) %>% + mutate(nna = nna_df$nna, se = (1 / sqrt(nna - 3)), pcnt_nna = 100 * nna / nrow(df_input)) %>% arrange(desc(abs(corr))) %>% mutate(p_value = 2 * pnorm(-abs(corr / se))) %>% mutate(lower = tanh(atanh(corr) - qnorm(1 - (alpha/2)) * se)) %>% mutate(upper = tanh(atanh(corr) + qnorm(1 - (alpha/2)) * se)) %>% mutate(pair = paste(col_1, col_2, sep = " & ")) %>% - select(-nna, -se) + select(col_1, col_2, pair, corr, p_value, lower, upper, pcnt_nna) # return tibble of correlations return(cor_out) diff --git a/R/inspect_cor.R b/R/inspect_cor.R index 350d448..a498e69 100644 --- a/R/inspect_cor.R +++ b/R/inspect_cor.R @@ -18,12 +18,14 @@ #' \item \code{col_1}, \code{co1_2} character vectors containing names of numeric #' columns in \code{df1}. #' \item \code{corr} the calculated correlation coefficient. -#' \item \code{lower}, \code{upper} lower and upper values of the confidence interval -#' for the correlations. #' \item \code{p_value} p-value associated with a test where the null hypothesis is that #' the numeric pair have 0 correlation. +#' \item \code{lower}, \code{upper} lower and upper values of the confidence interval +#' for the correlations. +#' \item \code{pcnt_nna} the number of pairs of observations that were non missing for each +#' pair of columns. The correlation calculation used by \code{inspect_cor()} uses only +#' pairwise complete observations. #' } -#' #' If \code{df1} has class \code{grouped_df}, then correlations will be calculated within the grouping levels #' and the tibble returned will have an additional column corresponding to the group labels. #' diff --git a/man/inspect_cor.Rd b/man/inspect_cor.Rd index c4d78f3..662d9e7 100644 --- a/man/inspect_cor.Rd +++ b/man/inspect_cor.Rd @@ -38,12 +38,14 @@ returned: \item \code{col_1}, \code{co1_2} character vectors containing names of numeric columns in \code{df1}. \item \code{corr} the calculated correlation coefficient. - \item \code{lower}, \code{upper} lower and upper values of the confidence interval - for the correlations. \item \code{p_value} p-value associated with a test where the null hypothesis is that the numeric pair have 0 correlation. + \item \code{lower}, \code{upper} lower and upper values of the confidence interval + for the correlations. + \item \code{pcnt_nna} the number of pairs of observations that were non missing for each + pair of columns. The correlation calculation used by \code{inspect_cor()} uses only + pairwise complete observations. } - If \code{df1} has class \code{grouped_df}, then correlations will be calculated within the grouping levels and the tibble returned will have an additional column corresponding to the group labels. diff --git a/tests/testthat/test_inspect_cor_single.R b/tests/testthat/test_inspect_cor_single.R index 6920f89..c8847fe 100644 --- a/tests/testthat/test_inspect_cor_single.R +++ b/tests/testthat/test_inspect_cor_single.R @@ -12,8 +12,8 @@ test_that("Output format checks", { z2 <- inspect_cor(tech) expect_is(z1, "data.frame") expect_is(z2, "data.frame") - expect_equal(colnames(z1), c("col_1", "col_2", "corr", "p_value", "lower", "upper")) - expect_equal(colnames(z2), c("col_1", "col_2", "corr", "p_value", "lower", "upper")) + expect_equal(colnames(z1), c("col_1", "col_2", "corr", "p_value", "lower", "upper", "pcnt_nna")) + expect_equal(colnames(z2), c("col_1", "col_2", "corr", "p_value", "lower", "upper", "pcnt_nna")) }) diff_correlatations <- function(data_input, method){ @@ -82,3 +82,11 @@ test_that("kendal and spearman work", { expect_is(y, "data.frame") }) +test_that("kendal and spearman work", { + z1 <- inspect_cor(starwars) + nna_1 = mean((is.na(starwars$birth_year) + is.na(starwars$mass)) > 0) * 100 + nna_2 = mean((is.na(starwars$birth_year) + is.na(starwars$height)) > 0) * 100 + expect_equal(z1$pcnt_nna[1], 100 - nna_1) + expect_equal(z1$pcnt_nna[2], 100 - nna_2) +}) +