add column pcnt_nna to inspect_cor pairwise complete percentage.

alastairrushworth · Oct 13, 2019 · 3511a55 · 3511a55
1 parent c7217f0
commit 3511a55
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,5 @@ checks.R
 big_tests.R
 *.gitignore
 Untitled.R
-*.dll
+*.dll
+new_cor.R
diff --git a/NEWS.Rmd b/NEWS.Rmd
@@ -8,6 +8,7 @@ knitr::opts_chunk$set(echo = TRUE)
 
 # `inspectdf` 0.0.7.9000
 
+- Added `pcnt_nna` column to `inspect_cor()` output containin the percentage of pairwise complete observations used calculated correlations.  Thanks to Theo Broekman for the suggestion.
 - Fixed bug causing order of grouping variable in grouped `inspect_` statements to be incorrect.  Thanks to the report from Theo Broekman.
 - Removed erroneous print statement form `inspect_num()`.
 

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,9 @@
 
 # `inspectdf` 0.0.7.9000
 
+  - Added `pcnt_nna` column to `inspect_cor()` output containin the
+    percentage of pairwise complete observations used calculated
+    correlations. Thanks to Theo Broekman for the suggestion.
   - Fixed bug causing order of grouping variable in grouped `inspect_`
     statements to be incorrect. Thanks to the report from Theo Broekman.
   - Removed erroneous print statement form `inspect_num()`.

diff --git a/R/cor_test.R b/R/cor_test.R
@@ -43,13 +43,13 @@ cor_test_2 <- function(df_input, df_name, with_col, alpha, method){
     as_tibble(rownames = 'col_1') %>% 
     gather(key = "col_2", value = "corr", -col_1) %>%
     filter(!corr == Inf | is.na(corr)) %>%
-    mutate(nna = nna_df$nna, se = (1 / sqrt(nna - 3))) %>%
+    mutate(nna = nna_df$nna, se = (1 / sqrt(nna - 3)), pcnt_nna = 100 * nna / nrow(df_input)) %>%
     arrange(desc(abs(corr))) %>%
     mutate(p_value = 2 * pnorm(-abs(corr / se))) %>%
     mutate(lower = tanh(atanh(corr) - qnorm(1 - (alpha/2)) * se)) %>%
     mutate(upper = tanh(atanh(corr) + qnorm(1 - (alpha/2)) * se)) %>%
     mutate(pair = paste(col_1, col_2, sep = " & ")) %>%
-    select(-nna, -se)
+    select(col_1, col_2, pair, corr, p_value, lower, upper, pcnt_nna)
 
   # return tibble of correlations
   return(cor_out)

diff --git a/R/inspect_cor.R b/R/inspect_cor.R
@@ -18,12 +18,14 @@
 #'   \item \code{col_1}, \code{co1_2} character vectors containing names of numeric 
 #'   columns in \code{df1}.
 #'   \item \code{corr} the calculated correlation coefficient.
-#'   \item \code{lower}, \code{upper} lower and upper values of the confidence interval 
-#'   for the correlations.
 #'   \item \code{p_value} p-value associated with a test where the null hypothesis is that 
 #'   the numeric pair have 0 correlation. 
+#'   \item \code{lower}, \code{upper} lower and upper values of the confidence interval 
+#'   for the correlations.
+#'   \item \code{pcnt_nna} the number of pairs of observations that were non missing for each 
+#'   pair of columns.  The correlation calculation used by \code{inspect_cor()} uses only 
+#'   pairwise complete observations.  
 #' }
-#' 
 #' If \code{df1} has class \code{grouped_df}, then correlations will be calculated within the grouping levels 
 #' and the tibble returned will have an additional column corresponding to the group labels.
 #' 

diff --git a/man/inspect_cor.Rd b/man/inspect_cor.Rd
diff --git a/tests/testthat/test_inspect_cor_single.R b/tests/testthat/test_inspect_cor_single.R
@@ -12,8 +12,8 @@ test_that("Output format checks", {
   z2 <- inspect_cor(tech)
   expect_is(z1, "data.frame")
   expect_is(z2, "data.frame")
-  expect_equal(colnames(z1), c("col_1", "col_2", "corr", "p_value", "lower", "upper"))
-  expect_equal(colnames(z2), c("col_1", "col_2", "corr", "p_value", "lower", "upper"))
+  expect_equal(colnames(z1), c("col_1", "col_2", "corr", "p_value", "lower", "upper", "pcnt_nna"))
+  expect_equal(colnames(z2), c("col_1", "col_2", "corr", "p_value", "lower", "upper", "pcnt_nna"))
 })
 
 diff_correlatations <- function(data_input, method){
@@ -82,3 +82,11 @@ test_that("kendal and spearman work", {
   expect_is(y, "data.frame")
 })
 
+test_that("kendal and spearman work", {
+  z1 <- inspect_cor(starwars)
+  nna_1 = mean((is.na(starwars$birth_year) + is.na(starwars$mass)) > 0) * 100
+  nna_2 = mean((is.na(starwars$birth_year) + is.na(starwars$height)) > 0) * 100
+  expect_equal(z1$pcnt_nna[1], 100 - nna_1)
+  expect_equal(z1$pcnt_nna[2], 100 - nna_2)
+})
+