Merge pull request #35 from pharmaverse/fix-supp_com_fp

Correcting issue #31 and #33 in combine_supp
pharmaverse · Jun 21, 2022 · 0c5c356 · 0c5c356
2 parents 0920b6b + 494264d
commit 0c5c356
Show file tree

Hide file tree

Showing 5 changed files with 122 additions and 39 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -22,6 +22,7 @@ export(sort_by_key)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,across)
 importFrom(dplyr,all_of)
+importFrom(dplyr,anti_join)
 importFrom(dplyr,any_of)
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)
@@ -69,3 +70,4 @@ importFrom(stringr,str_remove_all)
 importFrom(stringr,str_to_lower)
 importFrom(stringr,str_to_upper)
 importFrom(tidyr,pivot_wider)
+importFrom(utils,capture.output)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # metatools 0.1.2
 * correct bug with `combine_supp()` when the data and the supp have different classes for the IDVARVAL
+* add error to `combine_supp()` to report when not all the rows in the supp have merged
+* add `floating_pt_correction` argument to `combine_supp()` used for when there are floating point errors with `IDVARVAL`
 
 
 # metatools 0.1.1

diff --git a/R/supp.R b/R/supp.R
@@ -119,20 +119,24 @@ make_supp_qual <- function(dataset, metacore, dataset_name = NULL){
 #'
 #' @param dataset Domain dataset
 #' @param supp Supplemental Qualifier dataset
+#' @param floating_pt_correction By default this is `FALSE`, but can be set to
+#'   `TRUE` if the IDVAR is a double and `supp_combine` is not merging correctly
+#'   due to floating point.
 #'
 #' @return a dataset with the supp variables added to it
 #' @export
 #'
 #' @importFrom purrr discard map reduce
-#' @importFrom dplyr if_else select group_by group_split pull rename left_join any_of
+#' @importFrom dplyr if_else select group_by group_split pull rename left_join
+#'   any_of
 #' @importFrom tidyr pivot_wider
 #' @importFrom rlang sym
 #'
 #' @examples
 #' library(safetyData)
 #' library(tibble)
 #' combine_supp(sdtm_ae, sdtm_suppae)  %>% as_tibble()
-combine_supp <- function(dataset, supp){
+combine_supp <- function(dataset, supp, floating_pt_correction = FALSE){
    if(!is.data.frame(dataset) | !is.data.frame(supp)){
       stop("You must supply a domain and supplemental dataset", call. = FALSE)
    }
@@ -163,43 +167,86 @@ combine_supp <- function(dataset, supp){
       rename(DOMAIN = .data$RDOMAIN) %>%
       group_by(.data$IDVAR) %>% #For when there are multiple IDs
       group_split() %>%
-      map(function(x) {
-         # Get the IDVAR value to allow for renaming of IDVARVAL
-         id_var <- x %>%
-            pull(.data$IDVAR) %>%
-            unique()
-
-         wide_x <- x %>%
-            pivot_wider(
-               names_from = .data$QNAM,
-               values_from = .data$QVAL) %>%
-            select(-.data$IDVAR)
-
-
-         if(!is.na(id_var) && id_var  != ""){
-            # the type the new variable needs to be
-            type_convert <- dataset %>%
-               pull(all_of(id_var)) %>%
-               mode() %>%
-               paste0("as.", .) %>%
-               match.fun()
-            wide_x <- wide_x %>%
-               mutate(IDVARVAL = type_convert(.data$IDVARVAL)) %>%
-               rename(!!sym(id_var) := .data$IDVARVAL) #Given there is only one ID per df we can just rename
-
-            by <- c("STUDYID", "DOMAIN", "USUBJID", id_var)
-
-            out <- left_join(dataset, wide_x,
-                      by = by)
-         } else {
-            wide_x <- wide_x %>%
-               select(-.data$IDVARVAL)
-            out <- left_join(dataset, wide_x,
-                             by = c("STUDYID", "DOMAIN", "USUBJID"))
-         }
-         out
-      }) %>%
+      map(~combine_supp_by_idvar(dataset, ., floating_pt_correction)) %>%
       reduce(full_join, by= by)
+}
+
+
+#' Handles the combining of datasets and supps for a single IDVAR
+#'
+#' @param dataset Domain dataset
+#' @param supp Supplemental Qualifier dataset with a single IDVAR
+#' @param floating_pt_correction By default this is `FALSE`, but can be set to
+#'   `TRUE` if the IDVAR is a double and `supp_combine` is not merging correctly
+#'   due to floating point.
+#'
+#' @return list of datasets
+#' @noRd
+#' @importFrom dplyr anti_join
+#' @importFrom utils capture.output
+combine_supp_by_idvar <- function(dataset, supp, floating_pt_correction){
+   # Get the IDVAR value to allow for renaming of IDVARVAL
+   id_var <- supp %>%
+      pull(.data$IDVAR) %>%
+      unique()
+
+   wide_x <- supp %>%
+      pivot_wider(
+         names_from = .data$QNAM,
+         values_from = .data$QVAL) %>%
+      select(-.data$IDVAR)
+
+
+   if(!is.na(id_var) && id_var  != ""){
+      id_var_sym <- sym(id_var)
+      # the type the new variable needs to be
+      type_convert <- dataset %>%
+         pull(all_of(id_var)) %>%
+         mode() %>%
+         paste0("as.", .) %>%
+         match.fun()
+
+
+      if(floating_pt_correction){
+         by <- c("STUDYID", "DOMAIN", "USUBJID", "IDVARVAL")
+         wide_x <- wide_x %>%
+            mutate(IDVARVAL = as.character(.data$IDVARVAL))
+         dataset_chr <- dataset %>%
+            mutate(IDVARVAL = as.character(!!id_var_sym))
+
+         out <- left_join(dataset_chr, wide_x,
+                          by = by) %>%
+            select(-IDVARVAL)
+         missing<- anti_join(wide_x,dataset_chr, by = by)
+      } else {
+         by <- c("STUDYID", "DOMAIN", "USUBJID", id_var)
+         wide_x <- wide_x %>%
+            mutate(IDVARVAL = type_convert(.data$IDVARVAL)) %>%
+            rename(!!id_var_sym := .data$IDVARVAL) #Given there is only one ID per df we can just rename
+
+         out <- left_join(dataset, wide_x,
+                          by = by)
+         missing<- anti_join(wide_x, dataset, by = by)
+      }
+
+      # Add message for when there are rows in the supp that didn't get merged
+      if(nrow(missing) > 0){
+         missing_txt <- capture.output(missing %>%
+            select(.data$USUBJID, !!sym(id_var)) %>%
+            print()) %>%
+            paste0(collapse = "\n")
+         stop(paste0("Not all rows of the Supp were merged. The following rows are missing:\n",
+                        missing_txt),
+              call. = FALSE)
+      }
+
+   } else {
+      wide_x <- wide_x %>%
+         select(-.data$IDVARVAL)
+      out <- left_join(dataset, wide_x,
+                       by = c("STUDYID", "DOMAIN", "USUBJID"))
+   }
+   out
 
 }
 
diff --git a/man/combine_supp.Rd b/man/combine_supp.Rd
diff --git a/tests/testthat/test-supp.R b/tests/testthat/test-supp.R
@@ -201,3 +201,31 @@ test_that("combine_supp works with different IDVARVAL classes", {
 test_that("combine_supp works with without QEVAL", {
    expect_silent(combine_supp(admiral_tr, admiral_supptr))
 })
+
+test_that("supp data that does not match the main data will raise a warning", {
+   sdtm_suppae_extra <- sdtm_suppae
+   sdtm_suppae_extra$IDVARVAL[1] <- 99
+   expect_error(
+      combine_supp(sdtm_ae, sdtm_suppae_extra)
+   )
+})
+
+test_that("Floating point correction works", {
+   fp1 = 0.1 + 0.1 + 0.1 + 0.1 + 0.1 + 0.1 + 0.1 + 0.1 + 0.1 + 0.1
+   sdtm_ae_fp <- sdtm_ae %>%
+      mutate(AESEQ = case_when(AESEQ == 1 ~ fp1,
+                               TRUE ~ as.double(AESEQ)))
+   # Make sure a FP error is induced
+   expect_error(combine_supp(sdtm_ae_fp, sdtm_suppae))
+   # correction
+   combo_ae <-combine_supp(sdtm_ae_fp, sdtm_suppae, TRUE) %>%
+      select(USUBJID, AESEQ, AETRTEM) %>%
+      distinct() %>%
+      arrange(USUBJID, AESEQ)
+   supp_check <- sdtm_suppae %>%
+      select(USUBJID, AESEQ = IDVARVAL, AETRTEM = QVAL) %>%
+      arrange(USUBJID, AESEQ)
+   expect_equal(combo_ae, supp_check)
+   })
+
+