Merge pull request #93 from TESTgroup-BNL/group_nComps

Adding in functionality for using stratified sampling in pls_permutation and component selection
plantphys · Jan 28, 2022 · 36c77ae · 36c77ae
2 parents 59ce2c8 + dc6b9e3
commit 36c77ae
Show file tree

Hide file tree

Showing 91 changed files with 2,624 additions and 675 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# R test files
+Rplots.pdf
+
 # History files
 .Rhistory
 .Rapp.history

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: spectratrait
 Title: A simple add-on package to aid in the fitting of leaf-level spectra-trait PLSR models
-Version: 1.0.5
+Version: 1.1.0
 Authors@R:
   c(person(given = "Julien",
            family = "Lamour",
@@ -33,12 +33,13 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.1.2
 Imports: 
     httr (>= 1.4.2),
     readr (>= 1.3.1),
     pls (>= 2.7-2),
     dplyr (>= 1.0.1),
+    magrittr (>= 2.0.1),
     reshape2 (>= 1.4.4),
     here (>= 0.1),
     plotrix (>= 3.7-8),

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,13 +7,36 @@ export(create_data_split)
 export(f.coef.valid)
 export(f.plot.coef)
 export(f.plot.spec)
+export(find_optimal_comp_by_groups)
 export(find_optimal_components)
 export(get_ecosis_data)
 export(percent_rmse)
 export(pls_permutation)
+export(pls_permutation_by_groups)
 export(source_GitHubData)
+import(ggplot2)
 import(httr)
+importFrom(dplyr,all_of)
+importFrom(dplyr,group_by_at)
+importFrom(dplyr,mutate)
+importFrom(dplyr,n)
+importFrom(dplyr,row_number)
+importFrom(dplyr,slice)
+importFrom(dplyr,vars)
+importFrom(graphics,box)
+importFrom(graphics,legend)
+importFrom(graphics,lines)
+importFrom(graphics,polygon)
+importFrom(magrittr,"%>%")
 importFrom(pls,plsr)
+importFrom(pls,selectNcomp)
+importFrom(readr,read_csv)
+importFrom(reshape2,melt)
+importFrom(stats,as.formula)
+importFrom(stats,coef)
+importFrom(stats,predict)
+importFrom(stats,quantile)
+importFrom(stats,t.test)
 importFrom(utils,flush.console)
 importFrom(utils,read.table)
 importFrom(utils,setTxtProgressBar)

diff --git a/R/create_data_split.R b/R/create_data_split.R
@@ -5,15 +5,19 @@
 ##' @param split_seed random seed to use for splitting data
 ##' @param prop the proportion of data to preserve for calibration (e.g. 0.8) and validation (0.2). 
 ##' This sets the calibration proportion
-##' @param group_variables Use factor variables to conduct a stratfied sampling for cal/val
+##' @param group_variables Use factor variables to conduct a stratified sampling for cal/val
 ##' 
 ##' @return output_list A list containing the calibration dataset (cal_data)
 ##' and validation dataset (val_data)
 ##' 
+##' @importFrom magrittr %>%
+##' @importFrom dplyr mutate group_by_at slice n vars all_of 
+##' 
 ##' @author Julien Lamour, Jeremiah Anderson, Shawn P. Serbin
 ##' @export
 create_data_split <- function(dataset=NULL, approach=NULL, split_seed=123456789, prop=0.8,
                               group_variables=NULL) {
+  # TODO: import only required functions from dplyr
   set.seed(split_seed)
 
   # outer if/else to stop if approach set to NULL

diff --git a/R/f.plot.coef.R b/R/f.plot.coef.R
@@ -7,6 +7,9 @@
 ##' @param type Name of the y axis and of the legend
 ##' @param plot_label optional plot label to include with the figure
 ##' 
+##' @importFrom stats quantile
+##' @importFrom graphics polygon lines legend box
+##' 
 ##' @author Julien Lamour
 ##' @export
 f.plot.coef <- function(

diff --git a/R/f.plot.spec.R b/R/f.plot.spec.R
@@ -7,6 +7,9 @@
 ##' @param type Name of the y axis and of the legend. E.g. Reflectance, Transmittance
 ##' @param plot_label optional plot label to include with the figure
 ##' 
+##' @importFrom stats quantile
+##' @importFrom graphics polygon lines legend box
+##' 
 ##' @author Julien Lamour, Shawn P. Serbin
 ##' @export
 f.plot.spec <- function(

diff --git a/R/find_optimal_components.R b/R/find_optimal_components.R
@@ -1,30 +1,41 @@
-##' Apply different methods to determing the optimal number of PLSR model components
+##' Applies different methods for the determination of the optimal number of PLSR model components
 ##' 
 ##' @param dataset input full PLSR dataset. Usually just the calibration dataset
+##' @param targetVariable What object or variable to use as the Y (predictand) in the PLSR model? 
+##' Usually the "inVar" variable set at the beginning of a PLS script
 ##' @param method Which approach to use to find optimal components. Options: pls, firstPlateau, firstMin
 ##' @param maxComps maximum number of components to consider
 ##' @param iterations how many different permutations to run
 ##' @param seg For the built-in pls method, how many different data segments to select from the input dataset
 ##' @param prop proportion of data to preserve for each permutation
 ##' @param random_seed random seed to use for splitting data
 ##' 
+##' @importFrom stats as.formula coef predict quantile t.test
+##' @importFrom pls plsr selectNcomp
+##' @importFrom reshape2 melt
+##' @import ggplot2
+##' 
 ##' @return nComps the optimal number of PLSR components
 ##' 
 ##' @author Julien Lamour, Jeremiah Anderson, Shawn P. Serbin
 ##' @export 
-find_optimal_components <- function(dataset=NULL, method="pls", maxComps=20, iterations=20, seg=100, 
-                                    prop=0.70, random_seed=123456789) {
+find_optimal_components <- function(dataset=NULL, targetVariable=NULL, method="pls", maxComps=20, 
+                                    iterations=20, seg=100, prop=0.70, random_seed=123456789) {
+
   set.seed(random_seed)
+  inVar <- targetVariable
+  print("*** Identifying optimal number of PLSR components ***")
+
   if(method=="pls") {
     print("*** Running PLS permutation test ***")
 
     plsr.out <- pls::plsr(as.formula(paste(inVar,"~","Spectra")), scale=FALSE, center=TRUE, ncomp=maxComps, 
                           validation="CV", segments = seg, segment.type="interleaved", trace=FALSE, 
-                          jackknife=TRUE, data=cal.plsr.data)
-    nComps <- selectNcomp(plsr.out, method = "onesigma", plot = TRUE)
+                          jackknife=TRUE, data=dataset)
+    nComps <- pls::selectNcomp(plsr.out, method = "onesigma", plot = TRUE)
   }
   if(method=="firstPlateau") {
-    press.out <- spectratrait::pls_permutation(dataset=dataset, maxComps=maxComps, 
+    press.out <- spectratrait::pls_permutation(dataset=dataset, targetVariable=inVar, maxComps=maxComps, 
                                                iterations=iterations, prop=prop)
     # PRESS plot
     pressDF <- as.data.frame(press.out$PRESS)
@@ -50,7 +61,7 @@ find_optimal_components <- function(dataset=NULL, method="pls", maxComps=20, ite
     print(bp)
   }
   if(method=="firstMin") {
-    press.out <- spectratrait::pls_permutation(dataset=dataset, maxComps=maxComps, 
+    press.out <- spectratrait::pls_permutation(dataset=dataset, targetVariable=inVar, maxComps=maxComps, 
                                                iterations=iterations, prop=prop)
     # PRESS plot
     pressDF <- as.data.frame(press.out$PRESS)
@@ -84,4 +95,103 @@ find_optimal_components <- function(dataset=NULL, method="pls", maxComps=20, ite
     print(bp)
   }
   return(nComps)
+}
+
+##' Uses the firstMin and firstPlateau methods for the determination of the optimal number of PLSR model components,
+##' by group (i.e. optimal selection by stratification)
+##' 
+##' @param dataset input full PLSR dataset. Usually just the calibration dataset
+##' @param targetVariable What object or variable to use as the Y (predictand) in the PLSR model? 
+##' Usually the "inVar" variable set at the beginning of a PLS script
+##' @param method Which approach to use to find optimal components. Options: firstPlateau, firstMin
+##' @param maxComps maximum number of components to consider
+##' @param iterations how many different permutations to run
+##' @param prop proportion of data to preserve for each permutation
+##' @param random_seed random seed to use for splitting data
+##' @param group_variables group_variables character vector of the form c("var1", "var2"..."varn") 
+##' providing the factors used for stratified sampling. 
+##' 
+##' @importFrom stats as.formula coef predict quantile t.test
+##' @import ggplot2
+##' @importFrom reshape2 melt
+##' 
+##' @return nComps the optimal number of PLSR components
+##' 
+##' @author asierrl, Shawn P. Serbin
+##' @export 
+find_optimal_comp_by_groups <- function (dataset = NULL, targetVariable = NULL, method = "firstPlateau",
+                                         maxComps = 20, iterations = 20, prop = 0.7, random_seed = 123456789, 
+                                         group_variables=NULL) {
+  set.seed(random_seed)
+  inVar <- targetVariable
+  # TODO - really should merge this with the original and have an if/else if not NULL and select either
+  # pls_permutation OR pls_permutation_by_groups.
+  print("*** Identifying optimal number of PLSR components using stratified resampling by group_variables ***")
+  if (method == "pls") {
+    stop("*** Please select either the firstMin and firstPlateau. The pls package approach is not compatible ***")
+  }
+  if (method == "firstPlateau") {
+    press.out <- spectratrait::pls_permutation_by_groups(dataset=dataset, targetVariable=inVar,
+                                                    maxComps=maxComps, iterations=iterations, 
+                                                    prop=prop, group_variables=group_variables)
+    pressDF <- as.data.frame(press.out$PRESS)
+    names(pressDF) <- as.character(seq(maxComps))
+    pressDFres <- reshape2::melt(pressDF)
+    results <- NULL
+    for (i in 1:(maxComps - 1)) {
+      p_value <- t.test(press.out$PRESS[, i], press.out$PRESS[, (i + 1)])$p.value
+      temp_results <- data.frame(Component = (i + 1), P.value = round(p_value, 6))
+      results <- rbind(results, temp_results)
+    }
+    nComps <- min(results[results$P.value > 0.05, "Component"])
+    print(paste0("*** Optimal number of components based on t.test: ", nComps))
+    bp <- ggplot(pressDFres, aes(x = variable, y = value)) + 
+      theme_bw() + geom_boxplot(notch = FALSE) + labs(x = "Number of Components", 
+                                                      y = "PRESS") + 
+      stat_boxplot(geom = "errorbar", width = 0.2) + 
+      geom_vline(xintercept = nComps, linetype = "dashed", 
+                 color = "blue", size = 1)
+    theme(axis.text = element_text(size = 18), legend.position = "none", 
+          axis.title = element_text(size = 20, face = "bold"), 
+          axis.text.x = element_text(angle = 0, vjust = 0.5), 
+          panel.border = element_rect(linetype = "solid", 
+                                      fill = NA, size = 1.5))
+    print(bp)
+  }
+  if (method == "firstMin") {
+    press.out <- spectratrait::pls_permutation_by_groups(dataset = dataset, targetVariable=inVar,
+                                                   maxComps=maxComps, iterations=iterations, 
+                                                   prop=prop, group_variables=group_variables)
+    pressDF <- as.data.frame(press.out$PRESS)
+    names(pressDF) <- as.character(seq(maxComps))
+    pressDFres <- reshape2::melt(pressDF)
+    mean_PRESS_comp <- apply(X = pressDF, MARGIN = 2, FUN = mean)
+    lowest_PRESS <- which.min(mean_PRESS_comp)
+    results <- as.vector(array(data = "NA", dim = c(lowest_PRESS - 1, 1)))
+    for (i in seq_along(1:(lowest_PRESS - 1))) {
+      comp1 <- i
+      comp2 <- lowest_PRESS
+      ttest <- t.test(pressDFres$value[which(pressDFres$variable == comp1)], 
+                      pressDFres$value[which(pressDFres$variable == comp2)])
+      results[i] <- round(unlist(ttest$p.value), 8)
+    }
+    results <- data.frame(seq(1, lowest_PRESS - 1, 1), results)
+    names(results) <- c("Component", "P.value")
+    first <- min(which(as.numeric(as.character(results$P.value)) > 0.05))
+    nComps <- results$Component[first]
+    print(paste0("*** Optimal number of components based on t.test: ", nComps))
+    bp <- ggplot(pressDFres, aes(x = variable, y = value)) + 
+      theme_bw() + geom_boxplot(notch = FALSE) + labs(x = "Number of Components", 
+                                                      y = "PRESS") + 
+      stat_boxplot(geom = "errorbar", width = 0.2) + 
+      geom_vline(xintercept = nComps, linetype = "dashed", 
+                 color = "blue", size = 1)
+    theme(axis.text = element_text(size = 18), legend.position = "none", 
+          axis.title = element_text(size = 20, face = "bold"), 
+          axis.text.x = element_text(angle = 0, vjust = 0.5), 
+          panel.border = element_rect(linetype = "solid", 
+                                      fill = NA, size = 1.5))
+    print(bp)
+  }
+  return(nComps)
 }
diff --git a/R/get_ecosis_data.R b/R/get_ecosis_data.R
@@ -10,6 +10,8 @@
 ##' names(dat_raw)[1:40]
 ##' }
 ##' 
+##' @importFrom readr read_csv
+##' 
 ##' @return EcoSIS spectral dataset object
 ##' 
 ##' @author Shawn P. Serbin, Alexey Shiklomanov