Skip to content

Commit

Permalink
Merge pull request #93 from TESTgroup-BNL/group_nComps
Browse files Browse the repository at this point in the history
Adding in functionality for using stratified sampling in pls_permutation and component selection
  • Loading branch information
Shawn P. Serbin authored Jan 28, 2022
2 parents 59ce2c8 + dc6b9e3 commit 36c77ae
Show file tree
Hide file tree
Showing 91 changed files with 2,624 additions and 675 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# R test files
Rplots.pdf

# History files
.Rhistory
.Rapp.history
Expand Down
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: spectratrait
Title: A simple add-on package to aid in the fitting of leaf-level spectra-trait PLSR models
Version: 1.0.5
Version: 1.1.0
Authors@R:
c(person(given = "Julien",
family = "Lamour",
Expand Down Expand Up @@ -33,12 +33,13 @@ License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.1.2
Imports:
httr (>= 1.4.2),
readr (>= 1.3.1),
pls (>= 2.7-2),
dplyr (>= 1.0.1),
magrittr (>= 2.0.1),
reshape2 (>= 1.4.4),
here (>= 0.1),
plotrix (>= 3.7-8),
Expand Down
23 changes: 23 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,36 @@ export(create_data_split)
export(f.coef.valid)
export(f.plot.coef)
export(f.plot.spec)
export(find_optimal_comp_by_groups)
export(find_optimal_components)
export(get_ecosis_data)
export(percent_rmse)
export(pls_permutation)
export(pls_permutation_by_groups)
export(source_GitHubData)
import(ggplot2)
import(httr)
importFrom(dplyr,all_of)
importFrom(dplyr,group_by_at)
importFrom(dplyr,mutate)
importFrom(dplyr,n)
importFrom(dplyr,row_number)
importFrom(dplyr,slice)
importFrom(dplyr,vars)
importFrom(graphics,box)
importFrom(graphics,legend)
importFrom(graphics,lines)
importFrom(graphics,polygon)
importFrom(magrittr,"%>%")
importFrom(pls,plsr)
importFrom(pls,selectNcomp)
importFrom(readr,read_csv)
importFrom(reshape2,melt)
importFrom(stats,as.formula)
importFrom(stats,coef)
importFrom(stats,predict)
importFrom(stats,quantile)
importFrom(stats,t.test)
importFrom(utils,flush.console)
importFrom(utils,read.table)
importFrom(utils,setTxtProgressBar)
Expand Down
6 changes: 5 additions & 1 deletion R/create_data_split.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@
##' @param split_seed random seed to use for splitting data
##' @param prop the proportion of data to preserve for calibration (e.g. 0.8) and validation (0.2).
##' This sets the calibration proportion
##' @param group_variables Use factor variables to conduct a stratfied sampling for cal/val
##' @param group_variables Use factor variables to conduct a stratified sampling for cal/val
##'
##' @return output_list A list containing the calibration dataset (cal_data)
##' and validation dataset (val_data)
##'
##' @importFrom magrittr %>%
##' @importFrom dplyr mutate group_by_at slice n vars all_of
##'
##' @author Julien Lamour, Jeremiah Anderson, Shawn P. Serbin
##' @export
create_data_split <- function(dataset=NULL, approach=NULL, split_seed=123456789, prop=0.8,
group_variables=NULL) {
# TODO: import only required functions from dplyr
set.seed(split_seed)

# outer if/else to stop if approach set to NULL
Expand Down
3 changes: 3 additions & 0 deletions R/f.plot.coef.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
##' @param type Name of the y axis and of the legend
##' @param plot_label optional plot label to include with the figure
##'
##' @importFrom stats quantile
##' @importFrom graphics polygon lines legend box
##'
##' @author Julien Lamour
##' @export
f.plot.coef <- function(
Expand Down
3 changes: 3 additions & 0 deletions R/f.plot.spec.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
##' @param type Name of the y axis and of the legend. E.g. Reflectance, Transmittance
##' @param plot_label optional plot label to include with the figure
##'
##' @importFrom stats quantile
##' @importFrom graphics polygon lines legend box
##'
##' @author Julien Lamour, Shawn P. Serbin
##' @export
f.plot.spec <- function(
Expand Down
124 changes: 117 additions & 7 deletions R/find_optimal_components.R
Original file line number Diff line number Diff line change
@@ -1,30 +1,41 @@
##' Apply different methods to determing the optimal number of PLSR model components
##' Applies different methods for the determination of the optimal number of PLSR model components
##'
##' @param dataset input full PLSR dataset. Usually just the calibration dataset
##' @param targetVariable What object or variable to use as the Y (predictand) in the PLSR model?
##' Usually the "inVar" variable set at the beginning of a PLS script
##' @param method Which approach to use to find optimal components. Options: pls, firstPlateau, firstMin
##' @param maxComps maximum number of components to consider
##' @param iterations how many different permutations to run
##' @param seg For the built-in pls method, how many different data segments to select from the input dataset
##' @param prop proportion of data to preserve for each permutation
##' @param random_seed random seed to use for splitting data
##'
##' @importFrom stats as.formula coef predict quantile t.test
##' @importFrom pls plsr selectNcomp
##' @importFrom reshape2 melt
##' @import ggplot2
##'
##' @return nComps the optimal number of PLSR components
##'
##' @author Julien Lamour, Jeremiah Anderson, Shawn P. Serbin
##' @export
find_optimal_components <- function(dataset=NULL, method="pls", maxComps=20, iterations=20, seg=100,
prop=0.70, random_seed=123456789) {
find_optimal_components <- function(dataset=NULL, targetVariable=NULL, method="pls", maxComps=20,
iterations=20, seg=100, prop=0.70, random_seed=123456789) {

set.seed(random_seed)
inVar <- targetVariable
print("*** Identifying optimal number of PLSR components ***")

if(method=="pls") {
print("*** Running PLS permutation test ***")

plsr.out <- pls::plsr(as.formula(paste(inVar,"~","Spectra")), scale=FALSE, center=TRUE, ncomp=maxComps,
validation="CV", segments = seg, segment.type="interleaved", trace=FALSE,
jackknife=TRUE, data=cal.plsr.data)
nComps <- selectNcomp(plsr.out, method = "onesigma", plot = TRUE)
jackknife=TRUE, data=dataset)
nComps <- pls::selectNcomp(plsr.out, method = "onesigma", plot = TRUE)
}
if(method=="firstPlateau") {
press.out <- spectratrait::pls_permutation(dataset=dataset, maxComps=maxComps,
press.out <- spectratrait::pls_permutation(dataset=dataset, targetVariable=inVar, maxComps=maxComps,
iterations=iterations, prop=prop)
# PRESS plot
pressDF <- as.data.frame(press.out$PRESS)
Expand All @@ -50,7 +61,7 @@ find_optimal_components <- function(dataset=NULL, method="pls", maxComps=20, ite
print(bp)
}
if(method=="firstMin") {
press.out <- spectratrait::pls_permutation(dataset=dataset, maxComps=maxComps,
press.out <- spectratrait::pls_permutation(dataset=dataset, targetVariable=inVar, maxComps=maxComps,
iterations=iterations, prop=prop)
# PRESS plot
pressDF <- as.data.frame(press.out$PRESS)
Expand Down Expand Up @@ -84,4 +95,103 @@ find_optimal_components <- function(dataset=NULL, method="pls", maxComps=20, ite
print(bp)
}
return(nComps)
}

##' Uses the firstMin and firstPlateau methods for the determination of the optimal number of PLSR model components,
##' by group (i.e. optimal selection by stratification)
##'
##' @param dataset input full PLSR dataset. Usually just the calibration dataset
##' @param targetVariable What object or variable to use as the Y (predictand) in the PLSR model?
##' Usually the "inVar" variable set at the beginning of a PLS script
##' @param method Which approach to use to find optimal components. Options: firstPlateau, firstMin
##' @param maxComps maximum number of components to consider
##' @param iterations how many different permutations to run
##' @param prop proportion of data to preserve for each permutation
##' @param random_seed random seed to use for splitting data
##' @param group_variables group_variables character vector of the form c("var1", "var2"..."varn")
##' providing the factors used for stratified sampling.
##'
##' @importFrom stats as.formula coef predict quantile t.test
##' @import ggplot2
##' @importFrom reshape2 melt
##'
##' @return nComps the optimal number of PLSR components
##'
##' @author asierrl, Shawn P. Serbin
##' @export
find_optimal_comp_by_groups <- function (dataset = NULL, targetVariable = NULL, method = "firstPlateau",
maxComps = 20, iterations = 20, prop = 0.7, random_seed = 123456789,
group_variables=NULL) {
set.seed(random_seed)
inVar <- targetVariable
# TODO - really should merge this with the original and have an if/else if not NULL and select either
# pls_permutation OR pls_permutation_by_groups.
print("*** Identifying optimal number of PLSR components using stratified resampling by group_variables ***")
if (method == "pls") {
stop("*** Please select either the firstMin and firstPlateau. The pls package approach is not compatible ***")
}
if (method == "firstPlateau") {
press.out <- spectratrait::pls_permutation_by_groups(dataset=dataset, targetVariable=inVar,
maxComps=maxComps, iterations=iterations,
prop=prop, group_variables=group_variables)
pressDF <- as.data.frame(press.out$PRESS)
names(pressDF) <- as.character(seq(maxComps))
pressDFres <- reshape2::melt(pressDF)
results <- NULL
for (i in 1:(maxComps - 1)) {
p_value <- t.test(press.out$PRESS[, i], press.out$PRESS[, (i + 1)])$p.value
temp_results <- data.frame(Component = (i + 1), P.value = round(p_value, 6))
results <- rbind(results, temp_results)
}
nComps <- min(results[results$P.value > 0.05, "Component"])
print(paste0("*** Optimal number of components based on t.test: ", nComps))
bp <- ggplot(pressDFres, aes(x = variable, y = value)) +
theme_bw() + geom_boxplot(notch = FALSE) + labs(x = "Number of Components",
y = "PRESS") +
stat_boxplot(geom = "errorbar", width = 0.2) +
geom_vline(xintercept = nComps, linetype = "dashed",
color = "blue", size = 1)
theme(axis.text = element_text(size = 18), legend.position = "none",
axis.title = element_text(size = 20, face = "bold"),
axis.text.x = element_text(angle = 0, vjust = 0.5),
panel.border = element_rect(linetype = "solid",
fill = NA, size = 1.5))
print(bp)
}
if (method == "firstMin") {
press.out <- spectratrait::pls_permutation_by_groups(dataset = dataset, targetVariable=inVar,
maxComps=maxComps, iterations=iterations,
prop=prop, group_variables=group_variables)
pressDF <- as.data.frame(press.out$PRESS)
names(pressDF) <- as.character(seq(maxComps))
pressDFres <- reshape2::melt(pressDF)
mean_PRESS_comp <- apply(X = pressDF, MARGIN = 2, FUN = mean)
lowest_PRESS <- which.min(mean_PRESS_comp)
results <- as.vector(array(data = "NA", dim = c(lowest_PRESS - 1, 1)))
for (i in seq_along(1:(lowest_PRESS - 1))) {
comp1 <- i
comp2 <- lowest_PRESS
ttest <- t.test(pressDFres$value[which(pressDFres$variable == comp1)],
pressDFres$value[which(pressDFres$variable == comp2)])
results[i] <- round(unlist(ttest$p.value), 8)
}
results <- data.frame(seq(1, lowest_PRESS - 1, 1), results)
names(results) <- c("Component", "P.value")
first <- min(which(as.numeric(as.character(results$P.value)) > 0.05))
nComps <- results$Component[first]
print(paste0("*** Optimal number of components based on t.test: ", nComps))
bp <- ggplot(pressDFres, aes(x = variable, y = value)) +
theme_bw() + geom_boxplot(notch = FALSE) + labs(x = "Number of Components",
y = "PRESS") +
stat_boxplot(geom = "errorbar", width = 0.2) +
geom_vline(xintercept = nComps, linetype = "dashed",
color = "blue", size = 1)
theme(axis.text = element_text(size = 18), legend.position = "none",
axis.title = element_text(size = 20, face = "bold"),
axis.text.x = element_text(angle = 0, vjust = 0.5),
panel.border = element_rect(linetype = "solid",
fill = NA, size = 1.5))
print(bp)
}
return(nComps)
}
2 changes: 2 additions & 0 deletions R/get_ecosis_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
##' names(dat_raw)[1:40]
##' }
##'
##' @importFrom readr read_csv
##'
##' @return EcoSIS spectral dataset object
##'
##' @author Shawn P. Serbin, Alexey Shiklomanov
Expand Down
Loading

0 comments on commit 36c77ae

Please sign in to comment.