Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

correct_colinvar can be used with species points #394

Merged
merged 1 commit into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 197 additions & 49 deletions R/correct_colinvar.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,20 @@
#' @param maxcell numeric. Number of raster cells to be randomly sampled. Taking a sample could be
#' useful to reduce memory usage for large rasters. If NULL, the function will use all
#' raster cells. Default NULL. Usage maxcell = 50000.
#' @param based_on_points logical. If TRUE, collinearity reduction method will be based on species points data (i.e., presences, and absences, pseudo-absences or background points). If TRUE, data, x and y arguments must be provided. Default FALSE.
#' @param data tibble or data.frame. Database with species data used to model
#' (i.e., presence + absence, or presence + pseudo-absence + background points)
#' with x and y coordinates
#' @param x character. Column name with spatial x coordinates
#' @param y character. Column name with spatial y coordinates
#'
#' @return
#' #' If 'pearson', returns a list with the following elements:
#' \itemize{
#' \item cor_table: a matrix object with pairwise correlation values of the environmental variables
#' \item cor_variables: a list object with the same length of the number of environmental values containing the pairwise relations that exceeded the correlation threshold for each one of the environmental variables
#' \item cor_variables: a list object with the same length of the number of environmental
#' values containing the pairwise relations that exceeded the correlation
#' threshold for each one of the environmental variables
#' }
#'
#' If 'vif' method, returns a list with the following elements:
Expand Down Expand Up @@ -91,6 +99,15 @@
#' ├── UKESM_2050_ssp370\cr
#' │ └── pcs.tif
#'
#' Perform collinearity reduction based on points
#'
#' Evaluating collinearity based on all environmental conditions of a
#' calibration area or study area could yield different results than evaluating
#' collinearity based on points used to construct the models. If you want to
#' perform collinearity reduction based on species points data, it is strongly
#' recommended to use all the point data used for modeling
#' (i.e., presence + absence or presence + pseudo-absence/background points).
#'
#' @export
#' @importFrom dplyr tibble
#' @importFrom stats na.omit cor lm prcomp factanal
Expand Down Expand Up @@ -129,8 +146,8 @@
#'
#'
#' # Perform pca collinearity control with different projections
#' ## Below will be created a set of folders to simulate the structure of the directory where
#' ## environmental variables are stored for different scenarios
#' ## Below will be created a set of folders to simulate the structure of the
#' ## directory where environmental variables are stored for different scenarios
#' dir_sc <- file.path(tempdir(), "projections")
#' dir.create(dir_sc)
#' dir_sc <- file.path(dir_sc, c("scenario_1", "scenario_2"))
Expand Down Expand Up @@ -167,6 +184,7 @@
#' #### considering cell restricted to a region ####
#' # #
#' ## %######################################################%##
#' data("abies")
#'
#' # Define a calibration area
#' abies2 <- abies %>%
Expand Down Expand Up @@ -208,6 +226,61 @@
#' plot(pca_fr$env_layer) # PCA with all cells
#' plot(pca_rr$env_layer) # PCA with calibration area cell but predicted for entire region
#' plot(pca_rrp$env_layer) # PCA performed and predicted for cells within calibration area (ca)
#'
#'
#'
#' ##%######################################################%##
#' # #
#' #### Use correct_colinvar with points data ####
#' # #
#' ##%######################################################%##
#' data("abies")
#'
#' # Presence-absence database
#' abies2 <- abies %>%
#' dplyr::select(x, y, pr_ab)
#'
#' # Perform collinearity control
#' # Pearson
#' correct_colinvar(
#' env_layer = somevar,
#' method = c("pearson", th = "0.6"),
#' based_on_points = TRUE,
#' data = abies2,
#' x = "x",
#' y = "y"
#' )
#'
#' # VIF
#' correct_colinvar(
#' env_layer = somevar,
#' method = c("vif", th = "8"),
#' based_on_points = TRUE,
#' data = abies2,
#' x = "x",
#' y = "y"
#' )
#'
#' # PCA
#' correct_colinvar(
#' env_layer = somevar,
#' method = c("pca"),
#' based_on_points = TRUE,
#' data = abies2,
#' x = "x",
#' y = "y"
#' )
#'
#' # FA
#' correct_colinvar(
#' env_layer = somevar,
#' method = "fa",
#' based_on_points = TRUE,
#' data = abies2,
#' x = "x",
#' y = "y"
#' )
#'
#' }
#'
correct_colinvar <- function(env_layer,
Expand All @@ -216,14 +289,34 @@ correct_colinvar <- function(env_layer,
save_proj = NULL,
restric_to_region = NULL,
restric_pca_proj = FALSE,
maxcell = NULL) {
maxcell = NULL,
based_on_points = FALSE,
data = NULL,
x = NULL,
y = NULL) {
. <- NULL
if (!any(c("pearson", "vif", "pca", "fa") %in% method)) {
stop(
"argument 'method' was misused, select one of the available methods: pearson, vif, pca, fa"
)
}

if (based_on_points) {
if (is.null(data) | is.null(x) | is.null(y)) {
stop("If based_on_points is TRUE, data, x, and y arguments must be provided")
}
data <- sdm_extract(
data = data,
x = x,
y = y,
env_layer = env_layer,
variables = NULL,
filter_na = TRUE
)
data <- data[names(env_layer)]
}


if (class(env_layer)[1] != "SpatRaster") {
env_layer <- terra::rast(env_layer)
}
Expand All @@ -239,15 +332,18 @@ correct_colinvar <- function(env_layer,
}
}

#### VIF ####
#### Pearson ####
if (any(method %in% "pearson")) {
if (is.na(method["th"])) {
th <- 0.7
} else {
th <- as.numeric(method["th"])
}

if (is.null(maxcell)) {
if(based_on_points){
h <- data
} else {
if (is.null(maxcell)) {
h <- terra::as.data.frame(env_layer) %>% stats::na.omit()
} else {
# Raster random sample
Expand All @@ -258,12 +354,14 @@ correct_colinvar <- function(env_layer,
h <- env_layer[h] %>%
stats::na.omit()
}
}

h <- abs(stats::cor(h, method = "pearson"))
diag(h) <- 0

cor_var <- h > th
cor_var <- apply(cor_var, 2, function(x) colnames(h)[x])
if (length(cor_var) == 0) {
cor_var <- apply(cor_var, 2, function(x) colnames(h)[x], simplify = FALSE)
if (all(sapply(cor_var, length)==0)) {
cor_var <- "No pair of variables reached the specified correlation threshold."
}

Expand All @@ -273,24 +371,28 @@ correct_colinvar <- function(env_layer,
)
}

#### Peason ####
#### VIF ####
if (any(method %in% "vif")) {
if (is.null(method["th"])) {
th <- 10
} else {
th <- as.numeric(method["th"])
}

if (is.null(maxcell)) {
x <- terra::as.data.frame(env_layer)
if (based_on_points) {
x <- data %>% as.data.frame()
} else {
# Raster random sample
set.seed(10)
x <- terra::as.data.frame(env_layer[[1]], cells = TRUE)[, 1] %>%
sample(., size = maxcell, replace = FALSE) %>%
sort()
x <- env_layer[x] %>%
stats::na.omit()
if (is.null(maxcell)) {
x <- terra::as.data.frame(env_layer)
} else {
# Raster random sample
set.seed(10)
x <- terra::as.data.frame(env_layer[[1]], cells = TRUE)[, 1] %>%
sample(., size = maxcell, replace = FALSE) %>%
sort()
x <- env_layer[x] %>%
stats::na.omit()
}
}

LOOP <- TRUE
Expand Down Expand Up @@ -350,30 +452,45 @@ correct_colinvar <- function(env_layer,
env_layer_original <- env_layer
}

# mean
means <- t(terra::global(env_layer, "mean", na.rm = T)) %>% c()
names(means) <- names(env_layer)
# SD
stds <- t(terra::global(env_layer, "sd", na.rm = T)) %>% c()
names(stds) <- names(env_layer)
if(based_on_points){
# mean
means <- apply(data, 2, mean, na.rm=TRUE)
# SD
stds <- apply(data, 2, sd, na.rm=TRUE)
} else {
# mean
means <- t(terra::global(env_layer, "mean", na.rm = T)) %>% c()
names(means) <- names(env_layer)
# SD
stds <- t(terra::global(env_layer, "sd", na.rm = T)) %>% c()
names(stds) <- names(env_layer)
}


# Standardize raster values
env_layer <- terra::scale(env_layer, center = means, scale = stds)
vnmes <- names(means)


if (is.null(maxcell)) {
p0 <- terra::as.data.frame(env_layer, xy = FALSE, na.rm = TRUE)
if(based_on_points){
# Statandarize data
p0 <- data %>%
scale(center = means, scale = stds)
rm(data)
} else {
# Raster random sample
set.seed(10)
p0 <- terra::as.data.frame(env_layer[[1]], cells = TRUE)[, 1] %>%
sample(., size = maxcell, replace = FALSE) %>%
sort()
p0 <- env_layer[p0] %>%
stats::na.omit()
if (is.null(maxcell)) {
p0 <- terra::as.data.frame(env_layer, xy = FALSE, na.rm = TRUE)
} else {
# Raster random sample
set.seed(10)
p0 <- terra::as.data.frame(env_layer[[1]], cells = TRUE)[, 1] %>%
sample(., size = maxcell, replace = FALSE) %>%
sort()
p0 <- env_layer[p0] %>%
stats::na.omit()
}
}


p <- stats::prcomp(p0,
retx = TRUE,
scale. = FALSE,
Expand All @@ -389,7 +506,13 @@ correct_colinvar <- function(env_layer,


# p <- terra::as.data.frame(env_layer, xy = FALSE, na.rm = TRUE)
p <- stats::prcomp(p0, retx = TRUE, scale. = FALSE, center = FALSE, rank. = naxis)
p <- stats::prcomp(
p0,
retx = TRUE,
scale. = FALSE,
center = FALSE,
rank. = naxis
)

# env_layer <- terra::predict(env_layer, p)
if (restric_pca_proj & is.null(restric_to_region)) {
Expand Down Expand Up @@ -454,19 +577,24 @@ correct_colinvar <- function(env_layer,

#### FA ####
if (any(method %in% "fa")) {
p <- terra::scale(env_layer, center = TRUE, scale = TRUE)


if (is.null(maxcell)) {
p <- terra::as.data.frame(p, xy = FALSE, na.rm = TRUE)
if(based_on_points){
p <- as.data.frame(data) %>%
scale(center = TRUE, scale = TRUE)
rm(data)
} else {
# Raster random sample
set.seed(10)
p <- terra::as.data.frame(env_layer[[1]], cells = TRUE)[, 1] %>%
sample(., size = maxcell, replace = FALSE) %>%
sort()
p <- env_layer[p] %>%
stats::na.omit()
p <- terra::scale(env_layer, center = TRUE, scale = TRUE)

if (is.null(maxcell)) {
p <- terra::as.data.frame(p, xy = FALSE, na.rm = TRUE)
} else {
# Raster random sample
set.seed(10)
p <- terra::as.data.frame(env_layer[[1]], cells = TRUE)[, 1] %>%
sample(., size = maxcell, replace = FALSE) %>%
sort()
p <- env_layer[p] %>%
stats::na.omit()
}
}

# if (nrow(p) > 200000) {
Expand All @@ -485,7 +613,7 @@ correct_colinvar <- function(env_layer,

ns <- length(which(r > a))

lwr <- seq(0.001, 2, length.out = 50)
lwr <- seq(0.001, 3, length.out = 100)
fit <- NULL
for (tt in 1:length(lwr)) {
tryCatch(
Expand All @@ -498,12 +626,32 @@ correct_colinvar <- function(env_layer,
error = function(e) {
}
)

if (!is.null(fit)) {
break
}
}

if(is.null(fit)){
message("Factorial analysis could not be performed because ", ns, " factors are too many for ", ncol(p), " variables")
message("It will tested with ", ns-1, " factors")
ns <- ns - 1
for (tt in 1:length(lwr)) {
tryCatch(
fit <- stats::factanal(
x = p,
factors = ns,
rotation = "varimax",
lower = lwr[tt]
),
error = function(e) {
}
)
if (!is.null(fit)) {
break
}
}
}

sel <-
row.names(fit$loadings)[apply(fit$loadings, 2, which.max)]
rem <-
Expand Down
Loading
Loading