2. Spatial band cross-validati
#> Creating basic raster mask...
#> Searching for the optimal number of bands...
-plot(sp_part2$grid, col = gray.colors(20))
-points(sp_part2$part[c("x", "y")],
+plot(sp_part2$grid, col = gray.colors(20))
+points(sp_part2$part[c("x", "y")],
col = rainbow(8)[sp_part2$part$.part],
cex = 0.9,
pch = c(1, 19)[sp_part2$part$pr_ab + 1]
@@ -786,8 +795,8 @@ 3. Spatial block cross-valid
#> Creating basic raster mask...
#> Searching for the optimal grid size...
-plot(sp_part3$grid)
-points(sp_part3$part[c("x", "y")],
+plot(sp_part3$grid)
+points(sp_part3$part[c("x", "y")],
col = c("blue", "red")[sp_part3$part$.part],
cex = 0.5,
pch = 19
@@ -801,16 +810,16 @@ 3. Spatial block cross-valid
really useful for generating pseudo-absence or background sample points,
which we will explore in the next section.
-terra::res(sp_part3$grid)
+terra::res(sp_part3$grid)
#> [1] 881131 881131
-terra::res(somevar)
+terra::res(somevar)
#> [1] 1890 1890
grid_env <- get_block(env_layer = somevar, best_grid = sp_part3$grid)
-plot(grid_env) # this is a block layer with the same layer
+plot(grid_env) # this is a block layer with the same layer
# properties as environmental variables.
-points(sp_part3$part[c("x", "y")],
+points(sp_part3$part[c("x", "y")],
col = c("blue", "red")[sp_part3$part$.part],
cex = 0.5,
pch = 19
@@ -844,9 +853,9 @@ 4. Environmental a
#> 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10
#> Searching best partition...
-plot(regions, col = gray.colors(9))
-points(sp_part4$part[c("x", "y")],
- col = hcl.colors(length(unique(sp_part4$part)))[sp_part4$part$.part],
+plot(regions, col = gray.colors(9))
+points(sp_part4$part[c("x", "y")],
+ col = hcl.colors(length(unique(sp_part4$part)))[sp_part4$part$.part],
cex = 1,
pch = 19
)
@@ -898,13 +907,13 @@ 1. Sample backgroundpar(mfrow = c(2, 1))
-plot(grid_env, main = "Presence points")
-plot(ca_1, add = TRUE)
-points(p_data, cex = .7, pch = 19)
+plot(grid_env, main = "Presence points")
+plot(ca_1, add = TRUE)
+points(p_data, cex = .7, pch = 19)
-plot(grid_env, main = "Background points")
-plot(ca_1, add = TRUE)
-points(bg, cex = .1, pch = 19)
+plot(grid_env, main = "Background points")
+plot(ca_1, add = TRUE)
+points(bg, cex = .1, pch = 19)
flexsdm 1.3.4
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
index 359e0b3e..856b2d5e 100644
--- a/docs/pkgdown.yml
+++ b/docs/pkgdown.yml
@@ -8,7 +8,7 @@ articles:
v04_Red_fir_example: v04_Red_fir_example.html
v05_Rare_species_example: v05_Rare_species_example.html
v06_Extrapolation_example: v06_Extrapolation_example.html
-last_built: 2024-03-13T13:14Z
+last_built: 2024-04-26T16:17Z
urls:
reference: https://sjevelazco.github.io/flexsdm/reference
article: https://sjevelazco.github.io/flexsdm/articles
diff --git a/docs/reference/correct_colinvar.html b/docs/reference/correct_colinvar.html
index ff9ce18d..32b222a0 100644
--- a/docs/reference/correct_colinvar.html
+++ b/docs/reference/correct_colinvar.html
@@ -75,7 +75,14 @@
Usage
-
correct_colinvar(env_layer, method, proj = NULL, maxcell = NULL)
+
correct_colinvar(
+ env_layer,
+ method,
+ proj = NULL,
+ restric_to_region = NULL,
+ restric_pca_proj = FALSE,
+ maxcell = NULL
+)
diff --git a/docs/search.json b/docs/search.json
index a1a12bb7..fa324ca3 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -1 +1 @@
-[{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"introduction","dir":"Articles","previous_headings":"","what":"Introduction","title":"flexsdm: Overview of Pre-modeling functions","text":"Species distribution modeling (SDM) become standard tool many research areas, including ecology, conservation biology, biogeography, paleobiogeography, epidemiology. SDM active area theoretical methodological research. flexsdm package provides users ability manipulate parameterize models variety ways meet unique research needs. flexibility enables users define complete partial modeling procedure specific modeling situation (e.g., number variables, number records, different algorithms ensemble methods, algorithms tuning, etc.). vignette, users learn first set functions flexsdm package fall “pre-modeling” umbrella (see full list). pre-modeling functions calib_area() Delimit calibration area constructing species distribution models correct_colinvar() Collinearity reduction predictors env_outliers() Integration outliers detection methods environmental space part_random() Data partitioning training testing models part_sblock() Spatial block cross-validation part_sband() Spatial band cross-validation part_senv() Environmental cross-validation plot_res() Plot different resolutions used part_sblock get_block() Transform spatial partition layer spatial properties environmental variables sample_background() Sample background points sample_pseudoabs() Sample pseudo-absence sdm_directory() Create directories saving outputs flexsdm sdm_extract() Extract environmental data based x y coordinates occfilt_env() Perform environmental filtering species occurrences occfilt_geo() Perform geographical filtering species occurrences","code":""},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"installation","dir":"Articles","previous_headings":"","what":"Installation","title":"flexsdm: Overview of Pre-modeling functions","text":"First, install flexsdm package. can install released version flexsdm github :","code":"# devtools::install_github('sjevelazco/flexsdm') library(flexsdm) library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union library(terra) #> terra 1.7.55 #> #> Attaching package: 'terra' #> The following object is masked from 'package:knitr': #> #> spin"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"project-directory-setup","dir":"Articles","previous_headings":"","what":"Project Directory Setup","title":"flexsdm: Overview of Pre-modeling functions","text":"building SDM’s, organizing folders (directories) project save time confusion. project directory main project folder store relevant data results current project. Now, let’s create project directory initial data model results stored. function sdm_directory() can , based types model algorithms want use /types projections like make. First decide computer like store inputs outputs project (main directory) use dir.create() create main directory. Next, specify whether want include folders projections, calibration areas, algorithms, ensembles, thresholds.","code":"my_project <- file.path(file.path(tempdir(), \"flex_sdm_project\")) dir.create(my_project) project_directory <- sdm_directory( main_dir = my_project, projections = NULL, calibration_area = TRUE, algorithm = c(\"fit_max\", \"tune_raf\"), ensemble = c(\"mean\"), threshold = TRUE, return_vector = TRUE )"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"data-species-occurrence-and-background-data","dir":"Articles","previous_headings":"","what":"Data, species occurrence and background data","title":"flexsdm: Overview of Pre-modeling functions","text":"tutorial, using species occurrences available flexsdm package. “spp” example dataset includes pr_ab column (presence = 1, absence = 0), location columns (x, y). can load “spp” data local R environment using code :","code":"data(\"spp\") spp #> # A tibble: 1,150 × 4 #> species x y pr_ab #>
#> 1 sp1 -5541. -145138. 0 #> 2 sp1 -51981. 16322. 0 #> 3 sp1 -269871. 69512. 1 #> 4 sp1 -96261. -32008. 0 #> 5 sp1 269589. -566338. 0 #> 6 sp1 29829. -328468. 0 #> 7 sp1 -152691. 393782. 0 #> 8 sp1 -195081. 253652. 0 #> 9 sp1 -951. -277978. 0 #> 10 sp1 145929. -271498. 0 #> # ℹ 1,140 more rows"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"geographic-region","dir":"Articles","previous_headings":"","what":"Geographic region","title":"flexsdm: Overview of Pre-modeling functions","text":"species occurrences located California Floristic Province (far western USA). “regions” dataset can used visualize study area geographic space. points distributed across study area?","code":"regions <- system.file(\"external/regions.tif\", package = \"flexsdm\") regions <- terra::rast(regions) try(plot(regions), silent=TRUE) points(spp[, 2:3], pch = 19, cex = 0.5, col = as.factor(spp$species))"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"calibration-area","dir":"Articles","previous_headings":"","what":"Calibration area","title":"flexsdm: Overview of Pre-modeling functions","text":"important decision SDM delimit model’s calibration area, geographic space use train model(s). Choice calibration area affects modeling steps, including sampling pseudo-absence background points, performance metrics, geographic patterns habitat suitability. want train SDM using entire extent United States interested geographic distribution environmental controls rare plant species found mountaintops Sierra Nevada, California! Let’s use presence locations one species exercise. calib_area() function offers three methods defining calibration area: buffer, mcp, bmcp, mask. briefly go .","code":"spp1 <- spp %>% dplyr::filter(species == \"sp1\") %>% dplyr::filter(pr_ab == 1) %>% dplyr::select(-pr_ab)"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"buffer","dir":"Articles","previous_headings":"Calibration area","what":"1. Buffer","title":"flexsdm: Overview of Pre-modeling functions","text":"calibration area defined using buffers around presence points. User’s can specify distance around points using “width” argument. buffer width value interpreted m CRS longitude/latitude, map units cases.","code":"crs(regions, proj=TRUE) #> [1] \"+proj=aea +lat_0=0 +lon_0=-120 +lat_1=34 +lat_2=40.5 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs\" ca_1 <- calib_area( data = spp1, x = \"x\", y = \"y\", method = c(\"buffer\", width = 40000), crs = crs(regions) ) plot(regions, main = \"Buffer method\") plot(ca_1, add = TRUE) points(spp1[, 2:3], pch = 19, cex = 0.5)"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"minimum-convex-polygon","dir":"Articles","previous_headings":"Calibration area","what":"2. Minimum convex polygon","title":"flexsdm: Overview of Pre-modeling functions","text":"minimum convex polygon (mcp) method produces much simpler shape.","code":"ca_2 <- calib_area( data = spp1, x = \"x\", y = \"y\", method = c(\"mcp\"), crs = crs(regions) ) plot(regions, main = \"Minimum convex polygon method\") plot(ca_2, add = TRUE) points(spp1[, 2:3], pch = 19, cex = 0.5)"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"buffered-minimum-convex-polygon","dir":"Articles","previous_headings":"Calibration area","what":"3. Buffered minimum convex polygon","title":"flexsdm: Overview of Pre-modeling functions","text":"can also create buffer around minimum convex polygon.","code":"ca_3 <- calib_area( data = spp1, x = \"x\", y = \"y\", method = c(\"bmcp\", width = 40000), crs = crs(regions) ) plot(regions, main = \"Buffered minimum convex polygon\") plot(ca_3, add = TRUE) points(spp1[, 2:3], pch = 19, cex = 0.5)"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"mask","dir":"Articles","previous_headings":"Calibration area","what":"4. Mask","title":"flexsdm: Overview of Pre-modeling functions","text":"mask method allows polygons selected intersect species locations delineate calibration area. useful expect species distributions associated ecologically significant (mapped) ecoregions, interested distributions within political boundaries. use random set polygons named “clusters” illustrate mask method. original polygons left polygons contain points (“mask” calibration area) right.","code":"clusters <- system.file(\"external/clusters.shp\", package = \"flexsdm\") clusters <- terra::vect(clusters) ca_4 <- calib_area( data = spp1, x = \"x\", y = \"y\", method = c(\"mask\", clusters, \"clusters\"), crs = crs(regions) ) par(mfrow = c(1, 2)) plot(clusters, main = \"Original polygons\") plot(ca_4, main = \"Polygons with points (mask)\") points(spp1[, 2:3], pch = 19, cex = 0.5)"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"reducing-collinearity-among-the-predictors","dir":"Articles","previous_headings":"","what":"Reducing collinearity among the predictors","title":"flexsdm: Overview of Pre-modeling functions","text":"Predictor collinearity common issue SDMs, can lead model overfitting inaccurate tests significance predictors (De Marco & Nóbrega, 2018; Dormann et al., 2013).","code":""},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"environmental-predictors","dir":"Articles","previous_headings":"Reducing collinearity among the predictors","what":"Environmental predictors","title":"flexsdm: Overview of Pre-modeling functions","text":"use four climatic variables available flexsdm package: actual evapotranspiration (CFP_1), climatic water deficit (CFP_2), maximum temperature warmest month (CFP_3), minimum temperature coldest month (CFP_4). relationship different environmental variables can visualized pairs() function terra package. Several variables highly correlated (.89 predictors tmx tmn). can correct reduce collinearity? function correct_colinvar() four methods deal collinearity: pearson, vif, pca, fa. method returns 1) raster object (SpatRaster) selected predictors 2) useful outputs relevant method. functions used supplementary tools, predictor selection SDMs complicated ultimately based relationship environment species’ biology. said, functions offer options exploring relationships predictor variables can aid predictor selection process. Let’s look method:","code":"somevar <- system.file(\"external/somevar.tif\", package = \"flexsdm\") somevar <- terra::rast(somevar) names(somevar) <- c(\"aet\", \"cwd\", \"tmx\", \"tmn\") plot(somevar) terra::pairs(somevar)"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"pearson-correlation","dir":"Articles","previous_headings":"Reducing collinearity among the predictors","what":"1. Pearson correlation","title":"flexsdm: Overview of Pre-modeling functions","text":"method returns three objects 1) SpatRaster environmental variables correlation given threshold (default 0.7), 2) names variables correlation given threshold “removed” environmental data, 3) correlation matrix environmental variables. However, strongly urge users use information along knowledge specific species-environment relationships select ecologically-relevant predictors SDMs. example, , modeling distribution plant species water-limited Mediterranean-type ecosystem, may want include climatic water deficit (cwd) actual evapotranspiration (aet). Despite highly correlated, variables capture water availability evaporative demand, respectively (Stephenson 1998). Additionally, minimum absolute temperature strongly controls vegetation distributions (Woodward, Lomas, Kelly 2004), select tmn (minimum temperature coldest month) example. references, see:","code":""},{"path":[]},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"woodward-f--i--m--r--lomas-and-c--k--kelly--2004--global-climate-and-the-distribution-of-plant-biomes--philosophical-transactions-of-the-royal-society-of-london--series-b-biological-sciences-35914651476-","dir":"Articles","previous_headings":"Reducing collinearity among the predictors > 1. Pearson correlation","what":"2. Woodward, F. I., M. R. Lomas, and C. K. Kelly. 2004. Global climate and the distribution of plant biomes. Philosophical transactions of the Royal Society of London. Series B, Biological sciences 359:1465–1476.","title":"flexsdm: Overview of Pre-modeling functions","text":"","code":"pearson_var <- correct_colinvar(somevar, method = c(\"pearson\", th = \"0.7\")) pearson_var$cor_table #> aet cwd tmx tmn #> aet 0.0000000 0.7689893 0.7924813 0.7845401 #> cwd 0.7689893 0.0000000 0.4168956 0.5881831 #> tmx 0.7924813 0.4168956 0.0000000 0.7323259 #> tmn 0.7845401 0.5881831 0.7323259 0.0000000 pearson_var$cor_variables #> $aet #> [1] \"cwd\" \"tmx\" \"tmn\" #> #> $cwd #> [1] \"aet\" #> #> $tmx #> [1] \"aet\" \"tmn\" #> #> $tmn #> [1] \"aet\" \"tmx\" chosen_variables <- somevar[[c('cwd','aet','tmn')]]"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"variance-inflation-factor","dir":"Articles","previous_headings":"Reducing collinearity among the predictors","what":"2. Variance inflation factor","title":"flexsdm: Overview of Pre-modeling functions","text":"method removes predictors variance inflation factor higher chosen threshold. , users can specify threshold (default 10). method retains predictors aet, tmx, tmn removes cwd. output method matches produced pearson method: 1) environmental layer retained variables, 2) list removed variables, 3) correlation matrix variables.","code":"vif_var <- correct_colinvar(somevar, method = c(\"vif\", th = \"10\")) vif_var$env_layer #> class : SpatRaster #> dimensions : 558, 394, 4 (nrow, ncol, nlyr) #> resolution : 1890, 1890 (x, y) #> extent : -373685.8, 370974.2, -604813.3, 449806.7 (xmin, xmax, ymin, ymax) #> coord. ref. : +proj=aea +lat_0=0 +lon_0=-120 +lat_1=34 +lat_2=40.5 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs #> source : somevar.tif #> names : aet, cwd, tmx, tmn #> min values : 0.000, -9.39489, 22.44685, 0.2591429 #> max values : 1357.865, 14.20047, 614.69125, 64.3747588 vif_var$removed_variables #> NULL vif_var$vif_table #> # A tibble: 4 × 2 #> Variables VIF #> #> 1 aet 7.62 #> 2 cwd 3.29 #> 3 tmx 3.95 #> 4 tmn 2.89"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"principal-component-analysis","dir":"Articles","previous_headings":"Reducing collinearity among the predictors","what":"3. Principal component analysis","title":"flexsdm: Overview of Pre-modeling functions","text":"Finally, “pca” method performs principal components analysis predictors returns axis accounts 95% total variance system. method returns 1) SpatRaster object selected environmental variables, 2) matrix coefficients principal components predictors, 3) tibble cumulative variance explained selected principal components.","code":"pca_var <- correct_colinvar(somevar, method = c(\"pca\")) pca_var$env_layer #> class : SpatRaster #> dimensions : 558, 394, 3 (nrow, ncol, nlyr) #> resolution : 1890, 1890 (x, y) #> extent : -373685.8, 370974.2, -604813.3, 449806.7 (xmin, xmax, ymin, ymax) #> coord. ref. : +proj=aea +lat_0=0 +lon_0=-120 +lat_1=34 +lat_2=40.5 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs #> source(s) : memory #> names : PC1, PC2, PC3 #> min values : -8.453273, -4.260147, -1.525085 #> max values : 2.827164, 3.337545, 4.342864 pca_var$coefficients #> # A tibble: 4 × 5 #> variable PC1 PC2 PC3 PC4 #> #> 1 aet 0.550 -0.0722 0.296 -0.778 #> 2 cwd 0.450 -0.777 0.103 0.429 #> 3 tmx -0.485 -0.594 -0.450 -0.459 #> 4 tmn -0.511 -0.198 0.836 -0.0241 pca_var$cumulative_variance #> # A tibble: 4 × 2 #> PC cvar #> #> 1 1 0.764 #> 2 2 0.915 #> 3 3 0.979 #> 4 4 1"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"factorial-analysis","dir":"Articles","previous_headings":"Reducing collinearity among the predictors","what":"4. Factorial analysis","title":"flexsdm: Overview of Pre-modeling functions","text":"Selecting “fa” method performs factorial analysis reduce dimensionality selects predictor(s) highest correlation axis. outputs method similar produced ‘pca’ method.","code":"fa_var <- correct_colinvar(env_layer = somevar, method = c(\"fa\")) fa_var$env_layer fa_var$number_factors fa_var$removed_variables fa_var$uniqueness fa_var$loadings"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"data-filtering","dir":"Articles","previous_headings":"","what":"Data filtering","title":"flexsdm: Overview of Pre-modeling functions","text":"Sample bias species occurrence data common issue ecological studies filtering occurrence data can reduce bias. flexsdm provides two functions different types filtering, based geographical environmental “thinning”, randomly removing points dense (oversampling) geographical environmental space. can improve model performance reduce redundancy data.","code":""},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"environmental-filtering","dir":"Articles","previous_headings":"Data filtering","what":"Environmental filtering","title":"flexsdm: Overview of Pre-modeling functions","text":"function occfilt_env(), performs environmental filtering species occurrence data. method basically reduces environmental redundancy data based methods outlined Valera et al. (2014). However, function unique flexsdm, able use number environmental dimensions perform PCA filtering. example, use original environmental data (somevar) occurrence data single species (spp1). filtering occurrences, important row species data unique code (example: idd). function also gives user option specifying number classes used split environmental condition. explore results using 5, 8, 12 bins. Increasing number bins increases number occurrence points retained.","code":"spp1$idd <- 1:nrow(spp1) filt_env5 <- occfilt_env( data = spp1, x = \"x\", y = \"y\", id = \"idd\", env_layer = somevar, nbins = 5 ) #> Extracting values from raster ... #> 12 records were removed because they have NAs for some variables #> Number of unfiltered records: 238 #> Number of filtered records: 57 filt_env8 <- occfilt_env( data = spp1, x = \"x\", y = \"y\", id = \"idd\", env_layer = somevar, nbins = 8 ) #> Extracting values from raster ... #> 12 records were removed because they have NAs for some variables #> Number of unfiltered records: 238 #> Number of filtered records: 112 filt_env12 <- occfilt_env( data = spp1, x = \"x\", y = \"y\", id = \"idd\", env_layer = somevar, nbins = 12 ) #> Extracting values from raster ... #> 12 records were removed because they have NAs for some variables #> Number of unfiltered records: 238 #> Number of filtered records: 173 par(mfrow = c(2, 2)) somevar[[1]] %>% plot(main = \"Original occurrence data\") points(spp1 %>% select(x, y)) somevar[[1]] %>% plot(main = \"Filtering with 5 bins\") points(filt_env5 %>% select(x, y)) somevar[[1]] %>% plot(main = \"Filtering with 8 bins\") points(filt_env8 %>% select(x, y)) somevar[[1]] %>% plot(main = \"Filtering with 12 bins\") points(filt_env12 %>% select(x, y))"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"geographical-filtering","dir":"Articles","previous_headings":"Data filtering","what":"Geographical filtering","title":"flexsdm: Overview of Pre-modeling functions","text":"Next, look occfilt_geo(), three alternatives determine distance threshold pair points: “moran” determines threshold distance points minimizes spatial autocorrelation occurrence data; “cellsize” filters occurrences based resolution predictors (specified coarser resolution); finally, “determined” allows users manually determine distance threshold.","code":"filt_geo1 <- occfilt_geo( data = spp1, x = \"x\", y = \"y\", env_layer = somevar, method = c(\"moran\"), prj = crs(somevar) ) #> Extracting values from raster ... #> 16 records were removed because they have NAs for some variables #> Number of unfiltered records: 234 #> Threshold for Moran: 0.1 #> Distance threshold(km): 345.859 #> Number of filtered records: 4 filt_geo2 <- occfilt_geo( data = spp1, x = \"x\", y = \"y\", env_layer = somevar, method = c(\"cellsize\", factor = \"3\"), # coarser resolution than the provided raster prj = crs(somevar) ) #> Extracting values from raster ... #> 16 records were removed because they have NAs for some variables #> Number of unfiltered records: 234 #> Distance threshold(km): 4.617 #> Number of filtered records: 212 filt_geo3 <- occfilt_geo( data = spp1, x = \"x\", y = \"y\", env_layer = somevar, method = c(\"defined\", d = \"30\"), prj = crs(somevar) ) #> Extracting values from raster ... #> 16 records were removed because they have NAs for some variables #> Number of unfiltered records: 234 #> Distance threshold(km): 30 #> Number of filtered records: 78 par(mfrow = c(2, 2)) somevar[[1]] %>% plot(main = \"Original occurrence data\") points(spp1 %>% select(x, y)) somevar[[1]] %>% plot(main = \"Filtering with Moran's I\") points(filt_geo1 %>% select(x, y)) somevar[[1]] %>% plot(main = \"Filtering with cell size\") points(filt_geo2 %>% select(x, y)) somevar[[1]] %>% plot(main = \"Filtering with defined distance (30km)\") points(filt_geo3 %>% select(x, y))"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"data-partitioning","dir":"Articles","previous_headings":"","what":"Data partitioning","title":"flexsdm: Overview of Pre-modeling functions","text":"Data partitioning, splitting data testing training groups, key step building SDMs. flexsdm offers multiple options data partitioning, including part_random(), part_sband(), part_sblock(), part_senv(). Let’s explore methods.","code":""},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"conventional-data-partitioning-methods-part_random","dir":"Articles","previous_headings":"Data partitioning","what":"1. Conventional data partitioning methods (part_random)","title":"flexsdm: Overview of Pre-modeling functions","text":"part_random() function provides users ability divide species occurrence data based conventional partition methods including k-folds, repeated k-folds, leave-one-cross-validation, bootstrap partitioning. , use “kfold” method 10 folds divide data. results 10 folds occurrence data 25 observations fold.","code":"spp1$pr_ab <- 1 # Add a column with 1 to denote that this is presences only data sp_part1 <- part_random( data = spp1, pr_ab = \"pr_ab\", method = c(method = \"kfold\", folds = 10) ) sp_part1$.part %>% table() #> . #> 1 2 3 4 5 6 7 8 9 10 #> 25 25 25 25 25 25 25 25 25 25"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"spatial-band-cross-validation-part_sband","dir":"Articles","previous_headings":"Data partitioning","what":"2. Spatial band cross-validation (part_sband)","title":"flexsdm: Overview of Pre-modeling functions","text":"part_sband() part_sblock() partition data based position geographic space. Geographically structured data partitioning methods especially useful users want evaluate model transferability different regions time periods. function part_sband tests different numbers spatial partitions using latitudinal longitudinal bands selects best number bands given presence, presence-absence, presence-background dataset. procedure based spatial autocorrelation, environmental similarity, number presence/absence records band partition. function’s output includes 1) tibble presence/absence locations assigned partition number, 2) tibble information best partition, 3) SpatRaster showing selected grid.","code":"set.seed(1) sp_part2 <- part_sband( env_layer = somevar, data = spp1, x = \"x\", y = \"y\", pr_ab = \"pr_ab\", type = \"lat\", # specify bands across different degrees of longitude 'lon' or latitude 'lat'. min_bands = 2, # minimum number of spatial bands to be tested max_bands = 20, # maximum number of spatial bands to be tested n_part = 2, prop = 0.5 ) #> 12 rows were excluded from database because NAs were found #> The following number of bands will be tested: #> 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 #> Creating basic raster mask... #> Searching for the optimal number of bands... plot(sp_part2$grid, col = gray.colors(20)) points(sp_part2$part[c(\"x\", \"y\")], col = rainbow(8)[sp_part2$part$.part], cex = 0.9, pch = c(1, 19)[sp_part2$part$pr_ab + 1] )"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"spatial-block-cross-validation-part_sblock","dir":"Articles","previous_headings":"Data partitioning","what":"3. Spatial block cross-validation (part_sblock)","title":"flexsdm: Overview of Pre-modeling functions","text":"part_sblock() function similar part_sband() instead bands explores spatial blocks different raster cells sizes returns one best suited input dataset. , can see data divided different “blocks” training testing. However, notice grid partition produced part_sblock different resolution original environmental variables. want map layer properties (.e. resolution, extent, NAs) original environmental variables, apply get_block() function grid resulting part_sblock(). layer can really useful generating pseudo-absence background sample points, explore next section.","code":"sp_part3 <- part_sblock( env_layer = somevar, data = spp1, x = \"x\", y = \"y\", pr_ab = \"pr_ab\", min_res_mult = 10, # Minimum value used for multiplying raster resolution and define the finest resolution to be tested max_res_mult = 500, # Maximum value used for multiplying raster resolution and define the coarsest resolution to be tested num_grids = 30, # Number of grid to be tested between min_res_mult X (raster resolution) and max_res_mult X (raster resolution) n_part = 2, # Number of partitions prop = 0.5 # Proportion of points used for testing autocorrelation between groupds (0-1) ) #> 12 rows were excluded from database because NAs were found #> The following grid cell sizes will be tested: #> 18900 | 50834.48 | 82768.97 | 114703.45 | 146637.93 | 178572.41 | 210506.9 | 242441.38 | 274375.86 | 306310.34 | 338244.83 | 370179.31 | 402113.79 | 434048.28 | 465982.76 | 497917.24 | 529851.72 | 561786.21 | 593720.69 | 625655.17 | 657589.66 | 689524.14 | 721458.62 | 753393.1 | 785327.59 | 817262.07 | 849196.55 | 881131.03 | 913065.52 | 945000 #> Creating basic raster mask... #> Searching for the optimal grid size... plot(sp_part3$grid) points(sp_part3$part[c(\"x\", \"y\")], col = c(\"blue\", \"red\")[sp_part3$part$.part], cex = 0.5, pch = 19 ) terra::res(sp_part3$grid) #> [1] 881131 881131 terra::res(somevar) #> [1] 1890 1890 grid_env <- get_block(env_layer = somevar, best_grid = sp_part3$grid) plot(grid_env) # this is a block layer with the same layer # properties as environmental variables. points(sp_part3$part[c(\"x\", \"y\")], col = c(\"blue\", \"red\")[sp_part3$part$.part], cex = 0.5, pch = 19 )"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"environmental-and-spatial-cross-validation-part_senv","dir":"Articles","previous_headings":"Data partitioning","what":"4. Environmental and spatial cross-validation (part_senv)","title":"flexsdm: Overview of Pre-modeling functions","text":"final partitioning function flexsdm part_senv(), explores different numbers environmental partitions based K-means clustering algorithm returns one best-suited particular dataset, considering spatial autocorrelation, environmental similarity, number presence /absence records partition. map shows partitioning based environmental spatial factors.","code":"sp_part4 <- part_senv( env_layer = somevar, data = spp1, x = \"x\", y = \"y\", pr_ab = \"pr_ab\", min_n_groups = 2, # Minimum number of groups to be tested max_n_groups = 10, # Maximum number of groups to be tested prop = 0.5 # Proportion of points used for testing autocorrelation between groups (0-1) ) #> 12 rows were excluded from database because NAs were found #> The following grid cell sizes will be tested: #> 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 #> Searching best partition... plot(regions, col = gray.colors(9)) points(sp_part4$part[c(\"x\", \"y\")], col = hcl.colors(length(unique(sp_part4$part)))[sp_part4$part$.part], cex = 1, pch = 19 )"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"background-and-pseudo-absence-sampling","dir":"Articles","previous_headings":"","what":"Background and pseudo-absence sampling","title":"flexsdm: Overview of Pre-modeling functions","text":"Presence-occurrence data quite common ecology researchers may adequate “absence” data species interest. Sometimes building species distribution models, need able generate background pseudo-absence points modeling goals. flexsdm package allows users using sample_background() sample_pseudoabs().","code":""},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"sample-background","dir":"Articles","previous_headings":"Background and pseudo-absence sampling","what":"1. Sample background","title":"flexsdm: Overview of Pre-modeling functions","text":"function sample_background() allows slection background sample points based different geographic restrictions sampling methods. , sample set background points based earlier spatial block partitioning using “random” method. Using lapply() case ensures generate background points spatial blocks (n = 2). also specifying want ten times amount background points original occurrences calibration area buffer area around presence points (see section “Calibration area”).","code":"p_data <- sp_part3$part # presence data from spatial block partition example set.seed(10) bg <- lapply(1:2, function(x) { sample_background( data = p_data, x = \"x\", y = \"y\", n = sum(p_data == x) * 10, # number of background points to be sampled method = \"random\", rlayer = grid_env, maskval = x, calibarea = ca_1 # A SpatVector which delimit the calibration area used for a given species ) }) %>% bind_rows() %>% mutate(pr_ab = 0) par(mfrow = c(2, 1)) plot(grid_env, main = \"Presence points\") plot(ca_1, add = TRUE) points(p_data, cex = .7, pch = 19) plot(grid_env, main = \"Background points\") plot(ca_1, add = TRUE) points(bg, cex = .1, pch = 19)"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"sample-pseudo-absences","dir":"Articles","previous_headings":"Background and pseudo-absence sampling","what":"2. Sample pseudo-absences","title":"flexsdm: Overview of Pre-modeling functions","text":"Similarly, function sample_pseudoabs allows random pseudo-absence sampling based environmental /geographical constraints. example, specifying method = “env_const” selects pseudo-absences environmentally constrained regions lower suitability values predicted Bioclim model. Additionally, function allows users specify calibration area generate pseudo-absence points. , use buffer area around presence points (ca_1) show might look like. can see, generated pseudo-absence points general vicinity presence points, concentrated areas lower environmental suitability. specific method chosen sampling background /pseudo-absence points vary depending research goals.","code":"set.seed(10) psa <- lapply(1:2, function(x) { sample_pseudoabs( data = p_data, x = \"x\", y = \"y\", n = sum(p_data == x), # number of pseudo-absence points to be sampled method = c(\"env_const\", env = somevar), rlayer = grid_env, maskval = x, calibarea = ca_1 ) }) %>% bind_rows() %>% mutate(pr_ab = 0) #> Extents do not match, raster layers used were croped to minimum extent #> Extents do not match, raster layers used were croped to minimum extent par(mfrow = c(2, 1)) plot(grid_env, main = \"Presence points\") plot(ca_1, add = TRUE) points(p_data, cex = .7, pch = 19) plot(grid_env, main = \"Pseudo-absence points\") plot(ca_1, add = TRUE) points(psa, cex = .7, pch = 19)"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v01_pre_modeling.html","id":"extracting-environmental-values","dir":"Articles","previous_headings":"","what":"Extracting environmental values","title":"flexsdm: Overview of Pre-modeling functions","text":"Finally, modeling species geographic distributions, must extract environmental data presences + absences/pseudo-absences/background point locations. function sdm_extract() extracts environmental data values based x y coordinates returns tibble original data + additional columns extracted environmental variables locations. Let’s original presence points (spp1) background locations (bg). #=========#=========#=========#=========#=========#=========#=========# Vignette still construction changes #=========#=========#=========#=========#=========#=========#=========#","code":"all_points <- bind_rows(spp1 %>% dplyr::select(-idd), bg) ex_spp <- sdm_extract( data = all_points, x = \"x\", y = \"y\", env_layer = somevar, # Raster with environmental variables variables = NULL, # Vector with the variable names of predictor variables Usage variables. = c(\"aet\", \"cwd\", \"tmin\"). If no variable is specified, function will return data for all layers. filter_na = TRUE ) ex_spp"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v02_modeling.html","id":"introduction","dir":"Articles","previous_headings":"","what":"Introduction","title":"flexsdm: Overview of Modeling functions","text":"Species distribution modeling (SDM) become standard tool multiple research areas, including ecology, conservation biology, biogeography, paleobiogeography, epidemiology. SDM area active theoretical methodological research. flexsdm package provides users ability manipulate parameterize models variety ways meet unique research needs. flexibility enables users define complete partial modeling procedure specific modeling situations (e.g., number variables, number records, different algorithms ensemble methods, algorithms tuning, etc.). vignette, users learn second set functions flexsdm package fall “modeling” umbrella. functions designed construct validate different types models can grouped fit_* , tune_* , esm_* family functions. addition function perform ensemble modeling. fit_* functions construct validate models default hyper-parameter values. tune_* functions construct validate models searching best combination hyper-parameter values, esm_ functions can used constructing validating Ensemble Small Models. Finally, fit_ensemble() function fitting validating ensemble models. functions model construction validation: fit_* functions family fit_gam() Fit validate Generalized Additive Models fit_gau() Fit validate Gaussian Process models fit_gbm() Fit validate Generalized Boosted Regression models fit_glm() Fit validate Generalized Linear Models fit_max() Fit validate Maximum Entropy models fit_net() Fit validate Neural Networks models fit_raf() Fit validate Random Forest models fit_svm() Fit validate Support Vector Machine models tune_* functions family tune_gbm() Fit validate Generalized Boosted Regression models exploration hyper-parameters tune_max() Fit validate Maximum Entropy models exploration hyper-parameters tune_net() Fit validate Neural Networks models exploration hyper-parameters tune_raf() Fit validate Random Forest models exploration hyper-parameters tune_svm() Fit validate Support Vector Machine models exploration hyper-parameters model ensemble fit_ensemble() Fit validate ensemble models different ensemble methods esm_* functions family esm_gam() Fit validate Generalized Additive Models Ensemble Small Model approach esm_gau() Fit validate Gaussian Process models Models Ensemble Small Model approach esm_gbm() Fit validate Generalized Boosted Regression models Ensemble Small Model approach esm_glm() Fit validate Generalized Linear Models Ensemble Small Model approach esm_max() Fit validate Maximum Entropy models Ensemble Small Model approach esm_net() Fit validate Neural Networks models Ensemble Small Model approach esm_svm() Fit validate Support Vector Machine models Ensemble Small Model approach","code":""},{"path":"https://sjevelazco.github.io/flexsdm/articles/v02_modeling.html","id":"installation","dir":"Articles","previous_headings":"","what":"Installation","title":"flexsdm: Overview of Modeling functions","text":"First, install flexsdm package. can install released version flexsdm github :","code":"# devtools::install_github('sjevelazco/flexsdm') require(flexsdm) #> Loading required package: flexsdm require(terra) #> Loading required package: terra #> terra 1.7.55 #> #> Attaching package: 'terra' #> The following object is masked from 'package:knitr': #> #> spin require(dplyr) #> Loading required package: dplyr #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:terra': #> #> intersect, union #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v02_modeling.html","id":"project-directory-setup","dir":"Articles","previous_headings":"","what":"Project directory setup","title":"flexsdm: Overview of Modeling functions","text":"Decide computer like store inputs outputs project (main directory). Use existing one use dir.create() create main directory. specify whether include folders projections, calibration areas, algorithms, ensembles, thresholds. details see Vignette 01_pre_modeling","code":""},{"path":"https://sjevelazco.github.io/flexsdm/articles/v02_modeling.html","id":"data-species-occurrence-and-background-data","dir":"Articles","previous_headings":"","what":"Data, species occurrence and background data","title":"flexsdm: Overview of Modeling functions","text":"tutorial, using species occurrences environmental data available flexsdm package. “abies” example dataset includes pr_ab column (presence = 1, absence = 0), location columns (x, y) environmental data. can load “abies” data local R environment using code : (EXAMPLE LOOKS LITTLE STRANGE ALSO USING BACKGROUND DATA, ABIES DATASET CLEARLY ABSENCES…) want replace abies dataset data, make sure dataset contains environmental conditions related presence-absence data. use pre-modeling family function k-fold partition method (used cross-validation). partition method number folds replications must presence-absence background points datasets. Now, abies2 object new column called “.part” 5 k-folds (1, 2, 3, 4, 5), indicating partition record (row) member . Next, apply partition method number folds environmental conditions background points. backg2 object new column called “.part” 5 k-folds (1, 2, 3, 4, 5).","code":"data(\"abies\") data(\"backg\") dplyr::glimpse(abies) #> Rows: 1,400 #> Columns: 13 #> $ id 715, 5680, 7907, 1850, 1702, 10036, 12384, 6513, 9884, 8651, … #> $ pr_ab 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0… #> $ x -95417.134, 98986.536, 121474.257, -39976.221, 111372.261, -2… #> $ y 314240.13, -159415.18, -99463.44, -17456.11, -91404.05, 39222… #> $ aet 323.1133, 447.5567, 182.2833, 372.3867, 209.4567, 308.3000, 5… #> $ cwd 546.1400, 815.4033, 271.1800, 946.2933, 398.5500, 534.9533, 3… #> $ tmin 1.2433, 9.4267, -4.9500, 8.7767, -4.0333, 4.6600, 4.3800, 4.9… #> $ ppt_djf 62.7257, 129.6406, 150.7003, 116.0236, 164.9327, 166.2220, 48… #> $ ppt_jja 17.7941, 6.4317, 11.2294, 2.7020, 9.2686, 16.5310, 41.2494, 8… #> $ pH 5.773341, 5.600000, 0.000000, 6.411796, 0.000000, 5.700000, 5… #> $ awc 0.10837019, 0.16000000, 0.00000000, 0.09719457, 0.00000000, 0… #> $ depth 152.000000, 201.000000, 0.000000, 59.759930, 0.000000, 112.99… #> $ landform 7, 11, 15, 14, 15, 15, 7, 15, 4, 10, 6, 10, 10, 15, 10, 11, 1… dplyr::glimpse(backg) #> Rows: 5,000 #> Columns: 13 #> $ pr_ab 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, … #> $ x 160779.16, 36849.16, -240170.84, -152420.84, -193190.84, … #> $ y -449968.33, 24151.67, 90031.67, -143518.33, 24151.67, 223… #> $ aet 280.4567, 259.7800, 400.1767, 367.4833, 397.3667, 385.263… #> $ cwd 1137.2433, 381.5367, 699.6500, 843.4467, 842.3833, 637.35… #> $ tmin 13.5100, -3.1733, 8.6800, 9.0133, 8.9700, 4.9333, 6.2933,… #> $ ppt_djf 71.2741, 171.4537, 285.0893, 72.0309, 125.2467, 226.1534,… #> $ ppt_jja 1.1920, 17.5193, 5.0158, 1.2047, 1.9778, 8.1554, 18.4182,… #> $ pH 0.0000000, 0.2122687, 5.7222223, 7.5350823, 6.1963525, 5.… #> $ awc 0.000000000, 0.003473487, 0.080370426, 0.170000002, 0.131… #> $ depth 0.00000, 201.00000, 50.07409, 154.39426, 122.39575, 56.17… #> $ percent_clay 0.0000000, 0.4438345, 18.4111176, 46.9751244, 37.1873169,… #> $ landform 13, 10, 6, 6, 10, 14, 8, 14, 6, 7, 11, 14, 14, 10, 6, 6, … abies2 <- part_random( data = abies, pr_ab = \"pr_ab\", method = c(method = \"kfold\", folds = 5) ) dplyr::glimpse(abies2) #> Rows: 1,400 #> Columns: 14 #> $ id 715, 5680, 7907, 1850, 1702, 10036, 12384, 6513, 9884, 8651, … #> $ pr_ab 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0… #> $ x -95417.134, 98986.536, 121474.257, -39976.221, 111372.261, -2… #> $ y 314240.13, -159415.18, -99463.44, -17456.11, -91404.05, 39222… #> $ aet 323.1133, 447.5567, 182.2833, 372.3867, 209.4567, 308.3000, 5… #> $ cwd 546.1400, 815.4033, 271.1800, 946.2933, 398.5500, 534.9533, 3… #> $ tmin 1.2433, 9.4267, -4.9500, 8.7767, -4.0333, 4.6600, 4.3800, 4.9… #> $ ppt_djf 62.7257, 129.6406, 150.7003, 116.0236, 164.9327, 166.2220, 48… #> $ ppt_jja 17.7941, 6.4317, 11.2294, 2.7020, 9.2686, 16.5310, 41.2494, 8… #> $ pH 5.773341, 5.600000, 0.000000, 6.411796, 0.000000, 5.700000, 5… #> $ awc 0.10837019, 0.16000000, 0.00000000, 0.09719457, 0.00000000, 0… #> $ depth 152.000000, 201.000000, 0.000000, 59.759930, 0.000000, 112.99… #> $ landform 7, 11, 15, 14, 15, 15, 7, 15, 4, 10, 6, 10, 10, 15, 10, 11, 1… #> $ .part 2, 2, 3, 4, 2, 1, 5, 5, 2, 2, 4, 4, 1, 5, 4, 5, 5, 5, 1, 3, 1… backg2 <- part_random( data = backg, pr_ab = \"pr_ab\", method = c(method = \"kfold\", folds = 5) ) dplyr::glimpse(backg2) #> Rows: 5,000 #> Columns: 14 #> $ pr_ab 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, … #> $ x 160779.16, 36849.16, -240170.84, -152420.84, -193190.84, … #> $ y -449968.33, 24151.67, 90031.67, -143518.33, 24151.67, 223… #> $ aet 280.4567, 259.7800, 400.1767, 367.4833, 397.3667, 385.263… #> $ cwd 1137.2433, 381.5367, 699.6500, 843.4467, 842.3833, 637.35… #> $ tmin 13.5100, -3.1733, 8.6800, 9.0133, 8.9700, 4.9333, 6.2933,… #> $ ppt_djf 71.2741, 171.4537, 285.0893, 72.0309, 125.2467, 226.1534,… #> $ ppt_jja 1.1920, 17.5193, 5.0158, 1.2047, 1.9778, 8.1554, 18.4182,… #> $ pH 0.0000000, 0.2122687, 5.7222223, 7.5350823, 6.1963525, 5.… #> $ awc 0.000000000, 0.003473487, 0.080370426, 0.170000002, 0.131… #> $ depth 0.00000, 201.00000, 50.07409, 154.39426, 122.39575, 56.17… #> $ percent_clay 0.0000000, 0.4438345, 18.4111176, 46.9751244, 37.1873169,… #> $ landform 13, 10, 6, 6, 10, 14, 8, 14, 6, 7, 11, 14, 14, 10, 6, 6, … #> $ .part 2, 3, 4, 4, 1, 4, 5, 4, 3, 1, 5, 1, 4, 2, 5, 4, 2, 5, 1, …"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v02_modeling.html","id":"fit-and-validate-models","dir":"Articles","previous_headings":"Data, species occurrence and background data","what":"1. Fit and validate models","title":"flexsdm: Overview of Modeling functions","text":"fit validate models: . maximum entropy model default hyper-parameter values (flexsdm::fit_max) II. random forest model exploration hyper-parameters (flexsdm::tune_raf). . Maximum Entropy models default hyper-parameter values. function returns list object following elements: model: “MaxEnt” class object. object can used predicting. predictors: tibble quantitative (c column names) qualitative (f column names) variables use modeling. performance: performance metric (see sdm_eval). metrics threshold dependent calculated based threshold specified argument. can see selected threshold values. Predicted suitability test partition (row) based best model. database used fit_ensemble. II- Random forest models exploration hyper-parameters. First, create data.frame provides hyper-parameters values tested. recommended generate data.frame. Hyper-parameter needed tuning ‘mtry’. maximum mtry must equal total number predictors. use data object abies2, k-fold partition method: Let’s see output object contains. function returns list object following elements: model: “randomForest” class object. object can used see formula details, basic summary o fthe model, predicting. predictors: tibble quantitative (c column names) qualitative (f column names) variables use modeling. performance: performance metric (see sdm_eval). metrics threshold dependent calculated based threshold specified argument. can see selected threshold values. Predicted suitability test partition (row) based best model. database used fit_ensemble. model objects can used flexsdm::fit_ensemble().","code":"max_t1 <- fit_max( data = abies2, response = \"pr_ab\", predictors = c(\"aet\", \"ppt_jja\", \"pH\", \"awc\", \"depth\"), predictors_f = c(\"landform\"), partition = \".part\", background = backg2, thr = c(\"max_sens_spec\", \"equal_sens_spec\", \"max_sorensen\"), clamp = TRUE, classes = \"default\", pred_type = \"cloglog\", regmult = 1 ) #> Formula used for model fitting: #> ~aet + ppt_jja + pH + awc + depth + I(aet^2) + I(ppt_jja^2) + I(pH^2) + I(awc^2) + I(depth^2) + hinge(aet) + hinge(ppt_jja) + hinge(pH) + hinge(awc) + hinge(depth) + ppt_jja:aet + pH:aet + awc:aet + depth:aet + pH:ppt_jja + awc:ppt_jja + depth:ppt_jja + awc:pH + depth:pH + depth:awc + categorical(landform) - 1 #> Replica number: 1/1 #> Partition number: 1/5 #> Partition number: 2/5 #> Partition number: 3/5 #> Partition number: 4/5 #> Partition number: 5/5 names(max_t1) #> [1] \"model\" \"predictors\" \"performance\" \"data_ens\" options(max.print = 20) max_t1$model #> #> Call: glmnet::glmnet(x = mm, y = as.factor(p), family = \"binomial\", weights = weights, lambda = 10^(seq(4, 0, length.out = 200)) * sum(reg)/length(reg) * sum(p)/sum(weights), standardize = F, penalty.factor = reg) #> #> Df %Dev Lambda #> 1 0 0.00 21.3700 #> 2 0 0.00 20.4100 #> 3 0 0.00 19.4800 #> 4 0 0.00 18.6000 #> 5 0 0.00 17.7600 #> 6 0 0.00 16.9600 #> [ reached getOption(\"max.print\") -- omitted 194 rows ] max_t1$predictors #> # A tibble: 1 × 6 #> c1 c2 c3 c4 c5 f #> #> 1 aet ppt_jja pH awc depth landform max_t1$performance #> # A tibble: 3 × 25 #> model threshold thr_value n_presences n_absences TPR_mean TPR_sd TNR_mean #> #> 1 max equal_sens_sp… 0.573 700 700 0.669 0.0288 0.669 #> 2 max max_sens_spec 0.416 700 700 0.877 0.0609 0.56 #> 3 max max_sorensen 0.335 700 700 0.951 0.0362 0.457 #> # ℹ 17 more variables: TNR_sd , SORENSEN_mean , SORENSEN_sd , #> # JACCARD_mean , JACCARD_sd , FPB_mean , FPB_sd , #> # OR_mean , OR_sd , TSS_mean , TSS_sd , AUC_mean , #> # AUC_sd , BOYCE_mean , BOYCE_sd , IMAE_mean , #> # IMAE_sd max_t1$data_ens #> # A tibble: 1,400 × 5 #> rnames replicates part pr_ab pred #> #> 1 6 .part 1 0 0.656 #> 2 13 .part 1 0 0.0405 #> 3 19 .part 1 0 0.779 #> 4 21 .part 1 0 0.407 #> 5 25 .part 1 0 0.851 #> 6 27 .part 1 0 0.706 #> 7 31 .part 1 0 0.395 #> 8 33 .part 1 0 0.0456 #> 9 35 .part 1 0 0.412 #> 10 36 .part 1 0 0.130 #> # ℹ 1,390 more rows tune_grid <- expand.grid(mtry = seq(1, 7, 1)) rf_t <- tune_raf( data = abies2, response = \"pr_ab\", predictors = c( \"aet\", \"cwd\", \"tmin\", \"ppt_djf\", \"ppt_jja\", \"pH\", \"awc\", \"depth\" ), predictors_f = c(\"landform\"), partition = \".part\", grid = tune_grid, thr = \"max_sens_spec\", metric = \"TSS\", ) #> Formula used for model fitting: #> pr_ab ~ aet + cwd + tmin + ppt_djf + ppt_jja + pH + awc + depth + landform #> Tuning model... #> Replica number: 1/1 #> Formula used for model fitting: #> pr_ab ~ aet + cwd + tmin + ppt_djf + ppt_jja + pH + awc + depth + landform #> Replica number: 1/1 #> Partition number: 1/5 #> Partition number: 2/5 #> Partition number: 3/5 #> Partition number: 4/5 #> Partition number: 5/5 names(rf_t) #> [1] \"model\" \"predictors\" \"performance\" #> [4] \"hyper_performance\" \"data_ens\" rf_t$model #> #> Call: #> randomForest(formula = formula1, data = data, mtry = mtry, ntree = 500, importance = FALSE, ) #> Type of random forest: classification #> Number of trees: 500 #> No. of variables tried at each split: 1 #> #> OOB estimate of error rate: 11.64% #> Confusion matrix: #> 0 1 class.error #> 0 588 112 0.16000000 #> 1 51 649 0.07285714 rf_t$predictors #> # A tibble: 1 × 9 #> c1 c2 c3 c4 c5 c6 c7 c8 f #> #> 1 aet cwd tmin ppt_djf ppt_jja pH awc depth landform rf_t$performance #> # A tibble: 1 × 26 #> mtry model threshold thr_value n_presences n_absences TPR_mean TPR_sd #> #> 1 1 raf max_sens_spec 0.606 700 700 0.93 0.0333 #> # ℹ 18 more variables: TNR_mean , TNR_sd , SORENSEN_mean , #> # SORENSEN_sd , JACCARD_mean , JACCARD_sd , FPB_mean , #> # FPB_sd , OR_mean , OR_sd , TSS_mean , TSS_sd , #> # AUC_mean , AUC_sd , BOYCE_mean , BOYCE_sd , #> # IMAE_mean , IMAE_sd rf_t$data_ens #> # A tibble: 1,400 × 5 #> rnames replicates part pr_ab pred #> #> 1 6 .part 1 0 0.27 #> 2 13 .part 1 0 0.032 #> 3 19 .part 1 0 0.09 #> 4 21 .part 1 0 0.09 #> 5 25 .part 1 0 0.24 #> 6 27 .part 1 0 0.27 #> 7 31 .part 1 0 0.272 #> 8 33 .part 1 0 0.02 #> 9 35 .part 1 0 0.156 #> 10 36 .part 1 0 0.018 #> # ℹ 1,390 more rows"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v02_modeling.html","id":"model-ensemble","dir":"Articles","previous_headings":"Data, species occurrence and background data","what":"2. Model Ensemble","title":"flexsdm: Overview of Modeling functions","text":"example fit validate ensemble model using two model objects just created.","code":"# Fit and validate ensemble model an_ensemble <- fit_ensemble( models = list(max_t1, rf_t), ens_method = \"meansup\", thr = NULL, thr_model = \"max_sens_spec\", metric = \"TSS\" ) #> | | | 0% | |======================================================================| 100% # Outputs names(an_ensemble) #> [1] \"models\" \"thr_metric\" \"predictors\" \"performance\" an_ensemble$thr_metric #> [1] \"max_sens_spec\" \"TSS_mean\" an_ensemble$predictors #> # A tibble: 2 × 9 #> c1 c2 c3 c4 c5 f c6 c7 c8 #> #> 1 aet ppt_jja pH awc depth landform NA NA NA #> 2 aet cwd tmin ppt_djf ppt_jja landform pH awc depth an_ensemble$performance #> # A tibble: 7 × 25 #> model threshold thr_value n_presences n_absences TPR_mean TPR_sd TNR_mean #> #> 1 meansup equal_sens_… 0.596 700 700 0.879 0.0220 0.88 #> 2 meansup lpt 0.05 700 700 1 0 0.414 #> 3 meansup max_fpb 0.568 700 700 0.931 0.0322 0.86 #> 4 meansup max_jaccard 0.568 700 700 0.931 0.0322 0.86 #> 5 meansup max_sens_sp… 0.568 700 700 0.93 0.0333 0.861 #> 6 meansup max_sorensen 0.568 700 700 0.931 0.0322 0.86 #> 7 meansup sensitivity 0.55 700 700 0.9 0 0.861 #> # ℹ 17 more variables: TNR_sd , SORENSEN_mean , SORENSEN_sd , #> # JACCARD_mean , JACCARD_sd , FPB_mean , FPB_sd , #> # OR_mean , OR_sd , TSS_mean , TSS_sd , AUC_mean , #> # AUC_sd , BOYCE_mean , BOYCE_sd , IMAE_mean , #> # IMAE_sd "},{"path":"https://sjevelazco.github.io/flexsdm/articles/v02_modeling.html","id":"fit-and-validate-models-with-ensemble-of-small-model-approach","dir":"Articles","previous_headings":"Data, species occurrence and background data","what":"3. Fit and validate models with Ensemble of Small Model approach","title":"flexsdm: Overview of Modeling functions","text":"method consists creating bivariate models pair-wise combinations predictors perform ensemble based average suitability weighted Somers’ D metric (D = 2 x (AUC -0.5)). ESM recommended modeling species occurrences. function allow categorical variables use types variables problematic applied species occurrences. detail see Breiner et al. (2015, 2018) can use different methods flexsdm::part_random function according data. See part_random details. function constructs Generalized Additive Models using Ensembles Small Models (ESM) approach (Breiner et al., 2015, 2018). function returns list object following elements: esm_model: list “GAM” class object bivariate model. object can used predicting using ESM approachwith sdm_predict function. predictors: tibble variables use modeling. performance: Performance metric (see sdm_eval). threshold dependent metrics calculated based threshold specified argument. Now, test rep_kfold partition method. method ‘folds’ refers number partitions data partitioning ‘replicate’ refers number replicates. assume values >=1. use new rep_kfold partition gam model Test random bootstrap partitioning. method ‘replicate’ refers number replicates (assumes value >=1), ‘proportion’ refers proportion occurrences used model fitting (assumes value >0 <=1). method can configure proportion training testing data according species occurrences. example, proportion=‘0.7’ indicates 70% data used model training, 30% used model testing. method, function return .partX columns “train” “test” words entries. Use new rep_kfold partition gam model #=========#=========#=========#=========#=========#=========#=========# Vignette still construction changes #=========#=========#=========#=========#=========#=========#=========#","code":"data(\"abies\") library(dplyr) # Create a smaller subset of occurrences set.seed(10) abies2 <- abies %>% na.omit() %>% group_by(pr_ab) %>% dplyr::slice_sample(n = 10) %>% group_by() # Using k-fold partition method for model cross validation abies2 <- part_random( data = abies2, pr_ab = \"pr_ab\", method = c(method = \"kfold\", folds = 3) ) abies2 #> # A tibble: 20 × 14 #> id pr_ab x y aet cwd tmin ppt_djf ppt_jja pH awc #> #> 1 12040 0 -308909. 384248. 573. 332. 4.84 521. 48.8 5.63 0.108 #> 2 10361 0 -254286. 417158. 260. 469. 2.93 151. 15.1 6.20 0.0950 #> 3 9402 0 -286979. 386206. 587. 376. 6.45 333. 15.7 5.5 0.160 #> 4 9815 0 -291849. 445595. 443. 455. 4.39 332. 19.1 6 0.0700 #> 5 10524 0 -256658. 184438. 355. 568. 5.87 303. 10.6 5.20 0.0800 #> 6 8860 0 121343. -164170. 354. 733. 3.97 182. 9.83 0 0 #> 7 6431 0 107903. -122968. 461. 578. 4.87 161. 7.66 5.90 0.0900 #> 8 11730 0 -333903. 431238. 561. 364. 6.73 387. 25.2 5.80 0.130 #> 9 808 0 -150163. 357180. 339. 564. 2.64 220. 15.3 6.40 0.100 #> 10 11054 0 -293663. 340981. 477. 396. 3.89 332. 26.4 4.60 0.0634 #> 11 2960 1 -49273. 181752. 512. 275. 0.920 319. 17.3 5.92 0.0900 #> 12 3065 1 126907. -198892. 322. 544. 0.700 203. 10.6 5.60 0.110 #> 13 5527 1 116751. -181089. 261. 537. 0.363 178. 7.43 0 0 #> 14 4035 1 -31777. 115940. 394. 440. 2.07 298. 11.2 6.01 0.0769 #> 15 4081 1 -5158. 90159. 301. 502. 0.703 203. 14.6 6.11 0.0633 #> 16 3087 1 102151. -143976. 299. 425. -2.08 205. 13.4 3.88 0.110 #> 17 3495 1 -19586. 89803. 438. 419. 2.13 189. 15.2 6.19 0.0959 #> 18 4441 1 49405. -60502. 362. 582. 2.42 218. 7.84 5.64 0.0786 #> 19 301 1 -132516. 270845. 367. 196. -2.56 422. 26.3 6.70 0.0300 #> 20 3162 1 59905. -53634. 319. 626. 1.99 212. 4.50 4.51 0.0396 #> # ℹ 3 more variables: depth , landform , .part # We set the model without threshold specification and with the kfold created above esm_gam_t1 <- esm_gam( data = abies2, response = \"pr_ab\", predictors = c(\"aet\", \"cwd\", \"tmin\", \"ppt_djf\", \"ppt_jja\", \"pH\", \"awc\", \"depth\"), partition = \".part\", thr = NULL ) #> #> Model has more coefficients than data used for training it. Try to reduce k names(esm_gam_t1) #> NULL options(max.print = 10) # If you don't want to see printed all the output esm_gam_t1$esm_model #> NULL esm_gam_t1$predictors #> NULL esm_gam_t1$performance #> NULL # Remove the previous k-fold partition abies2 <- abies2 %>% select(-starts_with(\".\")) # Test with rep_kfold partition using 3 folds and 5 replicates set.seed(10) abies2 <- part_random( data = abies2, pr_ab = \"pr_ab\", method = c(method = \"rep_kfold\", folds = 3, replicates = 5) ) abies2 #> # A tibble: 20 × 18 #> id pr_ab x y aet cwd tmin ppt_djf ppt_jja pH awc #> #> 1 12040 0 -308909. 384248. 573. 332. 4.84 521. 48.8 5.63 0.108 #> 2 10361 0 -254286. 417158. 260. 469. 2.93 151. 15.1 6.20 0.0950 #> 3 9402 0 -286979. 386206. 587. 376. 6.45 333. 15.7 5.5 0.160 #> 4 9815 0 -291849. 445595. 443. 455. 4.39 332. 19.1 6 0.0700 #> 5 10524 0 -256658. 184438. 355. 568. 5.87 303. 10.6 5.20 0.0800 #> 6 8860 0 121343. -164170. 354. 733. 3.97 182. 9.83 0 0 #> 7 6431 0 107903. -122968. 461. 578. 4.87 161. 7.66 5.90 0.0900 #> 8 11730 0 -333903. 431238. 561. 364. 6.73 387. 25.2 5.80 0.130 #> 9 808 0 -150163. 357180. 339. 564. 2.64 220. 15.3 6.40 0.100 #> 10 11054 0 -293663. 340981. 477. 396. 3.89 332. 26.4 4.60 0.0634 #> 11 2960 1 -49273. 181752. 512. 275. 0.920 319. 17.3 5.92 0.0900 #> 12 3065 1 126907. -198892. 322. 544. 0.700 203. 10.6 5.60 0.110 #> 13 5527 1 116751. -181089. 261. 537. 0.363 178. 7.43 0 0 #> 14 4035 1 -31777. 115940. 394. 440. 2.07 298. 11.2 6.01 0.0769 #> 15 4081 1 -5158. 90159. 301. 502. 0.703 203. 14.6 6.11 0.0633 #> 16 3087 1 102151. -143976. 299. 425. -2.08 205. 13.4 3.88 0.110 #> 17 3495 1 -19586. 89803. 438. 419. 2.13 189. 15.2 6.19 0.0959 #> 18 4441 1 49405. -60502. 362. 582. 2.42 218. 7.84 5.64 0.0786 #> 19 301 1 -132516. 270845. 367. 196. -2.56 422. 26.3 6.70 0.0300 #> 20 3162 1 59905. -53634. 319. 626. 1.99 212. 4.50 4.51 0.0396 #> # ℹ 7 more variables: depth , landform , .part1 , .part2 , #> # .part3 , .part4 , .part5 esm_gam_t2 <- esm_gam( data = abies2, response = \"pr_ab\", predictors = c(\"aet\", \"cwd\", \"tmin\", \"ppt_djf\", \"ppt_jja\", \"pH\", \"awc\", \"depth\"), partition = \".part\", thr = NULL ) #> #> Model has more coefficients than data used for training it. Try to reduce k # Remove the previous k-fold partition abies2 <- abies2 %>% select(-starts_with(\".\")) # Test with bootstrap partition using 10 replicates set.seed(10) abies2 <- part_random( data = abies2, pr_ab = \"pr_ab\", method = c(method = \"boot\", replicates = 10, proportion = 0.7) ) abies2 #> # A tibble: 20 × 23 #> id pr_ab x y aet cwd tmin ppt_djf ppt_jja pH awc #> #> 1 12040 0 -308909. 384248. 573. 332. 4.84 521. 48.8 5.63 0.108 #> 2 10361 0 -254286. 417158. 260. 469. 2.93 151. 15.1 6.20 0.0950 #> 3 9402 0 -286979. 386206. 587. 376. 6.45 333. 15.7 5.5 0.160 #> 4 9815 0 -291849. 445595. 443. 455. 4.39 332. 19.1 6 0.0700 #> 5 10524 0 -256658. 184438. 355. 568. 5.87 303. 10.6 5.20 0.0800 #> 6 8860 0 121343. -164170. 354. 733. 3.97 182. 9.83 0 0 #> 7 6431 0 107903. -122968. 461. 578. 4.87 161. 7.66 5.90 0.0900 #> 8 11730 0 -333903. 431238. 561. 364. 6.73 387. 25.2 5.80 0.130 #> 9 808 0 -150163. 357180. 339. 564. 2.64 220. 15.3 6.40 0.100 #> 10 11054 0 -293663. 340981. 477. 396. 3.89 332. 26.4 4.60 0.0634 #> 11 2960 1 -49273. 181752. 512. 275. 0.920 319. 17.3 5.92 0.0900 #> 12 3065 1 126907. -198892. 322. 544. 0.700 203. 10.6 5.60 0.110 #> 13 5527 1 116751. -181089. 261. 537. 0.363 178. 7.43 0 0 #> 14 4035 1 -31777. 115940. 394. 440. 2.07 298. 11.2 6.01 0.0769 #> 15 4081 1 -5158. 90159. 301. 502. 0.703 203. 14.6 6.11 0.0633 #> 16 3087 1 102151. -143976. 299. 425. -2.08 205. 13.4 3.88 0.110 #> 17 3495 1 -19586. 89803. 438. 419. 2.13 189. 15.2 6.19 0.0959 #> 18 4441 1 49405. -60502. 362. 582. 2.42 218. 7.84 5.64 0.0786 #> 19 301 1 -132516. 270845. 367. 196. -2.56 422. 26.3 6.70 0.0300 #> 20 3162 1 59905. -53634. 319. 626. 1.99 212. 4.50 4.51 0.0396 #> # ℹ 12 more variables: depth , landform , .part1 , .part2 , #> # .part3 , .part4 , .part5 , .part6 , .part7 , #> # .part8 , .part9 , .part10 esm_gam_t3 <- esm_gam( data = abies2, response = \"pr_ab\", predictors = c(\"aet\", \"cwd\", \"tmin\", \"ppt_djf\", \"ppt_jja\", \"pH\", \"awc\", \"depth\"), partition = \".part\", thr = NULL ) #> #> Model has more coefficients than data used for training it. Try to reduce k"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v03_post_modeling.html","id":"introduction","dir":"Articles","previous_headings":"","what":"Introduction","title":"flexsdm: Overview of Post-modeling functions","text":"Species distribution modeling (SDM) become standard tool multiple research areas, including ecology, conservation biology, biogeography, paleobiogeography, epidemiology. SDM area active theoretical methodological research flexsdm package provides users ability manipulate parameterize models variety ways meet unique research needs. flexibility enables users define complete partial modeling procedure specific modeling situations (e.g., number variables, number records, different algorithms ensemble methods, algorithms tuning, etc.). vignette, users learn post-modeling set functions flexsdm package. functions designed aim assisting flexsdm user predicting, evaluating, correcting SDMs. functions created model prediction, evaluation correction: Post-modeling functions sdm_predict() Spatial predictions individual ensemble models sdm_summarize() Merge model performance tables interp() Raster interpolation SDM predictions two time periods extra_eval() Measure model extrapolation extra_correct() Constraint suitability values given extrapolation value msdm_priori() Create spatial predictor variables reduce overprediction species distribution models msdm_posteriori() Methods correct overprediction species distribution models based occurrences suitability patterns","code":""},{"path":"https://sjevelazco.github.io/flexsdm/articles/v03_post_modeling.html","id":"installation","dir":"Articles","previous_headings":"","what":"Installation","title":"flexsdm: Overview of Post-modeling functions","text":"Install flexsdm package. can install released version flexsdm github :","code":"# devtools::install_github('sjevelazco/flexsdm') library(flexsdm) library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union library(terra) #> terra 1.7.55 #> #> Attaching package: 'terra' #> The following object is masked from 'package:knitr': #> #> spin"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v03_post_modeling.html","id":"project-directory-setup","dir":"Articles","previous_headings":"","what":"Project directory setup","title":"flexsdm: Overview of Post-modeling functions","text":"Decide computer like store inputs outputs project (main directory). Use existing one use dir.create() create main directory. specify whether include folders projections, calibration areas, algorithms, ensembles, thresholds. details see Vignette 01_pre_modeling","code":""},{"path":"https://sjevelazco.github.io/flexsdm/articles/v03_post_modeling.html","id":"species-occurrence-presenceabsense-and-environmental-data","dir":"Articles","previous_headings":"","what":"Species occurrence, presence/absense and environmental data","title":"flexsdm: Overview of Post-modeling functions","text":"tutorial, using “spp” example dataset includes pr_ab (presence = 1, absence = 0), location (x, y) data 3 plant species found California raster environmental data. can load data local R environment using code : want replace spp dataset data, make sure contains coordinates, species presence = 1 / absence = 0 raster environmental data. First, prepare occurrences, environmental conditions partitions Next, fit different models","code":"data(\"spp\") somevar <- system.file(\"external/somevar.tif\", package = \"flexsdm\") somevar <- terra::rast(somevar) # Select only one species some_sp <- spp %>% filter(species == \"sp3\") # Extract the environmental condition from the rsater for sp3 some_sp <- sdm_extract( data = some_sp, x = \"x\", y = \"y\", env_layer = somevar ) #> 4 rows were excluded from database because NAs were found # Make a partition defining the method, folds and replicates some_sp <- part_random( data = some_sp, pr_ab = \"pr_ab\", method = c(method = \"rep_kfold\", folds = 3, replicates = 5) ) # Fit and validate a [generalized linear model](https://sjevelazco.github.io/flexsdm/reference/fit_glm.html) mglm <- fit_glm( data = some_sp, response = \"pr_ab\", predictors = c(\"CFP_1\", \"CFP_2\", \"CFP_3\", \"CFP_4\"), partition = \".part\", poly = 2 ) #> Formula used for model fitting: #> pr_ab ~ CFP_1 + CFP_2 + CFP_3 + CFP_4 + I(CFP_1^2) + I(CFP_2^2) + I(CFP_3^2) + I(CFP_4^2) #> Replica number: 1/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 2/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 3/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 4/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 5/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 # Fit and validate a [random forest model](https://sjevelazco.github.io/flexsdm/reference/fit_raf.html) mraf <- fit_raf( data = some_sp, response = \"pr_ab\", predictors = c(\"CFP_1\", \"CFP_2\", \"CFP_3\", \"CFP_4\"), partition = \".part\", ) #> Formula used for model fitting: #> pr_ab ~ CFP_1 + CFP_2 + CFP_3 + CFP_4 #> Replica number: 1/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 2/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 3/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 4/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 5/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 # Fit and validate a [general boosted regression model](https://sjevelazco.github.io/flexsdm/reference/fit_gbm.html) mgbm <- fit_gbm( data = some_sp, response = \"pr_ab\", predictors = c(\"CFP_1\", \"CFP_2\", \"CFP_3\", \"CFP_4\"), partition = \".part\" ) #> Formula used for model fitting: #> pr_ab ~ CFP_1 + CFP_2 + CFP_3 + CFP_4 #> Replica number: 1/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 2/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 3/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 4/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3 #> Replica number: 5/5 #> Partition number: 1/3 #> Partition number: 2/3 #> Partition number: 3/3"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v03_post_modeling.html","id":"fit-and-ensemble-the-models-above","dir":"Articles","previous_headings":"Species occurrence, presence/absense and environmental data","what":"1. Fit and ensemble the models above","title":"flexsdm: Overview of Post-modeling functions","text":"can also fit model using Ensembles Small Models approach. example, fit without threshold specification k-fold cross-validation. Finally, can predict different kinds models data (some_sp). sdm_predict can used predicting one models fitted fit_ tune_ functions. output list SpatRaster continuous /binary predictions.","code":"# Fit and ensemble the models. To choose the arguments that best fit your own data, see all options available in [fit_ensemble](https://sjevelazco.github.io/flexsdm/reference/fit_ensemble.html) mensemble <- fit_ensemble( models = list(mglm, mraf, mgbm), ens_method = \"meansup\", thr = NULL, thr_model = \"max_sens_spec\", metric = \"TSS\" ) #> | | | 0% | |======================================================================| 100% msmall <- esm_gam( data = some_sp, response = \"pr_ab\", predictors = c(\"CFP_1\", \"CFP_2\", \"CFP_3\", \"CFP_4\"), partition = \".part\", thr = NULL ) #> | | | 0% | |============ | 17% | |======================= | 33% | |=================================== | 50% | |=============================================== | 67% | |========================================================== | 83% | |======================================================================| 100% # Predict using a single model, which is an mglm model in this example, # and a threshold type for binary predictions ind_p <- sdm_predict( models = mglm, pred = somevar, thr = \"max_fpb\", con_thr = FALSE, predict_area = NULL ) #> Predicting individual models # Inspect the object. It's a SpatRaster with 2 layers: glm, max_fpb # These are the continuous and binary prediction from the model ind_p #> $glm #> class : SpatRaster #> dimensions : 558, 394, 2 (nrow, ncol, nlyr) #> resolution : 1890, 1890 (x, y) #> extent : -373685.8, 370974.2, -604813.3, 449806.7 (xmin, xmax, ymin, ymax) #> coord. ref. : +proj=aea +lat_0=0 +lon_0=-120 +lat_1=34 +lat_2=40.5 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs #> source(s) : memory #> varnames : somevar #> somevar #> names : glm, max_fpb #> min values : 2.220446e-16, TRUE #> max values : 1.000000e+00, TRUE # Plot to see this layers ind_p_rst <- terra::rast(ind_p) plot(ind_p_rst) # Predict a list of more than one model, specifying a threshold type list_p <- sdm_predict( models = list(mglm, mraf, mgbm), pred = somevar, thr = \"max_fpb\", con_thr = FALSE, predict_area = NULL ) #> Predicting list of individual models # Inspect the object. It's a list with 3 SpatRaster, one for each model, # each of which contains 2 layers, for the continuous and thresholded binary predictions. list_p #> $glm #> class : SpatRaster #> dimensions : 558, 394, 2 (nrow, ncol, nlyr) #> resolution : 1890, 1890 (x, y) #> extent : -373685.8, 370974.2, -604813.3, 449806.7 (xmin, xmax, ymin, ymax) #> coord. ref. : +proj=aea +lat_0=0 +lon_0=-120 +lat_1=34 +lat_2=40.5 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs #> source(s) : memory #> varnames : somevar #> somevar #> names : glm, max_fpb #> min values : 2.220446e-16, TRUE #> max values : 1.000000e+00, TRUE #> #> $raf #> class : SpatRaster #> dimensions : 558, 394, 2 (nrow, ncol, nlyr) #> resolution : 1890, 1890 (x, y) #> extent : -373685.8, 370974.2, -604813.3, 449806.7 (xmin, xmax, ymin, ymax) #> coord. ref. : +proj=aea +lat_0=0 +lon_0=-120 +lat_1=34 +lat_2=40.5 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs #> source(s) : memory #> varnames : somevar #> somevar #> names : raf, max_fpb #> min values : 0, FALSE #> max values : 1, TRUE #> #> $gbm #> class : SpatRaster #> dimensions : 558, 394, 2 (nrow, ncol, nlyr) #> resolution : 1890, 1890 (x, y) #> extent : -373685.8, 370974.2, -604813.3, 449806.7 (xmin, xmax, ymin, ymax) #> coord. ref. : +proj=aea +lat_0=0 +lon_0=-120 +lat_1=34 +lat_2=40.5 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs #> source(s) : memory #> varnames : somevar #> somevar #> names : gbm, max_fpb #> min values : 0.0002949323, FALSE #> max values : 0.9986537352, TRUE # Plot to see this layers list_p_rst <- terra::rast(list_p) plot(list_p_rst) # Predict an ensemble model. This is only possible using one fit_ensemble object. It's not possible to include e.g., list(fit_ensemble1, fit_ensemble2) in the model argument. ensemble_p <- sdm_predict( models = mensemble, pred = somevar, thr = \"max_fpb\", con_thr = FALSE, predict_area = NULL ) #> Predicting ensembles # Inspect the object. It's a SpatRaster with 2 layers, mensemble and max_fpb # These are the continuous and binary prediction from the ensemble model ensemble_p #> $meansup #> class : SpatRaster #> dimensions : 558, 394, 2 (nrow, ncol, nlyr) #> resolution : 1890, 1890 (x, y) #> extent : -373685.8, 370974.2, -604813.3, 449806.7 (xmin, xmax, ymin, ymax) #> coord. ref. : +proj=aea +lat_0=0 +lon_0=-120 +lat_1=34 +lat_2=40.5 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs #> source(s) : memory #> names : meansup, max_fpb #> min values : 0.0001474662, FALSE #> max values : 0.9972242977, TRUE # Plot to see this layers ensemble_p_rst <- terra::rast(ensemble_p) plot(ensemble_p_rst) # Predict an ensembles of small models. small_p <- sdm_predict( models = msmall, pred = somevar, thr = \"max_fpb\", con_thr = FALSE, predict_area = NULL ) #> Predicting ensemble of small models # Inspect the object It's a SpatRaster with 2 layers, msmall and max_fpb # These are the continuous and binary prediction from the ESM model small_p #> $esm_gam #> class : SpatRaster #> dimensions : 558, 394, 2 (nrow, ncol, nlyr) #> resolution : 1890, 1890 (x, y) #> extent : -373685.8, 370974.2, -604813.3, 449806.7 (xmin, xmax, ymin, ymax) #> coord. ref. : +proj=aea +lat_0=0 +lon_0=-120 +lat_1=34 +lat_2=40.5 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs #> source(s) : memory #> names : esm_gam, max_fpb #> min values : 1.961046e-05, FALSE #> max values : 8.644150e-01, TRUE # Plot to see this layers small_p_rst <- terra::rast(small_p) plot(small_p_rst)"},{"path":"https://sjevelazco.github.io/flexsdm/articles/v03_post_modeling.html","id":"merge-model-performance-tables","dir":"Articles","previous_headings":"Species occurrence, presence/absense and environmental data","what":"2. Merge model performance tables","title":"flexsdm: Overview of Post-modeling functions","text":"function combines model performance tables input models. function requires list one models fitted fit_ tune_ functions, fit_ensemble output, esm_ family function output. Build models use performance table merge Finally, merge three sdm performance tables.","code":"# Load abies data data(abies) abies #> # A tibble: 1,400 × 13 #> id pr_ab x y aet cwd tmin ppt_djf ppt_jja pH awc #> #> 1 715 0 -95417. 314240. 323. 546. 1.24 62.7 17.8 5.77 0.108 #> 2 5680 0 98987. -159415. 448. 815. 9.43 130. 6.43 5.60 0.160 #> 3 7907 0 121474. -99463. 182. 271. -4.95 151. 11.2 0 0 #> 4 1850 0 -39976. -17456. 372. 946. 8.78 116. 2.70 6.41 0.0972 #> 5 1702 0 111372. -91404. 209. 399. -4.03 165. 9.27 0 0 #> 6 10036 0 -255715. 392229. 308. 535. 4.66 166. 16.5 5.70 0.0777 #> 7 12384 0 -311765. 380213. 568. 352. 4.38 480. 41.2 5.80 0.110 #> 8 6513 0 111360. -120229. 327. 633. 4.93 163. 8.91 1.18 0.0116 #> 9 9884 0 -284326. 442136. 377. 446. 3.99 296. 16.8 5.96 0.0900 #> 10 8651 0 137640. -110538. 215. 265. -4.62 180. 9.57 0 0 #> # ℹ 1,390 more rows #> # ℹ 2 more variables: depth , landform # We will partition the data with the k-fold method abies2 <- part_random( data = abies, pr_ab = \"pr_ab\", method = c(method = \"kfold\", folds = 5) ) # Build a generalized additive model, and a generalized linear model using fit_ family functions gam_t1 <- fit_gam( data = abies2, response = \"pr_ab\", predictors = c(\"aet\", \"ppt_jja\", \"pH\", \"awc\", \"depth\"), predictors_f = c(\"landform\"), partition = \".part\", thr = c(\"max_sens_spec\", \"equal_sens_spec\", \"max_sorensen\") ) #> Formula used for model fitting: #> pr_ab ~ s(aet, k = -1) + s(ppt_jja, k = -1) + s(pH, k = -1) + s(awc, k = -1) + s(depth, k = -1) + landform #> Replica number: 1/1 #> Partition number: 1/5 #> Partition number: 2/5 #> Partition number: 3/5 #> Partition number: 4/5 #> Partition number: 5/5 glm_t1 <- fit_glm( data = abies2, response = \"pr_ab\", predictors = c(\"aet\", \"ppt_jja\", \"pH\", \"awc\", \"depth\"), predictors_f = c(\"landform\"), partition = \".part\", thr = c(\"max_sens_spec\", \"equal_sens_spec\", \"max_sorensen\"), poly = 0, inter_order = 0 ) #> Formula used for model fitting: #> pr_ab ~ aet + ppt_jja + pH + awc + depth + landform #> Replica number: 1/1 #> Partition number: 1/5 #> Partition number: 2/5 #> Partition number: 3/5 #> Partition number: 4/5 #> Partition number: 5/5 # Build a tuned model using tune_ family functions # Prepare the grid object to use in grid argument tune_grid <- expand.grid(mtry = seq(1, 7, 1)) # Build a tuned random forest model rf_t1 <- tune_raf( data = abies2, response = \"pr_ab\", predictors = c( \"aet\", \"cwd\", \"tmin\", \"ppt_djf\", \"ppt_jja\", \"pH\", \"awc\", \"depth\" ), predictors_f = c(\"landform\"), partition = \".part\", grid = tune_grid, thr = c(\"max_sens_spec\", \"equal_sens_spec\", \"max_sorensen\"), metric = \"TSS\", ) #> Formula used for model fitting: #> pr_ab ~ aet + cwd + tmin + ppt_djf + ppt_jja + pH + awc + depth + landform #> Tuning model... #> Replica number: 1/1 #> Formula used for model fitting: #> pr_ab ~ aet + cwd + tmin + ppt_djf + ppt_jja + pH + awc + depth + landform #> Replica number: 1/1 #> Partition number: 1/5 #> Partition number: 2/5 #> Partition number: 3/5 #> Partition number: 4/5 #> Partition number: 5/5 rf_t1$performance #> # A tibble: 1 × 26 #> mtry model threshold thr_value n_presences n_absences TPR_mean TPR_sd #>