Skip to content

Commit c4e9637

Browse files
committedFeb 4, 2021
tools
1 parent 4aa903f commit c4e9637

File tree

3 files changed

+541
-0
lines changed

3 files changed

+541
-0
lines changed
 

‎GDM_Unifrac/gdm_helper_tutorial.r

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# 0. Load up dependencies, make fake data so tutorial works
2+
library(gdm)
3+
set.seed(12345)
4+
pc <- runif(26, 0, 1)
5+
unif_dm <- as.matrix(dist(pc))
6+
rownames(unif_dm) <- colnames(unif_dm) <- letters
7+
metadata <- data.frame(
8+
Elevation = pc + runif(26, 0, 1) * 0.5,
9+
NDVI = pc + runif(26, 0, 1) * 0.7,
10+
Rainfall = pc - runif(26, 0, 1) * 0.4,
11+
latitude = 1:26,
12+
longitude = 1:26
13+
)
14+
rownames(metadata) <- letters
15+
16+
# 1. Let's say you have your UniFrac distance matrix in R, as a nxn numeric matrix
17+
# with row names and column names included (sample IDs). It's called 'unif_dm'.
18+
# Let's say you also have some metadata, as a data frame with n rows, and it
19+
# includes row names, which are the same (same order!!!) as the row names of
20+
# your UniFrac distance matrix. it's called 'metadata'.
21+
22+
# 2. test some assumptions:
23+
# do our objects match up? All of these should return TRUE
24+
nrow(unif_dm) == ncol(unif_dm)
25+
nrow(unif_dm) == nrow(metadata)
26+
all(rownames(unif_dm) == colnames(unif_dm))
27+
all(rownames(unif_dm) == rownames(metadata))
28+
# gdm assumes 0-1 range of distances, but UniFrac doesn't always provide that.
29+
# usually max is like 1.2 or something, not a huge re-scale.
30+
# fix is to re-scale:
31+
unif_dm <- unif_dm/max(unif_dm)
32+
33+
# 3. make site-pair table
34+
# now we'll need the helper scripts.
35+
source("jld_gdm_helpers.r")
36+
# and we'll make a list of the predictors we want to use.
37+
# note that location data are a list, with labels Lat and Lon. Must have those
38+
# names if location is included (but it doesn't have to be).
39+
# NOTE - distance matrix inputs to GDM just go in this list.
40+
predictor_list <- list(
41+
Elevation=metadata$Elevation,
42+
NDVI=metadata$NDVI,
43+
Rainfall=metadata$Rainfall,
44+
Location=list(Lat=metadata$latitude, Lon=metadata$longitude) # MUST be named Lat and Lon!!
45+
)
46+
47+
# 4. make site-pair table from predictor_list
48+
spt <- site_pair_from_list( responseMat=unif_dm, predList=predictor_list )
49+
50+
# 5. run GDM
51+
# make geo=F if you didn't include location in predictor_list
52+
fit1 <- gdm(spt, geo=T)
53+
# you can also use gdm.varImp to do backward elimination, TAKES FOREVER
54+
# variables_importance <- gdm.varImp(spt, geo=T, fullModelOnly=FALSE, parallel=TRUE, cores=4)
55+
56+
57+
# 6. plot GDM
58+
plot_gdm_jld(fit1, pred_colors="auto")
59+

‎GDM_Unifrac/jld_gdm_helpers.r

+331
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
## GDM helper functions
2+
# make gdm syntax a little less arcane
3+
4+
## gdmize() takes an matrix or data.frame and adds a row for SampleID, which is
5+
# required by gdm. I usually have my rows named instead, hence the function.
6+
gdmize <- function(x, sampleids, sampleids_name="SampleID"){
7+
a <- data.frame(sampleids, as.data.frame(x))
8+
colnames(a)[1] <- sampleids_name
9+
return(a)
10+
}
11+
12+
13+
## this is a function that makes a friendly gdm plot
14+
# x is a gdm model, result from function gdm()
15+
plot_gdm_jld <- function(x, points_color="darkslategray4", pred_colors="auto",
16+
line_back_col="black", line_front_col="white", PSAMPLE=200, top_blank=FALSE,
17+
coef_threshold=0){
18+
19+
require(gdm)
20+
# setting from original plot.gdm, not sure what it does so I'm leaving it
21+
options(warn.FPU = FALSE)
22+
23+
## define plot area type (2 cols, 1 row)
24+
par(mfrow=c(2,1), mai=c(1, 1, 0.1, 0.1))
25+
26+
## First plot - observed vs predicted compositional dissimilarity
27+
28+
# make plot - blank if top_blank==T
29+
if(top_blank==TRUE){ptype<-"n"}else{ptype<-"p"}
30+
31+
plot(x$predicted, x$observed, xlab = "Predicted community dissimilarity",
32+
ylab = "Observed community dissimilarity",
33+
ylim = c(0, 1),
34+
pch = 20,
35+
cex = 0.25,
36+
col = points_color,
37+
type = ptype
38+
)
39+
40+
41+
# add model fit
42+
if(top_blank==FALSE){
43+
overlayX <- overlayY <- seq(from = min(x$predicted), to = max(x$predicted), length = PSAMPLE)
44+
lines(overlayX, overlayY, lwd = 6, col=line_back_col)
45+
lines(overlayX, overlayY, lwd = 2, col=line_front_col, lty=2)
46+
}
47+
48+
## Organize spline data
49+
50+
# figure out how many predictors we need to plot
51+
n_preds <- length(x$predictors)
52+
53+
# make data frame for spline info (this makes code for extracting plot data 50000% more legible)
54+
spline_df <- data.frame(
55+
pred_ind=rep(1:length(x$predictors), x$splines),
56+
pred_name=rep(x$predictors, x$splines),
57+
coefficient=x$coefficients,
58+
knot=x$knots
59+
)
60+
61+
# standardize all knots to 0-1 range
62+
# this allows all splines to be plotted together
63+
for(p in x$predictors){
64+
knots_i <- spline_df$knot[spline_df$pred_name == p]
65+
knots_i <- (knots_i - min(knots_i)) / (max(knots_i) - min(knots_i))
66+
spline_df$knot[spline_df$pred_name == p] <- knots_i
67+
}
68+
69+
# change community dissimilarity explained to percent
70+
spline_df$coefficient <- spline_df$coefficient * 100
71+
72+
# make a list of predictor plot data
73+
pred_plot_list <- list()
74+
# fill list up
75+
for(i in 1:n_preds){
76+
# not sure why this is pre-allocated, but it can't hurt much to leave it
77+
preddata_i <- rep(0, times = PSAMPLE)
78+
# c function to get predictor plot data. No idea what it does or how it works.
79+
# I had to reverse-engineer the arguments it takes, but it works 100% now.
80+
pred_plot_list[[i]] <- .C("GetPredictorPlotData",
81+
pdata = as.double(preddata_i),
82+
as.integer(PSAMPLE),
83+
as.double(spline_df$coefficient[spline_df$pred_ind == i]),
84+
as.double(spline_df$knot[spline_df$pred_ind == i]),
85+
as.integer( sum(spline_df$pred_ind == i) ),
86+
PACKAGE = "gdm"
87+
)
88+
# named lists are nice, add name
89+
names(pred_plot_list)[i] <- x$predictors[i]
90+
}
91+
92+
# drop variables that have max coef below threshold
93+
maxcoef <- rep(0, n_preds)
94+
for(i in 1:n_preds){
95+
maxcoef[i] <- sum(spline_df$coefficient[spline_df$pred_name == x$predictors[i]])
96+
}
97+
goodpreds <- x$predictors[maxcoef >= coef_threshold]
98+
pred_plot_list <- pred_plot_list[names(pred_plot_list) %in% goodpreds]
99+
n_preds <- length(pred_plot_list)
100+
101+
## Second plot - plot splines
102+
# make empty plot frame
103+
plot(x=NULL, y=NULL, type="n",
104+
xlim=c(0, 1),
105+
ylim=c(0, max(maxcoef)),
106+
xlab="Variable range",
107+
ylab="% cum. dissimilarity explained"
108+
)
109+
110+
# make colors
111+
# if auto (default), make some colors
112+
if(pred_colors[1] == "auto"){
113+
# R interpreter checks logic first, so this is OK
114+
pred_colors <- rainbow(n_preds, start=0, end=0.60)
115+
}else{
116+
# make sure user colors are long enough
117+
while(length(pred_colors) < n_preds){
118+
pred_colors <- c(pred_colors, pred_colors)
119+
}
120+
# trim
121+
pred_colors <- pred_colors[1:n_preds]
122+
}
123+
124+
# plot 'em
125+
for(i in 1:length(pred_plot_list)){
126+
points(
127+
x=seq(from=0, to=1, length=PSAMPLE),
128+
y=pred_plot_list[[i]]$pdata,
129+
type="l",
130+
col=pred_colors[i],
131+
lwd=6
132+
)
133+
}
134+
135+
maxvals <- rep(0, length(pred_plot_list))
136+
for(i in 1:length(maxvals)){
137+
maxvals[i] <- round(max(pred_plot_list[[i]]$pdata), 1)
138+
}
139+
legend_labels <- paste(sprintf("%04.1f", maxvals), "% - ", names(pred_plot_list), sep="")
140+
141+
# add legend
142+
legend(x=0, y=max(spline_df$coefficient), legend=legend_labels, col=pred_colors,
143+
lty=1, lwd=6, bty = "n")
144+
}
145+
146+
## function to reset par()
147+
resetPar <- function() {
148+
dev.new()
149+
op <- par(no.readonly = TRUE)
150+
dev.off()
151+
}
152+
153+
## visualize pairwise relationships among variables in a nice and minimalist way
154+
# r's pairs() isn't good enough for me, and chart.Correlation from performanceAnalytics is too messy
155+
# df is just a data frame where each column is a numeric variable to plot
156+
plot_pairwise_corrs <- function(df, label_cex=1, point_cex=1, cor_cex=2, cor_red_lim=0.70, mthd="pearson"){
157+
n <- ncol(df)
158+
par(mfrow = c(n,n), oma = c(5,4,0,0), mar = c(0,0,0,0) )
159+
# make a matrix to figure out which type of plot to do at position i,j
160+
# lower tri = scatterplots, diag=names, upper tri = correlation coefficients
161+
typemat <- matrix("D", nrow=n, ncol=n)
162+
typemat[lower.tri(typemat)] <- "L"
163+
typemat[upper.tri(typemat)] <- "U"
164+
for(i in 1:n){for(j in 1:n){
165+
if(typemat[i,j] == "L"){
166+
# lower tri - do scaterplot
167+
plot(x=df[,j], y=df[,i], axes = FALSE, xlab="", ylab="", pch=20, cex=point_cex)
168+
box()
169+
}else if(typemat[i,j] == "D"){
170+
# diag - write variable name
171+
plot(1, type="n", xlim=c(-1, 1), ylim=c(-1, 1), axes = FALSE, xlab="", ylab="", pch=20)
172+
text(x=0, y=0, labels=colnames(df)[i], cex=label_cex, srt=-45)
173+
box()
174+
}else if(typemat[i,j] == "U"){
175+
# upper tri - nicely display correlation coefficient (r)
176+
cor_ij <- cor(df[,j], df[,i], use="complete.obs", method=mthd)
177+
if(cor_ij > cor_red_lim || cor_ij < (-1 * cor_red_lim)){
178+
col_ij <- "red"
179+
}else{
180+
col_ij <- "black"
181+
}
182+
cor_ij <- sprintf("%.2f", round(cor_ij,2))
183+
184+
plot(1, type="n", xlim=c(-1, 1), ylim=c(-1, 1), axes = FALSE, xlab="", ylab="", pch=20)
185+
text(x=0, y=0, labels=cor_ij, cex=cor_cex, col=col_ij)
186+
box()
187+
}
188+
}}
189+
resetPar()
190+
}
191+
192+
193+
194+
195+
## site_pair_from_list generates GDM's sitepair table from a list of objects
196+
# valid types in the list are : "numeric", "matrix", or "list"
197+
# see example above
198+
# a strength of this approach is that one can use only a subset of all the predictors
199+
# with the preds2use argument.
200+
site_pair_from_list <- function(responseMat, predList, preds2use=NULL){
201+
# if preds2use is specified, simplify predList accordingly
202+
if(!is.null(preds2use)){
203+
# drop unused items from predList
204+
predList <- predList[names(predList) %in% preds2use]
205+
# get predList into the same order as preds2use (only matters for metadata column vectors...)
206+
predList <- predList[order(match(names(predList), preds2use))]
207+
}
208+
# get classes
209+
predClasses <- lapply(X=predList, FUN=class)
210+
211+
# make table - if ONLY MATRIX, make data with fake variable instead.
212+
if(sum(predClasses == "numeric") > 0){
213+
predDF <- data.frame(
214+
SampleID=rownames(responseMat), # siteColumn
215+
simplify2array(predList[predClasses %in% c("integer", "numeric")]) # data columns
216+
)
217+
}else{
218+
predDF <- data.frame(
219+
SampleID=rownames(responseMat), # siteColumn
220+
FakeData=rep(0, nrow(responseMat)) # fake data column
221+
)
222+
}
223+
224+
# check if geo is included (one and only one time!)
225+
# if so, add geo information to predDF
226+
# if not, add fake geo information (because formatsitepair() is dumb)
227+
if(sum(predClasses == "list") == 1){
228+
geoLat <- predList[[which(predClasses=="list")]]$Lat
229+
geoLon <- predList[[which(predClasses=="list")]]$Lon
230+
}else{
231+
geoLat <- rep(1, nrow(responseMat))
232+
geoLon <- rep(1, nrow(responseMat))
233+
}
234+
# add real or fake lat/longs to predDF
235+
predDF <- data.frame(
236+
predDF,
237+
Lat=geoLat,
238+
Lon=geoLon
239+
)
240+
241+
# format distance matrices
242+
if(sum(predClasses == "matrix") > 0){
243+
matrixList <- predList[predClasses == "matrix"]
244+
matrixList <- lapply(X=matrixList, FUN=gdmize, sampleids=rownames(responseMat))
245+
}else{
246+
matrixList <- NULL
247+
}
248+
249+
# make sitepair table
250+
spt <- formatsitepair(
251+
bioData=gdmize(responseMat, rownames(responseMat)), bioFormat=3,
252+
predData=predDF,
253+
XColumn="Lon", YColumn="Lat",
254+
distPreds=matrixList,
255+
siteColumn="SampleID"
256+
)
257+
258+
# remove NAs
259+
spt <- na.omit(spt)
260+
261+
return(spt)
262+
}
263+
264+
## forward_adonis
265+
# forward model selection for adonis
266+
# all RHS vars must be column vectors, within a matrix.
267+
# no interaction terms are considered.
268+
# LHS is a dist object, maybe a community data matrix would work, not tested.
269+
fwd_adonis <- function(lhs, rhs, ncores=4){
270+
require(parallel)
271+
vars_in_model <- NULL
272+
lhs_name <- deparse(substitute(lhs))
273+
Ps <- R2s <- matrix(data=NA, nrow=ncol(rhs), ncol=ncol(rhs), dimnames=list(colnames(rhs)))
274+
275+
# this function takes names of variables and returns a formula.
276+
makefrmla <- function(v, y="lhs"){ as.formula(paste(y, "~", paste(v, collapse=" + "))) }
277+
278+
# start progress bar
279+
pb <- txtProgressBar(min=0, max=sum(1:ncol(R2s)), style=3)
280+
n_completed <- 0
281+
282+
# do model selection
283+
for(j in 1:ncol(R2s)){
284+
newvars <- colnames(rhs)[! colnames(rhs) %in% vars_in_model]
285+
# get list of aov tables for each potential new model
286+
aovs_newvars <- mclapply(
287+
X=newvars,
288+
FUN=function(x){ as.data.frame(adonis(makefrmla(c(vars_in_model, x)), data=rhs)$aov.tab) },
289+
mc.cores=ncores
290+
)
291+
# calculate total R2 for each potential model
292+
total_r2s <- sapply(
293+
X=aovs_newvars,
294+
FUN=function(x){ sum(x$R2[! rownames(x) %in% c("Residuals", "Total")]) }
295+
)
296+
# choose which term to add based on total R2 of model
297+
toadd <- which.max(total_r2s)
298+
# add term to vars_in_model, and put Pvals and R2s in output matrices
299+
vars_in_model <- c(vars_in_model, newvars[toadd])
300+
for(i in 1:nrow(R2s)){
301+
term <- rownames(R2s)[i]
302+
if(term %in% rownames(aovs_newvars[[toadd]])){
303+
R2s[i,j] <- round(aovs_newvars[[toadd]]$R2[ rownames(aovs_newvars[[toadd]]) == term ], 3)
304+
Ps[i,j] <- round(aovs_newvars[[toadd]]$"Pr(>F)"[ rownames(aovs_newvars[[toadd]]) == term ], 3)
305+
}
306+
}
307+
# update progress bar
308+
n_completed <- n_completed + length(newvars)
309+
setTxtProgressBar(pb, n_completed)
310+
}
311+
message("")
312+
# which is the last model that had only significant variables?
313+
allsig <- apply(X=Ps, MAR=2, FUN=function(x){ all(x[!is.na(x)] < 0.05) })
314+
lastallsig <- which.max(which(allsig))
315+
if(length(lastallsig) <= 0){
316+
formula_sig <- "No significant models."
317+
}else{
318+
formula_sig <- makefrmla(v=vars_in_model[1:lastallsig], y=lhs_name)
319+
}
320+
formula_all <- makefrmla(v=vars_in_model, y=lhs_name)
321+
# return relevant objects
322+
return(list(
323+
formula_all,
324+
formula_sig,
325+
Pvals=Ps,
326+
R2s=R2s
327+
))
328+
329+
}
330+
331+

0 commit comments

Comments
 (0)