|
| 1 | +## GDM helper functions |
| 2 | + # make gdm syntax a little less arcane |
| 3 | + |
| 4 | +## gdmize() takes an matrix or data.frame and adds a row for SampleID, which is |
| 5 | + # required by gdm. I usually have my rows named instead, hence the function. |
| 6 | + gdmize <- function(x, sampleids, sampleids_name="SampleID"){ |
| 7 | + a <- data.frame(sampleids, as.data.frame(x)) |
| 8 | + colnames(a)[1] <- sampleids_name |
| 9 | + return(a) |
| 10 | + } |
| 11 | + |
| 12 | + |
| 13 | +## this is a function that makes a friendly gdm plot |
| 14 | + # x is a gdm model, result from function gdm() |
| 15 | + plot_gdm_jld <- function(x, points_color="darkslategray4", pred_colors="auto", |
| 16 | + line_back_col="black", line_front_col="white", PSAMPLE=200, top_blank=FALSE, |
| 17 | + coef_threshold=0){ |
| 18 | + |
| 19 | + require(gdm) |
| 20 | + # setting from original plot.gdm, not sure what it does so I'm leaving it |
| 21 | + options(warn.FPU = FALSE) |
| 22 | + |
| 23 | + ## define plot area type (2 cols, 1 row) |
| 24 | + par(mfrow=c(2,1), mai=c(1, 1, 0.1, 0.1)) |
| 25 | + |
| 26 | + ## First plot - observed vs predicted compositional dissimilarity |
| 27 | + |
| 28 | + # make plot - blank if top_blank==T |
| 29 | + if(top_blank==TRUE){ptype<-"n"}else{ptype<-"p"} |
| 30 | + |
| 31 | + plot(x$predicted, x$observed, xlab = "Predicted community dissimilarity", |
| 32 | + ylab = "Observed community dissimilarity", |
| 33 | + ylim = c(0, 1), |
| 34 | + pch = 20, |
| 35 | + cex = 0.25, |
| 36 | + col = points_color, |
| 37 | + type = ptype |
| 38 | + ) |
| 39 | + |
| 40 | + |
| 41 | + # add model fit |
| 42 | + if(top_blank==FALSE){ |
| 43 | + overlayX <- overlayY <- seq(from = min(x$predicted), to = max(x$predicted), length = PSAMPLE) |
| 44 | + lines(overlayX, overlayY, lwd = 6, col=line_back_col) |
| 45 | + lines(overlayX, overlayY, lwd = 2, col=line_front_col, lty=2) |
| 46 | + } |
| 47 | + |
| 48 | + ## Organize spline data |
| 49 | + |
| 50 | + # figure out how many predictors we need to plot |
| 51 | + n_preds <- length(x$predictors) |
| 52 | + |
| 53 | + # make data frame for spline info (this makes code for extracting plot data 50000% more legible) |
| 54 | + spline_df <- data.frame( |
| 55 | + pred_ind=rep(1:length(x$predictors), x$splines), |
| 56 | + pred_name=rep(x$predictors, x$splines), |
| 57 | + coefficient=x$coefficients, |
| 58 | + knot=x$knots |
| 59 | + ) |
| 60 | + |
| 61 | + # standardize all knots to 0-1 range |
| 62 | + # this allows all splines to be plotted together |
| 63 | + for(p in x$predictors){ |
| 64 | + knots_i <- spline_df$knot[spline_df$pred_name == p] |
| 65 | + knots_i <- (knots_i - min(knots_i)) / (max(knots_i) - min(knots_i)) |
| 66 | + spline_df$knot[spline_df$pred_name == p] <- knots_i |
| 67 | + } |
| 68 | + |
| 69 | + # change community dissimilarity explained to percent |
| 70 | + spline_df$coefficient <- spline_df$coefficient * 100 |
| 71 | + |
| 72 | + # make a list of predictor plot data |
| 73 | + pred_plot_list <- list() |
| 74 | + # fill list up |
| 75 | + for(i in 1:n_preds){ |
| 76 | + # not sure why this is pre-allocated, but it can't hurt much to leave it |
| 77 | + preddata_i <- rep(0, times = PSAMPLE) |
| 78 | + # c function to get predictor plot data. No idea what it does or how it works. |
| 79 | + # I had to reverse-engineer the arguments it takes, but it works 100% now. |
| 80 | + pred_plot_list[[i]] <- .C("GetPredictorPlotData", |
| 81 | + pdata = as.double(preddata_i), |
| 82 | + as.integer(PSAMPLE), |
| 83 | + as.double(spline_df$coefficient[spline_df$pred_ind == i]), |
| 84 | + as.double(spline_df$knot[spline_df$pred_ind == i]), |
| 85 | + as.integer( sum(spline_df$pred_ind == i) ), |
| 86 | + PACKAGE = "gdm" |
| 87 | + ) |
| 88 | + # named lists are nice, add name |
| 89 | + names(pred_plot_list)[i] <- x$predictors[i] |
| 90 | + } |
| 91 | + |
| 92 | + # drop variables that have max coef below threshold |
| 93 | + maxcoef <- rep(0, n_preds) |
| 94 | + for(i in 1:n_preds){ |
| 95 | + maxcoef[i] <- sum(spline_df$coefficient[spline_df$pred_name == x$predictors[i]]) |
| 96 | + } |
| 97 | + goodpreds <- x$predictors[maxcoef >= coef_threshold] |
| 98 | + pred_plot_list <- pred_plot_list[names(pred_plot_list) %in% goodpreds] |
| 99 | + n_preds <- length(pred_plot_list) |
| 100 | + |
| 101 | + ## Second plot - plot splines |
| 102 | + # make empty plot frame |
| 103 | + plot(x=NULL, y=NULL, type="n", |
| 104 | + xlim=c(0, 1), |
| 105 | + ylim=c(0, max(maxcoef)), |
| 106 | + xlab="Variable range", |
| 107 | + ylab="% cum. dissimilarity explained" |
| 108 | + ) |
| 109 | + |
| 110 | + # make colors |
| 111 | + # if auto (default), make some colors |
| 112 | + if(pred_colors[1] == "auto"){ |
| 113 | + # R interpreter checks logic first, so this is OK |
| 114 | + pred_colors <- rainbow(n_preds, start=0, end=0.60) |
| 115 | + }else{ |
| 116 | + # make sure user colors are long enough |
| 117 | + while(length(pred_colors) < n_preds){ |
| 118 | + pred_colors <- c(pred_colors, pred_colors) |
| 119 | + } |
| 120 | + # trim |
| 121 | + pred_colors <- pred_colors[1:n_preds] |
| 122 | + } |
| 123 | + |
| 124 | + # plot 'em |
| 125 | + for(i in 1:length(pred_plot_list)){ |
| 126 | + points( |
| 127 | + x=seq(from=0, to=1, length=PSAMPLE), |
| 128 | + y=pred_plot_list[[i]]$pdata, |
| 129 | + type="l", |
| 130 | + col=pred_colors[i], |
| 131 | + lwd=6 |
| 132 | + ) |
| 133 | + } |
| 134 | + |
| 135 | + maxvals <- rep(0, length(pred_plot_list)) |
| 136 | + for(i in 1:length(maxvals)){ |
| 137 | + maxvals[i] <- round(max(pred_plot_list[[i]]$pdata), 1) |
| 138 | + } |
| 139 | + legend_labels <- paste(sprintf("%04.1f", maxvals), "% - ", names(pred_plot_list), sep="") |
| 140 | + |
| 141 | + # add legend |
| 142 | + legend(x=0, y=max(spline_df$coefficient), legend=legend_labels, col=pred_colors, |
| 143 | + lty=1, lwd=6, bty = "n") |
| 144 | + } |
| 145 | + |
| 146 | +## function to reset par() |
| 147 | + resetPar <- function() { |
| 148 | + dev.new() |
| 149 | + op <- par(no.readonly = TRUE) |
| 150 | + dev.off() |
| 151 | + } |
| 152 | + |
| 153 | +## visualize pairwise relationships among variables in a nice and minimalist way |
| 154 | + # r's pairs() isn't good enough for me, and chart.Correlation from performanceAnalytics is too messy |
| 155 | + # df is just a data frame where each column is a numeric variable to plot |
| 156 | + plot_pairwise_corrs <- function(df, label_cex=1, point_cex=1, cor_cex=2, cor_red_lim=0.70, mthd="pearson"){ |
| 157 | + n <- ncol(df) |
| 158 | + par(mfrow = c(n,n), oma = c(5,4,0,0), mar = c(0,0,0,0) ) |
| 159 | + # make a matrix to figure out which type of plot to do at position i,j |
| 160 | + # lower tri = scatterplots, diag=names, upper tri = correlation coefficients |
| 161 | + typemat <- matrix("D", nrow=n, ncol=n) |
| 162 | + typemat[lower.tri(typemat)] <- "L" |
| 163 | + typemat[upper.tri(typemat)] <- "U" |
| 164 | + for(i in 1:n){for(j in 1:n){ |
| 165 | + if(typemat[i,j] == "L"){ |
| 166 | + # lower tri - do scaterplot |
| 167 | + plot(x=df[,j], y=df[,i], axes = FALSE, xlab="", ylab="", pch=20, cex=point_cex) |
| 168 | + box() |
| 169 | + }else if(typemat[i,j] == "D"){ |
| 170 | + # diag - write variable name |
| 171 | + plot(1, type="n", xlim=c(-1, 1), ylim=c(-1, 1), axes = FALSE, xlab="", ylab="", pch=20) |
| 172 | + text(x=0, y=0, labels=colnames(df)[i], cex=label_cex, srt=-45) |
| 173 | + box() |
| 174 | + }else if(typemat[i,j] == "U"){ |
| 175 | + # upper tri - nicely display correlation coefficient (r) |
| 176 | + cor_ij <- cor(df[,j], df[,i], use="complete.obs", method=mthd) |
| 177 | + if(cor_ij > cor_red_lim || cor_ij < (-1 * cor_red_lim)){ |
| 178 | + col_ij <- "red" |
| 179 | + }else{ |
| 180 | + col_ij <- "black" |
| 181 | + } |
| 182 | + cor_ij <- sprintf("%.2f", round(cor_ij,2)) |
| 183 | + |
| 184 | + plot(1, type="n", xlim=c(-1, 1), ylim=c(-1, 1), axes = FALSE, xlab="", ylab="", pch=20) |
| 185 | + text(x=0, y=0, labels=cor_ij, cex=cor_cex, col=col_ij) |
| 186 | + box() |
| 187 | + } |
| 188 | + }} |
| 189 | + resetPar() |
| 190 | + } |
| 191 | + |
| 192 | + |
| 193 | + |
| 194 | + |
| 195 | +## site_pair_from_list generates GDM's sitepair table from a list of objects |
| 196 | + # valid types in the list are : "numeric", "matrix", or "list" |
| 197 | + # see example above |
| 198 | + # a strength of this approach is that one can use only a subset of all the predictors |
| 199 | + # with the preds2use argument. |
| 200 | + site_pair_from_list <- function(responseMat, predList, preds2use=NULL){ |
| 201 | + # if preds2use is specified, simplify predList accordingly |
| 202 | + if(!is.null(preds2use)){ |
| 203 | + # drop unused items from predList |
| 204 | + predList <- predList[names(predList) %in% preds2use] |
| 205 | + # get predList into the same order as preds2use (only matters for metadata column vectors...) |
| 206 | + predList <- predList[order(match(names(predList), preds2use))] |
| 207 | + } |
| 208 | + # get classes |
| 209 | + predClasses <- lapply(X=predList, FUN=class) |
| 210 | + |
| 211 | + # make table - if ONLY MATRIX, make data with fake variable instead. |
| 212 | + if(sum(predClasses == "numeric") > 0){ |
| 213 | + predDF <- data.frame( |
| 214 | + SampleID=rownames(responseMat), # siteColumn |
| 215 | + simplify2array(predList[predClasses %in% c("integer", "numeric")]) # data columns |
| 216 | + ) |
| 217 | + }else{ |
| 218 | + predDF <- data.frame( |
| 219 | + SampleID=rownames(responseMat), # siteColumn |
| 220 | + FakeData=rep(0, nrow(responseMat)) # fake data column |
| 221 | + ) |
| 222 | + } |
| 223 | + |
| 224 | + # check if geo is included (one and only one time!) |
| 225 | + # if so, add geo information to predDF |
| 226 | + # if not, add fake geo information (because formatsitepair() is dumb) |
| 227 | + if(sum(predClasses == "list") == 1){ |
| 228 | + geoLat <- predList[[which(predClasses=="list")]]$Lat |
| 229 | + geoLon <- predList[[which(predClasses=="list")]]$Lon |
| 230 | + }else{ |
| 231 | + geoLat <- rep(1, nrow(responseMat)) |
| 232 | + geoLon <- rep(1, nrow(responseMat)) |
| 233 | + } |
| 234 | + # add real or fake lat/longs to predDF |
| 235 | + predDF <- data.frame( |
| 236 | + predDF, |
| 237 | + Lat=geoLat, |
| 238 | + Lon=geoLon |
| 239 | + ) |
| 240 | + |
| 241 | + # format distance matrices |
| 242 | + if(sum(predClasses == "matrix") > 0){ |
| 243 | + matrixList <- predList[predClasses == "matrix"] |
| 244 | + matrixList <- lapply(X=matrixList, FUN=gdmize, sampleids=rownames(responseMat)) |
| 245 | + }else{ |
| 246 | + matrixList <- NULL |
| 247 | + } |
| 248 | + |
| 249 | + # make sitepair table |
| 250 | + spt <- formatsitepair( |
| 251 | + bioData=gdmize(responseMat, rownames(responseMat)), bioFormat=3, |
| 252 | + predData=predDF, |
| 253 | + XColumn="Lon", YColumn="Lat", |
| 254 | + distPreds=matrixList, |
| 255 | + siteColumn="SampleID" |
| 256 | + ) |
| 257 | + |
| 258 | + # remove NAs |
| 259 | + spt <- na.omit(spt) |
| 260 | + |
| 261 | + return(spt) |
| 262 | + } |
| 263 | + |
| 264 | +## forward_adonis |
| 265 | + # forward model selection for adonis |
| 266 | + # all RHS vars must be column vectors, within a matrix. |
| 267 | + # no interaction terms are considered. |
| 268 | + # LHS is a dist object, maybe a community data matrix would work, not tested. |
| 269 | + fwd_adonis <- function(lhs, rhs, ncores=4){ |
| 270 | + require(parallel) |
| 271 | + vars_in_model <- NULL |
| 272 | + lhs_name <- deparse(substitute(lhs)) |
| 273 | + Ps <- R2s <- matrix(data=NA, nrow=ncol(rhs), ncol=ncol(rhs), dimnames=list(colnames(rhs))) |
| 274 | + |
| 275 | + # this function takes names of variables and returns a formula. |
| 276 | + makefrmla <- function(v, y="lhs"){ as.formula(paste(y, "~", paste(v, collapse=" + "))) } |
| 277 | + |
| 278 | + # start progress bar |
| 279 | + pb <- txtProgressBar(min=0, max=sum(1:ncol(R2s)), style=3) |
| 280 | + n_completed <- 0 |
| 281 | + |
| 282 | + # do model selection |
| 283 | + for(j in 1:ncol(R2s)){ |
| 284 | + newvars <- colnames(rhs)[! colnames(rhs) %in% vars_in_model] |
| 285 | + # get list of aov tables for each potential new model |
| 286 | + aovs_newvars <- mclapply( |
| 287 | + X=newvars, |
| 288 | + FUN=function(x){ as.data.frame(adonis(makefrmla(c(vars_in_model, x)), data=rhs)$aov.tab) }, |
| 289 | + mc.cores=ncores |
| 290 | + ) |
| 291 | + # calculate total R2 for each potential model |
| 292 | + total_r2s <- sapply( |
| 293 | + X=aovs_newvars, |
| 294 | + FUN=function(x){ sum(x$R2[! rownames(x) %in% c("Residuals", "Total")]) } |
| 295 | + ) |
| 296 | + # choose which term to add based on total R2 of model |
| 297 | + toadd <- which.max(total_r2s) |
| 298 | + # add term to vars_in_model, and put Pvals and R2s in output matrices |
| 299 | + vars_in_model <- c(vars_in_model, newvars[toadd]) |
| 300 | + for(i in 1:nrow(R2s)){ |
| 301 | + term <- rownames(R2s)[i] |
| 302 | + if(term %in% rownames(aovs_newvars[[toadd]])){ |
| 303 | + R2s[i,j] <- round(aovs_newvars[[toadd]]$R2[ rownames(aovs_newvars[[toadd]]) == term ], 3) |
| 304 | + Ps[i,j] <- round(aovs_newvars[[toadd]]$"Pr(>F)"[ rownames(aovs_newvars[[toadd]]) == term ], 3) |
| 305 | + } |
| 306 | + } |
| 307 | + # update progress bar |
| 308 | + n_completed <- n_completed + length(newvars) |
| 309 | + setTxtProgressBar(pb, n_completed) |
| 310 | + } |
| 311 | + message("") |
| 312 | + # which is the last model that had only significant variables? |
| 313 | + allsig <- apply(X=Ps, MAR=2, FUN=function(x){ all(x[!is.na(x)] < 0.05) }) |
| 314 | + lastallsig <- which.max(which(allsig)) |
| 315 | + if(length(lastallsig) <= 0){ |
| 316 | + formula_sig <- "No significant models." |
| 317 | + }else{ |
| 318 | + formula_sig <- makefrmla(v=vars_in_model[1:lastallsig], y=lhs_name) |
| 319 | + } |
| 320 | + formula_all <- makefrmla(v=vars_in_model, y=lhs_name) |
| 321 | + # return relevant objects |
| 322 | + return(list( |
| 323 | + formula_all, |
| 324 | + formula_sig, |
| 325 | + Pvals=Ps, |
| 326 | + R2s=R2s |
| 327 | + )) |
| 328 | + |
| 329 | + } |
| 330 | + |
| 331 | + |
0 commit comments