-
Notifications
You must be signed in to change notification settings - Fork 2
/
CAWatershedBands.R
344 lines (327 loc) · 17.7 KB
/
CAWatershedBands.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
library("plyr")
library(dplyr)
library("ggplot2")
library(lubridate)
library("ape")
library("vegan")
library("microbiome")
library(data.table)
library(tidyr)
#This script looks at co-occurrence networks generated on a HUC-8 watershed scale from the SCCWRP
#California data set. Each watershed is divided into land use bands, and then the core networks
#which persist across all land use bands by watershed are compared to infer the relationship between
#co-occurrence network structure and stress.
setwd("~/Desktop/SCCWRP")
#Read in site data containing biological counts, water chemistry, and land usage values.
GISBioData <- read.table("CAGISBioData.csv", header=TRUE, sep=",",as.is=T,skip=0,fill=TRUE,check.names=FALSE)
#Read in sample metadata.
SCCWRP <- read.table("CSCI.csv", header=TRUE, sep=",",as.is=T,skip=0,fill=TRUE,check.names=FALSE)
#How many samples per group for generating a co-occurrence network?
groupNum=10
#Run through analysis on SCCWRP archive on a watershed-level scale.
#Select watersheds with a large enough set of samples for analysis.
watersheds <- subset(as.data.frame(table(SCCWRP$Watershed)),Freq>=3*groupNum)
colnames(watersheds) <- c("Watershed","Samples")
#Watersheds with a large enough set of samples to generate co-occurrence networks
#for each land use band.
LargeWS <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(LargeWS) <- c("Watershed","Samples")
#This portion of code determines which watersheds have enough samples in all land use bands
#to generate networks.
#Set land use thresholds.
PD <- 5 #Partially degraded land use threshold.
HD <- 15 #Highly degraded land use threshold.
#Subset data by watershed, and then by land use bands.
for(WS in watersheds$Watershed){
#Get samples per watershed.
GISBioDataSubset <- subset(GISBioData,Watershed==WS)
j=0
for(i in 1:3){
if(i==1){
GISBioDataSubsetLU <- subset(GISBioDataSubset,LU_2000_5K<=PD)
if(length(unique(GISBioDataSubsetLU$UniqueID))>=groupNum){
j=j+1
}
}
if(i==2){
GISBioDataSubsetLU <- subset(GISBioDataSubset,LU_2000_5K > PD & LU_2000_5K<=HD)
if(length(unique(GISBioDataSubsetLU$UniqueID))>=groupNum){
j=j+1
}
}
if(i==3){
GISBioDataSubsetLU <- subset(GISBioDataSubset,LU_2000_5K > HD)
if(length(unique(GISBioDataSubsetLU$UniqueID))>=groupNum){
j=j+1
}
}
#Get watersheds with a minimum number of samples per watershed and land use band.
if(j==3){
LargeWS <- rbind(LargeWS,subset(watersheds,watersheds$Watershed==WS))
}
}
}
#Generate LSA input files for each watershed which contains a sufficient number of samples
#for each land use band.
for(WS in LargeWS$Watershed){
GISBioDataSubset <- subset(GISBioData,Watershed==WS)
for(i in 1:3){
if(i==1){
GISBioDataSubsetLU <- subset(GISBioDataSubset,LU_2000_5K<=PD)
}
if(i==2){
GISBioDataSubsetLU <- subset(GISBioDataSubset,LU_2000_5K > PD & LU_2000_5K<=HD)
}
if(i==3){
GISBioDataSubsetLU <- subset(GISBioDataSubset,LU_2000_5K > HD)
}
selected <- GISBioDataSubsetLU
eLSAInput <- as.data.frame(unique(selected$FinalID))
colnames(eLSAInput)<-c("FinalID")
eLSAInput <- as.data.frame(eLSAInput[order(as.character(eLSAInput$FinalID)),])
colnames(eLSAInput)<-c("FinalID")
#Add the relative taxa abundances by column to a new dataframe.
#The rows are the unique taxa in a given subset of data.
selected <- selected[order(selected$Year,selected$UniqueID,selected$FinalID),]
for(ID in unique(selected$UniqueID)){
tmp <- filter(selected, UniqueID == ID)[,c("FinalID","Measurement","UniqueID")]
tmp <- as.data.frame(tmp[order(tmp$FinalID),])
tmp <- tmp[-c(3)]
colnames(tmp)<-c("FinalID",paste("Measurement",ID,sep=" "))
eLSAInput <- join(eLSAInput,tmp,by="FinalID")
eLSAInput$FinalID=as.character(eLSAInput$FinalID)
eLSAInput <- eLSAInput %>% group_by(FinalID) %>% summarise_if(is.numeric,mean,na.rm=TRUE)
}
eLSAInput[is.na(eLSAInput)] <- "NA"
#Determine the number of time points in the eLSA input file.
spotNum = length(unique(selected$Year))
#Determine the number of replicates per time point in the eLSA input file.
#In order to ensure a uniform number of replicates per year this needs to
#be the maximum number of replicates for all of the years available.
repMax = 0
for(year in unique(selected$Year)){
tmp <- filter(selected, Year == year)[,c("UniqueID","Year")]
repNum = length(unique(tmp$UniqueID))
if(repNum >= repMax){repMax = repNum}
#print (paste(repMax,repNum,year,sep=" "))
}
repNum = repMax
#Now insert the replicates with actual data in between the "NA" dummy columns
#which ensure that the final eLSA input file has an even number of replicates
#per year regardless of the variations in the actual number of sites (replicates)
#sampled per year.
eLSAtmp <- eLSAInput[,1]
j=1
k=1
nulCol <- data.frame(matrix(ncol=repNum*spotNum,nrow=length(unique(selected$FinalID))))
nulCol[,1] <- eLSAInput[,1]
for(year in unique(selected$Year)){
tmp <- filter(selected, Year == year)
rep = length(unique(tmp$UniqueID))
for(i in 1:repNum){
if(i <= rep){
repLabel = paste(year,"DoneRep",i,sep="")
j=j+1
k=k+1
eLSAtmp[,k] <- eLSAInput[,j]
}
else{
repLabel = as.character(paste(year,"Rep",i,sep=""))
k=k+1
eLSAtmp[,k] <- "NA"
#print(paste(k,repLabel,sep=" "))
}
}
}
eLSAInput <- eLSAtmp
#Designate a unique filename.
#S is the number of spots, or years represented in the subsample group.
#R is the number of replicates per year. Many of the years will have null replicates, but a uniform number is needed for eLSA.
#M is the mean LU_2000_5K score per subsample group.
sampleNum <- length(unique(selected$UniqueID))
meanLU = mean(na.omit(selected$LU_2000_5K))
filename <- paste("Watershed",WS,"NSamples",sampleNum,"S",spotNum,"R",repNum,"M",meanLU,sep="")
#Output file for use in eLSA.
write.table(eLSAInput,paste(filename,".txt",sep=""),quote=FALSE,sep="\t",row.names = FALSE)
eLSACommand = paste("lsa_compute ",filename,".txt ","-r ",repNum," -s ",spotNum," ",filename,"Network.txt;",sep="")
print(eLSACommand)
}
}
#Read in eLSA output.
#Compute network statistics of the likeliest association networks between taxa.
library(igraph)
library(network)
library(stringr)
#Read in site data.
GISBioData <- read.table("CAGISBioData.csv", header=TRUE, sep=",",as.is=T,skip=0,fill=TRUE,check.names=FALSE)
#Ensure that all sites have a land use value.
GISBioData <- subset(GISBioData, LU_2000_5K != "NA")
#Filter out to taxonomic groups of interest.
GISBioData <- subset(GISBioData, MeasurementType == "benthic macroinvertebrate relative abundance")
#Get list of taxa of interest.
taxaList <- as.data.frame(table(GISBioData$FinalID))
colnames(taxaList) <- c("FinalID","Freq")
#Get network files to analyze.
networkfiles <- as.data.frame(Sys.glob("Watershed*NSamples*Network.txt"))
colnames(networkfiles) <- c("filename")
#Add watersheds to the filenames
networkfiles$Watershed <- str_split_fixed(str_split_fixed(networkfiles$filename,"Watershed",2)[,2],"NSamples",2)[,1]
networkfiles <- arrange(networkfiles,Watershed)
#Analyze network topologies for co-occurrence networks generated from taxa which occur
#across all land use bands on a per watershed basis.
networkAnalysis <- data.frame()
for(WS in unique(networkfiles$Watershed)){
WSSubset <- subset(networkfiles,Watershed==WS)
coreTaxa <- list()
#Get the taxa which are common to a watershed across all land use bands.
for(networkFile in unique(WSSubset$filename)){
networkdata <- read.delim(networkFile,header=TRUE, sep="\t",as.is=T,check.names=FALSE)
#Filter out taxa not being focused on in a particular study.
networkdata <- networkdata[networkdata$X %in% as.vector(taxaList$FinalID) | networkdata$Y %in% as.vector(taxaList$FinalID),]
#Filter out association network data based on P and Q scores, for the local similarity
#between two factors, with values less than a particuar threshold.
networkdata <- filter(networkdata, P <= 1e-2)
networkdata <- filter(networkdata, Q <= 1e-2)
names(networkdata)[names(networkdata)=="LS"]<-"weight"
#Get taxa present for each co-occurrence network.
uniqueTaxa <- append(unique(networkdata$X),unique(networkdata$Y))
if(length(coreTaxa)==0){
coreTaxa <- uniqueTaxa
}
if(length(coreTaxa)>0){
coreTaxa <- Reduce(intersect,list(coreTaxa,uniqueTaxa))
}
#print(paste(WS,length(coreTaxa)))
}
for(networkFile in unique(WSSubset$filename)){
networkdata <- read.delim(networkFile,header=TRUE, sep="\t",as.is=T,check.names=FALSE)
#Focus only on taxa common to all land use bands in a watershed.
networkdata <- networkdata[networkdata$X %in% as.vector(coreTaxa) | networkdata$Y %in% as.vector(coreTaxa),]
#Filter out association network data based on P and Q scores, for the local similarity
#between two factors, with values less than a particuar threshold.
networkdata <- filter(networkdata, P <= 1e-2)
networkdata <- filter(networkdata, Q <= 1e-2)
names(networkdata)[names(networkdata)=="LS"]<-"weight"
#Get mean land use sample group.
options(digits=15)
meanLU <- as.numeric(str_match(networkFile,"M(.*?)Network")[2])
#Generate network graph and begin calculating network parameters.
networkgraph=graph.data.frame(networkdata,directed=FALSE)
#Covariant co-occurrence network
networkgraphCov <- graph.data.frame(subset(networkdata,weight>0),directed=FALSE)
if(ecount(networkgraphCov)>0){
# Generate adjacency matrix of relative taxa abundance correlations
adj= as.network(get.adjacency(networkgraphCov,attr='weight',sparse=FALSE),directed=FALSE,loops=FALSE,matrix.type="adjacency")
cov_C = network.density(adj)
#Calculate the degree heterogeneity.
networkmatrix <- as.matrix(get.adjacency(networkgraphCov,attr='weight'))
#Mean interaction strength
cov_meanStrength <- mean(abs(networkmatrix))
networkmatrix[upper.tri(networkmatrix)] <- 0
networkmatrix <- ifelse(networkmatrix!=0,1,networkmatrix)
cov_zeta <- mean(colSums(networkmatrix)^2)/mean(colSums(networkmatrix))^2
#Generate randomized version of full weighted adjacency matrix.
#Calculate the degree heterogeneity of the corresponding random network.
set.seed(1)
randnetworkmatrixCov <- matrix(sample(as.vector((networkmatrix))),nrow=nrow(networkmatrix),ncol=ncol(networkmatrix))
randnetworkmatrixCov[upper.tri(randnetworkmatrixCov)] <- 0
randnetworkmatrixCov <- ifelse(randnetworkmatrixCov!=0,1,randnetworkmatrixCov)
cov_rand_zeta <- mean(colSums(randnetworkmatrixCov)^2)/mean(colSums(randnetworkmatrixCov))^2
# Log response ratio of degree heterogeneity.
cov_lr_zeta <- log(cov_zeta/cov_rand_zeta)
#Calculate modularity
cov_M <- modularity(cluster_edge_betweenness(networkgraphCov, weights=NULL,directed=FALSE))
#Edge counts
cov_Edge <- ecount(networkgraphCov)
#Node counts
cov_nodes <- vcount(networkgraphCov)
#Get the average degree per node.
cov_k <- (2*cov_Edge)/cov_nodes
#Characteristic path length.
cov_L <- mean_distance(networkgraphCov,directed=FALSE)
#Log ratio of characteristic path length to its random counterpart.
cov_lr_L <- log(mean_distance(networkgraphCov,directed=FALSE)/(0.5+((log(cov_nodes)-0.5772156649)/log(cov_k))))
#Clustering coefficient.
cov_Cl <- transitivity(networkgraphCon,type="globalundirected",isolate="zero")
#Log ratio of clustering coefficient to its random counterpart.
cov_lr_Cl <- log(transitivity(networkgraphCov,type="globalundirected",isolate="zero")/(cov_k/cov_nodes))
}
#Contravariant co-occurrence network
networkgraphCon <- graph.data.frame(subset(networkdata,weight<0),directed=FALSE)
if(ecount(networkgraphCon)>0){
# Generate adjacency matrix of relative taxa abundance correlations
adj= as.network(get.adjacency(networkgraphCon,attr='weight',sparse=FALSE),directed=FALSE,loops=FALSE,matrix.type="adjacency")
con_C = network.density(adj)
#Calculate the degree heterogeneity.
networkmatrix <- as.matrix(get.adjacency(networkgraphCon,attr='weight'))
#Mean interaction strength
con_meanStrength <- mean(abs(networkmatrix))
networkmatrix[upper.tri(networkmatrix)] <- 0
networkmatrix <- ifelse(networkmatrix!=0,1,networkmatrix)
con_zeta <- mean(colSums(networkmatrix)^2)/mean(colSums(networkmatrix))^2
#Generate randomized version of full weighted adjacency matrix.
#Calculate the degree heterogeneity of the corresponding random network.
set.seed(1)
randnetworkmatrixCon <- matrix(sample(as.vector((networkmatrix))),nrow=nrow(networkmatrix),ncol=ncol(networkmatrix))
randnetworkmatrixCon[upper.tri(randnetworkmatrixCon)] <- 0
randnetworkmatrixCon <- ifelse(randnetworkmatrixCon!=0,1,randnetworkmatrixCon)
con_rand_zeta <- mean(colSums(randnetworkmatrixCon)^2)/mean(colSums(randnetworkmatrixCon))^2
# Log response ratio of degree heterogeneity.
con_lr_zeta <- log(con_zeta/con_rand_zeta)
#Calculate modularity
con_M <- modularity(cluster_edge_betweenness(networkgraphCon, weights=NULL,directed=FALSE))
#Edge counts
con_Edge <- ecount(networkgraphCon)
#Node counts
con_nodes <- vcount(networkgraphCon)
#Get the average degree per node.
con_k <- (2*con_Edge)/con_nodes
#Characteristic path length.
con_L <- mean_distance(networkgraphCon,directed=FALSE)
#Log ratio of characteristic path length to its random counterpart.
con_lr_L <- log(mean_distance(networkgraphCon,directed=FALSE)/(0.5+((log(con_nodes)-0.5772156649)/log(con_k))))
#Clustering coefficient.
con_Cl <- transitivity(networkgraphCon,type="globalundirected",isolate="zero")
#Log ratio of clustering coefficient to its random counterpart.
con_lr_Cl <- log(transitivity(networkgraphCon,type="globalundirected",isolate="zero")/(con_k/con_nodes))
}
#Add all network parameters to a dataframe.
networkAnalysis <- rbindlist(list(networkAnalysis,list(networkFile,WS,meanLU,cov_nodes,cov_C,cov_Edge,cov_meanStrength,cov_zeta,cov_lr_zeta,cov_M,cov_L,cov_Cl,con_nodes,con_C,con_Edge,con_meanStrength,con_zeta,con_lr_zeta,con_M,con_L,con_Cl)))
print(paste(networkFile,WS,meanLU,cov_nodes,cov_C,cov_Edge,cov_meanStrength,cov_zeta,cov_lr_zeta,cov_M,cov_lr_L,cov_lr_Cl,con_nodes,con_C,con_Edge,con_meanStrength,con_zeta,con_lr_zeta,con_M,con_lr_L,con_lr_Cl))
}
}
colnames(networkAnalysis) <- c("filename","Watershed","meanLU","cov_nodes","cov_C","cov_Edge","cov_meanStrength","cov_zeta","cov_lr_zeta","cov_M","cov_L","cov_Cl","con_nodes","con_C","con_Edge","con_meanStrength","con_zeta","con_lr_zeta","con_M","con_L","con_Cl")
networkAnalysis <- dplyr::arrange(networkAnalysis,Watershed,meanLU)
#Remove any watersheds with spurious results due to insufficient statistics.
networkAnalysis <- subset(networkAnalysis,Watershed!="LakeTahoe")
#Set infinities to null values.
networkAnalysis[networkAnalysis=="-Inf"] <- NA
networkAnalysis[networkAnalysis=="Inf"] <- NA
#Regression between network parameters.
library(Hmisc)
library(corrplot)
library("PerformanceAnalytics")
#Each significance level is associated to a symbol : p-values(0, 0.001, 0.01, 0.05, 0.1, 1) <=> symbols(“***”, “**”, “*”, “.”, " “)
chart.Correlation(networkAnalysis[,c("meanLU","cov_nodes","cov_C","cov_Edge","cov_meanStrength","cov_zeta","cov_lr_zeta","cov_M","cov_lr_L")], histogram=FALSE, method="spearman")
chart.Correlation(networkAnalysis[,c("meanLU","con_nodes","con_C","con_Edge","con_meanStrength","con_zeta","con_lr_zeta","con_M","con_lr_L")], histogram=FALSE, method="spearman")
#Check how network topologies change within watersheds.
test <- networkAnalysis
test$diff_meanLU <- ave(test$meanLU, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_cov_nodes <- ave(test$cov_nodes, test$Watershed, FUN=function(x) c(NA,diff(x)))
test$diff_con_nodes <- ave(test$con_nodes, test$Watershed, FUN=function(x) c(NA,diff(x)))
test$diff_cov_C <- ave(test$cov_C, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_con_C <- ave(test$con_C, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_cov_zeta <- ave(test$cov_zeta, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_con_zeta <- ave(test$con_zeta, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_cov_lr_zeta <- ave(test$cov_lr_zeta, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_con_lr_zeta <- ave(test$con_lr_zeta, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_cov_meanStrength <- ave(test$cov_meanStrength, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_con_meanStrength <- ave(test$con_meanStrength, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_cov_M <- ave(test$con_M, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_con_M <- ave(test$cov_M, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_cov_L <- ave(test$cov_L, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_con_L <- ave(test$con_L, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_cov_Cl <- ave(test$cov_Cl, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_con_Cl <- ave(test$con_Cl, test$Watershed, FUN=function(x) c(NA, diff(x)))
chart.Correlation(test[,c("diff_meanLU","diff_cov_nodes","diff_cov_C","diff_cov_meanStrength","diff_cov_zeta","diff_cov_lr_zeta","diff_cov_M","diff_cov_L","diff_cov_Cl")], histogram=FALSE, method="spearman")
chart.Correlation(test[,c("diff_meanLU","diff_con_nodes","diff_con_C","diff_con_meanStrength","diff_con_zeta","diff_con_lr_zeta","diff_con_M","diff_con_L","diff_con_Cl")], histogram=FALSE, method="spearman")