-
Notifications
You must be signed in to change notification settings - Fork 2
/
SCCWRPWSbyLUZeta.R
151 lines (140 loc) · 7.77 KB
/
SCCWRPWSbyLUZeta.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
library("plyr")
library(dplyr)
library("ggplot2")
library(lubridate)
library("ape")
library("vegan")
library("microbiome")
library(data.table)
library(tidyr)
library(hierDiversity)
library(zetadiv)
#This script focuses on zeta diversity patterns across the SCCWRP California streams data set.
setwd("~/Desktop/SCCWRP")
#Read in site data containing biological counts, water chemistry, and land usage
#values. If this file is not yet generated then proceed with the following commands
#to generate it in the first place.
GISBioData <- read.table("CAGISBioData.csv", header=TRUE, sep=",",as.is=T,skip=0,fill=TRUE,check.names=FALSE)
#Filter out to taxonomic groups of interest.
GISBioData <- subset(GISBioData, MeasurementType == "benthic macroinvertebrate relative abundance")
#Remove duplicate measures.
GISBioData <- GISBioData[!duplicated(GISBioData[,c("UniqueID","FinalID","Count")]),]
#Order data by LU_2000_5K.
GISBioData <- arrange(GISBioData,LU_2000_5K)
#Add taxa counts by sample.
tmp <- data.frame(table(GISBioData$UniqueID))
colnames(tmp) <- c("UniqueID","nTaxa")
GISBioData <- join(GISBioData,tmp,by=c("UniqueID"))
#Read in sample metadata.
SCCWRP <- read.table("CSCI.csv", header=TRUE, sep=",",as.is=T,skip=0,fill=TRUE,check.names=FALSE)
#Get latitude and longitude bounds of data set.
latMin <- min(SCCWRP$Latitude)
latMax <- max(SCCWRP$Latitude)
lonMin <- min(SCCWRP$Longitude)
lonMax <- max(SCCWRP$Longitude)
#Merge in sample altitude.
GISBioData <- join(GISBioData,SCCWRP[,c("UniqueID","altitude")],by=c("UniqueID"))
#Get samples per watershed.
watersheds <- as.data.frame((table(SCCWRP$Watershed)))
colnames(watersheds) <- c("Watershed","Samples")
#Get the samples per watershed for watersheds with at least a certain number of samples.
LargeWatersheds <- subset(watersheds,Samples>=40)
#Taxa frequency table.
taxaFreq <- as.data.frame(table(GISBioData$FinalID))
colnames(taxaFreq) <- c("FinalID","Freq")
#Find the total number of taxa in the full data set.
taxaMax <- length(unique(GISBioData$FinalID))
#Iterate over watersheds, and then high and low land use bands.
#Find zeta diversity decay parameters within each subset of stream samples.
zetaAnalysis <- data.frame()
for(WS in LargeWatersheds$Watershed){
#Get all samples for a given watershed.
GISBioDataWS <- subset(GISBioData,Watershed==WS)
#Set land use threshold.
threshold <- 5
#Get samples above and below threshold.
GISBioDataWSLow <- subset(GISBioDataWS,LU_2000_5K < threshold)
GISBioDataWSHigh <- subset(GISBioDataWS,LU_2000_5K >= threshold)
#Get the number of samples which fit the land use profile for a given watersheds.
nLow <- length(unique(GISBioDataWSLow$UniqueID))
nHigh <- length(unique(GISBioDataWSHigh$UniqueID))
#Find watersheds with at least a certain number of samples in low and high land use bands.
if(nLow >= 20 & nHigh >= 20){
for(i in 1:2){
if(i==1){
selected <- arrange(GISBioDataWSLow,Year,UniqueID)
}
if(i==2){
selected <- arrange(GISBioDataWSHigh,Year,UniqueID)
}
meanLU <- mean(na.omit(selected$LU_2000_5K))
print(paste(WS,meanLU))
#Generating a presence/absence matrix for California SCCWRP data.
eLSAInput <- as.data.frame(unique(selected$FinalID))
colnames(eLSAInput)<-c("FinalID")
eLSAInput <- as.data.frame(eLSAInput[order(as.character(eLSAInput$FinalID)),])
colnames(eLSAInput)<-c("FinalID")
taxa <- eLSAInput
#Add the relative taxa abundances by column to a new dataframe.
#The rows are the unique taxa in a given subset of data.
selected <- selected[order(selected$Year,selected$UniqueID,selected$FinalID),]
for(ID in unique(selected$UniqueID)){
tmp <- filter(selected, UniqueID == ID)[,c("FinalID","Measurement","UniqueID")]
tmp <- as.data.frame(tmp[order(tmp$FinalID),])
tmp <- tmp[-c(3)]
colnames(tmp)<-c("FinalID",ID)
tmp <- tmp %>% group_by(FinalID) %>% summarise_if(is.numeric,mean,na.rm=TRUE)
tmp <- join(tmp,taxa,type="full",by=c("FinalID"))
tmp <- as.data.frame(tmp[order(tmp$FinalID),])
eLSAInput <- cbind(eLSAInput,tmp)
eLSAInput <- eLSAInput[,!duplicated(colnames(eLSAInput))]
}
#Generate a presence/absence dataframe for zeta diversity analysis.
#Rows for samples, columns for taxa IDs.
eLSAInput[is.na(eLSAInput)] <- 0
eLSANames <- eLSAInput$FinalID
data.SCCWRP <- as.data.frame(t(eLSAInput[,-c(1)]))
colnames(data.SCCWRP) <- eLSANames
data.SCCWRP[data.SCCWRP > 0] <- 1
#Computes zeta diversity, the number of species shared by multiple assemblages, for a range of orders (number of assemblages or sites),
#using combinations of sampled sites, and fits the decline to an exponential and a power law relationship.
zetaDecay <- Zeta.decline.mc(data.SCCWRP,xy=NULL,orders=1:10,sam=1000)
dat <- data.frame()
dat[1,1] <- zetaDecay$zeta.exp$coefficients[1] #Zeta diversity exponential decay intercept.
dat[1,2] <- zetaDecay$zeta.exp$coefficients[2] #Zeta diversity exponential decay exponent.
dat[1,3] <- zetaDecay$aic$AIC[1] #AIC coefficient Zeta diversity exponential decay.
dat[1,4] <- zetaDecay$zeta.pl$coefficients[1] #Zeta diversity power law decay intercept.
dat[1,5] <- zetaDecay$zeta.pl$coefficients[2] #Zeta diversity power law decay exponent.
dat[1,6] <- zetaDecay$aic$AIC[2] #AIC coefficient Zeta diversity power law decay.
zetaAnalysis <- rbind(zetaAnalysis,dat)
print(dat)
}
}
}
colnames(zetaAnalysis) <- c("ZetaExponentialIntercept","ZetaExponentialExponent","ZetaExponentialAIC","ZetaPLIntercept","ZetaPLExponent","ZetaPLAIC")
write.table(zetaAnalysis,"SCCWRPWSByLU20SamplesZeta.txt",quote=FALSE,sep="\t",row.names = FALSE)
networkAnalysis <- read.table("SCCWRPWatershedByLU20Samples.txt", header=TRUE, sep="\t",as.is=T,skip=0,fill=TRUE,check.names=FALSE)
networkAnalysis <- arrange(networkAnalysis,Watershed,meanLU)
networkAnalysis <- cbind(networkAnalysis,zetaAnalysis)
#Check how network topologies change within watersheds.
test <- networkAnalysis
#Optional step if one wants to subsample results by groups of watersheds.
#test <- networkAnalysis[networkAnalysis$Watershed %in% HighLUWS$Watershed,]
#Calculate change in network topological measures.
test$diff_meanLU <- ave(test$meanLU, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_zeta_Cov <- ave(test$zeta_Cov, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_zeta_Con <- ave(test$zeta_Con, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_l_cov_rM <- ave(test$l_cov_rM, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_l_con_rM <- ave(test$l_con_rM, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_cov_M <- ave(test$cov_M, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_con_M <- ave(test$con_M, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_cov_C <- ave(test$cov_C, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_con_C <- ave(test$con_C, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_meanStrength_Cov <- ave(test$meanStrength_Cov, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_meanStrength_Con <- ave(test$meanStrength_Con, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_lambda_network_m_Cov <- ave(test$lambda_network_m_Cov, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_lambda_network_m_Con <- ave(test$lambda_network_m_Con, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_lambda_network_m <- ave(test$lambda_network_m, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_Network_size <- ave(test$Network_size, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_Pm <- ave(test$Pm, test$Watershed, FUN=function(x) c(NA, diff(x)))
test$diff_ZetaPLExponent <- ave(test$ZetaPLExponent, test$Watershed, FUN=function(x) c(NA, diff(x)))