check_cutoff_effect.R

#
#	Supp-step 2: Check normalizaiton cutoff's effect on p-values
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

treatments <- c("normalizedData$Plants", "normalizedData$Plants:normalizedData$Water", "normalizedData$Season")
treatmentsLength <- length(treatments)
# 'order_counts.csv' is generated by 'compile_raw.R'
rawData <- read.csv('data/order_counts.csv', row.names=1, stringsAsFactors=F)
cutoffs <- sort(rawData$reads)
cutoffsLength <- length(cutoffs)
reps <- 100

pVals <- array(data=rep(1, treatmentsLength * cutoffsLength * reps), dim=c(treatmentsLength, cutoffsLength, reps), dimnames=c("Treatment", "Read Count", "Replicate"))
for (repIndex in 1:reps) {
	for (cutoffIndex in 1:length(cutoffs)) {
		minReads <- cutoffs[cutoffIndex]
		rawTreatments <- subset(rawData[,c(1:6)], reads >= minReads)
		rawReads <- subset(rawData, reads >= minReads)[,-c(1:6)]
		
		# rarefaction (provided by 'vegan') selects a random subsample, n='minReads', of the raw data from each library
		library(vegan)
		normalizedData <- cbind(rawTreatments, rrarefy(rawReads, minReads))
		normalizedData$reads <- minReads

		# Randomly remove libraries until there is an equal number for all treatment combinations
		#>>>START
		trtCombos <- unique(rawData[,c(3:5)])
		trtNames <- apply(trtCombos, 1, paste, collapse=" ")
		trtLength <- length(trtNames)

		trtCounts <- matrix(0, nrow=trtLength, ncol=2)
		rownames(trtCounts) <- trtNames
		colnames(trtCounts) <- c("Total", "n to remove")

		# find the total number of libraries in each treatment combination
		for (i in 1:dim(normalizedData)[1]) {
			curTrt <- paste(normalizedData[i,c(3:5)], collapse=" ")
			trtCounts[curTrt, "Total"] <- trtCounts[curTrt, "Total"] + 1
		}
		# find the minimum library count from all the treatment combinations
		minCount <- min(trtCounts[,"Total"])
		trtCounts[,"n to remove"] <- trtCounts[,"Total"] - minCount
		librariesChosenForRemoval = vector()
		for (i in 1:trtLength) {
			curTrt <- trtCounts[trtNames[i],]
			if (curTrt["n to remove"] != 0) {
				potentialLibraries <- subset(normalizedData, Plants %in% trtCombos[i,] & Water %in% trtCombos[i,] & Season %in% trtCombos[i,])
				chosenLibrary <- sample(rownames(potentialLibraries), as.numeric(curTrt["n to remove"]))
				# select n libraries to remove
				librariesChosenForRemoval <- append(librariesChosenForRemoval, chosenLibrary)
			}
		}
		indexesChosenForRemoval <- match(librariesChosenForRemoval, rownames(normalizedData))
		# perform the removal
		normalizedData <- normalizedData[-indexesChosenForRemoval, ]
		#>>>FINISH

		if (dim(normalizedData)[1] > 1) {
			# 'decostand' in the package 'vegan' normalized community data (counts of orders per sample)
			# The method 'total' performs this normalization be dividing each order's count by the sample total
			# i.e. each order's datum is now the relative abundance of that order in the sample
			decostandData <- decostand(normalizedData[, -c(1:6)], "total")

			# 'adonis' in the package 'vegan' performas a permutational analysis of variance
			# In short, it shuffles community data between treatments 1000 times
			#  For each shuffle, it quantifies, proportionally, how much of the variance in dissimilarity is explained by various treatment combinations
			#  Finally, it gives the proportion of random shuffles for which the amount of variance explained by the treatments was equal to or greater than the amount of variance explained when the data were not shuffled
			# The proprition of random shuffles with equally -effective- treatments is the p-value, the probability of obtaining our observed results under the null hypothesis that treatment doesn't affect dissimilarity
			anova <- adonis(decostandData~normalizedData$Plants*normalizedData$Water*normalizedData$Season, permutations=9999)
			aov.tab <- anova$aov.tab
			for (treatmentIndex in 1:treatmentsLength) {
				pVals[treatmentIndex, cutoffIndex, repIndex] <- aov.tab[treatments[treatmentIndex],"Pr(>F)"]
			}
		}
	}
}

means <- apply(pVals, MARGIN=c(1,2), FUN=mean)
significantMeans <- means[,c(1:38)]
stdDevs <- apply(pVals, MARGIN=c(1,2), FUN=sd)
significantStdDevs <- stdDevs[,c(1:38)]

brplt <- barplot(significantMeans, col=rainbow(3), beside=T)
segments(brplt, significantMeans-significantStdDevs, brplt, significantMeans+significantStdDevs, col="black")

q(save="yes")