forked from andreas-schlicker/OncoScape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcnamap.r
324 lines (282 loc) · 13.8 KB
/
cnamap.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
##' Queries Biomart to obtain the chromosomal location for each input gene.
##' @param genes character vector with HGNC symbols
##' @param warningsfile file to save warning messages in; default: "warnings.txt".
##' If the file doesn't exist, it will be created. Otherwise, new warnings are appended.
##' @param genome genome version to be used; default: hg18
##' @return Returns a data.frame with the chromosome, start and end positions
##' @author Andreas Schlicker
getGeneLocs = function(genes, warningsfile="warnings.txt", genome="hg18") {
require(biomaRt) || stop("Can't load package \"biomaRt\"!")
# List all Biomarts available through the corresponding archive
# listMarts(host="may2009.archive.ensembl.org",path="/biomart/martservice")
# Use the latest Biomart that is based on NCBI36/hg18
mart = useMart(host="may2009.archive.ensembl.org", path="/biomart/martservice", biomart="ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl")
if (genome == "hg19") {
mart = useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
}
# Find out the gene locations
res = getBM(attributes=c("hgnc_symbol", "chromosome_name", "start_position", "end_position"), filters=c("hgnc_symbol"), values=genes, mart=mart)
res = subset(res, chromosome_name %in% c(1:22, "X", "Y"))
wf = NULL
if (length(unique(res$hgnc_symbol)) < length(res$hgnc_symbol)) {
if (is.null(wf)) {
wf = file(warningsfile, "a")
}
cat(paste("Found several genomic locations for genes: ", res$hgnc_symbol[which(duplicated(res$hgnc_symbol))], "\n", sep=""), file=wf)
cat("Using the first location for each gene!\n", file=wf)
res = res[!duplicated(res$hgnc_symbol), ]
}
if (!is.null(wf)) {
close(wf)
}
rownames(res) = res[, "hgnc_symbol"]
if (length(which(res[, "chromosome_name"] == "X")) > 0) {
res[which(res[, "chromosome_name"] == "X"), "chromosome_name"] = 23
}
if (length(which(res[, "chromosome_name"] == "Y")) > 0) {
res[which(res[, "chromosome_name"] == "Y"), "chromosome_name"] = 24
}
res
}
##' Find the correct copy number segment and the associated copy number.
##' @param start start location
##' @param stop end location
##' @param segments matrix of start (column 1) and end (column 2) positions, and the
##' copy number value (column 3) of the segments; all segments have to be on the
##' same chromosome as the position of interest
##' If a gene spans several segments, the mean copy number of all these segments
##' is returned.
##' @return named list with two elements: cnaalue: copy number value; case: integer
##' code; 1=gene is not part of any segment; 2=gene is part of exactly one segment;
##' 3=start of gene lies between segments; 4=end of gene lies between segments;
##' 5=gene spans several segments
##' @author Andreas Schlicker
getCopyNumberValue = function(start, stop, segments) {
if (!is.matrix(segments)) {
# Segments is a vector with only one segment, so add one artificial to make
# it a matrix.
segments = rbind(c(segments[1], 0, 0, 0), segments)
}
# Find the start segment
# Look for the largest segment start that is still smaller than the gene start
startSeg = 0
startSeg = max(which(segments[, 2] <= start))
# If the end of that segment is larger than the start, the gene doesn't touch
# that segment.
startWithin = TRUE
if (is.infinite(startSeg) || segments[startSeg, 3] < start)
startWithin = FALSE
# and the end segment
# Look for the smallest segment end that is still bigger than the gene end
endSeg = 0
endSeg = min(which(segments[, 3] >= stop))
# If the start of that segment is larger than the end, the gene doesn't touch
# that segment.
endWithin = TRUE
if (is.infinite(endSeg) || segments[endSeg, 2] > stop)
endWithin = FALSE
if (is.infinite(startSeg) && !is.infinite(endSeg) || (startSeg > endSeg)) {
# If no start segment but an end segment was found, the gene begins before the first one.
# Use the first segment in this case
startSeg = 1
} else if (!is.infinite(startSeg) && is.infinite(endSeg)) {
# If no end segment but a start segment was found, the gene ends behind the last one.
# Use the last segment in this case
endSeg = nrow(segments)
}
case = 0
if (is.infinite(startSeg) && is.infinite(endSeg)) {
# The gene doesn't touch any segments
cnvalue = NA
case = 1
} else if (startSeg == endSeg) {
# The gene lies withing one segment
cnvalue = segments[startSeg, 4]
case = 2
} else if (!startWithin) {
# The gene's start lies between segments
# -> ignore the start segment and average the remaining
cnvalue = mean(segments[(startSeg+1):endSeg, 4])
case = 3
} else if (!endWithin) {
# The gene's end lies between segments
# -> ignore the end segment and average the remaining
cnvalue = mean(segments[startSeg:(endSeg-1), 4])
case = 4
} else {
# The gene spans different segments
# Take the average of all copy number values of these segments
cnvalue = mean(segments[startSeg:endSeg, 4])
case = 5
}
list(cnvalue=as.double(cnvalue), case=case)
}
##' Maps all input genes to the copy number value of the corresponding segment.
##' @param genes character vector containing HGNC symbols
##' @param cnData matrix with segmented copy number variation data; formatted
##' as CBS output
##' @param cases boolean; if set to TRUE, an additional matrix containing
##' information on how each CN value was determined is returned; default: FALSE
##' @param genome version of the genome; default: "hg18"
##' @return named list with up to two elements; cn.values: gene-by-sample copy
##' number matrix; cn.cases: gene-by-sample conversion case matrix
##' @author Andreas Schlicker
mapGenes2CN = function (genes, cnData, cases=FALSE, genome="hg18") {
# Map all genes to their chromosomal location using biomaRt
genes2loc = getGeneLocs(genes, genome=genome)
# List of samples
allSamples = unique(cnData[, 1])
# Matrix that will contain the copy number for each gene in each sample
cn.matrix = matrix(NA, nrow=length(genes), ncol=length(allSamples))
rownames(cn.matrix) = genes
colnames(cn.matrix) = allSamples
# If the case matrix is to be reported
if (cases) {
cases = matrix(-1, nrow=length(genes), ncol=length(allSamples))
rownames(cases) = genes
colnames(cases) = allSamples
}
# Go through all samples
for (samp in allSamples) {
# Get all segments with their copy number for the current sample
samp.segs = as.matrix(cnData[which(cnData[, 1] == samp), c(2, 3, 4, 6)])
# Go through the chromosomes
for (chrom in unique(genes2loc[, "chromosome_name"])) {
# Get the segments for that particular chromosome
segs = samp.segs[which(samp.segs[, 1] == chrom), ]
# Go through all genes on that chromosome
for (gene in rownames(genes2loc[which(genes2loc[, "chromosome_name"] == chrom), ])) {
# Get the copy numberk
res = getCopyNumberValue(as.integer(genes2loc[gene, "start_position"]), as.integer(genes2loc[gene, "end_position"]), segs)
cn.matrix[gene, samp] = res[["cnvalue"]]
# Remember the case if that information is wanted
if (cases) {
cases[gene, samp] = res[["case"]]
}
}
}
}
res = list(cn.values=cn.matrix)
if (cases) {
res[["cn.cases"]] = cases
}
res
}
##' Calculates correlation between copy number and expression of the corresponding genes.
##' Correlation is calculated using all samples contained in both data matrices
##' @param cna.data matrix with genes in rows and samples in columns
##' @param exprs.data matrix with genes in rows and samples in columns
##' @param genes character vector with genes to test
##' @return data.frame with correlation values and p-values
corCnaExprs = function(cna.data, exprs.data, genes) {
# All genes contained in the expression data
common.genes = intersect(genes, intersect(rownames(cna.data), rownames(exprs.data)))
commonSamples = intersect(colnames(cna.data), colnames(exprs.data))
cna.data = cna.data[, commonSamples, drop=FALSE]
exprs.data = exprs.data[, commonSamples, drop=FALSE]
cors = data.frame(gene=character(length(genes)), cor=rep(NA, times=length(genes)),
cor.p=rep(NA, times=length(genes)), stringsAsFactors=FALSE)
rownames(cors) = genes
for (x in common.genes) {
tempCor = tryCatch(cor.test(exprs.data[x, ],
cna.data[x, ],
method="spearman",
use="pairwise.complete.obs",
exact=FALSE),
error=function(x) NA)
if (!is.na(tempCor)) {
cors[x, ] = c(x, tempCor$estimate, tempCor$p.value)
}
}
cors[, 2] = as.numeric(cors[, 2])
cors[, 3] = as.numeric(cors[, 3])
cors
}
##' Run copy number alteration analysis for each gene.
##' @param tumors matrix with tumor copy number matrix, genes in rows and samples in columns
##' @param normals matrix with normal sample copy number matrix, genes in rows and samples in columns
##' @param exprs expression data matrix, genes in rows and samples in columns
##' @param genes character vector of genes; default: NULL (test all genes in both tumors and normals)
##' @param samples vector with sample names to use for the analysis. If this is NULL, all samples will
##' be used; default: NULL
##' @param paired boolean indicating whether doing paired or unpaired analysis; default: TRUE
##' @return named list with gene scores, correlation results, Wilcoxon test results and difference in mean
##' @author Andreas Schlicker
doCnaAnalysis = function(tumors,
normals,
exprs,
genes=NULL,
samples=NULL,
paired=TRUE) {
selected.genes = doFilter(rownames(tumors), rownames(normals), genes, TRUE)
if (length(selected.genes) == 0) {
stop("doCnaAnalysis: No gene of interest is contained in gene expression data of both tumors and normals!")
}
filtered.samples = doFilter(colnames(tumors), colnames(normals), samples, paired)
if (paired && (length(filtered.samples[[1]]) == 0 || length(filtered.samples[[2]]) == 0)) {
paired = FALSE
filtered.samples = doFilter(colnames(tumors), colnames(normals), samples, FALSE)
warning("doCnaAnalysis: No paired expression samples found. Performing unpaired analysis!")
}
tumors = tumors[selected.genes[[1]], filtered.samples[[1]], drop=FALSE]
normals = normals[selected.genes[[2]], filtered.samples[[2]], drop=FALSE]
# Calculate the difference in copy number between tumors and normals
# tumor - normal
# --> diff < 0 implies that the gene has higher mean copy number values in normals than in tumors
# --> diff > 0 implies that the gene has lower mean copy number values in normals than in tumors
mean.diff = meanDiff(tumors, normals)
# Calculate correlation between copy number and expression
cors = corCnaExprs(tumors, exprs, selected.genes[[1]])
# Run Wilcoxon tests
wilcox = doWilcox(tumors, normals, paired)
list(diffs=mean.diff, cors=cors, wilcox=wilcox)
}
##' Score genes and find out which samples are affected by corresponding aberrations.
##' @param tumors matrix with tumor copy number matrix, genes in rows and samples in columns
##' @param normals matrix with normal sample copy number matrix, genes in rows and samples in columns
##' @param cna.analysis list returned by doCnaAnalysis()
##' @param genes character vector of genes; default: NULL (test all genes in both tumors and normals)
##' @param wilcox.FDR significance cut-off for Wilcoxon FDR; default=0.05
##' @param cor.FDR significance cut-off for correlation FDR; default=0.05
##' @param diff.cutoff copy number difference needs to be smaller or greater than this cut-off to be considered significant; default=-0.1
##' @param regulation either "down" or "up" for finding genes that are regulated in the corresponding direction; default="down"
##' @param stddev how many standard deviations does a sample have to be away from the mean to be considered affected; default=1
##' @param paired boolean indicating whether paired or unpaired analysis was performed; default: TRUE
##' @return named list with scores for genes ("scores"), number of affected samples ("summary") and the lists of affected samples ("samples")
##' @author Andreas Schlicker
summarizeCna = function(tumors,
normals,
cna.analysis,
genes=NULL,
wilcox.FDR=0.05,
cor.FDR=0.05,
diff.cutoff=-0.1,
regulation=c("down", "up"),
stddev=1) {
regulation = match.arg(regulation)
# Get the correct comparison function
# If we want to find genes with higher copy number in tumors get the greaterThan function
# If we want to find genes with lower copy number in tumors, get the smallerThan function
compare = switch(regulation, down=smallerThan, up=greaterThan)
significant.genes = doFilter(rownames(tumors), rownames(normals), genes, TRUE)[[1]]
if (is.null(genes)) {
genes = significant.genes
}
cna.analysis$cors = cna.analysis$cors[which(rownames(cna.analysis$cors) %in% significant.genes), ]
cna.analysis$wilcox = cna.analysis$wilcox[which(names(cna.analysis$wilcox) %in% significant.genes)]
cna.analysis$diffs = cna.analysis$diffs[which(names(cna.analysis$diffs) %in% significant.genes)]
## Integrate results
# Correlation significance filter
cna.analysis$cors = cbind(cna.analysis$cors, cor.FDR=p.adjust(cna.analysis$cors[, "cor.p"], method="BH"))
significant.genes = rownames(cna.analysis$cors)[which(cna.analysis$cors[, "cor.FDR"] <= cor.FDR & cna.analysis$cors[, "cor"] > 0)]
# Difference filter
cna.analysis$wilcox = cbind(wilcox.p=cna.analysis$wilcox,
wilcox.FDR=p.adjust(cna.analysis$wilcox[significant.genes], method="BH")[names(cna.analysis$wilcox)])
significant.genes = names(which(compare(cna.analysis$diffs[significant.genes], diff.cutoff)))
significant.genes = names(which(cna.analysis$wilcox[significant.genes, "wilcox.FDR"] <= wilcox.FDR))
gene.scores = rep(0, length(genes))
names(gene.scores) = genes
gene.scores[significant.genes] = 1
affected.samples = countAffectedSamples(genes, significant.genes, tumors, normals, regulation, stddev, TRUE)
list(scores=gene.scores, summary=affected.samples$summary, samples=affected.samples$samples, cna.analysis=cna.analysis)
}