forked from esherm/genomicHeatmapMaker
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenomicHeatmapMaker.R
126 lines (107 loc) · 5.15 KB
/
genomicHeatmapMaker.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# This source code file is a component of the larger INSPIIRED genomic analysis software package.
# Copyright (C) 2016 Frederic Bushman
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
library(colorspace)
library(hiAnnotator)
library(intSiteRetriever)
library(GCcontent)
library(BSgenome)
library(BSgenome.Hsapiens.UCSC.hg18)
library(BSgenome.Mmusculus.UCSC.mm9)
#library(pipeUtils)
codeDir <- dirname(sub("--file=", "", grep("--file=", commandArgs(trailingOnly=FALSE), value=T)))
make_heatmap <- function(sampleName_GTSP, referenceGenome, output_dir, connection) {
sites_mrcs <- get_sites_controls_from_db(
sampleName_GTSP, referenceGenome, connection)
sites_to_heatmap(sites_mrcs, referenceGenome, output_dir)
}
#' generate heatmap
#'
#' @param sites_mrcs real sites and mrcs GRanges object with metacolumns:
#' sitID, type, label
#' @param referenceGenome reference genome name, e.g. "hg18"
#' @param output_dir name of the directory
sites_to_heatmap <- function(sites_mrcs, referenceGenome, output_dir) {
reference_genome_sequence <- get_reference_genome(referenceGenome)
# TODO: populate from local database, at present pulled from UCSC web-site
refSeq_genes <- getRefSeq_genes(referenceGenome)
CpG_islands <- getCpG_islands(referenceGenome)
### oncogene_file <- "allonco_no_pipes.csv"
oncogene_file <- file.path(codeDir, "allonco_no_pipes.csv")
if (grepl("^mm", referenceGenome)) {
### oncogene_file <- "allonco_no_pipes.mm.csv"
oncogene_file <- file.path(codeDir, "allonco_no_pipes.mm.csv")
}
# @return vector of gene symbols
get_oncogene_from_file <- function(filename) {
onco <- read.csv(filename, header=FALSE, stringsAsFactors=FALSE)
as.character(onco$V1)
}
oncogenes <- get_oncogene_from_file(oncogene_file)
# END annotation loading
# is there oncogene closer than 50k
refSeq_gene_symbols <- refSeq_genes$name2
#' check if gene is onco gene list(curated by Bushman's lab)
#' @return TRUE if onco-gene FALSE if not
is_onco_gene <- function(gene_symbol_sites, oncogenes) {
toupper(gene_symbol_sites) %in% toupper(oncogenes)
}
is_refSeq_oncogene <- is_onco_gene(refSeq_gene_symbols, oncogenes)
refSeq_oncogene <- refSeq_genes[is_refSeq_oncogene]
sites_mrcs <- getNearestFeature(
sites_mrcs, refSeq_oncogene, dists.only=TRUE, colnam="onco")
#sites_mrcs, refSeq_oncogene, dists.only=TRUE, colnam="onco.100k")
sites_mrcs$onco.100k <- abs(sites_mrcs$oncoDist) <= 50000
sites_mrcs$oncoDist <- NULL
# end oncogene
# need within_refSeq_gene for getPositionalValuesOfFeature
sites_mrcs <- getSitesInFeature(
sites_mrcs, refSeq_genes, "within_refSeq_gene", asBool=TRUE)
sites_mrcs <- getPositionalValuesOfFeature(sites_mrcs, refSeq_genes)
# redo the work to move it after positional values
mcols(sites_mrcs)$within_refSeq_gene <- NULL
sites_mrcs <- getSitesInFeature(
sites_mrcs, refSeq_genes, "within_refSeq_gene", asBool=TRUE)
window_size_refSeq <- c("10k"=1e4, "100k"=1e5, "1M"=1e6)
sites_mrcs <- getFeatureCounts(sites_mrcs, refSeq_genes, "refSeq_counts",
width=window_size_refSeq)
window_size_GC <- c("100"=100,
"1k"=1000, "10k"=1e4, "100k"=1e5, "1M"=1e6)
sites_mrcs <- getGCpercentage(
sites_mrcs, "GC", window_size_GC, reference_genome_sequence)
window_size_CpG_counts <- c("1k"=1e3, "10k"=1e4)
sites_mrcs <- getFeatureCounts(sites_mrcs, CpG_islands, "CpG_counts",
width=window_size_CpG_counts)
window_size_CpG_density <- c("10k"=1e4, "100k"=1e5, "1M"=1e6)
sites_mrcs <- getFeatureCounts(sites_mrcs, CpG_islands, "CpG_density",
width=window_size_CpG_density)
sites_mrcs <- from_counts_to_density(sites_mrcs,
"CpG_density", window_size_CpG_density)
if( ! grepl("^mm", referenceGenome)) { # mouse does not have DNaseI track
DNaseI <- getDNaseI(referenceGenome)
window_size_DNaseI <- c("1k"=1e3, "10k"=1e4, "100k"=1e5, "1M"=1e6)
sites_mrcs <- getFeatureCounts(sites_mrcs, DNaseI, "DNaseI_count",
width=window_size_DNaseI)
}
if (config$rocControls == 'unmatched')
{
sites_to_ROC_ordinary(sites_mrcs, output_dir)
} else if (config$rocControls == 'matched') {
sites_to_ROC_matched(sites_mrcs, output_dir)
} else {
stop('Error, rocControls is not properly defined in the configuration file.')
}
}