-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_massage_offical_annotation.R
296 lines (238 loc) · 12.9 KB
/
1_massage_offical_annotation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#!/usr/local/bin/Rscript
# NOTE: run script from command with -h option to get help page
###################################################
########### Sandelin Lab Cage Pipeline ############
######## Annotation of CAGE peaks (1 of 2) ########
## Script for preparing the official annotation ###
############ Kristoffer Vitting-Seerup ############
###################################################
### Idea
# This script should take the path to a gtf file as input.
# the GTF should then be massaged into the GRanges needed for the annotation and
# save them to an Rdata file which will be loaded by the script which annotates the
# cage data
### Use argparse to make help file and parse input
if(TRUE) {
suppressMessages( library('argparser') )
### Use argparse to make help file
# create parser
argParse <- arg_parser(name = '1_massage_official_annotation.R', description = 'Description:\nPart 1 (of 2) of the Sandelin Lab
Cage Pipeline Annoation scripts. This script that massage the offical gene annoation so it can be used with
part 2. This script produces two files with the same name as the GTF file, except they are called \'.Rdata\'
and \'.parsed.Rdata\' instead of \'.gtf\'. The second file is used for the CAGE cluster annotation by the
second script. The first file have two functions: First it allows for very fasts loading of the GRange version
of the GTF file. Secondly it allows the user to manually parse non-standard GTF files and save them to an Rdata
file, which is then used instead of the GTF file. The details for the Rdata version of the GTF file are as
follows. It must be a standard GRanges object with chr start end and strand. Furthermore it must have 4 metadata
collumns: 1) A collum called \'type\' indicating whether it is a Exon, a CDS or a UTR regions (other are ignored).
2) A collumn called \'gene_id\' giving the unique gene id (fx ENSG00001. 3) A column called \'gene_name\' giving
the gene name (Fx Rac1) and 4) a collumn called \'transcript_id\' with the unique transcript id. Lastly note that
the GRange object must be called orgGTF.'
)
# add arguments
argParse <- add_argument(parser = argParse, short = '-g', arg = '-gtf', help = 'The FULL path to the GTF containing the official annotation.')
argParse <- add_argument(parser = argParse, arg = '-forceGTFimport', default=FALSE, help = 'A logic (TRUE/FALSE) which incates whether to import the GTF file even though a \".Rdata\" file containing the corresponding GRanges exists.')
# evaluate input
inputedArguments <- parse_args(argParse, argv = commandArgs(trailingOnly = TRUE)) # creates a named list with arguments
### Note: Currently argparser ( v 0.1 ) gives a warning message for arguments starting with certain types of names
### Test input
inputTest <- sapply(inputedArguments[-c(1:2)], is.na) # first is always help and opts
if( any( inputTest ) ) {
stop(paste('The following input arguments are missing: -', paste( names(inputTest)[which(inputTest)], collapse = ', -'), sep='') )
}
}
### For devel
if(FALSE) {
### Manually create list with arguments
inputedArguments <- list(
gtf='/Volumes/BINF-Sandelin/projects/CAGE/genome/annotations/ASM294v2_r26/Schizosaccharomyces_pombe.ASM294v2.26.gtf'
)
'/Volumes/BINF-Sandelin/projects/CAGE/genome/annotations/ASM294v2_r26/Schizosaccharomyces_pombe.ASM294v2.26.'
load('/Volumes/BINF-Sandelin/people/rtl144/test/gencodev19hg19.Rdata')# orgGTF
}
### Load dependencies
message('Loading dependencies...')
suppressMessages( library('rtracklayer') )
suppressMessages( library('GenomicRanges') )
suppressMessages( library('XVector') )
suppressMessages( library('plyr') )
### Read in gtf file
if(TRUE) {
gtfAsGRanges <- paste( gsub('.gtf$', '', inputedArguments$gtf, perl = T, ignore.case = T), '.Rdata', sep='')
if( file.exists(gtfAsGRanges) & !as.logical(inputedArguments$forceGTFimport)) {
message('Step 1 of 2: Importing the GRange format of the GTF file from the .Rdata file')
load(gtfAsGRanges)
} else {
message('Step 1 of 2: Importing GTF. This may take a while...')
orgGTF <- import(con = inputedArguments$gtf, format = 'gtf')
orgGTF <- sort(orgGTF)
# save for (potential) later usage
save(orgGTF, file=gtfAsGRanges)
}
### Test input
if( ! all(c('gene_name','transcript_id') %in% colnames(mcols(orgGTF))) ) {
stop('The gene name and transcript name was not found - please massage manually and save the Rdata file yourself')
}
### Massage
# subset to exon cds and utrs and only extract meta collums i need
orgGTF <- orgGTF[which( tolower(orgGTF$type) %in% c('exon','cds','utr')) , c('type','gene_id','gene_name','transcript_id')]
### The GRanges that is imported should have the following collums
# seqnames (chromosome name), ranges (start stop), stand - corresponding to standard GTF file format
# 4 meta data collums called:
# 'type' - containing basic annoation, exon cds, UTR etc - standard for GTF file
# 'gene_id' - a unique id for each gene
# 'gene_name' - containing the official gene name (hgnc, mgi etc)
# 'transcript_id' - containing unique name of the transcript.
}
### Modify the GTF file to get a set of GRanges shaped excatly like I need (to avoid havint to do this on the each time)
message('Step 2 of 3: Massaging GTF. Usually takes a couple of minuts...')
if(TRUE) {
### Base gene and transcript regions
if(TRUE) {
### Gene edges - takes 6 secs
geneEdges <- suppressMessages( unlist( range( split(orgGTF[,0], f=orgGTF$gene_id) ) ) )
geneEdges$gene_id <- names(geneEdges)
names(geneEdges) <- NULL
geneEdges$gene_name <- orgGTF$gene_name[match(geneEdges$gene_id , orgGTF$gene_id)]
### Transcript edges - takes 16 secs
transcriptEdges <- unlist( range(split(orgGTF[,0], f=orgGTF$transcript_id)) )
transcriptEdges$transcript_id <- names(transcriptEdges)
names(transcriptEdges) <- NULL
transcriptEdges$gene_id <- orgGTF$gene_id [match(transcriptEdges$transcript_id, orgGTF$transcript_id)]
transcriptEdges$gene_name <- orgGTF$gene_name[match(transcriptEdges$transcript_id, orgGTF$transcript_id)]
}
### TSS
if(TRUE) {
### Use gene edges to get most upstream TSS - takes < 1 sec
primaryTSS <- GenomicRanges::promoters( geneEdges, upstream =0 , downstream = 1 )
### Extract all TSS
transciptTSS <- GenomicRanges::promoters(transcriptEdges , upstream =0 , downstream = 1 )
### Remove those overlapping with primary TSS
alternativeTSS <- transciptTSS[ which( ! overlapsAny(query = transciptTSS, subject = primaryTSS) ), ]
### Get primary TSS with annoation
primaryTSS <- transciptTSS[ which( overlapsAny(query = transciptTSS, subject = primaryTSS) ), ]
}
### Coding regions
if(TRUE) {
# takes 3 secs
codingRegions <- orgGTF[which(orgGTF$type == 'CDS'),'gene_id']
codingRegions <- unlist(reduce(split( codingRegions, f=codingRegions$gene_id)))
codingRegions$gene_id <- names(codingRegions)
names(codingRegions) <- NULL
codingRegions$gene_name <- orgGTF$gene_name[match(codingRegions$gene_id , orgGTF$gene_id)]
}
### UTR regions - Deviding into 5' and 3' have to be done one transcript at the time.
if(TRUE) {
# Takes the longest time - 4 min
### devide into 5' and 3' - this is done with data.frames since applying over these are much faster
utrRegionsDF <- as.data.frame( orgGTF[which(orgGTF$type %in% c('CDS','UTR')) , c('type','transcript_id')] )
utrRegionsDF$type <- as.vector(utrRegionsDF$type) # unfactor
suppressMessages(
utrRegionsDF <- ddply(utrRegionsDF, .variables = 'transcript_id', .progress = 'none', .fun = function(aDF) {
isPlusStrand <- aDF$strand[1] == '+'
# Use a Rle object to change the UTR to 5UTR and 3UTR respectively
myRle <- Rle(aDF$type)
if(myRle@values[1] == 'UTR') {
if(isPlusStrand) {
myRle@values[1] <- '5UTR'
} else {
myRle@values[1] <- '3UTR'
}
}
if( myRle@values[length(myRle@values)] == 'UTR') {
if(isPlusStrand) {
myRle@values[length(myRle@values)] <- '3UTR'
} else {
myRle@values[length(myRle@values)] <- '5UTR'
}
}
# overwrite in data.frame
aDF$type <- as.vector(myRle)
# subset to only UTR regions
aDF <- aDF[which(aDF$type != 'CDS'),]
return(aDF)
})
)
### convert back to GRanges and add gene info
utrRegionsGr <- GRanges(utrRegionsDF$seqnames, IRanges(utrRegionsDF$start, utrRegionsDF$end), strand=utrRegionsDF$strand, type=utrRegionsDF$type, transcript_id=utrRegionsDF$transcript_id)
utrRegionsGr$gene_id <- orgGTF$gene_id[match(utrRegionsGr$transcript_id , orgGTF$transcript_id)]
### devide into 5UTR and 3UTR
utr5reg <- utrRegionsGr[which(utrRegionsGr$type == '5UTR'),]
utr3reg <- utrRegionsGr[which(utrRegionsGr$type == '3UTR'),]
### reduce on gene level
utr5regGene <- sort( unlist( reduce( split(utr5reg, f=utr5reg$gene_id) )) )
utr5regGene$gene_id <- names(utr5regGene)
utr5regGene$gene_name <- orgGTF$gene_name[match( utr5regGene$gene_id, orgGTF$gene_id)]
names(utr5regGene) <- NULL
utr3regGene <- sort( unlist( reduce( split(utr3reg, f=utr3reg$gene_id) )) )
utr3regGene$gene_id <- names(utr3regGene)
utr3regGene$gene_name <- orgGTF$gene_name[match( utr3regGene$gene_id, orgGTF$gene_id)]
names(utr3regGene) <- NULL
### remove those parts overlapping with CDS
#utr5regGene <- GenomicRanges::setdiff(utr5regGene, codingRegions)
#utr3regGene <- GenomicRanges::setdiff(utr3regGene, codingRegions)
}
### Exons
if(TRUE) {
# takes 8 sec
reducedGeneList <- reduce(split(orgGTF[,0], f=orgGTF$gene_id))
geneExons <- unlist(reducedGeneList)
geneExons$gene_id <- names(geneExons)
geneExons$gene_name <- orgGTF$gene_name[match( geneExons$gene_id, orgGTF$gene_id)]
names(geneExons) <- NULL
}
### Intron
if(TRUE) {
# takes < 1 sec
geneIntronsIRanges <- unlist( gaps(ranges( reducedGeneList )) )
# convert the IRanges back to GRanges
matchingGRangesIndex <- match(names(geneIntronsIRanges), orgGTF$gene_id)
geneIntrons <- GRanges(
seqnames=seqnames(orgGTF)[matchingGRangesIndex],
ranges=geneIntronsIRanges,
strand=strand(orgGTF)[matchingGRangesIndex],
gene_id =orgGTF$gene_id[matchingGRangesIndex],
gene_name=orgGTF$gene_name[matchingGRangesIndex]
)
names(geneIntrons) <- NULL
geneIntrons <- sort(geneIntrons)
### remove those parts overlapping with exons
#geneIntrons <- GenomicRanges::setdiff(geneIntrons, geneExons)
}
}
message('Step 3 of 3: Preparing output...')
### Save all files in a Rdata object
if(TRUE) {
### Sanity check that it worked
testList <- list(
geneEdges=geneEdges,
transcriptEdges=transcriptEdges,
primaryTSS=primaryTSS,
alternativeTSS=alternativeTSS,
utr5regGene=utr5regGene,
codingRegions=codingRegions,
utr3regGene=utr3regGene,
geneExons=geneExons,
geneIntrons=geneIntrons
)
grLength <- sapply(testList, length)
isOfLengthNull <- names(grLength)[which( grLength == 0)]
if( length(isOfLengthNull) ) {
warning('The parsing of the GTF migh not be sucessfull. The folloing subsets were empty:', paste(isOfLengthNull, collapse = ','))
}
### Create path
parsedGRangesPath <- paste( gsub('.gtf$', '', inputedArguments$gtf, perl = T, ignore.case = T), '.parsed.Rdata', sep='')
save(
geneEdges,
transcriptEdges,
primaryTSS,
alternativeTSS,
utr5regGene,
codingRegions,
utr3regGene,
geneExons,
geneIntrons,
file=parsedGRangesPath
)
}
#