-
Notifications
You must be signed in to change notification settings - Fork 2
/
parseMPileup.py
243 lines (205 loc) · 9.07 KB
/
parseMPileup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/usr/bin/python
#############################################################################
## Title: parseMPileup.py
## Description: Parse output of samtools mpileup
##
## Author: Pascal Belleau and Astrid Deschenes
## Creation: 2017-09-14
## License: GPL-3
#############################################################################
##################
## IMPORT
##################
import sys
import getopt
from re import split
from re import match
def extractArguments():
"""
Extract argument values as input by user.
Keyword arguments:
none
"""
inputFile = ''
outputPrefix = ''
usage = 'usage: parsePileup.py -i <inputFile> -p <outputPrefix> [-s] [-h]'
## Valid arguments are:
## -i or --ifile for the input file
## -p or --pfile for the prefix of the output file
## -s for the creation of separated files
## -h for help
try:
opts, arg = getopt.getopt(sys.argv[1:], "hi:p:s", ["help", "ifile=", "pfile="])
except getopt.GetoptError:
print usage
sys.exit(2)
except:
print "Unexpected error:", sys.exc_info()[0]
raise
if len(opts) == 0:
print usage
sys.exit(2)
separatedFiles = False
for opt, arg in opts:
if opt in ("-h", "help"):
print usage
sys.exit(0)
elif opt in ("-i", "--ifile"):
inputFile = arg
elif opt in ("-p", "--pfile"):
outputPrefix = arg
elif opt in ("-s"):
separatedFiles = True
print 'Input file is "', inputFile, '"'
print 'Output prefix file is "', outputPrefix, '"'
return(inputFile, outputPrefix, separatedFiles)
def open_read_file(inputfile):
"""
Open reading file and return file pointer.
An IOError is raised when the file cannot be opened.
Keyword arguments:
inputfile -- the name of the input file
"""
try:
input_file = open(inputfile, 'r')
except IOError:
raise IOError("Cannot open file : %s \n\n" % (inputfile))
return(input_file)
def open_write_file(outputFile):
"""
Open writing file and return file pointer.
An IOError is raised when the file cannot be opened.
Keyword arguments:
outputFile -- the name of the output file
"""
try:
output_file = open(outputFile, 'w')
except IOError:
raise IOError("Cannot open file : %s \n\n" % (outputFile))
return(output_file)
def extractCigarSeq(sequence, phred, mapq, info):
"""
Open writing file and return file pointer.
An IOError is raised when the file cannot be opened.
Keyword arguments:
sequence -- the read bases aligned at a specific position
phred -- the base qualities for the same position
mapq -- the map qualities for the same position
info -- a dictionary containing the information about the reference base, the position, the chromosome and the number of bases
"""
ref = info['ref']
## The dictionary that is going to contain all (Phred score, MPAQ) tuple for each base of the sequence
letters = dict([('A', list()), ('C', list()), ('G', list()), ('T', list()), ('N', list()), ('O', list())])
lettersKeys = letters.keys()
sequence = sequence.upper()
## Offset used to obtain the Phred and MPAQ values
asciiOffset = 33
positionSeq = 0
positionPhred = 0
positionMapq = 0
while(positionSeq < len(sequence)):
currentData = sequence[positionSeq]
if currentData == "," or currentData == ".":
phredVal = ord(phred[positionPhred]) - asciiOffset
mapqVal = ord(mapq[positionMapq]) - asciiOffset
letters[ref].append((phredVal, mapqVal))
positionSeq += 1
positionPhred += 1
positionMapq += 1
elif currentData in lettersKeys:
phredVal = ord(phred[positionPhred]) - asciiOffset
mapqVal = ord(mapq[positionMapq]) - asciiOffset
letters[currentData].append((phredVal, mapqVal))
positionSeq += 1
positionPhred += 1
positionMapq += 1
elif currentData == "+" or currentData == "-":
letters['O'].append((-1, -1))
res = match(r"[+-](\d+)", sequence[positionSeq:len(sequence)])
indelLength = res.groups()[0]
positionSeq = positionSeq + 1 + len(indelLength) + int(indelLength)
## Not change in positionPhred because there is not Phred value for an indel
## Not change in positionMapq because there is not Mapq value for an indel
elif currentData == ">" or currentData == "<":
## Reference skip (CIGAR "N")
## Phred value available but skipped
## Mapq value available but skipped :wq
positionSeq += 1
positionPhred += 1
positionMapq += 1
elif currentData == "$":
positionSeq += 1
## Not change in positionPhred because there is not Phred value for the end of a sequence
## Not change in positionMapq because there is not Mapq value for the end of a sequence
elif currentData == "*":
positionSeq += 1
## Not change in positionPhred because there is not Phred value for the shadow of a deletion
## Not change in positionMapq because there is not Mapq value for the shadow of a deletion
elif currentData == "^":
## The beginning is composed of 2 characters : "^" followed by the Mapq value
positionSeq += 2
## Not change in positionPhred because there is not Phred value for the beginning of a sequence
## Not change in positionMapq because the Mapq value is in the sequence field, not in the Mapq field
else:
print("Problem with letter: " + currentData)
print("in this cigar string: " + sequence)
sys.exit(2)
return(letters)
def parsePileup(inputFile, outputPrefix, separatedFiles):
"""
Extract argument values as input by user.
Keyword arguments:
inputFile -- the name of the input file
outputPrefix -- the prefix of the output files
separatedFiles -- an boolean indicating if the position file should be created separately
"""
## Open input file
iFile = open_read_file(inputFile)
## Open global output file
oFile = open_write_file(outputPrefix + ".txt")
if not separatedFiles:
## Write header when no separated file is created
oFile.write("Chromosome\tPosition\tA\tC\tG\tT\tOther\tN\n")
else:
## Open separated file to contain information about position
posFile = open_write_file(outputPrefix + "_pos.txt")
oExtraFile = dict()
for phred in (15, 20, 30, 25):
for mapq in (0, 1, 5, 10):
oExtraFile[str(phred) + str(mapq)] = open_write_file(outputPrefix + "_" + str(phred) + "_" + str(mapq) + ".txt")
if not separatedFiles:
## Header only present when files are not separated
oExtraFile[str(phred) + str(mapq)].write("Chromosome\tPosition\tA\tC\tG\tT\n")
for line in iFile:
data = split("\t", line)
info = dict([('chr', data[0]), ('pos', data[1]), ('ref', data[2]), ('NB', data[3])])
lettersCount = extractCigarSeq(data[4], data[5], data[6], info);
## Write information for all based aligned
if not separatedFiles:
oFile.write("%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\n" % (info['chr'], info['pos'], len(lettersCount['A']), len(lettersCount['C']), len(lettersCount['G']), len(lettersCount['T']), len(lettersCount['O']), len(lettersCount['N'])))
else:
posFile.write("%s\t%s\n" % (info['chr'], info['pos']))
oFile.write("%d\t%d\t%d\t%d\t%d\t%d\n" % (len(lettersCount['A']), len(lettersCount['C']), len(lettersCount['G']), len(lettersCount['T']), len(lettersCount['O']), len(lettersCount['N'])))
for phred in (15, 20, 30, 25):
for mapq in (0, 1, 5, 10):
outputF = oExtraFile[str(phred) + str(mapq)]
if not separatedFiles:
outputF.write("%s\t%s\t" % (info['chr'], info['pos']))
newCount = dict()
for letterNow in ('A', 'C', 'G', 'T'):
newCount[letterNow] = len(filter(lambda g: g[0] < phred and g[1] < mapq, lettersCount[letterNow]))
outputF.write("%d\t%d\t%d\t%d\n" % (newCount['A'], newCount['C'], newCount['G'], newCount['T']))
## Close all files
for phred in (15, 20, 30, 25):
for mapq in (0, 1, 5, 10):
outputF = oExtraFile[str(phred) + str(mapq)]
outputF.close()
oFile.close()
iFile.close()
if separatedFiles:
posFile.close()
if __name__ == "__main__":
# Extract arguments. Message shown when the number of arguments is not coherent
(inputFile, outputPrefix, separatedFiles) = extractArguments()
# Parsing pileup file
parsePileup(inputFile, outputPrefix, separatedFiles)