-
Notifications
You must be signed in to change notification settings - Fork 15
/
recountReads.py
executable file
·177 lines (146 loc) · 6.37 KB
/
recountReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
'''
Created on 02.03.2015
This script recalculates the number of read per base and joins the .
If there is an
@author: david
'''
import argparse, os, re, sys
import pysam
from Helper import Helper
parser = argparse.ArgumentParser(description='Merges the GVF Files and recalculates the base Counts after RnaEditor is finished.')
parser.add_argument('-f', '--files', metavar='N', type=str, nargs='+', help='the list of files')
parser.add_argument('-b', '--bams', metavar='N', type=str, nargs='+', help='the list of bam files')
parser.add_argument('-t', '--top', metavar='N', type=str, nargs="+", help='list of header names (space separated)')
parser.add_argument('-o', '--outFile', metavar='output File', type=str,help='Output File', default="baseCounts_combined.txt")
parser.add_argument('-c', '--columns', metavar='N', type=int, nargs='+', help='columns to keep (space separated)',default=[2],)
parser.add_argument('-k', '--keys', metavar='N', nargs='+', type=int, help='columnnumber on which to join',default=[1])
parser.add_argument('-d', '--delimiter', metavar='N', type=str, help='delimiter', default="\t")
parser.add_argument('-e', '--empty', metavar='N', type=str, help='Sign for empty Values', default="--")
args = parser.parse_args()
startTime = Helper.getTime()
def fillDicts(files,columns,keys):
'''
creates the table and fills the set of keys
'''
fileNumber=len(files)
fileCounter=0
keySet=()
fileCounter=0
for file in files: #loop through all files
i=0
Helper.info("Get information from %s" % file)
file = open(file)
for line in file: #loop through current file
line = line.split()
keyTuple=()
for k in keys:
keyTuple=keyTuple+(line[k-1],)
value=[]
for column in columns: #get the needed values
try:
value.append(line[column-1])
except IndexError:
raise ValueError("Not enough rows in line: %s in file %s" % (" ".join(line),file.name))
if keyTuple in keySet:
#currentDefaultList=idDict[keyTuple]
#currentDefaultList[fileCounter]=value
#idDict[keyTuple]=currentDefaultList
idDict[keyTuple][fileCounter]=value #replace filecounter List with values from current File
else:
currentDefaultList=[["--"]*len(columns)]*len(files) #create default list, with all values empty
currentDefaultList[fileCounter]=value
idDict[keyTuple]=currentDefaultList
keySet=keySet+(keyTuple,)
i+=1
if i % 1000 == 0:
Helper.status("%s lines parsed" % i)
fileCounter+=1
return idDict,keySet
def getBaseCount(reads, varPos):
'''
:param reads:
:param varPos:
'''
'''
returns the baseCount for the
'''
baseCount = {'A':0,'C':0,'G':0,'T':0}
for read in reads:
readPos=0
mmReadPos=0
startPos = read.pos
try:
cigarNums=re.split("[MIDNSHP]", read.cigarstring)[:-1]
cigarLetters=re.split("[0-9]+",read.cigarstring)[1:]
except (TypeError):
continue #for unmapped reads the cigarstring is empty
#to avoid a query for unmapped reads all the
#time the error is catched and the read will be skipped
#raise TypeError("Invalid Cigar String %s" % read.cigarstring)
for i in range(len(cigarLetters)): #parse over single read
if cigarLetters[i] in {"I","S","H"}: #Insertion, Soft Clipping and Hard Clipping
readPos = readPos + int(cigarNums[i])
elif cigarLetters[i] in {"D","N"}: #Deletions and skipped Regions
startPos = startPos + int(cigarNums[i])
elif cigarLetters[i] in {"M"}: #Matches
for j in range(int(cigarNums[i])):
if startPos == varPos:
mmReadPos = readPos
mmReadBase= read.seq[mmReadPos]
try:
baseCount[mmReadBase]+=1 #increase number for the base at the mm pos
except (KeyError):
sys.stderr.write("unknown Base %s \n" % mmReadBase)
readPos += 1
startPos += 1
return map(str,[baseCount['A'],baseCount['C'],baseCount['G'],baseCount['T']])
'''check right order of bam and gtf files'''
if len(args.bams) != len(args.files):
raise ValueError("Number of GVF and Bam Files should be the same")
counter=0
for bam in args.bams:
bam=os.path.basename(bam)
gtf=os.path.basename(args.files[counter])
bamBase = bam.split(".")[0]
gtfBase = gtf.split(".")[0]
if bamBase != gtfBase:
raise ValueError("BAM and GVF Files have to be in the same order")
counter+=1
'''Fill the header'''
idDict = {}
if args.top==None:
header=[]
for file in args.files:
header.append(os.path.basename(file))
else:
header = args.top
'''fill table'''
idDict,keySet = fillDicts(args.files, args.columns,args.keys)
'''recount Reads'''
fileCounter=0
defaultList= ["--"]*len(args.columns)
for bamFile in args.bams:
i=0
#Helper.status("recounting Reads for %s" % bamFile)
Helper.info("recounting Reads from %s" % bamFile)
samfile = pysam.AlignmentFile(bamFile, "rb")
for keyTuple in keySet[1:]:
i+=1
'''check if basecount is unset for current condition'''
if idDict[keyTuple][fileCounter] == defaultList:
chr,startAnalysis = keyTuple[3],int(keyTuple[7])-1 #pysam is zero based
reads=samfile.fetch(chr, startAnalysis, startAnalysis+1)
baseCount = getBaseCount(reads,startAnalysis)
idDict[keyTuple][fileCounter] = baseCount
if counter % 1000 == 0:
Helper.status("%s out of %s editing sites finished" % (i,len(keySet)))
fileCounter+=1
'''write the results to the output file'''
outFile = open(args.outFile,"w")
deli="\t"*len(args.columns)
outFile.write("\t"*len(args.keys)+deli.join(header)+"\n")
for keyTuple in keySet:
output=list(keyTuple)
for v in idDict[keyTuple]:
output=output+v
outFile.write("\t".join(output)+"\n")