forked from ranjit58/NGS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter-pileup.py
executable file
·49 lines (37 loc) · 1.69 KB
/
filter-pileup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python
#-------------------------------------------------------------------------
# Name - filter-pileup.py
# Desc - The program filters the insertion, deletion, and few other characters from the column 5 of pileup format
# Author - Ranjit Kumar ([email protected])
# Source code - https://github.com/ranjit58/NGS/blob/master/filter-pileup.py
# Usage - filter-pileup.py input.pileup output.pileup
#-------------------------------------------------------------------------
import sys
import re
#read input file and create an output file
input = open(sys.argv[1],"r")
output = open(sys.argv[2],"w")
print "processing, please wait..."
# go through each line of input file and remove patterns. While loops are used to filter the presence of repeated patterns
for lines in input:
lines=lines.rstrip('\n')
linesplit = re.split('\t',lines)
line = linesplit[4]
# filter deletions
while (re.search('(-[0-9]+[ACGTNacgtn]+)',line)):
if re.search('(-[0-9]+)',line):
count = re.search('(-[0-9]+)',line).group(1)
str_command = '(-[0-9]+[A-Za-z]{' + str(abs(int(count))) + '})'
line = re.sub(str_command,'',line)
# filter insertion
while (re.search('(\+[0-9]+[ACGTNacgtn]+)',line)):
if re.search('(\+[0-9]+)',line):
count = re.search('(\+[0-9]+)',line).group(1)
str_command = '(\+[0-9]+[A-Za-z]{' + str(abs(int(count))) + '})'
line = re.sub(str_command,'',line)
# filter ^ and its next chatacter
while (re.search('\^.{1}',line)):
line = re.sub('\^.{1}','',line)
# write the output file
output.write(linesplit[0] + "\t" + linesplit[1] + "\t" + linesplit[2] + "\t" + linesplit[3] + "\t" + line + "\t" + linesplit[5] + "\n")
print "Completed !",