-
Notifications
You must be signed in to change notification settings - Fork 6
/
prune_aln_cols.py
executable file
·173 lines (133 loc) · 4.71 KB
/
prune_aln_cols.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python
"""Prune certain columns from alignment
"""
#--- standard library imports
#
import sys
import logging
# optparse deprecated from Python 2.7 on
from optparse import OptionParser, SUPPRESS_HELP
from collections import Counter
#--- third-party imports
#
import Bio
from Bio import AlignIO
#--- project specific imports
#
import bioutils
__author__ = "Andreas Wilm"
__version__ = "0.1"
__email__ = "[email protected]"
__license__ = "The MIT License (MIT)"
# global logger
# http://docs.python.org/library/logging.html
LOG = logging.getLogger("")
logging.basicConfig(level=logging.WARN,
format='%(levelname)s [%(asctime)s]: %(message)s')
def cmdline_parser():
"""
creates an OptionParser instance
"""
# http://docs.python.org/library/optparse.html
usage = "%prog: " + __doc__ + "\n" \
"usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("", "--verbose",
dest="verbose",
action="store_true",
help=SUPPRESS_HELP) #"be verbose")
parser.add_option("", "--debug",
dest="debug",
action="store_true",
help=SUPPRESS_HELP) #"debugging")
parser.add_option("", "--all-gap",
action="store_true",
dest="all_gap",
help="Prune columns if all residues are gaps")
parser.add_option("", "--any-gap",
action="store_true",
dest="any_gap",
help="Prune columns with at least one gap")
parser.add_option("", "--identical",
action="store_true",
dest="identical",
help="Prune columns if all residues are identical")
parser.add_option("-i", "--in",
dest="aln_in",
help="Input alignment ('-' for stdin)")
parser.add_option("-f", "--infmt",
dest="informat",
help="Input format (must be supported by Biopython)")
return parser
def prune_aln(aln, what, fh_out=sys.stdout):
"""Prune what columns from alignment and print result
"""
assert what in ['any_gap', 'all_gap', 'identical']
keep_cols = []
for i in xrange(aln.get_alignment_length()):
# deprecated: col = aln.get_column(i)
col_nucs = [sr.seq[i].upper() for sr in aln]
counter = Counter(col_nucs)
if what == 'any_gap':
if any([bioutils.isgap(c) for c in counter.keys()]):
continue
if what == 'all_gap':
if all([bioutils.isgap(c) for c in counter.keys()]):
continue
if what == 'identical':
if len(set(counter.keys())) == 1:
continue
keep_cols.append(i)
# FIXME add support for proper alignment output, not just
# concatenated fasta
LOG.info("Keeping the following columns: %s" % (
', '.join([str(x+1) for x in keep_cols])))
for s in aln:
fh_out.write(">%s\n" % s.id)
fh_out.write('%s\n' % ''.join([s.seq[i] for i in keep_cols]))
def main():
"""
The main function
"""
parser = cmdline_parser()
(opts, args) = parser.parse_args()
if opts.verbose:
LOG.setLevel(logging.INFO)
if opts.debug:
LOG.setLevel(logging.DEBUG)
if not opts.aln_in:
parser.error("Missing input alignment argument")
sys.exit(1)
what = None
if opts.any_gap:
assert not what, ("Can only do one operation at a time")
what = 'any_gap'
if opts.all_gap:
assert not what, ("Can only do one operation at a time")
what = 'all_gap'
if opts.identical:
assert not what, ("Can only do one operation at a time")
what = 'identical'
if not what:
parser.error("No operation selected")
sys.exit(1)
if opts.aln_in == "-":
fh_in = sys.stdin
else:
fh_in = open(opts.aln_in, "rU")
fmt = opts.informat
if not fmt:
fmt = bioutils.guess_seqformat(opts.aln_in)
aln = AlignIO.read(fh_in, fmt)
if fh_in != sys.stdin:
fh_in.close()
prune_aln(aln, what, sys.stdout)
if __name__ == "__main__":
main()
if sys.version_info < (2 , 7) or sys.version_info > (2 , 8):
LOG.info("only tested Python 2.7 so far")
biopython_version = tuple(
[int(x) for x in Bio.__version__.split('.')])
if biopython_version < (1 , 55) or biopython_version > (1 , 57):
LOG.info("using untested version of Biopython")
LOG.info("Successful exit")