-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblobplot_stats_to_species.py
executable file
·101 lines (79 loc) · 3.45 KB
/
blobplot_stats_to_species.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
""" Use some dirty heuristics to look at the summary tables from
Blobtools and decide what species is in the sample.
"""
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import logging as L
from pprint import pprint
def main(args):
L.basicConfig(level=(L.DEBUG if args.verbose else L.WARNING))
# We could use Pandas here, but I'm not going to. Just load the TSV files
# naively.
stats_tables = []
for statfile in args.stats:
stats_tables.append(load_stat_file(statfile))
verdict = tables_to_verdict(stats_tables, args.cutoff, args.dominance)
print(";".join(verdict or [f"No hits >{args.cutoff}%"]))
def tables_to_verdict(stats_tables, cutoff, dominance):
ignore_names = set(["all", "no-hit", "other", "synthetic construct"])
verdict = []
for table in stats_tables:
lastperc = 0.0
for arow in table:
# Get the taxon name and the hit percentage
tname, tperc = arow['name'], arow['_sortkey']
if tperc < cutoff:
# We're not interested in this or anything below it.
# Go to next table.
break
elif tname in ignore_names:
# We don't care about this row but keep looking
continue
elif ( tperc >= cutoff
and tperc >= (lastperc - dominance) ):
verdict.append(f"{tname} ({tperc}%)")
lastperc = tperc
# If we already have a verdict, no need to go to the next table
if verdict:
break
return verdict
def load_stat_file(filename, sortby="cov0_read_map_p"):
"""Load the TSV file. We only need the first and last columns but may as well get
the lot.
"""
lines = []
with open(filename) as fh:
for aline in fh:
aline = aline.strip()
if aline == "No data":
# Special case
return []
if aline.startswith("# "):
headers = aline[2:].split("\t")
break
assert aline.startswith("##")
for aline in fh:
linedict = dict(zip( headers,
aline.strip().split("\t") ))
linedict['_sortkey'] = float(linedict[sortby].strip("% "))
lines.append(linedict)
lines.sort(key=lambda r: float(r['_sortkey']), reverse=True)
return lines
def parse_args(*args):
description = """ Takes a list of blobplot.stats.txt tables from blobtools
and tells you what the organism being sequenced is, using
dirty heuristics.
"""
argparser = ArgumentParser( description=description,
formatter_class = ArgumentDefaultsHelpFormatter )
argparser.add_argument("stats", nargs='+',
help="The blobplot.stats.txt files to read.")
argparser.add_argument("-c", "--cutoff", type=float, default=10.0,
help="Minimal percentage to consider.")
argparser.add_argument("-d", "--dominance", type=float, default=20.0,
help="Minimal percentage difference for secondary hit to be ignored.")
argparser.add_argument("-v", "--verbose", "--debug", action="store_true",
help="Print more verbose debugging messages.")
return argparser.parse_args(*args)
if __name__ == "__main__":
main(parse_args())