-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathunique-kmers.py
executable file
·96 lines (67 loc) · 2.99 KB
/
unique-kmers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#! /usr/bin/env python2
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: [email protected]
#
# pylint: disable=invalid-name,missing-docstring
"""
Estimate number of unique k-mers, with precision <= ERROR_RATE.
% python sandbox/unique-kmers.py [ -k <k size> ] [ -e <ERROR_RATE> ] <data1> <data2> ...
Use '-h' for parameter help.
"""
import argparse
import os
import sys
import textwrap
import khmer
from khmer.khmer_args import DEFAULT_K, info, ComboFormatter
from khmer import __version__
def get_parser():
descr = "Estimate number of unique k-mers, with precision <= ERROR_RATE."
epilog = ("""
A HyperLogLog counter is used to do cardinality estimation. Since this counter
is based on a tradeoff between precision and memory consumption,
:option:`-e`/:option:`--error-rate` can be used to control how much
memory will be used. In practice the memory footprint is small even
at low error rates (< 0.01).
:option:`-k`/:option:`--ksize` should be set to the desired k-mer size.
Output is sent to STDOUT, but a report file can be generated with
:option:`-R`/:option:`--report`.
Example::
unique-kmers.py -k 17 tests/test-data/test-abund-read{,-2,-3}.fa
Example::
""" " unique-kmers.py -R unique_count -k 30 tests/test-data/test-abund-read-paired.fa") # noqa
parser = argparse.ArgumentParser(
description=descr, epilog=textwrap.dedent(epilog),
formatter_class=ComboFormatter)
env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
parser.add_argument('--version', action='version',
version='khmer {v}'.format(v=__version__))
parser.add_argument('-q', '--quiet', dest='quiet', default=False,
action='store_true')
parser.add_argument('--ksize', '-k', type=int, default=env_ksize,
help='k-mer size to use')
parser.add_argument('--error-rate', '-e', type=float, default=0.01,
help='Acceptable error rate')
parser.add_argument('-R', '--report',
metavar='filename', type=argparse.FileType('a+'))
parser.add_argument('input_filenames', metavar='input_sequence_filename',
help='Input FAST[AQ] sequence filename.', nargs='+')
return parser
def main():
info('unique-kmers.py', ['SeqAn', 'hll'])
args = get_parser().parse_args()
hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
report_fp = args.report
input_filename = None
for index, input_filename in enumerate(args.input_filenames):
hllcpp.consume_fasta(input_filename)
cardinality = hllcpp.estimate_cardinality()
print >> sys.stdout, 'Estimated number of unique k-mers: {0}'.format(
cardinality)
if report_fp:
report_fp.write('\n%s %s' % (args.ksize, cardinality))
if __name__ == "__main__":
main()