-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfasta-stats.py
105 lines (98 loc) · 5.05 KB
/
fasta-stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
fasta-stats.py: Utility script to count number of nucleotides/Aminoacids in a FASTA files.
Version 1.0
Downloaded from https://techoverflow.net/blog/2013/10/24/a-simple-tool-for-fasta-statistics/
"""
from __future__ import with_statement
import sys
import argparse
import gzip
#Counter is used for per-character statistics
from collections import Counter
__author__ = "Uli Koehler & Anton Smirnov"
__copyright__ = "Copyright 2013 Uli Koehler"
__license__ = "Apache v2.0"
def printSequenceStats(fileName, sequenceName, charOccurrenceMap, totalCharCount):
"""
Print details about a sequence to stdout.
Called from inside parseFile().
Keyword arguments:
sequenceName: The name of the sequence to print
charOccurrenceMap: A dict-like object that contains a char -->
number of occurrences mapping
totalCharCount: The overall character count of the sequence
"""
print "Sequence '%s' from FASTA file '%s' contains %d sequence characters:" % (sequenceName, fileName, totalCharCount)
for char in sorted(charOccurrenceMap.iterkeys()):
charCount = charOccurrenceMap[char]
relativeFrequency = charCount * 100.0 / totalCharCount
print "\t%s : %d = %f%%" % (char, charCount, relativeFrequency)
#For nucleotide sequences (ATGC only), also print A+T vs G+C count
if sorted(charOccurrenceMap.iterkeys()) == ["A","C","G","T"]:
#Print A+T count
atCount = charOccurrenceMap["A"] + charOccurrenceMap["T"]
atRelFrequency = atCount * 100.0 / totalCharCount
print "\tA+T : %d = %f%%" % (atCount, atRelFrequency)
#Print G+C count
gcCount = charOccurrenceMap["G"] + charOccurrenceMap["C"]
atRelFrequency = gcCount * 100.0 / totalCharCount
print "\tG+C : %d = %f%%" % (gcCount, atRelFrequency)
def parseFile(filename, caseSensitive=False, charWhitelist=None):
"""
Parse a FASTA fil and call printRe
"""
#Set to the header line, with ">" removed from the beginning
sequenceName = None
#Key: character, value: Number of occurrences
charOccurrenceMap = Counter()
#The number of characters in the current sequence, not including \n
charCount = 0
#Keep track of consecutive comments, because they are appended
previousLineWasComment = False
#Open and iterate the file, auto- detect gzip
openFunc = gzip.open if filename.endswith(".gz") else open
with openFunc(filename, "r") as infile:
for line in infile:
line = line.strip()
#Be super-compatible with the original specification
if line.startswith(">") or line.startswith(";"):
#Process previous sequence, if any
if sequenceName is not None:
printSequenceStats(filename, sequenceName, charOccurrenceMap, charCount)
charOccurrenceMap = Counter()
charCount = 0
#Take the entire comment line as (new) sequence ID (with ">" stripped)
#Concatenate consecutive sequence lines
if previousLineWasComment: #Append -- add one space between to normalize whitespace count
sequenceName += " " + line[1:].strip()
else:
sequenceName = line[1:].strip()
previousLineWasComment = True
else: #Line belongs to the sequence
previousLineWasComment = False
#Line has been stripped before, so we can count directly
#Increment per-character stats (character occurrences)
for char in line:
#Skip any character not in the whitelist, if whitelist (--only) is enabled
if charWhitelist is not None and not char in charWhitelist:
continue
#We can only count after whitelite filter
charCount += 1
#In case-insensitive mode (default) count uppercased chars only
char = char if caseSensitive else char.upper()
charOccurrenceMap[char] += 1
#The last line has been read, print the last record, if any
if sequenceName is not None:
printSequenceStats(filename, sequenceName, charOccurrenceMap, charCount)
if __name__ == "__main__":
#Allow single or multiple files to be specified
parser = argparse.ArgumentParser(description='Compute simple statistics for FASTA files.')
parser.add_argument('infiles', nargs='+', help='FASTA files (.fa, .fa.gz) to generate statistics for')
parser.add_argument('--case-sensitive', action='store_true', help='Count characters in a case-sensitive way. Disabled per default.')
parser.add_argument('-o','--only', help='If this option is supplied (e.g. set to \'ATGC\'), characters not in the set will be ignored for all statistics')
args = parser.parse_args()
#Process all FASTA files
for infile in args.infiles:
parseFile(infile, caseSensitive=args.case_sensitive, charWhitelist=args.only)