-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathConsensus.py
executable file
·157 lines (140 loc) · 5.72 KB
/
Consensus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
import argparse
import re
from sys import argv
#Globals
NT= ('A','C','G','T','U','R','Y','K','M','S','W','B','D','H','V','N', '-', '?')
AA =('A','B','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','U','V','W','Y','Z','X', '-', '*', '?')
#dictionary of ambiguity:
Ambigs = {
'A': ['A'],
'G': ['G'],
'C': ['C'],
'T': ['T'],
'M': [ 'A', 'C'],
'R': [ 'A', 'G'],
'W': [ 'A', 'T'],
'S': [ 'C', 'G'],
'Y': [ 'C', 'T'],
'K': [ 'G', 'T'],
'V': [ 'A', 'C', 'G'],
'H': [ 'A', 'C', 'T'],
'D': [ 'A', 'G', 'T'],
'B': [ 'C', 'G', 'T'],
'N': [ 'G', 'A', 'T', 'C']
}
###############
def string_type(string):
if all (i in NT for i in list(string)):
return 'NT'
elif all (i in AA for i in list(string)):
return 'AA'
else:
return 'ERROR: NOT AA or NT'
def Is_NT_or_AA(Fasta_Dict):
''' Returns NT is the sequence is composed of Nucleotide symbols or AA if symbols are aminoacids'''
if all(string_type(Fasta_Dict[key]) == 'NT' for key in Fasta_Dict.iterkeys()):
return 'NT'
elif all(string_type(Fasta_Dict[key]) == 'AA' for key in Fasta_Dict.iterkeys()):
return 'AA'
else:
for k in Fasta_Dict.iterkeys():
for i in Fasta_Dict[k]:
if i not in AA:
print i
def return_amb(list_of_nuc):
"""Returns a one letter ambiguity code form a list of nucleotides. """
nts=[Ambigs[x] for x in list_of_nuc]
nts=[u for x in nts for u in x]
for code in Ambigs.iterkeys():
if set(Ambigs[code]) == set(nts):
return code
def is_ID(Line):
"""Evaluates if a string correspond to fasta identifier. Herein broadly defined by starting with th e '>' symbol"""
if Line.startswith('>'):
return True
else:
return False
def Fasta_to_Dict(File):
'''Creates a dictionary of FASTA sequences in a File, with seqIs as key to the sequences.'''
with open(File, 'r') as F:
Records = {}
Seqid='null'
Records['null']=''
for Line in F:
if Line.startswith('>'):
Seqid = Line.strip('>').strip('\n')
Seq= ''
Records[Seqid] = Seq
else:
Seq = Records[Seqid] + Line.strip('\n')
Records[Seqid] = Seq.upper()
del Records['null']
return Records
def make_Consensus(Dict, T):
'''This functiom returns the sites where all the aligemnet positions match on the same nucleotide. this is a T% consensus, for AA seqs, the most common aminoacid equal or greater than the threshold will be used, and ambiguities replaced by "?" '''
Type = Is_NT_or_AA(Dict)
ignore=['-', '?']
Consensus=''
for i in range(0, len(Dict[Dict.keys()[0]])):
compo = [seq[i] for seq in Dict.itervalues()]
compo = [x for x in compo if x not in ignore]
if len(compo) < 1:
Consensus+='-'
else:
MFB = max(set(compo), key=compo.count)
G = compo.count(MFB)
if float(G)/len(compo) >= T:
Consensus+=MFB
elif Type == 'NT':
AmbC = return_amb(compo)
Consensus+=str(AmbC)
else:
Consensus += 'X'
return Consensus
def Good_Blocks(Consensus, M):
'''This funcion takes as inputs a consensus sequence and returns blocks of M contiguous base pairs in that consensus (Conserved sites of a given length)'''
GoodBlocks =''
block = ''
for site in Consensus:
if site not in ['-','N', '?']:
block+=site
elif site in ['-','N', '?' ] and len(block)>0:
if len(block) >= M:
GoodBlocks += block.upper() + site
block = ''
else:
GoodBlocks += block.lower() + site
block = ''
else:
GoodBlocks += site
block = ''
GoodBlocks+=block.lower()
return GoodBlocks
###MAIN###
if __name__ =='__main__':
parser = argparse.ArgumentParser(description='This is a program to write consensus sequences')
parser.add_argument('-i', dest = 'alignments', type = str, nargs= '+', help = 'Input alignment(s) in FASTA format.')
parser.add_argument('-t', action= 'store', dest = 'percentage', default = 1.0, type = float, help='Specify percentage threshold to make consensus, default 1.0' )
parser.add_argument('-B', action = 'store', dest = 'blocks', default = 0, type = int, help='look for conserved regions in the alignement (blocks) of the minimum size provided')
parser.add_argument('-d', dest = 'delimiter', type = str, default = '|', help = 'Specify custom field delimiter character separating species name from other sequence identifiers. Species name should be the first element for proper parsing. Default is: "|".')
arguments= parser.parse_args()
#print arguments
T = arguments.percentage
M = arguments.blocks
D = arguments.delimiter
for File in arguments.alignments:
F = Fasta_to_Dict(File)
Con = make_Consensus(F, T)
with open ("%s_consensus.fasta" % File.split('.')[0], 'w') as out:
out.write('>%s consensus sequence\n%s\n' % (File, Con))
if M > 0:
Out = open ('Good_Blocks.fasta', 'w')
Res = Good_Blocks(Con, M)
if re.search(r'[ACGT]+', Res):
print 'Consensus from orthogroup %s have conserevd regions' % FileName[0]
Out.write('>' + FileName[0] + '\n')
Out.write(Res + '\n')
else:
print 'Consensus from orthogroup %s does not look promissing' % FileName[0]
Out.close()