-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathUPhO_wt.py
executable file
·303 lines (286 loc) · 13.1 KB
/
UPhO_wt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#! /usr/bin/env python
import re
import os
from sys import argv
from math import fsum
import argparse
parser = argparse.ArgumentParser(description='Script for finding orthologs from gene family trees using Unrooted Phylogenetic Orthology criterion. Input trees are provided as a newick file(s) with one or more trees.')
parser.add_argument('-in', dest = 'Trees', type = str, default= None, nargs= '+', help = 'Input file(s) to evaluate, with tree(s) in newick format.')
parser.add_argument('-iP', dest= 'inParalogs', action ='store_true', default= False, help ='Include inparalogs will in the orthogroups, default = False.')
parser.add_argument('-m', dest= 'minTaxa', type = int, default= '4', help ='Specify the minimum number of OTUs in an orthogroup.')
parser.add_argument('-ouT', dest='out_trees', action = 'store_true', default =False, help ='Write orthobranches to newick file.')
parser.add_argument('-R', dest= 'Reference', type = str, default= None, help ='Points to a fasta file to be used as the source of sequences to write individual multiple sequence fasta file for each of the orthogroups found. Requires Get_fasta_from_Ref.py and its dependencies.')
parser.add_argument('-S', dest= 'Support', type = float, default = 0.0, help='Minimum support value for the orthology evaluation.')
parser.add_argument('-d', dest = 'delimiter', type = str, default = '|', help = 'Specify custom field delimiter character separating species name from other sequence identifiers. Species name should be the first element for proper parsing. Default is: "|".')
parser.add_argument('-wP', dest= 'tolerance', type = int, default= 0, help ='Specify a maximum number (int) of paralogues to be tolertated, so that an almost ortholog group does not get discarded due to few potentially spurious sequence (feature suggested by Kevin M. Kocot).')
args = parser.parse_args()
#print args
#GLOBAL VARIABLES. MODIFY IF NEEDED
sep=args.delimiter
gsep=re.escape(sep)
#CLASS DEFINITIONS
class split():
def __init__(self):
self.vecs=None
self.branch_length=None
self.support=None
self.name=None
class myPhylo():
'''A class for newick trees'''
def __init__(self, N):
self.leaves = get_leaves(N)
self.splits = []
self.ortho=[]
self.paralogs=[]
self.costs={} # Dictionary of leaf cost for inparalog evaluation'
self.newick = N.strip('\n')
for leaf in self.leaves: #Cost initialized
self.costs[leaf] = 1.0
split_decomposition(self)
#FUNCTION DEFINITIONS
def get_leaves(String):
"""Find leaves names in newick files using regexp. Leaves names are composed of alpha numeric characters, underscore and a special field delimiter"""
pattern = "[^\(\),;:\[\]]+%s[^\(\),;:\[\]]+" % gsep
Leaves = re.findall(pattern, String)
return Leaves
def spp_in_list(alist):
'''Return a list species from a list of sequence identifiers'''
spp =[]
for i in alist:
spp.append(i.split(sep)[0])
return spp
def complement(Sub, Whole):
"""Return elements in Whole that are not present in Sub"""
complement=set(Whole) - set(Sub)
return list(complement)
def split_decomposition(Tree):
'''Add a list of splits class objects to myPhylo Class object'''
#Part I. Where we identify matching parenthesis in the newick.
newick = Tree.newick
leaves = Tree.leaves
P = {} #Empty dictionary where to store parenthesis identifiers and a list of string index.
ids = 0
idc =0
Pos =0
closed = []
for l in newick:
if l == '(':
ids +=1
P[ids]=[Pos]
elif l ==')':
idc = ids
while idc in closed and idc != 0:
idc = idc-1
P[idc].append(Pos)
closed.append(idc)
Pos+=1
#Part II: Where we use string operations to identify components parts of each split.
Inspected= []
missBval = 0
for Key in P.iterkeys(): # this extracs splits from the parenthethical notation using the parenthesis mapping dictionary.
r_vec=newick[P[Key][0]: P[Key][1]]
vec = sorted(get_leaves(r_vec))
covec = sorted(complement(vec, Tree.leaves)) #Complementary splits are inferred as the set of leaves not included in the parenthesis grouping.
if vec not in Inspected and covec not in Inspected:
mySplits = split()
mySplits.vecs = [vec, covec]
exp = re.escape(r_vec) + r'\)([0-9\.]*:[0-9\.]+)'
BranchVal=re.findall(exp, Tree.newick)
try:
mySplits.branch_length = BranchVal[0].split(':')[1]
mySplits.support = BranchVal[0].split(':')[0]
except:
missBval+=1
Tree.splits.append(mySplits)
Inspected.append(vec)
Inspected.append(covec)
for leaf in leaves: #Splits leading to each terminal are included.
vec=[leaf]
covec = sorted(complement(vec,leaves))
if leaf not in Inspected:
Inspected.append(leaf)
mySplits =split()
mySplits.vecs = [vec, covec]
exp = re.escape(leaf) + r'\:([0-9\.]+)'
BranchVal = re.findall(exp, Tree.newick)
try:
mySplits.branch_length = BranchVal[0]
except:
missBval+=1
Tree.splits.append(mySplits)
print '%d splits in the tree missed branch values.' %missBval
def LargestBox(LoL):
'''Takes a list of lists (lol) and returns a lol where no list is a subset of the others, retaining only the largest'''
NR =[]
for L in LoL:
score=0
for J in LoL:
if set(L).issubset(J):
score +=1
if score < 2:
NR.append(L)
return NR
def orthologs(Phylo, minTaxa, bsupport):
"""This function returns populates the list of orthologs in the PhyloClass object"""
OrthoBranch=[]
Tolerated=[]
#if in paralogs are to be included, update cost parameter per terminals.
if args.inParalogs:
for S in Phylo.splits:
if S.support in [None, ''] or float(S.support) >= bsupport:
for i_split in S.vecs:
Otus = spp_in_list(i_split)
if len(set(Otus)) == 1 and len(Otus) > 1: # find splits representing in-paralogs and update costs
for leaf in i_split:
ICost = 1.0/len(Otus)
if ICost < Phylo.costs[leaf]:
Phylo.costs[leaf] = ICost #Reduce cost value of inparlogue copies in poportion to the number of inparalogs inplied by this split.
for S in Phylo.splits:
if S.support in [None, ''] or float(S.support) >= bsupport:
for i_split in S.vecs:
Otus = spp_in_list(i_split)
cCount = fsum(Phylo.costs[i] for i in i_split)
if len(set(Otus)) == cCount and cCount >= minTaxa:
if i_split not in OrthoBranch:
OrthoBranch.append(i_split)
elif len(set(Otus)) >= (cCount -args.tolerance) and cCount >= minTaxa:
if i_split not in Tolerated:
Tolerated.append(i_split)
OrthoBranch = LargestBox(OrthoBranch)
Tolerated = LargestBox(Tolerated)
Phylo.ortho=OrthoBranch
Phylo.paralogs=Tolerated
def aggregate_splits(small,large):
"""Takes two newick like splits where small is a subset of large and returns partial newick incluiding the two input groupings"""
aggregate=large
contents = get_leaves(small)
placeholder= contents.pop()
for i in contents: #remove from aggregate all leaves in small except the placeholder
aggregate=aggregate.replace(i + ',' , "")
aggregate = aggregate.replace(placeholder, small)
return aggregate
def subNewick(alist, myPhylo):
"""This function takes a list of leaves forming a branch and a source tree, returning the newick subtree"""
relevant = []
seed =''
for split in myPhylo.splits:
for vec in split.vecs:
if set(vec).issubset(set(alist)) and len(vec) > 0:
if len (vec) == len(alist):
seed = "(%s)%s:%s;" %(','.join(vec), str(split.support), str(split.branch_length))
elif len (vec) == 1:
rep = '%s:%s' %(vec[0], str(split.branch_length))
relevant.append(rep)
else:
rep = "(%s)%s:%s" %(','.join(vec), str(split.support), str(split.branch_length))
relevant.append(rep)
partial = seed
relevant = sorted(relevant, key=len, reverse=True) # order is important
for e in relevant:
partial = aggregate_splits(e, partial)
partial = re.sub('None:', ':', partial)
partial = re.sub(':None', ':1', partial)
return partial
def main_wTrees ():
Total = 0
Tolerated=0
for tree in args.Trees:
name=tree.split('.')[0]
count = 0
countp = 0
with open(tree, 'r') as T:
for line in T:
P = myPhylo(line)
orthologs(P, args.minTaxa, args.Support)
parNum=0
ortNum=0
for group in P.ortho:
FName= '#%s_%d,' %(name,ortNum)
OrtBranch=open('%s_%s.tre' %(name,ortNum), 'w')
G = ','.join(group).strip(',')
OrtList.write(FName + G + '\n')
if set(group) == set(P.leaves): # if the whole input tree represents an orthobranch
branch = P.newick
else:
branch = subNewick(group, P)
OrtBranch.write(branch + '\n')
print "subtree written to: %s_%s.tre" %(name,ortNum)
count += 1
Total += 1
ortNum += 1
OrtBranch.close()
for group in P.paralogs:
FName= '#%s_p%d,' %(name,parNum)
ParBranch=open('%s_p%s.tre' %(name,parNum), 'w')
G = ','.join(group).strip(',')
ParList.write(FName + G + '\n')
if set(group) == set(P.leaves): # if the whole input tree represents an orthobranch with n paralogs
branch = P.newick
else:
branch = subNewick(group, P)
ParBranch.write(branch + '\n')
print "subtree written to: %s_p%s.tre" %(name,parNum)
countp += 1
Tolerated += 1
parNum += 1
ParBranch.close()
print "%d orthogroups were found in the tree file %s" % (count, tree)
print "%d paralogues were tolerated in the tree file %s" % (countp, tree)
print 'Total orthogroups found: %d' % Total
print 'Total orthogroups with paralgy tolerated: %d' % Tolerated
def main():
Total = 0
Tolerated =0
for tree in args.Trees:
name=tree.split('.')[0]
count = 0
countp = 0
with open(tree, 'r') as T:
for line in T:
P = myPhylo(line)
orthologs(P, args.minTaxa, args.Support)
ortNum=0
parNum=0
for group in P.ortho:
FName= '#%s_%d,' %(name,ortNum)
G = ','.join(group).strip(',')
OrtList.write(FName + G + '\n')
count += 1
Total += 1
ortNum += 1
for group in P.paralogs:
FName= '#%s_p%d,' %(name,parNum)
G = ','.join(group).strip(',')
ParList.write(FName + G + '\n')
countp += 1
Tolerated += 1
parNum += 1
print "%d orthogroups were found in the tree file %s" % (count, tree)
print "%d orthogroups with paralogy were tolerated in the tree file %s" % (countp, tree)
print 'Total orthogroups found: %d' % Total
print 'Total orthologs with paralogy tolerated %d' % Tolerated
#MAIN
if __name__ == "__main__":
print "Begining orthology assesment: Support threshold = %1.2f; inparalogs = %s" % (args.Support, args.inParalogs)
OrtList = open('UPhO_orthogroups.csv', 'w')
if args.tolerance > 0:
print "Orthogroups with including a max. of %d paralogs will be writted to UPhO_wp_orthogroups.csv"
ParList = open('UPhO_wp_orthogroups.csv', 'w')
if not args.out_trees:
main()
else:
main_wTrees()
OrtList.close()
try:
ParList.close()
except:
None
if args.Reference != None:
from Get_fasta_from_Ref import Retrieve_Fasta
print "Proceeding to create a fasta file for each ortholog"
Retrieve_Fasta('UPhO_orthogroups.csv','UPhO_Seqs','upho', args.Reference)
try:
Retrieve_Fasta('UPhO_wp_orthogroups.csv','UPhO_Seqs','uphowp', args.Reference)
except:
print "File with inapralogs not found."
print "Fasta files of each orthogroup written to UPhO_Seqs. Nothing else to do. Bye bye!"