-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfungal_guilds.py
185 lines (145 loc) · 5.53 KB
/
fungal_guilds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# This is an example command to run this script:
# python py_fun.py -otu Bottleneck_otu_wTax.txt
#%% Command line parameters
import argparse
import os
import timeit
parser = argparse.ArgumentParser()
parser.add_argument("-otu", help="Path and file name of the OTU table."
, default="otu_table.txt") # filename of the OTU table
parser.add_argument("-o", help="Path and file name of the output file."
" Output file will be a new OTU table contains matched"
" record and sorted by the number of sequences"
, default="otu_table_functions.txt") # file name of the output file
args = parser.parse_args()
print ""
print "Reading in the OTU table: {}".format(args.otu)
print ""
start = timeit.default_timer()
#%% Data import
#input files
otu_file = args.otu
#output files
output_file = args.o
# Open the OTU table and read in the header
with open(otu_file) as otu:
#load the header
header = otu.next().rstrip('\n').split('\t')
header.append("function")
header.append("type")
header.append("likelihood")
header.append("notes")
header.append("level")
# get the position of the taxonomy column
index_tax = header.index('taxonomy')
#%% Search in function database
# Read the OTU table into memory, and seperate taxonomy level with '@'.
with open(otu_file) as otu:
otu_tab = []
for record in otu:
otu_current = record.split('\t')
otu_taxonomy = otu_current[index_tax].rstrip('\n')
replace_list = ['_',' ',';',',']
for symbol in replace_list:
otu_taxonomy = otu_taxonomy.replace(symbol, '@')
otu_taxonomy = otu_taxonomy + '@'
otu_current[index_tax] = otu_taxonomy
otu_tab.append(otu_current)
otu_tab = otu_tab[1:] # remove the header line
#%%
# Import Function Database from GitHub
import urllib
function_file = 'temp_db.txt'
url = "https://raw.githubusercontent.com/UMNFuN/fungal_function/master/fungal_guild_table.txt"
urllib.urlretrieve(url, function_file) # Save the online database to a local temp file.
# Set the parameters for progress report
with open(function_file) as f1:
i = 0
for line in f1:
i += 1
total_length = float(i) #length of the database
p = range(1,11)
way_points = [int(total_length*(x/10.0)) for x in p]
#%%
# Start searching the database
count = 0 # count of matching records in the OTU table
percent = 0 # line number in the database
f_database = open(function_file, 'r') # Open the local temp database file.
f_database.next() # Skip the header
otu_redundant = []
otu_new = []
print "Searching the function database..."
for record in f_database:
# report the progress
percent += 1
if percent in way_points:
progress = (int(round(percent/total_length*100.0)))
print '{}%'.format(progress)
else: t = 0
# Compare database with the OTU table
function_tax = record.split('\t')
search_term = function_tax[0].replace(' ', '@')
search_term = '@' + search_term + '@' #Add @ to the search term
for otu in otu_tab:
otu_tax = otu[index_tax] # Get the taxonomy string of current OTU record.
if otu_tax.find(search_term) >= 0: #found the keyword in this OTU's taxonomy
count += 1 # Count the matching record
otu_new = otu[:]
# Assign the matching functional information to current OTU record.
a = [2,3,4,5,1]
for item in a:
otu_new.append(function_tax[item])
otu_redundant.append(otu_new)
f_database.close()
# Finish searching, delete the temp function database file
if os.path.isfile(function_file) == True:
os.remove(function_file)
print ""
print "Found {} matching taxonomy records from the OTU table.".format(count)
print "Dereplicating and sorting the result..."
#%% Dereplicate and write to output file
from operator import itemgetter
#Sort by OTU names and Level. Level is sorted from largest to smallest.
otu_sort = otu_redundant[:]
otu_sort.sort(key = itemgetter(index_tax), reverse = True) # Sort the redundant OTU table by Taxonomic Level.
otu_sort.sort(key = itemgetter(0)) # Sort the redundant OTU table by OTU ID.
#Dereplicate the OTU table, unique OTU ID with highest taxnomic level will be kept.
otu_id_list = []
unique_list = []
count = 0
for item in otu_sort:
if item[0] not in otu_id_list:
count += 1
otu_id_list.append(item[0])
unique_list.append(item)
#Copy the original taxonomy string (without @) to the unique OTU table
otu_tax = []
with open(otu_file) as f_otu:
for otu in f_otu:
temp = otu.rstrip('\n').split('\t')
otu_tax.append(temp)
otu_tax = otu_tax[1:]
for new_rec in unique_list:
for rec in otu_tax:
if new_rec[0] == rec[0]:
new_rec[index_tax] = rec[index_tax]
#Sort the new otu table by the total sequence number of each OTU.
unique_list.sort(key=lambda x: int(sum(map(int,x[1:index_tax]))), reverse=True)
#Check if the temp output file is already existed (if yes remove it).
if os.path.isfile(output_file) == True:
os.remove(output_file)
output = open(output_file,'a')
#Write the file
#Header
output.write('%s\n' % ('\t'.join(header)))
#Unique OTU table
for item in unique_list:
rec = '\t'.join(item)
output.write('%s\n' % rec)
output.close()
print "{} OTUs have been assigned functions and wrote to {}.".format(count, args.o)
#%%
# Finish the program
stop = timeit.default_timer()
runtime = round((stop-start),2)
print "Total calculating time: {} seconds.".format(runtime)