-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinter-map.py
417 lines (371 loc) · 14.3 KB
/
inter-map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
#inter-map.py
"""
# -----------------------------------------------------------------------
#
# inter-map.py
#
# by Hannah Catabia
# Last Modified: 2017-05-30
#
# This script creates GEXF files showing a user-input gene set
# mapped onto an interactome network.
#
#
# Inter-Tools: A Command-line Toolkit for Interactome Research
#
# by Hannah Catabia, Caren Smith, and Jose Ordovas
#
#
# -----------------------------------------------------------------------
#
#
# This program will produce a CSV or TSV file containing
# protein-protein interactions curated from user-input
# MITAB files.
#
#
# * Required input:
#
# -g A CSV or TSV file with a list of genes in the first
# column, identified with Entrez IDs or Uniprot KBs
#
# -e Indicates the the genes are identified with Entrez IDs
#
# -u Indicates the the genes are identified with UniProt KBs
#
# ENTREZ ID USAGE EXAMPLE:
# > python inter-map.py -g gene_set.csv -e
#
# UNIPROT KB USAGE EXAMPLE:
# > python inter-map.py -g gene_set.csv -u
#
# * Optional input:
#
# -i A CSV or TSV file containing an interactome dataset with
# the following column headers:
# entrez_A, entrez_B, uniprot_A, uniprot_B, symbol_A, symbol_B,
# interaction_detection_methods, publication_identifiers,
# taxid_A, taxid_B, interaction_type, source_database, from_files
# If none is given, the script will search for 'interactome.csv'
# in your directory.
# USAGE EXAMPLE:
# > python inter-map.py -g gene_set.csv -e -i mouse_interactome.tsv
#
# -s Include self-interactions in the interactome network.
# USAGE EXAMPLE:
# > python inter-map.py -g gene_set.csv -u -s
#
#
# -----------------------------------------------------------------------
"""
import sys
import argparse
import os.path
import csv
import pandas as pd
import networkx as nx
import datetime
from matplotlib import pyplot as plt
import random
import numpy as np
'''
# =============================================================================
S T A R T D E F I N I T I O N S
# =============================================================================
'''
# =============================================================================
# Error checks all user input from the command line
def command_line_error_check(args):
#Check to make sure the gene_file and interactome_files are a CSV or a TSV
file_error= '''
\n\n
ERROR: Both the interactome and the gene set must be either a CSV or TSV file.
%s or %s does not end with the file extension .csv or .tsv.
EXAMPLE:
./inter-map.py -g gene_set.tsv -i interactome.csv -e
\n\n
'''%(args.gene_file, args.interactome_file)
if args.gene_file[-4:].lower() not in ['.csv', '.tsv']:
print file_error
sys.exit(0)
if args.interactome_file[-4:].lower() not in ['.csv', '.tsv']:
print file_error
sys.exit(0)
#Make sure the gene_file and interactome_file are in the same directory
path_error = '''
\n\n
ERROR: Both the interactome and the gene set files must be in
the same directory as this program.
\n\n
'''
if not os.path.isfile(args.gene_file) or not os.path.isfile(args.interactome_file):
print path_error
sys.exit(0)
#Make sure we have an Entrez or UniProt identifier designation
id_error = '''
\n\n
ERROR: You must designate the gene set as a list of either Entrez IDs
or UniProt IDs. If no command is given, the program will assume
you have provided Entrez IDs.
EXAMPLE FOR ENTREZ IDs:
./inter-map.py -g gene_set.tsv -i interactome.csv -e
EXAMPLE FOR UNIPROT KBs:
./inter-map.py -g gene_set.tsv -i interactome.csv -u
\n\n
'''
if args.entrez==args.uniprot:
print id_error
sys.exit(0)
# =============================================================================
# Open the gene set file and read in the genes
def read_gene_set(gene_file):
#Is it a CSV or a TSV?
if gene_file[-4:].lower() == '.tsv':
delim = '\t'
else:
delim = ','
#open the file and put the genes into a list
with open(gene_file, 'rb') as gene_file:
gene_reader = csv.reader(gene_file, delimiter = delim)
genes = [str(row[0]) for row in gene_reader]
#check for a header that is not a number (entrez_id)
for item in ['entrez', 'gene', 'id', 'symbol']:
if item in genes[0].lower():
genes.pop(0)
return genes
# =============================================================================
# Open and read the interactome dataset file
def read_interactome(interactome_file):
#Is it a CSV or a TSV?
if interactome_file[-4:].lower() == '.tsv':
delim = '\t'
else:
delim = ','
#read it using pandas
interactome = pd.read_csv(interactome_file, sep = delim, usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12])
#enforce that the interactome file have this specific header
header = ['entrez_A', 'entrez_B', 'uniprot_A', 'uniprot_B', 'symbol_A', 'symbol_B', 'interaction_detections_methods',
'publication_identifiers', 'taxid_A', 'taxid_B', 'interaction_type', 'source_database', 'from_files']
header_error = '''
\n
%s must have the following 13 column headers replicated EXACTLY:
entrez_A, entrez_B, uniprot_A, uniprot_B, symbol_A, symbol_B,
interaction_detection_methods, publication_identifiers,
taxid_A, taxid_B, interaction_type, source_database, from_files
\n
''' %(interactome_file,)
if list(interactome) != header:
print header_error
sys.exit(0)
return interactome
# =============================================================================
# Make an graph in NetworkX interactome dataset
def graph_interactome(interactome_df, entrez, uniprot, self):
G = nx.Graph()
#get rid of missing id entries
if entrez:
interactome_df.dropna(subset=['entrez_A', 'entrez_B'])
else:
interactome_df.dropna(subset=['uniprot_A', 'uniprot_B'])
#make a list
interactome_list = interactome_df.values.tolist()
#list the self-edges
self_edges = []
for row in interactome_list:
if entrez:
node1 = str(row[0])
node2 = str(row[1])
else:
node1 = str(row[2])
node2 = str(row[3])
#adding interactions while dealing with self-edges
if node1==node2 and not self:
self_edges.append(node1)
else:
G.add_node(node1, entrez=row[0], uniprot=row[2], symbol=row[4], taxid=row[8], gene_set=False, weight = 0, interactome_degree = 0, gene_set_degree = 0)
G.add_node(node2, entrez=row[1], uniprot=row[3], symbol=row[5], taxid=row[9], gene_set=False, weight = 0, interactome_degree = 0, gene_set_degree = 0)
G.add_edge(node1, node2, interaction_detection_method=row[6], publication_identifiers=row[7], interaction_type=row[10], source_database=row[11], from_files=row[12])
if node1==node2:
self_edges.append(node1)
#make the interactome into one connected component
components = sorted([list(x) for x in nx.connected_components(G)], key=len)
largest_component = components.pop()
G = nx.subgraph(G, largest_component)
#make the interactome degree an attribute of each gene
for gene in G.nodes():
G.node[gene]['interactome_degree'] = nx.degree(G, gene)
# a list of eliminated genes
eliminated = [item for sublist in components for item in sublist]
return G, self_edges, eliminated
# =============================================================================
# Map the gene set into the interactome network
def map_gene_set(G, genes):
#lists of found and not found genes
found = []
not_found = []
for gene in genes:
if gene in G:
G.node[gene]['gene_set'] = True
found.append(gene)
else:
not_found.append(gene)
return G, found, not_found
# =============================================================================
# Create GEXF and PDF files
def create_visual_files(G, found, entrez, uniprot):
#the gene set subgraph
J = G.subgraph(found)
#make an attribute for the gene set degree
#weight = gene_set_degree/interactome_degree
for gene in found:
G.node[gene]['gene_set_degree'] = nx.degree(J, gene)
G.node[gene]['weight'] = (float(G.node[gene]['gene_set_degree'])/float(G.node[gene]['interactome_degree']))*100
#connected and unconnected subsets of gene set subgraph
connected_nodes = []
unconnected_nodes = []
for node in found:
if J.degree(node) > 0:
connected_nodes.append(node)
else:
unconnected_nodes.append(node)
#create a PDF picture of the connected nodes
H = J.subgraph(connected_nodes)
#GEXF files
print '\nWriting the GEXF files...\n'
nx.write_gexf(J, 'gene_set_network.gexf')
nx.write_gexf(G, 'full_interactome_network.gexf')
#Done!
return H, connected_nodes, unconnected_nodes
# =============================================================================
# Find the largest connected component of the gene set in the interactome network
def get_lcc(G, H):
#get the connected components
connected_components = sorted([list(x) for x in nx.connected_components(H)], key=len)
lcc = connected_components.pop()
lcc_graph = H.subgraph(lcc)
return lcc
# =============================================================================
# Find the mean shortest distance of the gene set in the interactome network
def get_msd(G, H, connected_nodes, unconnected_nodes):
print '\nCalculating the mean shortest distance (MSD)...\n'
#connected nodes have distance of 1
shortest_distances = [1] * len(connected_nodes)
#find all distances between unconnected node pairs:
pair_lengths = dict((node, {}) for node in unconnected_nodes)
for nodeA in unconnected_nodes:
for nodeB in unconnected_nodes:
if nodeA != nodeB:
if nodeB not in pair_lengths[nodeA]:
path_len= nx.shortest_path_length(G, source=nodeA, target=nodeB)
pair_lengths[nodeA][nodeB] = path_len
pair_lengths[nodeB][nodeA] = path_len
#now add in distances for connected nodes
for nodeA in unconnected_nodes:
for nodeB in connected_nodes:
pair_lengths[nodeA][nodeB] = nx.shortest_path_length(G, source=nodeA, target=nodeB)
#now add shortest distances
for node in unconnected_nodes:
shortest_distances.append(min(pair_lengths[node].values()))
#take the mean
msd = float(sum(shortest_distances))/len(shortest_distances)
return msd
'''
# =============================================================================
# PROGRAM INITIALIZES HERE
# =============================================================================
'''
if __name__ == '__main__':
#A few blank lines to begin with
print '\n\n\n'
#-------------------------------------------------------------------------------------------------
# Parsing and error checking the command line
#-------------------------------------------------------------------------------------------------
parser = argparse.ArgumentParser(description='Map a gene set to the interactome')
parser.add_argument('-g', '--genes', type=str, dest='gene_file', action='store', default= '', help = 'List of genes by Entrez ID or UniProt KB (single column in a CSV or TSV file).')
parser.add_argument('-i', '--interactome', type=str, dest='interactome_file', action='store', default= 'interactome.csv', help='An interactome dataset created with inter-build.py')
parser.add_argument('-e', '--entrez', dest='entrez', action='store_true', default=False, help='Use if the gene set uses Entrez IDs')
parser.add_argument('-u', '--uniprot', dest='uniprot', action='store_true', default=False, help='Use if the gene set uses UniProt KBs')
parser.add_argument('-s', '--self', dest='self', action='store_true', default=False, help= 'Use if you wish to include self-interactions in the interactome')
args = parser.parse_args()
# Error-checking the command line
command_line_error_check(args)
#upack the variables
gene_file = args.gene_file
interactome_file = args.interactome_file
entrez = args.entrez
uniprot = args.uniprot
self = args.self
#-------------------------------------------------------------------------------------------------
# Create network and add to the results file
#-------------------------------------------------------------------------------------------------
results = '''
MAP-RESULTS.txt
----------------------------------------------------------------------------
%s
Gene set: %s
Interactome dataset: %s
''' % ('{:%Y-%b-%d_%H:%M:%S}'.format(datetime.datetime.now()), gene_file, interactome_file)
#read in genes and interactome interactome
genes = read_gene_set(gene_file)
interactome = read_interactome(interactome_file)
#graph the interactome
G, self_edges, eliminated = graph_interactome(interactome, entrez, uniprot, self)
#update results
results = results + '''
INTERACTOME INFO:
%s contains %i unique genes and %i unique interactions.
''' % (interactome_file, len(G.nodes()), len(G.edges()) )
if self:
results = results + '''
%i of interactions in the dataset are self-interactions. \n
''' % len(self_edges)
else:
results = results + '''
%i self-interactions were eliminated from the dataset. \n
''' % len(self_edges)
results = results + '''
The following %i genes were eliminated from the interactome because they
were not connected to the largest connected component of the interactome:
%s
'''%(len(eliminated), ', '.join(eliminated))
#map the gene set to the itneractome
G, found, not_found = map_gene_set(G, genes)
#update the results file
total = len(found) + len(not_found)
results = results + '''
GENE SET INFO:
%s contains %i unique genes.
%i genes (%f %%) were found in the interactome.
The folowing %i genes were not found in the interactome dataset:
%s
''' % (interactome_file, total, len(found), float(len(found))/float(total)*100, len(not_found), ', '.join(not_found) )
#create the visual files and the connected subgraph
H, connected_nodes, unconnected_nodes = create_visual_files(G, found, entrez, uniprot)
#update results
results = results + '''
Of the %i genes found, %i (%f %%) were connected to at least one other gene in the set.
The following %i (%f %%) genes were found in the interactome, but were not directly connected another gene from the gene set:
%s
''' % (len(found), len(connected_nodes), float(len(connected_nodes))/float(total)*100,
len(unconnected_nodes), float(len(unconnected_nodes))/float(total)*100, ', '.join(unconnected_nodes) )
#find the largest connected component of the gene set
lcc = get_lcc(G, H)
#update results
results = results + '''
LARGEST CONNECTED COMPONENT:
The LCC of the gene set contains %i genes:
%s
''' % (len(lcc), ', '.join(lcc))
#find the mean shortest distance of the gene set
msd = get_msd(G, H, connected_nodes, unconnected_nodes)
#update results
results = results + '''
MEAN SHORTEST DISTANCE (MSD):
The MSD of the gene set is %f.
''' % (msd,)
#Write the results file
with open('MAP-RESULTS.txt', 'w') as results_file:
results_file.write(results)
print results
#Finished
print '\n\nDone!\n\n'