-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEpimeter.py
140 lines (124 loc) · 5.86 KB
/
Epimeter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
"""
epimeter
@author: savsr
epimeter is a tool that determines how similar a given neo-epitope is to the
human proteome by calculating the distance between the peptide sequences.
epimeter has two functions: "index" and "query." The "index" function takes
a FASTA file of proteins as an input and indexes all of the protein k-mers
(user specifies k) into an Annoy Index. The "query" function then allows the
user to search for a specific neo-epitope candidate within the Annoy Index
allowing for a much more stream-lined and efficient search process than
current bioinformatics search tools such as BLAST.
"""
import PeptideIndex
import argparse
from collections import defaultdict
def build_index(protein_fasta, kmer_bounds, index_name):
"""builds protein from protein fasta and indexes kmers
in protein into a Peptide Index, and saves the Peptide Index.
Each set of kmers has its own annoy index in Peptide Index
protein_fasta: path to file of protein seqs
kmer_bounds: range of kmers for which annoy indexes need to be built
index_name: name of dir where Peptide Index will be saved
Return Value: PeptideIndex
"""
print(kmer_bounds)
min_k = int(kmer_bounds[0])
max_k = int(kmer_bounds[1])
peptide_index = PeptideIndex.PeptideIndex()
protein = []
with open(protein_fasta) as protein_fasta:
#skip the first header in file
next(protein_fasta)
item_numbers = defaultdict(int)
for line in protein_fasta:
if line.startswith('>'):
# New protein! Process k-mers from last protein assembled
# Start with the full protein
protein = ''.join(protein)
for k in range(min_k, max_k + 1):
for i in range(len(protein) - k + 1):
item_numbers[k] = item_numbers[k] + 1
peptide_index.add_item(item_numbers[k], protein[i:i+k])
protein = []
#continue assembling the protein
else:
protein.append(line.strip())
#update the index for the last protein in the file
if len(protein) > 0:
protein = ''.join(protein)
for k in range(min_k, max_k + 1):
for i in range(len(protein) - k + 1):
item_numbers[k] =item_numbers[k] + 1
peptide_index.add_item(item_numbers[k], protein[i:i+k])
peptide_index.save(index_name)
return peptide_index
def query_epitope(epitopes, index_name):
""" takes each epitope from a file of epitopes, queries for the
nearest neighbors, and then writes the nearest neighbors + distances
for each epitope in one line of csv file
epitopes: path to file of epitopes; assumes one epitope per line
index_name: path to directory where PeptideIndex will be loaded from
Return Value: none
"""
nearest_neighbors = open("nearest_neighbors.csv", "w")
peptide_index = PeptideIndex.PeptideIndex()
peptide_index.load(index_name)
with open(epitopes) as epitopes:
for line in epitopes:
epitope = line.strip()
#returns a tuple with the neighbors and the distances
results = peptide_index.get_nns_by_epitope(epitope,
num_neighbors = 8,
search_k=-1,
include_distances=True)
#stores neighbors and distances from tuple as lists
neighbors = results[0]
distances = results[1]
for i in range(len(neighbors)):
#insert commas between the neighbors and respective distances
#for the epitope
nearest_neighbors.write(str(neighbors[i]) + "," + str(distances[i]))
if i < len(neighbors) - 1:
nearest_neighbors.write(",")
nearest_neighbors.write("\n")
nearest_neighbors.close()
def main():
#creates the top level parser
parser = argparse.ArgumentParser(
description = "Epimeter can index an input protein file "
"or query an epitope"
)
subparsers = parser.add_subparsers(dest='subparser_name')
#create the parser for the "index" subcommand
index_parser = subparsers.add_parser('index')
index_parser.add_argument(
'-p', '--protein-fasta', type=str, required=True,
help="path to protein fasta",
)
index_parser.add_argument('-k', '--kmer-bounds', nargs=2,
help="bounds on kmer sizes to index, e.g., "
"\"8 11\"",
default= ["8", "11"]
)
index_parser.add_argument('-i', '--index-name', required=True,
help="path to output index")
#create the parser for the "query" subcommand
query_parser = subparsers.add_parser('query')
query_parser.add_argument('-i', '--index-name', required=True,
help="path to output index")
query_parser.add_argument('-e', '--epitopes', type=str,
required=True,
help="path to list of epitopes")
#arguments user types into command line
args = parser.parse_args()
if args.subparser_name == 'index':
build_index(args.protein_fasta,
args.kmer_bounds,
args.index_name)
elif args.subparser_name == 'query':
query_epitope(args.epitopes,
args.index_name)
if __name__ == '__main__':
main()