-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
234 lines (194 loc) · 7.53 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import math
def readFasta(fileName, min_seq_len):
"""
This function is to read fasta files and returns a dict of fasta file
Args:
fileName (string): is the path to the fasta file
min_seq_len (int): is a cut value for the sequences
Return:
fastaDict (dict): It is a dictionary whose keys are UniProt protein ids and values are protein sequences
"""
fastaDict = {}
with open(fileName) as fp:
protId = ''
sequence = ''
for line in fp:
if line[0] == '>':
if len(sequence) > min_seq_len:
if sequence.find('U') != -1:
sequence = sequence.replace("U", "C")
fastaDict[protId] = sequence
sequence = ''
firstLine = line.split("|")
protId = firstLine[1]
continue
line = line.strip()
sequence = sequence + line
if len(sequence) > min_seq_len:
fastaDict[protId] = sequence
fp.close()
return fastaDict
def extractAllSubsequences(fastaDict, sub_seq_len):
"""
This function extracts subsequences whose length is sub_seq_len
Args:
fastaDict (dict): It is a dictionary whose keys are UniProt protein ids and values are protein sequences
sub_seq_len (int): is an integer that indicates the length of subsequences
Return:
listSubsequence (list): is a list of subsequences
"""
listSubsequence = [fastaDict[protId][i:i + sub_seq_len]
for protId in fastaDict for i in range(len(fastaDict[protId]) - (sub_seq_len - 1))]
return listSubsequence
def extractSubsequences(sequence, sub_seq_len):
listSubsequence = [sequence[i:i + sub_seq_len]
for i in range(len(sequence) - (sub_seq_len - 1))]
return listSubsequence
def blosum62_reader(fileName, alphabet):
"""
This function is to read blosum62 matrix from a file and to store as a 20 by 20 matrix
Args:
fileName (string): is the path to the csv file of blosum62 matrix
alphabet (string): is the string of combined letters of valid amino acids (20 letters: ARNDCQEGHILKMFPSTWYV)
Return:
blosum62Dict (dict): is the dictionary whose keys are amino acid pairs and values are blosum62 values.
"""
lineNo = 0
blosum62Dict = {}
with open(fileName) as fp:
for line in fp:
line = line.strip()
arrayBlo62 = line.split(",")
pos = 0
for letter in alphabet:
blosum62Dict[(alphabet[lineNo], letter)] = int(arrayBlo62[pos])
pos += 1
lineNo = lineNo + 1
return blosum62Dict
def PSSM_initializer(alphabet, sub_seq_len):
"""
This function is to initialize Position Specific Scoring Matrix(PSMM) matrix of each cluster of subsequences.
Args:
alphabet (string): is the string of combined letters of valid amino acids (20 letters: ARNDCQEGHILKMFPSTWYV).
sub_seq_len (int): is an integer that indicates the length of subsequences.
Return:
PSSM (dict): is a dictionary whose keys are the pairs of amino acid letters and their position in the
subsequence, values are the counts of amino acids in that position.
"""
PSSM = dict()
for letter in alphabet:
for i in range(sub_seq_len):
PSSM[(letter, i)] = 0
return PSSM
def PSSM_updater(subsequence, PSSM):
"""
This function is to update PSSM of a cluster if subsequence is added to that cluster
Args:
subsequence (string): a subsequence of the protein sequence
PSSM (dict): is a dictionary whose keys are the pairs of amino acid letters and their position in the
subsequence, values are the counts of amino acids in that position.
Return:
PSSM (dict): is a dictionary whose keys are the pairs of amino acid letters and their position in the
subsequence, values are the counts of amino acids in that position.
"""
for i, letter in enumerate(subsequence):
PSSM[(letter, i)] += 1
return PSSM
def calculateMaxSimilarity(blosum62Dict, listCenters, subsequence):
"""
This function is to find the most similar cluster to the subsequence by calculating the similarity between the
center subsequence of the cluster and the subsequence according to blosum62 matrix.
Args:
blosum62Dict (dict): is the dictionary whose keys are amino acid pairs and values are blosum62 values.
listCenters (list): is the list of the center subsequences of all clusters.
subsequence (string): is a subsequence of protein sequence.
Return:
maxSimilarity (int): maxSimilarity is the maximum similarity value between the center subsequence of the cluster
possibleCluster (string): possibleCluster is the center subsequence that used to indicate the cluster.
"""
maxSimilarity = -9999999
possibleCluster = listCenters[0]
for center in listCenters:
similarityScore = 0
for i in range(len(subsequence)):
similarityScore += blosum62Dict[(center[i], subsequence[i])]
if similarityScore > maxSimilarity:
maxSimilarity = similarityScore
possibleCluster = center
return maxSimilarity, possibleCluster
def PSSM2profile(PSSM, si, sub_seq_len, alphabet):
"""
This function is to convert PSSM to profile for each PSSM of clusters
Args:
PSSM:
si:
sub_seq_len:
alphabet:
Return:
"""
log = math.log
profile = [[log((PSSM[(letter, j)] + 0.01) / (si + 0.2))
for j in range(sub_seq_len)]
for letter in alphabet]
return profile
def writeProfiles2File(profileDict, fileName, alphabet):
"""
This function is to write all profiles to a file
Args:
profileDict:
fileName:
alphabet:
Return:
"""
with open(fileName, 'w') as fp:
write = fp.write
number = 0
for key in profileDict:
number += 1
write('NUMBER OF CLUSTER : %s\n' % number)
write('CLUSTER CENTER : %s\n' % key)
aa_index = -1
for item in profileDict[key]:
aa_index = aa_index + 1
write("%s : " % alphabet[aa_index])
write(" %s\n" % item)
write('\n')
fp.close()
def read_profiles(profile_file, alphabet):
"""
:param profile_file:
:param alphabet:
:return:
"""
file_profiles = open(profile_file, "r")
list_profiles = []
profile_each_cluster = {}
for line in file_profiles:
if line == "\n":
continue
parts = line.split(":")
part0 = parts[0].strip()
part1 = parts[1].strip()
if part0 == "NUMBER OF CLUSTER":
continue
if part0 in alphabet:
list_values = []
values = part1.split(",")
for value in values:
value = value.strip()
value = value.strip("[")
value = value.strip("]")
list_values.append(float(value))
profile_each_cluster[part0] = list_values
if part0 == "V":
list_profiles.append(profile_each_cluster)
profile_each_cluster = {}
return list_profiles
def write_feature_vector(filename_fv, fv_dict):
file = open(filename_fv, "w")
for prot_id in fv_dict:
file.write(f"{prot_id}: ")
for value in fv_dict[prot_id]:
file.writelines([str(value), ","])
file.write("\n")
file.close()