-
Notifications
You must be signed in to change notification settings - Fork 0
/
getData.py
173 lines (157 loc) · 8.34 KB
/
getData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import dgl
import torch
import pandas as pd
import torch.nn.functional as F
class getData:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# def seq_to_seq(self, database):
# df = pd.read_csv('Protein-Protein/sequences/unique_sequences.csv', na_filter = False, sep=',',
# dtype={'sequence': str, 'id': int})
# sequences = df['sequence'].values
# id_uniq_seq = df['id'].values
# uniq_sequences = {}
# for i, data in enumerate(sequences):
# if sequences[i] not in uniq_sequences:
# uniq_sequences[sequences[i]] = id_uniq_seq[i]
# # ALL DATABASES
# databases_headers = {'kd_pdbbind_database.csv': ['id_0', 'id_1', 'target', 'seq_0', 'seq_1'],
# 'pip_database.csv': ['id_0', 'id_1', 'target', 'seq_0', 'seq_1'],
# 'proximate_dg.csv': ['id_0', 'id_1', 'dg', 'sequence_0', 'sequence_1'],
# 'proximate_kd.csv': ['id_0', 'id_1', 'kd', 'sequence_0', 'sequence_1'],
# 'proximate_kon.csv': ['id_0', 'id_1', 'kon', 'sequence_0', 'sequence_1'],
# 'skempi_affinity.csv': ['id_0', 'id_1', 'affinity', 'sequence_0', 'sequence_1'],
# 'skempi_koff.csv': ['id_0', 'id_1', 'koff', 'sequence_0', 'sequence_1'],
# 'skempi_kon.csv': ['id_0', 'id_1', 'kon', 'sequence_0', 'sequence_1']}
# headers = databases_headers[database]
# df = pd.read_csv('Protein-Protein/datasets/' + database, na_filter = False, sep=',',
# dtype={headers[0]: str, headers[1]: str, headers[2]: float})
# id1 = df[headers[0]].values
# id2 = df[headers[1]].values
# sc = df[headers[2]].values
# # q1 = df['target'].quantile(0.25)
# # q3 = df['target'].quantile(0.75)
# # iqr = q3 - q1
# count = 0
# id_dic = {}
# for i, data in enumerate(id1):
# if id1[i] not in id_dic and id1[i] in uniq_sequences: # and sc[i] <= q3 + 1.5*iqr:
# id_dic[id1[i]] = count
# count = count + 1
# if id2[i] not in id_dic and id2[i] in uniq_sequences: # and sc[i] <= q3 + 1.5*iqr:
# id_dic[id2[i]] = count
# count = count + 1
# # EDGE INDEX Y LABEL (TAMBIEN EL SCORE)
# new_id1 = []
# new_id2 = []
# new_score = []
# label = []
# for i, data in enumerate(id1):
# if id1[i] in uniq_sequences and id2[i] in uniq_sequences: # and sc[i] <= q3 + 1.5*iqr:
# new_id1.append(id_dic[id1[i]])
# new_id2.append(id_dic[id2[i]])
# new_score.append(sc[i])
# label.append(1)
# label = torch.tensor(label)
# edge_label = torch.Tensor(new_score)
# # NODES FEATURES
# df_2 = pd.read_csv('features/Seq2Feature_summarized/resultados/result_values_normalized.csv', na_filter = False, header=None, sep='|')
# columna_features = df_2.values
# old_id_score_dic = {}
# num_features = 0
# for i, col in enumerate(columna_features):
# split = columna_features[i][0].split(',')
# id = columna_features[i][0].split(',')[0]
# old_id_score_dic[id] = [round(float(x), 3) for x in split[1:]]
# new_score = []
# new_scoreid = []
# for i, data in enumerate(id_dic.keys()):
# new_scoreid.append(data)
# new_score.append(old_id_score_dic[data])
# nodes_features = torch.tensor(new_score)
# g = dgl.graph((new_id1, new_id2))
# g.ndata['feat'] = nodes_features
# edge_label_normalized = F.normalize(edge_label, p=2, dim=0)
# # g.edata['weight'] = edge_label_normalized
# g.edata['weight'] = edge_label
# g.edata['label'] = label
# return g
def encoding(self, method, database):
df = pd.read_csv('Protein-Protein/sequences/unique_sequences.csv', na_filter = False, sep=',',
dtype={'sequence': str, 'id': int})
sequences = df['sequence'].values
id_uniq_seq = df['id'].values
uniq_sequences = {}
for i, data in enumerate(sequences):
if sequences[i] not in uniq_sequences:
uniq_sequences[sequences[i]] = id_uniq_seq[i]
# NODES FEATURES EXTRACT
df_2 = pd.read_csv('Protein-Protein/encoded_sequences/' + method + '/dataset_encoding.csv', na_filter = False, sep='|')
columna_features = df_2.values
old_id_seq_features_dic = {}
for i, col in enumerate(columna_features):
if method[:3] == 'phy' or method[:3] == 'phy':
split = columna_features[i][0].split(',')
id = int(float(columna_features[i][0].split(',')[0]))
old_id_seq_features_dic[id] = [round(float(x), 3) for x in split[1:]]
else:
split = columna_features[i][0].split(',')
id = int(float(columna_features[i][0].split(',')[-1]))
old_id_seq_features_dic[id] = [round(float(x), 3) for x in split[:-1]]
# ALL DATABASES
databases_headers = {'kd_pdbbind_database.csv': ['id_0', 'id_1', 'target', 'seq_0', 'seq_1'],
'pip_database.csv': ['id_0', 'id_1', 'target', 'seq_0', 'seq_1'],
'proximate_dg.csv': ['id_0', 'id_1', 'dg', 'sequence_0', 'sequence_1'],
'proximate_kd.csv': ['id_0', 'id_1', 'kd', 'sequence_0', 'sequence_1'],
'proximate_kon.csv': ['id_0', 'id_1', 'kon', 'sequence_0', 'sequence_1'],
'skempi_affinity.csv': ['id_0', 'id_1', 'affinity', 'sequence_0', 'sequence_1'],
'skempi_koff.csv': ['id_0', 'id_1', 'koff', 'sequence_0', 'sequence_1'],
'skempi_kon.csv': ['id_0', 'id_1', 'kon', 'sequence_0', 'sequence_1']}
headers = databases_headers[database]
df = pd.read_csv('Protein-Protein/datasets/' + database, na_filter = False, sep=',',
dtype={headers[0]: str, headers[1]: str, headers[2]: float, headers[3]: str, headers[4]: str})
id1 = df[headers[0]].values
seq1 = df[headers[3]].values
id2 = df[headers[1]].values
seq2 = df[headers[4]].values
sc = df[headers[2]].values
# q1 = df['target'].quantile(0.25)
# q3 = df['target'].quantile(0.75)
# iqr = q3 - q1
count = 0
id_dic = {}
id_dic_seq = {}
for i, data in enumerate(id1):
if id1[i] not in id_dic and seq1[i] in uniq_sequences: # and sc[i] <= q3 + 1.5*iqr:
id_dic[id1[i]] = count
id_dic_seq[id1[i]] = seq1[i]
count = count + 1
if id2[i] not in id_dic and seq2[i] in uniq_sequences: # and sc[i] <= q3 + 1.5*iqr:
id_dic[id2[i]] = count
id_dic_seq[id2[i]] = seq2[i]
count = count + 1
#########################
# EDGES
index_1 = []
index_2 = []
new_score = []
label = []
# Se matchean los ids, es posible que matcheando secuencias se puedan generar mas edges
for i, data in enumerate(id1):
if id1[i] in id_dic and id2[i] in id_dic: # and sc[i] <= q3 + 1.5*iqr:
index_1.append(id_dic[id1[i]])
index_2.append(id_dic[id2[i]])
new_score.append(sc[i])
label.append(1)
label = torch.tensor(label)
edge_label = torch.tensor(new_score)
nodes_feat = []
for i, data in enumerate(id_dic.keys()):
node_seq = id_dic_seq[data] # Extraer secuencia del nodo
id_uniq = uniq_sequences[node_seq] # Buscar en uniq_sequences la secuencia del nodo
nodes_feat.append(old_id_seq_features_dic[id_uniq]) # Con el id que devuelve uniq_sequences se busca en old_id_seq_features_dic la matriz que seran las nodes features
nodes_features = torch.tensor(nodes_feat)
g = dgl.graph((index_1, index_2))
g.ndata['feat'] = nodes_features
g.edata['weight'] = edge_label.float()
g.edata['label'] = label
return g