-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
180 lines (135 loc) · 5.88 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# file from https://github.com/tkipf/gcn/blob/master/gcn/utils.py
import sqlite3
import numpy as np
import random
from scipy import sparse as sp
# get n random edges that have 0 value in adj
def get_false_edges(adj, n):
print("getting false edges")
false_edges = []
while len(false_edges) < n:
r1 = random.randint(0, adj.shape[0]-1)
r2 = random.randint(0, adj.shape[0]-1)
# check that the edge is not present in the adj
# and that is in the triu part
if(adj[r1, r2] == 0 and r1<r2):
false_edges.append([r1,r2])
return false_edges
# sparse matrix to (coords of the edges, values of the edges, shape of the matrix)
def sparse_to_tuple(sparse_mx):
if not sp.isspmatrix_coo(sparse_mx):
sparse_mx = sparse_mx.tocoo()
coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
values = sparse_mx.data
shape = sparse_mx.shape
return coords, values, shape
# get adj_train, train a and test edges
def get_test_edges(adj, test_size=0.1, train_size=0.1):
print("getting test edges")
adj_ = sp.triu(adj)
coords, _, shape = sparse_to_tuple(adj_)
all_edges = coords
# define the number of edges for train and test
num_train = int(train_size*all_edges.shape[0])
num_test = int(test_size*all_edges.shape[0])
# shuffle the edges
np.random.shuffle(all_edges)
# get the first num_test edges (after shuffling)
# as the test_edges (the positive ones)
test_edges = all_edges[:num_test]
# get the first num_train after the first num_test edges (after shuffling)
# as the train_edges (the positive ones)
train_edges = all_edges[num_test:num_test+num_train]
res_edges = all_edges[num_test+num_train:]
# get random false edges
false_edges = get_false_edges(adj, test_edges.shape[0]+train_edges.shape[0])
# split them into train and test
test_false_edges = false_edges[:test_edges.shape[0]]
train_false_edges = false_edges[test_edges.shape[0]:]
print("got false edges")
# turn the remaning edges into a sparse matrix
adj_train = sp.csr_matrix((np.ones(res_edges.shape[0]), (res_edges[:, 0], res_edges[:, 1])), shape=adj.shape)
return adj_train, train_edges, np.array(train_false_edges), test_edges, np.array(test_false_edges)
# load data: adj_complete, features, labels, num of classes
def load_data(dataset_name):
if(dataset_name == "cora"):
return load_cora_data()
elif (dataset_name == "citeseer"):
return load_citeseer_data()
def load_citeseer_data():
content_d = {}
with open("citeseer/citeseer.content", "r") as fin_c:
classes = set()
docs = set()
for line in fin_c:
splitted = line.strip().split('\t')
if(len(splitted)>2):
content_d[splitted[0]] = {"features":splitted[1:-1], "class":splitted[-1]}
classes.add(splitted[-1])
docs.add(splitted[0])
class_to_id = {}
for i, class_ in enumerate(list(classes)):
class_to_id[class_] = i
for i, doc in enumerate(list(docs)):
content_d[doc]["id"] = i
cites = []
with open("citeseer/citeseer.cites", "r") as fin_c:
for line in fin_c:
splitted = line.strip().split("\t")
if(len(splitted)>1 and splitted[0] in content_d and splitted[1] in content_d):
cites.append([content_d[splitted[0]]["id"], content_d[splitted[1]]["id"]])
cites.append([content_d[splitted[1]]["id"], content_d[splitted[0]]["id"]])
cited = np.array(cites).T
adj_complete = sp.csr_matrix((np.ones(cited.shape[1]), (cited[0], cited[1])))
labels = []
for key in content_d.keys():
labels.append([content_d[key]["id"], class_to_id[content_d[key]["class"]]])
labels = sorted(labels, key= lambda y: y[0])
features_ = []
for key in content_d.keys():
ws = content_d[key]["features"]
for i, w in enumerate(ws):
if(w == "1"):
features_.append([content_d[key]["id"], i])
features_ = np.array(features_).T
features = sp.csr_matrix((np.ones(features_.shape[1]), (features_[0], features_[1])))
return adj_complete, features, labels, len(class_to_id.keys())
def load_cora_data():
# get cursor from db
conn = sqlite3.connect('CORA/cora.db')
cur = conn.cursor()
# load data from db
adj_sparse, features_sparse, labels, n_clusters = load_cora_adj_features(cur)
return adj_sparse, features_sparse, labels, n_clusters
def load_cora_adj_features(cur):
cur.execute("SELECT * FROM paper")
papers = cur.fetchall()
cur.execute("SELECT * FROM cites")
cited = cur.fetchall()
cur.execute("SELECT * FROM content")
features = cur.fetchall()
paper_id_to_int = {}
set_labels = set()
# associate paper_id to integer
for i, paper in enumerate(papers):
paper_id_to_int[paper[0]] = i
set_labels.add(paper[1])
topic_to_label_index = {}
for i, label in enumerate(list(set_labels)):
topic_to_label_index[label] = i
labels = [ [paper_id_to_int[x[0]], topic_to_label_index[x[1]]] for x in papers ]
cited = np.array([[paper_id_to_int[x[0]],paper_id_to_int[x[1]]] for x in cited]).T
# get sparse matrix
adj_s = sp.csr_matrix((np.ones(cited.shape[1]*2), (np.append(cited[0], cited[1]), np.append(cited[1], cited[0])))).tocsr()
# get set of words
words = set()
for feature in features:
words.add(feature[1])
# associate word_id to integer
word_to_int = {}
for i, word in enumerate(list(words)):
word_to_int[word] = i
features = np.array([ [ paper_id_to_int[x[0]], word_to_int[x[1]] ] for x in features ]).T
# get sparse matrix
features_s = sp.csr_matrix((np.ones(features.shape[1]), (features[0], features[1])))
return adj_s, features_s, labels, len(set_labels)