-
Notifications
You must be signed in to change notification settings - Fork 462
/
GCN_Cora.py
376 lines (287 loc) · 13.2 KB
/
GCN_Cora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
#!/usr/bin/env python
# coding: utf-8
# <h1>Table of Contents<span class="tocSkip"></span></h1>
# <div class="toc"><ul class="toc-item"><li><span><a href="#基于Cora数据集的GCN节点分类" data-toc-modified-id="基于Cora数据集的GCN节点分类-1"><span class="toc-item-num">1 </span>基于Cora数据集的GCN节点分类</a></span><ul class="toc-item"><li><span><a href="#SetUp" data-toc-modified-id="SetUp-1.1"><span class="toc-item-num">1.1 </span>SetUp</a></span></li><li><span><a href="#数据准备" data-toc-modified-id="数据准备-1.2"><span class="toc-item-num">1.2 </span>数据准备</a></span></li><li><span><a href="#图卷积层定义" data-toc-modified-id="图卷积层定义-1.3"><span class="toc-item-num">1.3 </span>图卷积层定义</a></span></li><li><span><a href="#模型定义" data-toc-modified-id="模型定义-1.4"><span class="toc-item-num">1.4 </span>模型定义</a></span></li><li><span><a href="#模型训练" data-toc-modified-id="模型训练-1.5"><span class="toc-item-num">1.5 </span>模型训练</a></span></li></ul></li></ul></div>
# # 基于Cora数据集的GCN节点分类
# <table align="left">
# <td>
# <a target="_blank" href="https://colab.research.google.com/github/FighterLYL/GraphNeuralNetwork/blob/master/chapter5/GCN_Cora.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
# </td>
# </table>
# 在Colab中运行时可以通过`代码执行程序->更改运行时类型`选择使用`GPU`
# ## SetUp
# In[1]:
import itertools
import os
import os.path as osp
import pickle
import urllib
from collections import namedtuple
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
import matplotlib.pyplot as plt
# ## 数据准备
# In[2]:
Data = namedtuple('Data', ['x', 'y', 'adjacency',
'train_mask', 'val_mask', 'test_mask'])
def tensor_from_numpy(x, device):
return torch.from_numpy(x).to(device)
class CoraData(object):
filenames = ["ind.cora.{}".format(name) for name in
['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']]
def __init__(self, data_root="../data/cora", rebuild=False):
"""Cora数据,包括数据下载,处理,加载等功能
当数据的缓存文件存在时,将使用缓存文件,否则将下载、进行处理,并缓存到磁盘
处理之后的数据可以通过属性 .data 获得,它将返回一个数据对象,包括如下几部分:
* x: 节点的特征,维度为 2708 * 1433,类型为 np.ndarray
* y: 节点的标签,总共包括7个类别,类型为 np.ndarray
* adjacency: 邻接矩阵,维度为 2708 * 2708,类型为 scipy.sparse.coo.coo_matrix
* train_mask: 训练集掩码向量,维度为 2708,当节点属于训练集时,相应位置为True,否则False
* val_mask: 验证集掩码向量,维度为 2708,当节点属于验证集时,相应位置为True,否则False
* test_mask: 测试集掩码向量,维度为 2708,当节点属于测试集时,相应位置为True,否则False
Args:
-------
data_root: string, optional
存放数据的目录,原始数据路径: ../data/cora
缓存数据路径: {data_root}/ch5_cached.pkl
rebuild: boolean, optional
是否需要重新构建数据集,当设为True时,如果存在缓存数据也会重建数据
"""
self.data_root = data_root
save_file = osp.join(self.data_root, "ch5_cached.pkl")
if osp.exists(save_file) and not rebuild:
print("Using Cached file: {}".format(save_file))
self._data = pickle.load(open(save_file, "rb"))
else:
self._data = self.process_data()
with open(save_file, "wb") as f:
pickle.dump(self.data, f)
print("Cached file: {}".format(save_file))
@property
def data(self):
"""返回Data数据对象,包括x, y, adjacency, train_mask, val_mask, test_mask"""
return self._data
def process_data(self):
"""
处理数据,得到节点特征和标签,邻接矩阵,训练集、验证集以及测试集
引用自:https://github.com/rusty1s/pytorch_geometric
"""
print("Process data ...")
_, tx, allx, y, ty, ally, graph, test_index = [self.read_data(
osp.join(self.data_root, name)) for name in self.filenames]
train_index = np.arange(y.shape[0])
val_index = np.arange(y.shape[0], y.shape[0] + 500)
sorted_test_index = sorted(test_index)
x = np.concatenate((allx, tx), axis=0)
y = np.concatenate((ally, ty), axis=0).argmax(axis=1)
x[test_index] = x[sorted_test_index]
y[test_index] = y[sorted_test_index]
num_nodes = x.shape[0]
train_mask = np.zeros(num_nodes, dtype=np.bool)
val_mask = np.zeros(num_nodes, dtype=np.bool)
test_mask = np.zeros(num_nodes, dtype=np.bool)
train_mask[train_index] = True
val_mask[val_index] = True
test_mask[test_index] = True
adjacency = self.build_adjacency(graph)
print("Node's feature shape: ", x.shape)
print("Node's label shape: ", y.shape)
print("Adjacency's shape: ", adjacency.shape)
print("Number of training nodes: ", train_mask.sum())
print("Number of validation nodes: ", val_mask.sum())
print("Number of test nodes: ", test_mask.sum())
return Data(x=x, y=y, adjacency=adjacency,
train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
@staticmethod
def build_adjacency(adj_dict):
"""根据邻接表创建邻接矩阵"""
edge_index = []
num_nodes = len(adj_dict)
for src, dst in adj_dict.items():
edge_index.extend([src, v] for v in dst)
edge_index.extend([v, src] for v in dst)
# 去除重复的边
edge_index = list(k for k, _ in itertools.groupby(sorted(edge_index)))
edge_index = np.asarray(edge_index)
adjacency = sp.coo_matrix((np.ones(len(edge_index)),
(edge_index[:, 0], edge_index[:, 1])),
shape=(num_nodes, num_nodes), dtype="float32")
return adjacency
@staticmethod
def read_data(path):
"""使用不同的方式读取原始数据以进一步处理"""
name = osp.basename(path)
if name == "ind.cora.test.index":
out = np.genfromtxt(path, dtype="int64")
return out
else:
out = pickle.load(open(path, "rb"), encoding="latin1")
out = out.toarray() if hasattr(out, "toarray") else out
return out
@staticmethod
def normalization(adjacency):
"""计算 L=D^-0.5 * (A+I) * D^-0.5"""
adjacency += sp.eye(adjacency.shape[0]) # 增加自连接
degree = np.array(adjacency.sum(1))
d_hat = sp.diags(np.power(degree, -0.5).flatten())
return d_hat.dot(adjacency).dot(d_hat).tocoo()
# ## 图卷积层定义
# In[3]:
class GraphConvolution(nn.Module):
def __init__(self, input_dim, output_dim, use_bias=True):
"""图卷积:L*X*\theta
Args:
----------
input_dim: int
节点输入特征的维度
output_dim: int
输出特征维度
use_bias : bool, optional
是否使用偏置
"""
super(GraphConvolution, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.use_bias = use_bias
self.weight = nn.Parameter(torch.Tensor(input_dim, output_dim))
if self.use_bias:
self.bias = nn.Parameter(torch.Tensor(output_dim))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
init.kaiming_uniform_(self.weight)
if self.use_bias:
init.zeros_(self.bias)
def forward(self, adjacency, input_feature):
"""邻接矩阵是稀疏矩阵,因此在计算时使用稀疏矩阵乘法
Args:
-------
adjacency: torch.sparse.FloatTensor
邻接矩阵
input_feature: torch.Tensor
输入特征
"""
support = torch.mm(input_feature, self.weight)
output = torch.sparse.mm(adjacency, support)
if self.use_bias:
output += self.bias
return output
def __repr__(self):
return self.__class__.__name__ + ' (' + str(self.input_dim) + ' -> ' + str(self.output_dim) + ')'
# ## 模型定义
#
# 读者可以自己对GCN模型结构进行修改和实验
# In[4]:
class GcnNet(nn.Module):
"""
定义一个包含两层GraphConvolution的模型
"""
def __init__(self, input_dim=1433):
super(GcnNet, self).__init__()
self.gcn1 = GraphConvolution(input_dim, 16)
self.gcn2 = GraphConvolution(16, 7)
def forward(self, adjacency, feature):
h = F.relu(self.gcn1(adjacency, feature))
logits = self.gcn2(adjacency, h)
return logits
# ## 模型训练
# In[5]:
# 超参数定义
LEARNING_RATE = 0.1
WEIGHT_DACAY = 5e-4
EPOCHS = 200
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# In[7]:
# 加载数据,并转换为torch.Tensor
dataset = CoraData().data
node_feature = dataset.x / dataset.x.sum(1, keepdims=True) # 归一化数据,使得每一行和为1
tensor_x = tensor_from_numpy(node_feature, DEVICE)
tensor_y = tensor_from_numpy(dataset.y, DEVICE)
tensor_train_mask = tensor_from_numpy(dataset.train_mask, DEVICE)
tensor_val_mask = tensor_from_numpy(dataset.val_mask, DEVICE)
tensor_test_mask = tensor_from_numpy(dataset.test_mask, DEVICE)
normalize_adjacency = CoraData.normalization(dataset.adjacency) # 规范化邻接矩阵
num_nodes, input_dim = node_feature.shape
indices = torch.from_numpy(np.asarray([normalize_adjacency.row,
normalize_adjacency.col]).astype('int64')).long()
values = torch.from_numpy(normalize_adjacency.data.astype(np.float32))
tensor_adjacency = torch.sparse.FloatTensor(indices, values,
(num_nodes, num_nodes)).to(DEVICE)
# In[ ]:
# 模型定义:Model, Loss, Optimizer
model = GcnNet(input_dim).to(DEVICE)
criterion = nn.CrossEntropyLoss().to(DEVICE)
optimizer = optim.Adam(model.parameters(),
lr=LEARNING_RATE,
weight_decay=WEIGHT_DACAY)
# In[8]:
# 训练主体函数
def train():
loss_history = []
val_acc_history = []
model.train()
train_y = tensor_y[tensor_train_mask]
for epoch in range(EPOCHS):
logits = model(tensor_adjacency, tensor_x) # 前向传播
train_mask_logits = logits[tensor_train_mask] # 只选择训练节点进行监督
loss = criterion(train_mask_logits, train_y) # 计算损失值
optimizer.zero_grad()
loss.backward() # 反向传播计算参数的梯度
optimizer.step() # 使用优化方法进行梯度更新
train_acc, _, _ = test(tensor_train_mask) # 计算当前模型训练集上的准确率
val_acc, _, _ = test(tensor_val_mask) # 计算当前模型在验证集上的准确率
# 记录训练过程中损失值和准确率的变化,用于画图
loss_history.append(loss.item())
val_acc_history.append(val_acc.item())
print("Epoch {:03d}: Loss {:.4f}, TrainAcc {:.4}, ValAcc {:.4f}".format(
epoch, loss.item(), train_acc.item(), val_acc.item()))
return loss_history, val_acc_history
# In[9]:
# 测试函数
def test(mask):
model.eval()
with torch.no_grad():
logits = model(tensor_adjacency, tensor_x)
test_mask_logits = logits[mask]
predict_y = test_mask_logits.max(1)[1]
accuarcy = torch.eq(predict_y, tensor_y[mask]).float().mean()
return accuarcy, test_mask_logits.cpu().numpy(), tensor_y[mask].cpu().numpy()
# In[13]:
def plot_loss_with_acc(loss_history, val_acc_history):
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(range(len(loss_history)), loss_history,
c=np.array([255, 71, 90]) / 255.)
plt.ylabel('Loss')
ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)
ax2.plot(range(len(val_acc_history)), val_acc_history,
c=np.array([79, 179, 255]) / 255.)
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position("right")
plt.ylabel('ValAcc')
plt.xlabel('Epoch')
plt.title('Training Loss & Validation Accuracy')
plt.show()
# In[ ]:
loss, val_acc = train()
test_acc, test_logits, test_label = test(tensor_test_mask)
print("Test accuarcy: ", test_acc.item())
# In[14]:
plot_loss_with_acc(loss, val_acc)
# In[ ]:
# 绘制测试数据的TSNE降维图
from sklearn.manifold import TSNE
tsne = TSNE()
out = tsne.fit_transform(test_logits)
fig = plt.figure()
for i in range(7):
indices = test_label == i
x, y = out[indices].T
plt.scatter(x, y, label=str(i))
plt.legend()