-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraph_utils.py
81 lines (62 loc) · 2.23 KB
/
graph_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy as np
import scipy.sparse as sp
from collections import Counter
def encode_onehot(labels):
"""
Input: labels | list of categorical data
Return: labels_onehot | m x n numpy array
m = number of data points
n = number of labels
"""
# Hardcode number
num_labels = 42
classes = set(labels)
classes_dict = {}
for c in classes:
classes_dict[c] = np.identity(num_labels)[c, :]
labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
return labels_onehot
def nearest_neighbor(node, labels, graph):
"""
docstring
"""
neighbors = np.where(graph[node] == 1)[0]
preds = []
for n in neighbors:
preds.append(int(labels[np.where(labels[:, 0] == n)][:, 1]))
counter = Counter(preds)
return counter.most_common(1)[0][0]
def load_email_data(path="email_data/", dataset="email", sparse=True):
"""
Input: path | dataset path
dataset | dataset name
sparse | return sparse matrices
Return: features | N x D feature matrix
adj | N x N adjacency matrix
labels | N x E label matrix
Other: N -> Number of nodes
D -> Number of features
E -> Number of classes
"""
print('Loading {} dataset...'.format(dataset))
# Labels
labels_data = np.genfromtxt("{}{}_labels.txt".format(path, dataset), dtype=np.int32)
labels = encode_onehot(labels_data[:, 1])
# Graph
edges = np.genfromtxt("{}{}_core.txt".format(path, dataset), dtype=np.int32)
adj = np.zeros((labels.shape[0], labels.shape[0]))
for node1, node2 in edges:
adj[node1][node2] = 1
adj[node2][node1] = 1
# Random Features
# pass
# Features
features = np.zeros(labels.shape[0], dtype=np.int32)
for node in range(labels.shape[0]):
features[node] = nearest_neighbor(node, labels_data, adj)
features = encode_onehot(features)
print('Dataset has {} nodes, {} edges, {} features.'.format(adj.shape[0], edges.shape[0], features.shape[1]))
if sparse:
features = sp.csr_matrix(features)
adj = sp.coo_matrix(adj)
return features, adj, labels