-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathCoraLuDataset.py
103 lines (83 loc) · 4.66 KB
/
CoraLuDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import numpy as np
from kgcnn.data.base import MemoryGraphDataset
from kgcnn.data.download import DownloadDataset
class CoraLuDataset(DownloadDataset, MemoryGraphDataset):
r"""Store and process Cora dataset after `Lu et al. 2003 <https://www.aaai.org/Papers/ICML/2003/ICML03-066.pdf>`_ .
Information in `Papers with code <https://paperswithcode.com/dataset/cora>`_ read:
The Cora dataset consists of 2708 scientific publications classified into one of seven classes.
The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word
vector indicating the absence/presence of the corresponding word from the dictionary.
The dictionary consists of 1433 unique words.
Downloaded from: `<https://linqs-data.soe.ucsc.edu/public/lbc/>`_ .
References:
(1) McCallum, A.K., Nigam, K., Rennie, J. et al. Automating the Construction of Internet Portals with Machine
Learning. Information Retrieval 3, 127–163 (2000). https://doi.org/10.1023/A:1009953814988
(2) Lu, Qing and Lise Getoor. “Link-Based Classification.” Encyclopedia of Machine Learning and Data
Mining (2003).
"""
download_info = {
"dataset_name": "cora_lu",
"data_directory_name": "cora_lu",
"download_url": "https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",
# download_url = "https://linqs-data.soe.ucsc.edu/public/arxiv-mrdm05/arxiv.tar.gz"
"download_file_name": 'cora.tgz',
"unpack_tar": True,
"unpack_zip": False,
"unpack_directory_name": "cora_lu"
}
# Make cora graph that was published by Qing Lu, and Lise Getoor. "Link-based classification." ICML, 2003.
# https://www.aaai.org/Papers/ICML/2003/ICML03-066.pdf
def __init__(self, reload=False, verbose: int = 10):
"""Initialize Cora dataset after Lu et al. 2003.
Args:
reload (bool): Whether to reload the data and make new dataset. Default is False.
verbose (int): Print progress or info for processing where 60=silent. Default is 10.
"""
self.class_label_mapping = None
# Use default base class init.
MemoryGraphDataset.__init__(self, dataset_name="cora_lu", verbose=verbose)
DownloadDataset.__init__(self, **self.download_info, reload=reload, verbose=verbose)
if self.fits_in_memory:
self.read_in_memory()
def read_in_memory(self):
"""Load Cora data into memory and already split into items."""
filepath = os.path.join(self.data_main_dir, self.data_directory_name, self.unpack_directory_name, "cora")
ids = np.loadtxt(os.path.join(filepath, "cora.cites"))
ids = np.array(ids, dtype="int64")
open_file = open(os.path.join(filepath, "cora.content"), "r")
lines = open_file.readlines()
labels = [x.strip().split('\t')[-1] for x in lines]
nodes = [x.strip().split('\t')[0:-1] for x in lines]
nodes = np.array([[int(y) for y in x] for x in nodes], dtype="int64")
open_file.close()
# Match edge_indices not with ids but with edge_indices in nodes
node_map = np.zeros(np.max(nodes[:, 0]) + 1, dtype="int64")
idx_new = np.arange(len(nodes))
node_map[nodes[:, 0]] = idx_new
indexlist = node_map[ids]
order1 = np.argsort(indexlist[:, 1], axis=0, kind='mergesort') # stable!
ind1 = indexlist[order1]
order2 = np.argsort(ind1[:, 0], axis=0, kind='mergesort')
indices = ind1[order2]
# Class mappings
class_label_mapping = {'Genetic_Algorithms': 0,
'Reinforcement_Learning': 1,
'Theory': 2,
'Rule_Learning': 3,
'Case_Based': 4,
'Probabilistic_Methods': 5,
'Neural_Networks': 6}
self.class_label_mapping = class_label_mapping
label_id = np.array([class_label_mapping[x] for x in labels], dtype="int")
labels = np.expand_dims(label_id, axis=-1)
labels = np.array(labels == np.arange(7), dtype="float")
self.assign_property("node_attributes", [nodes[:, 1:]])
self.assign_property("edge_indices", [indices])
self.assign_property("edge_attributes", [np.ones_like(indices)[:, :1]])
self.assign_property("node_labels", [labels])
self.assign_property("node_number", [label_id])
self.assign_property("edge_weights", [np.ones_like(indices)[:, :1]])
# self.graph_adjacency = make_adjacency_from_edge_indices(indices, np.ones_like(indices)[:, 0])
return self
# ds = CoraLuDataset()