-
Notifications
You must be signed in to change notification settings - Fork 3
/
ann_twitter.py
executable file
·118 lines (93 loc) · 3.56 KB
/
ann_twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.cross_validation import train_test_split
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import confusion_matrix
import csv
dataset_edges = pd.read_csv('enrondatasetfinal.csv', sep=',')
# dataset_nodes = pd.read_csv('bet_cen_enron_data.csv', sep = ' ', header = None, quoting = csv.QUOTE_NONE, error_bad_lines = False)
dataset_true = pd.read_csv('Enron.true', header = None, sep = ';')
node_from = dataset_edges.iloc[:,0].values
node_to = dataset_edges.iloc[:,1].values
num_nodes = len(dataset_true)
print "Number of nodes: " + str(76841)
#Stores the node as key, it's neighbors as values
d = dict()
for i in range(0, num_nodes):
d[i] = []
for (i,j) in zip(node_from, node_to):
if j not in d[i]:
d[i].append(j)
if i not in d[j]:
d[j].append(i)
train_nodes = int(0.8*13533) #10826
test_nodes = int(0.2*13533) #2706
print 'Train nodes: ' + str(56830) + ", Test nodes: " + str(20431)
#Full dataset graph
H = nx.Graph()
for k, v in d.iteritems():
for j in v:
H.add_edge(k, j)
# Training graph
G = nx.Graph()
for i in range(train_nodes):
for j in d[i]:
G.add_edge(i, j)
#Testing graph
test_id = [i for i in H.nodes() if i not in G.nodes()]
P = nx.Graph()
for i in test_id:
for j in d[i]:
P.add_edge(i, j)
# print len(H.nodes())
# print len(H.edges())
# print len(G.nodes())
# print len(P.nodes())
train_adjacency_matrix = nx.adjacency_matrix(G).todense()
# print train_adjacency_matrix.shape[0], train_adjacency_matrix.shape[1]
# np zero is added to show the if there is an edge between all the other nodes
B = np.zeros((len(G.nodes()), (len(H.nodes()) - len(G.nodes()))))
# print B.shape[0], B.shape[1]
train_adjacency_matrix = np.concatenate((train_adjacency_matrix, B), axis = 1)
#print train_adjacency_matrix.shape[0], train_adjacency_matrix.shape[1]
test_adjacency_matrix = nx.adjacency_matrix(P).todense()
# print test_adjacency_matrix.shape[0]
B = np.zeros((len(P.nodes()), (len(H.nodes()) - len(P.nodes()))))
# print B.shape[0], B.shape[1]
test_adjacency_matrix = np.concatenate((test_adjacency_matrix, B), axis = 1)
#print test_adjacency_matrix.shape[0], test_adjacency_matrix.shape[1]
d_labels = dict()
for i in range(0, num_nodes):
d_labels[i] = []
with open('bet_cen_enron_data.csv', 'r') as infile:
for line in infile.readlines():
token = line.strip().split(" ")
d_labels[int(token[0])].append(int(token[1]))
# print d_labels
y_train = [v for k,v in d_labels.iteritems() if k in G.nodes()]
y_test = [d_labels[i] for i in P.nodes()]
# print y_train[10]
y_train = np.array(y_train)
y_test = np.array(y_test)
classifier = Sequential()
#First hidden layer
classifier.add(Dense(output_dim = 5711, init = 'uniform', activation = 'relu', input_dim = 11703))
#Second hidden layer
# classifier.add(Dense(units = 11703, kernel_initializer = 'uniform', activation = 'relu'))
#Output Layer
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'sigmoid'))
#Compiling model
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
#Training
classifier.fit(train_adjacency_matrix, y_train, batch_size = 10, epochs = 100)
scores = model.evaluate(train_adjacency_matrix, y_train)
print "Train scores: " + str(classifier.metrics_names[1]) + str(scores[1]*100)
#Prediction
y_pred = classifier.predict(test_adjacency_matrix)
y_pred = (y_pred > 0.5)
scores = model.evaluate(test_adjacency_matrix, y_test)
print "Train scores: " + str(classifier.metrics_names[1]) + str(scores[1]*100)