forked from RubensZimbres/Repo-2017
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNLP Convolutional Neural Network
140 lines (111 loc) · 4.1 KB
/
NLP Convolutional Neural Network
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
# numpy
import numpy
import numpy as np
# random
from random import shuffle
# classifier
from sklearn.linear_model import LogisticRegression
class LabeledLineSentence(object):
def __init__(self, sources):
self.sources = sources
flipped = {}
# make sure that keys are unique
for key, value in sources.items():
if value not in flipped:
flipped[value] = [key]
else:
raise Exception('Non-unique prefix encountered')
def __iter__(self):
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def to_array(self):
self.sentences = []
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
return self.sentences
def sentences_perm(self):
shuffle(self.sentences)
return self.sentences
sources = {'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS', 'test-pos.txt':'TEST_POS','test-neg2.txt':'TEST_NEG'}
sentences = LabeledLineSentence(sources)
sentences
model = Doc2Vec(min_count=1, window=5, size=16, sample=1e-4, negative=5, workers=8)
model.build_vocab(sentences.to_array())
for epoch in range(16):
model.train(sentences.sentences_perm())
model.save('./imdb.d2v')
model = Doc2Vec.load('./imdb.d2v')
model
model.most_similar('good')
#### ERRO AQUI
sentences.to_array()
model.docvecs['TRAIN_POS_0']
train_arrays = numpy.zeros((8, 16))
train_labels = numpy.zeros(8)
for i in range(7):
prefix_train_pos = 'TRAIN_POS_' + str(i)
prefix_train_neg = 'TRAIN_NEG_' + str(i)
train_arrays[i] = model.docvecs[prefix_train_pos]
train_arrays[1 + i] = model.docvecs[prefix_train_neg]
train_labels[i] = 1
train_labels[1 + i] = 0
test_arrays = numpy.zeros((8, 16))
test_labels = numpy.zeros(8)
for i in range(7):
prefix_test_pos = 'TEST_POS_' + str(i)
test_arrays[i] = model.docvecs[prefix_test_pos]
test_labels[i] = 1
test_labels[1 + i] = 0
from __future__ import print_function
import numpy as np
np.random.seed(1337) # for reproducibility
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K
batch_size = 1
nb_classes = 2
nb_epoch = 100
nb_filters = 4
pool_size = (2, 2)
kernel_size = (2, 2)
X_train= train_arrays.astype('float32')
X_train=X_train.reshape(8,1,4,4)
X_test = test_arrays.astype('float32')
X_test=X_test.reshape(8,1,4,4)
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
from keras.utils.np_utils import to_categorical
Y_train = to_categorical(train_labels)
Y_test = to_categorical(test_labels)
from keras import backend as K
K.set_image_dim_ordering('th')
model = Sequential()
model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1],
border_mode='valid',
input_shape=(1,4,4)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool_size))
model.add(Flatten())
model.add(Dense(nb_classes))
model.add(Activation('sigmoid'))
from keras.optimizers import SGD
sgd = SGD(lr=0.01, decay=1e-4, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
model.fit(X_train, Y_train,
batch_size=batch_size, nb_epoch=nb_epoch,
verbose=1)
print('Test accuracy:', np.sum(model.predict_classes(X_test)-test_labels==0)/len(test_labels))
print(model.predict_classes(X_test))
print(test_labels)