1
1
# encoding:utf-8
2
2
3
3
import numpy as np
4
+
4
5
wordsList = np .load ('wordsList.npy' )
5
6
print ('载入word列表' )
6
7
wordsList = wordsList .tolist ()
9
10
wordVectors = np .load ('wordVectors.npy' )
10
11
print ('载入文本向量' )
11
12
12
-
13
13
print (len (wordsList ))
14
14
print (wordVectors .shape )
15
15
16
16
import os
17
17
from os .path import isfile , join
18
+
18
19
pos_files = ['pos/' + f for f in os .listdir (
19
20
'pos/' ) if isfile (join ('pos/' , f ))]
20
21
neg_files = ['neg/' + f for f in os .listdir (
21
22
'neg/' ) if isfile (join ('neg/' , f ))]
22
23
num_words = []
23
24
for pf in pos_files :
24
- with open (pf , "r" , encoding = 'utf-8' ) as f :
25
- line = f .readline ()
26
- counter = len (line .split ())
27
- num_words .append (counter )
25
+ with open (pf , "r" , encoding = 'utf-8' ) as f :
26
+ line = f .readline ()
27
+ counter = len (line .split ())
28
+ num_words .append (counter )
28
29
print ('正面评价完结' )
29
30
30
31
for nf in neg_files :
31
- with open (nf , "r" , encoding = 'utf-8' ) as f :
32
- line = f .readline ()
33
- counter = len (line .split ())
34
- num_words .append (counter )
32
+ with open (nf , "r" , encoding = 'utf-8' ) as f :
33
+ line = f .readline ()
34
+ counter = len (line .split ())
35
+ num_words .append (counter )
35
36
print ('负面评价完结' )
36
37
37
38
num_files = len (num_words )
38
39
print ('文件总数' , num_files )
39
40
print ('所有的词的数量' , sum (num_words ))
40
41
print ('平均文件词的长度' , sum (num_words ) / len (num_words ))
41
42
42
-
43
43
import re
44
+
44
45
strip_special_chars = re .compile ("[^A-Za-z0-9 ]+" )
45
46
num_dimensions = 300 # Dimensions for each word vector
46
47
47
48
48
49
def cleanSentences (string ):
49
- string = string .lower ().replace ("<br />" , " " )
50
- return re .sub (strip_special_chars , "" , string .lower ())
50
+ string = string .lower ().replace ("<br />" , " " )
51
+ return re .sub (strip_special_chars , "" , string .lower ())
51
52
52
53
53
54
max_seq_num = 250
@@ -94,38 +95,40 @@ def cleanSentences(string):
94
95
batch_size = 24
95
96
lstm_units = 64
96
97
num_labels = 2
97
- iterations = 100000
98
+ iterations = 100
99
+ lr = 0.001
98
100
ids = np .load ('idsMatrix.npy' )
99
101
100
102
101
103
def get_train_batch ():
102
- labels = []
103
- arr = np .zeros ([batch_size , max_seq_num ])
104
- for i in range (batch_size ):
105
- if (i % 2 == 0 ):
106
- num = randint (1 , 11499 )
107
- labels .append ([1 , 0 ])
108
- else :
109
- num = randint (13499 , 24999 )
110
- labels .append ([0 , 1 ])
111
- arr [i ] = ids [num - 1 :num ]
112
- return arr , labels
104
+ labels = []
105
+ arr = np .zeros ([batch_size , max_seq_num ])
106
+ for i in range (batch_size ):
107
+ if (i % 2 == 0 ):
108
+ num = randint (1 , 11499 )
109
+ labels .append ([1 , 0 ])
110
+ else :
111
+ num = randint (13499 , 24999 )
112
+ labels .append ([0 , 1 ])
113
+ arr [i ] = ids [num - 1 :num ]
114
+ return arr , labels
113
115
114
116
115
117
def get_test_batch ():
116
- labels = []
117
- arr = np .zeros ([batch_size , max_seq_num ])
118
- for i in range (batch_size ):
119
- num = randint (11499 , 13499 )
120
- if (num <= 12499 ):
121
- labels .append ([1 , 0 ])
122
- else :
123
- labels .append ([0 , 1 ])
124
- arr [i ] = ids [num - 1 :num ]
125
- return arr , labels
118
+ labels = []
119
+ arr = np .zeros ([batch_size , max_seq_num ])
120
+ for i in range (batch_size ):
121
+ num = randint (11499 , 13499 )
122
+ if (num <= 12499 ):
123
+ labels .append ([1 , 0 ])
124
+ else :
125
+ labels .append ([0 , 1 ])
126
+ arr [i ] = ids [num - 1 :num ]
127
+ return arr , labels
126
128
127
129
128
130
import tensorflow as tf
131
+
129
132
tf .reset_default_graph ()
130
133
131
134
labels = tf .placeholder (tf .float32 , [batch_size , num_labels ])
@@ -134,33 +137,43 @@ def get_test_batch():
134
137
tf .zeros ([batch_size , max_seq_num , num_dimensions ]), dtype = tf .float32 )
135
138
data = tf .nn .embedding_lookup (wordVectors , input_data )
136
139
137
-
138
140
lstmCell = tf .contrib .rnn .BasicLSTMCell (lstm_units )
139
- lstmCell = tf .contrib .rnn .DropoutWrapper (cell = lstmCell , output_keep_prob = 0.75 )
141
+ lstmCell = tf .contrib .rnn .DropoutWrapper (cell = lstmCell , output_keep_prob = 0.5 )
140
142
value , _ = tf .nn .dynamic_rnn (lstmCell , data , dtype = tf .float32 )
141
143
142
-
143
144
weight = tf .Variable (tf .truncated_normal ([lstm_units , num_labels ]))
144
145
bias = tf .Variable (tf .constant (0.1 , shape = [num_labels ]))
145
146
value = tf .transpose (value , [1 , 0 , 2 ])
146
147
last = tf .gather (value , int (value .get_shape ()[0 ]) - 1 )
147
148
prediction = (tf .matmul (last , weight ) + bias )
148
149
149
- correctPred = tf .equal (tf .argmax (prediction , 1 ), tf .argmax (labels , 1 ))
150
- accuracy = tf .reduce_mean (tf .cast (correctPred , tf .float32 ))
151
-
150
+ correct_pred = tf .equal (tf .argmax (prediction , 1 ), tf .argmax (labels , 1 ))
151
+ accuracy = tf .reduce_mean (tf .cast (correct_pred , tf .float32 ))
152
152
153
153
loss = tf .reduce_mean (tf .nn .softmax_cross_entropy_with_logits (
154
154
logits = prediction , labels = labels ))
155
- optimizer = tf .train .AdamOptimizer ().minimize (loss )
155
+ optimizer = tf .train .AdamOptimizer (lr ).minimize (loss )
156
156
157
-
158
- sess = tf .InteractiveSession ()
159
157
saver = tf .train .Saver ()
160
- saver .restore (sess , tf .train .latest_checkpoint ('models' ))
161
158
162
- iterations = 10
163
- for i in range (iterations ):
164
- next_batch , next_batch_labels = get_test_batch ()
165
- print ("正确率:" , (sess .run (
166
- accuracy , {input_data : next_batch , labels : next_batch_labels })) * 100 )
159
+ with tf .Session () as sess :
160
+ if os .path .exists ("models" ) and os .path .exists ("models/checkpoint" ):
161
+ saver .restore (sess , tf .train .latest_checkpoint ('models' ))
162
+ else :
163
+ if int ((tf .__version__ ).split ('.' )[1 ]) < 12 and int ((tf .__version__ ).split ('.' )[0 ]) < 1 :
164
+ init = tf .initialize_all_variables ()
165
+ else :
166
+ init = tf .global_variables_initializer ()
167
+ sess .run (init )
168
+
169
+ iterations = 100
170
+ for step in range (iterations ):
171
+ next_batch , next_batch_labels = get_test_batch ()
172
+ if step % 20 == 0 :
173
+ print ("step:" , step , " 正确率:" , (sess .run (
174
+ accuracy , {input_data : next_batch , labels : next_batch_labels })) * 100 )
175
+
176
+ if not os .path .exists ("models" ):
177
+ os .mkdir ("models" )
178
+ save_path = saver .save (sess , "models/model.ckpt" )
179
+ print ("Model saved in path: %s" % save_path )
0 commit comments