-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathLSTMEncDecAttn.py
executable file
·1924 lines (1791 loc) · 81.1 KB
/
LSTMEncDecAttn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
# This code is basically a supplementary material of the book
# "深層学習による自然言語処理 (機械学習プロフェッショナルシリーズ)
# ISBN-10: 4061529242".
# This is the reason why the comments in this code were written in Japanese,
# and written in single file.
# seq2seq-attn(またはopenNMT)をベースにして作成
# 動作確認
# * python2.7 と 3.5, 3.6
# * chainer v1.24 と v2.0
# * bottleneckのインストールを推奨
# https://pypi.python.org/pypi/Bottleneck
#
# * サンプルの利用法
# サンプルデータはWMT16のページから入手
# http://www.statmt.org/wmt16/translation-task.html
# vocabファイルの準備例
for f in sample_data/newstest2012-4p.{en,de} ;do \
echo ${f} ; \
cat ${f} | sed '/^$/d' | perl -pe 's/^\s+//; s/\s+\n$/\n/; s/ +/\n/g' | \
LC_ALL=C sort | LC_ALL=C uniq -c | LC_ALL=C sort -r -g -k1 | \
perl -pe 's/^\s+//; ($a1,$a2)=split;
if( $a1 >= 3 ){ $_="$a2\t$a1\n" }else{ $_="" } ' > ${f}.vocab_t3_tab ;\
done
# 学習 (GPUが無い環境では以下でGPU=-1として実行)
SLAN=de; TLAN=en; GPU=0; EP=13 ; \
MODEL=filename_of_sample_model.model ; \
python -u ./LSTMEncDecAttn.py -V2 \
-T train \
--gpu-enc ${GPU} \
--gpu-dec ${GPU} \
--enc-vocab-file sample_data/newstest2012-4p.${SLAN}.vocab_t3_tab \
--dec-vocab-file sample_data/newstest2012-4p.${TLAN}.vocab_t3_tab \
--enc-data-file sample_data/newstest2012-4p.${SLAN} \
--dec-data-file sample_data/newstest2012-4p.${TLAN} \
--enc-devel-data-file sample_data/newstest2015.h100.${SLAN} \
--dec-devel-data-file sample_data/newstest2015.h100.${TLAN} \
-D 512 \
-H 512 \
-N 2 \
--optimizer SGD \
--lrate 1.0 \
--batch-size 32 \
--out-each 0 \
--epoch ${EP} \
--eval-accuracy 0 \
--dropout-rate 0.3 \
--attention-mode 1 \
--gradient-clipping 5 \
--initializer-scale 0.1 \
--initializer-type uniform \
--merge-encoder-fwbw 0 \
--use-encoder-bos-eos 0 \
--use-decoder-inputfeed 1 \
-O ${MODEL} \
# 評価
SLAN=de; GPU=0; EP=13 ; BEAM=5 ; \
MODEL=filename_of_sample_model.model ; \
python -u ./LSTMEncDecAttn.py \
-T test \
--gpu-enc ${GPU} \
--gpu-dec ${GPU} \
--enc-data-file sample_data/newstest2015.h101-200.${SLAN} \
--init-model ${MODEL}.epoch${EP} \
--setting ${MODEL}.setting \
--beam-size ${BEAM} \
--max-length 150 \
> ${MODEL}.epoch${EP}.decode_MAX${MAXLEN}_BEAM${BEAM}.txt
'''
import sys
import collections
import six.moves.cPickle as pickle
import copy
import numpy as np
import bottleneck as bn
import argparse
import time
import random
import math
import six
import io
# import codecs
import chainer
import chainer.functions as chaFunc
import chainer.optimizers as chaOpt
import chainer.links as chaLink
import chainer.serializers as chaSerial
from chainer import cuda
# gradientのnormなどを効率的に取得するための処理
# logの出力で使わないなら,本来なくてもいい部分
class Chainer_GradientClipping_rmk_v1(chainer.optimizer.GradientClipping):
name = 'GradientClipping'
def __init__(self, threshold):
self.threshold = threshold
self.rate = 1
self.norm = 1
self.norm_orig = 1
def __call__(self, opt):
self.norm_orig = np.sqrt(chainer.optimizer._sum_sqnorm(
[p.grad for p in opt.target.params()]))
self.norm = self.norm_orig
self.rate = self.threshold / self.norm_orig
if self.rate < 1:
for param in opt.target.params():
grad = param.grad
with cuda.get_device(grad):
grad *= self.rate
self.norm = self.threshold
# LSTMの層の数を変数で決定したいので,層数が可変なことをこのクラスで吸収する
# ここでは主にdecoder用のLSTMを構築するために利用
class NLayerLSTM(chainer.ChainList):
def __init__(self, n_layers=2, eDim=512, hDim=512, name=""):
layers = [0] * n_layers # 層分の領域を確保
for z in six.moves.range(n_layers):
if z == 0: # 第一層の次元数は eDim
tDim = eDim
else: # 第二層以上は前の層の出力次元が入力次元となるのでhDim
tDim = hDim
layers[z] = chaLink.LSTM(tDim, hDim)
# logに出力する際にわかりやすくするための名前付け
layers[z].lateral.W.name = name + "_L%d_la_W" % (z + 1)
layers[z].upward.W.name = name + "_L%d_up_W" % (z + 1)
layers[z].upward.b.name = name + "_L%d_up_b" % (z + 1)
super(NLayerLSTM, self).__init__(*layers)
# 全ての層に対して一回だけLSTMを回す
def processOneStepForward(self, input_states, args, dropout_rate):
hout = None
for c, layer in enumerate(self):
if c > 0: # 一層目(embedding)の入力に対してはdropoutしない
if args.chainer_version_check[0] == 2:
hin = chaFunc.dropout(hout, ratio=dropout_rate)
else:
hin = chaFunc.dropout(
hout, train=args.dropout_mode, ratio=dropout_rate)
else: # 二層目以降の入力はdropoutする
hin = input_states
hout = layer(hin)
return hout
# 全ての層を一括で初期化
def reset_state(self):
for layer in self:
layer.reset_state()
# 主に encoder と decoder 間の情報の受渡しや,beam searchの際に
# 連続でLSTMを回せない時に一旦情報を保持するための関数
def getAllLSTMStates(self):
lstm_state_list_out = [0] * len(self) * 2
for z in six.moves.range(len(self)):
lstm_state_list_out[2 * z] = self[z].c
lstm_state_list_out[2 * z + 1] = self[z].h
# 扱いやすくするために,stackを使って一つの Chainer Variableにして返す
return chaFunc.stack(lstm_state_list_out)
# 用途としては,上のgetAllLSTMStatesで保存したものをセットし直すための関数
def setAllLSTMStates(self, lstm_state_list_in):
for z in six.moves.range(len(self)):
self[z].c = lstm_state_list_in[2 * z]
self[z].h = lstm_state_list_in[2 * z + 1]
# 組み込みのNStepLSTMを必要な形に修正したもの (cuDNNを使って高速化するため)
class NStepLSTMpp(chainer.ChainList):
def __init__(self, n_layers, # 層数
in_size, # 一層目の入力の次元
out_size, # 出力の次元(二層目以降の入力次元も同じ)
dropout_rate,
name="",
use_cudnn=True):
weights = []
direction = 1 # ここでは,からなず一方向ずつ構築するので1にする
t_name = name
if name is not "":
t_name = '%s_' % (name)
for i in six.moves.range(n_layers):
for di in six.moves.range(direction):
weight = chainer.Link()
for j in six.moves.range(8):
if i == 0 and j < 4:
w_in = in_size
elif i > 0 and j < 4:
w_in = out_size * direction
else:
w_in = out_size
weight.add_param('%sw%d' % (t_name, j), (out_size, w_in))
weight.add_param('%sb%d' % (t_name, j), (out_size,))
getattr(weight, '%sw%d' %
(t_name, j)).data[...] = np.random.normal(
0, np.sqrt(1. / w_in), (out_size, w_in))
getattr(weight, '%sb%d' % (t_name, j)).data[...] = 0
weights.append(weight)
super(NStepLSTMpp, self).__init__(*weights)
self.n_layers = n_layers
self.dropout_rate = dropout_rate
self.use_cudnn = use_cudnn
self.out_size = out_size
self.direction = direction
self.ws = [[getattr(w, '%sw0' % (t_name)),
getattr(w, '%sw1' % (t_name)),
getattr(w, '%sw2' % (t_name)),
getattr(w, '%sw3' % (t_name)),
getattr(w, '%sw4' % (t_name)),
getattr(w, '%sw5' % (t_name)),
getattr(w, '%sw6' % (t_name)),
getattr(w, '%sw7' % (t_name))] for w in self]
self.bs = [[getattr(w, '%sb0' % (t_name)),
getattr(w, '%sb1' % (t_name)),
getattr(w, '%sb2' % (t_name)),
getattr(w, '%sb3' % (t_name)),
getattr(w, '%sb4' % (t_name)),
getattr(w, '%sb5' % (t_name)),
getattr(w, '%sb6' % (t_name)),
getattr(w, '%sb7' % (t_name))] for w in self]
def init_hx(self, xs):
hx_shape = self.n_layers * self.direction
with cuda.get_device_from_id(self._device_id):
if args.chainer_version_check[0] == 2:
hx = chainer.Variable(
self.xp.zeros((hx_shape, xs.data.shape[1], self.out_size),
dtype=xs.dtype))
else:
hx = chainer.Variable(
self.xp.zeros((hx_shape, xs.data.shape[1], self.out_size),
dtype=xs.dtype),
volatile='auto')
return hx
def __call__(self, hx, cx, xs, flag_train, args):
if hx is None:
hx = self.init_hx(xs)
if cx is None:
cx = self.init_hx(xs)
# hx, cx は (layer数, minibatch数,出力次元数)のtensor
# xsは (系列長, minibatch数,出力次元数)のtensor
# Note: chaFunc.n_step_lstm() は最初の入力層にはdropoutしない仕様
if args.chainer_version_check[0] == 2:
hy, cy, ys = chaFunc.n_step_lstm(
self.n_layers, self.dropout_rate, hx, cx, self.ws, self.bs, xs)
else:
hy, cy, ys = chaFunc.n_step_lstm(
self.n_layers, self.dropout_rate, hx, cx, self.ws, self.bs, xs,
train=flag_train, use_cudnn=self.use_cudnn)
# hy, cy は (layer数, minibatch数,出力次元数) で出てくる
# ysは最終隠れ層だけなので,系列長のタプルで
# 各要素が (minibatch数,出力次元数)
# 扱いやすくするためにstackを使ってタプルを一つのchainer.Variableに変換
# (系列長, minibatch数,出力次元数)のtensor
hlist = chaFunc.stack(ys)
return hy, cy, hlist
# LSTMの層の数を変数で決定したいので,層数が可変なことをこのクラスで吸収する
class NLayerCuLSTM(chainer.ChainList):
def __init__(self, n_layers, eDim, hDim, name=""):
layers = [0] * n_layers
for z in six.moves.range(n_layers):
if name is not "": # 名前を付ける
t_name = '%s_L%d' % (name, z + 1)
# 毎回一層分のNStepLSTMを作成
if z == 0:
tDim = eDim
else:
tDim = hDim
# 手動で外でdropoutするのでここではrateを0に固定する
layers[z] = NStepLSTMpp(1, tDim, hDim, dropout_rate=0.0,
name=t_name)
super(NLayerCuLSTM, self).__init__(*layers)
# layre_numで指定された層をinput_state_listの長さ分回す
def __call__(self, layer_num, input_state_list, flag_train,
dropout_rate, args):
# Note: chaFunc.n_step_lstm() は最初の入力にはdropoutしない仕様なので,
# 一層毎に手動で作った場合は手動でdropoutが必要
if layer_num > 0:
if args.chainer_version_check[0] == 2:
hin = chaFunc.dropout(input_state_list, ratio=dropout_rate)
else:
hin = chaFunc.dropout(input_state_list,
train=args.dropout_mode,
ratio=dropout_rate)
else:
hin = input_state_list
# layer_num層目の処理を一括で行う
hy, cy, hout = self[layer_num](None, None, hin, flag_train, args)
return hy, cy, hout
# EncDecの本体
class EncoderDecoderAttention:
def __init__(self, encoderVocab, decoderVocab, setting):
self.encoderVocab = encoderVocab # encoderの語彙
self.decoderVocab = decoderVocab # decoderの語彙
# 語彙からIDを取得するための辞書
self.index2encoderWord = {
v: k for k, v in six.iteritems(
self.encoderVocab)} # 実際はなくてもいい
self.index2decoderWord = {
v: k for k, v in six.iteritems(
self.decoderVocab)} # decoderで利用
self.eDim = setting.eDim
self.hDim = setting.hDim
self.flag_dec_ifeed = setting.flag_dec_ifeed
self.flag_enc_boseos = setting.flag_enc_boseos
self.attn_mode = setting.attn_mode
self.flag_merge_encfwbw = setting.flag_merge_encfwbw
self.encVocabSize = len(encoderVocab)
self.decVocabSize = len(decoderVocab)
self.n_layers = setting.n_layers
# encoder-docoderのネットワーク
def initModel(self):
sys.stderr.write(
('Vocab: enc=%d dec=%d embedDim: %d, hiddenDim: %d, '
'n_layers: %d # [Params] dec inputfeed [%d] '
'| use Enc BOS/EOS [%d] | attn mode [%d] '
'| merge Enc FWBW [%d]\n'
% (self.encVocabSize, self.decVocabSize, self.eDim, self.hDim,
self.n_layers, self.flag_dec_ifeed,
self.flag_enc_boseos, self.attn_mode,
self.flag_merge_encfwbw)))
self.model = chainer.Chain(
# encoder embedding層
encoderEmbed=chaLink.EmbedID(self.encVocabSize, self.eDim),
# decoder embedding層
decoderEmbed=chaLink.EmbedID(self.decVocabSize, self.eDim,
ignore_label=-1),
# 出力層
decOutputL=chaLink.Linear(self.hDim, self.decVocabSize),
)
# logに出力する際にわかりやすくするための名前付け なくてもよい
self.model.encoderEmbed.W.name = "encoderEmbed_W"
self.model.decoderEmbed.W.name = "decoderEmbed_W"
self.model.decOutputL.W.name = "decoderOutput_W"
self.model.decOutputL.b.name = "decoderOutput_b"
if self.flag_merge_encfwbw == 0: # default
self.model.add_link(
"encLSTM_f",
NStepLSTMpp(self.n_layers, self.eDim, self.hDim,
args.dropout_rate, name="encBiLSTMpp_fw"))
self.model.add_link(
"encLSTM_b",
NStepLSTMpp(self.n_layers, self.eDim, self.hDim,
args.dropout_rate, name="encBiLSTMpp_bk"))
elif self.flag_merge_encfwbw == 1:
self.model.add_link(
"encLSTM_f",
NLayerCuLSTM(self.n_layers, self.eDim, self.hDim,
"encBiLSTM_fw"))
self.model.add_link(
"encLSTM_b",
NLayerCuLSTM(self.n_layers, self.eDim, self.hDim,
"encBiLSTM_bk"))
else:
assert 0, "ERROR"
# input feedの種類によって次元数が変わることに対応
if self.flag_dec_ifeed == 0: # inputfeedを使わない
decLSTM_indim = self.eDim
elif self.flag_dec_ifeed == 1: # inputfeedを使う default
decLSTM_indim = self.eDim + self.hDim
# if self.flag_dec_ifeed == 2: # inputEmbを使わない (debug用)
# decLSTM_indim = self.hDim
else:
assert 0, "ERROR"
self.model.add_link(
"decLSTM",
NLayerLSTM(self.n_layers, decLSTM_indim, self.hDim, "decLSTM_fw"))
# attentionの種類によってモデル構成が違うことに対応
if self.attn_mode > 0: # attn_mode == 1 or 2
self.model.add_link(
"attnIn_L1",
chaLink.Linear(self.hDim, self.hDim, nobias=True))
self.model.add_link(
"attnOut_L2",
chaLink.Linear(self.hDim + self.hDim, self.hDim, nobias=True))
self.model.attnIn_L1.W.name = "attnIn_W"
self.model.attnOut_L2.W.name = "attnOut_W"
#
if self.attn_mode == 2: # attention == MLP
self.model.add_link(
"attnM",
chaLink.Linear(self.hDim, self.hDim, nobias=True))
self.model.add_link(
"attnSum",
chaLink.Linear(self.hDim, 1, nobias=True))
self.model.attnM.W.name = "attnM_W"
self.model.attnSum.W.name = "attnSum_W"
#######################################
# ネットワークの各構成要素をGPUのメモリに配置
def setToGPUs(self, args):
if args.gpu_enc >= 0 and args.gpu_dec >= 0:
sys.stderr.write(
'# Working on GPUs [gpu_enc=%d][gpu_dec=%d]\n' %
(args.gpu_enc, args.gpu_dec))
if not args.flag_emb_cpu: # 指定があればCPU側のメモリ上に置く
self.model.encoderEmbed.to_gpu(args.gpu_enc)
self.model.encLSTM_f.to_gpu(args.gpu_enc)
self.model.encLSTM_b.to_gpu(args.gpu_enc)
if not args.flag_emb_cpu: # 指定があればCPU側のメモリ上に置く
self.model.decoderEmbed.to_gpu(args.gpu_dec)
self.model.decLSTM.to_gpu(args.gpu_dec)
self.model.decOutputL.to_gpu(args.gpu_dec)
if self.attn_mode > 0:
self.model.attnIn_L1.to_gpu(args.gpu_dec)
self.model.attnOut_L2.to_gpu(args.gpu_dec)
if self.attn_mode == 2:
self.model.attnSum.to_gpu(args.gpu_dec)
self.model.attnM.to_gpu(args.gpu_dec)
else:
sys.stderr.write(
'# NO GPUs [gpu_enc=%d][gpu_dec=%d]\n' %
(args.gpu_enc, args.gpu_dec))
#######################################
def setInitAllParameters(self, optimizer, init_type="default",
init_scale=0.1):
sys.stdout.write("############ Current Parameters BEGIN\n")
self.printAllParameters(optimizer)
sys.stdout.write("############ Current Parameters END\n")
if init_type == "uniform":
sys.stdout.write(
"# initializer is [uniform] [%f]\n" %
(init_scale))
t_initializer = chainer.initializers.Uniform(init_scale)
named_params = sorted(
optimizer.target.namedparams(),
key=lambda x: x[0])
for n, p in named_params:
with cuda.get_device(p.data):
if args.chainer_version_check[0] == 2:
p.copydata(chainer.Parameter(
t_initializer, p.data.shape))
else:
chainer.initializers.init_weight(p.data, t_initializer)
elif init_type == "normal":
sys.stdout.write("# initializer is [normal] [%f]\n" % (init_scale))
t_initializer = chainer.initializers.Normal(init_scale)
named_params = sorted(
optimizer.target.namedparams(),
key=lambda x: x[0])
for n, p in named_params:
with cuda.get_device(p.data):
if args.chainer_version_check[0] == 2:
p.copydata(chainer.Parameter(
t_initializer, p.data.shape))
else:
chainer.initializers.init_weight(p.data, t_initializer)
else: # "default"
sys.stdout.write(
"# initializer is [defalit] [%f]\n" %
(init_scale))
named_params = sorted(
optimizer.target.namedparams(),
key=lambda x: x[0])
for n, p in named_params:
with cuda.get_device(p.data):
p.data *= args.init_scale
self.printAllParameters(optimizer, init_type, init_scale)
return 0
def printAllParameters(self, optimizer, init_type="***", init_scale=1.0):
total_norm = 0
total_param = 0
named_params = sorted(
optimizer.target.namedparams(),
key=lambda x: x[0])
for n, p in named_params:
t_norm = chainer.optimizer._sum_sqnorm(p.data)
sys.stdout.write(
'### {} {} {} {} {}\n'.format(
p.name, p.data.ndim, p.data.shape, p.data.size, t_norm))
total_norm += t_norm
total_param += p.data.size
with cuda.get_device(total_norm):
sys.stdout.write(
'# param size= [{}] norm = [{}] scale=[{}, {}]\n'.format(
total_param, self.model.xp.sqrt(total_norm),
init_type, init_scale))
###############################################
# 情報を保持するためだけのクラス 主に 細切れにbackwardするための用途
class encInfoObject:
def __init__(self, finalHiddenVars, finalLSTMVars, encLen, cMBSize):
self.attnList = finalHiddenVars
self.lstmVars = finalLSTMVars
self.encLen = encLen
self.cMBSize = cMBSize
###############################################
# encoderのembeddingを取得する関数
def getEncoderInputEmbeddings(self, input_idx_list, args):
# 一文一括でembeddingを取得 この方が効率が良い?
if args.flag_emb_cpu and args.gpu_enc >= 0:
encEmbList = chaFunc.copy(
self.model.encoderEmbed(chainer.Variable(input_idx_list)),
args.gpu_enc)
else:
xp = cuda.get_array_module(self.model.encoderEmbed.W.data)
encEmbList = self.model.encoderEmbed(
chainer.Variable(xp.array(input_idx_list)))
return encEmbList
# decoderのembeddingを取得する関数 上のgetEncoderInputEmbeddingsとほぼ同じ
def getDecoderInputEmbeddings(self, input_idx_list, args):
if args.flag_emb_cpu and args.gpu_dec >= 0:
decEmbList = chaFunc.copy(
self.model.decoderEmbed(chainer.Variable(input_idx_list)),
args.gpu_dec)
else:
xp = cuda.get_array_module(self.model.decoderEmbed.W.data)
decEmbList = self.model.decoderEmbed(
chainer.Variable(xp.array(input_idx_list)))
return decEmbList
# encoder側の入力を処理する関数
def encodeSentenceFWD(self, train_mode, sentence, args, dropout_rate):
if args.gpu_enc != args.gpu_dec: # encとdecが別GPUの場合
chainer.cuda.get_device(args.gpu_enc).use()
encLen = len(sentence) # 文長
cMBSize = len(sentence[0]) # minibatch size
# 一文一括でembeddingを取得 この方が効率が良い?
encEmbList = self.getEncoderInputEmbeddings(sentence, args)
flag_train = (train_mode > 0)
lstmVars = [0] * self.n_layers * 2
if self.flag_merge_encfwbw == 0: # fwとbwは途中で混ぜない最後で混ぜる
hyf, cyf, fwHout = self.model.encLSTM_f(
None, None, encEmbList, flag_train, args) # 前向き
hyb, cyb, bkHout = self.model.encLSTM_b(
None, None, encEmbList[::-1], flag_train, args) # 後向き
for z in six.moves.range(self.n_layers):
lstmVars[2 * z] = cyf[z] + cyb[z]
lstmVars[2 * z + 1] = hyf[z] + hyb[z]
elif self.flag_merge_encfwbw == 1: # fwとbwを一層毎に混ぜる
sp = (cMBSize, self.hDim)
for z in six.moves.range(self.n_layers):
if z == 0: # 一層目 embeddingを使う
biH = encEmbList
else: # 二層目以降 前層の出力を使う
# 加算をするためにbkHoutの逆順をもとの順序に戻す
biH = fwHout + bkHout[::-1]
# z層目前向き
hyf, cyf, fwHout = self.model.encLSTM_f(
z, biH, flag_train, dropout_rate, args)
# z層目後ろ向き
hyb, cyb, bkHout = self.model.encLSTM_b(
z, biH[::-1], flag_train, dropout_rate, args)
# それぞれの階層の隠れ状態およびメモリセルをデコーダに
# 渡すために保持
lstmVars[2 * z] = chaFunc.reshape(cyf + cyb, sp)
lstmVars[2 * z + 1] = chaFunc.reshape(hyf + hyb, sp)
else:
assert 0, "ERROR"
# 最終隠れ層
if self.flag_enc_boseos == 0: # default
# fwHoutを[:,]しないとエラーになる?
biHiddenStack = fwHout[:, ] + bkHout[::-1]
elif self.flag_enc_boseos == 1:
bkHout2 = bkHout[::-1] # 逆順を戻す
biHiddenStack = fwHout[1:encLen - 1, ] + bkHout2[1:encLen - 1, ]
# BOS, EOS分を短くする TODO おそらく長さ0のものが入るとエラー
encLen -= 2
else:
assert 0, "ERROR"
# (encの単語数, minibatchの数, 隠れ層の次元)
# => (minibatchの数, encの単語数, 隠れ層の次元)に変更
biHiddenStackSW01 = chaFunc.swapaxes(biHiddenStack, 0, 1)
# 各LSTMの最終状態を取得して,decoderのLSTMの初期状態を作成
lstmVars = chaFunc.stack(lstmVars)
# encoderの情報をencInfoObjectに集約して返す
retO = self.encInfoObject(biHiddenStackSW01, lstmVars, encLen, cMBSize)
return retO
def prepareDecoder(self, encInfo):
self.model.decLSTM.reset_state()
if self.attn_mode == 0:
aList = None
elif self.attn_mode == 1:
aList = encInfo.attnList
elif self.attn_mode == 2:
aList = self.model.attnM(
chaFunc.reshape(encInfo.attnList,
(encInfo.cMBSize * encInfo.encLen, self.hDim)))
# TODO: 効率が悪いのでencoder側に移動したい
else:
assert 0, "ERROR"
xp = cuda.get_array_module(encInfo.lstmVars[0].data)
finalHS = chainer.Variable(
xp.zeros(
encInfo.lstmVars[0].data.shape,
dtype=xp.float32)) # 最初のinput_feedは0で初期化
return aList, finalHS
############################
def trainOneMiniBatch(self, train_mode, decSent, encInfo,
args, dropout_rate):
if args.gpu_enc != args.gpu_dec: # encとdecが別GPUの場合
chainer.cuda.get_device(args.gpu_dec).use()
cMBSize = encInfo.cMBSize
aList, finalHS = self.prepareDecoder(encInfo)
xp = cuda.get_array_module(encInfo.lstmVars[0].data)
total_loss = chainer.Variable(xp.zeros((), dtype=xp.float32)) # 初期化
total_loss_val = 0 # float
correct = 0
incorrect = 0
proc = 0
decoder_proc = len(decSent) - 1 # ここで処理するdecoder側の単語数
#######################################################################
# 1, decoder側の入力単語embeddingsをまとめて取得
decEmbListCopy = self.getDecoderInputEmbeddings(
decSent[:decoder_proc], args)
decSent = xp.array(decSent) # GPU上に移動
#######################################################################
# 2, decoder側のRNN部分を計算
h4_list_copy = [0] * decoder_proc
lstm_states_list_copy = [0] * decoder_proc
for index in six.moves.range(decoder_proc): # decoder_len -1
if index == 0:
t_lstm_states = encInfo.lstmVars
t_finalHS = finalHS
else:
t_lstm_states = lstm_states_list_copy[index - 1]
t_finalHS = h4_list_copy[index - 1]
# decoder LSTMを一回ぶん計算
hOut, lstm_states = self.processDecLSTMOneStep(
decEmbListCopy[index], t_lstm_states,
t_finalHS, args, dropout_rate)
# lstm_statesをキャッシュ
lstm_states_list_copy[index] = lstm_states
# attentionありの場合 contextベクトルを計算
finalHS = self.calcAttention(hOut, encInfo.attnList, aList,
encInfo.encLen, cMBSize, args)
# finalHSをキャッシュ
h4_list_copy[index] = finalHS
#######################################################################
# 3, output(softmax)層の計算
for index in reversed(six.moves.range(decoder_proc)):
# 2で用意した copyを使って最終出力層の計算をする
oVector = self.generateWord(h4_list_copy[index], encInfo.encLen,
cMBSize, args, dropout_rate)
# 正解データ
correctLabel = decSent[index + 1] # xp
proc += (xp.count_nonzero(correctLabel + 1))
# 必ずminibatchsizeでわる
closs = chaFunc.softmax_cross_entropy(
oVector, correctLabel, normalize=False)
# これで正規化なしのloss cf. seq2seq-attn code
total_loss_val += closs.data * cMBSize
if train_mode > 0: # 学習データのみ backward する
total_loss += closs
# 実際の正解数を獲得したい
t_correct = 0
t_incorrect = 0
# Devのときは必ず評価,学習データのときはオプションに従って評価
if train_mode == 0 or args.doEvalAcc > 0:
# 予測した単語のID配列 CuPy
pred_arr = oVector.data.argmax(axis=1)
# 正解と予測が同じなら0になるはず
# => 正解したところは0なので,全体から引く
t_correct = (correctLabel.size -
xp.count_nonzero(correctLabel - pred_arr))
# 予測不要の数から正解した数を引く # +1はbroadcast
t_incorrect = xp.count_nonzero(correctLabel + 1) - t_correct
correct += t_correct
incorrect += t_incorrect
####
if train_mode > 0: # 学習時のみ backward する
total_loss.backward()
return total_loss_val, (correct, incorrect, decoder_proc, proc)
# decoder LSTMの計算
def processDecLSTMOneStep(self, decInputEmb, lstm_states_in,
finalHS, args, dropout_rate):
# 1, RNN層を隠れ層の値をセット
# (beam searchへの対応のため毎回必ずセットする)
self.model.decLSTM.setAllLSTMStates(lstm_states_in)
# 2, 単語埋め込みの取得とinput feedの処理
if self.flag_dec_ifeed == 0: # inputfeedを使わない
wenbed = decInputEmb
elif self.flag_dec_ifeed == 1: # inputfeedを使う (default)
wenbed = chaFunc.concat((finalHS, decInputEmb))
# elif self.flag_dec_ifeed == 2: # decInputEmbを使わない (debug用)
# wenbed = finalHS
else:
assert 0, "ERROR"
# 3, N層分のRNN層を一括で計算
h1 = self.model.decLSTM.processOneStepForward(
wenbed, args, dropout_rate)
# 4, 次の時刻の計算のためにLSTMの隠れ層を取得
lstm_states_out = self.model.decLSTM.getAllLSTMStates()
return h1, lstm_states_out
# attentionの計算
def calcAttention(self, h1, hList, aList, encLen, cMBSize, args):
# attention使わないなら入力された最終隠れ層h1を返す
if self.attn_mode == 0:
return h1
# 1, attention計算のための準備
target1 = self.model.attnIn_L1(h1) # まず一回変換
# (cMBSize, self.hDim) => (cMBSize, 1, self.hDim)
target2 = chaFunc.expand_dims(target1, axis=1)
# (cMBSize, 1, self.hDim) => (cMBSize, encLen, self.hDim)
target3 = chaFunc.broadcast_to(target2, (cMBSize, encLen, self.hDim))
# target3 = chaFunc.broadcast_to(chaFunc.reshape(
# target1, (cMBSize, 1, self.hDim)), (cMBSize, encLen, self.hDim))
# 2, attentionの種類に従って計算
if self.attn_mode == 1: # bilinear
# bilinear系のattentionの場合は,hList1 == hList2 である
# shape: (cMBSize, encLen)
aval = chaFunc.sum(target3 * aList, axis=2)
elif self.attn_mode == 2: # MLP
# attnSum に通すために変形
t1 = chaFunc.reshape(target3, (cMBSize * encLen, self.hDim))
# (cMBSize*encLen, self.hDim) => (cMBSize*encLen, 1)
t2 = self.model.attnSum(chaFunc.tanh(t1 + aList))
# shape: (cMBSize, encLen)
aval = chaFunc.reshape(t2, (cMBSize, encLen))
# aval = chaFunc.reshape(self.model.attnSum(
# chaFunc.tanh(t1 + aList)), (cMBSize, encLen))
else:
assert 0, "ERROR"
# 3, softmaxを求める
cAttn1 = chaFunc.softmax(aval) # (cMBSize, encLen)
# 4, attentionの重みを使ってcontext vectorを作成するところ
# (cMBSize, encLen) => (cMBSize, 1, encLen)
cAttn2 = chaFunc.expand_dims(cAttn1, axis=1)
# (1, encLen) x (encLen, hDim) の行列演算(matmul)をcMBSize回繰り返す
# => (cMBSize, 1, hDim)
cAttn3 = chaFunc.batch_matmul(cAttn2, hList)
# cAttn3 = chaFunc.batch_matmul(chaFunc.reshape(
# cAttn1, (cMBSize, 1, encLen)), hList)
# axis=1の次元1になっているところを削除
context = chaFunc.reshape(cAttn3, (cMBSize, self.hDim))
# 4, attentionの重みを使ってcontext vectorを作成するところ
# こっちのやり方でも可
# (cMBSize, scrLen) => (cMBSize, scrLen, hDim)
# cAttn2 = chaFunc.reshape(cAttn1, (cMBSize, encLen, 1))
# (cMBSize, scrLen) => (cMBSize, scrLen, hDim)
# cAttn3 = chaFunc.broadcast_to(cAttn2, (cMBSize, encLen, self.hDim))
# 重み付き和を計算 (cMBSize, encLen, hDim)
# => (cMBSize, hDim) # axis=1 がなくなる
# context = chaFunc.sum(aList * cAttn3, axis=1)
# 6, attention時の最終隠れ層の計算
c1 = chaFunc.concat((h1, context))
c2 = self.model.attnOut_L2(c1)
finalH = chaFunc.tanh(c2)
# finalH = chaFunc.tanh(self.model.attnOut_L2(
# chaFunc.concat((h1, context))))
return finalH # context
# 出力層の計算
def generateWord(self, h4, encLen, cMBSize, args, dropout_rate):
if args.chainer_version_check[0] == 2:
oVector = self.model.decOutputL(
chaFunc.dropout(h4, ratio=dropout_rate))
else:
oVector = self.model.decOutputL(chaFunc.dropout(
h4, train=args.dropout_mode, ratio=dropout_rate))
return oVector
########################################################
# データを読み込んだりするための関数をまとめたもの
class PrepareData:
def __init__(self, setting):
self.flag_enc_boseos = setting.flag_enc_boseos
################################################
def readVocab(self, vocabFile): # 学習時にのみ呼ばれる予定
d = {}
d.setdefault('<unk>', len(d)) # 0番目 固定
sys.stdout.write('# Vocab: add <unk> | id={}\n'.format(d['<unk>']))
d.setdefault('<s>', len(d)) # 1番目 固定
sys.stdout.write('# Vocab: add <s> | id={}\n'.format(d['<s>']))
d.setdefault('</s>', len(d)) # 2番目 固定
sys.stdout.write('# Vocab: add </s> | id={}\n'.format(d['</s>']))
# TODO: codecsでないとエラーが出る環境がある? 要調査 不要ならioにしたい
with io.open(vocabFile, encoding='utf-8') as f:
# with codecs.open(vocabFile, encoding='utf-8') as f:
for line in f:
line = line.strip()
word, freq = line.split('\t') # 基本的にtab区切りを想定
# word, freq = line.split(' ') # スペース区切りはこちらに変更
if word == "<unk>":
continue
elif word == "<s>":
continue
elif word == "</s>":
continue
d.setdefault(word, len(d))
return d
def sentence2index(self, sentence, word2indexDict, input_side=False):
indexList = [word2indexDict[word] if word in word2indexDict
else word2indexDict['<unk>']
for word in sentence.split(' ')]
# encoder側でかつ,<s>と</s>を使わない設定の場合
if input_side and self.flag_enc_boseos == 0:
return indexList
else: # 通常はこちら
return ([word2indexDict['<s>']] +
indexList + [word2indexDict['</s>']])
def makeSentenceLenDict(self, fileName, word2indexDict, input_side=False):
if input_side:
d = collections.defaultdict(list)
else:
d = {}
sentenceNum = 0
sampleNum = 0
maxLen = 0
# ここで全てのデータを読み込む
# TODO: codecsでないとエラーが出る環境がある? 要調査 不要ならioにしたい
with io.open(fileName, encoding='utf-8') as f:
# with codecs.open(fileName, encoding='utf-8') as f:
for sntNum, snt in enumerate(f): # ここで全てのデータを読み込む
snt = snt.strip()
indexList = self.sentence2index(
snt, word2indexDict, input_side=input_side)
sampleNum += len(indexList)
if input_side:
# input側 ここで長さ毎でまとめたリストを作成する
# 値は文番号と文そのもののペア
d[len(indexList)].append((sntNum, indexList))
else:
d[sntNum] = indexList # decoder側 文の番号をキーとしたハッシュ
sentenceNum += 1
maxLen = max(maxLen, len(indexList))
sys.stdout.write('# data sent: %10d sample: %10d maxlen: %10d\n' % (
sentenceNum, sampleNum, maxLen))
return d
def makeBatch4Train(self, encSentLenDict, decSentLenDict,
batch_size=1, shuffle_flag=True):
encSentDividedBatch = []
for length, encSentList in six.iteritems(encSentLenDict):
random.shuffle(encSentList) # ここで同じencLenのデータをshuffle
iter2 = six.moves.range(0, len(encSentList), batch_size)
encSentDividedBatch.extend(
[encSentList[_:_ + batch_size] for _ in iter2])
if shuffle_flag is True:
# encLenの長さでまとめたものをシャッフルする
random.shuffle(encSentDividedBatch)
else:
sys.stderr.write(
('# NO shuffle: descending order based on '
'encoder sentence length\n'))
encSentBatch = []
decSentBatch = []
# shuffleなしの場合にencoderの長い方から順番に生成
for batch in encSentDividedBatch[::-1]:
encSentBatch.append(
np.array([encSent for sntNum, encSent in batch],
dtype=np.int32).T)
maxDecoderLength = max([len(decSentLenDict[sntNum])
for sntNum, encSent in batch])
decSentBatch.append(
np.array([decSentLenDict[sntNum] + [-1] *
(maxDecoderLength - len(decSentLenDict[sntNum]))
for sntNum, encSent in batch], dtype=np.int32).T)
######
return list(six.moves.zip(encSentBatch, decSentBatch))
# 主に学習時の状況を表示するための情報を保持するクラス
class TrainProcInfo:
def __init__(self):
self.lossVal = 0
self.instanceNum = 0
self.corTot = 0
self.incorTot = 0
self.batchCount = 0
self.trainsizeTot = 0
self.procTot = 0
self.gnorm = 0
self.gnormLimit = 0
self.pnorm = 0
self.encMaxLen = 0
self.decMaxLen = 0
def update(self, loss_stat, mbs, bc, cor, incor, tsize, proc,
encLen, decLen):
self.instanceNum += mbs # 文数を数える
self.batchCount += bc # minibatchで何回処理したか
self.corTot += cor
self.incorTot += incor
self.trainsizeTot += tsize
self.procTot += proc
# 強制的にGPUからCPUに値を移すため floatを利用
self.lossVal += float(loss_stat)
self.encMaxLen = max(encLen * mbs, self.encMaxLen)
self.decMaxLen = max(decLen * mbs, self.decMaxLen)
# 途中経過を標示するための情報取得するルーチン
def print_strings(self, train_mode, epoch, cMBSize, encLen, decLen,
start_time, args):
with cuda.get_device(self.lossVal):
msg0 = 'Epoch: %3d | LL: %9.6f PPL: %10.4f' % (
epoch, float(self.lossVal / max(1, self.procTot)),
math.exp(min(10, float(self.lossVal / max(1, self.procTot)))))
msg1 = '| gN: %8.4f %8.4f %8.4f' % (
self.gnorm, self.gnormLimit, self.pnorm)
dt = self.corTot + self.incorTot
msg2 = '| acc: %6.2f %8d %8d ' % (
float(100.0 * self.corTot / max(1, dt)),
self.corTot, self.incorTot)
msg3 = '| tot: %8d proc: %8d | num: %8d %6d %6d ' % (
self.trainsizeTot, self.procTot, self.instanceNum,
self.encMaxLen, self.decMaxLen)
msg4 = '| MB: %4d %6d %4d %4d | Time: %10.4f' % (
cMBSize, self.batchCount,
encLen, decLen, time.time() - start_time)
# dev.dataのときは必ず評価,学習データのときはオプションに従う
if train_mode == 0:
msgA = '%s %s %s %s' % (msg0, msg2, msg3, msg4)
elif args.doEvalAcc > 0:
msgA = '%s %s %s %s %s' % (msg0, msg1, msg2, msg3, msg4)
else:
msgA = '%s %s %s %s' % (msg0, msg1, msg3, msg4)
return msgA
# 学習用のサブルーチン
def train_model_sub(train_mode, epoch, tData, EncDecAtt, optimizer,
clip_obj, start_time, args):
if 1: # 並列処理のコードとインデントを揃えるため...
#####################
tInfo = TrainProcInfo()
prnCnt = 0
#####################
if train_mode > 0: # train
dropout_rate = args.dropout_rate
else: # dev
dropout_rate = 0
#####################
if args.chainer_version_check[0] == 2:
if train_mode > 0: # train
chainer.global_config.train = True
chainer.global_config.enable_backprop = True
sys.stderr.write(
('# TRAIN epoch {} drop rate={} | CHAINER CONFIG [{}] \n'
.format(epoch, dropout_rate,
chainer.global_config.__dict__)))
else: # dev
chainer.global_config.train = False
chainer.global_config.enable_backprop = False
sys.stderr.write(
('# DEV. epoch {} drop rate={} | CHAINER CONFIG [{}] \n'
.format(epoch, dropout_rate,
chainer.global_config.__dict__)))
else:
if train_mode > 0: # train
args.dropout_mode = args.dropout_mode_orig
else: # dev
args.dropout_mode = False
#####################