forked from tmbdev/clstm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclstm.h
492 lines (441 loc) · 14.4 KB
/
clstm.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
// -*- C++ -*-
// A basic LSTM implementation in C++. All you should need is clstm.cc and
// clstm.h. Library dependencies are limited to a small subset of STL and
// Eigen/Dense
#ifndef ocropus_lstm_
#define ocropus_lstm_
#include <vector>
#include <string>
#include <iostream>
#include <typeinfo>
#include <memory>
#include <map>
#include <Eigen/Dense>
#include <random>
namespace ocropus {
using std::string;
using std::vector;
using std::map;
using std::shared_ptr;
using std::unique_ptr;
using std::function;
void throwf(const char *format, ...);
extern char exception_message[256];
#ifdef LSTM_DOUBLE
typedef double Float;
typedef Eigen::VectorXi iVec;
typedef Eigen::VectorXd Vec;
typedef Eigen::MatrixXd Mat;
#else
typedef float Float;
typedef Eigen::VectorXi iVec;
typedef Eigen::VectorXf Vec;
typedef Eigen::MatrixXf Mat;
#endif
// These macros define the major matrix operations used
// in CLSTM. They are here for eventually converting the
// inner loops of CLSTM from Eigen::Matrix to Eigen::Tensor
// (which uses different and incompatible notation)
#define DOT(M, V) ((M) *(V))
#define MATMUL(A, B) ((A) *(B))
#define MATMUL_TR(A, B) ((A).transpose() * (B))
#define MATMUL_RT(A, B) ((A) *(B).transpose())
#define EMUL(U, V) ((U).array() * (V).array()).matrix()
#define EMULV(U, V) ((U).array() * (V).array()).matrix()
#define TRANPOSE(U) ((U).transpose())
#define ROWS(A) (A).rows()
#define COLS(A) (A).cols()
#define COL(A, b) (A).col(b)
#define MAPFUN(M, F) ((M).unaryExpr(ptr_fun(F)))
#define MAPFUNC(M, F) ((M).unaryExpr(F))
#define SUMREDUCE(M) float(M.sum())
#define BLOCK(A, i, j, n, m) (A).block(i, j, n, m)
inline void ADDCOLS(Mat &m, Vec &v) {
for (int i = 0; i < COLS(m); i++)
for (int j = 0; j < ROWS(m); j++)
m(i, j) += v(j);
}
inline void randgauss(Mat &m) {
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<double> randn;
for (int i = 0; i < ROWS(m); i++)
for (int j = 0; j < COLS(m); j++)
m(i, j) = randn(gen);
}
inline void randgauss(Vec &v) {
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<double> randn;
for (int i = 0; i < ROWS(v); i++)
v(i) = randn(gen);
}
inline void randinit(Mat &m, float s, const string mode="unif") {
if (mode == "unif") {
m.setRandom();
m = (2*s*m).array()-s;
} else if (mode == "pos") {
m.setRandom();
m = m*s;
} else if (mode == "normal") {
randgauss(m);
m = m*s;
}
}
inline void randinit(Vec &m, float s, const string mode="unif") {
if (mode == "unif") {
m.setRandom();
m = (2*s*m).array()-s;
} else if (mode == "pos") {
m.setRandom();
m = m*s;
} else if (mode == "normal") {
randgauss(m);
m = m*s;
}
}
inline void randinit(Mat &m, int no, int ni, float s, const string mode="unif") {
m.resize(no, ni);
randinit(m, s, mode);
}
inline void randinit(Vec &m, int no, float s, const string mode="unif") {
m.resize(no);
randinit(m, s, mode);
}
inline void zeroinit(Mat &m, int no, int ni) {
m.resize(no, ni);
m.setZero();
}
inline void zeroinit(Vec &m, int no) {
m.resize(no);
m.setZero();
}
typedef vector<Mat> Sequence;
inline void resize(Sequence &seq, int nsteps, int dims, int bs) {
seq.resize(nsteps);
for (int i=0; i<nsteps; i++) seq[i].resize(dims,bs);
}
inline int size(Sequence &seq, int dim) {
if (dim==0) return seq.size();
if (dim==1) return seq[0].rows();
if (dim==2) return seq[0].cols();
THROW("bad dim ins size");
}
typedef vector<int> Classes;
typedef vector<Classes> BatchClasses;
inline Vec timeslice(const Sequence &s, int i, int b=0) {
Vec result(s.size());
for (int t = 0; t < s.size(); t++)
result[t] = s[t](i, b);
return result;
}
struct VecMat {
Vec *vec = 0;
Mat *mat = 0;
VecMat() {
}
VecMat(Vec *vec) {
this->vec = vec;
}
VecMat(Mat *mat) {
this->mat = mat;
}
};
struct ITrainable {
virtual ~ITrainable() {
}
string name = "";
virtual const char *kind() = 0;
// Learning rate and momentum used for training.
Float learning_rate = 1e-4;
Float momentum = 0.9;
enum Normalization : int {
NORM_NONE, NORM_LEN, NORM_BATCH, NORM_DFLT = NORM_NONE,
} normalization = NORM_DFLT;
// The attributes array contains parameters for constructing the
// network, as well as information necessary for loading and saving
// networks.
map<string, string> attributes;
string attr(string key, string dflt="") {
auto it = attributes.find(key);
if (it == attributes.end()) return dflt;
return it->second;
}
int iattr(string key, int dflt=-1) {
auto it = attributes.find(key);
if (it == attributes.end()) return dflt;
return std::stoi(it->second);
}
double dattr(string key, double dflt=0.0) {
auto it = attributes.find(key);
if (it == attributes.end()) return dflt;
return std::stof(it->second);
}
int irequire(string key) {
auto it = attributes.find(key);
if (it == attributes.end()) {
sprintf(exception_message, "missing parameter: %s", key.c_str());
THROW(exception_message);
}
return std::stoi(it->second);
}
void set(string key, string value) {
attributes[key] = value;
}
void set(string key, int value) {
attributes[key] = std::to_string(value);
}
void set(string key, double value) {
attributes[key] = std::to_string(value);
}
// Learning rates
virtual void setLearningRate(Float lr, Float momentum) = 0;
// Main methods for forward and backward propagation
// of activations.
virtual void forward() = 0;
virtual void backward() = 0;
virtual void update() = 0;
virtual int idepth() {
return -9999;
}
virtual int odepth() {
return -9999;
}
virtual void initialize() {
// this gets initialization parameters
// out of the attributes array
}
// These are convenience functions for initialization
virtual void init(int no, int ni) final {
set("ninput", ni);
set("noutput", no);
initialize();
}
virtual void init(int no, int nh, int ni) final {
set("ninput", ni);
set("nhidden", nh);
set("noutput", no);
initialize();
}
virtual void init(int no, int nh2, int nh, int ni) final {
set("ninput", ni);
set("nhidden", nh);
set("nhidden2", nh2);
set("noutput", no);
initialize();
}
};
struct INetwork;
typedef shared_ptr<INetwork> Network;
struct INetwork : virtual ITrainable {
// Networks have input and output "ports" for sequences
// and derivatives. These are propagated in forward()
// and backward() methods.
Sequence inputs, d_inputs;
Sequence outputs, d_outputs;
// Some networks have subnetworks. They should be
// stored in the `sub` vector. That way, functions
// like `save` can automatically traverse the tree
// of networks. Together with the `name` field,
// this forms a hierarchical namespace of networks.
vector<Network > sub;
// Data for encoding/decoding input/output strings.
vector<int> codec;
vector<int> icodec;
unique_ptr<map<int, int> > encoder; // cached
unique_ptr<map<int, int> > iencoder; // cached
void makeEncoders();
std::wstring decode(Classes &cs);
std::wstring idecode(Classes &cs);
void encode(Classes &cs, const std::wstring &s);
void iencode(Classes &cs, const std::wstring &s);
// Parameters specific to softmax.
Float softmax_floor = 1e-5;
bool softmax_accel = false;
virtual ~INetwork() {
}
std::function<void(INetwork*)> initializer = [] (INetwork*){};
virtual void initialize() {
// this gets initialization parameters
// out of the attributes array
initializer(this);
}
// Expected number of input/output features.
virtual int ninput() {
return -999999;
}
virtual int noutput() {
return -999999;
}
// Add a network as a subnetwork.
virtual void add(Network net) {
sub.push_back(net);
}
// Hooks to iterate over the weights and states of this network.
typedef function<void (const string &, VecMat, VecMat)> WeightFun;
typedef function<void (const string &, Sequence *)> StateFun;
virtual void myweights(const string &prefix, WeightFun f) {
}
virtual void mystates(const string &prefix, StateFun f) {
}
// Hooks executed prior to saving and after loading.
// Loading iterates over the weights with the `weights`
// methods and restores only the weights. `postLoad`
// allows classes to update other internal state that
// depends on matrix size.
virtual void preSave() {
}
virtual void postLoad() {
}
// Set the learning rate for this network and all subnetworks.
virtual void setLearningRate(Float lr, Float momentum) {
this->learning_rate = lr;
this->momentum = momentum;
for (int i = 0; i < sub.size(); i++)
sub[i]->setLearningRate(lr, momentum);
}
void info(string prefix);
void weights(const string &prefix, WeightFun f);
void states(const string &prefix, StateFun f);
void networks(const string &prefix, function<void (string, INetwork*)>);
Sequence *getState(string name);
// special method for LSTM and similar networks, returning the
// primary internal state sequence
Sequence *getState() {
THROW("unimplemented");
};
void save(const char *fname);
void load(const char *fname);
};
// standard layer types
INetwork *make_SigmoidLayer();
INetwork *make_SoftmaxLayer();
INetwork *make_ReluLayer();
INetwork *make_Stacked();
INetwork *make_Reversed();
INetwork *make_Parallel();
INetwork *make_LSTM();
INetwork *make_NPLSTM();
INetwork *make_BidiLayer();
// setting inputs and outputs
void set_inputs(INetwork *net, Sequence &inputs);
void set_targets(INetwork *net, Sequence &targets);
void set_targets_accelerated(INetwork *net, Sequence &targets);
void set_classes(INetwork *net, Classes &classes);
void set_classes(INetwork *net, BatchClasses &classes);
// single sequence training functions
void train(INetwork *net, Sequence &xs, Sequence &targets);
void ctrain(INetwork *net, Sequence &xs, Classes &cs);
void ctrain_accelerated(INetwork *net, Sequence &xs, Classes &cs, Float lo=1e-5);
void cpred(INetwork *net, Classes &preds, Sequence &xs);
void mktargets(Sequence &seq, Classes &targets, int ndim);
// batch training functions
void ctrain(INetwork *net, Sequence &xs, BatchClasses &cs);
void ctrain_accelerated(INetwork *net, Sequence &xs, BatchClasses &cs, Float lo=1e-5);
void cpred(INetwork *net, BatchClasses &preds, Sequence &xs);
void mktargets(Sequence &seq, BatchClasses &targets, int ndim);
// instantiating layers and networks
typedef std::function<INetwork*(void)> ILayerFactory;
extern map<string, ILayerFactory> layer_factories;
Network make_layer(const string &kind);
struct String : public std::string {
String() {
}
String(const char *s) : std::string(s) {
}
String(const std::string &s) : std::string(s) {
}
String(int x) : std::string(std::to_string(x)) {
}
String(double x) : std::string(std::to_string(x)) {
}
double operator+() { return atof(this->c_str()); }
operator int() {
return atoi(this->c_str());
}
operator double() {
return atof(this->c_str());
}
};
struct Assoc : std::map<std::string, String> {
using std::map<std::string, String>::map;
Assoc(const string &s);
String at(const std::string &key) const {
auto it = this->find(key);
if (it == this->end()) throwf("%s: key not found", key.c_str());
return it->second;
}
};
typedef std::vector<Network> Networks;
Network layer(
const string &kind,
int ninput, int noutput,
const Assoc &args,
const Networks &subs
);
typedef std::function<Network(const Assoc &)> INetworkFactory;
extern map<string, INetworkFactory> network_factories;
Network make_net(const string &kind, const Assoc ¶ms);
Network make_net_init(const string &kind, const std::string ¶ms);
// new, proto-based I/O
Network proto_clone_net(INetwork *net);
void debug_as_proto(INetwork *net, bool do_weights=false);
void write_as_proto(std::ostream &output, INetwork *net);
void save_as_proto(const string &fname, INetwork *net);
Network load_as_proto(const string &fname);
inline void save_net(const string &file, Network net) {
save_as_proto(file, net.get());
}
inline Network load_net(const string &file) {
return load_as_proto(file);
}
// training with CTC
void forward_algorithm(Mat &lr, Mat &lmatch, double skip=-5.0);
void forwardbackward(Mat &both, Mat &lmatch);
void ctc_align_targets(Sequence &posteriors, Sequence &outputs, Sequence &targets);
void ctc_align_targets(Sequence &posteriors, Sequence &outputs, Classes &targets);
void trivial_decode(Classes &cs, Sequence &outputs, int batch=0);
void ctc_train(INetwork *net, Sequence &xs, Sequence &targets);
void ctc_train(INetwork *net, Sequence &xs, Classes &targets);
void ctc_train(INetwork *net, Sequence &xs, BatchClasses &targets);
// DEPRECATED
extern Mat debugmat;
// loading and saving networks (using HDF5)
void load_attributes(map<string, string> &attrs, const string &file);
}
namespace {
inline bool anynan(ocropus::Sequence &a) {
for (int i = 0; i < a.size(); i++) {
for (int j = 0; j < ROWS(a[i]); j++) {
for (int k = 0; k < COLS(a[i]); k++) {
if (isnan(a[i](j, k))) return true;
}
}
}
return false;
}
template <class A, class B>
double levenshtein(A &a, B &b) {
using std::vector;
int n = a.size();
int m = b.size();
if (n > m) return levenshtein(b, a);
vector<double> current(n+1);
vector<double> previous(n+1);
for (int k = 0; k < current.size(); k++) current[k] = k;
for (int i = 1; i <= m; i++) {
previous = current;
for (int k = 0; k < current.size(); k++) current[k] = 0;
current[0] = i;
for (int j = 1; j <= n; j++) {
double add = previous[j]+1;
double del = current[j-1]+1;
double change = previous[j-1];
if (a[j-1] != b[i-1]) change = change+1;
current[j] = fmin(fmin(add, del), change);
}
}
return current[n];
}
}
#endif