-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
138 lines (122 loc) · 4.19 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import optparse
import os
from collections import OrderedDict
from loader import prepare_dictionaries, get_word_embedding_matrix
import LstmCrfModel
import torch
import numpy as np
from utils import save_model_dictionaries, load_parameters
from train import train
import cPickle
optparser = optparse.OptionParser()
optparser.add_option(
"-T", "--train", default="data/conll2000.train.txt",
help="Train set location"
)
optparser.add_option(
"-D", "--dev", default="data/conll2000.test.txt",
help="Development dataset"
)
optparser.add_option(
"-l", "--lower", default="1",
type='int', help="Lowercase words (this will not affect character inputs)"
)
optparser.add_option(
"-z", "--zeros", default="1",
type='int', help="Replace digits with 0"
)
optparser.add_option(
"-p", "--pre_emb", default=None,#'embedding/glove.6B.100d.txt',
help="Location of pretrained embeddings"
)
optparser.add_option(
"-v", "--vocab_size", default="8000",
type='int', help="vocab_size"
)
optparser.add_option(
"-e", "--embedding_dim", default="100",
type='int', help="words hidden dimension"
)
optparser.add_option(
"-d", "--hidden_dim", default="200",
type='int', help="LSTM hidden dimension"
)
optparser.add_option(
"-t", "--decode_method", default="viterbi",
help="Choose viterbi or marginal to decode the output tag"
)
optparser.add_option(
"-o", "--loss_function", default="likelihood",
help="Choose likelihood or labelwise to determine the loss function"
)
optparser.add_option(
"-c", "--clip", default=5.0,
help="gradient clipping l2 norm"
)
optparser.add_option(
"-f", "--freeze", default=False,
help="Wheter freeze the embedding layer or not"
)
optparser.add_option(
"-s", "--save", default='model',
help="Model and dictionareis stored postition"
)
optparser.add_option(
"--load", default=None,
help="Load pre-trained Model and dictionaries"
)
def main():
np.random.seed(15213)
torch.manual_seed(15213)
opts = optparser.parse_args()[0]
# Parse parameters
Parse_parameters = OrderedDict()
Parse_parameters['lower'] = opts.lower == 1
Parse_parameters['zeros'] = opts.zeros == 1
Parse_parameters['pre_emb'] = opts.pre_emb
Parse_parameters['train'] = opts.train
Parse_parameters['development'] = opts.dev
Parse_parameters['vocab_size'] = opts.vocab_size
# Check parameters validity
assert os.path.isfile(opts.train)
assert os.path.isfile(opts.dev)
if opts.pre_emb:
assert opts.embedding_dim in [50, 100, 200, 300]
assert opts.lower == 1
# load datasets
if not opts.load:
dictionaries = prepare_dictionaries(Parse_parameters)
else:
# load dictionaries
with open(opts.load+'/dictionaries.dic', 'rb') as f:
dictionaries = cPickle.load(f)
# load parameters
opts = load_parameters(opts.load, opts)
# Model parameters
Model_parameters = OrderedDict()
Model_parameters['vocab_size'] = opts.vocab_size
Model_parameters['embedding_dim'] = opts.embedding_dim
Model_parameters['hidden_dim'] = opts.hidden_dim
Model_parameters['tagset_size'] = len(dictionaries['tag_to_id'])
Model_parameters['lower'] = opts.lower == 1
Model_parameters['decode_method'] = opts.decode_method
Model_parameters['loss_function'] = opts.loss_function
Model_parameters['freeze'] = opts.freeze
#model = LstmModel.LSTMTagger(Model_parameters)
model = LstmCrfModel.BiLSTM_CRF(Model_parameters)
# gradients are allocated lazily, so they are not shared here
model.share_memory()
# If using pre-train, we need to initialize word-embedding layer
if opts.pre_emb and not opts.load:
print("Initialize the word-embedding layer")
initial_matrix = get_word_embedding_matrix(dictionaries['word_to_id'],
opts.pre_emb, opts.embedding_dim)
model.init_word_embedding(initial_matrix)
# Load pre-trained model
if opts.load:
model.load_state_dict(torch.load(opts.load+'/model.mdl'))
train(model, Parse_parameters, opts, dictionaries)
# Save model and dictionaries
save_model_dictionaries('model', model, dictionaries, opts)
if __name__ == '__main__':
main()