-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_bioner.py
98 lines (90 loc) · 4.82 KB
/
train_bioner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import argparse
import random
import torch
from torch import optim
from bioner.model.annotator import Annotator, TrainingParameters
from bioner.model.bioner_model import BioNER
from bioner.model.encoder.fasttext_encoder import FasttextEncoder, FastTextEmbedding
if __name__ == '__main__':
torch.multiprocessing.set_start_method('spawn')
parser = argparse.ArgumentParser(description='Train Annotator')
required_named = parser.add_argument_group('required named arguments')
required_named.add_argument('--embeddings',
type=str,
help='Path to the embeddings file',
required=False)
required_named.add_argument('--embeddingsRoot',
type=str,
help='Path where the embeddings can be downloaded to',
required=False)
required_named.add_argument('--training',
type=str,
help='Path to the training dataset file',
required=True)
required_named.add_argument('--validation',
type=str,
help='Path to the validation dataset file',
required=True)
required_named.add_argument('--test',
type=str,
help='Path to the test dataset file',
required=False)
required_named.add_argument('--batchSize',
type=int,
help='Batch size',
required=True)
required_named.add_argument('--learningRate',
type=float,
help='Learning rate',
required=True)
required_named.add_argument('--modelOutputFolder',
type=str,
help='The folder where the best model should be saved',
required=True)
required_named.add_argument('--maxEpochs',
type=int,
help='Maximum training epochs',
required=True)
required_named.add_argument('--numWorkers',
type=int,
default=0,
help='Number of workers (defaults to 0)')
required_named.add_argument('--tensorboardLogDirectory',
type=str,
help='The directory where to log the tensorboard data',
required=False)
required_named.add_argument('--trainingsLogFile',
type=str,
help='The file path where to log the PyTorch Ignite training and validation',
required=False)
required_named.add_argument('--enableFasterTraining',
action='store_true',
help='Enable faster training by compute metrics only every 10th epoch')
args = parser.parse_args()
# Reproducibility
torch.use_deterministic_algorithms(True)
torch.manual_seed(1632737901)
random.seed(1632737901)
if args.embeddings is None and args.embeddingsRoot is None:
parser.error("You need to set either --embeddings or --embeddingsRoot")
embeddings_file_path = None
if args.embeddings is None:
fasttext_embedding = FastTextEmbedding(embeddings_root=args.embeddingsRoot, ngram_range="3-4")
embeddings_file_path = fasttext_embedding.filepath
else:
embeddings_file_path = args.embeddings
encoder = FasttextEncoder(embeddings_file_path=embeddings_file_path)
model = BioNER(input_vector_size=encoder.get_embeddings_vector_size())
parameters = TrainingParameters(encoder=encoder,
batch_size=args.batchSize,
training_dataset_path=args.training,
validation_dataset_path=args.validation,
model_save_path=args.modelOutputFolder,
max_epochs=args.maxEpochs,
num_workers=args.numWorkers,
tensorboard_log_directory_path=args.tensorboardLogDirectory,
training_log_file_path=args.trainingsLogFile,
optimizer=optim.Adam(model.parameters(), lr=args.learningRate),
model=model,
faster_training_evaluation=args.enableFasterTraining)
Annotator.train(parameters)