-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.Dockerfile
87 lines (72 loc) · 3.72 KB
/
train.Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
ARG PROJECT_ID=latest
ARG BASE_VERSION=latest
FROM gcr.io/${PROJECT_ID}/bluebert-base:${BASE_VERSION}
# Note: TUNED_MODEL_VERSION is the version of the model being trained.
# This version will be used in the exported model file name.
ARG TUNED_MODEL_VERSION=latest
# Note: TASK_NAME must align with names used by the prob2label.py script
# and also must align with the data directory structure, e.g. bl_chemical_to_gene
ARG TASK_NAME=latest
# Download the base BlueBERT model
WORKDIR /home/dev/models/baseline
RUN wget https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/NCBI-BERT/NCBI_BERT_pubmed_uncased_L-12_H-768_A-12.zip && \
unzip NCBI_BERT_pubmed_uncased_L-12_H-768_A-12.zip
# copy the task-specific training/evaluation data into the container
COPY data/${TASK_NAME}/data.tsv /home/dev/data/
# copy the training entrypoint script into the container
COPY scripts/train.entrypoint.sh /home/dev/entrypoint.sh
# copy resources used to evaluate the model into the container
COPY scripts/prob2label.py /home/dev/
WORKDIR /home/dev
## install blue benchmark repository; note, it's possible some of the pip install commands below are not needed
RUN git clone https://github.com/ncbi-nlp/BLUE_Benchmark.git ./blue_benchmark.git && \
cd blue_benchmark.git && \
pip install -r requirements.txt && \
pip install fire && \
pip install jsonlines && \
pip install pandas && \
pip install tabulate && \
pip install sklearn
ENV BlueBERT_DIR '/home/dev/models/baseline'
WORKDIR /home/dev/data
# split data.tsv randomly into train/dev/test = 60%/20%/20%
RUN sort -R data.tsv > data.random.tsv && \
split -l $[ $(wc -l data.random.tsv|cut -d" " -f1) * 60 / 100 ] data.random.tsv && \
mv xaa train.tsv && \
mv xab rest.tsv && \
split -l $[ $(wc -l rest.tsv|cut -d" " -f1) * 50 / 100 ] rest.tsv && \
mv xaa test.tsv && \
# tie breaker may cause an xac file with one line so we cat remaining xa* files into dev.tsv
cat xa* > dev.tsv && \
rm rest.tsv xa*
# create test.ids file that is used by prob2label.py
RUN cut -f 1 test.tsv | sed -E 's/^(.*)+/\1\t\1\tT1\tT2\t/' > test.ids
# convert the test.tsv file to the format expected by the BLUE_Benchmark code
RUN cut -f 1,3 test.tsv > test.blue.gs && sed -E -i 's/^(.*)+\t/\1\t\1\tT1\tT2\t/' test.blue.gs
# add headers
RUN sed -i '1s/^/id docid arg1 arg2 label\n/' test.blue.gs && \
sed -i '1s/^/id docid arg1 arg2 label\n/' test.ids && \
sed -i '1s/^/id sentence label\n/' train.tsv && \
sed -i '1s/^/id sentence label\n/' dev.tsv && \
sed -i '1s/^/id sentence label\n/' test.tsv
ENV TUNED_MODEL_VERSION_ENV=$TUNED_MODEL_VERSION
ENV TASK_NAME_ENV=$TASK_NAME
ENTRYPOINT /home/dev/entrypoint.sh "$TASK_NAME_ENV" "$TUNED_MODEL_VERSION_ENV" "$@"
# To build:
# docker build --build-arg "PROJECT_ID=[PROJECT_ID]" \
# --build-arg "TASK_NAME=[TASK_NAME]" \
# --build-arg "BASE_VERSION=[BASE_VERSION]" \
# --build-arg "TUNED_MODEL_VERSION=0.1" \
# -t task_name-train:0.1 -f train.Dockerfile
#
# where:
# [PROJECT_ID] = the id for this project - it is used to retrieve the already built base image (from base.Dockerfile)
# [BASE_VERSION] = the version of the base container to use
# [TASK_NAME] = the name of the task that the model is being trained for. TASK_NAME must align with names used by the prob2label.py script
# and also must align with the data directory structure, e.g. bl_chemical_to_gene
# [TUNED_MODEL_VERSION] = the version of the model being trained. This version will be used in the exported model file name.
# To run:
# docker run --rm chemicaltodisease:0.1 [MODEL_STORAGE_BUCKET]
#
# where:
# [MODEL_STORAGE_BUCKET] = the bucket where the trained model will be stored as a tarball, e.g. gs://xyz