forked from HabanaAI/Model-References
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_bert_1.5b_32x.sh
executable file
·68 lines (61 loc) · 2.2 KB
/
run_bert_1.5b_32x.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
##########################################################################################
# Example: Pretraining phase 1 of BERT with 1.5B parameters on multinode with 8 card each
##########################################################################################
# Params: run_pretraining
DATA_DIR=$HL_DATA_DIR_ROOT/data/pytorch/bert/pretraining/hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus
MODEL_CONFIG=${HL_MODEL_CONFIG:-./scripts/bert_1.5b_config.json}
DS_CONFIG=${HL_DS_CONFIG:-./scripts/deepspeed_config_bert_1.5b.json}
HOSTSFILE=${HL_HOSTSFILE:-./scripts/hostsfile}
RESULTS_DIR=${HL_RESULTS_DIR:-./results/bert_1.5b}
CHECKPOINTS_DIR=${HL_CHECKPOINTS_DIR:-$RESULTS_DIR/checkpoints}
MAX_SEQ_LENGTH=128
NUM_STEPS_PER_CP=${HL_NUM_STEPS_PER_CP:-200}
MAX_STEPS=155000
RUN_STEPS=${HL_RUN_STEPS:--1}
LR=0.0015
WARMUP=0.05
CONST=0.25
LOG_FREQ=10
MAX_PRED=20
# Params: DeepSpeed
NUM_NODES=${HL_NUM_NODES:-4}
NGPU_PER_NODE=8
DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
CMD="python -u ./run_pretraining.py \
--disable_progress_bar \
--optimizer=lans \
--use_lr_scheduler \
--resume_from_checkpoint \
--do_train \
--bert_model=bert-base-uncased \
--config_file=$MODEL_CONFIG \
--json-summary=$RESULTS_DIR/dllogger.json \
--output_dir=$CHECKPOINTS_DIR \
--seed=12439 \
--input_dir=$DATA_DIR \
--max_seq_length $MAX_SEQ_LENGTH \
--max_predictions_per_seq=$MAX_PRED \
--max_steps=$MAX_STEPS \
--steps_this_run=$RUN_STEPS \
--num_steps_per_checkpoint=$NUM_STEPS_PER_CP \
--learning_rate=$LR \
--warmup_proportion=$WARMUP \
--constant_proportion=$CONST \
--scheduler_degree=1.0 \
--log_freq=$LOG_FREQ \
--deepspeed \
--deepspeed_config=$DS_CONFIG"
#Configure multinode
if [ "$NUM_NODES" -ne "1" -a -f "$HOSTSFILE" ]
then
MULTINODE_CMD="--hostfile=$HOSTSFILE \
--master_addr $(head -n 1 $HOSTSFILE | sed -n s/[[:space:]]slots.*//p) "
fi
mkdir -p $RESULTS_DIR
deepspeed --num_nodes ${NUM_NODES} \
--num_gpus ${NGPU_PER_NODE} \
--no_local_rank \
--no_python \
$MULTINODE_CMD \
$CMD