-
Notifications
You must be signed in to change notification settings - Fork 128
/
train.sh
42 lines (39 loc) · 1.18 KB
/
train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=eth1
export NCCL_IB_GID_INDEX=3
export NCCL_IB_SL=3
export NCCL_NET_GDR_READ=1
export CHIEF_IP=127.0.0.1
export MASTER_ADDR="${CHIEF_IP:=localhost}"
export MASTER_PORT="${MASTER_PORT:=29500}"
path=./
train_path=$path/run_clm_llms.py
torchrun --nnodes 1 --nproc_per_node 8 \
${train_path} \
--deepspeed $path/configs/deepspeed_config.json \
--model_name_or_path ${path} \
--train_file $path/data/train_total_new_name.cache \
--preprocessing_num_workers 16 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 8 \
--gradient_accumulation_steps 3 \
--num_train_epochs 3 \
--save_strategy "steps" \
--save_steps 5000 \
--save_total_limit 1 \
--learning_rate 3e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 10 \
--block_size 512 \
--do_train \
--evaluation_strategy "no" \
--validation_split_percentage 0 \
--fp16 True \
--fp16_full_eval True \
--streaming \
--ddp_timeout 3600 \
--seed 1 \
--gradient_checkpointing False \
--output_dir $path/trained_models/MM-LLMs/mm_llms_trainer/