-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathjob.yaml
90 lines (90 loc) · 3.44 KB
/
job.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
apiVersion: batch/v1
kind: Job
metadata:
name: trl-full-sft
namespace: hf-gke-namespace
spec:
template:
metadata:
name: trl
labels:
app: trl
hf.co/model: google--gemma-2b
hf.co/dataset: timdettmers--openassistant-guanaco
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/ephemeral-storage-request: 200Gi
spec:
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-tesla-a100
cloud.google.com/compute-class: Accelerator
containers:
- name: trl-container
image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-training-cu121.2-3.transformers.4-42.ubuntu2204.py310:latest
command:
- "/bin/bash"
- "-c"
- |
mkdir -p $HF_HOME/accelerate
# `deepspeed.yaml` dumped as a string into `$HF_HOME/accelerate/default_config.yaml`
echo \"compute_environment: LOCAL_MACHINE\ndebug: false\ndeepspeed_config:\n deepspeed_multinode_launcher: standard\n offload_optimizer_device: none\n offload_param_device: none\n zero3_init_flag: true\n zero3_save_16bit_model: true\n zero_stage: 3\ndistributed_type: DEEPSPEED\ndowncast_bf16: 'no'\nmachine_rank: 0\nmain_training_function: main\nmixed_precision: bf16\nnum_machines: 1\nnum_processes: 4\nrdzv_backend: static\nsame_network: true\ntpu_env: []\ntpu_use_cluster: false\ntpu_use_sudo: false\nuse_cpu: false\" > $HF_HOME/accelerate/default_config.yaml
exec trl sft "$@"
- "--"
args:
# MODEL
- "--model_name_or_path=google/gemma-2b"
- "--torch_dtype=bfloat16"
- "--attn_implementation=flash_attention_2"
# DATASET
- "--dataset_name=timdettmers/openassistant-guanaco"
- "--dataset_text_field=text"
# TRAINER
- "--bf16"
- "--max_seq_length=1024"
- "--per_device_train_batch_size=2"
- "--gradient_accumulation_steps=4"
- "--gradient_checkpointing"
- "--learning_rate=0.0002"
- "--lr_scheduler_type=cosine"
- "--optim=adamw_bnb_8bit"
- "--num_train_epochs=3"
- "--logging_steps=10"
- "--do_eval"
- "--eval_steps=100"
- "--report_to=none"
- "--save_strategy=epoch"
- "--output_dir=/data/gemma-2b-SFT"
- "--overwrite_output_dir"
- "--seed=42"
- "--log_level=info"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_token
- name: ACCELERATE_LOG_LEVEL
value: "INFO"
- name: TRANSFORMERS_LOG_LEVEL
value: "INFO"
- name: TRL_USE_RICH
value: "0"
- name: TQDM_POSITION
value: "-1"
resources:
requests:
nvidia.com/gpu: 4
volumeMounts:
- name: gcs-fuse-csi-vol
mountPath: /data
readOnly: false
serviceAccountName: hf-gke-service-account
volumes:
- name: gcs-fuse-csi-vol
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: false
volumeAttributes:
bucketName: hf-train-gke-bucket
mountOptions: "implicit-dirs"
restartPolicy: "Never"