Skip to content

Commit

Permalink
Merge pull request #7 from valence-labs/slurm
Browse files Browse the repository at this point in the history
Slurm
  • Loading branch information
maclandrol authored Aug 15, 2023
2 parents 096780d + 1754ba0 commit 2ad3241
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 6 deletions.
8 changes: 8 additions & 0 deletions expts/notebook/1.7-final-touch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,14 @@
" eval_steps_per_second = 12.76\n",
" perplexity = 2.4522313125199866e+43\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
Expand Down
10 changes: 10 additions & 0 deletions expts/train-small.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
accelerate launch --config_file config/accelerate.yaml \
scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \
--dataset data/ --text_column "input" \
--is_tokenized False --streaming True \
--num_labels 1 --include_descriptors False \
--gradient_accumulation_steps 2 --wandb_watch 'gradients' \
--per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \
--eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \
--save_safetensors True --do_train True --output_dir output/test/ \
--learning_rate 5e-4 --warmup_steps 1000 --gradient_checkpointing True --max_steps 15_000
10 changes: 5 additions & 5 deletions expts/train.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
accelerate launch --config_file config/accelerate.yaml \
scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \
--dataset data/ --text_column "input" \
--dataset ~/data/ --text_column "input" \
--is_tokenized False --streaming True \
--num_labels 1 --include_descriptors False \
--gradient_accumulation_steps 2 --wandb_watch 'gradients' \
--per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \
--eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \
--save_safetensors True --do_train True --output_dir output/test/ \
--learning_rate 5e-4 --warmup_steps 500 --gradient_checkpointing True --max_steps 15000
--per_device_train_batch_size 64 --num_train_epochs 2 --save_steps 5000 --save_total_limit 10 \
--eval_accumulation_steps 100 --logging_steps 500 --logging_first_step True \
--save_safetensors True --do_train True --output_dir output/safe/ \
--learning_rate 5e-5 --warmup_steps 2500 --gradient_checkpointing True --max_steps 30_000_000
2 changes: 1 addition & 1 deletion safe/trainer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def compute_metrics(eval_preds):
trainer = SAFETrainer(
model=model,
tokenizer=None, # we don't deal with the tokenizer at all, https://github.com/huggingface/tokenizers/issues/581 -_-
dispatch_batches=(data_args.streaming is True),
dispatch_batches=(data_args.streaming is not True),
train_dataset=train_dataset.shuffle(seed=(training_args.seed or 42)),
eval_dataset=dataset.get(eval_dataset_key_name, None),
args=training_args,
Expand Down

0 comments on commit 2ad3241

Please sign in to comment.