diff --git a/expts/notebook/1.7-final-touch.ipynb b/expts/notebook/1.7-final-touch.ipynb index 850d7b0..f78c623 100644 --- a/expts/notebook/1.7-final-touch.ipynb +++ b/expts/notebook/1.7-final-touch.ipynb @@ -324,6 +324,14 @@ " eval_steps_per_second = 12.76\n", " perplexity = 2.4522313125199866e+43\n" ] + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." + ] } ], "source": [ diff --git a/expts/train-small.sh b/expts/train-small.sh new file mode 100755 index 0000000..255b63e --- /dev/null +++ b/expts/train-small.sh @@ -0,0 +1,10 @@ +accelerate launch --config_file config/accelerate.yaml \ + scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \ + --dataset data/ --text_column "input" \ + --is_tokenized False --streaming True \ + --num_labels 1 --include_descriptors False \ + --gradient_accumulation_steps 2 --wandb_watch 'gradients' \ + --per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \ + --eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \ + --save_safetensors True --do_train True --output_dir output/test/ \ + --learning_rate 5e-4 --warmup_steps 1000 --gradient_checkpointing True --max_steps 15_000 diff --git a/expts/train.sh b/expts/train.sh old mode 100644 new mode 100755 index 1bde9e5..1e1b8cc --- a/expts/train.sh +++ b/expts/train.sh @@ -1,10 +1,10 @@ accelerate launch --config_file config/accelerate.yaml \ scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \ - --dataset data/ --text_column "input" \ + --dataset ~/data/ --text_column "input" \ --is_tokenized False --streaming True \ --num_labels 1 --include_descriptors False \ --gradient_accumulation_steps 2 --wandb_watch 'gradients' \ - --per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \ - --eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \ - --save_safetensors True --do_train True --output_dir output/test/ \ - --learning_rate 5e-4 --warmup_steps 500 --gradient_checkpointing True --max_steps 15000 + --per_device_train_batch_size 64 --num_train_epochs 2 --save_steps 5000 --save_total_limit 10 \ + --eval_accumulation_steps 100 --logging_steps 500 --logging_first_step True \ + --save_safetensors True --do_train True --output_dir output/safe/ \ + --learning_rate 5e-5 --warmup_steps 2500 --gradient_checkpointing True --max_steps 30_000_000 \ No newline at end of file diff --git a/safe/trainer/cli.py b/safe/trainer/cli.py index fc568ba..1cb7e5d 100644 --- a/safe/trainer/cli.py +++ b/safe/trainer/cli.py @@ -339,7 +339,7 @@ def compute_metrics(eval_preds): trainer = SAFETrainer( model=model, tokenizer=None, # we don't deal with the tokenizer at all, https://github.com/huggingface/tokenizers/issues/581 -_- - dispatch_batches=(data_args.streaming is True), + dispatch_batches=(data_args.streaming is not True), train_dataset=train_dataset.shuffle(seed=(training_args.seed or 42)), eval_dataset=dataset.get(eval_dataset_key_name, None), args=training_args,