diff --git a/expts/notebook/1.7-final-touch.ipynb b/expts/notebook/1.7-final-touch.ipynb
index 850d7b0..f78c623 100644
--- a/expts/notebook/1.7-final-touch.ipynb
+++ b/expts/notebook/1.7-final-touch.ipynb
@@ -324,6 +324,14 @@
" eval_steps_per_second = 12.76\n",
" perplexity = 2.4522313125199866e+43\n"
]
+ },
+ {
+ "ename": "",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details."
+ ]
}
],
"source": [
diff --git a/expts/train-small.sh b/expts/train-small.sh
new file mode 100755
index 0000000..255b63e
--- /dev/null
+++ b/expts/train-small.sh
@@ -0,0 +1,10 @@
+accelerate launch --config_file config/accelerate.yaml \
+ scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \
+ --dataset data/ --text_column "input" \
+ --is_tokenized False --streaming True \
+ --num_labels 1 --include_descriptors False \
+ --gradient_accumulation_steps 2 --wandb_watch 'gradients' \
+ --per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \
+ --eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \
+ --save_safetensors True --do_train True --output_dir output/test/ \
+ --learning_rate 5e-4 --warmup_steps 1000 --gradient_checkpointing True --max_steps 15_000
diff --git a/expts/train.sh b/expts/train.sh
old mode 100644
new mode 100755
index 1bde9e5..1e1b8cc
--- a/expts/train.sh
+++ b/expts/train.sh
@@ -1,10 +1,10 @@
accelerate launch --config_file config/accelerate.yaml \
scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \
- --dataset data/ --text_column "input" \
+ --dataset ~/data/ --text_column "input" \
--is_tokenized False --streaming True \
--num_labels 1 --include_descriptors False \
--gradient_accumulation_steps 2 --wandb_watch 'gradients' \
- --per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \
- --eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \
- --save_safetensors True --do_train True --output_dir output/test/ \
- --learning_rate 5e-4 --warmup_steps 500 --gradient_checkpointing True --max_steps 15000
+ --per_device_train_batch_size 64 --num_train_epochs 2 --save_steps 5000 --save_total_limit 10 \
+ --eval_accumulation_steps 100 --logging_steps 500 --logging_first_step True \
+ --save_safetensors True --do_train True --output_dir output/safe/ \
+ --learning_rate 5e-5 --warmup_steps 2500 --gradient_checkpointing True --max_steps 30_000_000
\ No newline at end of file
diff --git a/safe/trainer/cli.py b/safe/trainer/cli.py
index fc568ba..1cb7e5d 100644
--- a/safe/trainer/cli.py
+++ b/safe/trainer/cli.py
@@ -339,7 +339,7 @@ def compute_metrics(eval_preds):
trainer = SAFETrainer(
model=model,
tokenizer=None, # we don't deal with the tokenizer at all, https://github.com/huggingface/tokenizers/issues/581 -_-
- dispatch_batches=(data_args.streaming is True),
+ dispatch_batches=(data_args.streaming is not True),
train_dataset=train_dataset.shuffle(seed=(training_args.seed or 42)),
eval_dataset=dataset.get(eval_dataset_key_name, None),
args=training_args,