Merge pull request #7 from valence-labs/slurm

Slurm
datamol-io · Aug 15, 2023 · 2ad3241 · 2ad3241
2 parents 096780d + 1754ba0
commit 2ad3241
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 6 deletions.
diff --git a/expts/notebook/1.7-final-touch.ipynb b/expts/notebook/1.7-final-touch.ipynb
@@ -324,6 +324,14 @@
       "  eval_steps_per_second   =                  12.76\n",
       "  perplexity              = 2.4522313125199866e+43\n"
      ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
     }
    ],
    "source": [

diff --git a/expts/train-small.sh b/expts/train-small.sh
@@ -0,0 +1,10 @@
+accelerate launch --config_file config/accelerate.yaml \
+    scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \
+    --dataset data/ --text_column "input" \
+    --is_tokenized False --streaming True \
+    --num_labels 1 --include_descriptors False \
+    --gradient_accumulation_steps 2 --wandb_watch 'gradients' \
+    --per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \
+    --eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \
+    --save_safetensors True --do_train True --output_dir output/test/ \
+    --learning_rate 5e-4 --warmup_steps 1000 --gradient_checkpointing True --max_steps 15_000
diff --git a/expts/train.sh b/expts/train.sh
@@ -1,10 +1,10 @@
 accelerate launch --config_file config/accelerate.yaml \
     scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \
-    --dataset data/ --text_column "input" \
+    --dataset ~/data/ --text_column "input" \
     --is_tokenized False --streaming True \
     --num_labels 1 --include_descriptors False \
     --gradient_accumulation_steps 2 --wandb_watch 'gradients' \
-    --per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \
-    --eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \
-    --save_safetensors True --do_train True --output_dir output/test/ \
-    --learning_rate 5e-4 --warmup_steps 500 --gradient_checkpointing True --max_steps 15000
+    --per_device_train_batch_size 64 --num_train_epochs 2 --save_steps 5000 --save_total_limit 10 \
+    --eval_accumulation_steps 100 --logging_steps 500 --logging_first_step True \
+    --save_safetensors True --do_train True --output_dir output/safe/ \
+    --learning_rate 5e-5 --warmup_steps 2500 --gradient_checkpointing True --max_steps 30_000_000
diff --git a/safe/trainer/cli.py b/safe/trainer/cli.py
@@ -339,7 +339,7 @@ def compute_metrics(eval_preds):
     trainer = SAFETrainer(
         model=model,
         tokenizer=None,  # we don't deal with the tokenizer at all, https://github.com/huggingface/tokenizers/issues/581 -_-
-        dispatch_batches=(data_args.streaming is True),
+        dispatch_batches=(data_args.streaming is not True),
         train_dataset=train_dataset.shuffle(seed=(training_args.seed or 42)),
         eval_dataset=dataset.get(eval_dataset_key_name, None),
         args=training_args,