Merge pull request #6 from valence-labs/slurm

Improve docs and launcher
datamol-io · Aug 15, 2023 · 096780d · 096780d
2 parents e1cf855 + bc92c8a
commit 096780d
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 4 deletions.
diff --git a/docs/index.md b/docs/index.md
@@ -1,5 +1,5 @@
 
-<h1 align="center">  :safety_vest: SAFE </h1>
+<h1 align="center">  🦺 SAFE </h1>
 <h4 align="center">S</b>equential <b>A</b>ttachment-based <b>F</b>ragment <b>E</b>mbedding (SAFE) is a novel molecular line notation that represents molecules as an unordered sequence of fragment blocks to improve molecule design using generative models.</h4>
 
 </br>

diff --git a/expts/notebook/1.7-final-touch.ipynb b/expts/notebook/1.7-final-touch.ipynb
@@ -86,14 +86,12 @@
     "# params\n",
     "ddp = False\n",
     "gradient_accumulation_steps = 2\n",
-    "wandb_watch = None\n",
     "\n",
     "batch_size = 32\n",
     "warmup_steps = 10\n",
     "num_epochs = 1\n",
     "learning_rate = 1e-5\n",
     "\n",
-    "num_labels = 9\n",
     "logging_steps = 10\n",
     "output_dir = \"../output/\"\n",
     "max_steps = 50\n",

diff --git a/expts/scripts/model_trainer.py b/expts/scripts/model_trainer.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import transformers
 from safe.trainer.cli import ModelArguments
 from safe.trainer.cli import DataArguments

diff --git a/expts/train.sh b/expts/train.sh
@@ -0,0 +1,10 @@
+accelerate launch --config_file config/accelerate.yaml \
+    scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \
+    --dataset data/ --text_column "input" \
+    --is_tokenized False --streaming True \
+    --num_labels 1 --include_descriptors False \
+    --gradient_accumulation_steps 2 --wandb_watch 'gradients' \
+    --per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \
+    --eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \
+    --save_safetensors True --do_train True --output_dir output/test/ \
+    --learning_rate 5e-4 --warmup_steps 500 --gradient_checkpointing True --max_steps 15000
diff --git a/safe/trainer/cli.py b/safe/trainer/cli.py
@@ -333,9 +333,13 @@ def compute_metrics(eval_preds):
             results.update(results_mse)
         return results
 
+    if model_args.include_descriptors:
+        training_args.label_names = ["labels", "mc_labels"]
+
     trainer = SAFETrainer(
         model=model,
         tokenizer=None,  # we don't deal with the tokenizer at all, https://github.com/huggingface/tokenizers/issues/581 -_-
+        dispatch_batches=(data_args.streaming is True),
         train_dataset=train_dataset.shuffle(seed=(training_args.seed or 42)),
         eval_dataset=dataset.get(eval_dataset_key_name, None),
         args=training_args,

diff --git a/safe/trainer/trainer_utils.py b/safe/trainer/trainer_utils.py
@@ -13,9 +13,12 @@ class SAFETrainer(Trainer):
 
     """
 
-    def __init__(self, *args, prop_loss_coeff: float = 1e-3, **kwargs):
+    def __init__(
+        self, *args, prop_loss_coeff: float = 1e-3, dispatch_batches: bool = False, **kwargs
+    ):
         super().__init__(*args, **kwargs)
         self.prop_loss_coeff = prop_loss_coeff
+        self.accelerator.dispatch_batches = dispatch_batches
 
     def compute_loss(self, model, inputs, return_outputs=False):
         """