Skip to content

Commit

Permalink
Merge pull request #6 from valence-labs/slurm
Browse files Browse the repository at this point in the history
Improve docs and launcher
  • Loading branch information
maclandrol authored Aug 15, 2023
2 parents e1cf855 + bc92c8a commit 096780d
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 4 deletions.
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

<h1 align="center"> :safety_vest: SAFE </h1>
<h1 align="center"> 🦺 SAFE </h1>
<h4 align="center">S</b>equential <b>A</b>ttachment-based <b>F</b>ragment <b>E</b>mbedding (SAFE) is a novel molecular line notation that represents molecules as an unordered sequence of fragment blocks to improve molecule design using generative models.</h4>

</br>
Expand Down
2 changes: 0 additions & 2 deletions expts/notebook/1.7-final-touch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,12 @@
"# params\n",
"ddp = False\n",
"gradient_accumulation_steps = 2\n",
"wandb_watch = None\n",
"\n",
"batch_size = 32\n",
"warmup_steps = 10\n",
"num_epochs = 1\n",
"learning_rate = 1e-5\n",
"\n",
"num_labels = 9\n",
"logging_steps = 10\n",
"output_dir = \"../output/\"\n",
"max_steps = 50\n",
Expand Down
1 change: 1 addition & 0 deletions expts/scripts/model_trainer.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
import transformers
from safe.trainer.cli import ModelArguments
from safe.trainer.cli import DataArguments
Expand Down
10 changes: 10 additions & 0 deletions expts/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
accelerate launch --config_file config/accelerate.yaml \
scripts/model_trainer.py --tokenizer "tokenizer/tokenizer-custom.json" \
--dataset data/ --text_column "input" \
--is_tokenized False --streaming True \
--num_labels 1 --include_descriptors False \
--gradient_accumulation_steps 2 --wandb_watch 'gradients' \
--per_device_train_batch_size 32 --num_train_epochs 5 --save_steps 2000 --save_total_limit 10 \
--eval_accumulation_steps 100 --logging_steps 200 --logging_first_step True \
--save_safetensors True --do_train True --output_dir output/test/ \
--learning_rate 5e-4 --warmup_steps 500 --gradient_checkpointing True --max_steps 15000
4 changes: 4 additions & 0 deletions safe/trainer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,9 +333,13 @@ def compute_metrics(eval_preds):
results.update(results_mse)
return results

if model_args.include_descriptors:
training_args.label_names = ["labels", "mc_labels"]

trainer = SAFETrainer(
model=model,
tokenizer=None, # we don't deal with the tokenizer at all, https://github.com/huggingface/tokenizers/issues/581 -_-
dispatch_batches=(data_args.streaming is True),
train_dataset=train_dataset.shuffle(seed=(training_args.seed or 42)),
eval_dataset=dataset.get(eval_dataset_key_name, None),
args=training_args,
Expand Down
5 changes: 4 additions & 1 deletion safe/trainer/trainer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ class SAFETrainer(Trainer):
"""

def __init__(self, *args, prop_loss_coeff: float = 1e-3, **kwargs):
def __init__(
self, *args, prop_loss_coeff: float = 1e-3, dispatch_batches: bool = False, **kwargs
):
super().__init__(*args, **kwargs)
self.prop_loss_coeff = prop_loss_coeff
self.accelerator.dispatch_batches = dispatch_batches

def compute_loss(self, model, inputs, return_outputs=False):
"""
Expand Down

0 comments on commit 096780d

Please sign in to comment.