From 23ff3fea226c545d1f1efd294e097f0b52e39748 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Thu, 29 Aug 2024 18:31:47 +0000
Subject: [PATCH 01/58] Bump NeMo/Mcore update stop and go to include learning
 rate check.

---
 3rdparty/Megatron-LM                                        | 2 +-
 3rdparty/NeMo                                               | 2 +-
 .../src/bionemo/testing/harnesses/stop_and_go.py            | 6 +-----
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM
index ef85bc94fc..34e607ef41 160000
--- a/3rdparty/Megatron-LM
+++ b/3rdparty/Megatron-LM
@@ -1 +1 @@
-Subproject commit ef85bc94fc744aa5d398d12140f808023afbf78d
+Subproject commit 34e607ef41cf1c0ed481a678df9c76952d0ec00c
diff --git a/3rdparty/NeMo b/3rdparty/NeMo
index ff7c614ab8..006d65fb83 160000
--- a/3rdparty/NeMo
+++ b/3rdparty/NeMo
@@ -1 +1 @@
-Subproject commit ff7c614ab8226c2038b268d4575015e5871e17ec
+Subproject commit 006d65fb83ece4766e3d1e9d25c536e86172de6f
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py b/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py
index 6e0de681fb..bbe38c55d2 100644
--- a/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py
+++ b/sub-packages/bionemo-testing/src/bionemo/testing/harnesses/stop_and_go.py
@@ -172,11 +172,7 @@ def get_default_metrics_dict(self) -> dict[str, Callable[[pl.Trainer, pl.Lightni
         Returns:
             dict: A dictionary of default metrics that can be used in the StopAndGoHarness.
         """
-        return {
-            "global_step": get_global_step,
-            # TODO  (Update when we are ToT.)
-            # "learning_rate": get_learning_rate
-        }
+        return {"global_step": get_global_step, "learning_rate": get_learning_rate}
 
     def get_callbacks(self, mode: Literal["stop", "go"]) -> list[pl.Callback]:
         """Returns a list of callbacks based on the specified mode. Base implemention provides reasonable defaults.

From ef671c86e3b8b29b60fe1aae99364be843a2d814 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 10 Sep 2024 15:36:51 -0700
Subject: [PATCH 02/58] Working nemo run example for geneformer pretraining.

---
 .../src/bionemo/geneformer/run/__init__.py    |   0
 .../src/bionemo/geneformer/run/factories.py   | 385 ++++++++++++++++++
 .../src/bionemo/geneformer/run/main.py        |   6 +
 3 files changed, 391 insertions(+)
 create mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py
 create mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
 create mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
new file mode 100644
index 0000000000..da106c51a9
--- /dev/null
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
@@ -0,0 +1,385 @@
+from dataclasses import dataclass
+from typing import List
+import pathlib
+import nemo_run as run
+from typing import Sequence, Literal
+from nemo import lightning as nl
+import math
+from pathlib import Path
+from typing import Optional, Sequence, get_args
+from tokenizers import Tokenizer
+
+from megatron.core.optimizer import OptimizerConfig
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.lightning import io, resume
+from nemo.lightning.pytorch import callbacks as nl_callbacks
+from nemo.lightning.pytorch.optim import MegatronOptimizerModule
+from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
+from nemo.utils import logging
+from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
+from torch.nn import functional as F
+
+from bionemo.core.utils.dtypes import PrecisionTypes, get_autocast_dtype
+from bionemo.geneformer.api import GeneformerConfig
+from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
+from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
+from bionemo.llm.lightning import LossLoggingCallback
+from bionemo.llm.model.biobert.lightning import BioBertLightningModule
+from bionemo.llm.model.biobert.model import BiobertSpecOption
+from bionemo.llm.utils.datamodule_utils import float_or_int_or_none, infer_global_batch_size
+from bionemo.llm.utils.logger_utils import WandbLoggerOptions, setup_nemo_lightning_logger
+@dataclass
+class DataConfig:
+    data_dir: str
+    result_dir: str = './results'
+    seq_length: int = 2048
+    num_dataset_workers: int = 0
+    micro_batch_size: int = 8
+
+    @property
+    def train_data_path(self) -> str:
+        return self.data_dir + "/train"
+
+    @property
+    def val_data_path(self) -> str:
+        return self.data_dir + "/val"
+
+    @property
+    def test_data_path(self) -> str:
+        return self.data_dir + "/test"
+
+@run.cli.factory
+@run.autoconvert
+def small_data_config(data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15_small/processed_data") -> DataConfig:
+    # NOTE theoretically we could validate that this stuff exists.
+    return DataConfig(data_dir=data_dir)
+
+@run.cli.factory
+@run.autoconvert
+def full_geneformer_data_config(data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15/processed_data") -> DataConfig:
+    # NOTE theoretically we could validate that this stuff exists.
+    return DataConfig(data_dir=data_dir)
+
+@dataclass
+class GeneformerDataArtifacts:
+    tokenizer: Tokenizer # TODO(SKH) typing isnt right
+    median_dict: dict
+
+def geneformer_preprocess_recipe(data_config: DataConfig) -> GeneformerDataArtifacts:
+    preprocessor = GeneformerPreprocess(
+        download_directory=pathlib.Path(data_config.train_data_path),
+        medians_file_path=pathlib.Path(data_config.train_data_path + "/medians.json"),
+        tokenizer_vocab_path=pathlib.Path(data_config.train_data_path + "/geneformer.vocab"),
+    )
+    match preprocessor.preprocess():
+        case {"tokenizer": tokenizer, "median_dict": median_dict}:
+            logging.info("*************** Preprocessing Finished ************")
+        case _:
+            logging.error("Preprocessing failed.")
+            raise ValueError("Preprocessing failed to create tokenizer and/or median dictionary.")
+    return GeneformerDataArtifacts(tokenizer=tokenizer, median_dict=median_dict)
+
+
+def singlecell_data_module(data_config: DataConfig, global_batch_size: int) -> SingleCellDataModule:
+    geneformer_data_artifacts: GeneformerDataArtifacts = geneformer_preprocess_recipe(data_config)
+    data = SingleCellDataModule(
+        seq_length=data_config.seq_length,
+        tokenizer=geneformer_data_artifacts.tokenizer,
+        train_dataset_path=data_config.train_data_path,
+        val_dataset_path=data_config.val_data_path,
+        test_dataset_path=data_config.test_data_path,
+        random_token_prob=0.02,  # changed to represent the incorrect setting we originally used.
+        median_dict=geneformer_data_artifacts.median_dict,
+        micro_batch_size=data_config.micro_batch_size,
+        global_batch_size=global_batch_size, 
+        # persistent workers is supported when num_dataset_workers > 0
+        persistent_workers=data_config.num_dataset_workers > 0,
+        pin_memory=False,
+        num_workers=data_config.num_dataset_workers,
+    )
+    return data
+ 
+@dataclass
+class ParallelConfig:
+    tensor_model_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 1
+    accumulate_grad_batches: int = 1
+    ddp: Literal["megatron"] = "megatron"
+    remove_unused_parameters: bool = True
+    num_devices: int = 1
+    num_nodes: int = 1
+
+
+@run.cli.factory
+@run.autoconvert
+def simple_parallel_recipe(tensor_model_parallel_size: int =1, pipeline_model_parallel_size: int = 1, num_devices: int = 1) -> ParallelConfig:
+    # TODO validatorssssssss, make sure we get everythign right here.
+    assert num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size, "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
+    return ParallelConfig(tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size, num_devices=num_devices)
+
+@dataclass
+class TrainingConfig:
+    max_steps: int
+    limit_val_batches: int
+    val_check_interval: int
+    precision: PrecisionTypes = 'bf16-mixed'
+    accelerator: str = 'gpu'
+
+@run.cli.factory
+@run.autoconvert
+def default_trainer_config() -> TrainingConfig:
+    return TrainingConfig(
+        max_steps=55000,
+        limit_val_batches=2,
+        val_check_interval=100
+    )
+
+def setup_trainer_from_configs(parallel_config: ParallelConfig, training_config: TrainingConfig) -> nl.Trainer:
+    # Because this returns a trainer, and trainer is not an argument to the entrypoint, this is not a factory.
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
+        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
+        ddp="megatron",
+        find_unused_parameters=True,
+        ckpt_include_optimizer=True,
+    )
+
+    trainer = nl.Trainer(
+        devices=parallel_config.num_devices,
+        max_steps=training_config.max_steps,
+        accelerator=training_config.accelerator,
+        strategy=strategy,
+        limit_val_batches=training_config.limit_val_batches,  # This controls upsampling and downsampling
+        val_check_interval=training_config.val_check_interval,  # TODO(@jstjohn) Checkpoint saving is currently broken, fix and change this.
+        num_nodes=parallel_config.num_nodes,
+        callbacks=[
+            RichModelSummary(max_depth=4),
+            LearningRateMonitor(),
+        ],
+        plugins=nl.MegatronMixedPrecision(precision=training_config.precision),
+    )
+    return trainer
+
+
+@run.cli.factory
+@run.autoconvert
+def basic_geneformer_config_recipe(seq_length: int = 128, precision: PrecisionTypes='bf16-mixed', nemo1_init_path: Optional[str]=None, biobert_spec_option: BiobertSpecOption=BiobertSpecOption.bert_layer_local_spec.value) -> GeneformerConfig:
+    # TODO seq_length must match the datamodule. We can pass in the DataConfig but I dont know how to enforce that its the same everywhere.
+    #           another option is to construct this ad-hoc like we do with wandb 
+    ''' Sets up the base GeneformerConfig. Recipes on geneformer configs should choose what to expose and come with sensible defaults. '''
+    geneformer_config = GeneformerConfig(
+        num_layers=6,
+        hidden_size=256,
+        ffn_hidden_size=512,
+        num_attention_heads=4,
+        seq_length=seq_length,
+        fp32_residual_connection=False,  # TODO(@jstjohn) check this
+        hidden_dropout=0.02,
+        init_method_std=0.02,
+        kv_channels=None,
+        apply_query_key_layer_scaling=False,
+        make_vocab_size_divisible_by=128,
+        masked_softmax_fusion=True,  # TODO(@jstjohn) check this
+        fp16_lm_cross_entropy=False,
+        params_dtype=get_autocast_dtype(precision),
+        pipeline_dtype=get_autocast_dtype(precision),
+        autocast_dtype=get_autocast_dtype(precision),  # setting this speeds things up a lot
+        gradient_accumulation_fusion=False,  # THIS BREAKS STUFF, leave False
+        layernorm_zero_centered_gamma=False,  # TODO(@jstjohn) check this
+        layernorm_epsilon=1.0e-12,
+        activation_func=F.gelu,  # TODO(@jstjohn) check this
+        qk_layernorm=False,  # TODO(@jstjohn) check this
+        apply_residual_connection_post_layernorm=False,  # False is new default, True was BERT pub.
+        bias_activation_fusion=True,  # TODO(@jstjohn) check this
+        bias_dropout_fusion=True,  # TODO(@jstjohn) check this
+        get_attention_mask_from_fusion=False,
+        attention_dropout=0.1,
+        share_embeddings_and_output_weights=True,
+        enable_autocast=False,  # This has to be set to True if we use the mixed precision plugin
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=nemo1_init_path,
+    )
+    return geneformer_config
+
+@dataclass
+class OptimizerSchedulerConfig:
+    lr: float = 1e-4
+    optimizer: str = 'adam' # TODO Literal
+    cosine_rampup_frac: float = .01
+    cosine_hold_frac: float = .05
+    interval: str = 'step' # TODO Literal
+    monitor: str = 'val_loss'
+
+@run.cli.factory
+@run.autoconvert
+def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
+    ''' Prefers the default parameters for the Optimizer and Scheduler. '''
+    return OptimizerSchedulerConfig()
+
+@run.cli.factory
+@run.autoconvert
+def exposed_optimizer_recipe(lr: float, optimizer: str, cosine_rampup_frac: float, cosine_hold_frac: float, interval: str, monitor: str) -> OptimizerSchedulerConfig:
+    ''' This recipe exposes all parameters to the underlying OptimizerSchedulerConfig. '''
+    return OptimizerSchedulerConfig(lr=lr, optimizer=optimizer, cosine_rampup_frac=cosine_rampup_frac, cosine_hold_frac=cosine_hold_frac, interval=interval, monitor=monitor)
+
+@run.cli.factory
+@run.autoconvert
+def optimizer_recipe_with_kwarg_defaults(lr: float = 1e-4, optimizer: str = 'adam', cosine_rampup_frac: float = .01, cosine_hold_frac: float = .05, interval: str = 'step', monitor: str = 'val_loss') -> OptimizerSchedulerConfig:
+    ''' This recipe exposes all parameters to the underlying OptimizerSchedulerConfig and provides defaults as kwargs. '''
+    return OptimizerSchedulerConfig(lr=lr, optimizer=optimizer, cosine_rampup_frac=cosine_rampup_frac, cosine_hold_frac=cosine_hold_frac, interval=interval, monitor=monitor)
+
+def biobert_lightning_module(geneformer_config: GeneformerConfig, tokenizer: Tokenizer, optim_config: OptimizerSchedulerConfig, num_steps: int) -> BioBertLightningModule:
+    ''' Function that constructs a lightning module from the requisite configs. 
+
+    tokenizer: Tokenizer - must be the same tokenizer used by the DataModule.
+    num_steps: int - must match the number of steps in the DataConfig. 
+    '''
+    model = BioBertLightningModule(
+        geneformer_config,
+        tokenizer=tokenizer,
+        optimizer=MegatronOptimizerModule(
+            config=OptimizerConfig(
+                lr=optim_config.lr,
+                optimizer=optim_config.optimizer,
+                use_distributed_optimizer=True,
+                # Pass through fp16/bf16 settings to avoid errors around model having bf16 enabled but optimizer not.
+                # implies these configs must be coupled.
+                fp16=geneformer_config.fp16,
+                bf16=geneformer_config.bf16,
+            ),
+            lr_scheduler=CosineAnnealingScheduler(
+                max_steps=num_steps,
+                # minimum learning rate is 1/100th of the initial learning rate, so eg lr=1e-3 -> min_lr=1e-5
+                min_lr=optim_config.lr / 100,
+                warmup_steps=int(math.ceil(num_steps * optim_config.cosine_rampup_frac)),
+                interval=optim_config.interval,
+                monitor=optim_config.monitor,
+                constant_steps=int(math.ceil(num_steps * optim_config.cosine_hold_frac)),
+            ),
+        ),
+    )
+    return model
+
+@dataclass
+class ExperimentConfig:
+    save_every_n_steps: int
+    result_dir: str
+    experiment_name: str
+    restore_from_checkpoint_path: Optional[str]
+    resume_if_exists: bool
+    wandb_options: WandbLoggerOptions = None # TODO(SKH) if we are passing a type in here its gonna blow up.
+    save_best_checkpoint: bool = False
+    save_last_checkpoint: bool = True
+    metric_to_monitor_for_checkpoints: str = 'reduced_train_loss' # TODO literal?
+    save_top_k: int = 2
+    create_tensorboard_logger: bool = False
+
+@run.cli.factory
+@run.autoconvert
+def experiment_config_recipe() -> ExperimentConfig:
+    return ExperimentConfig(
+        save_every_n_steps=100,
+        result_dir='./results',
+        experiment_name='default_experiment',
+        restore_from_checkpoint_path=None,
+        resume_if_exists=True,
+        save_best_checkpoint=False,
+        save_last_checkpoint=True,
+        metric_to_monitor_for_checkpoints='reduced_train_loss',
+        save_top_k=2,
+        create_tensorboard_logger=False,
+    )
+
+@dataclass
+class WandbConfig:
+    # NOTE(SKH) there is some duplication with WandbLoggerOptions
+    project: str # Must be set to log to wandb, this is the 'project' directory under your 'entity'
+    entity: str # Sometimes refers to team, sometimes username
+    offline: bool # If set does not log to wandb
+
+def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
+    checkpoint_callback = nl_callbacks.ModelCheckpoint(
+        save_best_model=experiment_config.save_best_checkpoint,
+        save_last=experiment_config.save_last_checkpoint,
+        monitor=experiment_config.metric_to_monitor_for_checkpoints,
+        save_top_k=experiment_config.save_top_k,
+        every_n_train_steps=experiment_config.save_every_n_steps,
+        enable_nemo_ckpt_io=True,
+    )
+
+    wandb_options: Optional[WandbLoggerOptions] = (
+        None
+        if wandb_config is None
+        else WandbLoggerOptions(
+            offline=wandb_config.offline,
+            project=wandb_config.project,
+            entity=wandb_config.entity,
+            log_model=False,
+        )
+    )
+
+    # Setup the logger and train the model
+    nemo_logger = setup_nemo_lightning_logger(
+        root_dir=experiment_config.result_dir,
+        name=experiment_config.experiment_name,
+        initialize_tensorboard_logger=experiment_config.create_tensorboard_logger,
+        wandb_kwargs=wandb_options,
+        ckpt_callback=checkpoint_callback,
+    )
+    return nemo_logger
+
+@run.cli.entrypoint
+def pretrain(
+        geneformer_config: GeneformerConfig, 
+        data_config: DataConfig, 
+        parallel_config: ParallelConfig, 
+        training_config: TrainingConfig, 
+        optim_config: OptimizerSchedulerConfig,
+        experiment_config: ExperimentConfig, 
+        # Remaining are things that live outside a config
+        resume_if_exists: bool = True,
+        wandb_entity: Optional[str] = None,
+        wandb_project: Optional[str] = None,
+        wandb_offline: bool = True,
+        copy_val_check_interval_for_save_every_n_steps: bool = True
+    ):
+    # NOTE: any config passed into the entrypoint can be MUTATED by the CLI.
+
+    # Setup.
+    # Create requisite directory.
+    pathlib.Path(data_config.result_dir).mkdir(parents=True, exist_ok=True)
+
+    if copy_val_check_interval_for_save_every_n_steps and experiment_config.save_every_n_steps != training_config.val_check_interval:
+        logging.warning("Mutating training_config.save_every_n_steps to be equal to val_check_interval.")
+        experiment_config.save_every_n_steps = training_config.val_check_interval
+
+
+    global_batch_size = infer_global_batch_size(micro_batch_size=data_config.micro_batch_size, 
+                                                num_nodes=parallel_config.num_nodes,
+                                                devices=parallel_config.num_devices, 
+                                                accumulate_grad_batches=parallel_config.accumulate_grad_batches,
+                                                tensor_model_parallel_size=parallel_config.tensor_model_parallel_size, 
+                                                pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size)
+
+    data: SingleCellDataModule = singlecell_data_module(data_config, global_batch_size)
+    # TODO there must be a way to do this automatically.
+    geneformer_config.seq_length = data_config.seq_length
+    geneformer_config.bf16 = geneformer_config.params_dtype == 'bf16-mixed'
+    geneformer_config.fp16 = geneformer_config.params_dtype == '16-mixed'
+
+    model: BioBertLightningModule = biobert_lightning_module(geneformer_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps)
+    trainer: nl.Trainer = setup_trainer_from_configs(parallel_config, training_config)
+    nemo_logger: nl.NeMoLogger = nemo_logger_factory(experiment_config, wandb_config=WandbConfig(project=wandb_project, entity=wandb_entity, offline=wandb_offline))
+
+    llm.train(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=nemo_logger,
+        resume=resume.AutoResume(
+            path=None,
+            resume_if_exists=resume_if_exists,  # Looks for the -last checkpoint to continue training.
+            resume_ignore_no_checkpoint=True,  # When false this will throw an error with no existing checkpoint.
+        ),
+    )
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
new file mode 100644
index 0000000000..148aaf352f
--- /dev/null
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -0,0 +1,6 @@
+import nemo_run as run
+from factories import pretrain
+
+if __name__ == "__main__":
+    # TODO see if we can setup the experiment management thingy too.
+    run.cli.main(pretrain)
\ No newline at end of file

From ef9f4ea51bbc3425fbe88642b43a640d66a419cc Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 10 Sep 2024 22:44:08 +0000
Subject: [PATCH 03/58] added readme

---
 README.md | 130 ++++++++++++------------------------------------------
 1 file changed, 28 insertions(+), 102 deletions(-)

diff --git a/README.md b/README.md
index 4be95117e8..b648565593 100644
--- a/README.md
+++ b/README.md
@@ -1,102 +1,28 @@
-# BioNeMo2 Repo
-To get started, please build the docker container using
-```bash
-./launch.sh build
-```
-
-All `bionemo2` code is partitioned into independently installable namespace packages. These live under the `sub-packages/` directory.
-
-
-# TODO: Finish this.
-
-## Downloading artifacts
-Set the AWS access info in your `.env` in the host container prior to running docker:
-
-```bash
-AWS_ACCESS_KEY_ID="team-bionemo"
-AWS_SECRET_ACCESS_KEY=$(grep aws_secret_access_key ~/.aws/config | cut -d' ' -f 3)
-AWS_REGION="us-east-1"
-AWS_ENDPOINT_URL="https://pbss.s8k.io"
-```
-then
-```bash
-python scripts/download_artifacts.py --models all --model_dir ./models --data all --data_dir ./ --verbose --source pbss
-```
-
-## Initializing 3rd-party dependencies as git submodules
-
-For development, the NeMo and Megatron-LM dependencies are vendored in the bionemo-2 repository workspace as git
-submodules. The pinned commits for these submodules represent the "last-known-good" versions of these packages that are
-confirmed to be working with bionemo2 (and those that are tested in CI).
-
-To initialize these sub-modules when cloning the repo, add the `--recursive` flag to the git clone command:
-
-```bash
-git clone --recursive git@github.com:NVIDIA/bionemo-fw-ea.git
-```
-
-To download the pinned versions of these submodules within an existing git repository, run
-
-```bash
-git submodule update --init --recursive
-```
-
-### Updating pinned versions of NeMo / Megatron-LM
-
-To update the pinned commits of NeMo or Megatron-LM, checkout that commit in the submodule folder, and then commit the
-result in the top-level bionemo repository.
-
-```bash
-cd 3rdparty/NeMo/
-git fetch
-git checkout <desired_sha>
-cd ../..
-git add '3rdparty/NeMo/'
-git commit -m "updating NeMo commit"
-```
-
-### Devloping with nemo+megatron+bionemo (deprecated)
-```
-export NEMO_HOME=path/to/local/nemo
-export MEGATRON_HOME=path/to/local/megatron
-./launch.sh dev
-```
-The above will make a `.env` file that you can edit as needed to get more variables into the container.
-
-## Models
-### Geneformer
-#### Get test data for geneformer
-```bash
-mkdir -p /workspace/bionemo2/data
-aws s3 cp \
-  s3://general-purpose/cellxgene_2023-12-15_small \
-  /workspace/bionemo2/data/cellxgene_2023-12-15_small \
-  --recursive \
-  --endpoint-url https://pbss.s8k.io
-```
-#### Running
-
-The following command runs a very small example of geneformer.
-
-```bash
-python  \
-    scripts/singlecell/geneformer/pretrain.py     \
-    --data-dir data/cellxgene_2023-12-15_small/processed_data    \
-    --result-dir ./results     \
-    --experiment-name test_experiment     \
-    --num-gpus 1  \
-    --num-nodes 1 \
-    --val-check-interval 10 \
-    --num-dataset-workers 0 \
-    --num-steps 55 \
-    --seq-length 128 \
-    --limit-val-batches 2 \
-    --micro-batch-size 2
-```
-
-#### Updating License Header on Python Files
-Make sure you have installed [`license-check`](https://gitlab-master.nvidia.com/clara-discovery/infra-bionemo),
-which is defined in the development dependencies. If you add new Python (`.py`) files, be sure to run as:
-```bash
-license-check --license-header ./license_header --check . --modify --replace
-```
+# NeMo-Run entrypoint
+
+`main.py` acts as a simple entrypoint to pretraining geneformer via use of configs and factories (`factories.py`). The command below will  execute the equivalent of what we have under scripts/pretrain.py
+
+This module is a work in progress.
+
+``` bash
+python sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py \
+	geneformer_config=basic_geneformer_config_recipe \
+	data_config=small_data_config \
+	parallel_config=simple_parallel_recipe \
+	training_config=default_trainer_config \
+	optim_config=default_adam_optimizer_with_cosine_annealing_recipe \
+	experiment_config=experiment_config_recipe \
+	resume_if_exists=False \
+	data_config.seq_length=128 \
+	parallel_config.num_devices=1 \
+	data_config.micro_batch_size=2 \
+	training_config.precision=bf16-mixed
+``` 
+
+## Concepts and things to keep in mind
+
+Plain Function - A function that does literally anything and produces something else. In somecases, we have functions that take configs and produce an object. In these scenarios we are often composing an object with pieces of various configs.
+Factory - A method that constructs a config and is decorated with run.cli.factory. These act as configs presentable to the command line.
+Recipe - A specific factory with a distinct purpose. E.g. BERT XL vs BERT Small
+Config - A fiddle dataclass presentable and mutatable via nemo run. These are also serialized and used for restoring previous configuations.
+Entrypoint - A method that takes a mixture of plain arguments and configs. These are exposed to the command line. The body of the function represents the execution occuring in the program.

From 14e4bbd547d092f33eaf01a078507c1a81ca35db Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 11 Sep 2024 18:33:36 +0000
Subject: [PATCH 04/58] adding files for nemo run

---
 README.md                                     | 130 ++++++++++++++----
 .../src/bionemo/geneformer/run/README.md      |  28 ++++
 .../src/bionemo/geneformer/run/__init__.py    |  16 +++
 .../src/bionemo/geneformer/run/factories.py   |  20 ++-
 .../src/bionemo/geneformer/run/main.py        |  16 +++
 5 files changed, 180 insertions(+), 30 deletions(-)
 create mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/README.md

diff --git a/README.md b/README.md
index b648565593..4be95117e8 100644
--- a/README.md
+++ b/README.md
@@ -1,28 +1,102 @@
-# NeMo-Run entrypoint
-
-`main.py` acts as a simple entrypoint to pretraining geneformer via use of configs and factories (`factories.py`). The command below will  execute the equivalent of what we have under scripts/pretrain.py
-
-This module is a work in progress.
-
-``` bash
-python sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py \
-	geneformer_config=basic_geneformer_config_recipe \
-	data_config=small_data_config \
-	parallel_config=simple_parallel_recipe \
-	training_config=default_trainer_config \
-	optim_config=default_adam_optimizer_with_cosine_annealing_recipe \
-	experiment_config=experiment_config_recipe \
-	resume_if_exists=False \
-	data_config.seq_length=128 \
-	parallel_config.num_devices=1 \
-	data_config.micro_batch_size=2 \
-	training_config.precision=bf16-mixed
-``` 
-
-## Concepts and things to keep in mind
-
-Plain Function - A function that does literally anything and produces something else. In somecases, we have functions that take configs and produce an object. In these scenarios we are often composing an object with pieces of various configs.
-Factory - A method that constructs a config and is decorated with run.cli.factory. These act as configs presentable to the command line.
-Recipe - A specific factory with a distinct purpose. E.g. BERT XL vs BERT Small
-Config - A fiddle dataclass presentable and mutatable via nemo run. These are also serialized and used for restoring previous configuations.
-Entrypoint - A method that takes a mixture of plain arguments and configs. These are exposed to the command line. The body of the function represents the execution occuring in the program.
+# BioNeMo2 Repo
+To get started, please build the docker container using
+```bash
+./launch.sh build
+```
+
+All `bionemo2` code is partitioned into independently installable namespace packages. These live under the `sub-packages/` directory.
+
+
+# TODO: Finish this.
+
+## Downloading artifacts
+Set the AWS access info in your `.env` in the host container prior to running docker:
+
+```bash
+AWS_ACCESS_KEY_ID="team-bionemo"
+AWS_SECRET_ACCESS_KEY=$(grep aws_secret_access_key ~/.aws/config | cut -d' ' -f 3)
+AWS_REGION="us-east-1"
+AWS_ENDPOINT_URL="https://pbss.s8k.io"
+```
+then
+```bash
+python scripts/download_artifacts.py --models all --model_dir ./models --data all --data_dir ./ --verbose --source pbss
+```
+
+## Initializing 3rd-party dependencies as git submodules
+
+For development, the NeMo and Megatron-LM dependencies are vendored in the bionemo-2 repository workspace as git
+submodules. The pinned commits for these submodules represent the "last-known-good" versions of these packages that are
+confirmed to be working with bionemo2 (and those that are tested in CI).
+
+To initialize these sub-modules when cloning the repo, add the `--recursive` flag to the git clone command:
+
+```bash
+git clone --recursive git@github.com:NVIDIA/bionemo-fw-ea.git
+```
+
+To download the pinned versions of these submodules within an existing git repository, run
+
+```bash
+git submodule update --init --recursive
+```
+
+### Updating pinned versions of NeMo / Megatron-LM
+
+To update the pinned commits of NeMo or Megatron-LM, checkout that commit in the submodule folder, and then commit the
+result in the top-level bionemo repository.
+
+```bash
+cd 3rdparty/NeMo/
+git fetch
+git checkout <desired_sha>
+cd ../..
+git add '3rdparty/NeMo/'
+git commit -m "updating NeMo commit"
+```
+
+### Devloping with nemo+megatron+bionemo (deprecated)
+```
+export NEMO_HOME=path/to/local/nemo
+export MEGATRON_HOME=path/to/local/megatron
+./launch.sh dev
+```
+The above will make a `.env` file that you can edit as needed to get more variables into the container.
+
+## Models
+### Geneformer
+#### Get test data for geneformer
+```bash
+mkdir -p /workspace/bionemo2/data
+aws s3 cp \
+  s3://general-purpose/cellxgene_2023-12-15_small \
+  /workspace/bionemo2/data/cellxgene_2023-12-15_small \
+  --recursive \
+  --endpoint-url https://pbss.s8k.io
+```
+#### Running
+
+The following command runs a very small example of geneformer.
+
+```bash
+python  \
+    scripts/singlecell/geneformer/pretrain.py     \
+    --data-dir data/cellxgene_2023-12-15_small/processed_data    \
+    --result-dir ./results     \
+    --experiment-name test_experiment     \
+    --num-gpus 1  \
+    --num-nodes 1 \
+    --val-check-interval 10 \
+    --num-dataset-workers 0 \
+    --num-steps 55 \
+    --seq-length 128 \
+    --limit-val-batches 2 \
+    --micro-batch-size 2
+```
+
+#### Updating License Header on Python Files
+Make sure you have installed [`license-check`](https://gitlab-master.nvidia.com/clara-discovery/infra-bionemo),
+which is defined in the development dependencies. If you add new Python (`.py`) files, be sure to run as:
+```bash
+license-check --license-header ./license_header --check . --modify --replace
+```
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/README.md b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/README.md
new file mode 100644
index 0000000000..427b78a85c
--- /dev/null
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/README.md
@@ -0,0 +1,28 @@
+# NeMo-Run entrypoint
+
+`main.py` acts as a simple entrypoint to pretraining geneformer via use of configs and factories (`factories.py`). The command below will  execute the equivalent of what we have under scripts/pretrain.py
+
+This module is a work in progress.
+
+``` bash
+python sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py \
+	geneformer_config=basic_geneformer_config_recipe \
+	data_config=small_data_config \
+	parallel_config=simple_parallel_recipe \
+	training_config=default_trainer_config \
+	optim_config=default_adam_optimizer_with_cosine_annealing_recipe \
+	experiment_config=experiment_config_recipe \
+	resume_if_exists=False \
+	data_config.seq_length=128 \
+	parallel_config.num_devices=1 \
+	data_config.micro_batch_size=2 \
+	training_config.precision=bf16-mixed
+```
+
+## Concepts and things to keep in mind
+
+Plain Function - A function that does literally anything and produces something else. In somecases, we have functions that take configs and produce an object. In these scenarios we are often composing an object with pieces of various configs.
+Factory - A method that constructs a config and is decorated with run.cli.factory. These act as configs presentable to the command line.
+Recipe - A specific factory with a distinct purpose. E.g. BERT XL vs BERT Small
+Config - A fiddle dataclass presentable and mutatable via nemo run. These are also serialized and used for restoring previous configuations.
+Entrypoint - A method that takes a mixture of plain arguments and configs. These are exposed to the command line. The body of the function represents the execution occuring in the program.
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py
index e69de29bb2..79672139c9 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
index da106c51a9..5cb3c4ddb5 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
@@ -1,3 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 from dataclasses import dataclass
 from typing import List
 import pathlib
@@ -365,8 +381,8 @@ def pretrain(
     data: SingleCellDataModule = singlecell_data_module(data_config, global_batch_size)
     # TODO there must be a way to do this automatically.
     geneformer_config.seq_length = data_config.seq_length
-    geneformer_config.bf16 = geneformer_config.params_dtype == 'bf16-mixed'
-    geneformer_config.fp16 = geneformer_config.params_dtype == '16-mixed'
+    geneformer_config.bf16 = training_config.precision == 'bf16-mixed'
+    geneformer_config.fp16 = training_config.precision == '16-mixed'
 
     model: BioBertLightningModule = biobert_lightning_module(geneformer_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps)
     trainer: nl.Trainer = setup_trainer_from_configs(parallel_config, training_config)
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 148aaf352f..e17ca04b36 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -1,3 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import nemo_run as run
 from factories import pretrain
 

From 28ddc6bb80dca6844ea9cd59ab9621e629eb2ef1 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 11 Sep 2024 15:55:11 -0700
Subject: [PATCH 05/58] Added entrypoint for launching experiment with a local
 executor and entrypoint for rerunning an experiment. Created an
 'ExposedGeneformerConfig' to get around serialization errors with nemo run.

---
 .../src/bionemo/geneformer/run/factories.py   | 96 +++++++++++++++++--
 .../src/bionemo/geneformer/run/main.py        | 71 +++++++++++++-
 2 files changed, 154 insertions(+), 13 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
index 5cb3c4ddb5..af02f6f001 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 
 
-from dataclasses import dataclass
-from typing import List
+from dataclasses import dataclass, asdict
+from typing import Any, Callable, List
 import pathlib
 import nemo_run as run
 from typing import Sequence, Literal
@@ -45,6 +45,8 @@
 from bionemo.llm.model.biobert.model import BiobertSpecOption
 from bionemo.llm.utils.datamodule_utils import float_or_int_or_none, infer_global_batch_size
 from bionemo.llm.utils.logger_utils import WandbLoggerOptions, setup_nemo_lightning_logger
+
+
 @dataclass
 class DataConfig:
     data_dir: str
@@ -177,14 +179,56 @@ def setup_trainer_from_configs(parallel_config: ParallelConfig, training_config:
     )
     return trainer
 
+@dataclass
+class ExposedGeneformerConfig:
+    ''' NeMo run does not like GeneformerConfig due to use its use of lambdas. 
+    
+    So I basicaly need a method that does This -> GeneformerConfig
+    then use regular recipes/factories on the parent and do this transform at the last step. 
+    '''
+    params_dtype: PrecisionTypes
+    pipeline_dtype: PrecisionTypes
+    autocast_dtype: PrecisionTypes
+    num_layers: int =6
+    hidden_size: int =256
+    ffn_hidden_size: int =512
+    num_attention_heads: int =4
+    seq_length: int = 512
+    fp32_residual_connection: bool =False
+    hidden_dropout: float =0.02
+    init_method_std: float =0.02
+    kv_channels: Optional[int]=None
+    apply_query_key_layer_scaling: bool =False
+    make_vocab_size_divisible_by: int =128
+    masked_softmax_fusion: bool =True
+    fp16_lm_cross_entropy: bool =False
+    gradient_accumulation_fusion: bool =False
+    layernorm_zero_centered_gamma: bool =False
+    layernorm_epsilon: float=1.0e-12
+    activation_func: Callable =F.gelu
+    qk_layernorm: bool=False
+    apply_residual_connection_post_layernorm: bool=False
+    bias_activation_fusion: bool =True
+    bias_dropout_fusion: bool =True
+    get_attention_mask_from_fusion: bool =False
+    attention_dropout: float =0.1
+    share_embeddings_and_output_weights: bool =True
+    enable_autocast: bool = False
+    nemo1_ckpt_path: Optional[str] = None
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec.value
+
+def exposed_to_internal_geneformer_config(arg: ExposedGeneformerConfig) -> GeneformerConfig:
+    return GeneformerConfig(**asdict(arg))
 
 @run.cli.factory
 @run.autoconvert
-def basic_geneformer_config_recipe(seq_length: int = 128, precision: PrecisionTypes='bf16-mixed', nemo1_init_path: Optional[str]=None, biobert_spec_option: BiobertSpecOption=BiobertSpecOption.bert_layer_local_spec.value) -> GeneformerConfig:
-    # TODO seq_length must match the datamodule. We can pass in the DataConfig but I dont know how to enforce that its the same everywhere.
-    #           another option is to construct this ad-hoc like we do with wandb 
+def basic_wrapped_geneformer_config_recipe(seq_length: int = 128, 
+                                   precision: PrecisionTypes='bf16-mixed', 
+                                   nemo1_init_path: Optional[str]=None, 
+                                   biobert_spec_option: BiobertSpecOption=BiobertSpecOption.bert_layer_local_spec.value
+                                   ) -> ExposedGeneformerConfig:
     ''' Sets up the base GeneformerConfig. Recipes on geneformer configs should choose what to expose and come with sensible defaults. '''
-    geneformer_config = GeneformerConfig(
+    geneformer_config = ExposedGeneformerConfig(
         num_layers=6,
         hidden_size=256,
         ffn_hidden_size=512,
@@ -345,9 +389,38 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
     )
     return nemo_logger
 
+def pretrain_partial(
+        geneformer_config: ExposedGeneformerConfig, 
+        data_config: DataConfig, 
+        parallel_config: ParallelConfig, 
+        training_config: TrainingConfig, 
+        optim_config: OptimizerSchedulerConfig,
+        experiment_config: ExperimentConfig, 
+        resume_if_exists: bool = True,
+        wandb_entity: Optional[str] = None,
+        wandb_project: Optional[str] = None,
+        wandb_offline: bool = True,
+) -> run.Partial:
+    ''' Same as pretrain but in partial form instead of an entrypoint. '''
+
+    return run.Partial(pretrain, 
+        geneformer_config=geneformer_config,
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+        # Remaining are things that live outside a config
+        resume_if_exists=resume_if_exists,
+        # These could live as their own config, but they dont make sense to use factories with since theyre dependent on the environment.
+        wandb_entity=wandb_entity,
+        wandb_project=wandb_project,
+        wandb_offline=wandb_offline
+    )
+
 @run.cli.entrypoint
 def pretrain(
-        geneformer_config: GeneformerConfig, 
+        geneformer_config: ExposedGeneformerConfig,  #noqa
         data_config: DataConfig, 
         parallel_config: ParallelConfig, 
         training_config: TrainingConfig, 
@@ -355,18 +428,21 @@ def pretrain(
         experiment_config: ExperimentConfig, 
         # Remaining are things that live outside a config
         resume_if_exists: bool = True,
+        # These could live as their own config, but they dont make sense to use factories with since theyre dependent on the environment.
         wandb_entity: Optional[str] = None,
         wandb_project: Optional[str] = None,
         wandb_offline: bool = True,
-        copy_val_check_interval_for_save_every_n_steps: bool = True
+        new_experiment_title = 'asdf'
     ):
-    # NOTE: any config passed into the entrypoint can be MUTATED by the CLI.
+
+    # To make this work correctly as an entrypoint we must actually wrap it in something else due how how certain local variables are used as defaults.
+    geneformer_config: GeneformerConfig = exposed_to_internal_geneformer_config(geneformer_config)
 
     # Setup.
     # Create requisite directory.
     pathlib.Path(data_config.result_dir).mkdir(parents=True, exist_ok=True)
 
-    if copy_val_check_interval_for_save_every_n_steps and experiment_config.save_every_n_steps != training_config.val_check_interval:
+    if experiment_config.save_every_n_steps != training_config.val_check_interval:
         logging.warning("Mutating training_config.save_every_n_steps to be equal to val_check_interval.")
         experiment_config.save_every_n_steps = training_config.val_check_interval
 
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index e17ca04b36..7ecc48d36c 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -15,8 +15,73 @@
 
 
 import nemo_run as run
-from factories import pretrain
+from bionemo.geneformer.run.factories import ExposedGeneformerConfig, DataConfig, ParallelConfig, TrainingConfig, OptimizerSchedulerConfig, ExperimentConfig, pretrain_partial, pretrain
+from dataclasses import dataclass
+import pydantic
+from typing import Optional
+
+
+class NeMoRunConfig(pydantic.BaseModel):
+    # These are all mutually exclusive, I think thats important to capture.
+    new_experiment_title: Optional[str]
+    resume_from_id: Optional[str]
+    resume_from_title: Optional[str]
+
+    def __post_init__(self):
+        if not any([self.new_experiment_title, self.resume_from_id, self.resume_from_title]):
+            raise ValueError("Exactly one of new_experiment_title, resume_from_id, resume_from_title must be set. None are set.")
+
+        if sum([bool(self.new_experiment_title), bool(self.resume_from_id), bool(self.resume_from_title)]) > 1:        
+            raise ValueError("Exactly one of new_experiment_title, resume_from_id, resume_from_title must be set. More than one field was set.")
+
+@run.cli.entrypoint
+def run_again(
+    resume_from_id: Optional[str], # Note, in these cases we dont actually need the rest of the configs. Maybe these deserve distinct entrypoints.
+    resume_from_title: Optional[str],
+    # NOTE could optionall support execution kwargs and mutate those.
+):
+    assert resume_from_id or resume_from_title, "Exactly one of resume_from_id or resume_from_title must be set to rerun an experiment."
+    assert not (resume_from_id and resume_from_title), "Exactly one of resume_from_id or resume_from_title must be set to rerun an experiment."
+
+    # Setup the context manager with the correct entrypoint, expect these to be mutually exclusive
+    with run.Experiment.from_title(resume_from_title)  \
+         if resume_from_title is not None else    \
+         run.Experiment.from_id(resume_from_id) \
+         as exp:
+        exp.executor = run.LocalExecutor() # Can we mutate? 
+        print(exp)
+        exp.reset()
+        exp.run(sequential=True)
+
+@run.cli.entrypoint
+def run_firsttime(
+    # NeMo Run controls.
+    experiment_title: str,
+
+    # Pretrain configuration requirements.
+    geneformer_config: ExposedGeneformerConfig, 
+    data_config: DataConfig, 
+    parallel_config: ParallelConfig, 
+    training_config: TrainingConfig, 
+    optim_config: OptimizerSchedulerConfig,
+    experiment_config: ExperimentConfig, 
+    # Remaining are things that live outside a config
+    resume_if_exists: bool = True,
+    # WANDB
+    wandb_entity: Optional[str] = None,
+    wandb_project: Optional[str] = None,
+    wandb_offline: bool = True,
+):
+    
+
+    # TODO execution conditionals
+    local_executor = run.LocalExecutor()
+    with run.Experiment(title=experiment_title, executor=local_executor) as e:
+        # Input has to be a partial wrapper of pretrain?
+        e.add(pretrain_partial(geneformer_config, data_config, parallel_config, training_config, optim_config, experiment_config, resume_if_exists, wandb_entity, wandb_project, wandb_offline))
+        e.run()
+
 
 if __name__ == "__main__":
-    # TODO see if we can setup the experiment management thingy too.
-    run.cli.main(pretrain)
\ No newline at end of file
+    run.cli.main(run_firsttime)
+    # run.cli.main(pretrain)
\ No newline at end of file

From 1be86782152125ad32cf63473516d207f4c00e62 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 23 Sep 2024 20:05:00 +0000
Subject: [PATCH 06/58] updating repo

---
 .../src/bionemo/geneformer/run/factories.py   | 434 ++++++++++++------
 .../src/bionemo/geneformer/run/main.py        |  88 ++--
 .../src/bionemo/llm/model/biobert/model.py    |   1 +
 3 files changed, 358 insertions(+), 165 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
index af02f6f001..aa0b698291 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
@@ -13,44 +13,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-from dataclasses import dataclass, asdict
-from typing import Any, Callable, List
-import pathlib
-import nemo_run as run
-from typing import Sequence, Literal
-from nemo import lightning as nl
 import math
-from pathlib import Path
-from typing import Optional, Sequence, get_args
-from tokenizers import Tokenizer
+import pathlib
+from abc import ABC, abstractmethod
+from dataclasses import asdict, dataclass
+from typing import Callable, Generic, List, Literal, Optional, Type, TypeVar
 
+import nemo_run as run
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
 from nemo.collections import llm
-from nemo.lightning import io, resume
+from nemo.lightning import resume
 from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule
 from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
 from nemo.utils import logging
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
+from tokenizers import Tokenizer
 from torch.nn import functional as F
 
 from bionemo.core.utils.dtypes import PrecisionTypes, get_autocast_dtype
 from bionemo.geneformer.api import GeneformerConfig
 from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
 from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
-from bionemo.llm.lightning import LossLoggingCallback
+from bionemo.geneformer.model.finetune_token_regressor import FineTuneSeqLenBioBertConfig
 from bionemo.llm.model.biobert.lightning import BioBertLightningModule
-from bionemo.llm.model.biobert.model import BiobertSpecOption
-from bionemo.llm.utils.datamodule_utils import float_or_int_or_none, infer_global_batch_size
+from bionemo.llm.model.biobert.model import BioBertGenericConfig, BiobertSpecOption
+from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
 from bionemo.llm.utils.logger_utils import WandbLoggerOptions, setup_nemo_lightning_logger
 
 
+run.Config
+
+
 @dataclass
 class DataConfig:
     data_dir: str
-    result_dir: str = './results'
+    result_dir: str = "./results"
     seq_length: int = 2048
     num_dataset_workers: int = 0
     micro_batch_size: int = 8
@@ -67,23 +66,31 @@ def val_data_path(self) -> str:
     def test_data_path(self) -> str:
         return self.data_dir + "/test"
 
+
 @run.cli.factory
 @run.autoconvert
-def small_data_config(data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15_small/processed_data") -> DataConfig:
+def small_data_config(
+    data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15_small/processed_data",
+) -> DataConfig:
     # NOTE theoretically we could validate that this stuff exists.
     return DataConfig(data_dir=data_dir)
 
+
 @run.cli.factory
 @run.autoconvert
-def full_geneformer_data_config(data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15/processed_data") -> DataConfig:
+def full_geneformer_data_config(
+    data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15/processed_data",
+) -> DataConfig:
     # NOTE theoretically we could validate that this stuff exists.
     return DataConfig(data_dir=data_dir)
 
+
 @dataclass
 class GeneformerDataArtifacts:
-    tokenizer: Tokenizer # TODO(SKH) typing isnt right
+    tokenizer: Tokenizer  # TODO(SKH) typing isnt right
     median_dict: dict
 
+
 def geneformer_preprocess_recipe(data_config: DataConfig) -> GeneformerDataArtifacts:
     preprocessor = GeneformerPreprocess(
         download_directory=pathlib.Path(data_config.train_data_path),
@@ -110,14 +117,15 @@ def singlecell_data_module(data_config: DataConfig, global_batch_size: int) -> S
         random_token_prob=0.02,  # changed to represent the incorrect setting we originally used.
         median_dict=geneformer_data_artifacts.median_dict,
         micro_batch_size=data_config.micro_batch_size,
-        global_batch_size=global_batch_size, 
+        global_batch_size=global_batch_size,
         # persistent workers is supported when num_dataset_workers > 0
         persistent_workers=data_config.num_dataset_workers > 0,
         pin_memory=False,
         num_workers=data_config.num_dataset_workers,
     )
     return data
- 
+
+
 @dataclass
 class ParallelConfig:
     tensor_model_parallel_size: int = 1
@@ -131,27 +139,34 @@ class ParallelConfig:
 
 @run.cli.factory
 @run.autoconvert
-def simple_parallel_recipe(tensor_model_parallel_size: int =1, pipeline_model_parallel_size: int = 1, num_devices: int = 1) -> ParallelConfig:
+def simple_parallel_recipe(
+    tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, num_devices: int = 1
+) -> ParallelConfig:
     # TODO validatorssssssss, make sure we get everythign right here.
-    assert num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size, "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
-    return ParallelConfig(tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size, num_devices=num_devices)
+    assert (
+        num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
+    ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
+    return ParallelConfig(
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
+        num_devices=num_devices,
+    )
+
 
 @dataclass
 class TrainingConfig:
     max_steps: int
     limit_val_batches: int
     val_check_interval: int
-    precision: PrecisionTypes = 'bf16-mixed'
-    accelerator: str = 'gpu'
+    precision: PrecisionTypes = "bf16-mixed"
+    accelerator: str = "gpu"
+
 
 @run.cli.factory
 @run.autoconvert
 def default_trainer_config() -> TrainingConfig:
-    return TrainingConfig(
-        max_steps=55000,
-        limit_val_batches=2,
-        val_check_interval=100
-    )
+    return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
+
 
 def setup_trainer_from_configs(parallel_config: ParallelConfig, training_config: TrainingConfig) -> nl.Trainer:
     # Because this returns a trainer, and trainer is not an argument to the entrypoint, this is not a factory.
@@ -179,55 +194,158 @@ def setup_trainer_from_configs(parallel_config: ParallelConfig, training_config:
     )
     return trainer
 
+
+ModelConfigT = TypeVar("ModelConfigT", bound=BioBertGenericConfig)
+
+
+@dataclass
+class ExposedModelConfig(Generic[ModelConfigT], ABC):
+    """ExposedConfigs are meant to be used as a way to expose a subset of the underlying model config.
+
+    Due to the fact that some fields in the underlying TransformerConfig are not serializable, it must be wrapped.
+    We tie each concrete ExposedModelConfig to a specific ModelConfigT, which is a subclass of BioBertGenericConfig.
+    Then, we expect implementors to implement a method using the same type called `model_class`, this returns the literal
+    type ModelConfigT.
+
+    exposed_to_internal_model_config is then a universal method that unpacks the exposed config and returns the underlying model config.
+
+    Users are expected to choose a recipe that returns the ExposedModelConfig of interest and parameterize it accordingly.
+    Developers should carefully create recipes and factories that reflect common usescases, and these will be specified on the CLI.
+    """
+
+    @abstractmethod
+    def model_class(self) -> Type[ModelConfigT]: ...
+
+    def exposed_to_internal_model_config(self) -> ModelConfigT:
+        # This is bad because it doesnt actually leverage any generics
+        cls: Type[ModelConfigT] = self.model_class()
+        return cls(**asdict(self))
+
+
 @dataclass
-class ExposedGeneformerConfig:
-    ''' NeMo run does not like GeneformerConfig due to use its use of lambdas. 
-    
+class ExposedGeneformerConfig(ExposedModelConfig[GeneformerConfig]):
+    """NeMo run does not like GeneformerConfig due to use its use of lambdas.
+
     So I basicaly need a method that does This -> GeneformerConfig
-    then use regular recipes/factories on the parent and do this transform at the last step. 
-    '''
+    then use regular recipes/factories on the parent and do this transform at the last step.
+    """
+
+    params_dtype: PrecisionTypes
+    pipeline_dtype: PrecisionTypes
+    autocast_dtype: PrecisionTypes
+    num_layers: int = 6
+    hidden_size: int = 256
+    ffn_hidden_size: int = 512
+    num_attention_heads: int = 4
+    seq_length: int = 512
+    fp32_residual_connection: bool = False
+    hidden_dropout: float = 0.02
+    init_method_std: float = 0.02
+    kv_channels: Optional[int] = None
+    apply_query_key_layer_scaling: bool = False
+    make_vocab_size_divisible_by: int = 128
+    masked_softmax_fusion: bool = True
+    fp16_lm_cross_entropy: bool = False
+    gradient_accumulation_fusion: bool = False
+    layernorm_zero_centered_gamma: bool = False
+    layernorm_epsilon: float = 1.0e-12
+    activation_func: Callable = F.gelu
+    qk_layernorm: bool = False
+    apply_residual_connection_post_layernorm: bool = False
+    bias_activation_fusion: bool = True
+    bias_dropout_fusion: bool = True
+    get_attention_mask_from_fusion: bool = False
+    attention_dropout: float = 0.1
+    share_embeddings_and_output_weights: bool = True
+    enable_autocast: bool = False
+    nemo1_ckpt_path: Optional[str] = None
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec.value
+    nemo1_ckpt_path: Optional[str] = None
+    # NOTE: handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
+    initial_ckpt_path: Optional[str] = None
+
+    def model_class(self) -> Type[GeneformerConfig]:
+        return GeneformerConfig
+
+
+@dataclass
+class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBertConfig]):
+    """NOTE could use inheritence here, but the typing gets really weird and we'd rather have no red squiggles."""
+
     params_dtype: PrecisionTypes
     pipeline_dtype: PrecisionTypes
     autocast_dtype: PrecisionTypes
-    num_layers: int =6
-    hidden_size: int =256
-    ffn_hidden_size: int =512
-    num_attention_heads: int =4
+    num_layers: int = 6
+    hidden_size: int = 256
+    ffn_hidden_size: int = 512
+    num_attention_heads: int = 4
     seq_length: int = 512
-    fp32_residual_connection: bool =False
-    hidden_dropout: float =0.02
-    init_method_std: float =0.02
-    kv_channels: Optional[int]=None
-    apply_query_key_layer_scaling: bool =False
-    make_vocab_size_divisible_by: int =128
-    masked_softmax_fusion: bool =True
-    fp16_lm_cross_entropy: bool =False
-    gradient_accumulation_fusion: bool =False
-    layernorm_zero_centered_gamma: bool =False
-    layernorm_epsilon: float=1.0e-12
-    activation_func: Callable =F.gelu
-    qk_layernorm: bool=False
-    apply_residual_connection_post_layernorm: bool=False
-    bias_activation_fusion: bool =True
-    bias_dropout_fusion: bool =True
-    get_attention_mask_from_fusion: bool =False
-    attention_dropout: float =0.1
-    share_embeddings_and_output_weights: bool =True
+    fp32_residual_connection: bool = False
+    hidden_dropout: float = 0.02
+    init_method_std: float = 0.02
+    kv_channels: Optional[int] = None
+    apply_query_key_layer_scaling: bool = False
+    make_vocab_size_divisible_by: int = 128
+    masked_softmax_fusion: bool = True
+    fp16_lm_cross_entropy: bool = False
+    gradient_accumulation_fusion: bool = False
+    layernorm_zero_centered_gamma: bool = False
+    layernorm_epsilon: float = 1.0e-12
+    activation_func: Callable = F.gelu
+    qk_layernorm: bool = False
+    apply_residual_connection_post_layernorm: bool = False
+    bias_activation_fusion: bool = True
+    bias_dropout_fusion: bool = True
+    get_attention_mask_from_fusion: bool = False
+    attention_dropout: float = 0.1
+    share_embeddings_and_output_weights: bool = True
     enable_autocast: bool = False
     nemo1_ckpt_path: Optional[str] = None
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec.value
+    nemo1_ckpt_path: Optional[str] = None
+    # NOTE: handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
+    initial_ckpt_path: Optional[str] = None
+    # NOTE only new attribute between this config and the geneformer config.
+    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None
+
+    def __post_init__(self):
+        if not self.initial_ckpt_skip_keys_with_these_prefixes:
+            self.initial_ckpt_skip_keys_with_these_prefixes = ["regression_head"]
+
+    def model_class(self) -> Type[FineTuneSeqLenBioBertConfig]:
+        return FineTuneSeqLenBioBertConfig
 
-def exposed_to_internal_geneformer_config(arg: ExposedGeneformerConfig) -> GeneformerConfig:
-    return GeneformerConfig(**asdict(arg))
 
 @run.cli.factory
 @run.autoconvert
-def basic_wrapped_geneformer_config_recipe(seq_length: int = 128, 
-                                   precision: PrecisionTypes='bf16-mixed', 
-                                   nemo1_init_path: Optional[str]=None, 
-                                   biobert_spec_option: BiobertSpecOption=BiobertSpecOption.bert_layer_local_spec.value
-                                   ) -> ExposedGeneformerConfig:
-    ''' Sets up the base GeneformerConfig. Recipes on geneformer configs should choose what to expose and come with sensible defaults. '''
+def geneformer_finetuning_regression_head_recipe(
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None,
+) -> ExposedModelConfig[FineTuneSeqLenBioBertConfig]:
+    finetuning_config = ExposedFineTuneSeqLenBioBertConfig(
+        params_dtype=get_autocast_dtype(precision),
+        pipeline_dtype=get_autocast_dtype(precision),
+        autocast_dtype=get_autocast_dtype(precision),  # setting this speeds things up a lot
+        nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
+        initial_ckpt_skip_keys_with_these_prefixes=initial_ckpt_skip_keys_with_these_prefixes,
+    )
+    return finetuning_config
+
+
+# TODO(SKH) rename this recipe to something more understandable.
+@run.cli.factory
+@run.autoconvert
+def geneformer10M_pretraining_recipe(
+    seq_length: int = 128,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec.value,
+) -> ExposedModelConfig[GeneformerConfig]:
+    """Sets up the base GeneformerConfig. Recipes on geneformer configs should choose what to expose and come with sensible defaults."""
     geneformer_config = ExposedGeneformerConfig(
         num_layers=6,
         hidden_size=256,
@@ -259,44 +377,75 @@ def basic_wrapped_geneformer_config_recipe(seq_length: int = 128,
         enable_autocast=False,  # This has to be set to True if we use the mixed precision plugin
         biobert_spec_option=biobert_spec_option,
         nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
     )
     return geneformer_config
 
+
 @dataclass
 class OptimizerSchedulerConfig:
     lr: float = 1e-4
-    optimizer: str = 'adam' # TODO Literal
-    cosine_rampup_frac: float = .01
-    cosine_hold_frac: float = .05
-    interval: str = 'step' # TODO Literal
-    monitor: str = 'val_loss'
+    optimizer: str = "adam"  # TODO Literal
+    cosine_rampup_frac: float = 0.01
+    cosine_hold_frac: float = 0.05
+    interval: str = "step"  # TODO Literal
+    monitor: str = "val_loss"
+
 
 @run.cli.factory
 @run.autoconvert
 def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
-    ''' Prefers the default parameters for the Optimizer and Scheduler. '''
+    """Prefers the default parameters for the Optimizer and Scheduler."""
     return OptimizerSchedulerConfig()
 
+
 @run.cli.factory
 @run.autoconvert
-def exposed_optimizer_recipe(lr: float, optimizer: str, cosine_rampup_frac: float, cosine_hold_frac: float, interval: str, monitor: str) -> OptimizerSchedulerConfig:
-    ''' This recipe exposes all parameters to the underlying OptimizerSchedulerConfig. '''
-    return OptimizerSchedulerConfig(lr=lr, optimizer=optimizer, cosine_rampup_frac=cosine_rampup_frac, cosine_hold_frac=cosine_hold_frac, interval=interval, monitor=monitor)
+def exposed_optimizer_recipe(
+    lr: float, optimizer: str, cosine_rampup_frac: float, cosine_hold_frac: float, interval: str, monitor: str
+) -> OptimizerSchedulerConfig:
+    """This recipe exposes all parameters to the underlying OptimizerSchedulerConfig."""
+    return OptimizerSchedulerConfig(
+        lr=lr,
+        optimizer=optimizer,
+        cosine_rampup_frac=cosine_rampup_frac,
+        cosine_hold_frac=cosine_hold_frac,
+        interval=interval,
+        monitor=monitor,
+    )
+
 
 @run.cli.factory
 @run.autoconvert
-def optimizer_recipe_with_kwarg_defaults(lr: float = 1e-4, optimizer: str = 'adam', cosine_rampup_frac: float = .01, cosine_hold_frac: float = .05, interval: str = 'step', monitor: str = 'val_loss') -> OptimizerSchedulerConfig:
-    ''' This recipe exposes all parameters to the underlying OptimizerSchedulerConfig and provides defaults as kwargs. '''
-    return OptimizerSchedulerConfig(lr=lr, optimizer=optimizer, cosine_rampup_frac=cosine_rampup_frac, cosine_hold_frac=cosine_hold_frac, interval=interval, monitor=monitor)
+def optimizer_recipe_with_kwarg_defaults(
+    lr: float = 1e-4,
+    optimizer: str = "adam",
+    cosine_rampup_frac: float = 0.01,
+    cosine_hold_frac: float = 0.05,
+    interval: str = "step",
+    monitor: str = "val_loss",
+) -> OptimizerSchedulerConfig:
+    """This recipe exposes all parameters to the underlying OptimizerSchedulerConfig and provides defaults as kwargs."""
+    return OptimizerSchedulerConfig(
+        lr=lr,
+        optimizer=optimizer,
+        cosine_rampup_frac=cosine_rampup_frac,
+        cosine_hold_frac=cosine_hold_frac,
+        interval=interval,
+        monitor=monitor,
+    )
+
 
-def biobert_lightning_module(geneformer_config: GeneformerConfig, tokenizer: Tokenizer, optim_config: OptimizerSchedulerConfig, num_steps: int) -> BioBertLightningModule:
-    ''' Function that constructs a lightning module from the requisite configs. 
+def biobert_lightning_module(
+    model_config: BioBertGenericConfig, tokenizer: Tokenizer, optim_config: OptimizerSchedulerConfig, num_steps: int
+) -> BioBertLightningModule:
+    """Function that constructs a lightning module from the requisite configs.
 
     tokenizer: Tokenizer - must be the same tokenizer used by the DataModule.
-    num_steps: int - must match the number of steps in the DataConfig. 
-    '''
+    num_steps: int - must match the number of steps in the DataConfig.
+    """
     model = BioBertLightningModule(
-        geneformer_config,
+        model_config,
         tokenizer=tokenizer,
         optimizer=MegatronOptimizerModule(
             config=OptimizerConfig(
@@ -305,8 +454,8 @@ def biobert_lightning_module(geneformer_config: GeneformerConfig, tokenizer: Tok
                 use_distributed_optimizer=True,
                 # Pass through fp16/bf16 settings to avoid errors around model having bf16 enabled but optimizer not.
                 # implies these configs must be coupled.
-                fp16=geneformer_config.fp16,
-                bf16=geneformer_config.bf16,
+                fp16=model_config.fp16,
+                bf16=model_config.bf16,
             ),
             lr_scheduler=CosineAnnealingScheduler(
                 max_steps=num_steps,
@@ -321,6 +470,7 @@ def biobert_lightning_module(geneformer_config: GeneformerConfig, tokenizer: Tok
     )
     return model
 
+
 @dataclass
 class ExperimentConfig:
     save_every_n_steps: int
@@ -328,35 +478,38 @@ class ExperimentConfig:
     experiment_name: str
     restore_from_checkpoint_path: Optional[str]
     resume_if_exists: bool
-    wandb_options: WandbLoggerOptions = None # TODO(SKH) if we are passing a type in here its gonna blow up.
+    wandb_options: WandbLoggerOptions = None  # TODO(SKH) if we are passing a type in here its gonna blow up.
     save_best_checkpoint: bool = False
     save_last_checkpoint: bool = True
-    metric_to_monitor_for_checkpoints: str = 'reduced_train_loss' # TODO literal?
+    metric_to_monitor_for_checkpoints: str = "reduced_train_loss"  # TODO literal?
     save_top_k: int = 2
     create_tensorboard_logger: bool = False
 
+
 @run.cli.factory
 @run.autoconvert
 def experiment_config_recipe() -> ExperimentConfig:
     return ExperimentConfig(
         save_every_n_steps=100,
-        result_dir='./results',
-        experiment_name='default_experiment',
+        result_dir="./results",
+        experiment_name="default_experiment",
         restore_from_checkpoint_path=None,
         resume_if_exists=True,
         save_best_checkpoint=False,
         save_last_checkpoint=True,
-        metric_to_monitor_for_checkpoints='reduced_train_loss',
+        metric_to_monitor_for_checkpoints="reduced_train_loss",
         save_top_k=2,
         create_tensorboard_logger=False,
     )
 
+
 @dataclass
 class WandbConfig:
     # NOTE(SKH) there is some duplication with WandbLoggerOptions
-    project: str # Must be set to log to wandb, this is the 'project' directory under your 'entity'
-    entity: str # Sometimes refers to team, sometimes username
-    offline: bool # If set does not log to wandb
+    project: str  # Must be set to log to wandb, this is the 'project' directory under your 'entity'
+    entity: str  # Sometimes refers to team, sometimes username
+    offline: bool  # If set does not log to wandb
+
 
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
     checkpoint_callback = nl_callbacks.ModelCheckpoint(
@@ -389,22 +542,24 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
     )
     return nemo_logger
 
+
 def pretrain_partial(
-        geneformer_config: ExposedGeneformerConfig, 
-        data_config: DataConfig, 
-        parallel_config: ParallelConfig, 
-        training_config: TrainingConfig, 
-        optim_config: OptimizerSchedulerConfig,
-        experiment_config: ExperimentConfig, 
-        resume_if_exists: bool = True,
-        wandb_entity: Optional[str] = None,
-        wandb_project: Optional[str] = None,
-        wandb_offline: bool = True,
+    model_config: ExposedModelConfig[ModelConfigT],
+    data_config: DataConfig,
+    parallel_config: ParallelConfig,
+    training_config: TrainingConfig,
+    optim_config: OptimizerSchedulerConfig,
+    experiment_config: ExperimentConfig,
+    resume_if_exists: bool = True,
+    wandb_entity: Optional[str] = None,
+    wandb_project: Optional[str] = None,
+    wandb_offline: bool = True,
 ) -> run.Partial:
-    ''' Same as pretrain but in partial form instead of an entrypoint. '''
+    """Same as pretrain but in partial form instead of an entrypoint."""
 
-    return run.Partial(pretrain, 
-        geneformer_config=geneformer_config,
+    return run.Partial(
+        pretrain,
+        model_config=model_config,
         data_config=data_config,
         parallel_config=parallel_config,
         training_config=training_config,
@@ -415,28 +570,28 @@ def pretrain_partial(
         # These could live as their own config, but they dont make sense to use factories with since theyre dependent on the environment.
         wandb_entity=wandb_entity,
         wandb_project=wandb_project,
-        wandb_offline=wandb_offline
+        wandb_offline=wandb_offline,
     )
 
+
 @run.cli.entrypoint
 def pretrain(
-        geneformer_config: ExposedGeneformerConfig,  #noqa
-        data_config: DataConfig, 
-        parallel_config: ParallelConfig, 
-        training_config: TrainingConfig, 
-        optim_config: OptimizerSchedulerConfig,
-        experiment_config: ExperimentConfig, 
-        # Remaining are things that live outside a config
-        resume_if_exists: bool = True,
-        # These could live as their own config, but they dont make sense to use factories with since theyre dependent on the environment.
-        wandb_entity: Optional[str] = None,
-        wandb_project: Optional[str] = None,
-        wandb_offline: bool = True,
-        new_experiment_title = 'asdf'
-    ):
-
-    # To make this work correctly as an entrypoint we must actually wrap it in something else due how how certain local variables are used as defaults.
-    geneformer_config: GeneformerConfig = exposed_to_internal_geneformer_config(geneformer_config)
+    model_config: ExposedModelConfig[ModelConfigT],  # noqa
+    data_config: DataConfig,
+    parallel_config: ParallelConfig,
+    training_config: TrainingConfig,
+    optim_config: OptimizerSchedulerConfig,
+    experiment_config: ExperimentConfig,
+    # Remaining are things that live outside a config
+    resume_if_exists: bool = True,
+    # These could live as their own config, but they dont make sense to use factories with since theyre dependent on the environment.
+    wandb_entity: Optional[str] = None,
+    wandb_project: Optional[str] = None,
+    wandb_offline: bool = True,
+    # ??? what was I doing with new_experiment title?
+    new_experiment_title="asdf",
+):
+    model_config: ModelConfigT = model_config.exposed_to_internal_model_config()
 
     # Setup.
     # Create requisite directory.
@@ -446,23 +601,28 @@ def pretrain(
         logging.warning("Mutating training_config.save_every_n_steps to be equal to val_check_interval.")
         experiment_config.save_every_n_steps = training_config.val_check_interval
 
-
-    global_batch_size = infer_global_batch_size(micro_batch_size=data_config.micro_batch_size, 
-                                                num_nodes=parallel_config.num_nodes,
-                                                devices=parallel_config.num_devices, 
-                                                accumulate_grad_batches=parallel_config.accumulate_grad_batches,
-                                                tensor_model_parallel_size=parallel_config.tensor_model_parallel_size, 
-                                                pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size)
+    global_batch_size = infer_global_batch_size(
+        micro_batch_size=data_config.micro_batch_size,
+        num_nodes=parallel_config.num_nodes,
+        devices=parallel_config.num_devices,
+        accumulate_grad_batches=parallel_config.accumulate_grad_batches,
+        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
+        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
+    )
 
     data: SingleCellDataModule = singlecell_data_module(data_config, global_batch_size)
     # TODO there must be a way to do this automatically.
-    geneformer_config.seq_length = data_config.seq_length
-    geneformer_config.bf16 = training_config.precision == 'bf16-mixed'
-    geneformer_config.fp16 = training_config.precision == '16-mixed'
+    model_config.seq_length = data_config.seq_length
+    model_config.bf16 = training_config.precision == "bf16-mixed"
+    model_config.fp16 = training_config.precision == "16-mixed"
 
-    model: BioBertLightningModule = biobert_lightning_module(geneformer_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps)
+    model: BioBertLightningModule = biobert_lightning_module(
+        model_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps
+    )
     trainer: nl.Trainer = setup_trainer_from_configs(parallel_config, training_config)
-    nemo_logger: nl.NeMoLogger = nemo_logger_factory(experiment_config, wandb_config=WandbConfig(project=wandb_project, entity=wandb_entity, offline=wandb_offline))
+    nemo_logger: nl.NeMoLogger = nemo_logger_factory(
+        experiment_config, wandb_config=WandbConfig(project=wandb_project, entity=wandb_entity, offline=wandb_offline)
+    )
 
     llm.train(
         model=model,
@@ -471,7 +631,7 @@ def pretrain(
         log=nemo_logger,
         resume=resume.AutoResume(
             path=None,
-            resume_if_exists=resume_if_exists,  # Looks for the -last checkpoint to continue training.
+            resume_if_exists=False,  # To resume training a specific checkpoint simply set initial_ckpt_path in the ModelConfig.
             resume_ignore_no_checkpoint=True,  # When false this will throw an error with no existing checkpoint.
         ),
     )
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 7ecc48d36c..2ae0d56e99 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -14,57 +14,78 @@
 # limitations under the License.
 
 
+from typing import Optional
+
 import nemo_run as run
-from bionemo.geneformer.run.factories import ExposedGeneformerConfig, DataConfig, ParallelConfig, TrainingConfig, OptimizerSchedulerConfig, ExperimentConfig, pretrain_partial, pretrain
-from dataclasses import dataclass
 import pydantic
-from typing import Optional
+
+from bionemo.geneformer.run.factories import (
+    DataConfig,
+    ExperimentConfig,
+    ExposedModelConfig,
+    ModelConfigT,
+    OptimizerSchedulerConfig,
+    ParallelConfig,
+    TrainingConfig,
+    pretrain_partial,
+)
 
 
 class NeMoRunConfig(pydantic.BaseModel):
     # These are all mutually exclusive, I think thats important to capture.
+    # NOTE havent figured out how to use this config yet.
+    #       could pass this into the entrypoint and do a branch based on the config
     new_experiment_title: Optional[str]
     resume_from_id: Optional[str]
     resume_from_title: Optional[str]
 
     def __post_init__(self):
         if not any([self.new_experiment_title, self.resume_from_id, self.resume_from_title]):
-            raise ValueError("Exactly one of new_experiment_title, resume_from_id, resume_from_title must be set. None are set.")
+            raise ValueError(
+                "Exactly one of new_experiment_title, resume_from_id, resume_from_title must be set. None are set."
+            )
+
+        if sum([bool(self.new_experiment_title), bool(self.resume_from_id), bool(self.resume_from_title)]) > 1:
+            raise ValueError(
+                "Exactly one of new_experiment_title, resume_from_id, resume_from_title must be set. More than one field was set."
+            )
 
-        if sum([bool(self.new_experiment_title), bool(self.resume_from_id), bool(self.resume_from_title)]) > 1:        
-            raise ValueError("Exactly one of new_experiment_title, resume_from_id, resume_from_title must be set. More than one field was set.")
 
 @run.cli.entrypoint
 def run_again(
-    resume_from_id: Optional[str], # Note, in these cases we dont actually need the rest of the configs. Maybe these deserve distinct entrypoints.
+    resume_from_id: Optional[
+        str
+    ],  # Note, in these cases we dont actually need the rest of the configs. Maybe these deserve distinct entrypoints.
     resume_from_title: Optional[str],
-    # NOTE could optionall support execution kwargs and mutate those.
 ):
-    assert resume_from_id or resume_from_title, "Exactly one of resume_from_id or resume_from_title must be set to rerun an experiment."
-    assert not (resume_from_id and resume_from_title), "Exactly one of resume_from_id or resume_from_title must be set to rerun an experiment."
+    """Example entrypoint of how to re-run an existing job."""
+    assert (
+        resume_from_id or resume_from_title
+    ), "Exactly one of resume_from_id or resume_from_title must be set to rerun an experiment."
+    assert not (
+        resume_from_id and resume_from_title
+    ), "Exactly one of resume_from_id or resume_from_title must be set to rerun an experiment."
 
     # Setup the context manager with the correct entrypoint, expect these to be mutually exclusive
-    with run.Experiment.from_title(resume_from_title)  \
-         if resume_from_title is not None else    \
-         run.Experiment.from_id(resume_from_id) \
-         as exp:
-        exp.executor = run.LocalExecutor() # Can we mutate? 
-        print(exp)
+    with run.Experiment.from_title(resume_from_title) if resume_from_title is not None else run.Experiment.from_id(
+        resume_from_id
+    ) as exp:
+        exp.executor = run.LocalExecutor()  # Can we mutate?
         exp.reset()
         exp.run(sequential=True)
 
+
 @run.cli.entrypoint
 def run_firsttime(
     # NeMo Run controls.
     experiment_title: str,
-
     # Pretrain configuration requirements.
-    geneformer_config: ExposedGeneformerConfig, 
-    data_config: DataConfig, 
-    parallel_config: ParallelConfig, 
-    training_config: TrainingConfig, 
+    model_config: ExposedModelConfig[ModelConfigT],
+    data_config: DataConfig,
+    parallel_config: ParallelConfig,
+    training_config: TrainingConfig,
     optim_config: OptimizerSchedulerConfig,
-    experiment_config: ExperimentConfig, 
+    experiment_config: ExperimentConfig,
     # Remaining are things that live outside a config
     resume_if_exists: bool = True,
     # WANDB
@@ -72,16 +93,27 @@ def run_firsttime(
     wandb_project: Optional[str] = None,
     wandb_offline: bool = True,
 ):
-    
-
-    # TODO execution conditionals
     local_executor = run.LocalExecutor()
     with run.Experiment(title=experiment_title, executor=local_executor) as e:
         # Input has to be a partial wrapper of pretrain?
-        e.add(pretrain_partial(geneformer_config, data_config, parallel_config, training_config, optim_config, experiment_config, resume_if_exists, wandb_entity, wandb_project, wandb_offline))
-        e.run()
+        e.add(
+            pretrain_partial(
+                model_config,
+                data_config,
+                parallel_config,
+                training_config,
+                optim_config,
+                experiment_config,
+                resume_if_exists,
+                wandb_entity,
+                wandb_project,
+                wandb_offline,
+            )
+        )
+        # TODO direct
+        e.run(direct=True)
 
 
 if __name__ == "__main__":
     run.cli.main(run_firsttime)
-    # run.cli.main(pretrain)
\ No newline at end of file
+    # run.cli.main(pretrain)
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
index 1bf2b8fcea..afb2ee8c61 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
@@ -421,6 +421,7 @@ class BioBertGenericConfig(
     nemo1_ckpt_path: Optional[str] = None
 
     initial_ckpt_path: Optional[str] = None
+    # TODO(@jstjohn, @skothenhill) Was this supposed to be only on the child?
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
     # Used if initializing from a checkpoint, set this to any fields you want to override rather than re-set.
     #  by default all fields will be overridden.

From a3ce991d09bf11f8bb1cd44853da75604594fc11 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 24 Sep 2024 09:37:43 -0700
Subject: [PATCH 07/58] updating

---
 .gitmodules                                                  | 3 +++
 Dockerfile                                                   | 4 ++--
 .../src/bionemo/geneformer/run/factories.py                  | 5 -----
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 0b6458ab20..035642e72d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "3rdparty/NeMo"]
 	path = 3rdparty/NeMo
 	url = https://github.com/NVIDIA/NeMo.git
+[submodule "3rdparty/NeMo-Run"]
+	path = 3rdparty/NeMo-Run
+	url = git@github.com:NVIDIA/NeMo-Run.git
diff --git a/Dockerfile b/Dockerfile
index a0024292c7..4aae8c336d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -48,8 +48,8 @@ RUN pip --disable-pip-version-check --no-cache-dir install \
   git+https://github.com/state-spaces/mamba.git@v2.0.3
 
 RUN pip install hatchling   # needed to install nemo-run
-ARG NEMU_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
-RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_TAG}
+ARG NEMO_RUN_TAG=8701f5f2c6c3a4a72bd2a435c872d7dcc4560527
+RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_TAG}
 
 FROM bionemo2-base AS pip-requirements
 
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
index aa0b698291..bcefef373e 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
@@ -479,7 +479,6 @@ class ExperimentConfig:
     restore_from_checkpoint_path: Optional[str]
     resume_if_exists: bool
     wandb_options: WandbLoggerOptions = None  # TODO(SKH) if we are passing a type in here its gonna blow up.
-    save_best_checkpoint: bool = False
     save_last_checkpoint: bool = True
     metric_to_monitor_for_checkpoints: str = "reduced_train_loss"  # TODO literal?
     save_top_k: int = 2
@@ -495,7 +494,6 @@ def experiment_config_recipe() -> ExperimentConfig:
         experiment_name="default_experiment",
         restore_from_checkpoint_path=None,
         resume_if_exists=True,
-        save_best_checkpoint=False,
         save_last_checkpoint=True,
         metric_to_monitor_for_checkpoints="reduced_train_loss",
         save_top_k=2,
@@ -513,12 +511,10 @@ class WandbConfig:
 
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
     checkpoint_callback = nl_callbacks.ModelCheckpoint(
-        save_best_model=experiment_config.save_best_checkpoint,
         save_last=experiment_config.save_last_checkpoint,
         monitor=experiment_config.metric_to_monitor_for_checkpoints,
         save_top_k=experiment_config.save_top_k,
         every_n_train_steps=experiment_config.save_every_n_steps,
-        enable_nemo_ckpt_io=True,
     )
 
     wandb_options: Optional[WandbLoggerOptions] = (
@@ -630,7 +626,6 @@ def pretrain(
         trainer=trainer,
         log=nemo_logger,
         resume=resume.AutoResume(
-            path=None,
             resume_if_exists=False,  # To resume training a specific checkpoint simply set initial_ckpt_path in the ModelConfig.
             resume_ignore_no_checkpoint=True,  # When false this will throw an error with no existing checkpoint.
         ),

From bd852533508d529f37f03ae16d844ddb61e3df83 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 24 Sep 2024 12:20:35 -0700
Subject: [PATCH 08/58] expose launcher

---
 .../src/bionemo/geneformer/run/main.py                   | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 2ae0d56e99..49359f38f1 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -72,7 +72,8 @@ def run_again(
     ) as exp:
         exp.executor = run.LocalExecutor()  # Can we mutate?
         exp.reset()
-        exp.run(sequential=True)
+        exp.run(direct=True, sequential=True)
+        # exp.run(direct=True, tail_logs=True, sequential=True)
 
 
 @run.cli.entrypoint
@@ -92,8 +93,10 @@ def run_firsttime(
     wandb_entity: Optional[str] = None,
     wandb_project: Optional[str] = None,
     wandb_offline: bool = True,
+    launcher: str | None = None
 ):
-    local_executor = run.LocalExecutor()
+    # Set launcher='torchrun' to execute on the cluster
+    local_executor = run.LocalExecutor(launcher=launcher)
     with run.Experiment(title=experiment_title, executor=local_executor) as e:
         # Input has to be a partial wrapper of pretrain?
         e.add(
@@ -116,4 +119,4 @@ def run_firsttime(
 
 if __name__ == "__main__":
     run.cli.main(run_firsttime)
-    # run.cli.main(pretrain)
+    # run.cli.main(run_again)

From a1e94581832d2b4cc5251ac9b50d21142b70a8bf Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 24 Sep 2024 19:49:57 +0000
Subject: [PATCH 09/58] update type signature on entrypoint to avoid the |
 operator

---
 .../bionemo-geneformer/src/bionemo/geneformer/run/main.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 49359f38f1..0c870d3dea 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -93,7 +93,7 @@ def run_firsttime(
     wandb_entity: Optional[str] = None,
     wandb_project: Optional[str] = None,
     wandb_offline: bool = True,
-    launcher: str | None = None
+    launcher: Optional[str] = None,
 ):
     # Set launcher='torchrun' to execute on the cluster
     local_executor = run.LocalExecutor(launcher=launcher)

From 8924a847f9e56ef5f5f199e181d8e1ea2b5470ee Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 24 Sep 2024 21:35:00 +0000
Subject: [PATCH 10/58] wrapping in distributed guards

---
 .../src/bionemo/geneformer/run/factories.py       |  1 +
 .../src/bionemo/geneformer/run/main.py            | 15 +++++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
index bcefef373e..a866cb5bdc 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
@@ -515,6 +515,7 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
         monitor=experiment_config.metric_to_monitor_for_checkpoints,
         save_top_k=experiment_config.save_top_k,
         every_n_train_steps=experiment_config.save_every_n_steps,
+        always_save_context=True,
     )
 
     wandb_options: Optional[WandbLoggerOptions] = (
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 0c870d3dea..3918b3270d 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -14,10 +14,11 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Optional, Union
 
 import nemo_run as run
 import pydantic
+import torch
 
 from bionemo.geneformer.run.factories import (
     DataConfig,
@@ -72,8 +73,12 @@ def run_again(
     ) as exp:
         exp.executor = run.LocalExecutor()  # Can we mutate?
         exp.reset()
-        exp.run(direct=True, sequential=True)
-        # exp.run(direct=True, tail_logs=True, sequential=True)
+        exp.run(direct=True, tail_logs=True, sequential=True)
+
+
+@run.cli.entrypoint
+def simple_example(this_or_that: Union[str, int]):
+    print(this_or_that)
 
 
 @run.cli.entrypoint
@@ -118,5 +123,7 @@ def run_firsttime(
 
 
 if __name__ == "__main__":
-    run.cli.main(run_firsttime)
+    if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
+        run.cli.main(run_firsttime)
+    # run.cli.main(simple_example)
     # run.cli.main(run_again)

From 748cc228d708d84948ad7f517e968a38aed9ce96 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 24 Sep 2024 21:52:09 +0000
Subject: [PATCH 11/58] fixed

---
 .../bionemo-geneformer/src/bionemo/geneformer/run/main.py   | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 3918b3270d..d23ee63fcc 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -101,7 +101,9 @@ def run_firsttime(
     launcher: Optional[str] = None,
 ):
     # Set launcher='torchrun' to execute on the cluster
-    local_executor = run.LocalExecutor(launcher=launcher)
+    # local_executor = run.LocalExecutor(launcher=launcher)
+
+    local_executor = run.LocalExecutor(ntasks_per_node=parallel_config.num_devices, launcher=launcher)
     with run.Experiment(title=experiment_title, executor=local_executor) as e:
         # Input has to be a partial wrapper of pretrain?
         e.add(
@@ -125,5 +127,7 @@ def run_firsttime(
 if __name__ == "__main__":
     if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
         run.cli.main(run_firsttime)
+    else:
+        run.cli.main(run_firsttime)
     # run.cli.main(simple_example)
     # run.cli.main(run_again)

From 758aaecc65031530751c095c727eac58ffd5188b Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 24 Sep 2024 22:08:33 +0000
Subject: [PATCH 12/58] update

---
 .../bionemo-geneformer/src/bionemo/geneformer/run/main.py      | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index d23ee63fcc..669a0b3f04 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -120,8 +120,7 @@ def run_firsttime(
                 wandb_offline,
             )
         )
-        # TODO direct
-        e.run(direct=True)
+        e.run()
 
 
 if __name__ == "__main__":

From 254e46f458b764fabe7b6c4cc541ccfc5f383a36 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 24 Sep 2024 22:51:33 +0000
Subject: [PATCH 13/58] updates

---
 .../bionemo-geneformer/src/bionemo/geneformer/run/main.py   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 669a0b3f04..7d47f45267 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -18,7 +18,6 @@
 
 import nemo_run as run
 import pydantic
-import torch
 
 from bionemo.geneformer.run.factories import (
     DataConfig,
@@ -124,9 +123,12 @@ def run_firsttime(
 
 
 if __name__ == "__main__":
+    run.cli.main(run_firsttime)
+
+    """
     if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
         run.cli.main(run_firsttime)
     else:
-        run.cli.main(run_firsttime)
+    """
     # run.cli.main(simple_example)
     # run.cli.main(run_again)

From 07e2963a109db463f03f6889c4ba93f570063183 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 25 Sep 2024 17:35:51 +0000
Subject: [PATCH 14/58] adding slurm run example

---
 scripts/singlecell/geneformer/slurm-run.py | 131 +++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 scripts/singlecell/geneformer/slurm-run.py

diff --git a/scripts/singlecell/geneformer/slurm-run.py b/scripts/singlecell/geneformer/slurm-run.py
new file mode 100644
index 0000000000..233315668a
--- /dev/null
+++ b/scripts/singlecell/geneformer/slurm-run.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import nemo_run as run
+from typing import Optional
+
+from bionemo.geneformer.run.factories import default_adam_optimizer_with_cosine_annealing_recipe, default_trainer_config, experiment_config_recipe, geneformer10M_pretraining_recipe, pretrain_partial, simple_parallel_recipe, small_data_config
+
+def slurm_executor(
+    user: str,
+    host: str,
+    remote_job_dir: str,
+    account: str,
+    partition: str,
+    nodes: int,
+    devices: int,
+    identity: str,
+    time: str = "01:00:00",
+    custom_mounts: Optional[list[str]] = None,
+    custom_env_vars: Optional[dict[str, str]] = None,
+    container_image: str = "nvcr.io/nvidia/nemo:dev",
+    retries: int = 0,
+) -> run.SlurmExecutor:
+    if not (user and host and remote_job_dir and account and partition and nodes and devices):
+        raise RuntimeError(
+            "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function."
+        )
+
+    mounts = []
+    # Custom mounts are defined here.
+    if custom_mounts:
+        mounts.extend(custom_mounts)
+
+    # Env vars for jobs are configured here
+    env_vars = {
+        "TRANSFORMERS_OFFLINE": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+        "NCCL_NVLS_ENABLE": "0",
+        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
+        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "NVTE_FUSED_ATTN": "0",
+    }
+    if custom_env_vars:
+        env_vars |= custom_env_vars
+
+    # This defines the slurm executor.
+    # We connect to the executor via the tunnel defined by user, host and remote_job_dir.
+    executor = run.SlurmExecutor(
+        account=account,
+        partition=partition,
+        tunnel=run.SSHTunnel(
+            user=user,
+            host=host,
+            job_dir=remote_job_dir, # This is where the results of the run will be stored by default.
+            identity=identity
+        ),
+        nodes=nodes,
+        ntasks_per_node=devices,
+        gpus_per_node=devices,
+        mem="0",
+        exclusive=True,
+        gres="gpu:8",
+    )
+
+    executor.container_image = container_image
+    executor.container_mounts = mounts
+    executor.env_vars = env_vars
+    executor.retries = retries
+    executor.time = time
+
+    return executor
+
+def main():
+    identity="/home/bionemo/.ssh/id_ed25519"
+    # OPTIONAL: Provide path to the private key that can be used to establish the SSH connection without entering your password.
+    DRACO="cs-oci-ord-login-03"
+    CUSTOM_MOUNTS = [
+        "/lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/results/bionemo2_geneformer_pretraining/bionemo2_geneformer_pretraining:/results",
+        "/lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/data:/workspaces/bionemo-fw-ea/data",
+    ]
+    executor = slurm_executor(
+        user='skothenhill',
+        identity=identity,
+        host=DRACO,
+        remote_job_dir='/home/skothenhill/20240924-bionemo2/nemorun',
+        account='healthcareeng_bionemo',
+        partition='polar',
+        nodes=1,
+        devices=8,
+        custom_mounts = CUSTOM_MOUNTS,
+        container_image="nvcr.io/nvidian/cvai_bnmo_trng/bionemo:bionemo2-758aaecc65031530751c095c727eac58ffd5188b",
+    )
+
+    model_config = geneformer10M_pretraining_recipe()
+    data_config = small_data_config(data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15_small/processed_data") 
+    parallel_config=simple_parallel_recipe()
+    training_config = default_trainer_config()
+    optim_config=default_adam_optimizer_with_cosine_annealing_recipe()
+    experiment_config=experiment_config_recipe()
+    data_config.seq_length=128
+    data_config.micro_batch_size=8
+    parallel_config.num_devices=8
+    training_config.precision='bf16-mixed'
+    training_config.max_steps=1000
+    recipe = pretrain_partial(
+        model_config=model_config,
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+        resume_if_exists=False,
+    )
+    # Submit a partial object
+    # There is a way to do this with explicit experiment management but idk how.
+    run.run(recipe, executor=executor, detach=True, dryrun=False)
+
+main()
\ No newline at end of file

From 3c5fba51b723111a1afcc044b2f5e789335804d9 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 2 Oct 2024 17:01:44 +0000
Subject: [PATCH 15/58] Intermediate convert that moves toward a pydantic based
 validation

---
 .../src/bionemo/core/utils/dtypes.py          |  18 +-
 .../bionemo/geneformer/run/config_models.py   | 748 ++++++++++++++++++
 .../src/bionemo/llm/utils/logger_utils.py     |  17 +-
 3 files changed, 773 insertions(+), 10 deletions(-)
 create mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py

diff --git a/sub-packages/bionemo-core/src/bionemo/core/utils/dtypes.py b/sub-packages/bionemo-core/src/bionemo/core/utils/dtypes.py
index 9520e62f3b..31f213633a 100644
--- a/sub-packages/bionemo-core/src/bionemo/core/utils/dtypes.py
+++ b/sub-packages/bionemo-core/src/bionemo/core/utils/dtypes.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 
-from typing import Literal, Sequence
+from typing import Dict, Literal, Sequence
 
 import torch
 
@@ -24,7 +24,23 @@
     "PrecisionTypes",
 )
 
+
+# NOTE(SKH) our precision types are a mess, but we inherit this problem from NeMo and Megatron.
 PrecisionTypes = Literal["fp16", "bf16", "fp32", "bf16-mixed", "fp32-mixed", "16-mixed", "fp16-mixed", 16, 32]
+precision_to_dtype: Dict[PrecisionTypes, torch.dtype] = {
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+    "fp32": torch.float32,
+    "16-mixed": torch.float16,
+    "fp16-mixed": torch.float16,
+    "bf16-mixed": torch.bfloat16,
+    "fp32-mixed": torch.float32,
+    16: torch.float16,
+    32: torch.float32,
+}
+
+# NOTE(SKH) these do not have a perfect 1-1 relationship, but we can use this to serialize/deserialize dtypes in ModelConfigs since its ultimately converted with precision_to_dtype.
+dtype_to_precision: Dict[torch.dtype, PrecisionTypes] = {v: k for k, v in precision_to_dtype.items()}
 
 
 def get_autocast_dtype(precision: PrecisionTypes) -> torch.dtype:
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
new file mode 100644
index 0000000000..f2d65e6e95
--- /dev/null
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -0,0 +1,748 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import math
+import pathlib
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Generic, List, Literal, Optional, Type, TypeVar, Union
+
+import nemo_run as run
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from megatron.core.optimizer import OptimizerConfig
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.lightning import resume
+from nemo.lightning.pytorch import callbacks as nl_callbacks
+from nemo.lightning.pytorch.optim import MegatronOptimizerModule
+from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
+from nemo.utils import logging
+from pydantic import BaseModel, Field, ValidationError, field_serializer, field_validator, model_validator
+from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
+from tokenizers import Tokenizer
+
+from bionemo.core.utils import dtypes
+from bionemo.core.utils.dtypes import PrecisionTypes
+from bionemo.geneformer.api import GeneformerConfig
+from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
+from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
+from bionemo.geneformer.model.finetune_token_regressor import FineTuneSeqLenBioBertConfig
+from bionemo.llm.model.biobert.lightning import BioBertLightningModule
+from bionemo.llm.model.biobert.model import BioBertGenericConfig, BiobertSpecOption
+from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
+from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
+
+
+# If you'd like to register a custom activation function, you can add it to this dictionary to pass validation and allow serialization.
+CUSTOM_ACTIVATION_FNS: Dict[str, Callable[[torch.Tensor, Any], torch.Tensor]] = {}
+
+# NOTE(SKH): DO NOT use keys that already exist in torch.nn.functional, as the torch.nn.functional functions are selected first.
+for key in CUSTOM_ACTIVATION_FNS:
+    assert key not in dir(torch.nn.functional), f"Key {key} already exists in torch.nn.functional"
+
+# NOTE(SKH): it does not matter if values are duplicated as the key=>value mapping still does the right thing. Repeat values should be considered aliases.
+REVERSE_CUSTOM_ACTIVATION_FNS: Dict[Callable[[torch.Tensor, Any], torch.Tensor], str] = {
+    v: k for k, v in CUSTOM_ACTIVATION_FNS.items()
+}
+
+
+ModelConfigT = TypeVar("ModelConfigT", bound=BioBertGenericConfig)
+DataModuleT = TypeVar("DataModuleT", bound=pl.LightningDataModule)
+
+
+class DataConfig(BaseModel, Generic[DataModuleT]):
+    """Base class for all data configurations.
+
+    This class is used to define the interface for all data configurations. It is used to define the data module that
+    will be used in the training loop.
+
+    !! note Children **MUST** include the field `data_config_type` to discriminate between available
+    data modules in the MasterConfig. Additionally, add the concrete type to the Union type annotation in MasterConfig.
+    """
+
+    micro_batch_size: int = 8
+    results_dir: str = "./results"
+
+    @abstractmethod
+    def construct_data_module(self, global_batch_size: int) -> DataModuleT:
+        """Construct the data module from the configuration. Cannot be defined generically."""
+        ...
+
+
+# TODO do we need this?
+@dataclass
+class GeneformerDataArtifacts:
+    """Data artifacts produced by the geneformer preprocess."""
+
+    tokenizer: Tokenizer
+    median_dict: dict
+
+
+class GeneformerPretrainingDataConfig(DataConfig[SingleCellDataModule]):
+    """Configuration for the geneformer pre-training data module."""
+
+    # Shadow two attributes from the parent for visibility.
+    result_dir: str = "./results"
+    micro_batch_size: int = 8
+
+    data_config_type: Literal["geneformer_pretraining_data_config"] = "geneformer_pretraining_data_config"
+    data_dir: str
+    seq_length: int = 2048
+    num_dataset_workers: int = 0
+
+    @property
+    def train_data_path(self) -> str:
+        return self.data_dir + "/train"
+
+    @property
+    def val_data_path(self) -> str:
+        return self.data_dir + "/val"
+
+    @property
+    def test_data_path(self) -> str:
+        return self.data_dir + "/test"
+
+    def geneformer_preprocess(self) -> GeneformerDataArtifacts:
+        """Geneformer datamodule expects certain artifacts to be present in the data directory.
+
+        This method uses a legacy 'preprocessor' from BioNeMo 1 to acquire the associated artifacts.
+        """
+        preprocessor = GeneformerPreprocess(
+            download_directory=pathlib.Path(self.train_data_path),
+            medians_file_path=pathlib.Path(self.train_data_path + "/medians.json"),
+            tokenizer_vocab_path=pathlib.Path(self.train_data_path + "/geneformer.vocab"),
+        )
+        result = preprocessor.preprocess()
+        if "tokenizer" in result and "median_dict" in result:
+            logging.info("*************** Preprocessing Finished ************")
+            return GeneformerDataArtifacts(tokenizer=result["tokenizer"], median_dict=result["median_dict"])
+        else:
+            logging.error("Preprocessing failed.")
+            raise ValueError("Preprocessing failed to create tokenizer and/or median dictionary.")
+
+    def construct_data_module(self, global_batch_size: int) -> SingleCellDataModule:
+        geneformer_data_artifacts: GeneformerDataArtifacts = geneformer_preprocess(self)
+        data = SingleCellDataModule(
+            seq_length=self.seq_length,
+            tokenizer=geneformer_data_artifacts.tokenizer,
+            train_dataset_path=self.train_data_path,
+            val_dataset_path=self.val_data_path,
+            test_dataset_path=self.test_data_path,
+            random_token_prob=0.02,
+            median_dict=geneformer_data_artifacts.median_dict,
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=global_batch_size,
+            persistent_workers=self.num_dataset_workers > 0,
+            pin_memory=False,
+            num_workers=self.num_dataset_workers,
+        )
+        return data
+
+
+def geneformer_small_data_recipe(
+    data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15_small/processed_data",
+) -> GeneformerPretrainingDataConfig:
+    """Recipe that produces the base geneformer small data configuration."""
+    return GeneformerPretrainingDataConfig(data_dir=data_dir)
+
+
+def full_geneformer_data_recipe(
+    data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15/processed_data",
+) -> GeneformerPretrainingDataConfig:
+    return GeneformerPretrainingDataConfig(data_dir=data_dir)
+
+
+def geneformer_preprocess(data_config: GeneformerPretrainingDataConfig) -> GeneformerDataArtifacts:
+    """Geneformer datamodule expects certain artifacts to be present in the data directory.
+
+    This method uses a legacy 'preprocessor' from BioNeMo 1 to acquire the associated artifacts.
+    """
+    preprocessor = GeneformerPreprocess(
+        download_directory=pathlib.Path(data_config.train_data_path),
+        medians_file_path=pathlib.Path(data_config.train_data_path + "/medians.json"),
+        tokenizer_vocab_path=pathlib.Path(data_config.train_data_path + "/geneformer.vocab"),
+    )
+    result = preprocessor.preprocess()
+    if "tokenizer" in result and "median_dict" in result:
+        logging.info("*************** Preprocessing Finished ************")
+        return GeneformerDataArtifacts(tokenizer=result["tokenizer"], median_dict=result["median_dict"])
+    else:
+        logging.error("Preprocessing failed.")
+        raise ValueError("Preprocessing failed to create tokenizer and/or median dictionary.")
+
+
+class ParallelConfig(BaseModel):
+    tensor_model_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 1
+    accumulate_grad_batches: int = 1
+    ddp: Literal["megatron"] = "megatron"
+    remove_unused_parameters: bool = True
+    num_devices: int = 1
+    num_nodes: int = 1
+
+    @model_validator(mode="after")
+    def validate_devices(self):
+        # I think we can do a 2x2 split on 2 gpus for pipeline/tensor model parallel
+        if self.num_devices < self.tensor_model_parallel_size * self.pipeline_model_parallel_size:
+            raise ValidationError(
+                "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
+            )
+        return self
+
+
+def simple_parallel_recipe(
+    tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, num_devices: int = 1
+) -> ParallelConfig:
+    assert (
+        num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
+    ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
+    return ParallelConfig(
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
+        num_devices=num_devices,
+    )
+
+
+class TrainingConfig(BaseModel):
+    max_steps: int
+    limit_val_batches: int
+    val_check_interval: int
+    # NOTE this matches whats used by nl.MegatronMixedPrecision which has a restricted set of precisions.
+    precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
+    accelerator: str = "gpu"
+
+
+def default_trainer_config_recipe() -> TrainingConfig:
+    return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
+
+
+def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
+        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
+        ddp="megatron",
+        find_unused_parameters=True,
+        ckpt_include_optimizer=True,
+    )
+
+    trainer = nl.Trainer(
+        devices=parallel_config.num_devices,
+        max_steps=training_config.max_steps,
+        accelerator=training_config.accelerator,
+        strategy=strategy,
+        limit_val_batches=training_config.limit_val_batches,
+        val_check_interval=training_config.val_check_interval,
+        num_nodes=parallel_config.num_nodes,
+        callbacks=[
+            RichModelSummary(max_depth=4),
+            LearningRateMonitor(),
+        ],
+        plugins=nl.MegatronMixedPrecision(precision=training_config.precision),
+    )
+    return trainer
+
+
+class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
+    """BioNeMo model configuration class, wraps TransformerConfig and friends.
+
+    This class is used to define the interface for all model configurations. It is **Exposed** to guard against ill-typed
+    or poorly defined fields in the underlying configuration objects. `ModelConfigT` declares the associated type of the
+    underlying config (most commonly a BioBertGenericConfig, but could also be a TransformerConfig or something similar).
+    Children should try to expose the minimal set of fields necessary for the user to configure the model while keeping
+    the more esoteric configuration private to the underlying ModelConfigT.
+
+
+    !! note Children **MUST** include the field to discriminate between available
+    bionemo_model_config_type: Literal["finetuning_seqlen_biobert"] = "finetuning_seqlen_biobert" # Immutable, declares how to discriminate between model types for pydantic
+    data modules in the MasterConfig. Additionally, add the concrete type to the Union type annotation in MasterConfig.
+    """
+
+    # Pydantic stuff to allow arbitrary types + validators + serializers
+    class Config:
+        arbitrary_types_allowed = True
+
+    """ Use this class to hide fields that are not serializable by Pydantic that we do not want to expose. """
+
+    @abstractmethod
+    def model_class(self) -> Type[ModelConfigT]: ...
+
+    def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
+        """Converts the exposed dataclass to the underlying Transformer config.
+
+        The underlying ModelConfigT may both be incomplete and unserializable. We use this transformation as a way to
+        hide fields that are either not serializable by Pydantic or that we do not want to expose.
+
+        This is a good candidate for refactoring.
+        """
+
+        cls: Type[ModelConfigT] = self.model_class()
+        model_dict = {}
+        for attr in self.model_fields:
+            if attr not in model_dict and attr in cls.__dataclass_fields__:
+                model_dict[attr] = getattr(self, attr)
+        # Now set fp16 and bf16 based on the precision for the underlying TransformerConfig=>ParallelConfig
+        #   the only constraint is that both must not be true.
+        model_dict["bf16"] = self.pipeline_dtype == dtypes.precision_to_dtype["bf16-mixed"]
+        model_dict["fp16"] = self.pipeline_dtype == dtypes.precision_to_dtype["16-mixed"]
+        result = cls(**model_dict)
+
+        return result
+
+    # NOTE: See PrecisionTypes for a list of valid literals that may be deserialized.
+    params_dtype: torch.dtype
+    pipeline_dtype: torch.dtype
+    autocast_dtype: torch.dtype
+
+    num_layers: int = 6
+    hidden_size: int = 256
+    ffn_hidden_size: int = 512
+    num_attention_heads: int = 4
+    seq_length: int = 512
+    fp32_residual_connection: bool = False
+    hidden_dropout: float = 0.02
+    init_method_std: float = 0.02
+    kv_channels: Optional[int] = None
+    apply_query_key_layer_scaling: bool = False
+    make_vocab_size_divisible_by: int = 128
+    masked_softmax_fusion: bool = True
+    fp16_lm_cross_entropy: bool = False
+    gradient_accumulation_fusion: bool = False
+    layernorm_zero_centered_gamma: bool = False
+    layernorm_epsilon: float = 1.0e-12
+    activation_func: Callable[[torch.Tensor, Any], torch.Tensor] = F.gelu
+    qk_layernorm: bool = False
+    apply_residual_connection_post_layernorm: bool = False
+    bias_activation_fusion: bool = True
+    bias_dropout_fusion: bool = True
+    get_attention_mask_from_fusion: bool = False
+    attention_dropout: float = 0.1
+    share_embeddings_and_output_weights: bool = True
+    enable_autocast: bool = False
+    nemo1_ckpt_path: Optional[str] = None
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec
+
+    @field_serializer("params_dtype", "pipeline_dtype", "autocast_dtype")
+    def serialize_dtypes(self, v: torch.dtype) -> PrecisionTypes:
+        return dtypes.dtype_to_precision[v]
+
+    @field_validator("activation_func", mode="before")
+    @classmethod
+    def validate_activation_func(cls, activation_func: str) -> Callable:
+        """
+        Validates the activation function, assumes this function exists in torch.nn.functional. For custom
+        activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module.
+
+        This method validates the provided activation function string and returns
+        a callable function based on the validation context using the provided validator in the base class.
+        Args:
+            activation_func (str): The activation function to be validated.
+            context (ValidationInfo): The context for validation.
+        Returns:
+            Callable: A callable function after validation.
+
+        See Also:
+            CUSTOM_ACTIVATION_FNS
+        """
+        func = getattr(torch.nn.functional, activation_func.lower(), None)
+        if func is None and activation_func in CUSTOM_ACTIVATION_FNS:
+            func = CUSTOM_ACTIVATION_FNS[activation_func]
+            return func
+        elif func is None:
+            raise ValidationError(
+                f"activation_func must be a valid function in `torch.nn.functional`, got {activation_func=}"
+            )
+        else:
+            return func
+
+    @field_validator("params_dtype", "pipeline_dtype", "autocast_dtype", mode="before")
+    @classmethod
+    def precision_validator(cls, v: PrecisionTypes) -> torch.dtype:
+        return dtypes.get_autocast_dtype(v)
+
+    @field_serializer("activation_func")
+    def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tensor]) -> str:
+        func_name = v.__name__
+        func = getattr(torch.nn.functional, func_name, None)
+        if func is not None:
+            return func_name
+        elif func in REVERSE_CUSTOM_ACTIVATION_FNS:
+            return REVERSE_CUSTOM_ACTIVATION_FNS[func]  # Get the serialization key
+        else:
+            raise ValueError(f"Unsupported activation function: {v}")
+
+
+class ExposedGeneformerConfig(ExposedModelConfig[GeneformerConfig]):
+    """There are no additional arguments for Geneformer, so we simply plugin the associated types and move on."""
+
+    bionemo_model_config_type: Literal["geneformer"] = (
+        "geneformer"  # Immutable, declares how to discriminate between model types for pydantic
+    )
+
+    def model_class(self) -> Type[GeneformerConfig]:
+        return GeneformerConfig
+
+
+class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBertConfig]):
+    """Config for models that fine-tune a BioBERT model from a pre-trained checkpoint.
+
+    Parameters:
+        initial_ckpt_path - path to a directory containing checkpoint files for initializing the model. This is only
+            required on the first execution of the model, any restored checkpoints should skip this step.
+        initial_ckpt_skip_keys_with_these_prefixes - skip any layer that contains this key during restoration. Useful
+            for ignoring extra additional layers used for finetuning. Layers with these keys are then randomly initialized.
+    """
+
+    # Used by discriminators
+    bionemo_model_config_type: Literal["finetuning_seqlen_biobert"] = (
+        "finetuning_seqlen_biobert"  # Immutable, declares how to discriminate between model types for pydantic
+    )
+
+    # Custom parameters for FineTuning
+    initial_ckpt_path: Optional[str] = None
+    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None
+
+    def __post_init__(self):
+        if not self.initial_ckpt_skip_keys_with_these_prefixes:
+            self.initial_ckpt_skip_keys_with_these_prefixes = ["regression_head"]
+
+    def model_class(self) -> Type[FineTuneSeqLenBioBertConfig]:
+        return FineTuneSeqLenBioBertConfig
+
+
+def geneformer_finetuning_regression_head_recipe(
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None,
+) -> ExposedFineTuneSeqLenBioBertConfig:
+    # NOTE (SKH): this recipe is sad because it isnt smart enough to know our validator is returning a dtype.
+    finetuning_config = ExposedFineTuneSeqLenBioBertConfig(
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
+        initial_ckpt_skip_keys_with_these_prefixes=initial_ckpt_skip_keys_with_these_prefixes,
+    )
+    return finetuning_config
+
+
+def geneformer10M_pretraining_recipe(
+    seq_length: int = 128,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec,
+) -> ExposedGeneformerConfig:
+    geneformer_config = ExposedGeneformerConfig(
+        num_layers=6,
+        hidden_size=256,
+        ffn_hidden_size=512,
+        num_attention_heads=4,
+        seq_length=seq_length,
+        fp32_residual_connection=False,
+        hidden_dropout=0.02,
+        init_method_std=0.02,
+        kv_channels=None,
+        apply_query_key_layer_scaling=False,
+        make_vocab_size_divisible_by=128,
+        masked_softmax_fusion=True,
+        fp16_lm_cross_entropy=False,
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        gradient_accumulation_fusion=False,
+        layernorm_zero_centered_gamma=False,
+        layernorm_epsilon=1.0e-12,
+        activation_func="gelu",
+        qk_layernorm=False,
+        apply_residual_connection_post_layernorm=False,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+        get_attention_mask_from_fusion=False,
+        attention_dropout=0.1,
+        share_embeddings_and_output_weights=True,
+        enable_autocast=False,
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
+    )
+    return geneformer_config
+
+
+class OptimizerSchedulerConfig(BaseModel):
+    # TODO could use validators on optimizer, interval, and monitor.
+
+    lr: float = 1e-4
+    optimizer: str = "adam"
+    cosine_rampup_frac: float = 0.01
+    cosine_hold_frac: float = 0.05
+    interval: str = "step"
+    monitor: str = "val_loss"
+
+
+def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
+    return OptimizerSchedulerConfig()
+
+
+def biobert_lightning_module(
+    bionemo_model_config: BioBertGenericConfig,
+    tokenizer: Tokenizer,
+    optim_config: OptimizerSchedulerConfig,
+    num_steps: int,
+) -> BioBertLightningModule:
+    model = BioBertLightningModule(
+        bionemo_model_config,
+        tokenizer=tokenizer,
+        optimizer=MegatronOptimizerModule(
+            config=OptimizerConfig(
+                lr=optim_config.lr,
+                optimizer=optim_config.optimizer,
+                use_distributed_optimizer=True,
+                fp16=bionemo_model_config.fp16,
+                bf16=bionemo_model_config.bf16,
+            ),
+            lr_scheduler=CosineAnnealingScheduler(
+                max_steps=num_steps,
+                min_lr=optim_config.lr / 100,
+                warmup_steps=int(math.ceil(num_steps * optim_config.cosine_rampup_frac)),
+                interval=optim_config.interval,
+                monitor=optim_config.monitor,
+                constant_steps=int(math.ceil(num_steps * optim_config.cosine_hold_frac)),
+            ),
+        ),
+    )
+    return model
+
+
+class ExperimentConfig(BaseModel):
+    save_every_n_steps: int
+    result_dir: str
+    experiment_name: str
+    restore_from_checkpoint_path: Optional[str]
+    resume_if_exists: bool
+    wandb_config: Optional[WandbConfig] = None
+    save_last_checkpoint: bool = True
+    metric_to_monitor_for_checkpoints: str = "reduced_train_loss"
+    save_top_k: int = 2
+    create_tensorboard_logger: bool = False
+
+
+def experiment_config_recipe() -> ExperimentConfig:
+    return ExperimentConfig(
+        save_every_n_steps=100,
+        result_dir="./results",
+        experiment_name="default_experiment",
+        restore_from_checkpoint_path=None,
+        resume_if_exists=True,
+        save_last_checkpoint=True,
+        metric_to_monitor_for_checkpoints="reduced_train_loss",
+        save_top_k=2,
+        create_tensorboard_logger=False,
+    )
+
+
+def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
+    checkpoint_callback = nl_callbacks.ModelCheckpoint(
+        save_last=experiment_config.save_last_checkpoint,
+        monitor=experiment_config.metric_to_monitor_for_checkpoints,
+        save_top_k=experiment_config.save_top_k,
+        every_n_train_steps=experiment_config.save_every_n_steps,
+        always_save_context=True,
+    )
+
+    wandb_config: Optional[WandbConfig] = (
+        None
+        if wandb_config is None
+        else WandbConfig(
+            offline=wandb_config.offline,
+            project=wandb_config.project,
+            entity=wandb_config.entity,
+            log_model=False,
+        )
+    )
+
+    nemo_logger = setup_nemo_lightning_logger(
+        root_dir=experiment_config.result_dir,
+        name=experiment_config.experiment_name,
+        initialize_tensorboard_logger=experiment_config.create_tensorboard_logger,
+        wandb_config=wandb_config,
+        ckpt_callback=checkpoint_callback,
+    )
+    return nemo_logger
+
+
+@run.cli.entrypoint
+def pretrain(
+    bionemo_exposed_model_config: ExposedModelConfig,
+    data_config: DataConfig[DataModuleT],
+    parallel_config: ParallelConfig,
+    training_config: TrainingConfig,
+    optim_config: OptimizerSchedulerConfig,
+    experiment_config: ExperimentConfig,
+    wandb_config: Optional[WandbConfig],
+    resume_if_exists: bool = True,
+):
+    bionemo_model_config = bionemo_exposed_model_config.exposed_to_internal_bionemo_model_config()
+    pathlib.Path(data_config.result_dir).mkdir(parents=True, exist_ok=True)
+
+    if experiment_config.save_every_n_steps != training_config.val_check_interval:
+        logging.warning("Mutating training_config.save_every_n_steps to be equal to val_check_interval.")
+        experiment_config.save_every_n_steps = training_config.val_check_interval
+
+    global_batch_size = infer_global_batch_size(
+        micro_batch_size=data_config.micro_batch_size,
+        num_nodes=parallel_config.num_nodes,
+        devices=parallel_config.num_devices,
+        accumulate_grad_batches=parallel_config.accumulate_grad_batches,
+        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
+        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
+    )
+
+    data: SingleCellDataModule = data_config.construct_data_module(global_batch_size)
+
+    model: BioBertLightningModule = biobert_lightning_module(
+        bionemo_model_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps
+    )
+    trainer: nl.Trainer = setup_trainer(parallel_config, training_config)
+    nemo_logger: nl.NeMoLogger = nemo_logger_factory(experiment_config, wandb_config=wandb_config)
+
+    llm.train(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=nemo_logger,
+        resume=resume.AutoResume(
+            resume_if_exists=resume_if_exists,
+            resume_ignore_no_checkpoint=True,
+        ),
+    )
+
+
+class MasterConfig(BaseModel):
+    """Mulling ways to make this generic over data modules:
+
+    1) ABC in our DataModule that supports DataConfig -> DataModule
+        pros:
+        cons:
+    2) Discriminated union on data_config, additionally needs a method that also takes this union and produces the correct data module.
+    3) Pick one and highlight the other approach in either the SDD, PR, or both.
+
+    """
+
+    data_config: Union[GeneformerPretrainingDataConfig] = Field(..., discriminator="data_config_type")
+    parallel_config: ParallelConfig
+    training_config: TrainingConfig
+    # TODO expand this for all other relevant models here.
+    bionemo_model_config: Union[ExposedGeneformerConfig, ExposedFineTuneSeqLenBioBertConfig] = Field(
+        ..., discriminator="bionemo_model_config_type"
+    )
+    optim_config: OptimizerSchedulerConfig
+    experiment_config: ExperimentConfig
+    wandb_config: Optional[WandbConfig] = None
+
+    @model_validator(mode="after")
+    def validate_master_config(self) -> "MasterConfig":
+        self.bionemo_model_config.seq_length = self.data_config.seq_length
+        # What other global validators should we set here?
+        return self
+
+
+def recipes_to_config_json(model_cfg_type="geneformer"):
+    """Simple example for creating a JSON from recipes."""
+
+    data_config: GeneformerPretrainingDataConfig = geneformer_small_data_recipe()
+    parallel_config = simple_parallel_recipe()
+    training_config = default_trainer_config_recipe()
+    if model_cfg_type == "geneformer":
+        bionemo_model_config = geneformer10M_pretraining_recipe()
+    else:
+        bionemo_model_config = geneformer_finetuning_regression_head_recipe()
+
+    optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
+    experiment_config = experiment_config_recipe()
+    wandb_config = WandbConfig(project="bionemo2-demo", entity="nvidia", offline=True)
+
+    # Create the master config
+    master_config = MasterConfig(
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        bionemo_model_config=bionemo_model_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+        wandb_config=wandb_config,
+    )
+
+    # Serialize to JSON
+    json_str = master_config.model_dump_json(indent=2)
+
+    # Save to file
+    with open(
+        "/workspaces/bionemo-fw-ea/sub-packages/bionemo-geneformer/src/bionemo/geneformer/conf/default-geneformer-config.json",
+        "w",
+    ) as f:
+        f.write(json_str)
+
+    print("Configuration saved to config.json")
+
+
+if __name__ == "__main__":
+    recipes_to_config_json("geneformer")
+    # recipes_to_config_json('finetune')
+
+    def parse_args():
+        parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
+        parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
+        return parser.parse_args()
+
+    def load_config(config_path: str) -> MasterConfig:
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+        return MasterConfig(**config_dict)
+
+    args = parse_args()
+    config = load_config(args.config)
+
+    pretrain(
+        bionemo_exposed_model_config=config.bionemo_model_config,
+        data_config=config.data_config,
+        parallel_config=config.parallel_config,
+        training_config=config.training_config,
+        optim_config=config.optim_config,
+        experiment_config=config.experiment_config,
+        wandb_config=config.wandb_config,
+        resume_if_exists=False,
+    )
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
index b6b1e4079f..2f9efdf9c7 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
@@ -13,21 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pathlib
-from typing import Any, Dict, Optional, Sequence, TypedDict
+from typing import Any, Dict, Optional, Sequence
 
 from nemo.lightning.nemo_logger import NeMoLogger
 from nemo.lightning.pytorch import callbacks as nemo_callbacks
 from nemo.utils import logging
+from pydantic import BaseModel
 from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
 
 
 __all__: Sequence[str] = (
-    "WandbLoggerOptions",
+    "WandbConfig",
     "setup_nemo_lightning_logger",
 )
 
 
-class WandbLoggerOptions(TypedDict):
+class WandbConfig(BaseModel):
     """Note: `name` controls the exp name is handled by the NeMoLogger so it is ommitted here.
     `directory` is also omitted since it is set by the NeMoLogger.
     """  # noqa: D205
@@ -35,16 +36,14 @@ class WandbLoggerOptions(TypedDict):
     offline: bool  # offline mode
     project: str  # project name
     entity: str  # group name or user name
-    # name: str # experiment name, this is handled by NeMoLogger
-    # the directory is also set by NeMoLogger
-    log_model: bool  # log model
+    log_model: bool = False  # log model
 
 
 def setup_nemo_lightning_logger(
     name: str = "default-name",
     root_dir: str | pathlib.Path = "./results",
     initialize_tensorboard_logger: bool = False,
-    wandb_kwargs: Optional[WandbLoggerOptions] = None,
+    wandb_config: Optional[WandbConfig] = None,
     ckpt_callback: Optional[nemo_callbacks.ModelCheckpoint] = None,
     **kwargs: Dict[str, Any],
 ) -> NeMoLogger:
@@ -64,8 +63,8 @@ def setup_nemo_lightning_logger(
     """
     # The directory that the logger will save to
     save_dir = pathlib.Path(root_dir) / name
-    if wandb_kwargs is not None:
-        wandb_logger = WandbLogger(save_dir=save_dir, name=name, **wandb_kwargs)
+    if wandb_config is not None:
+        wandb_logger = WandbLogger(save_dir=save_dir, name=name, **wandb_config.model_dump())
     else:
         wandb_logger = None
         logging.warning("WandB is currently turned off.")

From f61911b00e36cfabb2b5f5c55787c73b6bddcf61 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 2 Oct 2024 17:12:00 +0000
Subject: [PATCH 16/58] cleansup modified files

---
 .gitmodules                                   |   3 -
 Dockerfile                                    | 167 +++--
 scripts/singlecell/geneformer/slurm-run.py    | 131 ----
 .../src/bionemo/geneformer/run/README.md      |  28 -
 .../src/bionemo/geneformer/run/factories.py   | 633 ------------------
 .../src/bionemo/geneformer/run/main.py        | 134 ----
 6 files changed, 97 insertions(+), 999 deletions(-)
 delete mode 100644 scripts/singlecell/geneformer/slurm-run.py
 delete mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/README.md
 delete mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
 delete mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py

diff --git a/.gitmodules b/.gitmodules
index 035642e72d..0b6458ab20 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,6 +4,3 @@
 [submodule "3rdparty/NeMo"]
 	path = 3rdparty/NeMo
 	url = https://github.com/NVIDIA/NeMo.git
-[submodule "3rdparty/NeMo-Run"]
-	path = 3rdparty/NeMo-Run
-	url = git@github.com:NVIDIA/NeMo-Run.git
diff --git a/Dockerfile b/Dockerfile
index 4aae8c336d..f8ceffa650 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -48,46 +48,10 @@ RUN pip --disable-pip-version-check --no-cache-dir install \
   git+https://github.com/state-spaces/mamba.git@v2.0.3
 
 RUN pip install hatchling   # needed to install nemo-run
-ARG NEMO_RUN_TAG=8701f5f2c6c3a4a72bd2a435c872d7dcc4560527
-RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_TAG}
-
-FROM bionemo2-base AS pip-requirements
-
-# Copy and install pypi depedencies.
-RUN mkdir /tmp/pip-tmp
-WORKDIR /tmp/pip-tmp
-
-COPY requirements-dev.txt requirements-test.txt requirements-cve.txt /tmp/pip-tmp/
-
-# We want to only copy the requirements.txt, setup.py, and pyproject.toml files for *ALL* sub-packages
-# but we **can't** do COPY sub-packages/**/{requirements.txt,...} /<destination> because this will overwrite!
-# So....we copy everything into a temporary image and remove everything else!
-# Later, we can copy the result from the temporary image and get what we want
-# **WITHOUT** invalidating the cache for successive layers!
-COPY sub-packages/ /tmp/pip-tmp/sub-packages
-# remove all directories that aren't the top-level sub-packages/bionemo-{xyz}
-RUN find sub-packages/ -type d | grep "bionemo-[a-zA-Z0-9\-]*/" | xargs rm -rf && \
-    # only keep the requirements-related files
-    find sub-packages/ -type f | grep -v -E "requirements.txt|pyproject.toml|setup.py" | xargs rm
-
-FROM bionemo2-base AS dev
+ARG NEMU_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
+RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_TAG}
 
 RUN mkdir -p /workspace/bionemo2/
-WORKDIR /workspace/bionemo2
-
-# We get the sub-packcages/ top-level structure + requirements.txt files
-COPY --from=pip-requirements /tmp/pip-tmp/ /workspace/bionemo2/
-
-RUN pip install -r requirements-dev.txt -r requirements-test.txt -r requirements-cve.txt
-
-# We calculate paths to each requirements.txt file and dynamically construct the pip install command.
-# This command will expand to something like:
-#   pip install --disable-pip-version-check --no-cache-dir \
-#      -r bionemo-core/requirements.txt \
-#      -r bionemo-pytorch/requirements.txt \
-#      -r bionemo-lmm/requirements.txt \
-#      (etc.)
-RUN X=""; for sub in $(echo sub-packages/bionemo-*); do X="-r ${sub}/requirements.txt ${X}"; done; eval "pip install --disable-pip-version-check --no-cache-dir ${X}"
 
 # Delete the temporary /build directory.
 WORKDIR /workspace
@@ -96,16 +60,66 @@ RUN rm -rf /build
 # Addressing Security Scan Vulnerabilities
 RUN rm -rf /opt/pytorch/pytorch/third_party/onnx
 RUN apt-get update  && \
-    apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \
-    rm -rf /var/lib/apt/lists/*
+  apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \
+  rm -rf /var/lib/apt/lists/*
 RUN apt purge -y libslurm37 libpmi2-0 && \
-    apt autoremove -y
+  apt autoremove -y
 RUN source /usr/local/nvm/nvm.sh && \
-    NODE_VER=$(nvm current) && \
-    nvm deactivate && \
-    nvm uninstall $NODE_VER && \
-    sed -i "/NVM/d" /root/.bashrc && \
-    sed -i "/nvm.sh/d" /etc/bash.bashrc
+  NODE_VER=$(nvm current) && \
+  nvm deactivate && \
+  nvm uninstall $NODE_VER && \
+  sed -i "/NVM/d" /root/.bashrc && \
+  sed -i "/nvm.sh/d" /etc/bash.bashrc
+
+# Use UV to install python packages from the workspace. This just installs packages into the system's python
+# environment, and does not use the current uv.lock file.
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+ENV UV_LINK_MODE=copy \
+  UV_COMPILE_BYTECODE=1 \
+  UV_PYTHON_DOWNLOADS=never \
+  UV_SYSTEM_PYTHON=true
+
+# Install the bionemo-geomtric requirements ahead of copying over the rest of the repo, so that we can cache their
+# installation. These involve building some torch extensions, so they can take a while to install.
+RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \
+  --mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked \
+  uv pip install --no-build-isolation -r /requirements-pyg.txt
+
+WORKDIR /workspace/bionemo2
+
+# Install 3rd-party deps and bionemo submodules.
+COPY ./3rdparty /workspace/bionemo2/3rdparty
+COPY ./sub-packages /workspace/bionemo2/sub-packages
+
+# Note, we need to mount the .git folder here so that setuptools-scm is able to fetch git tag for version.
+RUN --mount=type=bind,source=./.git,target=./.git \
+  --mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked \
+  --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
+  --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
+  <<EOT
+uv pip install --no-build-isolation \
+  ./3rdparty/* \
+  ./sub-packages/bionemo-* \
+  -r /requirements-cve.txt \
+  -r /requirements-test.txt
+rm -rf ./3rdparty
+rm -rf /tmp/*
+EOT
+
+# In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the
+# base pytorch container. We can then set up a non-root user and uninstall the bionemo and 3rd-party packages, so that
+# they can be installed in an editable fashion from the workspace directory. This lets us install all the package
+# dependencies in a cached fashion, so they don't have to be built from scratch every time the devcontainer is rebuilt.
+FROM ${BASE_IMAGE} AS dev
+
+RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt,sharing=locked \
+  --mount=type=cache,id=apt-lib,target=/var/lib/apt,sharing=locked \
+  <<EOT
+apt-get update -qy
+apt-get install -qyy \
+  sudo
+rm -rf /tmp/* /var/tmp/*
+EOT
 
 # Create a non-root user to use inside a devcontainer.
 ARG USERNAME=bionemo
@@ -116,31 +130,44 @@ RUN groupadd --gid $USER_GID $USERNAME \
   && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
   && chmod 0440 /etc/sudoers.d/$USERNAME
 
-RUN find /usr/local/lib/python3.10/dist-packages/ -type f -print0 | xargs -0 -P 0 -n 10000 chown $USERNAME:$USER_GID
-
-ENV PATH="/home/bionemo/.local/bin:${PATH}"
-
-
-# Create a release image with bionemo2 installed.
-FROM dev AS release
-
-# Install 3rd-party deps
-COPY ./3rdparty /build
-WORKDIR /build/Megatron-LM
-RUN pip install --disable-pip-version-check --no-cache-dir .
+# Here we delete the dist-packages directory from the pytorch base image, and copy over the dist-packages directory from
+# the build image. This ensures we have all the necessary dependencies installed (megatron, nemo, etc.).
+RUN <<EOT
+  rm -rf /usr/local/lib/python3.10/dist-packages
+  mkdir -p /usr/local/lib/python3.10/dist-packages
+  chmod 777 /usr/local/lib/python3.10/dist-packages
+  chmod 777 /usr/local/bin
+EOT
+
+USER $USERNAME
+
+COPY --from=bionemo2-base --chown=$USERNAME:$USERNAME --chmod=777 \
+  /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+ENV UV_LINK_MODE=copy \
+  UV_COMPILE_BYTECODE=0 \
+  UV_PYTHON_DOWNLOADS=never \
+  UV_SYSTEM_PYTHON=true
+
+RUN --mount=type=bind,source=./requirements-dev.txt,target=/workspace/bionemo2/requirements-dev.txt \
+  --mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked <<EOT
+  uv pip install -r /workspace/bionemo2/requirements-dev.txt
+  rm -rf /tmp/*
+EOT
+
+RUN <<EOT
+  rm -rf /usr/local/lib/python3.10/dist-packages/bionemo*
+  pip uninstall -y nemo_toolkit megatron_core
+EOT
+
+# The 'release' target needs to be last so that it's the default build target. In the future, we could consider a setup
+# similar to the devcontainer above, where we copy the dist-packages folder from the build image into the release image.
+# This would reduce the overall image size by reducing the number of intermediate layers. In the meantime, we match the
+# existing release image build by copying over remaining files from the repo into the container.
+FROM bionemo2-base AS release
 
-WORKDIR /build/NeMo
-RUN pip install --disable-pip-version-check --no-cache-dir .[all]
-WORKDIR /workspace
-RUN rm -rf /build
-
-# Install bionemo2 submodules
-WORKDIR /workspace/bionemo2/
 COPY VERSION .
-COPY ./sub-packages /workspace/bionemo2/sub-packages
-# Dynamically install the code for each bionemo namespace package.
-RUN for sub in sub-packages/bionemo-*; do pushd ${sub} && pip install --no-build-isolation --no-cache-dir --disable-pip-version-check --no-deps -e . && popd; done
-
-WORKDIR /workspace/bionemo2/
 COPY ./scripts ./scripts
 COPY ./README.md ./
+COPY ./ci/scripts ./ci/scripts
diff --git a/scripts/singlecell/geneformer/slurm-run.py b/scripts/singlecell/geneformer/slurm-run.py
deleted file mode 100644
index 233315668a..0000000000
--- a/scripts/singlecell/geneformer/slurm-run.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-Apache2
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import nemo_run as run
-from typing import Optional
-
-from bionemo.geneformer.run.factories import default_adam_optimizer_with_cosine_annealing_recipe, default_trainer_config, experiment_config_recipe, geneformer10M_pretraining_recipe, pretrain_partial, simple_parallel_recipe, small_data_config
-
-def slurm_executor(
-    user: str,
-    host: str,
-    remote_job_dir: str,
-    account: str,
-    partition: str,
-    nodes: int,
-    devices: int,
-    identity: str,
-    time: str = "01:00:00",
-    custom_mounts: Optional[list[str]] = None,
-    custom_env_vars: Optional[dict[str, str]] = None,
-    container_image: str = "nvcr.io/nvidia/nemo:dev",
-    retries: int = 0,
-) -> run.SlurmExecutor:
-    if not (user and host and remote_job_dir and account and partition and nodes and devices):
-        raise RuntimeError(
-            "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function."
-        )
-
-    mounts = []
-    # Custom mounts are defined here.
-    if custom_mounts:
-        mounts.extend(custom_mounts)
-
-    # Env vars for jobs are configured here
-    env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
-    }
-    if custom_env_vars:
-        env_vars |= custom_env_vars
-
-    # This defines the slurm executor.
-    # We connect to the executor via the tunnel defined by user, host and remote_job_dir.
-    executor = run.SlurmExecutor(
-        account=account,
-        partition=partition,
-        tunnel=run.SSHTunnel(
-            user=user,
-            host=host,
-            job_dir=remote_job_dir, # This is where the results of the run will be stored by default.
-            identity=identity
-        ),
-        nodes=nodes,
-        ntasks_per_node=devices,
-        gpus_per_node=devices,
-        mem="0",
-        exclusive=True,
-        gres="gpu:8",
-    )
-
-    executor.container_image = container_image
-    executor.container_mounts = mounts
-    executor.env_vars = env_vars
-    executor.retries = retries
-    executor.time = time
-
-    return executor
-
-def main():
-    identity="/home/bionemo/.ssh/id_ed25519"
-    # OPTIONAL: Provide path to the private key that can be used to establish the SSH connection without entering your password.
-    DRACO="cs-oci-ord-login-03"
-    CUSTOM_MOUNTS = [
-        "/lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/results/bionemo2_geneformer_pretraining/bionemo2_geneformer_pretraining:/results",
-        "/lustre/fsw/portfolios/healthcareeng/projects/healthcareeng_bionemo/data:/workspaces/bionemo-fw-ea/data",
-    ]
-    executor = slurm_executor(
-        user='skothenhill',
-        identity=identity,
-        host=DRACO,
-        remote_job_dir='/home/skothenhill/20240924-bionemo2/nemorun',
-        account='healthcareeng_bionemo',
-        partition='polar',
-        nodes=1,
-        devices=8,
-        custom_mounts = CUSTOM_MOUNTS,
-        container_image="nvcr.io/nvidian/cvai_bnmo_trng/bionemo:bionemo2-758aaecc65031530751c095c727eac58ffd5188b",
-    )
-
-    model_config = geneformer10M_pretraining_recipe()
-    data_config = small_data_config(data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15_small/processed_data") 
-    parallel_config=simple_parallel_recipe()
-    training_config = default_trainer_config()
-    optim_config=default_adam_optimizer_with_cosine_annealing_recipe()
-    experiment_config=experiment_config_recipe()
-    data_config.seq_length=128
-    data_config.micro_batch_size=8
-    parallel_config.num_devices=8
-    training_config.precision='bf16-mixed'
-    training_config.max_steps=1000
-    recipe = pretrain_partial(
-        model_config=model_config,
-        data_config=data_config,
-        parallel_config=parallel_config,
-        training_config=training_config,
-        optim_config=optim_config,
-        experiment_config=experiment_config,
-        resume_if_exists=False,
-    )
-    # Submit a partial object
-    # There is a way to do this with explicit experiment management but idk how.
-    run.run(recipe, executor=executor, detach=True, dryrun=False)
-
-main()
\ No newline at end of file
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/README.md b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/README.md
deleted file mode 100644
index 427b78a85c..0000000000
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# NeMo-Run entrypoint
-
-`main.py` acts as a simple entrypoint to pretraining geneformer via use of configs and factories (`factories.py`). The command below will  execute the equivalent of what we have under scripts/pretrain.py
-
-This module is a work in progress.
-
-``` bash
-python sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py \
-	geneformer_config=basic_geneformer_config_recipe \
-	data_config=small_data_config \
-	parallel_config=simple_parallel_recipe \
-	training_config=default_trainer_config \
-	optim_config=default_adam_optimizer_with_cosine_annealing_recipe \
-	experiment_config=experiment_config_recipe \
-	resume_if_exists=False \
-	data_config.seq_length=128 \
-	parallel_config.num_devices=1 \
-	data_config.micro_batch_size=2 \
-	training_config.precision=bf16-mixed
-```
-
-## Concepts and things to keep in mind
-
-Plain Function - A function that does literally anything and produces something else. In somecases, we have functions that take configs and produce an object. In these scenarios we are often composing an object with pieces of various configs.
-Factory - A method that constructs a config and is decorated with run.cli.factory. These act as configs presentable to the command line.
-Recipe - A specific factory with a distinct purpose. E.g. BERT XL vs BERT Small
-Config - A fiddle dataclass presentable and mutatable via nemo run. These are also serialized and used for restoring previous configuations.
-Entrypoint - A method that takes a mixture of plain arguments and configs. These are exposed to the command line. The body of the function represents the execution occuring in the program.
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
deleted file mode 100644
index a866cb5bdc..0000000000
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/factories.py
+++ /dev/null
@@ -1,633 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-Apache2
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import pathlib
-from abc import ABC, abstractmethod
-from dataclasses import asdict, dataclass
-from typing import Callable, Generic, List, Literal, Optional, Type, TypeVar
-
-import nemo_run as run
-from megatron.core.optimizer import OptimizerConfig
-from nemo import lightning as nl
-from nemo.collections import llm
-from nemo.lightning import resume
-from nemo.lightning.pytorch import callbacks as nl_callbacks
-from nemo.lightning.pytorch.optim import MegatronOptimizerModule
-from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
-from nemo.utils import logging
-from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
-from tokenizers import Tokenizer
-from torch.nn import functional as F
-
-from bionemo.core.utils.dtypes import PrecisionTypes, get_autocast_dtype
-from bionemo.geneformer.api import GeneformerConfig
-from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
-from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
-from bionemo.geneformer.model.finetune_token_regressor import FineTuneSeqLenBioBertConfig
-from bionemo.llm.model.biobert.lightning import BioBertLightningModule
-from bionemo.llm.model.biobert.model import BioBertGenericConfig, BiobertSpecOption
-from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
-from bionemo.llm.utils.logger_utils import WandbLoggerOptions, setup_nemo_lightning_logger
-
-
-run.Config
-
-
-@dataclass
-class DataConfig:
-    data_dir: str
-    result_dir: str = "./results"
-    seq_length: int = 2048
-    num_dataset_workers: int = 0
-    micro_batch_size: int = 8
-
-    @property
-    def train_data_path(self) -> str:
-        return self.data_dir + "/train"
-
-    @property
-    def val_data_path(self) -> str:
-        return self.data_dir + "/val"
-
-    @property
-    def test_data_path(self) -> str:
-        return self.data_dir + "/test"
-
-
-@run.cli.factory
-@run.autoconvert
-def small_data_config(
-    data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15_small/processed_data",
-) -> DataConfig:
-    # NOTE theoretically we could validate that this stuff exists.
-    return DataConfig(data_dir=data_dir)
-
-
-@run.cli.factory
-@run.autoconvert
-def full_geneformer_data_config(
-    data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15/processed_data",
-) -> DataConfig:
-    # NOTE theoretically we could validate that this stuff exists.
-    return DataConfig(data_dir=data_dir)
-
-
-@dataclass
-class GeneformerDataArtifacts:
-    tokenizer: Tokenizer  # TODO(SKH) typing isnt right
-    median_dict: dict
-
-
-def geneformer_preprocess_recipe(data_config: DataConfig) -> GeneformerDataArtifacts:
-    preprocessor = GeneformerPreprocess(
-        download_directory=pathlib.Path(data_config.train_data_path),
-        medians_file_path=pathlib.Path(data_config.train_data_path + "/medians.json"),
-        tokenizer_vocab_path=pathlib.Path(data_config.train_data_path + "/geneformer.vocab"),
-    )
-    match preprocessor.preprocess():
-        case {"tokenizer": tokenizer, "median_dict": median_dict}:
-            logging.info("*************** Preprocessing Finished ************")
-        case _:
-            logging.error("Preprocessing failed.")
-            raise ValueError("Preprocessing failed to create tokenizer and/or median dictionary.")
-    return GeneformerDataArtifacts(tokenizer=tokenizer, median_dict=median_dict)
-
-
-def singlecell_data_module(data_config: DataConfig, global_batch_size: int) -> SingleCellDataModule:
-    geneformer_data_artifacts: GeneformerDataArtifacts = geneformer_preprocess_recipe(data_config)
-    data = SingleCellDataModule(
-        seq_length=data_config.seq_length,
-        tokenizer=geneformer_data_artifacts.tokenizer,
-        train_dataset_path=data_config.train_data_path,
-        val_dataset_path=data_config.val_data_path,
-        test_dataset_path=data_config.test_data_path,
-        random_token_prob=0.02,  # changed to represent the incorrect setting we originally used.
-        median_dict=geneformer_data_artifacts.median_dict,
-        micro_batch_size=data_config.micro_batch_size,
-        global_batch_size=global_batch_size,
-        # persistent workers is supported when num_dataset_workers > 0
-        persistent_workers=data_config.num_dataset_workers > 0,
-        pin_memory=False,
-        num_workers=data_config.num_dataset_workers,
-    )
-    return data
-
-
-@dataclass
-class ParallelConfig:
-    tensor_model_parallel_size: int = 1
-    pipeline_model_parallel_size: int = 1
-    accumulate_grad_batches: int = 1
-    ddp: Literal["megatron"] = "megatron"
-    remove_unused_parameters: bool = True
-    num_devices: int = 1
-    num_nodes: int = 1
-
-
-@run.cli.factory
-@run.autoconvert
-def simple_parallel_recipe(
-    tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, num_devices: int = 1
-) -> ParallelConfig:
-    # TODO validatorssssssss, make sure we get everythign right here.
-    assert (
-        num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
-    ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
-    return ParallelConfig(
-        tensor_model_parallel_size=tensor_model_parallel_size,
-        pipeline_model_parallel_size=pipeline_model_parallel_size,
-        num_devices=num_devices,
-    )
-
-
-@dataclass
-class TrainingConfig:
-    max_steps: int
-    limit_val_batches: int
-    val_check_interval: int
-    precision: PrecisionTypes = "bf16-mixed"
-    accelerator: str = "gpu"
-
-
-@run.cli.factory
-@run.autoconvert
-def default_trainer_config() -> TrainingConfig:
-    return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
-
-
-def setup_trainer_from_configs(parallel_config: ParallelConfig, training_config: TrainingConfig) -> nl.Trainer:
-    # Because this returns a trainer, and trainer is not an argument to the entrypoint, this is not a factory.
-    strategy = nl.MegatronStrategy(
-        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
-        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
-        ddp="megatron",
-        find_unused_parameters=True,
-        ckpt_include_optimizer=True,
-    )
-
-    trainer = nl.Trainer(
-        devices=parallel_config.num_devices,
-        max_steps=training_config.max_steps,
-        accelerator=training_config.accelerator,
-        strategy=strategy,
-        limit_val_batches=training_config.limit_val_batches,  # This controls upsampling and downsampling
-        val_check_interval=training_config.val_check_interval,  # TODO(@jstjohn) Checkpoint saving is currently broken, fix and change this.
-        num_nodes=parallel_config.num_nodes,
-        callbacks=[
-            RichModelSummary(max_depth=4),
-            LearningRateMonitor(),
-        ],
-        plugins=nl.MegatronMixedPrecision(precision=training_config.precision),
-    )
-    return trainer
-
-
-ModelConfigT = TypeVar("ModelConfigT", bound=BioBertGenericConfig)
-
-
-@dataclass
-class ExposedModelConfig(Generic[ModelConfigT], ABC):
-    """ExposedConfigs are meant to be used as a way to expose a subset of the underlying model config.
-
-    Due to the fact that some fields in the underlying TransformerConfig are not serializable, it must be wrapped.
-    We tie each concrete ExposedModelConfig to a specific ModelConfigT, which is a subclass of BioBertGenericConfig.
-    Then, we expect implementors to implement a method using the same type called `model_class`, this returns the literal
-    type ModelConfigT.
-
-    exposed_to_internal_model_config is then a universal method that unpacks the exposed config and returns the underlying model config.
-
-    Users are expected to choose a recipe that returns the ExposedModelConfig of interest and parameterize it accordingly.
-    Developers should carefully create recipes and factories that reflect common usescases, and these will be specified on the CLI.
-    """
-
-    @abstractmethod
-    def model_class(self) -> Type[ModelConfigT]: ...
-
-    def exposed_to_internal_model_config(self) -> ModelConfigT:
-        # This is bad because it doesnt actually leverage any generics
-        cls: Type[ModelConfigT] = self.model_class()
-        return cls(**asdict(self))
-
-
-@dataclass
-class ExposedGeneformerConfig(ExposedModelConfig[GeneformerConfig]):
-    """NeMo run does not like GeneformerConfig due to use its use of lambdas.
-
-    So I basicaly need a method that does This -> GeneformerConfig
-    then use regular recipes/factories on the parent and do this transform at the last step.
-    """
-
-    params_dtype: PrecisionTypes
-    pipeline_dtype: PrecisionTypes
-    autocast_dtype: PrecisionTypes
-    num_layers: int = 6
-    hidden_size: int = 256
-    ffn_hidden_size: int = 512
-    num_attention_heads: int = 4
-    seq_length: int = 512
-    fp32_residual_connection: bool = False
-    hidden_dropout: float = 0.02
-    init_method_std: float = 0.02
-    kv_channels: Optional[int] = None
-    apply_query_key_layer_scaling: bool = False
-    make_vocab_size_divisible_by: int = 128
-    masked_softmax_fusion: bool = True
-    fp16_lm_cross_entropy: bool = False
-    gradient_accumulation_fusion: bool = False
-    layernorm_zero_centered_gamma: bool = False
-    layernorm_epsilon: float = 1.0e-12
-    activation_func: Callable = F.gelu
-    qk_layernorm: bool = False
-    apply_residual_connection_post_layernorm: bool = False
-    bias_activation_fusion: bool = True
-    bias_dropout_fusion: bool = True
-    get_attention_mask_from_fusion: bool = False
-    attention_dropout: float = 0.1
-    share_embeddings_and_output_weights: bool = True
-    enable_autocast: bool = False
-    nemo1_ckpt_path: Optional[str] = None
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec.value
-    nemo1_ckpt_path: Optional[str] = None
-    # NOTE: handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
-    initial_ckpt_path: Optional[str] = None
-
-    def model_class(self) -> Type[GeneformerConfig]:
-        return GeneformerConfig
-
-
-@dataclass
-class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBertConfig]):
-    """NOTE could use inheritence here, but the typing gets really weird and we'd rather have no red squiggles."""
-
-    params_dtype: PrecisionTypes
-    pipeline_dtype: PrecisionTypes
-    autocast_dtype: PrecisionTypes
-    num_layers: int = 6
-    hidden_size: int = 256
-    ffn_hidden_size: int = 512
-    num_attention_heads: int = 4
-    seq_length: int = 512
-    fp32_residual_connection: bool = False
-    hidden_dropout: float = 0.02
-    init_method_std: float = 0.02
-    kv_channels: Optional[int] = None
-    apply_query_key_layer_scaling: bool = False
-    make_vocab_size_divisible_by: int = 128
-    masked_softmax_fusion: bool = True
-    fp16_lm_cross_entropy: bool = False
-    gradient_accumulation_fusion: bool = False
-    layernorm_zero_centered_gamma: bool = False
-    layernorm_epsilon: float = 1.0e-12
-    activation_func: Callable = F.gelu
-    qk_layernorm: bool = False
-    apply_residual_connection_post_layernorm: bool = False
-    bias_activation_fusion: bool = True
-    bias_dropout_fusion: bool = True
-    get_attention_mask_from_fusion: bool = False
-    attention_dropout: float = 0.1
-    share_embeddings_and_output_weights: bool = True
-    enable_autocast: bool = False
-    nemo1_ckpt_path: Optional[str] = None
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec.value
-    nemo1_ckpt_path: Optional[str] = None
-    # NOTE: handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
-    initial_ckpt_path: Optional[str] = None
-    # NOTE only new attribute between this config and the geneformer config.
-    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None
-
-    def __post_init__(self):
-        if not self.initial_ckpt_skip_keys_with_these_prefixes:
-            self.initial_ckpt_skip_keys_with_these_prefixes = ["regression_head"]
-
-    def model_class(self) -> Type[FineTuneSeqLenBioBertConfig]:
-        return FineTuneSeqLenBioBertConfig
-
-
-@run.cli.factory
-@run.autoconvert
-def geneformer_finetuning_regression_head_recipe(
-    precision: PrecisionTypes = "bf16-mixed",
-    nemo1_init_path: Optional[str] = None,
-    initial_ckpt_path: Optional[str] = None,
-    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None,
-) -> ExposedModelConfig[FineTuneSeqLenBioBertConfig]:
-    finetuning_config = ExposedFineTuneSeqLenBioBertConfig(
-        params_dtype=get_autocast_dtype(precision),
-        pipeline_dtype=get_autocast_dtype(precision),
-        autocast_dtype=get_autocast_dtype(precision),  # setting this speeds things up a lot
-        nemo1_ckpt_path=nemo1_init_path,
-        initial_ckpt_path=initial_ckpt_path,
-        initial_ckpt_skip_keys_with_these_prefixes=initial_ckpt_skip_keys_with_these_prefixes,
-    )
-    return finetuning_config
-
-
-# TODO(SKH) rename this recipe to something more understandable.
-@run.cli.factory
-@run.autoconvert
-def geneformer10M_pretraining_recipe(
-    seq_length: int = 128,
-    precision: PrecisionTypes = "bf16-mixed",
-    nemo1_init_path: Optional[str] = None,
-    initial_ckpt_path: Optional[str] = None,
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec.value,
-) -> ExposedModelConfig[GeneformerConfig]:
-    """Sets up the base GeneformerConfig. Recipes on geneformer configs should choose what to expose and come with sensible defaults."""
-    geneformer_config = ExposedGeneformerConfig(
-        num_layers=6,
-        hidden_size=256,
-        ffn_hidden_size=512,
-        num_attention_heads=4,
-        seq_length=seq_length,
-        fp32_residual_connection=False,  # TODO(@jstjohn) check this
-        hidden_dropout=0.02,
-        init_method_std=0.02,
-        kv_channels=None,
-        apply_query_key_layer_scaling=False,
-        make_vocab_size_divisible_by=128,
-        masked_softmax_fusion=True,  # TODO(@jstjohn) check this
-        fp16_lm_cross_entropy=False,
-        params_dtype=get_autocast_dtype(precision),
-        pipeline_dtype=get_autocast_dtype(precision),
-        autocast_dtype=get_autocast_dtype(precision),  # setting this speeds things up a lot
-        gradient_accumulation_fusion=False,  # THIS BREAKS STUFF, leave False
-        layernorm_zero_centered_gamma=False,  # TODO(@jstjohn) check this
-        layernorm_epsilon=1.0e-12,
-        activation_func=F.gelu,  # TODO(@jstjohn) check this
-        qk_layernorm=False,  # TODO(@jstjohn) check this
-        apply_residual_connection_post_layernorm=False,  # False is new default, True was BERT pub.
-        bias_activation_fusion=True,  # TODO(@jstjohn) check this
-        bias_dropout_fusion=True,  # TODO(@jstjohn) check this
-        get_attention_mask_from_fusion=False,
-        attention_dropout=0.1,
-        share_embeddings_and_output_weights=True,
-        enable_autocast=False,  # This has to be set to True if we use the mixed precision plugin
-        biobert_spec_option=biobert_spec_option,
-        nemo1_ckpt_path=nemo1_init_path,
-        initial_ckpt_path=initial_ckpt_path,
-    )
-    return geneformer_config
-
-
-@dataclass
-class OptimizerSchedulerConfig:
-    lr: float = 1e-4
-    optimizer: str = "adam"  # TODO Literal
-    cosine_rampup_frac: float = 0.01
-    cosine_hold_frac: float = 0.05
-    interval: str = "step"  # TODO Literal
-    monitor: str = "val_loss"
-
-
-@run.cli.factory
-@run.autoconvert
-def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
-    """Prefers the default parameters for the Optimizer and Scheduler."""
-    return OptimizerSchedulerConfig()
-
-
-@run.cli.factory
-@run.autoconvert
-def exposed_optimizer_recipe(
-    lr: float, optimizer: str, cosine_rampup_frac: float, cosine_hold_frac: float, interval: str, monitor: str
-) -> OptimizerSchedulerConfig:
-    """This recipe exposes all parameters to the underlying OptimizerSchedulerConfig."""
-    return OptimizerSchedulerConfig(
-        lr=lr,
-        optimizer=optimizer,
-        cosine_rampup_frac=cosine_rampup_frac,
-        cosine_hold_frac=cosine_hold_frac,
-        interval=interval,
-        monitor=monitor,
-    )
-
-
-@run.cli.factory
-@run.autoconvert
-def optimizer_recipe_with_kwarg_defaults(
-    lr: float = 1e-4,
-    optimizer: str = "adam",
-    cosine_rampup_frac: float = 0.01,
-    cosine_hold_frac: float = 0.05,
-    interval: str = "step",
-    monitor: str = "val_loss",
-) -> OptimizerSchedulerConfig:
-    """This recipe exposes all parameters to the underlying OptimizerSchedulerConfig and provides defaults as kwargs."""
-    return OptimizerSchedulerConfig(
-        lr=lr,
-        optimizer=optimizer,
-        cosine_rampup_frac=cosine_rampup_frac,
-        cosine_hold_frac=cosine_hold_frac,
-        interval=interval,
-        monitor=monitor,
-    )
-
-
-def biobert_lightning_module(
-    model_config: BioBertGenericConfig, tokenizer: Tokenizer, optim_config: OptimizerSchedulerConfig, num_steps: int
-) -> BioBertLightningModule:
-    """Function that constructs a lightning module from the requisite configs.
-
-    tokenizer: Tokenizer - must be the same tokenizer used by the DataModule.
-    num_steps: int - must match the number of steps in the DataConfig.
-    """
-    model = BioBertLightningModule(
-        model_config,
-        tokenizer=tokenizer,
-        optimizer=MegatronOptimizerModule(
-            config=OptimizerConfig(
-                lr=optim_config.lr,
-                optimizer=optim_config.optimizer,
-                use_distributed_optimizer=True,
-                # Pass through fp16/bf16 settings to avoid errors around model having bf16 enabled but optimizer not.
-                # implies these configs must be coupled.
-                fp16=model_config.fp16,
-                bf16=model_config.bf16,
-            ),
-            lr_scheduler=CosineAnnealingScheduler(
-                max_steps=num_steps,
-                # minimum learning rate is 1/100th of the initial learning rate, so eg lr=1e-3 -> min_lr=1e-5
-                min_lr=optim_config.lr / 100,
-                warmup_steps=int(math.ceil(num_steps * optim_config.cosine_rampup_frac)),
-                interval=optim_config.interval,
-                monitor=optim_config.monitor,
-                constant_steps=int(math.ceil(num_steps * optim_config.cosine_hold_frac)),
-            ),
-        ),
-    )
-    return model
-
-
-@dataclass
-class ExperimentConfig:
-    save_every_n_steps: int
-    result_dir: str
-    experiment_name: str
-    restore_from_checkpoint_path: Optional[str]
-    resume_if_exists: bool
-    wandb_options: WandbLoggerOptions = None  # TODO(SKH) if we are passing a type in here its gonna blow up.
-    save_last_checkpoint: bool = True
-    metric_to_monitor_for_checkpoints: str = "reduced_train_loss"  # TODO literal?
-    save_top_k: int = 2
-    create_tensorboard_logger: bool = False
-
-
-@run.cli.factory
-@run.autoconvert
-def experiment_config_recipe() -> ExperimentConfig:
-    return ExperimentConfig(
-        save_every_n_steps=100,
-        result_dir="./results",
-        experiment_name="default_experiment",
-        restore_from_checkpoint_path=None,
-        resume_if_exists=True,
-        save_last_checkpoint=True,
-        metric_to_monitor_for_checkpoints="reduced_train_loss",
-        save_top_k=2,
-        create_tensorboard_logger=False,
-    )
-
-
-@dataclass
-class WandbConfig:
-    # NOTE(SKH) there is some duplication with WandbLoggerOptions
-    project: str  # Must be set to log to wandb, this is the 'project' directory under your 'entity'
-    entity: str  # Sometimes refers to team, sometimes username
-    offline: bool  # If set does not log to wandb
-
-
-def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
-    checkpoint_callback = nl_callbacks.ModelCheckpoint(
-        save_last=experiment_config.save_last_checkpoint,
-        monitor=experiment_config.metric_to_monitor_for_checkpoints,
-        save_top_k=experiment_config.save_top_k,
-        every_n_train_steps=experiment_config.save_every_n_steps,
-        always_save_context=True,
-    )
-
-    wandb_options: Optional[WandbLoggerOptions] = (
-        None
-        if wandb_config is None
-        else WandbLoggerOptions(
-            offline=wandb_config.offline,
-            project=wandb_config.project,
-            entity=wandb_config.entity,
-            log_model=False,
-        )
-    )
-
-    # Setup the logger and train the model
-    nemo_logger = setup_nemo_lightning_logger(
-        root_dir=experiment_config.result_dir,
-        name=experiment_config.experiment_name,
-        initialize_tensorboard_logger=experiment_config.create_tensorboard_logger,
-        wandb_kwargs=wandb_options,
-        ckpt_callback=checkpoint_callback,
-    )
-    return nemo_logger
-
-
-def pretrain_partial(
-    model_config: ExposedModelConfig[ModelConfigT],
-    data_config: DataConfig,
-    parallel_config: ParallelConfig,
-    training_config: TrainingConfig,
-    optim_config: OptimizerSchedulerConfig,
-    experiment_config: ExperimentConfig,
-    resume_if_exists: bool = True,
-    wandb_entity: Optional[str] = None,
-    wandb_project: Optional[str] = None,
-    wandb_offline: bool = True,
-) -> run.Partial:
-    """Same as pretrain but in partial form instead of an entrypoint."""
-
-    return run.Partial(
-        pretrain,
-        model_config=model_config,
-        data_config=data_config,
-        parallel_config=parallel_config,
-        training_config=training_config,
-        optim_config=optim_config,
-        experiment_config=experiment_config,
-        # Remaining are things that live outside a config
-        resume_if_exists=resume_if_exists,
-        # These could live as their own config, but they dont make sense to use factories with since theyre dependent on the environment.
-        wandb_entity=wandb_entity,
-        wandb_project=wandb_project,
-        wandb_offline=wandb_offline,
-    )
-
-
-@run.cli.entrypoint
-def pretrain(
-    model_config: ExposedModelConfig[ModelConfigT],  # noqa
-    data_config: DataConfig,
-    parallel_config: ParallelConfig,
-    training_config: TrainingConfig,
-    optim_config: OptimizerSchedulerConfig,
-    experiment_config: ExperimentConfig,
-    # Remaining are things that live outside a config
-    resume_if_exists: bool = True,
-    # These could live as their own config, but they dont make sense to use factories with since theyre dependent on the environment.
-    wandb_entity: Optional[str] = None,
-    wandb_project: Optional[str] = None,
-    wandb_offline: bool = True,
-    # ??? what was I doing with new_experiment title?
-    new_experiment_title="asdf",
-):
-    model_config: ModelConfigT = model_config.exposed_to_internal_model_config()
-
-    # Setup.
-    # Create requisite directory.
-    pathlib.Path(data_config.result_dir).mkdir(parents=True, exist_ok=True)
-
-    if experiment_config.save_every_n_steps != training_config.val_check_interval:
-        logging.warning("Mutating training_config.save_every_n_steps to be equal to val_check_interval.")
-        experiment_config.save_every_n_steps = training_config.val_check_interval
-
-    global_batch_size = infer_global_batch_size(
-        micro_batch_size=data_config.micro_batch_size,
-        num_nodes=parallel_config.num_nodes,
-        devices=parallel_config.num_devices,
-        accumulate_grad_batches=parallel_config.accumulate_grad_batches,
-        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
-        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
-    )
-
-    data: SingleCellDataModule = singlecell_data_module(data_config, global_batch_size)
-    # TODO there must be a way to do this automatically.
-    model_config.seq_length = data_config.seq_length
-    model_config.bf16 = training_config.precision == "bf16-mixed"
-    model_config.fp16 = training_config.precision == "16-mixed"
-
-    model: BioBertLightningModule = biobert_lightning_module(
-        model_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps
-    )
-    trainer: nl.Trainer = setup_trainer_from_configs(parallel_config, training_config)
-    nemo_logger: nl.NeMoLogger = nemo_logger_factory(
-        experiment_config, wandb_config=WandbConfig(project=wandb_project, entity=wandb_entity, offline=wandb_offline)
-    )
-
-    llm.train(
-        model=model,
-        data=data,
-        trainer=trainer,
-        log=nemo_logger,
-        resume=resume.AutoResume(
-            resume_if_exists=False,  # To resume training a specific checkpoint simply set initial_ckpt_path in the ModelConfig.
-            resume_ignore_no_checkpoint=True,  # When false this will throw an error with no existing checkpoint.
-        ),
-    )
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
deleted file mode 100644
index 7d47f45267..0000000000
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-Apache2
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Optional, Union
-
-import nemo_run as run
-import pydantic
-
-from bionemo.geneformer.run.factories import (
-    DataConfig,
-    ExperimentConfig,
-    ExposedModelConfig,
-    ModelConfigT,
-    OptimizerSchedulerConfig,
-    ParallelConfig,
-    TrainingConfig,
-    pretrain_partial,
-)
-
-
-class NeMoRunConfig(pydantic.BaseModel):
-    # These are all mutually exclusive, I think thats important to capture.
-    # NOTE havent figured out how to use this config yet.
-    #       could pass this into the entrypoint and do a branch based on the config
-    new_experiment_title: Optional[str]
-    resume_from_id: Optional[str]
-    resume_from_title: Optional[str]
-
-    def __post_init__(self):
-        if not any([self.new_experiment_title, self.resume_from_id, self.resume_from_title]):
-            raise ValueError(
-                "Exactly one of new_experiment_title, resume_from_id, resume_from_title must be set. None are set."
-            )
-
-        if sum([bool(self.new_experiment_title), bool(self.resume_from_id), bool(self.resume_from_title)]) > 1:
-            raise ValueError(
-                "Exactly one of new_experiment_title, resume_from_id, resume_from_title must be set. More than one field was set."
-            )
-
-
-@run.cli.entrypoint
-def run_again(
-    resume_from_id: Optional[
-        str
-    ],  # Note, in these cases we dont actually need the rest of the configs. Maybe these deserve distinct entrypoints.
-    resume_from_title: Optional[str],
-):
-    """Example entrypoint of how to re-run an existing job."""
-    assert (
-        resume_from_id or resume_from_title
-    ), "Exactly one of resume_from_id or resume_from_title must be set to rerun an experiment."
-    assert not (
-        resume_from_id and resume_from_title
-    ), "Exactly one of resume_from_id or resume_from_title must be set to rerun an experiment."
-
-    # Setup the context manager with the correct entrypoint, expect these to be mutually exclusive
-    with run.Experiment.from_title(resume_from_title) if resume_from_title is not None else run.Experiment.from_id(
-        resume_from_id
-    ) as exp:
-        exp.executor = run.LocalExecutor()  # Can we mutate?
-        exp.reset()
-        exp.run(direct=True, tail_logs=True, sequential=True)
-
-
-@run.cli.entrypoint
-def simple_example(this_or_that: Union[str, int]):
-    print(this_or_that)
-
-
-@run.cli.entrypoint
-def run_firsttime(
-    # NeMo Run controls.
-    experiment_title: str,
-    # Pretrain configuration requirements.
-    model_config: ExposedModelConfig[ModelConfigT],
-    data_config: DataConfig,
-    parallel_config: ParallelConfig,
-    training_config: TrainingConfig,
-    optim_config: OptimizerSchedulerConfig,
-    experiment_config: ExperimentConfig,
-    # Remaining are things that live outside a config
-    resume_if_exists: bool = True,
-    # WANDB
-    wandb_entity: Optional[str] = None,
-    wandb_project: Optional[str] = None,
-    wandb_offline: bool = True,
-    launcher: Optional[str] = None,
-):
-    # Set launcher='torchrun' to execute on the cluster
-    # local_executor = run.LocalExecutor(launcher=launcher)
-
-    local_executor = run.LocalExecutor(ntasks_per_node=parallel_config.num_devices, launcher=launcher)
-    with run.Experiment(title=experiment_title, executor=local_executor) as e:
-        # Input has to be a partial wrapper of pretrain?
-        e.add(
-            pretrain_partial(
-                model_config,
-                data_config,
-                parallel_config,
-                training_config,
-                optim_config,
-                experiment_config,
-                resume_if_exists,
-                wandb_entity,
-                wandb_project,
-                wandb_offline,
-            )
-        )
-        e.run()
-
-
-if __name__ == "__main__":
-    run.cli.main(run_firsttime)
-
-    """
-    if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
-        run.cli.main(run_firsttime)
-    else:
-    """
-    # run.cli.main(simple_example)
-    # run.cli.main(run_again)

From ac9a13f279a4f9a6f3fff03b8930907a2cc20fda Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Fri, 4 Oct 2024 01:00:04 +0000
Subject: [PATCH 17/58] Updated design to match discussion with JSJ:

instead of discriminated unions we will use GenericModels built into
pydantic. Upside is we dont need to know all the variants at parse time,
downside is we need to declare the concrete types at parse time.
---
 .../bionemo/geneformer/run/config_models.py   | 179 +++++++++++++-----
 1 file changed, 128 insertions(+), 51 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index f2d65e6e95..40c3328f6c 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -35,7 +35,7 @@
 import pathlib
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, Generic, List, Literal, Optional, Type, TypeVar, Union
+from typing import Any, Callable, Dict, Generic, List, Literal, Optional, Type, TypeVar
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -49,7 +49,7 @@
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule
 from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
 from nemo.utils import logging
-from pydantic import BaseModel, Field, ValidationError, field_serializer, field_validator, model_validator
+from pydantic import BaseModel, ValidationError, field_serializer, field_validator, model_validator
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 from tokenizers import Tokenizer
 
@@ -81,20 +81,54 @@
 ModelConfigT = TypeVar("ModelConfigT", bound=BioBertGenericConfig)
 DataModuleT = TypeVar("DataModuleT", bound=pl.LightningDataModule)
 
+"""
+This is actually easier to think about with DataModule beacuse there is no exposed/nonexposed relationship
+
+# DataConfig
+make DataConfig[DataModuleT] -> DataModuleT
+
+in bionemo.llm.data.datamodule
+
+
+
+
+BioNeMoDataModule
+    @abstractmethod
+    def from_data_config(cls, global_batch_size: int) -> type(cls):
+        ( This is generic, how do I make this thing from the config? should call out to the constructor and do the right stuff. )
+        ...
+
+all compatable datamodules implement this method, this kinda sucks though because if you bring your own datamodule,
+this isnt defined. Is this okay? I guess they just have to extend (combinator) +  implement this method and its p straightforward.
+
+plus global_batch_size and micro_batch_size are distributed concepts, so it cant be generic over all ptl DataModules.
+
+
+
+
+# ModelConfig
+make ExposedModelConfig[ModelConfigT] -> ModelConfigT[ModelT] -> ModelT
+    the problem here is nested generics in a way that is probably more harmful than helpful.
+
+    probably still want to drop the ExposedConfig and just 'deal' with the fact that there are some naughty defaults in TransformerConfig
+"""
+
 
 class DataConfig(BaseModel, Generic[DataModuleT]):
     """Base class for all data configurations.
 
     This class is used to define the interface for all data configurations. It is used to define the data module that
     will be used in the training loop.
-
-    !! note Children **MUST** include the field `data_config_type` to discriminate between available
-    data modules in the MasterConfig. Additionally, add the concrete type to the Union type annotation in MasterConfig.
     """
 
+    # Are these indeed universal?
     micro_batch_size: int = 8
     results_dir: str = "./results"
+    seq_length: int = 128
 
+    # As an ABC this is okay but it makes the instantiation kinda tricky since were both generic over the DataModule
+    #     but it also implies a 1-1 relationship between the data module and the data config
+    # I think we actually want, BioNeMoDataModule.from_data_config(global_batch_size) -> BioNeMoDataModule
     @abstractmethod
     def construct_data_module(self, global_batch_size: int) -> DataModuleT:
         """Construct the data module from the configuration. Cannot be defined generically."""
@@ -117,7 +151,6 @@ class GeneformerPretrainingDataConfig(DataConfig[SingleCellDataModule]):
     result_dir: str = "./results"
     micro_batch_size: int = 8
 
-    data_config_type: Literal["geneformer_pretraining_data_config"] = "geneformer_pretraining_data_config"
     data_dir: str
     seq_length: int = 2048
     num_dataset_workers: int = 0
@@ -283,10 +316,6 @@ class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
     Children should try to expose the minimal set of fields necessary for the user to configure the model while keeping
     the more esoteric configuration private to the underlying ModelConfigT.
 
-
-    !! note Children **MUST** include the field to discriminate between available
-    bionemo_model_config_type: Literal["finetuning_seqlen_biobert"] = "finetuning_seqlen_biobert" # Immutable, declares how to discriminate between model types for pydantic
-    data modules in the MasterConfig. Additionally, add the concrete type to the Union type annotation in MasterConfig.
     """
 
     # Pydantic stuff to allow arbitrary types + validators + serializers
@@ -295,8 +324,11 @@ class Config:
 
     """ Use this class to hide fields that are not serializable by Pydantic that we do not want to expose. """
 
-    @abstractmethod
-    def model_class(self) -> Type[ModelConfigT]: ...
+    def model_class(self) -> Type[ModelConfigT]:
+        # How did this all work yesterday even?
+        # so we cant do it this way because we are kinda losing the magic of generics.
+        #  ideally _the generics_ have all the methods we want implemented on them already.
+        return GeneformerConfig
 
     def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
         """Converts the exposed dataclass to the underlying Transformer config.
@@ -353,10 +385,6 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
     nemo1_ckpt_path: Optional[str] = None
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec
 
-    @field_serializer("params_dtype", "pipeline_dtype", "autocast_dtype")
-    def serialize_dtypes(self, v: torch.dtype) -> PrecisionTypes:
-        return dtypes.dtype_to_precision[v]
-
     @field_validator("activation_func", mode="before")
     @classmethod
     def validate_activation_func(cls, activation_func: str) -> Callable:
@@ -386,11 +414,6 @@ def validate_activation_func(cls, activation_func: str) -> Callable:
         else:
             return func
 
-    @field_validator("params_dtype", "pipeline_dtype", "autocast_dtype", mode="before")
-    @classmethod
-    def precision_validator(cls, v: PrecisionTypes) -> torch.dtype:
-        return dtypes.get_autocast_dtype(v)
-
     @field_serializer("activation_func")
     def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tensor]) -> str:
         func_name = v.__name__
@@ -402,16 +425,14 @@ def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tenso
         else:
             raise ValueError(f"Unsupported activation function: {v}")
 
+    @field_validator("params_dtype", "pipeline_dtype", "autocast_dtype", mode="before")
+    @classmethod
+    def precision_validator(cls, v: PrecisionTypes) -> torch.dtype:
+        return dtypes.get_autocast_dtype(v)
 
-class ExposedGeneformerConfig(ExposedModelConfig[GeneformerConfig]):
-    """There are no additional arguments for Geneformer, so we simply plugin the associated types and move on."""
-
-    bionemo_model_config_type: Literal["geneformer"] = (
-        "geneformer"  # Immutable, declares how to discriminate between model types for pydantic
-    )
-
-    def model_class(self) -> Type[GeneformerConfig]:
-        return GeneformerConfig
+    @field_serializer("params_dtype", "pipeline_dtype", "autocast_dtype")
+    def serialize_dtypes(self, v: torch.dtype) -> PrecisionTypes:
+        return dtypes.dtype_to_precision[v]
 
 
 class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBertConfig]):
@@ -424,11 +445,6 @@ class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBer
             for ignoring extra additional layers used for finetuning. Layers with these keys are then randomly initialized.
     """
 
-    # Used by discriminators
-    bionemo_model_config_type: Literal["finetuning_seqlen_biobert"] = (
-        "finetuning_seqlen_biobert"  # Immutable, declares how to discriminate between model types for pydantic
-    )
-
     # Custom parameters for FineTuning
     initial_ckpt_path: Optional[str] = None
     initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None
@@ -465,8 +481,8 @@ def geneformer10M_pretraining_recipe(
     nemo1_init_path: Optional[str] = None,
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec,
-) -> ExposedGeneformerConfig:
-    geneformer_config = ExposedGeneformerConfig(
+) -> ExposedModelConfig[GeneformerConfig]:
+    geneformer_config = ExposedModelConfig(
         num_layers=6,
         hidden_size=256,
         ffn_hidden_size=512,
@@ -552,7 +568,6 @@ class ExperimentConfig(BaseModel):
     result_dir: str
     experiment_name: str
     restore_from_checkpoint_path: Optional[str]
-    resume_if_exists: bool
     wandb_config: Optional[WandbConfig] = None
     save_last_checkpoint: bool = True
     metric_to_monitor_for_checkpoints: str = "reduced_train_loss"
@@ -566,7 +581,6 @@ def experiment_config_recipe() -> ExperimentConfig:
         result_dir="./results",
         experiment_name="default_experiment",
         restore_from_checkpoint_path=None,
-        resume_if_exists=True,
         save_last_checkpoint=True,
         metric_to_monitor_for_checkpoints="reduced_train_loss",
         save_top_k=2,
@@ -651,7 +665,57 @@ def pretrain(
     )
 
 
-class MasterConfig(BaseModel):
+class MastererConfig(BaseModel, ABC):
+    data_config: DataConfig
+    parallel_config: ParallelConfig
+    training_config: TrainingConfig
+    bionemo_model_config: ExposedModelConfig[ModelConfigT]
+    optim_config: OptimizerSchedulerConfig
+    experiment_config: ExperimentConfig
+    wandb_config: Optional[WandbConfig] = None
+
+
+"""
+use the GenericModel abstraction in Pydantic for _runtime_ type resolution.
+
+use discriminated unions for parse-time type resolution (do we still have this?)
+    useful within sub-packages
+
+Random thing: check with george on how to configure PEFt
+
+bionemo/geneformer/model/geneformer.py
+if __name__ == "__main__":
+    masterer_config = MastererConfig[GeneformerConfig](**params)
+    from bionemo.llm.entrypoint import train
+    train(master_config)
+
+bionemo/geneformer/model/geneformer2.py
+if __name__ == "__main__":
+    masterer_config = MastererConfig[GeneformerConfig2](**params)
+    from bionemo.llm.entrypoint import train
+    pretrain(master_config)
+
+dino/dumbstuff/model/custom_dinos.py
+if __name__ == "__main__":
+    # TODO register in pyproject.toml <-- optional
+    masterer_config = MastererConfig[CustomDino](**params)
+    from bionemo.llm.entrypoint import train
+    pretrain(master_config)
+
+    for _ in whatever:
+        masterer_config.dtype = 'fp16'
+"""
+
+
+# Here in lies the meat of what is happening.
+
+# DataConfig -> some config that can make a data module (see ABC definition.)
+DataConfigT = TypeVar("DataConfigT", bound=DataConfig)
+# ExposedModelConfig -> some config that can make a non-exposed model config (see ABC definition.)
+ExModelConfigT = TypeVar("ExModelConfigT", bound=ExposedModelConfig)
+
+
+class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
     """Mulling ways to make this generic over data modules:
 
     1) ABC in our DataModule that supports DataConfig -> DataModule
@@ -662,19 +726,16 @@ class MasterConfig(BaseModel):
 
     """
 
-    data_config: Union[GeneformerPretrainingDataConfig] = Field(..., discriminator="data_config_type")
+    data_config: DataConfigT
     parallel_config: ParallelConfig
     training_config: TrainingConfig
-    # TODO expand this for all other relevant models here.
-    bionemo_model_config: Union[ExposedGeneformerConfig, ExposedFineTuneSeqLenBioBertConfig] = Field(
-        ..., discriminator="bionemo_model_config_type"
-    )
+    bionemo_model_config: ExModelConfigT
     optim_config: OptimizerSchedulerConfig
     experiment_config: ExperimentConfig
     wandb_config: Optional[WandbConfig] = None
 
     @model_validator(mode="after")
-    def validate_master_config(self) -> "MasterConfig":
+    def validate_master_config(self) -> "MainConfig":
         self.bionemo_model_config.seq_length = self.data_config.seq_length
         # What other global validators should we set here?
         return self
@@ -686,7 +747,7 @@ def recipes_to_config_json(model_cfg_type="geneformer"):
     data_config: GeneformerPretrainingDataConfig = geneformer_small_data_recipe()
     parallel_config = simple_parallel_recipe()
     training_config = default_trainer_config_recipe()
-    if model_cfg_type == "geneformer":
+    if model_cfg_type == "geneformer" and False:
         bionemo_model_config = geneformer10M_pretraining_recipe()
     else:
         bionemo_model_config = geneformer_finetuning_regression_head_recipe()
@@ -696,7 +757,7 @@ def recipes_to_config_json(model_cfg_type="geneformer"):
     wandb_config = WandbConfig(project="bionemo2-demo", entity="nvidia", offline=True)
 
     # Create the master config
-    master_config = MasterConfig(
+    master_config = MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig](
         data_config=data_config,
         parallel_config=parallel_config,
         training_config=training_config,
@@ -728,14 +789,30 @@ def parse_args():
         parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
         return parser.parse_args()
 
-    def load_config(config_path: str) -> MasterConfig:
+    def load_config(config_path: str) -> MainConfig:
         with open(config_path, "r") as f:
             config_dict = json.load(f)
-        return MasterConfig(**config_dict)
+            # here we choose _which_ generics to parse.
+            # What does this look like in practice if we have a bunch of variants?
+            # how does the user choose which variant to parse? entrypoint?
+            #   they could pass in a path via CLI, that would technically work.
+        return MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig](**config_dict)
 
     args = parse_args()
     config = load_config(args.config)
-
+    # New
+    pretrain(
+        bionemo_exposed_model_config=config.bionemo_model_config,
+        data_config=config.data_config,
+        parallel_config=config.parallel_config,
+        training_config=config.training_config,
+        optim_config=config.optim_config,
+        experiment_config=config.experiment_config,
+        wandb_config=config.wandb_config,
+        resume_if_exists=False,
+    )
+    exit()
+    # Old
     pretrain(
         bionemo_exposed_model_config=config.bionemo_model_config,
         data_config=config.data_config,

From 467cb22a4b75334184e0bc81a11b8d221c527cd2 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Fri, 4 Oct 2024 20:41:50 +0000
Subject: [PATCH 18/58] Adds entrypoints for bionemo-geneformer-train and
 bionemo-geneformer-recipe

reorganizes recipes, configs, and entrypoints.
---
 .../bionemo-geneformer/pyproject.toml         |   4 +
 .../bionemo/geneformer/run/config_models.py   | 429 +-----------------
 .../src/bionemo/geneformer/run/main.py        |  57 +++
 .../src/bionemo/geneformer/run/recipes.py     | 160 +++++++
 .../src/bionemo/llm/utils/logger_utils.py     |   9 +
 5 files changed, 238 insertions(+), 421 deletions(-)
 create mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
 create mode 100644 sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py

diff --git a/sub-packages/bionemo-geneformer/pyproject.toml b/sub-packages/bionemo-geneformer/pyproject.toml
index df280a9829..b560d5df03 100644
--- a/sub-packages/bionemo-geneformer/pyproject.toml
+++ b/sub-packages/bionemo-geneformer/pyproject.toml
@@ -11,6 +11,10 @@ authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }]
 dynamic = ["version"]
 dependencies = ['bionemo-core', 'bionemo-llm']
 
+[project.scripts]
+bionemo-geneformer-train= "bionemo.geneformer.run.main:main"
+bionemo-geneformer-recipe= "bionemo.geneformer.run.recipes:main"
+
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["bionemo.*"]
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index 40c3328f6c..96118c1720 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -37,10 +37,10 @@
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, Generic, List, Literal, Optional, Type, TypeVar
 
-import nemo_run as run
 import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
+from bionemo.llm.config.config_models import DataConfig, DataModuleT, ExperimentConfig, ExposedModelConfig, MainConfig, ModelConfigT, OptimizerSchedulerConfig, ParallelConfig, TrainingConfig
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
 from nemo.collections import llm
@@ -65,77 +65,6 @@
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
 
-# If you'd like to register a custom activation function, you can add it to this dictionary to pass validation and allow serialization.
-CUSTOM_ACTIVATION_FNS: Dict[str, Callable[[torch.Tensor, Any], torch.Tensor]] = {}
-
-# NOTE(SKH): DO NOT use keys that already exist in torch.nn.functional, as the torch.nn.functional functions are selected first.
-for key in CUSTOM_ACTIVATION_FNS:
-    assert key not in dir(torch.nn.functional), f"Key {key} already exists in torch.nn.functional"
-
-# NOTE(SKH): it does not matter if values are duplicated as the key=>value mapping still does the right thing. Repeat values should be considered aliases.
-REVERSE_CUSTOM_ACTIVATION_FNS: Dict[Callable[[torch.Tensor, Any], torch.Tensor], str] = {
-    v: k for k, v in CUSTOM_ACTIVATION_FNS.items()
-}
-
-
-ModelConfigT = TypeVar("ModelConfigT", bound=BioBertGenericConfig)
-DataModuleT = TypeVar("DataModuleT", bound=pl.LightningDataModule)
-
-"""
-This is actually easier to think about with DataModule beacuse there is no exposed/nonexposed relationship
-
-# DataConfig
-make DataConfig[DataModuleT] -> DataModuleT
-
-in bionemo.llm.data.datamodule
-
-
-
-
-BioNeMoDataModule
-    @abstractmethod
-    def from_data_config(cls, global_batch_size: int) -> type(cls):
-        ( This is generic, how do I make this thing from the config? should call out to the constructor and do the right stuff. )
-        ...
-
-all compatable datamodules implement this method, this kinda sucks though because if you bring your own datamodule,
-this isnt defined. Is this okay? I guess they just have to extend (combinator) +  implement this method and its p straightforward.
-
-plus global_batch_size and micro_batch_size are distributed concepts, so it cant be generic over all ptl DataModules.
-
-
-
-
-# ModelConfig
-make ExposedModelConfig[ModelConfigT] -> ModelConfigT[ModelT] -> ModelT
-    the problem here is nested generics in a way that is probably more harmful than helpful.
-
-    probably still want to drop the ExposedConfig and just 'deal' with the fact that there are some naughty defaults in TransformerConfig
-"""
-
-
-class DataConfig(BaseModel, Generic[DataModuleT]):
-    """Base class for all data configurations.
-
-    This class is used to define the interface for all data configurations. It is used to define the data module that
-    will be used in the training loop.
-    """
-
-    # Are these indeed universal?
-    micro_batch_size: int = 8
-    results_dir: str = "./results"
-    seq_length: int = 128
-
-    # As an ABC this is okay but it makes the instantiation kinda tricky since were both generic over the DataModule
-    #     but it also implies a 1-1 relationship between the data module and the data config
-    # I think we actually want, BioNeMoDataModule.from_data_config(global_batch_size) -> BioNeMoDataModule
-    @abstractmethod
-    def construct_data_module(self, global_batch_size: int) -> DataModuleT:
-        """Construct the data module from the configuration. Cannot be defined generically."""
-        ...
-
-
-# TODO do we need this?
 @dataclass
 class GeneformerDataArtifacts:
     """Data artifacts produced by the geneformer preprocess."""
@@ -236,25 +165,6 @@ def geneformer_preprocess(data_config: GeneformerPretrainingDataConfig) -> Genef
         raise ValueError("Preprocessing failed to create tokenizer and/or median dictionary.")
 
 
-class ParallelConfig(BaseModel):
-    tensor_model_parallel_size: int = 1
-    pipeline_model_parallel_size: int = 1
-    accumulate_grad_batches: int = 1
-    ddp: Literal["megatron"] = "megatron"
-    remove_unused_parameters: bool = True
-    num_devices: int = 1
-    num_nodes: int = 1
-
-    @model_validator(mode="after")
-    def validate_devices(self):
-        # I think we can do a 2x2 split on 2 gpus for pipeline/tensor model parallel
-        if self.num_devices < self.tensor_model_parallel_size * self.pipeline_model_parallel_size:
-            raise ValidationError(
-                "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
-            )
-        return self
-
-
 def simple_parallel_recipe(
     tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, num_devices: int = 1
 ) -> ParallelConfig:
@@ -268,14 +178,6 @@ def simple_parallel_recipe(
     )
 
 
-class TrainingConfig(BaseModel):
-    max_steps: int
-    limit_val_batches: int
-    val_check_interval: int
-    # NOTE this matches whats used by nl.MegatronMixedPrecision which has a restricted set of precisions.
-    precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
-    accelerator: str = "gpu"
-
 
 def default_trainer_config_recipe() -> TrainingConfig:
     return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
@@ -307,134 +209,6 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
     return trainer
 
 
-class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
-    """BioNeMo model configuration class, wraps TransformerConfig and friends.
-
-    This class is used to define the interface for all model configurations. It is **Exposed** to guard against ill-typed
-    or poorly defined fields in the underlying configuration objects. `ModelConfigT` declares the associated type of the
-    underlying config (most commonly a BioBertGenericConfig, but could also be a TransformerConfig or something similar).
-    Children should try to expose the minimal set of fields necessary for the user to configure the model while keeping
-    the more esoteric configuration private to the underlying ModelConfigT.
-
-    """
-
-    # Pydantic stuff to allow arbitrary types + validators + serializers
-    class Config:
-        arbitrary_types_allowed = True
-
-    """ Use this class to hide fields that are not serializable by Pydantic that we do not want to expose. """
-
-    def model_class(self) -> Type[ModelConfigT]:
-        # How did this all work yesterday even?
-        # so we cant do it this way because we are kinda losing the magic of generics.
-        #  ideally _the generics_ have all the methods we want implemented on them already.
-        return GeneformerConfig
-
-    def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
-        """Converts the exposed dataclass to the underlying Transformer config.
-
-        The underlying ModelConfigT may both be incomplete and unserializable. We use this transformation as a way to
-        hide fields that are either not serializable by Pydantic or that we do not want to expose.
-
-        This is a good candidate for refactoring.
-        """
-
-        cls: Type[ModelConfigT] = self.model_class()
-        model_dict = {}
-        for attr in self.model_fields:
-            if attr not in model_dict and attr in cls.__dataclass_fields__:
-                model_dict[attr] = getattr(self, attr)
-        # Now set fp16 and bf16 based on the precision for the underlying TransformerConfig=>ParallelConfig
-        #   the only constraint is that both must not be true.
-        model_dict["bf16"] = self.pipeline_dtype == dtypes.precision_to_dtype["bf16-mixed"]
-        model_dict["fp16"] = self.pipeline_dtype == dtypes.precision_to_dtype["16-mixed"]
-        result = cls(**model_dict)
-
-        return result
-
-    # NOTE: See PrecisionTypes for a list of valid literals that may be deserialized.
-    params_dtype: torch.dtype
-    pipeline_dtype: torch.dtype
-    autocast_dtype: torch.dtype
-
-    num_layers: int = 6
-    hidden_size: int = 256
-    ffn_hidden_size: int = 512
-    num_attention_heads: int = 4
-    seq_length: int = 512
-    fp32_residual_connection: bool = False
-    hidden_dropout: float = 0.02
-    init_method_std: float = 0.02
-    kv_channels: Optional[int] = None
-    apply_query_key_layer_scaling: bool = False
-    make_vocab_size_divisible_by: int = 128
-    masked_softmax_fusion: bool = True
-    fp16_lm_cross_entropy: bool = False
-    gradient_accumulation_fusion: bool = False
-    layernorm_zero_centered_gamma: bool = False
-    layernorm_epsilon: float = 1.0e-12
-    activation_func: Callable[[torch.Tensor, Any], torch.Tensor] = F.gelu
-    qk_layernorm: bool = False
-    apply_residual_connection_post_layernorm: bool = False
-    bias_activation_fusion: bool = True
-    bias_dropout_fusion: bool = True
-    get_attention_mask_from_fusion: bool = False
-    attention_dropout: float = 0.1
-    share_embeddings_and_output_weights: bool = True
-    enable_autocast: bool = False
-    nemo1_ckpt_path: Optional[str] = None
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec
-
-    @field_validator("activation_func", mode="before")
-    @classmethod
-    def validate_activation_func(cls, activation_func: str) -> Callable:
-        """
-        Validates the activation function, assumes this function exists in torch.nn.functional. For custom
-        activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module.
-
-        This method validates the provided activation function string and returns
-        a callable function based on the validation context using the provided validator in the base class.
-        Args:
-            activation_func (str): The activation function to be validated.
-            context (ValidationInfo): The context for validation.
-        Returns:
-            Callable: A callable function after validation.
-
-        See Also:
-            CUSTOM_ACTIVATION_FNS
-        """
-        func = getattr(torch.nn.functional, activation_func.lower(), None)
-        if func is None and activation_func in CUSTOM_ACTIVATION_FNS:
-            func = CUSTOM_ACTIVATION_FNS[activation_func]
-            return func
-        elif func is None:
-            raise ValidationError(
-                f"activation_func must be a valid function in `torch.nn.functional`, got {activation_func=}"
-            )
-        else:
-            return func
-
-    @field_serializer("activation_func")
-    def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tensor]) -> str:
-        func_name = v.__name__
-        func = getattr(torch.nn.functional, func_name, None)
-        if func is not None:
-            return func_name
-        elif func in REVERSE_CUSTOM_ACTIVATION_FNS:
-            return REVERSE_CUSTOM_ACTIVATION_FNS[func]  # Get the serialization key
-        else:
-            raise ValueError(f"Unsupported activation function: {v}")
-
-    @field_validator("params_dtype", "pipeline_dtype", "autocast_dtype", mode="before")
-    @classmethod
-    def precision_validator(cls, v: PrecisionTypes) -> torch.dtype:
-        return dtypes.get_autocast_dtype(v)
-
-    @field_serializer("params_dtype", "pipeline_dtype", "autocast_dtype")
-    def serialize_dtypes(self, v: torch.dtype) -> PrecisionTypes:
-        return dtypes.dtype_to_precision[v]
-
-
 class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBertConfig]):
     """Config for models that fine-tune a BioBERT model from a pre-trained checkpoint.
 
@@ -482,6 +256,8 @@ def geneformer10M_pretraining_recipe(
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec,
 ) -> ExposedModelConfig[GeneformerConfig]:
+    raise NotImplementedError("Not a useful recipe, finetuning should be used instead (and renamed)")
+    # TODO this is ABC, cant instantiate it.
     geneformer_config = ExposedModelConfig(
         num_layers=6,
         hidden_size=256,
@@ -518,15 +294,6 @@ def geneformer10M_pretraining_recipe(
     return geneformer_config
 
 
-class OptimizerSchedulerConfig(BaseModel):
-    # TODO could use validators on optimizer, interval, and monitor.
-
-    lr: float = 1e-4
-    optimizer: str = "adam"
-    cosine_rampup_frac: float = 0.01
-    cosine_hold_frac: float = 0.05
-    interval: str = "step"
-    monitor: str = "val_loss"
 
 
 def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
@@ -563,16 +330,6 @@ def biobert_lightning_module(
     return model
 
 
-class ExperimentConfig(BaseModel):
-    save_every_n_steps: int
-    result_dir: str
-    experiment_name: str
-    restore_from_checkpoint_path: Optional[str]
-    wandb_config: Optional[WandbConfig] = None
-    save_last_checkpoint: bool = True
-    metric_to_monitor_for_checkpoints: str = "reduced_train_loss"
-    save_top_k: int = 2
-    create_tensorboard_logger: bool = False
 
 
 def experiment_config_recipe() -> ExperimentConfig:
@@ -597,17 +354,6 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
         always_save_context=True,
     )
 
-    wandb_config: Optional[WandbConfig] = (
-        None
-        if wandb_config is None
-        else WandbConfig(
-            offline=wandb_config.offline,
-            project=wandb_config.project,
-            entity=wandb_config.entity,
-            log_model=False,
-        )
-    )
-
     nemo_logger = setup_nemo_lightning_logger(
         root_dir=experiment_config.result_dir,
         name=experiment_config.experiment_name,
@@ -618,8 +364,7 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
     return nemo_logger
 
 
-@run.cli.entrypoint
-def pretrain(
+def train(
     bionemo_exposed_model_config: ExposedModelConfig,
     data_config: DataConfig[DataModuleT],
     parallel_config: ParallelConfig,
@@ -645,8 +390,10 @@ def pretrain(
         pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
     )
 
-    data: SingleCellDataModule = data_config.construct_data_module(global_batch_size)
+    data: DataModuleT = data_config.construct_data_module(global_batch_size)
 
+    # TODO BioBertDataModule or BioBertTokenizer abstractions. We know all DataModuleT in this case have data.tokenizer,
+    # although this constraint is not documented.
     model: BioBertLightningModule = biobert_lightning_module(
         bionemo_model_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps
     )
@@ -662,164 +409,4 @@ def pretrain(
             resume_if_exists=resume_if_exists,
             resume_ignore_no_checkpoint=True,
         ),
-    )
-
-
-class MastererConfig(BaseModel, ABC):
-    data_config: DataConfig
-    parallel_config: ParallelConfig
-    training_config: TrainingConfig
-    bionemo_model_config: ExposedModelConfig[ModelConfigT]
-    optim_config: OptimizerSchedulerConfig
-    experiment_config: ExperimentConfig
-    wandb_config: Optional[WandbConfig] = None
-
-
-"""
-use the GenericModel abstraction in Pydantic for _runtime_ type resolution.
-
-use discriminated unions for parse-time type resolution (do we still have this?)
-    useful within sub-packages
-
-Random thing: check with george on how to configure PEFt
-
-bionemo/geneformer/model/geneformer.py
-if __name__ == "__main__":
-    masterer_config = MastererConfig[GeneformerConfig](**params)
-    from bionemo.llm.entrypoint import train
-    train(master_config)
-
-bionemo/geneformer/model/geneformer2.py
-if __name__ == "__main__":
-    masterer_config = MastererConfig[GeneformerConfig2](**params)
-    from bionemo.llm.entrypoint import train
-    pretrain(master_config)
-
-dino/dumbstuff/model/custom_dinos.py
-if __name__ == "__main__":
-    # TODO register in pyproject.toml <-- optional
-    masterer_config = MastererConfig[CustomDino](**params)
-    from bionemo.llm.entrypoint import train
-    pretrain(master_config)
-
-    for _ in whatever:
-        masterer_config.dtype = 'fp16'
-"""
-
-
-# Here in lies the meat of what is happening.
-
-# DataConfig -> some config that can make a data module (see ABC definition.)
-DataConfigT = TypeVar("DataConfigT", bound=DataConfig)
-# ExposedModelConfig -> some config that can make a non-exposed model config (see ABC definition.)
-ExModelConfigT = TypeVar("ExModelConfigT", bound=ExposedModelConfig)
-
-
-class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
-    """Mulling ways to make this generic over data modules:
-
-    1) ABC in our DataModule that supports DataConfig -> DataModule
-        pros:
-        cons:
-    2) Discriminated union on data_config, additionally needs a method that also takes this union and produces the correct data module.
-    3) Pick one and highlight the other approach in either the SDD, PR, or both.
-
-    """
-
-    data_config: DataConfigT
-    parallel_config: ParallelConfig
-    training_config: TrainingConfig
-    bionemo_model_config: ExModelConfigT
-    optim_config: OptimizerSchedulerConfig
-    experiment_config: ExperimentConfig
-    wandb_config: Optional[WandbConfig] = None
-
-    @model_validator(mode="after")
-    def validate_master_config(self) -> "MainConfig":
-        self.bionemo_model_config.seq_length = self.data_config.seq_length
-        # What other global validators should we set here?
-        return self
-
-
-def recipes_to_config_json(model_cfg_type="geneformer"):
-    """Simple example for creating a JSON from recipes."""
-
-    data_config: GeneformerPretrainingDataConfig = geneformer_small_data_recipe()
-    parallel_config = simple_parallel_recipe()
-    training_config = default_trainer_config_recipe()
-    if model_cfg_type == "geneformer" and False:
-        bionemo_model_config = geneformer10M_pretraining_recipe()
-    else:
-        bionemo_model_config = geneformer_finetuning_regression_head_recipe()
-
-    optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
-    experiment_config = experiment_config_recipe()
-    wandb_config = WandbConfig(project="bionemo2-demo", entity="nvidia", offline=True)
-
-    # Create the master config
-    master_config = MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig](
-        data_config=data_config,
-        parallel_config=parallel_config,
-        training_config=training_config,
-        bionemo_model_config=bionemo_model_config,
-        optim_config=optim_config,
-        experiment_config=experiment_config,
-        wandb_config=wandb_config,
-    )
-
-    # Serialize to JSON
-    json_str = master_config.model_dump_json(indent=2)
-
-    # Save to file
-    with open(
-        "/workspaces/bionemo-fw-ea/sub-packages/bionemo-geneformer/src/bionemo/geneformer/conf/default-geneformer-config.json",
-        "w",
-    ) as f:
-        f.write(json_str)
-
-    print("Configuration saved to config.json")
-
-
-if __name__ == "__main__":
-    recipes_to_config_json("geneformer")
-    # recipes_to_config_json('finetune')
-
-    def parse_args():
-        parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
-        parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
-        return parser.parse_args()
-
-    def load_config(config_path: str) -> MainConfig:
-        with open(config_path, "r") as f:
-            config_dict = json.load(f)
-            # here we choose _which_ generics to parse.
-            # What does this look like in practice if we have a bunch of variants?
-            # how does the user choose which variant to parse? entrypoint?
-            #   they could pass in a path via CLI, that would technically work.
-        return MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig](**config_dict)
-
-    args = parse_args()
-    config = load_config(args.config)
-    # New
-    pretrain(
-        bionemo_exposed_model_config=config.bionemo_model_config,
-        data_config=config.data_config,
-        parallel_config=config.parallel_config,
-        training_config=config.training_config,
-        optim_config=config.optim_config,
-        experiment_config=config.experiment_config,
-        wandb_config=config.wandb_config,
-        resume_if_exists=False,
-    )
-    exit()
-    # Old
-    pretrain(
-        bionemo_exposed_model_config=config.bionemo_model_config,
-        data_config=config.data_config,
-        parallel_config=config.parallel_config,
-        training_config=config.training_config,
-        optim_config=config.optim_config,
-        experiment_config=config.experiment_config,
-        wandb_config=config.wandb_config,
-        resume_if_exists=False,
-    )
+    )
\ No newline at end of file
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
new file mode 100644
index 0000000000..ea954d84d6
--- /dev/null
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -0,0 +1,57 @@
+from bionemo.geneformer.run.config_models import ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig
+from bionemo.llm.train import train
+from bionemo.llm.config.config_models import MainConfig
+import argparse
+import json
+from typing import Optional
+
+
+def main():
+    def parse_args():
+        parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
+        parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
+        parser.add_argument("--model-config-t", default=ExposedFineTuneSeqLenBioBertConfig, required=False, help="fully resolvable python import path to the ModelConfig object.")
+        parser.add_argument("--data-config-t", default=GeneformerPretrainingDataConfig, required=False, help="fully resolvable python import path to the ModelConfig object.")
+        parser.add_argument("--resume-if-exists", default=True, help="Resume training if a checkpoint exists that matches the current experiment configuration.")
+        return parser.parse_args()
+
+    def string_to_class(path: str):
+        import importlib
+        module_path, class_name = path.rsplit('.', 1)
+        module = importlib.import_module(module_path)
+        return getattr(module, class_name)
+
+    def load_config(config_path: str, model_config_t: Optional[str], data_config_t: Optional[str]) -> MainConfig:
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+
+        # model/data_config_t is used to select the parser dynamically.
+        if model_config_t is None:
+            # our parser doesnt like literals that are already imported.
+            model_config_t = ExposedFineTuneSeqLenBioBertConfig
+        elif isinstance(model_config_t, str):
+            model_config_t = string_to_class(model_config_t)
+
+        if data_config_t is None:
+            data_config_t = GeneformerPretrainingDataConfig
+        elif isinstance(data_config_t, str):
+            data_config_t = string_to_class(data_config_t)
+        
+        return MainConfig[model_config_t, data_config_t](**config_dict)
+
+    args = parse_args()
+    config = load_config(args.config, args.model_config_t, args.data_config_t)
+    # New
+    train(
+        bionemo_exposed_model_config=config.bionemo_model_config,
+        data_config=config.data_config,
+        parallel_config=config.parallel_config,
+        training_config=config.training_config,
+        optim_config=config.optim_config,
+        experiment_config=config.experiment_config,
+        wandb_config=config.wandb_config,
+        resume_if_exists=args.resume_if_exists,
+    )
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
new file mode 100644
index 0000000000..2f710a2396
--- /dev/null
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -0,0 +1,160 @@
+import argparse
+from typing import Optional
+from bionemo.core.utils.dtypes import PrecisionTypes
+from bionemo.geneformer.api import GeneformerConfig
+from bionemo.geneformer.run.config_models import ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig
+from bionemo.llm.config.config_models import ExperimentConfig, ExposedModelConfig, MainConfig, OptimizerSchedulerConfig, TrainingConfig, ParallelConfig
+from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
+from bionemo.llm.utils.logger_utils import WandbConfig
+from typing import List
+from nemo.utils import logging 
+import os
+
+def geneformer_small_data_recipe(
+    data_dir
+) -> GeneformerPretrainingDataConfig:
+    """Recipe that produces the base geneformer small data configuration."""
+    return GeneformerPretrainingDataConfig(data_dir=data_dir)
+
+
+def full_geneformer_data_recipe(
+    data_dir
+) -> GeneformerPretrainingDataConfig:
+    return GeneformerPretrainingDataConfig(data_dir=data_dir)
+
+
+def simple_parallel_recipe(
+    tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, num_devices: int = 1
+) -> ParallelConfig:
+    assert (
+        num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
+    ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
+    return ParallelConfig(
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
+        num_devices=num_devices,
+    )
+
+def geneformer_finetuning_regression_head_recipe(
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None,
+) -> ExposedFineTuneSeqLenBioBertConfig:
+    # NOTE (SKH): this recipe is sad because it isnt smart enough to know our validator is returning a dtype.
+    finetuning_config = ExposedFineTuneSeqLenBioBertConfig(
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
+        initial_ckpt_skip_keys_with_these_prefixes=initial_ckpt_skip_keys_with_these_prefixes,
+    )
+    return finetuning_config
+
+
+def default_trainer_config_recipe() -> TrainingConfig:
+    return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
+
+
+def geneformer10M_pretraining_recipe(
+    seq_length: int = 128,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec,
+) -> ExposedFineTuneSeqLenBioBertConfig:
+    geneformer_config = ExposedFineTuneSeqLenBioBertConfig(
+        num_layers=6,
+        hidden_size=256,
+        ffn_hidden_size=512,
+        num_attention_heads=4,
+        seq_length=seq_length,
+        fp32_residual_connection=False,
+        hidden_dropout=0.02,
+        init_method_std=0.02,
+        kv_channels=None,
+        apply_query_key_layer_scaling=False,
+        make_vocab_size_divisible_by=128,
+        masked_softmax_fusion=True,
+        fp16_lm_cross_entropy=False,
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        gradient_accumulation_fusion=False,
+        layernorm_zero_centered_gamma=False,
+        layernorm_epsilon=1.0e-12,
+        activation_func="gelu",
+        qk_layernorm=False,
+        apply_residual_connection_post_layernorm=False,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+        get_attention_mask_from_fusion=False,
+        attention_dropout=0.1,
+        share_embeddings_and_output_weights=True,
+        enable_autocast=False,
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
+    )
+    return geneformer_config
+
+
+def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
+    return OptimizerSchedulerConfig()
+
+def experiment_config_recipe() -> ExperimentConfig:
+    return ExperimentConfig(
+        save_every_n_steps=100,
+        result_dir="./results",
+        experiment_name="default_experiment",
+        restore_from_checkpoint_path=None,
+        save_last_checkpoint=True,
+        metric_to_monitor_for_checkpoints="reduced_train_loss",
+        save_top_k=2,
+        create_tensorboard_logger=False,
+    )
+
+
+def main():
+
+    def parse_args():
+        parser = argparse.ArgumentParser(description="Create Geneformer configuration JSON.")
+        parser.add_argument("--dest", type=str, default='./geneformer-recipe.json', required=True, help="Path to the JSON configuration file.")
+        parser.add_argument("--data-dir", type=str, required=True, help="Path to the directory containing pretraining data.")
+        args = parser.parse_args()
+        return args
+
+
+    """Simple example for creating a JSON from recipes."""
+
+    args = parse_args()
+    data_config: GeneformerPretrainingDataConfig = geneformer_small_data_recipe(data_dir=args.data_dir)
+    parallel_config = simple_parallel_recipe()
+    training_config = default_trainer_config_recipe()
+    bionemo_model_config = geneformer_finetuning_regression_head_recipe()
+    optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
+    experiment_config = experiment_config_recipe()
+    wandb_config = WandbConfig(project="bionemo2-demo", entity="nvidia", offline=True, tags=[], group="dev", id="dev", log_model=False, anonymous=True)
+
+    # Create the master config
+    master_config = MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig](
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        bionemo_model_config=bionemo_model_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+        wandb_config=wandb_config,
+    )
+
+    # Serialize to JSON
+    json_str = master_config.model_dump_json(indent=2)
+
+    # Save to file
+    with open(
+        args.dest,
+        "w",
+    ) as f:
+        f.write(json_str)
+    logging.info(f'Saved configuration to {args.dest=}')
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
index 54121fa6a5..d7fb67968f 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
@@ -31,6 +31,15 @@
 class WandbConfig(BaseModel):
     """Note: `name` controls the exp name is handled by the NeMoLogger so it is ommitted here.
     `directory` is also omitted since it is set by the NeMoLogger.
+
+    Args:
+        entity: The team posting this run (default: your username or your default team)
+        project: The name of the project to which this run will belong.
+        tags: Tags associated with this run.
+        group: A unique string shared by all runs in a given group
+        offline: Run offline (data can be streamed later to wandb servers).
+        id: Sets the version, mainly used to resume a previous run.
+        anonymous: Enables or explicitly disables anonymous logging.
     """  # noqa: D205
 
     entity: str  # The team posting this run (default: your username or your default team)

From 4393fc4628f61f75b92b21c0cad2b44d7ed34336 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Fri, 4 Oct 2024 20:48:49 +0000
Subject: [PATCH 19/58] add the bionemo.llm config and train files

---
 .../src/bionemo/llm/config/config_models.py   | 272 ++++++++++++++++++
 .../bionemo-llm/src/bionemo/llm/train.py      | 155 ++++++++++
 2 files changed, 427 insertions(+)
 create mode 100644 sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
 create mode 100644 sub-packages/bionemo-llm/src/bionemo/llm/train.py

diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
new file mode 100644
index 0000000000..6b82a6b6e8
--- /dev/null
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
@@ -0,0 +1,272 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Callable, Dict, Generic, Literal, Optional, Type, TypeVar
+from abc import ABC, abstractmethod
+from pydantic import BaseModel, ValidationError, field_serializer, field_validator, model_validator
+from nemo.lightning import resume
+from nemo import lightning as nl
+import pytorch_lightning as pl
+import torch
+from torch.nn import functional as F
+from nemo.collections import llm
+
+from bionemo.core.utils import dtypes
+from bionemo.geneformer.api import GeneformerConfig
+from bionemo.llm.model.biobert.model import BioBertGenericConfig
+from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
+from bionemo.llm.utils.logger_utils import WandbConfig
+
+ModelConfigT = TypeVar("ModelConfigT", bound=BioBertGenericConfig)
+DataModuleT = TypeVar("DataModuleT", bound=pl.LightningDataModule)
+
+# To register a custom activation function, add it to this dictionary to pass validation and allow serialization.
+CUSTOM_ACTIVATION_FNS: Dict[str, Callable[[torch.Tensor, Any], torch.Tensor]] = {}
+
+# DO NOT use keys that already exist in torch.nn.functional, as the torch.nn.functional functions are selected first.
+for key in CUSTOM_ACTIVATION_FNS:
+    assert key not in dir(torch.nn.functional), f"Key {key} already exists in torch.nn.functional"
+
+# It does not matter if values are duplicated as the key=>value mapping still does the right thing. Repeat values should be considered aliases.
+REVERSE_CUSTOM_ACTIVATION_FNS: Dict[Callable[[torch.Tensor, Any], torch.Tensor], str] = {
+    v: k for k, v in CUSTOM_ACTIVATION_FNS.items()
+}
+class DataConfig(BaseModel, Generic[DataModuleT], ABC):
+    """Base class for all data configurations.
+
+    This class is used to define the interface for all data configurations. It is used to define the data module that
+    will be used in the training loop.
+    """
+
+    micro_batch_size: int = 8
+    result_dir: str = "./results"
+    seq_length: int = 128
+
+    @abstractmethod
+    def construct_data_module(self, global_batch_size: int) -> DataModuleT:
+        """Construct the data module from the configuration. Cannot be defined generically."""
+        ...
+
+class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
+    """BioNeMo model configuration class, wraps TransformerConfig and friends.
+
+    This class is used to define the interface for all model configurations. It is **Exposed** to guard against ill-typed
+    or poorly defined fields in the underlying configuration objects. `ModelConfigT` declares the associated type of the
+    underlying config (most commonly a BioBertGenericConfig, but could also be a TransformerConfig or something similar).
+    Children should try to expose the minimal set of fields necessary for the user to configure the model while keeping
+    the more esoteric configuration private to the underlying ModelConfigT.
+
+    """
+
+    # Pydantic stuff to allow arbitrary types + validators + serializers
+    class Config:
+        arbitrary_types_allowed = True
+
+    """ Use this class to hide fields that are not serializable by Pydantic that we do not want to expose. """
+
+    def model_class(self) -> Type[ModelConfigT]:
+        # How did this all work yesterday even?
+        # so we cant do it this way because we are kinda losing the magic of generics.
+        #  ideally _the generics_ have all the methods we want implemented on them already.
+        # TODO (SKH)
+        raise NotImplementedError
+        return GeneformerConfig
+
+    def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
+        """Converts the exposed dataclass to the underlying Transformer config.
+
+        The underlying ModelConfigT may both be incomplete and unserializable. We use this transformation as a way to
+        hide fields that are either not serializable by Pydantic or that we do not want to expose.
+
+        This is a good candidate for refactoring.
+        """
+
+        cls: Type[ModelConfigT] = self.model_class()
+        model_dict = {}
+        for attr in self.model_fields:
+            if attr not in model_dict and attr in cls.__dataclass_fields__:
+                model_dict[attr] = getattr(self, attr)
+        # Now set fp16 and bf16 based on the precision for the underlying TransformerConfig=>ParallelConfig
+        #   the only constraint is that both must not be true.
+        model_dict["bf16"] = self.pipeline_dtype == dtypes.precision_to_dtype["bf16-mixed"]
+        model_dict["fp16"] = self.pipeline_dtype == dtypes.precision_to_dtype["16-mixed"]
+        result = cls(**model_dict)
+
+        return result
+
+    # NOTE: See PrecisionTypes for a list of valid literals that may be deserialized.
+    params_dtype: torch.dtype
+    pipeline_dtype: torch.dtype
+    autocast_dtype: torch.dtype
+
+    num_layers: int = 6
+    hidden_size: int = 256
+    ffn_hidden_size: int = 512
+    num_attention_heads: int = 4
+    seq_length: int = 512
+    fp32_residual_connection: bool = False
+    hidden_dropout: float = 0.02
+    init_method_std: float = 0.02
+    kv_channels: Optional[int] = None
+    apply_query_key_layer_scaling: bool = False
+    make_vocab_size_divisible_by: int = 128
+    masked_softmax_fusion: bool = True
+    fp16_lm_cross_entropy: bool = False
+    gradient_accumulation_fusion: bool = False
+    layernorm_zero_centered_gamma: bool = False
+    layernorm_epsilon: float = 1.0e-12
+    activation_func: Callable[[torch.Tensor, Any], torch.Tensor] = F.gelu
+    qk_layernorm: bool = False
+    apply_residual_connection_post_layernorm: bool = False
+    bias_activation_fusion: bool = True
+    bias_dropout_fusion: bool = True
+    get_attention_mask_from_fusion: bool = False
+    attention_dropout: float = 0.1
+    share_embeddings_and_output_weights: bool = True
+    enable_autocast: bool = False
+    nemo1_ckpt_path: Optional[str] = None
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec
+
+    @field_validator("activation_func", mode="before")
+    @classmethod
+    def validate_activation_func(cls, activation_func: str) -> Callable:
+        """
+        Validates the activation function, assumes this function exists in torch.nn.functional. For custom
+        activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module.
+
+        This method validates the provided activation function string and returns
+        a callable function based on the validation context using the provided validator in the base class.
+        Args:
+            activation_func (str): The activation function to be validated.
+            context (ValidationInfo): The context for validation.
+        Returns:
+            Callable: A callable function after validation.
+
+        See Also:
+            CUSTOM_ACTIVATION_FNS
+        """
+        func = getattr(torch.nn.functional, activation_func.lower(), None)
+        if func is None and activation_func in CUSTOM_ACTIVATION_FNS:
+            func = CUSTOM_ACTIVATION_FNS[activation_func]
+            return func
+        elif func is None:
+            raise ValidationError(
+                f"activation_func must be a valid function in `torch.nn.functional`, got {activation_func=}"
+            )
+        else:
+            return func
+
+    @field_serializer("activation_func")
+    def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tensor]) -> str:
+        func_name = v.__name__
+        func = getattr(torch.nn.functional, func_name, None)
+        if func is not None:
+            return func_name
+        elif func in REVERSE_CUSTOM_ACTIVATION_FNS:
+            return REVERSE_CUSTOM_ACTIVATION_FNS[func]  # Get the serialization key
+        else:
+            raise ValueError(f"Unsupported activation function: {v}")
+
+    @field_validator("params_dtype", "pipeline_dtype", "autocast_dtype", mode="before")
+    @classmethod
+    def precision_validator(cls, v: dtypes.PrecisionTypes) -> torch.dtype:
+        return dtypes.get_autocast_dtype(v)
+
+    @field_serializer("params_dtype", "pipeline_dtype", "autocast_dtype")
+    def serialize_dtypes(self, v: torch.dtype) -> dtypes.PrecisionTypes:
+        return dtypes.dtype_to_precision[v]
+
+class ParallelConfig(BaseModel):
+    tensor_model_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 1
+    accumulate_grad_batches: int = 1
+    ddp: Literal["megatron"] = "megatron"
+    remove_unused_parameters: bool = True
+    num_devices: int = 1
+    num_nodes: int = 1
+
+    @model_validator(mode="after")
+    def validate_devices(self):
+        # I think we can do a 2x2 split on 2 gpus for pipeline/tensor model parallel
+        if self.num_devices < self.tensor_model_parallel_size * self.pipeline_model_parallel_size:
+            raise ValidationError(
+                "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
+            )
+        return self
+
+class TrainingConfig(BaseModel):
+    max_steps: int
+    limit_val_batches: int
+    val_check_interval: int
+    # NOTE this matches whats used by nl.MegatronMixedPrecision which has a restricted set of precisions.
+    precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
+    accelerator: str = "gpu"
+
+class OptimizerSchedulerConfig(BaseModel):
+    # TODO validators on optimizer, interval, and monitor.
+    lr: float = 1e-4
+    optimizer: str = "adam"
+    cosine_rampup_frac: float = 0.01
+    cosine_hold_frac: float = 0.05
+    interval: str = "step"
+    monitor: str = "val_loss"
+
+class ExperimentConfig(BaseModel):
+    save_every_n_steps: int
+    result_dir: str
+    experiment_name: str
+    restore_from_checkpoint_path: Optional[str]
+    wandb_config: Optional[WandbConfig] = None
+    save_last_checkpoint: bool = True
+    metric_to_monitor_for_checkpoints: str = "reduced_train_loss"
+    save_top_k: int = 2
+    create_tensorboard_logger: bool = False
+
+# DataConfig -> some config that can make a data module (see ABC definition.)
+DataConfigT = TypeVar("DataConfigT", bound=DataConfig)
+# ExposedModelConfig -> some config that can make a non-exposed model config (see ABC definition.)
+ExModelConfigT = TypeVar("ExModelConfigT", bound=ExposedModelConfig)
+
+class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
+    ''' Main configuration class for BioNeMo. All serialized configs that are a valid MainConfig should be Runnable.
+
+    This class is used to define the main configuration for BioNeMo. It defines the minimal pieces of configuration
+    to execution a training job with the NeMo2 training api. It accepts two generic type parameters which users
+    must define in their own environment for execution.
+
+    Args:
+        data_config: Generic config type that contains instructions on instantiating the required DataModule.
+        parallel_config: The parallel configuration for the model.
+        training_config: The training configuration for the model.
+        bionemo_model_config: Generic ExposedModelConfig type. This class hides extra configuration parameters in the
+            underlying model configuration as well as providing
+        optim_config: The optimizer/scheduler configuration for the model.
+        experiment_config: The experiment configuration for the model.
+        wandb_config: Optional, the wandb configuration for the model.
+    '''
+    data_config: DataConfigT
+    parallel_config: ParallelConfig
+    training_config: TrainingConfig
+    bionemo_model_config: ExModelConfigT
+    optim_config: OptimizerSchedulerConfig
+    experiment_config: ExperimentConfig
+    wandb_config: Optional[WandbConfig] = None
+
+    @model_validator(mode="after")
+    def validate_master_config(self) -> "MainConfig":
+        self.bionemo_model_config.seq_length = self.data_config.seq_length
+        # What other global validators should we set here?
+        return self
\ No newline at end of file
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
new file mode 100644
index 0000000000..666341b206
--- /dev/null
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -0,0 +1,155 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pathlib
+from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
+import math
+from typing import Optional
+from megatron.core.optimizer import OptimizerConfig
+from bionemo.llm.config.config_models import DataModuleT, ExperimentConfig, ParallelConfig, TrainingConfig, ExposedModelConfig, DataConfig, OptimizerSchedulerConfig
+from nemo.lightning.pytorch.optim import MegatronOptimizerModule
+from bionemo.llm.model.biobert.lightning import BioBertLightningModule
+from bionemo.llm.model.biobert.model import BioBertGenericConfig
+from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
+from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
+from nemo import lightning as nl
+from nemo.utils import logging
+from nemo.collections import llm
+from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
+from tokenizers import Tokenizer
+from nemo.lightning import resume
+from nemo.lightning.pytorch import callbacks as nl_callbacks
+
+def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
+    checkpoint_callback = nl_callbacks.ModelCheckpoint(
+        save_last=experiment_config.save_last_checkpoint,
+        monitor=experiment_config.metric_to_monitor_for_checkpoints,
+        save_top_k=experiment_config.save_top_k,
+        every_n_train_steps=experiment_config.save_every_n_steps,
+        always_save_context=True,
+    )
+
+    nemo_logger = setup_nemo_lightning_logger(
+        root_dir=experiment_config.result_dir,
+        name=experiment_config.experiment_name,
+        initialize_tensorboard_logger=experiment_config.create_tensorboard_logger,
+        wandb_config=wandb_config,
+        ckpt_callback=checkpoint_callback,
+    )
+    return nemo_logger
+
+
+def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
+        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
+        ddp="megatron",
+        find_unused_parameters=True,
+        ckpt_include_optimizer=True,
+    )
+
+    trainer = nl.Trainer(
+        devices=parallel_config.num_devices,
+        max_steps=training_config.max_steps,
+        accelerator=training_config.accelerator,
+        strategy=strategy,
+        limit_val_batches=training_config.limit_val_batches,
+        val_check_interval=training_config.val_check_interval,
+        num_nodes=parallel_config.num_nodes,
+        callbacks=[
+            RichModelSummary(max_depth=4),
+            LearningRateMonitor(),
+        ],
+        plugins=nl.MegatronMixedPrecision(precision=training_config.precision),
+    )
+    return trainer
+
+def biobert_lightning_module(
+    bionemo_model_config: BioBertGenericConfig,
+    tokenizer: Tokenizer,
+    optim_config: OptimizerSchedulerConfig,
+    num_steps: int,
+) -> BioBertLightningModule:
+    model = BioBertLightningModule(
+        bionemo_model_config,
+        tokenizer=tokenizer,
+        optimizer=MegatronOptimizerModule(
+            config=OptimizerConfig(
+                lr=optim_config.lr,
+                optimizer=optim_config.optimizer,
+                use_distributed_optimizer=True,
+                fp16=bionemo_model_config.fp16,
+                bf16=bionemo_model_config.bf16,
+            ),
+            lr_scheduler=CosineAnnealingScheduler(
+                max_steps=num_steps,
+                min_lr=optim_config.lr / 100,
+                warmup_steps=int(math.ceil(num_steps * optim_config.cosine_rampup_frac)),
+                interval=optim_config.interval,
+                monitor=optim_config.monitor,
+                constant_steps=int(math.ceil(num_steps * optim_config.cosine_hold_frac)),
+            ),
+        ),
+    )
+    return model
+
+
+def train(
+    bionemo_exposed_model_config: ExposedModelConfig,
+    data_config: DataConfig[DataModuleT],
+    parallel_config: ParallelConfig,
+    training_config: TrainingConfig,
+    optim_config: OptimizerSchedulerConfig,
+    experiment_config: ExperimentConfig,
+    wandb_config: Optional[WandbConfig],
+    resume_if_exists: bool = True,
+):
+    bionemo_model_config = bionemo_exposed_model_config.exposed_to_internal_bionemo_model_config()
+    pathlib.Path(data_config.result_dir).mkdir(parents=True, exist_ok=True)
+
+    if experiment_config.save_every_n_steps != training_config.val_check_interval:
+        logging.warning("Mutating training_config.save_every_n_steps to be equal to val_check_interval.")
+        experiment_config.save_every_n_steps = training_config.val_check_interval
+
+    global_batch_size = infer_global_batch_size(
+        micro_batch_size=data_config.micro_batch_size,
+        num_nodes=parallel_config.num_nodes,
+        devices=parallel_config.num_devices,
+        accumulate_grad_batches=parallel_config.accumulate_grad_batches,
+        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
+        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
+    )
+
+    data: DataModuleT = data_config.construct_data_module(global_batch_size)
+
+    # TODO BioBertDataModule or BioBertTokenizer abstractions. We know all DataModuleT in this case have data.tokenizer,
+    # although this constraint is not documented.
+    model: BioBertLightningModule = biobert_lightning_module(
+        bionemo_model_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps
+    )
+    trainer: nl.Trainer = setup_trainer(parallel_config, training_config)
+    nemo_logger: nl.NeMoLogger = nemo_logger_factory(experiment_config, wandb_config=wandb_config)
+
+    llm.train(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=nemo_logger,
+        resume=resume.AutoResume(
+            resume_if_exists=resume_if_exists,
+            resume_ignore_no_checkpoint=True,
+        ),
+    )
\ No newline at end of file

From ec7587c6853d448667c7bdd21958f663694bb74b Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Fri, 4 Oct 2024 21:47:38 +0000
Subject: [PATCH 20/58] Fixes the default config for geneformer to use the
 non-finetuning one. creates a new recipe to distinguish
 finetuning/pretraining (but is not exposed)

---
 .../bionemo/geneformer/run/config_models.py   | 115 +++---------------
 .../src/bionemo/geneformer/run/main.py        |  53 ++++++--
 .../src/bionemo/geneformer/run/recipes.py     |  84 +++++++++----
 .../bionemo-llm/src/bionemo/llm/train.py      |  12 +-
 4 files changed, 128 insertions(+), 136 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index 96118c1720..712920e0d1 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -29,18 +29,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import argparse
-import json
 import math
 import pathlib
-from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, Generic, List, Literal, Optional, Type, TypeVar
+from typing import List, Optional, Type
 
-import pytorch_lightning as pl
-import torch
-import torch.nn.functional as F
-from bionemo.llm.config.config_models import DataConfig, DataModuleT, ExperimentConfig, ExposedModelConfig, MainConfig, ModelConfigT, OptimizerSchedulerConfig, ParallelConfig, TrainingConfig
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
 from nemo.collections import llm
@@ -49,16 +42,23 @@
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule
 from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
 from nemo.utils import logging
-from pydantic import BaseModel, ValidationError, field_serializer, field_validator, model_validator
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 from tokenizers import Tokenizer
 
-from bionemo.core.utils import dtypes
 from bionemo.core.utils.dtypes import PrecisionTypes
 from bionemo.geneformer.api import GeneformerConfig
 from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
 from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
 from bionemo.geneformer.model.finetune_token_regressor import FineTuneSeqLenBioBertConfig
+from bionemo.llm.config.config_models import (
+    DataConfig,
+    DataModuleT,
+    ExperimentConfig,
+    ExposedModelConfig,
+    OptimizerSchedulerConfig,
+    ParallelConfig,
+    TrainingConfig,
+)
 from bionemo.llm.model.biobert.lightning import BioBertLightningModule
 from bionemo.llm.model.biobert.model import BioBertGenericConfig, BiobertSpecOption
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
@@ -178,7 +178,6 @@ def simple_parallel_recipe(
     )
 
 
-
 def default_trainer_config_recipe() -> TrainingConfig:
     return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
 
@@ -208,6 +207,13 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
     )
     return trainer
 
+class ExposedGeneformerPretrainConfig(ExposedModelConfig[GeneformerConfig]):
+    # Custom parameters for FineTuning
+    initial_ckpt_path: Optional[str] = None
+    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None
+
+    def model_class(self) -> Type[GeneformerConfig]:
+        return GeneformerConfig
 
 class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBertConfig]):
     """Config for models that fine-tune a BioBERT model from a pre-trained checkpoint.
@@ -231,75 +237,6 @@ def model_class(self) -> Type[FineTuneSeqLenBioBertConfig]:
         return FineTuneSeqLenBioBertConfig
 
 
-def geneformer_finetuning_regression_head_recipe(
-    precision: PrecisionTypes = "bf16-mixed",
-    nemo1_init_path: Optional[str] = None,
-    initial_ckpt_path: Optional[str] = None,
-    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None,
-) -> ExposedFineTuneSeqLenBioBertConfig:
-    # NOTE (SKH): this recipe is sad because it isnt smart enough to know our validator is returning a dtype.
-    finetuning_config = ExposedFineTuneSeqLenBioBertConfig(
-        params_dtype=precision,
-        pipeline_dtype=precision,
-        autocast_dtype=precision,
-        nemo1_ckpt_path=nemo1_init_path,
-        initial_ckpt_path=initial_ckpt_path,
-        initial_ckpt_skip_keys_with_these_prefixes=initial_ckpt_skip_keys_with_these_prefixes,
-    )
-    return finetuning_config
-
-
-def geneformer10M_pretraining_recipe(
-    seq_length: int = 128,
-    precision: PrecisionTypes = "bf16-mixed",
-    nemo1_init_path: Optional[str] = None,
-    initial_ckpt_path: Optional[str] = None,
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec,
-) -> ExposedModelConfig[GeneformerConfig]:
-    raise NotImplementedError("Not a useful recipe, finetuning should be used instead (and renamed)")
-    # TODO this is ABC, cant instantiate it.
-    geneformer_config = ExposedModelConfig(
-        num_layers=6,
-        hidden_size=256,
-        ffn_hidden_size=512,
-        num_attention_heads=4,
-        seq_length=seq_length,
-        fp32_residual_connection=False,
-        hidden_dropout=0.02,
-        init_method_std=0.02,
-        kv_channels=None,
-        apply_query_key_layer_scaling=False,
-        make_vocab_size_divisible_by=128,
-        masked_softmax_fusion=True,
-        fp16_lm_cross_entropy=False,
-        params_dtype=precision,
-        pipeline_dtype=precision,
-        autocast_dtype=precision,
-        gradient_accumulation_fusion=False,
-        layernorm_zero_centered_gamma=False,
-        layernorm_epsilon=1.0e-12,
-        activation_func="gelu",
-        qk_layernorm=False,
-        apply_residual_connection_post_layernorm=False,
-        bias_activation_fusion=True,
-        bias_dropout_fusion=True,
-        get_attention_mask_from_fusion=False,
-        attention_dropout=0.1,
-        share_embeddings_and_output_weights=True,
-        enable_autocast=False,
-        biobert_spec_option=biobert_spec_option,
-        nemo1_ckpt_path=nemo1_init_path,
-        initial_ckpt_path=initial_ckpt_path,
-    )
-    return geneformer_config
-
-
-
-
-def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
-    return OptimizerSchedulerConfig()
-
-
 def biobert_lightning_module(
     bionemo_model_config: BioBertGenericConfig,
     tokenizer: Tokenizer,
@@ -329,22 +266,6 @@ def biobert_lightning_module(
     )
     return model
 
-
-
-
-def experiment_config_recipe() -> ExperimentConfig:
-    return ExperimentConfig(
-        save_every_n_steps=100,
-        result_dir="./results",
-        experiment_name="default_experiment",
-        restore_from_checkpoint_path=None,
-        save_last_checkpoint=True,
-        metric_to_monitor_for_checkpoints="reduced_train_loss",
-        save_top_k=2,
-        create_tensorboard_logger=False,
-    )
-
-
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
     checkpoint_callback = nl_callbacks.ModelCheckpoint(
         save_last=experiment_config.save_last_checkpoint,
@@ -409,4 +330,4 @@ def train(
             resume_if_exists=resume_if_exists,
             resume_ignore_no_checkpoint=True,
         ),
-    )
\ No newline at end of file
+    )
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index ea954d84d6..e0d75d4014 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -1,23 +1,55 @@
-from bionemo.geneformer.run.config_models import ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig
-from bionemo.llm.train import train
-from bionemo.llm.config.config_models import MainConfig
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import argparse
 import json
 from typing import Optional
 
+from bionemo.geneformer.run.config_models import ExposedFineTuneSeqLenBioBertConfig, ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig
+from bionemo.llm.config.config_models import MainConfig
+from bionemo.llm.train import train
+
 
 def main():
     def parse_args():
         parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
         parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
-        parser.add_argument("--model-config-t", default=ExposedFineTuneSeqLenBioBertConfig, required=False, help="fully resolvable python import path to the ModelConfig object.")
-        parser.add_argument("--data-config-t", default=GeneformerPretrainingDataConfig, required=False, help="fully resolvable python import path to the ModelConfig object.")
-        parser.add_argument("--resume-if-exists", default=True, help="Resume training if a checkpoint exists that matches the current experiment configuration.")
+        parser.add_argument(
+            "--model-config-t",
+            default=ExposedGeneformerPretrainConfig,
+            required=False,
+            help="fully resolvable python import path to the ModelConfig object. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.",
+        )
+        parser.add_argument(
+            "--data-config-t",
+            default=GeneformerPretrainingDataConfig,
+            required=False,
+            help="fully resolvable python import path to the ModelConfig object.",
+        )
+        parser.add_argument(
+            "--resume-if-exists",
+            default=True,
+            help="Resume training if a checkpoint exists that matches the current experiment configuration.",
+        )
         return parser.parse_args()
 
     def string_to_class(path: str):
         import importlib
-        module_path, class_name = path.rsplit('.', 1)
+
+        module_path, class_name = path.rsplit(".", 1)
         module = importlib.import_module(module_path)
         return getattr(module, class_name)
 
@@ -28,7 +60,7 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
         # model/data_config_t is used to select the parser dynamically.
         if model_config_t is None:
             # our parser doesnt like literals that are already imported.
-            model_config_t = ExposedFineTuneSeqLenBioBertConfig
+            model_config_t = ExposedGeneformerPretrainConfig
         elif isinstance(model_config_t, str):
             model_config_t = string_to_class(model_config_t)
 
@@ -36,7 +68,7 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
             data_config_t = GeneformerPretrainingDataConfig
         elif isinstance(data_config_t, str):
             data_config_t = string_to_class(data_config_t)
-        
+
         return MainConfig[model_config_t, data_config_t](**config_dict)
 
     args = parse_args()
@@ -53,5 +85,6 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
         resume_if_exists=args.resume_if_exists,
     )
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index 2f710a2396..1faab241f3 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -1,25 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import argparse
-from typing import Optional
+from typing import List, Optional
+
+from nemo.utils import logging
+
 from bionemo.core.utils.dtypes import PrecisionTypes
-from bionemo.geneformer.api import GeneformerConfig
-from bionemo.geneformer.run.config_models import ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig
-from bionemo.llm.config.config_models import ExperimentConfig, ExposedModelConfig, MainConfig, OptimizerSchedulerConfig, TrainingConfig, ParallelConfig
+from bionemo.geneformer.run.config_models import ExposedFineTuneSeqLenBioBertConfig, ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig
+from bionemo.llm.config.config_models import (
+    ExperimentConfig,
+    MainConfig,
+    OptimizerSchedulerConfig,
+    ParallelConfig,
+    TrainingConfig,
+)
 from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
 from bionemo.llm.utils.logger_utils import WandbConfig
-from typing import List
-from nemo.utils import logging 
-import os
 
-def geneformer_small_data_recipe(
-    data_dir
-) -> GeneformerPretrainingDataConfig:
+
+def geneformer_small_data_recipe(data_dir) -> GeneformerPretrainingDataConfig:
     """Recipe that produces the base geneformer small data configuration."""
     return GeneformerPretrainingDataConfig(data_dir=data_dir)
 
 
-def full_geneformer_data_recipe(
-    data_dir
-) -> GeneformerPretrainingDataConfig:
+def full_geneformer_data_recipe(data_dir) -> GeneformerPretrainingDataConfig:
     return GeneformerPretrainingDataConfig(data_dir=data_dir)
 
 
@@ -35,6 +53,7 @@ def simple_parallel_recipe(
         num_devices=num_devices,
     )
 
+
 def geneformer_finetuning_regression_head_recipe(
     precision: PrecisionTypes = "bf16-mixed",
     nemo1_init_path: Optional[str] = None,
@@ -58,13 +77,13 @@ def default_trainer_config_recipe() -> TrainingConfig:
 
 
 def geneformer10M_pretraining_recipe(
-    seq_length: int = 128,
+    seq_length: int = 2048,
     precision: PrecisionTypes = "bf16-mixed",
     nemo1_init_path: Optional[str] = None,
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec,
-) -> ExposedFineTuneSeqLenBioBertConfig:
-    geneformer_config = ExposedFineTuneSeqLenBioBertConfig(
+) -> ExposedGeneformerPretrainConfig:
+    geneformer_config = ExposedGeneformerPretrainConfig(
         num_layers=6,
         hidden_size=256,
         ffn_hidden_size=512,
@@ -103,6 +122,7 @@ def geneformer10M_pretraining_recipe(
 def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
     return OptimizerSchedulerConfig()
 
+
 def experiment_config_recipe() -> ExperimentConfig:
     return ExperimentConfig(
         save_every_n_steps=100,
@@ -117,28 +137,44 @@ def experiment_config_recipe() -> ExperimentConfig:
 
 
 def main():
-
     def parse_args():
         parser = argparse.ArgumentParser(description="Create Geneformer configuration JSON.")
-        parser.add_argument("--dest", type=str, default='./geneformer-recipe.json', required=True, help="Path to the JSON configuration file.")
-        parser.add_argument("--data-dir", type=str, required=True, help="Path to the directory containing pretraining data.")
+        parser.add_argument(
+            "--dest",
+            type=str,
+            default="./geneformer-recipe.json",
+            required=True,
+            help="Path to the JSON configuration file.",
+        )
+        parser.add_argument(
+            "--data-dir", type=str, required=True, help="Path to the directory containing pretraining data."
+        )
         args = parser.parse_args()
         return args
 
-
     """Simple example for creating a JSON from recipes."""
 
     args = parse_args()
     data_config: GeneformerPretrainingDataConfig = geneformer_small_data_recipe(data_dir=args.data_dir)
     parallel_config = simple_parallel_recipe()
     training_config = default_trainer_config_recipe()
-    bionemo_model_config = geneformer_finetuning_regression_head_recipe()
+    # bionemo_model_config = geneformer_finetuning_regression_head_recipe()
+    bionemo_model_config = geneformer10M_pretraining_recipe()
     optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
     experiment_config = experiment_config_recipe()
-    wandb_config = WandbConfig(project="bionemo2-demo", entity="nvidia", offline=True, tags=[], group="dev", id="dev", log_model=False, anonymous=True)
+    wandb_config = WandbConfig(
+        project="bionemo2-demo",
+        entity="nvidia",
+        offline=True,
+        tags=[],
+        group="dev",
+        id="dev",
+        log_model=False,
+        anonymous=True,
+    )
 
     # Create the master config
-    master_config = MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig](
+    master_config = MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig](
         data_config=data_config,
         parallel_config=parallel_config,
         training_config=training_config,
@@ -157,4 +193,4 @@ def parse_args():
         "w",
     ) as f:
         f.write(json_str)
-    logging.info(f'Saved configuration to {args.dest=}')
+    logging.info(f"Saved configuration to {args.dest=}")
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index 666341b206..b0e4675c74 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -52,7 +52,7 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
     return nemo_logger
 
 
-def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig) -> nl.Trainer:
+def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig, callbacks = None) -> nl.Trainer:
     strategy = nl.MegatronStrategy(
         tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
         pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
@@ -60,6 +60,11 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
         find_unused_parameters=True,
         ckpt_include_optimizer=True,
     )
+    if callbacks is None:
+        callbacks = [
+            RichModelSummary(max_depth=4),
+            LearningRateMonitor(),
+        ]
 
     trainer = nl.Trainer(
         devices=parallel_config.num_devices,
@@ -69,10 +74,7 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
         limit_val_batches=training_config.limit_val_batches,
         val_check_interval=training_config.val_check_interval,
         num_nodes=parallel_config.num_nodes,
-        callbacks=[
-            RichModelSummary(max_depth=4),
-            LearningRateMonitor(),
-        ],
+        callbacks=callbacks,
         plugins=nl.MegatronMixedPrecision(precision=training_config.precision),
     )
     return trainer

From 6cd633d660901503be85d6b6a15bb3e93d065a6a Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Fri, 4 Oct 2024 21:58:23 +0000
Subject: [PATCH 21/58] exposes initial_ckpt_path to the recipe maker

---
 .../src/bionemo/geneformer/run/recipes.py             | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index 1faab241f3..1ec5711dc1 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -20,7 +20,11 @@
 from nemo.utils import logging
 
 from bionemo.core.utils.dtypes import PrecisionTypes
-from bionemo.geneformer.run.config_models import ExposedFineTuneSeqLenBioBertConfig, ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig
+from bionemo.geneformer.run.config_models import (
+    ExposedFineTuneSeqLenBioBertConfig,
+    ExposedGeneformerPretrainConfig,
+    GeneformerPretrainingDataConfig,
+)
 from bionemo.llm.config.config_models import (
     ExperimentConfig,
     MainConfig,
@@ -149,6 +153,9 @@ def parse_args():
         parser.add_argument(
             "--data-dir", type=str, required=True, help="Path to the directory containing pretraining data."
         )
+        parser.add_argument(
+            "--initial-ckpt-path", type=str, required=False, default=None, help="Path to an existing to a checkpoint directory to restore"
+        )
         args = parser.parse_args()
         return args
 
@@ -159,7 +166,7 @@ def parse_args():
     parallel_config = simple_parallel_recipe()
     training_config = default_trainer_config_recipe()
     # bionemo_model_config = geneformer_finetuning_regression_head_recipe()
-    bionemo_model_config = geneformer10M_pretraining_recipe()
+    bionemo_model_config = geneformer10M_pretraining_recipe(initial_ckpt_path=args.initial_ckpt_path)
     optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
     experiment_config = experiment_config_recipe()
     wandb_config = WandbConfig(

From b05be9b10b7379cf3de3e099d9df4af1c1255f46 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Fri, 4 Oct 2024 23:54:54 +0000
Subject: [PATCH 22/58] fix default factory for finetuning in the exposed
 geneformer configs

---
 .../bionemo/geneformer/run/config_models.py   | 111 +-----------------
 .../src/bionemo/llm/config/config_models.py   |   2 +-
 2 files changed, 6 insertions(+), 107 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index 712920e0d1..afc2a57c76 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -31,7 +31,7 @@
 
 import math
 import pathlib
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Optional, Type
 
 from megatron.core.optimizer import OptimizerConfig
@@ -115,7 +115,7 @@ def geneformer_preprocess(self) -> GeneformerDataArtifacts:
             raise ValueError("Preprocessing failed to create tokenizer and/or median dictionary.")
 
     def construct_data_module(self, global_batch_size: int) -> SingleCellDataModule:
-        geneformer_data_artifacts: GeneformerDataArtifacts = geneformer_preprocess(self)
+        geneformer_data_artifacts: GeneformerDataArtifacts = self.geneformer_preprocess()
         data = SingleCellDataModule(
             seq_length=self.seq_length,
             tokenizer=geneformer_data_artifacts.tokenizer,
@@ -133,55 +133,6 @@ def construct_data_module(self, global_batch_size: int) -> SingleCellDataModule:
         return data
 
 
-def geneformer_small_data_recipe(
-    data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15_small/processed_data",
-) -> GeneformerPretrainingDataConfig:
-    """Recipe that produces the base geneformer small data configuration."""
-    return GeneformerPretrainingDataConfig(data_dir=data_dir)
-
-
-def full_geneformer_data_recipe(
-    data_dir="/workspaces/bionemo-fw-ea/data/cellxgene_2023-12-15/processed_data",
-) -> GeneformerPretrainingDataConfig:
-    return GeneformerPretrainingDataConfig(data_dir=data_dir)
-
-
-def geneformer_preprocess(data_config: GeneformerPretrainingDataConfig) -> GeneformerDataArtifacts:
-    """Geneformer datamodule expects certain artifacts to be present in the data directory.
-
-    This method uses a legacy 'preprocessor' from BioNeMo 1 to acquire the associated artifacts.
-    """
-    preprocessor = GeneformerPreprocess(
-        download_directory=pathlib.Path(data_config.train_data_path),
-        medians_file_path=pathlib.Path(data_config.train_data_path + "/medians.json"),
-        tokenizer_vocab_path=pathlib.Path(data_config.train_data_path + "/geneformer.vocab"),
-    )
-    result = preprocessor.preprocess()
-    if "tokenizer" in result and "median_dict" in result:
-        logging.info("*************** Preprocessing Finished ************")
-        return GeneformerDataArtifacts(tokenizer=result["tokenizer"], median_dict=result["median_dict"])
-    else:
-        logging.error("Preprocessing failed.")
-        raise ValueError("Preprocessing failed to create tokenizer and/or median dictionary.")
-
-
-def simple_parallel_recipe(
-    tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, num_devices: int = 1
-) -> ParallelConfig:
-    assert (
-        num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
-    ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
-    return ParallelConfig(
-        tensor_model_parallel_size=tensor_model_parallel_size,
-        pipeline_model_parallel_size=pipeline_model_parallel_size,
-        num_devices=num_devices,
-    )
-
-
-def default_trainer_config_recipe() -> TrainingConfig:
-    return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
-
-
 def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig) -> nl.Trainer:
     strategy = nl.MegatronStrategy(
         tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
@@ -210,7 +161,7 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
 class ExposedGeneformerPretrainConfig(ExposedModelConfig[GeneformerConfig]):
     # Custom parameters for FineTuning
     initial_ckpt_path: Optional[str] = None
-    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None
+    initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
 
     def model_class(self) -> Type[GeneformerConfig]:
         return GeneformerConfig
@@ -227,11 +178,7 @@ class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBer
 
     # Custom parameters for FineTuning
     initial_ckpt_path: Optional[str] = None
-    initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None
-
-    def __post_init__(self):
-        if not self.initial_ckpt_skip_keys_with_these_prefixes:
-            self.initial_ckpt_skip_keys_with_these_prefixes = ["regression_head"]
+    initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=lambda: ["regression_head"])
 
     def model_class(self) -> Type[FineTuneSeqLenBioBertConfig]:
         return FineTuneSeqLenBioBertConfig
@@ -282,52 +229,4 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
         wandb_config=wandb_config,
         ckpt_callback=checkpoint_callback,
     )
-    return nemo_logger
-
-
-def train(
-    bionemo_exposed_model_config: ExposedModelConfig,
-    data_config: DataConfig[DataModuleT],
-    parallel_config: ParallelConfig,
-    training_config: TrainingConfig,
-    optim_config: OptimizerSchedulerConfig,
-    experiment_config: ExperimentConfig,
-    wandb_config: Optional[WandbConfig],
-    resume_if_exists: bool = True,
-):
-    bionemo_model_config = bionemo_exposed_model_config.exposed_to_internal_bionemo_model_config()
-    pathlib.Path(data_config.result_dir).mkdir(parents=True, exist_ok=True)
-
-    if experiment_config.save_every_n_steps != training_config.val_check_interval:
-        logging.warning("Mutating training_config.save_every_n_steps to be equal to val_check_interval.")
-        experiment_config.save_every_n_steps = training_config.val_check_interval
-
-    global_batch_size = infer_global_batch_size(
-        micro_batch_size=data_config.micro_batch_size,
-        num_nodes=parallel_config.num_nodes,
-        devices=parallel_config.num_devices,
-        accumulate_grad_batches=parallel_config.accumulate_grad_batches,
-        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
-        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
-    )
-
-    data: DataModuleT = data_config.construct_data_module(global_batch_size)
-
-    # TODO BioBertDataModule or BioBertTokenizer abstractions. We know all DataModuleT in this case have data.tokenizer,
-    # although this constraint is not documented.
-    model: BioBertLightningModule = biobert_lightning_module(
-        bionemo_model_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps
-    )
-    trainer: nl.Trainer = setup_trainer(parallel_config, training_config)
-    nemo_logger: nl.NeMoLogger = nemo_logger_factory(experiment_config, wandb_config=wandb_config)
-
-    llm.train(
-        model=model,
-        data=data,
-        trainer=trainer,
-        log=nemo_logger,
-        resume=resume.AutoResume(
-            resume_if_exists=resume_if_exists,
-            resume_ignore_no_checkpoint=True,
-        ),
-    )
+    return nemo_logger
\ No newline at end of file
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
index 6b82a6b6e8..5ca3453506 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
@@ -138,7 +138,7 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
     share_embeddings_and_output_weights: bool = True
     enable_autocast: bool = False
     nemo1_ckpt_path: Optional[str] = None
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec
 
     @field_validator("activation_func", mode="before")
     @classmethod

From ae88b151eba81878b170a5f3e9367b1736b7f769 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 8 Oct 2024 19:41:31 +0000
Subject: [PATCH 23/58] checkpointing current work on ux

---
 scripts/protein/esm2/esm2_pretrain.py         | 13 +++----
 .../src/bionemo/geneformer/run/main.py        |  5 ++-
 .../bionemo-llm/src/bionemo/llm/train.py      | 36 ++++++++++++-------
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/scripts/protein/esm2/esm2_pretrain.py b/scripts/protein/esm2/esm2_pretrain.py
index 880ccaeefb..4cd9143b15 100644
--- a/scripts/protein/esm2/esm2_pretrain.py
+++ b/scripts/protein/esm2/esm2_pretrain.py
@@ -35,7 +35,7 @@
 from bionemo.llm.model.biobert.lightning import BioBertLightningModule
 from bionemo.llm.model.biobert.model import BiobertSpecOption
 from bionemo.llm.utils.datamodule_utils import float_or_int_or_none, infer_global_batch_size
-from bionemo.llm.utils.logger_utils import WandbLoggerOptions, setup_nemo_lightning_logger
+from bionemo.llm.utils.logger_utils import WandbConfig, WandbLoggerOptions, setup_nemo_lightning_logger
 
 
 __all__: Sequence[str] = ("main", "parser")
@@ -143,10 +143,10 @@ def main(
 
     # for wandb integration
     # Please refer to https://pytorch-lightning.readthedocs.io/en/0.7.6/api/pytorch_lightning.loggers.html"
-    wandb_options: Optional[WandbLoggerOptions] = (
+    wandb_config: Optional[WandbConfig] = (
         None
         if wandb_project is None
-        else WandbLoggerOptions(
+        else WandbConfig(
             offline=wandb_offline,
             project=wandb_project,
             entity=wandb_entity,
@@ -174,7 +174,6 @@ def main(
         plugins=nl.MegatronMixedPrecision(precision=precision),
     )
 
-    tokenizer = get_tokenizer()
 
     # Initialize the data module.
     data = ESMDataModule(
@@ -188,8 +187,10 @@ def main(
         max_seq_length=max_seq_length,
         num_workers=num_dataset_workers,
         random_mask_strategy=random_mask_strategy,
+        tokenizer = get_tokenizer()
     )
-
+    # NOTE(SKH) added this.
+    tokenizer = data._tokenizer
     # Configure the model
     need_megatron_variable_seq_lengths_reductions = (
         pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length,
@@ -242,7 +243,7 @@ def main(
         root_dir=result_dir,
         name=experiment_name,
         initialize_tensorboard_logger=create_tensorboard_logger,
-        wandb_kwargs=wandb_options,
+        wandb_config=wandb_config,
         ckpt_callback=checkpoint_callback,
     )
 
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index e0d75d4014..f09c183436 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -18,7 +18,10 @@
 import json
 from typing import Optional
 
-from bionemo.geneformer.run.config_models import ExposedFineTuneSeqLenBioBertConfig, ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig
+from bionemo.geneformer.run.config_models import (
+    ExposedGeneformerPretrainConfig,
+    GeneformerPretrainingDataConfig,
+)
 from bionemo.llm.config.config_models import MainConfig
 from bionemo.llm.train import train
 
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index b0e4675c74..a5218f6c1c 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -14,24 +14,35 @@
 # limitations under the License.
 
 
-import pathlib
-from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
 import math
+import pathlib
 from typing import Optional
+
 from megatron.core.optimizer import OptimizerConfig
-from bionemo.llm.config.config_models import DataModuleT, ExperimentConfig, ParallelConfig, TrainingConfig, ExposedModelConfig, DataConfig, OptimizerSchedulerConfig
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.lightning import resume
+from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule
+from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
+from nemo.utils import logging
+from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
+from tokenizers import Tokenizer
+
+from bionemo.llm.config.config_models import (
+    DataConfig,
+    DataModuleT,
+    ExperimentConfig,
+    ExposedModelConfig,
+    OptimizerSchedulerConfig,
+    ParallelConfig,
+    TrainingConfig,
+)
 from bionemo.llm.model.biobert.lightning import BioBertLightningModule
 from bionemo.llm.model.biobert.model import BioBertGenericConfig
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
-from nemo import lightning as nl
-from nemo.utils import logging
-from nemo.collections import llm
-from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
-from tokenizers import Tokenizer
-from nemo.lightning import resume
-from nemo.lightning.pytorch import callbacks as nl_callbacks
+
 
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
     checkpoint_callback = nl_callbacks.ModelCheckpoint(
@@ -52,7 +63,7 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
     return nemo_logger
 
 
-def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig, callbacks = None) -> nl.Trainer:
+def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig, callbacks=None) -> nl.Trainer:
     strategy = nl.MegatronStrategy(
         tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
         pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
@@ -79,6 +90,7 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
     )
     return trainer
 
+
 def biobert_lightning_module(
     bionemo_model_config: BioBertGenericConfig,
     tokenizer: Tokenizer,
@@ -154,4 +166,4 @@ def train(
             resume_if_exists=resume_if_exists,
             resume_ignore_no_checkpoint=True,
         ),
-    )
\ No newline at end of file
+    )

From 1100906dcb4d5e64309079d4712fc78694d63c2f Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 8 Oct 2024 19:42:01 +0000
Subject: [PATCH 24/58] saving work

---
 scripts/protein/esm2/esm2_pretrain.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/protein/esm2/esm2_pretrain.py b/scripts/protein/esm2/esm2_pretrain.py
index 4cd9143b15..501dbc63e9 100644
--- a/scripts/protein/esm2/esm2_pretrain.py
+++ b/scripts/protein/esm2/esm2_pretrain.py
@@ -35,7 +35,7 @@
 from bionemo.llm.model.biobert.lightning import BioBertLightningModule
 from bionemo.llm.model.biobert.model import BiobertSpecOption
 from bionemo.llm.utils.datamodule_utils import float_or_int_or_none, infer_global_batch_size
-from bionemo.llm.utils.logger_utils import WandbConfig, WandbLoggerOptions, setup_nemo_lightning_logger
+from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
 
 __all__: Sequence[str] = ("main", "parser")
@@ -174,7 +174,6 @@ def main(
         plugins=nl.MegatronMixedPrecision(precision=precision),
     )
 
-
     # Initialize the data module.
     data = ESMDataModule(
         train_cluster_path=train_cluster_path,
@@ -187,7 +186,7 @@ def main(
         max_seq_length=max_seq_length,
         num_workers=num_dataset_workers,
         random_mask_strategy=random_mask_strategy,
-        tokenizer = get_tokenizer()
+        tokenizer=get_tokenizer(),
     )
     # NOTE(SKH) added this.
     tokenizer = data._tokenizer

From e74ae7ba21004e5a1ecda0cc81beef182f030193 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 16 Oct 2024 21:20:03 +0000
Subject: [PATCH 25/58] Adds supporting tests for the pydantic CLI interface.

---
 .../geneformer/test_pydantic_train.py         | 167 ++++++++++
 .../src/bionemo/geneformer/run/main.py        |  12 +-
 .../src/bionemo/geneformer/run/recipes.py     | 292 ++++++++++++++++--
 .../src/bionemo/llm/config/config_models.py   |  34 +-
 4 files changed, 454 insertions(+), 51 deletions(-)
 create mode 100644 scripts/singlecell/geneformer/test_pydantic_train.py

diff --git a/scripts/singlecell/geneformer/test_pydantic_train.py b/scripts/singlecell/geneformer/test_pydantic_train.py
new file mode 100644
index 0000000000..c6188769c4
--- /dev/null
+++ b/scripts/singlecell/geneformer/test_pydantic_train.py
@@ -0,0 +1,167 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shlex
+import subprocess
+from pathlib import Path
+
+from lightning.fabric.plugins.environments.lightning import find_free_network_port
+
+from bionemo.testing.data.load import load
+
+
+data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
+
+
+def test_bionemo2_rootdir():
+    data_error_str = (
+        "Please download test data with:\n"
+        "`python scripts/download_artifacts.py --models all --model_dir ./models --data all --data_dir ./ --verbose --source pbss`"
+    )
+    assert data_path.exists(), f"Could not find test data directory.\n{data_error_str}"
+    assert data_path.is_dir(), f"Test data directory is supposed to be a directory.\n{data_error_str}"
+
+
+def test_pretrain_cli_from_ckpt(tmpdir):
+    # Same as test_pretrain, but includes a checkpoint to initialize from.
+    data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
+    result_dir = Path(tmpdir.mkdir("results"))
+
+    open_port = find_free_network_port()
+    config = "/workspaces/bionemo-fw-ea/test_config.json"
+    # Invoke with blocking
+    checkpoint_path: Path = load("geneformer/10M_240530:2.0")
+    cmd_str = f"""bionemo-geneformer-recipe --dest {config} --recipe test --data-path {data_path} --result-dir {result_dir} --initial-ckpt-path {checkpoint_path}""".strip()
+    # continue when finished
+    env = dict(**os.environ)  # a local copy of the environment
+    env["MASTER_PORT"] = str(open_port)
+    cmd = shlex.split(cmd_str)
+    result = subprocess.run(
+        cmd,
+        cwd=tmpdir,
+        env=env,
+        capture_output=True,
+    )
+    # Now do pretrain
+    if result.returncode != 0:
+        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+
+    cmd_str = f"""bionemo-geneformer-train --conf {config}""".strip()
+    env = dict(**os.environ)  # a local copy of the environment
+    open_port = find_free_network_port()
+    env["MASTER_PORT"] = str(open_port)
+    cmd = shlex.split(cmd_str)
+    result = subprocess.run(
+        cmd,
+        cwd=tmpdir,
+        env=env,
+        capture_output=True,
+    )
+    if result.returncode != 0:
+        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+    # NOTE this looks a lot like a magic value. But we also could do json.loads(config)['experiment_config']['experiment_name']
+    assert (result_dir / "test-experiment").exists(), "Could not find test experiment directory."
+
+
+def test_pretrain_cli(tmpdir):
+    """trains from scratch"""
+    data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
+    result_dir = Path(tmpdir.mkdir("results"))
+
+    open_port = find_free_network_port()
+    config = "test_config.json"
+    # Invoke with blocking
+    cmd_str = f"""bionemo-geneformer-recipe --dest {config} --recipe test --data-path {data_path} --result-dir {result_dir}""".strip()
+    # continue when finished
+    env = dict(**os.environ)  # a local copy of the environment
+    env["MASTER_PORT"] = str(open_port)
+    cmd = shlex.split(cmd_str)
+    result = subprocess.run(
+        cmd,
+        cwd=tmpdir,
+        env=env,
+        capture_output=True,
+    )
+    # Now do pretrain
+    if result.returncode != 0:
+        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+
+    cmd_str = f"""bionemo-geneformer-train --conf {config}""".strip()
+    env = dict(**os.environ)  # a local copy of the environment
+    open_port = find_free_network_port()
+    env["MASTER_PORT"] = str(open_port)
+    cmd = shlex.split(cmd_str)
+    result = subprocess.run(
+        cmd,
+        cwd=tmpdir,
+        env=env,
+        capture_output=True,
+    )
+    if result.returncode != 0:
+        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+    # NOTE this looks a lot like a magic value. But we also could do json.loads(config)['experiment_config']['experiment_name']
+    assert (result_dir / "test-experiment").exists(), "Could not find test experiment directory."
+
+
+def test_finetune_cli(tmpdir):
+    """Uses CLI to invoke the entrypoint"""
+    data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
+    result_dir = Path(tmpdir.mkdir("results"))
+    checkpoint_path: Path = load("geneformer/10M_240530:2.0")
+
+    open_port = find_free_network_port()
+
+    # TODO use relative path when the test is working.
+    config = "test_config.json"
+
+    # TODO add initial path
+    cmd_str = f"""bionemo-geneformer-recipe --dest {config} --recipe test-finetune --data-path {data_path} --result-dir {result_dir} --initial-ckpt-path {checkpoint_path}""".strip()
+    # continue when finished
+    env = dict(**os.environ)  # a local copy of the environment
+    env["MASTER_PORT"] = str(open_port)
+    cmd = shlex.split(cmd_str)
+    import sys
+
+    result = subprocess.run(
+        cmd,
+        cwd=tmpdir,
+        env=env,
+        capture_output=True,
+    )
+    # Now do pretrain
+    if result.returncode != 0:
+        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+
+    # TODO gotta set the right config options here.
+    # TODO set the parsing flag
+    cmd_str = f"""bionemo-geneformer-train --conf {config} """.strip()
+    env = dict(**os.environ)  # a local copy of the environment
+    open_port = find_free_network_port()
+    env["MASTER_PORT"] = str(open_port)
+    cmd = shlex.split(cmd_str)
+    print("starting the training invocation of evil")
+    result = subprocess.run(
+        cmd,
+        cwd=tmpdir,
+        env=env,
+        # capture_output=True,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+    )
+    if result.returncode != 0:
+        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+    # NOTE this looks a lot like a magic value. But we also could do json.loads(config)['experiment_config']['experiment_name']
+    assert (result_dir / "test-experiment").exists(), "Could not find test experiment directory."
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index f09c183436..8b567a8be0 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -19,6 +19,7 @@
 from typing import Optional
 
 from bionemo.geneformer.run.config_models import (
+    ExposedFineTuneSeqLenBioBertConfig,
     ExposedGeneformerPretrainConfig,
     GeneformerPretrainingDataConfig,
 )
@@ -44,7 +45,8 @@ def parse_args():
         )
         parser.add_argument(
             "--resume-if-exists",
-            default=True,
+            default=False,
+            action="store_true",
             help="Resume training if a checkpoint exists that matches the current experiment configuration.",
         )
         return parser.parse_args()
@@ -61,10 +63,13 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
             config_dict = json.load(f)
 
         # model/data_config_t is used to select the parser dynamically.
-        if model_config_t is None:
-            # our parser doesnt like literals that are already imported.
+        if model_config_t is None or model_config_t == "ExposedGeneformerPretrainConfig":
             model_config_t = ExposedGeneformerPretrainConfig
+        elif model_config_t == "ExposedFineTuneSeqLenBioBertConfig":
+            # Hardcoded path for those who do not know the full path
+            model_config_t = ExposedFineTuneSeqLenBioBertConfig
         elif isinstance(model_config_t, str):
+            # We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig'
             model_config_t = string_to_class(model_config_t)
 
         if data_config_t is None:
@@ -76,7 +81,6 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
 
     args = parse_args()
     config = load_config(args.config, args.model_config_t, args.data_config_t)
-    # New
     train(
         bionemo_exposed_model_config=config.bionemo_model_config,
         data_config=config.data_config,
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index 1ec5711dc1..b76ebde2fe 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import argparse
+from functools import partial
 from typing import List, Optional
 
 from nemo.utils import logging
@@ -36,7 +36,20 @@
 from bionemo.llm.utils.logger_utils import WandbConfig
 
 
-def geneformer_small_data_recipe(data_dir) -> GeneformerPretrainingDataConfig:
+"""
+This script is for defining pre-configured recipes. Recipes at the minimum provide the user with a template config file.
+Additionally, it may be useful to define prepackaged recipes for common usecases such as tests. Here we define a the
+following recipes:
+
+- example recipe with minimal data
+- test recipe for running tests (same as above?)
+- finetuning recipe with regression head based on the output of the test recipe.
+- pretraining recipe on 10M sized model
+- pretraining recipe on 106M sized model
+"""
+
+
+def geneformer_data_recipe(data_dir) -> GeneformerPretrainingDataConfig:
     """Recipe that produces the base geneformer small data configuration."""
     return GeneformerPretrainingDataConfig(data_dir=data_dir)
 
@@ -46,7 +59,10 @@ def full_geneformer_data_recipe(data_dir) -> GeneformerPretrainingDataConfig:
 
 
 def simple_parallel_recipe(
-    tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, num_devices: int = 1
+    tensor_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    num_devices: int = 1,
+    accumulate_grad_batches: int = 1,
 ) -> ParallelConfig:
     assert (
         num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
@@ -64,15 +80,26 @@ def geneformer_finetuning_regression_head_recipe(
     initial_ckpt_path: Optional[str] = None,
     initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None,
 ) -> ExposedFineTuneSeqLenBioBertConfig:
-    # NOTE (SKH): this recipe is sad because it isnt smart enough to know our validator is returning a dtype.
-    finetuning_config = ExposedFineTuneSeqLenBioBertConfig(
+    """NOTE on initial_ckpt_skip_keys_with_these_prefixes: configs define their own default with defaultfactory, so
+    when we get passed None, we defer to the default. Importantly, the 'do nothing' case is different, where the input
+    would be an empty list.
+    """
+    partial_finetuning_config = partial(
+        ExposedFineTuneSeqLenBioBertConfig,
         params_dtype=precision,
         pipeline_dtype=precision,
         autocast_dtype=precision,
         nemo1_ckpt_path=nemo1_init_path,
         initial_ckpt_path=initial_ckpt_path,
-        initial_ckpt_skip_keys_with_these_prefixes=initial_ckpt_skip_keys_with_these_prefixes,
+        biobert_spec_option=BiobertSpecOption.bert_layer_with_transformer_engine_spec,
     )
+    if initial_ckpt_skip_keys_with_these_prefixes:
+        finetuning_config = partial_finetuning_config(
+            initial_ckpt_skip_keys_with_these_prefixes=initial_ckpt_skip_keys_with_these_prefixes
+        )
+    else:
+        # Use the sensible default when None is passed
+        finetuning_config = partial_finetuning_config()
     return finetuning_config
 
 
@@ -80,12 +107,55 @@ def default_trainer_config_recipe() -> TrainingConfig:
     return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
 
 
-def geneformer10M_pretraining_recipe(
+def geneformer10m_finetune_config(
     seq_length: int = 2048,
     precision: PrecisionTypes = "bf16-mixed",
     nemo1_init_path: Optional[str] = None,
     initial_ckpt_path: Optional[str] = None,
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_local_spec,
+    biobert_spec_option=BiobertSpecOption.bert_layer_with_transformer_engine_spec,
+) -> ExposedFineTuneSeqLenBioBertConfig:
+    geneformer_config = ExposedFineTuneSeqLenBioBertConfig(
+        num_layers=6,
+        hidden_size=256,
+        ffn_hidden_size=512,
+        num_attention_heads=4,
+        seq_length=seq_length,
+        fp32_residual_connection=False,
+        hidden_dropout=0.02,
+        init_method_std=0.02,
+        kv_channels=None,
+        apply_query_key_layer_scaling=False,
+        make_vocab_size_divisible_by=128,
+        masked_softmax_fusion=True,
+        fp16_lm_cross_entropy=False,
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        gradient_accumulation_fusion=False,
+        layernorm_zero_centered_gamma=False,
+        layernorm_epsilon=1.0e-12,
+        activation_func="gelu",
+        qk_layernorm=False,
+        apply_residual_connection_post_layernorm=False,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+        get_attention_mask_from_fusion=False,
+        attention_dropout=0.1,
+        share_embeddings_and_output_weights=True,
+        enable_autocast=False,
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
+    )
+    return geneformer_config
+
+
+def geneformer10M_pretraining_config(
+    seq_length: int = 2048,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedGeneformerPretrainConfig:
     geneformer_config = ExposedGeneformerPretrainConfig(
         num_layers=6,
@@ -140,33 +210,98 @@ def experiment_config_recipe() -> ExperimentConfig:
     )
 
 
-def main():
-    def parse_args():
-        parser = argparse.ArgumentParser(description="Create Geneformer configuration JSON.")
-        parser.add_argument(
-            "--dest",
-            type=str,
-            default="./geneformer-recipe.json",
-            required=True,
-            help="Path to the JSON configuration file.",
-        )
-        parser.add_argument(
-            "--data-dir", type=str, required=True, help="Path to the directory containing pretraining data."
-        )
-        parser.add_argument(
-            "--initial-ckpt-path", type=str, required=False, default=None, help="Path to an existing to a checkpoint directory to restore"
-        )
-        args = parser.parse_args()
-        return args
+def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig]:
+    data_path = args.data_path
+    result_dir = args.result_dir
 
-    """Simple example for creating a JSON from recipes."""
+    parallel_config = ParallelConfig(
+        tensor_model_parallel_size=1, pipeline_model_parallel_size=1, num_devices=1, accumulate_grad_batches=2
+    )
+    training_config = TrainingConfig(
+        max_steps=55, limit_val_batches=2, val_check_interval=10, precision="bf16-mixed", accelerator="gpu"
+    )
+    data_config = GeneformerPretrainingDataConfig(
+        seq_length=128,
+        micro_batch_size=2,
+        num_dataset_workers=0,
+        data_dir=data_path,
+    )
+    experiment_config = ExperimentConfig(
+        save_every_n_steps=training_config.val_check_interval,
+        result_dir=result_dir,
+        experiment_name="test-experiment",
+        restore_from_checkpoint_path=None,
+        save_last_checkpoint=True,
+        metric_to_monitor_for_checkpoints="reduced_train_loss",
+        save_top_k=2,
+        create_tensorboard_logger=False,
+    )
 
-    args = parse_args()
-    data_config: GeneformerPretrainingDataConfig = geneformer_small_data_recipe(data_dir=args.data_dir)
+    optim_config = OptimizerSchedulerConfig()
+    geneformer_config = geneformer10m_finetune_config(
+        seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
+    )
+
+    return MainConfig(
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        bionemo_model_config=geneformer_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+    )
+
+
+def pretrain_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
+    data_path = args.data_path
+    result_dir = args.result_dir
+
+    parallel_config = ParallelConfig(
+        tensor_model_parallel_size=1, pipeline_model_parallel_size=1, num_devices=1, accumulate_grad_batches=2
+    )
+    training_config = TrainingConfig(
+        max_steps=55, limit_val_batches=2, val_check_interval=10, precision="bf16-mixed", accelerator="gpu"
+    )
+    data_config = GeneformerPretrainingDataConfig(
+        seq_length=128,
+        micro_batch_size=2,
+        num_dataset_workers=0,
+        data_dir=data_path,
+    )
+    experiment_config = ExperimentConfig(
+        save_every_n_steps=training_config.val_check_interval,
+        result_dir=result_dir,
+        experiment_name="test-experiment",
+        restore_from_checkpoint_path=None,
+        save_last_checkpoint=True,
+        metric_to_monitor_for_checkpoints="reduced_train_loss",
+        save_top_k=2,
+        create_tensorboard_logger=False,
+    )
+
+    optim_config = OptimizerSchedulerConfig()
+    geneformer_config = geneformer10M_pretraining_config(
+        seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
+    )
+
+    return MainConfig(
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        bionemo_model_config=geneformer_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+    )
+
+
+def geneformer10m_pretrain_recipe(
+    args,
+) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
+    data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_dir)
     parallel_config = simple_parallel_recipe()
     training_config = default_trainer_config_recipe()
     # bionemo_model_config = geneformer_finetuning_regression_head_recipe()
-    bionemo_model_config = geneformer10M_pretraining_recipe(initial_ckpt_path=args.initial_ckpt_path)
+    bionemo_model_config = geneformer10M_pretraining_config(initial_ckpt_path=args.initial_ckpt_path)
     optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
     experiment_config = experiment_config_recipe()
     wandb_config = WandbConfig(
@@ -179,9 +314,38 @@ def parse_args():
         log_model=False,
         anonymous=True,
     )
+    main_config = MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig](
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        bionemo_model_config=bionemo_model_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+        wandb_config=wandb_config,
+    )
+    return main_config
+
 
-    # Create the master config
-    master_config = MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig](
+def geneformer10m_finetune_recipe(
+    args,
+) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig]:
+    data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_path)
+    parallel_config = simple_parallel_recipe()
+    training_config = default_trainer_config_recipe()
+    bionemo_model_config = geneformer_finetuning_regression_head_recipe(initial_ckpt_path=args.initial_ckpt_path)
+    optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
+    experiment_config = experiment_config_recipe()
+    wandb_config = WandbConfig(
+        project="bionemo2-demo",
+        entity="nvidia",
+        offline=True,
+        tags=[],
+        group="dev",
+        id="dev",
+        log_model=False,
+        anonymous=True,
+    )
+    main_config = MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig](
         data_config=data_config,
         parallel_config=parallel_config,
         training_config=training_config,
@@ -190,9 +354,66 @@ def parse_args():
         experiment_config=experiment_config,
         wandb_config=wandb_config,
     )
+    return main_config
+
+
+def main():
+    def parse_args():
+        parser = argparse.ArgumentParser(description="Create Geneformer configuration JSON.")
+        parser.add_argument(
+            "--recipe",
+            type=str,
+            choices=["test", "10m-pretrain", "test-finetune", "finetune"],
+            required=True,
+            help="Use one of the preconfigured recipes to create a template config file.",
+        )
+
+        parser.add_argument(
+            "--dest",
+            type=str,
+            default="./geneformer-recipe.json",
+            required=True,
+            help="Path to the JSON configuration file.",
+        )
+
+        parser.add_argument(
+            "--data-path", type=str, required=True, help="Path to the directory containing pretraining data."
+        )
+        parser.add_argument(
+            "--result-dir", type=str, required=True, help="Path to the directory used to save results."
+        )
+
+        # Extra argument.
+        parser.add_argument(
+            "--initial-ckpt-path",
+            type=str,
+            required=False,
+            default=None,
+            help="Path to an existing to a checkpoint directory to restore an existing checkpoint. Not compatible with all recipes.",
+        )
+
+        args = parser.parse_args()
+        return args
+
+    """Simple example for creating a JSON from recipes."""
+    args = parse_args()
+
+    if args.recipe == "test":
+        config = pretrain_test_recipe(args)
+    elif args.recipe == "10m-pretrain":
+        config = geneformer10m_pretrain_recipe(args)
+    elif args.recipe == "106m-pretrain":
+        # config = geneformer106m_pretrain_recipe(args)
+        raise NotImplementedError("106M pretraining recipe not implemented.")
+    elif args.recipe == "test-finetune":
+        config = finetune_test_recipe(args)
+    elif args.recipe == "finetune":
+        config = geneformer10m_finetune_recipe(args)
+    else:
+        raise ValueError("Invalid recipe choice.")
 
     # Serialize to JSON
-    json_str = master_config.model_dump_json(indent=2)
+    json_str = config.model_dump_json(indent=2)
 
     # Save to file
     with open(
@@ -200,4 +421,9 @@ def parse_args():
         "w",
     ) as f:
         f.write(json_str)
+
     logging.info(f"Saved configuration to {args.dest=}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
index 5ca3453506..3e7ffc7b08 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
@@ -14,22 +14,20 @@
 # limitations under the License.
 
 
-from typing import Any, Callable, Dict, Generic, Literal, Optional, Type, TypeVar
 from abc import ABC, abstractmethod
-from pydantic import BaseModel, ValidationError, field_serializer, field_validator, model_validator
-from nemo.lightning import resume
-from nemo import lightning as nl
+from typing import Any, Callable, Dict, Generic, Literal, Optional, Type, TypeVar
+
 import pytorch_lightning as pl
 import torch
+from pydantic import BaseModel, ValidationError, field_serializer, field_validator, model_validator
 from torch.nn import functional as F
-from nemo.collections import llm
 
 from bionemo.core.utils import dtypes
-from bionemo.geneformer.api import GeneformerConfig
 from bionemo.llm.model.biobert.model import BioBertGenericConfig
 from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
 from bionemo.llm.utils.logger_utils import WandbConfig
 
+
 ModelConfigT = TypeVar("ModelConfigT", bound=BioBertGenericConfig)
 DataModuleT = TypeVar("DataModuleT", bound=pl.LightningDataModule)
 
@@ -44,6 +42,8 @@
 REVERSE_CUSTOM_ACTIVATION_FNS: Dict[Callable[[torch.Tensor, Any], torch.Tensor], str] = {
     v: k for k, v in CUSTOM_ACTIVATION_FNS.items()
 }
+
+
 class DataConfig(BaseModel, Generic[DataModuleT], ABC):
     """Base class for all data configurations.
 
@@ -60,6 +60,7 @@ def construct_data_module(self, global_batch_size: int) -> DataModuleT:
         """Construct the data module from the configuration. Cannot be defined generically."""
         ...
 
+
 class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
     """BioNeMo model configuration class, wraps TransformerConfig and friends.
 
@@ -81,9 +82,7 @@ def model_class(self) -> Type[ModelConfigT]:
         # How did this all work yesterday even?
         # so we cant do it this way because we are kinda losing the magic of generics.
         #  ideally _the generics_ have all the methods we want implemented on them already.
-        # TODO (SKH)
         raise NotImplementedError
-        return GeneformerConfig
 
     def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
         """Converts the exposed dataclass to the underlying Transformer config.
@@ -93,7 +92,6 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
 
         This is a good candidate for refactoring.
         """
-
         cls: Type[ModelConfigT] = self.model_class()
         model_dict = {}
         for attr in self.model_fields:
@@ -143,15 +141,16 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
     @field_validator("activation_func", mode="before")
     @classmethod
     def validate_activation_func(cls, activation_func: str) -> Callable:
-        """
-        Validates the activation function, assumes this function exists in torch.nn.functional. For custom
+        """Validates the activation function, assumes this function exists in torch.nn.functional. For custom
         activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module.
 
         This method validates the provided activation function string and returns
         a callable function based on the validation context using the provided validator in the base class.
+
         Args:
             activation_func (str): The activation function to be validated.
             context (ValidationInfo): The context for validation.
+
         Returns:
             Callable: A callable function after validation.
 
@@ -189,6 +188,7 @@ def precision_validator(cls, v: dtypes.PrecisionTypes) -> torch.dtype:
     def serialize_dtypes(self, v: torch.dtype) -> dtypes.PrecisionTypes:
         return dtypes.dtype_to_precision[v]
 
+
 class ParallelConfig(BaseModel):
     tensor_model_parallel_size: int = 1
     pipeline_model_parallel_size: int = 1
@@ -207,6 +207,7 @@ def validate_devices(self):
             )
         return self
 
+
 class TrainingConfig(BaseModel):
     max_steps: int
     limit_val_batches: int
@@ -215,6 +216,7 @@ class TrainingConfig(BaseModel):
     precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
     accelerator: str = "gpu"
 
+
 class OptimizerSchedulerConfig(BaseModel):
     # TODO validators on optimizer, interval, and monitor.
     lr: float = 1e-4
@@ -224,6 +226,7 @@ class OptimizerSchedulerConfig(BaseModel):
     interval: str = "step"
     monitor: str = "val_loss"
 
+
 class ExperimentConfig(BaseModel):
     save_every_n_steps: int
     result_dir: str
@@ -235,13 +238,15 @@ class ExperimentConfig(BaseModel):
     save_top_k: int = 2
     create_tensorboard_logger: bool = False
 
+
 # DataConfig -> some config that can make a data module (see ABC definition.)
 DataConfigT = TypeVar("DataConfigT", bound=DataConfig)
 # ExposedModelConfig -> some config that can make a non-exposed model config (see ABC definition.)
 ExModelConfigT = TypeVar("ExModelConfigT", bound=ExposedModelConfig)
 
+
 class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
-    ''' Main configuration class for BioNeMo. All serialized configs that are a valid MainConfig should be Runnable.
+    """Main configuration class for BioNeMo. All serialized configs that are a valid MainConfig should be Runnable.
 
     This class is used to define the main configuration for BioNeMo. It defines the minimal pieces of configuration
     to execution a training job with the NeMo2 training api. It accepts two generic type parameters which users
@@ -256,7 +261,8 @@ class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
         optim_config: The optimizer/scheduler configuration for the model.
         experiment_config: The experiment configuration for the model.
         wandb_config: Optional, the wandb configuration for the model.
-    '''
+    """
+
     data_config: DataConfigT
     parallel_config: ParallelConfig
     training_config: TrainingConfig
@@ -269,4 +275,4 @@ class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
     def validate_master_config(self) -> "MainConfig":
         self.bionemo_model_config.seq_length = self.data_config.seq_length
         # What other global validators should we set here?
-        return self
\ No newline at end of file
+        return self

From 592409703c4d4d18fc6bbb5b371b52ae930c20dc Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Thu, 17 Oct 2024 14:26:15 -0700
Subject: [PATCH 26/58] backing up

---
 .../bionemo-esm2/src/bionemo/esm2/data/datamodule.py         | 5 +++++
 sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py    | 1 +
 .../src/bionemo/geneformer/run/config_models.py              | 3 +++
 .../bionemo-llm/src/bionemo/llm/config/config_models.py      | 2 +-
 sub-packages/bionemo-llm/src/bionemo/llm/train.py            | 2 +-
 5 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/data/datamodule.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/data/datamodule.py
index 7d32dfce6b..29be5590fc 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/data/datamodule.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/data/datamodule.py
@@ -106,6 +106,11 @@ def __init__(
             rampup_batch_size=rampup_batch_size,
         )
 
+    @property
+    def tokenizer(self) -> tokenizer.BioNeMoESMTokenizer:
+        """Returns the tokenizer."""
+        return self._tokenizer
+
     def setup(self, stage: str = "") -> None:
         """Setup the ESMDataModule.
 
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
index ab23c38c36..5f59cdd23c 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
@@ -314,6 +314,7 @@ class ESM2GenericConfig(BioBertGenericConfig[ESM2ModelT]):
     return_only_hidden_states: bool = False  # return logits
 
     def __post_init__(self):
+        # TODO, as a validator?
         """Check compatibility between biobert_spec_option and apply_query_key_layer_scaling post initialization."""
         super().__post_init__()
         if self.biobert_spec_option == BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec:
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index afc2a57c76..2bb93d27b3 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -134,6 +134,7 @@ def construct_data_module(self, global_batch_size: int) -> SingleCellDataModule:
 
 
 def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig) -> nl.Trainer:
+    # TODO: lift into llm?
     strategy = nl.MegatronStrategy(
         tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
         pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
@@ -190,6 +191,7 @@ def biobert_lightning_module(
     optim_config: OptimizerSchedulerConfig,
     num_steps: int,
 ) -> BioBertLightningModule:
+    # TODO Lift into llm?
     model = BioBertLightningModule(
         bionemo_model_config,
         tokenizer=tokenizer,
@@ -214,6 +216,7 @@ def biobert_lightning_module(
     return model
 
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
+    # TODO lift into llm?
     checkpoint_callback = nl_callbacks.ModelCheckpoint(
         save_last=experiment_config.save_last_checkpoint,
         monitor=experiment_config.metric_to_monitor_for_checkpoints,
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
index 3e7ffc7b08..3c69da654e 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
@@ -53,7 +53,7 @@ class DataConfig(BaseModel, Generic[DataModuleT], ABC):
 
     micro_batch_size: int = 8
     result_dir: str = "./results"
-    seq_length: int = 128
+    num_dataset_workers: int = 0
 
     @abstractmethod
     def construct_data_module(self, global_batch_size: int) -> DataModuleT:
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index a5218f6c1c..788b3d10df 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -149,7 +149,7 @@ def train(
 
     data: DataModuleT = data_config.construct_data_module(global_batch_size)
 
-    # TODO BioBertDataModule or BioBertTokenizer abstractions. We know all DataModuleT in this case have data.tokenizer,
+    # TODO BioBertDataModule or BioBertTokenizer abstractions. We know all DataModuleT in this case has data.tokenizer,
     # although this constraint is not documented.
     model: BioBertLightningModule = biobert_lightning_module(
         bionemo_model_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps

From de517f1779483fadf3fcfcfe594f5c9eba976686 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 22 Oct 2024 20:39:29 +0000
Subject: [PATCH 27/58] - adds esm2 configs with pydantic - adds entrypoints
 for esm2 recipe builder and execution

---
 sub-packages/bionemo-esm2/pyproject.toml      |  4 +
 .../src/bionemo/esm2/model/model.py           |  4 +
 .../bionemo-esm2/src/bionemo/esm2/run/main.py | 77 +++++++++++++++++++
 .../src/bionemo/llm/config/config_models.py   | 38 ++++++++-
 4 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py

diff --git a/sub-packages/bionemo-esm2/pyproject.toml b/sub-packages/bionemo-esm2/pyproject.toml
index bcfe673533..9b65d9f7e2 100644
--- a/sub-packages/bionemo-esm2/pyproject.toml
+++ b/sub-packages/bionemo-esm2/pyproject.toml
@@ -11,6 +11,10 @@ authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }]
 dynamic = ["version"]
 dependencies = ['bionemo-core', 'bionemo-llm']
 
+[project.scripts]
+bionemo-esm2-train= "bionemo.esm2.run.main:main"
+bionemo-esm2-recipe= "bionemo.esm2.run.recipes:main"
+
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["bionemo.*"]
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
index 5f59cdd23c..54f2026d30 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
@@ -261,6 +261,10 @@ class ESM2GenericConfig(BioBertGenericConfig[ESM2ModelT]):
         return_only_hidden_states: Whether to return only hidden states.
         loss_reduction_class: Loss reduction class for the model. Default to BERTMLMLossWithReduction.
     """
+    # ESM specific fields (these are repeated below)
+    use_esm_attention: bool = False  # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
+    token_dropout: bool = True
+    normalize_attention_scores: bool = False
 
     # When overriding fields in a dataclass _always_ declare types: https://github.com/python/cpython/issues/123269
     model_cls: Type[ESM2Model] = ESM2Model
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
new file mode 100644
index 0000000000..9a9408e96e
--- /dev/null
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -0,0 +1,77 @@
+import argparse
+import json
+from typing import Optional
+
+from bionemo.esm2.run.config_models import ESM2DataConfig, ExposedESM2PretrainConfig
+from bionemo.llm.config.config_models import MainConfig
+from bionemo.llm.train import train
+
+
+def main():
+    def parse_args():
+        parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
+        parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
+        parser.add_argument(
+            "--model-config-t",
+            default=ExposedESM2PretrainConfig,
+            required=False,
+            help="fully resolvable python import path to the ModelConfig object. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.",
+        )
+        parser.add_argument(
+            "--data-config-t",
+            default=ESM2DataConfig,
+            required=False,
+            help="fully resolvable python import path to the ModelConfig object.",
+        )
+        parser.add_argument(
+            "--resume-if-exists",
+            default=False,
+            action="store_true",
+            help="Resume training if a checkpoint exists that matches the current experiment configuration.",
+        )
+        return parser.parse_args()
+
+    def string_to_class(path: str):
+        import importlib
+        module_path, class_name = path.rsplit(".", 1)
+        module = importlib.import_module(module_path)
+        return getattr(module, class_name)
+
+    def load_config(config_path: str, model_config_t: Optional[str], data_config_t: Optional[str]) -> MainConfig:
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+
+        # model/data_config_t is used to select the parser dynamically.
+        if model_config_t is None or model_config_t == "ExposedESM2PretrainConfig":
+            model_config_t = ExposedESM2PretrainConfig
+        elif model_config_t == "ExposedFineTuneSeqLenBioBertConfig":
+            # Hardcoded path for those who do not know the full path
+            # model_config_t = ExposedFineTuneSeqLenBioBertConfig
+            raise NotImplementedError()
+        elif isinstance(model_config_t, str):
+            # We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig'
+            model_config_t = string_to_class(model_config_t)
+
+        if data_config_t is None:
+            data_config_t = ESM2DataConfig
+        elif isinstance(data_config_t, str):
+            data_config_t = string_to_class(data_config_t)
+
+        return MainConfig[model_config_t, data_config_t](**config_dict)
+
+    args = parse_args()
+    config = load_config(args.config, args.model_config_t, args.data_config_t)
+    train(
+        bionemo_exposed_model_config=config.bionemo_model_config,
+        data_config=config.data_config,
+        parallel_config=config.parallel_config,
+        training_config=config.training_config,
+        optim_config=config.optim_config,
+        experiment_config=config.experiment_config,
+        wandb_config=config.wandb_config,
+        resume_if_exists=args.resume_if_exists,
+    )
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
index 3c69da654e..689b945e37 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
@@ -15,7 +15,8 @@
 
 
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Generic, Literal, Optional, Type, TypeVar
+from dataclasses import field
+from typing import Any, Callable, Dict, Generic, List, Literal, Optional, Type, TypeVar
 
 import pytorch_lightning as pl
 import torch
@@ -54,12 +55,22 @@ class DataConfig(BaseModel, Generic[DataModuleT], ABC):
     micro_batch_size: int = 8
     result_dir: str = "./results"
     num_dataset_workers: int = 0
+    seq_length: int = 128
 
     @abstractmethod
     def construct_data_module(self, global_batch_size: int) -> DataModuleT:
         """Construct the data module from the configuration. Cannot be defined generically."""
         ...
 
+    def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
+        ''' Use custom implementation of this method to define the things inside global_config. 
+
+        The following expression will always be true:
+
+        global_cfg.data_config == self 
+        '''
+        return global_cfg
+
 
 class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
     """BioNeMo model configuration class, wraps TransformerConfig and friends.
@@ -72,6 +83,14 @@ class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
 
     """
 
+    # Restores weights from a pretrained checkpoint
+    initial_ckpt_path: Optional[str] = None
+    # Does not attempt to load keys with these prefixes (useful if you attached extra parameters and still want to load a set of weights)
+    initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
+
+    # TODO validator on num_attention_heads, ffn_hidden_size, and hidden_size as these have knowable constraints.
+
+
     # Pydantic stuff to allow arbitrary types + validators + serializers
     class Config:
         arbitrary_types_allowed = True
@@ -84,6 +103,15 @@ def model_class(self) -> Type[ModelConfigT]:
         #  ideally _the generics_ have all the methods we want implemented on them already.
         raise NotImplementedError
 
+    def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
+        ''' Use custom implementation of this method to define the things inside global_config. 
+
+        The following expression will always be true:
+
+        global_cfg.bionemo_model_config == self 
+        '''
+        return global_cfg
+
     def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
         """Converts the exposed dataclass to the underlying Transformer config.
 
@@ -276,3 +304,11 @@ def validate_master_config(self) -> "MainConfig":
         self.bionemo_model_config.seq_length = self.data_config.seq_length
         # What other global validators should we set here?
         return self
+    
+    @model_validator(mode="after")
+    def run_bionemo_model_config_model_validators(self) -> "MainConfig":
+        return self.bionemo_model_config.model_validator(self)
+    
+    @model_validator(mode="after")
+    def run_data_config_modeL_validators(self) -> "MainConfig":
+        return self.data_config.model_validator(self)

From c7f679cedbc27e016b35b9609ccd3538b5fa4f9a Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 22 Oct 2024 20:43:42 +0000
Subject: [PATCH 28/58] add the missing recipes and init file

---
 .../src/bionemo/esm2/run/__init__.py          |   0
 .../src/bionemo/esm2/run/recipes.py           | 272 ++++++++++++++++++
 2 files changed, 272 insertions(+)
 create mode 100644 sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py
 create mode 100644 sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
new file mode 100644
index 0000000000..09226965c2
--- /dev/null
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
@@ -0,0 +1,272 @@
+
+import importlib
+from pathlib import Path
+from bionemo.llm.config.config_models import ExperimentConfig, MainConfig, OptimizerSchedulerConfig, ParallelConfig, TrainingConfig
+from bionemo.llm.utils.logger_utils import WandbConfig
+from nemo.utils import logging
+from typing import Optional
+from bionemo.core.utils.dtypes import PrecisionTypes
+from bionemo.esm2.run.config_models import ESM2DataConfig, ExposedESM2PretrainConfig
+from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
+
+import argparse
+import json
+from typing import Optional
+
+
+def simple_parallel_recipe(
+    tensor_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    num_devices: int = 1,
+    accumulate_grad_batches: int = 1,
+) -> ParallelConfig:
+    assert (
+        num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
+    ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
+    return ParallelConfig(
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        pipeline_model_parallel_size=pipeline_model_parallel_size,
+        num_devices=num_devices,
+        accumulate_grad_batches=accumulate_grad_batches,
+    )
+
+
+def default_trainer_config_recipe() -> TrainingConfig:
+    return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
+
+
+def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
+    return OptimizerSchedulerConfig()
+
+
+def experiment_config_recipe() -> ExperimentConfig:
+    return ExperimentConfig(
+        save_every_n_steps=100,
+        result_dir="./results",
+        experiment_name="default_experiment",
+        restore_from_checkpoint_path=None,
+        save_last_checkpoint=True,
+        metric_to_monitor_for_checkpoints="reduced_train_loss",
+        save_top_k=2,
+        create_tensorboard_logger=False,
+    )
+
+
+def esm2_8m_model_config(
+    seq_length: int = 2048,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
+    variable_seq_lengths: bool = False,
+) -> ExposedESM2PretrainConfig:
+    return ExposedESM2PretrainConfig(
+        seq_length=seq_length,
+        num_layers=6,
+        hidden_size=320,
+        num_attention_heads=20,
+        ffn_hidden_size=4*320,
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=str(nemo1_init_path) if nemo1_init_path is not None else None,
+        # handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
+        initial_ckpt_path=str(initial_ckpt_path) if initial_ckpt_path is not None else None,
+        variable_seq_lengths=variable_seq_lengths   
+    )
+
+
+def esm2_650m_config(
+    seq_length: int = 2048,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
+    variable_seq_lengths: bool = False,
+) -> ExposedESM2PretrainConfig:
+    return ExposedESM2PretrainConfig(
+        seq_length=seq_length,
+        num_layers=33,
+        hidden_size=1280,
+        num_attention_heads=20,
+        ffn_hidden_size=4*1280,
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=str(nemo1_init_path) if nemo1_init_path is not None else None,
+        # handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
+        initial_ckpt_path=str(initial_ckpt_path) if initial_ckpt_path is not None else None,
+        variable_seq_lengths=variable_seq_lengths   
+    )
+
+
+'''
+    --train-cluster-path ${TEST_DATA_DIR}/2024_03_sanity/train_clusters_sanity.parquet     \
+    --train-database-path ${TEST_DATA_DIR}/2024_03_sanity/train_sanity.db     \
+    --valid-cluster-path ${TEST_DATA_DIR}/2024_03_sanity/valid_clusters.parquet     \
+    --valid-database-path ${TEST_DATA_DIR}/2024_03_sanity/validation.db     \
+    --result-dir ./results     \
+    --experiment-name test_experiment     \
+    --num-gpus 1  \
+    --num-nodes 1 \
+    --val-check-interval 10 \
+    --num-dataset-workers 1 \
+    --num-steps 10 \
+    --max-seq-length 128 \
+    --limit-val-batches 2 \
+    --micro-batch-size 2 \
+    --restore-from-checkpoint-path ${ESM2_650M_CKPT}
+'''
+
+def esm2_8m_test_recipe(args):
+    parallel_config = simple_parallel_recipe()
+    training_config = default_trainer_config_recipe()
+    # $(download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source $MY_DATA_SOURCE);
+
+    # Find this from the test script... not sure what a sensible default is.
+    data_config = ESM2DataConfig(
+        min_seq_length=128,
+        max_seq_length=128,
+        micro_batch_size=2,
+        num_dataset_workers=1,
+        train_cluster_path=args.train_cluster_path,
+        train_database_path=args.train_database_path,
+        valid_cluster_path=args.valid_cluster_path,
+        valid_database_path=args.valid_database_path,
+    )
+    # bionemo_model_config = geneformer_finetuning_regression_head_recipe()
+    bionemo_model_config = esm2_650m_config(seq_length=data_config.max_seq_length, initial_ckpt_path=args.initial_ckpt_path)
+    bionemo_model_config = esm2_8m_model_config(seq_length=data_config.max_seq_length, initial_ckpt_path=args.initial_ckpt_path)
+
+    optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
+    experiment_config = experiment_config_recipe()
+    wandb_config = WandbConfig(
+        project="bionemo2-demo",
+        entity="nvidia",
+        offline=True,
+        tags=[],
+        group="dev",
+        id="dev",
+        log_model=False,
+        anonymous=True,
+    )
+    main_config = MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig](
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        bionemo_model_config=bionemo_model_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+        wandb_config=wandb_config,
+    )
+    return main_config
+
+def main():
+    def parse_args():
+        parser = argparse.ArgumentParser(description="Create ESM2 configuration JSON.")
+        parser.add_argument(
+            "--recipe",
+            type=str,
+            choices=["test-8m"],
+            required=True,
+            help="Use one of the preconfigured recipes to create a template config file.",
+        )
+
+        parser.add_argument(
+            "--dest",
+            type=str,
+            default="./esm2-recipe.json",
+            required=True,
+            help="Path to the JSON configuration file.",
+        )
+
+        parser.add_argument(
+            '--train-cluster-path', type=Path, required=True, help='Path to the training cluster file.'
+        )
+        parser.add_argument(
+            '--train-database-path', type=Path, required=True, help='Path to the training database file.'
+        )
+        parser.add_argument(
+            '--valid-cluster-path', type=Path, required=True, help='Path to the validation cluster file.'
+        )
+        parser.add_argument(
+            '--valid-database-path', type=Path, required=True, help='Path to the validation database file.'
+        )
+
+        # Extra argument.
+        parser.add_argument(
+            "--initial-ckpt-path",
+            type=str,
+            required=False,
+            default=None,
+            help="Path to an existing to a checkpoint directory to restore an existing checkpoint. Not compatible with all recipes.",
+        )
+
+        args = parser.parse_args()
+        return args
+
+    """Simple example for creating a JSON from recipes."""
+    args = parse_args()
+
+    if args.recipe == "test-8m":
+        # Hardcoded test recipe.
+        config = esm2_8m_test_recipe(args)
+    elif args.recipe == "test-finetune":
+        raise ValueError("Invalid recipe choice.")
+        # config = finetune_test_recipe(args)
+    else:
+        raise ValueError("Invalid recipe choice.")
+
+    # Serialize to JSON
+    breakpoint()
+    json_str = config.model_dump_json(indent=2)
+
+    # Save to file
+    with open(
+        args.dest,
+        "w",
+    ) as f:
+        f.write(json_str)
+    logging.info(f"Saved configuration to {args.dest=}")
+
+from pydantic import BaseModel, field_serializer, field_validator
+from typing import Type
+import torch
+class MyConfig(BaseModel):
+    core_attention_override: Optional[Type[torch.nn.Module]] = None
+
+    @field_serializer('core_attention_override')
+    def serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
+        if value is None:
+            return None
+        return f"{value.__module__}.{value.__name__}"
+
+    @field_validator('core_attention_override', mode='before')
+    def validate_core_attention_override(cls, value):
+        if value is None:
+            return None
+        if isinstance(value, str):
+            module_name, class_name = value.rsplit(".", 1)
+            try:
+                module = importlib.import_module(module_name)
+                cls = getattr(module, class_name)
+                if not issubclass(cls, torch.nn.Module):
+                    raise ValueError(f"{cls} is not a subclass of torch.nn.Module")
+                return cls
+            except (ImportError, AttributeError):
+                raise ValueError(f"Cannot import {value}")
+        return value
+
+if __name__ == "__main__":
+    # NOTE: this is where I left off!
+    config = esm2_650m_config()
+    dumped = config.model_dump()
+    config_again = ExposedESM2PretrainConfig(**dumped)
+
+    assert config_again == config
+    main()
+
+
+# config.exposed_to_internal_bionemo_model_config()
\ No newline at end of file

From cec010da134d46cadca1ee9878c9697c6913d308 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 22 Oct 2024 20:44:07 +0000
Subject: [PATCH 29/58] add missing configs

---
 .../src/bionemo/esm2/run/config_models.py     | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
new file mode 100644
index 0000000000..db582f2a06
--- /dev/null
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
@@ -0,0 +1,152 @@
+import importlib
+import math
+import pathlib
+from dataclasses import dataclass, field
+from typing import List, Optional, Type
+
+from pydantic import ValidationError, field_serializer, field_validator, model_validator
+import torch
+
+from bionemo.esm2.data.datamodule import ESMDataModule
+from bionemo.esm2.data.dataset import RandomMaskStrategy
+from bionemo.esm2.data.tokenizer import get_tokenizer
+from bionemo.esm2.model.attention import ESM2DotProductAttention, ESM2TEDotProductAttention
+from bionemo.esm2.model.model import ESM2Config
+from megatron.core.optimizer import OptimizerConfig
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.lightning import resume
+from nemo.lightning.pytorch import callbacks as nl_callbacks
+from nemo.lightning.pytorch.optim import MegatronOptimizerModule
+from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
+from nemo.utils import logging
+from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
+from tokenizers import Tokenizer
+
+from bionemo.core.utils.dtypes import PrecisionTypes
+from bionemo.geneformer.api import GeneformerConfig
+from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
+from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
+from bionemo.geneformer.model.finetune_token_regressor import FineTuneSeqLenBioBertConfig
+from bionemo.llm.config.config_models import (
+    DataConfig,
+    DataModuleT,
+    ExperimentConfig,
+    ExposedModelConfig,
+    MainConfig,
+    OptimizerSchedulerConfig,
+    ParallelConfig,
+    TrainingConfig,
+)
+from bionemo.llm.model.biobert.lightning import BioBertLightningModule
+from bionemo.llm.model.biobert.model import BioBertGenericConfig, BiobertSpecOption
+from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
+from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
+
+from pathlib import Path
+class ESM2DataConfig(DataConfig[ESMDataModule]):
+    # defined in baseclass- listed here for exposure.
+    train_cluster_path: Path
+    train_database_path: Path
+    valid_cluster_path: Path
+    valid_database_path: Path
+
+    micro_batch_size: int = 8
+    result_dir: str = "./results"
+    min_seq_length: int = 128
+    max_seq_length: int = 128
+    random_mask_strategy: RandomMaskStrategy = RandomMaskStrategy.ALL_TOKENS
+    num_dataset_workers: int = 0
+
+
+    def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
+        tokenizer = get_tokenizer()
+        data = ESMDataModule(
+            train_cluster_path=self.train_cluster_path,
+            train_database_path=self.train_database_path,
+            valid_cluster_path=self.valid_cluster_path,
+            valid_database_path=self.valid_database_path,
+            global_batch_size=global_batch_size,
+            micro_batch_size=self.micro_batch_size,
+            min_seq_length=self.min_seq_length,
+            max_seq_length=self.max_seq_length,
+            num_workers=self.num_dataset_workers,
+            random_mask_strategy=self.random_mask_strategy,
+            tokenizer=tokenizer
+        )
+        return data
+
+class ExposedESM2PretrainConfig(ExposedModelConfig[ESM2Config]):
+    # ESM specific fields
+    use_esm_attention: bool = False  # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
+    token_dropout: bool = True
+    normalize_attention_scores: bool = False
+    variable_seq_lengths: bool = False
+    core_attention_override: Type[torch.nn.Module] | None = None
+
+    @field_validator("biobert_spec_option", mode="after")
+    @classmethod
+    def restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -> BiobertSpecOption:
+        # This has some more complicated validation I see
+
+        if biobert_spec_option in (BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec, BiobertSpecOption.esm2_bert_layer_local_spec):
+            return biobert_spec_option
+        else:
+            raise TypeError(f"Unsupported BiobertSpecOption: {biobert_spec_option=}, use one of {BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec}, {BiobertSpecOption.esm2_bert_layer_local_spec}")
+
+    @field_serializer('core_attention_override')
+    def serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
+        if value is None:
+            return None
+        return f"{value.__module__}.{value.__name__}"
+
+    @field_validator('core_attention_override', mode='before')
+    def validate_core_attention_override(cls, value):
+        if value is None:
+            return None
+        if isinstance(value, str):
+            module_name, class_name = value.rsplit(".", 1)
+            try:
+                module = importlib.import_module(module_name)
+                cls = getattr(module, class_name)
+                if not issubclass(cls, torch.nn.Module):
+                    raise ValueError(f"{cls} is not a subclass of torch.nn.Module")
+                return cls
+            except (ImportError, AttributeError):
+                raise ValueError(f"Cannot import {value}")
+        return value
+
+
+
+    @model_validator(mode="after")
+    def validate_and_set_attention_and_scaling(self):
+        logging.info("Mutating apply_query_key_layer_scaling and core_attention_override based on biobert_spec_option..")
+        if self.biobert_spec_option == BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec:
+            self.apply_query_key_layer_scaling = False
+            self.core_attention_override = ESM2TEDotProductAttention
+        elif self.biobert_spec_option == BiobertSpecOption.esm2_bert_layer_local_spec:
+            logging.warning(
+                "BiobertSpecOption.esm2_bert_layer_local_spec is deprecated. "
+                "Use BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec instead."
+            )
+            self.apply_query_key_layer_scaling = True
+            self.core_attention_override = ESM2DotProductAttention
+        return self
+
+    def model_validator(self, global_cfg: MainConfig) -> MainConfig:
+        global_cfg = super().model_validator(global_cfg)
+        # Need to ensure that at the least we have access to min_seq_length and max_seq_length
+        if not isinstance(global_cfg.data_config, ESM2DataConfig):
+            raise TypeError(f"ESM2PretrainConfig requires ESM2DataConfig, got {global_cfg.data_config=}")
+
+        pipeline_model_parallel_size, tensor_model_parallel_size = global_cfg.parallel_config.pipeline_model_parallel_size, global_cfg.parallel_config.tensor_model_parallel_size 
+        min_seq_length, max_seq_length = global_cfg.data_config.min_seq_length, global_cfg.data_config.max_seq_length
+        assert self.variable_seq_lengths == (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length), 'Must set variable_seq_lengths = (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)'
+        return global_cfg
+
+    def model_class(self) -> Type[ESM2Config]:
+        return ESM2Config 
+
+# TODO NOTES on default configuration
+# seq_length: int # max_sequence_length
+# need_megatron_variable_seq_lengths_reductions = (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)
\ No newline at end of file

From 08ef41f0c1862b2324a48bfe9c6d075f0e1e5e42 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 28 Oct 2024 20:28:01 +0000
Subject: [PATCH 30/58] saving work to merge from main

---
 scripts/protein/esm2/test_esm2_pretrain.py    |   1 +
 scripts/protein/esm2/test_pydantic_train.py   | 138 +++++++++++++++
 .../geneformer/test_pydantic_train.py         |  15 +-
 .../src/bionemo/esm2/run/__init__.py          |  16 ++
 .../src/bionemo/esm2/run/config_models.py     |  91 +++++-----
 .../bionemo-esm2/src/bionemo/esm2/run/main.py |  16 ++
 .../src/bionemo/esm2/run/recipes.py           | 166 ++++++++++++++----
 .../bionemo/geneformer/run/config_models.py   |  14 +-
 .../src/bionemo/geneformer/run/recipes.py     |  52 +++++-
 .../src/bionemo/llm/config/config_models.py   |  24 +--
 10 files changed, 432 insertions(+), 101 deletions(-)
 create mode 100644 scripts/protein/esm2/test_pydantic_train.py

diff --git a/scripts/protein/esm2/test_esm2_pretrain.py b/scripts/protein/esm2/test_esm2_pretrain.py
index 100b83db66..5a518f7bb9 100644
--- a/scripts/protein/esm2/test_esm2_pretrain.py
+++ b/scripts/protein/esm2/test_esm2_pretrain.py
@@ -141,6 +141,7 @@ def test_main_runs(tmpdir, dummy_protein_dataset, dummy_parquet_train_val_inputs
 def test_val_dataloader_in_main_runs_with_limit_val_batches(
     tmpdir, dummy_protein_dataset, dummy_parquet_train_val_inputs, limit_val_batches
 ):
+    # TODO: pydantic.
     """Ensures doesn't run out of validation samples whenever updating limit_val_batches logic.
 
     Args:
diff --git a/scripts/protein/esm2/test_pydantic_train.py b/scripts/protein/esm2/test_pydantic_train.py
new file mode 100644
index 0000000000..a66a9603b3
--- /dev/null
+++ b/scripts/protein/esm2/test_pydantic_train.py
@@ -0,0 +1,138 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shlex
+import sqlite3
+import subprocess
+from pathlib import Path
+
+import pandas as pd
+import pytest
+from lightning.fabric.plugins.environments.lightning import find_free_network_port
+
+from bionemo.testing.data.load import load
+
+
+data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
+
+
+def test_bionemo2_rootdir():
+    data_error_str = (
+        "Please download test data with:\n"
+        "`python scripts/download_artifacts.py --models all --model_dir ./models --data all --data_dir ./ --verbose --source pbss`"
+    )
+    assert data_path.exists(), f"Could not find test data directory.\n{data_error_str}"
+    assert data_path.is_dir(), f"Test data directory is supposed to be a directory.\n{data_error_str}"
+
+
+@pytest.fixture
+def dummy_protein_dataset(tmp_path):
+    """Create a mock protein dataset."""
+    db_file = tmp_path / "protein_dataset.db"
+    conn = sqlite3.connect(str(db_file))
+    cursor = conn.cursor()
+
+    cursor.execute(
+        """
+        CREATE TABLE protein (
+            id TEXT PRIMARY KEY,
+            sequence TEXT
+        )
+    """
+    )
+
+    proteins = [
+        ("UniRef90_A", "ACDEFGHIKLMNPQRSTVWY"),
+        ("UniRef90_B", "DEFGHIKLMNPQRSTVWYAC"),
+        ("UniRef90_C", "MGHIKLMNPQRSTVWYACDE"),
+        ("UniRef50_A", "MKTVRQERLKSIVRI"),
+        ("UniRef50_B", "MRILERSKEPVSGAQLA"),
+    ]
+    cursor.executemany("INSERT INTO protein VALUES (?, ?)", proteins)
+
+    conn.commit()
+    conn.close()
+
+    return db_file
+
+
+@pytest.fixture
+def dummy_parquet_train_val_inputs(tmp_path):
+    """Create a mock protein train and val cluster parquet."""
+    train_cluster_path = tmp_path / "train_clusters.parquet"
+    train_clusters = pd.DataFrame(
+        {
+            "ur90_id": [["UniRef90_A"], ["UniRef90_B", "UniRef90_C"]],
+        }
+    )
+    train_clusters.to_parquet(train_cluster_path)
+
+    valid_cluster_path = tmp_path / "valid_clusters.parquet"
+    valid_clusters = pd.DataFrame(
+        {
+            "ur50_id": ["UniRef50_A", "UniRef50_B", "UniRef50_A", "UniRef50_B"],  # 2 IDs more than confest
+        }
+    )
+    valid_clusters.to_parquet(valid_cluster_path)
+    return train_cluster_path, valid_cluster_path
+
+
+def test_pretrain_pydantic_cli(dummy_protein_dataset, dummy_parquet_train_val_inputs, tmpdir):
+    # result_dir = Path(tmpdir.mkdir("results"))
+    train_cluster_path, valid_cluster_path = dummy_parquet_train_val_inputs
+    # result_dir = Path("/tmp/results").mkdir(exist_ok=True)
+    result_dir = tmpdir.mkdir("results")
+
+    open_port = find_free_network_port()
+    config = "test_config.json"
+
+    # Invoke with blocking
+    cmd_str = f"""bionemo-esm2-recipe --dest {config} --recipe test-tiny
+      --train-database-path {dummy_protein_dataset}
+      --train-cluster-path {train_cluster_path}
+      --valid-database-path {dummy_protein_dataset}
+      --valid-cluster-path {valid_cluster_path}
+      --result-dir {result_dir}""".strip()
+    print(cmd_str)
+    # continue when finished
+    env = dict(**os.environ)  # a local copy of the environment
+    env["MASTER_PORT"] = str(open_port)
+    cmd = shlex.split(cmd_str)
+    result = subprocess.run(
+        cmd,
+        cwd=tmpdir,
+        env=env,
+        capture_output=True,
+    )
+    # Now do pretrain
+    if result.returncode != 0:
+        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+
+    cmd_str = f"""bionemo-esm2-train --conf {config}""".strip()
+    env = dict(**os.environ)  # a local copy of the environment
+    open_port = find_free_network_port()
+    env["MASTER_PORT"] = str(open_port)
+    cmd = shlex.split(cmd_str)
+    result = subprocess.run(
+        cmd,
+        cwd=tmpdir,
+        env=env,
+        capture_output=True,
+    )
+    if result.returncode != 0:
+        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+    # NOTE this looks a lot like a magic value. But we also could do json.loads(config)['experiment_config']['experiment_name']
+    assert (result_dir / "default_experiment").exists(), "Could not find test experiment directory."
diff --git a/scripts/singlecell/geneformer/test_pydantic_train.py b/scripts/singlecell/geneformer/test_pydantic_train.py
index c6188769c4..034e555ff1 100644
--- a/scripts/singlecell/geneformer/test_pydantic_train.py
+++ b/scripts/singlecell/geneformer/test_pydantic_train.py
@@ -57,7 +57,10 @@ def test_pretrain_cli_from_ckpt(tmpdir):
     )
     # Now do pretrain
     if result.returncode != 0:
-        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+        print(f"{cmd_str=}")
+        print(f"{result.stdout=}")
+        print(f"{result.stderr=}")
+        raise Exception(f"Pretrain recipe failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
 
     cmd_str = f"""bionemo-geneformer-train --conf {config}""".strip()
     env = dict(**os.environ)  # a local copy of the environment
@@ -97,7 +100,10 @@ def test_pretrain_cli(tmpdir):
     )
     # Now do pretrain
     if result.returncode != 0:
-        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+        print(f"{cmd_str=}")
+        print(f"{result.stdout=}")
+        print(f"{result.stderr=}")
+        raise Exception(f"Pretrain recipe failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
 
     cmd_str = f"""bionemo-geneformer-train --conf {config}""".strip()
     env = dict(**os.environ)  # a local copy of the environment
@@ -143,7 +149,10 @@ def test_finetune_cli(tmpdir):
     )
     # Now do pretrain
     if result.returncode != 0:
-        raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
+        print(f"{cmd_str=}")
+        print(f"{result.stdout=}")
+        print(f"{result.stderr=}")
+        raise Exception(f"Pretrain recipe failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
 
     # TODO gotta set the right config options here.
     # TODO set the parsing flag
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py
index e69de29bb2..79672139c9 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
index db582f2a06..eb4d2389a2 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
@@ -1,49 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import importlib
-import math
-import pathlib
-from dataclasses import dataclass, field
-from typing import List, Optional, Type
+from pathlib import Path
+from typing import Optional, Type
 
-from pydantic import ValidationError, field_serializer, field_validator, model_validator
 import torch
+from nemo.utils import logging
+from pydantic import field_serializer, field_validator, model_validator
 
 from bionemo.esm2.data.datamodule import ESMDataModule
 from bionemo.esm2.data.dataset import RandomMaskStrategy
 from bionemo.esm2.data.tokenizer import get_tokenizer
 from bionemo.esm2.model.attention import ESM2DotProductAttention, ESM2TEDotProductAttention
 from bionemo.esm2.model.model import ESM2Config
-from megatron.core.optimizer import OptimizerConfig
-from nemo import lightning as nl
-from nemo.collections import llm
-from nemo.lightning import resume
-from nemo.lightning.pytorch import callbacks as nl_callbacks
-from nemo.lightning.pytorch.optim import MegatronOptimizerModule
-from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
-from nemo.utils import logging
-from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
-from tokenizers import Tokenizer
-
-from bionemo.core.utils.dtypes import PrecisionTypes
-from bionemo.geneformer.api import GeneformerConfig
-from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
-from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
-from bionemo.geneformer.model.finetune_token_regressor import FineTuneSeqLenBioBertConfig
 from bionemo.llm.config.config_models import (
     DataConfig,
-    DataModuleT,
-    ExperimentConfig,
     ExposedModelConfig,
     MainConfig,
-    OptimizerSchedulerConfig,
-    ParallelConfig,
-    TrainingConfig,
 )
-from bionemo.llm.model.biobert.lightning import BioBertLightningModule
-from bionemo.llm.model.biobert.model import BioBertGenericConfig, BiobertSpecOption
-from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
-from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
+from bionemo.llm.model.biobert.model import BiobertSpecOption
+
 
-from pathlib import Path
 class ESM2DataConfig(DataConfig[ESMDataModule]):
     # defined in baseclass- listed here for exposure.
     train_cluster_path: Path
@@ -58,7 +49,6 @@ class ESM2DataConfig(DataConfig[ESMDataModule]):
     random_mask_strategy: RandomMaskStrategy = RandomMaskStrategy.ALL_TOKENS
     num_dataset_workers: int = 0
 
-
     def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
         tokenizer = get_tokenizer()
         data = ESMDataModule(
@@ -72,10 +62,11 @@ def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
             max_seq_length=self.max_seq_length,
             num_workers=self.num_dataset_workers,
             random_mask_strategy=self.random_mask_strategy,
-            tokenizer=tokenizer
+            tokenizer=tokenizer,
         )
         return data
 
+
 class ExposedESM2PretrainConfig(ExposedModelConfig[ESM2Config]):
     # ESM specific fields
     use_esm_attention: bool = False  # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
@@ -89,18 +80,23 @@ class ExposedESM2PretrainConfig(ExposedModelConfig[ESM2Config]):
     def restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -> BiobertSpecOption:
         # This has some more complicated validation I see
 
-        if biobert_spec_option in (BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec, BiobertSpecOption.esm2_bert_layer_local_spec):
+        if biobert_spec_option in (
+            BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
+            BiobertSpecOption.esm2_bert_layer_local_spec,
+        ):
             return biobert_spec_option
         else:
-            raise TypeError(f"Unsupported BiobertSpecOption: {biobert_spec_option=}, use one of {BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec}, {BiobertSpecOption.esm2_bert_layer_local_spec}")
+            raise TypeError(
+                f"Unsupported BiobertSpecOption: {biobert_spec_option=}, use one of {BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec}, {BiobertSpecOption.esm2_bert_layer_local_spec}"
+            )
 
-    @field_serializer('core_attention_override')
+    @field_serializer("core_attention_override")
     def serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
         if value is None:
             return None
         return f"{value.__module__}.{value.__name__}"
 
-    @field_validator('core_attention_override', mode='before')
+    @field_validator("core_attention_override", mode="before")
     def validate_core_attention_override(cls, value):
         if value is None:
             return None
@@ -116,11 +112,11 @@ def validate_core_attention_override(cls, value):
                 raise ValueError(f"Cannot import {value}")
         return value
 
-
-
     @model_validator(mode="after")
     def validate_and_set_attention_and_scaling(self):
-        logging.info("Mutating apply_query_key_layer_scaling and core_attention_override based on biobert_spec_option..")
+        logging.info(
+            "Mutating apply_query_key_layer_scaling and core_attention_override based on biobert_spec_option.."
+        )
         if self.biobert_spec_option == BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec:
             self.apply_query_key_layer_scaling = False
             self.core_attention_override = ESM2TEDotProductAttention
@@ -139,14 +135,21 @@ def model_validator(self, global_cfg: MainConfig) -> MainConfig:
         if not isinstance(global_cfg.data_config, ESM2DataConfig):
             raise TypeError(f"ESM2PretrainConfig requires ESM2DataConfig, got {global_cfg.data_config=}")
 
-        pipeline_model_parallel_size, tensor_model_parallel_size = global_cfg.parallel_config.pipeline_model_parallel_size, global_cfg.parallel_config.tensor_model_parallel_size 
+        pipeline_model_parallel_size, tensor_model_parallel_size = (
+            global_cfg.parallel_config.pipeline_model_parallel_size,
+            global_cfg.parallel_config.tensor_model_parallel_size,
+        )
         min_seq_length, max_seq_length = global_cfg.data_config.min_seq_length, global_cfg.data_config.max_seq_length
-        assert self.variable_seq_lengths == (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length), 'Must set variable_seq_lengths = (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)'
+        assert (
+            self.variable_seq_lengths
+            == (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)
+        ), "Must set variable_seq_lengths = (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)"
         return global_cfg
 
     def model_class(self) -> Type[ESM2Config]:
-        return ESM2Config 
+        return ESM2Config
+
 
 # TODO NOTES on default configuration
 # seq_length: int # max_sequence_length
-# need_megatron_variable_seq_lengths_reductions = (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)
\ No newline at end of file
+# need_megatron_variable_seq_lengths_reductions = (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
index 9a9408e96e..859b0575fe 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -1,3 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import argparse
 import json
 from typing import Optional
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
index 09226965c2..5894d67d36 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
@@ -1,17 +1,37 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
+
+import argparse
 import importlib
 from pathlib import Path
-from bionemo.llm.config.config_models import ExperimentConfig, MainConfig, OptimizerSchedulerConfig, ParallelConfig, TrainingConfig
-from bionemo.llm.utils.logger_utils import WandbConfig
-from nemo.utils import logging
 from typing import Optional
+
+from nemo.utils import logging
+
 from bionemo.core.utils.dtypes import PrecisionTypes
 from bionemo.esm2.run.config_models import ESM2DataConfig, ExposedESM2PretrainConfig
+from bionemo.llm.config.config_models import (
+    ExperimentConfig,
+    MainConfig,
+    OptimizerSchedulerConfig,
+    ParallelConfig,
+    TrainingConfig,
+)
 from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
-
-import argparse
-import json
-from typing import Optional
+from bionemo.llm.utils.logger_utils import WandbConfig
 
 
 def simple_parallel_recipe(
@@ -31,18 +51,22 @@ def simple_parallel_recipe(
     )
 
 
-def default_trainer_config_recipe() -> TrainingConfig:
+def default_training_config_recipe() -> TrainingConfig:
     return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
 
 
+def tiny_train_config_recipe() -> TrainingConfig:
+    return TrainingConfig(max_steps=10, limit_val_batches=2, val_check_interval=2)
+
+
 def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
     return OptimizerSchedulerConfig()
 
 
-def experiment_config_recipe() -> ExperimentConfig:
+def experiment_config_recipe(result_dir="./results") -> ExperimentConfig:
     return ExperimentConfig(
         save_every_n_steps=100,
-        result_dir="./results",
+        result_dir=result_dir,
         experiment_name="default_experiment",
         restore_from_checkpoint_path=None,
         save_last_checkpoint=True,
@@ -52,6 +76,31 @@ def experiment_config_recipe() -> ExperimentConfig:
     )
 
 
+def esm2_tiny_model_config(
+    seq_length: int = 2048,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
+    variable_seq_lengths: bool = False,
+) -> ExposedESM2PretrainConfig:
+    return ExposedESM2PretrainConfig(
+        seq_length=seq_length,
+        num_layers=2,
+        hidden_size=32,
+        num_attention_heads=2,
+        ffn_hidden_size=4 * 32,
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=str(nemo1_init_path) if nemo1_init_path is not None else None,
+        # handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
+        initial_ckpt_path=str(initial_ckpt_path) if initial_ckpt_path is not None else None,
+        variable_seq_lengths=variable_seq_lengths,
+    )
+
+
 def esm2_8m_model_config(
     seq_length: int = 2048,
     precision: PrecisionTypes = "bf16-mixed",
@@ -65,7 +114,7 @@ def esm2_8m_model_config(
         num_layers=6,
         hidden_size=320,
         num_attention_heads=20,
-        ffn_hidden_size=4*320,
+        ffn_hidden_size=4 * 320,
         params_dtype=precision,
         pipeline_dtype=precision,
         autocast_dtype=precision,
@@ -73,7 +122,7 @@ def esm2_8m_model_config(
         nemo1_ckpt_path=str(nemo1_init_path) if nemo1_init_path is not None else None,
         # handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
         initial_ckpt_path=str(initial_ckpt_path) if initial_ckpt_path is not None else None,
-        variable_seq_lengths=variable_seq_lengths   
+        variable_seq_lengths=variable_seq_lengths,
     )
 
 
@@ -90,7 +139,7 @@ def esm2_650m_config(
         num_layers=33,
         hidden_size=1280,
         num_attention_heads=20,
-        ffn_hidden_size=4*1280,
+        ffn_hidden_size=4 * 1280,
         params_dtype=precision,
         pipeline_dtype=precision,
         autocast_dtype=precision,
@@ -98,11 +147,11 @@ def esm2_650m_config(
         nemo1_ckpt_path=str(nemo1_init_path) if nemo1_init_path is not None else None,
         # handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
         initial_ckpt_path=str(initial_ckpt_path) if initial_ckpt_path is not None else None,
-        variable_seq_lengths=variable_seq_lengths   
+        variable_seq_lengths=variable_seq_lengths,
     )
 
 
-'''
+"""
     --train-cluster-path ${TEST_DATA_DIR}/2024_03_sanity/train_clusters_sanity.parquet     \
     --train-database-path ${TEST_DATA_DIR}/2024_03_sanity/train_sanity.db     \
     --valid-cluster-path ${TEST_DATA_DIR}/2024_03_sanity/valid_clusters.parquet     \
@@ -118,11 +167,56 @@ def esm2_650m_config(
     --limit-val-batches 2 \
     --micro-batch-size 2 \
     --restore-from-checkpoint-path ${ESM2_650M_CKPT}
-'''
+"""
+
+
+def esm2_tiny_test_recipe(args):
+    parallel_config = simple_parallel_recipe()
+    training_config = tiny_train_config_recipe()
+    # $(download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source $MY_DATA_SOURCE);
+
+    # Find this from the test script... not sure what a sensible default is.
+    data_config = ESM2DataConfig(
+        min_seq_length=128,
+        max_seq_length=128,
+        micro_batch_size=2,
+        num_dataset_workers=1,
+        train_cluster_path=args.train_cluster_path,
+        train_database_path=args.train_database_path,
+        valid_cluster_path=args.valid_cluster_path,
+        valid_database_path=args.valid_database_path,
+    )
+    bionemo_model_config = esm2_tiny_model_config(
+        seq_length=data_config.max_seq_length, initial_ckpt_path=args.initial_ckpt_path
+    )
+
+    optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
+    experiment_config = experiment_config_recipe(args.result_dir)
+    wandb_config = WandbConfig(
+        project="bionemo2-demo",
+        entity="nvidia",
+        offline=True,
+        tags=[],
+        group="dev",
+        id="dev",
+        log_model=False,
+        anonymous=True,
+    )
+    main_config = MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig](
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        bionemo_model_config=bionemo_model_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+        wandb_config=wandb_config,
+    )
+    return main_config
+
 
 def esm2_8m_test_recipe(args):
     parallel_config = simple_parallel_recipe()
-    training_config = default_trainer_config_recipe()
+    training_config = default_training_config_recipe()
     # $(download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source $MY_DATA_SOURCE);
 
     # Find this from the test script... not sure what a sensible default is.
@@ -136,12 +230,12 @@ def esm2_8m_test_recipe(args):
         valid_cluster_path=args.valid_cluster_path,
         valid_database_path=args.valid_database_path,
     )
-    # bionemo_model_config = geneformer_finetuning_regression_head_recipe()
-    bionemo_model_config = esm2_650m_config(seq_length=data_config.max_seq_length, initial_ckpt_path=args.initial_ckpt_path)
-    bionemo_model_config = esm2_8m_model_config(seq_length=data_config.max_seq_length, initial_ckpt_path=args.initial_ckpt_path)
+    bionemo_model_config = esm2_8m_model_config(
+        seq_length=data_config.max_seq_length, initial_ckpt_path=args.initial_ckpt_path
+    )
 
     optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
-    experiment_config = experiment_config_recipe()
+    experiment_config = experiment_config_recipe(args.result_dir)
     wandb_config = WandbConfig(
         project="bionemo2-demo",
         entity="nvidia",
@@ -163,13 +257,14 @@ def esm2_8m_test_recipe(args):
     )
     return main_config
 
+
 def main():
     def parse_args():
         parser = argparse.ArgumentParser(description="Create ESM2 configuration JSON.")
         parser.add_argument(
             "--recipe",
             type=str,
-            choices=["test-8m"],
+            choices=["test-8m", "test"],
             required=True,
             help="Use one of the preconfigured recipes to create a template config file.",
         )
@@ -183,18 +278,20 @@ def parse_args():
         )
 
         parser.add_argument(
-            '--train-cluster-path', type=Path, required=True, help='Path to the training cluster file.'
+            "--train-cluster-path", type=Path, required=True, help="Path to the training cluster file."
         )
         parser.add_argument(
-            '--train-database-path', type=Path, required=True, help='Path to the training database file.'
+            "--train-database-path", type=Path, required=True, help="Path to the training database file."
         )
         parser.add_argument(
-            '--valid-cluster-path', type=Path, required=True, help='Path to the validation cluster file.'
+            "--valid-cluster-path", type=Path, required=True, help="Path to the validation cluster file."
         )
         parser.add_argument(
-            '--valid-database-path', type=Path, required=True, help='Path to the validation database file.'
+            "--valid-database-path", type=Path, required=True, help="Path to the validation database file."
         )
 
+        parser.add_argument("--result-dir", type=Path, required=True, default="results", help="Path to store results")
+
         # Extra argument.
         parser.add_argument(
             "--initial-ckpt-path",
@@ -213,6 +310,9 @@ def parse_args():
     if args.recipe == "test-8m":
         # Hardcoded test recipe.
         config = esm2_8m_test_recipe(args)
+    elif args.recipe == "test":
+        # Hardcoded test recipe.
+        config = esm2_tiny_test_recipe(args)
     elif args.recipe == "test-finetune":
         raise ValueError("Invalid recipe choice.")
         # config = finetune_test_recipe(args)
@@ -220,7 +320,6 @@ def parse_args():
         raise ValueError("Invalid recipe choice.")
 
     # Serialize to JSON
-    breakpoint()
     json_str = config.model_dump_json(indent=2)
 
     # Save to file
@@ -231,19 +330,23 @@ def parse_args():
         f.write(json_str)
     logging.info(f"Saved configuration to {args.dest=}")
 
-from pydantic import BaseModel, field_serializer, field_validator
+
 from typing import Type
+
 import torch
+from pydantic import BaseModel, field_serializer, field_validator
+
+
 class MyConfig(BaseModel):
     core_attention_override: Optional[Type[torch.nn.Module]] = None
 
-    @field_serializer('core_attention_override')
+    @field_serializer("core_attention_override")
     def serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
         if value is None:
             return None
         return f"{value.__module__}.{value.__name__}"
 
-    @field_validator('core_attention_override', mode='before')
+    @field_validator("core_attention_override", mode="before")
     def validate_core_attention_override(cls, value):
         if value is None:
             return None
@@ -259,6 +362,7 @@ def validate_core_attention_override(cls, value):
                 raise ValueError(f"Cannot import {value}")
         return value
 
+
 if __name__ == "__main__":
     # NOTE: this is where I left off!
     config = esm2_650m_config()
@@ -269,4 +373,4 @@ def validate_core_attention_override(cls, value):
     main()
 
 
-# config.exposed_to_internal_bionemo_model_config()
\ No newline at end of file
+# config.exposed_to_internal_bionemo_model_config()
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index 2bb93d27b3..fe1d7b0b89 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -36,8 +36,6 @@
 
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
-from nemo.collections import llm
-from nemo.lightning import resume
 from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule
 from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
@@ -45,14 +43,12 @@
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 from tokenizers import Tokenizer
 
-from bionemo.core.utils.dtypes import PrecisionTypes
 from bionemo.geneformer.api import GeneformerConfig
 from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
 from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
 from bionemo.geneformer.model.finetune_token_regressor import FineTuneSeqLenBioBertConfig
 from bionemo.llm.config.config_models import (
     DataConfig,
-    DataModuleT,
     ExperimentConfig,
     ExposedModelConfig,
     OptimizerSchedulerConfig,
@@ -60,8 +56,7 @@
     TrainingConfig,
 )
 from bionemo.llm.model.biobert.lightning import BioBertLightningModule
-from bionemo.llm.model.biobert.model import BioBertGenericConfig, BiobertSpecOption
-from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
+from bionemo.llm.model.biobert.model import BioBertGenericConfig
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
 
@@ -77,7 +72,7 @@ class GeneformerPretrainingDataConfig(DataConfig[SingleCellDataModule]):
     """Configuration for the geneformer pre-training data module."""
 
     # Shadow two attributes from the parent for visibility.
-    result_dir: str = "./results"
+    result_dir: str | pathlib.Path = "./results"
     micro_batch_size: int = 8
 
     data_dir: str
@@ -159,6 +154,7 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
     )
     return trainer
 
+
 class ExposedGeneformerPretrainConfig(ExposedModelConfig[GeneformerConfig]):
     # Custom parameters for FineTuning
     initial_ckpt_path: Optional[str] = None
@@ -167,6 +163,7 @@ class ExposedGeneformerPretrainConfig(ExposedModelConfig[GeneformerConfig]):
     def model_class(self) -> Type[GeneformerConfig]:
         return GeneformerConfig
 
+
 class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBertConfig]):
     """Config for models that fine-tune a BioBERT model from a pre-trained checkpoint.
 
@@ -215,6 +212,7 @@ def biobert_lightning_module(
     )
     return model
 
+
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
     # TODO lift into llm?
     checkpoint_callback = nl_callbacks.ModelCheckpoint(
@@ -232,4 +230,4 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
         wandb_config=wandb_config,
         ckpt_callback=checkpoint_callback,
     )
-    return nemo_logger
\ No newline at end of file
+    return nemo_logger
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index b76ebde2fe..e3316fb905 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -150,6 +150,49 @@ def geneformer10m_finetune_config(
     return geneformer_config
 
 
+def geneformer_tiny_config(
+    seq_length: int = 2048,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
+) -> ExposedGeneformerPretrainConfig:
+    geneformer_config = ExposedGeneformerPretrainConfig(
+        num_layers=2,
+        hidden_size=32,
+        ffn_hidden_size=4 * 32,
+        num_attention_heads=2,
+        seq_length=seq_length,
+        fp32_residual_connection=False,
+        hidden_dropout=0.02,
+        init_method_std=0.02,
+        kv_channels=None,
+        apply_query_key_layer_scaling=False,
+        make_vocab_size_divisible_by=128,
+        masked_softmax_fusion=True,
+        fp16_lm_cross_entropy=False,
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        gradient_accumulation_fusion=False,
+        layernorm_zero_centered_gamma=False,
+        layernorm_epsilon=1.0e-12,
+        activation_func="gelu",
+        qk_layernorm=False,
+        apply_residual_connection_post_layernorm=False,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+        get_attention_mask_from_fusion=False,
+        attention_dropout=0.1,
+        share_embeddings_and_output_weights=True,
+        enable_autocast=False,
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
+    )
+    return geneformer_config
+
+
 def geneformer10M_pretraining_config(
     seq_length: int = 2048,
     precision: PrecisionTypes = "bf16-mixed",
@@ -252,7 +295,7 @@ def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig,
     )
 
 
-def pretrain_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
+def pretrain_tiny_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
     data_path = args.data_path
     result_dir = args.result_dir
 
@@ -280,9 +323,12 @@ def pretrain_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfig, Ge
     )
 
     optim_config = OptimizerSchedulerConfig()
-    geneformer_config = geneformer10M_pretraining_config(
+    geneformer_config = geneformer_tiny_config(
         seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
     )
+    # geneformer_config = geneformer10M_pretraining_config(
+    #    seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
+    # )
 
     return MainConfig(
         data_config=data_config,
@@ -399,7 +445,7 @@ def parse_args():
     args = parse_args()
 
     if args.recipe == "test":
-        config = pretrain_test_recipe(args)
+        config = pretrain_tiny_test_recipe(args)
     elif args.recipe == "10m-pretrain":
         config = geneformer10m_pretrain_recipe(args)
     elif args.recipe == "106m-pretrain":
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
index 689b945e37..9f47f0307c 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 
+import pathlib
 from abc import ABC, abstractmethod
 from dataclasses import field
 from typing import Any, Callable, Dict, Generic, List, Literal, Optional, Type, TypeVar
@@ -53,7 +54,7 @@ class DataConfig(BaseModel, Generic[DataModuleT], ABC):
     """
 
     micro_batch_size: int = 8
-    result_dir: str = "./results"
+    result_dir: str | pathlib.Path = "./results"
     num_dataset_workers: int = 0
     seq_length: int = 128
 
@@ -63,12 +64,12 @@ def construct_data_module(self, global_batch_size: int) -> DataModuleT:
         ...
 
     def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
-        ''' Use custom implementation of this method to define the things inside global_config. 
+        """Use custom implementation of this method to define the things inside global_config.
 
         The following expression will always be true:
 
-        global_cfg.data_config == self 
-        '''
+        global_cfg.data_config == self
+        """
         return global_cfg
 
 
@@ -90,7 +91,6 @@ class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
 
     # TODO validator on num_attention_heads, ffn_hidden_size, and hidden_size as these have knowable constraints.
 
-
     # Pydantic stuff to allow arbitrary types + validators + serializers
     class Config:
         arbitrary_types_allowed = True
@@ -104,12 +104,12 @@ def model_class(self) -> Type[ModelConfigT]:
         raise NotImplementedError
 
     def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
-        ''' Use custom implementation of this method to define the things inside global_config. 
+        """Use custom implementation of this method to define the things inside global_config.
 
         The following expression will always be true:
 
-        global_cfg.bionemo_model_config == self 
-        '''
+        global_cfg.bionemo_model_config == self
+        """
         return global_cfg
 
     def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
@@ -257,7 +257,7 @@ class OptimizerSchedulerConfig(BaseModel):
 
 class ExperimentConfig(BaseModel):
     save_every_n_steps: int
-    result_dir: str
+    result_dir: str | pathlib.Path
     experiment_name: str
     restore_from_checkpoint_path: Optional[str]
     wandb_config: Optional[WandbConfig] = None
@@ -304,11 +304,11 @@ def validate_master_config(self) -> "MainConfig":
         self.bionemo_model_config.seq_length = self.data_config.seq_length
         # What other global validators should we set here?
         return self
-    
+
     @model_validator(mode="after")
     def run_bionemo_model_config_model_validators(self) -> "MainConfig":
         return self.bionemo_model_config.model_validator(self)
-    
+
     @model_validator(mode="after")
-    def run_data_config_modeL_validators(self) -> "MainConfig":
+    def run_data_config_model_validators(self) -> "MainConfig":
         return self.data_config.model_validator(self)

From 8a377ea14271f9cee0a5a997c43e9aa6ec46c405 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 30 Oct 2024 16:43:31 +0000
Subject: [PATCH 31/58] backing up work before creating a repro example of the
 weirdness with modules

---
 scripts/protein/esm2/test_pydantic_train.py   |   6 +-
 .../geneformer/test_pydantic_train.py         |  30 +-
 scripts/singlecell/geneformer/train.py        |   8 +-
 .../bionemo/geneformer/run/config_models.py   |  82 +----
 .../src/bionemo/geneformer/run/main.py        |   2 +-
 .../src/bionemo/geneformer/run/recipes.py     |  11 +-
 .../src/bionemo/llm/config/config_models.py   | 314 ------------------
 .../bionemo-llm/src/bionemo/llm/train.py      |   6 +-
 8 files changed, 26 insertions(+), 433 deletions(-)
 delete mode 100644 sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py

diff --git a/scripts/protein/esm2/test_pydantic_train.py b/scripts/protein/esm2/test_pydantic_train.py
index a66a9603b3..5585d68717 100644
--- a/scripts/protein/esm2/test_pydantic_train.py
+++ b/scripts/protein/esm2/test_pydantic_train.py
@@ -97,16 +97,16 @@ def test_pretrain_pydantic_cli(dummy_protein_dataset, dummy_parquet_train_val_in
     result_dir = tmpdir.mkdir("results")
 
     open_port = find_free_network_port()
-    config = "test_config.json"
+    config = f"{result_dir}/test_config.json"
 
     # Invoke with blocking
-    cmd_str = f"""bionemo-esm2-recipe --dest {config} --recipe test-tiny
+    cmd_str = f"""bionemo-esm2-recipe --dest {config} --recipe test
       --train-database-path {dummy_protein_dataset}
       --train-cluster-path {train_cluster_path}
       --valid-database-path {dummy_protein_dataset}
       --valid-cluster-path {valid_cluster_path}
       --result-dir {result_dir}""".strip()
-    print(cmd_str)
+
     # continue when finished
     env = dict(**os.environ)  # a local copy of the environment
     env["MASTER_PORT"] = str(open_port)
diff --git a/scripts/singlecell/geneformer/test_pydantic_train.py b/scripts/singlecell/geneformer/test_pydantic_train.py
index 034e555ff1..7eeb47a613 100644
--- a/scripts/singlecell/geneformer/test_pydantic_train.py
+++ b/scripts/singlecell/geneformer/test_pydantic_train.py
@@ -41,11 +41,11 @@ def test_pretrain_cli_from_ckpt(tmpdir):
     result_dir = Path(tmpdir.mkdir("results"))
 
     open_port = find_free_network_port()
-    config = "/workspaces/bionemo-fw-ea/test_config.json"
-    # Invoke with blocking
+    # NOTE: if this test is ever failing, you may want to put the config somewhere easily accessible.
+    config = f"{result_dir}/test_config.json"
+    # Invoke with blocking, continue when finished (and the json config is generated)
     checkpoint_path: Path = load("geneformer/10M_240530:2.0")
     cmd_str = f"""bionemo-geneformer-recipe --dest {config} --recipe test --data-path {data_path} --result-dir {result_dir} --initial-ckpt-path {checkpoint_path}""".strip()
-    # continue when finished
     env = dict(**os.environ)  # a local copy of the environment
     env["MASTER_PORT"] = str(open_port)
     cmd = shlex.split(cmd_str)
@@ -55,11 +55,7 @@ def test_pretrain_cli_from_ckpt(tmpdir):
         env=env,
         capture_output=True,
     )
-    # Now do pretrain
     if result.returncode != 0:
-        print(f"{cmd_str=}")
-        print(f"{result.stdout=}")
-        print(f"{result.stderr=}")
         raise Exception(f"Pretrain recipe failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
 
     cmd_str = f"""bionemo-geneformer-train --conf {config}""".strip()
@@ -74,8 +70,10 @@ def test_pretrain_cli_from_ckpt(tmpdir):
         capture_output=True,
     )
     if result.returncode != 0:
+        # More helpful failure
         raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
-    # NOTE this looks a lot like a magic value. But we also could do json.loads(config)['experiment_config']['experiment_name']
+
+    # Must match the experiment directory configured.
     assert (result_dir / "test-experiment").exists(), "Could not find test experiment directory."
 
 
@@ -85,7 +83,7 @@ def test_pretrain_cli(tmpdir):
     result_dir = Path(tmpdir.mkdir("results"))
 
     open_port = find_free_network_port()
-    config = "test_config.json"
+    config = f"{result_dir}/test_config.json"
     # Invoke with blocking
     cmd_str = f"""bionemo-geneformer-recipe --dest {config} --recipe test --data-path {data_path} --result-dir {result_dir}""".strip()
     # continue when finished
@@ -100,9 +98,6 @@ def test_pretrain_cli(tmpdir):
     )
     # Now do pretrain
     if result.returncode != 0:
-        print(f"{cmd_str=}")
-        print(f"{result.stdout=}")
-        print(f"{result.stderr=}")
         raise Exception(f"Pretrain recipe failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
 
     cmd_str = f"""bionemo-geneformer-train --conf {config}""".strip()
@@ -130,8 +125,7 @@ def test_finetune_cli(tmpdir):
 
     open_port = find_free_network_port()
 
-    # TODO use relative path when the test is working.
-    config = "test_config.json"
+    config = f"{result_dir}/test_config.json"
 
     # TODO add initial path
     cmd_str = f"""bionemo-geneformer-recipe --dest {config} --recipe test-finetune --data-path {data_path} --result-dir {result_dir} --initial-ckpt-path {checkpoint_path}""".strip()
@@ -149,28 +143,20 @@ def test_finetune_cli(tmpdir):
     )
     # Now do pretrain
     if result.returncode != 0:
-        print(f"{cmd_str=}")
-        print(f"{result.stdout=}")
-        print(f"{result.stderr=}")
         raise Exception(f"Pretrain recipe failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
 
-    # TODO gotta set the right config options here.
-    # TODO set the parsing flag
     cmd_str = f"""bionemo-geneformer-train --conf {config} """.strip()
     env = dict(**os.environ)  # a local copy of the environment
     open_port = find_free_network_port()
     env["MASTER_PORT"] = str(open_port)
     cmd = shlex.split(cmd_str)
-    print("starting the training invocation of evil")
     result = subprocess.run(
         cmd,
         cwd=tmpdir,
         env=env,
-        # capture_output=True,
         stdout=sys.stdout,
         stderr=sys.stderr,
     )
     if result.returncode != 0:
         raise Exception(f"Pretrain script failed:\n{cmd_str=}\n{result.stdout=}\n{result.stderr=}")
-    # NOTE this looks a lot like a magic value. But we also could do json.loads(config)['experiment_config']['experiment_name']
     assert (result_dir / "test-experiment").exists(), "Could not find test experiment directory."
diff --git a/scripts/singlecell/geneformer/train.py b/scripts/singlecell/geneformer/train.py
index fbda6f2bb2..9b597aebc9 100644
--- a/scripts/singlecell/geneformer/train.py
+++ b/scripts/singlecell/geneformer/train.py
@@ -44,7 +44,7 @@
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.model.biobert.model import BioBertConfig, BiobertSpecOption
 from bionemo.llm.utils.datamodule_utils import float_or_int_or_none, infer_global_batch_size
-from bionemo.llm.utils.logger_utils import WandbLoggerOptions, setup_nemo_lightning_logger
+from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
 
 __all__: Sequence[str] = ("main", "parser")
@@ -149,10 +149,10 @@ def main(
 
     # for wandb integration
     # Please refer to https://pytorch-lightning.readthedocs.io/en/0.7.6/api/pytorch_lightning.loggers.html"
-    wandb_options: Optional[WandbLoggerOptions] = (
+    wandb_options: Optional[WandbConfig] = (
         None
         if wandb_project is None
-        else WandbLoggerOptions(
+        else WandbConfig(
             offline=wandb_offline,
             project=wandb_project,
             entity=wandb_entity,
@@ -281,7 +281,7 @@ def main(
         root_dir=result_dir,
         name=experiment_name,
         initialize_tensorboard_logger=create_tensorboard_logger,
-        wandb_kwargs=wandb_options,
+        wandb_config=wandb_options,
         ckpt_callback=checkpoint_callback,
     )
     llm.train(
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index fe1d7b0b89..ae5ed376a1 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -13,22 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-Apache2
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import math
 import pathlib
 from dataclasses import dataclass, field
@@ -47,7 +31,7 @@
 from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
 from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
 from bionemo.geneformer.model.finetune_token_regressor import FineTuneSeqLenBioBertConfig
-from bionemo.llm.config.config_models import (
+from bionemo.llm.run.config_models import (
     DataConfig,
     ExperimentConfig,
     ExposedModelConfig,
@@ -56,7 +40,7 @@
     TrainingConfig,
 )
 from bionemo.llm.model.biobert.lightning import BioBertLightningModule
-from bionemo.llm.model.biobert.model import BioBertGenericConfig
+from bionemo.llm.model.biobert.model import BioBertConfig
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
 
@@ -92,8 +76,7 @@ def test_data_path(self) -> str:
         return self.data_dir + "/test"
 
     def geneformer_preprocess(self) -> GeneformerDataArtifacts:
-        """Geneformer datamodule expects certain artifacts to be present in the data directory.
-
+        """Geneformer datamodule expects certain artifacts to be present in the data directory. 
         This method uses a legacy 'preprocessor' from BioNeMo 1 to acquire the associated artifacts.
         """
         preprocessor = GeneformerPreprocess(
@@ -128,33 +111,6 @@ def construct_data_module(self, global_batch_size: int) -> SingleCellDataModule:
         return data
 
 
-def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig) -> nl.Trainer:
-    # TODO: lift into llm?
-    strategy = nl.MegatronStrategy(
-        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
-        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
-        ddp="megatron",
-        find_unused_parameters=True,
-        ckpt_include_optimizer=True,
-    )
-
-    trainer = nl.Trainer(
-        devices=parallel_config.num_devices,
-        max_steps=training_config.max_steps,
-        accelerator=training_config.accelerator,
-        strategy=strategy,
-        limit_val_batches=training_config.limit_val_batches,
-        val_check_interval=training_config.val_check_interval,
-        num_nodes=parallel_config.num_nodes,
-        callbacks=[
-            RichModelSummary(max_depth=4),
-            LearningRateMonitor(),
-        ],
-        plugins=nl.MegatronMixedPrecision(precision=training_config.precision),
-    )
-    return trainer
-
-
 class ExposedGeneformerPretrainConfig(ExposedModelConfig[GeneformerConfig]):
     # Custom parameters for FineTuning
     initial_ckpt_path: Optional[str] = None
@@ -182,38 +138,8 @@ def model_class(self) -> Type[FineTuneSeqLenBioBertConfig]:
         return FineTuneSeqLenBioBertConfig
 
 
-def biobert_lightning_module(
-    bionemo_model_config: BioBertGenericConfig,
-    tokenizer: Tokenizer,
-    optim_config: OptimizerSchedulerConfig,
-    num_steps: int,
-) -> BioBertLightningModule:
-    # TODO Lift into llm?
-    model = BioBertLightningModule(
-        bionemo_model_config,
-        tokenizer=tokenizer,
-        optimizer=MegatronOptimizerModule(
-            config=OptimizerConfig(
-                lr=optim_config.lr,
-                optimizer=optim_config.optimizer,
-                use_distributed_optimizer=True,
-                fp16=bionemo_model_config.fp16,
-                bf16=bionemo_model_config.bf16,
-            ),
-            lr_scheduler=CosineAnnealingScheduler(
-                max_steps=num_steps,
-                min_lr=optim_config.lr / 100,
-                warmup_steps=int(math.ceil(num_steps * optim_config.cosine_rampup_frac)),
-                interval=optim_config.interval,
-                monitor=optim_config.monitor,
-                constant_steps=int(math.ceil(num_steps * optim_config.cosine_hold_frac)),
-            ),
-        ),
-    )
-    return model
-
-
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
+    raise Exception
     # TODO lift into llm?
     checkpoint_callback = nl_callbacks.ModelCheckpoint(
         save_last=experiment_config.save_last_checkpoint,
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 8b567a8be0..0f59a34cc2 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -23,7 +23,7 @@
     ExposedGeneformerPretrainConfig,
     GeneformerPretrainingDataConfig,
 )
-from bionemo.llm.config.config_models import MainConfig
+from bionemo.llm.run.config_models import MainConfig
 from bionemo.llm.train import train
 
 
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index e3316fb905..a21adc7154 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -25,7 +25,7 @@
     ExposedGeneformerPretrainConfig,
     GeneformerPretrainingDataConfig,
 )
-from bionemo.llm.config.config_models import (
+from bionemo.llm.run.config_models import (
     ExperimentConfig,
     MainConfig,
     OptimizerSchedulerConfig,
@@ -261,7 +261,7 @@ def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig,
         tensor_model_parallel_size=1, pipeline_model_parallel_size=1, num_devices=1, accumulate_grad_batches=2
     )
     training_config = TrainingConfig(
-        max_steps=55, limit_val_batches=2, val_check_interval=10, precision="bf16-mixed", accelerator="gpu"
+        max_steps=10, limit_val_batches=2, val_check_interval=2, precision="bf16-mixed", accelerator="gpu"
     )
     data_config = GeneformerPretrainingDataConfig(
         seq_length=128,
@@ -303,7 +303,7 @@ def pretrain_tiny_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfi
         tensor_model_parallel_size=1, pipeline_model_parallel_size=1, num_devices=1, accumulate_grad_batches=2
     )
     training_config = TrainingConfig(
-        max_steps=55, limit_val_batches=2, val_check_interval=10, precision="bf16-mixed", accelerator="gpu"
+        max_steps=10, limit_val_batches=2, val_check_interval=2, precision="bf16-mixed", accelerator="gpu"
     )
     data_config = GeneformerPretrainingDataConfig(
         seq_length=128,
@@ -326,9 +326,6 @@ def pretrain_tiny_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfi
     geneformer_config = geneformer_tiny_config(
         seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
     )
-    # geneformer_config = geneformer10M_pretraining_config(
-    #    seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
-    # )
 
     return MainConfig(
         data_config=data_config,
@@ -346,7 +343,6 @@ def geneformer10m_pretrain_recipe(
     data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_dir)
     parallel_config = simple_parallel_recipe()
     training_config = default_trainer_config_recipe()
-    # bionemo_model_config = geneformer_finetuning_regression_head_recipe()
     bionemo_model_config = geneformer10M_pretraining_config(initial_ckpt_path=args.initial_ckpt_path)
     optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
     experiment_config = experiment_config_recipe()
@@ -449,7 +445,6 @@ def parse_args():
     elif args.recipe == "10m-pretrain":
         config = geneformer10m_pretrain_recipe(args)
     elif args.recipe == "106m-pretrain":
-        # config = geneformer106m_pretrain_recipe(args)
         raise NotImplementedError("106M pretraining recipe not implemented.")
     elif args.recipe == "test-finetune":
         config = finetune_test_recipe(args)
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
deleted file mode 100644
index 9f47f0307c..0000000000
--- a/sub-packages/bionemo-llm/src/bionemo/llm/config/config_models.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-Apache2
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pathlib
-from abc import ABC, abstractmethod
-from dataclasses import field
-from typing import Any, Callable, Dict, Generic, List, Literal, Optional, Type, TypeVar
-
-import pytorch_lightning as pl
-import torch
-from pydantic import BaseModel, ValidationError, field_serializer, field_validator, model_validator
-from torch.nn import functional as F
-
-from bionemo.core.utils import dtypes
-from bionemo.llm.model.biobert.model import BioBertGenericConfig
-from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
-from bionemo.llm.utils.logger_utils import WandbConfig
-
-
-ModelConfigT = TypeVar("ModelConfigT", bound=BioBertGenericConfig)
-DataModuleT = TypeVar("DataModuleT", bound=pl.LightningDataModule)
-
-# To register a custom activation function, add it to this dictionary to pass validation and allow serialization.
-CUSTOM_ACTIVATION_FNS: Dict[str, Callable[[torch.Tensor, Any], torch.Tensor]] = {}
-
-# DO NOT use keys that already exist in torch.nn.functional, as the torch.nn.functional functions are selected first.
-for key in CUSTOM_ACTIVATION_FNS:
-    assert key not in dir(torch.nn.functional), f"Key {key} already exists in torch.nn.functional"
-
-# It does not matter if values are duplicated as the key=>value mapping still does the right thing. Repeat values should be considered aliases.
-REVERSE_CUSTOM_ACTIVATION_FNS: Dict[Callable[[torch.Tensor, Any], torch.Tensor], str] = {
-    v: k for k, v in CUSTOM_ACTIVATION_FNS.items()
-}
-
-
-class DataConfig(BaseModel, Generic[DataModuleT], ABC):
-    """Base class for all data configurations.
-
-    This class is used to define the interface for all data configurations. It is used to define the data module that
-    will be used in the training loop.
-    """
-
-    micro_batch_size: int = 8
-    result_dir: str | pathlib.Path = "./results"
-    num_dataset_workers: int = 0
-    seq_length: int = 128
-
-    @abstractmethod
-    def construct_data_module(self, global_batch_size: int) -> DataModuleT:
-        """Construct the data module from the configuration. Cannot be defined generically."""
-        ...
-
-    def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
-        """Use custom implementation of this method to define the things inside global_config.
-
-        The following expression will always be true:
-
-        global_cfg.data_config == self
-        """
-        return global_cfg
-
-
-class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
-    """BioNeMo model configuration class, wraps TransformerConfig and friends.
-
-    This class is used to define the interface for all model configurations. It is **Exposed** to guard against ill-typed
-    or poorly defined fields in the underlying configuration objects. `ModelConfigT` declares the associated type of the
-    underlying config (most commonly a BioBertGenericConfig, but could also be a TransformerConfig or something similar).
-    Children should try to expose the minimal set of fields necessary for the user to configure the model while keeping
-    the more esoteric configuration private to the underlying ModelConfigT.
-
-    """
-
-    # Restores weights from a pretrained checkpoint
-    initial_ckpt_path: Optional[str] = None
-    # Does not attempt to load keys with these prefixes (useful if you attached extra parameters and still want to load a set of weights)
-    initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
-
-    # TODO validator on num_attention_heads, ffn_hidden_size, and hidden_size as these have knowable constraints.
-
-    # Pydantic stuff to allow arbitrary types + validators + serializers
-    class Config:
-        arbitrary_types_allowed = True
-
-    """ Use this class to hide fields that are not serializable by Pydantic that we do not want to expose. """
-
-    def model_class(self) -> Type[ModelConfigT]:
-        # How did this all work yesterday even?
-        # so we cant do it this way because we are kinda losing the magic of generics.
-        #  ideally _the generics_ have all the methods we want implemented on them already.
-        raise NotImplementedError
-
-    def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
-        """Use custom implementation of this method to define the things inside global_config.
-
-        The following expression will always be true:
-
-        global_cfg.bionemo_model_config == self
-        """
-        return global_cfg
-
-    def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
-        """Converts the exposed dataclass to the underlying Transformer config.
-
-        The underlying ModelConfigT may both be incomplete and unserializable. We use this transformation as a way to
-        hide fields that are either not serializable by Pydantic or that we do not want to expose.
-
-        This is a good candidate for refactoring.
-        """
-        cls: Type[ModelConfigT] = self.model_class()
-        model_dict = {}
-        for attr in self.model_fields:
-            if attr not in model_dict and attr in cls.__dataclass_fields__:
-                model_dict[attr] = getattr(self, attr)
-        # Now set fp16 and bf16 based on the precision for the underlying TransformerConfig=>ParallelConfig
-        #   the only constraint is that both must not be true.
-        model_dict["bf16"] = self.pipeline_dtype == dtypes.precision_to_dtype["bf16-mixed"]
-        model_dict["fp16"] = self.pipeline_dtype == dtypes.precision_to_dtype["16-mixed"]
-        result = cls(**model_dict)
-
-        return result
-
-    # NOTE: See PrecisionTypes for a list of valid literals that may be deserialized.
-    params_dtype: torch.dtype
-    pipeline_dtype: torch.dtype
-    autocast_dtype: torch.dtype
-
-    num_layers: int = 6
-    hidden_size: int = 256
-    ffn_hidden_size: int = 512
-    num_attention_heads: int = 4
-    seq_length: int = 512
-    fp32_residual_connection: bool = False
-    hidden_dropout: float = 0.02
-    init_method_std: float = 0.02
-    kv_channels: Optional[int] = None
-    apply_query_key_layer_scaling: bool = False
-    make_vocab_size_divisible_by: int = 128
-    masked_softmax_fusion: bool = True
-    fp16_lm_cross_entropy: bool = False
-    gradient_accumulation_fusion: bool = False
-    layernorm_zero_centered_gamma: bool = False
-    layernorm_epsilon: float = 1.0e-12
-    activation_func: Callable[[torch.Tensor, Any], torch.Tensor] = F.gelu
-    qk_layernorm: bool = False
-    apply_residual_connection_post_layernorm: bool = False
-    bias_activation_fusion: bool = True
-    bias_dropout_fusion: bool = True
-    get_attention_mask_from_fusion: bool = False
-    attention_dropout: float = 0.1
-    share_embeddings_and_output_weights: bool = True
-    enable_autocast: bool = False
-    nemo1_ckpt_path: Optional[str] = None
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec
-
-    @field_validator("activation_func", mode="before")
-    @classmethod
-    def validate_activation_func(cls, activation_func: str) -> Callable:
-        """Validates the activation function, assumes this function exists in torch.nn.functional. For custom
-        activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module.
-
-        This method validates the provided activation function string and returns
-        a callable function based on the validation context using the provided validator in the base class.
-
-        Args:
-            activation_func (str): The activation function to be validated.
-            context (ValidationInfo): The context for validation.
-
-        Returns:
-            Callable: A callable function after validation.
-
-        See Also:
-            CUSTOM_ACTIVATION_FNS
-        """
-        func = getattr(torch.nn.functional, activation_func.lower(), None)
-        if func is None and activation_func in CUSTOM_ACTIVATION_FNS:
-            func = CUSTOM_ACTIVATION_FNS[activation_func]
-            return func
-        elif func is None:
-            raise ValidationError(
-                f"activation_func must be a valid function in `torch.nn.functional`, got {activation_func=}"
-            )
-        else:
-            return func
-
-    @field_serializer("activation_func")
-    def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tensor]) -> str:
-        func_name = v.__name__
-        func = getattr(torch.nn.functional, func_name, None)
-        if func is not None:
-            return func_name
-        elif func in REVERSE_CUSTOM_ACTIVATION_FNS:
-            return REVERSE_CUSTOM_ACTIVATION_FNS[func]  # Get the serialization key
-        else:
-            raise ValueError(f"Unsupported activation function: {v}")
-
-    @field_validator("params_dtype", "pipeline_dtype", "autocast_dtype", mode="before")
-    @classmethod
-    def precision_validator(cls, v: dtypes.PrecisionTypes) -> torch.dtype:
-        return dtypes.get_autocast_dtype(v)
-
-    @field_serializer("params_dtype", "pipeline_dtype", "autocast_dtype")
-    def serialize_dtypes(self, v: torch.dtype) -> dtypes.PrecisionTypes:
-        return dtypes.dtype_to_precision[v]
-
-
-class ParallelConfig(BaseModel):
-    tensor_model_parallel_size: int = 1
-    pipeline_model_parallel_size: int = 1
-    accumulate_grad_batches: int = 1
-    ddp: Literal["megatron"] = "megatron"
-    remove_unused_parameters: bool = True
-    num_devices: int = 1
-    num_nodes: int = 1
-
-    @model_validator(mode="after")
-    def validate_devices(self):
-        # I think we can do a 2x2 split on 2 gpus for pipeline/tensor model parallel
-        if self.num_devices < self.tensor_model_parallel_size * self.pipeline_model_parallel_size:
-            raise ValidationError(
-                "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
-            )
-        return self
-
-
-class TrainingConfig(BaseModel):
-    max_steps: int
-    limit_val_batches: int
-    val_check_interval: int
-    # NOTE this matches whats used by nl.MegatronMixedPrecision which has a restricted set of precisions.
-    precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
-    accelerator: str = "gpu"
-
-
-class OptimizerSchedulerConfig(BaseModel):
-    # TODO validators on optimizer, interval, and monitor.
-    lr: float = 1e-4
-    optimizer: str = "adam"
-    cosine_rampup_frac: float = 0.01
-    cosine_hold_frac: float = 0.05
-    interval: str = "step"
-    monitor: str = "val_loss"
-
-
-class ExperimentConfig(BaseModel):
-    save_every_n_steps: int
-    result_dir: str | pathlib.Path
-    experiment_name: str
-    restore_from_checkpoint_path: Optional[str]
-    wandb_config: Optional[WandbConfig] = None
-    save_last_checkpoint: bool = True
-    metric_to_monitor_for_checkpoints: str = "reduced_train_loss"
-    save_top_k: int = 2
-    create_tensorboard_logger: bool = False
-
-
-# DataConfig -> some config that can make a data module (see ABC definition.)
-DataConfigT = TypeVar("DataConfigT", bound=DataConfig)
-# ExposedModelConfig -> some config that can make a non-exposed model config (see ABC definition.)
-ExModelConfigT = TypeVar("ExModelConfigT", bound=ExposedModelConfig)
-
-
-class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
-    """Main configuration class for BioNeMo. All serialized configs that are a valid MainConfig should be Runnable.
-
-    This class is used to define the main configuration for BioNeMo. It defines the minimal pieces of configuration
-    to execution a training job with the NeMo2 training api. It accepts two generic type parameters which users
-    must define in their own environment for execution.
-
-    Args:
-        data_config: Generic config type that contains instructions on instantiating the required DataModule.
-        parallel_config: The parallel configuration for the model.
-        training_config: The training configuration for the model.
-        bionemo_model_config: Generic ExposedModelConfig type. This class hides extra configuration parameters in the
-            underlying model configuration as well as providing
-        optim_config: The optimizer/scheduler configuration for the model.
-        experiment_config: The experiment configuration for the model.
-        wandb_config: Optional, the wandb configuration for the model.
-    """
-
-    data_config: DataConfigT
-    parallel_config: ParallelConfig
-    training_config: TrainingConfig
-    bionemo_model_config: ExModelConfigT
-    optim_config: OptimizerSchedulerConfig
-    experiment_config: ExperimentConfig
-    wandb_config: Optional[WandbConfig] = None
-
-    @model_validator(mode="after")
-    def validate_master_config(self) -> "MainConfig":
-        self.bionemo_model_config.seq_length = self.data_config.seq_length
-        # What other global validators should we set here?
-        return self
-
-    @model_validator(mode="after")
-    def run_bionemo_model_config_model_validators(self) -> "MainConfig":
-        return self.bionemo_model_config.model_validator(self)
-
-    @model_validator(mode="after")
-    def run_data_config_model_validators(self) -> "MainConfig":
-        return self.data_config.model_validator(self)
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index 788b3d10df..87a3fbb1ad 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -29,7 +29,7 @@
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 from tokenizers import Tokenizer
 
-from bionemo.llm.config.config_models import (
+from bionemo.llm.run.config_models import (
     DataConfig,
     DataModuleT,
     ExperimentConfig,
@@ -39,7 +39,7 @@
     TrainingConfig,
 )
 from bionemo.llm.model.biobert.lightning import BioBertLightningModule
-from bionemo.llm.model.biobert.model import BioBertGenericConfig
+from bionemo.llm.model.biobert.model import BioBertConfig 
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
@@ -92,7 +92,7 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
 
 
 def biobert_lightning_module(
-    bionemo_model_config: BioBertGenericConfig,
+    bionemo_model_config: BioBertConfig,
     tokenizer: Tokenizer,
     optim_config: OptimizerSchedulerConfig,
     num_steps: int,

From 26e76f5918c3703a31bf9f297a7519e1f3ec425f Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 30 Oct 2024 12:57:35 -0700
Subject: [PATCH 32/58] starting with a fresh repo

---
 .../bionemo/geneformer/run/config_models.py   |  12 +-
 .../src/bionemo/geneformer/run/recipes.py     |   2 +-
 .../src/bionemo/llm/run/__init__.py           |  14 +
 .../src/bionemo/llm/run/config_models.py      | 314 ++++++++++++++++++
 .../bionemo-llm/src/bionemo/llm/train.py      |   4 +-
 5 files changed, 332 insertions(+), 14 deletions(-)
 create mode 100644 sub-packages/bionemo-llm/src/bionemo/llm/run/__init__.py
 create mode 100644 sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index ae5ed376a1..fce4ca98da 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -13,18 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
 import pathlib
 from dataclasses import dataclass, field
 from typing import List, Optional, Type
 
-from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
 from nemo.lightning.pytorch import callbacks as nl_callbacks
-from nemo.lightning.pytorch.optim import MegatronOptimizerModule
-from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
 from nemo.utils import logging
-from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 from tokenizers import Tokenizer
 
 from bionemo.geneformer.api import GeneformerConfig
@@ -35,12 +30,7 @@
     DataConfig,
     ExperimentConfig,
     ExposedModelConfig,
-    OptimizerSchedulerConfig,
-    ParallelConfig,
-    TrainingConfig,
 )
-from bionemo.llm.model.biobert.lightning import BioBertLightningModule
-from bionemo.llm.model.biobert.model import BioBertConfig
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
 
@@ -76,7 +66,7 @@ def test_data_path(self) -> str:
         return self.data_dir + "/test"
 
     def geneformer_preprocess(self) -> GeneformerDataArtifacts:
-        """Geneformer datamodule expects certain artifacts to be present in the data directory. 
+        """Geneformer datamodule expects certain artifacts to be present in the data directory.
         This method uses a legacy 'preprocessor' from BioNeMo 1 to acquire the associated artifacts.
         """
         preprocessor = GeneformerPreprocess(
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index a21adc7154..1fc9b447e8 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -25,6 +25,7 @@
     ExposedGeneformerPretrainConfig,
     GeneformerPretrainingDataConfig,
 )
+from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
 from bionemo.llm.run.config_models import (
     ExperimentConfig,
     MainConfig,
@@ -32,7 +33,6 @@
     ParallelConfig,
     TrainingConfig,
 )
-from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
 from bionemo.llm.utils.logger_utils import WandbConfig
 
 
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/__init__.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/__init__.py
new file mode 100644
index 0000000000..25e6abfbc5
--- /dev/null
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
new file mode 100644
index 0000000000..8cc333630b
--- /dev/null
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -0,0 +1,314 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pathlib
+from abc import ABC, abstractmethod
+from dataclasses import field
+from typing import Any, Callable, Dict, Generic, List, Literal, Optional, Type, TypeVar
+
+import pytorch_lightning as pl
+import torch
+from pydantic import BaseModel, ValidationError, field_serializer, field_validator, model_validator
+from torch.nn import functional as F
+
+from bionemo.core.utils import dtypes
+from bionemo.llm.model.biobert.model import BioBertConfig
+from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
+from bionemo.llm.utils.logger_utils import WandbConfig
+
+
+ModelConfigT = TypeVar("ModelConfigT", bound=BioBertConfig)
+DataModuleT = TypeVar("DataModuleT", bound=pl.LightningDataModule)
+
+# To register a custom activation function, add it to this dictionary to pass validation and allow serialization.
+CUSTOM_ACTIVATION_FNS: Dict[str, Callable[[torch.Tensor, Any], torch.Tensor]] = {}
+
+# DO NOT use keys that already exist in torch.nn.functional, as the torch.nn.functional functions are selected first.
+for key in CUSTOM_ACTIVATION_FNS:
+    assert key not in dir(torch.nn.functional), f"Key {key} already exists in torch.nn.functional"
+
+# It does not matter if values are duplicated as the key=>value mapping still does the right thing. Repeat values should be considered aliases.
+REVERSE_CUSTOM_ACTIVATION_FNS: Dict[Callable[[torch.Tensor, Any], torch.Tensor], str] = {
+    v: k for k, v in CUSTOM_ACTIVATION_FNS.items()
+}
+
+
+class DataConfig(BaseModel, Generic[DataModuleT], ABC):
+    """Base class for all data configurations.
+
+    This class is used to define the interface for all data configurations. It is used to define the data module that
+    will be used in the training loop.
+    """
+
+    micro_batch_size: int = 8
+    result_dir: str | pathlib.Path = "./results"
+    num_dataset_workers: int = 0
+    seq_length: int = 128
+
+    @abstractmethod
+    def construct_data_module(self, global_batch_size: int) -> DataModuleT:
+        """Construct the data module from the configuration. Cannot be defined generically."""
+        ...
+
+    def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
+        """Use custom implementation of this method to define the things inside global_config.
+
+        The following expression will always be true:
+
+        global_cfg.data_config == self
+        """
+        return global_cfg
+
+
+class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
+    """BioNeMo model configuration class, wraps TransformerConfig and friends.
+
+    This class is used to define the interface for all model configurations. It is **Exposed** to guard against ill-typed
+    or poorly defined fields in the underlying configuration objects. `ModelConfigT` declares the associated type of the
+    underlying config (most commonly a BioBertGenericConfig, but could also be a TransformerConfig or something similar).
+    Children should try to expose the minimal set of fields necessary for the user to configure the model while keeping
+    the more esoteric configuration private to the underlying ModelConfigT.
+
+    """
+
+    # Restores weights from a pretrained checkpoint
+    initial_ckpt_path: Optional[str] = None
+    # Does not attempt to load keys with these prefixes (useful if you attached extra parameters and still want to load a set of weights)
+    initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
+
+    # TODO validator on num_attention_heads, ffn_hidden_size, and hidden_size as these have knowable constraints.
+
+    # Pydantic stuff to allow arbitrary types + validators + serializers
+    class Config:
+        arbitrary_types_allowed = True
+
+    """ Use this class to hide fields that are not serializable by Pydantic that we do not want to expose. """
+
+    def model_class(self) -> Type[ModelConfigT]:
+        # How did this all work yesterday even?
+        # so we cant do it this way because we are kinda losing the magic of generics.
+        #  ideally _the generics_ have all the methods we want implemented on them already.
+        raise NotImplementedError
+
+    def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
+        """Use custom implementation of this method to define the things inside global_config.
+
+        The following expression will always be true:
+
+        global_cfg.bionemo_model_config == self
+        """
+        return global_cfg
+
+    def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
+        """Converts the exposed dataclass to the underlying Transformer config.
+
+        The underlying ModelConfigT may both be incomplete and unserializable. We use this transformation as a way to
+        hide fields that are either not serializable by Pydantic or that we do not want to expose.
+
+        This is a good candidate for refactoring.
+        """
+        cls: Type[ModelConfigT] = self.model_class()
+        model_dict = {}
+        for attr in self.model_fields:
+            if attr not in model_dict and attr in cls.__dataclass_fields__:
+                model_dict[attr] = getattr(self, attr)
+        # Now set fp16 and bf16 based on the precision for the underlying TransformerConfig=>ParallelConfig
+        #   the only constraint is that both must not be true.
+        model_dict["bf16"] = self.pipeline_dtype == dtypes.precision_to_dtype["bf16-mixed"]
+        model_dict["fp16"] = self.pipeline_dtype == dtypes.precision_to_dtype["16-mixed"]
+        result = cls(**model_dict)
+
+        return result
+
+    # NOTE: See PrecisionTypes for a list of valid literals that may be deserialized.
+    params_dtype: torch.dtype
+    pipeline_dtype: torch.dtype
+    autocast_dtype: torch.dtype
+
+    num_layers: int = 6
+    hidden_size: int = 256
+    ffn_hidden_size: int = 512
+    num_attention_heads: int = 4
+    seq_length: int = 512
+    fp32_residual_connection: bool = False
+    hidden_dropout: float = 0.02
+    init_method_std: float = 0.02
+    kv_channels: Optional[int] = None
+    apply_query_key_layer_scaling: bool = False
+    make_vocab_size_divisible_by: int = 128
+    masked_softmax_fusion: bool = True
+    fp16_lm_cross_entropy: bool = False
+    gradient_accumulation_fusion: bool = False
+    layernorm_zero_centered_gamma: bool = False
+    layernorm_epsilon: float = 1.0e-12
+    activation_func: Callable[[torch.Tensor, Any], torch.Tensor] = F.gelu
+    qk_layernorm: bool = False
+    apply_residual_connection_post_layernorm: bool = False
+    bias_activation_fusion: bool = True
+    bias_dropout_fusion: bool = True
+    get_attention_mask_from_fusion: bool = False
+    attention_dropout: float = 0.1
+    share_embeddings_and_output_weights: bool = True
+    enable_autocast: bool = False
+    nemo1_ckpt_path: Optional[str] = None
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec
+
+    @field_validator("activation_func", mode="before")
+    @classmethod
+    def validate_activation_func(cls, activation_func: str) -> Callable:
+        """Validates the activation function, assumes this function exists in torch.nn.functional. For custom
+        activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module.
+
+        This method validates the provided activation function string and returns
+        a callable function based on the validation context using the provided validator in the base class.
+
+        Args:
+            activation_func (str): The activation function to be validated.
+            context (ValidationInfo): The context for validation.
+
+        Returns:
+            Callable: A callable function after validation.
+
+        See Also:
+            CUSTOM_ACTIVATION_FNS
+        """
+        func = getattr(torch.nn.functional, activation_func.lower(), None)
+        if func is None and activation_func in CUSTOM_ACTIVATION_FNS:
+            func = CUSTOM_ACTIVATION_FNS[activation_func]
+            return func
+        elif func is None:
+            raise ValidationError(
+                f"activation_func must be a valid function in `torch.nn.functional`, got {activation_func=}"
+            )
+        else:
+            return func
+
+    @field_serializer("activation_func")
+    def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tensor]) -> str:
+        func_name = v.__name__
+        func = getattr(torch.nn.functional, func_name, None)
+        if func is not None:
+            return func_name
+        elif func in REVERSE_CUSTOM_ACTIVATION_FNS:
+            return REVERSE_CUSTOM_ACTIVATION_FNS[func]  # Get the serialization key
+        else:
+            raise ValueError(f"Unsupported activation function: {v}")
+
+    @field_validator("params_dtype", "pipeline_dtype", "autocast_dtype", mode="before")
+    @classmethod
+    def precision_validator(cls, v: dtypes.PrecisionTypes) -> torch.dtype:
+        return dtypes.get_autocast_dtype(v)
+
+    @field_serializer("params_dtype", "pipeline_dtype", "autocast_dtype")
+    def serialize_dtypes(self, v: torch.dtype) -> dtypes.PrecisionTypes:
+        return dtypes.dtype_to_precision[v]
+
+
+class ParallelConfig(BaseModel):
+    tensor_model_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 1
+    accumulate_grad_batches: int = 1
+    ddp: Literal["megatron"] = "megatron"
+    remove_unused_parameters: bool = True
+    num_devices: int = 1
+    num_nodes: int = 1
+
+    @model_validator(mode="after")
+    def validate_devices(self):
+        # I think we can do a 2x2 split on 2 gpus for pipeline/tensor model parallel
+        if self.num_devices < self.tensor_model_parallel_size * self.pipeline_model_parallel_size:
+            raise ValidationError(
+                "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
+            )
+        return self
+
+
+class TrainingConfig(BaseModel):
+    max_steps: int
+    limit_val_batches: int
+    val_check_interval: int
+    # NOTE this matches whats used by nl.MegatronMixedPrecision which has a restricted set of precisions.
+    precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
+    accelerator: str = "gpu"
+
+
+class OptimizerSchedulerConfig(BaseModel):
+    # TODO validators on optimizer, interval, and monitor.
+    lr: float = 1e-4
+    optimizer: str = "adam"
+    cosine_rampup_frac: float = 0.01
+    cosine_hold_frac: float = 0.05
+    interval: str = "step"
+    monitor: str = "val_loss"
+
+
+class ExperimentConfig(BaseModel):
+    save_every_n_steps: int
+    result_dir: str | pathlib.Path
+    experiment_name: str
+    restore_from_checkpoint_path: Optional[str]
+    wandb_config: Optional[WandbConfig] = None
+    save_last_checkpoint: bool = True
+    metric_to_monitor_for_checkpoints: str = "reduced_train_loss"
+    save_top_k: int = 2
+    create_tensorboard_logger: bool = False
+
+
+# DataConfig -> some config that can make a data module (see ABC definition.)
+DataConfigT = TypeVar("DataConfigT", bound=DataConfig)
+# ExposedModelConfig -> some config that can make a non-exposed model config (see ABC definition.)
+ExModelConfigT = TypeVar("ExModelConfigT", bound=ExposedModelConfig)
+
+
+class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
+    """Main configuration class for BioNeMo. All serialized configs that are a valid MainConfig should be Runnable.
+
+    This class is used to define the main configuration for BioNeMo. It defines the minimal pieces of configuration
+    to execution a training job with the NeMo2 training api. It accepts two generic type parameters which users
+    must define in their own environment for execution.
+
+    Args:
+        data_config: Generic config type that contains instructions on instantiating the required DataModule.
+        parallel_config: The parallel configuration for the model.
+        training_config: The training configuration for the model.
+        bionemo_model_config: Generic ExposedModelConfig type. This class hides extra configuration parameters in the
+            underlying model configuration as well as providing
+        optim_config: The optimizer/scheduler configuration for the model.
+        experiment_config: The experiment configuration for the model.
+        wandb_config: Optional, the wandb configuration for the model.
+    """
+
+    data_config: DataConfigT
+    parallel_config: ParallelConfig
+    training_config: TrainingConfig
+    bionemo_model_config: ExModelConfigT
+    optim_config: OptimizerSchedulerConfig
+    experiment_config: ExperimentConfig
+    wandb_config: Optional[WandbConfig] = None
+
+    @model_validator(mode="after")
+    def validate_master_config(self) -> "MainConfig":
+        self.bionemo_model_config.seq_length = self.data_config.seq_length
+        # What other global validators should we set here?
+        return self
+
+    @model_validator(mode="after")
+    def run_bionemo_model_config_model_validators(self) -> "MainConfig":
+        return self.bionemo_model_config.model_validator(self)
+
+    @model_validator(mode="after")
+    def run_data_config_model_validators(self) -> "MainConfig":
+        return self.data_config.model_validator(self)
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index 87a3fbb1ad..a397c49acb 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -29,6 +29,8 @@
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 from tokenizers import Tokenizer
 
+from bionemo.llm.model.biobert.lightning import BioBertLightningModule
+from bionemo.llm.model.biobert.model import BioBertConfig
 from bionemo.llm.run.config_models import (
     DataConfig,
     DataModuleT,
@@ -38,8 +40,6 @@
     ParallelConfig,
     TrainingConfig,
 )
-from bionemo.llm.model.biobert.lightning import BioBertLightningModule
-from bionemo.llm.model.biobert.model import BioBertConfig 
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 

From 5127dc90a453586a196e3e495691b892f69803a2 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 30 Oct 2024 20:29:25 +0000
Subject: [PATCH 33/58] missing files with the correct paths

---
 sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py | 2 +-
 sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
index eb4d2389a2..1aa597d25c 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
@@ -27,7 +27,7 @@
 from bionemo.esm2.data.tokenizer import get_tokenizer
 from bionemo.esm2.model.attention import ESM2DotProductAttention, ESM2TEDotProductAttention
 from bionemo.esm2.model.model import ESM2Config
-from bionemo.llm.config.config_models import (
+from bionemo.llm.run.config_models import (
     DataConfig,
     ExposedModelConfig,
     MainConfig,
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
index 859b0575fe..e2ac55718c 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -19,7 +19,7 @@
 from typing import Optional
 
 from bionemo.esm2.run.config_models import ESM2DataConfig, ExposedESM2PretrainConfig
-from bionemo.llm.config.config_models import MainConfig
+from bionemo.llm.run.config_models import MainConfig
 from bionemo.llm.train import train
 
 

From 090e1d8f658284dd38469ee015adfc4101601518 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 30 Oct 2024 22:26:48 +0000
Subject: [PATCH 34/58] fixed recipe

---
 sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
index 5894d67d36..cc8830d99a 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
@@ -23,7 +23,7 @@
 
 from bionemo.core.utils.dtypes import PrecisionTypes
 from bionemo.esm2.run.config_models import ESM2DataConfig, ExposedESM2PretrainConfig
-from bionemo.llm.config.config_models import (
+from bionemo.llm.run.config_models import (
     ExperimentConfig,
     MainConfig,
     OptimizerSchedulerConfig,

From 82ae551f350914873b73d1e727c96dda78146a2f Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 4 Nov 2024 14:21:50 +0000
Subject: [PATCH 35/58] Adds recipes for 8m, 650m, 3b parameter models. Adds
 docstrings for all, updates readme to document the workflow for using
 pydantic configs.

---
 README.md                                     |  54 +++
 scripts/protein/esm2/test_pydantic_train.py   |   4 +-
 .../src/bionemo/esm2/run/config_models.py     |  79 +++-
 .../src/bionemo/esm2/run/recipes.py           | 408 ++++++++++--------
 .../src/bionemo/llm/run/config_models.py      |  86 +++-
 5 files changed, 427 insertions(+), 204 deletions(-)

diff --git a/README.md b/README.md
index e1c5a9823c..446f457e4d 100644
--- a/README.md
+++ b/README.md
@@ -198,6 +198,60 @@ python  \
     --restore-from-checkpoint-path ${ESM2_650M_CKPT}
 ```
 
+##### Running with Pydantic configs
+
+Alternatively, we provide a validated and serialized configuration file entrypoint for executing the same workflow. Recipes 
+are available for 8m, 650m, and 3b ESM2 models.
+
+```bash
+# The fastest transformer engine environment variables in testing were the following two
+TEST_DATA_DIR=$(download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source $MY_DATA_SOURCE); \
+bionemo-esm2-recipe \
+--train-cluster-path ${TEST_DATA_DIR}/2024_03_sanity/train_clusters_sanity.parquet     \
+--train-database-path ${TEST_DATA_DIR}/2024_03_sanity/train_sanity.db     \
+--valid-cluster-path ${TEST_DATA_DIR}/2024_03_sanity/valid_clusters.parquet     \
+--valid-database-path ${TEST_DATA_DIR}/2024_03_sanity/validation.db     \
+--result-dir ./results     \
+--dest my_config.yaml \
+--recipe 8m
+```
+
+> NOTE: To pretrain from an existing checkpoint, simply pass in the path --initial-ckpt-path to the recipe command. This will populate the JSON with the correct field to ensure pretraining is initialized from an existing checkpoint.
+
+To submit a training job with the passed config, first update the json file with any additional execution parameters
+of your choosing: number of devices, workers, steps, etc. Second, invoke our training entrypoint. To do this, we need 
+three things:
+
+- Configuration file, the JSON produced by the previous step
+- Model config type, in this case the pretraining config. This will validate the arguments in the config JSON against 
+    those required for pretraining. Alternatively, things like fine-tuning with custom task heads may be specified here.
+    This allows for mixing/matching Data Modules with various tasks. 
+- Data Config type, this specifies how to parse, validate, and prepare the DataModule. This may change depending on task,
+for example, pretraining ESM2 uses a protein cluster oriented sampling method. In the case of inference or fine-tuning
+a pretrained model, a simple fasta file may be sufficient. There is a one-to-one relationship between DataConfig types
+and DataModule types.
+
+> ⚠️ **Warning:** This setup does NO configuration of Weights and Biases. Edit your config JSON and populate it with your WandB details.
+
+```
+export NVTE_FUSED_ATTN=1
+export NVTE_FLASH_ATTN=0
+
+bionemo-esm2-train \
+--data-config-t bionemo.esm2.run.config_models.ESM2DataConfig \
+--model-config-t bionemo.esm2.run.config_models.ExposedESM2PretrainConfig \
+--config my_config.yaml
+```
+
+> NOTE: both data-config-t and model-config-t have default values corresponding to ESM2DataConfig and ExposedESM2PretrainingConfig
+
+DataConfigT and ModelConfigT can also refer to locally defined types by the user. As long as python knows how to import
+the specified path, they may be configured. For example, you may have a custom Dataset/DataModule that you would like to
+mix with an existing recipe. In this case, you define a DataConfig object with the generic specified as your DataModule
+type, and then pass in the config type to the training recipe.
+
+
+
 ### Geneformer
 #### Running
 
diff --git a/scripts/protein/esm2/test_pydantic_train.py b/scripts/protein/esm2/test_pydantic_train.py
index 5585d68717..7a245c26ee 100644
--- a/scripts/protein/esm2/test_pydantic_train.py
+++ b/scripts/protein/esm2/test_pydantic_train.py
@@ -91,10 +91,8 @@ def dummy_parquet_train_val_inputs(tmp_path):
 
 
 def test_pretrain_pydantic_cli(dummy_protein_dataset, dummy_parquet_train_val_inputs, tmpdir):
-    # result_dir = Path(tmpdir.mkdir("results"))
-    train_cluster_path, valid_cluster_path = dummy_parquet_train_val_inputs
-    # result_dir = Path("/tmp/results").mkdir(exist_ok=True)
     result_dir = tmpdir.mkdir("results")
+    train_cluster_path, valid_cluster_path = dummy_parquet_train_val_inputs
 
     open_port = find_free_network_port()
     config = f"{result_dir}/test_config.json"
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
index 1aa597d25c..7f58ac2c44 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
@@ -36,7 +36,28 @@
 
 
 class ESM2DataConfig(DataConfig[ESMDataModule]):
-    # defined in baseclass- listed here for exposure.
+    """
+    ESM2DataConfig is a configuration class for setting up the pre-training data module for ESM2.
+
+    The ESM2DataModule implements the cluster oriented sampling method defined in the ESM2 publication.
+
+    Attributes:
+        train_cluster_path (Path): Path to the training cluster data.
+        train_database_path (Path): Path to the training database.
+        valid_cluster_path (Path): Path to the validation cluster data.
+        valid_database_path (Path): Path to the validation database.
+        micro_batch_size (int): Size of the micro-batch. Default is 8.
+        result_dir (str): Directory to store results. Default is "./results".
+        min_seq_length (int): Minimum sequence length. Default is 128.
+        max_seq_length (int): Maximum sequence length. Default is 128.
+        random_mask_strategy (RandomMaskStrategy): Strategy for random masking. Default is RandomMaskStrategy.ALL_TOKENS.
+        num_dataset_workers (int): Number of workers for the dataset. Default is 0.
+
+    Methods:
+        construct_data_module(global_batch_size: int) -> ESMDataModule:
+            Constructs and returns an ESMDataModule instance with the provided global batch size.
+    """
+
     train_cluster_path: Path
     train_database_path: Path
     valid_cluster_path: Path
@@ -50,6 +71,7 @@ class ESM2DataConfig(DataConfig[ESMDataModule]):
     num_dataset_workers: int = 0
 
     def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
+        '''Constructs and returns an ESMDataModule instance with the provided global batch size.'''
         tokenizer = get_tokenizer()
         data = ESMDataModule(
             train_cluster_path=self.train_cluster_path,
@@ -68,8 +90,37 @@ def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
 
 
 class ExposedESM2PretrainConfig(ExposedModelConfig[ESM2Config]):
-    # ESM specific fields
-    use_esm_attention: bool = False  # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
+    class ExposedESM2PretrainConfig:
+        """
+        Configuration class for ESM2 pretraining with select exposed parameters.
+
+        See the inherited ExposedModelConfig for attributes and methods from the base class. Use this class either
+        as a template or extension for custom configurations. Importantly, these kinds of classes should do two things,
+        select attributes to expose to the user, and provide validation and serialization any attributes.
+
+        Attributes:
+            use_esm_attention (bool): Flag to skip ESM2 custom attention for TE acceleration. Defaults to False.
+            token_dropout (bool): Flag to enable token dropout. Defaults to True.
+            normalize_attention_scores (bool): Flag to normalize attention scores. Defaults to False.
+            variable_seq_lengths (bool): Flag to enable variable sequence lengths. Defaults to False.
+            core_attention_override (Optional[Type[torch.nn.Module]]): Optional override for core attention module. Defaults to None.
+
+        Methods:
+            restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -> BiobertSpecOption:
+                Validates the BiobertSpecOption to ensure it is compatible with ESM2.
+            serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
+                Serializes the core attention override module to a string.
+            validate_core_attention_override(cls, value):
+                Validates the core attention override module, ensuring it is a subclass of torch.nn.Module.
+            validate_and_set_attention_and_scaling(self):
+                Validates and sets the attention and scaling parameters based on the biobert_spec_option.
+            model_validator(self, global_cfg: MainConfig) -> MainConfig:
+                Validates the global configuration, ensuring compatibility with ESM2DataConfig and parallel settings.
+            model_class(self) -> Type[ESM2Config]:
+                Returns the model class associated with this configuration.
+        """
+
+    use_esm_attention: bool = False # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
     token_dropout: bool = True
     normalize_attention_scores: bool = False
     variable_seq_lengths: bool = False
@@ -78,8 +129,7 @@ class ExposedESM2PretrainConfig(ExposedModelConfig[ESM2Config]):
     @field_validator("biobert_spec_option", mode="after")
     @classmethod
     def restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -> BiobertSpecOption:
-        # This has some more complicated validation I see
-
+        '''Validates the BiobertSpecOption to ensure it is compatible with ESM2. by restricting it to the specs compatable with ESM2'''
         if biobert_spec_option in (
             BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
             BiobertSpecOption.esm2_bert_layer_local_spec,
@@ -92,12 +142,14 @@ def restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -
 
     @field_serializer("core_attention_override")
     def serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
+        '''Serializes the core attention override module to a string.'''
         if value is None:
             return None
         return f"{value.__module__}.{value.__name__}"
 
     @field_validator("core_attention_override", mode="before")
     def validate_core_attention_override(cls, value):
+        '''Validates the core attention override module, ensuring it is a subclass of torch.nn.Module.'''
         if value is None:
             return None
         if isinstance(value, str):
@@ -114,6 +166,7 @@ def validate_core_attention_override(cls, value):
 
     @model_validator(mode="after")
     def validate_and_set_attention_and_scaling(self):
+        '''Validates and sets the attention and scaling parameters based on the biobert_spec_option.''' 
         logging.info(
             "Mutating apply_query_key_layer_scaling and core_attention_override based on biobert_spec_option.."
         )
@@ -130,6 +183,14 @@ def validate_and_set_attention_and_scaling(self):
         return self
 
     def model_validator(self, global_cfg: MainConfig) -> MainConfig:
+        '''Validates the global configuration, ensuring compatibility with ESM2DataConfig and parallel settings.
+        
+        The global validator acts on the MainConfig, this couples together the ESM2DataConfig with ESM2PretrainingConfig.
+        Additionally, it provides validation for sequence length and parallelism settings.
+
+        Args:
+            global_cfg (MainConfig): The global configuration object. 
+        '''
         global_cfg = super().model_validator(global_cfg)
         # Need to ensure that at the least we have access to min_seq_length and max_seq_length
         if not isinstance(global_cfg.data_config, ESM2DataConfig):
@@ -147,9 +208,5 @@ def model_validator(self, global_cfg: MainConfig) -> MainConfig:
         return global_cfg
 
     def model_class(self) -> Type[ESM2Config]:
-        return ESM2Config
-
-
-# TODO NOTES on default configuration
-# seq_length: int # max_sequence_length
-# need_megatron_variable_seq_lengths_reductions = (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)
+        '''Returns the model class associated with this configuration.'''
+        return ESM2Config
\ No newline at end of file
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
index cc8830d99a..279c3c8f73 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
@@ -14,6 +14,10 @@
 # limitations under the License.
 
 
+from typing import Type
+import torch
+from pydantic import BaseModel, field_serializer, field_validator
+
 import argparse
 import importlib
 from pathlib import Path
@@ -34,12 +38,228 @@
 from bionemo.llm.utils.logger_utils import WandbConfig
 
 
+def esm2_base_training_config() -> TrainingConfig:
+    '''Base training config for ESM2'''
+    return TrainingConfig(max_steps=500000, limit_val_batches=1.0, val_check_interval=1500, precision='bf16-mixed')
+
+
+def esm2_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
+    '''Base optimizer scheduler config for ESM2'''
+    return OptimizerSchedulerConfig(
+        optimizer='adam',
+        lr=4e-4,
+        cosine_rampup_frac=.01,
+        cosine_hold_frac=.05,
+        interval='step',
+        monitor='val_loss',
+    )
+
+
+def esm2_base_parallel_config() -> ParallelConfig:
+    '''Base parallel config for ESM2'''
+    return ParallelConfig(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        accumulate_grad_batches=1,
+        ddp='megatron',
+        num_devices=1,
+        num_nodes=1,
+    )
+
+def esm2_8m_wandb_config() -> WandbConfig:
+    '''Wandb config for ESM2 8m'''
+    wandb_config = WandbConfig(
+        entity='esm2-8m_pretraining',
+        project='esm2-8m_pretraining',
+        group='esm2-8m',
+        tags=['esm2-8m'],
+        offline=True,
+        anonymous=True,
+        id='1',
+        log_model=False,
+    )
+    return wandb_config
+
+def esm2_8m_experiment_config(result_dir) -> ExperimentConfig:
+    '''Experiment config for ESM2 8m'''
+    return ExperimentConfig(
+        save_every_n_steps=50, # default set in previous script.
+        result_dir=result_dir,
+        experiment_name='esm2-8m', 
+        restore_from_checkpoint_path=None
+    )
+
+def esm2_8m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
+    '''Model config for ESM2 8m'''
+    return ExposedESM2PretrainConfig(
+        num_layers=6,
+        hidden_size=320,
+        ffn_hidden_size=320 * 4,
+        num_attention_heads=20,
+        seq_length=1024,
+        biobert_spec_option=BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
+        initial_ckpt_path=initial_ckpt_path,
+        params_dtype='bf16-mixed',
+        pipeline_dtype='bf16-mixed',
+        autocast_dtype='bf16-mixed',
+    )
+
+
+def esm2_8m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
+    '''Recipe for ESM2 8m'''
+    data_config = ESM2DataConfig(
+        min_seq_length=1024,
+        max_seq_length=1024,
+        micro_batch_size=2,
+        num_dataset_workers=8,
+        train_cluster_path=args.train_cluster_path,
+        train_database_path=args.train_database_path,
+        valid_cluster_path=args.valid_cluster_path,
+        valid_database_path=args.valid_database_path,
+    )
+
+    return MainConfig(
+        data_config=data_config,
+        parallel_config=esm2_base_parallel_config(),
+        training_config=esm2_base_training_config(), # no changes for 8m
+        bionemo_model_config=esm2_8m_model_config(args.initial_ckpt_path),
+        optim_config=esm2_base_optimizer_scheduler_config(), # no changes for 8m
+        experiment_config=esm2_8m_experiment_config(args.result_dir),
+        wandb_config=esm2_8m_wandb_config(),
+    )
+
+def esm2_650m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
+    '''Model config for ESM2 650m'''
+    return ExposedESM2PretrainConfig(
+        num_layers=6,
+        hidden_size=1280,
+        ffn_hidden_size=1280* 4,
+        seq_length=1024,
+        num_attention_heads=20,
+        biobert_spec_option=BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
+        initial_ckpt_path=initial_ckpt_path,
+        params_dtype='bf16-mixed',
+        pipeline_dtype='bf16-mixed',
+        autocast_dtype='bf16-mixed'
+    )
+
+def esm2_650m_wandb_config() -> WandbConfig:
+    '''Wandb config for ESM2 650m'''
+    return WandbConfig(
+        entity='esm2-650m_pretraining',
+        project='esm2-650m_pretraining',
+        group='esm2-650m',
+        tags=['esm2-650m'],
+        offline=True,
+        anonymous=True,
+        id='1',
+        log_model=False,
+    )
+
+def esm2_650m_experiment_config(result_dir) -> ExperimentConfig:
+    '''Experiment config for ESM2 650m'''
+    return ExperimentConfig(
+        save_every_n_steps=50,
+        result_dir=result_dir,
+        experiment_name='esm2-650m',
+        # TODO should this be exposed?
+        restore_from_checkpoint_path=None
+    )
+
+def esm2_650m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
+    '''Recipe for ESM2 650m'''
+    data_config = ESM2DataConfig(
+        min_seq_length=1024,
+        max_seq_length=1024,
+        micro_batch_size=1,
+        num_dataset_workers=8,
+        train_cluster_path=args.train_cluster_path,
+        train_database_path=args.train_database_path,
+        valid_cluster_path=args.valid_cluster_path,
+        valid_database_path=args.valid_database_path,
+    )
+
+    return MainConfig(
+        data_config=data_config,
+        parallel_config=esm2_base_parallel_config(),
+        training_config=esm2_base_training_config(), # no changes for 8m
+        bionemo_model_config=esm2_650m_model_config(args.initial_ckpt_path),
+        optim_config=esm2_base_optimizer_scheduler_config(), # no changes for 8m
+        experiment_config=esm2_650m_experiment_config(args.result_dir),
+        wandb_config=esm2_650m_wandb_config(),
+    )
+
+def esm2_3b_parallel_config() -> ParallelConfig:
+    '''Parallel config for ESM2 3b'''
+    return ParallelConfig(
+        tensor_model_parallel_size=2,
+        pipeline_model_parallel_size=1,
+        # TODO: is this correct?
+        accumulate_grad_batches=1,
+        ddp='megatron',
+        # NOTE assumes 8xGPU node. Can always edit the config.
+        num_devices=8,
+    )
+
+def esm2_3b_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
+    '''Model config for ESM2 3b'''
+    return ExposedESM2PretrainConfig(
+        num_layers=36,
+        hidden_size=2560,
+        ffn_hidden_size=2560 * 4,
+        num_attention_heads=40,
+        seq_length=1024,
+        biobert_spec_option=BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
+        initial_ckpt_path=initial_ckpt_path,
+        params_dtype='bf16-mixed',
+        pipeline_dtype='bf16-mixed',
+        autocast_dtype='bf16-mixed'
+    )
+
+def esm2_3b_wandb_config() -> WandbConfig:
+    '''Wandb config for ESM2 3b'''
+    return WandbConfig(
+        entity='esm2-3b_pretraining',
+        project='esm2-3b_pretraining',
+        group='esm2-3b',
+        tags=['esm2-3b'],
+        offline=True,
+        anonymous=True,
+        id='1',
+        log_model=False,
+    )
+
+def esm2_3b_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
+    '''Recipe for ESM2 3b'''
+    data_config = ESM2DataConfig(
+        min_seq_length=1024,
+        max_seq_length=1024,
+        micro_batch_size=1,
+        num_dataset_workers=8,
+        train_cluster_path=args.train_cluster_path,
+        train_database_path=args.train_database_path,
+        valid_cluster_path=args.valid_cluster_path,
+        valid_database_path=args.valid_database_path,
+    )
+
+    return MainConfig(
+        data_config=data_config,
+        parallel_config=esm2_3b_parallel_config(),
+        training_config=esm2_base_training_config(), # no changes for 8m
+        bionemo_model_config=esm2_3b_model_config(args.initial_ckpt_path),
+        optim_config=esm2_base_optimizer_scheduler_config(), # no changes for 8m
+        experiment_config=esm2_650m_experiment_config(args.result_dir),
+        wandb_config=esm2_3b_wandb_config(),
+    )
+
+
 def simple_parallel_recipe(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
     num_devices: int = 1,
     accumulate_grad_batches: int = 1,
 ) -> ParallelConfig:
+    '''Simple parallel recipe for ESM2'''
     assert (
         num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
     ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
@@ -51,19 +271,18 @@ def simple_parallel_recipe(
     )
 
 
-def default_training_config_recipe() -> TrainingConfig:
-    return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
-
-
 def tiny_train_config_recipe() -> TrainingConfig:
+    '''Tiny training config for ESM2'''
     return TrainingConfig(max_steps=10, limit_val_batches=2, val_check_interval=2)
 
 
 def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
+    '''Default optimizer scheduler config for ESM2'''
     return OptimizerSchedulerConfig()
 
 
 def experiment_config_recipe(result_dir="./results") -> ExperimentConfig:
+    '''Experiment config for ESM2'''
     return ExperimentConfig(
         save_every_n_steps=100,
         result_dir=result_dir,
@@ -84,6 +303,7 @@ def esm2_tiny_model_config(
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
     variable_seq_lengths: bool = False,
 ) -> ExposedESM2PretrainConfig:
+    '''Model config for ESM2 tiny'''
     return ExposedESM2PretrainConfig(
         seq_length=seq_length,
         num_layers=2,
@@ -100,82 +320,11 @@ def esm2_tiny_model_config(
         variable_seq_lengths=variable_seq_lengths,
     )
 
-
-def esm2_8m_model_config(
-    seq_length: int = 2048,
-    precision: PrecisionTypes = "bf16-mixed",
-    nemo1_init_path: Optional[str] = None,
-    initial_ckpt_path: Optional[str] = None,
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
-    variable_seq_lengths: bool = False,
-) -> ExposedESM2PretrainConfig:
-    return ExposedESM2PretrainConfig(
-        seq_length=seq_length,
-        num_layers=6,
-        hidden_size=320,
-        num_attention_heads=20,
-        ffn_hidden_size=4 * 320,
-        params_dtype=precision,
-        pipeline_dtype=precision,
-        autocast_dtype=precision,
-        biobert_spec_option=biobert_spec_option,
-        nemo1_ckpt_path=str(nemo1_init_path) if nemo1_init_path is not None else None,
-        # handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
-        initial_ckpt_path=str(initial_ckpt_path) if initial_ckpt_path is not None else None,
-        variable_seq_lengths=variable_seq_lengths,
-    )
-
-
-def esm2_650m_config(
-    seq_length: int = 2048,
-    precision: PrecisionTypes = "bf16-mixed",
-    nemo1_init_path: Optional[str] = None,
-    initial_ckpt_path: Optional[str] = None,
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
-    variable_seq_lengths: bool = False,
-) -> ExposedESM2PretrainConfig:
-    return ExposedESM2PretrainConfig(
-        seq_length=seq_length,
-        num_layers=33,
-        hidden_size=1280,
-        num_attention_heads=20,
-        ffn_hidden_size=4 * 1280,
-        params_dtype=precision,
-        pipeline_dtype=precision,
-        autocast_dtype=precision,
-        biobert_spec_option=biobert_spec_option,
-        nemo1_ckpt_path=str(nemo1_init_path) if nemo1_init_path is not None else None,
-        # handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
-        initial_ckpt_path=str(initial_ckpt_path) if initial_ckpt_path is not None else None,
-        variable_seq_lengths=variable_seq_lengths,
-    )
-
-
-"""
-    --train-cluster-path ${TEST_DATA_DIR}/2024_03_sanity/train_clusters_sanity.parquet     \
-    --train-database-path ${TEST_DATA_DIR}/2024_03_sanity/train_sanity.db     \
-    --valid-cluster-path ${TEST_DATA_DIR}/2024_03_sanity/valid_clusters.parquet     \
-    --valid-database-path ${TEST_DATA_DIR}/2024_03_sanity/validation.db     \
-    --result-dir ./results     \
-    --experiment-name test_experiment     \
-    --num-gpus 1  \
-    --num-nodes 1 \
-    --val-check-interval 10 \
-    --num-dataset-workers 1 \
-    --num-steps 10 \
-    --max-seq-length 128 \
-    --limit-val-batches 2 \
-    --micro-batch-size 2 \
-    --restore-from-checkpoint-path ${ESM2_650M_CKPT}
-"""
-
-
 def esm2_tiny_test_recipe(args):
+    '''Test recipe for ESM2 tiny'''
     parallel_config = simple_parallel_recipe()
     training_config = tiny_train_config_recipe()
-    # $(download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source $MY_DATA_SOURCE);
 
-    # Find this from the test script... not sure what a sensible default is.
     data_config = ESM2DataConfig(
         min_seq_length=128,
         max_seq_length=128,
@@ -213,58 +362,13 @@ def esm2_tiny_test_recipe(args):
     )
     return main_config
 
-
-def esm2_8m_test_recipe(args):
-    parallel_config = simple_parallel_recipe()
-    training_config = default_training_config_recipe()
-    # $(download_bionemo_data esm2/testdata_esm2_pretrain:2.0 --source $MY_DATA_SOURCE);
-
-    # Find this from the test script... not sure what a sensible default is.
-    data_config = ESM2DataConfig(
-        min_seq_length=128,
-        max_seq_length=128,
-        micro_batch_size=2,
-        num_dataset_workers=1,
-        train_cluster_path=args.train_cluster_path,
-        train_database_path=args.train_database_path,
-        valid_cluster_path=args.valid_cluster_path,
-        valid_database_path=args.valid_database_path,
-    )
-    bionemo_model_config = esm2_8m_model_config(
-        seq_length=data_config.max_seq_length, initial_ckpt_path=args.initial_ckpt_path
-    )
-
-    optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
-    experiment_config = experiment_config_recipe(args.result_dir)
-    wandb_config = WandbConfig(
-        project="bionemo2-demo",
-        entity="nvidia",
-        offline=True,
-        tags=[],
-        group="dev",
-        id="dev",
-        log_model=False,
-        anonymous=True,
-    )
-    main_config = MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig](
-        data_config=data_config,
-        parallel_config=parallel_config,
-        training_config=training_config,
-        bionemo_model_config=bionemo_model_config,
-        optim_config=optim_config,
-        experiment_config=experiment_config,
-        wandb_config=wandb_config,
-    )
-    return main_config
-
-
 def main():
     def parse_args():
         parser = argparse.ArgumentParser(description="Create ESM2 configuration JSON.")
         parser.add_argument(
             "--recipe",
             type=str,
-            choices=["test-8m", "test"],
+            choices=["test", "8m", "650m", "3b"],
             required=True,
             help="Use one of the preconfigured recipes to create a template config file.",
         )
@@ -304,20 +408,20 @@ def parse_args():
         args = parser.parse_args()
         return args
 
-    """Simple example for creating a JSON from recipes."""
+    # Simple example for creating a JSON from recipes.
     args = parse_args()
 
-    if args.recipe == "test-8m":
-        # Hardcoded test recipe.
-        config = esm2_8m_test_recipe(args)
+    if args.recipe == "8m":
+        config = esm2_8m_recipe(args)
+    elif args.recipe == "650m":
+        config = esm2_650m_recipe(args)
+    elif args.recipe == "3b":
+        config = esm2_3b_recipe(args)
     elif args.recipe == "test":
         # Hardcoded test recipe.
         config = esm2_tiny_test_recipe(args)
-    elif args.recipe == "test-finetune":
-        raise ValueError("Invalid recipe choice.")
-        # config = finetune_test_recipe(args)
     else:
-        raise ValueError("Invalid recipe choice.")
+        raise ValueError(f"Invalid recipe choice. {args.recipe=}")
 
     # Serialize to JSON
     json_str = config.model_dump_json(indent=2)
@@ -330,47 +434,5 @@ def parse_args():
         f.write(json_str)
     logging.info(f"Saved configuration to {args.dest=}")
 
-
-from typing import Type
-
-import torch
-from pydantic import BaseModel, field_serializer, field_validator
-
-
-class MyConfig(BaseModel):
-    core_attention_override: Optional[Type[torch.nn.Module]] = None
-
-    @field_serializer("core_attention_override")
-    def serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
-        if value is None:
-            return None
-        return f"{value.__module__}.{value.__name__}"
-
-    @field_validator("core_attention_override", mode="before")
-    def validate_core_attention_override(cls, value):
-        if value is None:
-            return None
-        if isinstance(value, str):
-            module_name, class_name = value.rsplit(".", 1)
-            try:
-                module = importlib.import_module(module_name)
-                cls = getattr(module, class_name)
-                if not issubclass(cls, torch.nn.Module):
-                    raise ValueError(f"{cls} is not a subclass of torch.nn.Module")
-                return cls
-            except (ImportError, AttributeError):
-                raise ValueError(f"Cannot import {value}")
-        return value
-
-
 if __name__ == "__main__":
-    # NOTE: this is where I left off!
-    config = esm2_650m_config()
-    dumped = config.model_dump()
-    config_again = ExposedESM2PretrainConfig(**dumped)
-
-    assert config_again == config
-    main()
-
-
-# config.exposed_to_internal_bionemo_model_config()
+    main()
\ No newline at end of file
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index 8cc333630b..7d32455186 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -33,7 +33,7 @@
 ModelConfigT = TypeVar("ModelConfigT", bound=BioBertConfig)
 DataModuleT = TypeVar("DataModuleT", bound=pl.LightningDataModule)
 
-# To register a custom activation function, add it to this dictionary to pass validation and allow serialization.
+# Activation functions not available in torch.nn.functional require custom serialization/validation. Add them here with a lookup key.
 CUSTOM_ACTIVATION_FNS: Dict[str, Callable[[torch.Tensor, Any], torch.Tensor]] = {}
 
 # DO NOT use keys that already exist in torch.nn.functional, as the torch.nn.functional functions are selected first.
@@ -81,7 +81,6 @@ class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
     underlying config (most commonly a BioBertGenericConfig, but could also be a TransformerConfig or something similar).
     Children should try to expose the minimal set of fields necessary for the user to configure the model while keeping
     the more esoteric configuration private to the underlying ModelConfigT.
-
     """
 
     # Restores weights from a pretrained checkpoint
@@ -89,18 +88,12 @@ class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
     # Does not attempt to load keys with these prefixes (useful if you attached extra parameters and still want to load a set of weights)
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
 
-    # TODO validator on num_attention_heads, ffn_hidden_size, and hidden_size as these have knowable constraints.
-
     # Pydantic stuff to allow arbitrary types + validators + serializers
     class Config:
         arbitrary_types_allowed = True
 
-    """ Use this class to hide fields that are not serializable by Pydantic that we do not want to expose. """
-
     def model_class(self) -> Type[ModelConfigT]:
-        # How did this all work yesterday even?
-        # so we cant do it this way because we are kinda losing the magic of generics.
-        #  ideally _the generics_ have all the methods we want implemented on them already.
+        '''Returns the underlying model class that this config wraps.'''
         raise NotImplementedError
 
     def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
@@ -117,14 +110,13 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
 
         The underlying ModelConfigT may both be incomplete and unserializable. We use this transformation as a way to
         hide fields that are either not serializable by Pydantic or that we do not want to expose.
-
-        This is a good candidate for refactoring.
         """
         cls: Type[ModelConfigT] = self.model_class()
         model_dict = {}
         for attr in self.model_fields:
             if attr not in model_dict and attr in cls.__dataclass_fields__:
                 model_dict[attr] = getattr(self, attr)
+
         # Now set fp16 and bf16 based on the precision for the underlying TransformerConfig=>ParallelConfig
         #   the only constraint is that both must not be true.
         model_dict["bf16"] = self.pipeline_dtype == dtypes.precision_to_dtype["bf16-mixed"]
@@ -198,6 +190,24 @@ def validate_activation_func(cls, activation_func: str) -> Callable:
 
     @field_serializer("activation_func")
     def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tensor]) -> str:
+        """
+        Serializes a given activation function to its corresponding string representation.
+
+        By default, all activation functions from `torch.nn.functional` are serialized to their name. User defined
+        activation functions should also be defined here with a custom mapping in CUSTOM_ACTIVATION_FNS defined at the
+        top of this file. This allows our Pydantic model to serialize and deserialize the activation function.
+
+        Args:
+            v (Callable[[torch.Tensor, Any], torch.Tensor]): The activation function to serialize.
+
+        Returns:
+            str: The name of the activation function if it is a standard PyTorch function,
+                 or the corresponding serialization key if it is a custom activation function.
+
+        Raises:
+            ValueError: If the activation function is not supported.
+        """
+
         func_name = v.__name__
         func = getattr(torch.nn.functional, func_name, None)
         if func is not None:
@@ -210,10 +220,12 @@ def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tenso
     @field_validator("params_dtype", "pipeline_dtype", "autocast_dtype", mode="before")
     @classmethod
     def precision_validator(cls, v: dtypes.PrecisionTypes) -> torch.dtype:
+        '''Validates the precision type and returns the corresponding torch dtype.'''
         return dtypes.get_autocast_dtype(v)
 
     @field_serializer("params_dtype", "pipeline_dtype", "autocast_dtype")
     def serialize_dtypes(self, v: torch.dtype) -> dtypes.PrecisionTypes:
+        '''Serializes the torch dtype to the corresponding precision type.'''
         return dtypes.dtype_to_precision[v]
 
 
@@ -228,7 +240,7 @@ class ParallelConfig(BaseModel):
 
     @model_validator(mode="after")
     def validate_devices(self):
-        # I think we can do a 2x2 split on 2 gpus for pipeline/tensor model parallel
+        '''Validates the number of devices based on the tensor and pipeline model parallel sizes.'''
         if self.num_devices < self.tensor_model_parallel_size * self.pipeline_model_parallel_size:
             raise ValidationError(
                 "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
@@ -237,16 +249,35 @@ def validate_devices(self):
 
 
 class TrainingConfig(BaseModel):
+    """
+    TrainingConfig is a configuration class for training models.
+    Attributes:
+        max_steps (int): The maximum number of training steps.
+        limit_val_batches (int | float): The number of validation batches to use. Can be a fraction or a count.
+        val_check_interval (int): The interval (in steps) at which to check validation.
+        precision (Literal["32", "bf16-mixed", "16-mixed"], optional): The precision to use for training. Defaults to "bf16-mixed".
+        accelerator (str, optional): The type of accelerator to use for training. Defaults to "gpu".
+    """
+
     max_steps: int
-    limit_val_batches: int
+    limit_val_batches: int | float # Because this can be a fraction or a count...
     val_check_interval: int
-    # NOTE this matches whats used by nl.MegatronMixedPrecision which has a restricted set of precisions.
     precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
     accelerator: str = "gpu"
 
 
 class OptimizerSchedulerConfig(BaseModel):
-    # TODO validators on optimizer, interval, and monitor.
+    """
+    Configuration for the optimizer and learning rate scheduler.
+    Attributes:
+        lr (float): Learning rate for the optimizer. Default is 1e-4.
+        optimizer (str): Type of optimizer to use. Default is "adam".
+        cosine_rampup_frac (float): Fraction of total training steps for the cosine ramp-up phase. Default is 0.01.
+        cosine_hold_frac (float): Fraction of total training steps to hold the learning rate constant after ramp-up. Default is 0.05.
+        interval (str): Interval for updating the learning rate scheduler. Default is "step".
+        monitor (str): Metric to monitor for learning rate adjustments. Default is "val_loss".
+    """
+
     lr: float = 1e-4
     optimizer: str = "adam"
     cosine_rampup_frac: float = 0.01
@@ -256,11 +287,24 @@ class OptimizerSchedulerConfig(BaseModel):
 
 
 class ExperimentConfig(BaseModel):
+    """
+    Configuration class for setting up and managing experiment parameters.
+    Attributes:
+        save_every_n_steps (int): Number of steps between saving checkpoints.
+        result_dir (str | pathlib.Path): Directory where results will be saved.
+        experiment_name (str): Name of the experiment.
+        restore_from_checkpoint_path (Optional[str]): Path to restore from a checkpoint. Note: This does not invoke the checkpoint callback as expected.
+        save_last_checkpoint (bool): Flag to save the last checkpoint. Default is True.
+        metric_to_monitor_for_checkpoints (str): Metric to monitor for saving top-k checkpoints. Default is "reduced_train_loss".
+        save_top_k (int): Number of top checkpoints to save based on the monitored metric. Default is 2.
+        create_tensorboard_logger (bool): Flag to create a TensorBoard logger. Default is False.
+    """
+
     save_every_n_steps: int
     result_dir: str | pathlib.Path
     experiment_name: str
+    # NOTE: restore_from_checkpoint_path does not invoke the checkpoint callback in the way we'd like. Avoid using.
     restore_from_checkpoint_path: Optional[str]
-    wandb_config: Optional[WandbConfig] = None
     save_last_checkpoint: bool = True
     metric_to_monitor_for_checkpoints: str = "reduced_train_loss"
     save_top_k: int = 2
@@ -280,6 +324,12 @@ class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
     to execution a training job with the NeMo2 training api. It accepts two generic type parameters which users
     must define in their own environment for execution.
 
+    Additionally, this class assumes that the configs for ExposedModelConfig and DataConfig may have custom validators
+    implemented that operate on the entire MainConfig. This prevents the need from type based conditionals inside this
+    class while still allowing for custom validation global logic to be implemented in the underlying classes. For example,
+    some models may want to restrict their Datamodules seq_length to a certain value.
+
+
     Args:
         data_config: Generic config type that contains instructions on instantiating the required DataModule.
         parallel_config: The parallel configuration for the model.
@@ -301,14 +351,16 @@ class MainConfig(BaseModel, Generic[ExModelConfigT, DataConfigT]):
 
     @model_validator(mode="after")
     def validate_master_config(self) -> "MainConfig":
+        '''Validates the master configuration object.'''
         self.bionemo_model_config.seq_length = self.data_config.seq_length
-        # What other global validators should we set here?
         return self
 
     @model_validator(mode="after")
     def run_bionemo_model_config_model_validators(self) -> "MainConfig":
+        '''Runs the model validators on the bionemo_model_config.'''
         return self.bionemo_model_config.model_validator(self)
 
     @model_validator(mode="after")
     def run_data_config_model_validators(self) -> "MainConfig":
+        '''Runs the model validators on the data_config.'''
         return self.data_config.model_validator(self)

From 1d88386d8f633c09d557e3b03dac5f0b13d23b6e Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 4 Nov 2024 15:36:30 +0000
Subject: [PATCH 36/58] Fixes a typo in the ESM2 readme, adds geneformer
 pydantic section to readme, updates recipes to include 10m and 106m models.
 Matches the structure of ESM2.

---
 README.md                                     |  54 +++-
 .../bionemo/geneformer/run/config_models.py   |  57 ++--
 .../src/bionemo/geneformer/run/main.py        |   2 +-
 .../src/bionemo/geneformer/run/recipes.py     | 291 +++++++++++++-----
 4 files changed, 296 insertions(+), 108 deletions(-)

diff --git a/README.md b/README.md
index 446f457e4d..3e92600313 100644
--- a/README.md
+++ b/README.md
@@ -212,10 +212,12 @@ bionemo-esm2-recipe \
 --valid-cluster-path ${TEST_DATA_DIR}/2024_03_sanity/valid_clusters.parquet     \
 --valid-database-path ${TEST_DATA_DIR}/2024_03_sanity/validation.db     \
 --result-dir ./results     \
---dest my_config.yaml \
+--dest my_config.json \
 --recipe 8m
 ```
 
+> ⚠️ **IMPORTANT:** Inspect and edit the contents of the outputted my_config.json as you see fit
+
 > NOTE: To pretrain from an existing checkpoint, simply pass in the path --initial-ckpt-path to the recipe command. This will populate the JSON with the correct field to ensure pretraining is initialized from an existing checkpoint.
 
 To submit a training job with the passed config, first update the json file with any additional execution parameters
@@ -240,7 +242,7 @@ export NVTE_FLASH_ATTN=0
 bionemo-esm2-train \
 --data-config-t bionemo.esm2.run.config_models.ESM2DataConfig \
 --model-config-t bionemo.esm2.run.config_models.ExposedESM2PretrainConfig \
---config my_config.yaml
+--config my_config.json
 ```
 
 > NOTE: both data-config-t and model-config-t have default values corresponding to ESM2DataConfig and ExposedESM2PretrainingConfig
@@ -303,6 +305,54 @@ python  \
     --restore-from-checkpoint-path results/test_experiment/dev/checkpoints/test_experiment--val_loss=4.3506-epoch=1-last
 ```
 
+##### Running with Pydantic configs
+Alternatively, we provide a validated and serialized configuration file entrypoint for executing the same workflow. Recipes 
+are available for 10m, and 106m geneformer models. Additionally we provide an example recipe of finetuning, where the objective
+is to 'regress' on token IDs rather than the traditional masked language model approach. In practice, you will likely
+need to implement your own DataModule, DataConfig, and Finetuning model. You can use the same overall approach, but with
+customizations fory our task.
+
+
+```bash
+TEST_DATA_DIR=$(download_bionemo_data single_cell/testdata-20240506 --source $MY_DATA_SOURCE); \
+bionemo-geneformer-recipe \
+    --recipe 10m-pretrain \
+    --dest my_config.json \
+    --data-path ${TEST_DATA_DIR}/cellxgene_2023-12-15_small/processed_data \
+    --result-dir ./results
+```
+> ⚠️ **IMPORTANT:** Inspect and edit the contents of the outputted my_config.json as you see fit
+
+> NOTE: To pretrain from an existing checkpoint, simply pass in the path --initial-ckpt-path to the recipe command. This will populate the JSON with the correct field to ensure pretraining is initialized from an existing checkpoint.
+
+To submit a training job with the passed config, first update the json file with any additional execution parameters
+of your choosing: number of devices, workers, steps, etc. Second, invoke our training entrypoint. To do this, we need 
+three things:
+
+- Configuration file, the JSON produced by the previous step
+- Model config type, in this case the pretraining config. This will validate the arguments in the config JSON against 
+    those required for pretraining. Alternatively, things like fine-tuning with custom task heads may be specified here.
+    This allows for mixing/matching Data Modules with various tasks. 
+- Data Config type, this specifies how to parse, validate, and prepare the DataModule. This may change depending on task,
+for example, while fine-tuning you may want to use a custom Dataset/DataModule that includes PERTURB-seq. In this case,
+the default pretraining DataConfig and DataModule will be insufficient. See ESM2 for additional example usecases.
+
+> ⚠️ **Warning:** This setup does NO configuration of Weights and Biases. Edit your config JSON and populate it with your WandB details.
+
+```bash
+bionemo-esm2-train \
+--data-config-t bionemo.geneformer.run.config_models.GeneformerPretrainingDataConfig \
+--model-config-t bionemo.geneformer.run.config_models.ExposedGeneformerPretrainConfig \
+--config my_config.json
+```
+
+> NOTE: both data-config-t and model-config-t have default values corresponding to GeneformerPretrainingDataConfig and ExposedGeneformerPretrainConfig
+
+DataConfigT and ModelConfigT can also refer to locally defined types by the user. As long as python knows how to import
+the specified path, they may be configured. For example, you may have a custom Dataset/DataModule that you would like to
+mix with an existing recipe. In this case, you define a DataConfig object with the generic specified as your DataModule
+type, and then pass in the config type to the training recipe.
+
 
 
 ## Updating License Header on Python Files
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index fce4ca98da..91fd1aa132 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -43,13 +43,35 @@ class GeneformerDataArtifacts:
 
 
 class GeneformerPretrainingDataConfig(DataConfig[SingleCellDataModule]):
-    """Configuration for the geneformer pre-training data module."""
+    '''
+    Configuration class for Geneformer pretraining data. 
+
+    Expects train/test/val to be prior split by directory and processed by `sub-packages/bionemo-geneformer/src/bionemo/geneformer/data/singlecell/sc_memmap.py`.
+
+    Attributes:
+        data_dir (str): Directory where the data is stored.
+        result_dir (str | pathlib.Path): Directory where the results will be stored. Defaults to "./results".
+        micro_batch_size (int): Size of the micro-batch. Defaults to 8.
+        seq_length (int): Sequence length for the data. Defaults to 2048.
+        num_dataset_workers (int): Number of workers for data loading. Defaults to 0.
+
+    Properties:
+        train_data_path (str): Path to the training data.
+        val_data_path (str): Path to the validation data.
+        test_data_path (str): Path to the test data.
+
+    Methods:
+        geneformer_preprocess() -> GeneformerDataArtifacts:
+            Preprocesses the data using a legacy preprocessor from BioNeMo 1 and returns the necessary artifacts.
+        construct_data_module(global_batch_size: int) -> SingleCellDataModule:
+            Constructs and returns a SingleCellDataModule using the preprocessed data artifacts.
+    '''
 
     # Shadow two attributes from the parent for visibility.
+    data_dir: str
     result_dir: str | pathlib.Path = "./results"
     micro_batch_size: int = 8
 
-    data_dir: str
     seq_length: int = 2048
     num_dataset_workers: int = 0
 
@@ -83,6 +105,7 @@ def geneformer_preprocess(self) -> GeneformerDataArtifacts:
             raise ValueError("Preprocessing failed to create tokenizer and/or median dictionary.")
 
     def construct_data_module(self, global_batch_size: int) -> SingleCellDataModule:
+        ''' Downloads the requisite data artifacts and instantiates the DataModule. '''
         geneformer_data_artifacts: GeneformerDataArtifacts = self.geneformer_preprocess()
         data = SingleCellDataModule(
             seq_length=self.seq_length,
@@ -102,6 +125,12 @@ def construct_data_module(self, global_batch_size: int) -> SingleCellDataModule:
 
 
 class ExposedGeneformerPretrainConfig(ExposedModelConfig[GeneformerConfig]):
+    ''' Exposes custom parameters for pretraining and binds the class to GeneformerConfig.
+
+    Attributes:
+        initial_ckpt_path (str): Path to a directory containing checkpoint files for initializing the model. This is only
+        initial_ckpt_skip_keys_with_these_prefixes (List[str]): Skip any layer that contains this key during restoration. Useful for finetuning, set the names of the task heads so checkpoint restoration does not errorniously try to restore these.
+    '''
     # Custom parameters for FineTuning
     initial_ckpt_path: Optional[str] = None
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
@@ -125,25 +154,5 @@ class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBer
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=lambda: ["regression_head"])
 
     def model_class(self) -> Type[FineTuneSeqLenBioBertConfig]:
-        return FineTuneSeqLenBioBertConfig
-
-
-def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
-    raise Exception
-    # TODO lift into llm?
-    checkpoint_callback = nl_callbacks.ModelCheckpoint(
-        save_last=experiment_config.save_last_checkpoint,
-        monitor=experiment_config.metric_to_monitor_for_checkpoints,
-        save_top_k=experiment_config.save_top_k,
-        every_n_train_steps=experiment_config.save_every_n_steps,
-        always_save_context=True,
-    )
-
-    nemo_logger = setup_nemo_lightning_logger(
-        root_dir=experiment_config.result_dir,
-        name=experiment_config.experiment_name,
-        initialize_tensorboard_logger=experiment_config.create_tensorboard_logger,
-        wandb_config=wandb_config,
-        ckpt_callback=checkpoint_callback,
-    )
-    return nemo_logger
+        ''' binds the class to FineTuneSeqLenBioBertConfig '''
+        return FineTuneSeqLenBioBertConfig
\ No newline at end of file
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 0f59a34cc2..e2eaf426af 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -18,6 +18,7 @@
 import json
 from typing import Optional
 
+
 from bionemo.geneformer.run.config_models import (
     ExposedFineTuneSeqLenBioBertConfig,
     ExposedGeneformerPretrainConfig,
@@ -76,7 +77,6 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
             data_config_t = GeneformerPretrainingDataConfig
         elif isinstance(data_config_t, str):
             data_config_t = string_to_class(data_config_t)
-
         return MainConfig[model_config_t, data_config_t](**config_dict)
 
     args = parse_args()
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index 1fc9b447e8..5babf99c71 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -36,26 +36,174 @@
 from bionemo.llm.utils.logger_utils import WandbConfig
 
 
-"""
-This script is for defining pre-configured recipes. Recipes at the minimum provide the user with a template config file.
-Additionally, it may be useful to define prepackaged recipes for common usecases such as tests. Here we define a the
-following recipes:
+def geneformer_base_parallel_config() -> ParallelConfig:
+    '''Base parallel config for Geneformer'''
+    return ParallelConfig(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        accumulate_grad_batches=1,
+        ddp='megatron',
+        num_devices=1,
+        num_nodes=1,
+    )
+
+def geneformer_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
+    '''Base optimizer scheduler config for Geneformer'''
+    return OptimizerSchedulerConfig(lr=1e-3) # Matches bionemo1
 
-- example recipe with minimal data
-- test recipe for running tests (same as above?)
-- finetuning recipe with regression head based on the output of the test recipe.
-- pretraining recipe on 10M sized model
-- pretraining recipe on 106M sized model
-"""
+def geneformer_base_training_config() -> TrainingConfig:
+    '''Base training config for Geneformer'''
+    return TrainingConfig(max_steps=400000, limit_val_batches=8, val_check_interval=100, precision='bf16-mixed') # matches bionemo1
 
 
 def geneformer_data_recipe(data_dir) -> GeneformerPretrainingDataConfig:
     """Recipe that produces the base geneformer small data configuration."""
     return GeneformerPretrainingDataConfig(data_dir=data_dir)
 
+# 10m definitions
+def geneformer_10m_model_config(
+    seq_length: int = 2048,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
+) -> ExposedGeneformerPretrainConfig:
+    '''Geneformer 10m model config settings'''
+    geneformer_config = ExposedGeneformerPretrainConfig(
+        num_layers=6,
+        hidden_size=256,
+        ffn_hidden_size=512,
+        num_attention_heads=4,
+        seq_length=seq_length,
+        fp32_residual_connection=False,
+        hidden_dropout=0.02,
+        init_method_std=0.02,
+        kv_channels=None,
+        apply_query_key_layer_scaling=False,
+        make_vocab_size_divisible_by=128,
+        masked_softmax_fusion=True,
+        fp16_lm_cross_entropy=False,
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        gradient_accumulation_fusion=False,
+        layernorm_zero_centered_gamma=False,
+        layernorm_epsilon=1.0e-12,
+        activation_func="gelu",
+        qk_layernorm=False,
+        apply_residual_connection_post_layernorm=False,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+        get_attention_mask_from_fusion=False,
+        attention_dropout=0.1,
+        share_embeddings_and_output_weights=True,
+        enable_autocast=False,
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
+    )
+    return geneformer_config
 
-def full_geneformer_data_recipe(data_dir) -> GeneformerPretrainingDataConfig:
-    return GeneformerPretrainingDataConfig(data_dir=data_dir)
+def geneformer_10m_experiment_config(result_dir) -> ExperimentConfig:
+    '''Experiment config for Geneformer 10m'''
+    return ExperimentConfig(
+        save_every_n_steps=100,
+        result_dir=result_dir,
+        experiment_name="geneformer-10m",
+        restore_from_checkpoint_path=None
+    )
+
+def geneformer_10m_wandb_config() -> WandbConfig:
+    '''Wandb config for Geneformer 10m'''
+    wandb_config = WandbConfig(
+        entity='geneformer-10m_pretraining',
+        project='geneformer-10m_pretraining',
+        group='geneformer-10m',
+        tags=['geneformer-10m'],
+        offline=True,
+        anonymous=True,
+        id='1',
+        log_model=False,
+    )
+    return wandb_config
+
+# 106m definition, model, experiment, wandb, parallel
+def geneformer_106m_parallel_config() -> ParallelConfig:
+    '''Base parallel config for Geneformer'''
+    return ParallelConfig(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        accumulate_grad_batches=1,
+        ddp='megatron',
+        num_devices=8,
+        num_nodes=1,
+    )
+
+def geneformer_106m_experiment_config(result_dir) -> ExperimentConfig:
+    '''Experiment config for Geneformer 106m'''
+    return ExperimentConfig(
+        save_every_n_steps=100,
+        result_dir=result_dir,
+        experiment_name="geneformer-106m",
+        restore_from_checkpoint_path=None
+    )
+
+def geneformer_106m_wandb_config() -> WandbConfig:
+    '''Wandb config for Geneformer 106m'''
+    wandb_config = WandbConfig(
+        entity='geneformer-106m_pretraining',
+        project='geneformer-106m_pretraining',
+        group='geneformer-106m',
+        tags=['geneformer-106m'],
+        offline=True,
+        anonymous=True,
+        id='1',
+        log_model=False,
+    )
+    return wandb_config
+
+def geneformer_106m_model_config(
+    seq_length: int = 2048,
+    precision: PrecisionTypes = "bf16-mixed",
+    nemo1_init_path: Optional[str] = None,
+    initial_ckpt_path: Optional[str] = None,
+    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
+) -> ExposedGeneformerPretrainConfig:
+    '''Geneformer 106m model config settings'''
+    geneformer_config = ExposedGeneformerPretrainConfig(
+        num_layers=12,
+        hidden_size=768,
+        ffn_hidden_size=3072,
+        num_attention_heads=12,
+        seq_length=seq_length,
+        fp32_residual_connection=False,
+        hidden_dropout=0.02,
+        init_method_std=0.02,
+        kv_channels=None,
+        apply_query_key_layer_scaling=False,
+        make_vocab_size_divisible_by=128,
+        masked_softmax_fusion=True,
+        fp16_lm_cross_entropy=False,
+        params_dtype=precision,
+        pipeline_dtype=precision,
+        autocast_dtype=precision,
+        gradient_accumulation_fusion=False,
+        layernorm_zero_centered_gamma=False,
+        layernorm_epsilon=1.0e-12,
+        activation_func="gelu",
+        qk_layernorm=False,
+        apply_residual_connection_post_layernorm=False,
+        bias_activation_fusion=True,
+        bias_dropout_fusion=True,
+        get_attention_mask_from_fusion=False,
+        attention_dropout=0.1,
+        share_embeddings_and_output_weights=True,
+        enable_autocast=False,
+        biobert_spec_option=biobert_spec_option,
+        nemo1_ckpt_path=nemo1_init_path,
+        initial_ckpt_path=initial_ckpt_path,
+    )
+    return geneformer_config
 
 
 def simple_parallel_recipe(
@@ -64,12 +212,14 @@ def simple_parallel_recipe(
     num_devices: int = 1,
     accumulate_grad_batches: int = 1,
 ) -> ParallelConfig:
+    '''Simple parallel config for Geneformer, only used in testing.'''
     assert (
         num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
     ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
     return ParallelConfig(
         tensor_model_parallel_size=tensor_model_parallel_size,
         pipeline_model_parallel_size=pipeline_model_parallel_size,
+        accumulate_grad_batches=accumulate_grad_batches,
         num_devices=num_devices,
     )
 
@@ -80,10 +230,7 @@ def geneformer_finetuning_regression_head_recipe(
     initial_ckpt_path: Optional[str] = None,
     initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None,
 ) -> ExposedFineTuneSeqLenBioBertConfig:
-    """NOTE on initial_ckpt_skip_keys_with_these_prefixes: configs define their own default with defaultfactory, so
-    when we get passed None, we defer to the default. Importantly, the 'do nothing' case is different, where the input
-    would be an empty list.
-    """
+    '''Recipe for finetuning a regression head on the masked tokens.'''
     partial_finetuning_config = partial(
         ExposedFineTuneSeqLenBioBertConfig,
         params_dtype=precision,
@@ -104,16 +251,18 @@ def geneformer_finetuning_regression_head_recipe(
 
 
 def default_trainer_config_recipe() -> TrainingConfig:
+    '''Default trainer config for Geneformer'''
     return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
 
 
-def geneformer10m_finetune_config(
+def geneformer_10m_finetune_config(
     seq_length: int = 2048,
     precision: PrecisionTypes = "bf16-mixed",
     nemo1_init_path: Optional[str] = None,
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option=BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedFineTuneSeqLenBioBertConfig:
+    '''Geneformer 10m finetuning config settings'''
     geneformer_config = ExposedFineTuneSeqLenBioBertConfig(
         num_layers=6,
         hidden_size=256,
@@ -157,6 +306,7 @@ def geneformer_tiny_config(
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedGeneformerPretrainConfig:
+    '''Geneformer tiny model config settings, used in testing.'''
     geneformer_config = ExposedGeneformerPretrainConfig(
         num_layers=2,
         hidden_size=32,
@@ -193,54 +343,13 @@ def geneformer_tiny_config(
     return geneformer_config
 
 
-def geneformer10M_pretraining_config(
-    seq_length: int = 2048,
-    precision: PrecisionTypes = "bf16-mixed",
-    nemo1_init_path: Optional[str] = None,
-    initial_ckpt_path: Optional[str] = None,
-    biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
-) -> ExposedGeneformerPretrainConfig:
-    geneformer_config = ExposedGeneformerPretrainConfig(
-        num_layers=6,
-        hidden_size=256,
-        ffn_hidden_size=512,
-        num_attention_heads=4,
-        seq_length=seq_length,
-        fp32_residual_connection=False,
-        hidden_dropout=0.02,
-        init_method_std=0.02,
-        kv_channels=None,
-        apply_query_key_layer_scaling=False,
-        make_vocab_size_divisible_by=128,
-        masked_softmax_fusion=True,
-        fp16_lm_cross_entropy=False,
-        params_dtype=precision,
-        pipeline_dtype=precision,
-        autocast_dtype=precision,
-        gradient_accumulation_fusion=False,
-        layernorm_zero_centered_gamma=False,
-        layernorm_epsilon=1.0e-12,
-        activation_func="gelu",
-        qk_layernorm=False,
-        apply_residual_connection_post_layernorm=False,
-        bias_activation_fusion=True,
-        bias_dropout_fusion=True,
-        get_attention_mask_from_fusion=False,
-        attention_dropout=0.1,
-        share_embeddings_and_output_weights=True,
-        enable_autocast=False,
-        biobert_spec_option=biobert_spec_option,
-        nemo1_ckpt_path=nemo1_init_path,
-        initial_ckpt_path=initial_ckpt_path,
-    )
-    return geneformer_config
-
-
 def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
+    '''Default optimizer scheduler config for Geneformer. See OptimizerSchedulerConfig for defaults.'''
     return OptimizerSchedulerConfig()
 
 
 def experiment_config_recipe() -> ExperimentConfig:
+    '''Default experiment config for Geneformer. Used in testing. '''
     return ExperimentConfig(
         save_every_n_steps=100,
         result_dir="./results",
@@ -254,6 +363,7 @@ def experiment_config_recipe() -> ExperimentConfig:
 
 
 def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig]:
+    '''Recipe for finetuning a regression head on the masked tokens.'''
     data_path = args.data_path
     result_dir = args.result_dir
 
@@ -281,7 +391,7 @@ def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig,
     )
 
     optim_config = OptimizerSchedulerConfig()
-    geneformer_config = geneformer10m_finetune_config(
+    geneformer_config = geneformer_10m_finetune_config(
         seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
     )
 
@@ -296,6 +406,7 @@ def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig,
 
 
 def pretrain_tiny_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
+    '''Recipe for pretraining a tiny model. Used in testing.'''
     data_path = args.data_path
     result_dir = args.result_dir
 
@@ -337,25 +448,39 @@ def pretrain_tiny_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfi
     )
 
 
-def geneformer10m_pretrain_recipe(
+def geneformer_10m_pretrain_recipe(
     args,
 ) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
-    data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_dir)
+    '''Recipe for pretraining the 10m model.'''
+    data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_path)
     parallel_config = simple_parallel_recipe()
-    training_config = default_trainer_config_recipe()
-    bionemo_model_config = geneformer10M_pretraining_config(initial_ckpt_path=args.initial_ckpt_path)
-    optim_config = default_adam_optimizer_with_cosine_annealing_recipe()
-    experiment_config = experiment_config_recipe()
-    wandb_config = WandbConfig(
-        project="bionemo2-demo",
-        entity="nvidia",
-        offline=True,
-        tags=[],
-        group="dev",
-        id="dev",
-        log_model=False,
-        anonymous=True,
+    training_config = geneformer_base_training_config()
+    bionemo_model_config = geneformer_10m_model_config(initial_ckpt_path=args.initial_ckpt_path)
+    optim_config = geneformer_base_optimizer_scheduler_config()
+    experiment_config = geneformer_10m_experiment_config(result_dir=args.result_dir)
+    wandb_config = geneformer_10m_wandb_config()
+    main_config = MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig](
+        data_config=data_config,
+        parallel_config=parallel_config,
+        training_config=training_config,
+        bionemo_model_config=bionemo_model_config,
+        optim_config=optim_config,
+        experiment_config=experiment_config,
+        wandb_config=wandb_config,
     )
+    return main_config
+
+def geneformer_106m_pretrain_recipe(
+    args,
+) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
+    '''Recipe for pretraining the 106m model. Uses 8 GPUs for data parallelism'''
+    data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_path)
+    parallel_config = geneformer_106m_parallel_config()
+    training_config = geneformer_base_training_config()
+    bionemo_model_config = geneformer_106m_model_config(initial_ckpt_path=args.initial_ckpt_path)
+    optim_config = geneformer_base_optimizer_scheduler_config()
+    experiment_config = geneformer_106m_experiment_config(result_dir=args.result_dir)
+    wandb_config = geneformer_106m_wandb_config()
     main_config = MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig](
         data_config=data_config,
         parallel_config=parallel_config,
@@ -368,9 +493,11 @@ def geneformer10m_pretrain_recipe(
     return main_config
 
 
-def geneformer10m_finetune_recipe(
+
+def geneformer_10m_finetune_recipe(
     args,
 ) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig]:
+    '''Recipe for finetuning the 10m model on a token regression head. Used as an example and for testing. '''
     data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_path)
     parallel_config = simple_parallel_recipe()
     training_config = default_trainer_config_recipe()
@@ -405,7 +532,7 @@ def parse_args():
         parser.add_argument(
             "--recipe",
             type=str,
-            choices=["test", "10m-pretrain", "test-finetune", "finetune"],
+            choices=["test", "10m-pretrain", "106m-pretrain", "test-finetune", "finetune"],
             required=True,
             help="Use one of the preconfigured recipes to create a template config file.",
         )
@@ -425,7 +552,6 @@ def parse_args():
             "--result-dir", type=str, required=True, help="Path to the directory used to save results."
         )
 
-        # Extra argument.
         parser.add_argument(
             "--initial-ckpt-path",
             type=str,
@@ -443,13 +569,16 @@ def parse_args():
     if args.recipe == "test":
         config = pretrain_tiny_test_recipe(args)
     elif args.recipe == "10m-pretrain":
-        config = geneformer10m_pretrain_recipe(args)
+        config = geneformer_10m_pretrain_recipe(args)
     elif args.recipe == "106m-pretrain":
+        config = geneformer_106m_pretrain_recipe(args)
         raise NotImplementedError("106M pretraining recipe not implemented.")
     elif args.recipe == "test-finetune":
+        # Uses a bigger model because we have a pretrained model for it.
         config = finetune_test_recipe(args)
     elif args.recipe == "finetune":
-        config = geneformer10m_finetune_recipe(args)
+        # NOTE: this recipe finetunes a regression model on the masked tokens, if youre looking to finetune with a custom task, youll need to define your own classes.
+        config = geneformer_10m_finetune_recipe(args)
     else:
         raise ValueError("Invalid recipe choice.")
 

From 3b5c7695cee1a2c8e321ba3b7a6c4560ee7b9a89 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 4 Nov 2024 15:38:57 +0000
Subject: [PATCH 37/58] formatting

---
 README.md                                     |  16 +-
 .../src/bionemo/esm2/model/model.py           |   1 +
 .../src/bionemo/esm2/run/__init__.py          |   2 -
 .../src/bionemo/esm2/run/config_models.py     |  32 ++--
 .../bionemo-esm2/src/bionemo/esm2/run/main.py |   3 +-
 .../src/bionemo/esm2/run/recipes.py           | 153 +++++++++---------
 .../src/bionemo/geneformer/run/__init__.py    |   2 -
 .../bionemo/geneformer/run/config_models.py   |  20 +--
 .../src/bionemo/geneformer/run/main.py        |   1 -
 .../src/bionemo/geneformer/run/recipes.py     |  89 +++++-----
 .../src/bionemo/llm/run/config_models.py      |  32 ++--
 11 files changed, 180 insertions(+), 171 deletions(-)

diff --git a/README.md b/README.md
index 3e92600313..f172c531cc 100644
--- a/README.md
+++ b/README.md
@@ -200,7 +200,7 @@ python  \
 
 ##### Running with Pydantic configs
 
-Alternatively, we provide a validated and serialized configuration file entrypoint for executing the same workflow. Recipes 
+Alternatively, we provide a validated and serialized configuration file entrypoint for executing the same workflow. Recipes
 are available for 8m, 650m, and 3b ESM2 models.
 
 ```bash
@@ -221,13 +221,13 @@ bionemo-esm2-recipe \
 > NOTE: To pretrain from an existing checkpoint, simply pass in the path --initial-ckpt-path to the recipe command. This will populate the JSON with the correct field to ensure pretraining is initialized from an existing checkpoint.
 
 To submit a training job with the passed config, first update the json file with any additional execution parameters
-of your choosing: number of devices, workers, steps, etc. Second, invoke our training entrypoint. To do this, we need 
+of your choosing: number of devices, workers, steps, etc. Second, invoke our training entrypoint. To do this, we need
 three things:
 
 - Configuration file, the JSON produced by the previous step
-- Model config type, in this case the pretraining config. This will validate the arguments in the config JSON against 
+- Model config type, in this case the pretraining config. This will validate the arguments in the config JSON against
     those required for pretraining. Alternatively, things like fine-tuning with custom task heads may be specified here.
-    This allows for mixing/matching Data Modules with various tasks. 
+    This allows for mixing/matching Data Modules with various tasks.
 - Data Config type, this specifies how to parse, validate, and prepare the DataModule. This may change depending on task,
 for example, pretraining ESM2 uses a protein cluster oriented sampling method. In the case of inference or fine-tuning
 a pretrained model, a simple fasta file may be sufficient. There is a one-to-one relationship between DataConfig types
@@ -306,7 +306,7 @@ python  \
 ```
 
 ##### Running with Pydantic configs
-Alternatively, we provide a validated and serialized configuration file entrypoint for executing the same workflow. Recipes 
+Alternatively, we provide a validated and serialized configuration file entrypoint for executing the same workflow. Recipes
 are available for 10m, and 106m geneformer models. Additionally we provide an example recipe of finetuning, where the objective
 is to 'regress' on token IDs rather than the traditional masked language model approach. In practice, you will likely
 need to implement your own DataModule, DataConfig, and Finetuning model. You can use the same overall approach, but with
@@ -326,13 +326,13 @@ bionemo-geneformer-recipe \
 > NOTE: To pretrain from an existing checkpoint, simply pass in the path --initial-ckpt-path to the recipe command. This will populate the JSON with the correct field to ensure pretraining is initialized from an existing checkpoint.
 
 To submit a training job with the passed config, first update the json file with any additional execution parameters
-of your choosing: number of devices, workers, steps, etc. Second, invoke our training entrypoint. To do this, we need 
+of your choosing: number of devices, workers, steps, etc. Second, invoke our training entrypoint. To do this, we need
 three things:
 
 - Configuration file, the JSON produced by the previous step
-- Model config type, in this case the pretraining config. This will validate the arguments in the config JSON against 
+- Model config type, in this case the pretraining config. This will validate the arguments in the config JSON against
     those required for pretraining. Alternatively, things like fine-tuning with custom task heads may be specified here.
-    This allows for mixing/matching Data Modules with various tasks. 
+    This allows for mixing/matching Data Modules with various tasks.
 - Data Config type, this specifies how to parse, validate, and prepare the DataModule. This may change depending on task,
 for example, while fine-tuning you may want to use a custom Dataset/DataModule that includes PERTURB-seq. In this case,
 the default pretraining DataConfig and DataModule will be insufficient. See ESM2 for additional example usecases.
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
index 17051298a2..5aeeef9804 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
@@ -266,6 +266,7 @@ class ESM2GenericConfig(BioBertConfig[ESM2ModelT, MegatronLossType]):
         return_only_hidden_states: Whether to return only hidden states.
         loss_reduction_class: Loss reduction class for the model. Default to BERTMLMLossWithReduction.
     """
+
     # ESM specific fields (these are repeated below)
     use_esm_attention: bool = False  # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
     token_dropout: bool = True
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py
index 79672139c9..25e6abfbc5 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/__init__.py
@@ -12,5 +12,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
index 7f58ac2c44..95f73ae043 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
@@ -27,17 +27,16 @@
 from bionemo.esm2.data.tokenizer import get_tokenizer
 from bionemo.esm2.model.attention import ESM2DotProductAttention, ESM2TEDotProductAttention
 from bionemo.esm2.model.model import ESM2Config
+from bionemo.llm.model.biobert.model import BiobertSpecOption
 from bionemo.llm.run.config_models import (
     DataConfig,
     ExposedModelConfig,
     MainConfig,
 )
-from bionemo.llm.model.biobert.model import BiobertSpecOption
 
 
 class ESM2DataConfig(DataConfig[ESMDataModule]):
-    """
-    ESM2DataConfig is a configuration class for setting up the pre-training data module for ESM2.
+    """ESM2DataConfig is a configuration class for setting up the pre-training data module for ESM2.
 
     The ESM2DataModule implements the cluster oriented sampling method defined in the ESM2 publication.
 
@@ -71,7 +70,7 @@ class ESM2DataConfig(DataConfig[ESMDataModule]):
     num_dataset_workers: int = 0
 
     def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
-        '''Constructs and returns an ESMDataModule instance with the provided global batch size.'''
+        """Constructs and returns an ESMDataModule instance with the provided global batch size."""
         tokenizer = get_tokenizer()
         data = ESMDataModule(
             train_cluster_path=self.train_cluster_path,
@@ -91,8 +90,7 @@ def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
 
 class ExposedESM2PretrainConfig(ExposedModelConfig[ESM2Config]):
     class ExposedESM2PretrainConfig:
-        """
-        Configuration class for ESM2 pretraining with select exposed parameters.
+        """Configuration class for ESM2 pretraining with select exposed parameters.
 
         See the inherited ExposedModelConfig for attributes and methods from the base class. Use this class either
         as a template or extension for custom configurations. Importantly, these kinds of classes should do two things,
@@ -120,7 +118,7 @@ class ExposedESM2PretrainConfig:
                 Returns the model class associated with this configuration.
         """
 
-    use_esm_attention: bool = False # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
+    use_esm_attention: bool = False  # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
     token_dropout: bool = True
     normalize_attention_scores: bool = False
     variable_seq_lengths: bool = False
@@ -129,7 +127,7 @@ class ExposedESM2PretrainConfig:
     @field_validator("biobert_spec_option", mode="after")
     @classmethod
     def restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -> BiobertSpecOption:
-        '''Validates the BiobertSpecOption to ensure it is compatible with ESM2. by restricting it to the specs compatable with ESM2'''
+        """Validates the BiobertSpecOption to ensure it is compatible with ESM2. by restricting it to the specs compatable with ESM2"""
         if biobert_spec_option in (
             BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
             BiobertSpecOption.esm2_bert_layer_local_spec,
@@ -142,14 +140,14 @@ def restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -
 
     @field_serializer("core_attention_override")
     def serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
-        '''Serializes the core attention override module to a string.'''
+        """Serializes the core attention override module to a string."""
         if value is None:
             return None
         return f"{value.__module__}.{value.__name__}"
 
     @field_validator("core_attention_override", mode="before")
     def validate_core_attention_override(cls, value):
-        '''Validates the core attention override module, ensuring it is a subclass of torch.nn.Module.'''
+        """Validates the core attention override module, ensuring it is a subclass of torch.nn.Module."""
         if value is None:
             return None
         if isinstance(value, str):
@@ -166,7 +164,7 @@ def validate_core_attention_override(cls, value):
 
     @model_validator(mode="after")
     def validate_and_set_attention_and_scaling(self):
-        '''Validates and sets the attention and scaling parameters based on the biobert_spec_option.''' 
+        """Validates and sets the attention and scaling parameters based on the biobert_spec_option."""
         logging.info(
             "Mutating apply_query_key_layer_scaling and core_attention_override based on biobert_spec_option.."
         )
@@ -183,14 +181,14 @@ def validate_and_set_attention_and_scaling(self):
         return self
 
     def model_validator(self, global_cfg: MainConfig) -> MainConfig:
-        '''Validates the global configuration, ensuring compatibility with ESM2DataConfig and parallel settings.
-        
+        """Validates the global configuration, ensuring compatibility with ESM2DataConfig and parallel settings.
+
         The global validator acts on the MainConfig, this couples together the ESM2DataConfig with ESM2PretrainingConfig.
         Additionally, it provides validation for sequence length and parallelism settings.
 
         Args:
-            global_cfg (MainConfig): The global configuration object. 
-        '''
+            global_cfg (MainConfig): The global configuration object.
+        """
         global_cfg = super().model_validator(global_cfg)
         # Need to ensure that at the least we have access to min_seq_length and max_seq_length
         if not isinstance(global_cfg.data_config, ESM2DataConfig):
@@ -208,5 +206,5 @@ def model_validator(self, global_cfg: MainConfig) -> MainConfig:
         return global_cfg
 
     def model_class(self) -> Type[ESM2Config]:
-        '''Returns the model class associated with this configuration.'''
-        return ESM2Config
\ No newline at end of file
+        """Returns the model class associated with this configuration."""
+        return ESM2Config
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
index e2ac55718c..10a4a94956 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -49,6 +49,7 @@ def parse_args():
 
     def string_to_class(path: str):
         import importlib
+
         module_path, class_name = path.rsplit(".", 1)
         module = importlib.import_module(module_path)
         return getattr(module, class_name)
@@ -90,4 +91,4 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
index 279c3c8f73..1bc2505d64 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
@@ -14,12 +14,7 @@
 # limitations under the License.
 
 
-from typing import Type
-import torch
-from pydantic import BaseModel, field_serializer, field_validator
-
 import argparse
-import importlib
 from pathlib import Path
 from typing import Optional
 
@@ -27,6 +22,7 @@
 
 from bionemo.core.utils.dtypes import PrecisionTypes
 from bionemo.esm2.run.config_models import ESM2DataConfig, ExposedESM2PretrainConfig
+from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
 from bionemo.llm.run.config_models import (
     ExperimentConfig,
     MainConfig,
@@ -34,63 +30,65 @@
     ParallelConfig,
     TrainingConfig,
 )
-from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
 from bionemo.llm.utils.logger_utils import WandbConfig
 
 
 def esm2_base_training_config() -> TrainingConfig:
-    '''Base training config for ESM2'''
-    return TrainingConfig(max_steps=500000, limit_val_batches=1.0, val_check_interval=1500, precision='bf16-mixed')
+    """Base training config for ESM2"""
+    return TrainingConfig(max_steps=500000, limit_val_batches=1.0, val_check_interval=1500, precision="bf16-mixed")
 
 
 def esm2_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
-    '''Base optimizer scheduler config for ESM2'''
+    """Base optimizer scheduler config for ESM2"""
     return OptimizerSchedulerConfig(
-        optimizer='adam',
+        optimizer="adam",
         lr=4e-4,
-        cosine_rampup_frac=.01,
-        cosine_hold_frac=.05,
-        interval='step',
-        monitor='val_loss',
+        cosine_rampup_frac=0.01,
+        cosine_hold_frac=0.05,
+        interval="step",
+        monitor="val_loss",
     )
 
 
 def esm2_base_parallel_config() -> ParallelConfig:
-    '''Base parallel config for ESM2'''
+    """Base parallel config for ESM2"""
     return ParallelConfig(
         tensor_model_parallel_size=1,
         pipeline_model_parallel_size=1,
         accumulate_grad_batches=1,
-        ddp='megatron',
+        ddp="megatron",
         num_devices=1,
         num_nodes=1,
     )
 
+
 def esm2_8m_wandb_config() -> WandbConfig:
-    '''Wandb config for ESM2 8m'''
+    """Wandb config for ESM2 8m"""
     wandb_config = WandbConfig(
-        entity='esm2-8m_pretraining',
-        project='esm2-8m_pretraining',
-        group='esm2-8m',
-        tags=['esm2-8m'],
+        entity="esm2-8m_pretraining",
+        project="esm2-8m_pretraining",
+        group="esm2-8m",
+        tags=["esm2-8m"],
         offline=True,
         anonymous=True,
-        id='1',
+        id="1",
         log_model=False,
     )
     return wandb_config
 
+
 def esm2_8m_experiment_config(result_dir) -> ExperimentConfig:
-    '''Experiment config for ESM2 8m'''
+    """Experiment config for ESM2 8m"""
     return ExperimentConfig(
-        save_every_n_steps=50, # default set in previous script.
+        save_every_n_steps=50,  # default set in previous script.
         result_dir=result_dir,
-        experiment_name='esm2-8m', 
-        restore_from_checkpoint_path=None
+        experiment_name="esm2-8m",
+        restore_from_checkpoint_path=None,
     )
 
+
 def esm2_8m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
-    '''Model config for ESM2 8m'''
+    """Model config for ESM2 8m"""
     return ExposedESM2PretrainConfig(
         num_layers=6,
         hidden_size=320,
@@ -99,14 +97,14 @@ def esm2_8m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
         seq_length=1024,
         biobert_spec_option=BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
         initial_ckpt_path=initial_ckpt_path,
-        params_dtype='bf16-mixed',
-        pipeline_dtype='bf16-mixed',
-        autocast_dtype='bf16-mixed',
+        params_dtype="bf16-mixed",
+        pipeline_dtype="bf16-mixed",
+        autocast_dtype="bf16-mixed",
     )
 
 
 def esm2_8m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
-    '''Recipe for ESM2 8m'''
+    """Recipe for ESM2 8m"""
     data_config = ESM2DataConfig(
         min_seq_length=1024,
         max_seq_length=1024,
@@ -121,53 +119,57 @@ def esm2_8m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig
     return MainConfig(
         data_config=data_config,
         parallel_config=esm2_base_parallel_config(),
-        training_config=esm2_base_training_config(), # no changes for 8m
+        training_config=esm2_base_training_config(),  # no changes for 8m
         bionemo_model_config=esm2_8m_model_config(args.initial_ckpt_path),
-        optim_config=esm2_base_optimizer_scheduler_config(), # no changes for 8m
+        optim_config=esm2_base_optimizer_scheduler_config(),  # no changes for 8m
         experiment_config=esm2_8m_experiment_config(args.result_dir),
         wandb_config=esm2_8m_wandb_config(),
     )
 
+
 def esm2_650m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
-    '''Model config for ESM2 650m'''
+    """Model config for ESM2 650m"""
     return ExposedESM2PretrainConfig(
         num_layers=6,
         hidden_size=1280,
-        ffn_hidden_size=1280* 4,
+        ffn_hidden_size=1280 * 4,
         seq_length=1024,
         num_attention_heads=20,
         biobert_spec_option=BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
         initial_ckpt_path=initial_ckpt_path,
-        params_dtype='bf16-mixed',
-        pipeline_dtype='bf16-mixed',
-        autocast_dtype='bf16-mixed'
+        params_dtype="bf16-mixed",
+        pipeline_dtype="bf16-mixed",
+        autocast_dtype="bf16-mixed",
     )
 
+
 def esm2_650m_wandb_config() -> WandbConfig:
-    '''Wandb config for ESM2 650m'''
+    """Wandb config for ESM2 650m"""
     return WandbConfig(
-        entity='esm2-650m_pretraining',
-        project='esm2-650m_pretraining',
-        group='esm2-650m',
-        tags=['esm2-650m'],
+        entity="esm2-650m_pretraining",
+        project="esm2-650m_pretraining",
+        group="esm2-650m",
+        tags=["esm2-650m"],
         offline=True,
         anonymous=True,
-        id='1',
+        id="1",
         log_model=False,
     )
 
+
 def esm2_650m_experiment_config(result_dir) -> ExperimentConfig:
-    '''Experiment config for ESM2 650m'''
+    """Experiment config for ESM2 650m"""
     return ExperimentConfig(
         save_every_n_steps=50,
         result_dir=result_dir,
-        experiment_name='esm2-650m',
+        experiment_name="esm2-650m",
         # TODO should this be exposed?
-        restore_from_checkpoint_path=None
+        restore_from_checkpoint_path=None,
     )
 
+
 def esm2_650m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
-    '''Recipe for ESM2 650m'''
+    """Recipe for ESM2 650m"""
     data_config = ESM2DataConfig(
         min_seq_length=1024,
         max_seq_length=1024,
@@ -182,27 +184,29 @@ def esm2_650m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConf
     return MainConfig(
         data_config=data_config,
         parallel_config=esm2_base_parallel_config(),
-        training_config=esm2_base_training_config(), # no changes for 8m
+        training_config=esm2_base_training_config(),  # no changes for 8m
         bionemo_model_config=esm2_650m_model_config(args.initial_ckpt_path),
-        optim_config=esm2_base_optimizer_scheduler_config(), # no changes for 8m
+        optim_config=esm2_base_optimizer_scheduler_config(),  # no changes for 8m
         experiment_config=esm2_650m_experiment_config(args.result_dir),
         wandb_config=esm2_650m_wandb_config(),
     )
 
+
 def esm2_3b_parallel_config() -> ParallelConfig:
-    '''Parallel config for ESM2 3b'''
+    """Parallel config for ESM2 3b"""
     return ParallelConfig(
         tensor_model_parallel_size=2,
         pipeline_model_parallel_size=1,
         # TODO: is this correct?
         accumulate_grad_batches=1,
-        ddp='megatron',
+        ddp="megatron",
         # NOTE assumes 8xGPU node. Can always edit the config.
         num_devices=8,
     )
 
+
 def esm2_3b_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
-    '''Model config for ESM2 3b'''
+    """Model config for ESM2 3b"""
     return ExposedESM2PretrainConfig(
         num_layers=36,
         hidden_size=2560,
@@ -211,26 +215,28 @@ def esm2_3b_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
         seq_length=1024,
         biobert_spec_option=BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
         initial_ckpt_path=initial_ckpt_path,
-        params_dtype='bf16-mixed',
-        pipeline_dtype='bf16-mixed',
-        autocast_dtype='bf16-mixed'
+        params_dtype="bf16-mixed",
+        pipeline_dtype="bf16-mixed",
+        autocast_dtype="bf16-mixed",
     )
 
+
 def esm2_3b_wandb_config() -> WandbConfig:
-    '''Wandb config for ESM2 3b'''
+    """Wandb config for ESM2 3b"""
     return WandbConfig(
-        entity='esm2-3b_pretraining',
-        project='esm2-3b_pretraining',
-        group='esm2-3b',
-        tags=['esm2-3b'],
+        entity="esm2-3b_pretraining",
+        project="esm2-3b_pretraining",
+        group="esm2-3b",
+        tags=["esm2-3b"],
         offline=True,
         anonymous=True,
-        id='1',
+        id="1",
         log_model=False,
     )
 
+
 def esm2_3b_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
-    '''Recipe for ESM2 3b'''
+    """Recipe for ESM2 3b"""
     data_config = ESM2DataConfig(
         min_seq_length=1024,
         max_seq_length=1024,
@@ -245,9 +251,9 @@ def esm2_3b_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig
     return MainConfig(
         data_config=data_config,
         parallel_config=esm2_3b_parallel_config(),
-        training_config=esm2_base_training_config(), # no changes for 8m
+        training_config=esm2_base_training_config(),  # no changes for 8m
         bionemo_model_config=esm2_3b_model_config(args.initial_ckpt_path),
-        optim_config=esm2_base_optimizer_scheduler_config(), # no changes for 8m
+        optim_config=esm2_base_optimizer_scheduler_config(),  # no changes for 8m
         experiment_config=esm2_650m_experiment_config(args.result_dir),
         wandb_config=esm2_3b_wandb_config(),
     )
@@ -259,7 +265,7 @@ def simple_parallel_recipe(
     num_devices: int = 1,
     accumulate_grad_batches: int = 1,
 ) -> ParallelConfig:
-    '''Simple parallel recipe for ESM2'''
+    """Simple parallel recipe for ESM2"""
     assert (
         num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
     ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
@@ -272,17 +278,17 @@ def simple_parallel_recipe(
 
 
 def tiny_train_config_recipe() -> TrainingConfig:
-    '''Tiny training config for ESM2'''
+    """Tiny training config for ESM2"""
     return TrainingConfig(max_steps=10, limit_val_batches=2, val_check_interval=2)
 
 
 def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
-    '''Default optimizer scheduler config for ESM2'''
+    """Default optimizer scheduler config for ESM2"""
     return OptimizerSchedulerConfig()
 
 
 def experiment_config_recipe(result_dir="./results") -> ExperimentConfig:
-    '''Experiment config for ESM2'''
+    """Experiment config for ESM2"""
     return ExperimentConfig(
         save_every_n_steps=100,
         result_dir=result_dir,
@@ -303,7 +309,7 @@ def esm2_tiny_model_config(
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
     variable_seq_lengths: bool = False,
 ) -> ExposedESM2PretrainConfig:
-    '''Model config for ESM2 tiny'''
+    """Model config for ESM2 tiny"""
     return ExposedESM2PretrainConfig(
         seq_length=seq_length,
         num_layers=2,
@@ -320,8 +326,9 @@ def esm2_tiny_model_config(
         variable_seq_lengths=variable_seq_lengths,
     )
 
+
 def esm2_tiny_test_recipe(args):
-    '''Test recipe for ESM2 tiny'''
+    """Test recipe for ESM2 tiny"""
     parallel_config = simple_parallel_recipe()
     training_config = tiny_train_config_recipe()
 
@@ -362,6 +369,7 @@ def esm2_tiny_test_recipe(args):
     )
     return main_config
 
+
 def main():
     def parse_args():
         parser = argparse.ArgumentParser(description="Create ESM2 configuration JSON.")
@@ -434,5 +442,6 @@ def parse_args():
         f.write(json_str)
     logging.info(f"Saved configuration to {args.dest=}")
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py
index 79672139c9..25e6abfbc5 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/__init__.py
@@ -12,5 +12,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index 91fd1aa132..e643224158 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -17,8 +17,6 @@
 from dataclasses import dataclass, field
 from typing import List, Optional, Type
 
-from nemo import lightning as nl
-from nemo.lightning.pytorch import callbacks as nl_callbacks
 from nemo.utils import logging
 from tokenizers import Tokenizer
 
@@ -28,10 +26,8 @@
 from bionemo.geneformer.model.finetune_token_regressor import FineTuneSeqLenBioBertConfig
 from bionemo.llm.run.config_models import (
     DataConfig,
-    ExperimentConfig,
     ExposedModelConfig,
 )
-from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
 
 @dataclass
@@ -43,8 +39,7 @@ class GeneformerDataArtifacts:
 
 
 class GeneformerPretrainingDataConfig(DataConfig[SingleCellDataModule]):
-    '''
-    Configuration class for Geneformer pretraining data. 
+    """Configuration class for Geneformer pretraining data.
 
     Expects train/test/val to be prior split by directory and processed by `sub-packages/bionemo-geneformer/src/bionemo/geneformer/data/singlecell/sc_memmap.py`.
 
@@ -65,7 +60,7 @@ class GeneformerPretrainingDataConfig(DataConfig[SingleCellDataModule]):
             Preprocesses the data using a legacy preprocessor from BioNeMo 1 and returns the necessary artifacts.
         construct_data_module(global_batch_size: int) -> SingleCellDataModule:
             Constructs and returns a SingleCellDataModule using the preprocessed data artifacts.
-    '''
+    """
 
     # Shadow two attributes from the parent for visibility.
     data_dir: str
@@ -105,7 +100,7 @@ def geneformer_preprocess(self) -> GeneformerDataArtifacts:
             raise ValueError("Preprocessing failed to create tokenizer and/or median dictionary.")
 
     def construct_data_module(self, global_batch_size: int) -> SingleCellDataModule:
-        ''' Downloads the requisite data artifacts and instantiates the DataModule. '''
+        """Downloads the requisite data artifacts and instantiates the DataModule."""
         geneformer_data_artifacts: GeneformerDataArtifacts = self.geneformer_preprocess()
         data = SingleCellDataModule(
             seq_length=self.seq_length,
@@ -125,12 +120,13 @@ def construct_data_module(self, global_batch_size: int) -> SingleCellDataModule:
 
 
 class ExposedGeneformerPretrainConfig(ExposedModelConfig[GeneformerConfig]):
-    ''' Exposes custom parameters for pretraining and binds the class to GeneformerConfig.
+    """Exposes custom parameters for pretraining and binds the class to GeneformerConfig.
 
     Attributes:
         initial_ckpt_path (str): Path to a directory containing checkpoint files for initializing the model. This is only
         initial_ckpt_skip_keys_with_these_prefixes (List[str]): Skip any layer that contains this key during restoration. Useful for finetuning, set the names of the task heads so checkpoint restoration does not errorniously try to restore these.
-    '''
+    """
+
     # Custom parameters for FineTuning
     initial_ckpt_path: Optional[str] = None
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
@@ -154,5 +150,5 @@ class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBer
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=lambda: ["regression_head"])
 
     def model_class(self) -> Type[FineTuneSeqLenBioBertConfig]:
-        ''' binds the class to FineTuneSeqLenBioBertConfig '''
-        return FineTuneSeqLenBioBertConfig
\ No newline at end of file
+        """Binds the class to FineTuneSeqLenBioBertConfig"""
+        return FineTuneSeqLenBioBertConfig
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index e2eaf426af..76a6d0854d 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -18,7 +18,6 @@
 import json
 from typing import Optional
 
-
 from bionemo.geneformer.run.config_models import (
     ExposedFineTuneSeqLenBioBertConfig,
     ExposedGeneformerPretrainConfig,
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index 5babf99c71..d0a44bb114 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -37,29 +37,34 @@
 
 
 def geneformer_base_parallel_config() -> ParallelConfig:
-    '''Base parallel config for Geneformer'''
+    """Base parallel config for Geneformer"""
     return ParallelConfig(
         tensor_model_parallel_size=1,
         pipeline_model_parallel_size=1,
         accumulate_grad_batches=1,
-        ddp='megatron',
+        ddp="megatron",
         num_devices=1,
         num_nodes=1,
     )
 
+
 def geneformer_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
-    '''Base optimizer scheduler config for Geneformer'''
-    return OptimizerSchedulerConfig(lr=1e-3) # Matches bionemo1
+    """Base optimizer scheduler config for Geneformer"""
+    return OptimizerSchedulerConfig(lr=1e-3)  # Matches bionemo1
+
 
 def geneformer_base_training_config() -> TrainingConfig:
-    '''Base training config for Geneformer'''
-    return TrainingConfig(max_steps=400000, limit_val_batches=8, val_check_interval=100, precision='bf16-mixed') # matches bionemo1
+    """Base training config for Geneformer"""
+    return TrainingConfig(
+        max_steps=400000, limit_val_batches=8, val_check_interval=100, precision="bf16-mixed"
+    )  # matches bionemo1
 
 
 def geneformer_data_recipe(data_dir) -> GeneformerPretrainingDataConfig:
     """Recipe that produces the base geneformer small data configuration."""
     return GeneformerPretrainingDataConfig(data_dir=data_dir)
 
+
 # 10m definitions
 def geneformer_10m_model_config(
     seq_length: int = 2048,
@@ -68,7 +73,7 @@ def geneformer_10m_model_config(
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedGeneformerPretrainConfig:
-    '''Geneformer 10m model config settings'''
+    """Geneformer 10m model config settings"""
     geneformer_config = ExposedGeneformerPretrainConfig(
         num_layers=6,
         hidden_size=256,
@@ -104,64 +109,70 @@ def geneformer_10m_model_config(
     )
     return geneformer_config
 
+
 def geneformer_10m_experiment_config(result_dir) -> ExperimentConfig:
-    '''Experiment config for Geneformer 10m'''
+    """Experiment config for Geneformer 10m"""
     return ExperimentConfig(
         save_every_n_steps=100,
         result_dir=result_dir,
         experiment_name="geneformer-10m",
-        restore_from_checkpoint_path=None
+        restore_from_checkpoint_path=None,
     )
 
+
 def geneformer_10m_wandb_config() -> WandbConfig:
-    '''Wandb config for Geneformer 10m'''
+    """Wandb config for Geneformer 10m"""
     wandb_config = WandbConfig(
-        entity='geneformer-10m_pretraining',
-        project='geneformer-10m_pretraining',
-        group='geneformer-10m',
-        tags=['geneformer-10m'],
+        entity="geneformer-10m_pretraining",
+        project="geneformer-10m_pretraining",
+        group="geneformer-10m",
+        tags=["geneformer-10m"],
         offline=True,
         anonymous=True,
-        id='1',
+        id="1",
         log_model=False,
     )
     return wandb_config
 
+
 # 106m definition, model, experiment, wandb, parallel
 def geneformer_106m_parallel_config() -> ParallelConfig:
-    '''Base parallel config for Geneformer'''
+    """Base parallel config for Geneformer"""
     return ParallelConfig(
         tensor_model_parallel_size=1,
         pipeline_model_parallel_size=1,
         accumulate_grad_batches=1,
-        ddp='megatron',
+        ddp="megatron",
         num_devices=8,
         num_nodes=1,
     )
 
+
 def geneformer_106m_experiment_config(result_dir) -> ExperimentConfig:
-    '''Experiment config for Geneformer 106m'''
+    """Experiment config for Geneformer 106m"""
     return ExperimentConfig(
         save_every_n_steps=100,
         result_dir=result_dir,
         experiment_name="geneformer-106m",
-        restore_from_checkpoint_path=None
+        restore_from_checkpoint_path=None,
     )
 
+
 def geneformer_106m_wandb_config() -> WandbConfig:
-    '''Wandb config for Geneformer 106m'''
+    """Wandb config for Geneformer 106m"""
     wandb_config = WandbConfig(
-        entity='geneformer-106m_pretraining',
-        project='geneformer-106m_pretraining',
-        group='geneformer-106m',
-        tags=['geneformer-106m'],
+        entity="geneformer-106m_pretraining",
+        project="geneformer-106m_pretraining",
+        group="geneformer-106m",
+        tags=["geneformer-106m"],
         offline=True,
         anonymous=True,
-        id='1',
+        id="1",
         log_model=False,
     )
     return wandb_config
 
+
 def geneformer_106m_model_config(
     seq_length: int = 2048,
     precision: PrecisionTypes = "bf16-mixed",
@@ -169,7 +180,7 @@ def geneformer_106m_model_config(
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedGeneformerPretrainConfig:
-    '''Geneformer 106m model config settings'''
+    """Geneformer 106m model config settings"""
     geneformer_config = ExposedGeneformerPretrainConfig(
         num_layers=12,
         hidden_size=768,
@@ -212,7 +223,7 @@ def simple_parallel_recipe(
     num_devices: int = 1,
     accumulate_grad_batches: int = 1,
 ) -> ParallelConfig:
-    '''Simple parallel config for Geneformer, only used in testing.'''
+    """Simple parallel config for Geneformer, only used in testing."""
     assert (
         num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
     ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
@@ -230,7 +241,7 @@ def geneformer_finetuning_regression_head_recipe(
     initial_ckpt_path: Optional[str] = None,
     initial_ckpt_skip_keys_with_these_prefixes: Optional[List[str]] = None,
 ) -> ExposedFineTuneSeqLenBioBertConfig:
-    '''Recipe for finetuning a regression head on the masked tokens.'''
+    """Recipe for finetuning a regression head on the masked tokens."""
     partial_finetuning_config = partial(
         ExposedFineTuneSeqLenBioBertConfig,
         params_dtype=precision,
@@ -251,7 +262,7 @@ def geneformer_finetuning_regression_head_recipe(
 
 
 def default_trainer_config_recipe() -> TrainingConfig:
-    '''Default trainer config for Geneformer'''
+    """Default trainer config for Geneformer"""
     return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
 
 
@@ -262,7 +273,7 @@ def geneformer_10m_finetune_config(
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option=BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedFineTuneSeqLenBioBertConfig:
-    '''Geneformer 10m finetuning config settings'''
+    """Geneformer 10m finetuning config settings"""
     geneformer_config = ExposedFineTuneSeqLenBioBertConfig(
         num_layers=6,
         hidden_size=256,
@@ -306,7 +317,7 @@ def geneformer_tiny_config(
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedGeneformerPretrainConfig:
-    '''Geneformer tiny model config settings, used in testing.'''
+    """Geneformer tiny model config settings, used in testing."""
     geneformer_config = ExposedGeneformerPretrainConfig(
         num_layers=2,
         hidden_size=32,
@@ -344,12 +355,12 @@ def geneformer_tiny_config(
 
 
 def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
-    '''Default optimizer scheduler config for Geneformer. See OptimizerSchedulerConfig for defaults.'''
+    """Default optimizer scheduler config for Geneformer. See OptimizerSchedulerConfig for defaults."""
     return OptimizerSchedulerConfig()
 
 
 def experiment_config_recipe() -> ExperimentConfig:
-    '''Default experiment config for Geneformer. Used in testing. '''
+    """Default experiment config for Geneformer. Used in testing."""
     return ExperimentConfig(
         save_every_n_steps=100,
         result_dir="./results",
@@ -363,7 +374,7 @@ def experiment_config_recipe() -> ExperimentConfig:
 
 
 def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig]:
-    '''Recipe for finetuning a regression head on the masked tokens.'''
+    """Recipe for finetuning a regression head on the masked tokens."""
     data_path = args.data_path
     result_dir = args.result_dir
 
@@ -406,7 +417,7 @@ def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig,
 
 
 def pretrain_tiny_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
-    '''Recipe for pretraining a tiny model. Used in testing.'''
+    """Recipe for pretraining a tiny model. Used in testing."""
     data_path = args.data_path
     result_dir = args.result_dir
 
@@ -451,7 +462,7 @@ def pretrain_tiny_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfi
 def geneformer_10m_pretrain_recipe(
     args,
 ) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
-    '''Recipe for pretraining the 10m model.'''
+    """Recipe for pretraining the 10m model."""
     data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_path)
     parallel_config = simple_parallel_recipe()
     training_config = geneformer_base_training_config()
@@ -470,10 +481,11 @@ def geneformer_10m_pretrain_recipe(
     )
     return main_config
 
+
 def geneformer_106m_pretrain_recipe(
     args,
 ) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
-    '''Recipe for pretraining the 106m model. Uses 8 GPUs for data parallelism'''
+    """Recipe for pretraining the 106m model. Uses 8 GPUs for data parallelism"""
     data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_path)
     parallel_config = geneformer_106m_parallel_config()
     training_config = geneformer_base_training_config()
@@ -493,11 +505,10 @@ def geneformer_106m_pretrain_recipe(
     return main_config
 
 
-
 def geneformer_10m_finetune_recipe(
     args,
 ) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig, GeneformerPretrainingDataConfig]:
-    '''Recipe for finetuning the 10m model on a token regression head. Used as an example and for testing. '''
+    """Recipe for finetuning the 10m model on a token regression head. Used as an example and for testing."""
     data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_path)
     parallel_config = simple_parallel_recipe()
     training_config = default_trainer_config_recipe()
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index 7d32455186..be5a504d90 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -93,7 +93,7 @@ class Config:
         arbitrary_types_allowed = True
 
     def model_class(self) -> Type[ModelConfigT]:
-        '''Returns the underlying model class that this config wraps.'''
+        """Returns the underlying model class that this config wraps."""
         raise NotImplementedError
 
     def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
@@ -190,8 +190,7 @@ def validate_activation_func(cls, activation_func: str) -> Callable:
 
     @field_serializer("activation_func")
     def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tensor]) -> str:
-        """
-        Serializes a given activation function to its corresponding string representation.
+        """Serializes a given activation function to its corresponding string representation.
 
         By default, all activation functions from `torch.nn.functional` are serialized to their name. User defined
         activation functions should also be defined here with a custom mapping in CUSTOM_ACTIVATION_FNS defined at the
@@ -207,7 +206,6 @@ def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tenso
         Raises:
             ValueError: If the activation function is not supported.
         """
-
         func_name = v.__name__
         func = getattr(torch.nn.functional, func_name, None)
         if func is not None:
@@ -220,12 +218,12 @@ def serialize_activation_func(self, v: Callable[[torch.Tensor, Any], torch.Tenso
     @field_validator("params_dtype", "pipeline_dtype", "autocast_dtype", mode="before")
     @classmethod
     def precision_validator(cls, v: dtypes.PrecisionTypes) -> torch.dtype:
-        '''Validates the precision type and returns the corresponding torch dtype.'''
+        """Validates the precision type and returns the corresponding torch dtype."""
         return dtypes.get_autocast_dtype(v)
 
     @field_serializer("params_dtype", "pipeline_dtype", "autocast_dtype")
     def serialize_dtypes(self, v: torch.dtype) -> dtypes.PrecisionTypes:
-        '''Serializes the torch dtype to the corresponding precision type.'''
+        """Serializes the torch dtype to the corresponding precision type."""
         return dtypes.dtype_to_precision[v]
 
 
@@ -240,7 +238,7 @@ class ParallelConfig(BaseModel):
 
     @model_validator(mode="after")
     def validate_devices(self):
-        '''Validates the number of devices based on the tensor and pipeline model parallel sizes.'''
+        """Validates the number of devices based on the tensor and pipeline model parallel sizes."""
         if self.num_devices < self.tensor_model_parallel_size * self.pipeline_model_parallel_size:
             raise ValidationError(
                 "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
@@ -249,8 +247,8 @@ def validate_devices(self):
 
 
 class TrainingConfig(BaseModel):
-    """
-    TrainingConfig is a configuration class for training models.
+    """TrainingConfig is a configuration class for training models.
+
     Attributes:
         max_steps (int): The maximum number of training steps.
         limit_val_batches (int | float): The number of validation batches to use. Can be a fraction or a count.
@@ -260,15 +258,15 @@ class TrainingConfig(BaseModel):
     """
 
     max_steps: int
-    limit_val_batches: int | float # Because this can be a fraction or a count...
+    limit_val_batches: int | float  # Because this can be a fraction or a count...
     val_check_interval: int
     precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
     accelerator: str = "gpu"
 
 
 class OptimizerSchedulerConfig(BaseModel):
-    """
-    Configuration for the optimizer and learning rate scheduler.
+    """Configuration for the optimizer and learning rate scheduler.
+
     Attributes:
         lr (float): Learning rate for the optimizer. Default is 1e-4.
         optimizer (str): Type of optimizer to use. Default is "adam".
@@ -287,8 +285,8 @@ class OptimizerSchedulerConfig(BaseModel):
 
 
 class ExperimentConfig(BaseModel):
-    """
-    Configuration class for setting up and managing experiment parameters.
+    """Configuration class for setting up and managing experiment parameters.
+
     Attributes:
         save_every_n_steps (int): Number of steps between saving checkpoints.
         result_dir (str | pathlib.Path): Directory where results will be saved.
@@ -351,16 +349,16 @@ class while still allowing for custom validation global logic to be implemented
 
     @model_validator(mode="after")
     def validate_master_config(self) -> "MainConfig":
-        '''Validates the master configuration object.'''
+        """Validates the master configuration object."""
         self.bionemo_model_config.seq_length = self.data_config.seq_length
         return self
 
     @model_validator(mode="after")
     def run_bionemo_model_config_model_validators(self) -> "MainConfig":
-        '''Runs the model validators on the bionemo_model_config.'''
+        """Runs the model validators on the bionemo_model_config."""
         return self.bionemo_model_config.model_validator(self)
 
     @model_validator(mode="after")
     def run_data_config_model_validators(self) -> "MainConfig":
-        '''Runs the model validators on the data_config.'''
+        """Runs the model validators on the data_config."""
         return self.data_config.model_validator(self)

From 0306f9480fa4f34652e7c51449f116887ebbb5d8 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 4 Nov 2024 18:09:54 +0000
Subject: [PATCH 38/58] formatting

---
 .../src/bionemo/geneformer/run/recipes.py     |  8 +--
 .../src/bionemo/llm/run/config_models.py      | 26 ++++++---
 .../bionemo-llm/src/bionemo/llm/train.py      | 53 ++++++++++++++++++-
 .../src/bionemo/llm/utils/logger_utils.py     |  2 +-
 4 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index d0a44bb114..86b717b627 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -262,7 +262,7 @@ def geneformer_finetuning_regression_head_recipe(
 
 
 def default_trainer_config_recipe() -> TrainingConfig:
-    """Default trainer config for Geneformer"""
+    """Default trainer config for Geneformer."""
     return TrainingConfig(max_steps=55000, limit_val_batches=2, val_check_interval=100)
 
 
@@ -273,7 +273,7 @@ def geneformer_10m_finetune_config(
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option=BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedFineTuneSeqLenBioBertConfig:
-    """Geneformer 10m finetuning config settings"""
+    """Geneformer 10m finetuning config settings."""
     geneformer_config = ExposedFineTuneSeqLenBioBertConfig(
         num_layers=6,
         hidden_size=256,
@@ -485,7 +485,7 @@ def geneformer_10m_pretrain_recipe(
 def geneformer_106m_pretrain_recipe(
     args,
 ) -> MainConfig[ExposedGeneformerPretrainConfig, GeneformerPretrainingDataConfig]:
-    """Recipe for pretraining the 106m model. Uses 8 GPUs for data parallelism"""
+    """Recipe for pretraining the 106m model. Uses 8 GPUs for data parallelism."""
     data_config: GeneformerPretrainingDataConfig = geneformer_data_recipe(data_dir=args.data_path)
     parallel_config = geneformer_106m_parallel_config()
     training_config = geneformer_base_training_config()
@@ -537,7 +537,7 @@ def geneformer_10m_finetune_recipe(
     return main_config
 
 
-def main():
+def main(): # noqa: D103
     def parse_args():
         parser = argparse.ArgumentParser(description="Create Geneformer configuration JSON.")
         parser.add_argument(
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index be5a504d90..e5ef95f4c7 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -89,7 +89,7 @@ class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
 
     # Pydantic stuff to allow arbitrary types + validators + serializers
-    class Config:
+    class Config: # noqa: D106
         arbitrary_types_allowed = True
 
     def model_class(self) -> Type[ModelConfigT]:
@@ -161,11 +161,11 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
     @field_validator("activation_func", mode="before")
     @classmethod
     def validate_activation_func(cls, activation_func: str) -> Callable:
-        """Validates the activation function, assumes this function exists in torch.nn.functional. For custom
-        activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module.
-
-        This method validates the provided activation function string and returns
-        a callable function based on the validation context using the provided validator in the base class.
+        """Validates the activation function, assumes this function exists in torch.nn.functional. 
+        
+        For custom activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module. This method 
+        validates the provided activation function string and returns a callable function based on the validation 
+        context using the provided validator in the base class.
 
         Args:
             activation_func (str): The activation function to be validated.
@@ -228,6 +228,20 @@ def serialize_dtypes(self, v: torch.dtype) -> dtypes.PrecisionTypes:
 
 
 class ParallelConfig(BaseModel):
+    """
+    ParallelConfig is a configuration class for setting up parallelism in model training.
+    Attributes:
+        tensor_model_parallel_size (int): The size of the tensor model parallelism. Default is 1.
+        pipeline_model_parallel_size (int): The size of the pipeline model parallelism. Default is 1.
+        accumulate_grad_batches (int): The number of batches to accumulate gradients over. Default is 1.
+        ddp (Literal["megatron"]): The distributed data parallel method to use. Default is "megatron".
+        remove_unused_parameters (bool): Whether to remove unused parameters. Default is True.
+        num_devices (int): The number of devices to use. Default is 1.
+        num_nodes (int): The number of nodes to use. Default is 1.
+    Methods:
+        validate_devices(): Validates the number of devices based on the tensor and pipeline model parallel sizes.
+    """
+
     tensor_model_parallel_size: int = 1
     pipeline_model_parallel_size: int = 1
     accumulate_grad_batches: int = 1
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index a397c49acb..d80d2af45c 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -45,6 +45,17 @@
 
 
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
+    """
+    Creates and returns a NeMoLogger instance configured based on the provided experiment and wandb configurations.
+
+    Args:
+        experiment_config (ExperimentConfig): Configuration object containing experiment settings such as 
+            result directory, experiment name, checkpoint settings, and logger preferences.
+        wandb_config (Optional[WandbConfig]): Optional configuration object for Weights and Biases logging.
+    Returns:
+        nl.NeMoLogger: An instance of NeMoLogger configured with the specified settings.
+    """
+
     checkpoint_callback = nl_callbacks.ModelCheckpoint(
         save_last=experiment_config.save_last_checkpoint,
         monitor=experiment_config.metric_to_monitor_for_checkpoints,
@@ -64,6 +75,20 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
 
 
 def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig, callbacks=None) -> nl.Trainer:
+    """
+    Set up the trainer for model training using the specified parallel and training configurations.
+
+    Args:
+        parallel_config (ParallelConfig): Configuration for parallelism, including tensor and pipeline model parallel sizes, 
+                                          number of devices, and number of nodes.
+        training_config (TrainingConfig): Configuration for training, including maximum steps, accelerator type, 
+                                          validation batch limit, validation check interval, and precision.
+        callbacks (list, optional): List of callback functions to be used during training. Defaults to None, 
+                                    in which case default callbacks (RichModelSummary and LearningRateMonitor) are used.
+    Returns:
+        nl.Trainer: Configured trainer object ready for model training.
+    """
+
     strategy = nl.MegatronStrategy(
         tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
         pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
@@ -97,6 +122,17 @@ def biobert_lightning_module(
     optim_config: OptimizerSchedulerConfig,
     num_steps: int,
 ) -> BioBertLightningModule:
+    """
+    Creates a BioBertLightningModule with the specified configuration, tokenizer, and optimizer settings.
+
+    Args:
+        bionemo_model_config (BioBertConfig): Configuration for the BioBert model.
+        tokenizer (Tokenizer): Tokenizer to be used with the model.
+        optim_config (OptimizerSchedulerConfig): Configuration for the optimizer and learning rate scheduler.
+        num_steps (int): Total number of training steps.
+    Returns:
+        BioBertLightningModule: An instance of BioBertLightningModule configured with the provided settings.
+    """
     model = BioBertLightningModule(
         bionemo_model_config,
         tokenizer=tokenizer,
@@ -129,8 +165,23 @@ def train(
     optim_config: OptimizerSchedulerConfig,
     experiment_config: ExperimentConfig,
     wandb_config: Optional[WandbConfig],
-    resume_if_exists: bool = True,
+    resume_if_exists: bool = True
 ):
+    """
+    Train a BioNemo model using the provided configurations. Uses the ExposedModelConfig and DataConfig as the primary
+    variants for this method.
+
+    Args:
+        bionemo_exposed_model_config (ExposedModelConfig): Configuration for the exposed BioNemo model.
+        data_config (DataConfig[DataModuleT]): Configuration for the data module.
+        parallel_config (ParallelConfig): Configuration for parallel training.
+        training_config (TrainingConfig): Configuration for training parameters.
+        optim_config (OptimizerSchedulerConfig): Configuration for the optimizer and scheduler.
+        experiment_config (ExperimentConfig): Configuration for the experiment.
+        wandb_config (Optional[WandbConfig]): Configuration for Weights and Biases logging.
+        resume_if_exists (bool, optional): Flag to resume training if a checkpoint exists. Defaults to True.
+    """
+    
     bionemo_model_config = bionemo_exposed_model_config.exposed_to_internal_bionemo_model_config()
     pathlib.Path(data_config.result_dir).mkdir(parents=True, exist_ok=True)
 
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
index d7fb67968f..5f2f6b1957 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
@@ -68,7 +68,7 @@ def setup_nemo_lightning_logger(
         name: The name of the experiment. Results go into `root_dir`/`name`
         root_dir: The root directory to create the `name` directory in for saving run results.
         initialize_tensorboard_logger: Whether to initialize the tensorboard logger.
-        wandb_kwargs: The kwargs for the wandb logger.
+        wandb_config: The remaining configuration options for the wandb logger.
         ckpt_callback: The checkpoint callback to use, must be a child of the pytorch lightning ModelCheckpoint callback.
             NOTE the type annotation in the underlying NeMoCheckpoint constructor is incorrect.
         **kwargs: The kwargs for the NeMoLogger.

From a361c8b02a6137f5b89f720f62da36e90c03a0f0 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 4 Nov 2024 18:17:40 +0000
Subject: [PATCH 39/58] formatting

---
 .../src/bionemo/esm2/run/config_models.py     | 57 +++++++++----------
 .../bionemo-esm2/src/bionemo/esm2/run/main.py |  2 +-
 .../src/bionemo/esm2/run/recipes.py           | 44 +++++++-------
 .../bionemo/geneformer/run/config_models.py   | 11 ++--
 .../src/bionemo/geneformer/run/main.py        |  2 +-
 .../src/bionemo/geneformer/run/recipes.py     | 22 +++----
 .../src/bionemo/llm/run/config_models.py      | 15 ++---
 .../bionemo-llm/src/bionemo/llm/train.py      | 29 ++++------
 8 files changed, 89 insertions(+), 93 deletions(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
index 95f73ae043..436093df11 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
@@ -89,34 +89,33 @@ def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
 
 
 class ExposedESM2PretrainConfig(ExposedModelConfig[ESM2Config]):
-    class ExposedESM2PretrainConfig:
-        """Configuration class for ESM2 pretraining with select exposed parameters.
-
-        See the inherited ExposedModelConfig for attributes and methods from the base class. Use this class either
-        as a template or extension for custom configurations. Importantly, these kinds of classes should do two things,
-        select attributes to expose to the user, and provide validation and serialization any attributes.
-
-        Attributes:
-            use_esm_attention (bool): Flag to skip ESM2 custom attention for TE acceleration. Defaults to False.
-            token_dropout (bool): Flag to enable token dropout. Defaults to True.
-            normalize_attention_scores (bool): Flag to normalize attention scores. Defaults to False.
-            variable_seq_lengths (bool): Flag to enable variable sequence lengths. Defaults to False.
-            core_attention_override (Optional[Type[torch.nn.Module]]): Optional override for core attention module. Defaults to None.
-
-        Methods:
-            restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -> BiobertSpecOption:
-                Validates the BiobertSpecOption to ensure it is compatible with ESM2.
-            serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
-                Serializes the core attention override module to a string.
-            validate_core_attention_override(cls, value):
-                Validates the core attention override module, ensuring it is a subclass of torch.nn.Module.
-            validate_and_set_attention_and_scaling(self):
-                Validates and sets the attention and scaling parameters based on the biobert_spec_option.
-            model_validator(self, global_cfg: MainConfig) -> MainConfig:
-                Validates the global configuration, ensuring compatibility with ESM2DataConfig and parallel settings.
-            model_class(self) -> Type[ESM2Config]:
-                Returns the model class associated with this configuration.
-        """
+    """Configuration class for ESM2 pretraining with select exposed parameters.
+
+    See the inherited ExposedModelConfig for attributes and methods from the base class. Use this class either
+    as a template or extension for custom configurations. Importantly, these kinds of classes should do two things,
+    select attributes to expose to the user, and provide validation and serialization any attributes.
+
+    Attributes:
+        use_esm_attention (bool): Flag to skip ESM2 custom attention for TE acceleration. Defaults to False.
+        token_dropout (bool): Flag to enable token dropout. Defaults to True.
+        normalize_attention_scores (bool): Flag to normalize attention scores. Defaults to False.
+        variable_seq_lengths (bool): Flag to enable variable sequence lengths. Defaults to False.
+        core_attention_override (Optional[Type[torch.nn.Module]]): Optional override for core attention module. Defaults to None.
+
+    Methods:
+        restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -> BiobertSpecOption:
+            Validates the BiobertSpecOption to ensure it is compatible with ESM2.
+        serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
+            Serializes the core attention override module to a string.
+        validate_core_attention_override(cls, value):
+            Validates the core attention override module, ensuring it is a subclass of torch.nn.Module.
+        validate_and_set_attention_and_scaling(self):
+            Validates and sets the attention and scaling parameters based on the biobert_spec_option.
+        model_validator(self, global_cfg: MainConfig) -> MainConfig:
+            Validates the global configuration, ensuring compatibility with ESM2DataConfig and parallel settings.
+        model_class(self) -> Type[ESM2Config]:
+            Returns the model class associated with this configuration.
+    """
 
     use_esm_attention: bool = False  # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
     token_dropout: bool = True
@@ -127,7 +126,7 @@ class ExposedESM2PretrainConfig:
     @field_validator("biobert_spec_option", mode="after")
     @classmethod
     def restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -> BiobertSpecOption:
-        """Validates the BiobertSpecOption to ensure it is compatible with ESM2. by restricting it to the specs compatable with ESM2"""
+        """Validates the BiobertSpecOption to ensure it is compatible with ESM2. by restricting it to the specs compatable with ESM2."""
         if biobert_spec_option in (
             BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
             BiobertSpecOption.esm2_bert_layer_local_spec,
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
index 10a4a94956..eb77ca23ed 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -23,7 +23,7 @@
 from bionemo.llm.train import train
 
 
-def main():
+def main():  # noqa: D103
     def parse_args():
         parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
         parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
index 1bc2505d64..1f7fa1c340 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
@@ -34,12 +34,12 @@
 
 
 def esm2_base_training_config() -> TrainingConfig:
-    """Base training config for ESM2"""
+    """Base training config for ESM2."""
     return TrainingConfig(max_steps=500000, limit_val_batches=1.0, val_check_interval=1500, precision="bf16-mixed")
 
 
 def esm2_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
-    """Base optimizer scheduler config for ESM2"""
+    """Base optimizer scheduler config for ESM2."""
     return OptimizerSchedulerConfig(
         optimizer="adam",
         lr=4e-4,
@@ -51,7 +51,7 @@ def esm2_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
 
 
 def esm2_base_parallel_config() -> ParallelConfig:
-    """Base parallel config for ESM2"""
+    """Base parallel config for ESM2."""
     return ParallelConfig(
         tensor_model_parallel_size=1,
         pipeline_model_parallel_size=1,
@@ -63,7 +63,7 @@ def esm2_base_parallel_config() -> ParallelConfig:
 
 
 def esm2_8m_wandb_config() -> WandbConfig:
-    """Wandb config for ESM2 8m"""
+    """Wandb config for ESM2 8m."""
     wandb_config = WandbConfig(
         entity="esm2-8m_pretraining",
         project="esm2-8m_pretraining",
@@ -78,7 +78,7 @@ def esm2_8m_wandb_config() -> WandbConfig:
 
 
 def esm2_8m_experiment_config(result_dir) -> ExperimentConfig:
-    """Experiment config for ESM2 8m"""
+    """Experiment config for ESM2 8m."""
     return ExperimentConfig(
         save_every_n_steps=50,  # default set in previous script.
         result_dir=result_dir,
@@ -88,7 +88,7 @@ def esm2_8m_experiment_config(result_dir) -> ExperimentConfig:
 
 
 def esm2_8m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
-    """Model config for ESM2 8m"""
+    """Model config for ESM2 8m."""
     return ExposedESM2PretrainConfig(
         num_layers=6,
         hidden_size=320,
@@ -104,7 +104,7 @@ def esm2_8m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
 
 
 def esm2_8m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
-    """Recipe for ESM2 8m"""
+    """Recipe for ESM2 8m."""
     data_config = ESM2DataConfig(
         min_seq_length=1024,
         max_seq_length=1024,
@@ -128,7 +128,7 @@ def esm2_8m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig
 
 
 def esm2_650m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
-    """Model config for ESM2 650m"""
+    """Model config for ESM2 650m."""
     return ExposedESM2PretrainConfig(
         num_layers=6,
         hidden_size=1280,
@@ -144,7 +144,7 @@ def esm2_650m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
 
 
 def esm2_650m_wandb_config() -> WandbConfig:
-    """Wandb config for ESM2 650m"""
+    """Wandb config for ESM2 650m."""
     return WandbConfig(
         entity="esm2-650m_pretraining",
         project="esm2-650m_pretraining",
@@ -158,7 +158,7 @@ def esm2_650m_wandb_config() -> WandbConfig:
 
 
 def esm2_650m_experiment_config(result_dir) -> ExperimentConfig:
-    """Experiment config for ESM2 650m"""
+    """Experiment config for ESM2 650m."""
     return ExperimentConfig(
         save_every_n_steps=50,
         result_dir=result_dir,
@@ -169,7 +169,7 @@ def esm2_650m_experiment_config(result_dir) -> ExperimentConfig:
 
 
 def esm2_650m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
-    """Recipe for ESM2 650m"""
+    """Recipe for ESM2 650m."""
     data_config = ESM2DataConfig(
         min_seq_length=1024,
         max_seq_length=1024,
@@ -193,7 +193,7 @@ def esm2_650m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConf
 
 
 def esm2_3b_parallel_config() -> ParallelConfig:
-    """Parallel config for ESM2 3b"""
+    """Parallel config for ESM2 3b."""
     return ParallelConfig(
         tensor_model_parallel_size=2,
         pipeline_model_parallel_size=1,
@@ -206,7 +206,7 @@ def esm2_3b_parallel_config() -> ParallelConfig:
 
 
 def esm2_3b_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
-    """Model config for ESM2 3b"""
+    """Model config for ESM2 3b."""
     return ExposedESM2PretrainConfig(
         num_layers=36,
         hidden_size=2560,
@@ -222,7 +222,7 @@ def esm2_3b_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
 
 
 def esm2_3b_wandb_config() -> WandbConfig:
-    """Wandb config for ESM2 3b"""
+    """Wandb config for ESM2 3b."""
     return WandbConfig(
         entity="esm2-3b_pretraining",
         project="esm2-3b_pretraining",
@@ -236,7 +236,7 @@ def esm2_3b_wandb_config() -> WandbConfig:
 
 
 def esm2_3b_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
-    """Recipe for ESM2 3b"""
+    """Recipe for ESM2 3b."""
     data_config = ESM2DataConfig(
         min_seq_length=1024,
         max_seq_length=1024,
@@ -265,7 +265,7 @@ def simple_parallel_recipe(
     num_devices: int = 1,
     accumulate_grad_batches: int = 1,
 ) -> ParallelConfig:
-    """Simple parallel recipe for ESM2"""
+    """Simple parallel recipe for ESM2."""
     assert (
         num_devices >= tensor_model_parallel_size * pipeline_model_parallel_size
     ), "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
@@ -278,17 +278,17 @@ def simple_parallel_recipe(
 
 
 def tiny_train_config_recipe() -> TrainingConfig:
-    """Tiny training config for ESM2"""
+    """Tiny training config for ESM2."""
     return TrainingConfig(max_steps=10, limit_val_batches=2, val_check_interval=2)
 
 
 def default_adam_optimizer_with_cosine_annealing_recipe() -> OptimizerSchedulerConfig:
-    """Default optimizer scheduler config for ESM2"""
+    """Default optimizer scheduler config for ESM2."""
     return OptimizerSchedulerConfig()
 
 
 def experiment_config_recipe(result_dir="./results") -> ExperimentConfig:
-    """Experiment config for ESM2"""
+    """Experiment config for ESM2."""
     return ExperimentConfig(
         save_every_n_steps=100,
         result_dir=result_dir,
@@ -309,7 +309,7 @@ def esm2_tiny_model_config(
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
     variable_seq_lengths: bool = False,
 ) -> ExposedESM2PretrainConfig:
-    """Model config for ESM2 tiny"""
+    """Model config for ESM2 tiny, used for testing."""
     return ExposedESM2PretrainConfig(
         seq_length=seq_length,
         num_layers=2,
@@ -328,7 +328,7 @@ def esm2_tiny_model_config(
 
 
 def esm2_tiny_test_recipe(args):
-    """Test recipe for ESM2 tiny"""
+    """Test recipe for ESM2 tiny, used for testing."""
     parallel_config = simple_parallel_recipe()
     training_config = tiny_train_config_recipe()
 
@@ -370,7 +370,7 @@ def esm2_tiny_test_recipe(args):
     return main_config
 
 
-def main():
+def main():  # noqa: D103
     def parse_args():
         parser = argparse.ArgumentParser(description="Create ESM2 configuration JSON.")
         parser.add_argument(
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
index e643224158..ff64d45f58 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/config_models.py
@@ -71,19 +71,20 @@ class GeneformerPretrainingDataConfig(DataConfig[SingleCellDataModule]):
     num_dataset_workers: int = 0
 
     @property
-    def train_data_path(self) -> str:
+    def train_data_path(self) -> str:  # noqa: D102
         return self.data_dir + "/train"
 
     @property
-    def val_data_path(self) -> str:
+    def val_data_path(self) -> str:  # noqa: D102
         return self.data_dir + "/val"
 
     @property
-    def test_data_path(self) -> str:
+    def test_data_path(self) -> str:  # noqa: D102
         return self.data_dir + "/test"
 
     def geneformer_preprocess(self) -> GeneformerDataArtifacts:
         """Geneformer datamodule expects certain artifacts to be present in the data directory.
+
         This method uses a legacy 'preprocessor' from BioNeMo 1 to acquire the associated artifacts.
         """
         preprocessor = GeneformerPreprocess(
@@ -131,7 +132,7 @@ class ExposedGeneformerPretrainConfig(ExposedModelConfig[GeneformerConfig]):
     initial_ckpt_path: Optional[str] = None
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
 
-    def model_class(self) -> Type[GeneformerConfig]:
+    def model_class(self) -> Type[GeneformerConfig]:  # noqa: D102
         return GeneformerConfig
 
 
@@ -150,5 +151,5 @@ class ExposedFineTuneSeqLenBioBertConfig(ExposedModelConfig[FineTuneSeqLenBioBer
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=lambda: ["regression_head"])
 
     def model_class(self) -> Type[FineTuneSeqLenBioBertConfig]:
-        """Binds the class to FineTuneSeqLenBioBertConfig"""
+        """Binds the class to FineTuneSeqLenBioBertConfig."""
         return FineTuneSeqLenBioBertConfig
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 76a6d0854d..79d7577e9b 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -27,7 +27,7 @@
 from bionemo.llm.train import train
 
 
-def main():
+def main():  # noqa: D103
     def parse_args():
         parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
         parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index 86b717b627..e56b08c080 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -37,7 +37,7 @@
 
 
 def geneformer_base_parallel_config() -> ParallelConfig:
-    """Base parallel config for Geneformer"""
+    """Base parallel config for Geneformer."""
     return ParallelConfig(
         tensor_model_parallel_size=1,
         pipeline_model_parallel_size=1,
@@ -49,12 +49,12 @@ def geneformer_base_parallel_config() -> ParallelConfig:
 
 
 def geneformer_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
-    """Base optimizer scheduler config for Geneformer"""
+    """Base optimizer scheduler config for Geneformer."""
     return OptimizerSchedulerConfig(lr=1e-3)  # Matches bionemo1
 
 
 def geneformer_base_training_config() -> TrainingConfig:
-    """Base training config for Geneformer"""
+    """Base training config for Geneformer."""
     return TrainingConfig(
         max_steps=400000, limit_val_batches=8, val_check_interval=100, precision="bf16-mixed"
     )  # matches bionemo1
@@ -73,7 +73,7 @@ def geneformer_10m_model_config(
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedGeneformerPretrainConfig:
-    """Geneformer 10m model config settings"""
+    """Geneformer 10m model config settings."""
     geneformer_config = ExposedGeneformerPretrainConfig(
         num_layers=6,
         hidden_size=256,
@@ -111,7 +111,7 @@ def geneformer_10m_model_config(
 
 
 def geneformer_10m_experiment_config(result_dir) -> ExperimentConfig:
-    """Experiment config for Geneformer 10m"""
+    """Experiment config for Geneformer 10m."""
     return ExperimentConfig(
         save_every_n_steps=100,
         result_dir=result_dir,
@@ -121,7 +121,7 @@ def geneformer_10m_experiment_config(result_dir) -> ExperimentConfig:
 
 
 def geneformer_10m_wandb_config() -> WandbConfig:
-    """Wandb config for Geneformer 10m"""
+    """Wandb config for Geneformer 10m."""
     wandb_config = WandbConfig(
         entity="geneformer-10m_pretraining",
         project="geneformer-10m_pretraining",
@@ -137,7 +137,7 @@ def geneformer_10m_wandb_config() -> WandbConfig:
 
 # 106m definition, model, experiment, wandb, parallel
 def geneformer_106m_parallel_config() -> ParallelConfig:
-    """Base parallel config for Geneformer"""
+    """Base parallel config for Geneformer."""
     return ParallelConfig(
         tensor_model_parallel_size=1,
         pipeline_model_parallel_size=1,
@@ -149,7 +149,7 @@ def geneformer_106m_parallel_config() -> ParallelConfig:
 
 
 def geneformer_106m_experiment_config(result_dir) -> ExperimentConfig:
-    """Experiment config for Geneformer 106m"""
+    """Experiment config for Geneformer 106m."""
     return ExperimentConfig(
         save_every_n_steps=100,
         result_dir=result_dir,
@@ -159,7 +159,7 @@ def geneformer_106m_experiment_config(result_dir) -> ExperimentConfig:
 
 
 def geneformer_106m_wandb_config() -> WandbConfig:
-    """Wandb config for Geneformer 106m"""
+    """Wandb config for Geneformer 106m."""
     wandb_config = WandbConfig(
         entity="geneformer-106m_pretraining",
         project="geneformer-106m_pretraining",
@@ -180,7 +180,7 @@ def geneformer_106m_model_config(
     initial_ckpt_path: Optional[str] = None,
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec,
 ) -> ExposedGeneformerPretrainConfig:
-    """Geneformer 106m model config settings"""
+    """Geneformer 106m model config settings."""
     geneformer_config = ExposedGeneformerPretrainConfig(
         num_layers=12,
         hidden_size=768,
@@ -537,7 +537,7 @@ def geneformer_10m_finetune_recipe(
     return main_config
 
 
-def main(): # noqa: D103
+def main():  # noqa: D103
     def parse_args():
         parser = argparse.ArgumentParser(description="Create Geneformer configuration JSON.")
         parser.add_argument(
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index e5ef95f4c7..9aa0d7c69b 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -89,7 +89,7 @@ class ExposedModelConfig(BaseModel, Generic[ModelConfigT], ABC):
     initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=list)
 
     # Pydantic stuff to allow arbitrary types + validators + serializers
-    class Config: # noqa: D106
+    class Config:  # noqa: D106
         arbitrary_types_allowed = True
 
     def model_class(self) -> Type[ModelConfigT]:
@@ -161,10 +161,10 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
     @field_validator("activation_func", mode="before")
     @classmethod
     def validate_activation_func(cls, activation_func: str) -> Callable:
-        """Validates the activation function, assumes this function exists in torch.nn.functional. 
-        
-        For custom activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module. This method 
-        validates the provided activation function string and returns a callable function based on the validation 
+        """Validates the activation function, assumes this function exists in torch.nn.functional.
+
+        For custom activation functions, use the CUSTOM_ACTIVATION_FUNCTIONS dictionary in the module. This method
+        validates the provided activation function string and returns a callable function based on the validation
         context using the provided validator in the base class.
 
         Args:
@@ -228,8 +228,8 @@ def serialize_dtypes(self, v: torch.dtype) -> dtypes.PrecisionTypes:
 
 
 class ParallelConfig(BaseModel):
-    """
-    ParallelConfig is a configuration class for setting up parallelism in model training.
+    """ParallelConfig is a configuration class for setting up parallelism in model training.
+
     Attributes:
         tensor_model_parallel_size (int): The size of the tensor model parallelism. Default is 1.
         pipeline_model_parallel_size (int): The size of the pipeline model parallelism. Default is 1.
@@ -238,6 +238,7 @@ class ParallelConfig(BaseModel):
         remove_unused_parameters (bool): Whether to remove unused parameters. Default is True.
         num_devices (int): The number of devices to use. Default is 1.
         num_nodes (int): The number of nodes to use. Default is 1.
+
     Methods:
         validate_devices(): Validates the number of devices based on the tensor and pipeline model parallel sizes.
     """
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index d80d2af45c..1920ec734e 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -45,17 +45,16 @@
 
 
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
-    """
-    Creates and returns a NeMoLogger instance configured based on the provided experiment and wandb configurations.
+    """Creates and returns a NeMoLogger instance configured based on the provided experiment and wandb configurations.
 
     Args:
-        experiment_config (ExperimentConfig): Configuration object containing experiment settings such as 
+        experiment_config (ExperimentConfig): Configuration object containing experiment settings such as
             result directory, experiment name, checkpoint settings, and logger preferences.
         wandb_config (Optional[WandbConfig]): Optional configuration object for Weights and Biases logging.
+
     Returns:
         nl.NeMoLogger: An instance of NeMoLogger configured with the specified settings.
     """
-
     checkpoint_callback = nl_callbacks.ModelCheckpoint(
         save_last=experiment_config.save_last_checkpoint,
         monitor=experiment_config.metric_to_monitor_for_checkpoints,
@@ -75,20 +74,19 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
 
 
 def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig, callbacks=None) -> nl.Trainer:
-    """
-    Set up the trainer for model training using the specified parallel and training configurations.
+    """Set up the trainer for model training using the specified parallel and training configurations.
 
     Args:
-        parallel_config (ParallelConfig): Configuration for parallelism, including tensor and pipeline model parallel sizes, 
+        parallel_config (ParallelConfig): Configuration for parallelism, including tensor and pipeline model parallel sizes,
                                           number of devices, and number of nodes.
-        training_config (TrainingConfig): Configuration for training, including maximum steps, accelerator type, 
+        training_config (TrainingConfig): Configuration for training, including maximum steps, accelerator type,
                                           validation batch limit, validation check interval, and precision.
-        callbacks (list, optional): List of callback functions to be used during training. Defaults to None, 
+        callbacks (list, optional): List of callback functions to be used during training. Defaults to None,
                                     in which case default callbacks (RichModelSummary and LearningRateMonitor) are used.
+
     Returns:
         nl.Trainer: Configured trainer object ready for model training.
     """
-
     strategy = nl.MegatronStrategy(
         tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
         pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
@@ -122,14 +120,14 @@ def biobert_lightning_module(
     optim_config: OptimizerSchedulerConfig,
     num_steps: int,
 ) -> BioBertLightningModule:
-    """
-    Creates a BioBertLightningModule with the specified configuration, tokenizer, and optimizer settings.
+    """Creates a BioBertLightningModule with the specified configuration, tokenizer, and optimizer settings.
 
     Args:
         bionemo_model_config (BioBertConfig): Configuration for the BioBert model.
         tokenizer (Tokenizer): Tokenizer to be used with the model.
         optim_config (OptimizerSchedulerConfig): Configuration for the optimizer and learning rate scheduler.
         num_steps (int): Total number of training steps.
+
     Returns:
         BioBertLightningModule: An instance of BioBertLightningModule configured with the provided settings.
     """
@@ -165,11 +163,9 @@ def train(
     optim_config: OptimizerSchedulerConfig,
     experiment_config: ExperimentConfig,
     wandb_config: Optional[WandbConfig],
-    resume_if_exists: bool = True
+    resume_if_exists: bool = True,
 ):
-    """
-    Train a BioNemo model using the provided configurations. Uses the ExposedModelConfig and DataConfig as the primary
-    variants for this method.
+    """Train a BioNemo model using the provided configurations. Uses the ExposedModelConfig and DataConfig as the primary variants for this method.
 
     Args:
         bionemo_exposed_model_config (ExposedModelConfig): Configuration for the exposed BioNemo model.
@@ -181,7 +177,6 @@ def train(
         wandb_config (Optional[WandbConfig]): Configuration for Weights and Biases logging.
         resume_if_exists (bool, optional): Flag to resume training if a checkpoint exists. Defaults to True.
     """
-    
     bionemo_model_config = bionemo_exposed_model_config.exposed_to_internal_bionemo_model_config()
     pathlib.Path(data_config.result_dir).mkdir(parents=True, exist_ok=True)
 

From a4d0870ee6a821e3da7c45e55d4f919092ef1bbf Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 4 Nov 2024 23:24:43 +0000
Subject: [PATCH 40/58] refactor train to match new lightning module workflow

---
 .../bionemo-llm/src/bionemo/llm/train.py      | 68 +++++++------------
 1 file changed, 24 insertions(+), 44 deletions(-)

diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index 1920ec734e..7074200039 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -18,6 +18,7 @@
 import pathlib
 from typing import Optional
 
+from bionemo.llm.lightning import BionemoLightningModule
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
 from nemo.collections import llm
@@ -29,7 +30,7 @@
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 from tokenizers import Tokenizer
 
-from bionemo.llm.model.biobert.lightning import BioBertLightningModule
+from bionemo.llm.model.biobert.lightning import BioBertLightningModule, biobert_lightning_module
 from bionemo.llm.model.biobert.model import BioBertConfig
 from bionemo.llm.run.config_models import (
     DataConfig,
@@ -114,47 +115,6 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
     return trainer
 
 
-def biobert_lightning_module(
-    bionemo_model_config: BioBertConfig,
-    tokenizer: Tokenizer,
-    optim_config: OptimizerSchedulerConfig,
-    num_steps: int,
-) -> BioBertLightningModule:
-    """Creates a BioBertLightningModule with the specified configuration, tokenizer, and optimizer settings.
-
-    Args:
-        bionemo_model_config (BioBertConfig): Configuration for the BioBert model.
-        tokenizer (Tokenizer): Tokenizer to be used with the model.
-        optim_config (OptimizerSchedulerConfig): Configuration for the optimizer and learning rate scheduler.
-        num_steps (int): Total number of training steps.
-
-    Returns:
-        BioBertLightningModule: An instance of BioBertLightningModule configured with the provided settings.
-    """
-    model = BioBertLightningModule(
-        bionemo_model_config,
-        tokenizer=tokenizer,
-        optimizer=MegatronOptimizerModule(
-            config=OptimizerConfig(
-                lr=optim_config.lr,
-                optimizer=optim_config.optimizer,
-                use_distributed_optimizer=True,
-                fp16=bionemo_model_config.fp16,
-                bf16=bionemo_model_config.bf16,
-            ),
-            lr_scheduler=CosineAnnealingScheduler(
-                max_steps=num_steps,
-                min_lr=optim_config.lr / 100,
-                warmup_steps=int(math.ceil(num_steps * optim_config.cosine_rampup_frac)),
-                interval=optim_config.interval,
-                monitor=optim_config.monitor,
-                constant_steps=int(math.ceil(num_steps * optim_config.cosine_hold_frac)),
-            ),
-        ),
-    )
-    return model
-
-
 def train(
     bionemo_exposed_model_config: ExposedModelConfig,
     data_config: DataConfig[DataModuleT],
@@ -197,8 +157,28 @@ def train(
 
     # TODO BioBertDataModule or BioBertTokenizer abstractions. We know all DataModuleT in this case has data.tokenizer,
     # although this constraint is not documented.
-    model: BioBertLightningModule = biobert_lightning_module(
-        bionemo_model_config, tokenizer=data.tokenizer, optim_config=optim_config, num_steps=training_config.max_steps
+
+    optimizer = MegatronOptimizerModule(
+        config=OptimizerConfig(
+            lr=optim_config.lr,
+            optimizer=optim_config.optimizer,
+            use_distributed_optimizer=True,
+            fp16=bionemo_model_config.fp16,
+            bf16=bionemo_model_config.bf16,
+        ),
+        lr_scheduler=CosineAnnealingScheduler(
+            max_steps=training_config.max_steps,
+            min_lr=optim_config.lr / 100,
+            warmup_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_rampup_frac)),
+            interval=optim_config.interval,
+            monitor=optim_config.monitor,
+            constant_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_hold_frac)),
+        )
+    )
+
+
+    model: BionemoLightningModule = biobert_lightning_module(
+        config=bionemo_model_config, tokenizer=data.tokenizer, optimizer=optimizer
     )
     trainer: nl.Trainer = setup_trainer(parallel_config, training_config)
     nemo_logger: nl.NeMoLogger = nemo_logger_factory(experiment_config, wandb_config=wandb_config)

From bca9c8d31b58ff87ff559bf0c1a04cebd7e15157 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 4 Nov 2024 23:51:22 +0000
Subject: [PATCH 41/58] Fixes bug where we are using the wrong
 BioBertLightningModule method (should be BionemoLightningModule, using the
 lightning construction function).

Introduces Nsys options to the CLI (argparse)
Adds callbacks for garbage collection to the recipes.
---
 .../bionemo-esm2/src/bionemo/esm2/run/main.py | 46 ++++++++++++++++-
 .../src/bionemo/geneformer/run/main.py        | 49 ++++++++++++++++++-
 .../geneformer/scripts/test_pydantic_train.py |  0
 .../src/bionemo/llm/run/config_models.py      |  3 ++
 .../bionemo-llm/src/bionemo/llm/train.py      | 40 +++++++++++++--
 5 files changed, 132 insertions(+), 6 deletions(-)
 rename sub-packages/bionemo-geneformer/{src => tests}/bionemo/geneformer/scripts/test_pydantic_train.py (100%)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
index eb77ca23ed..e8933f07d4 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -20,7 +20,7 @@
 
 from bionemo.esm2.run.config_models import ESM2DataConfig, ExposedESM2PretrainConfig
 from bionemo.llm.run.config_models import MainConfig
-from bionemo.llm.train import train
+from bionemo.llm.train import NsysConfig, train
 
 
 def main():  # noqa: D103
@@ -45,6 +45,38 @@ def parse_args():
             action="store_true",
             help="Resume training if a checkpoint exists that matches the current experiment configuration.",
         )
+
+        # Debug options.
+        parser.add_argument(
+            "--nsys-profiling",
+            action="store_true",
+            default=False,
+            help="Enable targeted `nsys` profiling on the training loop for a defined step range. To actually get profiling output you must run the whole program with `nsys`. For example: "
+            " `nsys profile -s none -o output_report_name -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop  [regular python command here]`",
+        )
+        # start, end, rank
+        parser.add_argument(
+            "--nsys-start-step",
+            type=int,
+            required=False,
+            default=0,
+            help="Start nsys profiling after this step.",
+        )
+        parser.add_argument(
+            "--nsys-end-step",
+            type=int,
+            required=False,
+            help="End nsys profiling after this step.",
+        )
+        # rank as list of integers
+        parser.add_argument(
+            "--nsys-ranks",
+            type=int,
+            nargs="+",
+            required=False,
+            default=[0],
+            help="Enable nsys profiling for these ranks.",
+        )
         return parser.parse_args()
 
     def string_to_class(path: str):
@@ -78,6 +110,17 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
 
     args = parse_args()
     config = load_config(args.config, args.model_config_t, args.data_config_t)
+
+
+    if args.nsys_profiling:
+        nsys_config = NsysConfig(
+            start_step=args.nsys_start_step,
+            end_step=args.nsys_end_step,
+            ranks=args.nsys_ranks,
+        )
+    else:
+        nsys_config = None
+
     train(
         bionemo_exposed_model_config=config.bionemo_model_config,
         data_config=config.data_config,
@@ -86,6 +129,7 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
         optim_config=config.optim_config,
         experiment_config=config.experiment_config,
         wandb_config=config.wandb_config,
+        nsys_config=nsys_config,
         resume_if_exists=args.resume_if_exists,
     )
 
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 79d7577e9b..366b58e4d1 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -18,14 +18,15 @@
 import json
 from typing import Optional
 
+from pydantic import BaseModel
+
 from bionemo.geneformer.run.config_models import (
     ExposedFineTuneSeqLenBioBertConfig,
     ExposedGeneformerPretrainConfig,
     GeneformerPretrainingDataConfig,
 )
 from bionemo.llm.run.config_models import MainConfig
-from bionemo.llm.train import train
-
+from bionemo.llm.train import NsysConfig, train
 
 def main():  # noqa: D103
     def parse_args():
@@ -49,6 +50,39 @@ def parse_args():
             action="store_true",
             help="Resume training if a checkpoint exists that matches the current experiment configuration.",
         )
+
+        # Debug options.
+        parser.add_argument(
+            "--nsys-profiling",
+            action="store_true",
+            default=False,
+            help="Enable targeted `nsys` profiling on the training loop for a defined step range. To actually get profiling output you must run the whole program with `nsys`. For example: "
+            " `nsys profile -s none -o output_report_name -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop  [regular python command here]`",
+        )
+        # start, end, rank
+        parser.add_argument(
+            "--nsys-start-step",
+            type=int,
+            required=False,
+            default=0,
+            help="Start nsys profiling after this step.",
+        )
+        parser.add_argument(
+            "--nsys-end-step",
+            type=int,
+            required=False,
+            help="End nsys profiling after this step.",
+        )
+        # rank as list of integers
+        parser.add_argument(
+            "--nsys-ranks",
+            type=int,
+            nargs="+",
+            required=False,
+            default=[0],
+            help="Enable nsys profiling for these ranks.",
+        )
+
         return parser.parse_args()
 
     def string_to_class(path: str):
@@ -80,6 +114,16 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
 
     args = parse_args()
     config = load_config(args.config, args.model_config_t, args.data_config_t)
+
+    if args.nsys_profiling:
+        nsys_config = NsysConfig(
+            start_step=args.nsys_start_step,
+            end_step=args.nsys_end_step,
+            ranks=args.nsys_ranks,
+        )
+    else:
+        nsys_config = None
+
     train(
         bionemo_exposed_model_config=config.bionemo_model_config,
         data_config=config.data_config,
@@ -89,6 +133,7 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
         experiment_config=config.experiment_config,
         wandb_config=config.wandb_config,
         resume_if_exists=args.resume_if_exists,
+        nsys_config=nsys_config
     )
 
 
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/test_pydantic_train.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py
similarity index 100%
rename from sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/test_pydantic_train.py
rename to sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index 9aa0d7c69b..25a8601149 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -270,6 +270,7 @@ class TrainingConfig(BaseModel):
         val_check_interval (int): The interval (in steps) at which to check validation.
         precision (Literal["32", "bf16-mixed", "16-mixed"], optional): The precision to use for training. Defaults to "bf16-mixed".
         accelerator (str, optional): The type of accelerator to use for training. Defaults to "gpu".
+        gc_interval (int, optional): The interval of global steps at which to run synchronized garbage collection. Useful for synchronizing garbage collection when performing distributed training. Defaults to 0.
     """
 
     max_steps: int
@@ -277,6 +278,8 @@ class TrainingConfig(BaseModel):
     val_check_interval: int
     precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
     accelerator: str = "gpu"
+    # NOTE: VERY important for distributed training performance.
+    gc_interval: int = 0 
 
 
 class OptimizerSchedulerConfig(BaseModel):
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index 7074200039..20c838170c 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -14,10 +14,13 @@
 # limitations under the License.
 
 
+from dataclasses import field
 import math
 import pathlib
 from typing import Optional
 
+from pydantic import BaseModel
+
 from bionemo.llm.lightning import BionemoLightningModule
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
@@ -44,6 +47,12 @@
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
+class NsysConfig(BaseModel):
+    """Configuration for nsys profiling."""
+    start_step: int = 0
+    end_step: Optional[int] = None
+    ranks: list[int] = field(default_factory=lambda: [0])
+
 
 def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optional[WandbConfig]) -> nl.NeMoLogger:
     """Creates and returns a NeMoLogger instance configured based on the provided experiment and wandb configurations.
@@ -74,7 +83,7 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
     return nemo_logger
 
 
-def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig, callbacks=None) -> nl.Trainer:
+def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig, callbacks=None, nsys_config: NsysConfig | None = None) -> nl.Trainer:
     """Set up the trainer for model training using the specified parallel and training configurations.
 
     Args:
@@ -101,6 +110,29 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
             LearningRateMonitor(),
         ]
 
+    if training_config.gc_interval > 0:
+        callbacks.append(
+            nl_callbacks.GarbageCollectionCallback(gc_interval_train=training_config.gc_interval, gc_interval_val=training_config.gc_interval)
+        )
+
+    # TODO set these as flags, the following are needed:
+    '''
+    nsys_profiling (bool)
+    nsys_start_step (int) when to start profiling
+    nsys_end_step (int) when to stop profiling
+    nsys_ranks (List[int]) which ranks to profile.
+    '''
+    if nsys_config:
+        if nsys_config.end_step is None:
+            nsys_config.end_step = training_config.max_steps
+        callbacks.append(
+            nl_callbacks.NsysCallback(
+                start_step=nsys_config.start_step, end_step=nsys_config.end_step, ranks=nsys_config.ranks, gen_shape=True
+            )
+        )
+
+
+
     trainer = nl.Trainer(
         devices=parallel_config.num_devices,
         max_steps=training_config.max_steps,
@@ -123,6 +155,7 @@ def train(
     optim_config: OptimizerSchedulerConfig,
     experiment_config: ExperimentConfig,
     wandb_config: Optional[WandbConfig],
+    nsys_config: Optional[NsysConfig] = None,
     resume_if_exists: bool = True,
 ):
     """Train a BioNemo model using the provided configurations. Uses the ExposedModelConfig and DataConfig as the primary variants for this method.
@@ -134,7 +167,8 @@ def train(
         training_config (TrainingConfig): Configuration for training parameters.
         optim_config (OptimizerSchedulerConfig): Configuration for the optimizer and scheduler.
         experiment_config (ExperimentConfig): Configuration for the experiment.
-        wandb_config (Optional[WandbConfig]): Configuration for Weights and Biases logging.
+        wandb_config (Optional[WandbConfig]): Configuration for Weights and Biases logging.n
+        nsys_config (Optional[NsysConfig], optional): Configuration for nsys profiling. If None, is disabled.
         resume_if_exists (bool, optional): Flag to resume training if a checkpoint exists. Defaults to True.
     """
     bionemo_model_config = bionemo_exposed_model_config.exposed_to_internal_bionemo_model_config()
@@ -180,7 +214,7 @@ def train(
     model: BionemoLightningModule = biobert_lightning_module(
         config=bionemo_model_config, tokenizer=data.tokenizer, optimizer=optimizer
     )
-    trainer: nl.Trainer = setup_trainer(parallel_config, training_config)
+    trainer: nl.Trainer = setup_trainer(parallel_config, training_config, nsys_config=nsys_config)
     nemo_logger: nl.NeMoLogger = nemo_logger_factory(experiment_config, wandb_config=wandb_config)
 
     llm.train(

From f5e096cd05397547f2398858045fbd70b1ef69e1 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Mon, 4 Nov 2024 23:53:23 +0000
Subject: [PATCH 42/58] Formatting

---
 .../bionemo-esm2/src/bionemo/esm2/run/main.py |  1 -
 .../src/bionemo/geneformer/run/main.py        |  5 +--
 .../src/bionemo/llm/run/config_models.py      |  2 +-
 .../bionemo-llm/src/bionemo/llm/train.py      | 39 +++++++++++--------
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
index e8933f07d4..b2feda7781 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -111,7 +111,6 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
     args = parse_args()
     config = load_config(args.config, args.model_config_t, args.data_config_t)
 
-
     if args.nsys_profiling:
         nsys_config = NsysConfig(
             start_step=args.nsys_start_step,
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
index 366b58e4d1..24f1682e18 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/main.py
@@ -18,8 +18,6 @@
 import json
 from typing import Optional
 
-from pydantic import BaseModel
-
 from bionemo.geneformer.run.config_models import (
     ExposedFineTuneSeqLenBioBertConfig,
     ExposedGeneformerPretrainConfig,
@@ -28,6 +26,7 @@
 from bionemo.llm.run.config_models import MainConfig
 from bionemo.llm.train import NsysConfig, train
 
+
 def main():  # noqa: D103
     def parse_args():
         parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
@@ -133,7 +132,7 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
         experiment_config=config.experiment_config,
         wandb_config=config.wandb_config,
         resume_if_exists=args.resume_if_exists,
-        nsys_config=nsys_config
+        nsys_config=nsys_config,
     )
 
 
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index 25a8601149..716cbf3a40 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -279,7 +279,7 @@ class TrainingConfig(BaseModel):
     precision: Literal["32", "bf16-mixed", "16-mixed"] = "bf16-mixed"
     accelerator: str = "gpu"
     # NOTE: VERY important for distributed training performance.
-    gc_interval: int = 0 
+    gc_interval: int = 0
 
 
 class OptimizerSchedulerConfig(BaseModel):
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index 20c838170c..567cb02a41 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -14,14 +14,11 @@
 # limitations under the License.
 
 
-from dataclasses import field
 import math
 import pathlib
+from dataclasses import field
 from typing import Optional
 
-from pydantic import BaseModel
-
-from bionemo.llm.lightning import BionemoLightningModule
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
 from nemo.collections import llm
@@ -30,11 +27,11 @@
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule
 from nemo.lightning.pytorch.optim.lr_scheduler import CosineAnnealingScheduler
 from nemo.utils import logging
+from pydantic import BaseModel
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
-from tokenizers import Tokenizer
 
-from bionemo.llm.model.biobert.lightning import BioBertLightningModule, biobert_lightning_module
-from bionemo.llm.model.biobert.model import BioBertConfig
+from bionemo.llm.lightning import BionemoLightningModule
+from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.run.config_models import (
     DataConfig,
     DataModuleT,
@@ -47,8 +44,10 @@
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
+
 class NsysConfig(BaseModel):
     """Configuration for nsys profiling."""
+
     start_step: int = 0
     end_step: Optional[int] = None
     ranks: list[int] = field(default_factory=lambda: [0])
@@ -83,7 +82,12 @@ def nemo_logger_factory(experiment_config: ExperimentConfig, wandb_config: Optio
     return nemo_logger
 
 
-def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConfig, callbacks=None, nsys_config: NsysConfig | None = None) -> nl.Trainer:
+def setup_trainer(
+    parallel_config: ParallelConfig,
+    training_config: TrainingConfig,
+    callbacks=None,
+    nsys_config: NsysConfig | None = None,
+) -> nl.Trainer:
     """Set up the trainer for model training using the specified parallel and training configurations.
 
     Args:
@@ -93,6 +97,7 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
                                           validation batch limit, validation check interval, and precision.
         callbacks (list, optional): List of callback functions to be used during training. Defaults to None,
                                     in which case default callbacks (RichModelSummary and LearningRateMonitor) are used.
+        nsys_config (NsysConfig, optional): Configuration for nsys profiling. If None, is disabled.
 
     Returns:
         nl.Trainer: Configured trainer object ready for model training.
@@ -112,27 +117,30 @@ def setup_trainer(parallel_config: ParallelConfig, training_config: TrainingConf
 
     if training_config.gc_interval > 0:
         callbacks.append(
-            nl_callbacks.GarbageCollectionCallback(gc_interval_train=training_config.gc_interval, gc_interval_val=training_config.gc_interval)
+            nl_callbacks.GarbageCollectionCallback(
+                gc_interval_train=training_config.gc_interval, gc_interval_val=training_config.gc_interval
+            )
         )
 
     # TODO set these as flags, the following are needed:
-    '''
+    """
     nsys_profiling (bool)
     nsys_start_step (int) when to start profiling
     nsys_end_step (int) when to stop profiling
     nsys_ranks (List[int]) which ranks to profile.
-    '''
+    """
     if nsys_config:
         if nsys_config.end_step is None:
             nsys_config.end_step = training_config.max_steps
         callbacks.append(
             nl_callbacks.NsysCallback(
-                start_step=nsys_config.start_step, end_step=nsys_config.end_step, ranks=nsys_config.ranks, gen_shape=True
+                start_step=nsys_config.start_step,
+                end_step=nsys_config.end_step,
+                ranks=nsys_config.ranks,
+                gen_shape=True,
             )
         )
 
-
-
     trainer = nl.Trainer(
         devices=parallel_config.num_devices,
         max_steps=training_config.max_steps,
@@ -207,10 +215,9 @@ def train(
             interval=optim_config.interval,
             monitor=optim_config.monitor,
             constant_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_hold_frac)),
-        )
+        ),
     )
 
-
     model: BionemoLightningModule = biobert_lightning_module(
         config=bionemo_model_config, tokenizer=data.tokenizer, optimizer=optimizer
     )

From 48c02c3555ccc8d65d1d10ffed9f4c1f7ec50b63 Mon Sep 17 00:00:00 2001
From: Steven Kothen-Hill <148821680+skothenhill-nv@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:31:18 -0700
Subject: [PATCH 43/58] Update README.md

Co-authored-by: Farhad Ramezanghorbani <farhadrgh@users.noreply.github.com>
Signed-off-by: Steven Kothen-Hill <148821680+skothenhill-nv@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e3c8c3e570..1fbc1924cc 100644
--- a/README.md
+++ b/README.md
@@ -338,7 +338,7 @@ the default pretraining DataConfig and DataModule will be insufficient. See ESM2
 > ⚠️ **Warning:** This setup does NO configuration of Weights and Biases. Edit your config JSON and populate it with your WandB details.
 
 ```bash
-bionemo-esm2-train \
+bionemo-geneformer-train \
 --data-config-t bionemo.geneformer.run.config_models.GeneformerPretrainingDataConfig \
 --model-config-t bionemo.geneformer.run.config_models.ExposedGeneformerPretrainConfig \
 --config my_config.json

From 9b2bc436f0dbb5b63c02a6c2d021002226cff053 Mon Sep 17 00:00:00 2001
From: Steven Kothen-Hill <148821680+skothenhill-nv@users.noreply.github.com>
Date: Tue, 5 Nov 2024 12:53:06 -0700
Subject: [PATCH 44/58] Update
 sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py

Co-authored-by: Farhad Ramezanghorbani <farhadrgh@users.noreply.github.com>
Signed-off-by: Steven Kothen-Hill <148821680+skothenhill-nv@users.noreply.github.com>
---
 sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
index b2feda7781..dd8c988805 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -25,7 +25,7 @@
 
 def main():  # noqa: D103
     def parse_args():
-        parser = argparse.ArgumentParser(description="Run Geneformer pretraining")
+        parser = argparse.ArgumentParser(description="Run ESM2 pretraining")
         parser.add_argument("--config", type=str, required=True, help="Path to the JSON configuration file")
         parser.add_argument(
             "--model-config-t",

From 05669012f1ad243088f0308fac5cab0d98ff0d94 Mon Sep 17 00:00:00 2001
From: Steven Kothen-Hill <148821680+skothenhill-nv@users.noreply.github.com>
Date: Tue, 5 Nov 2024 13:57:47 -0700
Subject: [PATCH 45/58] Apply suggestions from code review

Co-authored-by: Farhad Ramezanghorbani <farhadrgh@users.noreply.github.com>
Signed-off-by: Steven Kothen-Hill <148821680+skothenhill-nv@users.noreply.github.com>
---
 sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
index dd8c988805..6ae0b9f8a5 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -31,7 +31,7 @@ def parse_args():
             "--model-config-t",
             default=ExposedESM2PretrainConfig,
             required=False,
-            help="fully resolvable python import path to the ModelConfig object. Builtin options are ExposedGeneformerPretrainConfig and ExposedFineTuneSeqLenBioBertConfig.",
+            help="fully resolvable python import path to the ModelConfig object. Builtin options are ExposedESM2PretrainConfig.",
         )
         parser.add_argument(
             "--data-config-t",

From 99d796b8694a2d36f74d4bfaa862341499a29446 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 5 Nov 2024 23:18:48 +0000
Subject: [PATCH 46/58] Addressed comments on PR

---
 scripts/protein/esm2/esm2_pretrain.py         |  6 +-
 scripts/protein/esm2/test_pydantic_train.py   | 43 +---------
 .../src/bionemo/esm2/model/model.py           |  5 --
 .../src/bionemo/esm2/run/config_models.py     | 29 +++----
 .../bionemo-esm2/src/bionemo/esm2/run/main.py |  4 +-
 .../src/bionemo/esm2/run/recipes.py           | 84 ++++++++-----------
 .../src/bionemo/geneformer/run/recipes.py     |  6 +-
 .../src/bionemo/llm/run/config_models.py      | 31 ++++---
 .../bionemo-llm/src/bionemo/llm/train.py      | 34 +++++---
 9 files changed, 107 insertions(+), 135 deletions(-)

diff --git a/scripts/protein/esm2/esm2_pretrain.py b/scripts/protein/esm2/esm2_pretrain.py
index dcf26c6323..ed88c6053f 100644
--- a/scripts/protein/esm2/esm2_pretrain.py
+++ b/scripts/protein/esm2/esm2_pretrain.py
@@ -189,6 +189,8 @@ def main(
         plugins=nl.MegatronMixedPrecision(precision=precision),
     )
 
+    tokenizer = get_tokenizer()
+
     # Initialize the data module.
     data = ESMDataModule(
         train_cluster_path=train_cluster_path,
@@ -201,10 +203,8 @@ def main(
         max_seq_length=max_seq_length,
         num_workers=num_dataset_workers,
         random_mask_strategy=random_mask_strategy,
-        tokenizer=get_tokenizer(),
+        tokenizer=tokenizer,
     )
-    # NOTE(SKH) added this.
-    tokenizer = data._tokenizer
     # Configure the model
     esm2_config = ESM2Config(
         seq_length=max_seq_length,
diff --git a/scripts/protein/esm2/test_pydantic_train.py b/scripts/protein/esm2/test_pydantic_train.py
index 7a245c26ee..1b02ecbf3b 100644
--- a/scripts/protein/esm2/test_pydantic_train.py
+++ b/scripts/protein/esm2/test_pydantic_train.py
@@ -24,6 +24,7 @@
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
 
 from bionemo.testing.data.load import load
+from bionemo.testing.data.esm2 import create_mock_parquet_train_val_inputs, create_mock_protein_dataset
 
 
 data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
@@ -41,52 +42,14 @@ def test_bionemo2_rootdir():
 @pytest.fixture
 def dummy_protein_dataset(tmp_path):
     """Create a mock protein dataset."""
-    db_file = tmp_path / "protein_dataset.db"
-    conn = sqlite3.connect(str(db_file))
-    cursor = conn.cursor()
-
-    cursor.execute(
-        """
-        CREATE TABLE protein (
-            id TEXT PRIMARY KEY,
-            sequence TEXT
-        )
-    """
-    )
-
-    proteins = [
-        ("UniRef90_A", "ACDEFGHIKLMNPQRSTVWY"),
-        ("UniRef90_B", "DEFGHIKLMNPQRSTVWYAC"),
-        ("UniRef90_C", "MGHIKLMNPQRSTVWYACDE"),
-        ("UniRef50_A", "MKTVRQERLKSIVRI"),
-        ("UniRef50_B", "MRILERSKEPVSGAQLA"),
-    ]
-    cursor.executemany("INSERT INTO protein VALUES (?, ?)", proteins)
-
-    conn.commit()
-    conn.close()
-
+    db_file = create_mock_protein_dataset(tmp_path)
     return db_file
 
 
 @pytest.fixture
 def dummy_parquet_train_val_inputs(tmp_path):
     """Create a mock protein train and val cluster parquet."""
-    train_cluster_path = tmp_path / "train_clusters.parquet"
-    train_clusters = pd.DataFrame(
-        {
-            "ur90_id": [["UniRef90_A"], ["UniRef90_B", "UniRef90_C"]],
-        }
-    )
-    train_clusters.to_parquet(train_cluster_path)
-
-    valid_cluster_path = tmp_path / "valid_clusters.parquet"
-    valid_clusters = pd.DataFrame(
-        {
-            "ur50_id": ["UniRef50_A", "UniRef50_B", "UniRef50_A", "UniRef50_B"],  # 2 IDs more than confest
-        }
-    )
-    valid_clusters.to_parquet(valid_cluster_path)
+    train_cluster_path, valid_cluster_path = create_mock_parquet_train_val_inputs(tmp_path)
     return train_cluster_path, valid_cluster_path
 
 
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
index c189cd719d..b5b53b036c 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py
@@ -273,11 +273,6 @@ class ESM2GenericConfig(BioBertConfig[ESM2ModelT, MegatronLossType]):
         loss_reduction_class: Loss reduction class for the model. Default to BERTMLMLossWithReduction.
     """
 
-    # ESM specific fields (these are repeated below)
-    use_esm_attention: bool = False  # Skip ESM2 custom attention for TE acceleration. Still passes golden value test.
-    token_dropout: bool = True
-    normalize_attention_scores: bool = False
-
     # When overriding fields in a dataclass _always_ declare types: https://github.com/python/cpython/issues/123269
     model_cls: Type[ESM2ModelT] = ESM2Model
     num_layers: int = 33  # 650M
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
index 436093df11..1e939c6a95 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
@@ -70,7 +70,18 @@ class ESM2DataConfig(DataConfig[ESMDataModule]):
     num_dataset_workers: int = 0
 
     def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
-        """Constructs and returns an ESMDataModule instance with the provided global batch size."""
+        """Constructs and returns an ESMDataModule instance with the provided global batch size.
+
+        This method provides means for constructing the datamodule, any pre-requisites for the DataModule should be
+        aquired here. For example, tokenizers, preprocessing, may want to live in this method.        
+
+        Args:
+            global_batch_size (int): Global batch size for the data module. Global batch size must be a function of
+                parallelism settings and the `micro_batch_size` attribute. Since the DataConfig has no ownership over
+                parallelism configuration, we expect someone higher up on the ownership chain to provide the value to 
+                this method.
+        
+        """
         tokenizer = get_tokenizer()
         data = ESMDataModule(
             train_cluster_path=self.train_cluster_path,
@@ -123,20 +134,6 @@ class ExposedESM2PretrainConfig(ExposedModelConfig[ESM2Config]):
     variable_seq_lengths: bool = False
     core_attention_override: Type[torch.nn.Module] | None = None
 
-    @field_validator("biobert_spec_option", mode="after")
-    @classmethod
-    def restrict_biobert_spec_to_esm2(cls, biobert_spec_option: BiobertSpecOption) -> BiobertSpecOption:
-        """Validates the BiobertSpecOption to ensure it is compatible with ESM2. by restricting it to the specs compatable with ESM2."""
-        if biobert_spec_option in (
-            BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
-            BiobertSpecOption.esm2_bert_layer_local_spec,
-        ):
-            return biobert_spec_option
-        else:
-            raise TypeError(
-                f"Unsupported BiobertSpecOption: {biobert_spec_option=}, use one of {BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec}, {BiobertSpecOption.esm2_bert_layer_local_spec}"
-            )
-
     @field_serializer("core_attention_override")
     def serialize_core_attention_override(self, value: Optional[Type[torch.nn.Module]]) -> Optional[str]:
         """Serializes the core attention override module to a string."""
@@ -201,7 +198,7 @@ def model_validator(self, global_cfg: MainConfig) -> MainConfig:
         assert (
             self.variable_seq_lengths
             == (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)
-        ), "Must set variable_seq_lengths = (pipeline_model_parallel_size * tensor_model_parallel_size > 1 and min_seq_length != max_seq_length)"
+        ), "Must set variable_seq_lengths to True when min_seq_length != max_seq_length under pipeline or tensor parallelism."
         return global_cfg
 
     def model_class(self) -> Type[ESM2Config]:
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
index b2feda7781..9bfede751b 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/main.py
@@ -93,10 +93,12 @@ def load_config(config_path: str, model_config_t: Optional[str], data_config_t:
         # model/data_config_t is used to select the parser dynamically.
         if model_config_t is None or model_config_t == "ExposedESM2PretrainConfig":
             model_config_t = ExposedESM2PretrainConfig
-        elif model_config_t == "ExposedFineTuneSeqLenBioBertConfig":
+        elif model_config_t == "ExposedFineTuneSeqModel":
             # Hardcoded path for those who do not know the full path
             # model_config_t = ExposedFineTuneSeqLenBioBertConfig
             raise NotImplementedError()
+        elif model_config_t == "ExposedFineTuneTokenModel":
+            raise NotImplementedError()
         elif isinstance(model_config_t, str):
             # We assume we get a string to some importable config... e.g. in the sub-package jensen, 'bionemo.jensen.configs.MyConfig'
             model_config_t = string_to_class(model_config_t)
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
index 1f7fa1c340..34b82458c0 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
@@ -35,7 +35,7 @@
 
 def esm2_base_training_config() -> TrainingConfig:
     """Base training config for ESM2."""
-    return TrainingConfig(max_steps=500000, limit_val_batches=1.0, val_check_interval=1500, precision="bf16-mixed")
+    return TrainingConfig(max_steps=500000, limit_val_batches=1.0, val_check_interval=10_000, precision="bf16-mixed", include_perplexity=True)
 
 
 def esm2_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
@@ -43,10 +43,10 @@ def esm2_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
     return OptimizerSchedulerConfig(
         optimizer="adam",
         lr=4e-4,
-        cosine_rampup_frac=0.01,
-        cosine_hold_frac=0.05,
         interval="step",
         monitor="val_loss",
+        lr_scheduler="warmup_anneal",
+        warmup_steps=2000
     )
 
 
@@ -61,6 +61,18 @@ def esm2_base_parallel_config() -> ParallelConfig:
         num_nodes=1,
     )
 
+def esm2_base_data_config(args) -> ESM2DataConfig:
+    data_config = ESM2DataConfig(
+        min_seq_length=1024,
+        max_seq_length=1024,
+        micro_batch_size=1,
+        num_dataset_workers=8,
+        train_cluster_path=args.train_cluster_path,
+        train_database_path=args.train_database_path,
+        valid_cluster_path=args.valid_cluster_path,
+        valid_database_path=args.valid_database_path,
+    )
+    return data_config
 
 def esm2_8m_wandb_config() -> WandbConfig:
     """Wandb config for ESM2 8m."""
@@ -68,7 +80,7 @@ def esm2_8m_wandb_config() -> WandbConfig:
         entity="esm2-8m_pretraining",
         project="esm2-8m_pretraining",
         group="esm2-8m",
-        tags=["esm2-8m"],
+        tags=["esm2", "pretraining"],
         offline=True,
         anonymous=True,
         id="1",
@@ -82,7 +94,7 @@ def esm2_8m_experiment_config(result_dir) -> ExperimentConfig:
     return ExperimentConfig(
         save_every_n_steps=50,  # default set in previous script.
         result_dir=result_dir,
-        experiment_name="esm2-8m",
+        experiment_name="esm2-8m-pretraining",
         restore_from_checkpoint_path=None,
     )
 
@@ -105,19 +117,8 @@ def esm2_8m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
 
 def esm2_8m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
     """Recipe for ESM2 8m."""
-    data_config = ESM2DataConfig(
-        min_seq_length=1024,
-        max_seq_length=1024,
-        micro_batch_size=2,
-        num_dataset_workers=8,
-        train_cluster_path=args.train_cluster_path,
-        train_database_path=args.train_database_path,
-        valid_cluster_path=args.valid_cluster_path,
-        valid_database_path=args.valid_database_path,
-    )
-
     return MainConfig(
-        data_config=data_config,
+        data_config=esm2_base_data_config(args),
         parallel_config=esm2_base_parallel_config(),
         training_config=esm2_base_training_config(),  # no changes for 8m
         bionemo_model_config=esm2_8m_model_config(args.initial_ckpt_path),
@@ -130,7 +131,7 @@ def esm2_8m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig
 def esm2_650m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
     """Model config for ESM2 650m."""
     return ExposedESM2PretrainConfig(
-        num_layers=6,
+        num_layers=33,
         hidden_size=1280,
         ffn_hidden_size=1280 * 4,
         seq_length=1024,
@@ -149,7 +150,7 @@ def esm2_650m_wandb_config() -> WandbConfig:
         entity="esm2-650m_pretraining",
         project="esm2-650m_pretraining",
         group="esm2-650m",
-        tags=["esm2-650m"],
+        tags=["esm2", "pretraining"],
         offline=True,
         anonymous=True,
         id="1",
@@ -162,7 +163,7 @@ def esm2_650m_experiment_config(result_dir) -> ExperimentConfig:
     return ExperimentConfig(
         save_every_n_steps=50,
         result_dir=result_dir,
-        experiment_name="esm2-650m",
+        experiment_name="esm2-650m-pretraining",
         # TODO should this be exposed?
         restore_from_checkpoint_path=None,
     )
@@ -170,19 +171,8 @@ def esm2_650m_experiment_config(result_dir) -> ExperimentConfig:
 
 def esm2_650m_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
     """Recipe for ESM2 650m."""
-    data_config = ESM2DataConfig(
-        min_seq_length=1024,
-        max_seq_length=1024,
-        micro_batch_size=1,
-        num_dataset_workers=8,
-        train_cluster_path=args.train_cluster_path,
-        train_database_path=args.train_database_path,
-        valid_cluster_path=args.valid_cluster_path,
-        valid_database_path=args.valid_database_path,
-    )
-
     return MainConfig(
-        data_config=data_config,
+        data_config=esm2_base_data_config(args),
         parallel_config=esm2_base_parallel_config(),
         training_config=esm2_base_training_config(),  # no changes for 8m
         bionemo_model_config=esm2_650m_model_config(args.initial_ckpt_path),
@@ -227,34 +217,34 @@ def esm2_3b_wandb_config() -> WandbConfig:
         entity="esm2-3b_pretraining",
         project="esm2-3b_pretraining",
         group="esm2-3b",
-        tags=["esm2-3b"],
+        tags=["esm2-650m"],
         offline=True,
         anonymous=True,
         id="1",
         log_model=False,
     )
 
+def esm2_3b_experiment_config(result_dir) -> ExperimentConfig:
+    """Experiment config for ESM2 650m."""
+    return ExperimentConfig(
+        save_every_n_steps=50,
+        result_dir=result_dir,
+        experiment_name="esm2-3b-pretraining",
+        # TODO should this be exposed?
+        restore_from_checkpoint_path=None,
+    )
+
+
 
 def esm2_3b_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
     """Recipe for ESM2 3b."""
-    data_config = ESM2DataConfig(
-        min_seq_length=1024,
-        max_seq_length=1024,
-        micro_batch_size=1,
-        num_dataset_workers=8,
-        train_cluster_path=args.train_cluster_path,
-        train_database_path=args.train_database_path,
-        valid_cluster_path=args.valid_cluster_path,
-        valid_database_path=args.valid_database_path,
-    )
-
     return MainConfig(
-        data_config=data_config,
+        data_config=esm2_base_data_config(args),
         parallel_config=esm2_3b_parallel_config(),
         training_config=esm2_base_training_config(),  # no changes for 8m
         bionemo_model_config=esm2_3b_model_config(args.initial_ckpt_path),
         optim_config=esm2_base_optimizer_scheduler_config(),  # no changes for 8m
-        experiment_config=esm2_650m_experiment_config(args.result_dir),
+        experiment_config=esm2_3b_experiment_config(args.result_dir),
         wandb_config=esm2_3b_wandb_config(),
     )
 
@@ -295,7 +285,7 @@ def experiment_config_recipe(result_dir="./results") -> ExperimentConfig:
         experiment_name="default_experiment",
         restore_from_checkpoint_path=None,
         save_last_checkpoint=True,
-        metric_to_monitor_for_checkpoints="reduced_train_loss",
+        metric_to_monitor_for_checkpoints="val_loss",
         save_top_k=2,
         create_tensorboard_logger=False,
     )
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index e56b08c080..1e8f051f48 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -50,7 +50,7 @@ def geneformer_base_parallel_config() -> ParallelConfig:
 
 def geneformer_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
     """Base optimizer scheduler config for Geneformer."""
-    return OptimizerSchedulerConfig(lr=1e-3)  # Matches bionemo1
+    return OptimizerSchedulerConfig(lr=1e-3, lr_scheduler='cosine')  # Matches bionemo1
 
 
 def geneformer_base_training_config() -> TrainingConfig:
@@ -401,7 +401,7 @@ def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig,
         create_tensorboard_logger=False,
     )
 
-    optim_config = OptimizerSchedulerConfig()
+    optim_config = OptimizerSchedulerConfig(lr_scheduler='cosine')
     geneformer_config = geneformer_10m_finetune_config(
         seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
     )
@@ -444,7 +444,7 @@ def pretrain_tiny_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfi
         create_tensorboard_logger=False,
     )
 
-    optim_config = OptimizerSchedulerConfig()
+    optim_config = OptimizerSchedulerConfig(lr_scheduler='cosine')
     geneformer_config = geneformer_tiny_config(
         seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
     )
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index 716cbf3a40..5ddf5636a9 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -63,7 +63,7 @@ def construct_data_module(self, global_batch_size: int) -> DataModuleT:
         """Construct the data module from the configuration. Cannot be defined generically."""
         ...
 
-    def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
+    def custom_model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
         """Use custom implementation of this method to define the things inside global_config.
 
         The following expression will always be true:
@@ -96,7 +96,7 @@ def model_class(self) -> Type[ModelConfigT]:
         """Returns the underlying model class that this config wraps."""
         raise NotImplementedError
 
-    def model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
+    def custom_model_validator(self, global_cfg: "MainConfig") -> "MainConfig":
         """Use custom implementation of this method to define the things inside global_config.
 
         The following expression will always be true:
@@ -158,6 +158,13 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
     nemo1_ckpt_path: Optional[str] = None
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec
 
+    @model_validator(mode="after")
+    def validate_ffn_hidden_size(self) -> 'ExposedModelConfig':
+        """Validates the ffn_hidden_size."""
+        if not self.ffn_hidden_size == 4 * self.hidden_size:
+            raise ValidationError("ffn_hidden_size must be 4 * hidden_size")
+        return self
+
     @field_validator("activation_func", mode="before")
     @classmethod
     def validate_activation_func(cls, activation_func: str) -> Callable:
@@ -271,6 +278,7 @@ class TrainingConfig(BaseModel):
         precision (Literal["32", "bf16-mixed", "16-mixed"], optional): The precision to use for training. Defaults to "bf16-mixed".
         accelerator (str, optional): The type of accelerator to use for training. Defaults to "gpu".
         gc_interval (int, optional): The interval of global steps at which to run synchronized garbage collection. Useful for synchronizing garbage collection when performing distributed training. Defaults to 0.
+        include_perplexity (bool, optional): Whether to include perplexity in the validation logs. Defaults to False.
     """
 
     max_steps: int
@@ -280,7 +288,7 @@ class TrainingConfig(BaseModel):
     accelerator: str = "gpu"
     # NOTE: VERY important for distributed training performance.
     gc_interval: int = 0
-
+    include_perplexity: bool = False
 
 class OptimizerSchedulerConfig(BaseModel):
     """Configuration for the optimizer and learning rate scheduler.
@@ -288,18 +296,21 @@ class OptimizerSchedulerConfig(BaseModel):
     Attributes:
         lr (float): Learning rate for the optimizer. Default is 1e-4.
         optimizer (str): Type of optimizer to use. Default is "adam".
-        cosine_rampup_frac (float): Fraction of total training steps for the cosine ramp-up phase. Default is 0.01.
-        cosine_hold_frac (float): Fraction of total training steps to hold the learning rate constant after ramp-up. Default is 0.05.
         interval (str): Interval for updating the learning rate scheduler. Default is "step".
         monitor (str): Metric to monitor for learning rate adjustments. Default is "val_loss".
+        interval (str): Interval for updating the learning rate scheduler. Default is "step".
+        monitor (str): Metric to monitor for learning rate adjustments. Default is "val_loss".
+        warmup_steps (int): Number of warmup steps for use with the warmup annealing learning rate scheduler. Default is 0.
+        lr_scheduler (Literal['warmup_anneal', 'cosine']): Type of learning rate scheduler to use. Default is 'warmup_anneal'. NOTE this is likely to change.
     """
-
     lr: float = 1e-4
     optimizer: str = "adam"
-    cosine_rampup_frac: float = 0.01
-    cosine_hold_frac: float = 0.05
     interval: str = "step"
     monitor: str = "val_loss"
+    cosine_rampup_frac: float = 0.01
+    cosine_hold_frac: float = 0.05
+    warmup_steps: int = 0
+    lr_scheduler: Literal['warmup_anneal', 'cosine'] = 'warmup_anneal'
 
 
 class ExperimentConfig(BaseModel):
@@ -374,9 +385,9 @@ def validate_master_config(self) -> "MainConfig":
     @model_validator(mode="after")
     def run_bionemo_model_config_model_validators(self) -> "MainConfig":
         """Runs the model validators on the bionemo_model_config."""
-        return self.bionemo_model_config.model_validator(self)
+        return self.bionemo_model_config.custom_model_validator(self)
 
     @model_validator(mode="after")
     def run_data_config_model_validators(self) -> "MainConfig":
         """Runs the model validators on the data_config."""
-        return self.data_config.model_validator(self)
+        return self.data_config.custom_model_validator(self)
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index 567cb02a41..cc22aebabf 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -19,6 +19,7 @@
 from dataclasses import field
 from typing import Optional
 
+from bionemo.esm2.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
 from nemo.collections import llm
@@ -30,7 +31,7 @@
 from pydantic import BaseModel
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 
-from bionemo.llm.lightning import BionemoLightningModule
+from bionemo.llm.lightning import BionemoLightningModule, PerplexityLoggingCallback
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.run.config_models import (
     DataConfig,
@@ -115,6 +116,9 @@ def setup_trainer(
             LearningRateMonitor(),
         ]
 
+    if training_config.include_perplexity:
+        callbacks.append(PerplexityLoggingCallback())
+
     if training_config.gc_interval > 0:
         callbacks.append(
             nl_callbacks.GarbageCollectionCallback(
@@ -196,10 +200,26 @@ def train(
     )
 
     data: DataModuleT = data_config.construct_data_module(global_batch_size)
-
     # TODO BioBertDataModule or BioBertTokenizer abstractions. We know all DataModuleT in this case has data.tokenizer,
     # although this constraint is not documented.
 
+    # TODO: need an abstraction for LrSchedulerConfig
+    if optim_config.lr_scheduler == 'cosine':
+        lr_scheduler=CosineAnnealingScheduler(
+                max_steps=training_config.max_steps,
+                min_lr=optim_config.lr / 100,
+                warmup_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_rampup_frac)),
+                interval=optim_config.interval,
+                monitor=optim_config.monitor,
+                constant_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_hold_frac)),
+        )
+    elif optim_config.lr_scheduler == 'warmup_anneal':
+        lr_scheduler = WarmupAnnealDecayHoldScheduler(
+            warmup_steps=optim_config.warmup_steps, max_steps=training_config.max_steps, max_lr=optim_config.lr, min_lr=optim_config.lr / 10.0, anneal_percentage=0.10
+        )
+    else:
+        raise NotImplementedError(f"Scheduler {optim_config.lr_scheduler} not implemented.")
+    
     optimizer = MegatronOptimizerModule(
         config=OptimizerConfig(
             lr=optim_config.lr,
@@ -208,14 +228,8 @@ def train(
             fp16=bionemo_model_config.fp16,
             bf16=bionemo_model_config.bf16,
         ),
-        lr_scheduler=CosineAnnealingScheduler(
-            max_steps=training_config.max_steps,
-            min_lr=optim_config.lr / 100,
-            warmup_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_rampup_frac)),
-            interval=optim_config.interval,
-            monitor=optim_config.monitor,
-            constant_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_hold_frac)),
-        ),
+        lr_scheduler=lr_scheduler,
+        
     )
 
     model: BionemoLightningModule = biobert_lightning_module(

From 13d56c2814b3109d93d90cee17cc511646207dcd Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 5 Nov 2024 23:19:31 +0000
Subject: [PATCH 47/58] formatting

---
 scripts/protein/esm2/test_pydantic_train.py   |  4 +--
 .../src/bionemo/esm2/run/config_models.py     |  6 ++--
 .../src/bionemo/esm2/run/recipes.py           | 20 ++++++++-----
 .../src/bionemo/geneformer/run/recipes.py     |  6 ++--
 .../src/bionemo/llm/run/config_models.py      |  6 ++--
 .../bionemo-llm/src/bionemo/llm/train.py      | 29 ++++++++++---------
 6 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/scripts/protein/esm2/test_pydantic_train.py b/scripts/protein/esm2/test_pydantic_train.py
index 1b02ecbf3b..2522e538f8 100644
--- a/scripts/protein/esm2/test_pydantic_train.py
+++ b/scripts/protein/esm2/test_pydantic_train.py
@@ -15,16 +15,14 @@
 
 import os
 import shlex
-import sqlite3
 import subprocess
 from pathlib import Path
 
-import pandas as pd
 import pytest
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
 
-from bionemo.testing.data.load import load
 from bionemo.testing.data.esm2 import create_mock_parquet_train_val_inputs, create_mock_protein_dataset
+from bionemo.testing.data.load import load
 
 
 data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
index 1e939c6a95..7437e75f3e 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py
@@ -73,14 +73,14 @@ def construct_data_module(self, global_batch_size: int) -> ESMDataModule:
         """Constructs and returns an ESMDataModule instance with the provided global batch size.
 
         This method provides means for constructing the datamodule, any pre-requisites for the DataModule should be
-        aquired here. For example, tokenizers, preprocessing, may want to live in this method.        
+        aquired here. For example, tokenizers, preprocessing, may want to live in this method.
 
         Args:
             global_batch_size (int): Global batch size for the data module. Global batch size must be a function of
                 parallelism settings and the `micro_batch_size` attribute. Since the DataConfig has no ownership over
-                parallelism configuration, we expect someone higher up on the ownership chain to provide the value to 
+                parallelism configuration, we expect someone higher up on the ownership chain to provide the value to
                 this method.
-        
+
         """
         tokenizer = get_tokenizer()
         data = ESMDataModule(
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
index 34b82458c0..9ef8bf2760 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
@@ -35,18 +35,19 @@
 
 def esm2_base_training_config() -> TrainingConfig:
     """Base training config for ESM2."""
-    return TrainingConfig(max_steps=500000, limit_val_batches=1.0, val_check_interval=10_000, precision="bf16-mixed", include_perplexity=True)
+    return TrainingConfig(
+        max_steps=500000,
+        limit_val_batches=1.0,
+        val_check_interval=10_000,
+        precision="bf16-mixed",
+        include_perplexity=True,
+    )
 
 
 def esm2_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
     """Base optimizer scheduler config for ESM2."""
     return OptimizerSchedulerConfig(
-        optimizer="adam",
-        lr=4e-4,
-        interval="step",
-        monitor="val_loss",
-        lr_scheduler="warmup_anneal",
-        warmup_steps=2000
+        optimizer="adam", lr=4e-4, interval="step", monitor="val_loss", lr_scheduler="warmup_anneal", warmup_steps=2000
     )
 
 
@@ -61,7 +62,9 @@ def esm2_base_parallel_config() -> ParallelConfig:
         num_nodes=1,
     )
 
+
 def esm2_base_data_config(args) -> ESM2DataConfig:
+    """Base data config for ESM2."""
     data_config = ESM2DataConfig(
         min_seq_length=1024,
         max_seq_length=1024,
@@ -74,6 +77,7 @@ def esm2_base_data_config(args) -> ESM2DataConfig:
     )
     return data_config
 
+
 def esm2_8m_wandb_config() -> WandbConfig:
     """Wandb config for ESM2 8m."""
     wandb_config = WandbConfig(
@@ -224,6 +228,7 @@ def esm2_3b_wandb_config() -> WandbConfig:
         log_model=False,
     )
 
+
 def esm2_3b_experiment_config(result_dir) -> ExperimentConfig:
     """Experiment config for ESM2 650m."""
     return ExperimentConfig(
@@ -235,7 +240,6 @@ def esm2_3b_experiment_config(result_dir) -> ExperimentConfig:
     )
 
 
-
 def esm2_3b_recipe(args) -> MainConfig[ExposedESM2PretrainConfig, ESM2DataConfig]:
     """Recipe for ESM2 3b."""
     return MainConfig(
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index 1e8f051f48..7c74763ac8 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -50,7 +50,7 @@ def geneformer_base_parallel_config() -> ParallelConfig:
 
 def geneformer_base_optimizer_scheduler_config() -> OptimizerSchedulerConfig:
     """Base optimizer scheduler config for Geneformer."""
-    return OptimizerSchedulerConfig(lr=1e-3, lr_scheduler='cosine')  # Matches bionemo1
+    return OptimizerSchedulerConfig(lr=1e-3, lr_scheduler="cosine")  # Matches bionemo1
 
 
 def geneformer_base_training_config() -> TrainingConfig:
@@ -401,7 +401,7 @@ def finetune_test_recipe(args) -> MainConfig[ExposedFineTuneSeqLenBioBertConfig,
         create_tensorboard_logger=False,
     )
 
-    optim_config = OptimizerSchedulerConfig(lr_scheduler='cosine')
+    optim_config = OptimizerSchedulerConfig(lr_scheduler="cosine")
     geneformer_config = geneformer_10m_finetune_config(
         seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
     )
@@ -444,7 +444,7 @@ def pretrain_tiny_test_recipe(args) -> MainConfig[ExposedGeneformerPretrainConfi
         create_tensorboard_logger=False,
     )
 
-    optim_config = OptimizerSchedulerConfig(lr_scheduler='cosine')
+    optim_config = OptimizerSchedulerConfig(lr_scheduler="cosine")
     geneformer_config = geneformer_tiny_config(
         seq_length=data_config.seq_length, initial_ckpt_path=args.initial_ckpt_path
     )
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index 5ddf5636a9..624adfac18 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -159,7 +159,7 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec
 
     @model_validator(mode="after")
-    def validate_ffn_hidden_size(self) -> 'ExposedModelConfig':
+    def validate_ffn_hidden_size(self) -> "ExposedModelConfig":
         """Validates the ffn_hidden_size."""
         if not self.ffn_hidden_size == 4 * self.hidden_size:
             raise ValidationError("ffn_hidden_size must be 4 * hidden_size")
@@ -290,6 +290,7 @@ class TrainingConfig(BaseModel):
     gc_interval: int = 0
     include_perplexity: bool = False
 
+
 class OptimizerSchedulerConfig(BaseModel):
     """Configuration for the optimizer and learning rate scheduler.
 
@@ -303,6 +304,7 @@ class OptimizerSchedulerConfig(BaseModel):
         warmup_steps (int): Number of warmup steps for use with the warmup annealing learning rate scheduler. Default is 0.
         lr_scheduler (Literal['warmup_anneal', 'cosine']): Type of learning rate scheduler to use. Default is 'warmup_anneal'. NOTE this is likely to change.
     """
+
     lr: float = 1e-4
     optimizer: str = "adam"
     interval: str = "step"
@@ -310,7 +312,7 @@ class OptimizerSchedulerConfig(BaseModel):
     cosine_rampup_frac: float = 0.01
     cosine_hold_frac: float = 0.05
     warmup_steps: int = 0
-    lr_scheduler: Literal['warmup_anneal', 'cosine'] = 'warmup_anneal'
+    lr_scheduler: Literal["warmup_anneal", "cosine"] = "warmup_anneal"
 
 
 class ExperimentConfig(BaseModel):
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index cc22aebabf..084513aad0 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -19,7 +19,6 @@
 from dataclasses import field
 from typing import Optional
 
-from bionemo.esm2.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from megatron.core.optimizer import OptimizerConfig
 from nemo import lightning as nl
 from nemo.collections import llm
@@ -31,6 +30,7 @@
 from pydantic import BaseModel
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 
+from bionemo.esm2.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from bionemo.llm.lightning import BionemoLightningModule, PerplexityLoggingCallback
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.run.config_models import (
@@ -204,22 +204,26 @@ def train(
     # although this constraint is not documented.
 
     # TODO: need an abstraction for LrSchedulerConfig
-    if optim_config.lr_scheduler == 'cosine':
-        lr_scheduler=CosineAnnealingScheduler(
-                max_steps=training_config.max_steps,
-                min_lr=optim_config.lr / 100,
-                warmup_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_rampup_frac)),
-                interval=optim_config.interval,
-                monitor=optim_config.monitor,
-                constant_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_hold_frac)),
+    if optim_config.lr_scheduler == "cosine":
+        lr_scheduler = CosineAnnealingScheduler(
+            max_steps=training_config.max_steps,
+            min_lr=optim_config.lr / 100,
+            warmup_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_rampup_frac)),
+            interval=optim_config.interval,
+            monitor=optim_config.monitor,
+            constant_steps=int(math.ceil(training_config.max_steps * optim_config.cosine_hold_frac)),
         )
-    elif optim_config.lr_scheduler == 'warmup_anneal':
+    elif optim_config.lr_scheduler == "warmup_anneal":
         lr_scheduler = WarmupAnnealDecayHoldScheduler(
-            warmup_steps=optim_config.warmup_steps, max_steps=training_config.max_steps, max_lr=optim_config.lr, min_lr=optim_config.lr / 10.0, anneal_percentage=0.10
+            warmup_steps=optim_config.warmup_steps,
+            max_steps=training_config.max_steps,
+            max_lr=optim_config.lr,
+            min_lr=optim_config.lr / 10.0,
+            anneal_percentage=0.10,
         )
     else:
         raise NotImplementedError(f"Scheduler {optim_config.lr_scheduler} not implemented.")
-    
+
     optimizer = MegatronOptimizerModule(
         config=OptimizerConfig(
             lr=optim_config.lr,
@@ -229,7 +233,6 @@ def train(
             bf16=bionemo_model_config.bf16,
         ),
         lr_scheduler=lr_scheduler,
-        
     )
 
     model: BionemoLightningModule = biobert_lightning_module(

From 79381a951c82e2ee2a6b72f605574595f06121eb Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 5 Nov 2024 23:30:53 +0000
Subject: [PATCH 48/58] moved esm2 scheduler into llm

---
 .../src/bionemo/llm}/model/lr_scheduler.py                 | 0
 sub-packages/bionemo-llm/src/bionemo/llm/train.py          | 7 -------
 2 files changed, 7 deletions(-)
 rename sub-packages/{bionemo-esm2/src/bionemo/esm2 => bionemo-llm/src/bionemo/llm}/model/lr_scheduler.py (100%)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/lr_scheduler.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/lr_scheduler.py
similarity index 100%
rename from sub-packages/bionemo-esm2/src/bionemo/esm2/model/lr_scheduler.py
rename to sub-packages/bionemo-llm/src/bionemo/llm/model/lr_scheduler.py
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index 084513aad0..a3f7ceae85 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -126,13 +126,6 @@ def setup_trainer(
             )
         )
 
-    # TODO set these as flags, the following are needed:
-    """
-    nsys_profiling (bool)
-    nsys_start_step (int) when to start profiling
-    nsys_end_step (int) when to stop profiling
-    nsys_ranks (List[int]) which ranks to profile.
-    """
     if nsys_config:
         if nsys_config.end_step is None:
             nsys_config.end_step = training_config.max_steps

From c4eff16798e086482e2cfeef14337e0e81762543 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 5 Nov 2024 23:42:55 +0000
Subject: [PATCH 49/58] addressed comments

---
 README.md | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1fbc1924cc..266387f1f1 100644
--- a/README.md
+++ b/README.md
@@ -155,7 +155,19 @@ git tag MY-VERSION-TAG
 uv build /sub-packages/bionemo-core
 TWINE_PASSWORD="<pypi pass>" TWINE_USERNAME="<pypi user>" uvx twine upload /sub-packages/bionemo-core/dist/*
 ```
+## Pydantic Configuration
 
+BioNeMo 2 provides two entrypoints for models with both argparse and pydantic. Both documented in the `Models` section below.
+Pydantic based configuration is designed to accept a configuration json file as input, along with context specific arguments (e.g., should we resume from existing checkpoints?). These JSON configs go through a Pydantic Validator, in this case referred to as `MainConfig`. This Config is composed of several other Pydantic models, see the class definition for details. To pre-populate a config with reasonable defaults for various standard models, we provide 'recipes.' These are simple methods that instantiate the config object and then serialize it to a JSON configuration file. From this file, you may either submit it directly, or modify the various parameters to meet your usecase. For example, Weights and biases, devices, precision, and dataset options are all extremely useful to modify. Then, you would submit this config for training.
+
+These two workflows are packaged as executables when esm2 or geneformer are installed with pip. These commands will appear as:
+
+```bash
+bionemo-geneformer-recipe
+bionemo-esm2-recipe
+bionemo-geneformer-train
+bionemo-esm2-train
+```
 
 ## Models
 ### ESM-2
@@ -201,7 +213,7 @@ python  \
 ##### Running with Pydantic configs
 
 Alternatively, we provide a validated and serialized configuration file entrypoint for executing the same workflow. Recipes
-are available for 8m, 650m, and 3b ESM2 models.
+are available for 8m, 650m, and 3b ESM2 models. You may select which preset config to use by setting the `--recipe` parameter.
 
 ```bash
 # The fastest transformer engine environment variables in testing were the following two
@@ -277,7 +289,7 @@ train_geneformer     \
     --micro-batch-size 2
 ```
 
-To fine-tune, you just need to specify a different combination of model and loss. Pass the path to the outputted config file from the previous step as the `--restore-from-checkpoint-path`, and also change
+To fine-tune, you to specify a different combination of model and loss. Pass the path to the outputted config file from the previous step as the `--restore-from-checkpoint-path`, and also change
 `--training-model-config-class` to the newly created model-config-class.
 
 While no CLI option currently exists to hot swap in different data modules and processing functions _now_, you could
@@ -308,7 +320,7 @@ Alternatively, we provide a validated and serialized configuration file entrypoi
 are available for 10m, and 106m geneformer models. Additionally we provide an example recipe of finetuning, where the objective
 is to 'regress' on token IDs rather than the traditional masked language model approach. In practice, you will likely
 need to implement your own DataModule, DataConfig, and Finetuning model. You can use the same overall approach, but with
-customizations fory our task.
+customizations for your task.
 
 
 ```bash

From 67d83c335a051ad2b20e21d6a344cf34d34d1311 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Tue, 5 Nov 2024 23:57:01 +0000
Subject: [PATCH 50/58] added missing recipe argument for
 get_attention_mask_from_fusion

---
 sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py | 4 ++++
 .../src/bionemo/geneformer/run/recipes.py                 | 8 ++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
index 9ef8bf2760..9473cc69ce 100644
--- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
+++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/recipes.py
@@ -113,6 +113,7 @@ def esm2_8m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
         seq_length=1024,
         biobert_spec_option=BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
         initial_ckpt_path=initial_ckpt_path,
+        get_attention_mask_from_fusion=True,
         params_dtype="bf16-mixed",
         pipeline_dtype="bf16-mixed",
         autocast_dtype="bf16-mixed",
@@ -142,6 +143,7 @@ def esm2_650m_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
         num_attention_heads=20,
         biobert_spec_option=BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
         initial_ckpt_path=initial_ckpt_path,
+        get_attention_mask_from_fusion=True,
         params_dtype="bf16-mixed",
         pipeline_dtype="bf16-mixed",
         autocast_dtype="bf16-mixed",
@@ -209,6 +211,7 @@ def esm2_3b_model_config(initial_ckpt_path=None) -> ExposedESM2PretrainConfig:
         seq_length=1024,
         biobert_spec_option=BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec,
         initial_ckpt_path=initial_ckpt_path,
+        get_attention_mask_from_fusion=True,
         params_dtype="bf16-mixed",
         pipeline_dtype="bf16-mixed",
         autocast_dtype="bf16-mixed",
@@ -314,6 +317,7 @@ def esm2_tiny_model_config(
         pipeline_dtype=precision,
         autocast_dtype=precision,
         biobert_spec_option=biobert_spec_option,
+        get_attention_mask_from_fusion=True,
         nemo1_ckpt_path=str(nemo1_init_path) if nemo1_init_path is not None else None,
         # handle checkpoint resumption here rather than auto-resume so this supports fine-tuning capabilities
         initial_ckpt_path=str(initial_ckpt_path) if initial_ckpt_path is not None else None,
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index 7c74763ac8..df885ae67e 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -99,7 +99,7 @@ def geneformer_10m_model_config(
         apply_residual_connection_post_layernorm=False,
         bias_activation_fusion=True,
         bias_dropout_fusion=True,
-        get_attention_mask_from_fusion=False,
+        get_attention_mask_from_fusion=True,
         attention_dropout=0.1,
         share_embeddings_and_output_weights=True,
         enable_autocast=False,
@@ -206,7 +206,7 @@ def geneformer_106m_model_config(
         apply_residual_connection_post_layernorm=False,
         bias_activation_fusion=True,
         bias_dropout_fusion=True,
-        get_attention_mask_from_fusion=False,
+        get_attention_mask_from_fusion=True,
         attention_dropout=0.1,
         share_embeddings_and_output_weights=True,
         enable_autocast=False,
@@ -299,7 +299,7 @@ def geneformer_10m_finetune_config(
         apply_residual_connection_post_layernorm=False,
         bias_activation_fusion=True,
         bias_dropout_fusion=True,
-        get_attention_mask_from_fusion=False,
+        get_attention_mask_from_fusion=True,
         attention_dropout=0.1,
         share_embeddings_and_output_weights=True,
         enable_autocast=False,
@@ -343,7 +343,7 @@ def geneformer_tiny_config(
         apply_residual_connection_post_layernorm=False,
         bias_activation_fusion=True,
         bias_dropout_fusion=True,
-        get_attention_mask_from_fusion=False,
+        get_attention_mask_from_fusion=True,
         attention_dropout=0.1,
         share_embeddings_and_output_weights=True,
         enable_autocast=False,

From 0bdbff1a1892a2d5627cbd482a66f6d9e91a293a Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 6 Nov 2024 00:19:14 +0000
Subject: [PATCH 51/58] removed hanging exception

---
 .../bionemo-geneformer/src/bionemo/geneformer/run/recipes.py     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
index df885ae67e..2cbc1e3c1b 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/run/recipes.py
@@ -583,7 +583,6 @@ def parse_args():
         config = geneformer_10m_pretrain_recipe(args)
     elif args.recipe == "106m-pretrain":
         config = geneformer_106m_pretrain_recipe(args)
-        raise NotImplementedError("106M pretraining recipe not implemented.")
     elif args.recipe == "test-finetune":
         # Uses a bigger model because we have a pretrained model for it.
         config = finetune_test_recipe(args)

From 3d50ad353e4b82308e3fe77676b4dc6817062836 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 6 Nov 2024 00:51:43 +0000
Subject: [PATCH 52/58] missing refactor from certain tests

---
 scripts/protein/esm2/esm2_pretrain.py                           | 2 +-
 .../bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py   | 2 +-
 .../tests/bionemo/llm}/model/test_lr_scheduler.py               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename sub-packages/{bionemo-esm2/tests/bionemo/esm2 => bionemo-llm/tests/bionemo/llm}/model/test_lr_scheduler.py (96%)

diff --git a/scripts/protein/esm2/esm2_pretrain.py b/scripts/protein/esm2/esm2_pretrain.py
index ed88c6053f..25741e7bd2 100644
--- a/scripts/protein/esm2/esm2_pretrain.py
+++ b/scripts/protein/esm2/esm2_pretrain.py
@@ -30,10 +30,10 @@
 from bionemo.esm2.data.datamodule import ESMDataModule
 from bionemo.esm2.data.dataset import RandomMaskStrategy
 from bionemo.esm2.data.tokenizer import get_tokenizer
-from bionemo.esm2.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from bionemo.llm.lightning import PerplexityLoggingCallback
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.model.biobert.model import BiobertSpecOption
+from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from bionemo.llm.utils.datamodule_utils import float_or_int_or_none, infer_global_batch_size
 from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger
 
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py
index 41ba01fbba..18be7eccf3 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py
+++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py
@@ -28,8 +28,8 @@
 from bionemo.esm2.data.datamodule import ESMDataModule
 from bionemo.esm2.data.dataset import RandomMaskStrategy
 from bionemo.esm2.data.tokenizer import BioNeMoESMTokenizer, get_tokenizer
-from bionemo.esm2.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
+from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from bionemo.testing.data.load import load
 from bionemo.testing.harnesses import stop_and_go
 from bionemo.testing.harnesses.mode import Mode
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_lr_scheduler.py b/sub-packages/bionemo-llm/tests/bionemo/llm/model/test_lr_scheduler.py
similarity index 96%
rename from sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_lr_scheduler.py
rename to sub-packages/bionemo-llm/tests/bionemo/llm/model/test_lr_scheduler.py
index a0b0883e05..1b5549db00 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_lr_scheduler.py
+++ b/sub-packages/bionemo-llm/tests/bionemo/llm/model/test_lr_scheduler.py
@@ -16,7 +16,7 @@
 
 import torch
 
-from bionemo.esm2.model.lr_scheduler import WarmupAnnealDecayHold, WarmupAnnealDecayHoldScheduler
+from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHold, WarmupAnnealDecayHoldScheduler
 
 
 def test_warmup_anneal_decay_hold_scheduler_exists():

From 12133df0547fa12ebac7fd914627cd8dfe157947 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 6 Nov 2024 01:41:07 +0000
Subject: [PATCH 53/58] last one

---
 sub-packages/bionemo-llm/src/bionemo/llm/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index a3f7ceae85..3a4d6ec1cd 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -30,7 +30,7 @@
 from pydantic import BaseModel
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 
-from bionemo.esm2.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
+from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from bionemo.llm.lightning import BionemoLightningModule, PerplexityLoggingCallback
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.run.config_models import (

From 0421bcef3911876a3a731ffe1944f3779b7c7cca Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 6 Nov 2024 01:41:29 +0000
Subject: [PATCH 54/58] format

---
 sub-packages/bionemo-llm/src/bionemo/llm/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/train.py b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
index 3a4d6ec1cd..18ec5b1b83 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/train.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/train.py
@@ -30,9 +30,9 @@
 from pydantic import BaseModel
 from pytorch_lightning.callbacks import LearningRateMonitor, RichModelSummary
 
-from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from bionemo.llm.lightning import BionemoLightningModule, PerplexityLoggingCallback
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
+from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
 from bionemo.llm.run.config_models import (
     DataConfig,
     DataModuleT,

From 958e0374f740df799591c1f883a4024076c6a136 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 6 Nov 2024 18:01:32 +0000
Subject: [PATCH 55/58] remove validation check that wasx erronious

---
 .../src/bionemo/llm/run/config_models.py            | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index 624adfac18..fbea3c3c91 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -21,7 +21,7 @@
 
 import pytorch_lightning as pl
 import torch
-from pydantic import BaseModel, ValidationError, field_serializer, field_validator, model_validator
+from pydantic import BaseModel, field_serializer, field_validator, model_validator
 from torch.nn import functional as F
 
 from bionemo.core.utils import dtypes
@@ -158,13 +158,6 @@ def exposed_to_internal_bionemo_model_config(self) -> ModelConfigT:
     nemo1_ckpt_path: Optional[str] = None
     biobert_spec_option: BiobertSpecOption = BiobertSpecOption.bert_layer_with_transformer_engine_spec
 
-    @model_validator(mode="after")
-    def validate_ffn_hidden_size(self) -> "ExposedModelConfig":
-        """Validates the ffn_hidden_size."""
-        if not self.ffn_hidden_size == 4 * self.hidden_size:
-            raise ValidationError("ffn_hidden_size must be 4 * hidden_size")
-        return self
-
     @field_validator("activation_func", mode="before")
     @classmethod
     def validate_activation_func(cls, activation_func: str) -> Callable:
@@ -189,7 +182,7 @@ def validate_activation_func(cls, activation_func: str) -> Callable:
             func = CUSTOM_ACTIVATION_FNS[activation_func]
             return func
         elif func is None:
-            raise ValidationError(
+            raise ValueError(
                 f"activation_func must be a valid function in `torch.nn.functional`, got {activation_func=}"
             )
         else:
@@ -262,7 +255,7 @@ class ParallelConfig(BaseModel):
     def validate_devices(self):
         """Validates the number of devices based on the tensor and pipeline model parallel sizes."""
         if self.num_devices < self.tensor_model_parallel_size * self.pipeline_model_parallel_size:
-            raise ValidationError(
+            raise ValueError(
                 "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
             )
         return self

From f699b26edc93ab9e4e1dc8785e4da558a1fb2b24 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Wed, 6 Nov 2024 18:01:46 +0000
Subject: [PATCH 56/58] formatting

---
 sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
index fbea3c3c91..c3c2ef292d 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/run/config_models.py
@@ -255,9 +255,7 @@ class ParallelConfig(BaseModel):
     def validate_devices(self):
         """Validates the number of devices based on the tensor and pipeline model parallel sizes."""
         if self.num_devices < self.tensor_model_parallel_size * self.pipeline_model_parallel_size:
-            raise ValueError(
-                "devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size"
-            )
+            raise ValueError("devices must be divisible by tensor_model_parallel_size * pipeline_model_parallel_size")
         return self
 
 

From 1768b0f297b7568c29fbb6d6d0d736131fab71f6 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Thu, 7 Nov 2024 21:13:42 +0000
Subject: [PATCH 57/58] fix incorrect types for WandbConfig

---
 .../bionemo-llm/src/bionemo/llm/utils/logger_utils.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
index 5f2f6b1957..e95162715b 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
@@ -42,14 +42,14 @@ class WandbConfig(BaseModel):
         anonymous: Enables or explicitly disables anonymous logging.
     """  # noqa: D205
 
-    entity: str  # The team posting this run (default: your username or your default team)
+    entity: str | None # The team posting this run (default: your username or your default team)
     project: str  # The name of the project to which this run will belong.
     # name: #Display name for the run. "This is handled by NeMoLogger"
     # save_dir: #Path where data is saved. "This is handled by NeMoLogger"
-    tags: List[str]  # Tags associated with this run.
-    group: str  # A unique string shared by all runs in a given group
+    tags: List[str] | None # Tags associated with this run.
+    group: str | None # A unique string shared by all runs in a given group
     offline: bool  # Run offline (data can be streamed later to wandb servers).
-    id: str  # Sets the version, mainly used to resume a previous run.
+    id: str | None # Sets the version, mainly used to resume a previous run.
     anonymous: bool  # Enables or explicitly disables anonymous logging.
     log_model: bool  # Save checkpoints in wandb dir to upload on W&B servers.
 

From 3fb021d8722be6b65bf741ea3e918cf2d96e83a5 Mon Sep 17 00:00:00 2001
From: Steven <skothenhill@nvidia.com>
Date: Thu, 7 Nov 2024 21:50:49 +0000
Subject: [PATCH 58/58] fmt

---
 .../bionemo-llm/src/bionemo/llm/utils/logger_utils.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
index e95162715b..ebba878c66 100644
--- a/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
+++ b/sub-packages/bionemo-llm/src/bionemo/llm/utils/logger_utils.py
@@ -42,14 +42,14 @@ class WandbConfig(BaseModel):
         anonymous: Enables or explicitly disables anonymous logging.
     """  # noqa: D205
 
-    entity: str | None # The team posting this run (default: your username or your default team)
+    entity: str | None  # The team posting this run (default: your username or your default team)
     project: str  # The name of the project to which this run will belong.
     # name: #Display name for the run. "This is handled by NeMoLogger"
     # save_dir: #Path where data is saved. "This is handled by NeMoLogger"
-    tags: List[str] | None # Tags associated with this run.
-    group: str | None # A unique string shared by all runs in a given group
+    tags: List[str] | None  # Tags associated with this run.
+    group: str | None  # A unique string shared by all runs in a given group
     offline: bool  # Run offline (data can be streamed later to wandb servers).
-    id: str | None # Sets the version, mainly used to resume a previous run.
+    id: str | None  # Sets the version, mainly used to resume a previous run.
     anonymous: bool  # Enables or explicitly disables anonymous logging.
     log_model: bool  # Save checkpoints in wandb dir to upload on W&B servers.