chore: fixed configs

le1nux · le1nux · commit 2c5f78d023c0 · 2025-04-18T18:41:52.000+02:00
diff --git a/tutorials/getting_started/configs/example_config.yaml b/tutorials/getting_started/configs/example_config.yaml
@@ -235,25 +235,22 @@ model_raw:
             seq_length_dim: -2
             base_freq: 10000
     attention_implementation: manual
-    activation_type: gelu
+    activation_type: swiglu
     attention_norm_config:
-      norm_type: rms_norm
+      norm_type: layer_norm
       config:
-        ndim: ${model_raw.config.n_embd}
-        bias: true
-        epsilon: 1e-5
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1.0e-05
     ffn_norm_config:
-      norm_type: rms_norm
+      norm_type: layer_norm
       config:
-        ndim: ${model_raw.config.n_embd}
-        bias: true
-        epsilon: 1e-5
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1.0e-05
     lm_head_norm_config:
-      norm_type: rms_norm
+      norm_type: layer_norm
       config:
-        ndim: ${model_raw.config.n_embd}
-        bias: true
-        epsilon: 1e-5
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1.0e-05
     use_weight_tying: true
 
 lr_scheduler:
@@ -281,7 +278,7 @@ optimizer:
 
 gradient_clipper:
   component_key: gradient_clipper
-  variant_key: fsdp
+  variant_key: fsdp1
   config:
     wrapped_model:
       instance_key: wrapped_model
diff --git a/tutorials/warmstart/configs/pre_training_config.yaml b/tutorials/warmstart/configs/pre_training_config.yaml
@@ -156,6 +156,7 @@ model_raw:
   variant_key: gpt2
   config:
     use_meta_device: true
+    use_weight_tying: true
     sample_key: ${settings.referencing_keys.sample_key}
     poe_type: NOPE
     sequence_length: ${settings.step_profile.sequence_length}
@@ -196,7 +197,6 @@ model_raw:
         ndim: ${model_raw.config.n_embd}
         bias: true
         epsilon: 1e-5
-    use_weight_tying: true
 
 lr_scheduler:
   component_key: scheduler
@@ -228,7 +228,7 @@ optimizer:
 
 gradient_clipper:
   component_key: gradient_clipper
-  variant_key: fsdp
+  variant_key: fsdp1
   config:
     wrapped_model:
       instance_key: wrapped_model
diff --git a/tutorials/warmstart/configs/warmstart_config.yaml b/tutorials/warmstart/configs/warmstart_config.yaml
@@ -104,7 +104,7 @@ eval_dataloaders: []
 
 checkpoint_loading:
   component_key: checkpoint_loading
-  variant_key: fsdp
+  variant_key: fsdp1
   config:
     global_rank: ${settings.cuda_env.global_rank}
     block_names: [GPT2Block]
@@ -135,9 +135,23 @@ loss_fn:
     target_key: ${settings.referencing_keys.target_key}
     prediction_key: ${settings.referencing_keys.prediction_key}
 
+app_state:
+  component_key: app_state
+  variant_key: raw
+  config:
+    model: 
+      instance_key: wrapped_model
+      pass_type: BY_REFERENCE
+    optimizer:
+      instance_key: optimizer
+      pass_type: BY_REFERENCE
+    lr_scheduler:
+      instance_key: lr_scheduler
+      pass_type: BY_REFERENCE
+
 wrapped_model:
   component_key: model
-  variant_key: checkpointed
+  variant_key: fsdp1_checkpointed
   config:
     model:
       instance_key: model
@@ -169,6 +183,7 @@ model_raw:
   variant_key: gpt2
   config:
     use_meta_device: false
+    use_weight_tying: true
     sample_key: ${settings.referencing_keys.sample_key}
     poe_type: NOPE
     sequence_length: ${settings.step_profile.sequence_length}
@@ -255,7 +270,7 @@ optimizer_original:
 
 gradient_clipper:
   component_key: gradient_clipper
-  variant_key: fsdp
+  variant_key: fsdp1
   config:
     wrapped_model:
       instance_key: wrapped_model