From 9121adc9e7ac4c933ac2719a16fab4bee6b33dae Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 14:36:21 +0800
Subject: [PATCH 01/17] dropping the loss bias (tmp)

---
 RWKV-v5/src/model.py | 54 ++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 86cb7025..b090378f 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -803,34 +803,34 @@ def compute_loss(self, batch, batch_idx, is_training_run: bool):
         # should not be allowed
         num_devices = self.trainer.num_devices
 
-        ### ---
-        ### Positional loss bias handling
-        ### ---
+        # ### ---
+        # ### Positional loss bias handling
+        # ### ---
         
-        # Get the starting and ending loss bias
-        loss_bias_start = self.position_loss_bias
-        loss_bias_end   = 2.0 - loss_bias_start
-
-        # Skip loss bias calculation, if loss_bias_start is 1.0
-        if loss_bias_start == 1.0 or (is_training_run == False and self.position_loss_bias_in_validation == False):
-            seq_mask = ori_seq_mask
-        else:
-            # Lets get the torch mask sum
-            total_mask_sum = torch.sum(ori_seq_mask)
-
-            # Lets get a linear multiplier for the loss bias
-            # seq_mask_sum = torch.sum(ori_seq_mask)
-            bias_mask = torch.linspace(loss_bias_start, loss_bias_end, int(total_mask_sum.item()), device=ori_seq_mask.device)
-
-            # Boolean flag of seq_mask > 0
-            seq_mask_index = ori_seq_mask[0] > 0
-
-            # Apply the bias mask only to positive seq_mask values
-            final_mask = torch.zeros(ori_seq_mask.shape[1], device=ori_seq_mask.device)
-            final_mask[seq_mask_index] = ori_seq_mask[0][seq_mask_index] * bias_mask
-
-            # And save it as seq_mask
-            seq_mask = final_mask.unsqueeze(0)
+        # # Get the starting and ending loss bias
+        # loss_bias_start = self.position_loss_bias
+        # loss_bias_end   = 2.0 - loss_bias_start
+
+        # # Skip loss bias calculation, if loss_bias_start is 1.0
+        # if loss_bias_start == 1.0 or (is_training_run == False and self.position_loss_bias_in_validation == False):
+        #     seq_mask = ori_seq_mask
+        # else:
+        #     # Lets get the torch mask sum
+        #     total_mask_sum = torch.sum(ori_seq_mask)
+
+        #     # Lets get a linear multiplier for the loss bias
+        #     # seq_mask_sum = torch.sum(ori_seq_mask)
+        #     bias_mask = torch.linspace(loss_bias_start, loss_bias_end, int(total_mask_sum.item()), device=ori_seq_mask.device)
+
+        #     # Boolean flag of seq_mask > 0
+        #     seq_mask_index = ori_seq_mask[0] > 0
+
+        #     # Apply the bias mask only to positive seq_mask values
+        #     final_mask = torch.zeros(ori_seq_mask.shape[1], device=ori_seq_mask.device)
+        #     final_mask[seq_mask_index] = ori_seq_mask[0][seq_mask_index] * bias_mask
+
+        #     # And save it as seq_mask
+        #     seq_mask = final_mask.unsqueeze(0)
 
         ### ---
         ### Training cutoff logic handling 

From 7e2278a6c4084e6ffc8815a692f379d33db2d669 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 15:27:35 +0800
Subject: [PATCH 02/17] wip loss calc tweak

---
 RWKV-v5/src/model.py | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index b090378f..de823d83 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -832,6 +832,9 @@ def compute_loss(self, batch, batch_idx, is_training_run: bool):
         #     # And save it as seq_mask
         #     seq_mask = final_mask.unsqueeze(0)
 
+        # Since we are no longer doing positional loss above, use seq_mask directly
+        seq_mask = ori_seq_mask
+
         ### ---
         ### Training cutoff logic handling 
         ### ---
@@ -884,7 +887,7 @@ def compute_loss(self, batch, batch_idx, is_training_run: bool):
             return 0
         
         # Checkpoint steps
-        def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
+        def checkpointed_step(idx, targets, mask, last_shift_states,
                               last_wkv_states, prev_steps):
             logits, new_shift_states, new_wkv_states = self(
                 idx, last_shift_states, last_wkv_states)
@@ -895,18 +898,26 @@ def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
             targets = targets.contiguous()
             mask = mask.contiguous()
 
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
+            # Compute the token loss
+            token_loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
                                     targets.view(-1),
                                     reduction="none")
-
-            submask = mask.view(-1)[:loss.shape[0]]
+            submask = mask.view(-1)[:token_loss.shape[0]]
             submask_sum = torch.sum(submask)
-            loss = torch.sum(loss * submask) / total_mask_sum  
 
-            loss = L2Wrap.apply(loss, logits, total_mask_sum, submask)
+            # The training loss to use
+            train_loss = torch.sum(token_loss * submask) / total_mask_sum  
+
+            # # Sample loss, without backprop 
+            # sample_loss = torch.sum(token_loss * submask) / total_mask_sum
+            
+
+
+            segment_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
+
+
             new_steps = prev_steps + submask_sum
-            new_loss = prev_loss + loss
-            return new_loss, new_shift_states, new_wkv_states, new_steps
+            return segment_loss, new_shift_states, new_wkv_states, new_steps
 
         total_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
         steps = 0
@@ -1056,7 +1067,6 @@ def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
                     cur_idx,
                     cur_tar,
                     cur_msk,
-                    torch.tensor(0, dtype=self.emb.weight.dtype, device=cur_device).requires_grad_(True),
                     prv_shift_states,
                     prv_wkv_states,
                     steps,
@@ -1067,7 +1077,7 @@ def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
                 # segment_loss_arr[i] = segment_loss
 
                 # Perform the backward pass accordingly, for valid segments (besides the last segment)
-                # In this version, we do backward passes together the forward passes in the main segment loop
+                # In this version, we do backward passes together with the forward passes in the main segment loop
                 # Instead of after all segment losses are computed
                 if i >= start_learning_segment and i < start_learning_segment + backward_segment_count:
                     # The learning loss, should be normalized against the accumulation steps
@@ -1133,26 +1143,25 @@ def checkpointed_step(idx, targets, mask, prev_loss, last_shift_states,
             segment_size = self.ctx_len
             for i in range(segment_count):
                 if i < segment_count-1 and is_training_run:
-                    total_loss, new_shift_states, new_wkv_states, steps = deepspeed_checkpoint(
+                    segment_loss, new_shift_states, new_wkv_states, steps = deepspeed_checkpoint(
                         checkpointed_step,
                         idx[:, i * segment_size:(i + 1) * segment_size],
                         targets[:, i * segment_size:(i + 1) * segment_size],
                         seq_mask[:, i * segment_size:(i + 1) * segment_size],
-                        total_loss,
                         states.shift_states,
                         states.wkv_states,
                         steps,
                     )
                 else:
-                    total_loss, new_shift_states, new_wkv_states, steps = checkpointed_step(
+                    segment_loss, new_shift_states, new_wkv_states, steps = checkpointed_step(
                         idx[:, i * segment_size:(i + 1) * segment_size],
                         targets[:, i * segment_size:(i + 1) * segment_size],
                         seq_mask[:, i * segment_size:(i + 1) * segment_size],
-                        total_loss,
                         states.shift_states,
                         states.wkv_states,
                         steps,
                     )
+                total_loss = total_loss + segment_loss
 
                 states = BlockStateList(new_shift_states, new_wkv_states)
                 gc.collect()

From 1239547492399b73ea17c8cdecd68c9682af328e Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 16:02:13 +0800
Subject: [PATCH 03/17] refactoring loss handling

---
 RWKV-v5/src/model.py | 121 ++++++++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 65 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index de823d83..9a9676d5 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -888,7 +888,7 @@ def compute_loss(self, batch, batch_idx, is_training_run: bool):
         
         # Checkpoint steps
         def checkpointed_step(idx, targets, mask, last_shift_states,
-                              last_wkv_states, prev_steps):
+                              last_wkv_states):
             logits, new_shift_states, new_wkv_states = self(
                 idx, last_shift_states, last_wkv_states)
             
@@ -903,29 +903,33 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                                     targets.view(-1),
                                     reduction="none")
             submask = mask.view(-1)[:token_loss.shape[0]]
-            submask_sum = torch.sum(submask)
 
             # The training loss to use
             train_loss = torch.sum(token_loss * submask) / total_mask_sum  
+            train_token_count = torch.sum(submask)
 
-            # # Sample loss, without backprop 
-            # sample_loss = torch.sum(token_loss * submask) / total_mask_sum
+            # Sample loss, without backprop 
+            sample_loss = torch.sum(token_loss * submask) / total_mask_sum
             
+            # L2Wrap for the backprop process
+            segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
 
+            # Return the checkpoint values
+            return sample_loss, segment_train_loss, new_shift_states, new_wkv_states, train_token_count
 
-            segment_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
-
-
-            new_steps = prev_steps + submask_sum
-            return segment_loss, new_shift_states, new_wkv_states, new_steps
-
-        total_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
-        steps = 0
+        # Initialize the states, and compute the segment count
         states = BlockStateList.create(self.n_layer, B, C, 
                                        self.n_head, self.head_size,
                                        seq.device, self.emb.weight.dtype)
         segment_count = math.ceil(T / self.ctx_len)
 
+        # Initialize the training loss, and the token count
+        training_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
+        training_tokens = 0
+
+        # Raw sample loss (before selective token training)
+        sampling_loss = 0
+
         ### ---
         ### Learning process logic (BPTT or not)
         ### ---
@@ -1063,13 +1067,12 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                     cur_msk = dummy_2d_zero
 
                 # Segmented learning, applies the forward/pass over each chunk seperately
-                segment_loss, new_shift_states, new_wkv_states, steps = checkpointed_step(
+                segment_sample_loss, segment_train_loss, new_shift_states, new_wkv_states, segment_train_tokens = checkpointed_step(
                     cur_idx,
                     cur_tar,
                     cur_msk,
                     prv_shift_states,
-                    prv_wkv_states,
-                    steps,
+                    prv_wkv_states
                 )
                 states = BlockStateList(new_shift_states, new_wkv_states)
 
@@ -1079,90 +1082,68 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 # Perform the backward pass accordingly, for valid segments (besides the last segment)
                 # In this version, we do backward passes together with the forward passes in the main segment loop
                 # Instead of after all segment losses are computed
+                #
+                # In the past, we have implemented to do all forward, and all backwards. But this was found to be "slow"
                 if i >= start_learning_segment and i < start_learning_segment + backward_segment_count:
                     # The learning loss, should be normalized against the accumulation steps
                     # as we are bypassing the pytorch lightning normalization
                     # https://lightning.ai/docs/pytorch/2.0.4/common/lightning_module.html#backward
-                    learning_loss = segment_loss / gradient_accumulation_steps
+                    learning_loss = segment_train_loss / gradient_accumulation_steps
 
                     # Perform the backward pass accordingly, for valid segments (besides the last segment)
                     if i == start_learning_segment + backward_segment_count - 1:
                         # This is the last backward pass, we let the default pytorch lightning handle the backward pass
                         # and return the segment loss as part of the total loss
-                        total_loss = total_loss + segment_loss
+                        training_loss = training_loss + segment_train_loss
                     else:
                         # Undocumented multiple backward pass support
                         # https://github.com/Lightning-AI/lightning/blob/678f642808c54e4c490caee4df5d357301c976bb/tests/trainer/optimization/test_manual_optimization.py#L251
                         self.manual_backward(learning_loss, optimizer, retain_graph=True)
             
                         # Accumulate without gradient, as we already did the backward pass
-                        total_loss = total_loss + segment_loss.clone().detach().requires_grad_(False)
+                        training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
                 else:
                     # Even if its not the segments we use for backward pass, we still need to accumulate the loss
-                    total_loss = total_loss + segment_loss.clone().detach().requires_grad_(False)
+                    training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
                 
+                # Add token count and raw sampling loss
+                training_tokens = training_tokens + segment_train_tokens
+                sampling_loss = sampling_loss + segment_sample_loss
+
                 # GC collect unused memory
                 # gc.collect()
                 # torch.cuda.empty_cache()
-
-            # # Lets backpass the respective segments, in reverse
-            # # (including dummy backpass)
-            # for i in range(forward_segment_count-1, -1, -1):
-            #     # Get the segment loss
-            #     segment_loss = segment_loss_arr[i]
-            #
-            #     # Compute the backward pass for the segment
-            #     if i >= start_learning_segment and i < start_learning_segment + backward_segment_count:
-            #         # The learning loss, should be normalized against the accumulation steps
-            #         # as we are bypassing the pytorch lightning normalization
-            #         # https://lightning.ai/docs/pytorch/2.0.4/common/lightning_module.html#backward
-            #         learning_loss = segment_loss / gradient_accumulation_steps
-            #
-            #         # Perform the backward pass accordingly, for valid segments (besides the start_learning_segment)
-            #         if i > start_learning_segment:
-            #             # Undocumented multiple backward pass support
-            #             # https://github.com/Lightning-AI/lightning/blob/678f642808c54e4c490caee4df5d357301c976bb/tests/trainer/optimization/test_manual_optimization.py#L251
-            #             self.manual_backward(learning_loss, optimizer, retain_graph=True)
-            #
-            #             # Accumulate without gradient, as we already did the backward pass
-            #             total_loss = total_loss + segment_loss.clone().detach().requires_grad_(False)
-            #         else:
-            #             # This is the last backward pass, we let the default pytorch lightning handle the backward pass
-            #             # and return the segment loss as part of the total loss
-            #             total_loss = total_loss + segment_loss
-            #     else:
-            #         # Even if its not the segments we use for backward pass, we still need to accumulate the loss
-            #         total_loss = total_loss + segment_loss.clone().detach().requires_grad_(False)
-            #
-            #    # GC collect unused memory
-            #    gc.collect()
-            #    # torch.cuda.empty_cache()
         else:
 
+            #
             # Normal operations without BPTT
+            #
             segment_size = self.ctx_len
             for i in range(segment_count):
                 if i < segment_count-1 and is_training_run:
-                    segment_loss, new_shift_states, new_wkv_states, steps = deepspeed_checkpoint(
+                    segment_sample_loss, segment_train_loss, new_shift_states, new_wkv_states, segment_train_tokens = deepspeed_checkpoint(
                         checkpointed_step,
                         idx[:, i * segment_size:(i + 1) * segment_size],
                         targets[:, i * segment_size:(i + 1) * segment_size],
                         seq_mask[:, i * segment_size:(i + 1) * segment_size],
                         states.shift_states,
-                        states.wkv_states,
-                        steps,
+                        states.wkv_states
                     )
                 else:
-                    segment_loss, new_shift_states, new_wkv_states, steps = checkpointed_step(
+                    segment_sample_loss, segment_train_loss, new_shift_states, new_wkv_states, segment_train_tokens = checkpointed_step(
                         idx[:, i * segment_size:(i + 1) * segment_size],
                         targets[:, i * segment_size:(i + 1) * segment_size],
                         seq_mask[:, i * segment_size:(i + 1) * segment_size],
                         states.shift_states,
-                        states.wkv_states,
-                        steps,
+                        states.wkv_states
                     )
-                total_loss = total_loss + segment_loss
+                
+                # Add them up
+                training_loss = training_loss + segment_train_loss
+                training_tokens = training_tokens + segment_train_tokens
+                sampling_loss = sampling_loss + segment_sample_loss
 
+                # Update the states
                 states = BlockStateList(new_shift_states, new_wkv_states)
                 gc.collect()
                 # torch.cuda.empty_cache()
@@ -1171,24 +1152,34 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
         if wandb.run is not None and is_training_run:
             global_rank = self.global_rank
             global_device_count = self.trainer.num_devices * self.trainer.num_nodes
+            microbatch_size = self.trainer.microbatch_size
 
             # Get the total dataset context length
             batch_ctx_len = 0
             if "data_ctx_len" in batch:
                 batch_ctx_len = torch.sum(batch["data_ctx_len"]).item()
             else:
-                batch_ctx_len = T * self.trainer.microbatch_size
+                batch_ctx_len = T * microbatch_size
 
             # Increment the counting tokens, and log it accordingly
             self._counting_tokens += batch_ctx_len
 
             # Log the line values
             wandb.log({
-                'global_rank': global_rank, 
-                'data_ctx_len': batch_ctx_len / self.trainer.microbatch_size, 
-                'train/loss': total_loss,
+                # The original loss and ctx_len (averaged by batch size)
+                'train/loss': sampling_loss,
+                'train/ctx_len': batch_ctx_len / microbatch_size, 
+
+                # The selective training tokens, and loss
+                'train/tokens': training_tokens / microbatch_size,
+                'train/sel_loss': training_loss,
+
+                # Perf tracking
                 f'perf/tokens_total.gpu.{global_rank}': self._counting_tokens,
                 f'perf/tokens_per_sec.gpu.{global_rank}': self._counting_tokens / max(time.time() - self._counting_time_start, 1),
+
+                # Step and trainer tracking
+                'global_rank': global_rank, 
                 'substep': (batch_idx * global_device_count + global_rank),
                 'trainer/global_step':self.global_step,
                 'trainer/learning_rate': self.trainer.optimizers[0].param_groups[0]['lr'],
@@ -1196,8 +1187,8 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
             })
 
         # Throw if total loss is NaN
-        assert not torch.isnan(total_loss), "total_loss is NaN"
-        return total_loss
+        assert not torch.isnan(training_loss), "training_loss is NaN"
+        return training_loss
 
     #
     # Training and validation steps

From d88c16aa933cc0b48620590fc75fd2fa8d96ed11 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 16:19:34 +0800
Subject: [PATCH 04/17] wip sel loss implementation

---
 RWKV-v5/src/model.py | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 9a9676d5..bc93e907 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -904,13 +904,14 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                                     reduction="none")
             submask = mask.view(-1)[:token_loss.shape[0]]
 
+            # Sample loss, without backprop 
+            with torch.no_grad():
+                sample_loss = (torch.sum(token_loss * submask) / total_mask_sum).clone().detach().requires_grad_(False)
+            
             # The training loss to use
             train_loss = torch.sum(token_loss * submask) / total_mask_sum  
             train_token_count = torch.sum(submask)
 
-            # Sample loss, without backprop 
-            sample_loss = torch.sum(token_loss * submask) / total_mask_sum
-            
             # L2Wrap for the backprop process
             segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
 
@@ -1090,18 +1091,13 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                     # https://lightning.ai/docs/pytorch/2.0.4/common/lightning_module.html#backward
                     learning_loss = segment_train_loss / gradient_accumulation_steps
 
-                    # Perform the backward pass accordingly, for valid segments (besides the last segment)
-                    if i == start_learning_segment + backward_segment_count - 1:
-                        # This is the last backward pass, we let the default pytorch lightning handle the backward pass
-                        # and return the segment loss as part of the total loss
-                        training_loss = training_loss + segment_train_loss
-                    else:
-                        # Undocumented multiple backward pass support
-                        # https://github.com/Lightning-AI/lightning/blob/678f642808c54e4c490caee4df5d357301c976bb/tests/trainer/optimization/test_manual_optimization.py#L251
-                        self.manual_backward(learning_loss, optimizer, retain_graph=True)
-            
-                        # Accumulate without gradient, as we already did the backward pass
-                        training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
+                    # Undocumented multiple backward pass support
+                    # https://github.com/Lightning-AI/lightning/blob/678f642808c54e4c490caee4df5d357301c976bb/tests/trainer/optimization/test_manual_optimization.py#L251
+                    self.manual_backward(learning_loss, optimizer, retain_graph=True)
+        
+                    # Accumulate without gradient, as we already did the backward pass
+                    # This does mean, that a single backward pass is "wasted" at the end
+                    training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
                 else:
                     # Even if its not the segments we use for backward pass, we still need to accumulate the loss
                     training_loss = training_loss + segment_train_loss.clone().detach().requires_grad_(False)
@@ -1167,12 +1163,12 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
             # Log the line values
             wandb.log({
                 # The original loss and ctx_len (averaged by batch size)
-                'train/loss': sampling_loss,
                 'train/ctx_len': batch_ctx_len / microbatch_size, 
+                'train/data_loss': sampling_loss,
 
                 # The selective training tokens, and loss
                 'train/tokens': training_tokens / microbatch_size,
-                'train/sel_loss': training_loss,
+                'train/loss': training_loss,
 
                 # Perf tracking
                 f'perf/tokens_total.gpu.{global_rank}': self._counting_tokens,

From 5fcde26a1cfde53e687eabd1b41aea65241f6cd2 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 16:35:36 +0800
Subject: [PATCH 05/17] selective token loss threshold initial implementation

---
 RWKV-v5/src/model.py | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index bc93e907..7b1c2b0a 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -193,9 +193,14 @@ def __init__(self,
                  adam_eps: float = 1.0e-08,
                  weight_decay: float = 0.01,
                  warmup_steps: int = -1,
+
                  # loss bias start
                  position_loss_bias: float = 1.0,
                  position_loss_bias_in_validation: bool = False,
+                 
+                 # Selective loss settings
+                 selective_token_loss_threshold: float = 1.0,
+
                  # Backprop settings
                  grad_cp: bool = True,
                  bptt_learning: bool = True,
@@ -289,9 +294,10 @@ def __init__(self,
             print("====================================================================")
             self.bptt_truncated_learning = True
 
-        # Save the position loss params
+        # Save the position loss params, and selective loss settings
         self.position_loss_bias = position_loss_bias
         self.position_loss_bias_in_validation = position_loss_bias_in_validation
+        self.selective_token_loss_threshold = selective_token_loss_threshold
 
         dim_att = dim_att or n_embd
         dim_ffn = dim_ffn or int((n_embd * 3.5) // 32 * 32)
@@ -904,16 +910,29 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                                     reduction="none")
             submask = mask.view(-1)[:token_loss.shape[0]]
 
-            # Sample loss, without backprop 
-            with torch.no_grad():
-                sample_loss = (torch.sum(token_loss * submask) / total_mask_sum).clone().detach().requires_grad_(False)
-            
-            # The training loss to use
-            train_loss = torch.sum(token_loss * submask) / total_mask_sum  
-            train_token_count = torch.sum(submask)
+            # Selective token loss logic
+            if self.selective_token_loss_threshold > 0.0:
+
+                # Sample loss, without backprop 
+                with torch.no_grad():
+                    sample_loss = (torch.sum(token_loss * submask) / total_mask_sum).clone().detach().requires_grad_(False)
+
+                # Selective loss gating
+                above_threshold = token_loss > self.selective_token_loss_threshold
+                train_mask = submask * above_threshold
+                
+                # The training loss to use
+                train_loss = torch.sum(token_loss * train_mask) / total_mask_sum  
+                train_token_count = torch.sum(train_mask)
+
+            else:
+                train_loss = torch.sum(token_loss * submask) / total_mask_sum
+                sample_loss = train_loss.clone().detach().requires_grad_(False)
+                train_token_count = torch.sum(submask)
+                train_mask = submask
 
             # L2Wrap for the backprop process
-            segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, submask)
+            segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, train_mask)
 
             # Return the checkpoint values
             return sample_loss, segment_train_loss, new_shift_states, new_wkv_states, train_token_count

From 8aa0779f7a3a8da45c8d2dc4039f42c1a08ca0eb Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 17:29:39 +0800
Subject: [PATCH 06/17] WIP data prefix mask

---
 RWKV-v5/src/data.py | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 368e918a..b5a0a984 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -268,6 +268,15 @@ def encodeTokens(x):
 
                     conversation_enabled = True
 
+            # Apply the data_prefix_skip_mask to the given mask
+            # where relevent, and disables the training mask for the first X tokens
+            data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
+            def apply_data_prefix_skip_mask(mask):
+                if data_prefix_skip_mask_enabled > 0:
+                    for i in range(data_prefix_skip_mask_enabled):
+                        mask[i] = 0
+                return mask
+            
             # Maps the dataset record to the tokenized result
             # handles a wide variety of format according to the data configuration
             #
@@ -375,7 +384,7 @@ def map_tokenizer(x):
                     return {
                         'input_ids': input_ids,
                         'token_type_ids': token_type_ids,
-                        'attention_mask': attention_mask
+                        'attention_mask': apply_data_prefix_skip_mask(attention_mask)
                     }
                         
                 # Multi column merging support
@@ -443,7 +452,7 @@ def map_tokenizer(x):
                         return {
                             'input_ids': input_ids,
                             'token_type_ids': token_type_ids,
-                            'attention_mask': attention_mask
+                            'attention_mask': apply_data_prefix_skip_mask(attention_mask)
                         }
 
                 # Prompt completion support
@@ -472,12 +481,17 @@ def map_tokenizer(x):
                     return {
                         'input_ids': input_ids,
                         'token_type_ids': token_type_ids,
-                        'attention_mask': attention_mask,
+                        'attention_mask': apply_data_prefix_skip_mask(attention_mask),
                     }
                 
                 # Fallback to standard text tokenization
                 if 'text' in x:
-                    return encodeTokens(x['text'])
+                    ret = encodeTokens(x['text'])
+                    return {
+                        'input_ids': ret['input_ids'],
+                        'token_type_ids': ret['token_type_ids'],
+                        'attention_mask': apply_data_prefix_skip_mask(ret['attention_mask']),
+                    }
                 
                 raise ValueError('Invalid dataset format, must contain either the configured "multi column" or prompt/completion or text')
 
@@ -902,6 +916,18 @@ def __init__(
         # prompt/completion format masking support
         disable_prompt_completion_mask: bool = False,
 
+        # ----------------------------
+        # Selective loss training
+        # ----------------------------
+
+        # Prefix token masking
+        #
+        # The rationale behind this, is that the first X tokens should not be "backpropped"
+        # for any new training record. As its unfair to expect the model (or a human) make
+        # any resonable guesses at that stage. As such this is used to "mask" the first X tokens
+        # from the loss calculation, and thus not backpropped.
+        data_prefix_skip_mask: int = 0,
+
         # ----------------------------
         # dataset packing support
         # ----------------------------

From 979f961e6f07baba03f0501639655a1de6960a1b Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Wed, 17 Jan 2024 17:36:26 +0800
Subject: [PATCH 07/17] experimental factoring

---
 RWKV-v5/src/model.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 7b1c2b0a..fb075eba 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -126,7 +126,7 @@ def forward(self, x, last_state: BlockState):
 class L2Wrap(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, loss, y, token_amount, currentMask):
+    def forward(ctx, loss, y, factor, currentMask):
         # Currently (8th July 2023), save_for_backward, causes an issue with
         # pytorch.compile (see: https://github.com/pytorch/pytorch/blob/e600505e3209eaf539e8bc99870ea55236cefbf5/torch/_dynamo/variables/higher_order_ops.py#L735)
         # 
@@ -135,15 +135,13 @@ def forward(ctx, loss, y, token_amount, currentMask):
         #
         # See also:
         # - checkpointed_step
-        ctx.save_for_backward(y, token_amount, currentMask)
+        ctx.save_for_backward(y, factor, currentMask)
         return loss
 
     @staticmethod
     def backward(ctx, grad_output):
-        y, token_amount, currentMask = ctx.saved_tensors
+        y, factor, currentMask = ctx.saved_tensors
 
-        # to encourage the logits to be close to 0
-        factor = 1e-4 / token_amount
         maxx, ids = torch.max(y, -1, keepdim=True)
         gy = torch.zeros_like(y)
         gy.scatter_(-1, ids, maxx * factor)
@@ -910,6 +908,10 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                                     reduction="none")
             submask = mask.view(-1)[:token_loss.shape[0]]
 
+            # to encourage the logits to be close to 0
+            # factor_divisor is typically the total token count
+            L2Wrap_factor = 1e-4 / total_mask_sum
+            
             # Selective token loss logic
             if self.selective_token_loss_threshold > 0.0:
 
@@ -925,6 +927,9 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 train_loss = torch.sum(token_loss * train_mask) / total_mask_sum  
                 train_token_count = torch.sum(train_mask)
 
+                # Adjust the factor accordingly
+                L2Wrap_factor = L2Wrap_factor * (torch.sum(submask) / train_token_count)
+
             else:
                 train_loss = torch.sum(token_loss * submask) / total_mask_sum
                 sample_loss = train_loss.clone().detach().requires_grad_(False)
@@ -932,7 +937,7 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 train_mask = submask
 
             # L2Wrap for the backprop process
-            segment_train_loss = L2Wrap.apply(train_loss, logits, total_mask_sum, train_mask)
+            segment_train_loss = L2Wrap.apply(train_loss, logits, L2Wrap_factor, train_mask)
 
             # Return the checkpoint values
             return sample_loss, segment_train_loss, new_shift_states, new_wkv_states, train_token_count

From 370b06e5ca4b5464ea25d38e2b13c47c64e941ce Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 07:40:54 +0800
Subject: [PATCH 08/17] minor notebook reorg

---
 ...et-packing.ipynb => dataset-packing.ipynb} |   0
 .../minipile-validation.ipynb                 | 199 ++++++------------
 ...-length.ipynb => sort-offset-length.ipynb} |   0
 3 files changed, 62 insertions(+), 137 deletions(-)
 rename notebook/trainer-v5-validation/{test-dataset-packing.ipynb => dataset-packing.ipynb} (100%)
 rename notebook/trainer-v5-validation/{test-sort-offset-length.ipynb => sort-offset-length.ipynb} (100%)

diff --git a/notebook/trainer-v5-validation/test-dataset-packing.ipynb b/notebook/trainer-v5-validation/dataset-packing.ipynb
similarity index 100%
rename from notebook/trainer-v5-validation/test-dataset-packing.ipynb
rename to notebook/trainer-v5-validation/dataset-packing.ipynb
diff --git a/notebook/trainer-v5-validation/minipile-validation.ipynb b/notebook/trainer-v5-validation/minipile-validation.ipynb
index 0810e624..07b1dc74 100644
--- a/notebook/trainer-v5-validation/minipile-validation.ipynb
+++ b/notebook/trainer-v5-validation/minipile-validation.ipynb
@@ -26,7 +26,7 @@
      "output_type": "stream",
      "text": [
       "ENABLE_WANDB: True\n",
-      "GPU_DEVICES: auto\n",
+      "GPU_DEVICES: 1\n",
       "NOTEBOOK_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation\n",
       "TRAINER_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5\n",
       "PROJECT_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer\n"
@@ -60,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -73,16 +73,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2023-12-18 07:39:15--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.idx\n",
-      "Resolving huggingface.co (huggingface.co)... 13.33.33.55, 13.33.33.110, 13.33.33.102, ...\n",
-      "Connecting to huggingface.co (huggingface.co)|13.33.33.55|:443... connected.\n",
+      "--2024-01-17 16:37:45--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.idx\n",
+      "Resolving huggingface.co (huggingface.co)... 13.33.33.102, 13.33.33.20, 13.33.33.110, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|13.33.33.102|:443... connected.\n",
       "HTTP request sent, awaiting response... "
      ]
     },
@@ -91,21 +91,21 @@
      "output_type": "stream",
      "text": [
       "302 Found\n",
-      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=UycA%7Eo%7EEWgjN6kwZtAZSB6k5Nz7B5MQttQCeTVw5OD5T1lTLfhRIX3aFxwLTAyMDMOnWK0KGcnWfha6OcBl9%7EGTSfu408xpCk-PyW0E9W45m5fvR5FqLWgR41zakLePM0Ssu0Wb2syrSKCFElocrwluDvNykuHhUQgdhN9hutXENfd6qC8LZmn68eo-PlqIh6ka8sFyfJa-Bteb3mT1SAPmW19if1jiwcWmtFrB-HrdVtrxrGf033MkimToaxtDR310VEkdYmVnwaPSRcd4Hkfc2CR%7Emdd%7Eg-nzfMERz7Qh2CM%7EV6KBEOB%7EfX2fXXI8mTPVJNqxcIw23ZBEhArczmQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
-      "--2023-12-18 07:39:15--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=UycA%7Eo%7EEWgjN6kwZtAZSB6k5Nz7B5MQttQCeTVw5OD5T1lTLfhRIX3aFxwLTAyMDMOnWK0KGcnWfha6OcBl9%7EGTSfu408xpCk-PyW0E9W45m5fvR5FqLWgR41zakLePM0Ssu0Wb2syrSKCFElocrwluDvNykuHhUQgdhN9hutXENfd6qC8LZmn68eo-PlqIh6ka8sFyfJa-Bteb3mT1SAPmW19if1jiwcWmtFrB-HrdVtrxrGf033MkimToaxtDR310VEkdYmVnwaPSRcd4Hkfc2CR%7Emdd%7Eg-nzfMERz7Qh2CM%7EV6KBEOB%7EfX2fXXI8mTPVJNqxcIw23ZBEhArczmQ__&Key-Pair-Id=KCD77M1F0VK2B\n",
-      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.84, 13.33.88.7, ...\n",
+      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1705739865&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTczOTg2NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=Ca7T7yGaEKb-yz%7EGD34kXCNxrNYrwXXHAs9RwlCecKC9pUblLUPsz2wa1B-tAwJPnf3mjI8aBvhOpqsfeCg4oqM0TBWgwpHRxj%7E1bn8vjZRjYABwsTElLV-Z3rwgtVFKFCxtNQW1WWnf4AZmMDW8mqWjep48Y2-Mw6OzyZ3dWz6pOgA9%7E1osoqHjnZewkRB5RocVgOioqHAZRBc1mrqBd6yy%7E0oBixxb8pXzVOzU-J7JflEZBfvt2vGpuVNzOaYiwcAP7FOiWiFCBHjjWzeGYzcESofs%7E9%7EgALGuLQHGR8NGOZRlA4TvorBZIsd-V2abC1oO05yq8IRo5JmlCot6VQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
+      "--2024-01-17 16:37:45--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/f526abddaa06d376443e69c9a6c0fcbe4302afc0cb1aed08faf3fb97fc5acd10?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.idx%3B+filename%3D%22minipile.idx%22%3B&Expires=1705739865&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTczOTg2NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvZjUyNmFiZGRhYTA2ZDM3NjQ0M2U2OWM5YTZjMGZjYmU0MzAyYWZjMGNiMWFlZDA4ZmFmM2ZiOTdmYzVhY2QxMD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=Ca7T7yGaEKb-yz%7EGD34kXCNxrNYrwXXHAs9RwlCecKC9pUblLUPsz2wa1B-tAwJPnf3mjI8aBvhOpqsfeCg4oqM0TBWgwpHRxj%7E1bn8vjZRjYABwsTElLV-Z3rwgtVFKFCxtNQW1WWnf4AZmMDW8mqWjep48Y2-Mw6OzyZ3dWz6pOgA9%7E1osoqHjnZewkRB5RocVgOioqHAZRBc1mrqBd6yy%7E0oBixxb8pXzVOzU-J7JflEZBfvt2vGpuVNzOaYiwcAP7FOiWiFCBHjjWzeGYzcESofs%7E9%7EgALGuLQHGR8NGOZRlA4TvorBZIsd-V2abC1oO05yq8IRo5JmlCot6VQ__&Key-Pair-Id=KCD77M1F0VK2B\n",
+      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.62, 13.33.88.7, ...\n",
       "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|13.33.88.54|:443... connected.\n",
       "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
       "\n",
       "    The file is already fully retrieved; nothing to do.\n",
       "\n",
-      "--2023-12-18 07:39:15--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin\n",
-      "Resolving huggingface.co (huggingface.co)... 13.33.33.20, 13.33.33.102, 13.33.33.110, ...\n",
-      "Connecting to huggingface.co (huggingface.co)|13.33.33.20|:443... connected.\n",
+      "--2024-01-17 16:37:46--  https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin\n",
+      "Resolving huggingface.co (huggingface.co)... 13.33.33.55, 13.33.33.110, 13.33.33.20, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|13.33.33.55|:443... connected.\n",
       "HTTP request sent, awaiting response... 302 Found\n",
-      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=iDC3gWLKONw98DKGci%7ECza5tw-GGam9Yzp2u-tzqIr8SbJm%7EAWlT8QTLEiof9HrYmViwjTxt7ccXAk1m0Y0h4RchnE3xV1kCcAmCd0i%7EYAn4beKa7SvTgUKETCWGax382LNRM-pFC81TOmrbCPKbMsQKIiKIHCZ6aSjWd%7E-cqNSWs8VhL2Zs9ACnYFQXK%7E%7EOuTklP53PG0BpAfa7IGNxMyLYqQVr%7EzFd2UQAIgqpB2otxphl-e526oYIIun0jb6zcer8Qe93kG4S9O%7ETCKYBYwa2DNEYeeJZT0PAzKQrtbLDHn3LRm%7ES-uit6k-ReRDRJNEwwLsrXo9afWtn%7E9DjxA__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
-      "--2023-12-18 07:39:15--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1703115555&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMzExNTU1NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=iDC3gWLKONw98DKGci%7ECza5tw-GGam9Yzp2u-tzqIr8SbJm%7EAWlT8QTLEiof9HrYmViwjTxt7ccXAk1m0Y0h4RchnE3xV1kCcAmCd0i%7EYAn4beKa7SvTgUKETCWGax382LNRM-pFC81TOmrbCPKbMsQKIiKIHCZ6aSjWd%7E-cqNSWs8VhL2Zs9ACnYFQXK%7E%7EOuTklP53PG0BpAfa7IGNxMyLYqQVr%7EzFd2UQAIgqpB2otxphl-e526oYIIun0jb6zcer8Qe93kG4S9O%7ETCKYBYwa2DNEYeeJZT0PAzKQrtbLDHn3LRm%7ES-uit6k-ReRDRJNEwwLsrXo9afWtn%7E9DjxA__&Key-Pair-Id=KCD77M1F0VK2B\n",
-      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.84, 13.33.88.7, ...\n",
+      "Location: https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1705739866&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTczOTg2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=P90Ai0b76ySfWvt6dvtE2GVikpK-iG9tV1nPlNVKuj52n%7E2XxWBprGEOZ%7EUUK-WSakjaXqum1VlF8WfSB-HtsEbYLG4eWf5oIp2hFDtOZ1u5vxT6q1YaN3FTksCYAemZCYk3rAkyvucmjucOSmbt48eFgBovvQDKdazqtciuU6TQn0eQdxyo7YDY5VMXk8kDitYEjAZKrxxX28PuLV4h9hJxocQnWbDuSp4o7%7E1kih%7EIucA1cECAKfT4f8vUL3O9BGCh5FRb3xSdCyp5FnWrtnrj0eBk%7EyYgUSJziXXc-ZL9ExIdr2xFqVqzCrt3YIiR6uK8U5q6CD9GQbAnoVoquA__&Key-Pair-Id=KCD77M1F0VK2B [following]\n",
+      "--2024-01-17 16:37:46--  https://cdn-lfs-us-1.huggingface.co/repos/09/8d/098d39f30da901c320a0b91b647dbfcdb64742d734ad97ab2247383b7265662e/9917a52991b9ce5b0b05f92101962ba704cf3c4c20b64431ff8c45ba9d4141a5?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27minipile.bin%3B+filename%3D%22minipile.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1705739866&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTczOTg2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzA5LzhkLzA5OGQzOWYzMGRhOTAxYzMyMGEwYjkxYjY0N2RiZmNkYjY0NzQyZDczNGFkOTdhYjIyNDczODNiNzI2NTY2MmUvOTkxN2E1Mjk5MWI5Y2U1YjBiMDVmOTIxMDE5NjJiYTcwNGNmM2M0YzIwYjY0NDMxZmY4YzQ1YmE5ZDQxNDFhNT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=P90Ai0b76ySfWvt6dvtE2GVikpK-iG9tV1nPlNVKuj52n%7E2XxWBprGEOZ%7EUUK-WSakjaXqum1VlF8WfSB-HtsEbYLG4eWf5oIp2hFDtOZ1u5vxT6q1YaN3FTksCYAemZCYk3rAkyvucmjucOSmbt48eFgBovvQDKdazqtciuU6TQn0eQdxyo7YDY5VMXk8kDitYEjAZKrxxX28PuLV4h9hJxocQnWbDuSp4o7%7E1kih%7EIucA1cECAKfT4f8vUL3O9BGCh5FRb3xSdCyp5FnWrtnrj0eBk%7EyYgUSJziXXc-ZL9ExIdr2xFqVqzCrt3YIiR6uK8U5q6CD9GQbAnoVoquA__&Key-Pair-Id=KCD77M1F0VK2B\n",
+      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.33.88.54, 13.33.88.62, 13.33.88.7, ...\n",
       "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|13.33.88.54|:443... connected.\n",
       "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
       "\n",
@@ -117,20 +117,19 @@
    "source": [
     "# Download the minipile files\n",
     "!cd \"{PROJECT_DIR}\" && wget --continue -O dataset/minipile.idx https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.idx\n",
-    "!cd \"{PROJECT_DIR}\" && wget --continue -O dataset/minipile.bin https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin\n",
-    "\n"
+    "!cd \"{PROJECT_DIR}\" && wget --continue -O dataset/minipile.bin https://huggingface.co/datasets/BlinkDL/minipile-tokenized/resolve/main/rwkv_vocab_v20230424/minipile.bin"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2023-12-18 07:32:33,143] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-01-17 16:41:50,714] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
       "---- Initializing model ----\n",
       "No of layers: 12\n",
@@ -140,115 +139,7 @@
       "Emb scale: 0.0001\n",
       "Note: this process takes a significant time (and ram) for large models\n",
       "---- ----- ----\n",
-      "---\n",
-      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
-      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
-      "Detected CUDA files, patching ldflags\n",
-      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
-      "Building extension module wkv5...\n",
-      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
-      "ninja: no work to do.\n",
-      "Loading extension module wkv5...\n",
-      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
-      "---\n",
-      "65536 768   -0.0001 emb.weight\n",
-      "768   768   1.0  blocks.0.att.receptance.weight\n",
-      "768   768   1.0  blocks.0.att.key.weight\n",
-      "768   768   1.0  blocks.0.att.value.weight\n",
-      "768   768   0    blocks.0.att.output.weight\n",
-      "768   768   1.0  blocks.0.att.gate.weight\n",
-      "2688  768   1.0  blocks.0.ffn.key.weight\n",
-      "768   768   0    blocks.0.ffn.receptance.weight\n",
-      "768   2688  0    blocks.0.ffn.value.weight\n",
-      "768   768   1.0  blocks.1.att.receptance.weight\n",
-      "768   768   1.0  blocks.1.att.key.weight\n",
-      "768   768   1.0  blocks.1.att.value.weight\n",
-      "768   768   0    blocks.1.att.output.weight\n",
-      "768   768   1.0  blocks.1.att.gate.weight\n",
-      "2688  768   1.0  blocks.1.ffn.key.weight\n",
-      "768   768   0    blocks.1.ffn.receptance.weight\n",
-      "768   2688  0    blocks.1.ffn.value.weight\n",
-      "768   768   1.0  blocks.2.att.receptance.weight\n",
-      "768   768   1.0  blocks.2.att.key.weight\n",
-      "768   768   1.0  blocks.2.att.value.weight\n",
-      "768   768   0    blocks.2.att.output.weight\n",
-      "768   768   1.0  blocks.2.att.gate.weight\n",
-      "2688  768   1.0  blocks.2.ffn.key.weight\n",
-      "768   768   0    blocks.2.ffn.receptance.weight\n",
-      "768   2688  0    blocks.2.ffn.value.weight\n",
-      "768   768   1.0  blocks.3.att.receptance.weight\n",
-      "768   768   1.0  blocks.3.att.key.weight\n",
-      "768   768   1.0  blocks.3.att.value.weight\n",
-      "768   768   0    blocks.3.att.output.weight\n",
-      "768   768   1.0  blocks.3.att.gate.weight\n",
-      "2688  768   1.0  blocks.3.ffn.key.weight\n",
-      "768   768   0    blocks.3.ffn.receptance.weight\n",
-      "768   2688  0    blocks.3.ffn.value.weight\n",
-      "768   768   1.0  blocks.4.att.receptance.weight\n",
-      "768   768   1.0  blocks.4.att.key.weight\n",
-      "768   768   1.0  blocks.4.att.value.weight\n",
-      "768   768   0    blocks.4.att.output.weight\n",
-      "768   768   1.0  blocks.4.att.gate.weight\n",
-      "2688  768   1.0  blocks.4.ffn.key.weight\n",
-      "768   768   0    blocks.4.ffn.receptance.weight\n",
-      "768   2688  0    blocks.4.ffn.value.weight\n",
-      "768   768   1.0  blocks.5.att.receptance.weight\n",
-      "768   768   1.0  blocks.5.att.key.weight\n",
-      "768   768   1.0  blocks.5.att.value.weight\n",
-      "768   768   0    blocks.5.att.output.weight\n",
-      "768   768   1.0  blocks.5.att.gate.weight\n",
-      "2688  768   1.0  blocks.5.ffn.key.weight\n",
-      "768   768   0    blocks.5.ffn.receptance.weight\n",
-      "768   2688  0    blocks.5.ffn.value.weight\n",
-      "768   768   1.0  blocks.6.att.receptance.weight\n",
-      "768   768   1.0  blocks.6.att.key.weight\n",
-      "768   768   1.0  blocks.6.att.value.weight\n",
-      "768   768   0    blocks.6.att.output.weight\n",
-      "768   768   1.0  blocks.6.att.gate.weight\n",
-      "2688  768   1.0  blocks.6.ffn.key.weight\n",
-      "768   768   0    blocks.6.ffn.receptance.weight\n",
-      "768   2688  0    blocks.6.ffn.value.weight\n",
-      "768   768   1.0  blocks.7.att.receptance.weight\n",
-      "768   768   1.0  blocks.7.att.key.weight\n",
-      "768   768   1.0  blocks.7.att.value.weight\n",
-      "768   768   0    blocks.7.att.output.weight\n",
-      "768   768   1.0  blocks.7.att.gate.weight\n",
-      "2688  768   1.0  blocks.7.ffn.key.weight\n",
-      "768   768   0    blocks.7.ffn.receptance.weight\n",
-      "768   2688  0    blocks.7.ffn.value.weight\n",
-      "768   768   1.0  blocks.8.att.receptance.weight\n",
-      "768   768   1.0  blocks.8.att.key.weight\n",
-      "768   768   1.0  blocks.8.att.value.weight\n",
-      "768   768   0    blocks.8.att.output.weight\n",
-      "768   768   1.0  blocks.8.att.gate.weight\n",
-      "2688  768   1.0  blocks.8.ffn.key.weight\n",
-      "768   768   0    blocks.8.ffn.receptance.weight\n",
-      "768   2688  0    blocks.8.ffn.value.weight\n",
-      "768   768   1.0  blocks.9.att.receptance.weight\n",
-      "768   768   1.0  blocks.9.att.key.weight\n",
-      "768   768   1.0  blocks.9.att.value.weight\n",
-      "768   768   0    blocks.9.att.output.weight\n",
-      "768   768   1.0  blocks.9.att.gate.weight\n",
-      "2688  768   1.0  blocks.9.ffn.key.weight\n",
-      "768   768   0    blocks.9.ffn.receptance.weight\n",
-      "768   2688  0    blocks.9.ffn.value.weight\n",
-      "768   768   1.0  blocks.10.att.receptance.weight\n",
-      "768   768   1.0  blocks.10.att.key.weight\n",
-      "768   768   1.0  blocks.10.att.value.weight\n",
-      "768   768   0    blocks.10.att.output.weight\n",
-      "768   768   1.0  blocks.10.att.gate.weight\n",
-      "2688  768   1.0  blocks.10.ffn.key.weight\n",
-      "768   768   0    blocks.10.ffn.receptance.weight\n",
-      "768   2688  0    blocks.10.ffn.value.weight\n",
-      "768   768   1.0  blocks.11.att.receptance.weight\n",
-      "768   768   1.0  blocks.11.att.key.weight\n",
-      "768   768   1.0  blocks.11.att.value.weight\n",
-      "768   768   0    blocks.11.att.output.weight\n",
-      "768   768   1.0  blocks.11.att.gate.weight\n",
-      "2688  768   1.0  blocks.11.ffn.key.weight\n",
-      "768   768   0    blocks.11.ffn.receptance.weight\n",
-      "768   2688  0    blocks.11.ffn.value.weight\n",
-      "65536 768   0.5  head.weight\n"
+      "Model exists, skipping init_model\n"
      ]
     }
    ],
@@ -288,16 +179,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2023-12-18 11:57:34,927] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-01-17 16:42:01,086] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
-      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--model.load_model=../model/L12-D768-world-init.pth'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--model.load_model=../model/L12-D768-world-init.pth'].\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=1', '--model.load_model=../model/L12-D768-world-init.pth'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/minipile-world-512.yaml', '--trainer.logger.init_args.name=infctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=1', '--model.load_model=../model/L12-D768-world-init.pth'].\n",
       "Seed set to 3941088705\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
@@ -330,14 +221,14 @@
       "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
       "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.1 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.2 is available!  To upgrade, please run:\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.0\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20231218_115739-69qe82py\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240117_164210-bvjhu7ex\u001b[0m\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33minfctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)\u001b[0m\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/69qe82py\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/bvjhu7ex\u001b[0m\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory ../checkpoint/trainer-validaiton/infctx-v5-minipile-512 exists and is not empty.\n",
       "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
       "#\n",
@@ -360,7 +251,7 @@
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.05427098274230957 seconds\n",
+      "Time to load fused_adam op: 0.05117464065551758 seconds\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "Loading `train_dataloader` to estimate number of stepping batches.\n",
@@ -377,9 +268,43 @@
       "192 M     Total params\n",
       "771.232   Total estimated model params size (MB)\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/utilities/data.py:104: Total length of `DataLoader` across ranks is zero. Please make sure this was your intention.\n",
-      "Epoch 0:   1%| | 1000/183005 [02:20<7:07:18,  7.10it/s, v_num=82py, train/loss=5/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "Epoch 0:   1%| | 1000/183005 [02:24<7:19:45,  6.90it/s, v_num=u7ex, train/loss=5/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
       "  warnings.warn(\n",
-      "Epoch 0:  40%|▍| 74044/183005 [2:55:32<4:18:19,  7.03it/s, v_num=82py, train/los"
+      "Epoch 0: 100%|█| 183005/183005 [7:26:54<00:00,  6.82it/s, v_num=u7ex, train/loss`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 183005/183005 [7:26:54<00:00,  6.82it/s, v_num=u7ex, train/loss\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                     epoch ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 ▃▅▂▂▁▂▃▃▄▂▃▃▄▅▅▆▆▆▇▆▆▆▆▇▇▇▇▇▇▇▇█████████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             train/ctx_len ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:           train/data_loss █▇▅▇▆▄▇▄▅▅▄▄▄▂▅▃▅▃▄▄▄▂▃▃▃▄▃▁▃▂▃▂▁▃▂▃▂▃▂▂\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss █▇▄▆▃▅▄▃▄▄▄▅▃▄▅▃▄▂▃▂▄▃▂▂▄▃▃▂▃▂▂▃▃▃▃▂▁▂▁▃\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/tokens █▇▅▇▆▄▇▄▆▅▄▃▄▃▅▄▆▄▆▄▅▃▃▄▄▄▄▁▃▃▃▃▁▅▃▄▂▃▃▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate ████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  batchidx 183004\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                     epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/tokens_per_sec.gpu.0 55909.00069\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/tokens_total.gpu.0 1499171840\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   substep 183004\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             train/ctx_len 192.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:           train/data_loss 2.9375\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/loss 2.76562\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/tokens 127.375\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:       trainer/global_step 183004\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:     trainer/learning_rate 6e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33minfctx-v5-validation - MiniPile (train-ctx=512, data-ctx=512, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/runs/bvjhu7ex\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-infctx-unit-test/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjExNjk3NDc5Mw==/version_details/v9\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240117_164210-bvjhu7ex/logs\u001b[0m\n"
      ]
     }
    ],
diff --git a/notebook/trainer-v5-validation/test-sort-offset-length.ipynb b/notebook/trainer-v5-validation/sort-offset-length.ipynb
similarity index 100%
rename from notebook/trainer-v5-validation/test-sort-offset-length.ipynb
rename to notebook/trainer-v5-validation/sort-offset-length.ipynb

From 6ec9641369b5109517e8409296c6c18b1969aa4b Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 08:26:33 +0800
Subject: [PATCH 09/17] apply data prefix skip mask for rechunk text too

---
 RWKV-v5/src/data.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index b5a0a984..ce30618e 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -272,8 +272,9 @@ def encodeTokens(x):
             # where relevent, and disables the training mask for the first X tokens
             data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
             def apply_data_prefix_skip_mask(mask):
-                if data_prefix_skip_mask_enabled > 0:
-                    for i in range(data_prefix_skip_mask_enabled):
+                mask_len = len(mask)
+                if data_prefix_skip_mask_enabled > 0 and mask_len:
+                    for i in range(max(data_prefix_skip_mask_enabled, mask_len)):
                         mask[i] = 0
                 return mask
             
@@ -533,7 +534,7 @@ def rechunk_text(x):
                 # with the newline token in between
                 full_input_ids += x["input_ids"][i] + endOfDoc_tokenSet["input_ids"][0]
                 full_token_type_ids += x["token_type_ids"][i] + endOfDoc_tokenSet["token_type_ids"][0]
-                full_attention_mask += x["attention_mask"][i] + endOfDoc_tokenSet["attention_mask"][0]
+                full_attention_mask += apply_data_prefix_skip_mask( x["attention_mask"][i] ) + endOfDoc_tokenSet["attention_mask"][0]
             
             # Total length, and sample count
             # note that thte "remainder" will be discarded
@@ -554,7 +555,7 @@ def rechunk_text(x):
                 # Push the sample to the output arrays
                 out_input_ids.append(full_input_ids[start:end])
                 out_token_type_ids.append(full_token_type_ids[start:end])
-                out_attention_mask.append(full_attention_mask[start:end])
+                out_attention_mask.append(apply_data_prefix_skip_mask( full_attention_mask[start:end] ))
             
             # Prepare and return the output object
             ret = {

From 8805a7372299f7cedfbf0a75878d2d8424055180 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 11:00:27 +0800
Subject: [PATCH 10/17] wip data masking tweak

---
 RWKV-v5/src/data.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index ce30618e..bd22786e 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -46,6 +46,19 @@ def prepare_data_static(**kargs):
         
         # =====================================================
 
+        # Util functions
+        #--------------------------------
+
+        # Apply the data_prefix_skip_mask to the given mask
+        # where relevent, and disables the training mask for the first X tokens
+        data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
+        def apply_data_prefix_skip_mask(mask):
+            mask_len = len(mask)
+            if data_prefix_skip_mask_enabled > 0 and mask_len:
+                for i in range(max(data_prefix_skip_mask_enabled, mask_len)):
+                    mask[i] = 0
+            return mask
+        
         # Special handling for binidx
         #--------------------------------
 
@@ -66,7 +79,7 @@ def gen():
                     yield {
                         'input_ids': tokens,
                         'token_type_ids': [0] * len(tokens),
-                        'attention_mask': [1] * len(tokens)
+                        'attention_mask': apply_data_prefix_skip_mask([1] * len(tokens))
                     }
 
             # Load the huggingface dataset from the generator
@@ -268,16 +281,6 @@ def encodeTokens(x):
 
                     conversation_enabled = True
 
-            # Apply the data_prefix_skip_mask to the given mask
-            # where relevent, and disables the training mask for the first X tokens
-            data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
-            def apply_data_prefix_skip_mask(mask):
-                mask_len = len(mask)
-                if data_prefix_skip_mask_enabled > 0 and mask_len:
-                    for i in range(max(data_prefix_skip_mask_enabled, mask_len)):
-                        mask[i] = 0
-                return mask
-            
             # Maps the dataset record to the tokenized result
             # handles a wide variety of format according to the data configuration
             #

From cf43c2cf40d2f97cfc30d4b8b03a3199fbc3b9bd Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 11:15:09 +0800
Subject: [PATCH 11/17] fixing data masking

---
 RWKV-v5/src/data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index bd22786e..9534b582 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -51,11 +51,11 @@ def prepare_data_static(**kargs):
 
         # Apply the data_prefix_skip_mask to the given mask
         # where relevent, and disables the training mask for the first X tokens
-        data_prefix_skip_mask_enabled = kargs["data_prefix_skip_mask"] is not None
+        data_prefix_skip_mask_val = int(kargs["data_prefix_skip_mask"])
         def apply_data_prefix_skip_mask(mask):
             mask_len = len(mask)
-            if data_prefix_skip_mask_enabled > 0 and mask_len:
-                for i in range(max(data_prefix_skip_mask_enabled, mask_len)):
+            if data_prefix_skip_mask_val > 0 and mask_len:
+                for i in range(max(data_prefix_skip_mask_val, mask_len)):
                     mask[i] = 0
             return mask
         
@@ -1052,4 +1052,4 @@ def val_dataloader(self):
             batch_size=1, 
             # Pinned in GPU memory
             pin_memory=True
-        )
+        )
\ No newline at end of file

From f5fdb89b56fa8ef8a87edbb225b3dcd039af3eea Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 11:19:06 +0800
Subject: [PATCH 12/17] skipped fully masked records

---
 RWKV-v5/src/data.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 9534b582..b391a2df 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -583,6 +583,8 @@ def dataset_filter(x):
                 return False
             if kargs["max_token_size"] > 0 and row_length > kargs["max_token_size"]:
                 return False
+            if sum(x["attention_mask"]) <= 0:
+                return False
             return True
         src_dataset = src_dataset.filter(dataset_filter, num_proc=num_cpus)
 

From d3b3f182581b71e7169987416ad16558b7746e47 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 12:16:55 +0800
Subject: [PATCH 13/17] Fixing microbatches

---
 RWKV-v5/src/model.py                          |  22 +-
 .../config/enwiki_10k-world-full.yaml         | 265 ++++++++++
 .../dataset-microbatch.ipynb                  | 469 ++++++++++++++++++
 3 files changed, 751 insertions(+), 5 deletions(-)
 create mode 100644 notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml
 create mode 100644 notebook/trainer-v5-validation/dataset-microbatch.ipynb

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index fb075eba..1d065584 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -911,9 +911,18 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
             # to encourage the logits to be close to 0
             # factor_divisor is typically the total token count
             L2Wrap_factor = 1e-4 / total_mask_sum
+
+            # Submask count
+            submask_count = torch.sum(submask)
             
             # Selective token loss logic
-            if self.selective_token_loss_threshold > 0.0:
+            if submask_count <= 0.0:
+                train_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
+                sample_loss = train_loss.clone().detach().requires_grad_(False)
+                train_token_count = 0
+                train_mask = submask
+
+            elif self.selective_token_loss_threshold > 0.0:
 
                 # Sample loss, without backprop 
                 with torch.no_grad():
@@ -928,16 +937,19 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 train_token_count = torch.sum(train_mask)
 
                 # Adjust the factor accordingly
-                L2Wrap_factor = L2Wrap_factor * (torch.sum(submask) / train_token_count)
+                L2Wrap_factor = L2Wrap_factor * (submask_count / train_token_count)
 
             else:
                 train_loss = torch.sum(token_loss * submask) / total_mask_sum
                 sample_loss = train_loss.clone().detach().requires_grad_(False)
-                train_token_count = torch.sum(submask)
+                train_token_count = submask_count
                 train_mask = submask
 
-            # L2Wrap for the backprop process
-            segment_train_loss = L2Wrap.apply(train_loss, logits, L2Wrap_factor, train_mask)
+            if train_loss <= 0.0:
+                segment_train_loss = torch.tensor(0, dtype=self.emb.weight.dtype).requires_grad_()
+            else:
+                # L2Wrap for the backprop process
+                segment_train_loss = L2Wrap.apply(train_loss, logits, L2Wrap_factor, train_mask)
 
             # Return the checkpoint values
             return sample_loss, segment_train_loss, new_shift_states, new_wkv_states, train_token_count
diff --git a/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml b/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml
new file mode 100644
index 00000000..85b60f6a
--- /dev/null
+++ b/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml
@@ -0,0 +1,265 @@
+# lightning.pytorch==2.0.2
+seed_everything: 3941088705
+trainer:
+
+  #
+  # Configure the deepspeed strategy, we recommend you start with `deepspeed_stage_2_offload` 
+  # and adjust from there according to your training needs. `deepspeed_stage_3_offload` is useful  
+  # for training LoRA on large models on a single GPU.
+  #
+  # In general you would want to use the following:
+  #
+  # - deepspeed_stage_1 : Each of your GPU has too much vram, and you do not know what to do
+  #
+  # - deepspeed_stage_2 : Optimal distributed training strategy, across multiple gpu each with sufficient vram
+  # - deepspeed_stage_2_offload : Reduce vram usage by offloading the optimizer state and work to cpu
+  #
+  # - deepspeed_stage_3 : Split up the model across multiple gpu, useful for large models, at a performance cost
+  # - deepspeed_stage_3_offload : Additional offloading, for even greater performance cost
+  #
+  # For more details see:
+  # https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
+  #
+  strategy: deepspeed_stage_2_offload
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'infctx-v5-unit-test-baseline (train-ctx=4096, data-ctx=full)'
+      project: 'RWKV-infctx-unit-test'
+      tags: ['RWKV', 'infctx']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: ../checkpoint/trainer-validaiton/infctx-v5-enwiki-10k-full
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 3
+      # Choose by the most recent checkpoints (step based)
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: false
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 100
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other settings, you can probably leave alone
+      verbose: false
+      auto_insert_metric_name: true
+  
+  ########################################
+  ## Training run parameter settings
+  ########################################
+
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Number of datasamples to train for each step, a data sample is considered
+  # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
+  #
+  # This decides the number of datasample, to learn together from, before backproping
+  # any weight changes at the end of the batch.
+  #
+  # Recommended to be a big enough number (like 128/256) where it prevents the training 
+  # loss from flucuating in the process. But not too big of a number where the increased
+  # GPU vRAM / offloaded RAM usage will cause the training to crash.
+  #
+  # You are also recommended to configure this to a large enough number to fully utilize
+  # your GPU processing time %, and avoid idle time for the GPU between batches
+  #
+  # This number is divided by the number of GPUs, and nodes configured
+  # So if you have 4 GPUs, and 2 nodes, and this is configured as 128
+  # Each GPU will process 128/4/2 = 16 datasamples per step, via accumulate_grad_batches
+  target_batch_size: 16
+
+########################################
+## Training model settings
+########################################
+model:
+  # Model to start the finetune/training process from
+  load_model: ../model/L24-D2048-world-v5base-init.pth
+
+  # Context length to use for the training process
+  # the larger the number (and batch size) the larger the vram usage
+  # 
+  # Note that if the datasample context length is larger then the ctx_len
+  # its training process would be split into ctx_len sized chunks.
+  #
+  # This allows the training of extreamly large context length (eg. 100k),
+  # without eating up too much vram by keeping the training context length
+  # to a resonable number sutible to the current GPU setup
+  ctx_len: 4096
+  
+  # Data samples would be cut down to the respective max ctx_len_cutoffs
+  # values if its larger then ctx_len. If the data sample is larger then
+  # the largest len_cutoff, the remaining data will be discarded
+  ctx_len_cutoffs: []
+  # Experimental settings, number of tokens to skip in the data sample
+  # prefix, for the respective cutoff length. Used to speed up the process
+  ctx_len_warmup_steps: []
+
+  # Learning rate of the training process
+  # ---
+
+  # Initia learning rate of the process
+  lr_init: 8e-4
+  # Final learning rate after the learning rate period
+  # learning rate will stay at final value from then onwards
+  lr_final: 4e-4
+
+  # Number of epoch to reduce the learning rate from lr_init to lr_final
+  #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
+  #  0 means lr_final will apply immediately
+  # -1 means we take the current max_step / max_epoch as the period
+  lr_period: 1
+  # lr_period type if its set, defaults to epoch
+  lr_period_type: epoch
+
+  # Adam optimizer settings
+  # You probably want to leave this alone, unless you know what you are doing
+  beta1: 0.9
+  beta2: 0.99
+  adam_eps: 1.0e-08
+  weight_decay: 0.01
+
+  # torch.set_float32_matmul_precision, used to optimize operations with tensor cores
+  # this should be set as null, for non cuda core GPUs
+  torch_set_float32_matmul_precision: 'high'
+  # torch_set_float32_matmul_precision: null
+
+  # Segmented based learning, used to work around training of large context length
+  # beyond what can be supported by the current GPU vram architecture
+  #
+  # This is not 1:1 equivalent to the same training process with required vram
+  # as the training process is split into multiple segments, part by part.
+  # with limited learnings from the previous segment.
+  bptt_learning: true
+
+  # Segmented range to performing backprop learning on
+  # 1 means to apply only for the last segment
+  # -1 means to apply for all segments
+  bptt_learning_range: -1
+
+data:
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: ../datapath/enwiki_10k-world-4096/
+
+  # Other wise provide the source path, which is used as huggingface dataset path
+  # this will be used to populate the dataset_path
+  #
+  # Use either the following
+  # - hugging face dataset 
+  # - Directory path to a directory containing dataset files
+  # - Path to a single dataset file
+  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
+  # - null
+  #
+  # If source is disabled, all other params, except data_path, is ignored
+  source: "teven/enwiki_10k"
+  # source: text
+  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
+
+  # Use data_dir, if you are using source=text/json/etc
+  # this should be relative to the trainer script path
+  source_data_dir: null
+
+  # After loading the dataset, split out test data used for unit-test, 
+  # This process is skipped if the dataset includes a test split
+  # This process is skipped if set to zero
+  test_split: 0.01
+  test_split_shuffle: false
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the tokenizer file path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 512 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  min_token_size: -1
+  max_token_size: -1
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 4096
+  #
+  # This is ignored, if source is not set as text
+  # This is ignored, if set to zero
+  # ---
+  # text_rechunk_size: 4096
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: false
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  # ---
+  # multi_column_keys: ['instruction', 'input', 'output']
+  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
+  # multi_column_train_mask: [true, false, true]
+  # multi_column_separator: '\n\n'
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+# Path to the current checkpoint to continue training from
+# Enable this to the last checkpoint after the first run 
+# (if it crash and you want to resume)
+# ckpt_path: ../checkpoint/trainer-validaiton/infctx-unit-test-baseline/epoch=0-step=20.ckpt
+ckpt_path: null
diff --git a/notebook/trainer-v5-validation/dataset-microbatch.ipynb b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
new file mode 100644
index 00000000..73ebc960
--- /dev/null
+++ b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
@@ -0,0 +1,469 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dataset microbatch testing\n",
+    "\n",
+    "Testing runs on multiple micro batch settings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ENABLE_WANDB: False\n",
+      "GPU_DEVICES: auto\n",
+      "NOTEBOOK_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation\n",
+      "TRAINER_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5\n",
+      "PROJECT_DIR: /home/picocreator/rwkv-proj/RWKV-infctx-trainer\n"
+     ]
+    }
+   ],
+   "source": [
+    "GPU_DEVICES=\"auto\"\n",
+    "ENABLE_WANDB=False\n",
+    "WANDB_PREFIX=\"infctx-v5-microbatch\"\n",
+    "\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-01-18 11:19:39,010] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "---- Initializing model ----\n",
+      "No of layers: 6\n",
+      "Embedding size: 512\n",
+      "Output model path: ../model/L6-D512-world-v5base-init.pth\n",
+      "Vocab size: 65536\n",
+      "Emb scale: 0.0001\n",
+      "Note: this process takes a significant time (and ram) for large models\n",
+      "---- ----- ----\n",
+      "Model exists, skipping init_model\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Init the model\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 ./init_model.py \\\n",
+    "        --n_layer 6 --n_embd 512 \\\n",
+    "        --vocab_size world --skip-if-exists \\\n",
+    "        \"../model/L6-D512-world-v5base-init.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 9575.14 examples/s]\n",
+      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 12203.75 examples/s]\n",
+      "Map (num_proc=16): 100%|██████████| 9892/9892 [00:00<00:00, 20646.21 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 241357.37 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28064.93 examples\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets preload the requried dataset \n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/config/enwiki_10k-world-full.yaml\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# microbatch=1\n",
+    "\n",
+    "Note: We are intentionally testing without rechunk, as that has known edge case issues."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-01-18 12:00:55,830] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 1 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=1', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 1 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=1', '--trainer.devices=auto'].\n",
+      "Seed set to 3941088705\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       16\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             1\n",
+      "   - microbatch_size:         1\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    16\n",
+      "\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 595479.80 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28472.64 examples\n",
+      "[rank: 0] Seed set to 3941088705\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory ../checkpoint/v5-enwiki-10k-full/ exists and is not empty.\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  8.000e-04 (0.0008)\n",
+      "    - lr_final: 4.000e-04 (0.0004)\n",
+      "\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05255270004272461 seconds\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 33.6 M\n",
+      "1 | blocks | ModuleList | 20.5 M\n",
+      "2 | ln_out | LayerNorm  | 1.0 K \n",
+      "3 | head   | Linear     | 33.6 M\n",
+      "--------------------------------------\n",
+      "87.6 M    Trainable params\n",
+      "0         Non-trainable params\n",
+      "87.6 M    Total params\n",
+      "350.405   Total estimated model params size (MB)\n",
+      "Epoch 0:  16%|▏| 1600/9892 [00:55<04:49, 28.62it/s, v_num=mu7h, train/loss=5.310/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|█| 9892/9892 [05:45<00:00, 28.62it/s, v_num=mu7h, train/loss=4.090\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                       | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                          | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   1%|▏                 | 1/100 [00:00<00:30,  3.23it/s]\u001b[A\n",
+      "Validation DataLoader 0:   2%|▎                 | 2/100 [00:00<00:28,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:   3%|▌                 | 3/100 [00:00<00:27,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:   4%|▋                 | 4/100 [00:01<00:27,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:   5%|▉                 | 5/100 [00:01<00:26,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:   6%|█                 | 6/100 [00:01<00:26,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:   7%|█▎                | 7/100 [00:01<00:25,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:   8%|█▍                | 8/100 [00:02<00:25,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:   9%|█▌                | 9/100 [00:02<00:25,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  10%|█▋               | 10/100 [00:02<00:24,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  11%|█▊               | 11/100 [00:03<00:24,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  12%|██               | 12/100 [00:03<00:24,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  13%|██▏              | 13/100 [00:03<00:23,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  14%|██▍              | 14/100 [00:03<00:23,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  15%|██▌              | 15/100 [00:04<00:23,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  16%|██▋              | 16/100 [00:04<00:22,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  17%|██▉              | 17/100 [00:04<00:22,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  18%|███              | 18/100 [00:04<00:22,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  19%|███▏             | 19/100 [00:05<00:22,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  20%|███▍             | 20/100 [00:05<00:21,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  21%|███▌             | 21/100 [00:05<00:21,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  22%|███▋             | 22/100 [00:06<00:21,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  23%|███▉             | 23/100 [00:06<00:21,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  24%|████             | 24/100 [00:06<00:20,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  25%|████▎            | 25/100 [00:07<00:22,  3.39it/s]\u001b[A\n",
+      "Validation DataLoader 0:  26%|████▍            | 26/100 [00:07<00:21,  3.40it/s]\u001b[A\n",
+      "Validation DataLoader 0:  27%|████▌            | 27/100 [00:07<00:21,  3.41it/s]\u001b[A\n",
+      "Validation DataLoader 0:  28%|████▊            | 28/100 [00:08<00:21,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  29%|████▉            | 29/100 [00:08<00:20,  3.43it/s]\u001b[A\n",
+      "Validation DataLoader 0:  30%|█████            | 30/100 [00:08<00:20,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  31%|█████▎           | 31/100 [00:08<00:20,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  32%|█████▍           | 32/100 [00:09<00:19,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  33%|█████▌           | 33/100 [00:09<00:19,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  34%|█████▊           | 34/100 [00:09<00:19,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  35%|█████▉           | 35/100 [00:10<00:18,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  36%|██████           | 36/100 [00:10<00:18,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  37%|██████▎          | 37/100 [00:10<00:18,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  38%|██████▍          | 38/100 [00:10<00:17,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  39%|██████▋          | 39/100 [00:11<00:17,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  40%|██████▊          | 40/100 [00:11<00:17,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  41%|██████▉          | 41/100 [00:11<00:16,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  42%|███████▏         | 42/100 [00:11<00:16,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  43%|███████▎         | 43/100 [00:12<00:16,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  44%|███████▍         | 44/100 [00:12<00:15,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  45%|███████▋         | 45/100 [00:12<00:15,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  46%|███████▊         | 46/100 [00:13<00:15,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  47%|███████▉         | 47/100 [00:13<00:15,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  48%|████████▏        | 48/100 [00:13<00:14,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  49%|████████▎        | 49/100 [00:13<00:14,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  50%|████████▌        | 50/100 [00:14<00:14,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  51%|████████▋        | 51/100 [00:14<00:13,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  52%|████████▊        | 52/100 [00:14<00:13,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  53%|█████████        | 53/100 [00:14<00:13,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  54%|█████████▏       | 54/100 [00:15<00:12,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  55%|█████████▎       | 55/100 [00:15<00:12,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  56%|█████████▌       | 56/100 [00:15<00:12,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  57%|█████████▋       | 57/100 [00:16<00:12,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  58%|█████████▊       | 58/100 [00:16<00:11,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  59%|██████████       | 59/100 [00:16<00:11,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  60%|██████████▏      | 60/100 [00:16<00:11,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  61%|██████████▎      | 61/100 [00:17<00:10,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  62%|██████████▌      | 62/100 [00:17<00:10,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  63%|██████████▋      | 63/100 [00:17<00:10,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  64%|██████████▉      | 64/100 [00:17<00:10,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  65%|███████████      | 65/100 [00:18<00:09,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  66%|███████████▏     | 66/100 [00:18<00:09,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  67%|███████████▍     | 67/100 [00:18<00:09,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  68%|███████████▌     | 68/100 [00:18<00:08,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  69%|███████████▋     | 69/100 [00:19<00:08,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  70%|███████████▉     | 70/100 [00:19<00:08,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  71%|████████████     | 71/100 [00:19<00:08,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  72%|████████████▏    | 72/100 [00:20<00:07,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  73%|████████████▍    | 73/100 [00:20<00:07,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  74%|████████████▌    | 74/100 [00:20<00:07,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  75%|████████████▊    | 75/100 [00:20<00:06,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  76%|████████████▉    | 76/100 [00:21<00:06,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  77%|█████████████    | 77/100 [00:21<00:06,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  78%|█████████████▎   | 78/100 [00:21<00:06,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  79%|█████████████▍   | 79/100 [00:21<00:05,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  80%|█████████████▌   | 80/100 [00:22<00:05,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  81%|█████████████▊   | 81/100 [00:22<00:05,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  82%|█████████████▉   | 82/100 [00:22<00:04,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  83%|██████████████   | 83/100 [00:22<00:04,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  84%|██████████████▎  | 84/100 [00:23<00:04,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  85%|██████████████▍  | 85/100 [00:23<00:04,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  86%|██████████████▌  | 86/100 [00:23<00:03,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  87%|██████████████▊  | 87/100 [00:24<00:03,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  88%|██████████████▉  | 88/100 [00:24<00:03,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  89%|███████████████▏ | 89/100 [00:24<00:03,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  90%|███████████████▎ | 90/100 [00:24<00:02,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  91%|███████████████▍ | 91/100 [00:25<00:02,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  92%|███████████████▋ | 92/100 [00:25<00:02,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  93%|███████████████▊ | 93/100 [00:25<00:01,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  94%|███████████████▉ | 94/100 [00:25<00:01,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  95%|████████████████▏| 95/100 [00:26<00:01,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  96%|████████████████▎| 96/100 [00:26<00:01,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  97%|████████████████▍| 97/100 [00:26<00:00,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  98%|████████████████▋| 98/100 [00:27<00:00,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  99%|████████████████▊| 99/100 [00:27<00:00,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|████████████████| 100/100 [00:27<00:00,  3.63it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 9892/9892 [06:13<00:00, 26.50it/s, v_num=mu7h, train/loss=4.090`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 9892/9892 [06:13<00:00, 26.50it/s, v_num=mu7h, train/loss=4.090\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/config/enwiki_10k-world-full.yaml\" \\\n",
+    "        --model.load_model=\"../model/L6-D512-world-v5base-init.pth\" \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-full/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Microbatch 1 - (deepspeed_stage_1)\" \\\n",
+    "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
+    "        --trainer.microbatch_size=1 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# microbatch=2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets preload the requried dataset \n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/config/test-dataset-repack.yaml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-01-18 12:10:50,430] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'].\n",
+      "Seed set to 3941088705\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       16\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             1\n",
+      "   - microbatch_size:         2\n",
+      "   - accumulate_grad_batches: 8\n",
+      "   - effective_batch_size:    16\n",
+      "\n",
+      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 9808.78 examples/s]\n",
+      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 13056.99 examples/s]\n",
+      "Map (num_proc=16): 100%|██████████| 6409/6409 [00:00<00:00, 14454.07 examples/s]\n",
+      "Map (num_proc=16): 100%|███████████| 6409/6409 [00:00<00:00, 6890.28 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 323/323 [00:00<00:00, 7147.58 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█| 65/65 [00:00<00:00, 17810.79 examples/s\n",
+      "[rank: 0] Seed set to 3941088705\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  8.000e-04 (0.0008)\n",
+      "    - lr_final: 4.000e-04 (0.0004)\n",
+      "\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05469512939453125 seconds\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 33.6 M\n",
+      "1 | blocks | ModuleList | 20.5 M\n",
+      "2 | ln_out | LayerNorm  | 1.0 K \n",
+      "3 | head   | Linear     | 33.6 M\n",
+      "--------------------------------------\n",
+      "87.6 M    Trainable params\n",
+      "0         Non-trainable params\n",
+      "87.6 M    Total params\n",
+      "350.405   Total estimated model params size (MB)\n",
+      "Epoch 0:  21%|▋  | 34/162 [00:19<01:12,  1.76it/s, v_num=oczk, train/loss=8.880]^C\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/config/enwiki_10k-world-full.yaml\" \\\n",
+    "        --model.load_model=\"../model/L6-D512-world-v5base-init.pth\" \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-full/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Microbatch 2 - (deepspeed_stage_1)\" \\\n",
+    "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
+    "        --trainer.microbatch_size=2 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "rwkv-infctx",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From f24eb074ccbb390ceead13acd22c0d837a2347f3 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 12:22:28 +0800
Subject: [PATCH 14/17] disable selective token loss by default

---
 RWKV-v5/src/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 1d065584..f52d1d60 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -197,7 +197,7 @@ def __init__(self,
                  position_loss_bias_in_validation: bool = False,
                  
                  # Selective loss settings
-                 selective_token_loss_threshold: float = 1.0,
+                 selective_token_loss_threshold: float = 0.0,
 
                  # Backprop settings
                  grad_cp: bool = True,

From 25d1354dbe013d1df24e5cb64833b29f0033c97c Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 12:22:41 +0800
Subject: [PATCH 15/17] microbatch validation

---
 .../dataset-microbatch.ipynb                  | 357 +++++++++++++++++-
 1 file changed, 338 insertions(+), 19 deletions(-)

diff --git a/notebook/trainer-v5-validation/dataset-microbatch.ipynb b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
index 73ebc960..df25fda5 100644
--- a/notebook/trainer-v5-validation/dataset-microbatch.ipynb
+++ b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
@@ -330,27 +330,230 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-01-18 12:11:50,734] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'].\n",
+      "Seed set to 3941088705\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       16\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             1\n",
+      "   - microbatch_size:         2\n",
+      "   - accumulate_grad_batches: 8\n",
+      "   - effective_batch_size:    16\n",
+      "\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 530522.66 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28455.25 examples\n",
+      "[rank: 0] Seed set to 3941088705\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory ../checkpoint/v5-enwiki-10k-full/ exists and is not empty.\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  8.000e-04 (0.0008)\n",
+      "    - lr_final: 4.000e-04 (0.0004)\n",
+      "\n",
+      "Using /home/picocreator/.cache/torch_extensions/py311_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/picocreator/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05180692672729492 seconds\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading `train_dataloader` to estimate number of stepping batches.\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 33.6 M\n",
+      "1 | blocks | ModuleList | 20.5 M\n",
+      "2 | ln_out | LayerNorm  | 1.0 K \n",
+      "3 | head   | Linear     | 33.6 M\n",
+      "--------------------------------------\n",
+      "87.6 M    Trainable params\n",
+      "0         Non-trainable params\n",
+      "87.6 M    Total params\n",
+      "350.405   Total estimated model params size (MB)\n",
+      "Epoch 0:  16%|▏| 800/4946 [00:35<03:05, 22.41it/s, v_num=3o87, train/loss=5.060]/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|█| 4946/4946 [03:42<00:00, 22.19it/s, v_num=3o87, train/loss=5.720\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                       | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                          | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   1%|▏                 | 1/100 [00:00<00:30,  3.21it/s]\u001b[A\n",
+      "Validation DataLoader 0:   2%|▎                 | 2/100 [00:00<00:28,  3.41it/s]\u001b[A\n",
+      "Validation DataLoader 0:   3%|▌                 | 3/100 [00:00<00:27,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:   4%|▋                 | 4/100 [00:01<00:26,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:   5%|▉                 | 5/100 [00:01<00:26,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:   6%|█                 | 6/100 [00:01<00:26,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:   7%|█▎                | 7/100 [00:01<00:25,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:   8%|█▍                | 8/100 [00:02<00:25,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:   9%|█▌                | 9/100 [00:02<00:25,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  10%|█▋               | 10/100 [00:02<00:24,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  11%|█▊               | 11/100 [00:03<00:24,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  12%|██               | 12/100 [00:03<00:24,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  13%|██▏              | 13/100 [00:03<00:23,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  14%|██▍              | 14/100 [00:03<00:23,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  15%|██▌              | 15/100 [00:04<00:23,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  16%|██▋              | 16/100 [00:04<00:22,  3.66it/s]\u001b[A\n",
+      "Validation DataLoader 0:  17%|██▉              | 17/100 [00:04<00:22,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  18%|███              | 18/100 [00:04<00:22,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  19%|███▏             | 19/100 [00:05<00:22,  3.67it/s]\u001b[A\n",
+      "Validation DataLoader 0:  20%|███▍             | 20/100 [00:05<00:21,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  21%|███▌             | 21/100 [00:05<00:21,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  22%|███▋             | 22/100 [00:05<00:21,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  23%|███▉             | 23/100 [00:06<00:20,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  24%|████             | 24/100 [00:06<00:20,  3.68it/s]\u001b[A\n",
+      "Validation DataLoader 0:  25%|████▎            | 25/100 [00:07<00:22,  3.40it/s]\u001b[A\n",
+      "Validation DataLoader 0:  26%|████▍            | 26/100 [00:07<00:21,  3.41it/s]\u001b[A\n",
+      "Validation DataLoader 0:  27%|████▌            | 27/100 [00:07<00:21,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  28%|████▊            | 28/100 [00:08<00:20,  3.43it/s]\u001b[A\n",
+      "Validation DataLoader 0:  29%|████▉            | 29/100 [00:08<00:20,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  30%|█████            | 30/100 [00:08<00:20,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  31%|█████▎           | 31/100 [00:08<00:19,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  32%|█████▍           | 32/100 [00:09<00:19,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  33%|█████▌           | 33/100 [00:09<00:19,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  34%|█████▊           | 34/100 [00:09<00:18,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  35%|█████▉           | 35/100 [00:10<00:18,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  36%|██████           | 36/100 [00:10<00:18,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  37%|██████▎          | 37/100 [00:10<00:17,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  38%|██████▍          | 38/100 [00:10<00:17,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  39%|██████▋          | 39/100 [00:11<00:17,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  40%|██████▊          | 40/100 [00:11<00:17,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  41%|██████▉          | 41/100 [00:11<00:16,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  42%|███████▏         | 42/100 [00:11<00:16,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  43%|███████▎         | 43/100 [00:12<00:16,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  44%|███████▍         | 44/100 [00:12<00:15,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  45%|███████▋         | 45/100 [00:12<00:15,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  46%|███████▊         | 46/100 [00:12<00:15,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  47%|███████▉         | 47/100 [00:13<00:14,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  48%|████████▏        | 48/100 [00:13<00:14,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  49%|████████▎        | 49/100 [00:13<00:14,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  50%|████████▌        | 50/100 [00:14<00:14,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  51%|████████▋        | 51/100 [00:14<00:13,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  52%|████████▊        | 52/100 [00:14<00:13,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  53%|█████████        | 53/100 [00:14<00:13,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  54%|█████████▏       | 54/100 [00:15<00:12,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  55%|█████████▎       | 55/100 [00:15<00:12,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  56%|█████████▌       | 56/100 [00:15<00:12,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  57%|█████████▋       | 57/100 [00:15<00:12,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  58%|█████████▊       | 58/100 [00:16<00:11,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  59%|██████████       | 59/100 [00:16<00:11,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  60%|██████████▏      | 60/100 [00:16<00:11,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  61%|██████████▎      | 61/100 [00:17<00:10,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  62%|██████████▌      | 62/100 [00:17<00:10,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  63%|██████████▋      | 63/100 [00:17<00:10,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  64%|██████████▉      | 64/100 [00:17<00:10,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  65%|███████████      | 65/100 [00:18<00:09,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  66%|███████████▏     | 66/100 [00:18<00:09,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  67%|███████████▍     | 67/100 [00:18<00:09,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  68%|███████████▌     | 68/100 [00:18<00:08,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  69%|███████████▋     | 69/100 [00:19<00:08,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  70%|███████████▉     | 70/100 [00:19<00:08,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  71%|████████████     | 71/100 [00:19<00:08,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  72%|████████████▏    | 72/100 [00:19<00:07,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  73%|████████████▍    | 73/100 [00:20<00:07,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  74%|████████████▌    | 74/100 [00:20<00:07,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  75%|████████████▊    | 75/100 [00:20<00:06,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  76%|████████████▉    | 76/100 [00:21<00:06,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  77%|█████████████    | 77/100 [00:21<00:06,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  78%|█████████████▎   | 78/100 [00:21<00:06,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  79%|█████████████▍   | 79/100 [00:21<00:05,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  80%|█████████████▌   | 80/100 [00:22<00:05,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  81%|█████████████▊   | 81/100 [00:22<00:05,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  82%|█████████████▉   | 82/100 [00:22<00:04,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  83%|██████████████   | 83/100 [00:22<00:04,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  84%|██████████████▎  | 84/100 [00:23<00:04,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  85%|██████████████▍  | 85/100 [00:23<00:04,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  86%|██████████████▌  | 86/100 [00:23<00:03,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  87%|██████████████▊  | 87/100 [00:23<00:03,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  88%|██████████████▉  | 88/100 [00:24<00:03,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  89%|███████████████▏ | 89/100 [00:24<00:03,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  90%|███████████████▎ | 90/100 [00:24<00:02,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  91%|███████████████▍ | 91/100 [00:25<00:02,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  92%|███████████████▋ | 92/100 [00:25<00:02,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  93%|███████████████▊ | 93/100 [00:25<00:01,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  94%|███████████████▉ | 94/100 [00:25<00:01,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  95%|████████████████▏| 95/100 [00:26<00:01,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  96%|████████████████▎| 96/100 [00:26<00:01,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  97%|████████████████▍| 97/100 [00:26<00:00,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  98%|████████████████▋| 98/100 [00:26<00:00,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  99%|████████████████▊| 99/100 [00:27<00:00,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|████████████████| 100/100 [00:27<00:00,  3.64it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 4946/4946 [04:10<00:00, 19.74it/s, v_num=3o87, train/loss=5.720`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 4946/4946 [04:10<00:00, 19.74it/s, v_num=3o87, train/loss=5.720\n"
+     ]
+    }
+   ],
    "source": [
-    "# Lets preload the requried dataset \n",
     "!cd \"{TRAINER_DIR}\" && \\\n",
-    "    python3 preload_datapath.py \"{NOTEBOOK_DIR}/config/test-dataset-repack.yaml\""
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/config/enwiki_10k-world-full.yaml\" \\\n",
+    "        --model.load_model=\"../model/L6-D512-world-v5base-init.pth\" \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-full/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Microbatch 2 - (deepspeed_stage_1)\" \\\n",
+    "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
+    "        --trainer.microbatch_size=2 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# microbatch=8"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-01-18 12:10:50,430] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-01-18 12:17:38,330] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
-      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/test-dataset-repack.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-packing/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Packing - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=2', '--trainer.devices=auto'].\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=8', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=8', '--trainer.devices=auto'].\n",
       "Seed set to 3941088705\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
@@ -375,19 +578,16 @@
       "   - target_batch_size:       16\n",
       "   - num_nodes:               1\n",
       "   - num_devices:             1\n",
-      "   - microbatch_size:         2\n",
-      "   - accumulate_grad_batches: 8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 2\n",
       "   - effective_batch_size:    16\n",
       "\n",
-      "Map (num_proc=16): 100%|█████████| 10000/10000 [00:01<00:00, 9808.78 examples/s]\n",
-      "Filter (num_proc=16): 100%|█████| 10000/10000 [00:00<00:00, 13056.99 examples/s]\n",
-      "Map (num_proc=16): 100%|██████████| 6409/6409 [00:00<00:00, 14454.07 examples/s]\n",
-      "Map (num_proc=16): 100%|███████████| 6409/6409 [00:00<00:00, 6890.28 examples/s]\n",
-      "Saving the dataset (1/1 shards): 100%|█| 323/323 [00:00<00:00, 7147.58 examples/\n",
-      "Saving the dataset (1/1 shards): 100%|█| 65/65 [00:00<00:00, 17810.79 examples/s\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 516553.02 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28147.80 examples\n",
       "[rank: 0] Seed set to 3941088705\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
       "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory ../checkpoint/v5-enwiki-10k-full/ exists and is not empty.\n",
       "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
       "#\n",
       "# RWKV lighting_trainer.py important notes \n",
@@ -409,7 +609,7 @@
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.05469512939453125 seconds\n",
+      "Time to load fused_adam op: 0.05247139930725098 seconds\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "Loading `train_dataloader` to estimate number of stepping batches.\n",
@@ -425,8 +625,127 @@
       "0         Non-trainable params\n",
       "87.6 M    Total params\n",
       "350.405   Total estimated model params size (MB)\n",
-      "Epoch 0:  21%|▋  | 34/162 [00:19<01:12,  1.76it/s, v_num=oczk, train/loss=8.880]^C\n",
-      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...\n"
+      "Epoch 0:   1%|  | 18/1237 [00:05<06:23,  3.18it/s, v_num=rhl5, train/loss=8.250]Traceback (most recent call last):\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 296, in <module>\n",
+      "    cli_main()\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 271, in cli_main\n",
+      "    LightningCLI(\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 386, in __init__\n",
+      "    self._run_subcommand(self.subcommand)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 677, in _run_subcommand\n",
+      "    fn(**fn_kwargs)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 544, in fit\n",
+      "    call._call_and_handle_interrupt(\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 43, in _call_and_handle_interrupt\n",
+      "    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 102, in launch\n",
+      "    return function(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 580, in _fit_impl\n",
+      "    self._run(model, ckpt_path=ckpt_path)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 989, in _run\n",
+      "    results = self._run_stage()\n",
+      "              ^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 1035, in _run_stage\n",
+      "    self.fit_loop.run()\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 202, in run\n",
+      "    self.advance()\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 359, in advance\n",
+      "    self.epoch_loop.run(self._data_fetcher)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 136, in run\n",
+      "    self.advance(data_fetcher)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 240, in advance\n",
+      "    batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 187, in run\n",
+      "    self._optimizer_step(batch_idx, closure)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 265, in _optimizer_step\n",
+      "    call._call_lightning_module_hook(\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 157, in _call_lightning_module_hook\n",
+      "    output = fn(*args, **kwargs)\n",
+      "             ^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/module.py\", line 1282, in optimizer_step\n",
+      "    optimizer.step(closure=optimizer_closure)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/optimizer.py\", line 151, in step\n",
+      "    step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)\n",
+      "                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py\", line 264, in optimizer_step\n",
+      "    optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 230, in optimizer_step\n",
+      "    return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 123, in optimizer_step\n",
+      "    closure_result = closure()\n",
+      "                     ^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 140, in __call__\n",
+      "    self._result = self.closure(*args, **kwargs)\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/utils/_contextlib.py\", line 115, in decorate_context\n",
+      "    return func(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 126, in closure\n",
+      "    step_output = self._step_fn()\n",
+      "                  ^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 315, in _training_step\n",
+      "    training_step_output = call._call_strategy_hook(trainer, \"training_step\", *kwargs.values())\n",
+      "                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 309, in _call_strategy_hook\n",
+      "    output = fn(*args, **kwargs)\n",
+      "             ^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 381, in training_step\n",
+      "    return self._forward_redirection(self.model, self.lightning_module, \"training_step\", *args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 633, in __call__\n",
+      "    wrapper_output = wrapper_module(*args, **kwargs)\n",
+      "                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n",
+      "    return self._call_impl(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n",
+      "    return forward_call(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
+      "    ret_val = func(*args, **kwargs)\n",
+      "              ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/engine.py\", line 1818, in forward\n",
+      "    loss = self.module(*inputs, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n",
+      "    return self._call_impl(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n",
+      "    return forward_call(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 626, in wrapped_forward\n",
+      "    out = method(*_args, **_kwargs)\n",
+      "          ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1234, in training_step\n",
+      "    total_loss = self.compute_loss(batch, batch_idx, True)\n",
+      "                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1132, in compute_loss\n",
+      "    self.manual_backward(learning_loss, optimizer, retain_graph=True)\n",
+      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 779, in manual_backward\n",
+      "    self.trainer.strategy.backward(loss, None, *args, **kwargs)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 204, in backward\n",
+      "    self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 112, in backward\n",
+      "    deepspeed_engine.backward(tensor, *args, **kwargs)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
+      "    ret_val = func(*args, **kwargs)\n",
+      "              ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/engine.py\", line 1940, in backward\n",
+      "    self.optimizer.backward(loss, retain_graph=retain_graph)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/zero/stage_1_and_2.py\", line 1953, in backward\n",
+      "    self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/fp16/loss_scaler.py\", line 63, in backward\n",
+      "    scaled_loss.backward(retain_graph=retain_graph)\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_tensor.py\", line 492, in backward\n",
+      "    torch.autograd.backward(\n",
+      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/autograd/__init__.py\", line 251, in backward\n",
+      "    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\n",
+      "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.69 GiB. GPU 0 has a total capacty of 22.16 GiB of which 1.03 GiB is free. Including non-PyTorch memory, this process has 21.09 GiB memory in use. Of the allocated memory 16.32 GiB is allocated by PyTorch, and 4.10 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\n"
      ]
     }
    ],
@@ -439,7 +758,7 @@
     "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/v5-enwiki-10k-full/\" \\\n",
     "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Microbatch 2 - (deepspeed_stage_1)\" \\\n",
     "        --trainer.strategy=\"deepspeed_stage_1\" \\\n",
-    "        --trainer.microbatch_size=2 \\\n",
+    "        --trainer.microbatch_size=4 \\\n",
     "        --trainer.devices=\"{GPU_DEVICES}\""
    ]
   }

From 86808e9cde9181c057ac16ddaf0175bdd7908285 Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 12:25:43 +0800
Subject: [PATCH 16/17] microbatch validation

---
 .../dataset-microbatch.ipynb                  | 245 +++++++++---------
 1 file changed, 116 insertions(+), 129 deletions(-)

diff --git a/notebook/trainer-v5-validation/dataset-microbatch.ipynb b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
index df25fda5..cf45e52c 100644
--- a/notebook/trainer-v5-validation/dataset-microbatch.ipynb
+++ b/notebook/trainer-v5-validation/dataset-microbatch.ipynb
@@ -544,16 +544,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-01-18 12:17:38,330] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-01-18 12:18:03,234] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.1.1'\n",
-      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=8', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=8', '--trainer.devices=auto'].\n",
+      "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=4', '--trainer.devices=auto'], args=['fit', '-c', '/home/picocreator/rwkv-proj/RWKV-infctx-trainer/notebook/trainer-v5-validation/config/enwiki_10k-world-full.yaml', '--model.load_model=../model/L6-D512-world-v5base-init.pth', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5-enwiki-10k-full/', '--trainer.logger.init_args.name=infctx-v5-microbatch - Microbatch 2 - (deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.microbatch_size=4', '--trainer.devices=auto'].\n",
       "Seed set to 3941088705\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
@@ -578,12 +578,12 @@
       "   - target_batch_size:       16\n",
       "   - num_nodes:               1\n",
       "   - num_devices:             1\n",
-      "   - microbatch_size:         8\n",
-      "   - accumulate_grad_batches: 2\n",
+      "   - microbatch_size:         4\n",
+      "   - accumulate_grad_batches: 4\n",
       "   - effective_batch_size:    16\n",
       "\n",
-      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 516553.02 examp\n",
-      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28147.80 examples\n",
+      "Saving the dataset (1/1 shards): 100%|█| 9892/9892 [00:00<00:00, 583184.18 examp\n",
+      "Saving the dataset (1/1 shards): 100%|█| 100/100 [00:00<00:00, 28878.44 examples\n",
       "[rank: 0] Seed set to 3941088705\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
       "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
@@ -609,7 +609,7 @@
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.05247139930725098 seconds\n",
+      "Time to load fused_adam op: 0.05039358139038086 seconds\n",
       "/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201450/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "Loading `train_dataloader` to estimate number of stepping batches.\n",
@@ -625,127 +625,114 @@
       "0         Non-trainable params\n",
       "87.6 M    Total params\n",
       "350.405   Total estimated model params size (MB)\n",
-      "Epoch 0:   1%|  | 18/1237 [00:05<06:23,  3.18it/s, v_num=rhl5, train/loss=8.250]Traceback (most recent call last):\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 296, in <module>\n",
-      "    cli_main()\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 271, in cli_main\n",
-      "    LightningCLI(\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 386, in __init__\n",
-      "    self._run_subcommand(self.subcommand)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/cli.py\", line 677, in _run_subcommand\n",
-      "    fn(**fn_kwargs)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 544, in fit\n",
-      "    call._call_and_handle_interrupt(\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 43, in _call_and_handle_interrupt\n",
-      "    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 102, in launch\n",
-      "    return function(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 580, in _fit_impl\n",
-      "    self._run(model, ckpt_path=ckpt_path)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 989, in _run\n",
-      "    results = self._run_stage()\n",
-      "              ^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py\", line 1035, in _run_stage\n",
-      "    self.fit_loop.run()\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 202, in run\n",
-      "    self.advance()\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py\", line 359, in advance\n",
-      "    self.epoch_loop.run(self._data_fetcher)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 136, in run\n",
-      "    self.advance(data_fetcher)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/training_epoch_loop.py\", line 240, in advance\n",
-      "    batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 187, in run\n",
-      "    self._optimizer_step(batch_idx, closure)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 265, in _optimizer_step\n",
-      "    call._call_lightning_module_hook(\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 157, in _call_lightning_module_hook\n",
-      "    output = fn(*args, **kwargs)\n",
-      "             ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/module.py\", line 1282, in optimizer_step\n",
-      "    optimizer.step(closure=optimizer_closure)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/core/optimizer.py\", line 151, in step\n",
-      "    step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)\n",
-      "                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py\", line 264, in optimizer_step\n",
-      "    optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 230, in optimizer_step\n",
-      "    return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 123, in optimizer_step\n",
-      "    closure_result = closure()\n",
-      "                     ^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 140, in __call__\n",
-      "    self._result = self.closure(*args, **kwargs)\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/utils/_contextlib.py\", line 115, in decorate_context\n",
-      "    return func(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 126, in closure\n",
-      "    step_output = self._step_fn()\n",
-      "                  ^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/loops/optimization/automatic.py\", line 315, in _training_step\n",
-      "    training_step_output = call._call_strategy_hook(trainer, \"training_step\", *kwargs.values())\n",
-      "                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py\", line 309, in _call_strategy_hook\n",
-      "    output = fn(*args, **kwargs)\n",
-      "             ^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 381, in training_step\n",
-      "    return self._forward_redirection(self.model, self.lightning_module, \"training_step\", *args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 633, in __call__\n",
-      "    wrapper_output = wrapper_module(*args, **kwargs)\n",
-      "                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n",
-      "    return self._call_impl(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n",
-      "    return forward_call(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
-      "    ret_val = func(*args, **kwargs)\n",
-      "              ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/engine.py\", line 1818, in forward\n",
-      "    loss = self.module(*inputs, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1518, in _wrapped_call_impl\n",
-      "    return self._call_impl(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1527, in _call_impl\n",
-      "    return forward_call(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 626, in wrapped_forward\n",
-      "    out = method(*_args, **_kwargs)\n",
-      "          ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1234, in training_step\n",
-      "    total_loss = self.compute_loss(batch, batch_idx, True)\n",
-      "                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1132, in compute_loss\n",
-      "    self.manual_backward(learning_loss, optimizer, retain_graph=True)\n",
-      "  File \"/home/picocreator/rwkv-proj/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 779, in manual_backward\n",
-      "    self.trainer.strategy.backward(loss, None, *args, **kwargs)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py\", line 204, in backward\n",
-      "    self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/deepspeed.py\", line 112, in backward\n",
-      "    deepspeed_engine.backward(tensor, *args, **kwargs)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
-      "    ret_val = func(*args, **kwargs)\n",
-      "              ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/engine.py\", line 1940, in backward\n",
-      "    self.optimizer.backward(loss, retain_graph=retain_graph)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/zero/stage_1_and_2.py\", line 1953, in backward\n",
-      "    self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/deepspeed/runtime/fp16/loss_scaler.py\", line 63, in backward\n",
-      "    scaled_loss.backward(retain_graph=retain_graph)\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/_tensor.py\", line 492, in backward\n",
-      "    torch.autograd.backward(\n",
-      "  File \"/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/autograd/__init__.py\", line 251, in backward\n",
-      "    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\n",
-      "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.69 GiB. GPU 0 has a total capacty of 22.16 GiB of which 1.03 GiB is free. Including non-PyTorch memory, this process has 21.09 GiB memory in use. Of the allocated memory 16.32 GiB is allocated by PyTorch, and 4.10 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF\n"
+      "Epoch 0:  16%|▏| 400/2473 [00:30<02:37, 13.12it/s, v_num=jp9a, train/loss=6.780]/home/picocreator/anaconda3/envs/rwkv-infctx/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|█| 2473/2473 [03:04<00:00, 13.41it/s, v_num=jp9a, train/loss=6.660\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                       | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                          | 0/100 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   1%|▏                 | 1/100 [00:00<00:33,  2.99it/s]\u001b[A\n",
+      "Validation DataLoader 0:   2%|▎                 | 2/100 [00:00<00:30,  3.25it/s]\u001b[A\n",
+      "Validation DataLoader 0:   3%|▌                 | 3/100 [00:00<00:28,  3.37it/s]\u001b[A\n",
+      "Validation DataLoader 0:   4%|▋                 | 4/100 [00:01<00:27,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:   5%|▉                 | 5/100 [00:01<00:27,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:   6%|█                 | 6/100 [00:01<00:26,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:   7%|█▎                | 7/100 [00:01<00:26,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:   8%|█▍                | 8/100 [00:02<00:25,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:   9%|█▌                | 9/100 [00:02<00:25,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  10%|█▋               | 10/100 [00:02<00:25,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  11%|█▊               | 11/100 [00:03<00:24,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  12%|██               | 12/100 [00:03<00:24,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  13%|██▏              | 13/100 [00:03<00:24,  3.61it/s]\u001b[A\n",
+      "Validation DataLoader 0:  14%|██▍              | 14/100 [00:03<00:23,  3.62it/s]\u001b[A\n",
+      "Validation DataLoader 0:  15%|██▌              | 15/100 [00:04<00:23,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  16%|██▋              | 16/100 [00:04<00:23,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  17%|██▉              | 17/100 [00:04<00:22,  3.63it/s]\u001b[A\n",
+      "Validation DataLoader 0:  18%|███              | 18/100 [00:04<00:22,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  19%|███▏             | 19/100 [00:05<00:22,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  20%|███▍             | 20/100 [00:05<00:21,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  21%|███▌             | 21/100 [00:05<00:21,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  22%|███▋             | 22/100 [00:06<00:21,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  23%|███▉             | 23/100 [00:06<00:21,  3.64it/s]\u001b[A\n",
+      "Validation DataLoader 0:  24%|████             | 24/100 [00:06<00:20,  3.65it/s]\u001b[A\n",
+      "Validation DataLoader 0:  25%|████▎            | 25/100 [00:07<00:22,  3.37it/s]\u001b[A\n",
+      "Validation DataLoader 0:  26%|████▍            | 26/100 [00:07<00:21,  3.38it/s]\u001b[A\n",
+      "Validation DataLoader 0:  27%|████▌            | 27/100 [00:07<00:21,  3.39it/s]\u001b[A\n",
+      "Validation DataLoader 0:  28%|████▊            | 28/100 [00:08<00:21,  3.40it/s]\u001b[A\n",
+      "Validation DataLoader 0:  29%|████▉            | 29/100 [00:08<00:20,  3.41it/s]\u001b[A\n",
+      "Validation DataLoader 0:  30%|█████            | 30/100 [00:08<00:20,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  31%|█████▎           | 31/100 [00:09<00:20,  3.42it/s]\u001b[A\n",
+      "Validation DataLoader 0:  32%|█████▍           | 32/100 [00:09<00:19,  3.43it/s]\u001b[A\n",
+      "Validation DataLoader 0:  33%|█████▌           | 33/100 [00:09<00:19,  3.44it/s]\u001b[A\n",
+      "Validation DataLoader 0:  34%|█████▊           | 34/100 [00:09<00:19,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  35%|█████▉           | 35/100 [00:10<00:18,  3.45it/s]\u001b[A\n",
+      "Validation DataLoader 0:  36%|██████           | 36/100 [00:10<00:18,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  37%|██████▎          | 37/100 [00:10<00:18,  3.46it/s]\u001b[A\n",
+      "Validation DataLoader 0:  38%|██████▍          | 38/100 [00:10<00:17,  3.47it/s]\u001b[A\n",
+      "Validation DataLoader 0:  39%|██████▋          | 39/100 [00:11<00:17,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  40%|██████▊          | 40/100 [00:11<00:17,  3.48it/s]\u001b[A\n",
+      "Validation DataLoader 0:  41%|██████▉          | 41/100 [00:11<00:16,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  42%|███████▏         | 42/100 [00:12<00:16,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  43%|███████▎         | 43/100 [00:12<00:16,  3.49it/s]\u001b[A\n",
+      "Validation DataLoader 0:  44%|███████▍         | 44/100 [00:12<00:16,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  45%|███████▋         | 45/100 [00:12<00:15,  3.50it/s]\u001b[A\n",
+      "Validation DataLoader 0:  46%|███████▊         | 46/100 [00:13<00:15,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  47%|███████▉         | 47/100 [00:13<00:15,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  48%|████████▏        | 48/100 [00:13<00:14,  3.51it/s]\u001b[A\n",
+      "Validation DataLoader 0:  49%|████████▎        | 49/100 [00:13<00:14,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  50%|████████▌        | 50/100 [00:14<00:14,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  51%|████████▋        | 51/100 [00:14<00:13,  3.52it/s]\u001b[A\n",
+      "Validation DataLoader 0:  52%|████████▊        | 52/100 [00:14<00:13,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  53%|█████████        | 53/100 [00:15<00:13,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  54%|█████████▏       | 54/100 [00:15<00:13,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  55%|█████████▎       | 55/100 [00:15<00:12,  3.53it/s]\u001b[A\n",
+      "Validation DataLoader 0:  56%|█████████▌       | 56/100 [00:15<00:12,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  57%|█████████▋       | 57/100 [00:16<00:12,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  58%|█████████▊       | 58/100 [00:16<00:11,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  59%|██████████       | 59/100 [00:16<00:11,  3.54it/s]\u001b[A\n",
+      "Validation DataLoader 0:  60%|██████████▏      | 60/100 [00:16<00:11,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  61%|██████████▎      | 61/100 [00:17<00:10,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  62%|██████████▌      | 62/100 [00:17<00:10,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  63%|██████████▋      | 63/100 [00:17<00:10,  3.55it/s]\u001b[A\n",
+      "Validation DataLoader 0:  64%|██████████▉      | 64/100 [00:18<00:10,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  65%|███████████      | 65/100 [00:18<00:09,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  66%|███████████▏     | 66/100 [00:18<00:09,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  67%|███████████▍     | 67/100 [00:18<00:09,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  68%|███████████▌     | 68/100 [00:19<00:08,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  69%|███████████▋     | 69/100 [00:19<00:08,  3.56it/s]\u001b[A\n",
+      "Validation DataLoader 0:  70%|███████████▉     | 70/100 [00:19<00:08,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  71%|████████████     | 71/100 [00:19<00:08,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  72%|████████████▏    | 72/100 [00:20<00:07,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  73%|████████████▍    | 73/100 [00:20<00:07,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  74%|████████████▌    | 74/100 [00:20<00:07,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  75%|████████████▊    | 75/100 [00:20<00:06,  3.57it/s]\u001b[A\n",
+      "Validation DataLoader 0:  76%|████████████▉    | 76/100 [00:21<00:06,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  77%|█████████████    | 77/100 [00:21<00:06,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  78%|█████████████▎   | 78/100 [00:21<00:06,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  79%|█████████████▍   | 79/100 [00:22<00:05,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  80%|█████████████▌   | 80/100 [00:22<00:05,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  81%|█████████████▊   | 81/100 [00:22<00:05,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  82%|█████████████▉   | 82/100 [00:22<00:05,  3.58it/s]\u001b[A\n",
+      "Validation DataLoader 0:  83%|██████████████   | 83/100 [00:23<00:04,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  84%|██████████████▎  | 84/100 [00:23<00:04,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  85%|██████████████▍  | 85/100 [00:23<00:04,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  86%|██████████████▌  | 86/100 [00:23<00:03,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  87%|██████████████▊  | 87/100 [00:24<00:03,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  88%|██████████████▉  | 88/100 [00:24<00:03,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  89%|███████████████▏ | 89/100 [00:24<00:03,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  90%|███████████████▎ | 90/100 [00:25<00:02,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  91%|███████████████▍ | 91/100 [00:25<00:02,  3.59it/s]\u001b[A\n",
+      "Validation DataLoader 0:  92%|███████████████▋ | 92/100 [00:25<00:02,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  93%|███████████████▊ | 93/100 [00:25<00:01,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  94%|███████████████▉ | 94/100 [00:26<00:01,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  95%|████████████████▏| 95/100 [00:26<00:01,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  96%|████████████████▎| 96/100 [00:26<00:01,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  97%|████████████████▍| 97/100 [00:26<00:00,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  98%|████████████████▋| 98/100 [00:27<00:00,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0:  99%|████████████████▊| 99/100 [00:27<00:00,  3.60it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|████████████████| 100/100 [00:27<00:00,  3.60it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 2473/2473 [03:32<00:00, 11.65it/s, v_num=jp9a, train/loss=6.660`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 2473/2473 [03:32<00:00, 11.65it/s, v_num=jp9a, train/loss=6.660\n"
      ]
     }
    ],

From ce4a4614d61bbad27abdaf36c3038cd0c9bcf6cd Mon Sep 17 00:00:00 2001
From: "picocreator (Eugene Cheah)" <eugeneqin@gmail.com>
Date: Thu, 18 Jan 2024 13:14:17 +0800
Subject: [PATCH 17/17] experimental token based dropout

---
 RWKV-v5/src/model.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index f52d1d60..912a5684 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -197,7 +197,8 @@ def __init__(self,
                  position_loss_bias_in_validation: bool = False,
                  
                  # Selective loss settings
-                 selective_token_loss_threshold: float = 0.0,
+                 token_loss_threshold: float = 0.0,
+                 token_dropout_rate: float = 0.0, # Dropout rate should be between 0-1
 
                  # Backprop settings
                  grad_cp: bool = True,
@@ -295,7 +296,8 @@ def __init__(self,
         # Save the position loss params, and selective loss settings
         self.position_loss_bias = position_loss_bias
         self.position_loss_bias_in_validation = position_loss_bias_in_validation
-        self.selective_token_loss_threshold = selective_token_loss_threshold
+        self.token_loss_threshold = token_loss_threshold
+        self.token_dropout_rate = token_dropout_rate
 
         dim_att = dim_att or n_embd
         dim_ffn = dim_ffn or int((n_embd * 3.5) // 32 * 32)
@@ -922,15 +924,24 @@ def checkpointed_step(idx, targets, mask, last_shift_states,
                 train_token_count = 0
                 train_mask = submask
 
-            elif self.selective_token_loss_threshold > 0.0:
+            elif self.token_loss_threshold > 0.0 or self.token_dropout_rate > 0.0:
 
                 # Sample loss, without backprop 
                 with torch.no_grad():
                     sample_loss = (torch.sum(token_loss * submask) / total_mask_sum).clone().detach().requires_grad_(False)
 
+                # Building the training mask
+                train_mask = submask
+
                 # Selective loss gating
-                above_threshold = token_loss > self.selective_token_loss_threshold
-                train_mask = submask * above_threshold
+                if self.token_loss_threshold > 0.0:
+                    above_threshold = token_loss > self.token_loss_threshold
+                    train_mask = train_mask * above_threshold
+
+                # Dropout logic
+                if self.token_dropout_rate > 0.0:
+                    dropout_mask = torch.rand(train_mask.shape, device=train_mask.device) > self.token_dropout_rate
+                    train_mask = train_mask * dropout_mask
                 
                 # The training loss to use
                 train_loss = torch.sum(token_loss * train_mask) / total_mask_sum