From 65a5c49ebe18d7a58c435347a09d74020fbb0f4f Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Wed, 6 Mar 2024 09:54:16 +0000
Subject: [PATCH 01/13] tweak to data set processing, to have test dataset
 split overwrite

---
 RWKV-v5/src/data.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 9219d1e2..22ccb777 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -45,7 +45,9 @@ def prepare_data_static(
         source_dataset_params: dict = None,
         # Source dataset split to use
         source_dataset_split: str = "train",
-        # Test split of source data, if it was not already done
+        # test dataset split (if any)
+        test_dataset_split: str = "test",
+        # Test split of source data, if the test_dataset_split was not found
         test_split: float = 0.01,
         test_split_shuffle: bool = False,
         # Text rechunking size
@@ -298,7 +300,7 @@ def gen():
             # Load the dataset
             src_dataset = load_dataset(**load_dataset_params)
 
-            # If for some reason the dataset is a "test" only split, and missing a "train" split, we remap it as a "train" split
+            # If for some reason the dataset missing the "train" split, we throw accordingly
             if source_dataset_split not in src_dataset.keys():
                 raise ValueError('Dataset missing split: ' + source_dataset_split)
 
@@ -306,6 +308,16 @@ def gen():
                 src_dataset["train"] = src_dataset[source_dataset_split]
                 del src_dataset[source_dataset_split]
 
+            # If test split exists, and != "test", we will move it to "test"
+            if test_dataset_split != "test" and test_dataset_split in src_dataset.keys():
+                src_dataset["test"] = src_dataset[test_dataset_split]
+                del src_dataset[test_dataset_split]
+            
+            # Remove all splits, that is not "train" or "test"
+            for key in src_dataset.keys():
+                if key not in ["train", "test"]:
+                    del src_dataset[key]
+
             # If an int value is used, it is interprated as document count
             # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset
             if kargs["dataset_offset"] > 0 or kargs["dataset_length"] > 0:

From 2c424d59e98a779985129903ad579f5c39cfa07a Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Wed, 6 Mar 2024 10:40:37 +0000
Subject: [PATCH 02/13] wip test column overwrite

---
 RWKV-v5/src/data.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 22ccb777..077da458 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -309,12 +309,17 @@ def gen():
                 del src_dataset[source_dataset_split]
 
             # If test split exists, and != "test", we will move it to "test"
-            if test_dataset_split != "test" and test_dataset_split in src_dataset.keys():
-                src_dataset["test"] = src_dataset[test_dataset_split]
-                del src_dataset[test_dataset_split]
+            # or clear existing test (if not exists). This will allow the test_split fallback to work
+            if test_dataset_split != "test":
+                if test_dataset_split in src_dataset.keys():
+                    src_dataset["test"] = src_dataset[test_dataset_split]
+                    del src_dataset[test_dataset_split]
+                elif "test" in src_dataset.keys():
+                    del src_dataset["test"]
             
             # Remove all splits, that is not "train" or "test"
-            for key in src_dataset.keys():
+            src_dataset_keys = list(src_dataset.keys())
+            for key in src_dataset_keys:
                 if key not in ["train", "test"]:
                     del src_dataset[key]
 

From 4f2c9e28180f720261e2efbd0eb2ae95e204dd48 Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Wed, 6 Mar 2024 11:16:19 +0000
Subject: [PATCH 03/13] fixing prefix/suffix masking for multi-key dataset, and
 handling of int values as str in dataset

---
 RWKV-v5/src/data.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 077da458..2cf139c6 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -392,7 +392,7 @@ def encodeTokens(x):
                         type_arr = []
                         mask_arr = []
                         for i in range(len(x)):
-                            enc_str = world_tokenizer_encode(x[i], world_add_endoftext_token=world_add_endoftext_token)
+                            enc_str = world_tokenizer_encode(str(x[i]), world_add_endoftext_token=world_add_endoftext_token)
                             id_arr.append(enc_str)
                             type_arr.append([0] * len(enc_str))
                             mask_arr.append([1] * len(enc_str))
@@ -405,7 +405,7 @@ def encodeTokens(x):
                         }
                     
                     # Else we encode the string and return it following the HF tokenizer format
-                    enc_str = world_tokenizer_encode(x, world_add_endoftext_token=world_add_endoftext_token)
+                    enc_str = world_tokenizer_encode(str(x), world_add_endoftext_token=world_add_endoftext_token)
                     return {
                         'input_ids': enc_str,
                         'token_type_ids': [0] * len(enc_str),
@@ -591,7 +591,7 @@ def map_tokenizer(x):
                     # that have data in them
                     num_columns = 0
                     for i in range(len(multi_column_keys)):
-                        if multi_column_keys[i] in x and x[multi_column_keys[i]] is not None and len(x[multi_column_keys[i]]) > 0:
+                        if multi_column_keys[i] in x and x[multi_column_keys[i]] is not None and len(str(x[multi_column_keys[i]])) > 0:
                             num_columns += 1
                     # If we have more than 1 column, we will have to merge them
                     if num_columns > 1:
@@ -606,18 +606,18 @@ def map_tokenizer(x):
                         # Lets loop through each column
                         for i in range(len(multi_column_keys)):
                             # And process the column if it has data
-                            if multi_column_keys[i] in x and x[multi_column_keys[i]] is not None and len(x[multi_column_keys[i]]) > 0:
+                            if multi_column_keys[i] in x and x[multi_column_keys[i]] is not None and len(str(x[multi_column_keys[i]])) > 0:
                                 # Add the separator if this is not the first item
                                 if not is_first_item and multi_column_separator_encodings is not None:
                                     input_ids += multi_column_separator_encodings['input_ids']
                                     token_type_ids += multi_column_separator_encodings['token_type_ids']
-                                    attention_mask += multi_column_separator_encodings['attention_mask']
+                                    attention_mask += ([0] * len(multi_column_separator_encodings['input_ids']))
                                 
                                 # Add the prefix
                                 if len(multi_column_prefix_encodings) > i and multi_column_prefix_encodings[i] is not None:
                                     input_ids += multi_column_prefix_encodings[i]['input_ids']
                                     token_type_ids += multi_column_prefix_encodings[i]['token_type_ids']
-                                    attention_mask += multi_column_prefix_encodings[i]['attention_mask']
+                                    attention_mask += ([0] * len(multi_column_prefix_encodings[i]['input_ids']))
 
                                 # Tokenize the column
                                 column_encodings = encodeTokens(x[multi_column_keys[i]])
@@ -641,7 +641,7 @@ def map_tokenizer(x):
                                 if len(multi_column_suffix_encodings) > i and multi_column_suffix_encodings[i] is not None:
                                     input_ids += multi_column_suffix_encodings[i]['input_ids']
                                     token_type_ids += multi_column_suffix_encodings[i]['token_type_ids']
-                                    attention_mask += multi_column_suffix_encodings[i]['attention_mask']
+                                    attention_mask += ([0] * len(multi_column_suffix_encodings[i]['input_ids']))
                                 
                                 # Set the first item flag to false
                                 is_first_item = False

From bacff8d6f0f1e6892b0dfc5819621cb13837a221 Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Wed, 6 Mar 2024 11:32:28 +0000
Subject: [PATCH 04/13] fixing instruct tuning data processing

---
 RWKV-v5/src/data.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 2cf139c6..5f44fab7 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -404,6 +404,14 @@ def encodeTokens(x):
                             'attention_mask': mask_arr
                         }
                     
+                    # Empty / Null string handling
+                    if x is None or len(str(x)) == 0:
+                        return {
+                            'input_ids': [[]],
+                            'token_type_ids': [[]],
+                            'attention_mask': [[]],
+                        }
+                    
                     # Else we encode the string and return it following the HF tokenizer format
                     enc_str = world_tokenizer_encode(str(x), world_add_endoftext_token=world_add_endoftext_token)
                     return {

From 90faac0527753ce958b5d81fb4f335c087431fb4 Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Wed, 6 Mar 2024 12:07:16 +0000
Subject: [PATCH 05/13] better empty string handling

---
 RWKV-v5/src/data.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index 5f44fab7..de065af1 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -407,9 +407,9 @@ def encodeTokens(x):
                     # Empty / Null string handling
                     if x is None or len(str(x)) == 0:
                         return {
-                            'input_ids': [[]],
-                            'token_type_ids': [[]],
-                            'attention_mask': [[]],
+                            'input_ids': [],
+                            'token_type_ids': [],
+                            'attention_mask': [],
                         }
                     
                     # Else we encode the string and return it following the HF tokenizer format

From a9c0633792f107bf2562f477c55e1a583b185900 Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Thu, 7 Mar 2024 05:16:21 +0000
Subject: [PATCH 06/13] datapacking improvements for multi-turn data & masking

---
 RWKV-v5/src/data.py | 85 ++++++++++++++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 31 deletions(-)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index de065af1..f0540a8d 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -9,7 +9,7 @@
 from datasets import load_from_disk, load_dataset, concatenate_datasets, Dataset, Features, Value, Sequence
 from transformers import PreTrainedTokenizerFast, AutoTokenizer
 from multiprocessing import cpu_count
-import gc, yaml
+import gc, yaml, json
 
 num_cpus = cpu_count()
 num_workers = cpu_count() if cpu_count() < 8 else 8
@@ -384,28 +384,51 @@ def gen():
 
             # Function used to tokenize the dataset as per HF tokenizer format
             # if given the textual data, it will return the tokenized data
-            def encodeTokens(x):
+            def encodeTokens(x, enforceSingleItem = False):
                 if world_tokenizer is True:
-                    # If x is an array of strings, we encode them seperately, and conslidate the result
-                    if isinstance(x, list):
-                        id_arr = []
-                        type_arr = []
-                        mask_arr = []
-                        for i in range(len(x)):
-                            enc_str = world_tokenizer_encode(str(x[i]), world_add_endoftext_token=world_add_endoftext_token)
-                            id_arr.append(enc_str)
-                            type_arr.append([0] * len(enc_str))
-                            mask_arr.append([1] * len(enc_str))
-
-                        # Consolidate the result
+
+                    # Empty / Null string handling
+                    if x is None:
                         return {
-                            'input_ids': id_arr,
-                            'token_type_ids': type_arr,
-                            'attention_mask': mask_arr
+                            'input_ids': [],
+                            'token_type_ids': [],
+                            'attention_mask': [],
                         }
                     
+                    # If x is an array of strings, we encode them seperately, and conslidate the result
+                    if isinstance(x, list):
+                        if enforceSingleItem:
+                            # Converts it from list to str
+                            x = json.dumps(x)
+                        else:
+
+                            # Handles it as an array of string, that needs conversion
+                            id_arr = []
+                            type_arr = []
+                            mask_arr = []
+                            for i in range(len(x)):
+                                enc_str = world_tokenizer_encode(str(x[i]), world_add_endoftext_token=world_add_endoftext_token)
+                                id_arr.append(enc_str)
+                                type_arr.append([0] * len(enc_str))
+                                mask_arr.append([1] * len(enc_str))
+
+                            # Consolidate the result
+                            return {
+                                'input_ids': id_arr,
+                                'token_type_ids': type_arr,
+                                'attention_mask': mask_arr
+                            }
+                    
+                    # Converting from dictionary
+                    if isinstance(x, dict):
+                        # Dictionary to json string
+                        x = json.dumps(x)
+
+                    # Enforce string type
+                    x = str(x)
+
                     # Empty / Null string handling
-                    if x is None or len(str(x)) == 0:
+                    if len(x) == 0:
                         return {
                             'input_ids': [],
                             'token_type_ids': [],
@@ -449,23 +472,23 @@ def encodeTokens(x):
                 # Tokenize the multi column strings
                 for i in range(len(multi_column_keys)):
                     if multi_column_prefix is not None and multi_column_prefix[i] is not None:
-                        multi_column_prefix_encodings.append(encodeTokens(multi_column_prefix[i]))
+                        multi_column_prefix_encodings.append(encodeTokens(multi_column_prefix[i], enforceSingleItem=True))
                     if multi_column_suffix is not None and multi_column_suffix[i] is not None:
-                        multi_column_suffix_encodings.append(encodeTokens(multi_column_suffix[i]))    
+                        multi_column_suffix_encodings.append(encodeTokens(multi_column_suffix[i], enforceSingleItem=True))    
                 
                 # Tokenize the multi column separator
                 if multi_column_separator is not None and len(multi_column_separator) > 0:
-                    multi_column_separator_encodings = encodeTokens(multi_column_separator)
+                    multi_column_separator_encodings = encodeTokens(multi_column_separator, enforceSingleItem=True)
 
             conversation_prefix_encoding_map = {}
             conversation_suffix_encoding_map = {}
-            conversation_end_of_conversation_token = encodeTokens(kargs["conversation_end_of_conversation"]) if kargs["conversation_end_of_conversation"] is not None else None
+            conversation_end_of_conversation_token = encodeTokens(kargs["conversation_end_of_conversation"], enforceSingleItem=True) if kargs["conversation_end_of_conversation"] is not None else None
             conversation_enabled = False
             if 'conversation_format' in kargs and kargs["conversation_format"] is not None:
                 if kargs["conversation_format"] == "iopairs":
                     # preencode all prefixes (keyed by the input key)
                     for key, prefix in kargs['conversation_input_key_prefix_map'].items():
-                        conversation_prefix_encoding_map[key] = encodeTokens(prefix)
+                        conversation_prefix_encoding_map[key] = encodeTokens(prefix, enforceSingleItem=True)
                     conversation_enabled = True
                 elif kargs["conversation_format"] == "sender":
                     # preencode all prefixes (keyed by the sender value)
@@ -473,10 +496,10 @@ def encodeTokens(x):
                         for input_key, value in kargs['conversation_input_key_map'].items():
                             if input_key not in conversation_prefix_encoding_map:
                                 conversation_prefix_encoding_map[input_key] = {}
-                            conversation_prefix_encoding_map[input_key][key] = encodeTokens(value.replace('{sender}', relabel))
+                            conversation_prefix_encoding_map[input_key][key] = encodeTokens(value.replace('{sender}', relabel), enforceSingleItem=True)
 
                 for key, suffix in kargs['conversation_sender_suffix'].items():
-                    conversation_suffix_encoding_map[key] = encodeTokens(suffix)
+                    conversation_suffix_encoding_map[key] = encodeTokens(suffix, enforceSingleItem=True)
                             # example conversation_prefix_encoding_map['message']['user'] = encodeTokens('\n\nUser:')
 
                     conversation_enabled = True
@@ -496,7 +519,7 @@ def map_tokenizer(x):
                 # Custom text column support
                 if kargs["custom_text_key"] is not None:
                     if kargs["custom_text_key"] in x:
-                        return encodeTokens(x[kargs["custom_text_key"]])
+                        return encodeTokens(x[kargs["custom_text_key"]], enforceSingleItem=True)
                     
                 if conversation_enabled:
                     conv_key = kargs['conversation_key'] if 'conversation_key' in kargs else None
@@ -524,7 +547,7 @@ def map_tokenizer(x):
                                     attention_mask += prefix['attention_mask']
 
                                 # Tokenize the column
-                                column_encodings = encodeTokens(value)
+                                column_encodings = encodeTokens(value, enforceSingleItem=True)
 
                                 # Add the column
                                 input_ids += column_encodings['input_ids']
@@ -562,7 +585,7 @@ def map_tokenizer(x):
                                         attention_mask += prefix['attention_mask']
 
                                     # Tokenize the column
-                                    column_encodings = encodeTokens(turn[key])
+                                    column_encodings = encodeTokens(turn[key], enforceSingleItem=True)
 
                                     # Add the column
                                     input_ids += column_encodings['input_ids']
@@ -628,7 +651,7 @@ def map_tokenizer(x):
                                     attention_mask += ([0] * len(multi_column_prefix_encodings[i]['input_ids']))
 
                                 # Tokenize the column
-                                column_encodings = encodeTokens(x[multi_column_keys[i]])
+                                column_encodings = encodeTokens(x[multi_column_keys[i]], enforceSingleItem=True)
 
                                 # Add the column
                                 input_ids += column_encodings['input_ids']
@@ -670,8 +693,8 @@ def map_tokenizer(x):
 
                     # Tokenize both prompt and completion
                     # Note that the tokenizer will process and return the input_ids in batches
-                    prompt_encodings = encodeTokens(x['prompt'])
-                    completion_encodings = encodeTokens(x['completion'])
+                    prompt_encodings = encodeTokens(x['prompt'], enforceSingleItem=True)
+                    completion_encodings = encodeTokens(x['completion'], enforceSingleItem=True)
 
                     # Join the two input_ids lists
                     input_ids = prompt_encodings['input_ids'] + completion_encodings['input_ids']

From 6561ff07234b537d687011bcff77996ebde231d9 Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Thu, 7 Mar 2024 05:55:06 +0000
Subject: [PATCH 07/13] wip runs

---
 .../Eagle-2T-retune/data-prep.ipynb           |  248 ++
 .../Eagle-2T-retune/eagle-1b5-runs.ipynb      | 2558 +++++++++++++++++
 .../Eagle-2T-retune/eagle-7b-15t-runs.ipynb   |  512 ++++
 .../retune-data-build-no-mask.yaml            |  658 +++++
 .../retune-data-build-with-mask.yaml          |  658 +++++
 .../Eagle-2T-retune/retune-train-no-mask.yaml |  119 +
 .../retune-train-with-mask.yaml               |  119 +
 7 files changed, 4872 insertions(+)
 create mode 100644 notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
 create mode 100644 notebook/major-runs/Eagle-2T-retune/eagle-1b5-runs.ipynb
 create mode 100644 notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask.yaml
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-data-build-with-mask.yaml
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml

diff --git a/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb b/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
new file mode 100644
index 00000000..cfcefcf9
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
@@ -0,0 +1,248 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Perform validation runs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ENABLE_WANDB: True\n",
+      "GPU_DEVICES: auto\n",
+      "NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune\n",
+      "TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5\n",
+      "PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer\n"
+     ]
+    }
+   ],
+   "source": [
+    "GPU_DEVICES=\"auto\"\n",
+    "ENABLE_WANDB=True\n",
+    "WANDB_PREFIX=\"Eagle-Base Validation\"\n",
+    "DEEPSPEED_STRAT=\"deepspeed_stage_2\"\n",
+    "\n",
+    "EXPERIMENT_NAME=\"Baseline Validation\"\n",
+    "LEARNING_RATE=\"5e-6\"\n",
+    "\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# The model sizing\n",
+    "MODEL_PATH=\"/workspace/main-models/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth\"\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">> Starting datapack build process for: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-data-build-with-mask.yaml\n",
+      ">> Preparing dataset - index:  0  - name:  lambada-train\n",
+      "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+      "Saving the dataset (3/3 shards): 100%|█| 58333/58333 [00:05<00:00, 11324.73 exam\n",
+      "Saving the dataset (1/1 shards): 100%|████| 1/1 [00:00<00:00, 126.77 examples/s]\n",
+      ">> Preparing dataset - index:  1  - name:  enwiki-train\n",
+      "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+      "Saving the dataset (7/7 shards): 100%|█| 124218/124218 [00:10<00:00, 11520.94 ex\n",
+      "Saving the dataset (1/1 shards): 100%|████| 1/1 [00:00<00:00, 137.22 examples/s]\n",
+      ">> Preparing dataset - index:  2  - name:  balanced-copa-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5084.24 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 40.54 examples/s]\n",
+      ">> Preparing dataset - index:  3  - name:  balanced-copa-options\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5172.33 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 47.23 examples/s]\n",
+      ">> Preparing dataset - index:  4  - name:  MedText-QA\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4383.39 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 48.29 examples/s]\n",
+      ">> Preparing dataset - index:  5  - name:  ALMA-prompt-completion\n",
+      "Saving the dataset (1/1 shards): 100%|█| 2655/2655 [00:00<00:00, 10088.94 exampl\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.14 examples/s]\n",
+      ">> Preparing dataset - index:  6  - name:  openbookqa-answer-choice\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3677.74 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 47.12 examples/s]\n",
+      ">> Preparing dataset - index:  7  - name:  winogrande-debiased-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3314.15 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 44.89 examples/s]\n",
+      ">> Preparing dataset - index:  8  - name:  winogrande-l-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 320/320 [00:00<00:00, 6636.75 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 48.08 examples/s]\n",
+      ">> Preparing dataset - index:  9  - name:  arc_easy-answer-choice\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4255.67 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.26 examples/s]\n",
+      ">> Preparing dataset - index:  10  - name:  arc_challenge-answer-choice\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4559.27 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 42.50 examples/s]\n",
+      ">> Preparing dataset - index:  11  - name:  piqa-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 329/329 [00:00<00:00, 5538.62 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.80 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Dataset Mixing mode:  shuffle\n",
+      ">> Saving dataset to data_path :  /datapath/eval-retune/pack-with-mask/\n",
+      "Saving the dataset (10/10 shards): 100%|█| 186975/186975 [00:19<00:00, 9718.66 e\n",
+      "Saving the dataset (1/1 shards): 100%|███| 12/12 [00:00<00:00, 78.55 examples/s]\n",
+      ">> Dataset saved to data_path\n",
+      ">> -----------------------------------\n",
+      ">> Performing dataset counting\n",
+      ">> -----------------------------------\n",
+      ">> Final dataset count ( train ) : 186,975  samples/chunks/packs\n",
+      ">> Final dataset count ( test  ) : 12  samples\n",
+      ">> -----------------------------------\n",
+      "Map (num_proc=160): 100%|██████| 186975/186975 [00:26<00:00, 7129.50 examples/s]\n",
+      "num_proc must be <= 12. Reducing num_proc to 12 for dataset of size 12.\n",
+      "Map (num_proc=12): 100%|█████████████████| 12/12 [00:03<00:00,  3.94 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Final 'train' dataset token count ...\n",
+      ">> - Total tokens : 761,896,534\n",
+      ">> - Valid tokens : 752,459,956\n",
+      ">> - Hidden tokens : 9,436,578\n",
+      ">> -----------------------------------\n",
+      ">> Final 'test' dataset token count ...\n",
+      ">> - Total tokens : 8,973\n",
+      ">> - Valid tokens : 8,373\n",
+      ">> - Hidden tokens : 600\n",
+      ">> -----------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets build the giant datapack\n",
+    "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-with-mask.yaml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">> Starting datapack build process for: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask.yaml\n",
+      ">> Preparing dataset - index:  0  - name:  lambada-train\n",
+      "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+      "Saving the dataset (3/3 shards): 100%|█| 58333/58333 [00:05<00:00, 10980.23 exam\n",
+      "Saving the dataset (1/1 shards): 100%|████| 1/1 [00:00<00:00, 147.90 examples/s]\n",
+      ">> Preparing dataset - index:  1  - name:  enwiki-train\n",
+      "Map (num_proc=160): 100%|███| 1000000/1000000 [00:23<00:00, 41789.88 examples/s]\n",
+      "Filter (num_proc=160): 100%|█| 1000000/1000000 [00:06<00:00, 145873.85 examples/\n",
+      "Map (num_proc=160): 100%|█████| 472276/472276 [00:14<00:00, 32941.99 examples/s]\n",
+      "Map (num_proc=160): 100%|██████| 124218/124218 [00:14<00:00, 8836.64 examples/s]\n",
+      "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+      "Saving the dataset (7/7 shards): 100%|█| 124218/124218 [00:18<00:00, 6767.74 exa\n",
+      "Saving the dataset (1/1 shards): 100%|████| 1/1 [00:00<00:00, 147.10 examples/s]\n",
+      ">> Preparing dataset - index:  2  - name:  balanced-copa-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5194.39 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.37 examples/s]\n",
+      ">> Preparing dataset - index:  3  - name:  balanced-copa-options\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5217.77 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.50 examples/s]\n",
+      ">> Preparing dataset - index:  4  - name:  MedText-QA\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4315.27 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.01 examples/s]\n",
+      ">> Preparing dataset - index:  5  - name:  ALMA-prompt-completion\n",
+      "Saving the dataset (1/1 shards): 100%|█| 2655/2655 [00:00<00:00, 10170.66 exampl\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.02 examples/s]\n",
+      ">> Preparing dataset - index:  6  - name:  openbookqa-answer-choice\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3712.04 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 44.65 examples/s]\n",
+      ">> Preparing dataset - index:  7  - name:  winogrande-debiased-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3443.72 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.23 examples/s]\n",
+      ">> Preparing dataset - index:  8  - name:  winogrande-l-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 320/320 [00:00<00:00, 6878.55 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.79 examples/s]\n",
+      ">> Preparing dataset - index:  9  - name:  arc_easy-answer-choice\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4188.07 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.60 examples/s]\n",
+      ">> Preparing dataset - index:  10  - name:  arc_challenge-answer-choice\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4681.21 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.06 examples/s]\n",
+      ">> Preparing dataset - index:  11  - name:  piqa-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 329/329 [00:00<00:00, 5461.27 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.44 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Dataset Mixing mode:  shuffle\n",
+      ">> Saving dataset to data_path :  /datapath/eval-retune/pack-no-mask/\n",
+      "Saving the dataset (10/10 shards): 100%|█| 186975/186975 [00:18<00:00, 10043.09 \n",
+      "Saving the dataset (1/1 shards): 100%|███| 12/12 [00:00<00:00, 78.26 examples/s]\n",
+      ">> Dataset saved to data_path\n",
+      ">> -----------------------------------\n",
+      ">> Performing dataset counting\n",
+      ">> -----------------------------------\n",
+      ">> Final dataset count ( train ) : 186,975  samples/chunks/packs\n",
+      ">> Final dataset count ( test  ) : 12  samples\n",
+      ">> -----------------------------------\n",
+      "Map (num_proc=160): 100%|██████| 186975/186975 [00:27<00:00, 6744.73 examples/s]\n",
+      "num_proc must be <= 12. Reducing num_proc to 12 for dataset of size 12.\n",
+      "Map (num_proc=12): 100%|█████████████████| 12/12 [00:01<00:00,  6.01 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Final 'train' dataset token count ...\n",
+      ">> - Total tokens : 761,896,534\n",
+      ">> - Valid tokens : 754,446,928\n",
+      ">> - Hidden tokens : 7,449,606\n",
+      ">> -----------------------------------\n",
+      ">> Final 'test' dataset token count ...\n",
+      ">> - Total tokens : 8,973\n",
+      ">> - Valid tokens : 8,651\n",
+      ">> - Hidden tokens : 322\n",
+      ">> -----------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets build the giant datapack\n",
+    "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask.yaml\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook/major-runs/Eagle-2T-retune/eagle-1b5-runs.ipynb b/notebook/major-runs/Eagle-2T-retune/eagle-1b5-runs.ipynb
new file mode 100644
index 00000000..545edf1b
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/eagle-1b5-runs.ipynb
@@ -0,0 +1,2558 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Perform Retune runs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ENABLE_WANDB: True\n",
+      "GPU_DEVICES: auto\n",
+      "NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune\n",
+      "TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5\n",
+      "PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer\n"
+     ]
+    }
+   ],
+   "source": [
+    "GPU_DEVICES=\"auto\"\n",
+    "ENABLE_WANDB=True\n",
+    "WANDB_PREFIX=\"Eagle-Retune\"\n",
+    "DEEPSPEED_STRAT=\"deepspeed_stage_2\"\n",
+    "LEARNING_RATE=\"5e-6\"\n",
+    "\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# The model to start from\n",
+    "MODEL_PATH=\"/workspace/main-models/RWKV-5-World-1B5-v2-20231025-ctx4096.pth\"\n",
+    "MICROBATCH_SIZE=16\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 03:11:29,596] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/RWKV-5-World-1B5-v2-20231025-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/1B5-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 1B5-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=16', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/RWKV-5-World-1B5-v2-20231025-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/1B5-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 1B5-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=16', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 4158660442\n",
+      "Seed set to 4158660442\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         16\n",
+      "   - accumulate_grad_batches: 8\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.\n",
+      "[rank: 0] Seed set to 4158660442\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-07 03:11:45,444] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 03:11:45,466] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 03:11:45,519] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 03:11:45,551] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 03:11:45,556] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 03:11:45,587] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 03:11:45,598] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 5] Seed set to 4158660442\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 1] Seed set to 4158660442\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 3] Seed set to 4158660442\n",
+      "[rank: 2] Seed set to 4158660442\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 4] Seed set to 4158660442\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 7] Seed set to 4158660442\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 6] Seed set to 4158660442\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 5] Seed set to 4158660442\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 4] Seed set to 4158660442\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 7] Seed set to 4158660442\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 6] Seed set to 4158660442\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 3] Seed set to 4158660442\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 1] Seed set to 4158660442\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 2] Seed set to 4158660442\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_031217-l4bxp4sg\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 1B5-No-Mask (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/l4bxp4sg\u001b[0m\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05144309997558594 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10137343406677246 seconds\n",
+      "Time to load fused_adam op: 0.10144162178039551 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10212945938110352 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10202336311340332 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10243558883666992 seconds\n",
+      "Time to load fused_adam op: 0.10201096534729004 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10181879997253418 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 134 M \n",
+      "1 | blocks | ModuleList | 1.3 B \n",
+      "2 | ln_out | LayerNorm  | 4.1 K \n",
+      "3 | head   | Linear     | 134 M \n",
+      "--------------------------------------\n",
+      "1.6 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "1.6 B     Total params\n",
+      "6,311.018 Total estimated model params size (MB)\n",
+      "Epoch 0:  14%|██▌                | 200/1461 [08:02<50:43,  0.41it/s, v_num=p4sg]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|▉| 1460/1461 [54:17<00:02,  0.45it/s, v_num=p4sg, train/tok=6.29e+/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "[rank5]:[2024-03-07 04:07:18,571] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 04:07:18,571] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-07 04:07:18,571] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 04:07:18,572] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 04:07:18,572] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-07 04:07:18,572] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 04:07:18,573] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 04:07:18,573] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-07 04:07:18,573] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 04:07:19,273] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 04:07:19,273] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-07 04:07:19,273] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 04:07:19,274] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 04:07:19,274] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-07 04:07:19,274] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 04:07:19,274] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 04:07:19,274] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-07 04:07:19,274] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 04:07:19,483] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 04:07:19,483] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-07 04:07:19,483] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 04:07:19,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 04:07:19,484] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-07 04:07:19,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 04:07:19,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 04:07:19,484] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-07 04:07:19,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 04:07:19,527] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 04:07:19,527] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-07 04:07:19,527] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 04:07:19,528] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 04:07:19,528] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-07 04:07:19,528] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 04:07:19,528] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 04:07:19,528] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-07 04:07:19,528] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 04:07:19,587] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 04:07:19,587] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-07 04:07:19,587] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 04:07:19,588] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 04:07:19,588] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-07 04:07:19,588] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 04:07:19,588] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 04:07:19,588] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-07 04:07:19,588] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 04:07:19,694] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 04:07:19,694] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-07 04:07:19,694] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 04:07:19,695] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 04:07:19,695] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-07 04:07:19,695] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 04:07:19,695] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 04:07:19,695] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-07 04:07:19,695] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 04:07:20,296] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 04:07:20,296] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-07 04:07:20,296] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 04:07:20,297] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 04:07:20,297] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-07 04:07:20,297] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 04:07:20,297] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 04:07:20,297] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-07 04:07:20,297] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 04:07:20,469] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 04:07:20,469] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-07 04:07:20,469] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 04:07:20,470] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 04:07:20,470] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-07 04:07:20,470] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 04:07:20,470] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 04:07:20,470] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-07 04:07:20,470] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|█| 1461/1461 [55:06<00:00,  0.44it/s, v_num=p4sg, train/tok=6.29e+\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:07<00:00,  0.13it/s]\u001b[A\n",
+      "Epoch 1:   0%| | 0/1461 [00:00<?, ?it/s, v_num=p4sg, train/tok=6.29e+8, train/lo\u001b[A[rank6]:[2024-03-07 04:07:51,083] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 04:07:51,083] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank6]:[2024-03-07 04:07:51,083] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 04:07:51,100] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 04:07:51,100] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank7]:[2024-03-07 04:07:51,100] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 04:07:51,120] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 04:07:51,120] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank5]:[2024-03-07 04:07:51,120] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 04:07:51,121] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 04:07:51,121] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank1]:[2024-03-07 04:07:51,121] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 04:07:51,127] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 04:07:51,127] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank2]:[2024-03-07 04:07:51,127] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 04:07:51,150] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 04:07:51,150] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank4]:[2024-03-07 04:07:51,150] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 04:07:51,163] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 04:07:51,163] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank0]:[2024-03-07 04:07:51,163] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 04:07:51,178] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 04:07:51,178] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank3]:[2024-03-07 04:07:51,178] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 1:   2%| | 29/1461 [01:17<1:03:59,  0.37it/s, v_num=p4sg, train/tok=6.29e+^C\n",
+      "Process ForkProcess-31:\n",
+      "Process ForkProcess-4:\n",
+      "Process ForkProcess-30:\n",
+      "Process ForkProcess-4:\n",
+      "Process ForkProcess-15:\n",
+      "Process ForkProcess-13:\n",
+      "Process ForkProcess-4:\n",
+      "Process ForkProcess-21:\n",
+      "Process ForkProcess-9:\n",
+      "Process ForkProcess-12:\n",
+      "Process ForkProcess-2:\n",
+      "Process ForkProcess-19:\n",
+      "Process ForkProcess-26:\n",
+      "Process ForkProcess-29:\n",
+      "Process ForkProcess-28:\n",
+      "Process ForkProcess-21:\n",
+      "Process ForkProcess-22:\n",
+      "Process ForkProcess-28:\n",
+      "Process ForkProcess-32:\n",
+      "Process ForkProcess-14:\n",
+      "Process ForkProcess-7:\n",
+      "Process ForkProcess-3:\n",
+      "Process ForkProcess-24:\n",
+      "Process ForkProcess-6:\n",
+      "Process ForkProcess-13:\n",
+      "Process ForkProcess-20:\n",
+      "Process ForkProcess-7:\n",
+      "Process ForkProcess-5:\n",
+      "Process ForkProcess-28:\n",
+      "Process ForkProcess-11:\n",
+      "Process ForkProcess-26:\n",
+      "Process ForkProcess-29:\n",
+      "Process ForkProcess-7:\n",
+      "Process ForkProcess-16:\n",
+      "Process ForkProcess-8:\n",
+      "Process ForkProcess-32:\n",
+      "Process ForkProcess-10:\n",
+      "Process ForkProcess-10:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-26:\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "KeyboardInterrupt\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 103, in get\n",
+      "    res = self._recv_bytes()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/connection.py\", line 216, in recv_bytes\n",
+      "    buf = self._recv_bytes(maxlength)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/connection.py\", line 414, in _recv_bytes\n",
+      "    buf = self._recv(4)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/connection.py\", line 379, in _recv\n",
+      "    chunk = read(handle, remaining)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "Process ForkProcess-3:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Process ForkProcess-27:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Process ForkProcess-31:\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-13:\n",
+      "Process ForkProcess-32:\n",
+      "Process ForkProcess-31:\n",
+      "Process ForkProcess-20:\n",
+      "Process ForkProcess-24:\n",
+      "Process ForkProcess-29:\n",
+      "Process ForkProcess-30:\n",
+      "Process ForkProcess-3:\n",
+      "Process ForkProcess-5:\n",
+      "Process ForkProcess-25:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "KeyboardInterrupt\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "KeyboardInterrupt\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-24:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Process ForkProcess-6:\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-19:\n",
+      "Process ForkProcess-27:\n",
+      "Process ForkProcess-25:\n",
+      "Process ForkProcess-23:\n",
+      "Process ForkProcess-32:\n",
+      "Process ForkProcess-1:\n",
+      "Process ForkProcess-27:\n",
+      "Process ForkProcess-17:\n",
+      "Process ForkProcess-18:\n",
+      "Process ForkProcess-25:\n",
+      "Traceback (most recent call last):\n",
+      "Process ForkProcess-22:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-12:\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "Process ForkProcess-23:\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-31:\n",
+      "Process ForkProcess-30:\n",
+      "Process ForkProcess-32:\n",
+      "Process ForkProcess-31:\n",
+      "Process ForkProcess-32:\n",
+      "Process ForkProcess-31:\n",
+      "Process ForkProcess-16:\n",
+      "Process ForkProcess-29:\n",
+      "Process ForkProcess-28:\n",
+      "Process ForkProcess-21:\n",
+      "Process ForkProcess-29:\n",
+      "Process ForkProcess-21:\n",
+      "Process ForkProcess-30:\n",
+      "Process ForkProcess-28:\n",
+      "Traceback (most recent call last):\n",
+      "Process ForkProcess-20:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Process ForkProcess-22:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Process ForkProcess-24:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Process ForkProcess-14:\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-29:\n",
+      "Process ForkProcess-27:\n",
+      "Process ForkProcess-30:\n",
+      "Process ForkProcess-13:\n",
+      "Process ForkProcess-28:\n",
+      "Process ForkProcess-19:\n",
+      "Process ForkProcess-25:\n",
+      "Process ForkProcess-30:\n",
+      "Traceback (most recent call last):\n",
+      "Process ForkProcess-21:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Process ForkProcess-26:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Process ForkProcess-22:\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-21:\n",
+      "Process ForkProcess-17:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "Process ForkProcess-28:\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Process ForkProcess-10:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-20:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Process ForkProcess-24:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-23:\n",
+      "Process ForkProcess-19:\n",
+      "Process ForkProcess-18:\n",
+      "Process ForkProcess-18:\n",
+      "Process ForkProcess-14:\n",
+      "Traceback (most recent call last):\n",
+      "Process ForkProcess-27:\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "KeyboardInterrupt\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-12:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-3:\n",
+      "Process ForkProcess-19:\n",
+      "Process ForkProcess-8:\n",
+      "Process ForkProcess-1:\n",
+      "Process ForkProcess-5:\n",
+      "Process ForkProcess-23:\n",
+      "Process ForkProcess-11:\n",
+      "Process ForkProcess-27:\n",
+      "Process ForkProcess-15:\n",
+      "Process ForkProcess-30:\n",
+      "Process ForkProcess-17:\n",
+      "Process ForkProcess-22:\n",
+      "Process ForkProcess-15:\n",
+      "Process ForkProcess-19:\n",
+      "Process ForkProcess-20:\n",
+      "Process ForkProcess-9:\n",
+      "Process ForkProcess-22:\n",
+      "Process ForkProcess-9:\n",
+      "Process ForkProcess-9:\n",
+      "Process ForkProcess-18:\n",
+      "Process ForkProcess-10:\n",
+      "Process ForkProcess-18:\n",
+      "Process ForkProcess-18:\n",
+      "Process ForkProcess-30:\n",
+      "Process ForkProcess-2:\n",
+      "Process ForkProcess-29:\n",
+      "Process ForkProcess-21:\n",
+      "Process ForkProcess-16:\n",
+      "Process ForkProcess-26:\n",
+      "Process ForkProcess-8:\n",
+      "Process ForkProcess-23:\n",
+      "Process ForkProcess-15:\n",
+      "Process ForkProcess-3:\n",
+      "Process ForkProcess-25:\n",
+      "Process ForkProcess-9:\n",
+      "Process ForkProcess-24:\n",
+      "Process ForkProcess-26:\n",
+      "Process ForkProcess-15:\n",
+      "Process ForkProcess-14:\n",
+      "Process ForkProcess-11:\n",
+      "Process ForkProcess-11:\n",
+      "Process ForkProcess-16:\n",
+      "Traceback (most recent call last):\n",
+      "Process ForkProcess-15:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Process ForkProcess-17:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 103, in get\n",
+      "    res = self._recv_bytes()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/connection.py\", line 216, in recv_bytes\n",
+      "    buf = self._recv_bytes(maxlength)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/connection.py\", line 414, in _recv_bytes\n",
+      "    buf = self._recv(4)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/connection.py\", line 379, in _recv\n",
+      "    chunk = read(handle, remaining)\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-19:\n",
+      "Process ForkProcess-8:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-7:\n",
+      "Process ForkProcess-10:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 103, in get\n",
+      "    res = self._recv_bytes()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/connection.py\", line 216, in recv_bytes\n",
+      "    buf = self._recv_bytes(maxlength)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/connection.py\", line 414, in _recv_bytes\n",
+      "    buf = self._recv(4)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/connection.py\", line 379, in _recv\n",
+      "    chunk = read(handle, remaining)\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "KeyboardInterrupt\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-19:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "Process ForkProcess-13:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-27:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Process ForkProcess-25:\n",
+      "Process ForkProcess-21:\n",
+      "Process ForkProcess-23:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "KeyboardInterrupt\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 314, in _bootstrap\n",
+      "    self.run()\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/process.py\", line 108, in run\n",
+      "    self._target(*self._args, **self._kwargs)\n",
+      "  File \"/usr/lib/python3.10/concurrent/futures/process.py\", line 240, in _process_worker\n",
+      "    call_item = call_queue.get(block=True)\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/queues.py\", line 102, in get\n",
+      "    with self._rlock:\n",
+      "  File \"/usr/lib/python3.10/multiprocessing/synchronize.py\", line 95, in __enter__\n",
+      "    return self._semlock.__enter__()\n",
+      "Traceback (most recent call last):\r"
+     ]
+    }
+   ],
+   "source": [
+    "# The 1B5 model\n",
+    "EXPERIMENT_NAME=\"1B5-No-Mask\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-no-mask.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 05:23:45,564] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/1B5-No-Mask/epoch=0-step=175.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 534 params 1577754624 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-1B5-No-Mask.pth\n",
+      "-rw-r--r-- 1 nobody root 3.0G Mar  7 05:24 /workspace/main-models/R4-retune/R4-1B5-No-Mask.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"1B5-No-Mask\"\n",
+    "CKPT_DIR=\"epoch=0-step=175.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 04:11:38,644] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml', '--model.load_model=/workspace/main-models/RWKV-5-World-1B5-v2-20231025-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/1B5-With-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 1B5-With-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=16', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml', '--model.load_model=/workspace/main-models/RWKV-5-World-1B5-v2-20231025-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/1B5-With-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 1B5-With-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=16', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 984264868\n",
+      "Seed set to 984264868\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         16\n",
+      "   - accumulate_grad_batches: 8\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "[rank: 0] Seed set to 984264868\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-07 04:11:54,428] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:11:54,497] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:11:54,524] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:11:54,605] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:11:54,612] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:11:54,613] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:11:54,647] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 4] Seed set to 984264868\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 3] Seed set to 984264868\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 5] Seed set to 984264868\n",
+      "[rank: 6] Seed set to 984264868\n",
+      "[rank: 1] Seed set to 984264868\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 2] Seed set to 984264868\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 7] Seed set to 984264868\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 4] Seed set to 984264868\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 2] Seed set to 984264868\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 3] Seed set to 984264868\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 1] Seed set to 984264868\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 6] Seed set to 984264868\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 7] Seed set to 984264868\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 5] Seed set to 984264868\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_041224-7uaksxd6\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 1B5-With-Mask (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/7uaksxd6\u001b[0m\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /checkpoint/retune/1B5-With-Mask exists and is not empty.\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.0517115592956543 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10147452354431152 seconds\n",
+      "Time to load fused_adam op: 0.10148239135742188 seconds\n",
+      "Time to load fused_adam op: 0.10145282745361328 seconds\n",
+      "Time to load fused_adam op: 0.10155653953552246 seconds\n",
+      "Time to load fused_adam op: 0.10177063941955566 seconds\n",
+      "Time to load fused_adam op: 0.1015777587890625 seconds\n",
+      "Time to load fused_adam op: 0.10151982307434082 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 134 M \n",
+      "1 | blocks | ModuleList | 1.3 B \n",
+      "2 | ln_out | LayerNorm  | 4.1 K \n",
+      "3 | head   | Linear     | 134 M \n",
+      "--------------------------------------\n",
+      "1.6 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "1.6 B     Total params\n",
+      "6,311.018 Total estimated model params size (MB)\n",
+      "Epoch 0:  14%|██▌                | 200/1461 [07:56<50:04,  0.42it/s, v_num=sxd6]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|▉| 1460/1461 [54:25<00:02,  0.45it/s, v_num=sxd6, train/tok=6.29e+/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:110: UserWarning: 'has_cuda' is deprecated, please use 'torch.backends.cuda.is_built()'\n",
+      "  torch.has_cuda,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:111: UserWarning: 'has_cudnn' is deprecated, please use 'torch.backends.cudnn.is_available()'\n",
+      "  torch.has_cudnn,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:117: UserWarning: 'has_mps' is deprecated, please use 'torch.backends.mps.is_built()'\n",
+      "  torch.has_mps,\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/overrides.py:118: UserWarning: 'has_mkldnn' is deprecated, please use 'torch.backends.mkldnn.is_available()'\n",
+      "  torch.has_mkldnn,\n",
+      "[rank3]:[2024-03-07 05:07:30,910] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 05:07:30,910] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-07 05:07:30,910] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 05:07:30,911] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 05:07:30,911] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-07 05:07:30,911] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 05:07:30,911] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 05:07:30,911] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-07 05:07:30,911] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 05:07:31,032] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 05:07:31,032] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-07 05:07:31,032] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 05:07:31,033] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 05:07:31,033] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-07 05:07:31,033] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 05:07:31,033] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 05:07:31,033] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-07 05:07:31,033] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 05:07:31,401] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 05:07:31,401] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-07 05:07:31,401] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 05:07:31,402] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 05:07:31,402] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-07 05:07:31,402] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 05:07:31,402] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 05:07:31,402] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-07 05:07:31,402] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 05:07:31,477] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 05:07:31,477] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-07 05:07:31,477] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 05:07:31,477] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 05:07:31,477] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-07 05:07:31,477] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 05:07:31,478] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 05:07:31,478] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-07 05:07:31,478] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 05:07:31,495] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 05:07:31,495] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-07 05:07:31,495] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 05:07:31,496] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 05:07:31,496] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-07 05:07:31,496] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 05:07:31,496] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 05:07:31,496] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-07 05:07:31,496] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 05:07:31,577] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 05:07:31,577] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-07 05:07:31,577] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 05:07:31,578] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 05:07:31,578] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-07 05:07:31,578] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 05:07:31,579] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 05:07:31,579] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-07 05:07:31,579] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 05:07:31,950] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 05:07:31,950] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-07 05:07:31,950] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 05:07:31,951] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 05:07:31,951] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-07 05:07:31,951] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 05:07:31,952] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 05:07:31,952] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-07 05:07:31,952] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 05:07:32,228] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 05:07:32,228] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-07 05:07:32,228] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 05:07:32,229] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 05:07:32,229] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-07 05:07:32,229] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 05:07:32,230] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 05:07:32,230] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-07 05:07:32,230] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|█| 1461/1461 [55:10<00:00,  0.44it/s, v_num=sxd6, train/tok=6.29e+\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:06<00:00,  0.15it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 1461/1461 [55:17<00:00,  0.44it/s, v_num=sxd6, train/tok=6.29e+`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 1461/1461 [55:17<00:00,  0.44it/s, v_num=sxd6, train/tok=6.29e+\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.043 MB of 0.043 MB uploaded (0.004 MB deduped)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch ▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec ▁▄▅▆▇▇▇▇▇▇█▇▇███████████████████████████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step ▄▅▄▄▆▆▆▅▇▂▆▅▄▄▇▄▆▇▅▄▄▄▆▅▆▅▅█▇▃▄▅▆▆▄▅▁▆▅▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 ▁▄▅▆▇▇▇▇▇▇█▇▇███████████████████████████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 █▂▃█▇▇█▇▇▃▇█▇▇▇▇▇█▇█▁▇█▃▇▇▇█▇▇▇▇▇▆▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen ▄▆▄▄██▆▅█▃█▅▅▆█▅██▆▄▆▄▆▆█▆▆██▃▅▆█▇▆▆▁█▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss ▆▃▅▄▄▅▄▇█▅█▄▃▅▅▃▄▅▅▅▃▃▆▄▂▂▁▄▃▁▄▃▂▃▅▄▃▃▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss ▆▃▅▄▄▅▄▇█▅█▄▃▅▅▃▄▅▅▅▃▃▆▄▂▂▁▄▃▁▄▃▂▃▅▄▃▃▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens ▅▄▃▅▅▇▆▅▇▁█▄▅▄▇▅▆▅▇▃▃▄▅▅▆▄▄▇▇▃▂▂▇▅▄▅▂▇▆▅\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss █▄▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step 149.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec 231.18736\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step 11.19994\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total 761867.875\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 28.91554\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 1.41149\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 95289.894\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep 11680\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen 2793.07031\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss 2.3418\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss 2.3418\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens 2743.25\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss 2.32079\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok 628621312.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step 182\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate 1e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen 4096.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss 1.2627\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss 1.2627\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mEagle-Retune - 1B5-With-Mask (deepspeed_stage_2)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/7uaksxd6\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjE0NjI1ODU5Ng==/version_details/v1\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240307_041224-7uaksxd6/logs\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# The 1B5 model\n",
+    "EXPERIMENT_NAME=\"1B5-With-Mask\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-with-mask.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 05:24:44,220] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/1B5-With-Mask/epoch=0-step=175.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 534 params 1577754624 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-1B5-With-Mask.pth\n",
+      "-rw-r--r-- 1 nobody root 3.0G Mar  7 05:25 /workspace/main-models/R4-retune/R4-1B5-With-Mask.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"1B5-With-Mask\"\n",
+    "CKPT_DIR=\"epoch=0-step=175.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EXPERIMENT_NAME=\"1B5-No-Mask\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EXPERIMENT_NAME=\"1B5-With-Mask\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb b/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
new file mode 100644
index 00000000..53122919
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
@@ -0,0 +1,512 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Perform Retune runs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ENABLE_WANDB: True\n",
+      "GPU_DEVICES: auto\n",
+      "NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune\n",
+      "TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5\n",
+      "PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer\n"
+     ]
+    }
+   ],
+   "source": [
+    "GPU_DEVICES=\"auto\"\n",
+    "ENABLE_WANDB=True\n",
+    "WANDB_PREFIX=\"Eagle-Retune\"\n",
+    "DEEPSPEED_STRAT=\"deepspeed_stage_2\"\n",
+    "LEARNING_RATE=\"5e-6\"\n",
+    "\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# The model to start from\n",
+    "MODEL_PATH=\"/workspace/main-models/Eagle-2T/chunk8-1-0.85.pth\"\n",
+    "MICROBATCH_SIZE=8\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 04:39:28,649] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/Eagle-2T/chunk8-1-0.85.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/Eagle-2T/chunk8-1-0.85.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 378326663\n",
+      "Seed set to 378326663\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Creating extension directory /root/.cache/torch_extensions/py310_cu121/wkv5...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "[1/3] c++ -MMD -MF wkv5_op.o.d -DTORCH_EXTENSION_NAME=wkv5 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/cuda/wkv5_op.cpp -o wkv5_op.o \n",
+      "/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/cuda/wkv5_op.cpp:48:51: note: ‘#pragma message: No SIMD is supported’\n",
+      "   48 |             #pragma message(\"No SIMD is supported\")\n",
+      "      |                                                   ^\n",
+      "[2/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=wkv5 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90 --compiler-options '-fPIC' -res-usage --use_fast_math -O3 -Xptxas -O3 --extra-device-vectorization -D_N_=64 -std=c++17 -c /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/cuda/wkv5_cuda.cu -o wkv5_cuda.cuda.o \n",
+      "ptxas info    : 1 bytes gmem\n",
+      "ptxas info    : Compiling entry function '_Z15kernel_backwardIfEviiiiPfPKT_S3_S3_PKfS5_S3_S3_PS1_S6_S6_S6_S6_' for 'sm_90'\n",
+      "ptxas info    : Function properties for _Z15kernel_backwardIfEviiiiPfPKT_S3_S3_PKfS5_S3_S3_PS1_S6_S6_S6_S6_\n",
+      "    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n",
+      "ptxas info    : Used 166 registers, 1536 bytes smem\n",
+      "ptxas info    : Compiling entry function '_Z15kernel_backwardIN3c104HalfEEviiiiPfPKT_S5_S5_PKfS7_S5_S5_PS3_S8_S8_S8_S8_' for 'sm_90'\n",
+      "ptxas info    : Function properties for _Z15kernel_backwardIN3c104HalfEEviiiiPfPKT_S5_S5_PKfS7_S5_S5_PS3_S8_S8_S8_S8_\n",
+      "    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n",
+      "ptxas info    : Used 166 registers, 1536 bytes smem\n",
+      "ptxas info    : Compiling entry function '_Z15kernel_backwardIN3c108BFloat16EEviiiiPfPKT_S5_S5_PKfS7_S5_S5_PS3_S8_S8_S8_S8_' for 'sm_90'\n",
+      "ptxas info    : Function properties for _Z15kernel_backwardIN3c108BFloat16EEviiiiPfPKT_S5_S5_PKfS7_S5_S5_PS3_S8_S8_S8_S8_\n",
+      "    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n",
+      "ptxas info    : Used 166 registers, 1536 bytes smem\n",
+      "ptxas info    : Compiling entry function '_Z24kernel_forward_inferenceIfEviiiiPfPKT_S3_S3_PKfS3_PS1_' for 'sm_90'\n",
+      "ptxas info    : Function properties for _Z24kernel_forward_inferenceIfEviiiiPfPKT_S3_S3_PKfS3_PS1_\n",
+      "    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n",
+      "ptxas info    : Used 130 registers, 1024 bytes smem\n",
+      "ptxas info    : Compiling entry function '_Z24kernel_forward_inferenceIN3c104HalfEEviiiiPfPKT_S5_S5_PKfS5_PS3_' for 'sm_90'\n",
+      "ptxas info    : Function properties for _Z24kernel_forward_inferenceIN3c104HalfEEviiiiPfPKT_S5_S5_PKfS5_PS3_\n",
+      "    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n",
+      "ptxas info    : Used 131 registers, 1024 bytes smem\n",
+      "ptxas info    : Compiling entry function '_Z24kernel_forward_inferenceIN3c108BFloat16EEviiiiPfPKT_S5_S5_PKfS5_PS3_' for 'sm_90'\n",
+      "ptxas info    : Function properties for _Z24kernel_forward_inferenceIN3c108BFloat16EEviiiiPfPKT_S5_S5_PKfS5_PS3_\n",
+      "    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n",
+      "ptxas info    : Used 131 registers, 1024 bytes smem\n",
+      "ptxas info    : Compiling entry function '_Z15kernelc_mm8_oneIfEvyyPKT_PKhyPfPKfS7_yy' for 'sm_90'\n",
+      "ptxas info    : Function properties for _Z15kernelc_mm8_oneIfEvyyPKT_PKhyPfPKfS7_yy\n",
+      "    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads\n",
+      "ptxas info    : Used 32 registers\n",
+      "[3/3] c++ wkv5_op.o wkv5_cuda.cuda.o -shared -L/usr/local/lib/python3.10/dist-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o wkv5.so\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "[rank: 0] Seed set to 378326663\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-07 04:40:45,462] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:40:45,512] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:40:45,525] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:40:45,563] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:40:45,585] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:40:45,617] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 04:40:45,620] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 4] Seed set to 378326663\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 3] Seed set to 378326663\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 5] Seed set to 378326663\n",
+      "[rank: 2] Seed set to 378326663\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 7] Seed set to 378326663\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 1] Seed set to 378326663\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 6] Seed set to 378326663\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 1] Seed set to 378326663\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 2] Seed set to 378326663\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 6] Seed set to 378326663\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 5] Seed set to 378326663\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 7] Seed set to 378326663\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 3] Seed set to 378326663\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 4] Seed set to 378326663\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: W&B API key is configured. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_044154-p8i98m91\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 7B-15t-No-Mask (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/p8i98m91\u001b[0m\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.04762744903564453 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10148453712463379 seconds\n",
+      "Time to load fused_adam op: 0.10180044174194336 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10169410705566406 seconds\n",
+      "Time to load fused_adam op: 0.10160326957702637 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10586071014404297 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10921812057495117 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10483813285827637 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 268 M \n",
+      "1 | blocks | ModuleList | 7.0 B \n",
+      "2 | ln_out | LayerNorm  | 8.2 K \n",
+      "3 | head   | Linear     | 268 M \n",
+      "--------------------------------------\n",
+      "7.5 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "7.5 B     Total params\n",
+      "30,072.177Total estimated model params size (MB)\n",
+      "Epoch 0:  14%|██▎              | 400/2922 [28:06<2:57:13,  0.24it/s, v_num=8m91]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0:  32%|▎| 936/2922 [1:11:09<2:30:59,  0.22it/s, v_num=8m91, train/tok=2.0"
+     ]
+    }
+   ],
+   "source": [
+    "# The 7B model\n",
+    "EXPERIMENT_NAME=\"7B-15t-No-Mask\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-no-mask.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"7B-15t-No-Mask\"\n",
+    "CKPT_DIR=\"last.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EXPERIMENT_NAME=\"7B-15t-No-Mask\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The 7B model\n",
+    "EXPERIMENT_NAME=\"7B-15t-With-Mask\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-with-mask.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"7B-15t-With-Mask\"\n",
+    "CKPT_DIR=\"last.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EXPERIMENT_NAME=\"7B-15t-With-Mask\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask.yaml b/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask.yaml
new file mode 100644
index 00000000..a742f7fe
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask.yaml
@@ -0,0 +1,658 @@
+#
+# Custom multiple datasource, built as a single datapack
+#
+datapack:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # If using relative path, this should be relative to the trainer script path
+  data_path: /datapath/eval-retune/pack-no-mask/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Mixing mode to use, this is used to alternate between datasets
+  #
+  # - concat  : Keep It Simple Silly, lets just concat the datasets together
+  # - shuffle : Dataset is mixed on a per sample level
+  #
+  # (@TODO: Advance operations)
+  # - batch   : Meaning one dataset worth per batch, partial batches are discarded
+  mixing_mode: "shuffle"
+
+#
+# Default settings used across all datasets in the datapack
+# These settings can be overriden by the dataset specific settings
+#
+default:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # Datapath here is entirely optional, and only used if you intend to save each individual dataset
+  # seperately (makes it easier to tweak and rebuild the datapack if it crash mid-way)
+  #
+  # The dataset index will be appended to the default value, if set
+  # ---
+  data_path: /datapath/eval-retune/partial-no-mask/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Additional source dataset params, used to grab subsets of the dataset
+  # ---
+  # source_dataset_params:
+  #   language: en
+
+  # Sort the dataset by length, useful to reduce gpu waiting time (also useful for RWKV long context coherence)
+  # ---
+  # sort_by_length: false
+  # sort_asc: True # Sort in ascending order, true = shortest first, false = longest first
+
+  # Limit the document count, to an offset/length limit
+  # If an int value is used, it is interprated as document count
+  # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset
+  # ---
+  # dataset_offset: -1
+  # dataset_length: -1
+
+  # Use data_dir, if you are using source=text/json/etc
+  # If using relative path, this should be relative to the trainer script path
+  # source_data_dir: ../dataset-text/
+
+  # After loading the dataset, split out test data used for validation, 
+  # This process is skipped if the dataset includes a test split
+  #
+  # If given a float value, a percentage of the dataset is used (1.0 being 100%)
+  # If given an int value, the number of data sample is used.
+  #
+  # Due to the limitaitons in the trainer process, there is always a minimum of 1 test sample
+  test_split: 1 # Intentionally set to a low sample for test, cause the real eval is humans
+  test_split_shuffle: True
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the HF tokenizer name/path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 1024 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  # ---
+  min_token_size: -1
+  max_token_size: -1
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  #
+  # IMPORTANT NOTE: as newlines are commonly used for multi_column_suffix, etc. 
+  #                 you should use single quotes to ensure such values dun get escaped.
+  #                 eg. multi_column_suffix: ['\n\n']
+  #
+  # See: https://github.com/RWKV/RWKV-infctx-trainer/issues/34
+  # Need to use " or the new lines won't be tokenized properly
+  # ---
+  # multi_column_keys: ["instruction", "input", "output"]
+  # multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  # multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  # multi_column_train_mask: [true, true, true]
+  # multi_column_separator: "\n\n"
+  
+  # Conversation merging process
+  # useful for merging full conversational datasets, into single documents
+  # default is off, (or set conversation_key to [])
+  # conversation_formatting supports "iopairs" or "sender" for now.
+  # ---
+  # conversation_format: 'iopairs'
+  # conversation_key: 'conversation'
+  # conversation_end_of_conversation: "\n\nUser:"
+
+  # Iopairs specific config
+  # This means that every object in the conversation object is a pair of input output.
+  # In future it will also support a format where one of the keys dictates the format style
+  # if conversation_key is set to null, it will use the root object as the conversation object
+  # ---
+  # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  # conversation_input_key_mask: {'input': false, 'output': True}
+  # conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # Sender specific config
+  # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  # conversation_input_key_map: {'message': "\n\n{sender}: ", 'context': ''}
+  # conversation_sender_key: 'sender'
+  # conversation_sender_value_map: {'user': 'User', 'assistant': 'Assistant', 'system': 'System'}
+  # conversation_sender_mask: {'user': false, 'assistant': True, 'system': false}
+  # conversation_sender_suffix: {'user': "", 'assistant': "", 'system': ""}
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+  # ----------------------------
+  # Dataset split usage
+  # ----------------------------
+
+  source_dataset_split: "train"
+  test_dataset_split: "do-not-use-test-split"
+
+  # ----------------------------
+  # Rechunking support
+  # ----------------------------
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 2048
+  #
+  # This is ignored, if source is not set as text (unless text_rechunk_force)
+  # This is ignored, if set to zero / -1
+  # ---
+  text_rechunk_size: 4096
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: False
+
+  # Used to disable the automated text rechunkin for text files, if set as false
+  # ---
+  text_rechunk_auto: True
+
+  # ----------------------------
+  # Dataset packing support
+  # Recommended to be used with mixed documents sized finetuning
+  # For foundation model "from scratch", rechunking is typically used instead
+  # ----------------------------
+
+  # Boolean flag to enable / disable dataset packing
+  packing_enable: True
+
+  # Used to ensure all training samples wihin this batch size is the same length
+  # Ideally this should align exactly with your real "batch size"
+  #
+  # Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
+  # a large number of batch size combinations. This helps reduce the amount of
+  # misaligned batches, and thus reduce the amount of wasted training time.
+  #
+  # This is tagged to datapack.batchsize, unless overriden here or on a dataset level
+  # ---
+  # packing_batchsize: 20160
+
+  # Chunking size to align within each batch, this ideally should be equal to
+  # the training context length used.
+  packing_chunksize: 4096
+
+  # Minimum size to pack up to, this should be a multiple of packing_chunksize
+  # defautls to -1, which equals to packing_chunksize
+  packing_min_ctx_len: 4096
+
+  # Pack the data sequentially if possible, in accordance to the dataset sequence
+  # this can be used together with sort_by_length, otherwise a shuffle will be done
+  packing_in_sequence: False
+
+  # ----------------------------
+  # Specal use caes flags
+  # ----------------------------
+
+  # Reverse the training dataset order before saving, this is useful for,
+  # optimizing dataset packing process, when using packing_in_sequence
+  # and sort_by_length desc order together
+  reverse_train_dataset_before_save: False
+
+#
+# The dataset specific settings
+# 
+dataset:
+  
+  # ---
+  # Text based dataset
+  # ---
+
+  - # Lambada training text
+    # https://huggingface.co/datasets/lambada
+    source: "lambada"
+    name: "lambada-train"
+    # 4k rechunk forced
+    text_rechunk_force: True
+
+  - # Enwiki training text
+    # https://huggingface.co/datasets/teven/enwiki_100k
+    source: "teven/enwiki_100k"
+    name: "enwiki-train"
+    # 4k rechunk forced
+    min_token_size: 256
+    text_rechunk_force: True
+
+  # ---
+  # Copa style
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/pkavumba/balanced-copa
+
+  - # Balanced copa, framed as choices
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["### Premise:\n", "\n\n### Question:\nWhich choice was the", "1) ", "2) ", "\n### Answer:\n"]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [true, true, true, true, true]
+    multi_column_separator: ""
+  
+  - # Balanced copa, framed as options
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-options"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["Context: ", "\n\nQuestion: Which option was the", "1. ", "2. ", "\nAnswer: "]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [true, true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Prompt completion / Q&A datasets
+  # ---
+
+  - # Question answer pair medical text
+    # https://huggingface.co/datasets/BI55/MedText
+    source: "BI55/MedText"   
+    name: "MedText-QA"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["Prompt", "Completion"]
+    multi_column_prefix: ["Question:\n", "Answer:\n"]
+    multi_column_suffix: ["", ""]
+    multi_column_train_mask: [true, true]
+    multi_column_separator: "\n\n"
+  
+  - # Language translation prompt/completion
+    # https://huggingface.co/datasets/kristaller486/ALMA-prompt-completion
+    source: "kristaller486/ALMA-prompt-completion"
+    name: "ALMA-prompt-completion"
+    # 4k packing
+    packing_enable: True
+    # Prompt completion, nothing else else
+
+  # ---
+  # openbookqa
+  # ---
+
+  # openbookqa
+  # https://huggingface.co/datasets/allenai/openbookqa
+
+  - # Openbookqa training, with the json
+    source: "allenai/openbookqa"
+    name: "openbookqa-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["fact1", "question_stem", "choices", "answerKey"]
+    multi_column_prefix: [">>> Premise:\n", "\n\nChoose the best option to complete the following:\n", "\n\nUsing the text options found in the following JSON:\n", "\n\n>>> Answer:\n"]
+    multi_column_suffix: ["", "", "\n\nAnswer using only the label given in the json", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Winogrande
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/winogrande
+
+  - # Balanced copa, framed as choices
+    source: "winogrande"
+    name: "winogrande-debiased-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_debiased
+
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following sentence:\n", "\n1) ", "\n2) ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["\n\n Choose either 1 or 2, for which option is the best fit to replace _ in the sentence\n", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  - # Balanced copa, framed as choices
+    source: "winogrande"
+    name: "winogrande-l-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_l 
+    
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following statement: `", "\n1. ", "\n2. ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["`\n\n Choose 1 or 2, for which choice is the best fit to replace _ in the statement, answer only with the number given\n", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # logiqa
+  # ---
+
+  # logiqa
+  # https://huggingface.co/datasets/lucasmccabe/logiqa
+  # ( This has a pyarrow error somehow ?, probably cause its an array/list internally )
+
+  # - # Openbookqa training, with the json
+  #   source: "lucasmccabe/logiqa"
+  #   name: "logiqa-options"
+  #   # 4k packing
+  #   packing_enable: True
+
+  #   # Question / Answer pairings
+  #   multi_column_keys: ["context", "query", "options", "correct_option"]
+  #   multi_column_prefix: [">>> Context:\n", "\n\n>>> Query:\n", "\n\nAnswer with the array index position (starting from 0), for the most appropriate option for the given query: ", "\n\n>>> Answer:\n"]
+  #   multi_column_suffix: ["", "", "", ""]
+  #   multi_column_train_mask: [true, true, true, true]
+  #   multi_column_separator: ""
+  
+  # ---
+  # arc_easy
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_easy
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_easy"
+    name: "arc_easy-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["question", "choices", "answerKey"]
+    multi_column_prefix: ["Question: ", "\n\nUsing the text options found in the following JSON:\n", "\n\nAnswer: "]
+    multi_column_suffix: ["", "\n\nAnswer using only the corresponding label given in the json", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # arc_challenge
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_challenge
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_challenge"
+    name: "arc_challenge-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["choices", "question", "answerKey"]
+    multi_column_prefix: ["Using the text found in the following:\n", "\n\nQuestion: ", "\n\nAnswer: "]
+    multi_column_suffix: ["\n\nAnswer using only the respective label given", "", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Piqa
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/piqa
+
+  - # Balanced copa, framed as choices
+    source: "piqa"
+    name: "piqa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["goal", "sol1", "sol2", "label"]
+    multi_column_prefix: ["# Goal: ", "\n\n0) ", "\n1) ", "\n\n# Answer: "]
+    multi_column_suffix: ["", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Instruct datasets
+  # ---
+
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "Open-Orca/OpenOrca"
+  #   name: "OpenOrca"
+
+  #   multi_column_keys: ["system_prompt", "question", "response"]
+  #   multi_column_prefix: ["Instruction:\n", "", ""]
+  #   multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  #   multi_column_train_mask: [true, true, true]
+  #   multi_column_separator: ""
+    
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "teknium/openhermes"
+  #   name: "openhermes-1-instruct"
+
+  #   multi_column_keys: ["instruction", "input", "output"]
+  #   multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  #   multi_column_suffix: ["", "", ""]
+  #   multi_column_train_mask: [true, true, true]
+  #   multi_column_separator: "\n\n"
+    
+  # ---
+  # Chat datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Capybara"
+  #   name: "Capybara-chat"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\n>>> User: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\n>>> User: ", 'output': "\n\n>>> Assistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Pure-Dove"
+  #   name: "Pure-Dove"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+ 
+  # ---
+  # Other datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "teknium/OpenHermes-2.5"
+  #   name: "openhermes-2-convo"
+      
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'conversations'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'value': "\n\n{sender}: "}
+  #   conversation_sender_key: 'from'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  # - # Instruct, input, output format
+  #   # With the instruction format changed, to fix the formatting
+  #   # https://huggingface.co/datasets/Darok/Lamini-instructions-to-french
+  #   source: "Darok/Lamini-instructions-to-french"
+  #   name: "Lamini-instructions-to-french"
+
+  #   multi_column_keys: ["Input", "Response"]
+  #   multi_column_prefix: ["### Instruction:\nPlease translate the next sentence into French\n\n### Input:\n", "### Output:\n"]
+  #   multi_column_suffix: ["", ""]
+  #   multi_column_train_mask: [true, true]
+  #   multi_column_separator: "\n\n"
+    
+  # - # Long range instruction format
+  #   # https://huggingface.co/datasets/THUDM/LongAlign-10k/
+  #   source: "THUDM/LongAlign-10k"
+  #   name: "LongAlign-10k"
+
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'messages'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'content': "\n\n{sender}: "}
+  #   conversation_sender_key: 'role'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  ######################################################
+  # Note: You can probably throw in enwiki if you want
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "teven/enwiki_100k" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "enwiki_100k"
+
+  #   # Minimum / Maximum token size of the dataset to use
+  #   min_token_size: 1024
+  #   max_token_size: -1
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  # - # SuperWiki (Multi-lingual)
+  #   # https://huggingface.co/datasets/RyokoExtra/SuperWIKI-Cleaned
+  #   source: "RyokoExtra/SuperWIKI-Cleaned" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "super_wiki"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  #   source_dataset_split: lang25
+
+  #   # Custom text column to use, useful for dataset with alternative training columns labels
+  #   # This is checked before multi column merging, default is null (disabled)
+  #   # If set this takes priority
+  #   # eg: 'code'
+  #   # ---
+  #   custom_text_key: 'text'
+
+  #   # All other settings found in default can be overriden here
+  #   # ---
+  #   # ...
+
+  ######################################################
+  # Note: We found the ML generated textbooks
+  # too low in perplexity that it hurts the model
+  # so we are using the original enwiki_100k & superwiki
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "TanvirOnHF/muse_textbooks" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "muse_textbooks"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+  ######################################################
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-data-build-with-mask.yaml b/notebook/major-runs/Eagle-2T-retune/retune-data-build-with-mask.yaml
new file mode 100644
index 00000000..37995eae
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-data-build-with-mask.yaml
@@ -0,0 +1,658 @@
+#
+# Custom multiple datasource, built as a single datapack
+#
+datapack:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # If using relative path, this should be relative to the trainer script path
+  data_path: /datapath/eval-retune/pack-with-mask/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Mixing mode to use, this is used to alternate between datasets
+  #
+  # - concat  : Keep It Simple Silly, lets just concat the datasets together
+  # - shuffle : Dataset is mixed on a per sample level
+  #
+  # (@TODO: Advance operations)
+  # - batch   : Meaning one dataset worth per batch, partial batches are discarded
+  mixing_mode: "shuffle"
+
+#
+# Default settings used across all datasets in the datapack
+# These settings can be overriden by the dataset specific settings
+#
+default:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # Datapath here is entirely optional, and only used if you intend to save each individual dataset
+  # seperately (makes it easier to tweak and rebuild the datapack if it crash mid-way)
+  #
+  # The dataset index will be appended to the default value, if set
+  # ---
+  data_path: /datapath/eval-retune/partial-with-mask/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Additional source dataset params, used to grab subsets of the dataset
+  # ---
+  # source_dataset_params:
+  #   language: en
+
+  # Sort the dataset by length, useful to reduce gpu waiting time (also useful for RWKV long context coherence)
+  # ---
+  # sort_by_length: false
+  # sort_asc: True # Sort in ascending order, true = shortest first, false = longest first
+
+  # Limit the document count, to an offset/length limit
+  # If an int value is used, it is interprated as document count
+  # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset
+  # ---
+  # dataset_offset: -1
+  # dataset_length: -1
+
+  # Use data_dir, if you are using source=text/json/etc
+  # If using relative path, this should be relative to the trainer script path
+  # source_data_dir: ../dataset-text/
+
+  # After loading the dataset, split out test data used for validation, 
+  # This process is skipped if the dataset includes a test split
+  #
+  # If given a float value, a percentage of the dataset is used (1.0 being 100%)
+  # If given an int value, the number of data sample is used.
+  #
+  # Due to the limitaitons in the trainer process, there is always a minimum of 1 test sample
+  test_split: 1 # Intentionally set to a low sample for test, cause the real eval is humans
+  test_split_shuffle: True
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the HF tokenizer name/path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 1024 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  # ---
+  min_token_size: -1
+  max_token_size: -1
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  #
+  # IMPORTANT NOTE: as newlines are commonly used for multi_column_suffix, etc. 
+  #                 you should use single quotes to ensure such values dun get escaped.
+  #                 eg. multi_column_suffix: ['\n\n']
+  #
+  # See: https://github.com/RWKV/RWKV-infctx-trainer/issues/34
+  # Need to use " or the new lines won't be tokenized properly
+  # ---
+  # multi_column_keys: ["instruction", "input", "output"]
+  # multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  # multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  # multi_column_train_mask: [true, false, true]
+  # multi_column_separator: "\n\n"
+  
+  # Conversation merging process
+  # useful for merging full conversational datasets, into single documents
+  # default is off, (or set conversation_key to [])
+  # conversation_formatting supports "iopairs" or "sender" for now.
+  # ---
+  # conversation_format: 'iopairs'
+  # conversation_key: 'conversation'
+  # conversation_end_of_conversation: "\n\nUser:"
+
+  # Iopairs specific config
+  # This means that every object in the conversation object is a pair of input output.
+  # In future it will also support a format where one of the keys dictates the format style
+  # if conversation_key is set to null, it will use the root object as the conversation object
+  # ---
+  # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  # conversation_input_key_mask: {'input': false, 'output': True}
+  # conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # Sender specific config
+  # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  # conversation_input_key_map: {'message': "\n\n{sender}: ", 'context': ''}
+  # conversation_sender_key: 'sender'
+  # conversation_sender_value_map: {'user': 'User', 'assistant': 'Assistant', 'system': 'System'}
+  # conversation_sender_mask: {'user': false, 'assistant': True, 'system': false}
+  # conversation_sender_suffix: {'user': "", 'assistant': "", 'system': ""}
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+  # ----------------------------
+  # Dataset split usage
+  # ----------------------------
+
+  source_dataset_split: "train"
+  test_dataset_split: "do-not-use-test-split"
+
+  # ----------------------------
+  # Rechunking support
+  # ----------------------------
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 2048
+  #
+  # This is ignored, if source is not set as text (unless text_rechunk_force)
+  # This is ignored, if set to zero / -1
+  # ---
+  text_rechunk_size: 4096
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: False
+
+  # Used to disable the automated text rechunkin for text files, if set as false
+  # ---
+  text_rechunk_auto: True
+
+  # ----------------------------
+  # Dataset packing support
+  # Recommended to be used with mixed documents sized finetuning
+  # For foundation model "from scratch", rechunking is typically used instead
+  # ----------------------------
+
+  # Boolean flag to enable / disable dataset packing
+  packing_enable: True
+
+  # Used to ensure all training samples wihin this batch size is the same length
+  # Ideally this should align exactly with your real "batch size"
+  #
+  # Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
+  # a large number of batch size combinations. This helps reduce the amount of
+  # misaligned batches, and thus reduce the amount of wasted training time.
+  #
+  # This is tagged to datapack.batchsize, unless overriden here or on a dataset level
+  # ---
+  # packing_batchsize: 20160
+
+  # Chunking size to align within each batch, this ideally should be equal to
+  # the training context length used.
+  packing_chunksize: 4096
+
+  # Minimum size to pack up to, this should be a multiple of packing_chunksize
+  # defautls to -1, which equals to packing_chunksize
+  packing_min_ctx_len: 4096
+
+  # Pack the data sequentially if possible, in accordance to the dataset sequence
+  # this can be used together with sort_by_length, otherwise a shuffle will be done
+  packing_in_sequence: False
+
+  # ----------------------------
+  # Specal use caes flags
+  # ----------------------------
+
+  # Reverse the training dataset order before saving, this is useful for,
+  # optimizing dataset packing process, when using packing_in_sequence
+  # and sort_by_length desc order together
+  reverse_train_dataset_before_save: False
+
+#
+# The dataset specific settings
+# 
+dataset:
+  
+  # ---
+  # Text based dataset
+  # ---
+
+  - # Lambada training text
+    # https://huggingface.co/datasets/lambada
+    source: "lambada"
+    name: "lambada-train"
+    # 4k rechunk forced
+    text_rechunk_force: True
+
+  - # Enwiki training text
+    # https://huggingface.co/datasets/teven/enwiki_100k
+    source: "teven/enwiki_100k"
+    name: "enwiki-train"
+    # 4k rechunk forced
+    min_token_size: 256
+    text_rechunk_force: True
+
+  # ---
+  # Copa style
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/pkavumba/balanced-copa
+
+  - # Balanced copa, framed as choices
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["### Premise:\n", "\n\n### Question:\nWhich choice was the", "1) ", "2) ", "\n### Answer:\n"]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [false, false, false, false, true]
+    multi_column_separator: ""
+  
+  - # Balanced copa, framed as options
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-options"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["Context: ", "\n\nQuestion: Which option was the", "1. ", "2. ", "\nAnswer: "]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [false, false, false, false, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Prompt completion / Q&A datasets
+  # ---
+
+  - # Question answer pair medical text
+    # https://huggingface.co/datasets/BI55/MedText
+    source: "BI55/MedText"   
+    name: "MedText-QA"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["Prompt", "Completion"]
+    multi_column_prefix: ["Question:\n", "Answer:\n"]
+    multi_column_suffix: ["", ""]
+    multi_column_train_mask: [false, true]
+    multi_column_separator: "\n\n"
+  
+  - # Language translation prompt/completion
+    # https://huggingface.co/datasets/kristaller486/ALMA-prompt-completion
+    source: "kristaller486/ALMA-prompt-completion"
+    name: "ALMA-prompt-completion"
+    # 4k packing
+    packing_enable: True
+    # Prompt completion, nothing else else
+
+  # ---
+  # openbookqa
+  # ---
+
+  # openbookqa
+  # https://huggingface.co/datasets/allenai/openbookqa
+
+  - # Openbookqa training, with the json
+    source: "allenai/openbookqa"
+    name: "openbookqa-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["fact1", "question_stem", "choices", "answerKey"]
+    multi_column_prefix: [">>> Premise:\n", "\n\nChoose the best option to complete the following:\n", "\n\nUsing the text options found in the following JSON:\n", "\n\n>>> Answer:\n"]
+    multi_column_suffix: ["", "", "\n\nAnswer using only the label given in the json", ""]
+    multi_column_train_mask: [false, false, false, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Winogrande
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/winogrande
+
+  - # Balanced copa, framed as choices
+    source: "winogrande"
+    name: "winogrande-debiased-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_debiased
+
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following sentence:\n", "\n1) ", "\n2) ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["\n\n Choose either 1 or 2, for which option is the best fit to replace _ in the sentence\n", "", "", ""]
+    multi_column_train_mask: [false, false, false, true]
+    multi_column_separator: ""
+  
+  - # Balanced copa, framed as choices
+    source: "winogrande"
+    name: "winogrande-l-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_l 
+    
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following statement: `", "\n1. ", "\n2. ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["`\n\n Choose 1 or 2, for which choice is the best fit to replace _ in the statement, answer only with the number given\n", "", "", ""]
+    multi_column_train_mask: [false, false, false, true]
+    multi_column_separator: ""
+  
+  # ---
+  # logiqa
+  # ---
+
+  # logiqa
+  # https://huggingface.co/datasets/lucasmccabe/logiqa
+  # ( This has a pyarrow error somehow ?, probably cause its an array/list internally )
+
+  # - # Openbookqa training, with the json
+  #   source: "lucasmccabe/logiqa"
+  #   name: "logiqa-options"
+  #   # 4k packing
+  #   packing_enable: True
+
+  #   # Question / Answer pairings
+  #   multi_column_keys: ["context", "query", "options", "correct_option"]
+  #   multi_column_prefix: [">>> Context:\n", "\n\n>>> Query:\n", "\n\nAnswer with the array index position (starting from 0), for the most appropriate option for the given query: ", "\n\n>>> Answer:\n"]
+  #   multi_column_suffix: ["", "", "", ""]
+  #   multi_column_train_mask: [false, false, false, true]
+  #   multi_column_separator: ""
+  
+  # ---
+  # arc_easy
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_easy
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_easy"
+    name: "arc_easy-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["question", "choices", "answerKey"]
+    multi_column_prefix: ["Question: ", "\n\nUsing the text options found in the following JSON:\n", "\n\nAnswer: "]
+    multi_column_suffix: ["", "\n\nAnswer using only the corresponding label given in the json", ""]
+    multi_column_train_mask: [false, false, true]
+    multi_column_separator: ""
+  
+  # ---
+  # arc_challenge
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_challenge
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_challenge"
+    name: "arc_challenge-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["choices", "question", "answerKey"]
+    multi_column_prefix: ["Using the text found in the following:\n", "\n\nQuestion: ", "\n\nAnswer: "]
+    multi_column_suffix: ["\n\nAnswer using only the respective label given", "", ""]
+    multi_column_train_mask: [false, false, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Piqa
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/piqa
+
+  - # Balanced copa, framed as choices
+    source: "piqa"
+    name: "piqa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["goal", "sol1", "sol2", "label"]
+    multi_column_prefix: ["# Goal: ", "\n\n0) ", "\n1) ", "\n\n# Answer: "]
+    multi_column_suffix: ["", "", "", ""]
+    multi_column_train_mask: [false, false, false, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Instruct datasets
+  # ---
+
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "Open-Orca/OpenOrca"
+  #   name: "OpenOrca"
+
+  #   multi_column_keys: ["system_prompt", "question", "response"]
+  #   multi_column_prefix: ["Instruction:\n", "", ""]
+  #   multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  #   multi_column_train_mask: [false, false, true]
+  #   multi_column_separator: ""
+    
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "teknium/openhermes"
+  #   name: "openhermes-1-instruct"
+
+  #   multi_column_keys: ["instruction", "input", "output"]
+  #   multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  #   multi_column_suffix: ["", "", ""]
+  #   multi_column_train_mask: [false, false, true]
+  #   multi_column_separator: "\n\n"
+    
+  # ---
+  # Chat datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Capybara"
+  #   name: "Capybara-chat"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\n>>> User: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\n>>> User: ", 'output': "\n\n>>> Assistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Pure-Dove"
+  #   name: "Pure-Dove"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+ 
+  # ---
+  # Other datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "teknium/OpenHermes-2.5"
+  #   name: "openhermes-2-convo"
+      
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'conversations'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'value': "\n\n{sender}: "}
+  #   conversation_sender_key: 'from'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  # - # Instruct, input, output format
+  #   # With the instruction format changed, to fix the formatting
+  #   # https://huggingface.co/datasets/Darok/Lamini-instructions-to-french
+  #   source: "Darok/Lamini-instructions-to-french"
+  #   name: "Lamini-instructions-to-french"
+
+  #   multi_column_keys: ["Input", "Response"]
+  #   multi_column_prefix: ["### Instruction:\nPlease translate the next sentence into French\n\n### Input:\n", "### Output:\n"]
+  #   multi_column_suffix: ["", ""]
+  #   multi_column_train_mask: [false, true]
+  #   multi_column_separator: "\n\n"
+    
+  # - # Long range instruction format
+  #   # https://huggingface.co/datasets/THUDM/LongAlign-10k/
+  #   source: "THUDM/LongAlign-10k"
+  #   name: "LongAlign-10k"
+
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'messages'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'content': "\n\n{sender}: "}
+  #   conversation_sender_key: 'role'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  ######################################################
+  # Note: You can probably throw in enwiki if you want
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "teven/enwiki_100k" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "enwiki_100k"
+
+  #   # Minimum / Maximum token size of the dataset to use
+  #   min_token_size: 1024
+  #   max_token_size: -1
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  # - # SuperWiki (Multi-lingual)
+  #   # https://huggingface.co/datasets/RyokoExtra/SuperWIKI-Cleaned
+  #   source: "RyokoExtra/SuperWIKI-Cleaned" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "super_wiki"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  #   source_dataset_split: lang25
+
+  #   # Custom text column to use, useful for dataset with alternative training columns labels
+  #   # This is checked before multi column merging, default is null (disabled)
+  #   # If set this takes priority
+  #   # eg: 'code'
+  #   # ---
+  #   custom_text_key: 'text'
+
+  #   # All other settings found in default can be overriden here
+  #   # ---
+  #   # ...
+
+  ######################################################
+  # Note: We found the ML generated textbooks
+  # too low in perplexity that it hurts the model
+  # so we are using the original enwiki_100k & superwiki
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "TanvirOnHF/muse_textbooks" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "muse_textbooks"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+  ######################################################
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml b/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml
new file mode 100644
index 00000000..d4516fc6
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml
@@ -0,0 +1,119 @@
+###############################################
+##
+## See the full `config-example.yaml` for more
+## detailes on the trainer/model configs
+##
+###############################################
+
+trainer:
+  # Multi node training settings
+  num_nodes: 1
+  microbatch_size: 8
+  strategy: deepspeed_stage_2
+  
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Resonable batch size, for a more realistic it/s rate
+  # this is currently overwritten in the notebook
+  target_batch_size: 1024
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'Eagle-2T-R4'
+      project: 'RWKV-V5-Eagle-2T-R4'
+      tags: ['Eagle', 'RWKV-V5']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: /checkpoint/retune/Eagle-R4-no-mask/
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 5
+      # Choose the most recent checkpoints by steps
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: true
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 25
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other pytorch lightning settings, which in most cases you can remove/ignore
+      # ---
+      # verbose: false
+      # auto_insert_metric_name: true
+  
+model:
+  # The model to load
+  load_model: /workspace/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth
+
+  # Starting and ending learning rate
+  lr_init: 5e-6
+  lr_final: 5e-6
+
+  # Training context length, note that the dataset can be
+  # larger then the context size, in which the trainer
+  # will process the dataset in chunks
+  ctx_len: 4096
+
+  # BPTT learning, this allows you to run the trainer against dataset
+  # larger then its training context length
+  bptt_learning: true
+  bptt_learning_range: 1
+
+########################################
+## Training model settings
+########################################
+data:
+  # Skip the datapath setup
+  #
+  # ignored if using the preload_datapath.py, useful for speeding up the trainer startup
+  # provided you have your datasets all properly preinitialized
+  # ---
+  skip_datapath_setup: True
+
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: /datapath/eval-retune/pack-no-mask/
+
+# Path to the current checkpoint to continue training from
+# this should be the directory path, and ends with `.ckpt/`
+# ckpt_path: /checkpoint/Eagle-2T-p1/last.ckpt
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml b/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml
new file mode 100644
index 00000000..5edab433
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml
@@ -0,0 +1,119 @@
+###############################################
+##
+## See the full `config-example.yaml` for more
+## detailes on the trainer/model configs
+##
+###############################################
+
+trainer:
+  # Multi node training settings
+  num_nodes: 1
+  microbatch_size: 8
+  strategy: deepspeed_stage_2
+  
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Resonable batch size, for a more realistic it/s rate
+  # this is currently overwritten in the notebook
+  target_batch_size: 1024
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'Eagle-2T-R4'
+      project: 'RWKV-V5-Eagle-2T-R4'
+      tags: ['Eagle', 'RWKV-V5']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: /checkpoint/retune/Eagle-R4-with-mask/
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 5
+      # Choose the most recent checkpoints by steps
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: true
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 25
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other pytorch lightning settings, which in most cases you can remove/ignore
+      # ---
+      # verbose: false
+      # auto_insert_metric_name: true
+  
+model:
+  # The model to load
+  load_model: /workspace/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth
+
+  # Starting and ending learning rate
+  lr_init: 5e-6
+  lr_final: 5e-6
+
+  # Training context length, note that the dataset can be
+  # larger then the context size, in which the trainer
+  # will process the dataset in chunks
+  ctx_len: 4096
+
+  # BPTT learning, this allows you to run the trainer against dataset
+  # larger then its training context length
+  bptt_learning: true
+  bptt_learning_range: 1
+
+########################################
+## Training model settings
+########################################
+data:
+  # Skip the datapath setup
+  #
+  # ignored if using the preload_datapath.py, useful for speeding up the trainer startup
+  # provided you have your datasets all properly preinitialized
+  # ---
+  skip_datapath_setup: True
+
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: /datapath/eval-retune/pack-with-mask/
+
+# Path to the current checkpoint to continue training from
+# this should be the directory path, and ends with `.ckpt/`
+# ckpt_path: /checkpoint/Eagle-2T-p1/last.ckpt

From f2acb6adb7f05029435af8dbb8b68aa029f311e0 Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Thu, 7 Mar 2024 08:32:33 +0000
Subject: [PATCH 08/13] wip test split testing

---
 .../Eagle-2T-retune/eagle-7b-15t-runs.ipynb   | 188 ++++++-
 .../Eagle-2T-retune/eagle-7b-base-runs.ipynb  | 487 ++++++++++++++++++
 2 files changed, 670 insertions(+), 5 deletions(-)
 create mode 100644 notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb

diff --git a/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb b/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
index 53122919..9b4ec590 100644
--- a/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
+++ b/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -371,7 +371,162 @@
       "  warnings.warn(\n",
       "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
       "  warnings.warn(\n",
-      "Epoch 0:  32%|▎| 936/2922 [1:11:09<2:30:59,  0.22it/s, v_num=8m91, train/tok=2.0"
+      "Epoch 0: 100%|▉| 2921/2922 [3:40:32<00:04,  0.22it/s, v_num=8m91, train/tok=6.29[rank5]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-07 08:23:07,953] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-07 08:23:07,954] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-07 08:23:07,955] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:41:00<00:00,  0.22it/s, v_num=8m91, train/tok=6.29\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A[rank2]:[2024-03-07 08:23:36,694] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 08:23:36,694] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank2]:[2024-03-07 08:23:36,694] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 08:23:36,695] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 08:23:36,695] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank7]:[2024-03-07 08:23:36,695] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 08:23:36,698] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 08:23:36,698] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank6]:[2024-03-07 08:23:36,698] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 08:23:36,698] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 08:23:36,698] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank1]:[2024-03-07 08:23:36,698] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 08:23:36,699] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 08:23:36,699] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank3]:[2024-03-07 08:23:36,699] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 08:23:36,700] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 08:23:36,700] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank0]:[2024-03-07 08:23:36,700] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 08:23:36,712] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 08:23:36,712] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank4]:[2024-03-07 08:23:36,712] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 08:23:36,716] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 08:23:36,716] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank5]:[2024-03-07 08:23:36,716] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:01<00:00,  0.67it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:41:02<00:00,  0.22it/s, v_num=8m91, train/tok=6.29`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:41:02<00:00,  0.22it/s, v_num=8m91, train/tok=6.29\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.044 MB of 0.044 MB uploaded (0.008 MB deduped)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch ▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec ▁▅▇▇██▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step ▅▆▄▁▆▆▄▆▆▁▆▄▄▃▆▂▆▆▇▄▇▇▇▄▇▄▆█▆▄▆▄▆▆▄▅▃▆▄▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 ▁▅▇▇██▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 ▇▇▁▇▇▇▇▇▇▁▇█▇▇▇▇▇▇▇▇█▇█▁▇▇▇█▇█▇▇▇▅▇▇▇▇█▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen ▅█▅▁██▄██▁█▄▄▄█▃███▄███▅█▅███▄█▄█▇▅▇▃█▄█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss █▄▄▆▅▆▅▆▆▃▇▄▄▆▆▄▄▅▆▇▄▄█▆▄▂▃▅▅▁▃▅▂▅▇▆▃▄▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss █▄▄▆▅▆▅▆▆▃▇▄▄▆▆▄▄▅▆▇▄▄█▆▄▂▃▅▅▁▃▅▂▅▇▆▃▄▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens ▅▅▃▄▅█▄▅█▁█▃▆▆▇▄▆▅█▆▇▇▇▃▅▄▅▆█▄▅▁█▇▅▅▅█▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss █▃▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step 149.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec 57.53983\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step 8.57743\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total 761867.875\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 7.19674\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 1.07218\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 95289.894\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep 23368\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss 2.11816\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss 2.11816\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss 2.05111\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok 628883456.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step 182\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate 1e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen 4096.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss 1.90576\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss 1.90576\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mEagle-Retune - 7B-15t-No-Mask (deepspeed_stage_2)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/p8i98m91\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjE0NjI1ODU5Ng==/version_details/v2\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240307_044154-p8i98m91/logs\u001b[0m\n"
      ]
     }
    ],
@@ -401,9 +556,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 08:25:45,193] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/7B-15t-No-Mask/last.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 710 params 7518044160 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-7B-15t-No-Mask.pth\n",
+      "-rw-r--r-- 1 nobody root 15G Mar  7 08:27 /workspace/main-models/R4-retune/R4-7B-15t-No-Mask.pth\n"
+     ]
+    }
+   ],
    "source": [
     "# Lets export the model from the checkpoint\n",
     "EXPERIMENT_NAME=\"7B-15t-No-Mask\"\n",
@@ -418,7 +587,16 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "R4-7B-15t-No-Mask.pth:  50%|███████▍       | 7.50G/15.0G [03:25<02:52, 43.6MB/s]"
+     ]
+    }
+   ],
    "source": [
     "EXPERIMENT_NAME=\"7B-15t-No-Mask\"\n",
     "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
diff --git a/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb b/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb
new file mode 100644
index 00000000..b445e0c9
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb
@@ -0,0 +1,487 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Perform Retune runs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ENABLE_WANDB: True\n",
+      "GPU_DEVICES: auto\n",
+      "NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune\n",
+      "TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5\n",
+      "PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer\n"
+     ]
+    }
+   ],
+   "source": [
+    "GPU_DEVICES=\"auto\"\n",
+    "ENABLE_WANDB=True\n",
+    "WANDB_PREFIX=\"Eagle-Retune\"\n",
+    "DEEPSPEED_STRAT=\"deepspeed_stage_2\"\n",
+    "LEARNING_RATE=\"5e-6\"\n",
+    "\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# The model to start from\n",
+    "MODEL_PATH=\"/workspace/main-models/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth\"\n",
+    "MICROBATCH_SIZE=8\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 05:57:14,913] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-Base-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-Base-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-Base-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-Base-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 2332856914\n",
+      "Seed set to 2332856914\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "[rank: 0] Seed set to 2332856914\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-07 05:58:19,989] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 05:58:20,003] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 05:58:20,012] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 05:58:20,017] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 05:58:20,058] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 05:58:20,058] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 05:58:20,059] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 2] Seed set to 2332856914\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 4] Seed set to 2332856914\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 6] Seed set to 2332856914\n",
+      "[rank: 7] Seed set to 2332856914\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 1] Seed set to 2332856914\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 5] Seed set to 2332856914\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 3] Seed set to 2332856914\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 7] Seed set to 2332856914\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 2] Seed set to 2332856914\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 1] Seed set to 2332856914\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 6] Seed set to 2332856914\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 3] Seed set to 2332856914\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 4] Seed set to 2332856914\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 5] Seed set to 2332856914\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_060016-3k4mg2rv\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 7B-Base-No-Mask (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/3k4mg2rv\u001b[0m\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05054736137390137 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10110187530517578 seconds\n",
+      "Time to load fused_adam op: 0.10103106498718262 seconds\n",
+      "Time to load fused_adam op: 0.10123372077941895 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10117864608764648 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10124063491821289 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10187435150146484 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10411763191223145 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 268 M \n",
+      "1 | blocks | ModuleList | 7.0 B \n",
+      "2 | ln_out | LayerNorm  | 8.2 K \n",
+      "3 | head   | Linear     | 268 M \n",
+      "--------------------------------------\n",
+      "7.5 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "7.5 B     Total params\n",
+      "30,072.177Total estimated model params size (MB)\n",
+      "Epoch 0:  14%|██▎              | 400/2922 [27:47<2:55:13,  0.24it/s, v_num=g2rv]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0:  70%|▋| 2036/2922 [2:31:01<1:05:43,  0.22it/s, v_num=g2rv, train/tok=4."
+     ]
+    }
+   ],
+   "source": [
+    "# The 7B model\n",
+    "EXPERIMENT_NAME=\"7B-Base-No-Mask\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-no-mask.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"7B-Base-No-Mask\"\n",
+    "CKPT_DIR=\"last.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EXPERIMENT_NAME=\"7B-Base-No-Mask\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The 7B model\n",
+    "EXPERIMENT_NAME=\"7B-Base-With-Mask\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-with-mask.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"7B-Base-With-Mask\"\n",
+    "CKPT_DIR=\"last.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EXPERIMENT_NAME=\"7B-Base-With-Mask\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From c7ce34d90e33d675e76997833ed1521436572667 Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Thu, 7 Mar 2024 09:50:32 +0000
Subject: [PATCH 09/13] WIP train, and 32k dataset

---
 .../Eagle-2T-retune/data-prep.ipynb           |  10 +
 .../Eagle-2T-retune/eagle-7b-15t-runs.ipynb   | 293 +++++++-
 .../Eagle-2T-retune/eagle-7b-base-runs.ipynb  | 455 +++++++++++-
 .../retune-data-build-no-mask-32k.yaml        | 658 ++++++++++++++++++
 4 files changed, 1406 insertions(+), 10 deletions(-)
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-32k.yaml

diff --git a/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb b/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
index cfcefcf9..ad4561d8 100644
--- a/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
+++ b/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
@@ -222,6 +222,16 @@
     "# Lets build the giant datapack\n",
     "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask.yaml\""
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets build the giant datapack\n",
+    "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask-32k.yaml\""
+   ]
   }
  ],
  "metadata": {
diff --git a/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb b/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
index 9b4ec590..3fc7ae81 100644
--- a/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
+++ b/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
@@ -585,7 +585,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -593,7 +593,8 @@
      "output_type": "stream",
      "text": [
       "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
-      "R4-7B-15t-No-Mask.pth:  50%|███████▍       | 7.50G/15.0G [03:25<02:52, 43.6MB/s]"
+      "R4-7B-15t-No-Mask.pth: 100%|███████████████| 15.0G/15.0G [06:34<00:00, 38.1MB/s]\n",
+      "https://huggingface.co/rwkv-x-dev/eagle-7b-experiment/blob/main/R4-7B-15t-No-Mask.pth\n"
      ]
     }
    ],
@@ -607,7 +608,293 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 08:36:08,716] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml', '--model.load_model=/workspace/main-models/Eagle-2T/chunk8-1-0.85.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-With-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-With-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml', '--model.load_model=/workspace/main-models/Eagle-2T/chunk8-1-0.85.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-With-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-With-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 3634212335\n",
+      "Seed set to 3634212335\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "[rank: 0] Seed set to 3634212335\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-07 08:36:52,845] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 08:36:52,879] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 08:36:52,927] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 08:36:52,977] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 08:36:52,986] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 08:36:52,993] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 08:36:53,043] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 4] Seed set to 3634212335\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 5] Seed set to 3634212335\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 1] Seed set to 3634212335\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 6] Seed set to 3634212335\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 3] Seed set to 3634212335\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 2] Seed set to 3634212335\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 7] Seed set to 3634212335\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 5] Seed set to 3634212335\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 1] Seed set to 3634212335\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 6] Seed set to 3634212335\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 3] Seed set to 3634212335\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 2] Seed set to 3634212335\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 7] Seed set to 3634212335\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 4] Seed set to 3634212335\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_083754-1d2d77a7\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 7B-15t-With-Mask (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/1d2d77a7\u001b[0m\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.048171281814575195 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10106372833251953 seconds\n",
+      "Time to load fused_adam op: 0.10101103782653809 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10218214988708496 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10167479515075684 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10123562812805176 seconds\n",
+      "Time to load fused_adam op: 0.10174727439880371 seconds\n",
+      "Time to load fused_adam op: 0.10139894485473633 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 268 M \n",
+      "1 | blocks | ModuleList | 7.0 B \n",
+      "2 | ln_out | LayerNorm  | 8.2 K \n",
+      "3 | head   | Linear     | 268 M \n",
+      "--------------------------------------\n",
+      "7.5 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "7.5 B     Total params\n",
+      "30,072.177Total estimated model params size (MB)\n",
+      "Epoch 0:  14%|██▎              | 400/2922 [27:59<2:56:26,  0.24it/s, v_num=77a7]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0:  32%|▎| 943/2922 [1:10:51<2:28:41,  0.22it/s, v_num=77a7, train/tok=2.0"
+     ]
+    }
+   ],
    "source": [
     "# The 7B model\n",
     "EXPERIMENT_NAME=\"7B-15t-With-Mask\"\n",
diff --git a/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb b/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb
index b445e0c9..ca1677d8 100644
--- a/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb
+++ b/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -346,7 +346,162 @@
       "  warnings.warn(\n",
       "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
       "  warnings.warn(\n",
-      "Epoch 0:  70%|▋| 2036/2922 [2:31:01<1:05:43,  0.22it/s, v_num=g2rv, train/tok=4."
+      "Epoch 0: 100%|▉| 2921/2922 [3:36:02<00:04,  0.23it/s, v_num=g2rv, train/tok=6.29[rank1]:[2024-03-07 09:37:05,097] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 09:37:05,097] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-07 09:37:05,097] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-07 09:37:05,098] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-07 09:37:05,099] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 09:37:05,100] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 09:37:05,100] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-07 09:37:05,100] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 09:37:05,100] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 09:37:05,100] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-07 09:37:05,100] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:36:30<00:00,  0.22it/s, v_num=g2rv, train/tok=6.29\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A[rank7]:[2024-03-07 09:37:34,062] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 09:37:34,062] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank7]:[2024-03-07 09:37:34,062] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 09:37:34,070] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 09:37:34,070] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank2]:[2024-03-07 09:37:34,070] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 09:37:34,071] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 09:37:34,071] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank1]:[2024-03-07 09:37:34,071] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 09:37:34,073] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 09:37:34,073] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank3]:[2024-03-07 09:37:34,073] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 09:37:34,074] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 09:37:34,074] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank6]:[2024-03-07 09:37:34,074] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 09:37:34,078] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 09:37:34,078] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank5]:[2024-03-07 09:37:34,078] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 09:37:34,078] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 09:37:34,078] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank4]:[2024-03-07 09:37:34,078] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 09:37:34,081] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 09:37:34,081] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank0]:[2024-03-07 09:37:34,081] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:01<00:00,  0.71it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:36:32<00:00,  0.22it/s, v_num=g2rv, train/tok=6.29`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:36:32<00:00,  0.22it/s, v_num=g2rv, train/tok=6.29\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.034 MB of 0.044 MB uploaded\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch ▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec ▁▅▇▇██▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step ▅▇▅▁▆▇▄▇▇▁▇▃▃▄▇▂▆▆▇▄▇▇▇▄▆▄▇█▇▃▇▄▇▅▄▆▂▇▄▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 ▁▅▇▇██▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 ██▁██████▂█████▇▇██▇███▁▇████████▅▇█▇██▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen ▅█▅▁██▄██▁█▄▄▄█▃███▄███▅█▅███▄█▄█▇▅▇▃█▄█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss ▇▃▃▅▅▇▇██▄▇▅▄▆▇▅▄▅▆▇▄▄█▆▄▂▃▅▆▁▃▅▂▅▇▆▃▄▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss ▇▃▃▅▅▇▇██▄▇▅▄▆▇▅▄▅▆▇▄▄█▆▄▂▃▅▆▁▃▅▂▅▇▆▃▄▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens ▅▅▃▄▅█▄▅█▁█▃▆▆▇▄▆▅█▆▇▇▇▃▅▄▅▆█▄▅▁█▇▅▅▅█▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss █▄▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step 149.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec 58.73783\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step 8.50058\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total 761867.875\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 7.34658\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 1.06257\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 95289.894\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep 23368\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss 2.13086\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss 2.13086\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss 2.06598\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok 628883456.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step 182\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate 1e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen 4096.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss 1.97656\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss 1.97656\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mEagle-Retune - 7B-Base-No-Mask (deepspeed_stage_2)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/3k4mg2rv\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjE0NjI1ODU5Ng==/version_details/v3\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240307_060016-3k4mg2rv/logs\u001b[0m\n"
      ]
     }
    ],
@@ -376,9 +531,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 09:38:14,170] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/7B-Base-No-Mask/last.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 710 params 7518044160 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-7B-Base-No-Mask.pth\n",
+      "-rw-r--r-- 1 nobody root 15G Mar  7 09:39 /workspace/main-models/R4-retune/R4-7B-Base-No-Mask.pth\n"
+     ]
+    }
+   ],
    "source": [
     "# Lets export the model from the checkpoint\n",
     "EXPERIMENT_NAME=\"7B-Base-No-Mask\"\n",
@@ -391,9 +560,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "R4-7B-Base-No-Mask.pth: 100%|██████████████| 15.0G/15.0G [06:28<00:00, 38.7MB/s]\n",
+      "https://huggingface.co/rwkv-x-dev/eagle-7b-experiment/blob/main/R4-7B-Base-No-Mask.pth\n"
+     ]
+    }
+   ],
    "source": [
     "EXPERIMENT_NAME=\"7B-Base-No-Mask\"\n",
     "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
@@ -404,7 +583,269 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 09:46:36,561] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml', '--model.load_model=/workspace/main-models/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-Base-With-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-Base-With-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml', '--model.load_model=/workspace/main-models/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-Base-With-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-Base-With-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 1642324168\n",
+      "Seed set to 1642324168\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "[rank: 0] Seed set to 1642324168\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-07 09:47:20,024] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 09:47:20,050] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 09:47:20,096] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 09:47:20,201] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 09:47:20,204] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 09:47:20,238] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 09:47:20,252] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 6] Seed set to 1642324168\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 7] Seed set to 1642324168\n",
+      "[rank: 3] Seed set to 1642324168\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 4] Seed set to 1642324168\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 5] Seed set to 1642324168\n",
+      "[rank: 2] Seed set to 1642324168\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 1] Seed set to 1642324168\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 5] Seed set to 1642324168\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 1] Seed set to 1642324168\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 4] Seed set to 1642324168\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 6] Seed set to 1642324168\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 3] Seed set to 1642324168\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 7] Seed set to 1642324168\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "[rank: 2] Seed set to 1642324168\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_094842-9u7s91jy\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 7B-Base-With-Mask (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/9u7s91jy\u001b[0m\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.047239065170288086 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.1010434627532959 seconds\n",
+      "Time to load fused_adam op: 0.10138964653015137 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10202407836914062 seconds\n",
+      "Time to load fused_adam op: 0.10114169120788574 seconds\n",
+      "Time to load fused_adam op: 0.10132956504821777 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.1024618148803711 seconds\n",
+      "Time to load fused_adam op: 0.10168576240539551 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n"
+     ]
+    }
+   ],
    "source": [
     "# The 7B model\n",
     "EXPERIMENT_NAME=\"7B-Base-With-Mask\"\n",
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-32k.yaml b/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-32k.yaml
new file mode 100644
index 00000000..3f16d6e9
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-32k.yaml
@@ -0,0 +1,658 @@
+#
+# Custom multiple datasource, built as a single datapack
+#
+datapack:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # If using relative path, this should be relative to the trainer script path
+  data_path: /datapath/eval-retune/pack-no-mask-32k/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Mixing mode to use, this is used to alternate between datasets
+  #
+  # - concat  : Keep It Simple Silly, lets just concat the datasets together
+  # - shuffle : Dataset is mixed on a per sample level
+  #
+  # (@TODO: Advance operations)
+  # - batch   : Meaning one dataset worth per batch, partial batches are discarded
+  mixing_mode: "shuffle"
+
+#
+# Default settings used across all datasets in the datapack
+# These settings can be overriden by the dataset specific settings
+#
+default:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # Datapath here is entirely optional, and only used if you intend to save each individual dataset
+  # seperately (makes it easier to tweak and rebuild the datapack if it crash mid-way)
+  #
+  # The dataset index will be appended to the default value, if set
+  # ---
+  data_path: /datapath/eval-retune/partial-no-mask-32k/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Additional source dataset params, used to grab subsets of the dataset
+  # ---
+  # source_dataset_params:
+  #   language: en
+
+  # Sort the dataset by length, useful to reduce gpu waiting time (also useful for RWKV long context coherence)
+  # ---
+  # sort_by_length: false
+  # sort_asc: True # Sort in ascending order, true = shortest first, false = longest first
+
+  # Limit the document count, to an offset/length limit
+  # If an int value is used, it is interprated as document count
+  # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset
+  # ---
+  # dataset_offset: -1
+  # dataset_length: -1
+
+  # Use data_dir, if you are using source=text/json/etc
+  # If using relative path, this should be relative to the trainer script path
+  # source_data_dir: ../dataset-text/
+
+  # After loading the dataset, split out test data used for validation, 
+  # This process is skipped if the dataset includes a test split
+  #
+  # If given a float value, a percentage of the dataset is used (1.0 being 100%)
+  # If given an int value, the number of data sample is used.
+  #
+  # Due to the limitaitons in the trainer process, there is always a minimum of 1 test sample
+  test_split: 1 # Intentionally set to a low sample for test, cause the real eval is humans
+  test_split_shuffle: True
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the HF tokenizer name/path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 1024 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  # ---
+  min_token_size: -1
+  max_token_size: -1
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  #
+  # IMPORTANT NOTE: as newlines are commonly used for multi_column_suffix, etc. 
+  #                 you should use single quotes to ensure such values dun get escaped.
+  #                 eg. multi_column_suffix: ['\n\n']
+  #
+  # See: https://github.com/RWKV/RWKV-infctx-trainer/issues/34
+  # Need to use " or the new lines won't be tokenized properly
+  # ---
+  # multi_column_keys: ["instruction", "input", "output"]
+  # multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  # multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  # multi_column_train_mask: [true, true, true]
+  # multi_column_separator: "\n\n"
+  
+  # Conversation merging process
+  # useful for merging full conversational datasets, into single documents
+  # default is off, (or set conversation_key to [])
+  # conversation_formatting supports "iopairs" or "sender" for now.
+  # ---
+  # conversation_format: 'iopairs'
+  # conversation_key: 'conversation'
+  # conversation_end_of_conversation: "\n\nUser:"
+
+  # Iopairs specific config
+  # This means that every object in the conversation object is a pair of input output.
+  # In future it will also support a format where one of the keys dictates the format style
+  # if conversation_key is set to null, it will use the root object as the conversation object
+  # ---
+  # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  # conversation_input_key_mask: {'input': false, 'output': True}
+  # conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # Sender specific config
+  # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  # conversation_input_key_map: {'message': "\n\n{sender}: ", 'context': ''}
+  # conversation_sender_key: 'sender'
+  # conversation_sender_value_map: {'user': 'User', 'assistant': 'Assistant', 'system': 'System'}
+  # conversation_sender_mask: {'user': false, 'assistant': True, 'system': false}
+  # conversation_sender_suffix: {'user': "", 'assistant': "", 'system': ""}
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+  # ----------------------------
+  # Dataset split usage
+  # ----------------------------
+
+  source_dataset_split: "train"
+  test_dataset_split: "do-not-use-test-split"
+
+  # ----------------------------
+  # Rechunking support
+  # ----------------------------
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 2048
+  #
+  # This is ignored, if source is not set as text (unless text_rechunk_force)
+  # This is ignored, if set to zero / -1
+  # ---
+  text_rechunk_size: 32768
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: False
+
+  # Used to disable the automated text rechunkin for text files, if set as false
+  # ---
+  text_rechunk_auto: True
+
+  # ----------------------------
+  # Dataset packing support
+  # Recommended to be used with mixed documents sized finetuning
+  # For foundation model "from scratch", rechunking is typically used instead
+  # ----------------------------
+
+  # Boolean flag to enable / disable dataset packing
+  packing_enable: True
+
+  # Used to ensure all training samples wihin this batch size is the same length
+  # Ideally this should align exactly with your real "batch size"
+  #
+  # Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
+  # a large number of batch size combinations. This helps reduce the amount of
+  # misaligned batches, and thus reduce the amount of wasted training time.
+  #
+  # This is tagged to datapack.batchsize, unless overriden here or on a dataset level
+  # ---
+  # packing_batchsize: 20160
+
+  # Chunking size to align within each batch, this ideally should be equal to
+  # the training context length used.
+  packing_chunksize: 32768
+
+  # Minimum size to pack up to, this should be a multiple of packing_chunksize
+  # defautls to -1, which equals to packing_chunksize
+  packing_min_ctx_len: 32768
+
+  # Pack the data sequentially if possible, in accordance to the dataset sequence
+  # this can be used together with sort_by_length, otherwise a shuffle will be done
+  packing_in_sequence: False
+
+  # ----------------------------
+  # Specal use caes flags
+  # ----------------------------
+
+  # Reverse the training dataset order before saving, this is useful for,
+  # optimizing dataset packing process, when using packing_in_sequence
+  # and sort_by_length desc order together
+  reverse_train_dataset_before_save: False
+
+#
+# The dataset specific settings
+# 
+dataset:
+  
+  # ---
+  # Text based dataset
+  # ---
+
+  - # Lambada training text
+    # https://huggingface.co/datasets/lambada
+    source: "lambada"
+    name: "lambada-train"
+    # 4k rechunk forced
+    text_rechunk_force: True
+
+  - # Enwiki training text
+    # https://huggingface.co/datasets/teven/enwiki_100k
+    source: "teven/enwiki_100k"
+    name: "enwiki-train"
+    # 4k rechunk forced
+    min_token_size: 256
+    text_rechunk_force: True
+
+  # ---
+  # Copa style
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/pkavumba/balanced-copa
+
+  - # Balanced copa, framed as choices
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["### Premise:\n", "\n\n### Question:\nWhich choice was the", "1) ", "2) ", "\n### Answer:\n"]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [true, true, true, true, true]
+    multi_column_separator: ""
+  
+  - # Balanced copa, framed as options
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-options"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["Context: ", "\n\nQuestion: Which option was the", "1. ", "2. ", "\nAnswer: "]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [true, true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Prompt completion / Q&A datasets
+  # ---
+
+  - # Question answer pair medical text
+    # https://huggingface.co/datasets/BI55/MedText
+    source: "BI55/MedText"   
+    name: "MedText-QA"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["Prompt", "Completion"]
+    multi_column_prefix: ["Question:\n", "Answer:\n"]
+    multi_column_suffix: ["", ""]
+    multi_column_train_mask: [true, true]
+    multi_column_separator: "\n\n"
+  
+  - # Language translation prompt/completion
+    # https://huggingface.co/datasets/kristaller486/ALMA-prompt-completion
+    source: "kristaller486/ALMA-prompt-completion"
+    name: "ALMA-prompt-completion"
+    # 4k packing
+    packing_enable: True
+    # Prompt completion, nothing else else
+
+  # ---
+  # openbookqa
+  # ---
+
+  # openbookqa
+  # https://huggingface.co/datasets/allenai/openbookqa
+
+  - # Openbookqa training, with the json
+    source: "allenai/openbookqa"
+    name: "openbookqa-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["fact1", "question_stem", "choices", "answerKey"]
+    multi_column_prefix: [">>> Premise:\n", "\n\nChoose the best option to complete the following:\n", "\n\nUsing the text options found in the following JSON:\n", "\n\n>>> Answer:\n"]
+    multi_column_suffix: ["", "", "\n\nAnswer using only the label given in the json", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Winogrande
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/winogrande
+
+  - # Balanced copa, framed as choices
+    source: "winogrande"
+    name: "winogrande-debiased-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_debiased
+
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following sentence:\n", "\n1) ", "\n2) ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["\n\n Choose either 1 or 2, for which option is the best fit to replace _ in the sentence\n", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  - # Balanced copa, framed as choices
+    source: "winogrande"
+    name: "winogrande-l-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_l 
+    
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following statement: `", "\n1. ", "\n2. ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["`\n\n Choose 1 or 2, for which choice is the best fit to replace _ in the statement, answer only with the number given\n", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # logiqa
+  # ---
+
+  # logiqa
+  # https://huggingface.co/datasets/lucasmccabe/logiqa
+  # ( This has a pyarrow error somehow ?, probably cause its an array/list internally )
+
+  # - # Openbookqa training, with the json
+  #   source: "lucasmccabe/logiqa"
+  #   name: "logiqa-options"
+  #   # 4k packing
+  #   packing_enable: True
+
+  #   # Question / Answer pairings
+  #   multi_column_keys: ["context", "query", "options", "correct_option"]
+  #   multi_column_prefix: [">>> Context:\n", "\n\n>>> Query:\n", "\n\nAnswer with the array index position (starting from 0), for the most appropriate option for the given query: ", "\n\n>>> Answer:\n"]
+  #   multi_column_suffix: ["", "", "", ""]
+  #   multi_column_train_mask: [true, true, true, true]
+  #   multi_column_separator: ""
+  
+  # ---
+  # arc_easy
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_easy
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_easy"
+    name: "arc_easy-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["question", "choices", "answerKey"]
+    multi_column_prefix: ["Question: ", "\n\nUsing the text options found in the following JSON:\n", "\n\nAnswer: "]
+    multi_column_suffix: ["", "\n\nAnswer using only the corresponding label given in the json", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # arc_challenge
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_challenge
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_challenge"
+    name: "arc_challenge-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["choices", "question", "answerKey"]
+    multi_column_prefix: ["Using the text found in the following:\n", "\n\nQuestion: ", "\n\nAnswer: "]
+    multi_column_suffix: ["\n\nAnswer using only the respective label given", "", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Piqa
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/piqa
+
+  - # Balanced copa, framed as choices
+    source: "piqa"
+    name: "piqa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["goal", "sol1", "sol2", "label"]
+    multi_column_prefix: ["# Goal: ", "\n\n0) ", "\n1) ", "\n\n# Answer: "]
+    multi_column_suffix: ["", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Instruct datasets
+  # ---
+
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "Open-Orca/OpenOrca"
+  #   name: "OpenOrca"
+
+  #   multi_column_keys: ["system_prompt", "question", "response"]
+  #   multi_column_prefix: ["Instruction:\n", "", ""]
+  #   multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  #   multi_column_train_mask: [true, true, true]
+  #   multi_column_separator: ""
+    
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "teknium/openhermes"
+  #   name: "openhermes-1-instruct"
+
+  #   multi_column_keys: ["instruction", "input", "output"]
+  #   multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  #   multi_column_suffix: ["", "", ""]
+  #   multi_column_train_mask: [true, true, true]
+  #   multi_column_separator: "\n\n"
+    
+  # ---
+  # Chat datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Capybara"
+  #   name: "Capybara-chat"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\n>>> User: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\n>>> User: ", 'output': "\n\n>>> Assistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Pure-Dove"
+  #   name: "Pure-Dove"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+ 
+  # ---
+  # Other datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "teknium/OpenHermes-2.5"
+  #   name: "openhermes-2-convo"
+      
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'conversations'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'value': "\n\n{sender}: "}
+  #   conversation_sender_key: 'from'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  # - # Instruct, input, output format
+  #   # With the instruction format changed, to fix the formatting
+  #   # https://huggingface.co/datasets/Darok/Lamini-instructions-to-french
+  #   source: "Darok/Lamini-instructions-to-french"
+  #   name: "Lamini-instructions-to-french"
+
+  #   multi_column_keys: ["Input", "Response"]
+  #   multi_column_prefix: ["### Instruction:\nPlease translate the next sentence into French\n\n### Input:\n", "### Output:\n"]
+  #   multi_column_suffix: ["", ""]
+  #   multi_column_train_mask: [true, true]
+  #   multi_column_separator: "\n\n"
+    
+  # - # Long range instruction format
+  #   # https://huggingface.co/datasets/THUDM/LongAlign-10k/
+  #   source: "THUDM/LongAlign-10k"
+  #   name: "LongAlign-10k"
+
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'messages'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'content': "\n\n{sender}: "}
+  #   conversation_sender_key: 'role'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  ######################################################
+  # Note: You can probably throw in enwiki if you want
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "teven/enwiki_100k" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "enwiki_100k"
+
+  #   # Minimum / Maximum token size of the dataset to use
+  #   min_token_size: 1024
+  #   max_token_size: -1
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  # - # SuperWiki (Multi-lingual)
+  #   # https://huggingface.co/datasets/RyokoExtra/SuperWIKI-Cleaned
+  #   source: "RyokoExtra/SuperWIKI-Cleaned" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "super_wiki"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  #   source_dataset_split: lang25
+
+  #   # Custom text column to use, useful for dataset with alternative training columns labels
+  #   # This is checked before multi column merging, default is null (disabled)
+  #   # If set this takes priority
+  #   # eg: 'code'
+  #   # ---
+  #   custom_text_key: 'text'
+
+  #   # All other settings found in default can be overriden here
+  #   # ---
+  #   # ...
+
+  ######################################################
+  # Note: We found the ML generated textbooks
+  # too low in perplexity that it hurts the model
+  # so we are using the original enwiki_100k & superwiki
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "TanvirOnHF/muse_textbooks" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "muse_textbooks"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+  ######################################################

From 05eb723e61e5735b9e0e2a4147495ac706585854 Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Thu, 7 Mar 2024 20:57:48 +0000
Subject: [PATCH 10/13] wip dataprep

---
 .../Eagle-2T-retune/data-prep.ipynb           | 144 ++-
 .../Eagle-2T-retune/eagle-7b-15t-runs.ipynb   | 844 ++++++++++++++++--
 .../Eagle-2T-retune/eagle-7b-base-runs.ipynb  | 742 ++++++++++++++-
 .../retune-data-build-no-mask-no-text.yaml    | 658 ++++++++++++++
 .../retune-train-no-mask-32k.yaml             | 119 +++
 .../Eagle-2T-retune/retune-train-no-mask.yaml |   2 +-
 .../retune-train-with-mask.yaml               |   2 +-
 7 files changed, 2424 insertions(+), 87 deletions(-)
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-no-text.yaml
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-train-no-mask-32k.yaml

diff --git a/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb b/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
index ad4561d8..3c50068a 100644
--- a/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
+++ b/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
@@ -223,6 +223,136 @@
     "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask.yaml\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">> Starting datapack build process for: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-32k.yaml\n",
+      ">> Preparing dataset - index:  0  - name:  lambada-train\n",
+      "Map (num_proc=160): 100%|███████████| 2662/2662 [00:09<00:00, 294.89 examples/s]\n",
+      "Filter (num_proc=160): 100%|████████| 2662/2662 [00:03<00:00, 723.14 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 2661/2661 [00:06<00:00, 436.35 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 7221/7221 [00:06<00:00, 1196.52 examples/s]\n",
+      "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+      "Saving the dataset (3/3 shards): 100%|█| 7221/7221 [00:08<00:00, 860.36 examples\n",
+      "Saving the dataset (1/1 shards): 100%|████| 1/1 [00:00<00:00, 122.92 examples/s]\n",
+      ">> Preparing dataset - index:  1  - name:  enwiki-train\n",
+      "Map (num_proc=160): 100%|███| 1000000/1000000 [00:21<00:00, 46232.78 examples/s]\n",
+      "Filter (num_proc=160): 100%|█| 1000000/1000000 [00:07<00:00, 136515.60 examples/\n",
+      "Map (num_proc=160): 100%|█████| 472276/472276 [00:14<00:00, 33143.01 examples/s]\n",
+      "Map (num_proc=160): 100%|████████| 15456/15456 [00:12<00:00, 1265.51 examples/s]\n",
+      "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+      "Saving the dataset (7/7 shards): 100%|█| 15456/15456 [00:18<00:00, 850.72 exampl\n",
+      "Saving the dataset (1/1 shards): 100%|████| 1/1 [00:00<00:00, 108.06 examples/s]\n",
+      ">> Preparing dataset - index:  2  - name:  balanced-copa-choices\n",
+      "Map (num_proc=160): 100%|███████████| 1000/1000 [00:01<00:00, 944.86 examples/s]\n",
+      "Filter (num_proc=160): 100%|████████| 1000/1000 [00:01<00:00, 972.65 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 999/999 [00:02<00:00, 414.00 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 999/999 [00:01<00:00, 540.03 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4872.92 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.52 examples/s]\n",
+      ">> Preparing dataset - index:  3  - name:  balanced-copa-options\n",
+      "Map (num_proc=160): 100%|███████████| 1000/1000 [00:01<00:00, 935.64 examples/s]\n",
+      "Filter (num_proc=160): 100%|████████| 1000/1000 [00:01<00:00, 968.80 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 999/999 [00:02<00:00, 433.41 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 999/999 [00:01<00:00, 528.35 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5012.50 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.30 examples/s]\n",
+      ">> Preparing dataset - index:  4  - name:  MedText-QA\n",
+      "Map (num_proc=160): 100%|██████████| 1412/1412 [00:01<00:00, 1370.04 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 1412/1412 [00:00<00:00, 1415.80 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 1411/1411 [00:02<00:00, 658.51 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 1411/1411 [00:01<00:00, 766.49 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4222.99 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.36 examples/s]\n",
+      ">> Preparing dataset - index:  5  - name:  ALMA-prompt-completion\n",
+      "Map (num_proc=160): 100%|█████| 117404/117404 [00:01<00:00, 84427.68 examples/s]\n",
+      "Filter (num_proc=160): 100%|█| 117404/117404 [00:01<00:00, 103449.36 examples/s]\n",
+      "Map (num_proc=160): 100%|█████| 117403/117403 [00:02<00:00, 48025.94 examples/s]\n",
+      "Map (num_proc=160): 100%|█████| 117403/117403 [00:01<00:00, 61484.38 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 411/411 [00:00<00:00, 1180.88 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 42.24 examples/s]\n",
+      ">> Preparing dataset - index:  6  - name:  openbookqa-answer-choice\n",
+      "Map (num_proc=160): 100%|██████████| 4957/4957 [00:01<00:00, 4669.88 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 4957/4957 [00:00<00:00, 5115.47 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 4956/4956 [00:02<00:00, 2154.53 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 4956/4956 [00:01<00:00, 2685.23 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 1163.91 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 44.53 examples/s]\n",
+      ">> Preparing dataset - index:  7  - name:  winogrande-debiased-choices\n",
+      "Map (num_proc=160): 100%|██████████| 9248/9248 [00:01<00:00, 8858.91 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 9248/9248 [00:01<00:00, 9138.30 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 9247/9247 [00:02<00:00, 3961.32 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 9247/9247 [00:01<00:00, 4858.31 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3421.11 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 24.57 examples/s]\n",
+      ">> Preparing dataset - index:  8  - name:  winogrande-l-choices\n",
+      "Map (num_proc=160): 100%|███████| 10234/10234 [00:01<00:00, 10099.86 examples/s]\n",
+      "Filter (num_proc=160): 100%|████| 10234/10234 [00:00<00:00, 10713.05 examples/s]\n",
+      "Map (num_proc=160): 100%|████████| 10233/10233 [00:02<00:00, 4447.36 examples/s]\n",
+      "Map (num_proc=160): 100%|████████| 10233/10233 [00:01<00:00, 5208.21 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3172.52 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 38.39 examples/s]\n",
+      ">> Preparing dataset - index:  9  - name:  arc_easy-answer-choice\n",
+      "Map (num_proc=160): 100%|██████████| 2251/2251 [00:01<00:00, 2023.89 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 2251/2251 [00:00<00:00, 2316.50 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 2250/2250 [00:02<00:00, 997.01 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 2250/2250 [00:01<00:00, 1159.56 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4271.29 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.69 examples/s]\n",
+      ">> Preparing dataset - index:  10  - name:  arc_challenge-answer-choice\n",
+      "Map (num_proc=160): 100%|██████████| 1119/1119 [00:01<00:00, 1016.30 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 1119/1119 [00:01<00:00, 1080.18 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 1118/1118 [00:02<00:00, 458.11 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 1118/1118 [00:01<00:00, 602.26 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4528.08 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 44.31 examples/s]\n",
+      ">> Preparing dataset - index:  11  - name:  piqa-choices\n",
+      "Map (num_proc=160): 100%|███████| 16113/16113 [00:01<00:00, 14850.29 examples/s]\n",
+      "Filter (num_proc=160): 100%|████| 16113/16113 [00:01<00:00, 15600.36 examples/s]\n",
+      "Map (num_proc=160): 100%|████████| 16112/16112 [00:02<00:00, 7100.73 examples/s]\n",
+      "Map (num_proc=160): 100%|████████| 16112/16112 [00:01<00:00, 8771.49 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 884.08 examples/s\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 42.64 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Dataset Mixing mode:  shuffle\n",
+      ">> Saving dataset to data_path :  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "Saving the dataset (10/10 shards): 100%|█| 24528/24528 [00:19<00:00, 1269.30 exa\n",
+      "Saving the dataset (1/1 shards): 100%|███| 12/12 [00:00<00:00, 77.37 examples/s]\n",
+      ">> Dataset saved to data_path\n",
+      ">> -----------------------------------\n",
+      ">> Performing dataset counting\n",
+      ">> -----------------------------------\n",
+      ">> Final dataset count ( train ) : 24,528  samples/chunks/packs\n",
+      ">> Final dataset count ( test  ) : 12  samples\n",
+      ">> -----------------------------------\n",
+      "Map (num_proc=160): 100%|█████████| 24528/24528 [00:31<00:00, 767.98 examples/s]\n",
+      "num_proc must be <= 12. Reducing num_proc to 12 for dataset of size 12.\n",
+      "Map (num_proc=12): 100%|█████████████████| 12/12 [00:03<00:00,  3.63 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Final 'train' dataset token count ...\n",
+      ">> - Total tokens : 757,250,147\n",
+      ">> - Valid tokens : 749,800,541\n",
+      ">> - Hidden tokens : 7,449,606\n",
+      ">> -----------------------------------\n",
+      ">> Final 'test' dataset token count ...\n",
+      ">> - Total tokens : 66,317\n",
+      ">> - Valid tokens : 65,995\n",
+      ">> - Hidden tokens : 322\n",
+      ">> -----------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets build the giant datapack\n",
+    "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask-32k.yaml\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -230,7 +360,7 @@
    "outputs": [],
    "source": [
     "# Lets build the giant datapack\n",
-    "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask-32k.yaml\""
+    "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask-no-text.yaml\""
    ]
   }
  ],
@@ -239,18 +369,6 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb b/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
index 3fc7ae81..0eaed5e7 100644
--- a/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
+++ b/notebook/major-runs/Eagle-2T-retune/eagle-7b-15t-runs.ipynb
@@ -606,19 +606,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[2024-03-07 08:36:08,716] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 10:45:37,633] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
       "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
       "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml', '--model.load_model=/workspace/main-models/Eagle-2T/chunk8-1-0.85.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-With-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-With-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml', '--model.load_model=/workspace/main-models/Eagle-2T/chunk8-1-0.85.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-With-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-With-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
-      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 3634212335\n",
-      "Seed set to 3634212335\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 1545430736\n",
+      "Seed set to 1545430736\n",
       "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
       "---\n",
@@ -646,15 +646,15 @@
       "   - accumulate_grad_batches: 16\n",
       "   - effective_batch_size:    1024\n",
       "\n",
-      "[rank: 0] Seed set to 3634212335\n",
+      "[rank: 0] Seed set to 1545430736\n",
       "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
-      "[2024-03-07 08:36:52,845] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-03-07 08:36:52,879] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-03-07 08:36:52,927] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-03-07 08:36:52,977] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-03-07 08:36:52,986] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-03-07 08:36:52,993] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
-      "[2024-03-07 08:36:53,043] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 10:46:22,906] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 10:46:22,911] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 10:46:22,936] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 10:46:22,982] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 10:46:23,046] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 10:46:23,058] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 10:46:23,061] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
       "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
       "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
       "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
@@ -669,25 +669,25 @@
       "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
       "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
       "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
-      "[rank: 4] Seed set to 3634212335\n",
+      "[rank: 3] Seed set to 1545430736\n",
       "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 5] Seed set to 3634212335\n",
+      "[rank: 6] Seed set to 1545430736\n",
+      "[rank: 7] Seed set to 1545430736\n",
       "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 1] Seed set to 3634212335\n",
       "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 6] Seed set to 3634212335\n",
+      "[rank: 4] Seed set to 1545430736\n",
       "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 3] Seed set to 3634212335\n",
+      "[rank: 2] Seed set to 1545430736\n",
       "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 2] Seed set to 3634212335\n",
+      "[rank: 5] Seed set to 1545430736\n",
+      "[rank: 1] Seed set to 1545430736\n",
       "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
-      "[rank: 7] Seed set to 3634212335\n",
       "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
       "  return self.fget.__get__(instance, owner)()\n",
       "---\n",
@@ -715,6 +715,14 @@
       "---\n",
       "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
       "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
       "---\n",
       "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
       "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
@@ -722,6 +730,9 @@
       "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
       "Building extension module wkv5...\n",
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
       "ninja: no work to do.\n",
       "Loading extension module wkv5...\n",
       "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
@@ -747,42 +758,36 @@
       "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
       "Building extension module wkv5...\n",
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
-      "---\n",
-      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
-      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
       "ninja: no work to do.\n",
       "Loading extension module wkv5...\n",
       "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
       "---\n",
-      "Loading extension module wkv5...\n",
-      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
-      "---\n",
-      "[rank: 5] Seed set to 3634212335\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      "[rank: 6] Seed set to 1545430736\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
       ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
       ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
-      "[rank: 1] Seed set to 3634212335\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      "[rank: 4] Seed set to 1545430736\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
       ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
       ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
-      "[rank: 6] Seed set to 3634212335\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      "[rank: 7] Seed set to 1545430736\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
       ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
       ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
-      "[rank: 3] Seed set to 3634212335\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      "[rank: 1] Seed set to 1545430736\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
       ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
       ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
-      "[rank: 2] Seed set to 3634212335\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      "[rank: 3] Seed set to 1545430736\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
       ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
       ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
-      "[rank: 7] Seed set to 3634212335\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      "[rank: 2] Seed set to 1545430736\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
       ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
       ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
-      "[rank: 4] Seed set to 3634212335\n",
-      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      "[rank: 5] Seed set to 1545430736\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
       ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
       ">> Loading dataset from data_path:  /datapath/eval-retune/pack-with-mask/\n",
       ">> Dataset load finished:  /datapath/eval-retune/pack-with-mask/\n",
@@ -792,16 +797,19 @@
       "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_083754-1d2d77a7\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_104733-tez49nfn\u001b[0m\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 7B-15t-With-Mask (deepspeed_stage_2)\u001b[0m\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/1d2d77a7\u001b[0m\n",
-      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
-      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
-      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/tez49nfn\u001b[0m\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /checkpoint/retune/7B-15t-With-Mask exists and is not empty.\n",
       "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
-      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
       "#\n",
       "# RWKV lighting_trainer.py important notes \n",
       "# https://github.com/RWKV/RWKV-infctx-trainer \n",
@@ -810,14 +818,12 @@
       "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
       "# - When resuming from checkpoint, the estimated time is inaccurate\n",
       "#\n",
-      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
       "\n",
       "[RWKV.model] Configuring optimizer with\n",
       "    - lr_init:  5.000e-06 (5e-06)\n",
       "    - lr_final: 5.000e-06 (5e-06)\n",
       "\n",
-      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
-      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
       "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
       "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
       "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
@@ -832,33 +838,33 @@
       "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
       "ninja: no work to do.\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.048171281814575195 seconds\n",
+      "Time to load fused_adam op: 0.04868745803833008 seconds\n",
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "Loading extension module fused_adam...\n",
       "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.10106372833251953 seconds\n",
-      "Time to load fused_adam op: 0.10101103782653809 seconds\n",
       "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10103034973144531 seconds\n",
+      "Time to load fused_adam op: 0.10106563568115234 seconds\n",
+      "Time to load fused_adam op: 0.10108828544616699 seconds\n",
+      "Time to load fused_adam op: 0.10117268562316895 seconds\n",
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
-      "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.10218214988708496 seconds\n",
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
-      "Loading extension module fused_adam...\n",
-      "Time to load fused_adam op: 0.10167479515075684 seconds\n",
-      "Loading extension module fused_adam...\n",
-      "Loading extension module fused_adam...\n",
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
-      "Time to load fused_adam op: 0.10123562812805176 seconds\n",
-      "Time to load fused_adam op: 0.10174727439880371 seconds\n",
-      "Time to load fused_adam op: 0.10139894485473633 seconds\n",
+      "Time to load fused_adam op: 0.10134577751159668 seconds\n",
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10547518730163574 seconds\n",
+      "Time to load fused_adam op: 0.10280680656433105 seconds\n",
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
@@ -875,7 +881,7 @@
       "0         Non-trainable params\n",
       "7.5 B     Total params\n",
       "30,072.177Total estimated model params size (MB)\n",
-      "Epoch 0:  14%|██▎              | 400/2922 [27:59<2:56:26,  0.24it/s, v_num=77a7]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "Epoch 0:  14%|██▎              | 400/2922 [27:58<2:56:26,  0.24it/s, v_num=9nfn]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
       "  warnings.warn(\n",
       "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
       "  warnings.warn(\n",
@@ -891,7 +897,162 @@
       "  warnings.warn(\n",
       "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
       "  warnings.warn(\n",
-      "Epoch 0:  32%|▎| 943/2922 [1:10:51<2:28:41,  0.22it/s, v_num=77a7, train/tok=2.0"
+      "Epoch 0: 100%|▉| 2921/2922 [3:57:16<00:04,  0.21it/s, v_num=9nfn, train/tok=6.29[rank2]:[2024-03-07 14:45:33,787] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 14:45:33,787] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-07 14:45:33,787] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 14:45:33,787] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 14:45:33,787] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-07 14:45:33,787] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-07 14:45:33,788] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-07 14:45:33,789] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-07 14:45:33,790] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:57:43<00:00,  0.20it/s, v_num=9nfn, train/tok=6.29\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A[rank1]:[2024-03-07 14:46:01,061] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 14:46:01,061] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank1]:[2024-03-07 14:46:01,061] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 14:46:01,061] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 14:46:01,061] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank2]:[2024-03-07 14:46:01,061] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 14:46:01,062] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 14:46:01,062] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank3]:[2024-03-07 14:46:01,062] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 14:46:01,062] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 14:46:01,062] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank5]:[2024-03-07 14:46:01,062] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 14:46:01,063] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 14:46:01,063] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank6]:[2024-03-07 14:46:01,063] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 14:46:01,068] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 14:46:01,068] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank0]:[2024-03-07 14:46:01,068] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 14:46:01,069] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 14:46:01,069] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank4]:[2024-03-07 14:46:01,069] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 14:46:01,069] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 14:46:01,069] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank7]:[2024-03-07 14:46:01,069] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:01<00:00,  0.78it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:57:44<00:00,  0.20it/s, v_num=9nfn, train/tok=6.29`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:57:44<00:00,  0.20it/s, v_num=9nfn, train/tok=6.29\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: | 0.044 MB of 0.044 MB uploaded\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch ▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec ▁▅▇▇██▅▆▆▆▇▅▆▆▆▆▆▅▆▆▆▆▅▅▆▆▆▅▅▅▆▆▆▆▆▆▆▆▆▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step ▅▆▅▁▆▇▄▆▇▁▆▃▃▃▆▂▇▆▇▄▇▆▇▄▆▄▆█▇▃▇▃▇▅▅▆▃▇▅▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 ▁▅▇▇██▅▆▆▆▇▅▆▆▆▆▆▅▆▆▆▆▅▅▆▆▆▅▅▅▆▆▆▆▆▆▆▆▆▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 █▇▁▇▇▇█▇▇▁▇█▇▇▇▇▇████▇█▁▇▇▇██▇█▇▇▅███▇█▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen ▅█▅▁██▄██▁█▄▄▄█▃███▄███▅█▅███▄█▄█▇▅▇▃█▄█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss █▄▄▆▅▆▅▆▆▃▇▄▄▆▆▄▄▄▆▇▄▄█▆▄▂▃▅▅▁▃▅▃▅▇▆▃▄▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss █▄▄▆▅▆▅▆▆▃▇▄▄▆▆▄▄▄▆▇▄▄█▆▄▂▃▅▅▁▃▅▃▅▇▆▃▄▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens ▅▅▃▄▅█▅▅█▁█▂▆▆▇▄▇▅█▆▆▇▇▄▅▅▅▇█▄▆▂█▆▄▅▄█▅▃\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss █▃▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step 149.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec 53.48868\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step 9.27182\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total 761867.875\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 6.69005\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 1.15898\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 95289.894\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep 23368\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss 2.11816\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss 2.11816\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss 2.05145\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok 628883456.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step 182\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate 1e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen 4096.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss 0.87405\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss 0.87405\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mEagle-Retune - 7B-15t-With-Mask (deepspeed_stage_2)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/tez49nfn\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjE0NjI1ODU5Ng==/version_details/v6\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240307_104733-tez49nfn/logs\u001b[0m\n"
      ]
     }
    ],
@@ -921,9 +1082,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 14:46:41,729] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/7B-15t-With-Mask/last.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 710 params 7518044160 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-7B-15t-With-Mask.pth\n",
+      "-rw-r--r-- 1 nobody root 15G Mar  7 14:51 /workspace/main-models/R4-retune/R4-7B-15t-With-Mask.pth\n"
+     ]
+    }
+   ],
    "source": [
     "# Lets export the model from the checkpoint\n",
     "EXPERIMENT_NAME=\"7B-15t-With-Mask\"\n",
@@ -936,15 +1111,552 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "R4-7B-15t-With-Mask.pth: 100%|█████████████| 15.0G/15.0G [06:49<00:00, 36.7MB/s]\n",
+      "https://huggingface.co/rwkv-x-dev/eagle-7b-experiment/blob/main/R4-7B-15t-With-Mask.pth\n"
+     ]
+    }
+   ],
    "source": [
     "EXPERIMENT_NAME=\"7B-15t-With-Mask\"\n",
     "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
     "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 14:58:41,256] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask-32k.yaml', '--model.load_model=/workspace/main-models/Eagle-2T/chunk8-1-0.85.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-32k-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-32k-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask-32k.yaml', '--model.load_model=/workspace/main-models/Eagle-2T/chunk8-1-0.85.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-32k-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-32k-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 2778646456\n",
+      "Seed set to 2778646456\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "[rank: 0] Seed set to 2778646456\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-07 14:59:28,364] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 14:59:28,416] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 14:59:28,433] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 14:59:28,463] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 14:59:28,525] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 14:59:28,584] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 14:59:28,585] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 7] Seed set to 2778646456\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 1] Seed set to 2778646456\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 3] Seed set to 2778646456\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 5] Seed set to 2778646456\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 2] Seed set to 2778646456\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 4] Seed set to 2778646456\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 6] Seed set to 2778646456\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 7] Seed set to 2778646456\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 1] Seed set to 2778646456\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 2] Seed set to 2778646456\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 5] Seed set to 2778646456\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 4] Seed set to 2778646456\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 3] Seed set to 2778646456\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 6] Seed set to 2778646456\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_150047-r0433zwk\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 7B-15t-32k-No-Mask (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/r0433zwk\u001b[0m\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.054949045181274414 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10108590126037598 seconds\n",
+      "Time to load fused_adam op: 0.10102558135986328 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10142970085144043 seconds\n",
+      "Time to load fused_adam op: 0.10099005699157715 seconds\n",
+      "Time to load fused_adam op: 0.10146355628967285 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10216784477233887 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.11625409126281738 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 268 M \n",
+      "1 | blocks | ModuleList | 7.0 B \n",
+      "2 | ln_out | LayerNorm  | 8.2 K \n",
+      "3 | head   | Linear     | 268 M \n",
+      "--------------------------------------\n",
+      "7.5 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "7.5 B     Total params\n",
+      "30,072.177Total estimated model params size (MB)\n",
+      "Epoch 0:  21%|███▉               | 80/384 [43:34<2:45:35,  0.03it/s, v_num=3zwk]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|█████████████████▉| 383/384 [3:36:33<00:33,  0.03it/s, v_num=3zwk][rank0]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-07 18:38:06,751] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 18:38:06,752] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 18:38:06,752] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-07 18:38:06,752] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 18:38:06,752] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 18:38:06,752] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-07 18:38:06,752] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-07 18:38:06,753] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|██████████████████| 384/384 [3:37:11<00:00,  0.03it/s, v_num=3zwk]\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A[rank5]:[2024-03-07 18:38:44,864] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 18:38:44,864] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank5]:[2024-03-07 18:38:44,864] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 18:38:44,876] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 18:38:44,876] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank3]:[2024-03-07 18:38:44,876] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 18:38:44,877] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 18:38:44,877] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank4]:[2024-03-07 18:38:44,877] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 18:38:44,878] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 18:38:44,878] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank0]:[2024-03-07 18:38:44,878] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 18:38:44,880] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 18:38:44,880] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank2]:[2024-03-07 18:38:44,880] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 18:38:44,882] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 18:38:44,882] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank1]:[2024-03-07 18:38:44,882] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 18:38:44,883] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 18:38:44,883] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank6]:[2024-03-07 18:38:44,883] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 18:38:44,885] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 18:38:44,885] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank7]:[2024-03-07 18:38:44,885] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:05<00:00,  0.18it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 384/384 [3:37:16<00:00,  0.03it/s, v_num=3zwk, validation/loss=\u001b[A`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 384/384 [3:37:16<00:00,  0.03it/s, v_num=3zwk, validation/loss=\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.041 MB of 0.041 MB uploaded (0.008 MB deduped)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec ▁▅▆▇▇████▇▇▇▇▇▇▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step ▃▆▆▆▃▇▄▇▄▆▄▆▄▆▆▄▅▆▆▄▃█▇▇▆▇▅▅▆▅▇▄▄▁▇▇▆▄▅▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 ▁▅▇▇▇████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 ▄██▄▄██▅▅█▂██▄▄█▅██▄▄█████▅█▅██▅▅▄▅▅█▁██\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen ▃▆▆▇▃▇▄▇▄▆▄▆▄▆▆▃▅▆▆▄▃█▆▇▆▇▅▅▆▅▇▄▄▁▇▆▆▄▅▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss ▂▇▂▃▅▅▃▃▅▃▃▄▅▃▂▆▃▄▅▄█▅▆▅▄▂▅▄▃▄▅▃▆▃▁▂▁▅▃▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss ▂▇▂▃▅▅▃▃▅▃▃▄▅▃▂▆▃▄▅▄█▅▆▅▄▂▅▄▃▄▅▃▆▃▁▂▁▅▃▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens ▃▆▅▆▃▇▄▆▃▅▃▅▂▆▅▃▅▆▆▄▃█▅▇▅▆▅▅▅▃▇▄▄▁▆▆▄▄▅▅\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec 58.11791\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step 11.63052\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total 757250.125\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 7.19819\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 1.65534\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 93789.159\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep 3064\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen 7194.6875\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss 2.06055\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss 2.06055\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens 7190.53125\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step 23\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate 1e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen 11264.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss 2.03027\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss 2.03027\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mEagle-Retune - 7B-15t-32k-No-Mask (deepspeed_stage_2)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/r0433zwk\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjE0NjI1ODU5Ng==/version_details/v8\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240307_150047-r0433zwk/logs\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# The 7B model\n",
+    "EXPERIMENT_NAME=\"7B-15t-32k-No-Mask\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-no-mask-32k.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 18:39:29,974] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/7B-15t-32k-No-Mask/last.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 710 params 7518044160 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-7B-15t-32k-No-Mask.pth\n",
+      "-rw-r--r-- 1 nobody root 15G Mar  7 18:42 /workspace/main-models/R4-retune/R4-7B-15t-32k-No-Mask.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"7B-15t-32k-No-Mask\"\n",
+    "CKPT_DIR=\"last.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "R4-7B-15t-32k-No-Mask.pth: 100%|███████████| 15.0G/15.0G [07:14<00:00, 34.6MB/s]\n",
+      "https://huggingface.co/rwkv-x-dev/eagle-7b-experiment/blob/main/R4-7B-15t-32k-No-Mask.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "EXPERIMENT_NAME=\"7B-15t-32k-No-Mask\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb b/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb
index ca1677d8..02a97b27 100644
--- a/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb
+++ b/notebook/major-runs/Eagle-2T-retune/eagle-7b-base-runs.ipynb
@@ -581,7 +581,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -842,7 +842,191 @@
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
       "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
       "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
-      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n"
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 268 M \n",
+      "1 | blocks | ModuleList | 7.0 B \n",
+      "2 | ln_out | LayerNorm  | 8.2 K \n",
+      "3 | head   | Linear     | 268 M \n",
+      "--------------------------------------\n",
+      "7.5 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "7.5 B     Total params\n",
+      "30,072.177Total estimated model params size (MB)\n",
+      "Epoch 0:  14%|██▎              | 400/2922 [27:42<2:54:43,  0.24it/s, v_num=91jy]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|▉| 2921/2922 [3:55:04<00:04,  0.21it/s, v_num=91jy, train/tok=6.29[rank2]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-07 13:44:36,482] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-07 13:44:36,483] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-07 13:44:36,484] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 13:44:36,485] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 13:44:36,485] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-07 13:44:36,485] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 13:44:36,485] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 13:44:36,485] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-07 13:44:36,485] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:55:31<00:00,  0.21it/s, v_num=91jy, train/tok=6.29\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A[rank7]:[2024-03-07 13:45:03,296] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 13:45:03,296] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank7]:[2024-03-07 13:45:03,296] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 13:45:03,300] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 13:45:03,300] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank3]:[2024-03-07 13:45:03,300] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 13:45:03,301] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 13:45:03,301] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank2]:[2024-03-07 13:45:03,301] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 13:45:03,303] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 13:45:03,303] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank5]:[2024-03-07 13:45:03,303] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 13:45:03,304] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 13:45:03,304] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank6]:[2024-03-07 13:45:03,304] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 13:45:03,306] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 13:45:03,306] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank1]:[2024-03-07 13:45:03,306] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 13:45:03,307] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 13:45:03,307] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank4]:[2024-03-07 13:45:03,307] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 13:45:03,376] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 13:45:03,376] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank0]:[2024-03-07 13:45:03,376] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:01<00:00,  0.80it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:55:32<00:00,  0.21it/s, v_num=91jy, train/tok=6.29`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:55:32<00:00,  0.21it/s, v_num=91jy, train/tok=6.29\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.044 MB of 0.044 MB uploaded\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch ▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec ▁▅▇▇██▆▇▇▇▇▆▇▇▇▇▇▆▆▆▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step ▆▇▅▁▇▇▄▇▇▂▇▃▃▅▇▃▇▇▇▄▇▇▇▅▇▄▇█▇▄▇▄▇▆▅▆▃▇▄▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 ▁▅▇▇██▆▇▇▇▇▆▇▇▇▇▇▆▆▆▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 ██▁██████▂█████████████▁▇████████▅██████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen ▅█▅▁██▄██▁█▄▄▄█▃███▄███▅█▅███▄█▄█▇▅▇▃█▄█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss ▇▃▄▅▅▇███▄▇▅▄▆▇▅▄▅▆▇▄▄█▆▄▃▄▅▆▁▃▅▃▅▇▆▄▄▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss ▇▃▄▅▅▇███▄▇▅▄▆▇▅▄▅▆▇▄▄█▆▄▃▄▅▆▁▃▅▃▅▇▆▄▄▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens ▅▅▃▄▅█▅▅█▁█▂▆▆▇▄▇▅█▆▆▇▇▄▅▅▅▇█▄▆▂█▆▄▅▄█▅▃\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss █▄▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step 149.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec 53.9859\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step 9.26441\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total 761867.875\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 6.75224\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 1.15805\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 95289.894\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep 23368\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss 2.13086\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss 2.13086\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss 2.06636\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok 628883456.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step 182\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate 1e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen 4096.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss 1.15149\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss 1.15149\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mEagle-Retune - 7B-Base-With-Mask (deepspeed_stage_2)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/9u7s91jy\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjE0NjI1ODU5Ng==/version_details/v5\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240307_094842-9u7s91jy/logs\u001b[0m\n"
      ]
     }
    ],
@@ -872,9 +1056,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 13:45:44,669] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/7B-Base-With-Mask/last.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 710 params 7518044160 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-7B-Base-With-Mask.pth\n",
+      "-rw-r--r-- 1 nobody root 15G Mar  7 13:47 /workspace/main-models/R4-retune/R4-7B-Base-With-Mask.pth\n"
+     ]
+    }
+   ],
    "source": [
     "# Lets export the model from the checkpoint\n",
     "EXPERIMENT_NAME=\"7B-Base-With-Mask\"\n",
@@ -887,15 +1085,547 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "R4-7B-Base-With-Mask.pth: 100%|████████████| 15.0G/15.0G [06:38<00:00, 37.8MB/s]\n",
+      "https://huggingface.co/rwkv-x-dev/eagle-7b-experiment/blob/main/R4-7B-Base-With-Mask.pth\n"
+     ]
+    }
+   ],
    "source": [
     "EXPERIMENT_NAME=\"7B-Base-With-Mask\"\n",
     "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
     "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 13:54:20,995] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask-32k.yaml', '--model.load_model=/workspace/main-models/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-Base-32k-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-Base-32k-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask-32k.yaml', '--model.load_model=/workspace/main-models/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-Base-32k-No-Mask/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-Base-32k-No-Mask (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 986777407\n",
+      "Seed set to 986777407\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "[rank: 0] Seed set to 986777407\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-07 13:55:04,541] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 13:55:04,578] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 13:55:04,634] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 13:55:04,653] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 13:55:04,668] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 13:55:04,692] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-07 13:55:04,744] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 4] Seed set to 986777407\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 2] Seed set to 986777407\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 5] Seed set to 986777407\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 6] Seed set to 986777407\n",
+      "[rank: 7] Seed set to 986777407\n",
+      "[rank: 3] Seed set to 986777407\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 1] Seed set to 986777407\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 4] Seed set to 986777407\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 2] Seed set to 986777407\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 6] Seed set to 986777407\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 1] Seed set to 986777407\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 7] Seed set to 986777407\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 3] Seed set to 986777407\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "[rank: 5] Seed set to 986777407\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask-32k/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240307_135626-jv8r2vut\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 7B-Base-32k-No-Mask (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/jv8r2vut\u001b[0m\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "[WARNING]: unlimited bptt_learning_range across multiple GPU's has a performance penalty with datasets of mixed sizes due to its constant need to keep all GPU's in sync (consider using bptt_learning_range=1 instead)\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.04860186576843262 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10135221481323242 seconds\n",
+      "Time to load fused_adam op: 0.10137391090393066 seconds\n",
+      "Time to load fused_adam op: 0.10134077072143555 seconds\n",
+      "Time to load fused_adam op: 0.10113263130187988 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10140562057495117 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10206794738769531 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10189056396484375 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 268 M \n",
+      "1 | blocks | ModuleList | 7.0 B \n",
+      "2 | ln_out | LayerNorm  | 8.2 K \n",
+      "3 | head   | Linear     | 268 M \n",
+      "--------------------------------------\n",
+      "7.5 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "7.5 B     Total params\n",
+      "30,072.177Total estimated model params size (MB)\n",
+      "Epoch 0:  21%|███▉               | 80/384 [43:08<2:43:54,  0.03it/s, v_num=2vut]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|█████████████████▉| 383/384 [3:35:52<00:33,  0.03it/s, v_num=2vut][rank0]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-07 17:33:05,019] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 17:33:05,020] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 17:33:05,020] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-07 17:33:05,020] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 17:33:05,020] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 17:33:05,020] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-07 17:33:05,020] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 17:33:05,020] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 17:33:05,020] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-07 17:33:05,020] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-07 17:33:05,021] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 17:33:05,022] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 17:33:05,022] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-07 17:33:05,022] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|██████████████████| 384/384 [3:36:31<00:00,  0.03it/s, v_num=2vut]\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A[rank6]:[2024-03-07 17:33:44,997] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-07 17:33:44,997] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank6]:[2024-03-07 17:33:44,997] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-07 17:33:45,004] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-07 17:33:45,004] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank1]:[2024-03-07 17:33:45,004] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-07 17:33:45,004] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-07 17:33:45,004] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank3]:[2024-03-07 17:33:45,004] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-07 17:33:45,005] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-07 17:33:45,005] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank7]:[2024-03-07 17:33:45,005] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-07 17:33:45,006] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-07 17:33:45,006] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank5]:[2024-03-07 17:33:45,006] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-07 17:33:45,008] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-07 17:33:45,008] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank0]:[2024-03-07 17:33:45,008] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-07 17:33:45,015] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-07 17:33:45,015] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank2]:[2024-03-07 17:33:45,015] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-07 17:33:45,071] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-07 17:33:45,071] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank4]:[2024-03-07 17:33:45,071] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:05<00:00,  0.18it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 384/384 [3:36:37<00:00,  0.03it/s, v_num=2vut, validation/loss=\u001b[A`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 384/384 [3:36:37<00:00,  0.03it/s, v_num=2vut, validation/loss=\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.042 MB of 0.042 MB uploaded\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec ▁▅▆▇▇████▇▇▇▇▇▇██▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step ▃▆▆▇▃▇▄▇▃▆▃▆▃▆▆▃▅▆▆▄▃█▆▇▆▇▅▅▆▅▇▄▄▁▇▇▆▄▅▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 ▁▅▇▇▇████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 ▄██▄▄██▅▅█▂██▄▄█▅██▄▄█████▅█▅██▅▅▅▅▅█▁██\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen ▃▆▆▇▃▇▄▇▄▆▄▆▄▆▆▃▅▆▆▄▃█▆▇▆▇▅▅▆▅▇▄▄▁▇▆▆▄▅▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss ▃▆▃█▃▅▂▄▃▅▅▄▃▃▃▄▄▃▄▄▄▆▅▆▄▃▆▄▄▄▄▂▅▂▄▄▁▄▂▅\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss ▃▆▃█▃▅▂▄▃▅▅▄▃▃▃▄▄▃▄▄▄▆▅▆▄▃▆▄▄▄▄▂▅▂▄▄▁▄▂▅\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens ▃▆▅▆▃▇▄▆▃▅▃▅▂▆▅▃▅▆▆▄▃█▅▇▅▆▅▅▅▃▇▄▄▁▆▆▄▄▅▅\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec 58.29307\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step 11.1174\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total 757250.125\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 7.21988\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 1.58231\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 93789.159\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep 3064\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen 7194.6875\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss 2.4248\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss 2.4248\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens 7190.53125\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step 23\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate 1e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen 11264.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss 2.25635\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss 2.25635\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mEagle-Retune - 7B-Base-32k-No-Mask (deepspeed_stage_2)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/jv8r2vut\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjE0NjI1ODU5Ng==/version_details/v7\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240307_135626-jv8r2vut/logs\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# The 7B model\n",
+    "EXPERIMENT_NAME=\"7B-Base-32k-No-Mask\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-no-mask-32k.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-07 17:34:34,483] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/7B-Base-32k-No-Mask/last.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 710 params 7518044160 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-7B-Base-32k-No-Mask.pth\n",
+      "-rw-r--r-- 1 nobody root 15G Mar  7 17:40 /workspace/main-models/R4-retune/R4-7B-Base-32k-No-Mask.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"7B-Base-32k-No-Mask\"\n",
+    "CKPT_DIR=\"last.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "R4-7B-Base-32k-No-Mask.pth: 100%|██████████| 15.0G/15.0G [07:04<00:00, 35.4MB/s]\n",
+      "https://huggingface.co/rwkv-x-dev/eagle-7b-experiment/blob/main/R4-7B-Base-32k-No-Mask.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "EXPERIMENT_NAME=\"7B-Base-32k-No-Mask\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-no-text.yaml b/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-no-text.yaml
new file mode 100644
index 00000000..7b69eaf5
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-no-text.yaml
@@ -0,0 +1,658 @@
+#
+# Custom multiple datasource, built as a single datapack
+#
+datapack:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # If using relative path, this should be relative to the trainer script path
+  data_path: /datapath/eval-retune/pack-no-mask-no-text/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Mixing mode to use, this is used to alternate between datasets
+  #
+  # - concat  : Keep It Simple Silly, lets just concat the datasets together
+  # - shuffle : Dataset is mixed on a per sample level
+  #
+  # (@TODO: Advance operations)
+  # - batch   : Meaning one dataset worth per batch, partial batches are discarded
+  mixing_mode: "shuffle"
+
+#
+# Default settings used across all datasets in the datapack
+# These settings can be overriden by the dataset specific settings
+#
+default:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # Datapath here is entirely optional, and only used if you intend to save each individual dataset
+  # seperately (makes it easier to tweak and rebuild the datapack if it crash mid-way)
+  #
+  # The dataset index will be appended to the default value, if set
+  # ---
+  data_path: /datapath/eval-retune/partial-no-mask-no-text/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Additional source dataset params, used to grab subsets of the dataset
+  # ---
+  # source_dataset_params:
+  #   language: en
+
+  # Sort the dataset by length, useful to reduce gpu waiting time (also useful for RWKV long context coherence)
+  # ---
+  # sort_by_length: false
+  # sort_asc: True # Sort in ascending order, true = shortest first, false = longest first
+
+  # Limit the document count, to an offset/length limit
+  # If an int value is used, it is interprated as document count
+  # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset
+  # ---
+  # dataset_offset: -1
+  # dataset_length: -1
+
+  # Use data_dir, if you are using source=text/json/etc
+  # If using relative path, this should be relative to the trainer script path
+  # source_data_dir: ../dataset-text/
+
+  # After loading the dataset, split out test data used for validation, 
+  # This process is skipped if the dataset includes a test split
+  #
+  # If given a float value, a percentage of the dataset is used (1.0 being 100%)
+  # If given an int value, the number of data sample is used.
+  #
+  # Due to the limitaitons in the trainer process, there is always a minimum of 1 test sample
+  test_split: 1 # Intentionally set to a low sample for test, cause the real eval is humans
+  test_split_shuffle: True
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the HF tokenizer name/path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 1024 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  # ---
+  min_token_size: -1
+  max_token_size: -1
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  #
+  # IMPORTANT NOTE: as newlines are commonly used for multi_column_suffix, etc. 
+  #                 you should use single quotes to ensure such values dun get escaped.
+  #                 eg. multi_column_suffix: ['\n\n']
+  #
+  # See: https://github.com/RWKV/RWKV-infctx-trainer/issues/34
+  # Need to use " or the new lines won't be tokenized properly
+  # ---
+  # multi_column_keys: ["instruction", "input", "output"]
+  # multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  # multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  # multi_column_train_mask: [true, true, true]
+  # multi_column_separator: "\n\n"
+  
+  # Conversation merging process
+  # useful for merging full conversational datasets, into single documents
+  # default is off, (or set conversation_key to [])
+  # conversation_formatting supports "iopairs" or "sender" for now.
+  # ---
+  # conversation_format: 'iopairs'
+  # conversation_key: 'conversation'
+  # conversation_end_of_conversation: "\n\nUser:"
+
+  # Iopairs specific config
+  # This means that every object in the conversation object is a pair of input output.
+  # In future it will also support a format where one of the keys dictates the format style
+  # if conversation_key is set to null, it will use the root object as the conversation object
+  # ---
+  # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  # conversation_input_key_mask: {'input': false, 'output': True}
+  # conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # Sender specific config
+  # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  # conversation_input_key_map: {'message': "\n\n{sender}: ", 'context': ''}
+  # conversation_sender_key: 'sender'
+  # conversation_sender_value_map: {'user': 'User', 'assistant': 'Assistant', 'system': 'System'}
+  # conversation_sender_mask: {'user': false, 'assistant': True, 'system': false}
+  # conversation_sender_suffix: {'user': "", 'assistant': "", 'system': ""}
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+  # ----------------------------
+  # Dataset split usage
+  # ----------------------------
+
+  source_dataset_split: "train"
+  test_dataset_split: "do-not-use-test-split"
+
+  # ----------------------------
+  # Rechunking support
+  # ----------------------------
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 2048
+  #
+  # This is ignored, if source is not set as text (unless text_rechunk_force)
+  # This is ignored, if set to zero / -1
+  # ---
+  text_rechunk_size: 4096
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: False
+
+  # Used to disable the automated text rechunkin for text files, if set as false
+  # ---
+  text_rechunk_auto: True
+
+  # ----------------------------
+  # Dataset packing support
+  # Recommended to be used with mixed documents sized finetuning
+  # For foundation model "from scratch", rechunking is typically used instead
+  # ----------------------------
+
+  # Boolean flag to enable / disable dataset packing
+  packing_enable: True
+
+  # Used to ensure all training samples wihin this batch size is the same length
+  # Ideally this should align exactly with your real "batch size"
+  #
+  # Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
+  # a large number of batch size combinations. This helps reduce the amount of
+  # misaligned batches, and thus reduce the amount of wasted training time.
+  #
+  # This is tagged to datapack.batchsize, unless overriden here or on a dataset level
+  # ---
+  # packing_batchsize: 20160
+
+  # Chunking size to align within each batch, this ideally should be equal to
+  # the training context length used.
+  packing_chunksize: 4096
+
+  # Minimum size to pack up to, this should be a multiple of packing_chunksize
+  # defautls to -1, which equals to packing_chunksize
+  packing_min_ctx_len: 4096
+
+  # Pack the data sequentially if possible, in accordance to the dataset sequence
+  # this can be used together with sort_by_length, otherwise a shuffle will be done
+  packing_in_sequence: False
+
+  # ----------------------------
+  # Specal use caes flags
+  # ----------------------------
+
+  # Reverse the training dataset order before saving, this is useful for,
+  # optimizing dataset packing process, when using packing_in_sequence
+  # and sort_by_length desc order together
+  reverse_train_dataset_before_save: False
+
+#
+# The dataset specific settings
+# 
+dataset:
+  
+  # # ---
+  # # Text based dataset
+  # # ---
+
+  # - # Lambada training text
+  #   # https://huggingface.co/datasets/lambada
+  #   source: "lambada"
+  #   name: "lambada-train"
+  #   # 4k rechunk forced
+  #   text_rechunk_force: True
+
+  # - # Enwiki training text
+  #   # https://huggingface.co/datasets/teven/enwiki_100k
+  #   source: "teven/enwiki_100k"
+  #   name: "enwiki-train"
+  #   # 4k rechunk forced
+  #   min_token_size: 256
+  #   text_rechunk_force: True
+
+  # ---
+  # Copa style
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/pkavumba/balanced-copa
+
+  - # Balanced copa, framed as choices
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["### Premise:\n", "\n\n### Question:\nWhich choice was the", "1) ", "2) ", "\n### Answer:\n"]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [true, true, true, true, true]
+    multi_column_separator: ""
+  
+  - # Balanced copa, framed as options
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-options"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["Context: ", "\n\nQuestion: Which option was the", "1. ", "2. ", "\nAnswer: "]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [true, true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Prompt completion / Q&A datasets
+  # ---
+
+  - # Question answer pair medical text
+    # https://huggingface.co/datasets/BI55/MedText
+    source: "BI55/MedText"   
+    name: "MedText-QA"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["Prompt", "Completion"]
+    multi_column_prefix: ["Question:\n", "Answer:\n"]
+    multi_column_suffix: ["", ""]
+    multi_column_train_mask: [true, true]
+    multi_column_separator: "\n\n"
+  
+  - # Language translation prompt/completion
+    # https://huggingface.co/datasets/kristaller486/ALMA-prompt-completion
+    source: "kristaller486/ALMA-prompt-completion"
+    name: "ALMA-prompt-completion"
+    # 4k packing
+    packing_enable: True
+    # Prompt completion, nothing else else
+
+  # ---
+  # openbookqa
+  # ---
+
+  # openbookqa
+  # https://huggingface.co/datasets/allenai/openbookqa
+
+  - # Openbookqa training, with the json
+    source: "allenai/openbookqa"
+    name: "openbookqa-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["fact1", "question_stem", "choices", "answerKey"]
+    multi_column_prefix: [">>> Premise:\n", "\n\nChoose the best option to complete the following:\n", "\n\nUsing the text options found in the following JSON:\n", "\n\n>>> Answer:\n"]
+    multi_column_suffix: ["", "", "\n\nAnswer using only the label given in the json", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Winogrande
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/winogrande
+
+  - # Balanced copa, framed as choices
+    source: "winogrande"
+    name: "winogrande-debiased-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_debiased
+
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following sentence:\n", "\n1) ", "\n2) ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["\n\n Choose either 1 or 2, for which option is the best fit to replace _ in the sentence\n", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  - # Balanced copa, framed as choices
+    source: "winogrande"
+    name: "winogrande-l-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_l 
+    
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following statement: `", "\n1. ", "\n2. ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["`\n\n Choose 1 or 2, for which choice is the best fit to replace _ in the statement, answer only with the number given\n", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # logiqa
+  # ---
+
+  # logiqa
+  # https://huggingface.co/datasets/lucasmccabe/logiqa
+  # ( This has a pyarrow error somehow ?, probably cause its an array/list internally )
+
+  # - # Openbookqa training, with the json
+  #   source: "lucasmccabe/logiqa"
+  #   name: "logiqa-options"
+  #   # 4k packing
+  #   packing_enable: True
+
+  #   # Question / Answer pairings
+  #   multi_column_keys: ["context", "query", "options", "correct_option"]
+  #   multi_column_prefix: [">>> Context:\n", "\n\n>>> Query:\n", "\n\nAnswer with the array index position (starting from 0), for the most appropriate option for the given query: ", "\n\n>>> Answer:\n"]
+  #   multi_column_suffix: ["", "", "", ""]
+  #   multi_column_train_mask: [true, true, true, true]
+  #   multi_column_separator: ""
+  
+  # ---
+  # arc_easy
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_easy
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_easy"
+    name: "arc_easy-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["question", "choices", "answerKey"]
+    multi_column_prefix: ["Question: ", "\n\nUsing the text options found in the following JSON:\n", "\n\nAnswer: "]
+    multi_column_suffix: ["", "\n\nAnswer using only the corresponding label given in the json", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # arc_challenge
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_challenge
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_challenge"
+    name: "arc_challenge-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["choices", "question", "answerKey"]
+    multi_column_prefix: ["Using the text found in the following:\n", "\n\nQuestion: ", "\n\nAnswer: "]
+    multi_column_suffix: ["\n\nAnswer using only the respective label given", "", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Piqa
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/piqa
+
+  - # Balanced copa, framed as choices
+    source: "piqa"
+    name: "piqa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["goal", "sol1", "sol2", "label"]
+    multi_column_prefix: ["# Goal: ", "\n\n0) ", "\n1) ", "\n\n# Answer: "]
+    multi_column_suffix: ["", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Instruct datasets
+  # ---
+
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "Open-Orca/OpenOrca"
+  #   name: "OpenOrca"
+
+  #   multi_column_keys: ["system_prompt", "question", "response"]
+  #   multi_column_prefix: ["Instruction:\n", "", ""]
+  #   multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  #   multi_column_train_mask: [true, true, true]
+  #   multi_column_separator: ""
+    
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "teknium/openhermes"
+  #   name: "openhermes-1-instruct"
+
+  #   multi_column_keys: ["instruction", "input", "output"]
+  #   multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  #   multi_column_suffix: ["", "", ""]
+  #   multi_column_train_mask: [true, true, true]
+  #   multi_column_separator: "\n\n"
+    
+  # ---
+  # Chat datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Capybara"
+  #   name: "Capybara-chat"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\n>>> User: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\n>>> User: ", 'output': "\n\n>>> Assistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Pure-Dove"
+  #   name: "Pure-Dove"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+ 
+  # ---
+  # Other datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "teknium/OpenHermes-2.5"
+  #   name: "openhermes-2-convo"
+      
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'conversations'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'value': "\n\n{sender}: "}
+  #   conversation_sender_key: 'from'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  # - # Instruct, input, output format
+  #   # With the instruction format changed, to fix the formatting
+  #   # https://huggingface.co/datasets/Darok/Lamini-instructions-to-french
+  #   source: "Darok/Lamini-instructions-to-french"
+  #   name: "Lamini-instructions-to-french"
+
+  #   multi_column_keys: ["Input", "Response"]
+  #   multi_column_prefix: ["### Instruction:\nPlease translate the next sentence into French\n\n### Input:\n", "### Output:\n"]
+  #   multi_column_suffix: ["", ""]
+  #   multi_column_train_mask: [true, true]
+  #   multi_column_separator: "\n\n"
+    
+  # - # Long range instruction format
+  #   # https://huggingface.co/datasets/THUDM/LongAlign-10k/
+  #   source: "THUDM/LongAlign-10k"
+  #   name: "LongAlign-10k"
+
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'messages'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'content': "\n\n{sender}: "}
+  #   conversation_sender_key: 'role'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  ######################################################
+  # Note: You can probably throw in enwiki if you want
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "teven/enwiki_100k" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "enwiki_100k"
+
+  #   # Minimum / Maximum token size of the dataset to use
+  #   min_token_size: 1024
+  #   max_token_size: -1
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  # - # SuperWiki (Multi-lingual)
+  #   # https://huggingface.co/datasets/RyokoExtra/SuperWIKI-Cleaned
+  #   source: "RyokoExtra/SuperWIKI-Cleaned" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "super_wiki"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  #   source_dataset_split: lang25
+
+  #   # Custom text column to use, useful for dataset with alternative training columns labels
+  #   # This is checked before multi column merging, default is null (disabled)
+  #   # If set this takes priority
+  #   # eg: 'code'
+  #   # ---
+  #   custom_text_key: 'text'
+
+  #   # All other settings found in default can be overriden here
+  #   # ---
+  #   # ...
+
+  ######################################################
+  # Note: We found the ML generated textbooks
+  # too low in perplexity that it hurts the model
+  # so we are using the original enwiki_100k & superwiki
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "TanvirOnHF/muse_textbooks" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "muse_textbooks"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+  ######################################################
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask-32k.yaml b/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask-32k.yaml
new file mode 100644
index 00000000..634d556e
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask-32k.yaml
@@ -0,0 +1,119 @@
+###############################################
+##
+## See the full `config-example.yaml` for more
+## detailes on the trainer/model configs
+##
+###############################################
+
+trainer:
+  # Multi node training settings
+  num_nodes: 1
+  microbatch_size: 8
+  strategy: deepspeed_stage_2
+  
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Resonable batch size, for a more realistic it/s rate
+  # this is currently overwritten in the notebook
+  target_batch_size: 1024
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'Eagle-2T-R4'
+      project: 'RWKV-V5-Eagle-2T-R4'
+      tags: ['Eagle', 'RWKV-V5']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: /checkpoint/retune/Eagle-R4-no-mask/
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 3
+      # Choose the most recent checkpoints by steps
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: true
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 5
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other pytorch lightning settings, which in most cases you can remove/ignore
+      # ---
+      # verbose: false
+      # auto_insert_metric_name: true
+  
+model:
+  # The model to load
+  load_model: /workspace/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth
+
+  # Starting and ending learning rate
+  lr_init: 5e-6
+  lr_final: 5e-6
+
+  # Training context length, note that the dataset can be
+  # larger then the context size, in which the trainer
+  # will process the dataset in chunks
+  ctx_len: 4096
+
+  # BPTT learning, this allows you to run the trainer against dataset
+  # larger then its training context length
+  bptt_learning: true
+  bptt_learning_range: -1
+
+########################################
+## Training model settings
+########################################
+data:
+  # Skip the datapath setup
+  #
+  # ignored if using the preload_datapath.py, useful for speeding up the trainer startup
+  # provided you have your datasets all properly preinitialized
+  # ---
+  skip_datapath_setup: True
+
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: /datapath/eval-retune/pack-no-mask-32k/
+
+# Path to the current checkpoint to continue training from
+# this should be the directory path, and ends with `.ckpt/`
+# ckpt_path: /checkpoint/Eagle-2T-p1/last.ckpt
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml b/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml
index d4516fc6..7b7203a4 100644
--- a/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml
+++ b/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml
@@ -49,7 +49,7 @@ trainer:
       filename: null
       
       # Save the top/last K checkpoints
-      save_top_k: 5
+      save_top_k: 3
       # Choose the most recent checkpoints by steps
       monitor: 'step'
       mode: max
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml b/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml
index 5edab433..60669879 100644
--- a/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml
+++ b/notebook/major-runs/Eagle-2T-retune/retune-train-with-mask.yaml
@@ -49,7 +49,7 @@ trainer:
       filename: null
       
       # Save the top/last K checkpoints
-      save_top_k: 5
+      save_top_k: 3
       # Choose the most recent checkpoints by steps
       monitor: 'step'
       mode: max

From 11a8f302394dbbf7b5a043250f1bdc3d624a345d Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Fri, 8 Mar 2024 21:56:47 +0000
Subject: [PATCH 11/13] handling data as boolean

---
 RWKV-v5/src/data.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/RWKV-v5/src/data.py b/RWKV-v5/src/data.py
index f0540a8d..8987defe 100644
--- a/RWKV-v5/src/data.py
+++ b/RWKV-v5/src/data.py
@@ -424,6 +424,13 @@ def encodeTokens(x, enforceSingleItem = False):
                         # Dictionary to json string
                         x = json.dumps(x)
 
+                    # Converting from boolean
+                    if isinstance(x, bool):
+                        if x:
+                            x = "true"
+                        else:
+                            x = "false"
+
                     # Enforce string type
                     x = str(x)
 

From d92047c4250915f497181a47f7cedb32beea584c Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Fri, 8 Mar 2024 21:57:09 +0000
Subject: [PATCH 12/13] wip simpleRWKV support

---
 RWKV-v5/src/model.py | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/RWKV-v5/src/model.py b/RWKV-v5/src/model.py
index 86e79b2b..39f4d6f3 100644
--- a/RWKV-v5/src/model.py
+++ b/RWKV-v5/src/model.py
@@ -1526,7 +1526,7 @@ class SimpleRWKV():
     def __init__(
             self,
             model_path: str,
-            ctx_len:int = 1024,
+            ctx_len:int = 256,
             device:str = "cuda",
             dtype:str = "fp32"
         ):
@@ -1610,10 +1610,38 @@ def _forward(
         # The all_logits array, if requested
         all_logits_arr = None
 
-        # For each token, process the state, in batches up to ctx_len
-        for i in range(0, token_len, self.ctx_len):
+        # Number of times we can do batched
+        full_len_chunk = token_len // self.ctx_len
+        full_len_remain = token_len % self.ctx_len
+
+        # # For each token, we can process in full ctx_len batches
+        # for i in range(0, full_len_chunk * self.ctx_len, self.ctx_len):
+        #     # Token set
+        #     token_set = tokens[i:i+self.ctx_len]
+
+        #     # Check if tokens are already tensors
+        #     batch_tokens = torch.tensor(
+        #         token_set, 
+        #         dtype=torch.long, device=self.device
+        #     ).unsqueeze(0)
+            
+        #     # Compute the logits and state
+        #     logits_arr, shift_states, wkv_states = self.model.forward(
+        #         batch_tokens, shift_states, wkv_states
+        #     )
+
+        #     # Build the all_logits array
+        #     if all_logits:
+        #         if all_logits_arr is None:
+        #             all_logits_arr = logits_arr[0]
+        #         else:
+        #             all_logits_arr = torch.cat([all_logits_arr, logits_arr[0]], dim=0)
+
+        # For each remaining token, after the full batches
+        # full_len_chunk * self.ctx_len
+        for i in range(0, token_len, 1):
             # Token set
-            token_set = tokens[i:i+self.ctx_len]
+            token_set = tokens[i:i+1]
 
             # Check if tokens are already tensors
             batch_tokens = torch.tensor(

From fb6023ef4ebf5e90085ff7cd0b2d92507407ee09 Mon Sep 17 00:00:00 2001
From: "Eugene Cheah (picocreator)" <eugeneqin@gmail.com>
Date: Fri, 8 Mar 2024 21:57:17 +0000
Subject: [PATCH 13/13] wip data prep

---
 .../Eagle-2T-retune/data-prep.ipynb           |  215 +++-
 .../retune-extd-data-build.yaml               |  694 +++++++++++
 .../Eagle-2T-retune/retune-extd-runs.ipynb    | 1110 +++++++++++++++++
 .../Eagle-2T-retune/retune-extd-train.yaml    |  119 ++
 4 files changed, 2136 insertions(+), 2 deletions(-)
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-extd-data-build.yaml
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-extd-runs.ipynb
 create mode 100644 notebook/major-runs/Eagle-2T-retune/retune-extd-train.yaml

diff --git a/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb b/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
index 3c50068a..071ebef9 100644
--- a/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
+++ b/notebook/major-runs/Eagle-2T-retune/data-prep.ipynb
@@ -355,13 +355,212 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">> Starting datapack build process for: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-data-build-no-mask-no-text.yaml\n",
+      ">> Preparing dataset - index:  0  - name:  balanced-copa-choices\n",
+      "Map (num_proc=160): 100%|███████████| 1000/1000 [00:01<00:00, 749.89 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 1000/1000 [00:00<00:00, 1110.68 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 999/999 [00:02<00:00, 452.39 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 999/999 [00:01<00:00, 589.03 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5295.46 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.34 examples/s]\n",
+      ">> Preparing dataset - index:  1  - name:  balanced-copa-options\n",
+      "Map (num_proc=160): 100%|███████████| 1000/1000 [00:01<00:00, 949.99 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 1000/1000 [00:00<00:00, 1047.64 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 999/999 [00:02<00:00, 462.29 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 999/999 [00:01<00:00, 590.28 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5193.26 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.34 examples/s]\n",
+      ">> Preparing dataset - index:  2  - name:  MedText-QA\n",
+      "Map (num_proc=160): 100%|██████████| 1412/1412 [00:00<00:00, 1440.98 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 1412/1412 [00:00<00:00, 1467.08 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 1411/1411 [00:02<00:00, 684.53 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 1411/1411 [00:01<00:00, 815.58 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4376.36 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.90 examples/s]\n",
+      ">> Preparing dataset - index:  3  - name:  ALMA-prompt-completion\n",
+      "Map (num_proc=160): 100%|█████| 117404/117404 [00:01<00:00, 96403.92 examples/s]\n",
+      "Filter (num_proc=160): 100%|█| 117404/117404 [00:01<00:00, 114128.31 examples/s]\n",
+      "Map (num_proc=160): 100%|█████| 117403/117403 [00:02<00:00, 55460.76 examples/s]\n",
+      "Map (num_proc=160): 100%|█████| 117403/117403 [00:01<00:00, 63525.84 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 2655/2655 [00:00<00:00, 10049.60 exampl\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 40.57 examples/s]\n",
+      ">> Preparing dataset - index:  4  - name:  openbookqa-answer-choice\n",
+      "Map (num_proc=160): 100%|██████████| 4957/4957 [00:00<00:00, 5020.61 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 4957/4957 [00:00<00:00, 5386.22 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 4956/4956 [00:02<00:00, 2307.23 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 4956/4956 [00:01<00:00, 2834.50 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 2867.67 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.20 examples/s]\n",
+      ">> Preparing dataset - index:  5  - name:  winogrande-debiased-choices\n",
+      "Map (num_proc=160): 100%|██████████| 9248/9248 [00:00<00:00, 9789.46 examples/s]\n",
+      "Filter (num_proc=160): 100%|██████| 9248/9248 [00:00<00:00, 10080.53 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 9247/9247 [00:02<00:00, 4182.93 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 9247/9247 [00:01<00:00, 5169.89 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 2348.11 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.32 examples/s]\n",
+      ">> Preparing dataset - index:  6  - name:  winogrande-l-choices\n",
+      "Map (num_proc=160): 100%|███████| 10234/10234 [00:00<00:00, 10300.11 examples/s]\n",
+      "Filter (num_proc=160): 100%|████| 10234/10234 [00:00<00:00, 10846.91 examples/s]\n",
+      "Map (num_proc=160): 100%|████████| 10233/10233 [00:02<00:00, 4792.37 examples/s]\n",
+      "Map (num_proc=160): 100%|████████| 10233/10233 [00:01<00:00, 5768.97 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 320/320 [00:00<00:00, 5209.75 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 44.06 examples/s]\n",
+      ">> Preparing dataset - index:  7  - name:  arc_easy-answer-choice\n",
+      "Map (num_proc=160): 100%|██████████| 2251/2251 [00:00<00:00, 2320.81 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 2251/2251 [00:00<00:00, 2281.46 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 2250/2250 [00:02<00:00, 1034.23 examples/s]\n",
+      "Map (num_proc=160): 100%|██████████| 2250/2250 [00:01<00:00, 1319.60 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3744.25 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.58 examples/s]\n",
+      ">> Preparing dataset - index:  8  - name:  arc_challenge-answer-choice\n",
+      "Map (num_proc=160): 100%|██████████| 1119/1119 [00:01<00:00, 1100.39 examples/s]\n",
+      "Filter (num_proc=160): 100%|███████| 1119/1119 [00:00<00:00, 1173.84 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 1118/1118 [00:02<00:00, 515.15 examples/s]\n",
+      "Map (num_proc=160): 100%|███████████| 1118/1118 [00:01<00:00, 626.97 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4203.71 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.67 examples/s]\n",
+      ">> Preparing dataset - index:  9  - name:  piqa-choices\n",
+      "Map (num_proc=160): 100%|███████| 16113/16113 [00:00<00:00, 16423.87 examples/s]\n",
+      "Filter (num_proc=160): 100%|████| 16113/16113 [00:00<00:00, 18913.24 examples/s]\n",
+      "Map (num_proc=160): 100%|████████| 16112/16112 [00:02<00:00, 7504.01 examples/s]\n",
+      "Map (num_proc=160): 100%|████████| 16112/16112 [00:01<00:00, 9118.82 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 329/329 [00:00<00:00, 4519.52 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 42.57 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Dataset Mixing mode:  shuffle\n",
+      ">> Saving dataset to data_path :  /datapath/eval-retune/pack-no-mask-no-text/\n",
+      "Saving the dataset (1/1 shards): 100%|█| 4424/4424 [00:00<00:00, 5063.70 example\n",
+      "Saving the dataset (1/1 shards): 100%|███| 10/10 [00:00<00:00, 64.43 examples/s]\n",
+      ">> Dataset saved to data_path\n",
+      ">> -----------------------------------\n",
+      ">> Performing dataset counting\n",
+      ">> -----------------------------------\n",
+      ">> Final dataset count ( train ) : 4,424  samples/chunks/packs\n",
+      ">> Final dataset count ( test  ) : 10  samples\n",
+      ">> -----------------------------------\n",
+      "Map (num_proc=160): 100%|███████████| 4424/4424 [00:22<00:00, 197.19 examples/s]\n",
+      "num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.\n",
+      "Map (num_proc=10): 100%|█████████████████| 10/10 [00:02<00:00,  4.14 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Final 'train' dataset token count ...\n",
+      ">> - Total tokens : 14,167,638\n",
+      ">> - Valid tokens : 6,718,032\n",
+      ">> - Hidden tokens : 7,449,606\n",
+      ">> -----------------------------------\n",
+      ">> Final 'test' dataset token count ...\n",
+      ">> - Total tokens : 781\n",
+      ">> - Valid tokens : 459\n",
+      ">> - Hidden tokens : 322\n",
+      ">> -----------------------------------\n"
+     ]
+    }
+   ],
    "source": [
     "# Lets build the giant datapack\n",
     "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-data-build-no-mask-no-text.yaml\""
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ">> Starting datapack build process for: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune5-data-build.yaml\n",
+      ">> Preparing dataset - index:  0  - name:  lambada-train\n",
+      "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+      "Saving the dataset (3/3 shards): 100%|█| 58333/58333 [00:05<00:00, 10642.25 exam\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 96.40 examples/s]\n",
+      ">> Preparing dataset - index:  1  - name:  enwiki-train\n",
+      "Warning: packing_enable=true, with text rechunking (either auto, or forced) - packing_enable will be treated as false\n",
+      "Saving the dataset (7/7 shards): 100%|█| 124218/124218 [00:11<00:00, 11074.44 ex\n",
+      "Saving the dataset (1/1 shards): 100%|████| 1/1 [00:00<00:00, 120.13 examples/s]\n",
+      ">> Preparing dataset - index:  2  - name:  balanced-copa-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4993.37 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 46.95 examples/s]\n",
+      ">> Preparing dataset - index:  3  - name:  balanced-copa-options\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5056.73 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.42 examples/s]\n",
+      ">> Preparing dataset - index:  4  - name:  MedText-QA\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3783.83 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.22 examples/s]\n",
+      ">> Preparing dataset - index:  5  - name:  ALMA-prompt-completion\n",
+      "Saving the dataset (1/1 shards): 100%|█| 2655/2655 [00:00<00:00, 10179.92 exampl\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 32.20 examples/s]\n",
+      ">> Preparing dataset - index:  6  - name:  openbookqa-answer-choice\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3284.31 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.81 examples/s]\n",
+      ">> Preparing dataset - index:  7  - name:  winogrande-debiased-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 3196.16 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.81 examples/s]\n",
+      ">> Preparing dataset - index:  8  - name:  winogrande-l-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 320/320 [00:00<00:00, 5800.67 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 41.56 examples/s]\n",
+      ">> Preparing dataset - index:  9  - name:  logiqa-options\n",
+      "Saving the dataset (1/1 shards): 100%|█| 480/480 [00:00<00:00, 5771.96 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 37.33 examples/s]\n",
+      ">> Preparing dataset - index:  10  - name:  arc_easy-answer-choice\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4189.54 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 45.12 examples/s]\n",
+      ">> Preparing dataset - index:  11  - name:  arc_challenge-answer-choice\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 4675.01 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 44.19 examples/s]\n",
+      ">> Preparing dataset - index:  12  - name:  piqa-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 329/329 [00:00<00:00, 5608.34 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.35 examples/s]\n",
+      ">> Preparing dataset - index:  13  - name:  boolq-choices\n",
+      "Saving the dataset (1/1 shards): 100%|█| 480/480 [00:00<00:00, 7115.70 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 41.09 examples/s]\n",
+      ">> Preparing dataset - index:  14  - name:  mmlu-choices\n",
+      "Map (num_proc=160): 100%|█████████████| 285/285 [00:01<00:00, 276.45 examples/s]\n",
+      "Filter (num_proc=160): 100%|██████████| 285/285 [00:01<00:00, 274.18 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 284/284 [00:01<00:00, 145.31 examples/s]\n",
+      "Map (num_proc=160): 100%|█████████████| 284/284 [00:01<00:00, 176.93 examples/s]\n",
+      "Saving the dataset (1/1 shards): 100%|█| 160/160 [00:00<00:00, 5182.07 examples/\n",
+      "Saving the dataset (1/1 shards): 100%|█████| 1/1 [00:00<00:00, 43.49 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Dataset Mixing mode:  shuffle\n",
+      ">> Saving dataset to data_path :  /datapath/eval-retune/pack-5-no-mask/\n",
+      "Saving the dataset (10/10 shards): 100%|█| 188095/188095 [00:21<00:00, 8880.70 e\n",
+      "Saving the dataset (1/1 shards): 100%|███| 15/15 [00:00<00:00, 72.47 examples/s]\n",
+      ">> Dataset saved to data_path\n",
+      ">> -----------------------------------\n",
+      ">> Performing dataset counting\n",
+      ">> -----------------------------------\n",
+      ">> Final dataset count ( train ) : 188,095  samples/chunks/packs\n",
+      ">> Final dataset count ( test  ) : 15  samples\n",
+      ">> -----------------------------------\n",
+      "Map (num_proc=160): 100%|██████| 188095/188095 [00:30<00:00, 6192.88 examples/s]\n",
+      "num_proc must be <= 15. Reducing num_proc to 15 for dataset of size 15.\n",
+      "Map (num_proc=15): 100%|█████████████████| 15/15 [00:02<00:00,  5.65 examples/s]\n",
+      ">> -----------------------------------\n",
+      ">> Final 'train' dataset token count ...\n",
+      ">> - Total tokens : 764,990,976\n",
+      ">> - Valid tokens : 757,024,325\n",
+      ">> - Hidden tokens : 7,966,651\n",
+      ">> -----------------------------------\n",
+      ">> Final 'test' dataset token count ...\n",
+      ">> - Total tokens : 9,469\n",
+      ">> - Valid tokens : 9,055\n",
+      ">> - Hidden tokens : 414\n",
+      ">> -----------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets build the giant datapack\n",
+    "!cd \"{TRAINER_DIR}\" && python3 datapack_build.py \"{NOTEBOOK_DIR}/retune-extd-data-build.yaml\""
+   ]
   }
  ],
  "metadata": {
@@ -369,6 +568,18 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-extd-data-build.yaml b/notebook/major-runs/Eagle-2T-retune/retune-extd-data-build.yaml
new file mode 100644
index 00000000..708b9e37
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-extd-data-build.yaml
@@ -0,0 +1,694 @@
+#
+# Custom multiple datasource, built as a single datapack
+#
+datapack:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # If using relative path, this should be relative to the trainer script path
+  data_path: /datapath/eval-retune/pack-5-no-mask/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Mixing mode to use, this is used to alternate between datasets
+  #
+  # - concat  : Keep It Simple Silly, lets just concat the datasets together
+  # - shuffle : Dataset is mixed on a per sample level
+  #
+  # (@TODO: Advance operations)
+  # - batch   : Meaning one dataset worth per batch, partial batches are discarded
+  mixing_mode: "shuffle"
+
+#
+# Default settings used across all datasets in the datapack
+# These settings can be overriden by the dataset specific settings
+#
+default:
+
+  # dataset_path for the prebuilt dataset, to save into using HF `save _to_disk()`
+  #
+  # Datapath here is entirely optional, and only used if you intend to save each individual dataset
+  # seperately (makes it easier to tweak and rebuild the datapack if it crash mid-way)
+  #
+  # The dataset index will be appended to the default value, if set
+  # ---
+  data_path: /datapath/eval-retune/partial-xl-no-mask/
+
+  # Data path storage options, this is used to support cloud storage
+  # via the huggingface dataset API. See:
+  # https://huggingface.co/docs/datasets/v2.16.1/en/filesystems#amazon-s3
+  #
+  # Note: As of Jan 2023, these options has been only tested to work with AWS S3, and backblaze. YMMV
+  #       For S3 bucket support you will also need to install s3fs `python3 -m pip install s3fs`
+  #
+  # If you want to reduce the risk of accidental key/secret commits, you can use
+  # `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables instead
+  #
+  # For datapath, it should use the `s3://bucket-name/subpath` format
+  # ---
+  # data_path_storage_options:
+  #   key: <example S3 key>
+  #   secret: <example S3 secret>
+  #   endpoint_url: <example S3 endpoint>
+
+  # Additional source dataset params, used to grab subsets of the dataset
+  # ---
+  # source_dataset_params:
+  #   language: en
+
+  # Sort the dataset by length, useful to reduce gpu waiting time (also useful for RWKV long context coherence)
+  # ---
+  # sort_by_length: false
+  # sort_asc: True # Sort in ascending order, true = shortest first, false = longest first
+
+  # Limit the document count, to an offset/length limit
+  # If an int value is used, it is interprated as document count
+  # If a floating value (<1.0) is used, it is interprated as a percentage of the dataset
+  # ---
+  # dataset_offset: -1
+  # dataset_length: -1
+
+  # Use data_dir, if you are using source=text/json/etc
+  # If using relative path, this should be relative to the trainer script path
+  # source_data_dir: ../dataset-text/
+
+  # After loading the dataset, split out test data used for validation, 
+  # This process is skipped if the dataset includes a test split
+  #
+  # If given a float value, a percentage of the dataset is used (1.0 being 100%)
+  # If given an int value, the number of data sample is used.
+  #
+  # Due to the limitaitons in the trainer process, there is always a minimum of 1 test sample
+  test_split: 1 # Intentionally set to a low sample for test, cause the real eval is humans
+  test_split_shuffle: True
+
+  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
+  # If using a custom tokenizer, provide the HF tokenizer name/path
+  # ---
+  tokenizer: world
+
+  # Minimum / Maximum token size of the dataset to use
+  # useful for filtering out small noisy data samples from large datasets
+  # (eg. removal of small articles of less then 1024 tokens from wikipedia)
+  #
+  # This is ignored, if set to -1
+  # ---
+  min_token_size: -1
+  max_token_size: -1
+
+  # Custom text column to use, useful for dataset with alternative training columns labels
+  # This is checked before multi column merging, default is null (disabled)
+  # eg: 'code'
+  # ---
+  # custom_text_key: 'code'
+
+  # Multi Column merging process, default setting is used to support and merge
+  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
+  #
+  # A minimum of 2 columns is required, with non empty data, for the merge to occur
+  # If no match is found, this will fallback to the default prompt/completion or text column, 
+  # or throw an error if the default fallback is not found
+  #
+  # IMPORTANT NOTE: as newlines are commonly used for multi_column_suffix, etc. 
+  #                 you should use single quotes to ensure such values dun get escaped.
+  #                 eg. multi_column_suffix: ['\n\n']
+  #
+  # See: https://github.com/RWKV/RWKV-infctx-trainer/issues/34
+  # Need to use " or the new lines won't be tokenized properly
+  # ---
+  # multi_column_keys: ["instruction", "input", "output"]
+  # multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  # multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  # multi_column_train_mask: [true, true, true]
+  # multi_column_separator: "\n\n"
+  
+  # Conversation merging process
+  # useful for merging full conversational datasets, into single documents
+  # default is off, (or set conversation_key to [])
+  # conversation_formatting supports "iopairs" or "sender" for now.
+  # ---
+  # conversation_format: 'iopairs'
+  # conversation_key: 'conversation'
+  # conversation_end_of_conversation: "\n\nUser:"
+
+  # Iopairs specific config
+  # This means that every object in the conversation object is a pair of input output.
+  # In future it will also support a format where one of the keys dictates the format style
+  # if conversation_key is set to null, it will use the root object as the conversation object
+  # ---
+  # conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  # conversation_input_key_mask: {'input': false, 'output': True}
+  # conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # Sender specific config
+  # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  # conversation_input_key_map: {'message': "\n\n{sender}: ", 'context': ''}
+  # conversation_sender_key: 'sender'
+  # conversation_sender_value_map: {'user': 'User', 'assistant': 'Assistant', 'system': 'System'}
+  # conversation_sender_mask: {'user': false, 'assistant': True, 'system': false}
+  # conversation_sender_suffix: {'user': "", 'assistant': "", 'system': ""}
+
+  # If processing prompt/completion jsonl pairs, the prompt is masked by default
+  # use this flag to disable this default behaviour
+  # ---
+  # disable_prompt_completion_mask: false
+
+  # ----------------------------
+  # Dataset split usage
+  # ----------------------------
+
+  source_dataset_split: "train"
+  test_dataset_split: "do-not-use-test-split"
+
+  # ----------------------------
+  # Rechunking support
+  # ----------------------------
+
+  # Rechunking of text dataset, this is done only when source is set as 'text'
+  # and will merge the various sentencees, into larger chunks up to the target size
+  #
+  # Defaults to 2048
+  #
+  # This is ignored, if source is not set as text (unless text_rechunk_force)
+  # This is ignored, if set to zero / -1
+  # ---
+  text_rechunk_size: 4096
+
+  # Apply text rechunk to the dataset, even if its not a 'text' source
+  # This is done only after dataset filtering, and if source is not 'text'
+  # ---
+  text_rechunk_force: False
+
+  # Used to disable the automated text rechunkin for text files, if set as false
+  # ---
+  text_rechunk_auto: True
+
+  # ----------------------------
+  # Dataset packing support
+  # Recommended to be used with mixed documents sized finetuning
+  # For foundation model "from scratch", rechunking is typically used instead
+  # ----------------------------
+
+  # Boolean flag to enable / disable dataset packing
+  packing_enable: True
+
+  # Used to ensure all training samples wihin this batch size is the same length
+  # Ideally this should align exactly with your real "batch size"
+  #
+  # Uses, `8 * (3 * 4 * 5 * 6 * 7) = 20160` for default, as it should align across
+  # a large number of batch size combinations. This helps reduce the amount of
+  # misaligned batches, and thus reduce the amount of wasted training time.
+  #
+  # This is tagged to datapack.batchsize, unless overriden here or on a dataset level
+  # ---
+  # packing_batchsize: 20160
+
+  # Chunking size to align within each batch, this ideally should be equal to
+  # the training context length used.
+  packing_chunksize: 4096
+
+  # Minimum size to pack up to, this should be a multiple of packing_chunksize
+  # defautls to -1, which equals to packing_chunksize
+  packing_min_ctx_len: 4096
+
+  # Pack the data sequentially if possible, in accordance to the dataset sequence
+  # this can be used together with sort_by_length, otherwise a shuffle will be done
+  packing_in_sequence: False
+
+  # ----------------------------
+  # Specal use caes flags
+  # ----------------------------
+
+  # Reverse the training dataset order before saving, this is useful for,
+  # optimizing dataset packing process, when using packing_in_sequence
+  # and sort_by_length desc order together
+  reverse_train_dataset_before_save: False
+
+#
+# The dataset specific settings
+# 
+dataset:
+  
+  # ---
+  # Text based dataset
+  # ---
+
+  - # Lambada training text
+    # https://huggingface.co/datasets/lambada
+    source: "lambada"
+    name: "lambada-train"
+    # 4k rechunk forced
+    text_rechunk_force: True
+
+  - # Enwiki training text
+    # https://huggingface.co/datasets/teven/enwiki_100k
+    source: "teven/enwiki_100k"
+    name: "enwiki-train"
+    # 4k rechunk forced
+    min_token_size: 256
+    text_rechunk_force: True
+
+  # ---
+  # Copa style
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/pkavumba/balanced-copa
+
+  - # Balanced copa, framed as choices
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["### Premise:\n", "\n\n### Question:\nWhich choice was the", "1) ", "2) ", "\n### Answer:\n"]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [true, true, true, true, true]
+    multi_column_separator: ""
+  
+  - # Balanced copa, framed as options
+    source: "pkavumba/balanced-copa"
+    name: "balanced-copa-options"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["premise", "question", "choice1", "choice2", "label"]
+    multi_column_prefix: ["Context: ", "\n\nQuestion: Which option was the", "1. ", "2. ", "\nAnswer: "]
+    multi_column_suffix: ["", "?\n\n", "\n", "\n", ""]
+    multi_column_train_mask: [true, true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Prompt completion / Q&A datasets
+  # ---
+
+  - # Question answer pair medical text
+    # https://huggingface.co/datasets/BI55/MedText
+    source: "BI55/MedText"   
+    name: "MedText-QA"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["Prompt", "Completion"]
+    multi_column_prefix: ["Question:\n", "Answer:\n"]
+    multi_column_suffix: ["", ""]
+    multi_column_train_mask: [true, true]
+    multi_column_separator: "\n\n"
+  
+  - # Language translation prompt/completion
+    # https://huggingface.co/datasets/kristaller486/ALMA-prompt-completion
+    source: "kristaller486/ALMA-prompt-completion"
+    name: "ALMA-prompt-completion"
+    # 4k packing
+    packing_enable: True
+    # Prompt completion, nothing else else
+
+  # ---
+  # openbookqa
+  # ---
+
+  # openbookqa
+  # https://huggingface.co/datasets/allenai/openbookqa
+
+  - # Openbookqa training, with the json
+    source: "allenai/openbookqa"
+    name: "openbookqa-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["fact1", "question_stem", "choices", "answerKey"]
+    multi_column_prefix: [">>> Premise:\n", "\n\nChoose the best option to complete the following:\n", "\n\nUsing the text options found in the following JSON:\n", "\n\n>>> Answer:\n"]
+    multi_column_suffix: ["", "", "\n\nAnswer using only the label given in the json", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Winogrande
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/winogrande
+
+  - # Balanced winogrande, framed as choices
+    source: "winogrande"
+    name: "winogrande-debiased-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_debiased
+
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following sentence:\n", "\n1) ", "\n2) ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["\n\n Choose either 1 or 2, for which option is the best fit to replace _ in the sentence\n", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  - # Balanced winogrande, framed as choices
+    source: "winogrande"
+    name: "winogrande-l-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: winogrande_l 
+    
+    # Question / Answer pairings
+    multi_column_keys: ["sentence", "option1", "option2", "answer"]
+    multi_column_prefix: ["For the following statement: `", "\n1. ", "\n2. ", "\n\nAnswer:\n"]
+    multi_column_suffix: ["`\n\n Choose 1 or 2, for which choice is the best fit to replace _ in the statement, answer only with the number given\n", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # logiqa
+  # ---
+
+  # logiqa
+  # https://huggingface.co/datasets/lucasmccabe/logiqa
+
+  - # Openbookqa training, with the json
+    source: "lucasmccabe/logiqa"
+    name: "logiqa-options"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["context", "query", "options", "correct_option"]
+    multi_column_prefix: [">>> Context:\n", "\n\n>>> Query:\n", "\n\nAnswer with the array index position (starting from 0), for the most appropriate option for the given query: ", "\n\n>>> Answer:\n"]
+    multi_column_suffix: ["", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # arc_easy
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_easy
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_easy"
+    name: "arc_easy-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["question", "choices", "answerKey"]
+    multi_column_prefix: ["Question: ", "\n\nUsing the text options found in the following JSON:\n", "\n\nAnswer: "]
+    multi_column_suffix: ["", "\n\nAnswer using only the corresponding label given in the json", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # arc_challenge
+  # ---
+
+  # arc_easy
+  # https://huggingface.co/datasets/ibragim-bad/arc_challenge
+
+  - # Openbookqa training, with the json
+    source: "ibragim-bad/arc_challenge"
+    name: "arc_challenge-answer-choice"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["choices", "question", "answerKey"]
+    multi_column_prefix: ["Using the text found in the following:\n", "\n\nQuestion: ", "\n\nAnswer: "]
+    multi_column_suffix: ["\n\nAnswer using only the respective label given", "", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Piqa
+  # ---
+
+  # Copa trained using
+  # https://huggingface.co/datasets/piqa
+
+  - # Balanced copa, framed as choices
+    source: "piqa"
+    name: "piqa-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["goal", "sol1", "sol2", "label"]
+    multi_column_prefix: ["# Goal: ", "\n\n0) ", "\n1) ", "\n\n# Answer: "]
+    multi_column_suffix: ["", "", "", ""]
+    multi_column_train_mask: [true, true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Boolq
+  # ---
+
+  - # Boolq true/false
+    source: "boolq"
+    name: "boolq-choices"
+    # 4k packing
+    packing_enable: True
+
+    # Question / Answer pairings
+    multi_column_keys: ["passage", "question", "answer"]
+    multi_column_prefix: ["Study the following passage:\n\n", "\n\nQuestion: ", "\n\nAnswer: "]
+    multi_column_suffix: ["\n\nAnswer the question, with either true or false only", "", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # MMLU
+  # ---
+
+  - # Balanced winogrande, framed as choices
+    source: "cais/mmlu"
+    name: "mmlu-choices"
+    # 4k packing
+    packing_enable: True
+    source_dataset_params:
+      name: all
+    source_dataset_split: "dev"
+
+    # Question / Answer pairings
+    multi_column_keys: ["question", "choices", "answer"]
+    multi_column_prefix: ["\n\n### Question:\n", "\n\nAnswer with the array index (0 indexed), for the most appropriate option in the question: ", "\n\n### Answer:\n"]
+    multi_column_suffix: ["", "", ""]
+    multi_column_train_mask: [true, true, true]
+    multi_column_separator: ""
+  
+  # ---
+  # Instruct datasets
+  # ---
+
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "Open-Orca/OpenOrca"
+  #   name: "OpenOrca"
+
+  #   multi_column_keys: ["system_prompt", "question", "response"]
+  #   multi_column_prefix: ["Instruction:\n", "", ""]
+  #   multi_column_suffix: ["\n\n", "\n\n", "\n\n"]
+  #   multi_column_train_mask: [true, true, true]
+  #   multi_column_separator: ""
+    
+  # - # Instruct, input, output format
+  #   # https://huggingface.co/datasets/teknium/openhermes
+  #   source: "teknium/openhermes"
+  #   name: "openhermes-1-instruct"
+
+  #   multi_column_keys: ["instruction", "input", "output"]
+  #   multi_column_prefix: ["Instruction:\n", "Input:\n", "Output:\n"]
+  #   multi_column_suffix: ["", "", ""]
+  #   multi_column_train_mask: [true, true, true]
+  #   multi_column_separator: "\n\n"
+    
+  # ---
+  # Chat datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Capybara"
+  #   name: "Capybara-chat"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\n>>> User: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\n>>> User: ", 'output': "\n\n>>> Assistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "LDJnr/Pure-Dove"
+  #   name: "Pure-Dove"
+      
+  #   # Conversation merging process=
+  #   # ---
+  #   conversation_format: 'iopairs'
+  #   conversation_key: 'conversation'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Iopairs specific config
+  #   # ---
+  #   conversation_input_key_prefix_map: {'input': "\n\nUser: ", 'output': "\n\nAssistant: "}
+  #   conversation_input_key_mask: {'input': false, 'output': True}
+  #   conversation_sender_suffix: {'input': "", 'output': ""}
+ 
+  # ---
+  # Other datasets
+  # ---
+
+  # - # Conversation format
+  #   # https://huggingface.co/datasets/teknium/OpenHermes-2.5
+  #   source: "teknium/OpenHermes-2.5"
+  #   name: "openhermes-2-convo"
+      
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'conversations'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'value': "\n\n{sender}: "}
+  #   conversation_sender_key: 'from'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  # - # Instruct, input, output format
+  #   # With the instruction format changed, to fix the formatting
+  #   # https://huggingface.co/datasets/Darok/Lamini-instructions-to-french
+  #   source: "Darok/Lamini-instructions-to-french"
+  #   name: "Lamini-instructions-to-french"
+
+  #   multi_column_keys: ["Input", "Response"]
+  #   multi_column_prefix: ["### Instruction:\nPlease translate the next sentence into French\n\n### Input:\n", "### Output:\n"]
+  #   multi_column_suffix: ["", ""]
+  #   multi_column_train_mask: [true, true]
+  #   multi_column_separator: "\n\n"
+    
+  # - # Long range instruction format
+  #   # https://huggingface.co/datasets/THUDM/LongAlign-10k/
+  #   source: "THUDM/LongAlign-10k"
+  #   name: "LongAlign-10k"
+
+  #   # Conversation merging process
+  #   # useful for merging full conversational datasets, into single documents
+  #   # default is off, (or set conversation_key to [])
+  #   # conversation_formatting supports "iopairs" or "sender" for now.
+  #   # ---
+  #   conversation_format: 'sender'
+  #   conversation_key: 'messages'
+  #   conversation_end_of_conversation: "\n\nUser: "
+
+  #   # Sender specific config
+  #   # This means that every object in the conversation object is a single message (with sender and message keys - or similar)
+  #   # The output is dictated by the input key map, the rest of the "sender_" config is keyed by the value of the sender key
+  #   # ---
+  #   conversation_input_key_map: {'content': "\n\n{sender}: "}
+  #   conversation_sender_key: 'role'
+  #   conversation_sender_value_map: {'user': 'User', 'human': 'User', 'assistant': 'Assistant', 'gpt': 'Assistant', 'system': 'System'}
+  #   conversation_sender_mask: {'user': false, 'human': false, 'assistant': True, 'gpt': True, 'system': false}
+  #   conversation_sender_suffix: {'user': "", 'human': "", 'assistant': "", 'gpt': "", 'system': ""}
+
+  ######################################################
+  # Note: You can probably throw in enwiki if you want
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "teven/enwiki_100k" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "enwiki_100k"
+
+  #   # Minimum / Maximum token size of the dataset to use
+  #   min_token_size: 1024
+  #   max_token_size: -1
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  # - # SuperWiki (Multi-lingual)
+  #   # https://huggingface.co/datasets/RyokoExtra/SuperWIKI-Cleaned
+  #   source: "RyokoExtra/SuperWIKI-Cleaned" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "super_wiki"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+
+  #   source_dataset_split: lang25
+
+  #   # Custom text column to use, useful for dataset with alternative training columns labels
+  #   # This is checked before multi column merging, default is null (disabled)
+  #   # If set this takes priority
+  #   # eg: 'code'
+  #   # ---
+  #   custom_text_key: 'text'
+
+  #   # All other settings found in default can be overriden here
+  #   # ---
+  #   # ...
+
+  ######################################################
+  # Note: We found the ML generated textbooks
+  # too low in perplexity that it hurts the model
+  # so we are using the original enwiki_100k & superwiki
+  ######################################################
+  # - # Text book is all you need
+  #   # https://huggingface.co/datasets/TanvirOnHF/muse_textbooks
+  #   source: "TanvirOnHF/muse_textbooks" 
+
+  #   # Optional, provide a name for the dataset
+  #   name: "muse_textbooks"
+
+  #   # Various over write settings
+  #   # ---
+  #   text_rechunk_size: 32768
+  #   text_rechunk_force: True
+  #   packing_enable: False
+  #   max_token_size: -1
+  ######################################################
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-extd-runs.ipynb b/notebook/major-runs/Eagle-2T-retune/retune-extd-runs.ipynb
new file mode 100644
index 00000000..65e01e25
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-extd-runs.ipynb
@@ -0,0 +1,1110 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Perform Retune runs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ENABLE_WANDB: True\n",
+      "GPU_DEVICES: auto\n",
+      "NOTEBOOK_DIR: /workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune\n",
+      "TRAINER_DIR: /workspace/picocreator/RWKV-infctx-trainer/RWKV-v5\n",
+      "PROJECT_DIR: /workspace/picocreator/RWKV-infctx-trainer\n"
+     ]
+    }
+   ],
+   "source": [
+    "GPU_DEVICES=\"auto\"\n",
+    "ENABLE_WANDB=True\n",
+    "WANDB_PREFIX=\"Eagle-Retune\"\n",
+    "DEEPSPEED_STRAT=\"deepspeed_stage_2\"\n",
+    "LEARNING_RATE=\"5e-6\"\n",
+    "\n",
+    "print(\"ENABLE_WANDB:\", ENABLE_WANDB)\n",
+    "print(\"GPU_DEVICES:\", GPU_DEVICES)\n",
+    "\n",
+    "if ENABLE_WANDB:\n",
+    "    WANDB_MODE=\"online\"\n",
+    "else:\n",
+    "    WANDB_MODE=\"disabled\"\n",
+    "\n",
+    "# The model to start from\n",
+    "MICROBATCH_SIZE=8\n",
+    "\n",
+    "# Computing the notebook, and various paths\n",
+    "import os\n",
+    "NOTEBOOK_DIR=os.path.dirname(os.path.abspath(\"__file__\"))\n",
+    "PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, \"../../../\"))\n",
+    "TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, \"./RWKV-v5/\"))\n",
+    "\n",
+    "print(\"NOTEBOOK_DIR:\", NOTEBOOK_DIR)\n",
+    "print(\"TRAINER_DIR:\", TRAINER_DIR)\n",
+    "print(\"PROJECT_DIR:\", PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-08 02:07:35,449] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/R4-retune/R4-7B-15t-No-Mask.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-extd-e2/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-extd-e2 (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/R4-retune/R4-7B-15t-No-Mask.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-extd-e2/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-extd-e2 (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 2276849947\n",
+      "Seed set to 2276849947\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "[rank: 0] Seed set to 2276849947\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-08 02:08:21,435] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 02:08:21,522] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 02:08:21,578] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 02:08:21,620] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 02:08:21,699] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 02:08:21,711] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 02:08:21,714] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 2] Seed set to 2276849947\n",
+      "[rank: 6] Seed set to 2276849947\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 7] Seed set to 2276849947\n",
+      "[rank: 3] Seed set to 2276849947\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 5] Seed set to 2276849947\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 1] Seed set to 2276849947\n",
+      "[rank: 4] Seed set to 2276849947\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 6] Seed set to 2276849947\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 7] Seed set to 2276849947\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 3] Seed set to 2276849947\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 1] Seed set to 2276849947\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 4] Seed set to 2276849947\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 5] Seed set to 2276849947\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 2] Seed set to 2276849947\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240308_020938-5owxkov9\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 7B-15t-extd-e2 (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/5owxkov9\u001b[0m\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.048902273178100586 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10107994079589844 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.1011803150177002 seconds\n",
+      "Time to load fused_adam op: 0.10128021240234375 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10105180740356445 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10143899917602539 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.10274028778076172 seconds\n",
+      "Time to load fused_adam op: 0.10399985313415527 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 268 M \n",
+      "1 | blocks | ModuleList | 7.0 B \n",
+      "2 | ln_out | LayerNorm  | 8.2 K \n",
+      "3 | head   | Linear     | 268 M \n",
+      "--------------------------------------\n",
+      "7.5 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "7.5 B     Total params\n",
+      "30,072.177Total estimated model params size (MB)\n",
+      "Epoch 0:  14%|██▎              | 400/2922 [27:59<2:56:30,  0.24it/s, v_num=kov9]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|▉| 2921/2922 [3:37:56<00:04,  0.22it/s, v_num=kov9, train/tok=6.29[rank6]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-08 05:48:16,297] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-08 05:48:16,298] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-08 05:48:16,298] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-08 05:48:16,298] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-08 05:48:16,298] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-08 05:48:16,298] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-08 05:48:16,298] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-08 05:48:16,298] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-08 05:48:16,298] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-08 05:48:16,298] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-08 05:48:16,299] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-08 05:48:16,301] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-08 05:48:16,301] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-08 05:48:16,301] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-08 05:48:16,301] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-08 05:48:16,301] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-08 05:48:16,301] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-08 05:48:16,302] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-08 05:48:16,302] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-08 05:48:16,302] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-08 05:48:16,302] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-08 05:48:16,302] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-08 05:48:16,302] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:38:24<00:00,  0.22it/s, v_num=kov9, train/tok=6.29\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A[rank3]:[2024-03-08 05:48:43,978] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-08 05:48:43,978] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank3]:[2024-03-08 05:48:43,978] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-08 05:48:43,982] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-08 05:48:43,982] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank2]:[2024-03-08 05:48:43,982] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-08 05:48:43,982] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-08 05:48:43,982] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank7]:[2024-03-08 05:48:43,982] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-08 05:48:43,982] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-08 05:48:43,982] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank5]:[2024-03-08 05:48:43,982] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-08 05:48:43,983] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-08 05:48:43,983] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank1]:[2024-03-08 05:48:43,983] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-08 05:48:43,985] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-08 05:48:43,985] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank6]:[2024-03-08 05:48:43,985] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-08 05:48:43,991] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-08 05:48:43,991] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank4]:[2024-03-08 05:48:43,991] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-08 05:48:43,999] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-08 05:48:43,999] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank0]:[2024-03-08 05:48:43,999] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:01<00:00,  0.78it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:38:25<00:00,  0.22it/s, v_num=kov9, train/tok=6.29`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:38:25<00:00,  0.22it/s, v_num=kov9, train/tok=6.29\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.044 MB of 0.044 MB uploaded (0.004 MB deduped)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch ▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec ▁▅▇▇██▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step ▅▇▅▂▇▇▅▇▇▁▇▄▃▄▇▃▇▇▇▄▇▇█▄▇▄▇█▇▄▇▄▇▆▅▆▃▇▅▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 ▁▅▇▇██▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 ██▁█████▇▁██▇▇▇▇▇▇█▇▇▇█▁███████▇▇▅██▇▇██\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen ▅█▅▁██▄██▁█▄▄▄█▃███▄███▅█▅███▄█▄█▇▅▇▃█▄█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss ▇▃▃▅▄▆▅▅▆▂▇▄▃▆▇▄▄▅▆█▄▄█▆▄▂▄▅▆▁▃▅▂▅▇▆▄▄▄▅\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss ▇▃▃▅▄▆▅▅▆▂▇▄▃▆▇▄▄▅▆█▄▄█▆▄▂▄▅▆▁▃▅▂▅▇▆▄▄▄▅\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens ▅▅▃▄▅█▄▅█▁█▃▆▆▇▄▆▅█▆▇▇▇▃▅▄▅▆█▄▅▁█▇▅▅▅█▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss █▄▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step 149.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec 58.22579\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step 8.88431\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total 761867.875\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 7.28253\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 1.11054\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 95289.894\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep 23368\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss 2.11426\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss 2.11426\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss 2.04359\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok 628883456.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step 182\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate 1e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen 4096.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss 1.8667\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss 1.8667\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mEagle-Retune - 7B-15t-extd-e2 (deepspeed_stage_2)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/5owxkov9\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjE0NjI1ODU5Ng==/version_details/v9\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240308_020938-5owxkov9/logs\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# The 7B model\n",
+    "MODEL_PATH=\"/workspace/main-models/R4-retune/R4-7B-15t-No-Mask.pth\"\n",
+    "EXPERIMENT_NAME=\"7B-15t-extd-e2\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-no-mask.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-08 05:49:25,348] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/7B-15t-extd-e2/last.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 710 params 7518044160 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-7B-15t-extd-e2.pth\n",
+      "-rw-r--r-- 1 nobody root 15G Mar  8 05:53 /workspace/main-models/R4-retune/R4-7B-15t-extd-e2.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"7B-15t-extd-e2\"\n",
+    "CKPT_DIR=\"last.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "R4-7B-15t-extd-e2.pth: 100%|███████████████| 15.0G/15.0G [06:32<00:00, 38.3MB/s]\n",
+      "https://huggingface.co/rwkv-x-dev/eagle-7b-experiment/blob/main/R4-7B-15t-extd-e2.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "EXPERIMENT_NAME=\"7B-15t-extd-e2\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-08 06:01:01,724] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:518: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/R4-retune/R4-7B-15t-extd-e2.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-extd-e3/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-extd-e3 (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'], args=['fit', '-c', '/workspace/picocreator/RWKV-infctx-trainer/notebook/major-runs/Eagle-2T-retune/retune-train-no-mask.yaml', '--model.load_model=/workspace/main-models/R4-retune/R4-7B-15t-extd-e2.pth', '--model.lr_init=5e-6', '--model.lr_final=5e-6', '--data.skip_datapath_setup=True', '--trainer.callbacks.init_args.dirpath=/checkpoint/retune/7B-15t-extd-e3/', '--trainer.logger.init_args.name=Eagle-Retune - 7B-15t-extd-e3 (deepspeed_stage_2)', '--trainer.strategy=deepspeed_stage_2', '--trainer.target_batch_size=1024', '--trainer.microbatch_size=8', '--model.ctx_len=4096', '--trainer.devices=auto'].\n",
+      "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:40: No seed found, seed set to 3817360836\n",
+      "Seed set to 3817360836\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "\n",
+      "\n",
+      "[RWKV.Trainer] Applying 'target_batch_size' with the following:\n",
+      "   - target_batch_size:       1024\n",
+      "   - num_nodes:               1\n",
+      "   - num_devices:             8\n",
+      "   - microbatch_size:         8\n",
+      "   - accumulate_grad_batches: 16\n",
+      "   - effective_batch_size:    1024\n",
+      "\n",
+      "[rank: 0] Seed set to 3817360836\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8\n",
+      "[2024-03-08 06:01:44,010] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 06:01:44,012] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 06:01:44,036] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 06:01:44,051] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 06:01:44,096] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 06:01:44,111] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-08 06:01:44,122] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[RWKV.model][WARNING] - torch.compile is enabled, but this has been observed to perform worse, or even crash in some setup. Ensure to test if you actually measure speedups over JIT before using for large training runs'\n",
+      "[RWKV.model] Running RWKV infctx using 'torch-compile' with torch '2.1.1+cu121'\n",
+      "[rank: 2] Seed set to 3817360836\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 5] Seed set to 3817360836\n",
+      "[rank: 3] Seed set to 3817360836\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 7] Seed set to 3817360836\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 6] Seed set to 3817360836\n",
+      "[rank: 4] Seed set to 3817360836\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "[rank: 1] Seed set to 3817360836\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "---\n",
+      "[RWKV.TimeMix] Compiling CUDA kernel with HEAD_SIZE=64\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/wkv5/build.ninja...\n",
+      "Building extension module wkv5...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "Loading extension module wkv5...\n",
+      "[RWKV.TimeMix] CUDA kernel compiled & loaded globally\n",
+      "---\n",
+      "[rank: 4] Seed set to 3817360836\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 3] Seed set to 3817360836\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 6] Seed set to 3817360836\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 1] Seed set to 3817360836\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 2] Seed set to 3817360836\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 7] Seed set to 3817360836\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "[rank: 5] Seed set to 3817360836\n",
+      "initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Loading dataset from data_path:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      ">> Dataset load finished:  /datapath/eval-retune/pack-no-mask/\n",
+      "Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: wandb version 0.16.4 is available!  To upgrade, please run:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:  $ pip install wandb --upgrade\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.16.3\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20240308_060303-h6vapdvr\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mEagle-Retune - 7B-15t-extd-e3 (deepspeed_stage_2)\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/h6vapdvr\u001b[0m\n",
+      "LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]\n",
+      "#\n",
+      "# RWKV lighting_trainer.py important notes \n",
+      "# https://github.com/RWKV/RWKV-infctx-trainer \n",
+      "#\n",
+      "# - Ensure your host is not running cuda 12.0 (use either 11.8, or >=12.1), as this is known to have freeze issues\n",
+      "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\n",
+      "# - When resuming from checkpoint, the estimated time is inaccurate\n",
+      "#\n",
+      "\n",
+      "[RWKV.model] Configuring optimizer with\n",
+      "    - lr_init:  5.000e-06 (5e-06)\n",
+      "    - lr_final: 5.000e-06 (5e-06)\n",
+      "\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.04831433296203613 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10107970237731934 seconds\n",
+      "Time to load fused_adam op: 0.10109663009643555 seconds\n",
+      "Time to load fused_adam op: 0.1010141372680664 seconds\n",
+      "Time to load fused_adam op: 0.10109901428222656 seconds\n",
+      "Time to load fused_adam op: 0.10119318962097168 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10131072998046875 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Time to load fused_adam op: 0.1020510196685791 seconds\n",
+      "/usr/local/lib/python3.10/dist-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at ../torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "\n",
+      "  | Name   | Type       | Params\n",
+      "--------------------------------------\n",
+      "0 | emb    | Embedding  | 268 M \n",
+      "1 | blocks | ModuleList | 7.0 B \n",
+      "2 | ln_out | LayerNorm  | 8.2 K \n",
+      "3 | head   | Linear     | 268 M \n",
+      "--------------------------------------\n",
+      "7.5 B     Trainable params\n",
+      "0         Non-trainable params\n",
+      "7.5 B     Total params\n",
+      "30,072.177Total estimated model params size (MB)\n",
+      "Epoch 0:  14%|██▎              | 400/2922 [28:00<2:56:32,  0.24it/s, v_num=pdvr]/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "Epoch 0: 100%|▉| 2921/2922 [3:40:19<00:04,  0.22it/s, v_num=pdvr, train/tok=6.29[rank3]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank3]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank1]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank4]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank5]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank7]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank0]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank6]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/model.py:103)\n",
+      "[rank2]:[2024-03-08 09:44:13,106] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank3]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank4]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank1]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank3]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank3]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank5]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank4]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank1]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank5]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank0]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank6]:[2024-03-08 09:44:13,107] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank7]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:276)\n",
+      "[rank2]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank0]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank6]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank7]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING]    function: '_forward_cuda' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/TimeMix.py:284)\n",
+      "[rank2]:[2024-03-08 09:44:13,108] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:40:46<00:00,  0.22it/s, v_num=pdvr, train/tok=6.29\n",
+      "Validation: |                                             | 0/? [00:00<?, ?it/s]\u001b[A\n",
+      "Validation:   0%|                                         | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "Validation DataLoader 0:   0%|                            | 0/1 [00:00<?, ?it/s]\u001b[A[rank3]:[2024-03-08 09:44:39,987] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank3]:[2024-03-08 09:44:39,987] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank3]:[2024-03-08 09:44:39,987] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank5]:[2024-03-08 09:44:39,987] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank5]:[2024-03-08 09:44:39,987] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank5]:[2024-03-08 09:44:39,987] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank4]:[2024-03-08 09:44:39,990] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank4]:[2024-03-08 09:44:39,990] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank4]:[2024-03-08 09:44:39,990] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank0]:[2024-03-08 09:44:39,992] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank0]:[2024-03-08 09:44:39,992] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank0]:[2024-03-08 09:44:39,992] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank6]:[2024-03-08 09:44:39,993] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank6]:[2024-03-08 09:44:39,993] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank6]:[2024-03-08 09:44:39,993] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank1]:[2024-03-08 09:44:39,994] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank1]:[2024-03-08 09:44:39,994] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank1]:[2024-03-08 09:44:39,994] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank7]:[2024-03-08 09:44:39,995] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank7]:[2024-03-08 09:44:39,995] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank7]:[2024-03-08 09:44:39,995] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "[rank2]:[2024-03-08 09:44:39,997] torch._dynamo.convert_frame: [WARNING] torch._dynamo hit config.cache_size_limit (64)\n",
+      "[rank2]:[2024-03-08 09:44:39,997] torch._dynamo.convert_frame: [WARNING]    function: 'forward' (/workspace/picocreator/RWKV-infctx-trainer/RWKV-v5/src/module/ChannelMix.py:34)\n",
+      "[rank2]:[2024-03-08 09:44:39,997] torch._dynamo.convert_frame: [WARNING] to diagnose recompilation issues, set env variable TORCHDYNAMO_REPORT_GUARD_FAILURES=1 and also see https://pytorch.org/docs/master/compile/troubleshooting.html.\n",
+      "\n",
+      "Validation DataLoader 0: 100%|████████████████████| 1/1 [00:01<00:00,  0.76it/s]\u001b[A\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:40:47<00:00,  0.22it/s, v_num=pdvr, train/tok=6.29`Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "Epoch 0: 100%|█| 2922/2922 [3:40:47<00:00,  0.22it/s, v_num=pdvr, train/tok=6.29\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.032 MB of 0.032 MB uploaded\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch ▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec ▁▅▇▇██▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step ▅▇▄▁▇▇▄▆▇▁▇▃▃▄▇▂▇▇▇▃▇▇█▄▇▄▆██▄▇▄▇▆▅▆▃▇▅▆\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 ▁▅▇▇██▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 ██▁█▇██▇█▂▇█▇▇██▇██▇███▁▇▇▇██████▅█████▇\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen ▅█▅▁██▄██▁█▄▄▄█▃███▄███▅█▅███▄█▄█▇▅▇▃█▄█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss ▇▃▃▅▄▅▅▅▆▂▇▄▃▆▇▄▄▅▆▇▄▄█▆▄▂▃▅▆▁▃▅▂▅▇▆▄▄▄▅\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss ▇▃▃▅▄▅▅▅▆▂▇▄▃▆▇▄▄▅▆▇▄▄█▆▄▂▃▅▆▁▃▅▂▅▇▆▄▄▄▅\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens ▅▅▃▄▅█▄▅█▁█▃▆▆▇▄▆▅█▆▇▇▇▃▅▄▅▆█▄▅▁█▇▅▅▅█▄▄\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss █▃▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok ▁▅█\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss ▁\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                          batchidx 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                             epoch 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_rank 0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                       global_step 149.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:      perf/cluster/kTokens_per_sec 57.5995\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: perf/cluster/kTokens_per_sec_step 9.30592\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/cluster/kTokens_total 761867.875\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:        perf/kTokens_per_sec.gpu.0 7.2042\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:   perf/kTokens_per_sec_step.gpu.0 1.16324\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:          perf/kTokens_total.gpu.0 95289.894\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                           substep 23368\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                 train/data_ctxlen 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   train/data_loss 2.1123\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                  train/learn_loss 2.1123\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                train/learn_tokens 1536.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                        train/loss 2.04022\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                         train/tok 628883456.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:               trainer/global_step 182\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:             trainer/learning_rate 1e-05\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:            validation/data_ctxlen 4096.0\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:              validation/data_loss 1.81104\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m:                   validation/loss 1.81104\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mEagle-Retune - 7B-15t-extd-e3 (deepspeed_stage_2)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/runs/h6vapdvr\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-V5-Eagle-2T-R4/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjE0NjI1ODU5Ng==/version_details/v9\u001b[0m\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240308_060303-h6vapdvr/logs\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "# The 7B model\n",
+    "MODEL_PATH=\"/workspace/main-models/R4-retune/R4-7B-15t-extd-e2.pth\"\n",
+    "EXPERIMENT_NAME=\"7B-15t-extd-e3\"\n",
+    "\n",
+    "# Perform the validation\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    export RWKV_TORCH_COMPILE=1 && \\\n",
+    "    export RWKV_NO_CUDA=0 && \\\n",
+    "    export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
+    "    python3 lightning_trainer.py fit \\\n",
+    "        -c \"{NOTEBOOK_DIR}/retune-train-no-mask.yaml\" \\\n",
+    "        --model.load_model=\"{MODEL_PATH}\" \\\n",
+    "        --model.lr_init={LEARNING_RATE} \\\n",
+    "        --model.lr_final={LEARNING_RATE} \\\n",
+    "        --data.skip_datapath_setup=True \\\n",
+    "        --trainer.callbacks.init_args.dirpath=\"/checkpoint/retune/{EXPERIMENT_NAME}/\" \\\n",
+    "        --trainer.logger.init_args.name=\"{WANDB_PREFIX} - {EXPERIMENT_NAME} ({DEEPSPEED_STRAT})\" \\\n",
+    "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
+    "        --trainer.target_batch_size=1024 \\\n",
+    "        --trainer.microbatch_size={MICROBATCH_SIZE} \\\n",
+    "        --model.ctx_len=4096 \\\n",
+    "        --trainer.devices=\"{GPU_DEVICES}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-08 09:45:16,691] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "Processing zero checkpoint '/checkpoint/retune/7B-15t-extd-e3/last.ckpt/checkpoint'\n",
+      "Detected checkpoint of type zero stage 2, world_size: 8\n",
+      "Parsing checkpoint created by deepspeed==0.12.6\n",
+      "Reconstructed fp32 state dict with 710 params 7518044160 elements\n",
+      "Saving bf16 state dict to /workspace/main-models/R4-retune/R4-7B-15t-extd-e3.pth\n",
+      "-rw-r--r-- 1 nobody root 15G Mar  8 09:46 /workspace/main-models/R4-retune/R4-7B-15t-extd-e3.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets export the model from the checkpoint\n",
+    "EXPERIMENT_NAME=\"7B-15t-extd-e3\"\n",
+    "CKPT_DIR=\"last.ckpt\"\n",
+    "\n",
+    "!cd \"{TRAINER_DIR}\" && \\\n",
+    "    python export_checkpoint.py \"/checkpoint/retune/{EXPERIMENT_NAME}/{CKPT_DIR}/\" \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\"\n",
+    "!cd \"{TRAINER_DIR}\" && ls -alh \"/workspace/main-models/R4-retune/R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.\n",
+      "R4-7B-15t-extd-e3.pth: 100%|███████████████| 15.0G/15.0G [10:50<00:00, 23.1MB/s]\n",
+      "https://huggingface.co/rwkv-x-dev/eagle-7b-experiment/blob/main/R4-7B-15t-extd-e3.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "EXPERIMENT_NAME=\"7B-15t-extd-e3\"\n",
+    "!cd \"/workspace/main-models/R4-retune/\" && \\\n",
+    "    huggingface-cli upload rwkv-x-dev/eagle-7b-experiment \"./R4-{EXPERIMENT_NAME}.pth\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebook/major-runs/Eagle-2T-retune/retune-extd-train.yaml b/notebook/major-runs/Eagle-2T-retune/retune-extd-train.yaml
new file mode 100644
index 00000000..7272f8b8
--- /dev/null
+++ b/notebook/major-runs/Eagle-2T-retune/retune-extd-train.yaml
@@ -0,0 +1,119 @@
+###############################################
+##
+## See the full `config-example.yaml` for more
+## detailes on the trainer/model configs
+##
+###############################################
+
+trainer:
+  # Multi node training settings
+  num_nodes: 1
+  microbatch_size: 8
+  strategy: deepspeed_stage_2
+  
+  # Generally what you want to configure is the maximum number of epochs
+  # Leave it as -1, and it will keep going forever till interrupted
+  # Or set it as a number, and it will stop after that number of epochs
+  max_epochs: 1
+  min_epochs: null
+  max_steps: -1
+  min_steps: null
+  max_time: null
+
+  # Resonable batch size, for a more realistic it/s rate
+  # this is currently overwritten in the notebook
+  target_batch_size: 1024
+
+  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
+  # ---
+  logger:
+    class_path: lightning.pytorch.loggers.WandbLogger
+    init_args:
+      name: 'Eagle-2T-R5'
+      project: 'RWKV-V5-Eagle-2T-R4'
+      tags: ['Eagle', 'RWKV-V5']
+  
+  # Checkpoint settings for the training process
+  callbacks:
+    class_path: lightning.pytorch.callbacks.ModelCheckpoint
+    init_args:
+      # Configure this to the path you want to save your checkpoints to
+      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
+      # 
+      # to convert a checkpoint to a model, you can use the 
+      # `python3 export_checkpoint.py <checkpoint path>` script, 
+      # which will create a `rwkv_model.pth` in the checkpoint directory.
+      #
+      # Do not use the `zero_to_fp32.py` script as that will have export format issues
+      dirpath: /checkpoint/retune/Eagle-R4-no-mask/
+      filename: null
+      
+      # Save the top/last K checkpoints
+      save_top_k: 3
+      # Choose the most recent checkpoints by steps
+      monitor: 'step'
+      mode: max
+      
+      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
+      # useful to simply checkpoint resume scripts, at a price of disk performance
+      save_last: true
+
+      # DO NOT set this as true, as the model weight exported will have format issues
+      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
+      save_weights_only: false
+
+      # How frequent you want to save a checkpoint for every step.
+      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
+      #
+      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
+      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
+      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
+      every_n_train_steps: 25
+      every_n_epochs: null
+      save_on_train_epoch_end: true
+      train_time_interval: null
+
+      # Other pytorch lightning settings, which in most cases you can remove/ignore
+      # ---
+      # verbose: false
+      # auto_insert_metric_name: true
+  
+model:
+  # The model to load
+  load_model: /workspace/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth
+
+  # Starting and ending learning rate
+  lr_init: 5e-6
+  lr_final: 5e-6
+
+  # Training context length, note that the dataset can be
+  # larger then the context size, in which the trainer
+  # will process the dataset in chunks
+  ctx_len: 4096
+
+  # BPTT learning, this allows you to run the trainer against dataset
+  # larger then its training context length
+  bptt_learning: true
+  bptt_learning_range: 1
+
+########################################
+## Training model settings
+########################################
+data:
+  # Skip the datapath setup
+  #
+  # ignored if using the preload_datapath.py, useful for speeding up the trainer startup
+  # provided you have your datasets all properly preinitialized
+  # ---
+  skip_datapath_setup: True
+
+  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
+  #
+  # Use this if you have built your own dataset and saved it with `save_to_disk()`
+  # with source left as null. Other wise configure this to a directory which the 
+  # dataset will be built and tokenized by the huggingface dataset process.
+  data_path: /datapath/eval-retune/pack-5-no-mask/
+
+# Path to the current checkpoint to continue training from
+# this should be the directory path, and ends with `.ckpt/`
+# ckpt_path: /checkpoint/Eagle-2T-p1/last.ckpt