From 5557cd3db684e890198d00204048103ac4c9a0e8 Mon Sep 17 00:00:00 2001
From: Max Luebbering <le1nux@users.noreply.github.com>
Date: Mon, 18 Mar 2024 00:07:23 +0100
Subject: [PATCH 1/9] feat: added example instruction multi-turn dataset

---
 data/lorem_ipsum_instruct_multi_turn.idx   | Bin 0 -> 48 bytes
 data/lorem_ipsum_instruct_multi_turn.jsonl |   5 +++++
 2 files changed, 5 insertions(+)
 create mode 100644 data/lorem_ipsum_instruct_multi_turn.idx
 create mode 100644 data/lorem_ipsum_instruct_multi_turn.jsonl

diff --git a/data/lorem_ipsum_instruct_multi_turn.idx b/data/lorem_ipsum_instruct_multi_turn.idx
new file mode 100644
index 0000000000000000000000000000000000000000..9bcc11df1b167097a6983cdb8f9923bfd4f681f3
GIT binary patch
literal 48
wcmZo*nX1YF0kKmwycxXx+opI2c!vSW<=#8nruZf@dfNl3g^b>gZBtV906|L+H~;_u

literal 0
HcmV?d00001

diff --git a/data/lorem_ipsum_instruct_multi_turn.jsonl b/data/lorem_ipsum_instruct_multi_turn.jsonl
new file mode 100644
index 000000000..3ee98d518
--- /dev/null
+++ b/data/lorem_ipsum_instruct_multi_turn.jsonl
@@ -0,0 +1,5 @@
+{"Conversations": ["0 Who is the president of the United States", "Joe Biden"]}
+{"Conversations": ["1 Who is the chancellor of Germany", "Olaf Scholz", "Thank you."]}
+{"conversations": ["2 What is the most effective weapon in CS? ", "Are you referring to Counter Strike 2?", "Yes.", "The most effective weapon from a damage point of view is the AWP"]}
+{"conversations": ["3 What is the capital of France", "Paris"]}
+{"conversations": ["4 What is the capital of Germany", "Berlin"]}
\ No newline at end of file

From c42ce122d1888a4dfccd682ecc807ea6a60ab58e Mon Sep 17 00:00:00 2001
From: Max Luebbering <le1nux@users.noreply.github.com>
Date: Mon, 18 Mar 2024 00:08:24 +0100
Subject: [PATCH 2/9] refactor. towards instruction dataset support in MemMap
 datasets.

---
 src/modalities/config/config.py              |  3 +-
 src/modalities/dataloader/dataset.py         | 84 +++++++++++++++++---
 src/modalities/dataloader/dataset_factory.py |  9 ++-
 3 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/src/modalities/config/config.py b/src/modalities/config/config.py
index 353cf14e0..d520b6e8e 100644
--- a/src/modalities/config/config.py
+++ b/src/modalities/config/config.py
@@ -191,7 +191,8 @@ class MemMapDatasetConfig(BaseModel):
     index_path: Optional[FilePath] = None
     block_size: Annotated[int, Field(strict=True, gt=0)]
     tokenizer: PydanticTokenizerIFType
-    jq_pattern: str
+    tokenization_jq_patterns: Dict[str, str]
+    pass_through_jq_patterns: Optional[Dict[str, str]] = None
     sample_key: str
 
 
diff --git a/src/modalities/dataloader/dataset.py b/src/modalities/dataloader/dataset.py
index ef0ae2ad1..3bdc7b92b 100644
--- a/src/modalities/dataloader/dataset.py
+++ b/src/modalities/dataloader/dataset.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import jq
 import numpy as np
@@ -30,9 +30,10 @@ def __init__(
         raw_data_path: Path,
         block_size: int,
         tokenizer: PreTrainedTokenizer,
-        sample_key: str,
+        sample_key: str, # TODO Max: is sample key really necessary?
+        tokenization_jq_patterns: Dict[str, str],
+        pass_through_jq_patterns: Dict[str, str] = None,
         index_path: Optional[Path] = None,
-        jq_pattern: str = ".text",
     ):
         """
         Pytorch Dataset with mmap support.
@@ -49,23 +50,38 @@ def __init__(
                            TODO: If this setting should support multi-modal features using separately encoded inputs,
                             this needs to get replaced with a list of sample keys!
         """
-        super().__init__(raw_data_path=raw_data_path, block_size=block_size, sample_key=sample_key)
+        super().__init__(raw_data_path=raw_data_path,
+                          block_size=block_size, sample_key=sample_key)
+
+        self.tokenization_jq_filter = {key: jq.compile(pattern) for key, pattern in tokenization_jq_patterns.items()}
+        self.pass_through_jq_filter = {key: jq.compile(pattern) for key, pattern in pass_through_jq_patterns.items()} if pass_through_jq_patterns else {}
 
         self.reader = LargeFileLinesReader(self.raw_data_path, index_path=index_path)
-        self.jq_filter = jq.compile(jq_pattern)
         self.tokenizer = tokenizer
 
     def __len__(self) -> int:
         return len(self.reader)
 
-    def __getitem__(self, idx: int) -> BatchEncoding:
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
         self._check_if_inbounds(idx)
-        return self.tokenizer(
-            self.jq_filter.input_text(self.reader[idx]).first(),
-            max_length=self.block_size,
-            padding="max_length",
-            truncation=True,
-        )
+
+        item = {}
+        # applying jq filter for which we want to tokenize the text
+        for key, jq_filter in self.tokenization_jq_filter.items():
+            text = jq_filter.input_text(self.reader[idx]).first()
+
+            tokens = self.tokenizer(
+                jq_filter.input_text(self.reader[idx]).first(),
+                max_length=self.block_size,
+                padding="max_length",
+                truncation=True,
+            )
+            item[key] = tokens
+
+        # applying jq filter for which we want to pass through the raw data without tokenization
+        for key, jq_filter in self.pass_through_jq_filter.items():
+            item[key] = jq_filter.input_text(self.reader[idx]).first()
+        return item
 
 
 class PackedMemMapDatasetBase(Dataset):
@@ -157,3 +173,47 @@ def _generate_packing_index(self) -> List[Tuple[int, int]]:
                     curr_offset = segment_offset
                     curr_len = segment_len
         return index
+
+
+class DictMemMapDataset(Dataset):
+    def __init__(
+        self,
+        raw_data_path: Path,
+        block_size: int,
+        tokenizer: PreTrainedTokenizer,
+        sample_key: str,
+        index_path: Optional[Path] = None,
+        jq_pattern: str = ".text",
+    ):
+        """
+        Pytorch Dataset with mmap support.
+
+        :param raw_data_path: Path to a jsonl file, which holds text data
+        :param block_size: alias for max sequence length. The amount of tokens the model can handle.
+        :param tokenizer: PretrainedTokenizer required to tokenize text data on the fly.
+        :param jq_pattern: jq-pattern applied on every jsonl-entry. Results are afterwards tokenized and packed
+        :param index_path: Path to an index file, which indicates the start character/byte position
+                           and length of samples given in `raw_data_path`.
+                           If not defined, an index next to `raw_data_path` is picked,
+                           by replacing its suffix with ".idx".
+        :param sample_key: model-specific parameter to indicate where in the BatchEncoding the input_token_ids are.
+                           TODO: If this setting should support multi-modal features using separately encoded inputs,
+                            this needs to get replaced with a list of sample keys!
+        """
+        super().__init__(raw_data_path=raw_data_path, block_size=block_size, sample_key=sample_key)
+
+        self.reader = LargeFileLinesReader(self.raw_data_path, index_path=index_path)
+        self.jq_filter = jq.compile(jq_pattern)
+        self.tokenizer = tokenizer
+
+    def __len__(self) -> int:
+        return len(self.reader)
+
+    def __getitem__(self, idx: int) -> BatchEncoding:
+        self._check_if_inbounds(idx)
+        return self.tokenizer(
+            self.jq_filter.input_text(self.reader[idx]).first(),
+            max_length=self.block_size,
+            padding="max_length",
+            truncation=True,
+        )
\ No newline at end of file
diff --git a/src/modalities/dataloader/dataset_factory.py b/src/modalities/dataloader/dataset_factory.py
index 157e98d07..24d7b5fdd 100644
--- a/src/modalities/dataloader/dataset_factory.py
+++ b/src/modalities/dataloader/dataset_factory.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Optional
+from typing import Dict, Optional
 
 from pydantic import FilePath
 from torch.utils.data.dataset import Dataset
@@ -32,9 +32,11 @@ def get_mem_map_dataset(
         block_size: int,
         tokenizer: PreTrainedTokenizer,
         sample_key: str,
+        tokenization_jq_patterns: Dict[str, str],
         index_path: Optional[Path] = None,
-        jq_pattern: str = ".text",
+        pass_through_jq_patterns: Optional[Dict[str, str]] = None
     ) -> MemMapDataset:
+        
         # TODO this was part of the old Dataloader implementation.
         # we need to check if this is actually wanted generally.
         tokenizer.pad_token = tokenizer.eos_token
@@ -45,7 +47,8 @@ def get_mem_map_dataset(
             tokenizer=tokenizer,
             sample_key=sample_key,
             index_path=index_path,
-            jq_pattern=jq_pattern,
+            tokenization_jq_patterns=tokenization_jq_patterns,
+            pass_through_jq_patterns=pass_through_jq_patterns
         )
         return dataset
 

From 0447328d51596fd89dc8df0347eb40b13b305697 Mon Sep 17 00:00:00 2001
From: Max Luebbering <le1nux@users.noreply.github.com>
Date: Thu, 21 Mar 2024 15:05:31 +0100
Subject: [PATCH 3/9] docs: started documentation on instruction tuning

---
 MMAP_DATASET_README.md | 68 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/MMAP_DATASET_README.md b/MMAP_DATASET_README.md
index 4cfe38e19..ed753eae3 100644
--- a/MMAP_DATASET_README.md
+++ b/MMAP_DATASET_README.md
@@ -112,3 +112,71 @@ def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
     doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
     return np.concatenate((doc_idx_first, doc_idx_last))
 ```
+
+
+# Fine-tuning Datasets
+
+## Instruction Tuning
+Datasets, such as Bactrian or LIMA, come in different formats. Before instruction-tuning a model with one of these datasets the user has to 
+transform the dataset into the following format JSONL, inspired by Fast Chat. The listing below showcases an exemplary sample from the JSONL file. 
+The `id` represents the incremental sample id. `Conversations` contains the multi-turn messages between different parties. Here, we depicted messages 
+between a human and a gpt model. Finally, the format allows for the specification of further, arbitrary key-value pairs such as instructions and roles.
+
+```JSON
+{
+    "id": 0,
+    "conversations": [
+      {
+        "from": "human",
+        "value": "What is up?"
+      },
+      {
+        "from": "gpt",
+        "value": "Hello! How can I help you today?"
+      },
+      {
+        "from": "human",
+        "value": "Who are you?"
+      },
+      {
+        "from": "gpt",
+        "value": "You can call me Vicuna, and I was trained by Large Model Systems Organization (LMSYS) researchers as a language model."
+      },
+      {
+        "from": "human",
+        "value": "Goodbye"
+      },
+      {
+        "from": "gpt",
+        "value": "Goodbye! If you have any more questions in the future, don't hesitate to ask."
+      }
+    ]
+    
+    # optional / arbitrary key value pairs e.g.:
+    "instruction": "Role: Vicuna, trained by Large Model Systems Organization (LMSYS) researchers"
+    "role": "Vicuna, trained by Large Model Systems Organization (LMSYS) researchers"
+}
+```
+
+During the instantiation of the MemMap file, we specify the JQ patterns that determine which fields in the JSON are supposed to be tokenized and additionally pass a list of special tokens e.g., `<s>`, `</s>`, `<eod>` etc. to the constructor. 
+Each one of the special tokens is mapped to a single, individual token id once during the instantation of the MemMap file. 
+
+When the dataloader iterates over the MemMap file, the `__get_item__()` method tokenizes the sample as specified in the JQ patterns list and enriches the resulting dictionary with the token ids of the special tokens. 
+
+The dataloader packs multiple samples to a `DatasetBatch` and calls the `Collator` for bringing the samples in the correct format training. 
+
+The collator is instantiated with information on how to assemble the entire prompt from the `conversations` and the optional key-value pairs. 
+In practice, the YAML configuration has the following structure
+
+```YAML
+special_tokens:
+    bos_token: <s>
+    eos_token: </s>
+
+loss_masking_jq_patterns:
+    - .conversations | select(.from == "human")
+    - .instruction
+    - .role
+
+message_construction: [role, instruction, conversations]
+```

From 2c46986a4b75f392e86c4437ba538a909b8f18b1 Mon Sep 17 00:00:00 2001
From: Max Luebbering <le1nux@users.noreply.github.com>
Date: Thu, 21 Mar 2024 15:06:21 +0100
Subject: [PATCH 4/9] refactor: drafted instructioning tuning  setup

---
 config_files/config_lorem_ipsum.yaml | 19 +++++++++++++++----
 src/modalities/dataloader/dataset.py |  7 ++++++-
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/config_files/config_lorem_ipsum.yaml b/config_files/config_lorem_ipsum.yaml
index c9f012913..48bd3fa85 100644
--- a/config_files/config_lorem_ipsum.yaml
+++ b/config_files/config_lorem_ipsum.yaml
@@ -1,7 +1,7 @@
 settings:  
   experiment_id: ${modalities_env:experiment_id}
   referencing_keys:
-    sample_key: input_ids
+    sample_key: tokenized_input
     target_key: target_ids
   training:
     callback_interval_in_samples: 6
@@ -17,6 +17,7 @@ settings:
     world_size: ${cuda_env:WORLD_SIZE}
   paths:
     checkpointing_path: data/checkpoints
+
 tokenizer:
   component_key: tokenizer
   variant_key: gpt2_tokenizer_fast
@@ -34,10 +35,20 @@ train_dataset:
   component_key: dataset
   variant_key: mem_map_dataset
   config:
-    raw_data_path: data/lorem_ipsum.jsonl
-    index_path: data/lorem_ipsum.idx
+    raw_data_path: data/lorem_ipsum_instruct_multi_turn.jsonl
+    index_path: data/lorem_ipsum_instruct_multi_turn.idx
     block_size: ${settings.training.sequence_length}
-    jq_pattern: ".text"
+    tokenization_jq_patterns: 
+      ${settings.referencing_keys.sample_key}: .conversations
+    pass_through_jq_patterns:
+      raw_text:  .conversations
+
+    # tokenization_jq_patterns: 
+    #   - new_key: input_ids # ${settings.referencing_keys.sample_key}
+    #     jq_pattern: .text
+    # pass_through_jq_patterns:
+    #   - new_key: raw_text
+    #     jq_pattern: .text
     sample_key:  ${settings.referencing_keys.sample_key}
     tokenizer:
       instance_key: tokenizer
diff --git a/src/modalities/dataloader/dataset.py b/src/modalities/dataloader/dataset.py
index 3bdc7b92b..1b7f29513 100644
--- a/src/modalities/dataloader/dataset.py
+++ b/src/modalities/dataloader/dataset.py
@@ -34,6 +34,8 @@ def __init__(
         tokenization_jq_patterns: Dict[str, str],
         pass_through_jq_patterns: Dict[str, str] = None,
         index_path: Optional[Path] = None,
+        special_tokens_map: Optional[Dict[str, str]] = None,
+        # {"bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "unk_token": "<unk>", "mask_token": "<mask}
     ):
         """
         Pytorch Dataset with mmap support.
@@ -58,6 +60,8 @@ def __init__(
 
         self.reader = LargeFileLinesReader(self.raw_data_path, index_path=index_path)
         self.tokenizer = tokenizer
+        if special_tokens_map:
+            self.special_tokens_map = {k: self.tokenizer.tokenizer(v) for k, v in special_tokens_map.items()}
 
     def __len__(self) -> int:
         return len(self.reader)
@@ -71,7 +75,7 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
             text = jq_filter.input_text(self.reader[idx]).first()
 
             tokens = self.tokenizer(
-                jq_filter.input_text(self.reader[idx]).first(),
+                text,
                 max_length=self.block_size,
                 padding="max_length",
                 truncation=True,
@@ -81,6 +85,7 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
         # applying jq filter for which we want to pass through the raw data without tokenization
         for key, jq_filter in self.pass_through_jq_filter.items():
             item[key] = jq_filter.input_text(self.reader[idx]).first()
+        item = {**item, **self.special_tokens_map}
         return item
 
 

From f2a164d821dbfba4d75a46e337c319ee7de04b8f Mon Sep 17 00:00:00 2001
From: Max Luebbering <le1nux@users.noreply.github.com>
Date: Tue, 2 Apr 2024 00:57:38 +0200
Subject: [PATCH 5/9] feat: towards generic tokenization specifications

---
 src/modalities/dataloader/dataset.py | 151 +++++++++++++++++++++------
 1 file changed, 118 insertions(+), 33 deletions(-)

diff --git a/src/modalities/dataloader/dataset.py b/src/modalities/dataloader/dataset.py
index 1b7f29513..2335803f6 100644
--- a/src/modalities/dataloader/dataset.py
+++ b/src/modalities/dataloader/dataset.py
@@ -1,10 +1,13 @@
 from __future__ import annotations
 
+import json
+from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
 import jq
 import numpy as np
+from pydantic import BaseModel, validator
 from torch.utils.data.dataset import Dataset as TorchdataSet
 from tqdm import tqdm
 from transformers import BatchEncoding, PreTrainedTokenizer
@@ -14,10 +17,9 @@
 
 
 class Dataset(TorchdataSet):
-    def __init__(self, raw_data_path: Path, block_size: int, sample_key: str):
+    def __init__(self, raw_data_path: Path, block_size: int):
         self.raw_data_path = raw_data_path
         self.block_size = block_size
-        self.sample_key = sample_key
 
     def _check_if_inbounds(self, idx: int):
         if not 0 <= idx < len(self):
@@ -30,7 +32,7 @@ def __init__(
         raw_data_path: Path,
         block_size: int,
         tokenizer: PreTrainedTokenizer,
-        sample_key: str, # TODO Max: is sample key really necessary?
+        sample_key: str,  # TODO Max: is sample key really necessary?
         tokenization_jq_patterns: Dict[str, str],
         pass_through_jq_patterns: Dict[str, str] = None,
         index_path: Optional[Path] = None,
@@ -52,15 +54,19 @@ def __init__(
                            TODO: If this setting should support multi-modal features using separately encoded inputs,
                             this needs to get replaced with a list of sample keys!
         """
-        super().__init__(raw_data_path=raw_data_path,
-                          block_size=block_size, sample_key=sample_key)
+        super().__init__(raw_data_path=raw_data_path, block_size=block_size, sample_key=sample_key)
+        self.sample_key = sample_key
 
         self.tokenization_jq_filter = {key: jq.compile(pattern) for key, pattern in tokenization_jq_patterns.items()}
-        self.pass_through_jq_filter = {key: jq.compile(pattern) for key, pattern in pass_through_jq_patterns.items()} if pass_through_jq_patterns else {}
+        self.pass_through_jq_filter = (
+            {key: jq.compile(pattern) for key, pattern in pass_through_jq_patterns.items()}
+            if pass_through_jq_patterns
+            else {}
+        )
 
         self.reader = LargeFileLinesReader(self.raw_data_path, index_path=index_path)
         self.tokenizer = tokenizer
-        if special_tokens_map:
+        if special_tokens_map is not None:
             self.special_tokens_map = {k: self.tokenizer.tokenizer(v) for k, v in special_tokens_map.items()}
 
     def __len__(self) -> int:
@@ -180,45 +186,124 @@ def _generate_packing_index(self) -> List[Tuple[int, int]]:
         return index
 
 
-class DictMemMapDataset(Dataset):
+class TransformOperation(Enum):
+    TOKENIZE = "tokenize"
+    PASS_THROUGH = "pass_through"
+
+
+class SampleTransform(BaseModel):
+    json_indexation_pattern: List[str]
+    new_key: Optional[str] = None
+    transform_operation: TransformOperation = TransformOperation.TOKENIZE
+
+    @validator("json_indexation_pattern", pre=True, each_item=False)
+    def _check_at_least_one_item(cls, v):
+        if not v:
+            raise ValueError("json_indexation_pattern must contain at least one item")
+        return v
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        if self.new_key is None and self.json_indexation_pattern:
+            self.new_key = self.json_indexation_pattern[-1]
+
+
+class SFTMemMapDataset(Dataset):
     def __init__(
         self,
         raw_data_path: Path,
         block_size: int,
         tokenizer: PreTrainedTokenizer,
-        sample_key: str,
+        sample_transforms: List[SampleTransform],
         index_path: Optional[Path] = None,
-        jq_pattern: str = ".text",
     ):
-        """
-        Pytorch Dataset with mmap support.
-
-        :param raw_data_path: Path to a jsonl file, which holds text data
-        :param block_size: alias for max sequence length. The amount of tokens the model can handle.
-        :param tokenizer: PretrainedTokenizer required to tokenize text data on the fly.
-        :param jq_pattern: jq-pattern applied on every jsonl-entry. Results are afterwards tokenized and packed
-        :param index_path: Path to an index file, which indicates the start character/byte position
-                           and length of samples given in `raw_data_path`.
-                           If not defined, an index next to `raw_data_path` is picked,
-                           by replacing its suffix with ".idx".
-        :param sample_key: model-specific parameter to indicate where in the BatchEncoding the input_token_ids are.
-                           TODO: If this setting should support multi-modal features using separately encoded inputs,
-                            this needs to get replaced with a list of sample keys!
-        """
-        super().__init__(raw_data_path=raw_data_path, block_size=block_size, sample_key=sample_key)
+        super().__init__(raw_data_path=raw_data_path, block_size=block_size)
 
         self.reader = LargeFileLinesReader(self.raw_data_path, index_path=index_path)
-        self.jq_filter = jq.compile(jq_pattern)
         self.tokenizer = tokenizer
+        self.indexation_pattern_to_sample_transforms = {}
+        for sample_transform in sample_transforms:
+            if sample_transform.json_indexation_pattern not in self.indexation_pattern_to_sample_transforms:
+                self.indexation_pattern_to_sample_transforms[sample_transform.json_indexation_pattern] = []
+            self.indexation_pattern_to_sample_transforms[sample_transform.json_indexation_pattern].append(
+                sample_transform
+            )
 
     def __len__(self) -> int:
         return len(self.reader)
 
     def __getitem__(self, idx: int) -> BatchEncoding:
         self._check_if_inbounds(idx)
-        return self.tokenizer(
-            self.jq_filter.input_text(self.reader[idx]).first(),
-            max_length=self.block_size,
-            padding="max_length",
-            truncation=True,
-        )
\ No newline at end of file
+        item = json.loads(self.reader[idx])
+        # conversations -> * -> value -> tokenize value
+        self._transform_json_dict(
+            element=item,
+            current_path=[],
+            indexation_pattern_to_sample_transforms=self.indexation_pattern_to_sample_transforms,
+        )
+        return item
+
+    def _transform_json_dict(
+        self,
+        element: Dict | List | str,
+        current_path: List[str],
+        indexation_pattern_to_sample_transforms: Dict[str, List[SampleTransform]],
+    ):
+        def run_transform(
+            current_path: List[str],
+            element: str,
+            indexation_pattern_to_sample_transforms: Dict[str, List[SampleTransform]],
+        ):
+            current_pattern_string = ".".join(current_path)
+            transformed_element = {}
+            if current_pattern_string in indexation_pattern_to_sample_transforms:
+                sample_transforms = indexation_pattern_to_sample_transforms[current_pattern_string]
+                for sample_transform in sample_transforms:
+                    if sample_transform.transform_operation == TransformOperation.TOKENIZE:
+                        tokens = self.tokenizer(
+                            element,
+                            max_length=self.block_size,
+                            padding="max_length",
+                            truncation=True,
+                        )
+                        transformed_element[sample_transform.new_key] = tokens
+                    elif sample_transform.transform_operation == TransformOperation.PASS_THROUGH:
+                        transformed_element[sample_transform.new_key] = element
+            return transformed_element
+
+        if isinstance(element, dict):
+            transformed_elements_list = []
+
+            for key, sub_element in element.items():
+                if not isinstance(element, dict) or not isinstance(element, list):
+                    transformed_sub_element: Dict = run_transform(
+                        current_path=current_path + [key],
+                        element=sub_element,
+                        indexation_pattern_to_sample_transforms=indexation_pattern_to_sample_transforms,
+                    )
+                else:
+                    transformed_sub_element = self._transform_json_dict(
+                        sub_element, current_path + [key], indexation_pattern_to_sample_transforms
+                    )
+                transformed_elements_list.append(transformed_sub_element)
+
+            transformed_elements_dict = {k: v for d in transformed_elements_list for k, v in d.items()}
+            return transformed_elements_dict
+
+        elif isinstance(element, list):
+            transformed_elements_list = []
+            for sub_element in element:
+                # Note that, we don't execute run_transform here, as we only tokenize the values
+                # of dictionaries and not of lists.
+                # If this is required, there is still the possibility to add this functionality.
+                transformed_sub_element = self._transform_json_dict(
+                    sub_element, current_path + ["*"], indexation_pattern_to_sample_transforms
+                )
+                transformed_elements_list.append(transformed_sub_element)
+
+            # In this case, we have a nested list and therfore no key to construct a dictionary from
+            if current_path[-1] == "*":
+                return transformed_elements_list
+            # In this case, we don't have a nested list and can construct a dictionary from the list
+            else:
+                return {current_path[-1]: transformed_elements_list}

From 95b1976f4814bde2123b0c295882cd4a70bdef0d Mon Sep 17 00:00:00 2001
From: Max Luebbering <le1nux@users.noreply.github.com>
Date: Wed, 3 Apr 2024 10:58:18 +0200
Subject: [PATCH 6/9] feat: added lorem_ipsum_sft.jsonl

---
 data/lorem_ipsum_sft.jsonl | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 data/lorem_ipsum_sft.jsonl

diff --git a/data/lorem_ipsum_sft.jsonl b/data/lorem_ipsum_sft.jsonl
new file mode 100644
index 000000000..3c1a0e894
--- /dev/null
+++ b/data/lorem_ipsum_sft.jsonl
@@ -0,0 +1,10 @@
+{"id": 0, "conversations": [{"from": "human_1", "value": "What is up?`"}, {"from": "gpt", "value": "Hello! How can I help you today?"}, {"from": "human_1", "value": "Who are you?"}, {"from": "gpt", "value": "You can call me Mody, and I was trained by the modalities team as a language model."}, {"from": "human_2", "value": "Goodbye"}, {"from": "gpt", "value": "Goodbye! If you have any more questions in the future, don't hesitate to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 1, "conversations": [{"from": "human_1", "value": "How does AI work?"}, {"from": "gpt", "value": "AI, or artificial intelligence, operates by analyzing data and patterns to make decisions or predictions. Would you like a more detailed explanation?"}, {"from": "human_1", "value": "Yes, please."}, {"from": "gpt", "value": "Sure, AI systems, such as neural networks, are trained using large amounts of data. They learn to recognize patterns and make predictions. It's like how a child learns from experience."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 2, "conversations": [{"from": "human_1", "value": "Can you write a poem?"}, {"from": "gpt", "value": "Of course! Give me a topic, and I'll write something for you."}, {"from": "human_1", "value": "Write about the sea."}, {"from": "gpt", "value": "The sea, a vast expanse, so deep and wide, Whispers secrets at the tide's low ebb, Its waves, the rhythm of time's endless web, A world of mystery beneath its hide."}, {"from": "human_2", "value": "That's beautiful, thanks!"}, {"from": "gpt", "value": "You're welcome! I'm glad you liked it. If you have any more requests, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 3, "conversations": [{"from": "human_1", "value": "What's the weather like today?"}, {"from": "gpt", "value": "I'm not able to check real-time data, but I can tell you how to find out using a weather app or website. Would that help?"}, {"from": "human_1", "value": "Yes, please."}, {"from": "gpt", "value": "You can use weather websites like Weather.com or apps like AccuWeather to get current weather conditions and forecasts."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 4, "conversations": [{"from": "human_1", "value": "What is the capital of France?"}, {"from": "gpt", "value": "The capital of France is Paris. It's known for its art, culture, and history. Have you ever been there?"}, {"from": "human_1", "value": "No, but I'd love to visit someday."}, {"from": "gpt", "value": "It's a beautiful city with so much to see and do. I hope you get the chance to visit soon!"}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 5, "conversations": [{"from": "human_1", "value": "What is the meaning of life?"}, {"from": "gpt", "value": "The meaning of life is a philosophical question that has been debated for centuries. Some believe it's about finding happiness, others think it's about fulfilling a purpose. What do you think?"}, {"from": "human_1", "value": "I think it's about making the most of the time we have and finding joy in the little things."}, {"from": "gpt", "value": "That's a beautiful perspective. It's important to appreciate the present and find joy in everyday moments."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 6, "conversations": [{"from": "human_1", "value": "What is the best way to learn a new language?"}, {"from": "gpt", "value": "The best way to learn a new language is to practice regularly, immerse yourself in the language, and use a variety of resources like books, apps, and language classes. It's also helpful to practice with native speakers."}, {"from": "human_1", "value": "Thank you for the advice."}, {"from": "gpt", "value": "You're welcome! Learning a new language can be a rewarding experience. If you have any more questions, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 7, "conversations": [{"from": "human_1", "value": "What is the best way to stay healthy?"}, {"from": "gpt", "value": "The best way to stay healthy is to eat a balanced diet, exercise regularly, get enough sleep, and manage stress. It's also important to see your doctor for regular check-ups and screenings."}, {"from": "human_1", "value": "Thank you for the advice."}, {"from": "gpt", "value": "You're welcome! Taking care of your health is important for your overall well-being. If you have any more questions, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 8, "conversations": [{"from": "human_1", "value": "What is the best way to study for exams?"}, {"from": "gpt", "value": "The best way to study for exams is to create a study schedule, break down the material into manageable chunks, and use active learning techniques like flashcards or practice tests. It's also important to get plenty of rest and stay hydrated."}, {"from": "human_1", "value": "Thank you for the advice."}, {"from": "gpt", "value": "You're welcome! Studying can be challenging, but with the right strategies, you can succeed. If you have any more questions, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 9, "conversations": [{"from": "human_1", "value": "What is the best way to save money?"}, {"from": "gpt", "value": "The best way to save money is to create a budget, track your expenses, and look for ways to cut costs. You can also set financial goals and automate your savings to make it easier to save."}, {"from": "human_1", "value": "Thank you for the advice."}, {"from": "gpt", "value": "You're welcome! Saving money is an important skill that can help you achieve your financial goals. If you have any more questions, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}

From d68b314a0e2aabb3a208c525d03c2ccfbc775d1a Mon Sep 17 00:00:00 2001
From: Max Luebbering <le1nux@users.noreply.github.com>
Date: Wed, 3 Apr 2024 11:43:45 +0200
Subject: [PATCH 7/9] test: drafted sft dataset test

---
 tests/dataloader/test_sft_dataset.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 tests/dataloader/test_sft_dataset.py

diff --git a/tests/dataloader/test_sft_dataset.py b/tests/dataloader/test_sft_dataset.py
new file mode 100644
index 000000000..4543eb90a
--- /dev/null
+++ b/tests/dataloader/test_sft_dataset.py
@@ -0,0 +1,25 @@
+def test_create_packed_dataset(indexed_dummy_data_path, gpt2_tokenizer):
+    pass
+    # block_size = 5
+    # packed_generator = SFTMemMapDataset(
+    #     src_path=indexed_dummy_data_path.raw_data_path, tokenizer=gpt2_tokenizer, number_of_processes=2
+    # )
+    # default_packed_dataset_path = packed_generator._default_destination_path()
+    # assert not default_packed_dataset_path.is_file()
+    # packed_generator.run()
+    # packed_dataset = PackedMemMapDatasetContinuous(
+    #     default_packed_dataset_path, block_size=block_size, sample_key="input_ids"
+    # )
+
+    # start_of_jsonl_content = "0 Lorem ipsum dolor sit amet, consetetur sadipscing elitr"
+    # tokenized_start_of_jsonl_content = gpt2_tokenizer(start_of_jsonl_content)["input_ids"]
+    # packed_dataset_iterator = iter(packed_dataset)
+    # np.testing.assert_equal(tokenized_start_of_jsonl_content[:block_size], next(packed_dataset_iterator)["input_ids"])
+    # np.testing.assert_equal(
+    #     tokenized_start_of_jsonl_content[block_size : 2 * block_size], next(packed_dataset_iterator)["input_ids"]
+    # )
+    # assert len(packed_dataset._embedded_stream_data.index_base) == 12
+
+    # # check validity of index section in packed dataset
+    # for idx, (offset, entry_length) in enumerate(packed_dataset._embedded_stream_data.index_base[:-1]):
+    #     assert offset + entry_length == packed_dataset._embedded_stream_data.index_base[idx + 1][0]

From 1f26cfca2536248742d5f543d8cf5f4d5d2ebb31 Mon Sep 17 00:00:00 2001
From: Max Luebbering <le1nux@users.noreply.github.com>
Date: Wed, 3 Apr 2024 11:45:40 +0200
Subject: [PATCH 8/9] refactor: MemMapDataset changes roll back

---
 src/modalities/dataloader/dataset.py | 50 ++++++++--------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/src/modalities/dataloader/dataset.py b/src/modalities/dataloader/dataset.py
index 2335803f6..d7e7e13b8 100644
--- a/src/modalities/dataloader/dataset.py
+++ b/src/modalities/dataloader/dataset.py
@@ -3,7 +3,7 @@
 import json
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import jq
 import numpy as np
@@ -32,12 +32,9 @@ def __init__(
         raw_data_path: Path,
         block_size: int,
         tokenizer: PreTrainedTokenizer,
-        sample_key: str,  # TODO Max: is sample key really necessary?
-        tokenization_jq_patterns: Dict[str, str],
-        pass_through_jq_patterns: Dict[str, str] = None,
+        sample_key: str,
         index_path: Optional[Path] = None,
-        special_tokens_map: Optional[Dict[str, str]] = None,
-        # {"bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "unk_token": "<unk>", "mask_token": "<mask}
+        jq_pattern: str = ".text",
     ):
         """
         Pytorch Dataset with mmap support.
@@ -55,44 +52,22 @@ def __init__(
                             this needs to get replaced with a list of sample keys!
         """
         super().__init__(raw_data_path=raw_data_path, block_size=block_size, sample_key=sample_key)
-        self.sample_key = sample_key
-
-        self.tokenization_jq_filter = {key: jq.compile(pattern) for key, pattern in tokenization_jq_patterns.items()}
-        self.pass_through_jq_filter = (
-            {key: jq.compile(pattern) for key, pattern in pass_through_jq_patterns.items()}
-            if pass_through_jq_patterns
-            else {}
-        )
 
         self.reader = LargeFileLinesReader(self.raw_data_path, index_path=index_path)
+        self.jq_filter = jq.compile(jq_pattern)
         self.tokenizer = tokenizer
-        if special_tokens_map is not None:
-            self.special_tokens_map = {k: self.tokenizer.tokenizer(v) for k, v in special_tokens_map.items()}
 
     def __len__(self) -> int:
         return len(self.reader)
 
-    def __getitem__(self, idx: int) -> Dict[str, Any]:
+    def __getitem__(self, idx: int) -> BatchEncoding:
         self._check_if_inbounds(idx)
-
-        item = {}
-        # applying jq filter for which we want to tokenize the text
-        for key, jq_filter in self.tokenization_jq_filter.items():
-            text = jq_filter.input_text(self.reader[idx]).first()
-
-            tokens = self.tokenizer(
-                text,
-                max_length=self.block_size,
-                padding="max_length",
-                truncation=True,
-            )
-            item[key] = tokens
-
-        # applying jq filter for which we want to pass through the raw data without tokenization
-        for key, jq_filter in self.pass_through_jq_filter.items():
-            item[key] = jq_filter.input_text(self.reader[idx]).first()
-        item = {**item, **self.special_tokens_map}
-        return item
+        return self.tokenizer(
+            self.jq_filter.input_text(self.reader[idx]).first(),
+            max_length=self.block_size,
+            padding="max_length",
+            truncation=True,
+        )
 
 
 class PackedMemMapDatasetBase(Dataset):
@@ -121,9 +96,10 @@ def __init__(self, raw_data_path: Path, block_size: int, sample_key: str):
                            TODO: If this setting should support multi-modal features using separately encoded inputs,
                             this needs to get replaced with a list of sample keys!
         """
-        super().__init__(raw_data_path=raw_data_path, block_size=block_size, sample_key=sample_key)
+        super().__init__(raw_data_path=raw_data_path, block_size=block_size)
         self._embedded_stream_data = EmbeddedStreamData(raw_data_path)
         self._token_size_in_bytes = self._embedded_stream_data.token_size_in_bytes
+        self.sample_key = sample_key
         try:
             self._token_dtype = self.np_dtype_from_num_bytes[self._token_size_in_bytes]
         except KeyError:

From 3b18573cec6761f1e20f311d8789e47f87228edd Mon Sep 17 00:00:00 2001
From: Max Luebbering <le1nux@users.noreply.github.com>
Date: Wed, 3 Apr 2024 11:46:07 +0200
Subject: [PATCH 9/9] feat: added documentation for instruction tuning

---
 MMAP_DATASET_README.md | 130 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 117 insertions(+), 13 deletions(-)

diff --git a/MMAP_DATASET_README.md b/MMAP_DATASET_README.md
index ed753eae3..81b39e983 100644
--- a/MMAP_DATASET_README.md
+++ b/MMAP_DATASET_README.md
@@ -117,17 +117,15 @@ def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
 # Fine-tuning Datasets
 
 ## Instruction Tuning
-Datasets, such as Bactrian or LIMA, come in different formats. Before instruction-tuning a model with one of these datasets the user has to 
-transform the dataset into the following format JSONL, inspired by Fast Chat. The listing below showcases an exemplary sample from the JSONL file. 
-The `id` represents the incremental sample id. `Conversations` contains the multi-turn messages between different parties. Here, we depicted messages 
-between a human and a gpt model. Finally, the format allows for the specification of further, arbitrary key-value pairs such as instructions and roles.
+Instruction tuning datasets, such as Bactrian or LIMA, generally come in diverse formats. Therefore, before instruction-tuning a model with one of these datasets the user has to transform the dataset into the following format JSONL, inspired by Fast Chat. The listing below showcases an exemplary sample from the JSONL file. 
+The `id` represents the incremental sample id. `Conversations` contains the multi-turn messages between different parties. Here, we depicted messages between a human and a gpt model. Finally, the format allows for the specification of further, arbitrary key-value pairs such as instructions and roles.
 
 ```JSON
 {
     "id": 0,
     "conversations": [
       {
-        "from": "human",
+        "from": "human_1",
         "value": "What is up?"
       },
       {
@@ -135,15 +133,15 @@ between a human and a gpt model. Finally, the format allows for the specificatio
         "value": "Hello! How can I help you today?"
       },
       {
-        "from": "human",
+        "from": "human_1",
         "value": "Who are you?"
       },
       {
         "from": "gpt",
-        "value": "You can call me Vicuna, and I was trained by Large Model Systems Organization (LMSYS) researchers as a language model."
+        "value": "You can call me Mody, and I was trained by the modalities team as a language model."
       },
       {
-        "from": "human",
+        "from": "human_2",
         "value": "Goodbye"
       },
       {
@@ -153,17 +151,74 @@ between a human and a gpt model. Finally, the format allows for the specificatio
     ]
     
     # optional / arbitrary key value pairs e.g.:
-    "instruction": "Role: Vicuna, trained by Large Model Systems Organization (LMSYS) researchers"
-    "role": "Vicuna, trained by Large Model Systems Organization (LMSYS) researchers"
+    "instruction": "You are Mody, a helpful LLM trained by the modalities team"
+    "role": "Mody, a helpful LLM trained by the modalities team"
 }
 ```
+All JSONL files for instruction tuning have to follow this format.
 
+Given a prepared JSONL file, the training / processing flow can be described as follows:
 During the instantiation of the MemMap file, we specify the JQ patterns that determine which fields in the JSON are supposed to be tokenized and additionally pass a list of special tokens e.g., `<s>`, `</s>`, `<eod>` etc. to the constructor. 
 Each one of the special tokens is mapped to a single, individual token id once during the instantation of the MemMap file. 
 
-When the dataloader iterates over the MemMap file, the `__get_item__()` method tokenizes the sample as specified in the JQ patterns list and enriches the resulting dictionary with the token ids of the special tokens. 
+When the dataloader iterates over the MemMap file, the `__get_item__()` method tokenizes the sample as specified in the JQ patterns list and enriches the resulting dictionary with the token ids of the special tokens that we pre-computed during the MemMap file instantiation. 
+In other words, we extract the desired keys from the raw text dictionary, tokenize the content, build a new dictionary with the tokenized data and add the representation of special tokens to it.
 
-The dataloader packs multiple samples to a `DatasetBatch` and calls the `Collator` for bringing the samples in the correct format training. 
+Given the MemMap parameterization
+
+```
+  tokenization_jq_patterns = [".conversations .value", ".instruction", ".role"]
+  pass_through_jq_patterns = [".id"]
+  special_tokens_map = {"b_instruction_token": "place_holder_token_100", ... }
+```
+
+```JSON
+{
+    "id": 0,
+    "conversations": [
+      {
+        "from": "human_1",
+        "from_tokenized": "<human_1>",
+        "value": "<What is up?>"
+      },
+      {
+        "from": "gpt",
+        "from_tokenized": "<gpt>",
+        "value": "<Hello! How can I help you today?>"
+      },
+      {
+        "from": "human_1",
+        "from_tokenized": "<human_1>",
+        "value": "<Who are you?>"
+      },
+      {
+        "from": "gpt",
+        "from_tokenized": "<gpt>",
+        "value": "<You can call me Mody, and I was trained by the modalities team as a language model.>"
+      },
+      {
+        "from": "human_2",
+        "from_tokenized": "<human_2>",
+        "value": "<Goodbye>"
+      },
+      {
+        "from": "gpt",
+        "from_tokenized": "<gpt>",
+        "value": "<Goodbye! If you have any more questions in the future, don't hesitate to ask.>"
+      }
+    ]
+    
+    # optional / arbitrary key value pairs e.g.:
+    "instruction": "<You are Mody, a helpful LLM trained by the modalities team>"
+    "role": "<Mody, a helpful LLM trained by the modalities team>"
+    "special_tokens": {"bos_token": "<bos_token_id>", "eos_token": "eos_token_id>", ... "unk_token", "mask_token", 
+                    "b_role_token", "e_role_token", "b_instruction_token", 
+                    "e_instruction_token"}
+}
+```
+
+
+The dataloader packs multiple samples to a `DatasetBatch` and calls the `Collator` for bringing the batch of samples into the correct format training. 
 
 The collator is instantiated with information on how to assemble the entire prompt from the `conversations` and the optional key-value pairs. 
 In practice, the YAML configuration has the following structure
@@ -172,11 +227,60 @@ In practice, the YAML configuration has the following structure
 special_tokens:
     bos_token: <s>
     eos_token: </s>
+    b_role_token: <r>
+    e_role_token: </r>
+    b_instruction_token: <i>
+    e_instruction_token: </i>
 
 loss_masking_jq_patterns:
     - .conversations | select(.from == "human")
     - .instruction
     - .role
 
-message_construction: [role, instruction, conversations]
+message_construction: 
+  - b_role_token
+  - role
+  - e_role_token
+  - b_instruction_token
+  - instruction
+  - e_instruction_token
+  - conversations
+
+  assistant_role: gpt
 ```
+
+To reduce the complexity of this example, we assume that each word is resembled by exactly one token and disregard punctuation. Similarly, we also did not replace each word by its token id. 
+
+Given the simplification, the batch is represented by the following data structure: 
+
+
+```JSON
+[
+  "samples" : torch.Tensor([
+    
+    <
+    (b_instruction_token)
+    You are Mody, a helpful LLM trained by the modalities team
+    (e_instruction_token)
+    
+    (b_role_token)
+    Mody, a helpful LLM trained by the modalities team
+    (b_role_token)
+
+    human_1: What is up?
+    gpt: Hello! How can I help you today?
+
+    human_1: Who are you?
+    gpt:
+    (b_assistant_token) 
+    You can call me Mody, and I was trained by the modalities team as a language model.
+    (e_assistant_token) 
+
+    human_2: Goodbye
+    gpt: Goodbye! If you have any more questions in the future, don't hesitate to ask.>
+    ...
+  ]
+  "targets": <equals samples just shifted by one token>
+  "loss_mask": 
+]
+```
\ No newline at end of file