diff --git a/MMAP_DATASET_README.md b/MMAP_DATASET_README.md
index 4cfe38e19..81b39e983 100644
--- a/MMAP_DATASET_README.md
+++ b/MMAP_DATASET_README.md
@@ -112,3 +112,175 @@ def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
return np.concatenate((doc_idx_first, doc_idx_last))
```
+
+
+# Fine-tuning Datasets
+
+## Instruction Tuning
+Instruction tuning datasets, such as Bactrian or LIMA, generally come in diverse formats. Therefore, before instruction-tuning a model with one of these datasets the user has to transform the dataset into the following format JSONL, inspired by Fast Chat. The listing below showcases an exemplary sample from the JSONL file.
+The `id` represents the incremental sample id. `Conversations` contains the multi-turn messages between different parties. Here, we depicted messages between a human and a gpt model. Finally, the format allows for the specification of further, arbitrary key-value pairs such as instructions and roles.
+
+```JSON
+{
+ "id": 0,
+ "conversations": [
+ {
+ "from": "human_1",
+ "value": "What is up?"
+ },
+ {
+ "from": "gpt",
+ "value": "Hello! How can I help you today?"
+ },
+ {
+ "from": "human_1",
+ "value": "Who are you?"
+ },
+ {
+ "from": "gpt",
+ "value": "You can call me Mody, and I was trained by the modalities team as a language model."
+ },
+ {
+ "from": "human_2",
+ "value": "Goodbye"
+ },
+ {
+ "from": "gpt",
+ "value": "Goodbye! If you have any more questions in the future, don't hesitate to ask."
+ }
+ ]
+
+ # optional / arbitrary key value pairs e.g.:
+ "instruction": "You are Mody, a helpful LLM trained by the modalities team"
+ "role": "Mody, a helpful LLM trained by the modalities team"
+}
+```
+All JSONL files for instruction tuning have to follow this format.
+
+Given a prepared JSONL file, the training / processing flow can be described as follows:
+During the instantiation of the MemMap file, we specify the JQ patterns that determine which fields in the JSON are supposed to be tokenized and additionally pass a list of special tokens e.g., ``, ``, `` etc. to the constructor.
+Each one of the special tokens is mapped to a single, individual token id once during the instantation of the MemMap file.
+
+When the dataloader iterates over the MemMap file, the `__get_item__()` method tokenizes the sample as specified in the JQ patterns list and enriches the resulting dictionary with the token ids of the special tokens that we pre-computed during the MemMap file instantiation.
+In other words, we extract the desired keys from the raw text dictionary, tokenize the content, build a new dictionary with the tokenized data and add the representation of special tokens to it.
+
+Given the MemMap parameterization
+
+```
+ tokenization_jq_patterns = [".conversations .value", ".instruction", ".role"]
+ pass_through_jq_patterns = [".id"]
+ special_tokens_map = {"b_instruction_token": "place_holder_token_100", ... }
+```
+
+```JSON
+{
+ "id": 0,
+ "conversations": [
+ {
+ "from": "human_1",
+ "from_tokenized": "",
+ "value": ""
+ },
+ {
+ "from": "gpt",
+ "from_tokenized": "",
+ "value": ""
+ },
+ {
+ "from": "human_1",
+ "from_tokenized": "",
+ "value": ""
+ },
+ {
+ "from": "gpt",
+ "from_tokenized": "",
+ "value": ""
+ },
+ {
+ "from": "human_2",
+ "from_tokenized": "",
+ "value": ""
+ },
+ {
+ "from": "gpt",
+ "from_tokenized": "",
+ "value": ""
+ }
+ ]
+
+ # optional / arbitrary key value pairs e.g.:
+ "instruction": ""
+ "role": ""
+ "special_tokens": {"bos_token": "", "eos_token": "eos_token_id>", ... "unk_token", "mask_token",
+ "b_role_token", "e_role_token", "b_instruction_token",
+ "e_instruction_token"}
+}
+```
+
+
+The dataloader packs multiple samples to a `DatasetBatch` and calls the `Collator` for bringing the batch of samples into the correct format training.
+
+The collator is instantiated with information on how to assemble the entire prompt from the `conversations` and the optional key-value pairs.
+In practice, the YAML configuration has the following structure
+
+```YAML
+special_tokens:
+ bos_token:
+ eos_token:
+ b_role_token:
+ e_role_token:
+ b_instruction_token:
+ e_instruction_token:
+
+loss_masking_jq_patterns:
+ - .conversations | select(.from == "human")
+ - .instruction
+ - .role
+
+message_construction:
+ - b_role_token
+ - role
+ - e_role_token
+ - b_instruction_token
+ - instruction
+ - e_instruction_token
+ - conversations
+
+ assistant_role: gpt
+```
+
+To reduce the complexity of this example, we assume that each word is resembled by exactly one token and disregard punctuation. Similarly, we also did not replace each word by its token id.
+
+Given the simplification, the batch is represented by the following data structure:
+
+
+```JSON
+[
+ "samples" : torch.Tensor([
+
+ <
+ (b_instruction_token)
+ You are Mody, a helpful LLM trained by the modalities team
+ (e_instruction_token)
+
+ (b_role_token)
+ Mody, a helpful LLM trained by the modalities team
+ (b_role_token)
+
+ human_1: What is up?
+ gpt: Hello! How can I help you today?
+
+ human_1: Who are you?
+ gpt:
+ (b_assistant_token)
+ You can call me Mody, and I was trained by the modalities team as a language model.
+ (e_assistant_token)
+
+ human_2: Goodbye
+ gpt: Goodbye! If you have any more questions in the future, don't hesitate to ask.>
+ ...
+ ]
+ "targets":
+ "loss_mask":
+]
+```
\ No newline at end of file
diff --git a/config_files/training/config_lorem_ipsum.yaml b/config_files/training/config_lorem_ipsum.yaml
index e4251a2f7..ed3dfdee0 100644
--- a/config_files/training/config_lorem_ipsum.yaml
+++ b/config_files/training/config_lorem_ipsum.yaml
@@ -2,7 +2,7 @@ settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
- sample_key: input_ids
+ sample_key: tokenized_input
target_key: target_ids
training:
training_log_interval_in_steps: 2
@@ -20,6 +20,12 @@ settings:
paths:
checkpointing_path: data/checkpoints
+tokenizer:
+ component_key: tokenizer
+ variant_key: gpt2_tokenizer_fast
+ config:
+ tokenizer_file: data/tokenizer/tokenizer_gpt2.json
+
collate_fn:
component_key: collate_fn
variant_key: gpt_2_llm_collator
@@ -31,8 +37,21 @@ train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
- raw_data_path: ./data/lorem_ipsum.pbin
+ raw_data_path: data/lorem_ipsum_instruct_multi_turn.jsonl
+ index_path: data/lorem_ipsum_instruct_multi_turn.idx
sequence_length: ${settings.training.sequence_length}
+ block_size: ${settings.training.sequence_length}
+ tokenization_jq_patterns:
+ ${settings.referencing_keys.sample_key}: .conversations
+ pass_through_jq_patterns:
+ raw_text: .conversations
+
+ # tokenization_jq_patterns:
+ # - new_key: input_ids # ${settings.referencing_keys.sample_key}
+ # jq_pattern: .text
+ # pass_through_jq_patterns:
+ # - new_key: raw_text
+ # jq_pattern: .text
sample_key: ${settings.referencing_keys.sample_key}
train_dataloader:
diff --git a/data/lorem_ipsum_instruct_multi_turn.idx b/data/lorem_ipsum_instruct_multi_turn.idx
new file mode 100644
index 000000000..9bcc11df1
Binary files /dev/null and b/data/lorem_ipsum_instruct_multi_turn.idx differ
diff --git a/data/lorem_ipsum_instruct_multi_turn.jsonl b/data/lorem_ipsum_instruct_multi_turn.jsonl
new file mode 100644
index 000000000..3ee98d518
--- /dev/null
+++ b/data/lorem_ipsum_instruct_multi_turn.jsonl
@@ -0,0 +1,5 @@
+{"Conversations": ["0 Who is the president of the United States", "Joe Biden"]}
+{"Conversations": ["1 Who is the chancellor of Germany", "Olaf Scholz", "Thank you."]}
+{"conversations": ["2 What is the most effective weapon in CS? ", "Are you referring to Counter Strike 2?", "Yes.", "The most effective weapon from a damage point of view is the AWP"]}
+{"conversations": ["3 What is the capital of France", "Paris"]}
+{"conversations": ["4 What is the capital of Germany", "Berlin"]}
\ No newline at end of file
diff --git a/data/lorem_ipsum_sft.jsonl b/data/lorem_ipsum_sft.jsonl
new file mode 100644
index 000000000..3c1a0e894
--- /dev/null
+++ b/data/lorem_ipsum_sft.jsonl
@@ -0,0 +1,10 @@
+{"id": 0, "conversations": [{"from": "human_1", "value": "What is up?`"}, {"from": "gpt", "value": "Hello! How can I help you today?"}, {"from": "human_1", "value": "Who are you?"}, {"from": "gpt", "value": "You can call me Mody, and I was trained by the modalities team as a language model."}, {"from": "human_2", "value": "Goodbye"}, {"from": "gpt", "value": "Goodbye! If you have any more questions in the future, don't hesitate to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 1, "conversations": [{"from": "human_1", "value": "How does AI work?"}, {"from": "gpt", "value": "AI, or artificial intelligence, operates by analyzing data and patterns to make decisions or predictions. Would you like a more detailed explanation?"}, {"from": "human_1", "value": "Yes, please."}, {"from": "gpt", "value": "Sure, AI systems, such as neural networks, are trained using large amounts of data. They learn to recognize patterns and make predictions. It's like how a child learns from experience."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 2, "conversations": [{"from": "human_1", "value": "Can you write a poem?"}, {"from": "gpt", "value": "Of course! Give me a topic, and I'll write something for you."}, {"from": "human_1", "value": "Write about the sea."}, {"from": "gpt", "value": "The sea, a vast expanse, so deep and wide, Whispers secrets at the tide's low ebb, Its waves, the rhythm of time's endless web, A world of mystery beneath its hide."}, {"from": "human_2", "value": "That's beautiful, thanks!"}, {"from": "gpt", "value": "You're welcome! I'm glad you liked it. If you have any more requests, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 3, "conversations": [{"from": "human_1", "value": "What's the weather like today?"}, {"from": "gpt", "value": "I'm not able to check real-time data, but I can tell you how to find out using a weather app or website. Would that help?"}, {"from": "human_1", "value": "Yes, please."}, {"from": "gpt", "value": "You can use weather websites like Weather.com or apps like AccuWeather to get current weather conditions and forecasts."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 4, "conversations": [{"from": "human_1", "value": "What is the capital of France?"}, {"from": "gpt", "value": "The capital of France is Paris. It's known for its art, culture, and history. Have you ever been there?"}, {"from": "human_1", "value": "No, but I'd love to visit someday."}, {"from": "gpt", "value": "It's a beautiful city with so much to see and do. I hope you get the chance to visit soon!"}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 5, "conversations": [{"from": "human_1", "value": "What is the meaning of life?"}, {"from": "gpt", "value": "The meaning of life is a philosophical question that has been debated for centuries. Some believe it's about finding happiness, others think it's about fulfilling a purpose. What do you think?"}, {"from": "human_1", "value": "I think it's about making the most of the time we have and finding joy in the little things."}, {"from": "gpt", "value": "That's a beautiful perspective. It's important to appreciate the present and find joy in everyday moments."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 6, "conversations": [{"from": "human_1", "value": "What is the best way to learn a new language?"}, {"from": "gpt", "value": "The best way to learn a new language is to practice regularly, immerse yourself in the language, and use a variety of resources like books, apps, and language classes. It's also helpful to practice with native speakers."}, {"from": "human_1", "value": "Thank you for the advice."}, {"from": "gpt", "value": "You're welcome! Learning a new language can be a rewarding experience. If you have any more questions, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 7, "conversations": [{"from": "human_1", "value": "What is the best way to stay healthy?"}, {"from": "gpt", "value": "The best way to stay healthy is to eat a balanced diet, exercise regularly, get enough sleep, and manage stress. It's also important to see your doctor for regular check-ups and screenings."}, {"from": "human_1", "value": "Thank you for the advice."}, {"from": "gpt", "value": "You're welcome! Taking care of your health is important for your overall well-being. If you have any more questions, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 8, "conversations": [{"from": "human_1", "value": "What is the best way to study for exams?"}, {"from": "gpt", "value": "The best way to study for exams is to create a study schedule, break down the material into manageable chunks, and use active learning techniques like flashcards or practice tests. It's also important to get plenty of rest and stay hydrated."}, {"from": "human_1", "value": "Thank you for the advice."}, {"from": "gpt", "value": "You're welcome! Studying can be challenging, but with the right strategies, you can succeed. If you have any more questions, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
+{"id": 9, "conversations": [{"from": "human_1", "value": "What is the best way to save money?"}, {"from": "gpt", "value": "The best way to save money is to create a budget, track your expenses, and look for ways to cut costs. You can also set financial goals and automate your savings to make it easier to save."}, {"from": "human_1", "value": "Thank you for the advice."}, {"from": "gpt", "value": "You're welcome! Saving money is an important skill that can help you achieve your financial goals. If you have any more questions, feel free to ask."}], "instruction": "You are Mody, a helpful LLM trained by the modalities team", "role": "Mody, a helpful LLM trained by the modalities team", "special_tokens": {"bos_token": "bos", "eos_token": "eos"}}
diff --git a/src/modalities/config/config.py b/src/modalities/config/config.py
index c1f62ed4a..f0455c975 100644
--- a/src/modalities/config/config.py
+++ b/src/modalities/config/config.py
@@ -261,7 +261,8 @@ class MemMapDatasetConfig(BaseModel):
index_path: Optional[FilePath] = None
sequence_length: Annotated[int, Field(strict=True, gt=1)]
tokenizer: PydanticTokenizerIFType
- jq_pattern: str
+ tokenization_jq_patterns: Dict[str, str]
+ pass_through_jq_patterns: Optional[Dict[str, str]] = None
sample_key: str
diff --git a/src/modalities/dataloader/dataset.py b/src/modalities/dataloader/dataset.py
index 1730971f4..5212f1502 100644
--- a/src/modalities/dataloader/dataset.py
+++ b/src/modalities/dataloader/dataset.py
@@ -1,15 +1,16 @@
from __future__ import annotations
+import json
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import jq
import numpy as np
-from pydantic import BaseModel
+from pydantic import BaseModel, validator
from torch.utils.data.dataset import Dataset as TorchdataSet
from tqdm import tqdm
-from transformers import BatchEncoding
+from transformers import BatchEncoding, PreTrainedTokenizer
from modalities.tokenization.tokenizer_wrapper import TokenizerWrapper
@@ -138,6 +139,7 @@ def __init__(self, raw_data_path: Path, sample_key: str):
super().__init__(raw_data_path=raw_data_path, sample_key=sample_key)
self._embedded_stream_data = EmbeddedStreamData(raw_data_path)
self._token_size_in_bytes = self._embedded_stream_data.token_size_in_bytes
+ self.sample_key = sample_key
try:
self._token_dtype_on_disk = self.np_dtype_of_tokens_on_disk_from_bytes[self._token_size_in_bytes]
self._token_dtype_in_ram = self.type_converter_for_torch[self._token_size_in_bytes]
@@ -241,3 +243,126 @@ def _generate_packing_index(self) -> List[Tuple[int, int]]:
curr_offset = segment_offset
curr_len = segment_len
return index
+
+
+class TransformOperation(Enum):
+ TOKENIZE = "tokenize"
+ PASS_THROUGH = "pass_through"
+
+
+class SampleTransform(BaseModel):
+ json_indexation_pattern: List[str]
+ new_key: Optional[str] = None
+ transform_operation: TransformOperation = TransformOperation.TOKENIZE
+
+ @validator("json_indexation_pattern", pre=True, each_item=False)
+ def _check_at_least_one_item(cls, v):
+ if not v:
+ raise ValueError("json_indexation_pattern must contain at least one item")
+ return v
+
+ def __init__(self, **data):
+ super().__init__(**data)
+ if self.new_key is None and self.json_indexation_pattern:
+ self.new_key = self.json_indexation_pattern[-1]
+
+
+class SFTMemMapDataset(Dataset):
+ def __init__(
+ self,
+ raw_data_path: Path,
+ block_size: int,
+ tokenizer: PreTrainedTokenizer,
+ sample_transforms: List[SampleTransform],
+ index_path: Optional[Path] = None,
+ ):
+ super().__init__(raw_data_path=raw_data_path, block_size=block_size)
+
+ self.reader = LargeFileLinesReader(self.raw_data_path, index_path=index_path)
+ self.tokenizer = tokenizer
+ self.indexation_pattern_to_sample_transforms = {}
+ for sample_transform in sample_transforms:
+ if sample_transform.json_indexation_pattern not in self.indexation_pattern_to_sample_transforms:
+ self.indexation_pattern_to_sample_transforms[sample_transform.json_indexation_pattern] = []
+ self.indexation_pattern_to_sample_transforms[sample_transform.json_indexation_pattern].append(
+ sample_transform
+ )
+
+ def __len__(self) -> int:
+ return len(self.reader)
+
+ def __getitem__(self, idx: int) -> BatchEncoding:
+ self._check_if_inbounds(idx)
+ item = json.loads(self.reader[idx])
+ # conversations -> * -> value -> tokenize value
+ self._transform_json_dict(
+ element=item,
+ current_path=[],
+ indexation_pattern_to_sample_transforms=self.indexation_pattern_to_sample_transforms,
+ )
+ return item
+
+ def _transform_json_dict(
+ self,
+ element: Dict | List | str,
+ current_path: List[str],
+ indexation_pattern_to_sample_transforms: Dict[str, List[SampleTransform]],
+ ):
+ def run_transform(
+ current_path: List[str],
+ element: str,
+ indexation_pattern_to_sample_transforms: Dict[str, List[SampleTransform]],
+ ):
+ current_pattern_string = ".".join(current_path)
+ transformed_element = {}
+ if current_pattern_string in indexation_pattern_to_sample_transforms:
+ sample_transforms = indexation_pattern_to_sample_transforms[current_pattern_string]
+ for sample_transform in sample_transforms:
+ if sample_transform.transform_operation == TransformOperation.TOKENIZE:
+ tokens = self.tokenizer(
+ element,
+ max_length=self.block_size,
+ padding="max_length",
+ truncation=True,
+ )
+ transformed_element[sample_transform.new_key] = tokens
+ elif sample_transform.transform_operation == TransformOperation.PASS_THROUGH:
+ transformed_element[sample_transform.new_key] = element
+ return transformed_element
+
+ if isinstance(element, dict):
+ transformed_elements_list = []
+
+ for key, sub_element in element.items():
+ if not isinstance(element, dict) or not isinstance(element, list):
+ transformed_sub_element: Dict = run_transform(
+ current_path=current_path + [key],
+ element=sub_element,
+ indexation_pattern_to_sample_transforms=indexation_pattern_to_sample_transforms,
+ )
+ else:
+ transformed_sub_element = self._transform_json_dict(
+ sub_element, current_path + [key], indexation_pattern_to_sample_transforms
+ )
+ transformed_elements_list.append(transformed_sub_element)
+
+ transformed_elements_dict = {k: v for d in transformed_elements_list for k, v in d.items()}
+ return transformed_elements_dict
+
+ elif isinstance(element, list):
+ transformed_elements_list = []
+ for sub_element in element:
+ # Note that, we don't execute run_transform here, as we only tokenize the values
+ # of dictionaries and not of lists.
+ # If this is required, there is still the possibility to add this functionality.
+ transformed_sub_element = self._transform_json_dict(
+ sub_element, current_path + ["*"], indexation_pattern_to_sample_transforms
+ )
+ transformed_elements_list.append(transformed_sub_element)
+
+ # In this case, we have a nested list and therfore no key to construct a dictionary from
+ if current_path[-1] == "*":
+ return transformed_elements_list
+ # In this case, we don't have a nested list and can construct a dictionary from the list
+ else:
+ return {current_path[-1]: transformed_elements_list}
diff --git a/src/modalities/dataloader/dataset_factory.py b/src/modalities/dataloader/dataset_factory.py
index 990ffb227..d1ba9639f 100644
--- a/src/modalities/dataloader/dataset_factory.py
+++ b/src/modalities/dataloader/dataset_factory.py
@@ -1,5 +1,5 @@
from pathlib import Path
-from typing import Optional, Tuple
+from typing import Dict, Optional, Tuple
from pydantic import FilePath
from torch.utils.data.dataset import Dataset
@@ -43,8 +43,9 @@ def get_mem_map_dataset(
sequence_length: int,
tokenizer: PreTrainedTokenizer,
sample_key: str,
+ tokenization_jq_patterns: Dict[str, str],
index_path: Optional[Path] = None,
- jq_pattern: str = ".text",
+ pass_through_jq_patterns: Optional[Dict[str, str]] = None,
) -> MemMapDataset:
dataset = MemMapDataset(
raw_data_path=raw_data_path,
@@ -52,7 +53,8 @@ def get_mem_map_dataset(
tokenizer=tokenizer,
sample_key=sample_key,
index_path=index_path,
- jq_pattern=jq_pattern,
+ tokenization_jq_patterns=tokenization_jq_patterns,
+ pass_through_jq_patterns=pass_through_jq_patterns,
)
return dataset
diff --git a/tests/dataloader/test_sft_dataset.py b/tests/dataloader/test_sft_dataset.py
new file mode 100644
index 000000000..4543eb90a
--- /dev/null
+++ b/tests/dataloader/test_sft_dataset.py
@@ -0,0 +1,25 @@
+def test_create_packed_dataset(indexed_dummy_data_path, gpt2_tokenizer):
+ pass
+ # block_size = 5
+ # packed_generator = SFTMemMapDataset(
+ # src_path=indexed_dummy_data_path.raw_data_path, tokenizer=gpt2_tokenizer, number_of_processes=2
+ # )
+ # default_packed_dataset_path = packed_generator._default_destination_path()
+ # assert not default_packed_dataset_path.is_file()
+ # packed_generator.run()
+ # packed_dataset = PackedMemMapDatasetContinuous(
+ # default_packed_dataset_path, block_size=block_size, sample_key="input_ids"
+ # )
+
+ # start_of_jsonl_content = "0 Lorem ipsum dolor sit amet, consetetur sadipscing elitr"
+ # tokenized_start_of_jsonl_content = gpt2_tokenizer(start_of_jsonl_content)["input_ids"]
+ # packed_dataset_iterator = iter(packed_dataset)
+ # np.testing.assert_equal(tokenized_start_of_jsonl_content[:block_size], next(packed_dataset_iterator)["input_ids"])
+ # np.testing.assert_equal(
+ # tokenized_start_of_jsonl_content[block_size : 2 * block_size], next(packed_dataset_iterator)["input_ids"]
+ # )
+ # assert len(packed_dataset._embedded_stream_data.index_base) == 12
+
+ # # check validity of index section in packed dataset
+ # for idx, (offset, entry_length) in enumerate(packed_dataset._embedded_stream_data.index_base[:-1]):
+ # assert offset + entry_length == packed_dataset._embedded_stream_data.index_base[idx + 1][0]