Add PT/SFT training scripts (#18)

check wiki for details: https://github.com/ymcui/Chinese-Mixtral/wiki
ymcui · Mar 5, 2024 · 7f32e79 · 7f32e79
1 parent 8ab9aaa
commit 7f32e79
Show file tree

Hide file tree

Showing 6 changed files with 1,255 additions and 0 deletions.
diff --git a/scripts/training/build_dataset.py b/scripts/training/build_dataset.py
@@ -0,0 +1,84 @@
+import logging
+import os
+from typing import Union, List
+import datasets
+import torch
+from datasets import load_dataset, concatenate_datasets
+import transformers
+
+
+IGNORE_INDEX = -100
+
+logger = logging.getLogger('__name__')
+
+PROMPT_TEMPLATE = (
+        "[INST] {instruction} [/INST]"
+    )
+
+def build_instruction_dataset(data_path: Union[List[str],str],
+                tokenizer: transformers.PreTrainedTokenizer,
+                max_seq_length: int, data_cache_dir = None,
+                preprocessing_num_workers = None,
+                ):
+
+    def tokenization(examples):
+        sources = []
+        targets = []
+        prompt = PROMPT_TEMPLATE
+        for instruction, input_text, output in zip(examples['instruction'],examples['input'],examples['output']):
+            if input_text is not None and input_text !="":
+                instruction = instruction+'\n' + input_text
+            source = prompt.format_map({'instruction':instruction})
+            target = f"{output}{tokenizer.eos_token}"
+
+            sources.append(source)
+            targets.append(target)
+
+        tokenized_sources = tokenizer(sources,return_attention_mask=False)
+        tokenized_targets = tokenizer(targets,return_attention_mask=False,add_special_tokens=False)
+
+        all_input_ids = []
+        all_labels = []
+        for s,t in zip(tokenized_sources['input_ids'],tokenized_targets['input_ids']):
+            if len(s) >= max_seq_length:
+                continue
+            input_ids = torch.LongTensor(s + t)[:max_seq_length]
+            labels = torch.LongTensor([IGNORE_INDEX] * len(s) + t)[:max_seq_length]
+            all_input_ids.append(input_ids)
+            all_labels.append(labels)
+
+        results = {'input_ids':all_input_ids, 'labels': all_labels}
+        return results
+
+
+    logging.warning("building dataset...")
+    all_datasets = []
+
+    if not isinstance(data_path,(list,tuple)):
+        data_path = [data_path]
+    for file in data_path:
+
+        if data_cache_dir is None:
+            data_cache_dir = str(os.path.dirname(file))
+        cache_path = os.path.join(data_cache_dir,os.path.basename(file).split('.')[0]+f"_{max_seq_length}")
+        os.makedirs(cache_path, exist_ok=True)
+        try:
+            processed_dataset = datasets.load_from_disk(cache_path)
+            logger.info(f'training datasets-{file} has been loaded from disk')
+        except Exception:
+            raw_dataset = load_dataset("json", data_files=file, cache_dir=cache_path)
+            tokenization_func = tokenization
+            tokenized_dataset = raw_dataset.map(
+                tokenization_func,
+                batched=True,
+                num_proc=preprocessing_num_workers,
+                remove_columns=["instruction","input","output"],
+                keep_in_memory=False,
+                desc="preprocessing on dataset",
+            )
+            processed_dataset = tokenized_dataset
+            processed_dataset.save_to_disk(cache_path)
+        processed_dataset.set_format('torch')
+        all_datasets.append(processed_dataset['train'])
+    all_datasets = concatenate_datasets(all_datasets)
+    return all_datasets
diff --git a/scripts/training/ds_zero2_no_offload.json b/scripts/training/ds_zero2_no_offload.json
@@ -0,0 +1,27 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 100,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1e-10
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 1e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 1e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}