Skip to content

Commit

Permalink
Add PT/SFT training scripts (#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
iMountTai authored Mar 5, 2024
1 parent 8ab9aaa commit 7f32e79
Show file tree
Hide file tree
Showing 6 changed files with 1,255 additions and 0 deletions.
84 changes: 84 additions & 0 deletions scripts/training/build_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import logging
import os
from typing import Union, List
import datasets
import torch
from datasets import load_dataset, concatenate_datasets
import transformers


IGNORE_INDEX = -100

logger = logging.getLogger('__name__')

PROMPT_TEMPLATE = (
"[INST] {instruction} [/INST]"
)

def build_instruction_dataset(data_path: Union[List[str],str],
tokenizer: transformers.PreTrainedTokenizer,
max_seq_length: int, data_cache_dir = None,
preprocessing_num_workers = None,
):

def tokenization(examples):
sources = []
targets = []
prompt = PROMPT_TEMPLATE
for instruction, input_text, output in zip(examples['instruction'],examples['input'],examples['output']):
if input_text is not None and input_text !="":
instruction = instruction+'\n' + input_text
source = prompt.format_map({'instruction':instruction})
target = f"{output}{tokenizer.eos_token}"

sources.append(source)
targets.append(target)

tokenized_sources = tokenizer(sources,return_attention_mask=False)
tokenized_targets = tokenizer(targets,return_attention_mask=False,add_special_tokens=False)

all_input_ids = []
all_labels = []
for s,t in zip(tokenized_sources['input_ids'],tokenized_targets['input_ids']):
if len(s) >= max_seq_length:
continue
input_ids = torch.LongTensor(s + t)[:max_seq_length]
labels = torch.LongTensor([IGNORE_INDEX] * len(s) + t)[:max_seq_length]
all_input_ids.append(input_ids)
all_labels.append(labels)

results = {'input_ids':all_input_ids, 'labels': all_labels}
return results


logging.warning("building dataset...")
all_datasets = []

if not isinstance(data_path,(list,tuple)):
data_path = [data_path]
for file in data_path:

if data_cache_dir is None:
data_cache_dir = str(os.path.dirname(file))
cache_path = os.path.join(data_cache_dir,os.path.basename(file).split('.')[0]+f"_{max_seq_length}")
os.makedirs(cache_path, exist_ok=True)
try:
processed_dataset = datasets.load_from_disk(cache_path)
logger.info(f'training datasets-{file} has been loaded from disk')
except Exception:
raw_dataset = load_dataset("json", data_files=file, cache_dir=cache_path)
tokenization_func = tokenization
tokenized_dataset = raw_dataset.map(
tokenization_func,
batched=True,
num_proc=preprocessing_num_workers,
remove_columns=["instruction","input","output"],
keep_in_memory=False,
desc="preprocessing on dataset",
)
processed_dataset = tokenized_dataset
processed_dataset.save_to_disk(cache_path)
processed_dataset.set_format('torch')
all_datasets.append(processed_dataset['train'])
all_datasets = concatenate_datasets(all_datasets)
return all_datasets
27 changes: 27 additions & 0 deletions scripts/training/ds_zero2_no_offload.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 100,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1e-10
},

"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 1e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 1e8,
"contiguous_gradients": true
},

"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
Loading

0 comments on commit 7f32e79

Please sign in to comment.