-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
check wiki for details: https://github.com/ymcui/Chinese-Mixtral/wiki
- Loading branch information
Showing
6 changed files
with
1,255 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import logging | ||
import os | ||
from typing import Union, List | ||
import datasets | ||
import torch | ||
from datasets import load_dataset, concatenate_datasets | ||
import transformers | ||
|
||
|
||
IGNORE_INDEX = -100 | ||
|
||
logger = logging.getLogger('__name__') | ||
|
||
PROMPT_TEMPLATE = ( | ||
"[INST] {instruction} [/INST]" | ||
) | ||
|
||
def build_instruction_dataset(data_path: Union[List[str],str], | ||
tokenizer: transformers.PreTrainedTokenizer, | ||
max_seq_length: int, data_cache_dir = None, | ||
preprocessing_num_workers = None, | ||
): | ||
|
||
def tokenization(examples): | ||
sources = [] | ||
targets = [] | ||
prompt = PROMPT_TEMPLATE | ||
for instruction, input_text, output in zip(examples['instruction'],examples['input'],examples['output']): | ||
if input_text is not None and input_text !="": | ||
instruction = instruction+'\n' + input_text | ||
source = prompt.format_map({'instruction':instruction}) | ||
target = f"{output}{tokenizer.eos_token}" | ||
|
||
sources.append(source) | ||
targets.append(target) | ||
|
||
tokenized_sources = tokenizer(sources,return_attention_mask=False) | ||
tokenized_targets = tokenizer(targets,return_attention_mask=False,add_special_tokens=False) | ||
|
||
all_input_ids = [] | ||
all_labels = [] | ||
for s,t in zip(tokenized_sources['input_ids'],tokenized_targets['input_ids']): | ||
if len(s) >= max_seq_length: | ||
continue | ||
input_ids = torch.LongTensor(s + t)[:max_seq_length] | ||
labels = torch.LongTensor([IGNORE_INDEX] * len(s) + t)[:max_seq_length] | ||
all_input_ids.append(input_ids) | ||
all_labels.append(labels) | ||
|
||
results = {'input_ids':all_input_ids, 'labels': all_labels} | ||
return results | ||
|
||
|
||
logging.warning("building dataset...") | ||
all_datasets = [] | ||
|
||
if not isinstance(data_path,(list,tuple)): | ||
data_path = [data_path] | ||
for file in data_path: | ||
|
||
if data_cache_dir is None: | ||
data_cache_dir = str(os.path.dirname(file)) | ||
cache_path = os.path.join(data_cache_dir,os.path.basename(file).split('.')[0]+f"_{max_seq_length}") | ||
os.makedirs(cache_path, exist_ok=True) | ||
try: | ||
processed_dataset = datasets.load_from_disk(cache_path) | ||
logger.info(f'training datasets-{file} has been loaded from disk') | ||
except Exception: | ||
raw_dataset = load_dataset("json", data_files=file, cache_dir=cache_path) | ||
tokenization_func = tokenization | ||
tokenized_dataset = raw_dataset.map( | ||
tokenization_func, | ||
batched=True, | ||
num_proc=preprocessing_num_workers, | ||
remove_columns=["instruction","input","output"], | ||
keep_in_memory=False, | ||
desc="preprocessing on dataset", | ||
) | ||
processed_dataset = tokenized_dataset | ||
processed_dataset.save_to_disk(cache_path) | ||
processed_dataset.set_format('torch') | ||
all_datasets.append(processed_dataset['train']) | ||
all_datasets = concatenate_datasets(all_datasets) | ||
return all_datasets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
{ | ||
"fp16": { | ||
"enabled": "auto", | ||
"loss_scale": 0, | ||
"loss_scale_window": 100, | ||
"initial_scale_power": 16, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1e-10 | ||
}, | ||
|
||
"zero_optimization": { | ||
"stage": 2, | ||
"allgather_partitions": true, | ||
"allgather_bucket_size": 1e8, | ||
"overlap_comm": true, | ||
"reduce_scatter": true, | ||
"reduce_bucket_size": 1e8, | ||
"contiguous_gradients": true | ||
}, | ||
|
||
"gradient_accumulation_steps": "auto", | ||
"gradient_clipping": "auto", | ||
"steps_per_print": 2000, | ||
"train_batch_size": "auto", | ||
"train_micro_batch_size_per_gpu": "auto", | ||
"wall_clock_breakdown": false | ||
} |
Oops, something went wrong.