>>> import sys
>>> from flagai.trainer import Trainer
>>> from transformers import T5ForConditionalGeneration, T5Tokenizer
>>> from torch.utils.data import Dataset
>>> import torch
>>> ## Inheriant the Trainer
>>> ## overload the forward_step function
>>> class MyTrainer(Trainer):
>>> def forward_step(self, data, model, mems):
"""
Args:
data: a dict contains a batch of inputs
return:
output: a dict contains `loss`
"""
>>> model_outputs = model(**data)
>>> output = {}
>>> output['loss'] = model_outputs.loss
>>> output['logits'] = model_outputs.logits
>>> output['hidden_states'] = model_outputs.decoder_hidden_states
>>> return output
>>> # get a customized trainer instance
>>> trainer = MyTrainer(
>>> env_type='pytorch',
>>> epochs=1,
>>> batch_size=4,
>>> eval_interval=10,
>>> log_interval=10,
>>> experiment_name='t5-11b',
>>> pytorch_device='cuda:0',
>>> load_dir=None,
>>> lr=1e-4,
>>> fp16=False)
>>> # using huggingface transformers to get tokenizer and models
>>> model_name = 't5-11b'
>>> tokenizer = T5Tokenizer.from_pretrained(model_name)
>>> model = T5ForConditionalGeneration.from_pretrained(model_name)
>>> print("loading model & tokenizer is done!")
>>> src_dir = 'train_inputs.txt'
>>> tgt_dir = 'train_targets.txt'
>>> model_dir = "./t5-11b" # 模型位置
>>> maxlen = 1024
>>> def read_file():
>>> src = []
>>> tgt = []
>>> with open(src_dir, 'r', encoding='utf-8') as f:
>>> lines = f.readlines()
>>> for line in lines:
>>> src.append(line.strip('\n').lower())
>>> with open(tgt_dir, 'r', encoding='utf-8') as f:
>>> lines = f.readlines()
>>> for line in lines:
>>> tgt.append(line.strip('\n').lower())
>>> return src, tgt
>>> class BertSeq2seqDataset(Dataset):
>>> def __init__(self, sents_src, sents_tgt, tokenizer, maxlen=512):
>>> super(BertSeq2seqDataset, self).__init__()
>>> self.sents_src = sents_src
>>> self.sents_tgt = sents_tgt
>>> self.tokenizer = tokenizer
>>> self.maxlen = maxlen
>>> def __getitem__(self, i):
>>> src = self.sents_src[i]
>>> tgt = self.sents_tgt[i]
>>> inputs = tokenizer(src)
>>> with tokenizer.as_target_tokenizer():
>>> labels = tokenizer(tgt)
>>> output = {}
>>> output['input_ids'] = inputs.input_ids
>>> output['labels'] = labels.input_ids
>>> return output
>>> def __len__(self):
>>> return len(self.sents_src)
>>> def seq2seq_collate_fn(batch):
>>> def padding(indice, max_length, pad_idx=0):
>>> pad_indice = [
>>> item + [pad_idx] * max(0, max_length - len(item))
>>> for item in indice
>>> ]
>>> return torch.tensor(pad_indice)
>>> token_ids = [data["input_ids"] for data in batch]
>>> max_length_tk = max([len(t) for t in token_ids])
>>> labels = [data["labels"] for data in batch]
>>> max_length_lb = max([len(t) for t in labels])
>>> token_ids_padded = padding(token_ids, max_length_tk)
>>> labels_padded = padding(labels, max_length_lb)
>>> data = {"input_ids": token_ids_padded, "labels": labels_padded}
>>> return data
>>> sents_src, sents_tgt = read_file()
>>> data_len = len(sents_tgt)
>>> train_size = int(data_len * 0.8)
>>> train_src = sents_src[:train_size]
>>> train_tgt = sents_tgt[:train_size]
>>> val_src = sents_src[train_size:]
>>> val_tgt = sents_tgt[train_size:]
>>> train_dataset = BertSeq2seqDataset(train_src,
>>> train_tgt,
>>> tokenizer=tokenizer,
>>> maxlen=maxlen)
>>> val_dataset = BertSeq2seqDataset(val_src,
>>> val_tgt,
>>> tokenizer=tokenizer,
>>> maxlen=maxlen)
>>> ## Training
>>> trainer.train(model,
>>> train_dataset=train_dataset,
>>> collate_fn=seq2seq_collate_fn)
We may not run a t5-11b on a V100 32G. So, we need some tricks to cut down the GPU memory usage.
Model parameters turned to fp16
>>> trainer = MyTrainer(
>>> env_type='pytorch',
>>> epochs=1,
>>> batch_size=1,
>>> eval_interval=10,
>>> log_interval=10,
>>> experiment_name='t5-11b',
>>> pytorch_device='cuda:0',
>>> load_dir=None,
>>> lr=1e-4,
>>> fp16=True) # change to `True`
Do not save the itermedia results in forward stage. Now you may run t5-11b with batch size
=1.
Now, we can train/finetune a t5-11b with gradient_accumulation_steps
.
>>> model.gradient_checkpointing = True
To multiply your batch size, we can use data paralle on multiple GPUs.
>>> trainer = Trainer(
>>> env_type="pytorchDDP",
>>> epochs=1,
>>> batch_size=1,
>>> eval_interval=10,
>>> log_interval=10,
>>> experiment_name='t5-11b',
>>> load_dir=None,
>>> lr=1e-4,
>>> fp16=True
>>> checkpoint_activations=False,
>>> # The following six options is for pytorchDDP
>>> master_ip='127.0.0.1',
>>> master_port=17750,
>>> num_nodes=1,
>>> num_gpus=2,
>>> hostfile='hostfile', # hostfile setup the number of nodes & gpus
>>> training_script=__file__,
>>> )
With cpuoffload
and stage2
, increase the batch size
on single gpu to 4
.
>>> trainer = Trainer(
>>> env_type="deepspeed", # env_type
>>> epochs=1,
>>> batch_size=1,
>>> eval_interval=10,
>>> log_interval=10,
>>> experiment_name='t5-11b',
>>> load_dir=None,
>>> lr=1e-4,
>>> fp16=True
>>> checkpoint_activations=False,
>>> # parallel settings
>>> master_ip='127.0.0.1',
>>> master_port=17750,
>>> num_nodes=1,
>>> num_gpus=2,
>>> hostfile='hostfile',
>>> training_script=__file__,
>>> # deepspeed
>>> deepspeed_config='deepspeed.json'
>>> )
Open your imagenation.
>>> trainer = Trainer(
>>> env_type="deepspeed", # env_type
>>> epochs=1,
>>> batch_size=1,
>>> eval_interval=10,
>>> log_interval=10,
>>> experiment_name='t5-11b',
>>> load_dir=None,
>>> lr=1e-4,
>>> fp16=True
>>> checkpoint_activations=False,
>>> # parallel settings
>>> master_ip='127.0.0.1',
>>> master_port=17750,
>>> num_nodes=1,
>>> num_gpus=2,
>>> hostfile='hostfile',
>>> training_script=__file__,
>>> hostfile='hostfile',
>>> # deepspeed
>>> deepspeed_config='deepspeed.json',
>>> # megatron-lm
>>> model_paralle_size = 2
>>> )