diff --git a/configs/common/models/t5.py b/configs/common/models/t5.py index 2d5625c41..a23c637fd 100644 --- a/configs/common/models/t5.py +++ b/configs/common/models/t5.py @@ -13,7 +13,6 @@ attention_probs_dropout_prob=0.1, max_position_embeddings=512, embedding_dropout_prob=0.1, - num_tokentypes=0, initializer_range=0.02, layernorm_eps=1e-5, bias_gelu_fusion=True, diff --git a/libai/engine/default.py b/libai/engine/default.py index 1e670b651..c5c8cf21f 100644 --- a/libai/engine/default.py +++ b/libai/engine/default.py @@ -303,7 +303,17 @@ def __init__(self, cfg): self.auto_scale_hyperparams(cfg, self.train_loader) # Assume these objects must be constructed in this order. + dist.synchronize() + start_time = time.time() + logger.info("> Start building model...") self.model = self.build_model(cfg) + + dist.synchronize() + logger.info( + ">>> done with building model. " + "Building time: {:.3f} seconds".format(time.time() - start_time) + ) + self.optimizer = self.build_optimizer(cfg, self.model) self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer) diff --git a/libai/inference/generator/generation_logits_processor.py b/libai/inference/generator/generation_logits_processor.py index f650177d6..7ca04684b 100644 --- a/libai/inference/generator/generation_logits_processor.py +++ b/libai/inference/generator/generation_logits_processor.py @@ -19,11 +19,8 @@ import math from typing import Callable, List, Tuple -import numpy as np import oneflow as flow -from libai.utils import distributed as dist - class LogitsProcessorList(list): def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor, **kwargs) -> flow.Tensor: @@ -125,20 +122,7 @@ def __call__(self, input_ids, scores, current_tokens, beam_group_idx) -> flow.Te previous_group_tokens = current_tokens[ batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx ] - # TODO: bincount - previous_group_tokens = ( - previous_group_tokens.to_global( - sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), - ) - .to_local() - .numpy() - ) - token_frequency = np.bincount(previous_group_tokens, minlength=vocab_size) - token_frequency = token_frequency.to_global( - sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), - ) + token_frequency = flow.bincount(previous_group_tokens, minlength=vocab_size) scores[batch_idx * group_size : (batch_idx + 1) * group_size] = ( scores[batch_idx * group_size : (batch_idx + 1) * group_size] - self._diversity_penalty * token_frequency diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py index ed14c2e19..5e2513bba 100644 --- a/libai/inference/generator/generation_utils.py +++ b/libai/inference/generator/generation_utils.py @@ -468,7 +468,7 @@ def greedy_search( stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) # keep track of which sequences are already finished - unfinished_sequences = flow.zeros(input_ids.shape[0]).fill_(1) + unfinished_sequences = flow.ones(input_ids.shape[0]) cur_len = input_ids.shape[-1] while True: # prepare model inputs @@ -517,6 +517,10 @@ def greedy_search( if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): break + # Release records + self.past_key_values = [None] * len(self.decoder.layers) + self.encoder_states = None + return input_ids def multinomial_sample( @@ -553,7 +557,7 @@ def multinomial_sample( stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList() - unfinished_sequences = flow.zeros(input_ids.shape[0]).fill_(1) + unfinished_sequences = flow.ones(input_ids.shape[0]) cur_len = input_ids.shape[-1] while True: @@ -612,6 +616,10 @@ def multinomial_sample( if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): break + # Release records + self.past_key_values = [None] * len(self.decoder.layers) + self.encoder_states = None + return input_ids def beam_search( @@ -747,6 +755,10 @@ def beam_search( beam_indices=beam_indices, ) + # Release records + self.past_key_values = [None] * len(self.decoder.layers) + self.encoder_states = None + return sequence_outputs["sequences"] @flow.no_grad() diff --git a/libai/inference/text_generation.py b/libai/inference/text_generation.py index d003d63b8..96647f6f1 100644 --- a/libai/inference/text_generation.py +++ b/libai/inference/text_generation.py @@ -14,18 +14,10 @@ # limitations under the License. from libai.inference.basic import BasePipeline -from libai.tokenizer import T5Tokenizer from libai.utils import distributed as dist class TextGenerationPipeline(BasePipeline): - def build_tokenizer(self, cfg): - tokenizer = T5Tokenizer( - "data_test/t5_inference_model/spiece.model", - add_bos_token=True, - ) - return tokenizer - def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"): """load pretrained model. @@ -48,12 +40,21 @@ def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"): model_type="t5", ) return model_loader.load() - else: - return super().load_pretrain_weight( + elif mode == "libai": + from projects.MT5.utils.mt5_loader import T5LoaderLibai + + model_loader = T5LoaderLibai( libai_cfg_model, + libai_cfg_model.cfg, model_path, - mode=mode, ) + return model_loader.load() + elif mode == "random": + from libai.engine import DefaultTrainer + + return DefaultTrainer.build_model(self.cfg) + else: + raise NotImplementedError def _parse_parameters(self, **pipeline_parameters): preprocess_params = {} @@ -82,8 +83,11 @@ def forward(self, encoder_input_dict, **kwargs) -> dict: return {"return_ids": outputs} def postprocess(self, model_output_dict, **kwargs) -> dict: - text = self.tokenizer.decode(model_output_dict["return_ids"][0], skip_special_tokens=True) - records = {"generated_text": text} + return_ids = model_output_dict["return_ids"] + records = [ + {"generated_text": self.tokenizer.decode(return_ids[i], skip_special_tokens=True)} + for i in range(return_ids.size(0)) + ] return records diff --git a/projects/MAE/modeling/mae.py b/projects/MAE/modeling/mae.py index 221e9634b..c0ac7ca99 100644 --- a/projects/MAE/modeling/mae.py +++ b/projects/MAE/modeling/mae.py @@ -362,8 +362,8 @@ def forward_loss(self, imgs, pred, mask): target = (target - mean) / (var + 1.0e-6) ** 0.5 loss = (pred - target) ** 2 - # We want the prev loss to be calculated with float32, - # and mean/sum below to be calculated with float16. + # We want the prev loss to be calculated with float16, + # and mean/sum below to be calculated with float32. # this amp_white_identity will affect preceding ops to be float16 loss = flow._C.amp_white_identity(loss) # this amp_black_identity will affect succeeding ops to be float32 diff --git a/projects/MT5/configs/t5_inference.py b/projects/MT5/configs/t5_inference.py index 5f6ab7a84..528aca972 100644 --- a/projects/MT5/configs/t5_inference.py +++ b/projects/MT5/configs/t5_inference.py @@ -1,7 +1,9 @@ from .mt5_base import cfg from libai.config import LazyCall +from libai.tokenizer import T5Tokenizer from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining from configs.common.train import train +from configs.common.data.t5_dataset import tokenization cfg.update( model_type="t5", @@ -38,3 +40,7 @@ ) model = LazyCall(MT5Model)(cfg=cfg) +tokenization.tokenizer = LazyCall(T5Tokenizer)( + vocab_file="/path/to/spiece.model", + add_bos_token=True, +) diff --git a/projects/T5/configs/t5_model_config.py b/projects/T5/configs/t5_model_config.py index 3fd2a3b66..50523f756 100644 --- a/projects/T5/configs/t5_model_config.py +++ b/projects/T5/configs/t5_model_config.py @@ -12,7 +12,6 @@ attention_probs_dropout_prob=0.1, relative_attention_num_buckets=32, embedding_dropout_prob=0.1, - num_tokentypes=0, initializer_range=0.02, layernorm_eps=1e-5, amp_enabled=False,