diff --git a/libai/layers/layer_norm.py b/libai/layers/layer_norm.py index b154645cc..98f8027b8 100644 --- a/libai/layers/layer_norm.py +++ b/libai/layers/layer_norm.py @@ -126,4 +126,4 @@ def __init__(self, normalized_shape, eps=1e-6, layer_idx=0): self.l2norm_epsilon = eps def forward(self, hidden_states): - return flow._C.rms_layer_norm(hidden_states, self.weight, self.l2norm_epsilon) + return flow._C.rms_norm(hidden_states, self.weight, self.weight.shape, self.l2norm_epsilon) diff --git a/projects/T5/configs/mt5_pretrain.py b/projects/T5/configs/mt5_pretrain.py index bd9fd3bd8..304e6c5d3 100644 --- a/projects/T5/configs/mt5_pretrain.py +++ b/projects/T5/configs/mt5_pretrain.py @@ -18,7 +18,7 @@ train_data_path = "projects/T5/data/training_data/part_0" pretrained_model_path = None -micro_batch_size = 64 +micro_batch_size = 4 optim["lr"] = 1e-4 # dataloader @@ -30,7 +30,7 @@ ) ], collate_fn=collate_fn( - vocab_size=12902, + vocab_size=12900, max_seq_length=512, noise_density=0.15, mean_noise_span_length=3, @@ -43,7 +43,7 @@ model = LazyCall(T5ForPreTraining)(cfg=cfg) # model config -model.cfg.vocab_size = 12902 +model.cfg.vocab_size = 12900 model.cfg.hidden_size = 512 model.cfg.hidden_layers = 8 model.cfg.num_attention_heads = 6 @@ -53,7 +53,7 @@ model.cfg.attention_probs_dropout_prob = 0.0 model.cfg.embedding_dropout_prob = 0.0 model.cfg.layernorm_eps = 1e-6 -model.cfg.model_type = "mt5" + model.cfg.pretrained_model_path = pretrained_model_path train.update( @@ -63,7 +63,7 @@ train_epoch=1, train_iter=24000, log_period=10, - amp=dict(enabled=False), + amp=dict(enabled=True), warmup_ratio=1 / 24, # checkpointer=dict(period=10, max_to_keep=20), dist=dict( @@ -89,3 +89,5 @@ train.zero_optimization.enabled = True train.zero_optimization.stage = 2 +train.activation_checkpoint.enabled = False +train.num_accumulation_steps = 8 diff --git a/projects/T5/configs/t5_model_config.py b/projects/T5/configs/t5_model_config.py index 50523f756..53e124239 100644 --- a/projects/T5/configs/t5_model_config.py +++ b/projects/T5/configs/t5_model_config.py @@ -15,7 +15,7 @@ initializer_range=0.02, layernorm_eps=1e-5, amp_enabled=False, - model_type="t5", + model_type="mt5", ) cfg = DictConfig(cfg) diff --git a/projects/T5/models/attention.py b/projects/T5/models/attention.py index a825f681a..20b2b0a08 100644 --- a/projects/T5/models/attention.py +++ b/projects/T5/models/attention.py @@ -147,17 +147,13 @@ def forward( use_cache (bool, optional): it will be set to True, when the model is in the inference phase and used for incremental decoding. Defaults to False. """ - - # hidden_states, encoder_states: [S(0), B] - # attention_mask: [S(0), B] - if encoder_states is not None: encoder_states = encoder_states.to_global(placement=hidden_states.placement) if attention_mask is not None: attention_mask = attention_mask.to_global(placement=hidden_states.placement) - bsz, real_seq_length = hidden_states.size()[:2] + real_seq_length, bsz = hidden_states.size()[:2] if past_key_value is not None: assert ( @@ -166,47 +162,39 @@ def forward( f"Got {len(past_key_value)} past states.\n" real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - key_length = real_seq_length if encoder_states is None else encoder_states.shape[1] + key_length = real_seq_length if encoder_states is None else encoder_states.shape[0] if self.is_cross_attention: - # if it is cross attention, key and value should be calculated only once, and the - # result can be reused. query = self.query(hidden_states) - query = query.view(bsz, -1, self.num_heads, self.head_size) - query = query.permute(0, 2, 1, 3) + query = query.view(-1, bsz, self.num_heads, self.head_size) + query = query.permute(1, 2, 0, 3) # bsz, num_head, seq_len, head_size + if past_key_value is not None: key, value = past_key_value elif encoder_states is not None: key_value = self.key_value(encoder_states) - key_value = key_value.view(bsz, -1, self.num_heads, 2 * self.head_size) - key_value = key_value.permute(0, 2, 1, 3) + key_value = key_value.view(-1, bsz, self.num_heads, 2 * self.head_size) + key_value = key_value.permute(1, 2, 0, 3) key, value = flow.chunk(key_value, chunks=2, dim=-1) else: raise ValueError( "past_key_value and encoder_states cannot be None at the same time." ) else: - # if it is self attention, query, key, and value are all obtained from hidden_states. - # when in the inference phase of an incremental decoder, - # hidden_states is the last-added state, - # the full key and value could be obtained by concatenating with past_key_value. query_key_value = self.query_key_value(hidden_states) - query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size) - query_key_value = query_key_value.permute( - 0, 2, 1, 3 - ) # [bsz, num_heads, src_len, 3 * head_size] - query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1) + attention_scores, value = flow._C.fused_self_attention( + query_key_value, head_size=self.head_size, alpha=1 + ) if past_key_value is not None: past_key, past_value = past_key_value key = flow.cat((past_key.type_as(key), key), dim=2) value = flow.cat((past_value.type_as(value), value), dim=2) - # query, key, value: [S(0), S(1)], shape: [bsz, num_heads, seq_length, head_size] if use_cache: past_key_value = (key, value) - # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)] - attention_scores = flow.matmul(query, key, transpose_b=True) + if self.is_cross_attention: + attention_scores = flow.matmul(query, key, transpose_b=True, alpha=1) if position_bias is None: if not self.has_relative_attention_bias: @@ -223,35 +211,27 @@ def forward( if past_key_value is not None: position_bias = position_bias[:, :, -hidden_states.size(1) :, :] - position_bias = position_bias + (1 - attention_mask) * -1000 - position_bias = position_bias.to_global(placement=attention_scores.placement) - - attention_scores = attention_scores + position_bias - - # [S(0), S(1)] x [S(0), B] = [S(0), S(1)] if attention_mask is not None: - attention_scores = flow.mul(attention_scores, attention_mask) - attention_scores = attention_scores - 10000.0 * (1 - attention_mask) - # TODO(xingyu.liao): graph will occur `where_scalar` errors - # when using `masked_fill` - # attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0) - attention_weights = flow.softmax(attention_scores, dim=-1) - # [bsz, num_heads, tgt_len, src_len] - attention_weights = self.dropout(attention_weights) + if use_cache: + attention_mask = attention_mask.expand_as(attention_scores) + + attention_weights = flow._C.fused_bias_add_scale_mask_softmax_dropout( + attention_scores, + position_bias, + attention_mask, + fill_value=-10000.0, + scale=1, + p=self.attention_dropout_prob, + )[0] else: + attention_scores = attention_scores + position_bias attention_weights = flow.softmax(attention_scores, dim=-1) - # [bsz, num_heads, tgt_len, src_len] attention_weights = self.dropout(attention_weights) - # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)] context = flow.matmul(attention_weights, value) - # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size] - context = context.transpose(1, 2) - # Concat multi-head results from - # [bsz, tgt_len, num_heads, head_size] -> [bsz, tgt_len, num_heads * head_size] - # SBP sign: [S(0), S(2)] - # [S(0), S(2)] x [B, S(0)] = [S(0), P] -> [S(0), B] + context = flow._C.transpose(context, perm=(2, 0, 1, 3)) + output = self.dense(context.flatten(2)) output = self.output_dropout(output) @@ -272,7 +252,6 @@ def extra_repr(self) -> str: def _relative_position_bucket( self, relative_position, bidirectional=True, num_buckets=32, max_distance=128 ): - # relative_position: (seq_len, seq_len) relative_buckets = 0 if bidirectional: num_buckets //= 2 diff --git a/projects/T5/models/embedding.py b/projects/T5/models/embedding.py index 3d51a16b0..5c981ab78 100644 --- a/projects/T5/models/embedding.py +++ b/projects/T5/models/embedding.py @@ -97,15 +97,9 @@ def __init__( ) ) self.init_method(self.weight) - # FIXME(lxy): Fill padding_idx is not supported in nd_sbp right now. - # self._fill_padding_idx_with_zero() def forward(self, input_ids): weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight - # embeddings with sbp sign: [B, B] - # [B, B] x [S(0), B] --> [S(0), B] - # ↑ ↑ ↑ - # embed pos_ids pos_embed input_embeds = flow._C.gather(weight, input_ids, axis=0) return input_embeds diff --git a/projects/T5/models/layer_norm.py b/projects/T5/models/layer_norm.py index b7e9864a5..35c3262a1 100644 --- a/projects/T5/models/layer_norm.py +++ b/projects/T5/models/layer_norm.py @@ -33,4 +33,4 @@ def __init__(self, normalized_shape, eps=1e-6, layer_idx=0): self.l2norm_epsilon = eps def forward(self, hidden_states): - return flow._C.rms_layer_norm(hidden_states, self.weight, self.l2norm_epsilon) + return flow._C.rms_norm(hidden_states, self.weight, self.weight.shape, self.l2norm_epsilon) diff --git a/projects/T5/models/logits.py b/projects/T5/models/logits.py index 27fd2ae01..0eb8e213a 100644 --- a/projects/T5/models/logits.py +++ b/projects/T5/models/logits.py @@ -13,40 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import oneflow as flow from oneflow import nn from libai.layers import Linear -from libai.utils import distributed as dist class LMLogits(nn.Module): - def __init__(self, vocab_size, hidden_size=None, bias=False, model_type="t5", layer_idx=-1): + def __init__(self, vocab_size, hidden_size=None, bias=False, layer_idx=-1): super().__init__() - self.model_type = model_type - if model_type == "t5": - self.bias = ( - nn.Parameter( - flow.zeros( - (vocab_size,), - dtype=flow.float32, - placement=dist.get_layer_placement(layer_idx), - sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]), - ) - ) - if bias - else None - ) - elif model_type == "mt5": - self.linear = Linear(hidden_size, vocab_size, bias=False, layer_idx=layer_idx) + self.linear = Linear(hidden_size, vocab_size, bias=bias, layer_idx=layer_idx) - def forward(self, input, word_embeddings=None): - if self.model_type == "t5": - w = word_embeddings.to_global(placement=input.placement) - input = input.to_global(grad_sbp=input.sbp) - logits = flow._C.matmul(input, w, transpose_b=True) - if self.bias is not None: - logits = logits + self.bias - else: - logits = self.linear(input) + def forward(self, input): + logits = self.linear(input) return logits diff --git a/projects/T5/models/mlp.py b/projects/T5/models/mlp.py index 3a69d5816..7160599b0 100644 --- a/projects/T5/models/mlp.py +++ b/projects/T5/models/mlp.py @@ -13,58 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import oneflow as flow from oneflow import nn -from libai.layers import Linear, build_activation - - -class T5MLP(nn.Module): - def __init__( - self, - hidden_size, - ffn_hidden_size, - output_dropout_prob=0.0, - init_method=nn.init.xavier_normal_, - output_layer_init_method=None, - *, - layer_idx=0, - ): - super().__init__() - self.output_dropout_prob = output_dropout_prob - - if output_layer_init_method is None: - output_layer_init_method = init_method - - self.dense_h_to_4h = Linear( - hidden_size, - ffn_hidden_size, - bias=False, - parallel="col", - skip_bias_add=False, - init_method=init_method, - layer_idx=layer_idx, - ) - - self.activation_func = build_activation("relu") - - self.dense_4h_to_h = Linear( - ffn_hidden_size, - hidden_size, - bias=False, - parallel="row", - skip_bias_add=False, - init_method=output_layer_init_method, - layer_idx=layer_idx, - ) - - self.dropout = nn.Dropout(self.output_dropout_prob) - - def forward(self, hidden_states): - intermediate = self.dense_h_to_4h(hidden_states) - intermediate = self.activation_func(intermediate) - output = self.dense_4h_to_h(intermediate) - output = self.dropout(output) - return output +from libai.layers import Linear class MT5MLP(nn.Module): @@ -104,8 +56,6 @@ def __init__( layer_idx=layer_idx, ) - self.activation_func = build_activation("gelu_tanh") - self.wo = Linear( ffn_hidden_size, hidden_size, @@ -120,9 +70,8 @@ def __init__( def forward(self, hidden_states): wi_0_out = self.wi_0(hidden_states) - hidden_gelu = self.activation_func(wi_0_out) hidden_linear = self.wi_1(hidden_states) - hidden_states = hidden_gelu * hidden_linear + hidden_states = flow._C.fused_fast_gelu_mul(wi_0_out, hidden_linear) output = self.wo(hidden_states) output = self.dropout(output) return output diff --git a/projects/T5/models/t5_model.py b/projects/T5/models/t5_model.py index 2551d6c54..251961b27 100644 --- a/projects/T5/models/t5_model.py +++ b/projects/T5/models/t5_model.py @@ -17,7 +17,7 @@ import oneflow.nn as nn from libai.config import configurable -from libai.layers import Linear, LMLogits +from libai.layers import Linear from libai.models.t5_model import T5Loss from libai.models.utils import init_method_normal, scaled_init_method_normal from libai.utils import distributed as dist @@ -45,10 +45,9 @@ def __init__( initializer_range=0.02, layernorm_eps=1e-12, amp_enabled=False, - model_type="t5", ) -> None: super().__init__() - self.model_type = model_type + self.model_type = "mt5" init_method = init_method_normal(initializer_range) scaled_init_method = scaled_init_method_normal(initializer_range, hidden_layers) self.embedding = T5Embedding( @@ -75,7 +74,6 @@ def __init__( init_method=init_method, output_layer_init_method=scaled_init_method, layer_idx=i, - model_type=model_type, has_relative_attention_bias=bool(i == 0), ) for i in range(hidden_layers) @@ -107,7 +105,6 @@ def __init__( init_method=init_method, output_layer_init_method=scaled_init_method, layer_idx=i, - model_type=model_type, has_relative_attention_bias=bool(i - hidden_layers == 0), ) for i in range(hidden_layers, 2 * hidden_layers) @@ -127,12 +124,7 @@ def __init__( self.encoder_states = None self.past_length = 0 - if model_type == "mt5": - self.lm_head = Linear( - hidden_size, vocab_size, bias=False, layer_idx=2 * hidden_layers - 1 - ) - else: - self.lm_head = LMLogits(vocab_size, bias=False) + self.lm_head = Linear(hidden_size, vocab_size, bias=False, layer_idx=2 * hidden_layers - 1) @classmethod def from_config(cls, cfg): @@ -150,7 +142,7 @@ def from_config(cls, cfg): "initializer_range": cfg.initializer_range, "layernorm_eps": cfg.layernorm_eps, "amp_enabled": cfg.amp_enabled, - "model_type": cfg.model_type, + # "model_type": cfg.model_type, } def forward( @@ -177,8 +169,10 @@ def forward( encoder_decoder_position_bias = None self.set_cache(encoder_states=None, past_key_values=None) encoder_attn_mask = self.extended_attn_mask(encoder_attn_mask) - enc_embedding_output = self.embedding(encoder_input_ids) - enc_hidden_states = enc_embedding_output + + enc_hidden_states = self.embedding(encoder_input_ids) + + enc_hidden_states = enc_hidden_states.transpose(0, 1) for layer in self.encoder.layers: enc_hidden_states, position_bias = layer( @@ -193,8 +187,10 @@ def forward( ) encoder_decoder_attn_mask = self.extended_attn_mask(encoder_decoder_attn_mask) - dec_embedding_output = self.embedding(decoder_input_ids) - dec_hidden_states = dec_embedding_output + dec_hidden_states = self.embedding(decoder_input_ids) + + dec_hidden_states = dec_hidden_states.transpose(0, 1) + if use_cache: presents = [] @@ -219,10 +215,7 @@ def forward( decoder_states = self.decoder.final_layernorm(dec_hidden_states) - if self.model_type == "mt5": - logits = self.lm_head(decoder_states) - else: - logits = self.lm_head(decoder_states, self.embedding.word_embeddings.weight) + logits = self.lm_head(decoder_states) return logits @@ -271,7 +264,7 @@ def forward( encoder_decoder_attn_mask, use_cache=use_cache, ) - + logits = logits.transpose(0, 1) if lm_labels is not None: lm_loss = self.loss_func(logits, lm_labels, loss_mask) return lm_loss @@ -342,3 +335,14 @@ def set_pipeline_stage_id(model): dist_utils.get_layer_stage_id(model.t5_model.decoder.final_layernorm.layer_idx), dist.get_layer_placement(model.t5_model.decoder.final_layernorm.layer_idx), ) + + @staticmethod + def set_activation_checkpoint(model): + for module_block in model.modules(): + # Old API in OneFlow 0.8 + if hasattr(module_block, "origin"): + if isinstance(module_block.origin, TransformerLayer): + module_block.config.activation_checkpointing = True + else: + if isinstance(module_block.to(nn.Module), TransformerLayer): + module_block.to(nn.graph.GraphModule).activation_checkpointing = True diff --git a/projects/T5/models/transformer_layer.py b/projects/T5/models/transformer_layer.py index c23cb903d..9b3f8f671 100644 --- a/projects/T5/models/transformer_layer.py +++ b/projects/T5/models/transformer_layer.py @@ -19,7 +19,7 @@ from libai.utils import distributed as dist from projects.T5.models.attention import MultiheadAttention from projects.T5.models.layer_norm import LayerNorm -from projects.T5.models.mlp import MT5MLP, T5MLP +from projects.T5.models.mlp import MT5MLP class TransformerLayer(nn.Module): @@ -60,7 +60,6 @@ def __init__( output_layer_init_method=None, *, layer_idx=0, - model_type="t5", has_relative_attention_bias=False ): super().__init__() @@ -104,24 +103,14 @@ def __init__( self.post_cross_attention_layernorm = LayerNorm( self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx ) - if model_type == "mt5": - self.mlp = MT5MLP( - self.hidden_size, - self.ffn_hidden_size, - self.output_dropout_prob, - self.init_method, - output_layer_init_method=self.output_layer_init_method, - layer_idx=self.layer_idx, - ) - elif model_type == "t5": - self.mlp = T5MLP( - self.hidden_size, - self.ffn_hidden_size, - self.output_dropout_prob, - self.init_method, - output_layer_init_method=self.output_layer_init_method, - layer_idx=self.layer_idx, - ) + self.mlp = MT5MLP( + self.hidden_size, + self.ffn_hidden_size, + self.output_dropout_prob, + self.init_method, + output_layer_init_method=self.output_layer_init_method, + layer_idx=self.layer_idx, + ) def forward( self, @@ -152,10 +141,8 @@ def forward( use_cache: it will be set to `True` when the model is in the inference phase and used for incremental decoding. """ - # Change placement for pipeline parallelsim hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx)) - # hidden_states shape: (batch_size, seq_length, hidden_size) if attention_mask is not None: attention_mask = attention_mask.to_global( placement=dist.get_layer_placement(self.layer_idx) diff --git a/tests/model_utils/test_t5_loader.py b/tests/model_utils/test_t5_loader.py index 62587b3cb..2bd1d59e9 100644 --- a/tests/model_utils/test_t5_loader.py +++ b/tests/model_utils/test_t5_loader.py @@ -26,15 +26,15 @@ from libai.utils import distributed as dist from libai.utils.file_utils import get_data_from_cache from libai.utils.logger import setup_logger -from projects.MT5.configs.mt5_base import cfg as libai_cfg -from projects.MT5.mt5_model import MT5Model from projects.MT5.utils.mt5_loader import T5LoaderHuggerFace +from projects.T5.configs.t5_model_config import cfg as libai_cfg +from projects.T5.models.t5_model import T5Model -PRETRAINED_MODEL_URL = "http://oneflow-static.oss-cn-beijing.aliyuncs.com/ci-files/dataset/libai/model_utils_test/t5_utils/pytorch_model.bin" # noqa -PRETRAINED_MODEL_CONFIG_URL = "http://oneflow-static.oss-cn-beijing.aliyuncs.com/ci-files/dataset/libai/model_utils_test/t5_utils/config.json" # noqa +PRETRAINED_MODEL_URL = "http://oneflow-static.oss-cn-beijing.aliyuncs.com/ci-files/dataset/libai/model_utils_test/mt5_utils/pytorch_model.bin" # noqa +PRETRAINED_MODEL_CONFIG_URL = "http://oneflow-static.oss-cn-beijing.aliyuncs.com/ci-files/dataset/libai/model_utils_test/mt5_utils/config.json" # noqa -PRETRAINED_MODEL_MD5 = "952862a8ba425a25739a69e5f33b0df8" -PRETRAINED_MODEL_CONFIG_MD5 = "7ebc91dc4377c01190f4116c3c1ac6cd" +PRETRAINED_MODEL_MD5 = "4c9c0be541b89de9b01c597ec4cc371a" +PRETRAINED_MODEL_CONFIG_MD5 = "b159e41603b7eeaf9a9c489165bbcaca" TEST_OUTPUT = os.path.join(os.getenv("TEST_OUTPUT", "output_unittest"), "test_t5_utils") @@ -64,18 +64,21 @@ def setUp(self) -> None: [101, 2009, 1005, 1055, 2986, 2651, 1012, 102], [101, 2028, 12314, 3377, 102, 0, 0, 0], [101, 2064, 2017, 3305, 2009, 102, 0, 0], + [101, 2064, 2017, 3305, 2009, 102, 0, 0], ] self.encoder_att_mask = [ [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1], ] self.decoder_input_ids = [ [101, 2009, 1005, 1055, 2986], [101, 2028, 12314, 3377, 102], [101, 2064, 2017, 3305, 2009], + [101, 2064, 2017, 3305, 2009], ] - self.decoder_att_mask = [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]] + self.decoder_att_mask = [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]] @classmethod def tearDownClass(cls) -> None: @@ -96,13 +99,13 @@ def test_t5_loader_with_data_tensor_parallel(self): # load model load_func = T5LoaderHuggerFace( - model=MT5Model, + model=T5Model, libai_cfg=libai_cfg, pretrained_model_path=self.pretrained_model_path, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, embedding_dropout_prob=0.0, - model_type="t5", + model_type="mt5", ) model = load_func.load() model.eval() @@ -121,13 +124,13 @@ def test_t5_loader_with_data_tensor_parallel(self): ) encode_att_mask = flow.tensor( self.encoder_att_mask, - dtype=flow.long, + dtype=flow.bool, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=dist.get_layer_placement(0), ) decoder_att_mask = flow.tensor( self.decoder_att_mask, - dtype=flow.long, + dtype=flow.bool, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=dist.get_layer_placement(0), ) @@ -137,7 +140,7 @@ def test_t5_loader_with_data_tensor_parallel(self): ) self.assertTrue( np.allclose( - np.array(-9836561.0), + np.array(-1.1011268e08), logits.sum().data.numpy(), ) ) @@ -150,20 +153,20 @@ def test_t5_loader_with_data_tensor_pipeline_parallel(self): data_parallel_size=2, tensor_parallel_size=1, pipeline_parallel_size=2, - pipeline_num_layers=24, + pipeline_num_layers=16, ) ) dist.setup_dist_util(dist_cfg) # load model load_func = T5LoaderHuggerFace( - model=MT5Model, + model=T5Model, libai_cfg=libai_cfg, pretrained_model_path=self.pretrained_model_path, hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, embedding_dropout_prob=0.0, - model_type="t5", + model_type="mt5", ) model = load_func.load() model.eval() @@ -182,13 +185,13 @@ def test_t5_loader_with_data_tensor_pipeline_parallel(self): ) encode_att_mask = flow.tensor( self.encoder_att_mask, - dtype=flow.long, + dtype=flow.bool, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=dist.get_layer_placement(0), ) decoder_att_mask = flow.tensor( self.decoder_att_mask, - dtype=flow.long, + dtype=flow.bool, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=dist.get_layer_placement(0), ) @@ -198,7 +201,7 @@ def test_t5_loader_with_data_tensor_pipeline_parallel(self): ) self.assertTrue( np.allclose( - np.array(-9836561.0), + np.array(-1.1011268e08), logits.sum().data.numpy(), ) )