diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 9f456ba30..e33ded613 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -219,6 +219,7 @@ "nn.functional.ctc_loss", "nn.functional.channel_shuffle", "nn.functional.multi_head_attention_forward", + "nn.TransformerEncoderLayer", "sigmoid", "logsigmoid", "sgn", diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index 86e9d4d11..7075c9889 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -5859,6 +5859,13 @@ XPU: _dirichlet_grad_xpu autogen: _dirichlet_grad.out +# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is. +- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor + variants: function + dispatch: + XPU: transformer_encoder_layer_forward + autogen: _transformer_encoder_layer_fwd.out + # Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads). - func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor) dispatch: