ROCm · mozga-amd · Nov 12, 2024 · Dec 16, 2024 · Dec 19, 2024 · Dec 20, 2024
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -16,6 +16,7 @@ using OutElementOp = PassThrough;
 
 template <ck::index_t NDimSpatial>
 using DeviceConvBwdWeightInstance =
+    // clang-format off
     ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Wmma_CShuffle<
         NDimSpatial,
         ck::tensor_layout::convolution::GNDHWC,
@@ -52,11 +53,11 @@ using DeviceConvBwdWeightInstance =
         1,                    // BBlockTransferSrcScalarPerVector
         8,                    // BBlockTransferDstScalarPerVector_BK1
         true,                 // BBlockLdsExtraN
-        4,
-        2,
-        S<1, 32, 1, 8>,
-        1>;
-
+        4,                    // CShuffleMXdlPerWavePerShuffle
+        2,                    // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1>;                   // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
 template <ck::index_t NDimSpatial>
 using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
                                                                                      InDataType,

@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -17,6 +17,7 @@ using OutElementOp = PassThrough;
 
 template <ck::index_t NDimSpatial>
 using DeviceConvBwdWeightInstance =
+    // clang-format on
     ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
         NDimSpatial,
         ck::tuple_element_t<NDimSpatial - 1,
@@ -42,7 +43,7 @@ using DeviceConvBwdWeightInstance =
         256,                  // BlockSize
         128,                  // MPerBlock
         128,                  // NPerBlock
-        4,                    // K0PerBlock
+        32,                   // K0PerBlock
         8,                    // K1
         32,                   // MPerXdl
         32,                   // NPerXdl
@@ -52,20 +53,21 @@ using DeviceConvBwdWeightInstance =
         S<0, 3, 1, 2>,        // ABlockTransferThreadClusterArrangeOrder
         S<0, 2, 1, 3>,        // ABlockTransferSrcAccessOrder
         2,                    // ABlockTransferSrcVectorDim
-        8,                    // ABlockTransferSrcScalarPerVector
+        1,                    // ABlockTransferSrcScalarPerVector
         2,                    // ABlockTransferDstScalarPerVector_K1
         true,                 // ABlockLdsAddExtraM
         S<1, 4, 16, 4>,       // BBlockTransferThreadClusterLengths_K0_N_K1
         S<0, 3, 1, 2>,        // BBlockTransferThreadClusterArrangeOrder
         S<0, 2, 1, 3>,        // BBlockTransferSrcAccessOrder
         2,                    // BBlockTransferSrcVectorDim
-        8,                    // BBlockTransferSrcScalarPerVector
+        1,                    // BBlockTransferSrcScalarPerVector
         2,                    // BBlockTransferDstScalarPerVector_K1
         true,                 // BBlockLdsAddExtraN
         1,                    // CShuffleMXdlPerWavePerShuffle
         1,                    // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
         128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
+                                                 // clang-format off
 
 template <ck::index_t NDimSpatial>
 using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,

@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -41,7 +41,7 @@ using DeviceConvBwdWeightInstance =
         256,                  // BlockSize
         128,                  // MPerBlock
         128,                  // NPerBlock
-        4,                    // K0PerBlock
+        32,                   // K0PerBlock
         8,                    // K1
         32,                   // MPerXdl
         32,                   // NPerXdl
@@ -51,16 +51,16 @@ using DeviceConvBwdWeightInstance =
         S<0, 3, 1, 2>,        // ABlockTransferThreadClusterArrangeOrder
         S<0, 2, 1, 3>,        // ABlockTransferSrcAccessOrder
         2,                    // ABlockTransferSrcVectorDim
-        8,                    // ABlockTransferSrcScalarPerVector
+        1,                    // ABlockTransferSrcScalarPerVector
         2,                    // ABlockTransferDstScalarPerVector_K1
-        true,                 // ABlockLdsAddExtraM
+        false,                // ABlockLdsAddExtraM
         S<1, 4, 16, 4>,       // BBlockTransferThreadClusterLengths_K0_N_K1
         S<0, 3, 1, 2>,        // BBlockTransferThreadClusterArrangeOrder
         S<0, 2, 1, 3>,        // BBlockTransferSrcAccessOrder
         2,                    // BBlockTransferSrcVectorDim
-        8,                    // BBlockTransferSrcScalarPerVector
+        1,                    // BBlockTransferSrcScalarPerVector
         2,                    // BBlockTransferDstScalarPerVector_K1
-        true,                 // BBlockLdsAddExtraN
+        false,                // BBlockLdsAddExtraN
         1,                    // CShuffleMXdlPerWavePerShuffle
         1,                    // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock

@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "common.hpp"
 
@@ -18,6 +18,7 @@ using OutElementOp = PassThrough;
 
 template <ck::index_t NDimSpatial>
 using DeviceConvBwdWeightInstance =
+    // clang-format off
     ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
         NDimSpatial,
         ck::tuple_element_t<NDimSpatial - 1,
@@ -43,7 +44,7 @@ using DeviceConvBwdWeightInstance =
         256,                  // BlockSize
         128,                  // MPerBlock
         128,                  // NPerBlock
-        4,                    // K0PerBlock
+        32,                   // K0PerBlock
         8,                    // K1
         32,                   // MPerXdl
         32,                   // NPerXdl
@@ -67,8 +68,11 @@ using DeviceConvBwdWeightInstance =
         1,                    // CShuffleNXdlPerWavePerShuffle
         S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
         2,                    // CBlockTransferScalarPerVector_NWaveNPerXdl
-        ComputeTypeA,         // ComputeTypeA
-        ComputeTypeB>;        // ComputeTypeB
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        ComputeTypeA,                              // ComputeTypeA
+        ComputeTypeB>;                             // ComputeTypeB
+// clang-format on
 
 template <ck::index_t NDimSpatial>
 using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,