From 9ef14e9655a62ce0a303f8459b8259794c146b29 Mon Sep 17 00:00:00 2001
From: LeiWang1999 <leiwang1999@outlook.com>
Date: Sun, 1 Sep 2024 08:51:26 +0000
Subject: [PATCH] remove shared mem hack

---
 bitblas/gpu/matmul_mma.py            | 8 ++++----
 bitblas/gpu/matmul_mma_dequantize.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/bitblas/gpu/matmul_mma.py b/bitblas/gpu/matmul_mma.py
index 5d92f99b1..3dafd3959 100644
--- a/bitblas/gpu/matmul_mma.py
+++ b/bitblas/gpu/matmul_mma.py
@@ -571,9 +571,9 @@ def fetch_to_shared(block, idx, vec_len, can_swizzle=False, is_smooth=False, red
             # Apply Swizzling
             sch.annotate(block_read, ann_key="permuted_layout", ann_val=can_swizzle)
             # if not, apply padding to alleviate bank conflict
-            if not (can_swizzle or is_smooth):
-                pad_offset = 8 if intrin_info.in_dtype == "float16" else 16
-                sch.storage_align(block_read, 0, axis=-2, factor=16, offset=pad_offset)
+            # if not (can_swizzle or is_smooth):
+            #     pad_offset = 8 if intrin_info.in_dtype == "float16" else 16
+            #     sch.storage_align(block_read, 0, axis=-2, factor=16, offset=pad_offset)
             sch.annotate(f_2, "pragma_unroll_explicit", False)
             return block_read
 
@@ -648,7 +648,7 @@ def inverse_permutation(i, j, ii, jj):
             auto_inline_consumer_chain(sch, accumulator_shared_to_global)
             sch.reverse_compute_at(
                 accumulator_shared_to_global,
-                sch.get_loops(store)[-5],
+                sch.get_loops(store)[-6],
                 preserve_unit_loops=True,
             )
             vec_len = get_coalesced_veclen(sch.get(accumulator_shared_to_global))
diff --git a/bitblas/gpu/matmul_mma_dequantize.py b/bitblas/gpu/matmul_mma_dequantize.py
index 6bc0e39bc..f6f1e0989 100644
--- a/bitblas/gpu/matmul_mma_dequantize.py
+++ b/bitblas/gpu/matmul_mma_dequantize.py
@@ -578,7 +578,7 @@ def get_idx():
             auto_inline_consumer_chain(sch, accumulator_shared_to_global)
             sch.reverse_compute_at(
                 accumulator_shared_to_global,
-                sch.get_loops(store)[-5],
+                sch.get_loops(store)[-6],
                 preserve_unit_loops=True,
             )
             vec_len = get_coalesced_veclen(sch.get(accumulator_shared_to_global))
@@ -1075,7 +1075,7 @@ def get_idx():
             auto_inline_consumer_chain(sch, accumulator_shared_to_global)
             sch.reverse_compute_at(
                 accumulator_shared_to_global,
-                sch.get_loops(store)[-5],
+                sch.get_loops(store)[-6],
                 preserve_unit_loops=True,
             )
             vec_len = get_coalesced_veclen(sch.get(accumulator_shared_to_global))
@@ -1675,7 +1675,7 @@ def get_idx():
             auto_inline_consumer_chain(sch, accumulator_shared_to_global)
             sch.reverse_compute_at(
                 accumulator_shared_to_global,
-                sch.get_loops(store)[-5],
+                sch.get_loops(store)[-6],
                 preserve_unit_loops=True,
             )
             vec_len = get_coalesced_veclen(sch.get(accumulator_shared_to_global))
@@ -2194,7 +2194,7 @@ def get_idx():
             auto_inline_consumer_chain(sch, accumulator_shared_to_global)
             sch.reverse_compute_at(
                 accumulator_shared_to_global,
-                sch.get_loops(store)[-5],
+                sch.get_loops(store)[-6],
                 preserve_unit_loops=True,
             )
             vec_len = get_coalesced_veclen(sch.get(accumulator_shared_to_global))