diff --git a/bitblas/gpu/intrin/lop3.py b/bitblas/gpu/intrin/lop3.py index a6a7011a0..aee3eac8b 100644 --- a/bitblas/gpu/intrin/lop3.py +++ b/bitblas/gpu/intrin/lop3.py @@ -1127,7 +1127,7 @@ def fast_decode_desc(compressed: T.handle, decompressed: T.handle, scale: T.hand 1, ], dtype=target_dtype, - scope="global", + scope="local", ) with T.block("root"): T.reads(Compressed[0:n_storage_elems], Scale[0:1]) @@ -1173,7 +1173,7 @@ def fast_decode_impl(compressed: T.handle, decompressed: T.handle, scale: T.hand dtype=target_dtype, offset_factor=1, strides=[s0], - scope="global", + scope="local", ) with T.block("root"): T.reads(Compressed[0:n_storage_elems], Scale[0:1])