BUGFIX: layernorm mean, rstd

gaxler · web-flow · commit 2bca3847b89f · 2024-01-14T21:11:15.000-08:00
mean and rstd were allocated on default "cuda" device
diff --git a/mamba_ssm/ops/triton/layernorm.py b/mamba_ssm/ops/triton/layernorm.py
@@ -143,8 +143,8 @@ def _layer_norm_fwd(
         assert residual_out.stride(-1) == 1
     else:
         residual_out = None
-    mean = torch.empty((M,), dtype=torch.float32, device="cuda") if not is_rms_norm else None
-    rstd = torch.empty((M,), dtype=torch.float32, device="cuda")
+    mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
+    rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))