diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index 13e9e0230ca8d..de67d2f65bcc0 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -840,8 +840,8 @@ void cuLoadWriteStridedInputs( { int i1 = i1_block+thr_load_row_off; if (i1 < i1_end) { - T curr_mean = mean[i1]; - T curr_rstd = rstd[i1]; + T_ACC curr_mean = mean[i1]; + T_ACC curr_rstd = rstd[i1]; for (int k = 0; k < blockDim.y; ++k) { int i2 = i2_off + k; int load_idx = i1*N+i2; diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 5d7d77dcd4134..83d0afe8eac5f 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -1092,7 +1092,8 @@ def _get_amdsmi_device_index(device: Optional[Union[int, Device]]) -> int: def _get_amdsmi_memory_usage(device: Optional[Union[Device, int]] = None) -> int: handle = _get_amdsmi_handler() device = _get_amdsmi_device_index(device) - return amdsmi.amdsmi_get_gpu_vram_usage(handle)["vram_used"] + handle = amdsmi.amdsmi_get_processor_handles()[device] + return amdsmi.amdsmi_get_gpu_activity(handle)["umc_activity"] def _get_amdsmi_utilization(device: Optional[Union[Device, int]] = None) -> int: