diff --git a/setup.py b/setup.py index e18283622c4..7818a598244 100644 --- a/setup.py +++ b/setup.py @@ -129,9 +129,11 @@ def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "torchvision", "csrc") - main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + glob.glob( - os.path.join(extensions_dir, "ops", "*.cpp") - ) + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp")) + main_file = ( + glob.glob(os.path.join(extensions_dir, "*.cpp")) + + glob.glob(os.path.join(extensions_dir, "ops", "*.cpp")) + + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp")) + ) source_cpu = ( glob.glob(os.path.join(extensions_dir, "ops", "autograd", "*.cpp")) + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp")) diff --git a/test/test_ops.py b/test/test_ops.py index 5747038a3cb..787521722c5 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -122,8 +122,7 @@ def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, determinist tol = 5e-3 else: tol = 4e-3 - - if x_dtype == torch.bfloat16: + elif x_dtype == torch.bfloat16: tol = 5e-3 pool_size = 5 @@ -509,7 +508,7 @@ def test_autocast_cpu(self, aligned, deterministic, x_dtype, rois_dtype): aligned=aligned, x_dtype=x_dtype, rois_dtype=rois_dtype, - ) + ) @pytest.mark.parametrize("seed", range(10)) @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) @@ -730,7 +729,7 @@ def _create_tensors_with_iou(self, N, iou_thresh): @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) @pytest.mark.parametrize("seed", range(10)) - def test_nms_ref(self, iou, seed): + def test_nms_ref(self, iou, seed, dtype=torch.float): torch.random.manual_seed(seed) err_msg = "NMS incompatible between CPU and reference implementation for IoU={}" boxes, scores = self._create_tensors_with_iou(1000, iou) @@ -738,6 +737,11 @@ def test_nms_ref(self, iou, seed): keep = ops.nms(boxes, scores, iou) torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou)) + if dtype == torch.bfloat16: + keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou) + keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou) + torch.testing.assert_close(keep_ref_float, keep_dtype) + def test_nms_input_errors(self): with pytest.raises(RuntimeError): ops.nms(torch.rand(4), torch.rand(3), 0.5) @@ -769,17 +773,6 @@ def test_qnms(self, iou, scale, zero_point): torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou)) - @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) - def test_nms_cpu(self, iou, dtype=torch.float): - err_msg = "NMS incompatible between float and {dtype} for IoU={}" - - boxes, scores = self._create_tensors_with_iou(1000, iou) - r_ref = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou) - r_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou) - - is_eq = torch.allclose(r_ref, r_dtype) - assert is_eq, err_msg.format(iou) - @pytest.mark.parametrize( "device", ( @@ -815,7 +808,7 @@ def test_autocast(self, iou, dtype): @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16)) def test_autocast_cpu(self, iou, dtype): with torch.cpu.amp.autocast(): - self.test_nms_cpu(iou=iou, dtype=dtype) + self.test_nms_ref(iou=iou, seed=0, dtype=dtype) @pytest.mark.parametrize( "device", diff --git a/torchvision/csrc/ops/autocast/nms_kernel.cpp b/torchvision/csrc/ops/autocast/nms_kernel.cpp index e3ee94d390a..2acd0f5d0dc 100644 --- a/torchvision/csrc/ops/autocast/nms_kernel.cpp +++ b/torchvision/csrc/ops/autocast/nms_kernel.cpp @@ -9,13 +9,13 @@ namespace ops { namespace { -template +template at::Tensor nms_autocast( const at::Tensor& dets, const at::Tensor& scores, double iou_threshold) { c10::impl::ExcludeDispatchKeyGuard no_autocast(autocast_key); - + return nms( at::autocast::cached_cast(at::kFloat, dets, device_type), at::autocast::cached_cast(at::kFloat, scores, device_type), @@ -25,11 +25,17 @@ at::Tensor nms_autocast( } // namespace TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { - m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast))); + m.impl( + TORCH_SELECTIVE_NAME("torchvision::nms"), + TORCH_FN( + (nms_autocast))); } TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) { - m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast))); + m.impl( + TORCH_SELECTIVE_NAME("torchvision::nms"), + TORCH_FN( + (nms_autocast))); } } // namespace ops diff --git a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp index 8748ef73c1d..919393a5ef0 100644 --- a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp @@ -9,7 +9,7 @@ namespace ops { namespace { -template +template at::Tensor roi_align_autocast( const at::Tensor& input, const at::Tensor& rois, @@ -35,13 +35,17 @@ at::Tensor roi_align_autocast( TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { m.impl( TORCH_SELECTIVE_NAME("torchvision::roi_align"), - TORCH_FN((roi_align_autocast))); + TORCH_FN((roi_align_autocast< + c10::DispatchKey::Autocast, + c10::DeviceType::CUDA>))); } TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) { m.impl( TORCH_SELECTIVE_NAME("torchvision::roi_align"), - TORCH_FN((roi_align_autocast))); + TORCH_FN((roi_align_autocast< + c10::DispatchKey::AutocastCPU, + c10::DeviceType::CPU>))); } } // namespace ops