diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml index 3871dca340f..8da613cf194 100644 --- a/.github/workflows/build-cmake.yml +++ b/.github/workflows/build-cmake.yml @@ -26,6 +26,7 @@ jobs: runner: ${{ matrix.runner }} gpu-arch-type: ${{ matrix.gpu-arch-type }} gpu-arch-version: ${{ matrix.gpu-arch-version }} + test-infra-ref: main script: | set -euo pipefail @@ -46,6 +47,7 @@ jobs: with: repository: pytorch/vision runner: ${{ matrix.runner }} + test-infra-ref: main script: | set -euo pipefail @@ -71,6 +73,7 @@ jobs: runner: ${{ matrix.runner }} gpu-arch-type: ${{ matrix.gpu-arch-type }} gpu-arch-version: ${{ matrix.gpu-arch-version }} + test-infra-ref: main script: | set -euo pipefail diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 779da13e3a2..51cc77b3cea 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -18,6 +18,7 @@ jobs: with: repository: pytorch/vision upload-artifact: docs + test-infra-ref: main script: | set -euo pipefail @@ -25,7 +26,7 @@ jobs: export GPU_ARCH_TYPE=cpu export GPU_ARCH_VERSION='' ./.github/scripts/setup-env.sh - + # Prepare conda CONDA_PATH=$(which conda) eval "$(${CONDA_PATH} shell.bash hook)" @@ -36,13 +37,13 @@ jobs: # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't # have to pay attention in all other workflows? export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" - + cd docs - + echo '::group::Install doc requirements' pip install --progress-bar=off -r requirements.txt echo '::endgroup::' - + if [[ ${{ github.event_name }} == push && (${{ github.ref_type }} == tag || (${{ github.ref_type }} == branch && ${{ github.ref_name }} == release/*)) ]]; then echo '::group::Enable version string sanitization' # This environment variable just has to exist and must not be empty. The actual value is arbitrary. @@ -66,9 +67,9 @@ jobs: cp $file build/html/_generated_ipynb_notebooks/ fi done - + cp -r build/html "${RUNNER_ARTIFACT_DIR}" - + # On PRs we also want to upload the docs into our S3 bucket for preview. if [[ ${{ github.event_name == 'pull_request' }} ]]; then cp -r build/html/* "${RUNNER_DOCS_DIR}" @@ -85,9 +86,10 @@ jobs: repository: pytorch/vision download-artifact: docs ref: gh-pages + test-infra-ref: main script: | set -euo pipefail - + REF_TYPE=${{ github.ref_type }} REF_NAME=${{ github.ref_name }} @@ -112,14 +114,14 @@ jobs: rm -rf "${TARGET_FOLDER}"/* mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}" git add "${TARGET_FOLDER}" || true - + if [[ "${TARGET_FOLDER}" == main ]]; then mkdir -p _static rm -rf _static/* cp -r "${TARGET_FOLDER}"/_static/* _static git add _static || true fi - + git config user.name 'pytorchbot' git config user.email 'soumith+bot@pytorch.org' git config http.postBuffer 524288000 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 8203bb61e4f..b8dc5566cc7 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,6 +14,7 @@ jobs: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/vision + test-infra-ref: main script: | set -euo pipefail @@ -23,14 +24,14 @@ jobs: conda create --name ci --quiet --yes python=3.8 pip conda activate ci echo '::endgroup::' - + echo '::group::Install lint tools' pip install --progress-bar=off pre-commit echo '::endgroup::' - + set +e pre-commit run --all-files - + if [ $? -ne 0 ]; then git --no-pager diff exit 1 @@ -40,9 +41,10 @@ jobs: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/vision + test-infra-ref: main script: | set -euo pipefail - + echo '::group::Setup environment' CONDA_PATH=$(which conda) eval "$(${CONDA_PATH} shell.bash hook)" @@ -53,7 +55,7 @@ jobs: conda activate ci export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" echo '::endgroup::' - + echo '::group::Install lint tools' curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o ./clang-format chmod +x ./clang-format @@ -62,7 +64,7 @@ jobs: echo '::group::Lint C source' set +e ./.github/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format - + if [ $? -ne 0 ]; then git --no-pager diff exit 1 @@ -74,23 +76,24 @@ jobs: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/vision + test-infra-ref: main script: | set -euo pipefail - + export PYTHON_VERSION=3.8 export GPU_ARCH_TYPE=cpu export GPU_ARCH_VERSION='' ./.github/scripts/setup-env.sh - + CONDA_PATH=$(which conda) eval "$(${CONDA_PATH} shell.bash hook)" conda activate ci - + echo '::group::Install lint tools' pip install --progress-bar=off mypy echo '::endgroup::' - + echo '::group::Lint Python types' mypy --install-types --non-interactive --config-file mypy.ini echo '::endgroup::' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 22e1a4ac18d..eb6290fdfe9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,6 +33,7 @@ jobs: gpu-arch-type: ${{ matrix.gpu-arch-type }} gpu-arch-version: ${{ matrix.gpu-arch-version }} timeout: 120 + test-infra-ref: main script: | set -euo pipefail @@ -41,7 +42,7 @@ jobs: export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} ./.github/scripts/unittest.sh - + unittests-macos: strategy: matrix: @@ -62,6 +63,7 @@ jobs: # and needs roughly 2 hours to just run the test suite timeout: 240 runner: ${{ matrix.runner }} + test-infra-ref: main script: | set -euo pipefail @@ -94,6 +96,7 @@ jobs: gpu-arch-type: ${{ matrix.gpu-arch-type }} gpu-arch-version: ${{ matrix.gpu-arch-version }} timeout: 120 + test-infra-ref: main script: | set -euxo pipefail @@ -102,13 +105,14 @@ jobs: export VSDEVCMD_ARGS="" export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} - + ./.github/scripts/unittest.sh onnx: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/vision + test-infra-ref: main script: | set -euo pipefail @@ -117,20 +121,20 @@ jobs: export GPU_ARCH_VERSION='' ./.github/scripts/setup-env.sh - + # Prepare conda CONDA_PATH=$(which conda) eval "$(${CONDA_PATH} shell.bash hook)" conda activate ci - + echo '::group::Install ONNX' pip install --progress-bar=off onnx onnxruntime echo '::endgroup::' - + echo '::group::Install testing utilities' pip install --progress-bar=off pytest echo '::endgroup::' - + echo '::group::Run ONNX tests' pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_onnx.py echo '::endgroup::' @@ -139,6 +143,7 @@ jobs: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/vision + test-infra-ref: main script: | set -euo pipefail @@ -147,21 +152,21 @@ jobs: export GPU_ARCH_VERSION='' ./.github/scripts/setup-env.sh - + # Prepare conda CONDA_PATH=$(which conda) eval "$(${CONDA_PATH} shell.bash hook)" conda activate ci - + echo '::group::Pre-download model weights' pip install --progress-bar=off aiohttp aiofiles tqdm python scripts/download_model_urls.py echo '::endgroup::' - + echo '::group::Install testing utilities' pip install --progress-bar=off pytest echo '::endgroup::' - + echo '::group::Run extended unittests' export PYTORCH_TEST_WITH_EXTENDED=1 pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_extended_*.py diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index 665d833b60a..8e4889b9ba7 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -18,6 +18,7 @@ jobs: with: repository: pytorch/vision required_checks: "Build Linux,Build M1,Build Macos,Build Windows,Tests,CMake,Lint,Docs" + test-infra-ref: main secrets: ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }} GITHUB_DEPLOY_KEY : ${{ secrets.VISION_GITHUB_DEPLOY_KEY }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b41c0fe8939..32a89df7792 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -226,3 +226,5 @@ please read [GitHub's guides](https://docs.github.com/en/github/collaborating-wi By contributing to Torchvision, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. + +Contributors are also required to [sign our Contributor License Agreement](https://code.facebook.com/cla). diff --git a/README.md b/README.md index 373b6b79548..e9de2b833cf 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ Torchvision currently supports the following video backends: conflicting version of ffmpeg installed. Currently, this is only supported on Linux. ``` -conda install -c conda-forge ffmpeg +conda install -c conda-forge 'ffmpeg<4.3' python setup.py install ``` diff --git a/docs/source/conf.py b/docs/source/conf.py index cd3a28658cb..a3be2282a47 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -371,7 +371,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines): used within the autoclass directive. """ - if getattr(obj, ".__name__", "").endswith(("_Weights", "_QuantizedWeights")): + if getattr(obj, "__name__", "").endswith(("_Weights", "_QuantizedWeights")): if len(obj) == 0: lines[:] = ["There are no available pre-trained weights."] diff --git a/references/segmentation/README.md b/references/segmentation/README.md index 2c7391c8380..2c8e581dac1 100644 --- a/references/segmentation/README.md +++ b/references/segmentation/README.md @@ -1,7 +1,7 @@ # Semantic segmentation reference training scripts This folder contains reference training scripts for semantic segmentation. -They serve as a log of how to train specific models, as provide baseline +They serve as a log of how to train specific models and provide baseline training and evaluation scripts to quickly bootstrap research. All models have been trained on 8x V100 GPUs. diff --git a/test/assets/fakedata/draw_boxes_util.png b/test/assets/fakedata/draw_boxes_util.png index d38f8be78ac..ee5dac329e0 100644 Binary files a/test/assets/fakedata/draw_boxes_util.png and b/test/assets/fakedata/draw_boxes_util.png differ diff --git a/test/common_utils.py b/test/common_utils.py index a1d188efdae..99c7931587d 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -406,6 +406,7 @@ def make_bounding_boxes( canvas_size=DEFAULT_SIZE, *, format=tv_tensors.BoundingBoxFormat.XYXY, + num_boxes=1, dtype=None, device="cpu", ): @@ -419,8 +420,7 @@ def sample_position(values, max_value): dtype = dtype or torch.float32 - num_objects = 1 - h, w = [torch.randint(1, c, (num_objects,)) for c in canvas_size] + h, w = [torch.randint(1, s, (num_boxes,)) for s in canvas_size] y = sample_position(h, canvas_size[0]) x = sample_position(w, canvas_size[1]) @@ -443,12 +443,11 @@ def sample_position(values, max_value): ) -def make_detection_mask(size=DEFAULT_SIZE, *, dtype=None, device="cpu"): +def make_detection_masks(size=DEFAULT_SIZE, *, num_masks=1, dtype=None, device="cpu"): """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks""" - num_objects = 1 return tv_tensors.Mask( torch.testing.make_tensor( - (num_objects, *size), + (num_masks, *size), low=0, high=2, dtype=dtype or torch.bool, diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py deleted file mode 100644 index b26bcff3246..00000000000 --- a/test/prototype_common_utils.py +++ /dev/null @@ -1,82 +0,0 @@ -import collections.abc -import dataclasses -from typing import Optional, Sequence - -import pytest -import torch -from torch.nn.functional import one_hot - -from torchvision.prototype import tv_tensors - -from transforms_v2_legacy_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader - - -@dataclasses.dataclass -class LabelLoader(TensorLoader): - categories: Optional[Sequence[str]] - - -def _parse_categories(categories): - if categories is None: - num_categories = int(torch.randint(1, 11, ())) - elif isinstance(categories, int): - num_categories = categories - categories = [f"category{idx}" for idx in range(num_categories)] - elif isinstance(categories, collections.abc.Sequence) and all(isinstance(category, str) for category in categories): - categories = list(categories) - num_categories = len(categories) - else: - raise pytest.UsageError( - f"`categories` can either be `None` (default), an integer, or a sequence of strings, " - f"but got '{categories}' instead." - ) - return categories, num_categories - - -def make_label_loader(*, extra_dims=(), categories=None, dtype=torch.int64): - categories, num_categories = _parse_categories(categories) - - def fn(shape, dtype, device): - # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values, - # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123 - data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype) - return tv_tensors.Label(data, categories=categories) - - return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories) - - -make_label = from_loader(make_label_loader) - - -@dataclasses.dataclass -class OneHotLabelLoader(TensorLoader): - categories: Optional[Sequence[str]] - - -def make_one_hot_label_loader(*, categories=None, extra_dims=(), dtype=torch.int64): - categories, num_categories = _parse_categories(categories) - - def fn(shape, dtype, device): - if num_categories == 0: - data = torch.empty(shape, dtype=dtype, device=device) - else: - # The idiom `make_label_loader(..., dtype=torch.int64); ...; one_hot(...).to(dtype)` is intentional - # since `one_hot` only supports int64 - label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device) - data = one_hot(label, num_classes=num_categories).to(dtype) - return tv_tensors.OneHotLabel(data, categories=categories) - - return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories) - - -def make_one_hot_label_loaders( - *, - categories=(1, 0, None), - extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.int64, torch.float32), -): - for params in combinations_grid(categories=categories, extra_dims=extra_dims, dtype=dtypes): - yield make_one_hot_label_loader(**params) - - -make_one_hot_labels = from_loaders(make_one_hot_label_loaders) diff --git a/test/smoke_test.py b/test/smoke_test.py index 6cc07c00aed..d672d46ad9e 100644 --- a/test/smoke_test.py +++ b/test/smoke_test.py @@ -59,7 +59,7 @@ def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None: model.eval() # Step 2: Initialize the inference transforms - preprocess = weights.transforms() + preprocess = weights.transforms(antialias=(device != "mps")) # antialias not supported on MPS # Step 3: Apply inference preprocessing transforms batch = preprocess(img).unsqueeze(0) diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py index fb3f5744e54..15fa4c391b6 100644 --- a/test/test_functional_tensor.py +++ b/test/test_functional_tensor.py @@ -2,7 +2,6 @@ import itertools import math import os -import warnings from functools import partial from typing import Sequence @@ -569,23 +568,6 @@ def test_resize_antialias(device, dt, size, interpolation): assert_equal(resized_tensor, resize_result) -def test_resize_antialias_default_warning(): - - img = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8) - - match = "The default value of the antialias" - with pytest.warns(UserWarning, match=match): - F.resize(img, size=(20, 20)) - with pytest.warns(UserWarning, match=match): - F.resized_crop(img, 0, 0, 10, 10, size=(20, 20)) - - # For modes that aren't bicubic or bilinear, don't throw a warning - with warnings.catch_warnings(): - warnings.simplefilter("error") - F.resize(img, size=(20, 20), interpolation=NEAREST) - F.resized_crop(img, 0, 0, 10, 10, size=(20, 20), interpolation=NEAREST) - - def check_functional_vs_PIL_vs_scripted( fn, fn_pil, fn_t, config, device, dtype, channels=3, tol=2.0 + 1e-10, agg_method="max" ): diff --git a/test/test_models.py b/test/test_models.py index 76bddebefe4..33c6a84c941 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -1057,25 +1057,5 @@ def test_raft(model_fn, scripted): _assert_expected(flow_pred.cpu(), name=model_fn.__name__, atol=1e-2, rtol=1) -def test_presets_antialias(): - - img = torch.randint(0, 256, size=(1, 3, 224, 224), dtype=torch.uint8) - - match = "The default value of the antialias parameter" - with pytest.warns(UserWarning, match=match): - models.ResNet18_Weights.DEFAULT.transforms()(img) - with pytest.warns(UserWarning, match=match): - models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT.transforms()(img) - - with warnings.catch_warnings(): - warnings.simplefilter("error") - models.ResNet18_Weights.DEFAULT.transforms(antialias=True)(img) - models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT.transforms(antialias=True)(img) - - models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT.transforms()(img) - models.video.R3D_18_Weights.DEFAULT.transforms()(img) - models.optical_flow.Raft_Small_Weights.DEFAULT.transforms()(img, img) - - if __name__ == "__main__": pytest.main([__file__]) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 9794b196a70..3f2e5015863 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1,41 +1,42 @@ +import collections.abc import re import PIL.Image import pytest import torch -from common_utils import assert_equal +from common_utils import assert_equal, make_bounding_boxes, make_detection_masks, make_image, make_video -from prototype_common_utils import make_label from torchvision.prototype import transforms, tv_tensors from torchvision.transforms.v2._utils import check_type, is_pure_tensor from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image from torchvision.tv_tensors import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video -from transforms_v2_legacy_utils import ( - DEFAULT_EXTRA_DIMS, - make_bounding_boxes, - make_detection_mask, - make_image, - make_video, -) -BATCH_EXTRA_DIMS = [extra_dims for extra_dims in DEFAULT_EXTRA_DIMS if extra_dims] +def _parse_categories(categories): + if categories is None: + num_categories = int(torch.randint(1, 11, ())) + elif isinstance(categories, int): + num_categories = categories + categories = [f"category{idx}" for idx in range(num_categories)] + elif isinstance(categories, collections.abc.Sequence) and all(isinstance(category, str) for category in categories): + categories = list(categories) + num_categories = len(categories) + else: + raise pytest.UsageError( + f"`categories` can either be `None` (default), an integer, or a sequence of strings, " + f"but got '{categories}' instead." + ) + return categories, num_categories -def parametrize(transforms_with_inputs): - return pytest.mark.parametrize( - ("transform", "input"), - [ - pytest.param( - transform, - input, - id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}", - ) - for transform, inputs in transforms_with_inputs - for idx, input in enumerate(inputs) - ], - ) + +def make_label(*, extra_dims=(), categories=10, dtype=torch.int64, device="cpu"): + categories, num_categories = _parse_categories(categories) + # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values, + # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123 + data = torch.testing.make_tensor(extra_dims, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype) + return tv_tensors.Label(data, categories=categories) class TestSimpleCopyPaste: @@ -167,7 +168,7 @@ def test__get_params(self, mocker): flat_inputs = [ make_image(size=canvas_size, color_space="RGB"), - make_bounding_boxes(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape), + make_bounding_boxes(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, num_boxes=batch_shape[0]), ] params = transform._get_params(flat_inputs) @@ -203,9 +204,9 @@ def test__transform_culling(self, mocker): ) bounding_boxes = make_bounding_boxes( - format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,) + format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, num_boxes=batch_size ) - masks = make_detection_mask(size=canvas_size, batch_dims=(batch_size,)) + masks = make_detection_masks(size=canvas_size, num_masks=batch_size) labels = make_label(extra_dims=(batch_size,)) transform = transforms.FixedSizeCrop((-1, -1)) @@ -241,7 +242,7 @@ def test__transform_bounding_boxes_clamping(self, mocker): ) bounding_boxes = make_bounding_boxes( - format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,) + format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, num_boxes=batch_size ) mock = mocker.patch( "torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes", wraps=clamp_bounding_boxes @@ -389,27 +390,27 @@ def make_tv_tensors(): pil_image = to_pil_image(make_image(size=size, color_space="RGB")) target = { - "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", num_boxes=num_objects, dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), - "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), + "masks": make_detection_masks(size=size, num_masks=num_objects, dtype=torch.long), } yield (pil_image, target) tensor_image = torch.Tensor(make_image(size=size, color_space="RGB")) target = { - "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", num_boxes=num_objects, dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), - "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), + "masks": make_detection_masks(size=size, num_masks=num_objects, dtype=torch.long), } yield (tensor_image, target) tv_tensor_image = make_image(size=size, color_space="RGB") target = { - "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", num_boxes=num_objects, dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), - "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), + "masks": make_detection_masks(size=size, num_masks=num_objects, dtype=torch.long), } yield (tv_tensor_image, target) diff --git a/test/test_transforms.py b/test/test_transforms.py index 7581bf33220..7c92baa9f5c 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -3,7 +3,6 @@ import random import re import textwrap -import warnings from functools import partial import numpy as np @@ -440,16 +439,6 @@ def test_resize_antialias_error(): t(img) -def test_resize_antialias_default_warning(): - - img = Image.new("RGB", size=(10, 10), color=127) - # We make sure we don't warn for PIL images since the default behaviour doesn't change - with warnings.catch_warnings(): - warnings.simplefilter("error") - transforms.Resize((20, 20))(img) - transforms.RandomResizedCrop((20, 20))(img) - - @pytest.mark.parametrize("height, width", ((32, 64), (64, 32))) def test_resize_size_equals_small_edge_size(height, width): # Non-regression test for https://github.com/pytorch/vision/issues/5405 diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py index e2ab5673f1e..eac52dafc17 100644 --- a/test/test_transforms_tensor.py +++ b/test/test_transforms_tensor.py @@ -1,6 +1,5 @@ import os import sys -import warnings import numpy as np import PIL.Image @@ -428,22 +427,6 @@ def test_resized_crop_save_load(self, tmpdir): fn = T.RandomResizedCrop(size=[32], antialias=True) _test_fn_save_load(fn, tmpdir) - def test_antialias_default_warning(self): - - img = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8) - - match = "The default value of the antialias" - with pytest.warns(UserWarning, match=match): - T.Resize((20, 20))(img) - with pytest.warns(UserWarning, match=match): - T.RandomResizedCrop((20, 20))(img) - - # For modes that aren't bicubic or bilinear, don't throw a warning - with warnings.catch_warnings(): - warnings.simplefilter("error") - T.Resize((20, 20), interpolation=NEAREST)(img) - T.RandomResizedCrop((20, 20), interpolation=NEAREST)(img) - def _test_random_affine_helper(device, **kwargs): tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 5ab35fc873b..4f8d0027bd6 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -1,348 +1,5283 @@ +import contextlib +import decimal +import functools +import inspect import itertools -import pathlib +import math import pickle import random -import warnings +import re +from copy import deepcopy +from pathlib import Path +from unittest import mock import numpy as np +import PIL.Image +import pytest + +import torch +import torchvision.ops +import torchvision.transforms.v2 as transforms + +from common_utils import ( + assert_equal, + cache, + cpu_and_cuda, + freeze_rng_state, + ignore_jit_no_profile_information_warning, + make_bounding_boxes, + make_detection_masks, + make_image, + make_image_pil, + make_image_tensor, + make_segmentation_mask, + make_video, + make_video_tensor, + needs_cuda, + set_rng_seed, +) + +from torch import nn +from torch.testing import assert_close +from torch.utils._pytree import tree_flatten, tree_map +from torch.utils.data import DataLoader, default_collate +from torchvision import tv_tensors +from torchvision.ops.boxes import box_iou + +from torchvision.transforms._functional_tensor import _max_value as get_max_value +from torchvision.transforms.functional import pil_modes_mapping, to_pil_image +from torchvision.transforms.v2 import functional as F +from torchvision.transforms.v2._utils import check_type, is_pure_tensor +from torchvision.transforms.v2.functional._geometry import _get_perspective_coeffs +from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal + + +# turns all warnings into errors for this module +pytestmark = pytest.mark.filterwarnings("error") + + +@pytest.fixture(autouse=True) +def fix_rng_seed(): + set_rng_seed(0) + yield + + +def _to_tolerances(maybe_tolerance_dict): + if not isinstance(maybe_tolerance_dict, dict): + return dict(rtol=None, atol=None) + + tolerances = dict(rtol=0, atol=0) + tolerances.update(maybe_tolerance_dict) + return tolerances + + +def _check_kernel_cuda_vs_cpu(kernel, input, *args, rtol, atol, **kwargs): + """Checks if the kernel produces closes results for inputs on GPU and CPU.""" + if input.device.type != "cuda": + return + + input_cuda = input.as_subclass(torch.Tensor) + input_cpu = input_cuda.to("cpu") + + with freeze_rng_state(): + actual = kernel(input_cuda, *args, **kwargs) + with freeze_rng_state(): + expected = kernel(input_cpu, *args, **kwargs) + + assert_close(actual, expected, check_device=False, rtol=rtol, atol=atol) + + +@cache +def _script(obj): + try: + return torch.jit.script(obj) + except Exception as error: + name = getattr(obj, "__name__", obj.__class__.__name__) + raise AssertionError(f"Trying to `torch.jit.script` '{name}' raised the error above.") from error + + +def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs): + """Checks if the kernel is scriptable and if the scripted output is close to the eager one.""" + if input.device.type != "cpu": + return + + kernel_scripted = _script(kernel) + + input = input.as_subclass(torch.Tensor) + with ignore_jit_no_profile_information_warning(): + actual = kernel_scripted(input, *args, **kwargs) + expected = kernel(input, *args, **kwargs) + + assert_close(actual, expected, rtol=rtol, atol=atol) + + +def _check_kernel_batched_vs_unbatched(kernel, input, *args, rtol, atol, **kwargs): + """Checks if the kernel produces close results for batched and unbatched inputs.""" + unbatched_input = input.as_subclass(torch.Tensor) + + for batch_dims in [(2,), (2, 1)]: + repeats = [*batch_dims, *[1] * input.ndim] + + actual = kernel(unbatched_input.repeat(repeats), *args, **kwargs) + + expected = kernel(unbatched_input, *args, **kwargs) + # We can't directly call `.repeat()` on the output, since some kernel also return some additional metadata + if isinstance(expected, torch.Tensor): + expected = expected.repeat(repeats) + else: + tensor, *metadata = expected + expected = (tensor.repeat(repeats), *metadata) + + assert_close(actual, expected, rtol=rtol, atol=atol) + + for degenerate_batch_dims in [(0,), (5, 0), (0, 5)]: + degenerate_batched_input = torch.empty( + degenerate_batch_dims + input.shape, dtype=input.dtype, device=input.device + ) + + output = kernel(degenerate_batched_input, *args, **kwargs) + # Most kernels just return a tensor, but some also return some additional metadata + if not isinstance(output, torch.Tensor): + output, *_ = output + + assert output.shape[: -input.ndim] == degenerate_batch_dims + + +def check_kernel( + kernel, + input, + *args, + check_cuda_vs_cpu=True, + check_scripted_vs_eager=True, + check_batched_vs_unbatched=True, + **kwargs, +): + initial_input_version = input._version + + output = kernel(input.as_subclass(torch.Tensor), *args, **kwargs) + # Most kernels just return a tensor, but some also return some additional metadata + if not isinstance(output, torch.Tensor): + output, *_ = output + + # check that no inplace operation happened + assert input._version == initial_input_version + + if kernel not in {F.to_dtype_image, F.to_dtype_video}: + assert output.dtype == input.dtype + assert output.device == input.device + + if check_cuda_vs_cpu: + _check_kernel_cuda_vs_cpu(kernel, input, *args, **kwargs, **_to_tolerances(check_cuda_vs_cpu)) + + if check_scripted_vs_eager: + _check_kernel_scripted_vs_eager(kernel, input, *args, **kwargs, **_to_tolerances(check_scripted_vs_eager)) + + if check_batched_vs_unbatched: + _check_kernel_batched_vs_unbatched(kernel, input, *args, **kwargs, **_to_tolerances(check_batched_vs_unbatched)) + + +def _check_functional_scripted_smoke(functional, input, *args, **kwargs): + """Checks if the functional can be scripted and the scripted version can be called without error.""" + if not isinstance(input, tv_tensors.Image): + return + + functional_scripted = _script(functional) + with ignore_jit_no_profile_information_warning(): + functional_scripted(input.as_subclass(torch.Tensor), *args, **kwargs) + + +def check_functional(functional, input, *args, check_scripted_smoke=True, **kwargs): + unknown_input = object() + with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))): + functional(unknown_input, *args, **kwargs) + + with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy: + output = functional(input, *args, **kwargs) + + spy.assert_any_call(f"{functional.__module__}.{functional.__name__}") + + assert isinstance(output, type(input)) + + if isinstance(input, tv_tensors.BoundingBoxes) and functional is not F.convert_bounding_box_format: + assert output.format == input.format + + if check_scripted_smoke: + _check_functional_scripted_smoke(functional, input, *args, **kwargs) + + +def check_functional_kernel_signature_match(functional, *, kernel, input_type): + """Checks if the signature of the functional matches the kernel signature.""" + functional_params = list(inspect.signature(functional).parameters.values())[1:] + kernel_params = list(inspect.signature(kernel).parameters.values())[1:] + + if issubclass(input_type, tv_tensors.TVTensor): + # We filter out metadata that is implicitly passed to the functional through the input tv_tensor, but has to be + # explicitly passed to the kernel. + explicit_metadata = { + tv_tensors.BoundingBoxes: {"format", "canvas_size"}, + } + kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())] + + functional_params = iter(functional_params) + for functional_param, kernel_param in zip(functional_params, kernel_params): + try: + # In general, the functional parameters are a superset of the kernel parameters. Thus, we filter out + # functional parameters that have no kernel equivalent while keeping the order intact. + while functional_param.name != kernel_param.name: + functional_param = next(functional_params) + except StopIteration: + raise AssertionError( + f"Parameter `{kernel_param.name}` of kernel `{kernel.__name__}` " + f"has no corresponding parameter on the functional `{functional.__name__}`." + ) from None + + if issubclass(input_type, PIL.Image.Image): + # PIL kernels often have more correct annotations, since they are not limited by JIT. Thus, we don't check + # them in the first place. + functional_param._annotation = kernel_param._annotation = inspect.Parameter.empty + + assert functional_param == kernel_param + + +def _check_transform_v1_compatibility(transform, input, *, rtol, atol): + """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static + ``get_params`` method that is the v1 equivalent, the output is close to v1, is scriptable, and the scripted version + can be called without error.""" + if not (type(input) is torch.Tensor or isinstance(input, PIL.Image.Image)): + return + + v1_transform_cls = transform._v1_transform_cls + if v1_transform_cls is None: + return + + if hasattr(v1_transform_cls, "get_params"): + assert type(transform).get_params is v1_transform_cls.get_params + + v1_transform = v1_transform_cls(**transform._extract_params_for_v1_transform()) + + with freeze_rng_state(): + output_v2 = transform(input) + + with freeze_rng_state(): + output_v1 = v1_transform(input) + + assert_close(F.to_image(output_v2), F.to_image(output_v1), rtol=rtol, atol=atol) + + if isinstance(input, PIL.Image.Image): + return + + _script(v1_transform)(input) + + +def _make_transform_sample(transform, *, image_or_video, adapter): + device = image_or_video.device if isinstance(image_or_video, torch.Tensor) else "cpu" + size = F.get_size(image_or_video) + input = dict( + image_or_video=image_or_video, + image_tv_tensor=make_image(size, device=device), + video_tv_tensor=make_video(size, device=device), + image_pil=make_image_pil(size), + bounding_boxes_xyxy=make_bounding_boxes(size, format=tv_tensors.BoundingBoxFormat.XYXY, device=device), + bounding_boxes_xywh=make_bounding_boxes(size, format=tv_tensors.BoundingBoxFormat.XYWH, device=device), + bounding_boxes_cxcywh=make_bounding_boxes(size, format=tv_tensors.BoundingBoxFormat.CXCYWH, device=device), + bounding_boxes_degenerate_xyxy=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [2, 0, 1, 1], # x1 > x2, y1 < y2 + [0, 2, 1, 1], # x1 < x2, y1 > y2 + [2, 2, 1, 1], # x1 > x2, y1 > y2 + ], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=size, + device=device, + ), + bounding_boxes_degenerate_xywh=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [0, 0, 1, -1], # negative height + [0, 0, -1, 1], # negative width + [0, 0, -1, -1], # negative height and width + ], + format=tv_tensors.BoundingBoxFormat.XYWH, + canvas_size=size, + device=device, + ), + bounding_boxes_degenerate_cxcywh=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [0, 0, 1, -1], # negative height + [0, 0, -1, 1], # negative width + [0, 0, -1, -1], # negative height and width + ], + format=tv_tensors.BoundingBoxFormat.CXCYWH, + canvas_size=size, + device=device, + ), + detection_mask=make_detection_masks(size, device=device), + segmentation_mask=make_segmentation_mask(size, device=device), + int=0, + float=0.0, + bool=True, + none=None, + str="str", + path=Path.cwd(), + object=object(), + tensor=torch.empty(5), + array=np.empty(5), + ) + if adapter is not None: + input = adapter(transform, input, device) + return input + + +def _check_transform_sample_input_smoke(transform, input, *, adapter): + # This is a bunch of input / output convention checks, using a big sample with different parts as input. + + if not check_type(input, (is_pure_tensor, PIL.Image.Image, tv_tensors.Image, tv_tensors.Video)): + return + + sample = _make_transform_sample( + # adapter might change transform inplace + transform=transform if adapter is None else deepcopy(transform), + image_or_video=input, + adapter=adapter, + ) + for container_type in [dict, list, tuple]: + if container_type is dict: + input = sample + else: + input = container_type(sample.values()) + + input_flat, input_spec = tree_flatten(input) + + with freeze_rng_state(): + torch.manual_seed(0) + output = transform(input) + output_flat, output_spec = tree_flatten(output) + + assert output_spec == input_spec + + for output_item, input_item, should_be_transformed in zip( + output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat) + ): + if should_be_transformed: + assert type(output_item) is type(input_item) + else: + assert output_item is input_item + + # Enforce that the transform does not turn a degenerate bounding box, e.g. marked by RandomIoUCrop (or any other + # future transform that does this), back into a valid one. + for degenerate_bounding_boxes in ( + bounding_box + for name, bounding_box in sample.items() + if "degenerate" in name and isinstance(bounding_box, tv_tensors.BoundingBoxes) + ): + sample = dict( + boxes=degenerate_bounding_boxes, + labels=torch.randint(10, (degenerate_bounding_boxes.shape[0],), device=degenerate_bounding_boxes.device), + ) + assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4) + + +def check_transform(transform, input, check_v1_compatibility=True, check_sample_input=True): + pickle.loads(pickle.dumps(transform)) + + output = transform(input) + assert isinstance(output, type(input)) + + if isinstance(input, tv_tensors.BoundingBoxes) and not isinstance(transform, transforms.ConvertBoundingBoxFormat): + assert output.format == input.format + + if check_sample_input: + _check_transform_sample_input_smoke( + transform, input, adapter=check_sample_input if callable(check_sample_input) else None + ) + + if check_v1_compatibility: + _check_transform_v1_compatibility(transform, input, **_to_tolerances(check_v1_compatibility)) + + return output + + +def transform_cls_to_functional(transform_cls, **transform_specific_kwargs): + def wrapper(input, *args, **kwargs): + transform = transform_cls(*args, **transform_specific_kwargs, **kwargs) + return transform(input) + + wrapper.__name__ = transform_cls.__name__ + + return wrapper + + +def param_value_parametrization(**kwargs): + """Helper function to turn + + @pytest.mark.parametrize( + ("param", "value"), + ("a", 1), + ("a", 2), + ("a", 3), + ("b", -1.0) + ("b", 1.0) + ) + + into + + @param_value_parametrization(a=[1, 2, 3], b=[-1.0, 1.0]) + """ + return pytest.mark.parametrize( + ("param", "value"), + [(param, value) for param, values in kwargs.items() for value in values], + ) + + +def adapt_fill(value, *, dtype): + """Adapt fill values in the range [0.0, 1.0] to the value range of the dtype""" + if value is None: + return value + + max_value = get_max_value(dtype) + value_type = float if dtype.is_floating_point else int + + if isinstance(value, (int, float)): + return value_type(value * max_value) + elif isinstance(value, (list, tuple)): + return type(value)(value_type(v * max_value) for v in value) + else: + raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'.") + + +EXHAUSTIVE_TYPE_FILLS = [ + None, + 1, + 0.5, + [1], + [0.2], + (0,), + (0.7,), + [1, 0, 1], + [0.1, 0.2, 0.3], + (0, 1, 0), + (0.9, 0.234, 0.314), +] +CORRECTNESS_FILLS = [ + v for v in EXHAUSTIVE_TYPE_FILLS if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1) +] + + +# We cannot use `list(transforms.InterpolationMode)` here, since it includes some PIL-only ones as well +INTERPOLATION_MODES = [ + transforms.InterpolationMode.NEAREST, + transforms.InterpolationMode.NEAREST_EXACT, + transforms.InterpolationMode.BILINEAR, + transforms.InterpolationMode.BICUBIC, +] + + +def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True): + format = bounding_boxes.format + canvas_size = new_canvas_size or bounding_boxes.canvas_size + + def affine_bounding_boxes(bounding_boxes): + dtype = bounding_boxes.dtype + device = bounding_boxes.device + + # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 + input_xyxy = F.convert_bounding_box_format( + bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True), + old_format=format, + new_format=tv_tensors.BoundingBoxFormat.XYXY, + inplace=True, + ) + x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist() + + points = np.array( + [ + [x1, y1, 1.0], + [x2, y1, 1.0], + [x1, y2, 1.0], + [x2, y2, 1.0], + ] + ) + transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T) + + output_xyxy = torch.Tensor( + [ + float(np.min(transformed_points[:, 0])), + float(np.min(transformed_points[:, 1])), + float(np.max(transformed_points[:, 0])), + float(np.max(transformed_points[:, 1])), + ] + ) + + output = F.convert_bounding_box_format( + output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format + ) + + if clamp: + # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 + output = F.clamp_bounding_boxes( + output, + format=format, + canvas_size=canvas_size, + ) + else: + # We leave the bounding box as float64 so the caller gets the full precision to perform any additional + # operation + dtype = output.dtype + + return output.to(dtype=dtype, device=device) + + return tv_tensors.BoundingBoxes( + torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape( + bounding_boxes.shape + ), + format=format, + canvas_size=canvas_size, + ) + + +class TestResize: + INPUT_SIZE = (17, 11) + OUTPUT_SIZES = [17, [17], (17,), [12, 13], (12, 13)] + + def _make_max_size_kwarg(self, *, use_max_size, size): + if use_max_size: + if not (isinstance(size, int) or len(size) == 1): + # This would result in an `ValueError` + return None + + max_size = (size if isinstance(size, int) else size[0]) + 1 + else: + max_size = None + + return dict(max_size=max_size) + + def _compute_output_size(self, *, input_size, size, max_size): + if not (isinstance(size, int) or len(size) == 1): + return tuple(size) + + if not isinstance(size, int): + size = size[0] + + old_height, old_width = input_size + ratio = old_width / old_height + if ratio > 1: + new_height = size + new_width = int(ratio * new_height) + else: + new_width = size + new_height = int(new_width / ratio) + + if max_size is not None and max(new_height, new_width) > max_size: + # Need to recompute the aspect ratio, since it might have changed due to rounding + ratio = new_width / new_height + if ratio > 1: + new_width = max_size + new_height = int(new_width / ratio) + else: + new_height = max_size + new_width = int(new_height * ratio) + + return new_height, new_width + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("antialias", [True, False]) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, size, interpolation, use_max_size, antialias, dtype, device): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + # In contrast to CPU, there is no native `InterpolationMode.BICUBIC` implementation for uint8 images on CUDA. + # Internally, it uses the float path. Thus, we need to test with an enormous tolerance here to account for that. + atol = 30 if transforms.InterpolationMode.BICUBIC and dtype is torch.uint8 else 1 + check_cuda_vs_cpu_tolerances = dict(rtol=0, atol=atol / 255 if dtype.is_floating_point else atol) + + check_kernel( + F.resize_image, + make_image(self.INPUT_SIZE, dtype=dtype, device=device), + size=size, + interpolation=interpolation, + **max_size_kwarg, + antialias=antialias, + check_cuda_vs_cpu=check_cuda_vs_cpu_tolerances, + check_scripted_vs_eager=not isinstance(size, int), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + bounding_boxes = make_bounding_boxes( + format=format, + canvas_size=self.INPUT_SIZE, + dtype=dtype, + device=device, + ) + check_kernel( + F.resize_bounding_boxes, + bounding_boxes, + canvas_size=bounding_boxes.canvas_size, + size=size, + **max_size_kwarg, + check_scripted_vs_eager=not isinstance(size, int), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.resize_mask, make_mask(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1]) + + def test_kernel_video(self): + check_kernel(F.resize_video, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1], antialias=True) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, size, make_input): + check_functional( + F.resize, + make_input(self.INPUT_SIZE), + size=size, + antialias=True, + check_scripted_smoke=not isinstance(size, int), + ) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.resize_image, torch.Tensor), + (F._resize_image_pil, PIL.Image.Image), + (F.resize_image, tv_tensors.Image), + (F.resize_bounding_boxes, tv_tensors.BoundingBoxes), + (F.resize_mask, tv_tensors.Mask), + (F.resize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.resize, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + ], + ) + def test_transform(self, size, device, make_input): + check_transform( + transforms.Resize(size=size, antialias=True), + make_input(self.INPUT_SIZE, device=device), + # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes + check_v1_compatibility=dict(rtol=0, atol=1), + ) + + def _check_output_size(self, input, output, *, size, max_size): + assert tuple(F.get_size(output)) == self._compute_output_size( + input_size=F.get_size(input), size=size, max_size=max_size + ) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2. + # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT` + @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST}) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) + def test_image_correctness(self, size, interpolation, use_max_size, fn): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + image = make_image(self.INPUT_SIZE, dtype=torch.uint8) + + actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True) + expected = F.to_image(F.resize(F.to_pil_image(image), size=size, interpolation=interpolation, **max_size_kwarg)) + + self._check_output_size(image, actual, size=size, **max_size_kwarg) + torch.testing.assert_close(actual, expected, atol=1, rtol=0) + + def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=None): + old_height, old_width = bounding_boxes.canvas_size + new_height, new_width = self._compute_output_size( + input_size=bounding_boxes.canvas_size, size=size, max_size=max_size + ) + + if (old_height, old_width) == (new_height, new_width): + return bounding_boxes + + affine_matrix = np.array( + [ + [new_width / old_width, 0, 0], + [0, new_height / old_height, 0], + ], + ) + + return reference_affine_bounding_boxes_helper( + bounding_boxes, + affine_matrix=affine_matrix, + new_canvas_size=(new_height, new_width), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) + def test_bounding_boxes_correctness(self, format, size, use_max_size, fn): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE) + + actual = fn(bounding_boxes, size=size, **max_size_kwarg) + expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg) + + self._check_output_size(bounding_boxes, actual, size=size, **max_size_kwarg) + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES)) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_pil_interpolation_compat_smoke(self, interpolation, make_input): + input = make_input(self.INPUT_SIZE) + + with ( + contextlib.nullcontext() + if isinstance(input, PIL.Image.Image) + # This error is triggered in PyTorch core + else pytest.raises(NotImplementedError, match=f"got {interpolation.value.lower()}") + ): + F.resize( + input, + size=self.OUTPUT_SIZES[0], + interpolation=interpolation, + ) + + def test_functional_pil_antialias_warning(self): + with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"): + F.resize(make_image_pil(self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + ], + ) + def test_max_size_error(self, size, make_input): + if isinstance(size, int) or len(size) == 1: + max_size = (size if isinstance(size, int) else size[0]) - 1 + match = "must be strictly greater than the requested size" + else: + # value can be anything other than None + max_size = -1 + match = "size should be an int or a sequence of length 1" + + with pytest.raises(ValueError, match=match): + F.resize(make_input(self.INPUT_SIZE), size=size, max_size=max_size, antialias=True) + + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_interpolation_int(self, interpolation, make_input): + input = make_input(self.INPUT_SIZE) + + # `InterpolationMode.NEAREST_EXACT` has no proper corresponding integer equivalent. Internally, we map it to + # `0` to be the same as `InterpolationMode.NEAREST` for PIL. However, for the tensor backend there is a + # difference and thus we don't test it here. + if isinstance(input, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT: + return + + expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True) + actual = F.resize( + input, size=self.OUTPUT_SIZES[0], interpolation=pil_modes_mapping[interpolation], antialias=True + ) + + assert_equal(actual, expected) + + def test_transform_unknown_size_error(self): + with pytest.raises(ValueError, match="size can either be an integer or a sequence of one or two integers"): + transforms.Resize(size=object()) + + @pytest.mark.parametrize( + "size", [min(INPUT_SIZE), [min(INPUT_SIZE)], (min(INPUT_SIZE),), list(INPUT_SIZE), tuple(INPUT_SIZE)] + ) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + ], + ) + def test_noop(self, size, make_input): + input = make_input(self.INPUT_SIZE) + + output = F.resize(input, size=F.get_size(input), antialias=True) + + # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there + # is a good reason to break this, feel free to downgrade to an equality check. + if isinstance(input, tv_tensors.TVTensor): + # We can't test identity directly, since that checks for the identity of the Python object. Since all + # tv_tensors unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check + # that the underlying storage is the same + assert output.data_ptr() == input.data_ptr() + else: + assert output is input + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + ], + ) + def test_no_regression_5405(self, make_input): + # Checks that `max_size` is not ignored if `size == small_edge_size` + # See https://github.com/pytorch/vision/issues/5405 + + input = make_input(self.INPUT_SIZE) + + size = min(F.get_size(input)) + max_size = size + 1 + output = F.resize(input, size=size, max_size=max_size, antialias=True) + + assert max(F.get_size(output)) == max_size + + def _make_image(self, *args, batch_dims=(), memory_format=torch.contiguous_format, **kwargs): + # torch.channels_last memory_format is only available for 4D tensors, i.e. (B, C, H, W). However, images coming + # from PIL or our own I/O functions do not have a batch dimensions and are thus 3D, i.e. (C, H, W). Still, the + # layout of the data in memory is channels last. To emulate this when a 3D input is requested here, we create + # the image as 4D and create a view with the right shape afterwards. With this the layout in memory is channels + # last although PyTorch doesn't recognizes it as such. + emulate_channels_last = memory_format is torch.channels_last and len(batch_dims) != 1 + + image = make_image( + *args, + batch_dims=(math.prod(batch_dims),) if emulate_channels_last else batch_dims, + memory_format=memory_format, + **kwargs, + ) + + if emulate_channels_last: + image = tv_tensors.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image) + + return image + + def _check_stride(self, image, *, memory_format): + C, H, W = F.get_dimensions(image) + if memory_format is torch.contiguous_format: + expected_stride = (H * W, W, 1) + elif memory_format is torch.channels_last: + expected_stride = (1, W * C, C) + else: + raise ValueError(f"Unknown memory_format: {memory_format}") + + assert image.stride() == expected_stride + + # TODO: We can remove this test and related torchvision workaround + # once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430 + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize("antialias", [True, False]) + @pytest.mark.parametrize("memory_format", [torch.contiguous_format, torch.channels_last]) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image_memory_format_consistency(self, interpolation, antialias, memory_format, dtype, device): + size = self.OUTPUT_SIZES[0] + + input = self._make_image(self.INPUT_SIZE, dtype=dtype, device=device, memory_format=memory_format) + + # Smoke test to make sure we aren't starting with wrong assumptions + self._check_stride(input, memory_format=memory_format) + + output = F.resize_image(input, size=size, interpolation=interpolation, antialias=antialias) + + self._check_stride(output, memory_format=memory_format) + + def test_float16_no_rounding(self): + # Make sure Resize() doesn't round float16 images + # Non-regression test for https://github.com/pytorch/vision/issues/7667 + + input = make_image_tensor(self.INPUT_SIZE, dtype=torch.float16) + output = F.resize_image(input, size=self.OUTPUT_SIZES[0], antialias=True) + + assert output.dtype is torch.float16 + assert (output.round() - output).abs().sum() > 0 + + +class TestHorizontalFlip: + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device)) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + check_kernel( + F.horizontal_flip_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.horizontal_flip_mask, make_mask()) + + def test_kernel_video(self): + check_kernel(F.horizontal_flip_video, make_video()) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.horizontal_flip, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.horizontal_flip_image, torch.Tensor), + (F._horizontal_flip_image_pil, PIL.Image.Image), + (F.horizontal_flip_image, tv_tensors.Image), + (F.horizontal_flip_bounding_boxes, tv_tensors.BoundingBoxes), + (F.horizontal_flip_mask, tv_tensors.Mask), + (F.horizontal_flip_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform(transforms.RandomHorizontalFlip(p=1), make_input(device=device)) + + @pytest.mark.parametrize( + "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] + ) + def test_image_correctness(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.horizontal_flip(F.to_pil_image(image))) + + torch.testing.assert_close(actual, expected) + + def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes): + affine_matrix = np.array( + [ + [-1, 0, bounding_boxes.canvas_size[1]], + [0, 1, 0], + ], + ) + + return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize( + "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] + ) + def test_bounding_boxes_correctness(self, format, fn): + bounding_boxes = make_bounding_boxes(format=format) + + actual = fn(bounding_boxes) + expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_noop(self, make_input, device): + input = make_input(device=device) + + transform = transforms.RandomHorizontalFlip(p=0) + + output = transform(input) + + assert_equal(output, input) + + +class TestAffine: + _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict( + # float, int + angle=[-10.9, 18], + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + translate=[[6.3, -0.6], [1, -3], (16.6, -6.6), (-2, 4)], + # float + scale=[0.5], + # float, int, + # one-list of float, one-list of int, one-tuple of float, one-tuple of int + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + shear=[35.6, 38, [-37.7], [-23], (5.3,), (-52,), [5.4, 21.8], [-47, 51], (-11.2, 36.7), (8, -53)], + # None + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)], + ) + # The special case for shear makes sure we pick a value that is supported while JIT scripting + _MINIMAL_AFFINE_KWARGS = { + k: vs[0] if k != "shear" else next(v for v in vs if isinstance(v, list)) + for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() + } + _CORRECTNESS_AFFINE_KWARGS = { + k: [v for v in vs if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)] + for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() + } + + _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict( + degrees=[30, (-15, 20)], + translate=[None, (0.5, 0.5)], + scale=[None, (0.75, 1.25)], + shear=[None, (12, 30, -17, 5), 10, (-5, 12)], + ) + _CORRECTNESS_TRANSFORM_AFFINE_RANGES = { + k: next(v for v in vs if v is not None) for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items() + } + + def _check_kernel(self, kernel, input, *args, **kwargs): + kwargs_ = self._MINIMAL_AFFINE_KWARGS.copy() + kwargs_.update(kwargs) + check_kernel(kernel, input, *args, **kwargs_) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"], + shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + if param == "fill": + value = adapt_fill(value, dtype=dtype) + self._check_kernel( + F.affine_image, + make_image(dtype=dtype, device=device), + **{param: value}, + check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))), + check_cuda_vs_cpu=dict(atol=1, rtol=0) + if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR + else True, + ) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"], + shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + ) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, param, value, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + self._check_kernel( + F.affine_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + **{param: value}, + check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + self._check_kernel(F.affine_mask, make_mask()) + + def test_kernel_video(self): + self._check_kernel(F.affine_video, make_video()) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.affine_image, torch.Tensor), + (F._affine_image_pil, PIL.Image.Image), + (F.affine_image, tv_tensors.Image), + (F.affine_bounding_boxes, tv_tensors.BoundingBoxes), + (F.affine_mask, tv_tensors.Mask), + (F.affine_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.affine, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + input = make_input(device=device) + + check_transform(transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), input) + + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) + @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) + @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + actual = F.affine( + image, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + interpolation=interpolation, + fill=fill, + ) + expected = F.to_image( + F.affine( + F.to_pil_image(image), + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + interpolation=interpolation, + fill=fill, + ) + ) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8 + + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, center, interpolation, fill, seed): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + transform = transforms.RandomAffine( + **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center, interpolation=interpolation, fill=fill + ) + + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8 + + def _compute_affine_matrix(self, *, angle, translate, scale, shear, center): + rot = math.radians(angle) + cx, cy = center + tx, ty = translate + sx, sy = [math.radians(s) for s in ([shear, 0.0] if isinstance(shear, (int, float)) else shear)] + + c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]]) + t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]]) + c_matrix_inv = np.linalg.inv(c_matrix) + rs_matrix = np.array( + [ + [scale * math.cos(rot), -scale * math.sin(rot), 0], + [scale * math.sin(rot), scale * math.cos(rot), 0], + [0, 0, 1], + ] + ) + shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]]) + shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]]) + rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix)) + true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv))) + return true_matrix[:2, :] + + def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center): + if center is None: + center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] + + return reference_affine_bounding_boxes_helper( + bounding_boxes, + affine_matrix=self._compute_affine_matrix( + angle=angle, translate=translate, scale=scale, shear=shear, center=center + ), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) + @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) + @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center): + bounding_boxes = make_bounding_boxes(format=format) + + actual = F.affine( + bounding_boxes, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + ) + expected = self._reference_affine_bounding_boxes( + bounding_boxes, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + ) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_bounding_boxes_correctness(self, format, center, seed): + bounding_boxes = make_bounding_boxes(format=format) + + transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center) + + torch.manual_seed(seed) + params = transform._get_params([bounding_boxes]) + + torch.manual_seed(seed) + actual = transform(bounding_boxes) + + expected = self._reference_affine_bounding_boxes(bounding_boxes, **params, center=center) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"]) + @pytest.mark.parametrize("translate", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["translate"]) + @pytest.mark.parametrize("scale", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["scale"]) + @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed): + image = make_image() + height, width = F.get_size(image) + + transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear) + + torch.manual_seed(seed) + params = transform._get_params([image]) + + if isinstance(degrees, (int, float)): + assert -degrees <= params["angle"] <= degrees + else: + assert degrees[0] <= params["angle"] <= degrees[1] + + if translate is not None: + width_max = int(round(translate[0] * width)) + height_max = int(round(translate[1] * height)) + assert -width_max <= params["translate"][0] <= width_max + assert -height_max <= params["translate"][1] <= height_max + else: + assert params["translate"] == (0, 0) + + if scale is not None: + assert scale[0] <= params["scale"] <= scale[1] + else: + assert params["scale"] == 1.0 + + if shear is not None: + if isinstance(shear, (int, float)): + assert -shear <= params["shear"][0] <= shear + assert params["shear"][1] == 0.0 + elif len(shear) == 2: + assert shear[0] <= params["shear"][0] <= shear[1] + assert params["shear"][1] == 0.0 + elif len(shear) == 4: + assert shear[0] <= params["shear"][0] <= shear[1] + assert shear[2] <= params["shear"][1] <= shear[3] + else: + assert params["shear"] == (0, 0) + + @pytest.mark.parametrize("param", ["degrees", "translate", "scale", "shear", "center"]) + @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]]) + def test_transform_sequence_len_errors(self, param, value): + if param in {"degrees", "shear"} and not isinstance(value, list): + return + + kwargs = {param: value} + if param != "degrees": + kwargs["degrees"] = 0 + + with pytest.raises( + ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2" + ): + transforms.RandomAffine(**kwargs) + + def test_transform_negative_degrees_error(self): + with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"): + transforms.RandomAffine(degrees=-1) + + @pytest.mark.parametrize("translate", [[-1, 0], [2, 0], [-1, 2]]) + def test_transform_translate_range_error(self, translate): + with pytest.raises(ValueError, match="translation values should be between 0 and 1"): + transforms.RandomAffine(degrees=0, translate=translate) + + @pytest.mark.parametrize("scale", [[-1, 0], [0, -1], [-1, -1]]) + def test_transform_scale_range_error(self, scale): + with pytest.raises(ValueError, match="scale values should be positive"): + transforms.RandomAffine(degrees=0, scale=scale) + + def test_transform_negative_shear_error(self): + with pytest.raises(ValueError, match="If shear is a single number, it must be positive"): + transforms.RandomAffine(degrees=0, shear=-1) + + def test_transform_unknown_fill_error(self): + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomAffine(degrees=0, fill="fill") + + +class TestVerticalFlip: + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device)) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + check_kernel( + F.vertical_flip_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.vertical_flip_mask, make_mask()) + + def test_kernel_video(self): + check_kernel(F.vertical_flip_video, make_video()) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.vertical_flip, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.vertical_flip_image, torch.Tensor), + (F._vertical_flip_image_pil, PIL.Image.Image), + (F.vertical_flip_image, tv_tensors.Image), + (F.vertical_flip_bounding_boxes, tv_tensors.BoundingBoxes), + (F.vertical_flip_mask, tv_tensors.Mask), + (F.vertical_flip_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform(transforms.RandomVerticalFlip(p=1), make_input(device=device)) + + @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) + def test_image_correctness(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.vertical_flip(F.to_pil_image(image))) + + torch.testing.assert_close(actual, expected) + + def _reference_vertical_flip_bounding_boxes(self, bounding_boxes): + affine_matrix = np.array( + [ + [1, 0, 0], + [0, -1, bounding_boxes.canvas_size[0]], + ], + ) + + return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) + def test_bounding_boxes_correctness(self, format, fn): + bounding_boxes = make_bounding_boxes(format=format) + + actual = fn(bounding_boxes) + expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_noop(self, make_input, device): + input = make_input(device=device) + + transform = transforms.RandomVerticalFlip(p=0) + + output = transform(input) + + assert_equal(output, input) + + +class TestRotate: + _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict( + # float, int + angle=[-10.9, 18], + # None + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)], + ) + _MINIMAL_AFFINE_KWARGS = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()} + _CORRECTNESS_AFFINE_KWARGS = { + k: [v for v in vs if v is None or isinstance(v, float) or isinstance(v, list)] + for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() + } + + _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict( + degrees=[30, (-15, 20)], + ) + _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()} + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], + expand=[False, True], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + kwargs = {param: value} + if param != "angle": + kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] + check_kernel( + F.rotate_image, + make_image(dtype=dtype, device=device), + **kwargs, + check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), + ) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + expand=[False, True], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + ) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, param, value, format, dtype, device): + kwargs = {param: value} + if param != "angle": + kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] + + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + check_kernel( + F.rotate_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + **kwargs, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.rotate_mask, make_mask(), **self._MINIMAL_AFFINE_KWARGS) + + def test_kernel_video(self): + check_kernel(F.rotate_video, make_video(), **self._MINIMAL_AFFINE_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.rotate_image, torch.Tensor), + (F._rotate_image_pil, PIL.Image.Image), + (F.rotate_image, tv_tensors.Image), + (F.rotate_bounding_boxes, tv_tensors.BoundingBoxes), + (F.rotate_mask, tv_tensors.Mask), + (F.rotate_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.rotate, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform( + transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), make_input(device=device) + ) + + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + def test_functional_image_correctness(self, angle, center, interpolation, expand, fill): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + actual = F.rotate(image, angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill) + expected = F.to_image( + F.rotate( + F.to_pil_image(image), angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill + ) + ) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6 + + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, center, interpolation, expand, fill, seed): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + transform = transforms.RandomRotation( + **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, + center=center, + interpolation=interpolation, + expand=expand, + fill=fill, + ) + + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6 + + def _compute_output_canvas_size(self, *, expand, canvas_size, affine_matrix): + if not expand: + return canvas_size, (0.0, 0.0) + + input_height, input_width = canvas_size + + input_image_frame = np.array( + [ + [0.0, 0.0, 1.0], + [0.0, input_height, 1.0], + [input_width, input_height, 1.0], + [input_width, 0.0, 1.0], + ], + dtype=np.float64, + ) + output_image_frame = np.matmul(input_image_frame, affine_matrix.astype(input_image_frame.dtype).T) + + recenter_x = float(np.min(output_image_frame[:, 0])) + recenter_y = float(np.min(output_image_frame[:, 1])) + + output_width = int(np.max(output_image_frame[:, 0]) - recenter_x) + output_height = int(np.max(output_image_frame[:, 1]) - recenter_y) + + return (output_height, output_width), (recenter_x, recenter_y) + + def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy): + x, y = recenter_xy + if bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYXY: + translate = [x, y, x, y] + else: + translate = [x, y, 0.0, 0.0] + return tv_tensors.wrap( + (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes + ) + + def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center): + if center is None: + center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] + cx, cy = center + + a = np.cos(angle * np.pi / 180.0) + b = np.sin(angle * np.pi / 180.0) + affine_matrix = np.array( + [ + [a, b, cx - cx * a - b * cy], + [-b, a, cy + cx * b - a * cy], + ], + ) + + new_canvas_size, recenter_xy = self._compute_output_canvas_size( + expand=expand, canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix + ) + + output = reference_affine_bounding_boxes_helper( + bounding_boxes, + affine_matrix=affine_matrix, + new_canvas_size=new_canvas_size, + clamp=False, + ) + + return F.clamp_bounding_boxes(self._recenter_bounding_boxes_after_expand(output, recenter_xy=recenter_xy)).to( + bounding_boxes + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + def test_functional_bounding_boxes_correctness(self, format, angle, expand, center): + bounding_boxes = make_bounding_boxes(format=format) + + actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center) + expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center) + + torch.testing.assert_close(actual, expected) + torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_bounding_boxes_correctness(self, format, expand, center, seed): + bounding_boxes = make_bounding_boxes(format=format) + + transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center) + + torch.manual_seed(seed) + params = transform._get_params([bounding_boxes]) + + torch.manual_seed(seed) + actual = transform(bounding_boxes) + + expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center) + + torch.testing.assert_close(actual, expected) + torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + + @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_transform_get_params_bounds(self, degrees, seed): + transform = transforms.RandomRotation(degrees=degrees) + + torch.manual_seed(seed) + params = transform._get_params([]) + + if isinstance(degrees, (int, float)): + assert -degrees <= params["angle"] <= degrees + else: + assert degrees[0] <= params["angle"] <= degrees[1] + + @pytest.mark.parametrize("param", ["degrees", "center"]) + @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]]) + def test_transform_sequence_len_errors(self, param, value): + if param == "degrees" and not isinstance(value, list): + return + + kwargs = {param: value} + if param != "degrees": + kwargs["degrees"] = 0 + + with pytest.raises( + ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2" + ): + transforms.RandomRotation(**kwargs) + + def test_transform_negative_degrees_error(self): + with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"): + transforms.RandomAffine(degrees=-1) + + def test_transform_unknown_fill_error(self): + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomAffine(degrees=0, fill="fill") + + +class TestContainerTransforms: + class BuiltinTransform(transforms.Transform): + def _transform(self, inpt, params): + return inpt + + class PackedInputTransform(nn.Module): + def forward(self, sample): + assert len(sample) == 2 + return sample + + class UnpackedInputTransform(nn.Module): + def forward(self, image, label): + return image, label + + @pytest.mark.parametrize( + "transform_cls", [transforms.Compose, functools.partial(transforms.RandomApply, p=1), transforms.RandomOrder] + ) + @pytest.mark.parametrize( + "wrapped_transform_clss", + [ + [BuiltinTransform], + [PackedInputTransform], + [UnpackedInputTransform], + [BuiltinTransform, BuiltinTransform], + [PackedInputTransform, PackedInputTransform], + [UnpackedInputTransform, UnpackedInputTransform], + [BuiltinTransform, PackedInputTransform, BuiltinTransform], + [BuiltinTransform, UnpackedInputTransform, BuiltinTransform], + [PackedInputTransform, BuiltinTransform, PackedInputTransform], + [UnpackedInputTransform, BuiltinTransform, UnpackedInputTransform], + ], + ) + @pytest.mark.parametrize("unpack", [True, False]) + def test_packed_unpacked(self, transform_cls, wrapped_transform_clss, unpack): + needs_packed_inputs = any(issubclass(cls, self.PackedInputTransform) for cls in wrapped_transform_clss) + needs_unpacked_inputs = any(issubclass(cls, self.UnpackedInputTransform) for cls in wrapped_transform_clss) + assert not (needs_packed_inputs and needs_unpacked_inputs) + + transform = transform_cls([cls() for cls in wrapped_transform_clss]) + + image = make_image() + label = 3 + packed_input = (image, label) + + def call_transform(): + if unpack: + return transform(*packed_input) + else: + return transform(packed_input) + + if needs_unpacked_inputs and not unpack: + with pytest.raises(TypeError, match="missing 1 required positional argument"): + call_transform() + elif needs_packed_inputs and unpack: + with pytest.raises(TypeError, match="takes 2 positional arguments but 3 were given"): + call_transform() + else: + output = call_transform() + + assert isinstance(output, tuple) and len(output) == 2 + assert output[0] is image + assert output[1] is label + + def test_compose(self): + transform = transforms.Compose( + [ + transforms.RandomHorizontalFlip(p=1), + transforms.RandomVerticalFlip(p=1), + ] + ) + + input = make_image() + + actual = check_transform(transform, input) + expected = F.vertical_flip(F.horizontal_flip(input)) + + assert_equal(actual, expected) + + @pytest.mark.parametrize("p", [0.0, 1.0]) + @pytest.mark.parametrize("sequence_type", [list, nn.ModuleList]) + def test_random_apply(self, p, sequence_type): + transform = transforms.RandomApply( + sequence_type( + [ + transforms.RandomHorizontalFlip(p=1), + transforms.RandomVerticalFlip(p=1), + ] + ), + p=p, + ) + + # This needs to be a pure tensor (or a PIL image), because otherwise check_transforms skips the v1 compatibility + # check + input = make_image_tensor() + output = check_transform(transform, input, check_v1_compatibility=issubclass(sequence_type, nn.ModuleList)) + + if p == 1: + assert_equal(output, F.vertical_flip(F.horizontal_flip(input))) + else: + assert output is input + + @pytest.mark.parametrize("p", [(0, 1), (1, 0)]) + def test_random_choice(self, p): + transform = transforms.RandomChoice( + [ + transforms.RandomHorizontalFlip(p=1), + transforms.RandomVerticalFlip(p=1), + ], + p=p, + ) + + input = make_image() + output = check_transform(transform, input) + + p_horz, p_vert = p + if p_horz: + assert_equal(output, F.horizontal_flip(input)) + else: + assert_equal(output, F.vertical_flip(input)) + + def test_random_order(self): + transform = transforms.Compose( + [ + transforms.RandomHorizontalFlip(p=1), + transforms.RandomVerticalFlip(p=1), + ] + ) + + input = make_image() + + actual = check_transform(transform, input) + # We can't really check whether the transforms are actually applied in random order. However, horizontal and + # vertical flip are commutative. Meaning, even under the assumption that the transform applies them in random + # order, we can use a fixed order to compute the expected value. + expected = F.vertical_flip(F.horizontal_flip(input)) + + assert_equal(actual, expected) + + def test_errors(self): + for cls in [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder]: + with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"): + cls(lambda x: x) + + with pytest.raises(ValueError, match="at least one transform"): + transforms.Compose([]) + + for p in [-1, 2]: + with pytest.raises(ValueError, match=re.escape("value in the interval [0.0, 1.0]")): + transforms.RandomApply([lambda x: x], p=p) + + for transforms_, p in [([lambda x: x], []), ([], [1.0])]: + with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"): + transforms.RandomChoice(transforms_, p=p) + + +class TestToDtype: + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.to_dtype_image, make_image_tensor), + (F.to_dtype_image, make_image), + (F.to_dtype_video, make_video), + ], + ) + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + def test_kernel(self, kernel, make_input, input_dtype, output_dtype, device, scale): + check_kernel( + kernel, + make_input(dtype=input_dtype, device=device), + dtype=output_dtype, + scale=scale, + ) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + def test_functional(self, make_input, input_dtype, output_dtype, device, scale): + check_functional( + F.to_dtype, + make_input(dtype=input_dtype, device=device), + dtype=output_dtype, + scale=scale, + ) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + @pytest.mark.parametrize("as_dict", (True, False)) + def test_transform(self, make_input, input_dtype, output_dtype, device, scale, as_dict): + input = make_input(dtype=input_dtype, device=device) + if as_dict: + output_dtype = {type(input): output_dtype} + check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input, check_sample_input=not as_dict) + + def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False): + input_dtype = image.dtype + output_dtype = dtype + + if not scale: + return image.to(dtype) + + if output_dtype == input_dtype: + return image + + def fn(value): + if input_dtype.is_floating_point: + if output_dtype.is_floating_point: + return value + else: + return round(decimal.Decimal(value) * torch.iinfo(output_dtype).max) + else: + input_max_value = torch.iinfo(input_dtype).max + + if output_dtype.is_floating_point: + return float(decimal.Decimal(value) / input_max_value) + else: + output_max_value = torch.iinfo(output_dtype).max + + if input_max_value > output_max_value: + factor = (input_max_value + 1) // (output_max_value + 1) + return value / factor + else: + factor = (output_max_value + 1) // (input_max_value + 1) + return value * factor + + return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype, device=image.device) + + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + def test_image_correctness(self, input_dtype, output_dtype, device, scale): + if input_dtype.is_floating_point and output_dtype == torch.int64: + pytest.xfail("float to int64 conversion is not supported") + + input = make_image(dtype=input_dtype, device=device) + + out = F.to_dtype(input, dtype=output_dtype, scale=scale) + expected = self.reference_convert_dtype_image_tensor(input, dtype=output_dtype, scale=scale) + + if input_dtype.is_floating_point and not output_dtype.is_floating_point and scale: + torch.testing.assert_close(out, expected, atol=1, rtol=0) + else: + torch.testing.assert_close(out, expected) + + def was_scaled(self, inpt): + # this assumes the target dtype is float + return inpt.max() <= 1 + + def make_inpt_with_bbox_and_mask(self, make_input): + H, W = 10, 10 + inpt_dtype = torch.uint8 + bbox_dtype = torch.float32 + mask_dtype = torch.bool + sample = { + "inpt": make_input(size=(H, W), dtype=inpt_dtype), + "bbox": make_bounding_boxes(canvas_size=(H, W), dtype=bbox_dtype), + "mask": make_detection_masks(size=(H, W), dtype=mask_dtype), + } + + return sample, inpt_dtype, bbox_dtype, mask_dtype + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + @pytest.mark.parametrize("scale", (True, False)) + def test_dtype_not_a_dict(self, make_input, scale): + # assert only inpt gets transformed when dtype isn't a dict + + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + out = transforms.ToDtype(dtype=torch.float32, scale=scale)(sample) + + assert out["inpt"].dtype != inpt_dtype + assert out["inpt"].dtype == torch.float32 + if scale: + assert self.was_scaled(out["inpt"]) + else: + assert not self.was_scaled(out["inpt"]) + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype == mask_dtype + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + def test_others_catch_all_and_none(self, make_input): + # make sure "others" works as a catch-all and that None means no conversion + + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.int64, "others": None})(sample) + assert out["inpt"].dtype == inpt_dtype + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype != mask_dtype + assert out["mask"].dtype == torch.int64 + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + def test_typical_use_case(self, make_input): + # Typical use-case: want to convert dtype and scale for inpt and just dtype for masks. + # This just makes sure we now have a decent API for this + + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + out = transforms.ToDtype( + dtype={type(sample["inpt"]): torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True + )(sample) + assert out["inpt"].dtype != inpt_dtype + assert out["inpt"].dtype == torch.float32 + assert self.was_scaled(out["inpt"]) + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype != mask_dtype + assert out["mask"].dtype == torch.int64 + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + def test_errors_warnings(self, make_input): + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + + with pytest.raises(ValueError, match="No dtype was specified for"): + out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.float32})(sample) + with pytest.warns(UserWarning, match=re.escape("plain `torch.Tensor` will *not* be transformed")): + transforms.ToDtype(dtype={torch.Tensor: torch.float32, tv_tensors.Image: torch.float32}) + with pytest.warns(UserWarning, match="no scaling will be done"): + out = transforms.ToDtype(dtype={"others": None}, scale=True)(sample) + assert out["inpt"].dtype == inpt_dtype + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype == mask_dtype + + +class TestAdjustBrightness: + _CORRECTNESS_BRIGHTNESS_FACTORS = [0.5, 0.0, 1.0, 5.0] + _DEFAULT_BRIGHTNESS_FACTOR = _CORRECTNESS_BRIGHTNESS_FACTORS[0] + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.adjust_brightness_image, make_image), + (F.adjust_brightness_video, make_video), + ], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel(self, kernel, make_input, dtype, device): + check_kernel(kernel, make_input(dtype=dtype, device=device), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_brightness, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_brightness_image, torch.Tensor), + (F._adjust_brightness_image_pil, PIL.Image.Image), + (F.adjust_brightness_image, tv_tensors.Image), + (F.adjust_brightness_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_brightness, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("brightness_factor", _CORRECTNESS_BRIGHTNESS_FACTORS) + def test_image_correctness(self, brightness_factor): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_brightness(image, brightness_factor=brightness_factor) + expected = F.to_image(F.adjust_brightness(F.to_pil_image(image), brightness_factor=brightness_factor)) + + torch.testing.assert_close(actual, expected) + + +class TestCutMixMixUp: + class DummyDataset: + def __init__(self, size, num_classes): + self.size = size + self.num_classes = num_classes + assert size < num_classes + + def __getitem__(self, idx): + img = torch.rand(3, 100, 100) + label = idx # This ensures all labels in a batch are unique and makes testing easier + return img, label + + def __len__(self): + return self.size + + @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) + def test_supported_input_structure(self, T): + + batch_size = 32 + num_classes = 100 + + dataset = self.DummyDataset(size=batch_size, num_classes=num_classes) + + cutmix_mixup = T(num_classes=num_classes) + + dl = DataLoader(dataset, batch_size=batch_size) + + # Input sanity checks + img, target = next(iter(dl)) + input_img_size = img.shape[-3:] + assert isinstance(img, torch.Tensor) and isinstance(target, torch.Tensor) + assert target.shape == (batch_size,) + + def check_output(img, target): + assert img.shape == (batch_size, *input_img_size) + assert target.shape == (batch_size, num_classes) + torch.testing.assert_close(target.sum(axis=-1), torch.ones(batch_size)) + num_non_zero_labels = (target != 0).sum(axis=-1) + assert (num_non_zero_labels == 2).all() + + # After Dataloader, as unpacked input + img, target = next(iter(dl)) + assert target.shape == (batch_size,) + img, target = cutmix_mixup(img, target) + check_output(img, target) + + # After Dataloader, as packed input + packed_from_dl = next(iter(dl)) + assert isinstance(packed_from_dl, list) + img, target = cutmix_mixup(packed_from_dl) + check_output(img, target) + + # As collation function. We expect default_collate to be used by users. + def collate_fn_1(batch): + return cutmix_mixup(default_collate(batch)) + + def collate_fn_2(batch): + return cutmix_mixup(*default_collate(batch)) + + for collate_fn in (collate_fn_1, collate_fn_2): + dl = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn) + img, target = next(iter(dl)) + check_output(img, target) + + @needs_cuda + @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) + def test_cpu_vs_gpu(self, T): + num_classes = 10 + batch_size = 3 + H, W = 12, 12 + + imgs = torch.rand(batch_size, 3, H, W) + labels = torch.randint(0, num_classes, (batch_size,)) + cutmix_mixup = T(alpha=0.5, num_classes=num_classes) + + _check_kernel_cuda_vs_cpu(cutmix_mixup, imgs, labels, rtol=None, atol=None) + + @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) + def test_error(self, T): + + num_classes = 10 + batch_size = 9 + + imgs = torch.rand(batch_size, 3, 12, 12) + cutmix_mixup = T(alpha=0.5, num_classes=num_classes) + + for input_with_bad_type in ( + F.to_pil_image(imgs[0]), + tv_tensors.Mask(torch.rand(12, 12)), + tv_tensors.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12), + ): + with pytest.raises(ValueError, match="does not support PIL images, "): + cutmix_mixup(input_with_bad_type) + + with pytest.raises(ValueError, match="Could not infer where the labels are"): + cutmix_mixup({"img": imgs, "Nothing_else": 3}) + + with pytest.raises(ValueError, match="labels tensor should be of shape"): + # Note: the error message isn't ideal, but that's because the label heuristic found the img as the label + # It's OK, it's an edge-case. The important thing is that this fails loudly instead of passing silently + cutmix_mixup(imgs) + + with pytest.raises(ValueError, match="When using the default labels_getter"): + cutmix_mixup(imgs, "not_a_tensor") + + with pytest.raises(ValueError, match="labels tensor should be of shape"): + cutmix_mixup(imgs, torch.randint(0, 2, size=(2, 3))) + + with pytest.raises(ValueError, match="Expected a batched input with 4 dims"): + cutmix_mixup(imgs[None, None], torch.randint(0, num_classes, size=(batch_size,))) + + with pytest.raises(ValueError, match="does not match the batch size of the labels"): + cutmix_mixup(imgs, torch.randint(0, num_classes, size=(batch_size + 1,))) + + with pytest.raises(ValueError, match="labels tensor should be of shape"): + # The purpose of this check is more about documenting the current + # behaviour of what happens on a Compose(), rather than actually + # asserting the expected behaviour. We may support Compose() in the + # future, e.g. for 2 consecutive CutMix? + labels = torch.randint(0, num_classes, size=(batch_size,)) + transforms.Compose([cutmix_mixup, cutmix_mixup])(imgs, labels) + + +@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT")) +@pytest.mark.parametrize("sample_type", (tuple, list, dict)) +def test_labels_getter_default_heuristic(key, sample_type): + labels = torch.arange(10) + sample = {key: labels, "another_key": "whatever"} + if sample_type is not dict: + sample = sample_type((None, sample, "whatever_again")) + assert transforms._utils._find_labels_default_heuristic(sample) is labels + + if key.lower() != "labels": + # If "labels" is in the dict (case-insensitive), + # it takes precedence over other keys which would otherwise be a match + d = {key: "something_else", "labels": labels} + assert transforms._utils._find_labels_default_heuristic(d) is labels + + +class TestShapeGetters: + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_dimensions_image, make_image_tensor), + (F._get_dimensions_image_pil, make_image_pil), + (F.get_dimensions_image, make_image), + (F.get_dimensions_video, make_video), + ], + ) + def test_get_dimensions(self, kernel, make_input): + size = (10, 10) + color_space, num_channels = "RGB", 3 + + input = make_input(size, color_space=color_space) + + assert kernel(input) == F.get_dimensions(input) == [num_channels, *size] + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_num_channels_image, make_image_tensor), + (F._get_num_channels_image_pil, make_image_pil), + (F.get_num_channels_image, make_image), + (F.get_num_channels_video, make_video), + ], + ) + def test_get_num_channels(self, kernel, make_input): + color_space, num_channels = "RGB", 3 + + input = make_input(color_space=color_space) + + assert kernel(input) == F.get_num_channels(input) == num_channels + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_size_image, make_image_tensor), + (F._get_size_image_pil, make_image_pil), + (F.get_size_image, make_image), + (F.get_size_bounding_boxes, make_bounding_boxes), + (F.get_size_mask, make_detection_masks), + (F.get_size_mask, make_segmentation_mask), + (F.get_size_video, make_video), + ], + ) + def test_get_size(self, kernel, make_input): + size = (10, 10) + + input = make_input(size) + + assert kernel(input) == F.get_size(input) == list(size) + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_num_frames_video, make_video_tensor), + (F.get_num_frames_video, make_video), + ], + ) + def test_get_num_frames(self, kernel, make_input): + num_frames = 4 + + input = make_input(num_frames=num_frames) + + assert kernel(input) == F.get_num_frames(input) == num_frames + + @pytest.mark.parametrize( + ("functional", "make_input"), + [ + (F.get_dimensions, make_bounding_boxes), + (F.get_dimensions, make_detection_masks), + (F.get_dimensions, make_segmentation_mask), + (F.get_num_channels, make_bounding_boxes), + (F.get_num_channels, make_detection_masks), + (F.get_num_channels, make_segmentation_mask), + (F.get_num_frames, make_image_pil), + (F.get_num_frames, make_image), + (F.get_num_frames, make_bounding_boxes), + (F.get_num_frames, make_detection_masks), + (F.get_num_frames, make_segmentation_mask), + ], + ) + def test_unsupported_types(self, functional, make_input): + input = make_input() + + with pytest.raises(TypeError, match=re.escape(str(type(input)))): + functional(input) + + +class TestRegisterKernel: + @pytest.mark.parametrize("functional", (F.resize, "resize")) + def test_register_kernel(self, functional): + class CustomTVTensor(tv_tensors.TVTensor): + pass + + kernel_was_called = False + + @F.register_kernel(functional, CustomTVTensor) + def new_resize(dp, *args, **kwargs): + nonlocal kernel_was_called + kernel_was_called = True + return dp + + t = transforms.Resize(size=(224, 224), antialias=True) + + my_dp = CustomTVTensor(torch.rand(3, 10, 10)) + out = t(my_dp) + assert out is my_dp + assert kernel_was_called + + # Sanity check to make sure we didn't override the kernel of other types + t(torch.rand(3, 10, 10)).shape == (3, 224, 224) + t(tv_tensors.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224) + + def test_errors(self): + with pytest.raises(ValueError, match="Could not find functional with name"): + F.register_kernel("bad_name", tv_tensors.Image) + + with pytest.raises(ValueError, match="Kernels can only be registered on functionals"): + F.register_kernel(tv_tensors.Image, F.resize) + + with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"): + F.register_kernel(F.resize, object) + + with pytest.raises(ValueError, match="cannot be registered for the builtin tv_tensor classes"): + F.register_kernel(F.resize, tv_tensors.Image)(F.resize_image) + + class CustomTVTensor(tv_tensors.TVTensor): + pass + + def resize_custom_tv_tensor(): + pass + + F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor) + + with pytest.raises(ValueError, match="already has a kernel registered for type"): + F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor) + + +class TestGetKernel: + # We are using F.resize as functional and the kernels below as proxy. Any other functional / kernels combination + # would also be fine + KERNELS = { + torch.Tensor: F.resize_image, + PIL.Image.Image: F._resize_image_pil, + tv_tensors.Image: F.resize_image, + tv_tensors.BoundingBoxes: F.resize_bounding_boxes, + tv_tensors.Mask: F.resize_mask, + tv_tensors.Video: F.resize_video, + } + + @pytest.mark.parametrize("input_type", [str, int, object]) + def test_unsupported_types(self, input_type): + with pytest.raises(TypeError, match="supports inputs of type"): + _get_kernel(F.resize, input_type) + + def test_exact_match(self): + # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the + # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional + # here, register the kernels without wrapper, and check the exact matching afterwards. + def resize_with_pure_kernels(): + pass + + for input_type, kernel in self.KERNELS.items(): + _register_kernel_internal(resize_with_pure_kernels, input_type, tv_tensor_wrapper=False)(kernel) + + assert _get_kernel(resize_with_pure_kernels, input_type) is kernel + + def test_builtin_tv_tensor_subclass(self): + # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the + # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional + # here, register the kernels without wrapper, and check if subclasses of our builtin tv_tensors get dispatched + # to the kernel of the corresponding superclass + def resize_with_pure_kernels(): + pass + + class MyImage(tv_tensors.Image): + pass + + class MyBoundingBoxes(tv_tensors.BoundingBoxes): + pass + + class MyMask(tv_tensors.Mask): + pass + + class MyVideo(tv_tensors.Video): + pass + + for custom_tv_tensor_subclass in [ + MyImage, + MyBoundingBoxes, + MyMask, + MyVideo, + ]: + builtin_tv_tensor_class = custom_tv_tensor_subclass.__mro__[1] + builtin_tv_tensor_kernel = self.KERNELS[builtin_tv_tensor_class] + _register_kernel_internal(resize_with_pure_kernels, builtin_tv_tensor_class, tv_tensor_wrapper=False)( + builtin_tv_tensor_kernel + ) + + assert _get_kernel(resize_with_pure_kernels, custom_tv_tensor_subclass) is builtin_tv_tensor_kernel + + def test_tv_tensor_subclass(self): + class MyTVTensor(tv_tensors.TVTensor): + pass + + with pytest.raises(TypeError, match="supports inputs of type"): + _get_kernel(F.resize, MyTVTensor) + + def resize_my_tv_tensor(): + pass + + _register_kernel_internal(F.resize, MyTVTensor, tv_tensor_wrapper=False)(resize_my_tv_tensor) + + assert _get_kernel(F.resize, MyTVTensor) is resize_my_tv_tensor + + def test_pil_image_subclass(self): + opened_image = PIL.Image.open(Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg") + loaded_image = opened_image.convert("RGB") + + # check the assumptions + assert isinstance(opened_image, PIL.Image.Image) + assert type(opened_image) is not PIL.Image.Image + + assert type(loaded_image) is PIL.Image.Image + + size = [17, 11] + for image in [opened_image, loaded_image]: + kernel = _get_kernel(F.resize, type(image)) + + output = kernel(image, size=size) + + assert F.get_size(output) == size + + +class TestPermuteChannels: + _DEFAULT_PERMUTATION = [2, 0, 1] + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.permute_channels_image, make_image_tensor), + # FIXME + # check_kernel does not support PIL kernel, but it should + (F.permute_channels_image, make_image), + (F.permute_channels_video, make_video), + ], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel(self, kernel, make_input, dtype, device): + check_kernel(kernel, make_input(dtype=dtype, device=device), permutation=self._DEFAULT_PERMUTATION) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.permute_channels, make_input(), permutation=self._DEFAULT_PERMUTATION) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.permute_channels_image, torch.Tensor), + (F._permute_channels_image_pil, PIL.Image.Image), + (F.permute_channels_image, tv_tensors.Image), + (F.permute_channels_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.permute_channels, kernel=kernel, input_type=input_type) + + def reference_image_correctness(self, image, permutation): + channel_images = image.split(1, dim=-3) + permuted_channel_images = [channel_images[channel_idx] for channel_idx in permutation] + return tv_tensors.Image(torch.concat(permuted_channel_images, dim=-3)) + + @pytest.mark.parametrize("permutation", [[2, 0, 1], [1, 2, 0], [2, 0, 1], [0, 1, 2]]) + @pytest.mark.parametrize("batch_dims", [(), (2,), (2, 1)]) + def test_image_correctness(self, permutation, batch_dims): + image = make_image(batch_dims=batch_dims) + + actual = F.permute_channels(image, permutation=permutation) + expected = self.reference_image_correctness(image, permutation=permutation) + + torch.testing.assert_close(actual, expected) + + +class TestElastic: + def _make_displacement(self, inpt): + return torch.rand( + 1, + *F.get_size(inpt), + 2, + dtype=torch.float32, + device=inpt.device if isinstance(inpt, torch.Tensor) else "cpu", + ) + + @param_value_parametrization( + interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8, torch.float16]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + image = make_image_tensor(dtype=dtype, device=device) + + check_kernel( + F.elastic_image, + image, + displacement=self._make_displacement(image), + **{param: value}, + check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), + check_cuda_vs_cpu=dtype is not torch.float16, + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + check_kernel( + F.elastic_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + displacement=self._make_displacement(bounding_boxes), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + mask = make_mask() + check_kernel(F.elastic_mask, mask, displacement=self._make_displacement(mask)) + + def test_kernel_video(self): + video = make_video() + check_kernel(F.elastic_video, video, displacement=self._make_displacement(video)) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + input = make_input() + check_functional(F.elastic, input, displacement=self._make_displacement(input)) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.elastic_image, torch.Tensor), + (F._elastic_image_pil, PIL.Image.Image), + (F.elastic_image, tv_tensors.Image), + (F.elastic_bounding_boxes, tv_tensors.BoundingBoxes), + (F.elastic_mask, tv_tensors.Mask), + (F.elastic_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.elastic, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_displacement_error(self, make_input): + input = make_input() + + with pytest.raises(TypeError, match="displacement should be a Tensor"): + F.elastic(input, displacement=None) + + with pytest.raises(ValueError, match="displacement shape should be"): + F.elastic(input, displacement=torch.rand(F.get_size(input))) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image + @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, size, device): + check_transform( + transforms.ElasticTransform(), + make_input(size, device=device), + # We updated gaussian blur kernel generation with a faster and numerically more stable version + check_v1_compatibility=dict(rtol=0, atol=1), + ) + + +class TestToPureTensor: + def test_correctness(self): + input = { + "img": make_image(), + "img_tensor": make_image_tensor(), + "img_pil": make_image_pil(), + "mask": make_detection_masks(), + "video": make_video(), + "bbox": make_bounding_boxes(), + "str": "str", + } + + out = transforms.ToPureTensor()(input) + + for input_value, out_value in zip(input.values(), out.values()): + if isinstance(input_value, tv_tensors.TVTensor): + assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, tv_tensors.TVTensor) + else: + assert isinstance(out_value, type(input_value)) + + +class TestCrop: + INPUT_SIZE = (21, 11) + + CORRECTNESS_CROP_KWARGS = [ + # center + dict(top=5, left=5, height=10, width=5), + # larger than input, i.e. pad + dict(top=-5, left=-5, height=30, width=20), + # sides: left, right, top, bottom + dict(top=-5, left=-5, height=30, width=10), + dict(top=-5, left=5, height=30, width=10), + dict(top=-5, left=-5, height=20, width=20), + dict(top=5, left=-5, height=20, width=20), + # corners: top-left, top-right, bottom-left, bottom-right + dict(top=-5, left=-5, height=20, width=10), + dict(top=-5, left=5, height=20, width=10), + dict(top=5, left=-5, height=20, width=10), + dict(top=5, left=5, height=20, width=10), + ] + MINIMAL_CROP_KWARGS = CORRECTNESS_CROP_KWARGS[0] + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, kwargs, dtype, device): + check_kernel(F.crop_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **kwargs) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_box(self, kwargs, format, dtype, device): + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) + check_kernel(F.crop_bounding_boxes, bounding_boxes, format=format, **kwargs) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.crop_mask, make_mask(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) + + def test_kernel_video(self): + check_kernel(F.crop_video, make_video(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.crop, make_input(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.crop_image, torch.Tensor), + (F._crop_image_pil, PIL.Image.Image), + (F.crop_image, tv_tensors.Image), + (F.crop_bounding_boxes, tv_tensors.BoundingBoxes), + (F.crop_mask, tv_tensors.Mask), + (F.crop_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.crop, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + def test_functional_image_correctness(self, kwargs): + image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + + actual = F.crop(image, **kwargs) + expected = F.to_image(F.crop(F.to_pil_image(image), **kwargs)) + + assert_equal(actual, expected) + + @param_value_parametrization( + size=[(10, 5), (25, 15), (25, 5), (10, 15)], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_transform(self, param, value, make_input): + input = make_input(self.INPUT_SIZE) + + check_sample_input = True + if param == "fill": + if isinstance(value, (tuple, list)): + if isinstance(input, tv_tensors.Mask): + pytest.skip("F.pad_mask doesn't support non-scalar fill.") + else: + check_sample_input = False + + kwargs = dict( + # 1. size is required + # 2. the fill parameter only has an affect if we need padding + size=[s + 4 for s in self.INPUT_SIZE], + fill=adapt_fill(value, dtype=input.dtype if isinstance(input, torch.Tensor) else torch.uint8), + ) + else: + kwargs = {param: value} + + check_transform( + transforms.RandomCrop(**kwargs, pad_if_needed=True), + input, + check_v1_compatibility=param != "fill" or isinstance(value, (int, float)), + check_sample_input=check_sample_input, + ) + + @pytest.mark.parametrize("padding", [1, (1, 1), (1, 1, 1, 1)]) + def test_transform_padding(self, padding): + inpt = make_image(self.INPUT_SIZE) + + output_size = [s + 2 for s in F.get_size(inpt)] + transform = transforms.RandomCrop(output_size, padding=padding) + + output = transform(inpt) + + assert F.get_size(output) == output_size + + @pytest.mark.parametrize("padding", [None, 1, (1, 1), (1, 1, 1, 1)]) + def test_transform_insufficient_padding(self, padding): + inpt = make_image(self.INPUT_SIZE) + + output_size = [s + 3 for s in F.get_size(inpt)] + transform = transforms.RandomCrop(output_size, padding=padding) + + with pytest.raises(ValueError, match="larger than (padded )?input image size"): + transform(inpt) + + def test_transform_pad_if_needed(self): + inpt = make_image(self.INPUT_SIZE) + + output_size = [s * 2 for s in F.get_size(inpt)] + transform = transforms.RandomCrop(output_size, pad_if_needed=True) + + output = transform(inpt) + + assert F.get_size(output) == output_size + + @param_value_parametrization( + size=[(10, 5), (25, 15), (25, 5), (10, 15)], + fill=CORRECTNESS_FILLS, + padding_mode=["constant", "edge", "reflect", "symmetric"], + ) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, param, value, seed): + kwargs = {param: value} + if param != "size": + # 1. size is required + # 2. the fill / padding_mode parameters only have an affect if we need padding + kwargs["size"] = [s + 4 for s in self.INPUT_SIZE] + if param == "fill": + kwargs["fill"] = adapt_fill(kwargs["fill"], dtype=torch.uint8) + + transform = transforms.RandomCrop(pad_if_needed=True, **kwargs) + + image = make_image(self.INPUT_SIZE) + + with freeze_rng_state(): + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + assert_equal(actual, expected) + + def _reference_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width): + affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + ], + ) + return reference_affine_bounding_boxes_helper( + bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width) + ) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_functional_bounding_box_correctness(self, kwargs, format, dtype, device): + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) + + actual = F.crop(bounding_boxes, **kwargs) + expected = self._reference_crop_bounding_boxes(bounding_boxes, **kwargs) + + assert_equal(actual, expected, atol=1, rtol=0) + assert_equal(F.get_size(actual), F.get_size(expected)) + + @pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)]) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_bounding_boxes_correctness(self, output_size, format, dtype, device, seed): + input_size = [s * 2 for s in output_size] + bounding_boxes = make_bounding_boxes(input_size, format=format, dtype=dtype, device=device) + + transform = transforms.RandomCrop(output_size) + + with freeze_rng_state(): + torch.manual_seed(seed) + params = transform._get_params([bounding_boxes]) + assert not params.pop("needs_pad") + del params["padding"] + assert params.pop("needs_crop") + + torch.manual_seed(seed) + actual = transform(bounding_boxes) + + expected = self._reference_crop_bounding_boxes(bounding_boxes, **params) + + assert_equal(actual, expected) + assert_equal(F.get_size(actual), F.get_size(expected)) + + def test_errors(self): + with pytest.raises(ValueError, match="Please provide only two dimensions"): + transforms.RandomCrop([10, 12, 14]) + + with pytest.raises(TypeError, match="Got inappropriate padding arg"): + transforms.RandomCrop([10, 12], padding="abc") + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomCrop([10, 12], padding=1, fill="abc") + + with pytest.raises(ValueError, match="Padding mode should be either"): + transforms.RandomCrop([10, 12], padding=1, padding_mode="abc") + + +class TestErase: + INPUT_SIZE = (17, 11) + FUNCTIONAL_KWARGS = dict( + zip("ijhwv", [2, 2, 10, 8, torch.tensor(0.0, dtype=torch.float32, device="cpu").reshape(-1, 1, 1)]) + ) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.erase_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **self.FUNCTIONAL_KWARGS) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image_inplace(self, dtype, device): + input = make_image(self.INPUT_SIZE, dtype=dtype, device=device) + input_version = input._version + + output_out_of_place = F.erase_image(input, **self.FUNCTIONAL_KWARGS) + assert output_out_of_place.data_ptr() != input.data_ptr() + assert output_out_of_place is not input + + output_inplace = F.erase_image(input, **self.FUNCTIONAL_KWARGS, inplace=True) + assert output_inplace.data_ptr() == input.data_ptr() + assert output_inplace._version > input_version + assert output_inplace is input + + assert_equal(output_inplace, output_out_of_place) + + def test_kernel_video(self): + check_kernel(F.erase_video, make_video(self.INPUT_SIZE), **self.FUNCTIONAL_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_functional(self, make_input): + check_functional(F.erase, make_input(), **self.FUNCTIONAL_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.erase_image, torch.Tensor), + (F._erase_image_pil, PIL.Image.Image), + (F.erase_image, tv_tensors.Image), + (F.erase_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.erase, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + input = make_input(device=device) + + with pytest.warns(UserWarning, match="currently passing through inputs of type"): + check_transform( + transforms.RandomErasing(p=1), + input, + check_v1_compatibility=not isinstance(input, PIL.Image.Image), + ) + + def _reference_erase_image(self, image, *, i, j, h, w, v): + mask = torch.zeros_like(image, dtype=torch.bool) + mask[..., i : i + h, j : j + w] = True + + # The broadcasting and type casting logic is handled automagically in the kernel through indexing + value = torch.broadcast_to(v, (*image.shape[:-2], h, w)).to(image) + + erased_image = torch.empty_like(image) + erased_image[mask] = value.flatten() + erased_image[~mask] = image[~mask] + + return erased_image + + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_functional_image_correctness(self, dtype, device): + image = make_image(dtype=dtype, device=device) + + actual = F.erase(image, **self.FUNCTIONAL_KWARGS) + expected = self._reference_erase_image(image, **self.FUNCTIONAL_KWARGS) + + assert_equal(actual, expected) + + @param_value_parametrization( + scale=[(0.1, 0.2), [0.0, 1.0]], + ratio=[(0.3, 0.7), [0.1, 5.0]], + value=[0, 0.5, (0, 1, 0), [-0.2, 0.0, 1.3], "random"], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, param, value, dtype, device, seed): + transform = transforms.RandomErasing(**{param: value}, p=1) + + image = make_image(dtype=dtype, device=device) + + with freeze_rng_state(): + torch.manual_seed(seed) + # This emulates the random apply check that happens before _get_params is called + torch.rand(1) + params = transform._get_params([image]) + + torch.manual_seed(seed) + actual = transform(image) + + expected = self._reference_erase_image(image, **params) + + assert_equal(actual, expected) + + def test_transform_errors(self): + with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"): + transforms.RandomErasing(value={}) + + with pytest.raises(ValueError, match="If value is str, it should be 'random'"): + transforms.RandomErasing(value="abc") + + with pytest.raises(TypeError, match="Scale should be a sequence"): + transforms.RandomErasing(scale=123) + + with pytest.raises(TypeError, match="Ratio should be a sequence"): + transforms.RandomErasing(ratio=123) + + with pytest.raises(ValueError, match="Scale should be between 0 and 1"): + transforms.RandomErasing(scale=[-1, 2]) + + transform = transforms.RandomErasing(value=[1, 2, 3, 4]) + + with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"): + transform._get_params([make_image()]) + + +class TestGaussianBlur: + @pytest.mark.parametrize("kernel_size", [1, 3, (3, 1), [3, 5]]) + @pytest.mark.parametrize("sigma", [None, 1.0, 1, (0.5,), [0.3], (0.3, 0.7), [0.9, 0.2]]) + def test_kernel_image(self, kernel_size, sigma): + check_kernel( + F.gaussian_blur_image, + make_image(), + kernel_size=kernel_size, + sigma=sigma, + check_scripted_vs_eager=not (isinstance(kernel_size, int) or isinstance(sigma, (float, int))), + ) + + def test_kernel_image_errors(self): + image = make_image_tensor() + + with pytest.raises(ValueError, match="kernel_size is a sequence its length should be 2"): + F.gaussian_blur_image(image, kernel_size=[1, 2, 3]) + + for kernel_size in [2, -1]: + with pytest.raises(ValueError, match="kernel_size should have odd and positive integers"): + F.gaussian_blur_image(image, kernel_size=kernel_size) + + with pytest.raises(ValueError, match="sigma is a sequence, its length should be 2"): + F.gaussian_blur_image(image, kernel_size=1, sigma=[1, 2, 3]) + + with pytest.raises(TypeError, match="sigma should be either float or sequence of floats"): + F.gaussian_blur_image(image, kernel_size=1, sigma=object()) + + with pytest.raises(ValueError, match="sigma should have positive values"): + F.gaussian_blur_image(image, kernel_size=1, sigma=-1) + + def test_kernel_video(self): + check_kernel(F.gaussian_blur_video, make_video(), kernel_size=(3, 3)) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_functional(self, make_input): + check_functional(F.gaussian_blur, make_input(), kernel_size=(3, 3)) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.gaussian_blur_image, torch.Tensor), + (F._gaussian_blur_image_pil, PIL.Image.Image), + (F.gaussian_blur_image, tv_tensors.Image), + (F.gaussian_blur_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.gaussian_blur, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("sigma", [5, 2.0, (0.5, 2), [1.3, 2.7]]) + def test_transform(self, make_input, device, sigma): + check_transform(transforms.GaussianBlur(kernel_size=3, sigma=sigma), make_input(device=device)) + + def test_assertions(self): + with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"): + transforms.GaussianBlur([10, 12, 14]) + + with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"): + transforms.GaussianBlur(4) + + with pytest.raises(ValueError, match="If sigma is a sequence its length should be 1 or 2. Got 3"): + transforms.GaussianBlur(3, sigma=[1, 2, 3]) + + with pytest.raises(ValueError, match="sigma values should be positive and of the form"): + transforms.GaussianBlur(3, sigma=-1.0) + + with pytest.raises(ValueError, match="sigma values should be positive and of the form"): + transforms.GaussianBlur(3, sigma=[2.0, 1.0]) + + with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"): + transforms.GaussianBlur(3, sigma={}) + + @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0], (10, 12.0), [10]]) + def test__get_params(self, sigma): + transform = transforms.GaussianBlur(3, sigma=sigma) + params = transform._get_params([]) + + if isinstance(sigma, float): + assert params["sigma"][0] == params["sigma"][1] == sigma + elif isinstance(sigma, list) and len(sigma) == 1: + assert params["sigma"][0] == params["sigma"][1] == sigma[0] + else: + assert sigma[0] <= params["sigma"][0] <= sigma[1] + assert sigma[0] <= params["sigma"][1] <= sigma[1] + + # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3)) + # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28)) + # { + # "10_12_3__3_3_0.8": cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8), + # "10_12_3__3_3_0.5": cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5), + # "10_12_3__3_5_0.8": cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8), + # "10_12_3__3_5_0.5": cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5), + # "26_28_1__23_23_1.7": cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7), + # } + REFERENCE_GAUSSIAN_BLUR_IMAGE_RESULTS = torch.load( + Path(__file__).parent / "assets" / "gaussian_blur_opencv_results.pt" + ) + + @pytest.mark.parametrize( + ("dimensions", "kernel_size", "sigma"), + [ + ((3, 10, 12), (3, 3), 0.8), + ((3, 10, 12), (3, 3), 0.5), + ((3, 10, 12), (3, 5), 0.8), + ((3, 10, 12), (3, 5), 0.5), + ((1, 26, 28), (23, 23), 1.7), + ], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.float64, torch.float16]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_functional_image_correctness(self, dimensions, kernel_size, sigma, dtype, device): + if dtype is torch.float16 and device == "cpu": + pytest.skip("The CPU implementation of float16 on CPU differs from opencv") + + num_channels, height, width = dimensions + + reference_results_key = f"{height}_{width}_{num_channels}__{kernel_size[0]}_{kernel_size[1]}_{sigma}" + expected = ( + torch.tensor(self.REFERENCE_GAUSSIAN_BLUR_IMAGE_RESULTS[reference_results_key]) + .reshape(height, width, num_channels) + .permute(2, 0, 1) + .to(dtype=dtype, device=device) + ) + + image = tv_tensors.Image( + torch.arange(num_channels * height * width, dtype=torch.uint8) + .reshape(height, width, num_channels) + .permute(2, 0, 1), + dtype=dtype, + device=device, + ) + + actual = F.gaussian_blur_image(image, kernel_size=kernel_size, sigma=sigma) + + torch.testing.assert_close(actual, expected, rtol=0, atol=1) + + +class TestAutoAugmentTransforms: + # These transforms have a lot of branches in their `forward()` passes which are conditioned on random sampling. + # It's typically very hard to test the effect on some parameters without heavy mocking logic. + # This class adds correctness tests for the kernels that are specific to those transforms. The rest of kernels, e.g. + # rotate, are tested in their respective classes. The rest of the tests here are mostly smoke tests. + + def _reference_shear_translate(self, image, *, transform_id, magnitude, interpolation, fill): + if isinstance(image, PIL.Image.Image): + input = image + else: + input = F.to_pil_image(image) + + matrix = { + "ShearX": (1, magnitude, 0, 0, 1, 0), + "ShearY": (1, 0, 0, magnitude, 1, 0), + "TranslateX": (1, 0, -int(magnitude), 0, 1, 0), + "TranslateY": (1, 0, 0, 0, 1, -int(magnitude)), + }[transform_id] + + output = input.transform( + input.size, PIL.Image.AFFINE, matrix, resample=pil_modes_mapping[interpolation], fill=fill + ) + + if isinstance(image, PIL.Image.Image): + return output + else: + return F.to_image(output) + + @pytest.mark.parametrize("transform_id", ["ShearX", "ShearY", "TranslateX", "TranslateY"]) + @pytest.mark.parametrize("magnitude", [0.3, -0.2, 0.0]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + @pytest.mark.parametrize("input_type", ["Tensor", "PIL"]) + def test_correctness_shear_translate(self, transform_id, magnitude, interpolation, fill, input_type): + # ShearX/Y and TranslateX/Y are the only ops that are native to the AA transforms. They are modeled after the + # reference implementation: + # https://github.com/tensorflow/models/blob/885fda091c46c59d6c7bb5c7e760935eacc229da/research/autoaugment/augmentation_transforms.py#L273-L362 + # All other ops are checked in their respective dedicated tests. + + image = make_image(dtype=torch.uint8, device="cpu") + if input_type == "PIL": + image = F.to_pil_image(image) + + if "Translate" in transform_id: + # For TranslateX/Y magnitude is a value in pixels + magnitude *= min(F.get_size(image)) + + actual = transforms.AutoAugment()._apply_image_or_video_transform( + image, + transform_id=transform_id, + magnitude=magnitude, + interpolation=interpolation, + fill={type(image): fill}, + ) + expected = self._reference_shear_translate( + image, transform_id=transform_id, magnitude=magnitude, interpolation=interpolation, fill=fill + ) + + if input_type == "PIL": + actual, expected = F.to_image(actual), F.to_image(expected) + + if "Shear" in transform_id and input_type == "Tensor": + mae = (actual.float() - expected.float()).abs().mean() + assert mae < (12 if interpolation is transforms.InterpolationMode.NEAREST else 5) + else: + assert_close(actual, expected, rtol=0, atol=1) + + def _sample_input_adapter(self, transform, input, device): + adapted_input = {} + image_or_video_found = False + for key, value in input.items(): + if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.Mask)): + # AA transforms don't support bounding boxes or masks + continue + elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)): + if image_or_video_found: + # AA transforms only support a single image or video + continue + image_or_video_found = True + adapted_input[key] = value + return adapted_input + + @pytest.mark.parametrize( + "transform", + [transforms.AutoAugment(), transforms.RandAugment(), transforms.TrivialAugmentWide(), transforms.AugMix()], + ) + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_smoke(self, transform, make_input, dtype, device): + if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"): + pytest.skip( + "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' " + "will degenerate to that anyway." + ) + input = make_input(dtype=dtype, device=device) + + with freeze_rng_state(): + # By default every test starts from the same random seed. This leads to minimal coverage of the sampling + # that happens inside forward(). To avoid calling the transform multiple times to achieve higher coverage, + # we build a reproducible random seed from the input type, dtype, and device. + torch.manual_seed(hash((make_input, dtype, device))) + + # For v2, we changed the random sampling of the AA transforms. This makes it impossible to compare the v1 + # and v2 outputs without complicated mocking and monkeypatching. Thus, we skip the v1 compatibility checks + # here and only check if we can script the v2 transform and subsequently call the result. + check_transform( + transform, input, check_v1_compatibility=False, check_sample_input=self._sample_input_adapter + ) + + if type(input) is torch.Tensor and dtype is torch.uint8: + _script(transform)(input) + + def test_auto_augment_policy_error(self): + with pytest.raises(ValueError, match="provided policy"): + transforms.AutoAugment(policy=None) + + @pytest.mark.parametrize("severity", [0, 11]) + def test_aug_mix_severity_error(self, severity): + with pytest.raises(ValueError, match="severity must be between"): + transforms.AugMix(severity=severity) + + +class TestConvertBoundingBoxFormat: + old_new_formats = list(itertools.permutations(iter(tv_tensors.BoundingBoxFormat), 2)) + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + def test_kernel(self, old_format, new_format): + check_kernel( + F.convert_bounding_box_format, + make_bounding_boxes(format=old_format), + new_format=new_format, + old_format=old_format, + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("inplace", [False, True]) + def test_kernel_noop(self, format, inplace): + input = make_bounding_boxes(format=format).as_subclass(torch.Tensor) + input_version = input._version + + output = F.convert_bounding_box_format(input, old_format=format, new_format=format, inplace=inplace) + + assert output is input + assert output.data_ptr() == input.data_ptr() + assert output._version == input_version + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + def test_kernel_inplace(self, old_format, new_format): + input = make_bounding_boxes(format=old_format).as_subclass(torch.Tensor) + input_version = input._version + + output_out_of_place = F.convert_bounding_box_format(input, old_format=old_format, new_format=new_format) + assert output_out_of_place.data_ptr() != input.data_ptr() + assert output_out_of_place is not input + + output_inplace = F.convert_bounding_box_format( + input, old_format=old_format, new_format=new_format, inplace=True + ) + assert output_inplace.data_ptr() == input.data_ptr() + assert output_inplace._version > input_version + assert output_inplace is input + + assert_equal(output_inplace, output_out_of_place) + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + def test_functional(self, old_format, new_format): + check_functional(F.convert_bounding_box_format, make_bounding_boxes(format=old_format), new_format=new_format) + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + @pytest.mark.parametrize("format_type", ["enum", "str"]) + def test_transform(self, old_format, new_format, format_type): + check_transform( + transforms.ConvertBoundingBoxFormat(new_format.name if format_type == "str" else new_format), + make_bounding_boxes(format=old_format), + ) + + def _reference_convert_bounding_box_format(self, bounding_boxes, new_format): + return tv_tensors.wrap( + torchvision.ops.box_convert( + bounding_boxes.as_subclass(torch.Tensor), + in_fmt=bounding_boxes.format.name.lower(), + out_fmt=new_format.name.lower(), + ).to(bounding_boxes.dtype), + like=bounding_boxes, + format=new_format, + ) + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("fn_type", ["functional", "transform"]) + def test_correctness(self, old_format, new_format, dtype, device, fn_type): + bounding_boxes = make_bounding_boxes(format=old_format, dtype=dtype, device=device) + + if fn_type == "functional": + fn = functools.partial(F.convert_bounding_box_format, new_format=new_format) + else: + fn = transforms.ConvertBoundingBoxFormat(format=new_format) + + actual = fn(bounding_boxes) + expected = self._reference_convert_bounding_box_format(bounding_boxes, new_format) + + assert_equal(actual, expected) + + def test_errors(self): + input_tv_tensor = make_bounding_boxes() + input_pure_tensor = input_tv_tensor.as_subclass(torch.Tensor) + + for input in [input_tv_tensor, input_pure_tensor]: + with pytest.raises(TypeError, match="missing 1 required argument: 'new_format'"): + F.convert_bounding_box_format(input) + + with pytest.raises(ValueError, match="`old_format` has to be passed"): + F.convert_bounding_box_format(input_pure_tensor, new_format=input_tv_tensor.format) + + with pytest.raises(ValueError, match="`old_format` must not be passed"): + F.convert_bounding_box_format( + input_tv_tensor, old_format=input_tv_tensor.format, new_format=input_tv_tensor.format + ) + + +class TestResizedCrop: + INPUT_SIZE = (17, 11) + CROP_KWARGS = dict(top=2, left=2, height=5, width=7) + OUTPUT_SIZE = (19, 32) + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.resized_crop_image, make_image), + (F.resized_crop_bounding_boxes, make_bounding_boxes), + (F.resized_crop_mask, make_segmentation_mask), + (F.resized_crop_mask, make_detection_masks), + (F.resized_crop_video, make_video), + ], + ) + def test_kernel(self, kernel, make_input): + input = make_input(self.INPUT_SIZE) + if isinstance(input, tv_tensors.BoundingBoxes): + extra_kwargs = dict(format=input.format) + elif isinstance(input, tv_tensors.Mask): + extra_kwargs = dict() + else: + extra_kwargs = dict(antialias=True) + + check_kernel(kernel, input, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, **extra_kwargs) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional( + F.resized_crop, make_input(self.INPUT_SIZE), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, antialias=True + ) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.resized_crop_image, torch.Tensor), + (F._resized_crop_image_pil, PIL.Image.Image), + (F.resized_crop_image, tv_tensors.Image), + (F.resized_crop_bounding_boxes, tv_tensors.BoundingBoxes), + (F.resized_crop_mask, tv_tensors.Mask), + (F.resized_crop_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.resized_crop, kernel=kernel, input_type=input_type) + + @param_value_parametrization( + scale=[(0.1, 0.2), [0.0, 1.0]], + ratio=[(0.3, 0.7), [0.1, 5.0]], + ) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_transform(self, param, value, make_input): + check_transform( + transforms.RandomResizedCrop(size=self.OUTPUT_SIZE, **{param: value}, antialias=True), + make_input(self.INPUT_SIZE), + check_v1_compatibility=dict(rtol=0, atol=1), + ) + + # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2. + # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT` + @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST}) + def test_functional_image_correctness(self, interpolation): + image = make_image(self.INPUT_SIZE, dtype=torch.uint8) + + actual = F.resized_crop( + image, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation, antialias=True + ) + expected = F.to_image( + F.resized_crop( + F.to_pil_image(image), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation + ) + ) + + torch.testing.assert_close(actual, expected, atol=1, rtol=0) + + def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width, size): + new_height, new_width = size + + crop_affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + [0, 0, 1], + ], + ) + resize_affine_matrix = np.array( + [ + [new_width / width, 0, 0], + [0, new_height / height, 0], + [0, 0, 1], + ], + ) + affine_matrix = (resize_affine_matrix @ crop_affine_matrix)[:2, :] + + return reference_affine_bounding_boxes_helper( + bounding_boxes, + affine_matrix=affine_matrix, + new_canvas_size=size, + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_functional_bounding_boxes_correctness(self, format): + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format) + + actual = F.resized_crop(bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE) + expected = self._reference_resized_crop_bounding_boxes( + bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE + ) + + assert_equal(actual, expected) + assert_equal(F.get_size(actual), F.get_size(expected)) + + def test_transform_errors_warnings(self): + with pytest.raises(ValueError, match="provide only two dimensions"): + transforms.RandomResizedCrop(size=(1, 2, 3)) + + with pytest.raises(TypeError, match="Scale should be a sequence"): + transforms.RandomResizedCrop(size=self.INPUT_SIZE, scale=123) + + with pytest.raises(TypeError, match="Ratio should be a sequence"): + transforms.RandomResizedCrop(size=self.INPUT_SIZE, ratio=123) + + for param in ["scale", "ratio"]: + with pytest.warns(match="Scale and ratio should be of kind"): + transforms.RandomResizedCrop(size=self.INPUT_SIZE, **{param: [1, 0]}) + + +class TestPad: + EXHAUSTIVE_TYPE_PADDINGS = [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] + CORRECTNESS_PADDINGS = [ + padding + for padding in EXHAUSTIVE_TYPE_PADDINGS + if isinstance(padding, int) or isinstance(padding, list) and len(padding) > 1 + ] + PADDING_MODES = ["constant", "symmetric", "edge", "reflect"] + + @param_value_parametrization( + padding=EXHAUSTIVE_TYPE_PADDINGS, + fill=EXHAUSTIVE_TYPE_FILLS, + padding_mode=PADDING_MODES, + ) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + if param == "fill": + value = adapt_fill(value, dtype=dtype) + kwargs = {param: value} + if param != "padding": + kwargs["padding"] = [1] + + image = make_image(dtype=dtype, device=device) + + check_kernel( + F.pad_image, + image, + **kwargs, + check_scripted_vs_eager=not ( + (param == "padding" and isinstance(value, int)) + # See https://github.com/pytorch/vision/pull/7252#issue-1585585521 for details + or ( + param == "fill" + and ( + isinstance(value, tuple) or (isinstance(value, list) and any(isinstance(v, int) for v in value)) + ) + ) + ), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_kernel_bounding_boxes(self, format): + bounding_boxes = make_bounding_boxes(format=format) + check_kernel( + F.pad_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + padding=[1], + ) + + @pytest.mark.parametrize("padding_mode", ["symmetric", "edge", "reflect"]) + def test_kernel_bounding_boxes_errors(self, padding_mode): + bounding_boxes = make_bounding_boxes() + with pytest.raises(ValueError, match=f"'{padding_mode}' is not supported"): + F.pad_bounding_boxes( + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + padding=[1], + padding_mode=padding_mode, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.pad_mask, make_mask(), padding=[1]) + + @pytest.mark.parametrize("fill", [[1], (0,), [1, 0, 1], (0, 1, 0)]) + def test_kernel_mask_errors(self, fill): + with pytest.raises(ValueError, match="Non-scalar fill value is not supported"): + check_kernel(F.pad_mask, make_segmentation_mask(), padding=[1], fill=fill) + + def test_kernel_video(self): + check_kernel(F.pad_video, make_video(), padding=[1]) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.pad, make_input(), padding=[1]) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.pad_image, torch.Tensor), + # The PIL kernel uses fill=0 as default rather than fill=None as all others. + # Since the whole fill story is already really inconsistent, we won't introduce yet another case to allow + # for this test to pass. + # See https://github.com/pytorch/vision/issues/6623 for a discussion. + # (F._pad_image_pil, PIL.Image.Image), + (F.pad_image, tv_tensors.Image), + (F.pad_bounding_boxes, tv_tensors.BoundingBoxes), + (F.pad_mask, tv_tensors.Mask), + (F.pad_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.pad, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_transform(self, make_input): + check_transform(transforms.Pad(padding=[1]), make_input()) + + def test_transform_errors(self): + with pytest.raises(TypeError, match="Got inappropriate padding arg"): + transforms.Pad("abc") + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.Pad([-0.7, 0, 0.7]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.Pad(12, fill="abc") + + with pytest.raises(ValueError, match="Padding mode should be either"): + transforms.Pad(12, padding_mode="abc") + + @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS) + @pytest.mark.parametrize( + ("padding_mode", "fill"), + [ + *[("constant", fill) for fill in CORRECTNESS_FILLS], + *[(padding_mode, None) for padding_mode in ["symmetric", "edge", "reflect"]], + ], + ) + @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)]) + def test_image_correctness(self, padding, padding_mode, fill, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + actual = fn(image, padding=padding, padding_mode=padding_mode, fill=fill) + expected = F.to_image(F.pad(F.to_pil_image(image), padding=padding, padding_mode=padding_mode, fill=fill)) + + assert_equal(actual, expected) + + def _reference_pad_bounding_boxes(self, bounding_boxes, *, padding): + if isinstance(padding, int): + padding = [padding] + left, top, right, bottom = padding * (4 // len(padding)) + + affine_matrix = np.array( + [ + [1, 0, left], + [0, 1, top], + ], + ) + + height = bounding_boxes.canvas_size[0] + top + bottom + width = bounding_boxes.canvas_size[1] + left + right + + return reference_affine_bounding_boxes_helper( + bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width) + ) + + @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)]) + def test_bounding_boxes_correctness(self, padding, format, dtype, device, fn): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + actual = fn(bounding_boxes, padding=padding) + expected = self._reference_pad_bounding_boxes(bounding_boxes, padding=padding) + + assert_equal(actual, expected) + + +class TestCenterCrop: + INPUT_SIZE = (17, 11) + OUTPUT_SIZES = [(3, 5), (5, 3), (4, 4), (21, 9), (13, 15), (19, 14), 3, (4,), [5], INPUT_SIZE] + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, output_size, dtype, device): + check_kernel( + F.center_crop_image, + make_image(self.INPUT_SIZE, dtype=dtype, device=device), + output_size=output_size, + check_scripted_vs_eager=not isinstance(output_size, int), + ) + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_kernel_bounding_boxes(self, output_size, format): + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format) + check_kernel( + F.center_crop_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + output_size=output_size, + check_scripted_vs_eager=not isinstance(output_size, int), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.center_crop_mask, make_mask(), output_size=self.OUTPUT_SIZES[0]) + + def test_kernel_video(self): + check_kernel(F.center_crop_video, make_video(self.INPUT_SIZE), output_size=self.OUTPUT_SIZES[0]) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.center_crop, make_input(self.INPUT_SIZE), output_size=self.OUTPUT_SIZES[0]) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.center_crop_image, torch.Tensor), + (F._center_crop_image_pil, PIL.Image.Image), + (F.center_crop_image, tv_tensors.Image), + (F.center_crop_bounding_boxes, tv_tensors.BoundingBoxes), + (F.center_crop_mask, tv_tensors.Mask), + (F.center_crop_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.center_crop, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_transform(self, make_input): + check_transform(transforms.CenterCrop(self.OUTPUT_SIZES[0]), make_input(self.INPUT_SIZE)) + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)]) + def test_image_correctness(self, output_size, fn): + image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + + actual = fn(image, output_size) + expected = F.to_image(F.center_crop(F.to_pil_image(image), output_size=output_size)) + + assert_equal(actual, expected) + + def _reference_center_crop_bounding_boxes(self, bounding_boxes, output_size): + image_height, image_width = bounding_boxes.canvas_size + if isinstance(output_size, int): + output_size = (output_size, output_size) + elif len(output_size) == 1: + output_size *= 2 + crop_height, crop_width = output_size + + top = int(round((image_height - crop_height) / 2)) + left = int(round((image_width - crop_width) / 2)) + + affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + ], + ) + return reference_affine_bounding_boxes_helper( + bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=output_size + ) + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)]) + def test_bounding_boxes_correctness(self, output_size, format, dtype, device, fn): + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) + + actual = fn(bounding_boxes, output_size) + expected = self._reference_center_crop_bounding_boxes(bounding_boxes, output_size) + + assert_equal(actual, expected) + + +class TestPerspective: + COEFFICIENTS = [ + [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018], + [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063], + ] + START_END_POINTS = [ + ([[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]), + ([[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]), + ([[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]), + ] + MINIMAL_KWARGS = dict(startpoints=None, endpoints=None, coefficients=COEFFICIENTS[0]) + + @param_value_parametrization( + coefficients=COEFFICIENTS, + start_end_points=START_END_POINTS, + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + if param == "start_end_points": + kwargs = dict(zip(["startpoints", "endpoints"], value)) + else: + kwargs = {"startpoints": None, "endpoints": None, param: value} + if param == "fill": + kwargs["coefficients"] = self.COEFFICIENTS[0] + + check_kernel( + F.perspective_image, + make_image(dtype=dtype, device=device), + **kwargs, + check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), + ) + + def test_kernel_image_error(self): + image = make_image_tensor() + + with pytest.raises(ValueError, match="startpoints/endpoints or the coefficients must have non `None` values"): + F.perspective_image(image, startpoints=None, endpoints=None) + + with pytest.raises( + ValueError, match="startpoints/endpoints and the coefficients shouldn't be defined concurrently" + ): + startpoints, endpoints = self.START_END_POINTS[0] + coefficients = self.COEFFICIENTS[0] + F.perspective_image(image, startpoints=startpoints, endpoints=endpoints, coefficients=coefficients) + + with pytest.raises(ValueError, match="coefficients should have 8 float values"): + F.perspective_image(image, startpoints=None, endpoints=None, coefficients=list(range(7))) + + @param_value_parametrization( + coefficients=COEFFICIENTS, + start_end_points=START_END_POINTS, + ) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_kernel_bounding_boxes(self, param, value, format): + if param == "start_end_points": + kwargs = dict(zip(["startpoints", "endpoints"], value)) + else: + kwargs = {"startpoints": None, "endpoints": None, param: value} + + bounding_boxes = make_bounding_boxes(format=format) + + check_kernel( + F.perspective_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + **kwargs, + ) + + def test_kernel_bounding_boxes_error(self): + bounding_boxes = make_bounding_boxes() + format, canvas_size = bounding_boxes.format, bounding_boxes.canvas_size + bounding_boxes = bounding_boxes.as_subclass(torch.Tensor) + + with pytest.raises(RuntimeError, match="Denominator is zero"): + F.perspective_bounding_boxes( + bounding_boxes, + format=format, + canvas_size=canvas_size, + startpoints=None, + endpoints=None, + coefficients=[0.0] * 8, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.perspective_mask, make_mask(), **self.MINIMAL_KWARGS) + + def test_kernel_video(self): + check_kernel(F.perspective_video, make_video(), **self.MINIMAL_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.perspective, make_input(), **self.MINIMAL_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.perspective_image, torch.Tensor), + (F._perspective_image_pil, PIL.Image.Image), + (F.perspective_image, tv_tensors.Image), + (F.perspective_bounding_boxes, tv_tensors.BoundingBoxes), + (F.perspective_mask, tv_tensors.Mask), + (F.perspective_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.perspective, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("distortion_scale", [0.5, 0.0, 1.0]) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_transform(self, distortion_scale, make_input): + check_transform(transforms.RandomPerspective(distortion_scale=distortion_scale, p=1), make_input()) + + @pytest.mark.parametrize("distortion_scale", [-1, 2]) + def test_transform_error(self, distortion_scale): + with pytest.raises(ValueError, match="distortion_scale value should be between 0 and 1"): + transforms.RandomPerspective(distortion_scale=distortion_scale) + + @pytest.mark.parametrize("coefficients", COEFFICIENTS) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + def test_image_functional_correctness(self, coefficients, interpolation, fill): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.perspective( + image, startpoints=None, endpoints=None, coefficients=coefficients, interpolation=interpolation, fill=fill + ) + expected = F.to_image( + F.perspective( + F.to_pil_image(image), + startpoints=None, + endpoints=None, + coefficients=coefficients, + interpolation=interpolation, + fill=fill, + ) + ) + + if interpolation is transforms.InterpolationMode.BILINEAR: + abs_diff = (actual.float() - expected.float()).abs() + assert (abs_diff > 1).float().mean() < 7e-2 + mae = abs_diff.mean() + assert mae < 3 + else: + assert_equal(actual, expected) + + def _reference_perspective_bounding_boxes(self, bounding_boxes, *, startpoints, endpoints): + format = bounding_boxes.format + canvas_size = bounding_boxes.canvas_size + dtype = bounding_boxes.dtype + device = bounding_boxes.device + + coefficients = _get_perspective_coeffs(endpoints, startpoints) + + def perspective_bounding_boxes(bounding_boxes): + m1 = np.array( + [ + [coefficients[0], coefficients[1], coefficients[2]], + [coefficients[3], coefficients[4], coefficients[5]], + ] + ) + m2 = np.array( + [ + [coefficients[6], coefficients[7], 1.0], + [coefficients[6], coefficients[7], 1.0], + ] + ) + + # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 + input_xyxy = F.convert_bounding_box_format( + bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True), + old_format=format, + new_format=tv_tensors.BoundingBoxFormat.XYXY, + inplace=True, + ) + x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist() + + points = np.array( + [ + [x1, y1, 1.0], + [x2, y1, 1.0], + [x1, y2, 1.0], + [x2, y2, 1.0], + ] + ) + + numerator = points @ m1.T + denominator = points @ m2.T + transformed_points = numerator / denominator + + output_xyxy = torch.Tensor( + [ + float(np.min(transformed_points[:, 0])), + float(np.min(transformed_points[:, 1])), + float(np.max(transformed_points[:, 0])), + float(np.max(transformed_points[:, 1])), + ] + ) + + output = F.convert_bounding_box_format( + output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format + ) + + # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 + return F.clamp_bounding_boxes( + output, + format=format, + canvas_size=canvas_size, + ).to(dtype=dtype, device=device) + + return tv_tensors.BoundingBoxes( + torch.cat([perspective_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape( + bounding_boxes.shape + ), + format=format, + canvas_size=canvas_size, + ) + + @pytest.mark.parametrize(("startpoints", "endpoints"), START_END_POINTS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_correctness_perspective_bounding_boxes(self, startpoints, endpoints, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + actual = F.perspective(bounding_boxes, startpoints=startpoints, endpoints=endpoints) + expected = self._reference_perspective_bounding_boxes( + bounding_boxes, startpoints=startpoints, endpoints=endpoints + ) + + assert_close(actual, expected, rtol=0, atol=1) + + +class TestEqualize: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.equalize_image, make_image(dtype=dtype, device=device)) + + def test_kernel_video(self): + check_kernel(F.equalize_image, make_video()) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.equalize, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.equalize_image, torch.Tensor), + (F._equalize_image_pil, PIL.Image.Image), + (F.equalize_image, tv_tensors.Image), + (F.equalize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.equalize, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_transform(self, make_input): + check_transform(transforms.RandomEqualize(p=1), make_input()) + + @pytest.mark.parametrize(("low", "high"), [(0, 64), (64, 192), (192, 256), (0, 1), (127, 128), (255, 256)]) + @pytest.mark.parametrize("fn", [F.equalize, transform_cls_to_functional(transforms.RandomEqualize, p=1)]) + def test_image_correctness(self, low, high, fn): + # We are not using the default `make_image` here since that uniformly samples the values over the whole value + # range. Since the whole point of F.equalize is to transform an arbitrary distribution of values into a uniform + # one over the full range, the information gain is low if we already provide something really close to the + # expected value. + image = tv_tensors.Image( + torch.testing.make_tensor((3, 117, 253), dtype=torch.uint8, device="cpu", low=low, high=high) + ) + + actual = fn(image) + expected = F.to_image(F.equalize(F.to_pil_image(image))) + + assert_equal(actual, expected) + + +class TestUniformTemporalSubsample: + def test_kernel_video(self): + check_kernel(F.uniform_temporal_subsample_video, make_video(), num_samples=2) + + @pytest.mark.parametrize("make_input", [make_video_tensor, make_video]) + def test_functional(self, make_input): + check_functional(F.uniform_temporal_subsample, make_input(), num_samples=2) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.uniform_temporal_subsample_video, torch.Tensor), + (F.uniform_temporal_subsample_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.uniform_temporal_subsample, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_video_tensor, make_video]) + def test_transform(self, make_input): + check_transform(transforms.UniformTemporalSubsample(num_samples=2), make_input()) + + def _reference_uniform_temporal_subsample_video(self, video, *, num_samples): + # Adapted from + # https://github.com/facebookresearch/pytorchvideo/blob/c8d23d8b7e597586a9e2d18f6ed31ad8aa379a7a/pytorchvideo/transforms/functional.py#L19 + t = video.shape[-4] + assert num_samples > 0 and t > 0 + # Sample by nearest neighbor interpolation if num_samples > t. + indices = torch.linspace(0, t - 1, num_samples, device=video.device) + indices = torch.clamp(indices, 0, t - 1).long() + return tv_tensors.Video(torch.index_select(video, -4, indices)) + + CORRECTNESS_NUM_FRAMES = 5 + + @pytest.mark.parametrize("num_samples", list(range(1, CORRECTNESS_NUM_FRAMES + 1))) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize( + "fn", [F.uniform_temporal_subsample, transform_cls_to_functional(transforms.UniformTemporalSubsample)] + ) + def test_video_correctness(self, num_samples, dtype, device, fn): + video = make_video(num_frames=self.CORRECTNESS_NUM_FRAMES, dtype=dtype, device=device) + + actual = fn(video, num_samples=num_samples) + expected = self._reference_uniform_temporal_subsample_video(video, num_samples=num_samples) + + assert_equal(actual, expected) + + +class TestNormalize: + MEANS_STDS = [ + ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]), + ] + MEAN, STD = MEANS_STDS[0] + + @pytest.mark.parametrize(("mean", "std"), [*MEANS_STDS, (0.5, 2.0)]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, mean, std, device): + check_kernel(F.normalize_image, make_image(dtype=torch.float32, device=device), mean=self.MEAN, std=self.STD) + + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image_inplace(self, device): + input = make_image_tensor(dtype=torch.float32, device=device) + input_version = input._version + + output_out_of_place = F.normalize_image(input, mean=self.MEAN, std=self.STD) + assert output_out_of_place.data_ptr() != input.data_ptr() + assert output_out_of_place is not input + + output_inplace = F.normalize_image(input, mean=self.MEAN, std=self.STD, inplace=True) + assert output_inplace.data_ptr() == input.data_ptr() + assert output_inplace._version > input_version + assert output_inplace is input + + assert_equal(output_inplace, output_out_of_place) + + def test_kernel_video(self): + check_kernel(F.normalize_video, make_video(dtype=torch.float32), mean=self.MEAN, std=self.STD) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.normalize, make_input(dtype=torch.float32), mean=self.MEAN, std=self.STD) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.normalize_image, torch.Tensor), + (F.normalize_image, tv_tensors.Image), + (F.normalize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.normalize, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(TypeError, match="should be a float tensor"): + F.normalize_image(make_image(dtype=torch.uint8), mean=self.MEAN, std=self.STD) + + with pytest.raises(ValueError, match="tensor image of size"): + F.normalize_image(torch.rand(16, 16, dtype=torch.float32), mean=self.MEAN, std=self.STD) + + for std in [0, [0, 0, 0], [0, 1, 1]]: + with pytest.raises(ValueError, match="std evaluated to zero, leading to division by zero"): + F.normalize_image(make_image(dtype=torch.float32), mean=self.MEAN, std=std) + + def _sample_input_adapter(self, transform, input, device): + adapted_input = {} + for key, value in input.items(): + if isinstance(value, PIL.Image.Image): + # normalize doesn't support PIL images + continue + elif check_type(value, (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)): + # normalize doesn't support integer images + value = F.to_dtype(value, torch.float32, scale=True) + adapted_input[key] = value + return adapted_input + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + def test_transform(self, make_input): + check_transform( + transforms.Normalize(mean=self.MEAN, std=self.STD), + make_input(dtype=torch.float32), + check_sample_input=self._sample_input_adapter, + ) + + def _reference_normalize_image(self, image, *, mean, std): + image = image.numpy() + mean, std = [np.array(stat, dtype=image.dtype).reshape((-1, 1, 1)) for stat in [mean, std]] + return tv_tensors.Image((image - mean) / std) + + @pytest.mark.parametrize(("mean", "std"), MEANS_STDS) + @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64]) + @pytest.mark.parametrize("fn", [F.normalize, transform_cls_to_functional(transforms.Normalize)]) + def test_correctness_image(self, mean, std, dtype, fn): + image = make_image(dtype=dtype) + + actual = fn(image, mean=mean, std=std) + expected = self._reference_normalize_image(image, mean=mean, std=std) + + assert_equal(actual, expected) + + +class TestClampBoundingBoxes: + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel(self, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + check_kernel( + F.clamp_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_functional(self, format): + check_functional(F.clamp_bounding_boxes, make_bounding_boxes(format=format)) + + def test_errors(self): + input_tv_tensor = make_bounding_boxes() + input_pure_tensor = input_tv_tensor.as_subclass(torch.Tensor) + format, canvas_size = input_tv_tensor.format, input_tv_tensor.canvas_size + + for format_, canvas_size_ in [(None, None), (format, None), (None, canvas_size)]: + with pytest.raises( + ValueError, match="For pure tensor inputs, `format` and `canvas_size` have to be passed." + ): + F.clamp_bounding_boxes(input_pure_tensor, format=format_, canvas_size=canvas_size_) + + for format_, canvas_size_ in [(format, canvas_size), (format, None), (None, canvas_size)]: + with pytest.raises( + ValueError, match="For bounding box tv_tensor inputs, `format` and `canvas_size` must not be passed." + ): + F.clamp_bounding_boxes(input_tv_tensor, format=format_, canvas_size=canvas_size_) + + def test_transform(self): + check_transform(transforms.ClampBoundingBoxes(), make_bounding_boxes()) + + +class TestInvert: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.int16, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.invert_image, make_image(dtype=dtype, device=device)) + + def test_kernel_video(self): + check_kernel(F.invert_video, make_video()) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.invert, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.invert_image, torch.Tensor), + (F._invert_image_pil, PIL.Image.Image), + (F.invert_image, tv_tensors.Image), + (F.invert_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.invert, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + check_transform(transforms.RandomInvert(p=1), make_input()) + + @pytest.mark.parametrize("fn", [F.invert, transform_cls_to_functional(transforms.RandomInvert, p=1)]) + def test_correctness_image(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.invert(F.to_pil_image(image))) + + assert_equal(actual, expected) + + +class TestPosterize: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.posterize_image, make_image(dtype=dtype, device=device), bits=1) + + def test_kernel_video(self): + check_kernel(F.posterize_video, make_video(), bits=1) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.posterize, make_input(), bits=1) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.posterize_image, torch.Tensor), + (F._posterize_image_pil, PIL.Image.Image), + (F.posterize_image, tv_tensors.Image), + (F.posterize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.posterize, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + check_transform(transforms.RandomPosterize(bits=1, p=1), make_input()) + + @pytest.mark.parametrize("bits", [1, 4, 8]) + @pytest.mark.parametrize("fn", [F.posterize, transform_cls_to_functional(transforms.RandomPosterize, p=1)]) + def test_correctness_image(self, bits, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image, bits=bits) + expected = F.to_image(F.posterize(F.to_pil_image(image), bits=bits)) + + assert_equal(actual, expected) + + +class TestSolarize: + def _make_threshold(self, input, *, factor=0.5): + dtype = input.dtype if isinstance(input, torch.Tensor) else torch.uint8 + return (float if dtype.is_floating_point else int)(get_max_value(dtype) * factor) + + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + image = make_image(dtype=dtype, device=device) + check_kernel(F.solarize_image, image, threshold=self._make_threshold(image)) + + def test_kernel_video(self): + video = make_video() + check_kernel(F.solarize_video, video, threshold=self._make_threshold(video)) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + input = make_input() + check_functional(F.solarize, input, threshold=self._make_threshold(input)) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.solarize_image, torch.Tensor), + (F._solarize_image_pil, PIL.Image.Image), + (F.solarize_image, tv_tensors.Image), + (F.solarize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.solarize, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize(("dtype", "threshold"), [(torch.uint8, 256), (torch.float, 1.5)]) + def test_functional_error(self, dtype, threshold): + with pytest.raises(TypeError, match="Threshold should be less or equal the maximum value of the dtype"): + F.solarize(make_image(dtype=dtype), threshold=threshold) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + input = make_input() + check_transform(transforms.RandomSolarize(threshold=self._make_threshold(input), p=1), input) + + @pytest.mark.parametrize("threshold_factor", [0.0, 0.1, 0.5, 0.9, 1.0]) + @pytest.mark.parametrize("fn", [F.solarize, transform_cls_to_functional(transforms.RandomSolarize, p=1)]) + def test_correctness_image(self, threshold_factor, fn): + image = make_image(dtype=torch.uint8, device="cpu") + threshold = self._make_threshold(image, factor=threshold_factor) + + actual = fn(image, threshold=threshold) + expected = F.to_image(F.solarize(F.to_pil_image(image), threshold=threshold)) + + assert_equal(actual, expected) + + +class TestAutocontrast: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.int16, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.autocontrast_image, make_image(dtype=dtype, device=device)) + + def test_kernel_video(self): + check_kernel(F.autocontrast_video, make_video()) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.autocontrast, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.autocontrast_image, torch.Tensor), + (F._autocontrast_image_pil, PIL.Image.Image), + (F.autocontrast_image, tv_tensors.Image), + (F.autocontrast_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.autocontrast, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + check_transform(transforms.RandomAutocontrast(p=1), make_input(), check_v1_compatibility=dict(rtol=0, atol=1)) + + @pytest.mark.parametrize("fn", [F.autocontrast, transform_cls_to_functional(transforms.RandomAutocontrast, p=1)]) + def test_correctness_image(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.autocontrast(F.to_pil_image(image))) + + assert_close(actual, expected, rtol=0, atol=1) + + +class TestAdjustSharpness: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_sharpness_image, make_image(dtype=dtype, device=device), sharpness_factor=0.5) + + def test_kernel_video(self): + check_kernel(F.adjust_sharpness_video, make_video(), sharpness_factor=0.5) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_sharpness, make_input(), sharpness_factor=0.5) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_sharpness_image, torch.Tensor), + (F._adjust_sharpness_image_pil, PIL.Image.Image), + (F.adjust_sharpness_image, tv_tensors.Image), + (F.adjust_sharpness_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_sharpness, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + check_transform(transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1), make_input()) + + def test_functional_error(self): + with pytest.raises(TypeError, match="can have 1 or 3 channels"): + F.adjust_sharpness(make_image(color_space="RGBA"), sharpness_factor=0.5) + + with pytest.raises(ValueError, match="is not non-negative"): + F.adjust_sharpness(make_image(), sharpness_factor=-1) + + @pytest.mark.parametrize("sharpness_factor", [0.1, 0.5, 1.0]) + @pytest.mark.parametrize( + "fn", [F.adjust_sharpness, transform_cls_to_functional(transforms.RandomAdjustSharpness, p=1)] + ) + def test_correctness_image(self, sharpness_factor, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image, sharpness_factor=sharpness_factor) + expected = F.to_image(F.adjust_sharpness(F.to_pil_image(image), sharpness_factor=sharpness_factor)) + + assert_equal(actual, expected) + + +class TestAdjustContrast: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_contrast_image, make_image(dtype=dtype, device=device), contrast_factor=0.5) + + def test_kernel_video(self): + check_kernel(F.adjust_contrast_video, make_video(), contrast_factor=0.5) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_contrast, make_input(), contrast_factor=0.5) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_contrast_image, torch.Tensor), + (F._adjust_contrast_image_pil, PIL.Image.Image), + (F.adjust_contrast_image, tv_tensors.Image), + (F.adjust_contrast_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_contrast, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(TypeError, match="permitted channel values are 1 or 3"): + F.adjust_contrast(make_image(color_space="RGBA"), contrast_factor=0.5) + + with pytest.raises(ValueError, match="is not non-negative"): + F.adjust_contrast(make_image(), contrast_factor=-1) + + @pytest.mark.parametrize("contrast_factor", [0.1, 0.5, 1.0]) + def test_correctness_image(self, contrast_factor): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_contrast(image, contrast_factor=contrast_factor) + expected = F.to_image(F.adjust_contrast(F.to_pil_image(image), contrast_factor=contrast_factor)) + + assert_close(actual, expected, rtol=0, atol=1) + + +class TestAdjustGamma: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_gamma_image, make_image(dtype=dtype, device=device), gamma=0.5) + + def test_kernel_video(self): + check_kernel(F.adjust_gamma_video, make_video(), gamma=0.5) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_gamma, make_input(), gamma=0.5) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_gamma_image, torch.Tensor), + (F._adjust_gamma_image_pil, PIL.Image.Image), + (F.adjust_gamma_image, tv_tensors.Image), + (F.adjust_gamma_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_gamma, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(ValueError, match="Gamma should be a non-negative real number"): + F.adjust_gamma(make_image(), gamma=-1) + + @pytest.mark.parametrize("gamma", [0.1, 0.5, 1.0]) + @pytest.mark.parametrize("gain", [0.1, 1.0, 2.0]) + def test_correctness_image(self, gamma, gain): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_gamma(image, gamma=gamma, gain=gain) + expected = F.to_image(F.adjust_gamma(F.to_pil_image(image), gamma=gamma, gain=gain)) + + assert_equal(actual, expected) + + +class TestAdjustHue: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_hue_image, make_image(dtype=dtype, device=device), hue_factor=0.25) + + def test_kernel_video(self): + check_kernel(F.adjust_hue_video, make_video(), hue_factor=0.25) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_hue, make_input(), hue_factor=0.25) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_hue_image, torch.Tensor), + (F._adjust_hue_image_pil, PIL.Image.Image), + (F.adjust_hue_image, tv_tensors.Image), + (F.adjust_hue_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_hue, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(TypeError, match="permitted channel values are 1 or 3"): + F.adjust_hue(make_image(color_space="RGBA"), hue_factor=0.25) + + for hue_factor in [-1, 1]: + with pytest.raises(ValueError, match=re.escape("is not in [-0.5, 0.5]")): + F.adjust_hue(make_image(), hue_factor=hue_factor) + + @pytest.mark.parametrize("hue_factor", [-0.5, -0.3, 0.0, 0.2, 0.5]) + def test_correctness_image(self, hue_factor): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_hue(image, hue_factor=hue_factor) + expected = F.to_image(F.adjust_hue(F.to_pil_image(image), hue_factor=hue_factor)) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 + + +class TestAdjustSaturation: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_saturation_image, make_image(dtype=dtype, device=device), saturation_factor=0.5) + + def test_kernel_video(self): + check_kernel(F.adjust_saturation_video, make_video(), saturation_factor=0.5) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_saturation, make_input(), saturation_factor=0.5) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_saturation_image, torch.Tensor), + (F._adjust_saturation_image_pil, PIL.Image.Image), + (F.adjust_saturation_image, tv_tensors.Image), + (F.adjust_saturation_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_saturation, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(TypeError, match="permitted channel values are 1 or 3"): + F.adjust_saturation(make_image(color_space="RGBA"), saturation_factor=0.5) + + with pytest.raises(ValueError, match="is not non-negative"): + F.adjust_saturation(make_image(), saturation_factor=-1) + + @pytest.mark.parametrize("saturation_factor", [0.1, 0.5, 1.0]) + def test_correctness_image(self, saturation_factor): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_saturation(image, saturation_factor=saturation_factor) + expected = F.to_image(F.adjust_saturation(F.to_pil_image(image), saturation_factor=saturation_factor)) + + assert_close(actual, expected, rtol=0, atol=1) + + +class TestFiveTenCrop: + INPUT_SIZE = (17, 11) + OUTPUT_SIZE = (3, 5) + + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("kernel", [F.five_crop_image, F.ten_crop_image]) + def test_kernel_image(self, dtype, device, kernel): + check_kernel( + kernel, + make_image(self.INPUT_SIZE, dtype=dtype, device=device), + size=self.OUTPUT_SIZE, + check_batched_vs_unbatched=False, + ) + + @pytest.mark.parametrize("kernel", [F.five_crop_video, F.ten_crop_video]) + def test_kernel_video(self, kernel): + check_kernel(kernel, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZE, check_batched_vs_unbatched=False) + + def _functional_wrapper(self, fn): + # This wrapper is needed to make five_crop / ten_crop compatible with check_functional, since that requires a + # single output rather than a sequence. + @functools.wraps(fn) + def wrapper(*args, **kwargs): + outputs = fn(*args, **kwargs) + return outputs[0] + + return wrapper + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("functional", [F.five_crop, F.ten_crop]) + def test_functional(self, make_input, functional): + check_functional( + self._functional_wrapper(functional), + make_input(self.INPUT_SIZE), + size=self.OUTPUT_SIZE, + check_scripted_smoke=False, + ) + + @pytest.mark.parametrize( + ("functional", "kernel", "input_type"), + [ + (F.five_crop, F.five_crop_image, torch.Tensor), + (F.five_crop, F._five_crop_image_pil, PIL.Image.Image), + (F.five_crop, F.five_crop_image, tv_tensors.Image), + (F.five_crop, F.five_crop_video, tv_tensors.Video), + (F.ten_crop, F.ten_crop_image, torch.Tensor), + (F.ten_crop, F._ten_crop_image_pil, PIL.Image.Image), + (F.ten_crop, F.ten_crop_image, tv_tensors.Image), + (F.ten_crop, F.ten_crop_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, functional, kernel, input_type): + check_functional_kernel_signature_match(functional, kernel=kernel, input_type=input_type) + + class _TransformWrapper(nn.Module): + # This wrapper is needed to make FiveCrop / TenCrop compatible with check_transform, since that requires a + # single output rather than a sequence. + _v1_transform_cls = None + + def _extract_params_for_v1_transform(self): + return dict(five_ten_crop_transform=self.five_ten_crop_transform) + + def __init__(self, five_ten_crop_transform): + super().__init__() + type(self)._v1_transform_cls = type(self) + self.five_ten_crop_transform = five_ten_crop_transform + + def forward(self, input: torch.Tensor) -> torch.Tensor: + outputs = self.five_ten_crop_transform(input) + return outputs[0] + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop]) + def test_transform(self, make_input, transform_cls): + check_transform( + self._TransformWrapper(transform_cls(size=self.OUTPUT_SIZE)), + make_input(self.INPUT_SIZE), + check_sample_input=False, + ) + + @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_masks]) + @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop]) + def test_transform_error(self, make_input, transform_cls): + transform = transform_cls(size=self.OUTPUT_SIZE) + + with pytest.raises(TypeError, match="not supported"): + transform(make_input(self.INPUT_SIZE)) + + @pytest.mark.parametrize("fn", [F.five_crop, transform_cls_to_functional(transforms.FiveCrop)]) + def test_correctness_image_five_crop(self, fn): + image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + + actual = fn(image, size=self.OUTPUT_SIZE) + expected = F.five_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE) + + assert isinstance(actual, tuple) + assert_equal(actual, [F.to_image(e) for e in expected]) + + @pytest.mark.parametrize("fn_or_class", [F.ten_crop, transforms.TenCrop]) + @pytest.mark.parametrize("vertical_flip", [False, True]) + def test_correctness_image_ten_crop(self, fn_or_class, vertical_flip): + if fn_or_class is transforms.TenCrop: + fn = transform_cls_to_functional(fn_or_class, size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) + kwargs = dict() + else: + fn = fn_or_class + kwargs = dict(size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) + + image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + + actual = fn(image, **kwargs) + expected = F.ten_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) + + assert isinstance(actual, tuple) + assert_equal(actual, [F.to_image(e) for e in expected]) + + +class TestColorJitter: + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, dtype, device): + if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"): + pytest.skip( + "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' " + "will degenerate to that anyway." + ) + + check_transform( + transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25), + make_input(dtype=dtype, device=device), + ) + + def test_transform_noop(self): + input = make_image() + input_version = input._version + + transform = transforms.ColorJitter() + output = transform(input) + + assert output is input + assert output.data_ptr() == input.data_ptr() + assert output._version == input_version + + def test_transform_error(self): + with pytest.raises(ValueError, match="must be non negative"): + transforms.ColorJitter(brightness=-1) + + for brightness in [object(), [1, 2, 3]]: + with pytest.raises(TypeError, match="single number or a sequence with length 2"): + transforms.ColorJitter(brightness=brightness) + + with pytest.raises(ValueError, match="values should be between"): + transforms.ColorJitter(brightness=(-1, 0.5)) + + with pytest.raises(ValueError, match="values should be between"): + transforms.ColorJitter(hue=1) + + @pytest.mark.parametrize("brightness", [None, 0.1, (0.2, 0.3)]) + @pytest.mark.parametrize("contrast", [None, 0.4, (0.5, 0.6)]) + @pytest.mark.parametrize("saturation", [None, 0.7, (0.8, 0.9)]) + @pytest.mark.parametrize("hue", [None, 0.3, (-0.1, 0.2)]) + def test_transform_correctness(self, brightness, contrast, saturation, hue): + image = make_image(dtype=torch.uint8, device="cpu") + + transform = transforms.ColorJitter(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue) + + with freeze_rng_state(): + torch.manual_seed(0) + actual = transform(image) + + torch.manual_seed(0) + expected = F.to_image(transform(F.to_pil_image(image))) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 + + +class TestRgbToGrayscale: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.rgb_to_grayscale_image, make_image(dtype=dtype, device=device)) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image]) + def test_functional(self, make_input): + check_functional(F.rgb_to_grayscale, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.rgb_to_grayscale_image, torch.Tensor), + (F._rgb_to_grayscale_image_pil, PIL.Image.Image), + (F.rgb_to_grayscale_image, tv_tensors.Image), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.rgb_to_grayscale, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("transform", [transforms.Grayscale(), transforms.RandomGrayscale(p=1)]) + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image]) + def test_transform(self, transform, make_input): + check_transform(transform, make_input()) + + @pytest.mark.parametrize("num_output_channels", [1, 3]) + @pytest.mark.parametrize("fn", [F.rgb_to_grayscale, transform_cls_to_functional(transforms.Grayscale)]) + def test_image_correctness(self, num_output_channels, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image, num_output_channels=num_output_channels) + expected = F.to_image(F.rgb_to_grayscale(F.to_pil_image(image), num_output_channels=num_output_channels)) + + assert_equal(actual, expected, rtol=0, atol=1) + + @pytest.mark.parametrize("num_input_channels", [1, 3]) + def test_random_transform_correctness(self, num_input_channels): + image = make_image( + color_space={ + 1: "GRAY", + 3: "RGB", + }[num_input_channels], + dtype=torch.uint8, + device="cpu", + ) + + transform = transforms.RandomGrayscale(p=1) + + actual = transform(image) + expected = F.to_image(F.rgb_to_grayscale(F.to_pil_image(image), num_output_channels=num_input_channels)) + + assert_equal(actual, expected, rtol=0, atol=1) + + +class TestRandomZoomOut: + # Tests are light because this largely relies on the already tested `pad` kernels. + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + ], + ) + def test_transform(self, make_input): + check_transform(transforms.RandomZoomOut(p=1), make_input()) + + def test_transform_error(self): + for side_range in [None, 1, [1, 2, 3]]: + with pytest.raises( + ValueError if isinstance(side_range, list) else TypeError, match="should be a sequence of length 2" + ): + transforms.RandomZoomOut(side_range=side_range) + + for side_range in [[0.5, 1.5], [2.0, 1.0]]: + with pytest.raises(ValueError, match="Invalid side range"): + transforms.RandomZoomOut(side_range=side_range) + + @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + ], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_params_correctness(self, side_range, make_input, device): + if make_input is make_image_pil and device != "cpu": + pytest.skip("PIL image tests with parametrization device!='cpu' will degenerate to that anyway.") + + transform = transforms.RandomZoomOut(side_range=side_range) + + input = make_input() + height, width = F.get_size(input) + + params = transform._get_params([input]) + assert "padding" in params + + padding = params["padding"] + assert len(padding) == 4 + + assert 0 <= padding[0] <= (side_range[1] - 1) * width + assert 0 <= padding[1] <= (side_range[1] - 1) * height + assert 0 <= padding[2] <= (side_range[1] - 1) * width + assert 0 <= padding[3] <= (side_range[1] - 1) * height + + +class TestRandomPhotometricDistort: + # Tests are light because this largely relies on the already tested + # `adjust_{brightness,contrast,saturation,hue}` and `permute_channels` kernels. + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, dtype, device): + if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"): + pytest.skip( + "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' " + "will degenerate to that anyway." + ) + + check_transform( + transforms.RandomPhotometricDistort( + brightness=(0.3, 0.4), contrast=(0.5, 0.6), saturation=(0.7, 0.8), hue=(-0.1, 0.2), p=1 + ), + make_input(dtype=dtype, device=device), + ) + + +class TestScaleJitter: + # Tests are light because this largely relies on the already tested `resize` kernels. + + INPUT_SIZE = (17, 11) + TARGET_SIZE = (12, 13) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + if make_input is make_image_pil and device != "cpu": + pytest.skip("PIL image tests with parametrization device!='cpu' will degenerate to that anyway.") + + check_transform(transforms.ScaleJitter(self.TARGET_SIZE), make_input(self.INPUT_SIZE, device=device)) + + def test__get_params(self): + input_size = self.INPUT_SIZE + target_size = self.TARGET_SIZE + scale_range = (0.5, 1.5) + + transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range) + params = transform._get_params([make_image(input_size)]) + + assert "size" in params + size = params["size"] + + assert isinstance(size, tuple) and len(size) == 2 + height, width = size + + r_min = min(target_size[1] / input_size[0], target_size[0] / input_size[1]) * scale_range[0] + r_max = min(target_size[1] / input_size[0], target_size[0] / input_size[1]) * scale_range[1] + + assert int(input_size[0] * r_min) <= height <= int(input_size[0] * r_max) + assert int(input_size[1] * r_min) <= width <= int(input_size[1] * r_max) + + +class TestLinearTransform: + def _make_matrix_and_vector(self, input, *, device=None): + device = device or input.device + numel = math.prod(F.get_dimensions(input)) + transformation_matrix = torch.randn((numel, numel), device=device) + mean_vector = torch.randn((numel,), device=device) + return transformation_matrix, mean_vector + + def _sample_input_adapter(self, transform, input, device): + return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)} -import PIL.Image -import pytest -import torch -import torchvision.transforms.v2 as transforms + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, dtype, device): + input = make_input(dtype=dtype, device=device) + check_transform( + transforms.LinearTransformation(*self._make_matrix_and_vector(input)), + input, + check_sample_input=self._sample_input_adapter, + ) -from common_utils import assert_equal, cpu_and_cuda -from torch.utils._pytree import tree_flatten, tree_unflatten -from torchvision import tv_tensors -from torchvision.ops.boxes import box_iou -from torchvision.transforms.functional import to_pil_image -from torchvision.transforms.v2 import functional as F -from torchvision.transforms.v2._utils import check_type, is_pure_tensor, query_chw -from transforms_v2_legacy_utils import ( - make_bounding_boxes, - make_detection_mask, - make_image, - make_images, - make_multiple_bounding_boxes, - make_segmentation_mask, - make_video, - make_videos, -) + def test_transform_error(self): + with pytest.raises(ValueError, match="transformation_matrix should be square"): + transforms.LinearTransformation(transformation_matrix=torch.rand(2, 3), mean_vector=torch.rand(2)) + with pytest.raises(ValueError, match="mean_vector should have the same length"): + transforms.LinearTransformation(transformation_matrix=torch.rand(2, 2), mean_vector=torch.rand(1)) -def make_vanilla_tensor_images(*args, **kwargs): - for image in make_images(*args, **kwargs): - if image.ndim > 3: - continue - yield image.data + for matrix_dtype, vector_dtype in [(torch.float32, torch.float64), (torch.float64, torch.float32)]: + with pytest.raises(ValueError, match="Input tensors should have the same dtype"): + transforms.LinearTransformation( + transformation_matrix=torch.rand(2, 2, dtype=matrix_dtype), + mean_vector=torch.rand(2, dtype=vector_dtype), + ) + + image = make_image() + transform = transforms.LinearTransformation(transformation_matrix=torch.rand(2, 2), mean_vector=torch.rand(2)) + with pytest.raises(ValueError, match="Input tensor and transformation matrix have incompatible shape"): + transform(image) + + transform = transforms.LinearTransformation(*self._make_matrix_and_vector(image)) + with pytest.raises(TypeError, match="does not support PIL images"): + transform(F.to_pil_image(image)) + + @needs_cuda + def test_transform_error_cuda(self): + for matrix_device, vector_device in [("cuda", "cpu"), ("cpu", "cuda")]: + with pytest.raises(ValueError, match="Input tensors should be on the same device"): + transforms.LinearTransformation( + transformation_matrix=torch.rand(2, 2, device=matrix_device), + mean_vector=torch.rand(2, device=vector_device), + ) + + for input_device, param_device in [("cuda", "cpu"), ("cpu", "cuda")]: + input = make_image(device=input_device) + transform = transforms.LinearTransformation(*self._make_matrix_and_vector(input, device=param_device)) + with pytest.raises( + ValueError, match="Input tensor should be on the same device as transformation matrix and mean vector" + ): + transform(input) -def make_pil_images(*args, **kwargs): - for image in make_vanilla_tensor_images(*args, **kwargs): - yield to_pil_image(image) +def make_image_numpy(*args, **kwargs): + image = make_image_tensor(*args, **kwargs) + return image.permute((1, 2, 0)).numpy() -def make_vanilla_tensor_bounding_boxes(*args, **kwargs): - for bounding_boxes in make_multiple_bounding_boxes(*args, **kwargs): - yield bounding_boxes.data +class TestToImage: + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_image_numpy]) + @pytest.mark.parametrize("fn", [F.to_image, transform_cls_to_functional(transforms.ToImage)]) + def test_functional_and_transform(self, make_input, fn): + input = make_input() + output = fn(input) + assert isinstance(output, tv_tensors.Image) -def parametrize(transforms_with_inputs): - return pytest.mark.parametrize( - ("transform", "input"), - [ - pytest.param( - transform, - input, - id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}", - ) - for transform, inputs in transforms_with_inputs - for idx, input in enumerate(inputs) - ], - ) + input_size = list(input.shape[:2]) if isinstance(input, np.ndarray) else F.get_size(input) + assert F.get_size(output) == input_size + if isinstance(input, torch.Tensor): + assert output.data_ptr() == input.data_ptr() -def auto_augment_adapter(transform, input, device): - adapted_input = {} - image_or_video_found = False - for key, value in input.items(): - if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.Mask)): - # AA transforms don't support bounding boxes or masks - continue - elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)): - if image_or_video_found: - # AA transforms only support a single image or video - continue - image_or_video_found = True - adapted_input[key] = value - return adapted_input + def test_functional_error(self): + with pytest.raises(TypeError, match="Input can either be a pure Tensor, a numpy array, or a PIL image"): + F.to_image(object()) -def linear_transformation_adapter(transform, input, device): - flat_inputs = list(input.values()) - c, h, w = query_chw( - [ - item - for item, needs_transform in zip(flat_inputs, transforms.Transform()._needs_transform_list(flat_inputs)) - if needs_transform - ] - ) - num_elements = c * h * w - transform.transformation_matrix = torch.randn((num_elements, num_elements), device=device) - transform.mean_vector = torch.randn((num_elements,), device=device) - return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)} +class TestToPILImage: + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_numpy]) + @pytest.mark.parametrize("color_space", ["RGB", "GRAY"]) + @pytest.mark.parametrize("fn", [F.to_pil_image, transform_cls_to_functional(transforms.ToPILImage)]) + def test_functional_and_transform(self, make_input, color_space, fn): + input = make_input(color_space=color_space) + output = fn(input) + assert isinstance(output, PIL.Image.Image) -def normalize_adapter(transform, input, device): - adapted_input = {} - for key, value in input.items(): - if isinstance(value, PIL.Image.Image): - # normalize doesn't support PIL images - continue - elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor)): - # normalize doesn't support integer images - value = F.to_dtype(value, torch.float32, scale=True) - adapted_input[key] = value - return adapted_input + input_size = list(input.shape[:2]) if isinstance(input, np.ndarray) else F.get_size(input) + assert F.get_size(output) == input_size + def test_functional_error(self): + with pytest.raises(TypeError, match="pic should be Tensor or ndarray"): + F.to_pil_image(object()) -class TestSmoke: - @pytest.mark.parametrize( - ("transform", "adapter"), - [ - (transforms.RandomErasing(p=1.0), None), - (transforms.AugMix(), auto_augment_adapter), - (transforms.AutoAugment(), auto_augment_adapter), - (transforms.RandAugment(), auto_augment_adapter), - (transforms.TrivialAugmentWide(), auto_augment_adapter), - (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None), - (transforms.Grayscale(), None), - (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None), - (transforms.RandomAutocontrast(p=1.0), None), - (transforms.RandomEqualize(p=1.0), None), - (transforms.RandomGrayscale(p=1.0), None), - (transforms.RandomInvert(p=1.0), None), - (transforms.RandomChannelPermutation(), None), - (transforms.RandomPhotometricDistort(p=1.0), None), - (transforms.RandomPosterize(bits=4, p=1.0), None), - (transforms.RandomSolarize(threshold=0.5, p=1.0), None), - (transforms.CenterCrop([16, 16]), None), - (transforms.ElasticTransform(sigma=1.0), None), - (transforms.Pad(4), None), - (transforms.RandomAffine(degrees=30.0), None), - (transforms.RandomCrop([16, 16], pad_if_needed=True), None), - (transforms.RandomHorizontalFlip(p=1.0), None), - (transforms.RandomPerspective(p=1.0), None), - (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None), - (transforms.RandomResizedCrop([16, 16], antialias=True), None), - (transforms.RandomRotation(degrees=30), None), - (transforms.RandomShortestSize(min_size=10, antialias=True), None), - (transforms.RandomVerticalFlip(p=1.0), None), - (transforms.RandomZoomOut(p=1.0), None), - (transforms.Resize([16, 16], antialias=True), None), - (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None), - (transforms.ClampBoundingBoxes(), None), - (transforms.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.CXCYWH), None), - (transforms.ConvertImageDtype(), None), - (transforms.GaussianBlur(kernel_size=3), None), - ( - transforms.LinearTransformation( - # These are just dummy values that will be filled by the adapter. We can't define them upfront, - # because for we neither know the spatial size nor the device at this point - transformation_matrix=torch.empty((1, 1)), - mean_vector=torch.empty((1,)), - ), - linear_transformation_adapter, - ), - (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter), - (transforms.ToDtype(torch.float64), None), - (transforms.UniformTemporalSubsample(num_samples=2), None), - ], - ids=lambda transform: type(transform).__name__, - ) - @pytest.mark.parametrize("container_type", [dict, list, tuple]) - @pytest.mark.parametrize( - "image_or_video", - [ - make_image(), - make_video(), - next(make_pil_images(color_spaces=["RGB"])), - next(make_vanilla_tensor_images()), - ], - ) - @pytest.mark.parametrize("de_serialize", [lambda t: t, lambda t: pickle.loads(pickle.dumps(t))]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_common(self, transform, adapter, container_type, image_or_video, de_serialize, device): - transform = de_serialize(transform) + for ndim in [1, 4]: + with pytest.raises(ValueError, match="pic should be 2/3 dimensional"): + F.to_pil_image(torch.empty(*[1] * ndim)) - canvas_size = F.get_size(image_or_video) - input = dict( - image_or_video=image_or_video, - image_tv_tensor=make_image(size=canvas_size), - video_tv_tensor=make_video(size=canvas_size), - image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])), - bounding_boxes_xyxy=make_bounding_boxes( - format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,) - ), - bounding_boxes_xywh=make_bounding_boxes( - format=tv_tensors.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,) - ), - bounding_boxes_cxcywh=make_bounding_boxes( - format=tv_tensors.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,) - ), - bounding_boxes_degenerate_xyxy=tv_tensors.BoundingBoxes( - [ - [0, 0, 0, 0], # no height or width - [0, 0, 0, 1], # no height - [0, 0, 1, 0], # no width - [2, 0, 1, 1], # x1 > x2, y1 < y2 - [0, 2, 1, 1], # x1 < x2, y1 > y2 - [2, 2, 1, 1], # x1 > x2, y1 > y2 - ], - format=tv_tensors.BoundingBoxFormat.XYXY, - canvas_size=canvas_size, - ), - bounding_boxes_degenerate_xywh=tv_tensors.BoundingBoxes( - [ - [0, 0, 0, 0], # no height or width - [0, 0, 0, 1], # no height - [0, 0, 1, 0], # no width - [0, 0, 1, -1], # negative height - [0, 0, -1, 1], # negative width - [0, 0, -1, -1], # negative height and width - ], - format=tv_tensors.BoundingBoxFormat.XYWH, - canvas_size=canvas_size, - ), - bounding_boxes_degenerate_cxcywh=tv_tensors.BoundingBoxes( - [ - [0, 0, 0, 0], # no height or width - [0, 0, 0, 1], # no height - [0, 0, 1, 0], # no width - [0, 0, 1, -1], # negative height - [0, 0, -1, 1], # negative width - [0, 0, -1, -1], # negative height and width - ], - format=tv_tensors.BoundingBoxFormat.CXCYWH, - canvas_size=canvas_size, - ), - detection_mask=make_detection_mask(size=canvas_size), - segmentation_mask=make_segmentation_mask(size=canvas_size), - int=0, - float=0.0, - bool=True, - none=None, - str="str", - path=pathlib.Path.cwd(), - object=object(), - tensor=torch.empty(5), - array=np.empty(5), - ) - if adapter is not None: - input = adapter(transform, input, device) - - if container_type in {tuple, list}: - input = container_type(input.values()) + with pytest.raises(ValueError, match="pic should not have > 4 channels"): + num_channels = 5 + F.to_pil_image(torch.empty(num_channels, 1, 1)) - input_flat, input_spec = tree_flatten(input) - input_flat = [item.to(device) if isinstance(item, torch.Tensor) else item for item in input_flat] - input = tree_unflatten(input_flat, input_spec) - torch.manual_seed(0) +class TestToTensor: + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_image_numpy]) + def test_smoke(self, make_input): + with pytest.warns(UserWarning, match="deprecated and will be removed"): + transform = transforms.ToTensor() + + input = make_input() output = transform(input) - output_flat, output_spec = tree_flatten(output) - assert output_spec == input_spec + input_size = list(input.shape[:2]) if isinstance(input, np.ndarray) else F.get_size(input) + assert F.get_size(output) == input_size - for output_item, input_item, should_be_transformed in zip( - output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat) - ): - if should_be_transformed: - assert type(output_item) is type(input_item) - else: - assert output_item is input_item - if isinstance(input_item, tv_tensors.BoundingBoxes) and not isinstance( - transform, transforms.ConvertBoundingBoxFormat - ): - assert output_item.format == input_item.format - - # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future - # transform that does this), back into a valid one. - # TODO: we should test that against all degenerate boxes above - for format in list(tv_tensors.BoundingBoxFormat): - sample = dict( - boxes=tv_tensors.BoundingBoxes([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)), - labels=torch.tensor([3]), - ) - assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4) +class TestPILToTensor: + @pytest.mark.parametrize("color_space", ["RGB", "GRAY"]) + @pytest.mark.parametrize("fn", [F.pil_to_tensor, transform_cls_to_functional(transforms.PILToTensor)]) + def test_functional_and_transform(self, color_space, fn): + input = make_image_pil(color_space=color_space) + output = fn(input) - @parametrize( - [ - ( - transform, - itertools.chain.from_iterable( - fn( - color_spaces=[ - "GRAY", - "RGB", - ], - dtypes=[torch.uint8], - extra_dims=[(), (4,)], - **(dict(num_frames=[3]) if fn is make_videos else dict()), - ) - for fn in [ - make_images, - make_vanilla_tensor_images, - make_pil_images, - make_videos, - ] - ), - ) - for transform in ( - transforms.RandAugment(), - transforms.TrivialAugmentWide(), - transforms.AutoAugment(), - transforms.AugMix(), - ) - ] - ) - def test_auto_augment(self, transform, input): - transform(input) + assert isinstance(output, torch.Tensor) and not isinstance(output, tv_tensors.TVTensor) + assert F.get_size(output) == F.get_size(input) - @parametrize( - [ - ( - transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]), - itertools.chain.from_iterable( - fn(color_spaces=["RGB"], dtypes=[torch.float32]) - for fn in [ - make_images, - make_vanilla_tensor_images, - make_videos, - ] - ), - ), + def test_functional_error(self): + with pytest.raises(TypeError, match="pic should be PIL Image"): + F.pil_to_tensor(object()) + + +class TestLambda: + @pytest.mark.parametrize("input", [object(), torch.empty(()), np.empty(()), "string", 1, 0.0]) + @pytest.mark.parametrize("types", [(), (torch.Tensor, np.ndarray)]) + def test_transform(self, input, types): + was_applied = False + + def was_applied_fn(input): + nonlocal was_applied + was_applied = True + return input + + transform = transforms.Lambda(was_applied_fn, *types) + output = transform(input) + + assert output is input + assert was_applied is (not types or isinstance(input, types)) + + +@pytest.mark.parametrize( + ("alias", "target"), + [ + pytest.param(alias, target, id=alias.__name__) + for alias, target in [ + (F.hflip, F.horizontal_flip), + (F.vflip, F.vertical_flip), + (F.get_image_num_channels, F.get_num_channels), + (F.to_pil_image, F.to_pil_image), + (F.elastic_transform, F.elastic), + (F.to_grayscale, F.rgb_to_grayscale), ] - ) - def test_normalize(self, transform, input): - transform(input) + ], +) +def test_alias(alias, target): + assert alias is target @pytest.mark.parametrize( - "flat_inputs", + "make_inputs", itertools.permutations( [ - next(make_vanilla_tensor_images()), - next(make_vanilla_tensor_images()), - next(make_pil_images()), - make_image(), - next(make_videos()), + make_image_tensor, + make_image_tensor, + make_image_pil, + make_image, + make_video, ], 3, ), ) -def test_pure_tensor_heuristic(flat_inputs): +def test_pure_tensor_heuristic(make_inputs): + flat_inputs = [make_input() for make_input in make_inputs] + def split_on_pure_tensor(to_split): # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts: # 1. The first pure tensor. If none is present, this will be `None` @@ -390,170 +5325,6 @@ def was_applied(output, inpt): assert transform.was_applied(output, input) -class TestRandomZoomOut: - def test_assertions(self): - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomZoomOut(fill="abc") - - with pytest.raises(TypeError, match="should be a sequence of length"): - transforms.RandomZoomOut(0, side_range=0) - - with pytest.raises(ValueError, match="Invalid canvas side range"): - transforms.RandomZoomOut(0, side_range=[4.0, 1.0]) - - @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) - @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) - def test__get_params(self, fill, side_range): - transform = transforms.RandomZoomOut(fill=fill, side_range=side_range) - - h, w = size = (24, 32) - image = make_image(size) - - params = transform._get_params([image]) - - assert len(params["padding"]) == 4 - assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w - assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h - assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w - assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h - - -class TestElasticTransform: - def test_assertions(self): - - with pytest.raises(TypeError, match="alpha should be a number or a sequence of numbers"): - transforms.ElasticTransform({}) - - with pytest.raises(ValueError, match="alpha is a sequence its length should be 1 or 2"): - transforms.ElasticTransform([1.0, 2.0, 3.0]) - - with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"): - transforms.ElasticTransform(1.0, {}) - - with pytest.raises(ValueError, match="sigma is a sequence its length should be 1 or 2"): - transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0]) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.ElasticTransform(1.0, 2.0, fill="abc") - - def test__get_params(self): - alpha = 2.0 - sigma = 3.0 - transform = transforms.ElasticTransform(alpha, sigma) - - h, w = size = (24, 32) - image = make_image(size) - - params = transform._get_params([image]) - - displacement = params["displacement"] - assert displacement.shape == (1, h, w, 2) - assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all() - assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all() - - -class TestTransform: - @pytest.mark.parametrize( - "inpt_type", - [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int], - ) - def test_check_transformed_types(self, inpt_type, mocker): - # This test ensures that we correctly handle which types to transform and which to bypass - t = transforms.Transform() - inpt = mocker.MagicMock(spec=inpt_type) - - if inpt_type in (np.ndarray, str, int): - output = t(inpt) - assert output is inpt - else: - with pytest.raises(NotImplementedError): - t(inpt) - - -class TestToImage: - @pytest.mark.parametrize( - "inpt_type", - [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int], - ) - def test__transform(self, inpt_type, mocker): - fn = mocker.patch( - "torchvision.transforms.v2.functional.to_image", - return_value=torch.rand(1, 3, 8, 8), - ) - - inpt = mocker.MagicMock(spec=inpt_type) - transform = transforms.ToImage() - transform(inpt) - if inpt_type in (tv_tensors.BoundingBoxes, tv_tensors.Image, str, int): - assert fn.call_count == 0 - else: - fn.assert_called_once_with(inpt) - - -class TestToPILImage: - @pytest.mark.parametrize( - "inpt_type", - [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int], - ) - def test__transform(self, inpt_type, mocker): - fn = mocker.patch("torchvision.transforms.v2.functional.to_pil_image") - - inpt = mocker.MagicMock(spec=inpt_type) - transform = transforms.ToPILImage() - transform(inpt) - if inpt_type in (PIL.Image.Image, tv_tensors.BoundingBoxes, str, int): - assert fn.call_count == 0 - else: - fn.assert_called_once_with(inpt, mode=transform.mode) - - -class TestToTensor: - @pytest.mark.parametrize( - "inpt_type", - [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int], - ) - def test__transform(self, inpt_type, mocker): - fn = mocker.patch("torchvision.transforms.functional.to_tensor") - - inpt = mocker.MagicMock(spec=inpt_type) - with pytest.warns(UserWarning, match="deprecated and will be removed"): - transform = transforms.ToTensor() - transform(inpt) - if inpt_type in (tv_tensors.Image, torch.Tensor, tv_tensors.BoundingBoxes, str, int): - assert fn.call_count == 0 - else: - fn.assert_called_once_with(inpt) - - -class TestContainers: - @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder]) - def test_assertions(self, transform_cls): - with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"): - transform_cls(transforms.RandomCrop(28)) - - @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder]) - @pytest.mark.parametrize( - "trfms", - [ - [transforms.Pad(2), transforms.RandomCrop(28)], - [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)], - [transforms.Pad(2), lambda x: 2.0 * x, transforms.RandomCrop(28)], - ], - ) - def test_ctor(self, transform_cls, trfms): - c = transform_cls(trfms) - inpt = torch.rand(1, 3, 32, 32) - output = c(inpt) - assert isinstance(output, torch.Tensor) - assert output.ndim == 4 - - -class TestRandomChoice: - def test_assertions(self): - with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"): - transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1]) - - class TestRandomIoUCrop: @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]]) @@ -617,8 +5388,8 @@ def test__transform(self, mocker): size = (32, 24) image = make_image(size) - bboxes = make_bounding_boxes(format="XYXY", canvas_size=size, batch_dims=(6,)) - masks = make_detection_mask(size, num_objects=6) + bboxes = make_bounding_boxes(format="XYXY", canvas_size=size, num_boxes=6) + masks = make_detection_masks(size, num_masks=6) sample = [image, bboxes, masks] @@ -637,34 +5408,6 @@ def test__transform(self, mocker): assert isinstance(output_masks, tv_tensors.Mask) -class TestScaleJitter: - def test__get_params(self): - canvas_size = (24, 32) - target_size = (16, 12) - scale_range = (0.5, 1.5) - - transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range) - - sample = make_image(canvas_size) - - n_samples = 5 - for _ in range(n_samples): - - params = transform._get_params([sample]) - - assert "size" in params - size = params["size"] - - assert isinstance(size, tuple) and len(size) == 2 - height, width = size - - r_min = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[0] - r_max = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[1] - - assert int(canvas_size[0] * r_min) <= height <= int(canvas_size[0] * r_max) - assert int(canvas_size[1] * r_min) <= width <= int(canvas_size[1] * r_max) - - class TestRandomShortestSize: @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)]) def test__get_params(self, min_size, max_size): @@ -689,39 +5432,6 @@ def test__get_params(self, min_size, max_size): assert shorter in min_size -class TestLinearTransformation: - def test_assertions(self): - with pytest.raises(ValueError, match="transformation_matrix should be square"): - transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5)) - - with pytest.raises(ValueError, match="mean_vector should have the same length"): - transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5)) - - @pytest.mark.parametrize( - "inpt", - [ - 122 * torch.ones(1, 3, 8, 8), - 122.0 * torch.ones(1, 3, 8, 8), - tv_tensors.Image(122 * torch.ones(1, 3, 8, 8)), - PIL.Image.new("RGB", (8, 8), (122, 122, 122)), - ], - ) - def test__transform(self, inpt): - - v = 121 * torch.ones(3 * 8 * 8) - m = torch.ones(3 * 8 * 8, 3 * 8 * 8) - transform = transforms.LinearTransformation(m, v) - - if isinstance(inpt, PIL.Image.Image): - with pytest.raises(TypeError, match="does not support PIL images"): - transform(inpt) - else: - output = transform(inpt) - assert isinstance(output, torch.Tensor) - assert output.unique() == 3 * 8 * 8 - assert output.dtype == inpt.dtype - - class TestRandomResize: def test__get_params(self): min_size = 3 @@ -738,70 +5448,11 @@ def test__get_params(self): assert min_size <= size < max_size -class TestUniformTemporalSubsample: - @pytest.mark.parametrize( - "inpt", - [ - torch.zeros(10, 3, 8, 8), - torch.zeros(1, 10, 3, 8, 8), - tv_tensors.Video(torch.zeros(1, 10, 3, 8, 8)), - ], - ) - def test__transform(self, inpt): - num_samples = 5 - transform = transforms.UniformTemporalSubsample(num_samples) - - output = transform(inpt) - assert type(output) is type(inpt) - assert output.shape[-4] == num_samples - assert output.dtype == inpt.dtype - - -# TODO: remove this test in 0.17 when the default of antialias changes to True -def test_antialias_warning(): - pil_img = PIL.Image.new("RGB", size=(10, 10), color=127) - tensor_img = torch.randint(0, 256, size=(3, 10, 10), dtype=torch.uint8) - tensor_video = torch.randint(0, 256, size=(2, 3, 10, 10), dtype=torch.uint8) - - match = "The default value of the antialias parameter" - with pytest.warns(UserWarning, match=match): - transforms.RandomResizedCrop((20, 20))(tensor_img) - with pytest.warns(UserWarning, match=match): - transforms.ScaleJitter((20, 20))(tensor_img) - with pytest.warns(UserWarning, match=match): - transforms.RandomShortestSize((20, 20))(tensor_img) - with pytest.warns(UserWarning, match=match): - transforms.RandomResize(10, 20)(tensor_img) - - with pytest.warns(UserWarning, match=match): - F.resized_crop(tv_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20)) - - with pytest.warns(UserWarning, match=match): - F.resize(tv_tensors.Video(tensor_video), (20, 20)) - with pytest.warns(UserWarning, match=match): - F.resized_crop(tv_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20)) - - with warnings.catch_warnings(): - warnings.simplefilter("error") - transforms.RandomResizedCrop((20, 20))(pil_img) - transforms.ScaleJitter((20, 20))(pil_img) - transforms.RandomShortestSize((20, 20))(pil_img) - transforms.RandomResize(10, 20)(pil_img) - - transforms.RandomResizedCrop((20, 20), antialias=True)(tensor_img) - transforms.ScaleJitter((20, 20), antialias=True)(tensor_img) - transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img) - transforms.RandomResize(10, 20, antialias=True)(tensor_img) - - F.resized_crop(tv_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20), antialias=True) - F.resized_crop(tv_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20), antialias=True) - - @pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image)) @pytest.mark.parametrize("label_type", (torch.Tensor, int)) @pytest.mark.parametrize("dataset_return_type", (dict, tuple)) @pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage)) -def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor): +def test_classification_preset(image_type, label_type, dataset_return_type, to_tensor): image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8)) if image_type is PIL.Image: @@ -970,165 +5621,128 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected -@pytest.mark.parametrize("min_size", (1, 10)) -@pytest.mark.parametrize("labels_getter", ("default", lambda inputs: inputs["labels"], None, lambda inputs: None)) -@pytest.mark.parametrize("sample_type", (tuple, dict)) -def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): - - if sample_type is tuple and not isinstance(labels_getter, str): - # The "lambda inputs: inputs["labels"]" labels_getter used in this test - # doesn't work if the input is a tuple. - return - - H, W = 256, 128 - - boxes_and_validity = [ - ([0, 1, 10, 1], False), # Y1 == Y2 - ([0, 1, 0, 20], False), # X1 == X2 - ([0, 0, min_size - 1, 10], False), # H < min_size - ([0, 0, 10, min_size - 1], False), # W < min_size - ([0, 0, 10, H + 1], False), # Y2 > H - ([0, 0, W + 1, 10], False), # X2 > W - ([-1, 1, 10, 20], False), # any < 0 - ([0, 0, -1, 20], False), # any < 0 - ([0, 0, -10, -1], False), # any < 0 - ([0, 0, min_size, 10], True), # H < min_size - ([0, 0, 10, min_size], True), # W < min_size - ([0, 0, W, H], True), # TODO: Is that actually OK?? Should it be -1? - ([1, 1, 30, 20], True), - ([0, 0, 10, 10], True), - ([1, 1, 30, 20], True), - ] - - random.shuffle(boxes_and_validity) # For test robustness: mix order of wrong and correct cases - boxes, is_valid_mask = zip(*boxes_and_validity) - valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid] - - boxes = torch.tensor(boxes) - labels = torch.arange(boxes.shape[0]) - - boxes = tv_tensors.BoundingBoxes( - boxes, - format=tv_tensors.BoundingBoxFormat.XYXY, - canvas_size=(H, W), - ) - - masks = tv_tensors.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W))) - whatever = torch.rand(10) - input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8) - sample = { - "image": input_img, - "labels": labels, - "boxes": boxes, - "whatever": whatever, - "None": None, - "masks": masks, - } - - if sample_type is tuple: - img = sample.pop("image") - sample = (img, sample) - - out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample) - - if sample_type is tuple: - out_image = out[0] - out_labels = out[1]["labels"] - out_boxes = out[1]["boxes"] - out_masks = out[1]["masks"] - out_whatever = out[1]["whatever"] - else: - out_image = out["image"] - out_labels = out["labels"] - out_boxes = out["boxes"] - out_masks = out["masks"] - out_whatever = out["whatever"] - - assert out_image is input_img - assert out_whatever is whatever - - assert isinstance(out_boxes, tv_tensors.BoundingBoxes) - assert isinstance(out_masks, tv_tensors.Mask) - - if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None): - assert out_labels is labels - else: - assert isinstance(out_labels, torch.Tensor) - assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0] - # This works because we conveniently set labels to arange(num_boxes) - assert out_labels.tolist() == valid_indices - - -def test_sanitize_bounding_boxes_no_label(): - # Non-regression test for https://github.com/pytorch/vision/issues/7878 - - img = make_image() - boxes = make_bounding_boxes() - - with pytest.raises(ValueError, match="or a two-tuple whose second item is a dict"): - transforms.SanitizeBoundingBoxes()(img, boxes) - - out_img, out_boxes = transforms.SanitizeBoundingBoxes(labels_getter=None)(img, boxes) - assert isinstance(out_img, tv_tensors.Image) - assert isinstance(out_boxes, tv_tensors.BoundingBoxes) +class TestSanitizeBoundingBoxes: + @pytest.mark.parametrize("min_size", (1, 10)) + @pytest.mark.parametrize("labels_getter", ("default", lambda inputs: inputs["labels"], None, lambda inputs: None)) + @pytest.mark.parametrize("sample_type", (tuple, dict)) + def test_transform(self, min_size, labels_getter, sample_type): + + if sample_type is tuple and not isinstance(labels_getter, str): + # The "lambda inputs: inputs["labels"]" labels_getter used in this test + # doesn't work if the input is a tuple. + return + + H, W = 256, 128 + + boxes_and_validity = [ + ([0, 1, 10, 1], False), # Y1 == Y2 + ([0, 1, 0, 20], False), # X1 == X2 + ([0, 0, min_size - 1, 10], False), # H < min_size + ([0, 0, 10, min_size - 1], False), # W < min_size + ([0, 0, 10, H + 1], False), # Y2 > H + ([0, 0, W + 1, 10], False), # X2 > W + ([-1, 1, 10, 20], False), # any < 0 + ([0, 0, -1, 20], False), # any < 0 + ([0, 0, -10, -1], False), # any < 0 + ([0, 0, min_size, 10], True), # H < min_size + ([0, 0, 10, min_size], True), # W < min_size + ([0, 0, W, H], True), # TODO: Is that actually OK?? Should it be -1? + ([1, 1, 30, 20], True), + ([0, 0, 10, 10], True), + ([1, 1, 30, 20], True), + ] + random.shuffle(boxes_and_validity) # For test robustness: mix order of wrong and correct cases + boxes, is_valid_mask = zip(*boxes_and_validity) + valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid] -def test_sanitize_bounding_boxes_errors(): + boxes = torch.tensor(boxes) + labels = torch.arange(boxes.shape[0]) - good_bbox = tv_tensors.BoundingBoxes( - [[0, 0, 10, 10]], - format=tv_tensors.BoundingBoxFormat.XYXY, - canvas_size=(20, 20), - ) + boxes = tv_tensors.BoundingBoxes( + boxes, + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=(H, W), + ) - with pytest.raises(ValueError, match="min_size must be >= 1"): - transforms.SanitizeBoundingBoxes(min_size=0) - with pytest.raises(ValueError, match="labels_getter should either be 'default'"): - transforms.SanitizeBoundingBoxes(labels_getter=12) + masks = tv_tensors.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W))) + whatever = torch.rand(10) + input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8) + sample = { + "image": input_img, + "labels": labels, + "boxes": boxes, + "whatever": whatever, + "None": None, + "masks": masks, + } - with pytest.raises(ValueError, match="Could not infer where the labels are"): - bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])} - transforms.SanitizeBoundingBoxes()(bad_labels_key) + if sample_type is tuple: + img = sample.pop("image") + sample = (img, sample) - with pytest.raises(ValueError, match="must be a tensor"): - not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()} - transforms.SanitizeBoundingBoxes()(not_a_tensor) + out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample) - with pytest.raises(ValueError, match="Number of boxes"): - different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)} - transforms.SanitizeBoundingBoxes()(different_sizes) + if sample_type is tuple: + out_image = out[0] + out_labels = out[1]["labels"] + out_boxes = out[1]["boxes"] + out_masks = out[1]["masks"] + out_whatever = out[1]["whatever"] + else: + out_image = out["image"] + out_labels = out["labels"] + out_boxes = out["boxes"] + out_masks = out["masks"] + out_whatever = out["whatever"] + assert out_image is input_img + assert out_whatever is whatever -class TestLambda: - inputs = pytest.mark.parametrize("input", [object(), torch.empty(()), np.empty(()), "string", 1, 0.0]) + assert isinstance(out_boxes, tv_tensors.BoundingBoxes) + assert isinstance(out_masks, tv_tensors.Mask) - @inputs - def test_default(self, input): - was_applied = False + if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None): + assert out_labels is labels + else: + assert isinstance(out_labels, torch.Tensor) + assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0] + # This works because we conveniently set labels to arange(num_boxes) + assert out_labels.tolist() == valid_indices - def was_applied_fn(input): - nonlocal was_applied - was_applied = True - return input + def test_no_label(self): + # Non-regression test for https://github.com/pytorch/vision/issues/7878 - transform = transforms.Lambda(was_applied_fn) + img = make_image() + boxes = make_bounding_boxes() - transform(input) + with pytest.raises(ValueError, match="or a two-tuple whose second item is a dict"): + transforms.SanitizeBoundingBoxes()(img, boxes) - assert was_applied + out_img, out_boxes = transforms.SanitizeBoundingBoxes(labels_getter=None)(img, boxes) + assert isinstance(out_img, tv_tensors.Image) + assert isinstance(out_boxes, tv_tensors.BoundingBoxes) - @inputs - def test_with_types(self, input): - was_applied = False + def test_errors(self): + good_bbox = tv_tensors.BoundingBoxes( + [[0, 0, 10, 10]], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=(20, 20), + ) - def was_applied_fn(input): - nonlocal was_applied - was_applied = True - return input + with pytest.raises(ValueError, match="min_size must be >= 1"): + transforms.SanitizeBoundingBoxes(min_size=0) + with pytest.raises(ValueError, match="labels_getter should either be 'default'"): + transforms.SanitizeBoundingBoxes(labels_getter=12) - types = (torch.Tensor, np.ndarray) - transform = transforms.Lambda(was_applied_fn, *types) + with pytest.raises(ValueError, match="Could not infer where the labels are"): + bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])} + transforms.SanitizeBoundingBoxes()(bad_labels_key) - transform(input) + with pytest.raises(ValueError, match="must be a tensor"): + not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()} + transforms.SanitizeBoundingBoxes()(not_a_tensor) - assert was_applied is isinstance(input, types) + with pytest.raises(ValueError, match="Number of boxes"): + different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)} + transforms.SanitizeBoundingBoxes()(different_sizes) diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py deleted file mode 100644 index efeb673059f..00000000000 --- a/test/test_transforms_v2_consistency.py +++ /dev/null @@ -1,843 +0,0 @@ -import importlib.machinery -import importlib.util -import inspect -import random -import re -from pathlib import Path - -import numpy as np -import pytest - -import torch -import torchvision.transforms.v2 as v2_transforms -from common_utils import assert_close, assert_equal, set_rng_seed -from torch import nn -from torchvision import transforms as legacy_transforms, tv_tensors -from torchvision._utils import sequence_to_str - -from torchvision.transforms import functional as legacy_F -from torchvision.transforms.v2 import functional as prototype_F -from torchvision.transforms.v2._utils import _get_fill, query_size -from torchvision.transforms.v2.functional import to_pil_image -from transforms_v2_legacy_utils import ( - ArgsKwargs, - make_bounding_boxes, - make_detection_mask, - make_image, - make_images, - make_segmentation_mask, -) - -DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)]) - - -@pytest.fixture(autouse=True) -def fix_rng_seed(): - set_rng_seed(0) - yield - - -class NotScriptableArgsKwargs(ArgsKwargs): - """ - This class is used to mark parameters that render the transform non-scriptable. They still work in eager mode and - thus will be tested there, but will be skipped by the JIT tests. - """ - - pass - - -class ConsistencyConfig: - def __init__( - self, - prototype_cls, - legacy_cls, - # If no args_kwargs is passed, only the signature will be checked - args_kwargs=(), - make_images_kwargs=None, - supports_pil=True, - removed_params=(), - closeness_kwargs=None, - ): - self.prototype_cls = prototype_cls - self.legacy_cls = legacy_cls - self.args_kwargs = args_kwargs - self.make_images_kwargs = make_images_kwargs or DEFAULT_MAKE_IMAGES_KWARGS - self.supports_pil = supports_pil - self.removed_params = removed_params - self.closeness_kwargs = closeness_kwargs or dict(rtol=0, atol=0) - - -# These are here since both the prototype and legacy transform need to be constructed with the same random parameters -LINEAR_TRANSFORMATION_MEAN = torch.rand(36) -LINEAR_TRANSFORMATION_MATRIX = torch.rand([LINEAR_TRANSFORMATION_MEAN.numel()] * 2) - -CONSISTENCY_CONFIGS = [ - ConsistencyConfig( - v2_transforms.Normalize, - legacy_transforms.Normalize, - [ - ArgsKwargs(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), - ], - supports_pil=False, - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.float]), - ), - ConsistencyConfig( - v2_transforms.FiveCrop, - legacy_transforms.FiveCrop, - [ - ArgsKwargs(18), - ArgsKwargs((18, 13)), - ], - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]), - ), - ConsistencyConfig( - v2_transforms.TenCrop, - legacy_transforms.TenCrop, - [ - ArgsKwargs(18), - ArgsKwargs((18, 13)), - ArgsKwargs(18, vertical_flip=True), - ], - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]), - ), - *[ - ConsistencyConfig( - v2_transforms.LinearTransformation, - legacy_transforms.LinearTransformation, - [ - ArgsKwargs(LINEAR_TRANSFORMATION_MATRIX.to(matrix_dtype), LINEAR_TRANSFORMATION_MEAN.to(matrix_dtype)), - ], - # Make sure that the product of the height, width and number of channels matches the number of elements in - # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36. - make_images_kwargs=dict( - DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=["RGB"], dtypes=[image_dtype] - ), - supports_pil=False, - ) - for matrix_dtype, image_dtype in [ - (torch.float32, torch.float32), - (torch.float64, torch.float64), - (torch.float32, torch.uint8), - (torch.float64, torch.float32), - (torch.float32, torch.float64), - ] - ], - ConsistencyConfig( - v2_transforms.Grayscale, - legacy_transforms.Grayscale, - [ - ArgsKwargs(num_output_channels=1), - ArgsKwargs(num_output_channels=3), - ], - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=["RGB", "GRAY"]), - # Use default tolerances of `torch.testing.assert_close` - closeness_kwargs=dict(rtol=None, atol=None), - ), - ConsistencyConfig( - v2_transforms.ToPILImage, - legacy_transforms.ToPILImage, - [NotScriptableArgsKwargs()], - make_images_kwargs=dict( - color_spaces=[ - "GRAY", - "GRAY_ALPHA", - "RGB", - "RGBA", - ], - extra_dims=[()], - ), - supports_pil=False, - ), - ConsistencyConfig( - v2_transforms.Lambda, - legacy_transforms.Lambda, - [ - NotScriptableArgsKwargs(lambda image: image / 2), - ], - # Technically, this also supports PIL, but it is overkill to write a function here that supports tensor and PIL - # images given that the transform does nothing but call it anyway. - supports_pil=False, - ), - ConsistencyConfig( - v2_transforms.RandomEqualize, - legacy_transforms.RandomEqualize, - [ - ArgsKwargs(p=0), - ArgsKwargs(p=1), - ], - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]), - ), - ConsistencyConfig( - v2_transforms.RandomInvert, - legacy_transforms.RandomInvert, - [ - ArgsKwargs(p=0), - ArgsKwargs(p=1), - ], - ), - ConsistencyConfig( - v2_transforms.RandomPosterize, - legacy_transforms.RandomPosterize, - [ - ArgsKwargs(p=0, bits=5), - ArgsKwargs(p=1, bits=1), - ArgsKwargs(p=1, bits=3), - ], - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]), - ), - ConsistencyConfig( - v2_transforms.RandomSolarize, - legacy_transforms.RandomSolarize, - [ - ArgsKwargs(p=0, threshold=0.5), - ArgsKwargs(p=1, threshold=0.3), - ArgsKwargs(p=1, threshold=0.99), - ], - ), - *[ - ConsistencyConfig( - v2_transforms.RandomAutocontrast, - legacy_transforms.RandomAutocontrast, - [ - ArgsKwargs(p=0), - ArgsKwargs(p=1), - ], - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[dt]), - closeness_kwargs=ckw, - ) - for dt, ckw in [(torch.uint8, dict(atol=1, rtol=0)), (torch.float32, dict(rtol=None, atol=None))] - ], - ConsistencyConfig( - v2_transforms.RandomAdjustSharpness, - legacy_transforms.RandomAdjustSharpness, - [ - ArgsKwargs(p=0, sharpness_factor=0.5), - ArgsKwargs(p=1, sharpness_factor=0.2), - ArgsKwargs(p=1, sharpness_factor=0.99), - ], - closeness_kwargs={"atol": 1e-6, "rtol": 1e-6}, - ), - ConsistencyConfig( - v2_transforms.RandomGrayscale, - legacy_transforms.RandomGrayscale, - [ - ArgsKwargs(p=0), - ArgsKwargs(p=1), - ], - make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=["RGB", "GRAY"]), - # Use default tolerances of `torch.testing.assert_close` - closeness_kwargs=dict(rtol=None, atol=None), - ), - ConsistencyConfig( - v2_transforms.PILToTensor, - legacy_transforms.PILToTensor, - ), - ConsistencyConfig( - v2_transforms.ToTensor, - legacy_transforms.ToTensor, - ), - ConsistencyConfig( - v2_transforms.Compose, - legacy_transforms.Compose, - ), - ConsistencyConfig( - v2_transforms.RandomApply, - legacy_transforms.RandomApply, - ), - ConsistencyConfig( - v2_transforms.RandomChoice, - legacy_transforms.RandomChoice, - ), - ConsistencyConfig( - v2_transforms.RandomOrder, - legacy_transforms.RandomOrder, - ), - ConsistencyConfig( - v2_transforms.AugMix, - legacy_transforms.AugMix, - ), - ConsistencyConfig( - v2_transforms.AutoAugment, - legacy_transforms.AutoAugment, - ), - ConsistencyConfig( - v2_transforms.RandAugment, - legacy_transforms.RandAugment, - ), - ConsistencyConfig( - v2_transforms.TrivialAugmentWide, - legacy_transforms.TrivialAugmentWide, - ), -] - - -@pytest.mark.parametrize("config", CONSISTENCY_CONFIGS, ids=lambda config: config.legacy_cls.__name__) -def test_signature_consistency(config): - legacy_params = dict(inspect.signature(config.legacy_cls).parameters) - prototype_params = dict(inspect.signature(config.prototype_cls).parameters) - - for param in config.removed_params: - legacy_params.pop(param, None) - - missing = legacy_params.keys() - prototype_params.keys() - if missing: - raise AssertionError( - f"The prototype transform does not support the parameters " - f"{sequence_to_str(sorted(missing), separate_last='and ')}, but the legacy transform does. " - f"If that is intentional, e.g. pending deprecation, please add the parameters to the `removed_params` on " - f"the `ConsistencyConfig`." - ) - - extra = prototype_params.keys() - legacy_params.keys() - extra_without_default = { - param - for param in extra - if prototype_params[param].default is inspect.Parameter.empty - and prototype_params[param].kind not in {inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD} - } - if extra_without_default: - raise AssertionError( - f"The prototype transform requires the parameters " - f"{sequence_to_str(sorted(extra_without_default), separate_last='and ')}, but the legacy transform does " - f"not. Please add a default value." - ) - - legacy_signature = list(legacy_params.keys()) - # Since we made sure that we don't have any extra parameters without default above, we clamp the prototype signature - # to the same number of parameters as the legacy one - prototype_signature = list(prototype_params.keys())[: len(legacy_signature)] - - assert prototype_signature == legacy_signature - - -def check_call_consistency( - prototype_transform, legacy_transform, images=None, supports_pil=True, closeness_kwargs=None -): - if images is None: - images = make_images(**DEFAULT_MAKE_IMAGES_KWARGS) - - closeness_kwargs = closeness_kwargs or dict() - - for image in images: - image_repr = f"[{tuple(image.shape)}, {str(image.dtype).rsplit('.')[-1]}]" - - image_tensor = torch.Tensor(image) - try: - torch.manual_seed(0) - output_legacy_tensor = legacy_transform(image_tensor) - except Exception as exc: - raise pytest.UsageError( - f"Transforming a tensor image {image_repr} failed in the legacy transform with the " - f"error above. This means that you need to specify the parameters passed to `make_images` through the " - "`make_images_kwargs` of the `ConsistencyConfig`." - ) from exc - - try: - torch.manual_seed(0) - output_prototype_tensor = prototype_transform(image_tensor) - except Exception as exc: - raise AssertionError( - f"Transforming a tensor image with shape {image_repr} failed in the prototype transform with " - f"the error above. This means there is a consistency bug either in `_get_params` or in the " - f"`is_pure_tensor` path in `_transform`." - ) from exc - - assert_close( - output_prototype_tensor, - output_legacy_tensor, - msg=lambda msg: f"Tensor image consistency check failed with: \n\n{msg}", - **closeness_kwargs, - ) - - try: - torch.manual_seed(0) - output_prototype_image = prototype_transform(image) - except Exception as exc: - raise AssertionError( - f"Transforming a image tv_tensor with shape {image_repr} failed in the prototype transform with " - f"the error above. This means there is a consistency bug either in `_get_params` or in the " - f"`tv_tensors.Image` path in `_transform`." - ) from exc - - assert_close( - output_prototype_image, - output_prototype_tensor, - msg=lambda msg: f"Output for tv_tensor and tensor images is not equal: \n\n{msg}", - **closeness_kwargs, - ) - - if image.ndim == 3 and supports_pil: - image_pil = to_pil_image(image) - - try: - torch.manual_seed(0) - output_legacy_pil = legacy_transform(image_pil) - except Exception as exc: - raise pytest.UsageError( - f"Transforming a PIL image with shape {image_repr} failed in the legacy transform with the " - f"error above. If this transform does not support PIL images, set `supports_pil=False` on the " - "`ConsistencyConfig`. " - ) from exc - - try: - torch.manual_seed(0) - output_prototype_pil = prototype_transform(image_pil) - except Exception as exc: - raise AssertionError( - f"Transforming a PIL image with shape {image_repr} failed in the prototype transform with " - f"the error above. This means there is a consistency bug either in `_get_params` or in the " - f"`PIL.Image.Image` path in `_transform`." - ) from exc - - assert_close( - output_prototype_pil, - output_legacy_pil, - msg=lambda msg: f"PIL image consistency check failed with: \n\n{msg}", - **closeness_kwargs, - ) - - -@pytest.mark.parametrize( - ("config", "args_kwargs"), - [ - pytest.param( - config, args_kwargs, id=f"{config.legacy_cls.__name__}-{idx:0{len(str(len(config.args_kwargs)))}d}" - ) - for config in CONSISTENCY_CONFIGS - for idx, args_kwargs in enumerate(config.args_kwargs) - ], -) -@pytest.mark.filterwarnings("ignore") -def test_call_consistency(config, args_kwargs): - args, kwargs = args_kwargs - - try: - legacy_transform = config.legacy_cls(*args, **kwargs) - except Exception as exc: - raise pytest.UsageError( - f"Initializing the legacy transform failed with the error above. " - f"Please correct the `ArgsKwargs({args_kwargs})` in the `ConsistencyConfig`." - ) from exc - - try: - prototype_transform = config.prototype_cls(*args, **kwargs) - except Exception as exc: - raise AssertionError( - "Initializing the prototype transform failed with the error above. " - "This means there is a consistency bug in the constructor." - ) from exc - - check_call_consistency( - prototype_transform, - legacy_transform, - images=make_images(**config.make_images_kwargs), - supports_pil=config.supports_pil, - closeness_kwargs=config.closeness_kwargs, - ) - - -@pytest.mark.parametrize( - ("config", "args_kwargs"), - [ - pytest.param( - config, args_kwargs, id=f"{config.legacy_cls.__name__}-{idx:0{len(str(len(config.args_kwargs)))}d}" - ) - for config in CONSISTENCY_CONFIGS - for idx, args_kwargs in enumerate(config.args_kwargs) - if not isinstance(args_kwargs, NotScriptableArgsKwargs) - ], -) -def test_jit_consistency(config, args_kwargs): - args, kwargs = args_kwargs - - prototype_transform_eager = config.prototype_cls(*args, **kwargs) - legacy_transform_eager = config.legacy_cls(*args, **kwargs) - - legacy_transform_scripted = torch.jit.script(legacy_transform_eager) - prototype_transform_scripted = torch.jit.script(prototype_transform_eager) - - for image in make_images(**config.make_images_kwargs): - image = image.as_subclass(torch.Tensor) - - torch.manual_seed(0) - output_legacy_scripted = legacy_transform_scripted(image) - - torch.manual_seed(0) - output_prototype_scripted = prototype_transform_scripted(image) - - assert_close(output_prototype_scripted, output_legacy_scripted, **config.closeness_kwargs) - - -class TestContainerTransforms: - """ - Since we are testing containers here, we also need some transforms to wrap. Thus, testing a container transform for - consistency automatically tests the wrapped transforms consistency. - - Instead of complicated mocking or creating custom transforms just for these tests, here we use deterministic ones - that were already tested for consistency above. - """ - - def test_compose(self): - prototype_transform = v2_transforms.Compose( - [ - v2_transforms.Resize(256), - v2_transforms.CenterCrop(224), - ] - ) - legacy_transform = legacy_transforms.Compose( - [ - legacy_transforms.Resize(256), - legacy_transforms.CenterCrop(224), - ] - ) - - # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes - check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1)) - - @pytest.mark.parametrize("p", [0, 0.1, 0.5, 0.9, 1]) - @pytest.mark.parametrize("sequence_type", [list, nn.ModuleList]) - def test_random_apply(self, p, sequence_type): - prototype_transform = v2_transforms.RandomApply( - sequence_type( - [ - v2_transforms.Resize(256), - v2_transforms.CenterCrop(224), - ] - ), - p=p, - ) - legacy_transform = legacy_transforms.RandomApply( - sequence_type( - [ - legacy_transforms.Resize(256), - legacy_transforms.CenterCrop(224), - ] - ), - p=p, - ) - - # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes - check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1)) - - if sequence_type is nn.ModuleList: - # quick and dirty test that it is jit-scriptable - scripted = torch.jit.script(prototype_transform) - scripted(torch.rand(1, 3, 300, 300)) - - # We can't test other values for `p` since the random parameter generation is different - @pytest.mark.parametrize("probabilities", [(0, 1), (1, 0)]) - def test_random_choice(self, probabilities): - prototype_transform = v2_transforms.RandomChoice( - [ - v2_transforms.Resize(256), - legacy_transforms.CenterCrop(224), - ], - p=probabilities, - ) - legacy_transform = legacy_transforms.RandomChoice( - [ - legacy_transforms.Resize(256), - legacy_transforms.CenterCrop(224), - ], - p=probabilities, - ) - - # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes - check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1)) - - -class TestToTensorTransforms: - def test_pil_to_tensor(self): - prototype_transform = v2_transforms.PILToTensor() - legacy_transform = legacy_transforms.PILToTensor() - - for image in make_images(extra_dims=[()]): - image_pil = to_pil_image(image) - - assert_equal(prototype_transform(image_pil), legacy_transform(image_pil)) - - def test_to_tensor(self): - with pytest.warns(UserWarning, match=re.escape("The transform `ToTensor()` is deprecated")): - prototype_transform = v2_transforms.ToTensor() - legacy_transform = legacy_transforms.ToTensor() - - for image in make_images(extra_dims=[()]): - image_pil = to_pil_image(image) - image_numpy = np.array(image_pil) - - assert_equal(prototype_transform(image_pil), legacy_transform(image_pil)) - assert_equal(prototype_transform(image_numpy), legacy_transform(image_numpy)) - - -def import_transforms_from_references(reference): - HERE = Path(__file__).parent - PROJECT_ROOT = HERE.parent - - loader = importlib.machinery.SourceFileLoader( - "transforms", str(PROJECT_ROOT / "references" / reference / "transforms.py") - ) - spec = importlib.util.spec_from_loader("transforms", loader) - module = importlib.util.module_from_spec(spec) - loader.exec_module(module) - return module - - -det_transforms = import_transforms_from_references("detection") - - -class TestRefDetTransforms: - def make_tv_tensors(self, with_mask=True): - size = (600, 800) - num_objects = 22 - - def make_label(extra_dims, categories): - return torch.randint(categories, extra_dims, dtype=torch.int64) - - pil_image = to_pil_image(make_image(size=size, color_space="RGB")) - target = { - "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), - "labels": make_label(extra_dims=(num_objects,), categories=80), - } - if with_mask: - target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) - - yield (pil_image, target) - - tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32)) - target = { - "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), - "labels": make_label(extra_dims=(num_objects,), categories=80), - } - if with_mask: - target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) - - yield (tensor_image, target) - - tv_tensor_image = make_image(size=size, color_space="RGB", dtype=torch.float32) - target = { - "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), - "labels": make_label(extra_dims=(num_objects,), categories=80), - } - if with_mask: - target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) - - yield (tv_tensor_image, target) - - @pytest.mark.parametrize( - "t_ref, t, data_kwargs", - [ - (det_transforms.RandomHorizontalFlip(p=1.0), v2_transforms.RandomHorizontalFlip(p=1.0), {}), - ( - det_transforms.RandomIoUCrop(), - v2_transforms.Compose( - [ - v2_transforms.RandomIoUCrop(), - v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]), - ] - ), - {"with_mask": False}, - ), - (det_transforms.RandomZoomOut(), v2_transforms.RandomZoomOut(), {"with_mask": False}), - (det_transforms.ScaleJitter((1024, 1024)), v2_transforms.ScaleJitter((1024, 1024), antialias=True), {}), - ( - det_transforms.RandomShortestSize( - min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 - ), - v2_transforms.RandomShortestSize( - min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 - ), - {}, - ), - ], - ) - def test_transform(self, t_ref, t, data_kwargs): - for dp in self.make_tv_tensors(**data_kwargs): - - # We should use prototype transform first as reference transform performs inplace target update - torch.manual_seed(12) - output = t(dp) - - torch.manual_seed(12) - expected_output = t_ref(*dp) - - assert_equal(expected_output, output) - - -seg_transforms = import_transforms_from_references("segmentation") - - -# We need this transform for two reasons: -# 1. transforms.RandomCrop uses a different scheme to pad images and masks of insufficient size than its name -# counterpart in the detection references. Thus, we cannot use it with `pad_if_needed=True` -# 2. transforms.Pad only supports a fixed padding, but the segmentation datasets don't have a fixed image size. -class PadIfSmaller(v2_transforms.Transform): - def __init__(self, size, fill=0): - super().__init__() - self.size = size - self.fill = v2_transforms._geometry._setup_fill_arg(fill) - - def _get_params(self, sample): - height, width = query_size(sample) - padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)] - needs_padding = any(padding) - return dict(padding=padding, needs_padding=needs_padding) - - def _transform(self, inpt, params): - if not params["needs_padding"]: - return inpt - - fill = _get_fill(self.fill, type(inpt)) - return prototype_F.pad(inpt, padding=params["padding"], fill=fill) - - -class TestRefSegTransforms: - def make_tv_tensors(self, supports_pil=True, image_dtype=torch.uint8): - size = (256, 460) - num_categories = 21 - - conv_fns = [] - if supports_pil: - conv_fns.append(to_pil_image) - conv_fns.extend([torch.Tensor, lambda x: x]) - - for conv_fn in conv_fns: - tv_tensor_image = make_image(size=size, color_space="RGB", dtype=image_dtype) - tv_tensor_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8) - - dp = (conv_fn(tv_tensor_image), tv_tensor_mask) - dp_ref = ( - to_pil_image(tv_tensor_image) if supports_pil else tv_tensor_image.as_subclass(torch.Tensor), - to_pil_image(tv_tensor_mask), - ) - - yield dp, dp_ref - - def set_seed(self, seed=12): - torch.manual_seed(seed) - random.seed(seed) - - def check(self, t, t_ref, data_kwargs=None): - for dp, dp_ref in self.make_tv_tensors(**data_kwargs or dict()): - - self.set_seed() - actual = actual_image, actual_mask = t(dp) - - self.set_seed() - expected_image, expected_mask = t_ref(*dp_ref) - if isinstance(actual_image, torch.Tensor) and not isinstance(expected_image, torch.Tensor): - expected_image = legacy_F.pil_to_tensor(expected_image) - expected_mask = legacy_F.pil_to_tensor(expected_mask).squeeze(0) - expected = (expected_image, expected_mask) - - assert_equal(actual, expected) - - @pytest.mark.parametrize( - ("t_ref", "t", "data_kwargs"), - [ - ( - seg_transforms.RandomHorizontalFlip(flip_prob=1.0), - v2_transforms.RandomHorizontalFlip(p=1.0), - dict(), - ), - ( - seg_transforms.RandomHorizontalFlip(flip_prob=0.0), - v2_transforms.RandomHorizontalFlip(p=0.0), - dict(), - ), - ( - seg_transforms.RandomCrop(size=480), - v2_transforms.Compose( - [ - PadIfSmaller(size=480, fill={tv_tensors.Mask: 255, "others": 0}), - v2_transforms.RandomCrop(size=480), - ] - ), - dict(), - ), - ( - seg_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), - v2_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), - dict(supports_pil=False, image_dtype=torch.float), - ), - ], - ) - def test_common(self, t_ref, t, data_kwargs): - self.check(t, t_ref, data_kwargs) - - -@pytest.mark.parametrize( - ("legacy_dispatcher", "name_only_params"), - [ - (legacy_F.get_dimensions, {}), - (legacy_F.get_image_size, {}), - (legacy_F.get_image_num_channels, {}), - (legacy_F.to_tensor, {}), - (legacy_F.pil_to_tensor, {}), - (legacy_F.convert_image_dtype, {}), - (legacy_F.to_pil_image, {}), - (legacy_F.normalize, {}), - (legacy_F.resize, {"interpolation"}), - (legacy_F.pad, {"padding", "fill"}), - (legacy_F.crop, {}), - (legacy_F.center_crop, {}), - (legacy_F.resized_crop, {"interpolation"}), - (legacy_F.hflip, {}), - (legacy_F.perspective, {"startpoints", "endpoints", "fill", "interpolation"}), - (legacy_F.vflip, {}), - (legacy_F.five_crop, {}), - (legacy_F.ten_crop, {}), - (legacy_F.adjust_brightness, {}), - (legacy_F.adjust_contrast, {}), - (legacy_F.adjust_saturation, {}), - (legacy_F.adjust_hue, {}), - (legacy_F.adjust_gamma, {}), - (legacy_F.rotate, {"center", "fill", "interpolation"}), - (legacy_F.affine, {"angle", "translate", "center", "fill", "interpolation"}), - (legacy_F.to_grayscale, {}), - (legacy_F.rgb_to_grayscale, {}), - (legacy_F.to_tensor, {}), - (legacy_F.erase, {}), - (legacy_F.gaussian_blur, {}), - (legacy_F.invert, {}), - (legacy_F.posterize, {}), - (legacy_F.solarize, {}), - (legacy_F.adjust_sharpness, {}), - (legacy_F.autocontrast, {}), - (legacy_F.equalize, {}), - (legacy_F.elastic_transform, {"fill", "interpolation"}), - ], -) -def test_dispatcher_signature_consistency(legacy_dispatcher, name_only_params): - legacy_signature = inspect.signature(legacy_dispatcher) - legacy_params = list(legacy_signature.parameters.values())[1:] - - try: - prototype_dispatcher = getattr(prototype_F, legacy_dispatcher.__name__) - except AttributeError: - raise AssertionError( - f"Legacy dispatcher `F.{legacy_dispatcher.__name__}` has no prototype equivalent" - ) from None - - prototype_signature = inspect.signature(prototype_dispatcher) - prototype_params = list(prototype_signature.parameters.values())[1:] - - # Some dispatchers got extra parameters. This makes sure they have a default argument and thus are BC. We don't - # need to check if parameters were added in the middle rather than at the end, since that will be caught by the - # regular check below. - prototype_params, new_prototype_params = ( - prototype_params[: len(legacy_params)], - prototype_params[len(legacy_params) :], - ) - for param in new_prototype_params: - assert param.default is not param.empty - - # Some annotations were changed mostly to supersets of what was there before. Plus, some legacy dispatchers had no - # annotations. In these cases we simply drop the annotation and default argument from the comparison - for prototype_param, legacy_param in zip(prototype_params, legacy_params): - if legacy_param.name in name_only_params: - prototype_param._annotation = prototype_param._default = inspect.Parameter.empty - legacy_param._annotation = legacy_param._default = inspect.Parameter.empty - elif legacy_param.annotation is inspect.Parameter.empty: - prototype_param._annotation = inspect.Parameter.empty - - assert prototype_params == legacy_params diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py deleted file mode 100644 index be32c9c7d4c..00000000000 --- a/test/test_transforms_v2_functional.py +++ /dev/null @@ -1,572 +0,0 @@ -import inspect -import re - -import numpy as np -import PIL.Image -import pytest -import torch - -from common_utils import assert_close, cache, cpu_and_cuda, needs_cuda, set_rng_seed -from torch.utils._pytree import tree_map -from torchvision import tv_tensors -from torchvision.transforms.v2 import functional as F -from torchvision.transforms.v2._utils import is_pure_tensor -from transforms_v2_dispatcher_infos import DISPATCHER_INFOS -from transforms_v2_kernel_infos import KERNEL_INFOS -from transforms_v2_legacy_utils import ( - DEFAULT_SQUARE_SPATIAL_SIZE, - make_multiple_bounding_boxes, - parametrized_error_message, -) - - -KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS} -DISPATCHER_INFOS_MAP = {info.dispatcher: info for info in DISPATCHER_INFOS} - - -@cache -def script(fn): - try: - return torch.jit.script(fn) - except Exception as error: - raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error - - -# Scripting a function often triggers a warning like -# `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information` -# with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore -# them. -ignore_jit_warning_no_profile = pytest.mark.filterwarnings( - f"ignore:{re.escape('operator() profile_node %')}:UserWarning" -) - - -def make_info_args_kwargs_params(info, *, args_kwargs_fn, test_id=None): - args_kwargs = list(args_kwargs_fn(info)) - if not args_kwargs: - raise pytest.UsageError( - f"Couldn't collect a single `ArgsKwargs` for `{info.id}`{f' in {test_id}' if test_id else ''}" - ) - idx_field_len = len(str(len(args_kwargs))) - return [ - pytest.param( - info, - args_kwargs_, - marks=info.get_marks(test_id, args_kwargs_) if test_id else [], - id=f"{info.id}-{idx:0{idx_field_len}}", - ) - for idx, args_kwargs_ in enumerate(args_kwargs) - ] - - -def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn): - def decorator(test_fn): - parts = test_fn.__qualname__.split(".") - if len(parts) == 1: - test_class_name = None - test_function_name = parts[0] - elif len(parts) == 2: - test_class_name, test_function_name = parts - else: - raise pytest.UsageError("Unable to parse the test class name and test function name from test function") - test_id = (test_class_name, test_function_name) - - argnames = ("info", "args_kwargs") - argvalues = [] - for info in infos: - argvalues.extend(make_info_args_kwargs_params(info, args_kwargs_fn=args_kwargs_fn, test_id=test_id)) - - return pytest.mark.parametrize(argnames, argvalues)(test_fn) - - return decorator - - -@pytest.fixture(autouse=True) -def fix_rng_seed(): - set_rng_seed(0) - yield - - -@pytest.fixture() -def test_id(request): - test_class_name = request.cls.__name__ if request.cls is not None else None - test_function_name = request.node.originalname - return test_class_name, test_function_name - - -class TestKernels: - sample_inputs = make_info_args_kwargs_parametrization( - KERNEL_INFOS, - args_kwargs_fn=lambda kernel_info: kernel_info.sample_inputs_fn(), - ) - reference_inputs = make_info_args_kwargs_parametrization( - [info for info in KERNEL_INFOS if info.reference_fn is not None], - args_kwargs_fn=lambda info: info.reference_inputs_fn(), - ) - - @make_info_args_kwargs_parametrization( - [info for info in KERNEL_INFOS if info.logs_usage], - args_kwargs_fn=lambda info: info.sample_inputs_fn(), - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_logging(self, spy_on, info, args_kwargs, device): - spy = spy_on(torch._C._log_api_usage_once) - - (input, *other_args), kwargs = args_kwargs.load(device) - info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs) - - spy.assert_any_call(f"{info.kernel.__module__}.{info.id}") - - @ignore_jit_warning_no_profile - @sample_inputs - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_scripted_vs_eager(self, test_id, info, args_kwargs, device): - kernel_eager = info.kernel - kernel_scripted = script(kernel_eager) - - (input, *other_args), kwargs = args_kwargs.load(device) - input = input.as_subclass(torch.Tensor) - - actual = kernel_scripted(input, *other_args, **kwargs) - expected = kernel_eager(input, *other_args, **kwargs) - - assert_close( - actual, - expected, - **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device), - msg=parametrized_error_message(input, other_args, **kwargs), - ) - - def _unbatch(self, batch, *, data_dims): - if isinstance(batch, torch.Tensor): - batched_tensor = batch - metadata = () - else: - batched_tensor, *metadata = batch - - if batched_tensor.ndim == data_dims: - return batch - - return [ - self._unbatch(unbatched, data_dims=data_dims) - for unbatched in ( - batched_tensor.unbind(0) if not metadata else [(t, *metadata) for t in batched_tensor.unbind(0)] - ) - ] - - @sample_inputs - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_batched_vs_single(self, test_id, info, args_kwargs, device): - (batched_input, *other_args), kwargs = args_kwargs.load(device) - - tv_tensor_type = tv_tensors.Image if is_pure_tensor(batched_input) else type(batched_input) - # This dictionary contains the number of rightmost dimensions that contain the actual data. - # Everything to the left is considered a batch dimension. - data_dims = { - tv_tensors.Image: 3, - tv_tensors.BoundingBoxes: 1, - # `Mask`'s are special in the sense that the data dimensions depend on the type of mask. For detection masks - # it is 3 `(*, N, H, W)`, but for segmentation masks it is 2 `(*, H, W)`. Since both a grouped under one - # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as - # common ground. - tv_tensors.Mask: 2, - tv_tensors.Video: 4, - }.get(tv_tensor_type) - if data_dims is None: - raise pytest.UsageError( - f"The number of data dimensions cannot be determined for input of type {tv_tensor_type.__name__}." - ) from None - elif batched_input.ndim <= data_dims: - pytest.skip("Input is not batched.") - elif not all(batched_input.shape[:-data_dims]): - pytest.skip("Input has a degenerate batch shape.") - - batched_input = batched_input.as_subclass(torch.Tensor) - batched_output = info.kernel(batched_input, *other_args, **kwargs) - actual = self._unbatch(batched_output, data_dims=data_dims) - - single_inputs = self._unbatch(batched_input, data_dims=data_dims) - expected = tree_map(lambda single_input: info.kernel(single_input, *other_args, **kwargs), single_inputs) - - assert_close( - actual, - expected, - **info.get_closeness_kwargs(test_id, dtype=batched_input.dtype, device=batched_input.device), - msg=parametrized_error_message(batched_input, *other_args, **kwargs), - ) - - @sample_inputs - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_no_inplace(self, info, args_kwargs, device): - (input, *other_args), kwargs = args_kwargs.load(device) - input = input.as_subclass(torch.Tensor) - - if input.numel() == 0: - pytest.skip("The input has a degenerate shape.") - - input_version = input._version - info.kernel(input, *other_args, **kwargs) - - assert input._version == input_version - - @sample_inputs - @needs_cuda - def test_cuda_vs_cpu(self, test_id, info, args_kwargs): - (input_cpu, *other_args), kwargs = args_kwargs.load("cpu") - input_cpu = input_cpu.as_subclass(torch.Tensor) - input_cuda = input_cpu.to("cuda") - - output_cpu = info.kernel(input_cpu, *other_args, **kwargs) - output_cuda = info.kernel(input_cuda, *other_args, **kwargs) - - assert_close( - output_cuda, - output_cpu, - check_device=False, - **info.get_closeness_kwargs(test_id, dtype=input_cuda.dtype, device=input_cuda.device), - msg=parametrized_error_message(input_cpu, *other_args, **kwargs), - ) - - @sample_inputs - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_dtype_and_device_consistency(self, info, args_kwargs, device): - (input, *other_args), kwargs = args_kwargs.load(device) - input = input.as_subclass(torch.Tensor) - - output = info.kernel(input, *other_args, **kwargs) - # Most kernels just return a tensor, but some also return some additional metadata - if not isinstance(output, torch.Tensor): - output, *_ = output - - assert output.dtype == input.dtype - assert output.device == input.device - - @reference_inputs - def test_against_reference(self, test_id, info, args_kwargs): - (input, *other_args), kwargs = args_kwargs.load("cpu") - - actual = info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs) - # We intnetionally don't unwrap the input of the reference function in order for it to have access to all - # metadata regardless of whether the kernel takes it explicitly or not - expected = info.reference_fn(input, *other_args, **kwargs) - - assert_close( - actual, - expected, - **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device), - msg=parametrized_error_message(input, *other_args, **kwargs), - ) - - @make_info_args_kwargs_parametrization( - [info for info in KERNEL_INFOS if info.float32_vs_uint8], - args_kwargs_fn=lambda info: info.reference_inputs_fn(), - ) - def test_float32_vs_uint8(self, test_id, info, args_kwargs): - (input, *other_args), kwargs = args_kwargs.load("cpu") - input = input.as_subclass(torch.Tensor) - - if input.dtype != torch.uint8: - pytest.skip(f"Input dtype is {input.dtype}.") - - adapted_other_args, adapted_kwargs = info.float32_vs_uint8(other_args, kwargs) - - actual = info.kernel( - F.to_dtype_image(input, dtype=torch.float32, scale=True), - *adapted_other_args, - **adapted_kwargs, - ) - - expected = F.to_dtype_image(info.kernel(input, *other_args, **kwargs), dtype=torch.float32, scale=True) - - assert_close( - actual, - expected, - **info.get_closeness_kwargs(test_id, dtype=torch.float32, device=input.device), - msg=parametrized_error_message(input, *other_args, **kwargs), - ) - - -@pytest.fixture -def spy_on(mocker): - def make_spy(fn, *, module=None, name=None): - # TODO: we can probably get rid of the non-default modules and names if we eliminate aliasing - module = module or fn.__module__ - name = name or fn.__name__ - spy = mocker.patch(f"{module}.{name}", wraps=fn) - return spy - - return make_spy - - -class TestDispatchers: - image_sample_inputs = make_info_args_kwargs_parametrization( - [info for info in DISPATCHER_INFOS if tv_tensors.Image in info.kernels], - args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.Image), - ) - - @make_info_args_kwargs_parametrization( - DISPATCHER_INFOS, - args_kwargs_fn=lambda info: info.sample_inputs(), - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_logging(self, spy_on, info, args_kwargs, device): - spy = spy_on(torch._C._log_api_usage_once) - - args, kwargs = args_kwargs.load(device) - info.dispatcher(*args, **kwargs) - - spy.assert_any_call(f"{info.dispatcher.__module__}.{info.id}") - - @ignore_jit_warning_no_profile - @image_sample_inputs - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_scripted_smoke(self, info, args_kwargs, device): - dispatcher = script(info.dispatcher) - - (image_tv_tensor, *other_args), kwargs = args_kwargs.load(device) - image_pure_tensor = torch.Tensor(image_tv_tensor) - - dispatcher(image_pure_tensor, *other_args, **kwargs) - - # TODO: We need this until the dispatchers below also have `DispatcherInfo`'s. If they do, `test_scripted_smoke` - # replaces this test for them. - @ignore_jit_warning_no_profile - @pytest.mark.parametrize( - "dispatcher", - [ - F.get_dimensions, - F.get_image_num_channels, - F.get_image_size, - F.get_num_channels, - F.get_num_frames, - F.get_size, - F.rgb_to_grayscale, - F.uniform_temporal_subsample, - ], - ids=lambda dispatcher: dispatcher.__name__, - ) - def test_scriptable(self, dispatcher): - script(dispatcher) - - @image_sample_inputs - def test_pure_tensor_output_type(self, info, args_kwargs): - (image_tv_tensor, *other_args), kwargs = args_kwargs.load() - image_pure_tensor = image_tv_tensor.as_subclass(torch.Tensor) - - output = info.dispatcher(image_pure_tensor, *other_args, **kwargs) - - # We cannot use `isinstance` here since all tv_tensors are instances of `torch.Tensor` as well - assert type(output) is torch.Tensor - - @make_info_args_kwargs_parametrization( - [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None], - args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.Image), - ) - def test_pil_output_type(self, info, args_kwargs): - (image_tv_tensor, *other_args), kwargs = args_kwargs.load() - - if image_tv_tensor.ndim > 3: - pytest.skip("Input is batched") - - image_pil = F.to_pil_image(image_tv_tensor) - - output = info.dispatcher(image_pil, *other_args, **kwargs) - - assert isinstance(output, PIL.Image.Image) - - @make_info_args_kwargs_parametrization( - DISPATCHER_INFOS, - args_kwargs_fn=lambda info: info.sample_inputs(), - ) - def test_tv_tensor_output_type(self, info, args_kwargs): - (tv_tensor, *other_args), kwargs = args_kwargs.load() - - output = info.dispatcher(tv_tensor, *other_args, **kwargs) - - assert isinstance(output, type(tv_tensor)) - - if isinstance(tv_tensor, tv_tensors.BoundingBoxes) and info.dispatcher is not F.convert_bounding_box_format: - assert output.format == tv_tensor.format - - @pytest.mark.parametrize( - ("dispatcher_info", "tv_tensor_type", "kernel_info"), - [ - pytest.param( - dispatcher_info, tv_tensor_type, kernel_info, id=f"{dispatcher_info.id}-{tv_tensor_type.__name__}" - ) - for dispatcher_info in DISPATCHER_INFOS - for tv_tensor_type, kernel_info in dispatcher_info.kernel_infos.items() - ], - ) - def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, tv_tensor_type, kernel_info): - dispatcher_signature = inspect.signature(dispatcher_info.dispatcher) - dispatcher_params = list(dispatcher_signature.parameters.values())[1:] - - kernel_signature = inspect.signature(kernel_info.kernel) - kernel_params = list(kernel_signature.parameters.values())[1:] - - # We filter out metadata that is implicitly passed to the dispatcher through the input tv_tensor, but has to be - # explicitly passed to the kernel. - input_type = {v: k for k, v in dispatcher_info.kernels.items()}.get(kernel_info.kernel) - explicit_metadata = { - tv_tensors.BoundingBoxes: {"format", "canvas_size"}, - } - kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())] - - dispatcher_params = iter(dispatcher_params) - for dispatcher_param, kernel_param in zip(dispatcher_params, kernel_params): - try: - # In general, the dispatcher parameters are a superset of the kernel parameters. Thus, we filter out - # dispatcher parameters that have no kernel equivalent while keeping the order intact. - while dispatcher_param.name != kernel_param.name: - dispatcher_param = next(dispatcher_params) - except StopIteration: - raise AssertionError( - f"Parameter `{kernel_param.name}` of kernel `{kernel_info.id}` " - f"has no corresponding parameter on the dispatcher `{dispatcher_info.id}`." - ) from None - - assert dispatcher_param == kernel_param - - @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id) - def test_unkown_type(self, info): - unkown_input = object() - (_, *other_args), kwargs = next(iter(info.sample_inputs())).load("cpu") - - with pytest.raises(TypeError, match=re.escape(str(type(unkown_input)))): - info.dispatcher(unkown_input, *other_args, **kwargs) - - @make_info_args_kwargs_parametrization( - [ - info - for info in DISPATCHER_INFOS - if tv_tensors.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_bounding_box_format - ], - args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.BoundingBoxes), - ) - def test_bounding_boxes_format_consistency(self, info, args_kwargs): - (bounding_boxes, *other_args), kwargs = args_kwargs.load() - format = bounding_boxes.format - - output = info.dispatcher(bounding_boxes, *other_args, **kwargs) - - assert output.format == format - - -@pytest.mark.parametrize( - ("alias", "target"), - [ - pytest.param(alias, target, id=alias.__name__) - for alias, target in [ - (F.hflip, F.horizontal_flip), - (F.vflip, F.vertical_flip), - (F.get_image_num_channels, F.get_num_channels), - (F.to_pil_image, F.to_pil_image), - (F.elastic_transform, F.elastic), - (F.to_grayscale, F.rgb_to_grayscale), - ] - ], -) -def test_alias(alias, target): - assert alias is target - - -@pytest.mark.parametrize("device", cpu_and_cuda()) -@pytest.mark.parametrize("num_channels", [1, 3]) -def test_normalize_image_tensor_stats(device, num_channels): - stats = pytest.importorskip("scipy.stats", reason="SciPy is not available") - - def assert_samples_from_standard_normal(t): - p_value = stats.kstest(t.flatten(), cdf="norm", args=(0, 1)).pvalue - return p_value > 1e-4 - - image = torch.rand(num_channels, DEFAULT_SQUARE_SPATIAL_SIZE, DEFAULT_SQUARE_SPATIAL_SIZE) - mean = image.mean(dim=(1, 2)).tolist() - std = image.std(dim=(1, 2)).tolist() - - assert_samples_from_standard_normal(F.normalize_image(image, mean, std)) - - -class TestClampBoundingBoxes: - @pytest.mark.parametrize( - "metadata", - [ - dict(), - dict(format=tv_tensors.BoundingBoxFormat.XYXY), - dict(canvas_size=(1, 1)), - ], - ) - def test_pure_tensor_insufficient_metadata(self, metadata): - pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor) - - with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")): - F.clamp_bounding_boxes(pure_tensor, **metadata) - - @pytest.mark.parametrize( - "metadata", - [ - dict(format=tv_tensors.BoundingBoxFormat.XYXY), - dict(canvas_size=(1, 1)), - dict(format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=(1, 1)), - ], - ) - def test_tv_tensor_explicit_metadata(self, metadata): - tv_tensor = next(make_multiple_bounding_boxes()) - - with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")): - F.clamp_bounding_boxes(tv_tensor, **metadata) - - -# TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in -# `transforms_v2_kernel_infos.py` - - -@pytest.mark.parametrize( - "inpt", - [ - 127 * np.ones((32, 32, 3), dtype="uint8"), - PIL.Image.new("RGB", (32, 32), 122), - ], -) -def test_to_image(inpt): - output = F.to_image(inpt) - assert isinstance(output, torch.Tensor) - assert output.shape == (3, 32, 32) - - assert np.asarray(inpt).sum() == output.sum().item() - - -@pytest.mark.parametrize( - "inpt", - [ - torch.randint(0, 256, size=(3, 32, 32), dtype=torch.uint8), - 127 * np.ones((32, 32, 3), dtype="uint8"), - ], -) -@pytest.mark.parametrize("mode", [None, "RGB"]) -def test_to_pil_image(inpt, mode): - output = F.to_pil_image(inpt, mode=mode) - assert isinstance(output, PIL.Image.Image) - - assert np.asarray(inpt).sum() == np.asarray(output).sum() - - -def test_equalize_image_tensor_edge_cases(): - inpt = torch.zeros(3, 200, 200, dtype=torch.uint8) - output = F.equalize_image(inpt) - torch.testing.assert_close(inpt, output) - - inpt = torch.zeros(5, 3, 200, 200, dtype=torch.uint8) - inpt[..., 100:, 100:] = 1 - output = F.equalize_image(inpt) - assert output.unique().tolist() == [0, 255] - - -@pytest.mark.parametrize("device", cpu_and_cuda()) -def test_correctness_uniform_temporal_subsample(device): - video = torch.arange(10, device=device)[:, None, None, None].expand(-1, 3, 8, 8) - out_video = F.uniform_temporal_subsample(video, 5) - assert out_video.unique().tolist() == [0, 2, 4, 6, 9] - - out_video = F.uniform_temporal_subsample(video, 8) - assert out_video.unique().tolist() == [0, 1, 2, 3, 5, 6, 7, 9] diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py deleted file mode 100644 index 59d30d482e2..00000000000 --- a/test/test_transforms_v2_refactored.py +++ /dev/null @@ -1,3947 +0,0 @@ -import contextlib -import decimal -import functools -import inspect -import itertools -import math -import pickle -import re -from pathlib import Path -from unittest import mock - -import numpy as np -import PIL.Image -import pytest - -import torch - -import torchvision.ops -import torchvision.transforms.v2 as transforms -from common_utils import ( - assert_equal, - assert_no_warnings, - cache, - cpu_and_cuda, - freeze_rng_state, - ignore_jit_no_profile_information_warning, - make_bounding_boxes, - make_detection_mask, - make_image, - make_image_pil, - make_image_tensor, - make_segmentation_mask, - make_video, - make_video_tensor, - needs_cuda, - set_rng_seed, -) - -from torch import nn -from torch.testing import assert_close -from torch.utils._pytree import tree_map -from torch.utils.data import DataLoader, default_collate -from torchvision import tv_tensors - -from torchvision.transforms._functional_tensor import _max_value as get_max_value -from torchvision.transforms.functional import pil_modes_mapping -from torchvision.transforms.v2 import functional as F -from torchvision.transforms.v2.functional._geometry import _get_perspective_coeffs -from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal - - -@pytest.fixture(autouse=True) -def fix_rng_seed(): - set_rng_seed(0) - yield - - -def _to_tolerances(maybe_tolerance_dict): - if not isinstance(maybe_tolerance_dict, dict): - return dict(rtol=None, atol=None) - - tolerances = dict(rtol=0, atol=0) - tolerances.update(maybe_tolerance_dict) - return tolerances - - -def _check_kernel_cuda_vs_cpu(kernel, input, *args, rtol, atol, **kwargs): - """Checks if the kernel produces closes results for inputs on GPU and CPU.""" - if input.device.type != "cuda": - return - - input_cuda = input.as_subclass(torch.Tensor) - input_cpu = input_cuda.to("cpu") - - with freeze_rng_state(): - actual = kernel(input_cuda, *args, **kwargs) - with freeze_rng_state(): - expected = kernel(input_cpu, *args, **kwargs) - - assert_close(actual, expected, check_device=False, rtol=rtol, atol=atol) - - -@cache -def _script(obj): - try: - return torch.jit.script(obj) - except Exception as error: - name = getattr(obj, "__name__", obj.__class__.__name__) - raise AssertionError(f"Trying to `torch.jit.script` '{name}' raised the error above.") from error - - -def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs): - """Checks if the kernel is scriptable and if the scripted output is close to the eager one.""" - if input.device.type != "cpu": - return - - kernel_scripted = _script(kernel) - - input = input.as_subclass(torch.Tensor) - with ignore_jit_no_profile_information_warning(): - actual = kernel_scripted(input, *args, **kwargs) - expected = kernel(input, *args, **kwargs) - - assert_close(actual, expected, rtol=rtol, atol=atol) - - -def _check_kernel_batched_vs_unbatched(kernel, input, *args, rtol, atol, **kwargs): - """Checks if the kernel produces close results for batched and unbatched inputs.""" - unbatched_input = input.as_subclass(torch.Tensor) - - for batch_dims in [(2,), (2, 1)]: - repeats = [*batch_dims, *[1] * input.ndim] - - actual = kernel(unbatched_input.repeat(repeats), *args, **kwargs) - - expected = kernel(unbatched_input, *args, **kwargs) - # We can't directly call `.repeat()` on the output, since some kernel also return some additional metadata - if isinstance(expected, torch.Tensor): - expected = expected.repeat(repeats) - else: - tensor, *metadata = expected - expected = (tensor.repeat(repeats), *metadata) - - assert_close(actual, expected, rtol=rtol, atol=atol) - - for degenerate_batch_dims in [(0,), (5, 0), (0, 5)]: - degenerate_batched_input = torch.empty( - degenerate_batch_dims + input.shape, dtype=input.dtype, device=input.device - ) - - output = kernel(degenerate_batched_input, *args, **kwargs) - # Most kernels just return a tensor, but some also return some additional metadata - if not isinstance(output, torch.Tensor): - output, *_ = output - - assert output.shape[: -input.ndim] == degenerate_batch_dims - - -def check_kernel( - kernel, - input, - *args, - check_cuda_vs_cpu=True, - check_scripted_vs_eager=True, - check_batched_vs_unbatched=True, - **kwargs, -): - initial_input_version = input._version - - output = kernel(input.as_subclass(torch.Tensor), *args, **kwargs) - # Most kernels just return a tensor, but some also return some additional metadata - if not isinstance(output, torch.Tensor): - output, *_ = output - - # check that no inplace operation happened - assert input._version == initial_input_version - - if kernel not in {F.to_dtype_image, F.to_dtype_video}: - assert output.dtype == input.dtype - assert output.device == input.device - - if check_cuda_vs_cpu: - _check_kernel_cuda_vs_cpu(kernel, input, *args, **kwargs, **_to_tolerances(check_cuda_vs_cpu)) - - if check_scripted_vs_eager: - _check_kernel_scripted_vs_eager(kernel, input, *args, **kwargs, **_to_tolerances(check_scripted_vs_eager)) - - if check_batched_vs_unbatched: - _check_kernel_batched_vs_unbatched(kernel, input, *args, **kwargs, **_to_tolerances(check_batched_vs_unbatched)) - - -def _check_functional_scripted_smoke(functional, input, *args, **kwargs): - """Checks if the functional can be scripted and the scripted version can be called without error.""" - if not isinstance(input, tv_tensors.Image): - return - - functional_scripted = _script(functional) - with ignore_jit_no_profile_information_warning(): - functional_scripted(input.as_subclass(torch.Tensor), *args, **kwargs) - - -def check_functional(functional, input, *args, check_scripted_smoke=True, **kwargs): - unknown_input = object() - with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))): - functional(unknown_input, *args, **kwargs) - - with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy: - output = functional(input, *args, **kwargs) - - spy.assert_any_call(f"{functional.__module__}.{functional.__name__}") - - assert isinstance(output, type(input)) - - if isinstance(input, tv_tensors.BoundingBoxes) and functional is not F.convert_bounding_box_format: - assert output.format == input.format - - if check_scripted_smoke: - _check_functional_scripted_smoke(functional, input, *args, **kwargs) - - -def check_functional_kernel_signature_match(functional, *, kernel, input_type): - """Checks if the signature of the functional matches the kernel signature.""" - functional_params = list(inspect.signature(functional).parameters.values())[1:] - kernel_params = list(inspect.signature(kernel).parameters.values())[1:] - - if issubclass(input_type, tv_tensors.TVTensor): - # We filter out metadata that is implicitly passed to the functional through the input tv_tensor, but has to be - # explicitly passed to the kernel. - explicit_metadata = { - tv_tensors.BoundingBoxes: {"format", "canvas_size"}, - } - kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())] - - functional_params = iter(functional_params) - for functional_param, kernel_param in zip(functional_params, kernel_params): - try: - # In general, the functional parameters are a superset of the kernel parameters. Thus, we filter out - # functional parameters that have no kernel equivalent while keeping the order intact. - while functional_param.name != kernel_param.name: - functional_param = next(functional_params) - except StopIteration: - raise AssertionError( - f"Parameter `{kernel_param.name}` of kernel `{kernel.__name__}` " - f"has no corresponding parameter on the functional `{functional.__name__}`." - ) from None - - if issubclass(input_type, PIL.Image.Image): - # PIL kernels often have more correct annotations, since they are not limited by JIT. Thus, we don't check - # them in the first place. - functional_param._annotation = kernel_param._annotation = inspect.Parameter.empty - - assert functional_param == kernel_param - - -def _check_transform_v1_compatibility(transform, input, *, rtol, atol): - """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static - ``get_params`` method that is the v1 equivalent, the output is close to v1, is scriptable, and the scripted version - can be called without error.""" - if not (type(input) is torch.Tensor or isinstance(input, PIL.Image.Image)): - return - - v1_transform_cls = transform._v1_transform_cls - if v1_transform_cls is None: - return - - if hasattr(v1_transform_cls, "get_params"): - assert type(transform).get_params is v1_transform_cls.get_params - - v1_transform = v1_transform_cls(**transform._extract_params_for_v1_transform()) - - with freeze_rng_state(): - output_v2 = transform(input) - - with freeze_rng_state(): - output_v1 = v1_transform(input) - - assert_close(F.to_image(output_v2), F.to_image(output_v1), rtol=rtol, atol=atol) - - if isinstance(input, PIL.Image.Image): - return - - _script(v1_transform)(input) - - -def check_transform(transform, input, check_v1_compatibility=True): - pickle.loads(pickle.dumps(transform)) - - output = transform(input) - assert isinstance(output, type(input)) - - if isinstance(input, tv_tensors.BoundingBoxes) and not isinstance(transform, transforms.ConvertBoundingBoxFormat): - assert output.format == input.format - - if check_v1_compatibility: - _check_transform_v1_compatibility(transform, input, **_to_tolerances(check_v1_compatibility)) - - -def transform_cls_to_functional(transform_cls, **transform_specific_kwargs): - def wrapper(input, *args, **kwargs): - transform = transform_cls(*args, **transform_specific_kwargs, **kwargs) - return transform(input) - - wrapper.__name__ = transform_cls.__name__ - - return wrapper - - -def param_value_parametrization(**kwargs): - """Helper function to turn - - @pytest.mark.parametrize( - ("param", "value"), - ("a", 1), - ("a", 2), - ("a", 3), - ("b", -1.0) - ("b", 1.0) - ) - - into - - @param_value_parametrization(a=[1, 2, 3], b=[-1.0, 1.0]) - """ - return pytest.mark.parametrize( - ("param", "value"), - [(param, value) for param, values in kwargs.items() for value in values], - ) - - -def adapt_fill(value, *, dtype): - """Adapt fill values in the range [0.0, 1.0] to the value range of the dtype""" - if value is None: - return value - - max_value = get_max_value(dtype) - value_type = float if dtype.is_floating_point else int - - if isinstance(value, (int, float)): - return value_type(value * max_value) - elif isinstance(value, (list, tuple)): - return type(value)(value_type(v * max_value) for v in value) - else: - raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'.") - - -EXHAUSTIVE_TYPE_FILLS = [ - None, - 1, - 0.5, - [1], - [0.2], - (0,), - (0.7,), - [1, 0, 1], - [0.1, 0.2, 0.3], - (0, 1, 0), - (0.9, 0.234, 0.314), -] -CORRECTNESS_FILLS = [ - v for v in EXHAUSTIVE_TYPE_FILLS if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1) -] - - -# We cannot use `list(transforms.InterpolationMode)` here, since it includes some PIL-only ones as well -INTERPOLATION_MODES = [ - transforms.InterpolationMode.NEAREST, - transforms.InterpolationMode.NEAREST_EXACT, - transforms.InterpolationMode.BILINEAR, - transforms.InterpolationMode.BICUBIC, -] - - -@contextlib.contextmanager -def assert_warns_antialias_default_value(): - with pytest.warns(UserWarning, match="The default value of the antialias parameter of all the resizing transforms"): - yield - - -def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True): - format = bounding_boxes.format - canvas_size = new_canvas_size or bounding_boxes.canvas_size - - def affine_bounding_boxes(bounding_boxes): - dtype = bounding_boxes.dtype - device = bounding_boxes.device - - # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 - input_xyxy = F.convert_bounding_box_format( - bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True), - old_format=format, - new_format=tv_tensors.BoundingBoxFormat.XYXY, - inplace=True, - ) - x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist() - - points = np.array( - [ - [x1, y1, 1.0], - [x2, y1, 1.0], - [x1, y2, 1.0], - [x2, y2, 1.0], - ] - ) - transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T) - - output_xyxy = torch.Tensor( - [ - float(np.min(transformed_points[:, 0])), - float(np.min(transformed_points[:, 1])), - float(np.max(transformed_points[:, 0])), - float(np.max(transformed_points[:, 1])), - ] - ) - - output = F.convert_bounding_box_format( - output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format - ) - - if clamp: - # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 - output = F.clamp_bounding_boxes( - output, - format=format, - canvas_size=canvas_size, - ) - else: - # We leave the bounding box as float64 so the caller gets the full precision to perform any additional - # operation - dtype = output.dtype - - return output.to(dtype=dtype, device=device) - - return tv_tensors.BoundingBoxes( - torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape( - bounding_boxes.shape - ), - format=format, - canvas_size=canvas_size, - ) - - -# turns all warnings into errors for this module -pytestmark = pytest.mark.filterwarnings("error") - - -class TestResize: - INPUT_SIZE = (17, 11) - OUTPUT_SIZES = [17, [17], (17,), [12, 13], (12, 13)] - - def _make_max_size_kwarg(self, *, use_max_size, size): - if use_max_size: - if not (isinstance(size, int) or len(size) == 1): - # This would result in an `ValueError` - return None - - max_size = (size if isinstance(size, int) else size[0]) + 1 - else: - max_size = None - - return dict(max_size=max_size) - - def _compute_output_size(self, *, input_size, size, max_size): - if not (isinstance(size, int) or len(size) == 1): - return tuple(size) - - if not isinstance(size, int): - size = size[0] - - old_height, old_width = input_size - ratio = old_width / old_height - if ratio > 1: - new_height = size - new_width = int(ratio * new_height) - else: - new_width = size - new_height = int(new_width / ratio) - - if max_size is not None and max(new_height, new_width) > max_size: - # Need to recompute the aspect ratio, since it might have changed due to rounding - ratio = new_width / new_height - if ratio > 1: - new_width = max_size - new_height = int(new_width / ratio) - else: - new_height = max_size - new_width = int(new_height * ratio) - - return new_height, new_width - - @pytest.mark.parametrize("size", OUTPUT_SIZES) - @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) - @pytest.mark.parametrize("use_max_size", [True, False]) - @pytest.mark.parametrize("antialias", [True, False]) - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, size, interpolation, use_max_size, antialias, dtype, device): - if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): - return - - # In contrast to CPU, there is no native `InterpolationMode.BICUBIC` implementation for uint8 images on CUDA. - # Internally, it uses the float path. Thus, we need to test with an enormous tolerance here to account for that. - atol = 30 if transforms.InterpolationMode.BICUBIC and dtype is torch.uint8 else 1 - check_cuda_vs_cpu_tolerances = dict(rtol=0, atol=atol / 255 if dtype.is_floating_point else atol) - - check_kernel( - F.resize_image, - make_image(self.INPUT_SIZE, dtype=dtype, device=device), - size=size, - interpolation=interpolation, - **max_size_kwarg, - antialias=antialias, - check_cuda_vs_cpu=check_cuda_vs_cpu_tolerances, - check_scripted_vs_eager=not isinstance(size, int), - ) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("size", OUTPUT_SIZES) - @pytest.mark.parametrize("use_max_size", [True, False]) - @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device): - if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): - return - - bounding_boxes = make_bounding_boxes( - format=format, - canvas_size=self.INPUT_SIZE, - dtype=dtype, - device=device, - ) - check_kernel( - F.resize_bounding_boxes, - bounding_boxes, - canvas_size=bounding_boxes.canvas_size, - size=size, - **max_size_kwarg, - check_scripted_vs_eager=not isinstance(size, int), - ) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - check_kernel(F.resize_mask, make_mask(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1]) - - def test_kernel_video(self): - check_kernel(F.resize_video, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1], antialias=True) - - @pytest.mark.parametrize("size", OUTPUT_SIZES) - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, size, make_input): - check_functional( - F.resize, - make_input(self.INPUT_SIZE), - size=size, - antialias=True, - check_scripted_smoke=not isinstance(size, int), - ) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.resize_image, torch.Tensor), - (F._resize_image_pil, PIL.Image.Image), - (F.resize_image, tv_tensors.Image), - (F.resize_bounding_boxes, tv_tensors.BoundingBoxes), - (F.resize_mask, tv_tensors.Mask), - (F.resize_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.resize, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize("size", OUTPUT_SIZES) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize( - "make_input", - [ - make_image_tensor, - make_image_pil, - make_image, - make_bounding_boxes, - make_segmentation_mask, - make_detection_mask, - make_video, - ], - ) - def test_transform(self, size, device, make_input): - check_transform( - transforms.Resize(size=size, antialias=True), - make_input(self.INPUT_SIZE, device=device), - # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes - check_v1_compatibility=dict(rtol=0, atol=1), - ) - - def _check_output_size(self, input, output, *, size, max_size): - assert tuple(F.get_size(output)) == self._compute_output_size( - input_size=F.get_size(input), size=size, max_size=max_size - ) - - @pytest.mark.parametrize("size", OUTPUT_SIZES) - # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2. - # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT` - @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST}) - @pytest.mark.parametrize("use_max_size", [True, False]) - @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) - def test_image_correctness(self, size, interpolation, use_max_size, fn): - if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): - return - - image = make_image(self.INPUT_SIZE, dtype=torch.uint8) - - actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True) - expected = F.to_image(F.resize(F.to_pil_image(image), size=size, interpolation=interpolation, **max_size_kwarg)) - - self._check_output_size(image, actual, size=size, **max_size_kwarg) - torch.testing.assert_close(actual, expected, atol=1, rtol=0) - - def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=None): - old_height, old_width = bounding_boxes.canvas_size - new_height, new_width = self._compute_output_size( - input_size=bounding_boxes.canvas_size, size=size, max_size=max_size - ) - - if (old_height, old_width) == (new_height, new_width): - return bounding_boxes - - affine_matrix = np.array( - [ - [new_width / old_width, 0, 0], - [0, new_height / old_height, 0], - ], - ) - - return reference_affine_bounding_boxes_helper( - bounding_boxes, - affine_matrix=affine_matrix, - new_canvas_size=(new_height, new_width), - ) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("size", OUTPUT_SIZES) - @pytest.mark.parametrize("use_max_size", [True, False]) - @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) - def test_bounding_boxes_correctness(self, format, size, use_max_size, fn): - if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): - return - - bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE) - - actual = fn(bounding_boxes, size=size, **max_size_kwarg) - expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg) - - self._check_output_size(bounding_boxes, actual, size=size, **max_size_kwarg) - torch.testing.assert_close(actual, expected) - - @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES)) - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], - ) - def test_pil_interpolation_compat_smoke(self, interpolation, make_input): - input = make_input(self.INPUT_SIZE) - - with ( - contextlib.nullcontext() - if isinstance(input, PIL.Image.Image) - # This error is triggered in PyTorch core - else pytest.raises(NotImplementedError, match=f"got {interpolation.value.lower()}") - ): - F.resize( - input, - size=self.OUTPUT_SIZES[0], - interpolation=interpolation, - ) - - def test_functional_pil_antialias_warning(self): - with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"): - F.resize(make_image_pil(self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False) - - @pytest.mark.parametrize("size", OUTPUT_SIZES) - @pytest.mark.parametrize( - "make_input", - [ - make_image_tensor, - make_image_pil, - make_image, - make_bounding_boxes, - make_segmentation_mask, - make_detection_mask, - make_video, - ], - ) - def test_max_size_error(self, size, make_input): - if isinstance(size, int) or len(size) == 1: - max_size = (size if isinstance(size, int) else size[0]) - 1 - match = "must be strictly greater than the requested size" - else: - # value can be anything other than None - max_size = -1 - match = "size should be an int or a sequence of length 1" - - with pytest.raises(ValueError, match=match): - F.resize(make_input(self.INPUT_SIZE), size=size, max_size=max_size, antialias=True) - - @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image, make_video], - ) - def test_antialias_warning(self, interpolation, make_input): - with ( - assert_warns_antialias_default_value() - if interpolation in {transforms.InterpolationMode.BILINEAR, transforms.InterpolationMode.BICUBIC} - else assert_no_warnings() - ): - F.resize( - make_input(self.INPUT_SIZE), - size=self.OUTPUT_SIZES[0], - interpolation=interpolation, - ) - - @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], - ) - def test_interpolation_int(self, interpolation, make_input): - input = make_input(self.INPUT_SIZE) - - # `InterpolationMode.NEAREST_EXACT` has no proper corresponding integer equivalent. Internally, we map it to - # `0` to be the same as `InterpolationMode.NEAREST` for PIL. However, for the tensor backend there is a - # difference and thus we don't test it here. - if isinstance(input, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT: - return - - expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True) - actual = F.resize( - input, size=self.OUTPUT_SIZES[0], interpolation=pil_modes_mapping[interpolation], antialias=True - ) - - assert_equal(actual, expected) - - def test_transform_unknown_size_error(self): - with pytest.raises(ValueError, match="size can either be an integer or a list or tuple of one or two integers"): - transforms.Resize(size=object()) - - @pytest.mark.parametrize( - "size", [min(INPUT_SIZE), [min(INPUT_SIZE)], (min(INPUT_SIZE),), list(INPUT_SIZE), tuple(INPUT_SIZE)] - ) - @pytest.mark.parametrize( - "make_input", - [ - make_image_tensor, - make_image_pil, - make_image, - make_bounding_boxes, - make_segmentation_mask, - make_detection_mask, - make_video, - ], - ) - def test_noop(self, size, make_input): - input = make_input(self.INPUT_SIZE) - - output = F.resize(input, size=F.get_size(input), antialias=True) - - # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there - # is a good reason to break this, feel free to downgrade to an equality check. - if isinstance(input, tv_tensors.TVTensor): - # We can't test identity directly, since that checks for the identity of the Python object. Since all - # tv_tensors unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check - # that the underlying storage is the same - assert output.data_ptr() == input.data_ptr() - else: - assert output is input - - @pytest.mark.parametrize( - "make_input", - [ - make_image_tensor, - make_image_pil, - make_image, - make_bounding_boxes, - make_segmentation_mask, - make_detection_mask, - make_video, - ], - ) - def test_no_regression_5405(self, make_input): - # Checks that `max_size` is not ignored if `size == small_edge_size` - # See https://github.com/pytorch/vision/issues/5405 - - input = make_input(self.INPUT_SIZE) - - size = min(F.get_size(input)) - max_size = size + 1 - output = F.resize(input, size=size, max_size=max_size, antialias=True) - - assert max(F.get_size(output)) == max_size - - def _make_image(self, *args, batch_dims=(), memory_format=torch.contiguous_format, **kwargs): - # torch.channels_last memory_format is only available for 4D tensors, i.e. (B, C, H, W). However, images coming - # from PIL or our own I/O functions do not have a batch dimensions and are thus 3D, i.e. (C, H, W). Still, the - # layout of the data in memory is channels last. To emulate this when a 3D input is requested here, we create - # the image as 4D and create a view with the right shape afterwards. With this the layout in memory is channels - # last although PyTorch doesn't recognizes it as such. - emulate_channels_last = memory_format is torch.channels_last and len(batch_dims) != 1 - - image = make_image( - *args, - batch_dims=(math.prod(batch_dims),) if emulate_channels_last else batch_dims, - memory_format=memory_format, - **kwargs, - ) - - if emulate_channels_last: - image = tv_tensors.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image) - - return image - - def _check_stride(self, image, *, memory_format): - C, H, W = F.get_dimensions(image) - if memory_format is torch.contiguous_format: - expected_stride = (H * W, W, 1) - elif memory_format is torch.channels_last: - expected_stride = (1, W * C, C) - else: - raise ValueError(f"Unknown memory_format: {memory_format}") - - assert image.stride() == expected_stride - - # TODO: We can remove this test and related torchvision workaround - # once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430 - @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) - @pytest.mark.parametrize("antialias", [True, False]) - @pytest.mark.parametrize("memory_format", [torch.contiguous_format, torch.channels_last]) - @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image_memory_format_consistency(self, interpolation, antialias, memory_format, dtype, device): - size = self.OUTPUT_SIZES[0] - - input = self._make_image(self.INPUT_SIZE, dtype=dtype, device=device, memory_format=memory_format) - - # Smoke test to make sure we aren't starting with wrong assumptions - self._check_stride(input, memory_format=memory_format) - - output = F.resize_image(input, size=size, interpolation=interpolation, antialias=antialias) - - self._check_stride(output, memory_format=memory_format) - - def test_float16_no_rounding(self): - # Make sure Resize() doesn't round float16 images - # Non-regression test for https://github.com/pytorch/vision/issues/7667 - - input = make_image_tensor(self.INPUT_SIZE, dtype=torch.float16) - output = F.resize_image(input, size=self.OUTPUT_SIZES[0], antialias=True) - - assert output.dtype is torch.float16 - assert (output.round() - output).abs().sum() > 0 - - -class TestHorizontalFlip: - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, dtype, device): - check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device)) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, format, dtype, device): - bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) - check_kernel( - F.horizontal_flip_bounding_boxes, - bounding_boxes, - format=format, - canvas_size=bounding_boxes.canvas_size, - ) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - check_kernel(F.horizontal_flip_mask, make_mask()) - - def test_kernel_video(self): - check_kernel(F.horizontal_flip_video, make_video()) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - check_functional(F.horizontal_flip, make_input()) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.horizontal_flip_image, torch.Tensor), - (F._horizontal_flip_image_pil, PIL.Image.Image), - (F.horizontal_flip_image, tv_tensors.Image), - (F.horizontal_flip_bounding_boxes, tv_tensors.BoundingBoxes), - (F.horizontal_flip_mask, tv_tensors.Mask), - (F.horizontal_flip_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform(self, make_input, device): - check_transform(transforms.RandomHorizontalFlip(p=1), make_input(device=device)) - - @pytest.mark.parametrize( - "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] - ) - def test_image_correctness(self, fn): - image = make_image(dtype=torch.uint8, device="cpu") - - actual = fn(image) - expected = F.to_image(F.horizontal_flip(F.to_pil_image(image))) - - torch.testing.assert_close(actual, expected) - - def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes): - affine_matrix = np.array( - [ - [-1, 0, bounding_boxes.canvas_size[1]], - [0, 1, 0], - ], - ) - - return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize( - "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] - ) - def test_bounding_boxes_correctness(self, format, fn): - bounding_boxes = make_bounding_boxes(format=format) - - actual = fn(bounding_boxes) - expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes) - - torch.testing.assert_close(actual, expected) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform_noop(self, make_input, device): - input = make_input(device=device) - - transform = transforms.RandomHorizontalFlip(p=0) - - output = transform(input) - - assert_equal(output, input) - - -class TestAffine: - _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict( - # float, int - angle=[-10.9, 18], - # two-list of float, two-list of int, two-tuple of float, two-tuple of int - translate=[[6.3, -0.6], [1, -3], (16.6, -6.6), (-2, 4)], - # float - scale=[0.5], - # float, int, - # one-list of float, one-list of int, one-tuple of float, one-tuple of int - # two-list of float, two-list of int, two-tuple of float, two-tuple of int - shear=[35.6, 38, [-37.7], [-23], (5.3,), (-52,), [5.4, 21.8], [-47, 51], (-11.2, 36.7), (8, -53)], - # None - # two-list of float, two-list of int, two-tuple of float, two-tuple of int - center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)], - ) - # The special case for shear makes sure we pick a value that is supported while JIT scripting - _MINIMAL_AFFINE_KWARGS = { - k: vs[0] if k != "shear" else next(v for v in vs if isinstance(v, list)) - for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() - } - _CORRECTNESS_AFFINE_KWARGS = { - k: [v for v in vs if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)] - for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() - } - - _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict( - degrees=[30, (-15, 20)], - translate=[None, (0.5, 0.5)], - scale=[None, (0.75, 1.25)], - shear=[None, (12, 30, -17, 5), 10, (-5, 12)], - ) - _CORRECTNESS_TRANSFORM_AFFINE_RANGES = { - k: next(v for v in vs if v is not None) for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items() - } - - def _check_kernel(self, kernel, input, *args, **kwargs): - kwargs_ = self._MINIMAL_AFFINE_KWARGS.copy() - kwargs_.update(kwargs) - check_kernel(kernel, input, *args, **kwargs_) - - @param_value_parametrization( - angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], - translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"], - shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], - center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], - interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], - fill=EXHAUSTIVE_TYPE_FILLS, - ) - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, param, value, dtype, device): - if param == "fill": - value = adapt_fill(value, dtype=dtype) - self._check_kernel( - F.affine_image, - make_image(dtype=dtype, device=device), - **{param: value}, - check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))), - check_cuda_vs_cpu=dict(atol=1, rtol=0) - if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR - else True, - ) - - @param_value_parametrization( - angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], - translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"], - shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], - center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], - ) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, param, value, format, dtype, device): - bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) - self._check_kernel( - F.affine_bounding_boxes, - bounding_boxes, - format=format, - canvas_size=bounding_boxes.canvas_size, - **{param: value}, - check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))), - ) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - self._check_kernel(F.affine_mask, make_mask()) - - def test_kernel_video(self): - self._check_kernel(F.affine_video, make_video()) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.affine_image, torch.Tensor), - (F._affine_image_pil, PIL.Image.Image), - (F.affine_image, tv_tensors.Image), - (F.affine_bounding_boxes, tv_tensors.BoundingBoxes), - (F.affine_mask, tv_tensors.Mask), - (F.affine_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.affine, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform(self, make_input, device): - input = make_input(device=device) - - check_transform(transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), input) - - @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) - @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) - @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) - @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) - @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - @pytest.mark.parametrize( - "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] - ) - @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) - def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill): - image = make_image(dtype=torch.uint8, device="cpu") - - fill = adapt_fill(fill, dtype=torch.uint8) - - actual = F.affine( - image, - angle=angle, - translate=translate, - scale=scale, - shear=shear, - center=center, - interpolation=interpolation, - fill=fill, - ) - expected = F.to_image( - F.affine( - F.to_pil_image(image), - angle=angle, - translate=translate, - scale=scale, - shear=shear, - center=center, - interpolation=interpolation, - fill=fill, - ) - ) - - mae = (actual.float() - expected.float()).abs().mean() - assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8 - - @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - @pytest.mark.parametrize( - "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] - ) - @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) - @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_image_correctness(self, center, interpolation, fill, seed): - image = make_image(dtype=torch.uint8, device="cpu") - - fill = adapt_fill(fill, dtype=torch.uint8) - - transform = transforms.RandomAffine( - **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center, interpolation=interpolation, fill=fill - ) - - torch.manual_seed(seed) - actual = transform(image) - - torch.manual_seed(seed) - expected = F.to_image(transform(F.to_pil_image(image))) - - mae = (actual.float() - expected.float()).abs().mean() - assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8 - - def _compute_affine_matrix(self, *, angle, translate, scale, shear, center): - rot = math.radians(angle) - cx, cy = center - tx, ty = translate - sx, sy = [math.radians(s) for s in ([shear, 0.0] if isinstance(shear, (int, float)) else shear)] - - c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]]) - t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]]) - c_matrix_inv = np.linalg.inv(c_matrix) - rs_matrix = np.array( - [ - [scale * math.cos(rot), -scale * math.sin(rot), 0], - [scale * math.sin(rot), scale * math.cos(rot), 0], - [0, 0, 1], - ] - ) - shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]]) - shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]]) - rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix)) - true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv))) - return true_matrix[:2, :] - - def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center): - if center is None: - center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] - - return reference_affine_bounding_boxes_helper( - bounding_boxes, - affine_matrix=self._compute_affine_matrix( - angle=angle, translate=translate, scale=scale, shear=shear, center=center - ), - ) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) - @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) - @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) - @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) - @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center): - bounding_boxes = make_bounding_boxes(format=format) - - actual = F.affine( - bounding_boxes, - angle=angle, - translate=translate, - scale=scale, - shear=shear, - center=center, - ) - expected = self._reference_affine_bounding_boxes( - bounding_boxes, - angle=angle, - translate=translate, - scale=scale, - shear=shear, - center=center, - ) - - torch.testing.assert_close(actual, expected) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_bounding_boxes_correctness(self, format, center, seed): - bounding_boxes = make_bounding_boxes(format=format) - - transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center) - - torch.manual_seed(seed) - params = transform._get_params([bounding_boxes]) - - torch.manual_seed(seed) - actual = transform(bounding_boxes) - - expected = self._reference_affine_bounding_boxes(bounding_boxes, **params, center=center) - - torch.testing.assert_close(actual, expected) - - @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"]) - @pytest.mark.parametrize("translate", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["translate"]) - @pytest.mark.parametrize("scale", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["scale"]) - @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"]) - @pytest.mark.parametrize("seed", list(range(10))) - def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed): - image = make_image() - height, width = F.get_size(image) - - transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear) - - torch.manual_seed(seed) - params = transform._get_params([image]) - - if isinstance(degrees, (int, float)): - assert -degrees <= params["angle"] <= degrees - else: - assert degrees[0] <= params["angle"] <= degrees[1] - - if translate is not None: - width_max = int(round(translate[0] * width)) - height_max = int(round(translate[1] * height)) - assert -width_max <= params["translate"][0] <= width_max - assert -height_max <= params["translate"][1] <= height_max - else: - assert params["translate"] == (0, 0) - - if scale is not None: - assert scale[0] <= params["scale"] <= scale[1] - else: - assert params["scale"] == 1.0 - - if shear is not None: - if isinstance(shear, (int, float)): - assert -shear <= params["shear"][0] <= shear - assert params["shear"][1] == 0.0 - elif len(shear) == 2: - assert shear[0] <= params["shear"][0] <= shear[1] - assert params["shear"][1] == 0.0 - elif len(shear) == 4: - assert shear[0] <= params["shear"][0] <= shear[1] - assert shear[2] <= params["shear"][1] <= shear[3] - else: - assert params["shear"] == (0, 0) - - @pytest.mark.parametrize("param", ["degrees", "translate", "scale", "shear", "center"]) - @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]]) - def test_transform_sequence_len_errors(self, param, value): - if param in {"degrees", "shear"} and not isinstance(value, list): - return - - kwargs = {param: value} - if param != "degrees": - kwargs["degrees"] = 0 - - with pytest.raises( - ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2" - ): - transforms.RandomAffine(**kwargs) - - def test_transform_negative_degrees_error(self): - with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"): - transforms.RandomAffine(degrees=-1) - - @pytest.mark.parametrize("translate", [[-1, 0], [2, 0], [-1, 2]]) - def test_transform_translate_range_error(self, translate): - with pytest.raises(ValueError, match="translation values should be between 0 and 1"): - transforms.RandomAffine(degrees=0, translate=translate) - - @pytest.mark.parametrize("scale", [[-1, 0], [0, -1], [-1, -1]]) - def test_transform_scale_range_error(self, scale): - with pytest.raises(ValueError, match="scale values should be positive"): - transforms.RandomAffine(degrees=0, scale=scale) - - def test_transform_negative_shear_error(self): - with pytest.raises(ValueError, match="If shear is a single number, it must be positive"): - transforms.RandomAffine(degrees=0, shear=-1) - - def test_transform_unknown_fill_error(self): - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomAffine(degrees=0, fill="fill") - - -class TestVerticalFlip: - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, dtype, device): - check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device)) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, format, dtype, device): - bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) - check_kernel( - F.vertical_flip_bounding_boxes, - bounding_boxes, - format=format, - canvas_size=bounding_boxes.canvas_size, - ) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - check_kernel(F.vertical_flip_mask, make_mask()) - - def test_kernel_video(self): - check_kernel(F.vertical_flip_video, make_video()) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - check_functional(F.vertical_flip, make_input()) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.vertical_flip_image, torch.Tensor), - (F._vertical_flip_image_pil, PIL.Image.Image), - (F.vertical_flip_image, tv_tensors.Image), - (F.vertical_flip_bounding_boxes, tv_tensors.BoundingBoxes), - (F.vertical_flip_mask, tv_tensors.Mask), - (F.vertical_flip_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform(self, make_input, device): - check_transform(transforms.RandomVerticalFlip(p=1), make_input(device=device)) - - @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) - def test_image_correctness(self, fn): - image = make_image(dtype=torch.uint8, device="cpu") - - actual = fn(image) - expected = F.to_image(F.vertical_flip(F.to_pil_image(image))) - - torch.testing.assert_close(actual, expected) - - def _reference_vertical_flip_bounding_boxes(self, bounding_boxes): - affine_matrix = np.array( - [ - [1, 0, 0], - [0, -1, bounding_boxes.canvas_size[0]], - ], - ) - - return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) - def test_bounding_boxes_correctness(self, format, fn): - bounding_boxes = make_bounding_boxes(format=format) - - actual = fn(bounding_boxes) - expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes) - - torch.testing.assert_close(actual, expected) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform_noop(self, make_input, device): - input = make_input(device=device) - - transform = transforms.RandomVerticalFlip(p=0) - - output = transform(input) - - assert_equal(output, input) - - -class TestRotate: - _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict( - # float, int - angle=[-10.9, 18], - # None - # two-list of float, two-list of int, two-tuple of float, two-tuple of int - center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)], - ) - _MINIMAL_AFFINE_KWARGS = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()} - _CORRECTNESS_AFFINE_KWARGS = { - k: [v for v in vs if v is None or isinstance(v, float) or isinstance(v, list)] - for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() - } - - _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict( - degrees=[30, (-15, 20)], - ) - _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()} - - @param_value_parametrization( - angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], - interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], - expand=[False, True], - center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], - fill=EXHAUSTIVE_TYPE_FILLS, - ) - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, param, value, dtype, device): - kwargs = {param: value} - if param != "angle": - kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] - check_kernel( - F.rotate_image, - make_image(dtype=dtype, device=device), - **kwargs, - check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), - ) - - @param_value_parametrization( - angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], - expand=[False, True], - center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], - ) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, param, value, format, dtype, device): - kwargs = {param: value} - if param != "angle": - kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] - - bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) - - check_kernel( - F.rotate_bounding_boxes, - bounding_boxes, - format=format, - canvas_size=bounding_boxes.canvas_size, - **kwargs, - ) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - check_kernel(F.rotate_mask, make_mask(), **self._MINIMAL_AFFINE_KWARGS) - - def test_kernel_video(self): - check_kernel(F.rotate_video, make_video(), **self._MINIMAL_AFFINE_KWARGS) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.rotate_image, torch.Tensor), - (F._rotate_image_pil, PIL.Image.Image), - (F.rotate_image, tv_tensors.Image), - (F.rotate_bounding_boxes, tv_tensors.BoundingBoxes), - (F.rotate_mask, tv_tensors.Mask), - (F.rotate_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.rotate, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform(self, make_input, device): - check_transform( - transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), make_input(device=device) - ) - - @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) - @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - @pytest.mark.parametrize( - "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] - ) - @pytest.mark.parametrize("expand", [False, True]) - @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) - def test_functional_image_correctness(self, angle, center, interpolation, expand, fill): - image = make_image(dtype=torch.uint8, device="cpu") - - fill = adapt_fill(fill, dtype=torch.uint8) - - actual = F.rotate(image, angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill) - expected = F.to_image( - F.rotate( - F.to_pil_image(image), angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill - ) - ) - - mae = (actual.float() - expected.float()).abs().mean() - assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6 - - @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - @pytest.mark.parametrize( - "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] - ) - @pytest.mark.parametrize("expand", [False, True]) - @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) - @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_image_correctness(self, center, interpolation, expand, fill, seed): - image = make_image(dtype=torch.uint8, device="cpu") - - fill = adapt_fill(fill, dtype=torch.uint8) - - transform = transforms.RandomRotation( - **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, - center=center, - interpolation=interpolation, - expand=expand, - fill=fill, - ) - - torch.manual_seed(seed) - actual = transform(image) - - torch.manual_seed(seed) - expected = F.to_image(transform(F.to_pil_image(image))) - - mae = (actual.float() - expected.float()).abs().mean() - assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6 - - def _compute_output_canvas_size(self, *, expand, canvas_size, affine_matrix): - if not expand: - return canvas_size, (0.0, 0.0) - - input_height, input_width = canvas_size - - input_image_frame = np.array( - [ - [0.0, 0.0, 1.0], - [0.0, input_height, 1.0], - [input_width, input_height, 1.0], - [input_width, 0.0, 1.0], - ], - dtype=np.float64, - ) - output_image_frame = np.matmul(input_image_frame, affine_matrix.astype(input_image_frame.dtype).T) - - recenter_x = float(np.min(output_image_frame[:, 0])) - recenter_y = float(np.min(output_image_frame[:, 1])) - - output_width = int(np.max(output_image_frame[:, 0]) - recenter_x) - output_height = int(np.max(output_image_frame[:, 1]) - recenter_y) - - return (output_height, output_width), (recenter_x, recenter_y) - - def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy): - x, y = recenter_xy - if bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYXY: - translate = [x, y, x, y] - else: - translate = [x, y, 0.0, 0.0] - return tv_tensors.wrap( - (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes - ) - - def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center): - if center is None: - center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] - cx, cy = center - - a = np.cos(angle * np.pi / 180.0) - b = np.sin(angle * np.pi / 180.0) - affine_matrix = np.array( - [ - [a, b, cx - cx * a - b * cy], - [-b, a, cy + cx * b - a * cy], - ], - ) - - new_canvas_size, recenter_xy = self._compute_output_canvas_size( - expand=expand, canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix - ) - - output = reference_affine_bounding_boxes_helper( - bounding_boxes, - affine_matrix=affine_matrix, - new_canvas_size=new_canvas_size, - clamp=False, - ) - - return F.clamp_bounding_boxes(self._recenter_bounding_boxes_after_expand(output, recenter_xy=recenter_xy)).to( - bounding_boxes - ) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) - @pytest.mark.parametrize("expand", [False, True]) - @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - def test_functional_bounding_boxes_correctness(self, format, angle, expand, center): - bounding_boxes = make_bounding_boxes(format=format) - - actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center) - expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center) - - torch.testing.assert_close(actual, expected) - torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("expand", [False, True]) - @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) - @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_bounding_boxes_correctness(self, format, expand, center, seed): - bounding_boxes = make_bounding_boxes(format=format) - - transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center) - - torch.manual_seed(seed) - params = transform._get_params([bounding_boxes]) - - torch.manual_seed(seed) - actual = transform(bounding_boxes) - - expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center) - - torch.testing.assert_close(actual, expected) - torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) - - @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"]) - @pytest.mark.parametrize("seed", list(range(10))) - def test_transform_get_params_bounds(self, degrees, seed): - transform = transforms.RandomRotation(degrees=degrees) - - torch.manual_seed(seed) - params = transform._get_params([]) - - if isinstance(degrees, (int, float)): - assert -degrees <= params["angle"] <= degrees - else: - assert degrees[0] <= params["angle"] <= degrees[1] - - @pytest.mark.parametrize("param", ["degrees", "center"]) - @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]]) - def test_transform_sequence_len_errors(self, param, value): - if param == "degrees" and not isinstance(value, list): - return - - kwargs = {param: value} - if param != "degrees": - kwargs["degrees"] = 0 - - with pytest.raises( - ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2" - ): - transforms.RandomRotation(**kwargs) - - def test_transform_negative_degrees_error(self): - with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"): - transforms.RandomAffine(degrees=-1) - - def test_transform_unknown_fill_error(self): - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomAffine(degrees=0, fill="fill") - - -class TestCompose: - class BuiltinTransform(transforms.Transform): - def _transform(self, inpt, params): - return inpt - - class PackedInputTransform(nn.Module): - def forward(self, sample): - assert len(sample) == 2 - return sample - - class UnpackedInputTransform(nn.Module): - def forward(self, image, label): - return image, label - - @pytest.mark.parametrize( - "transform_clss", - [ - [BuiltinTransform], - [PackedInputTransform], - [UnpackedInputTransform], - [BuiltinTransform, BuiltinTransform], - [PackedInputTransform, PackedInputTransform], - [UnpackedInputTransform, UnpackedInputTransform], - [BuiltinTransform, PackedInputTransform, BuiltinTransform], - [BuiltinTransform, UnpackedInputTransform, BuiltinTransform], - [PackedInputTransform, BuiltinTransform, PackedInputTransform], - [UnpackedInputTransform, BuiltinTransform, UnpackedInputTransform], - ], - ) - @pytest.mark.parametrize("unpack", [True, False]) - def test_packed_unpacked(self, transform_clss, unpack): - needs_packed_inputs = any(issubclass(cls, self.PackedInputTransform) for cls in transform_clss) - needs_unpacked_inputs = any(issubclass(cls, self.UnpackedInputTransform) for cls in transform_clss) - assert not (needs_packed_inputs and needs_unpacked_inputs) - - transform = transforms.Compose([cls() for cls in transform_clss]) - - image = make_image() - label = 3 - packed_input = (image, label) - - def call_transform(): - if unpack: - return transform(*packed_input) - else: - return transform(packed_input) - - if needs_unpacked_inputs and not unpack: - with pytest.raises(TypeError, match="missing 1 required positional argument"): - call_transform() - elif needs_packed_inputs and unpack: - with pytest.raises(TypeError, match="takes 2 positional arguments but 3 were given"): - call_transform() - else: - output = call_transform() - - assert isinstance(output, tuple) and len(output) == 2 - assert output[0] is image - assert output[1] is label - - -class TestToDtype: - @pytest.mark.parametrize( - ("kernel", "make_input"), - [ - (F.to_dtype_image, make_image_tensor), - (F.to_dtype_image, make_image), - (F.to_dtype_video, make_video), - ], - ) - @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) - @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("scale", (True, False)) - def test_kernel(self, kernel, make_input, input_dtype, output_dtype, device, scale): - check_kernel( - kernel, - make_input(dtype=input_dtype, device=device), - dtype=output_dtype, - scale=scale, - ) - - @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) - @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) - @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("scale", (True, False)) - def test_functional(self, make_input, input_dtype, output_dtype, device, scale): - check_functional( - F.to_dtype, - make_input(dtype=input_dtype, device=device), - dtype=output_dtype, - scale=scale, - ) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) - @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("scale", (True, False)) - @pytest.mark.parametrize("as_dict", (True, False)) - def test_transform(self, make_input, input_dtype, output_dtype, device, scale, as_dict): - input = make_input(dtype=input_dtype, device=device) - if as_dict: - output_dtype = {type(input): output_dtype} - check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input) - - def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False): - input_dtype = image.dtype - output_dtype = dtype - - if not scale: - return image.to(dtype) - - if output_dtype == input_dtype: - return image - - def fn(value): - if input_dtype.is_floating_point: - if output_dtype.is_floating_point: - return value - else: - return round(decimal.Decimal(value) * torch.iinfo(output_dtype).max) - else: - input_max_value = torch.iinfo(input_dtype).max - - if output_dtype.is_floating_point: - return float(decimal.Decimal(value) / input_max_value) - else: - output_max_value = torch.iinfo(output_dtype).max - - if input_max_value > output_max_value: - factor = (input_max_value + 1) // (output_max_value + 1) - return value / factor - else: - factor = (output_max_value + 1) // (input_max_value + 1) - return value * factor - - return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype, device=image.device) - - @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) - @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("scale", (True, False)) - def test_image_correctness(self, input_dtype, output_dtype, device, scale): - if input_dtype.is_floating_point and output_dtype == torch.int64: - pytest.xfail("float to int64 conversion is not supported") - - input = make_image(dtype=input_dtype, device=device) - - out = F.to_dtype(input, dtype=output_dtype, scale=scale) - expected = self.reference_convert_dtype_image_tensor(input, dtype=output_dtype, scale=scale) - - if input_dtype.is_floating_point and not output_dtype.is_floating_point and scale: - torch.testing.assert_close(out, expected, atol=1, rtol=0) - else: - torch.testing.assert_close(out, expected) - - def was_scaled(self, inpt): - # this assumes the target dtype is float - return inpt.max() <= 1 - - def make_inpt_with_bbox_and_mask(self, make_input): - H, W = 10, 10 - inpt_dtype = torch.uint8 - bbox_dtype = torch.float32 - mask_dtype = torch.bool - sample = { - "inpt": make_input(size=(H, W), dtype=inpt_dtype), - "bbox": make_bounding_boxes(canvas_size=(H, W), dtype=bbox_dtype), - "mask": make_detection_mask(size=(H, W), dtype=mask_dtype), - } - - return sample, inpt_dtype, bbox_dtype, mask_dtype - - @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) - @pytest.mark.parametrize("scale", (True, False)) - def test_dtype_not_a_dict(self, make_input, scale): - # assert only inpt gets transformed when dtype isn't a dict - - sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) - out = transforms.ToDtype(dtype=torch.float32, scale=scale)(sample) - - assert out["inpt"].dtype != inpt_dtype - assert out["inpt"].dtype == torch.float32 - if scale: - assert self.was_scaled(out["inpt"]) - else: - assert not self.was_scaled(out["inpt"]) - assert out["bbox"].dtype == bbox_dtype - assert out["mask"].dtype == mask_dtype - - @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) - def test_others_catch_all_and_none(self, make_input): - # make sure "others" works as a catch-all and that None means no conversion - - sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) - out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.int64, "others": None})(sample) - assert out["inpt"].dtype == inpt_dtype - assert out["bbox"].dtype == bbox_dtype - assert out["mask"].dtype != mask_dtype - assert out["mask"].dtype == torch.int64 - - @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) - def test_typical_use_case(self, make_input): - # Typical use-case: want to convert dtype and scale for inpt and just dtype for masks. - # This just makes sure we now have a decent API for this - - sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) - out = transforms.ToDtype( - dtype={type(sample["inpt"]): torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True - )(sample) - assert out["inpt"].dtype != inpt_dtype - assert out["inpt"].dtype == torch.float32 - assert self.was_scaled(out["inpt"]) - assert out["bbox"].dtype == bbox_dtype - assert out["mask"].dtype != mask_dtype - assert out["mask"].dtype == torch.int64 - - @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) - def test_errors_warnings(self, make_input): - sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) - - with pytest.raises(ValueError, match="No dtype was specified for"): - out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.float32})(sample) - with pytest.warns(UserWarning, match=re.escape("plain `torch.Tensor` will *not* be transformed")): - transforms.ToDtype(dtype={torch.Tensor: torch.float32, tv_tensors.Image: torch.float32}) - with pytest.warns(UserWarning, match="no scaling will be done"): - out = transforms.ToDtype(dtype={"others": None}, scale=True)(sample) - assert out["inpt"].dtype == inpt_dtype - assert out["bbox"].dtype == bbox_dtype - assert out["mask"].dtype == mask_dtype - - -class TestAdjustBrightness: - _CORRECTNESS_BRIGHTNESS_FACTORS = [0.5, 0.0, 1.0, 5.0] - _DEFAULT_BRIGHTNESS_FACTOR = _CORRECTNESS_BRIGHTNESS_FACTORS[0] - - @pytest.mark.parametrize( - ("kernel", "make_input"), - [ - (F.adjust_brightness_image, make_image), - (F.adjust_brightness_video, make_video), - ], - ) - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel(self, kernel, make_input, dtype, device): - check_kernel(kernel, make_input(dtype=dtype, device=device), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR) - - @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) - def test_functional(self, make_input): - check_functional(F.adjust_brightness, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.adjust_brightness_image, torch.Tensor), - (F._adjust_brightness_image_pil, PIL.Image.Image), - (F.adjust_brightness_image, tv_tensors.Image), - (F.adjust_brightness_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.adjust_brightness, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize("brightness_factor", _CORRECTNESS_BRIGHTNESS_FACTORS) - def test_image_correctness(self, brightness_factor): - image = make_image(dtype=torch.uint8, device="cpu") - - actual = F.adjust_brightness(image, brightness_factor=brightness_factor) - expected = F.to_image(F.adjust_brightness(F.to_pil_image(image), brightness_factor=brightness_factor)) - - torch.testing.assert_close(actual, expected) - - -class TestCutMixMixUp: - class DummyDataset: - def __init__(self, size, num_classes): - self.size = size - self.num_classes = num_classes - assert size < num_classes - - def __getitem__(self, idx): - img = torch.rand(3, 100, 100) - label = idx # This ensures all labels in a batch are unique and makes testing easier - return img, label - - def __len__(self): - return self.size - - @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) - def test_supported_input_structure(self, T): - - batch_size = 32 - num_classes = 100 - - dataset = self.DummyDataset(size=batch_size, num_classes=num_classes) - - cutmix_mixup = T(num_classes=num_classes) - - dl = DataLoader(dataset, batch_size=batch_size) - - # Input sanity checks - img, target = next(iter(dl)) - input_img_size = img.shape[-3:] - assert isinstance(img, torch.Tensor) and isinstance(target, torch.Tensor) - assert target.shape == (batch_size,) - - def check_output(img, target): - assert img.shape == (batch_size, *input_img_size) - assert target.shape == (batch_size, num_classes) - torch.testing.assert_close(target.sum(axis=-1), torch.ones(batch_size)) - num_non_zero_labels = (target != 0).sum(axis=-1) - assert (num_non_zero_labels == 2).all() - - # After Dataloader, as unpacked input - img, target = next(iter(dl)) - assert target.shape == (batch_size,) - img, target = cutmix_mixup(img, target) - check_output(img, target) - - # After Dataloader, as packed input - packed_from_dl = next(iter(dl)) - assert isinstance(packed_from_dl, list) - img, target = cutmix_mixup(packed_from_dl) - check_output(img, target) - - # As collation function. We expect default_collate to be used by users. - def collate_fn_1(batch): - return cutmix_mixup(default_collate(batch)) - - def collate_fn_2(batch): - return cutmix_mixup(*default_collate(batch)) - - for collate_fn in (collate_fn_1, collate_fn_2): - dl = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn) - img, target = next(iter(dl)) - check_output(img, target) - - @needs_cuda - @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) - def test_cpu_vs_gpu(self, T): - num_classes = 10 - batch_size = 3 - H, W = 12, 12 - - imgs = torch.rand(batch_size, 3, H, W) - labels = torch.randint(0, num_classes, (batch_size,)) - cutmix_mixup = T(alpha=0.5, num_classes=num_classes) - - _check_kernel_cuda_vs_cpu(cutmix_mixup, imgs, labels, rtol=None, atol=None) - - @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) - def test_error(self, T): - - num_classes = 10 - batch_size = 9 - - imgs = torch.rand(batch_size, 3, 12, 12) - cutmix_mixup = T(alpha=0.5, num_classes=num_classes) - - for input_with_bad_type in ( - F.to_pil_image(imgs[0]), - tv_tensors.Mask(torch.rand(12, 12)), - tv_tensors.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12), - ): - with pytest.raises(ValueError, match="does not support PIL images, "): - cutmix_mixup(input_with_bad_type) - - with pytest.raises(ValueError, match="Could not infer where the labels are"): - cutmix_mixup({"img": imgs, "Nothing_else": 3}) - - with pytest.raises(ValueError, match="labels tensor should be of shape"): - # Note: the error message isn't ideal, but that's because the label heuristic found the img as the label - # It's OK, it's an edge-case. The important thing is that this fails loudly instead of passing silently - cutmix_mixup(imgs) - - with pytest.raises(ValueError, match="When using the default labels_getter"): - cutmix_mixup(imgs, "not_a_tensor") - - with pytest.raises(ValueError, match="labels tensor should be of shape"): - cutmix_mixup(imgs, torch.randint(0, 2, size=(2, 3))) - - with pytest.raises(ValueError, match="Expected a batched input with 4 dims"): - cutmix_mixup(imgs[None, None], torch.randint(0, num_classes, size=(batch_size,))) - - with pytest.raises(ValueError, match="does not match the batch size of the labels"): - cutmix_mixup(imgs, torch.randint(0, num_classes, size=(batch_size + 1,))) - - with pytest.raises(ValueError, match="labels tensor should be of shape"): - # The purpose of this check is more about documenting the current - # behaviour of what happens on a Compose(), rather than actually - # asserting the expected behaviour. We may support Compose() in the - # future, e.g. for 2 consecutive CutMix? - labels = torch.randint(0, num_classes, size=(batch_size,)) - transforms.Compose([cutmix_mixup, cutmix_mixup])(imgs, labels) - - -@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT")) -@pytest.mark.parametrize("sample_type", (tuple, list, dict)) -def test_labels_getter_default_heuristic(key, sample_type): - labels = torch.arange(10) - sample = {key: labels, "another_key": "whatever"} - if sample_type is not dict: - sample = sample_type((None, sample, "whatever_again")) - assert transforms._utils._find_labels_default_heuristic(sample) is labels - - if key.lower() != "labels": - # If "labels" is in the dict (case-insensitive), - # it takes precedence over other keys which would otherwise be a match - d = {key: "something_else", "labels": labels} - assert transforms._utils._find_labels_default_heuristic(d) is labels - - -class TestShapeGetters: - @pytest.mark.parametrize( - ("kernel", "make_input"), - [ - (F.get_dimensions_image, make_image_tensor), - (F._get_dimensions_image_pil, make_image_pil), - (F.get_dimensions_image, make_image), - (F.get_dimensions_video, make_video), - ], - ) - def test_get_dimensions(self, kernel, make_input): - size = (10, 10) - color_space, num_channels = "RGB", 3 - - input = make_input(size, color_space=color_space) - - assert kernel(input) == F.get_dimensions(input) == [num_channels, *size] - - @pytest.mark.parametrize( - ("kernel", "make_input"), - [ - (F.get_num_channels_image, make_image_tensor), - (F._get_num_channels_image_pil, make_image_pil), - (F.get_num_channels_image, make_image), - (F.get_num_channels_video, make_video), - ], - ) - def test_get_num_channels(self, kernel, make_input): - color_space, num_channels = "RGB", 3 - - input = make_input(color_space=color_space) - - assert kernel(input) == F.get_num_channels(input) == num_channels - - @pytest.mark.parametrize( - ("kernel", "make_input"), - [ - (F.get_size_image, make_image_tensor), - (F._get_size_image_pil, make_image_pil), - (F.get_size_image, make_image), - (F.get_size_bounding_boxes, make_bounding_boxes), - (F.get_size_mask, make_detection_mask), - (F.get_size_mask, make_segmentation_mask), - (F.get_size_video, make_video), - ], - ) - def test_get_size(self, kernel, make_input): - size = (10, 10) - - input = make_input(size) - - assert kernel(input) == F.get_size(input) == list(size) - - @pytest.mark.parametrize( - ("kernel", "make_input"), - [ - (F.get_num_frames_video, make_video_tensor), - (F.get_num_frames_video, make_video), - ], - ) - def test_get_num_frames(self, kernel, make_input): - num_frames = 4 - - input = make_input(num_frames=num_frames) - - assert kernel(input) == F.get_num_frames(input) == num_frames - - @pytest.mark.parametrize( - ("functional", "make_input"), - [ - (F.get_dimensions, make_bounding_boxes), - (F.get_dimensions, make_detection_mask), - (F.get_dimensions, make_segmentation_mask), - (F.get_num_channels, make_bounding_boxes), - (F.get_num_channels, make_detection_mask), - (F.get_num_channels, make_segmentation_mask), - (F.get_num_frames, make_image_pil), - (F.get_num_frames, make_image), - (F.get_num_frames, make_bounding_boxes), - (F.get_num_frames, make_detection_mask), - (F.get_num_frames, make_segmentation_mask), - ], - ) - def test_unsupported_types(self, functional, make_input): - input = make_input() - - with pytest.raises(TypeError, match=re.escape(str(type(input)))): - functional(input) - - -class TestRegisterKernel: - @pytest.mark.parametrize("functional", (F.resize, "resize")) - def test_register_kernel(self, functional): - class CustomTVTensor(tv_tensors.TVTensor): - pass - - kernel_was_called = False - - @F.register_kernel(functional, CustomTVTensor) - def new_resize(dp, *args, **kwargs): - nonlocal kernel_was_called - kernel_was_called = True - return dp - - t = transforms.Resize(size=(224, 224), antialias=True) - - my_dp = CustomTVTensor(torch.rand(3, 10, 10)) - out = t(my_dp) - assert out is my_dp - assert kernel_was_called - - # Sanity check to make sure we didn't override the kernel of other types - t(torch.rand(3, 10, 10)).shape == (3, 224, 224) - t(tv_tensors.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224) - - def test_errors(self): - with pytest.raises(ValueError, match="Could not find functional with name"): - F.register_kernel("bad_name", tv_tensors.Image) - - with pytest.raises(ValueError, match="Kernels can only be registered on functionals"): - F.register_kernel(tv_tensors.Image, F.resize) - - with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"): - F.register_kernel(F.resize, object) - - with pytest.raises(ValueError, match="cannot be registered for the builtin tv_tensor classes"): - F.register_kernel(F.resize, tv_tensors.Image)(F.resize_image) - - class CustomTVTensor(tv_tensors.TVTensor): - pass - - def resize_custom_tv_tensor(): - pass - - F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor) - - with pytest.raises(ValueError, match="already has a kernel registered for type"): - F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor) - - -class TestGetKernel: - # We are using F.resize as functional and the kernels below as proxy. Any other functional / kernels combination - # would also be fine - KERNELS = { - torch.Tensor: F.resize_image, - PIL.Image.Image: F._resize_image_pil, - tv_tensors.Image: F.resize_image, - tv_tensors.BoundingBoxes: F.resize_bounding_boxes, - tv_tensors.Mask: F.resize_mask, - tv_tensors.Video: F.resize_video, - } - - @pytest.mark.parametrize("input_type", [str, int, object]) - def test_unsupported_types(self, input_type): - with pytest.raises(TypeError, match="supports inputs of type"): - _get_kernel(F.resize, input_type) - - def test_exact_match(self): - # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the - # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional - # here, register the kernels without wrapper, and check the exact matching afterwards. - def resize_with_pure_kernels(): - pass - - for input_type, kernel in self.KERNELS.items(): - _register_kernel_internal(resize_with_pure_kernels, input_type, tv_tensor_wrapper=False)(kernel) - - assert _get_kernel(resize_with_pure_kernels, input_type) is kernel - - def test_builtin_tv_tensor_subclass(self): - # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the - # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional - # here, register the kernels without wrapper, and check if subclasses of our builtin tv_tensors get dispatched - # to the kernel of the corresponding superclass - def resize_with_pure_kernels(): - pass - - class MyImage(tv_tensors.Image): - pass - - class MyBoundingBoxes(tv_tensors.BoundingBoxes): - pass - - class MyMask(tv_tensors.Mask): - pass - - class MyVideo(tv_tensors.Video): - pass - - for custom_tv_tensor_subclass in [ - MyImage, - MyBoundingBoxes, - MyMask, - MyVideo, - ]: - builtin_tv_tensor_class = custom_tv_tensor_subclass.__mro__[1] - builtin_tv_tensor_kernel = self.KERNELS[builtin_tv_tensor_class] - _register_kernel_internal(resize_with_pure_kernels, builtin_tv_tensor_class, tv_tensor_wrapper=False)( - builtin_tv_tensor_kernel - ) - - assert _get_kernel(resize_with_pure_kernels, custom_tv_tensor_subclass) is builtin_tv_tensor_kernel - - def test_tv_tensor_subclass(self): - class MyTVTensor(tv_tensors.TVTensor): - pass - - with pytest.raises(TypeError, match="supports inputs of type"): - _get_kernel(F.resize, MyTVTensor) - - def resize_my_tv_tensor(): - pass - - _register_kernel_internal(F.resize, MyTVTensor, tv_tensor_wrapper=False)(resize_my_tv_tensor) - - assert _get_kernel(F.resize, MyTVTensor) is resize_my_tv_tensor - - def test_pil_image_subclass(self): - opened_image = PIL.Image.open(Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg") - loaded_image = opened_image.convert("RGB") - - # check the assumptions - assert isinstance(opened_image, PIL.Image.Image) - assert type(opened_image) is not PIL.Image.Image - - assert type(loaded_image) is PIL.Image.Image - - size = [17, 11] - for image in [opened_image, loaded_image]: - kernel = _get_kernel(F.resize, type(image)) - - output = kernel(image, size=size) - - assert F.get_size(output) == size - - -class TestPermuteChannels: - _DEFAULT_PERMUTATION = [2, 0, 1] - - @pytest.mark.parametrize( - ("kernel", "make_input"), - [ - (F.permute_channels_image, make_image_tensor), - # FIXME - # check_kernel does not support PIL kernel, but it should - (F.permute_channels_image, make_image), - (F.permute_channels_video, make_video), - ], - ) - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel(self, kernel, make_input, dtype, device): - check_kernel(kernel, make_input(dtype=dtype, device=device), permutation=self._DEFAULT_PERMUTATION) - - @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) - def test_functional(self, make_input): - check_functional(F.permute_channels, make_input(), permutation=self._DEFAULT_PERMUTATION) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.permute_channels_image, torch.Tensor), - (F._permute_channels_image_pil, PIL.Image.Image), - (F.permute_channels_image, tv_tensors.Image), - (F.permute_channels_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.permute_channels, kernel=kernel, input_type=input_type) - - def reference_image_correctness(self, image, permutation): - channel_images = image.split(1, dim=-3) - permuted_channel_images = [channel_images[channel_idx] for channel_idx in permutation] - return tv_tensors.Image(torch.concat(permuted_channel_images, dim=-3)) - - @pytest.mark.parametrize("permutation", [[2, 0, 1], [1, 2, 0], [2, 0, 1], [0, 1, 2]]) - @pytest.mark.parametrize("batch_dims", [(), (2,), (2, 1)]) - def test_image_correctness(self, permutation, batch_dims): - image = make_image(batch_dims=batch_dims) - - actual = F.permute_channels(image, permutation=permutation) - expected = self.reference_image_correctness(image, permutation=permutation) - - torch.testing.assert_close(actual, expected) - - -class TestElastic: - def _make_displacement(self, inpt): - return torch.rand( - 1, - *F.get_size(inpt), - 2, - dtype=torch.float32, - device=inpt.device if isinstance(inpt, torch.Tensor) else "cpu", - ) - - @param_value_parametrization( - interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], - fill=EXHAUSTIVE_TYPE_FILLS, - ) - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, param, value, dtype, device): - image = make_image_tensor(dtype=dtype, device=device) - - check_kernel( - F.elastic_image, - image, - displacement=self._make_displacement(image), - **{param: value}, - check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), - ) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_boxes(self, format, dtype, device): - bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) - - check_kernel( - F.elastic_bounding_boxes, - bounding_boxes, - format=bounding_boxes.format, - canvas_size=bounding_boxes.canvas_size, - displacement=self._make_displacement(bounding_boxes), - ) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - mask = make_mask() - check_kernel(F.elastic_mask, mask, displacement=self._make_displacement(mask)) - - def test_kernel_video(self): - video = make_video() - check_kernel(F.elastic_video, video, displacement=self._make_displacement(video)) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - input = make_input() - check_functional(F.elastic, input, displacement=self._make_displacement(input)) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.elastic_image, torch.Tensor), - (F._elastic_image_pil, PIL.Image.Image), - (F.elastic_image, tv_tensors.Image), - (F.elastic_bounding_boxes, tv_tensors.BoundingBoxes), - (F.elastic_mask, tv_tensors.Mask), - (F.elastic_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.elastic, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_displacement_error(self, make_input): - input = make_input() - - with pytest.raises(TypeError, match="displacement should be a Tensor"): - F.elastic(input, displacement=None) - - with pytest.raises(ValueError, match="displacement shape should be"): - F.elastic(input, displacement=torch.rand(F.get_size(input))) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image - @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform(self, make_input, size, device): - check_transform( - transforms.ElasticTransform(), - make_input(size, device=device), - # We updated gaussian blur kernel generation with a faster and numerically more stable version - check_v1_compatibility=dict(rtol=0, atol=1), - ) - - -class TestToPureTensor: - def test_correctness(self): - input = { - "img": make_image(), - "img_tensor": make_image_tensor(), - "img_pil": make_image_pil(), - "mask": make_detection_mask(), - "video": make_video(), - "bbox": make_bounding_boxes(), - "str": "str", - } - - out = transforms.ToPureTensor()(input) - - for input_value, out_value in zip(input.values(), out.values()): - if isinstance(input_value, tv_tensors.TVTensor): - assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, tv_tensors.TVTensor) - else: - assert isinstance(out_value, type(input_value)) - - -class TestCrop: - INPUT_SIZE = (21, 11) - - CORRECTNESS_CROP_KWARGS = [ - # center - dict(top=5, left=5, height=10, width=5), - # larger than input, i.e. pad - dict(top=-5, left=-5, height=30, width=20), - # sides: left, right, top, bottom - dict(top=-5, left=-5, height=30, width=10), - dict(top=-5, left=5, height=30, width=10), - dict(top=-5, left=-5, height=20, width=20), - dict(top=5, left=-5, height=20, width=20), - # corners: top-left, top-right, bottom-left, bottom-right - dict(top=-5, left=-5, height=20, width=10), - dict(top=-5, left=5, height=20, width=10), - dict(top=5, left=-5, height=20, width=10), - dict(top=5, left=5, height=20, width=10), - ] - MINIMAL_CROP_KWARGS = CORRECTNESS_CROP_KWARGS[0] - - @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) - @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, kwargs, dtype, device): - check_kernel(F.crop_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **kwargs) - - @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_bounding_box(self, kwargs, format, dtype, device): - bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) - check_kernel(F.crop_bounding_boxes, bounding_boxes, format=format, **kwargs) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - check_kernel(F.crop_mask, make_mask(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) - - def test_kernel_video(self): - check_kernel(F.crop_video, make_video(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - check_functional(F.crop, make_input(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.crop_image, torch.Tensor), - (F._crop_image_pil, PIL.Image.Image), - (F.crop_image, tv_tensors.Image), - (F.crop_bounding_boxes, tv_tensors.BoundingBoxes), - (F.crop_mask, tv_tensors.Mask), - (F.crop_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.crop, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) - def test_functional_image_correctness(self, kwargs): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") - - actual = F.crop(image, **kwargs) - expected = F.to_image(F.crop(F.to_pil_image(image), **kwargs)) - - assert_equal(actual, expected) - - @param_value_parametrization( - size=[(10, 5), (25, 15), (25, 5), (10, 15)], - fill=EXHAUSTIVE_TYPE_FILLS, - ) - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_transform(self, param, value, make_input): - input = make_input(self.INPUT_SIZE) - - if param == "fill": - if isinstance(input, tv_tensors.Mask) and isinstance(value, (tuple, list)): - pytest.skip("F.pad_mask doesn't support non-scalar fill.") - - kwargs = dict( - # 1. size is required - # 2. the fill parameter only has an affect if we need padding - size=[s + 4 for s in self.INPUT_SIZE], - fill=adapt_fill(value, dtype=input.dtype if isinstance(input, torch.Tensor) else torch.uint8), - ) - else: - kwargs = {param: value} - - check_transform( - transforms.RandomCrop(**kwargs, pad_if_needed=True), - input, - check_v1_compatibility=param != "fill" or isinstance(value, (int, float)), - ) - - @pytest.mark.parametrize("padding", [1, (1, 1), (1, 1, 1, 1)]) - def test_transform_padding(self, padding): - inpt = make_image(self.INPUT_SIZE) - - output_size = [s + 2 for s in F.get_size(inpt)] - transform = transforms.RandomCrop(output_size, padding=padding) - - output = transform(inpt) - - assert F.get_size(output) == output_size - - @pytest.mark.parametrize("padding", [None, 1, (1, 1), (1, 1, 1, 1)]) - def test_transform_insufficient_padding(self, padding): - inpt = make_image(self.INPUT_SIZE) - - output_size = [s + 3 for s in F.get_size(inpt)] - transform = transforms.RandomCrop(output_size, padding=padding) - - with pytest.raises(ValueError, match="larger than (padded )?input image size"): - transform(inpt) - - def test_transform_pad_if_needed(self): - inpt = make_image(self.INPUT_SIZE) - - output_size = [s * 2 for s in F.get_size(inpt)] - transform = transforms.RandomCrop(output_size, pad_if_needed=True) - - output = transform(inpt) - - assert F.get_size(output) == output_size - - @param_value_parametrization( - size=[(10, 5), (25, 15), (25, 5), (10, 15)], - fill=CORRECTNESS_FILLS, - padding_mode=["constant", "edge", "reflect", "symmetric"], - ) - @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_image_correctness(self, param, value, seed): - kwargs = {param: value} - if param != "size": - # 1. size is required - # 2. the fill / padding_mode parameters only have an affect if we need padding - kwargs["size"] = [s + 4 for s in self.INPUT_SIZE] - if param == "fill": - kwargs["fill"] = adapt_fill(kwargs["fill"], dtype=torch.uint8) - - transform = transforms.RandomCrop(pad_if_needed=True, **kwargs) - - image = make_image(self.INPUT_SIZE) - - with freeze_rng_state(): - torch.manual_seed(seed) - actual = transform(image) - - torch.manual_seed(seed) - expected = F.to_image(transform(F.to_pil_image(image))) - - assert_equal(actual, expected) - - def _reference_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width): - affine_matrix = np.array( - [ - [1, 0, -left], - [0, 1, -top], - ], - ) - return reference_affine_bounding_boxes_helper( - bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width) - ) - - @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_functional_bounding_box_correctness(self, kwargs, format, dtype, device): - bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) - - actual = F.crop(bounding_boxes, **kwargs) - expected = self._reference_crop_bounding_boxes(bounding_boxes, **kwargs) - - assert_equal(actual, expected, atol=1, rtol=0) - assert_equal(F.get_size(actual), F.get_size(expected)) - - @pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)]) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_bounding_boxes_correctness(self, output_size, format, dtype, device, seed): - input_size = [s * 2 for s in output_size] - bounding_boxes = make_bounding_boxes(input_size, format=format, dtype=dtype, device=device) - - transform = transforms.RandomCrop(output_size) - - with freeze_rng_state(): - torch.manual_seed(seed) - params = transform._get_params([bounding_boxes]) - assert not params.pop("needs_pad") - del params["padding"] - assert params.pop("needs_crop") - - torch.manual_seed(seed) - actual = transform(bounding_boxes) - - expected = self._reference_crop_bounding_boxes(bounding_boxes, **params) - - assert_equal(actual, expected) - assert_equal(F.get_size(actual), F.get_size(expected)) - - def test_errors(self): - with pytest.raises(ValueError, match="Please provide only two dimensions"): - transforms.RandomCrop([10, 12, 14]) - - with pytest.raises(TypeError, match="Got inappropriate padding arg"): - transforms.RandomCrop([10, 12], padding="abc") - - with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): - transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7]) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomCrop([10, 12], padding=1, fill="abc") - - with pytest.raises(ValueError, match="Padding mode should be either"): - transforms.RandomCrop([10, 12], padding=1, padding_mode="abc") - - -class TestErase: - INPUT_SIZE = (17, 11) - FUNCTIONAL_KWARGS = dict( - zip("ijhwv", [2, 2, 10, 8, torch.tensor(0.0, dtype=torch.float32, device="cpu").reshape(-1, 1, 1)]) - ) - - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, dtype, device): - check_kernel(F.erase_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **self.FUNCTIONAL_KWARGS) - - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image_inplace(self, dtype, device): - input = make_image(self.INPUT_SIZE, dtype=dtype, device=device) - input_version = input._version - - output_out_of_place = F.erase_image(input, **self.FUNCTIONAL_KWARGS) - assert output_out_of_place.data_ptr() != input.data_ptr() - assert output_out_of_place is not input - - output_inplace = F.erase_image(input, **self.FUNCTIONAL_KWARGS, inplace=True) - assert output_inplace.data_ptr() == input.data_ptr() - assert output_inplace._version > input_version - assert output_inplace is input - - assert_equal(output_inplace, output_out_of_place) - - def test_kernel_video(self): - check_kernel(F.erase_video, make_video(self.INPUT_SIZE), **self.FUNCTIONAL_KWARGS) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], - ) - def test_functional(self, make_input): - check_functional(F.erase, make_input(), **self.FUNCTIONAL_KWARGS) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.erase_image, torch.Tensor), - (F._erase_image_pil, PIL.Image.Image), - (F.erase_image, tv_tensors.Image), - (F.erase_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.erase, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform(self, make_input, device): - input = make_input(device=device) - check_transform( - transforms.RandomErasing(p=1), input, check_v1_compatibility=not isinstance(input, PIL.Image.Image) - ) - - def _reference_erase_image(self, image, *, i, j, h, w, v): - mask = torch.zeros_like(image, dtype=torch.bool) - mask[..., i : i + h, j : j + w] = True - - # The broadcasting and type casting logic is handled automagically in the kernel through indexing - value = torch.broadcast_to(v, (*image.shape[:-2], h, w)).to(image) - - erased_image = torch.empty_like(image) - erased_image[mask] = value.flatten() - erased_image[~mask] = image[~mask] - - return erased_image - - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_functional_image_correctness(self, dtype, device): - image = make_image(dtype=dtype, device=device) - - actual = F.erase(image, **self.FUNCTIONAL_KWARGS) - expected = self._reference_erase_image(image, **self.FUNCTIONAL_KWARGS) - - assert_equal(actual, expected) - - @param_value_parametrization( - scale=[(0.1, 0.2), [0.0, 1.0]], - ratio=[(0.3, 0.7), [0.1, 5.0]], - value=[0, 0.5, (0, 1, 0), [-0.2, 0.0, 1.3], "random"], - ) - @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_image_correctness(self, param, value, dtype, device, seed): - transform = transforms.RandomErasing(**{param: value}, p=1) - - image = make_image(dtype=dtype, device=device) - - with freeze_rng_state(): - torch.manual_seed(seed) - # This emulates the random apply check that happens before _get_params is called - torch.rand(1) - params = transform._get_params([image]) - - torch.manual_seed(seed) - actual = transform(image) - - expected = self._reference_erase_image(image, **params) - - assert_equal(actual, expected) - - def test_transform_errors(self): - with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"): - transforms.RandomErasing(value={}) - - with pytest.raises(ValueError, match="If value is str, it should be 'random'"): - transforms.RandomErasing(value="abc") - - with pytest.raises(TypeError, match="Scale should be a sequence"): - transforms.RandomErasing(scale=123) - - with pytest.raises(TypeError, match="Ratio should be a sequence"): - transforms.RandomErasing(ratio=123) - - with pytest.raises(ValueError, match="Scale should be between 0 and 1"): - transforms.RandomErasing(scale=[-1, 2]) - - transform = transforms.RandomErasing(value=[1, 2, 3, 4]) - - with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"): - transform._get_params([make_image()]) - - @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_mask]) - def test_transform_passthrough(self, make_input): - transform = transforms.RandomErasing(p=1) - - input = make_input(self.INPUT_SIZE) - - with pytest.warns(UserWarning, match="currently passing through inputs of type"): - # RandomErasing requires an image or video to be present - _, output = transform(make_image(self.INPUT_SIZE), input) - - assert output is input - - -class TestGaussianBlur: - @pytest.mark.parametrize("kernel_size", [1, 3, (3, 1), [3, 5]]) - @pytest.mark.parametrize("sigma", [None, 1.0, 1, (0.5,), [0.3], (0.3, 0.7), [0.9, 0.2]]) - def test_kernel_image(self, kernel_size, sigma): - check_kernel( - F.gaussian_blur_image, - make_image(), - kernel_size=kernel_size, - sigma=sigma, - check_scripted_vs_eager=not (isinstance(kernel_size, int) or isinstance(sigma, (float, int))), - ) - - def test_kernel_image_errors(self): - image = make_image_tensor() - - with pytest.raises(ValueError, match="kernel_size is a sequence its length should be 2"): - F.gaussian_blur_image(image, kernel_size=[1, 2, 3]) - - for kernel_size in [2, -1]: - with pytest.raises(ValueError, match="kernel_size should have odd and positive integers"): - F.gaussian_blur_image(image, kernel_size=kernel_size) - - with pytest.raises(ValueError, match="sigma is a sequence, its length should be 2"): - F.gaussian_blur_image(image, kernel_size=1, sigma=[1, 2, 3]) - - with pytest.raises(TypeError, match="sigma should be either float or sequence of floats"): - F.gaussian_blur_image(image, kernel_size=1, sigma=object()) - - with pytest.raises(ValueError, match="sigma should have positive values"): - F.gaussian_blur_image(image, kernel_size=1, sigma=-1) - - def test_kernel_video(self): - check_kernel(F.gaussian_blur_video, make_video(), kernel_size=(3, 3)) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], - ) - def test_functional(self, make_input): - check_functional(F.gaussian_blur, make_input(), kernel_size=(3, 3)) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.gaussian_blur_image, torch.Tensor), - (F._gaussian_blur_image_pil, PIL.Image.Image), - (F.gaussian_blur_image, tv_tensors.Image), - (F.gaussian_blur_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.gaussian_blur, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("sigma", [5, 2.0, (0.5, 2), [1.3, 2.7]]) - def test_transform(self, make_input, device, sigma): - check_transform(transforms.GaussianBlur(kernel_size=3, sigma=sigma), make_input(device=device)) - - def test_assertions(self): - with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"): - transforms.GaussianBlur([10, 12, 14]) - - with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"): - transforms.GaussianBlur(4) - - with pytest.raises(ValueError, match="If sigma is a sequence its length should be 1 or 2. Got 3"): - transforms.GaussianBlur(3, sigma=[1, 2, 3]) - - with pytest.raises(ValueError, match="sigma values should be positive and of the form"): - transforms.GaussianBlur(3, sigma=-1.0) - - with pytest.raises(ValueError, match="sigma values should be positive and of the form"): - transforms.GaussianBlur(3, sigma=[2.0, 1.0]) - - with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"): - transforms.GaussianBlur(3, sigma={}) - - @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0], (10, 12.0), [10]]) - def test__get_params(self, sigma): - transform = transforms.GaussianBlur(3, sigma=sigma) - params = transform._get_params([]) - - if isinstance(sigma, float): - assert params["sigma"][0] == params["sigma"][1] == sigma - elif isinstance(sigma, list) and len(sigma) == 1: - assert params["sigma"][0] == params["sigma"][1] == sigma[0] - else: - assert sigma[0] <= params["sigma"][0] <= sigma[1] - assert sigma[0] <= params["sigma"][1] <= sigma[1] - - # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3)) - # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28)) - # { - # "10_12_3__3_3_0.8": cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8), - # "10_12_3__3_3_0.5": cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5), - # "10_12_3__3_5_0.8": cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8), - # "10_12_3__3_5_0.5": cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5), - # "26_28_1__23_23_1.7": cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7), - # } - REFERENCE_GAUSSIAN_BLUR_IMAGE_RESULTS = torch.load( - Path(__file__).parent / "assets" / "gaussian_blur_opencv_results.pt" - ) - - @pytest.mark.parametrize( - ("dimensions", "kernel_size", "sigma"), - [ - ((3, 10, 12), (3, 3), 0.8), - ((3, 10, 12), (3, 3), 0.5), - ((3, 10, 12), (3, 5), 0.8), - ((3, 10, 12), (3, 5), 0.5), - ((1, 26, 28), (23, 23), 1.7), - ], - ) - @pytest.mark.parametrize("dtype", [torch.float32, torch.float64, torch.float16]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_functional_image_correctness(self, dimensions, kernel_size, sigma, dtype, device): - if dtype is torch.float16 and device == "cpu": - pytest.skip("The CPU implementation of float16 on CPU differs from opencv") - - num_channels, height, width = dimensions - - reference_results_key = f"{height}_{width}_{num_channels}__{kernel_size[0]}_{kernel_size[1]}_{sigma}" - expected = ( - torch.tensor(self.REFERENCE_GAUSSIAN_BLUR_IMAGE_RESULTS[reference_results_key]) - .reshape(height, width, num_channels) - .permute(2, 0, 1) - .to(dtype=dtype, device=device) - ) - - image = tv_tensors.Image( - torch.arange(num_channels * height * width, dtype=torch.uint8) - .reshape(height, width, num_channels) - .permute(2, 0, 1), - dtype=dtype, - device=device, - ) - - actual = F.gaussian_blur_image(image, kernel_size=kernel_size, sigma=sigma) - - torch.testing.assert_close(actual, expected, rtol=0, atol=1) - - -class TestAutoAugmentTransforms: - # These transforms have a lot of branches in their `forward()` passes which are conditioned on random sampling. - # It's typically very hard to test the effect on some parameters without heavy mocking logic. - # This class adds correctness tests for the kernels that are specific to those transforms. The rest of kernels, e.g. - # rotate, are tested in their respective classes. The rest of the tests here are mostly smoke tests. - - def _reference_shear_translate(self, image, *, transform_id, magnitude, interpolation, fill): - if isinstance(image, PIL.Image.Image): - input = image - else: - input = F.to_pil_image(image) - - matrix = { - "ShearX": (1, magnitude, 0, 0, 1, 0), - "ShearY": (1, 0, 0, magnitude, 1, 0), - "TranslateX": (1, 0, -int(magnitude), 0, 1, 0), - "TranslateY": (1, 0, 0, 0, 1, -int(magnitude)), - }[transform_id] - - output = input.transform( - input.size, PIL.Image.AFFINE, matrix, resample=pil_modes_mapping[interpolation], fill=fill - ) - - if isinstance(image, PIL.Image.Image): - return output - else: - return F.to_image(output) - - @pytest.mark.parametrize("transform_id", ["ShearX", "ShearY", "TranslateX", "TranslateY"]) - @pytest.mark.parametrize("magnitude", [0.3, -0.2, 0.0]) - @pytest.mark.parametrize( - "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] - ) - @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) - @pytest.mark.parametrize("input_type", ["Tensor", "PIL"]) - def test_correctness_shear_translate(self, transform_id, magnitude, interpolation, fill, input_type): - # ShearX/Y and TranslateX/Y are the only ops that are native to the AA transforms. They are modeled after the - # reference implementation: - # https://github.com/tensorflow/models/blob/885fda091c46c59d6c7bb5c7e760935eacc229da/research/autoaugment/augmentation_transforms.py#L273-L362 - # All other ops are checked in their respective dedicated tests. - - image = make_image(dtype=torch.uint8, device="cpu") - if input_type == "PIL": - image = F.to_pil_image(image) - - if "Translate" in transform_id: - # For TranslateX/Y magnitude is a value in pixels - magnitude *= min(F.get_size(image)) - - actual = transforms.AutoAugment()._apply_image_or_video_transform( - image, - transform_id=transform_id, - magnitude=magnitude, - interpolation=interpolation, - fill={type(image): fill}, - ) - expected = self._reference_shear_translate( - image, transform_id=transform_id, magnitude=magnitude, interpolation=interpolation, fill=fill - ) - - if input_type == "PIL": - actual, expected = F.to_image(actual), F.to_image(expected) - - if "Shear" in transform_id and input_type == "Tensor": - mae = (actual.float() - expected.float()).abs().mean() - assert mae < (12 if interpolation is transforms.InterpolationMode.NEAREST else 5) - else: - assert_close(actual, expected, rtol=0, atol=1) - - @pytest.mark.parametrize( - "transform", - [transforms.AutoAugment(), transforms.RandAugment(), transforms.TrivialAugmentWide(), transforms.AugMix()], - ) - @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) - @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform_smoke(self, transform, make_input, dtype, device): - if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"): - pytest.skip( - "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' " - "will degenerate to that anyway." - ) - input = make_input(dtype=dtype, device=device) - - with freeze_rng_state(): - # By default every test starts from the same random seed. This leads to minimal coverage of the sampling - # that happens inside forward(). To avoid calling the transform multiple times to achieve higher coverage, - # we build a reproducible random seed from the input type, dtype, and device. - torch.manual_seed(hash((make_input, dtype, device))) - - # For v2, we changed the random sampling of the AA transforms. This makes it impossible to compare the v1 - # and v2 outputs without complicated mocking and monkeypatching. Thus, we skip the v1 compatibility checks - # here and only check if we can script the v2 transform and subsequently call the result. - check_transform(transform, input, check_v1_compatibility=False) - - if type(input) is torch.Tensor and dtype is torch.uint8: - _script(transform)(input) - - def test_auto_augment_policy_error(self): - with pytest.raises(ValueError, match="provided policy"): - transforms.AutoAugment(policy=None) - - @pytest.mark.parametrize("severity", [0, 11]) - def test_aug_mix_severity_error(self, severity): - with pytest.raises(ValueError, match="severity must be between"): - transforms.AugMix(severity=severity) - - -class TestConvertBoundingBoxFormat: - old_new_formats = list(itertools.permutations(iter(tv_tensors.BoundingBoxFormat), 2)) - - @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) - def test_kernel(self, old_format, new_format): - check_kernel( - F.convert_bounding_box_format, - make_bounding_boxes(format=old_format), - new_format=new_format, - old_format=old_format, - ) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("inplace", [False, True]) - def test_kernel_noop(self, format, inplace): - input = make_bounding_boxes(format=format).as_subclass(torch.Tensor) - input_version = input._version - - output = F.convert_bounding_box_format(input, old_format=format, new_format=format, inplace=inplace) - - assert output is input - assert output.data_ptr() == input.data_ptr() - assert output._version == input_version - - @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) - def test_kernel_inplace(self, old_format, new_format): - input = make_bounding_boxes(format=old_format).as_subclass(torch.Tensor) - input_version = input._version - - output_out_of_place = F.convert_bounding_box_format(input, old_format=old_format, new_format=new_format) - assert output_out_of_place.data_ptr() != input.data_ptr() - assert output_out_of_place is not input - - output_inplace = F.convert_bounding_box_format( - input, old_format=old_format, new_format=new_format, inplace=True - ) - assert output_inplace.data_ptr() == input.data_ptr() - assert output_inplace._version > input_version - assert output_inplace is input - - assert_equal(output_inplace, output_out_of_place) - - @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) - def test_functional(self, old_format, new_format): - check_functional(F.convert_bounding_box_format, make_bounding_boxes(format=old_format), new_format=new_format) - - @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) - @pytest.mark.parametrize("format_type", ["enum", "str"]) - def test_transform(self, old_format, new_format, format_type): - check_transform( - transforms.ConvertBoundingBoxFormat(new_format.name if format_type == "str" else new_format), - make_bounding_boxes(format=old_format), - ) - - def _reference_convert_bounding_box_format(self, bounding_boxes, new_format): - return tv_tensors.wrap( - torchvision.ops.box_convert( - bounding_boxes.as_subclass(torch.Tensor), - in_fmt=bounding_boxes.format.name.lower(), - out_fmt=new_format.name.lower(), - ).to(bounding_boxes.dtype), - like=bounding_boxes, - format=new_format, - ) - - @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) - @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("fn_type", ["functional", "transform"]) - def test_correctness(self, old_format, new_format, dtype, device, fn_type): - bounding_boxes = make_bounding_boxes(format=old_format, dtype=dtype, device=device) - - if fn_type == "functional": - fn = functools.partial(F.convert_bounding_box_format, new_format=new_format) - else: - fn = transforms.ConvertBoundingBoxFormat(format=new_format) - - actual = fn(bounding_boxes) - expected = self._reference_convert_bounding_box_format(bounding_boxes, new_format) - - assert_equal(actual, expected) - - def test_errors(self): - input_tv_tensor = make_bounding_boxes() - input_pure_tensor = input_tv_tensor.as_subclass(torch.Tensor) - - for input in [input_tv_tensor, input_pure_tensor]: - with pytest.raises(TypeError, match="missing 1 required argument: 'new_format'"): - F.convert_bounding_box_format(input) - - with pytest.raises(ValueError, match="`old_format` has to be passed"): - F.convert_bounding_box_format(input_pure_tensor, new_format=input_tv_tensor.format) - - with pytest.raises(ValueError, match="`old_format` must not be passed"): - F.convert_bounding_box_format( - input_tv_tensor, old_format=input_tv_tensor.format, new_format=input_tv_tensor.format - ) - - -class TestResizedCrop: - INPUT_SIZE = (17, 11) - CROP_KWARGS = dict(top=2, left=2, height=5, width=7) - OUTPUT_SIZE = (19, 32) - - @pytest.mark.parametrize( - ("kernel", "make_input"), - [ - (F.resized_crop_image, make_image), - (F.resized_crop_bounding_boxes, make_bounding_boxes), - (F.resized_crop_mask, make_segmentation_mask), - (F.resized_crop_mask, make_detection_mask), - (F.resized_crop_video, make_video), - ], - ) - def test_kernel(self, kernel, make_input): - input = make_input(self.INPUT_SIZE) - if isinstance(input, tv_tensors.BoundingBoxes): - extra_kwargs = dict(format=input.format) - elif isinstance(input, tv_tensors.Mask): - extra_kwargs = dict() - else: - extra_kwargs = dict(antialias=True) - - check_kernel(kernel, input, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, **extra_kwargs) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - check_functional( - F.resized_crop, make_input(self.INPUT_SIZE), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, antialias=True - ) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.resized_crop_image, torch.Tensor), - (F._resized_crop_image_pil, PIL.Image.Image), - (F.resized_crop_image, tv_tensors.Image), - (F.resized_crop_bounding_boxes, tv_tensors.BoundingBoxes), - (F.resized_crop_mask, tv_tensors.Mask), - (F.resized_crop_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.resized_crop, kernel=kernel, input_type=input_type) - - @param_value_parametrization( - scale=[(0.1, 0.2), [0.0, 1.0]], - ratio=[(0.3, 0.7), [0.1, 5.0]], - ) - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_transform(self, param, value, make_input): - check_transform( - transforms.RandomResizedCrop(size=self.OUTPUT_SIZE, **{param: value}, antialias=True), - make_input(self.INPUT_SIZE), - check_v1_compatibility=dict(rtol=0, atol=1), - ) - - # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2. - # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT` - @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST}) - def test_functional_image_correctness(self, interpolation): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8) - - actual = F.resized_crop( - image, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation, antialias=True - ) - expected = F.to_image( - F.resized_crop( - F.to_pil_image(image), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation - ) - ) - - torch.testing.assert_close(actual, expected, atol=1, rtol=0) - - def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width, size): - new_height, new_width = size - - crop_affine_matrix = np.array( - [ - [1, 0, -left], - [0, 1, -top], - [0, 0, 1], - ], - ) - resize_affine_matrix = np.array( - [ - [new_width / width, 0, 0], - [0, new_height / height, 0], - [0, 0, 1], - ], - ) - affine_matrix = (resize_affine_matrix @ crop_affine_matrix)[:2, :] - - return reference_affine_bounding_boxes_helper( - bounding_boxes, - affine_matrix=affine_matrix, - new_canvas_size=size, - ) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - def test_functional_bounding_boxes_correctness(self, format): - bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format) - - actual = F.resized_crop(bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE) - expected = self._reference_resized_crop_bounding_boxes( - bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE - ) - - assert_equal(actual, expected) - assert_equal(F.get_size(actual), F.get_size(expected)) - - def test_transform_errors_warnings(self): - with pytest.raises(ValueError, match="provide only two dimensions"): - transforms.RandomResizedCrop(size=(1, 2, 3)) - - with pytest.raises(TypeError, match="Scale should be a sequence"): - transforms.RandomResizedCrop(size=self.INPUT_SIZE, scale=123) - - with pytest.raises(TypeError, match="Ratio should be a sequence"): - transforms.RandomResizedCrop(size=self.INPUT_SIZE, ratio=123) - - for param in ["scale", "ratio"]: - with pytest.warns(match="Scale and ratio should be of kind"): - transforms.RandomResizedCrop(size=self.INPUT_SIZE, **{param: [1, 0]}) - - -class TestPad: - EXHAUSTIVE_TYPE_PADDINGS = [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] - CORRECTNESS_PADDINGS = [ - padding - for padding in EXHAUSTIVE_TYPE_PADDINGS - if isinstance(padding, int) or isinstance(padding, list) and len(padding) > 1 - ] - PADDING_MODES = ["constant", "symmetric", "edge", "reflect"] - - @param_value_parametrization( - padding=EXHAUSTIVE_TYPE_PADDINGS, - fill=EXHAUSTIVE_TYPE_FILLS, - padding_mode=PADDING_MODES, - ) - @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, param, value, dtype, device): - if param == "fill": - value = adapt_fill(value, dtype=dtype) - kwargs = {param: value} - if param != "padding": - kwargs["padding"] = [1] - - image = make_image(dtype=dtype, device=device) - - check_kernel( - F.pad_image, - image, - **kwargs, - check_scripted_vs_eager=not ( - (param == "padding" and isinstance(value, int)) - # See https://github.com/pytorch/vision/pull/7252#issue-1585585521 for details - or ( - param == "fill" - and ( - isinstance(value, tuple) or (isinstance(value, list) and any(isinstance(v, int) for v in value)) - ) - ) - ), - ) - - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - def test_kernel_bounding_boxes(self, format): - bounding_boxes = make_bounding_boxes(format=format) - check_kernel( - F.pad_bounding_boxes, - bounding_boxes, - format=bounding_boxes.format, - canvas_size=bounding_boxes.canvas_size, - padding=[1], - ) - - @pytest.mark.parametrize("padding_mode", ["symmetric", "edge", "reflect"]) - def test_kernel_bounding_boxes_errors(self, padding_mode): - bounding_boxes = make_bounding_boxes() - with pytest.raises(ValueError, match=f"'{padding_mode}' is not supported"): - F.pad_bounding_boxes( - bounding_boxes, - format=bounding_boxes.format, - canvas_size=bounding_boxes.canvas_size, - padding=[1], - padding_mode=padding_mode, - ) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - check_kernel(F.pad_mask, make_mask(), padding=[1]) - - @pytest.mark.parametrize("fill", [[1], (0,), [1, 0, 1], (0, 1, 0)]) - def test_kernel_mask_errors(self, fill): - with pytest.raises(ValueError, match="Non-scalar fill value is not supported"): - check_kernel(F.pad_mask, make_segmentation_mask(), padding=[1], fill=fill) - - def test_kernel_video(self): - check_kernel(F.pad_video, make_video(), padding=[1]) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - check_functional(F.pad, make_input(), padding=[1]) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.pad_image, torch.Tensor), - # The PIL kernel uses fill=0 as default rather than fill=None as all others. - # Since the whole fill story is already really inconsistent, we won't introduce yet another case to allow - # for this test to pass. - # See https://github.com/pytorch/vision/issues/6623 for a discussion. - # (F._pad_image_pil, PIL.Image.Image), - (F.pad_image, tv_tensors.Image), - (F.pad_bounding_boxes, tv_tensors.BoundingBoxes), - (F.pad_mask, tv_tensors.Mask), - (F.pad_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.pad, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_transform(self, make_input): - check_transform(transforms.Pad(padding=[1]), make_input()) - - def test_transform_errors(self): - with pytest.raises(TypeError, match="Got inappropriate padding arg"): - transforms.Pad("abc") - - with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): - transforms.Pad([-0.7, 0, 0.7]) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.Pad(12, fill="abc") - - with pytest.raises(ValueError, match="Padding mode should be either"): - transforms.Pad(12, padding_mode="abc") - - @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS) - @pytest.mark.parametrize( - ("padding_mode", "fill"), - [ - *[("constant", fill) for fill in CORRECTNESS_FILLS], - *[(padding_mode, None) for padding_mode in ["symmetric", "edge", "reflect"]], - ], - ) - @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)]) - def test_image_correctness(self, padding, padding_mode, fill, fn): - image = make_image(dtype=torch.uint8, device="cpu") - - fill = adapt_fill(fill, dtype=torch.uint8) - - actual = fn(image, padding=padding, padding_mode=padding_mode, fill=fill) - expected = F.to_image(F.pad(F.to_pil_image(image), padding=padding, padding_mode=padding_mode, fill=fill)) - - assert_equal(actual, expected) - - def _reference_pad_bounding_boxes(self, bounding_boxes, *, padding): - if isinstance(padding, int): - padding = [padding] - left, top, right, bottom = padding * (4 // len(padding)) - - affine_matrix = np.array( - [ - [1, 0, left], - [0, 1, top], - ], - ) - - height = bounding_boxes.canvas_size[0] + top + bottom - width = bounding_boxes.canvas_size[1] + left + right - - return reference_affine_bounding_boxes_helper( - bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width) - ) - - @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)]) - def test_bounding_boxes_correctness(self, padding, format, dtype, device, fn): - bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) - - actual = fn(bounding_boxes, padding=padding) - expected = self._reference_pad_bounding_boxes(bounding_boxes, padding=padding) - - assert_equal(actual, expected) - - -class TestCenterCrop: - INPUT_SIZE = (17, 11) - OUTPUT_SIZES = [(3, 5), (5, 3), (4, 4), (21, 9), (13, 15), (19, 14), 3, (4,), [5], INPUT_SIZE] - - @pytest.mark.parametrize("output_size", OUTPUT_SIZES) - @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, output_size, dtype, device): - check_kernel( - F.center_crop_image, - make_image(self.INPUT_SIZE, dtype=dtype, device=device), - output_size=output_size, - check_scripted_vs_eager=not isinstance(output_size, int), - ) - - @pytest.mark.parametrize("output_size", OUTPUT_SIZES) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - def test_kernel_bounding_boxes(self, output_size, format): - bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format) - check_kernel( - F.center_crop_bounding_boxes, - bounding_boxes, - format=bounding_boxes.format, - canvas_size=bounding_boxes.canvas_size, - output_size=output_size, - check_scripted_vs_eager=not isinstance(output_size, int), - ) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - check_kernel(F.center_crop_mask, make_mask(), output_size=self.OUTPUT_SIZES[0]) - - def test_kernel_video(self): - check_kernel(F.center_crop_video, make_video(self.INPUT_SIZE), output_size=self.OUTPUT_SIZES[0]) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - check_functional(F.center_crop, make_input(self.INPUT_SIZE), output_size=self.OUTPUT_SIZES[0]) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.center_crop_image, torch.Tensor), - (F._center_crop_image_pil, PIL.Image.Image), - (F.center_crop_image, tv_tensors.Image), - (F.center_crop_bounding_boxes, tv_tensors.BoundingBoxes), - (F.center_crop_mask, tv_tensors.Mask), - (F.center_crop_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.center_crop, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_transform(self, make_input): - check_transform(transforms.CenterCrop(self.OUTPUT_SIZES[0]), make_input(self.INPUT_SIZE)) - - @pytest.mark.parametrize("output_size", OUTPUT_SIZES) - @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)]) - def test_image_correctness(self, output_size, fn): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") - - actual = fn(image, output_size) - expected = F.to_image(F.center_crop(F.to_pil_image(image), output_size=output_size)) - - assert_equal(actual, expected) - - def _reference_center_crop_bounding_boxes(self, bounding_boxes, output_size): - image_height, image_width = bounding_boxes.canvas_size - if isinstance(output_size, int): - output_size = (output_size, output_size) - elif len(output_size) == 1: - output_size *= 2 - crop_height, crop_width = output_size - - top = int(round((image_height - crop_height) / 2)) - left = int(round((image_width - crop_width) / 2)) - - affine_matrix = np.array( - [ - [1, 0, -left], - [0, 1, -top], - ], - ) - return reference_affine_bounding_boxes_helper( - bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=output_size - ) - - @pytest.mark.parametrize("output_size", OUTPUT_SIZES) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)]) - def test_bounding_boxes_correctness(self, output_size, format, dtype, device, fn): - bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) - - actual = fn(bounding_boxes, output_size) - expected = self._reference_center_crop_bounding_boxes(bounding_boxes, output_size) - - assert_equal(actual, expected) - - -class TestPerspective: - COEFFICIENTS = [ - [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018], - [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063], - ] - START_END_POINTS = [ - ([[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]), - ([[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]), - ([[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]), - ] - MINIMAL_KWARGS = dict(startpoints=None, endpoints=None, coefficients=COEFFICIENTS[0]) - - @param_value_parametrization( - coefficients=COEFFICIENTS, - start_end_points=START_END_POINTS, - fill=EXHAUSTIVE_TYPE_FILLS, - ) - @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_kernel_image(self, param, value, dtype, device): - if param == "start_end_points": - kwargs = dict(zip(["startpoints", "endpoints"], value)) - else: - kwargs = {"startpoints": None, "endpoints": None, param: value} - if param == "fill": - kwargs["coefficients"] = self.COEFFICIENTS[0] - - check_kernel( - F.perspective_image, - make_image(dtype=dtype, device=device), - **kwargs, - check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), - ) - - def test_kernel_image_error(self): - image = make_image_tensor() - - with pytest.raises(ValueError, match="startpoints/endpoints or the coefficients must have non `None` values"): - F.perspective_image(image, startpoints=None, endpoints=None) - - with pytest.raises( - ValueError, match="startpoints/endpoints and the coefficients shouldn't be defined concurrently" - ): - startpoints, endpoints = self.START_END_POINTS[0] - coefficients = self.COEFFICIENTS[0] - F.perspective_image(image, startpoints=startpoints, endpoints=endpoints, coefficients=coefficients) - - with pytest.raises(ValueError, match="coefficients should have 8 float values"): - F.perspective_image(image, startpoints=None, endpoints=None, coefficients=list(range(7))) - - @param_value_parametrization( - coefficients=COEFFICIENTS, - start_end_points=START_END_POINTS, - ) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - def test_kernel_bounding_boxes(self, param, value, format): - if param == "start_end_points": - kwargs = dict(zip(["startpoints", "endpoints"], value)) - else: - kwargs = {"startpoints": None, "endpoints": None, param: value} - - bounding_boxes = make_bounding_boxes(format=format) - - check_kernel( - F.perspective_bounding_boxes, - bounding_boxes, - format=bounding_boxes.format, - canvas_size=bounding_boxes.canvas_size, - **kwargs, - ) - - def test_kernel_bounding_boxes_error(self): - bounding_boxes = make_bounding_boxes() - format, canvas_size = bounding_boxes.format, bounding_boxes.canvas_size - bounding_boxes = bounding_boxes.as_subclass(torch.Tensor) - - with pytest.raises(RuntimeError, match="Denominator is zero"): - F.perspective_bounding_boxes( - bounding_boxes, - format=format, - canvas_size=canvas_size, - startpoints=None, - endpoints=None, - coefficients=[0.0] * 8, - ) - - @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) - def test_kernel_mask(self, make_mask): - check_kernel(F.perspective_mask, make_mask(), **self.MINIMAL_KWARGS) - - def test_kernel_video(self): - check_kernel(F.perspective_video, make_video(), **self.MINIMAL_KWARGS) - - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_functional(self, make_input): - check_functional(F.perspective, make_input(), **self.MINIMAL_KWARGS) - - @pytest.mark.parametrize( - ("kernel", "input_type"), - [ - (F.perspective_image, torch.Tensor), - (F._perspective_image_pil, PIL.Image.Image), - (F.perspective_image, tv_tensors.Image), - (F.perspective_bounding_boxes, tv_tensors.BoundingBoxes), - (F.perspective_mask, tv_tensors.Mask), - (F.perspective_video, tv_tensors.Video), - ], - ) - def test_functional_signature(self, kernel, input_type): - check_functional_kernel_signature_match(F.perspective, kernel=kernel, input_type=input_type) - - @pytest.mark.parametrize("distortion_scale", [0.5, 0.0, 1.0]) - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], - ) - def test_transform(self, distortion_scale, make_input): - check_transform(transforms.RandomPerspective(distortion_scale=distortion_scale, p=1), make_input()) - - @pytest.mark.parametrize("distortion_scale", [-1, 2]) - def test_transform_error(self, distortion_scale): - with pytest.raises(ValueError, match="distortion_scale value should be between 0 and 1"): - transforms.RandomPerspective(distortion_scale=distortion_scale) - - @pytest.mark.parametrize("coefficients", COEFFICIENTS) - @pytest.mark.parametrize( - "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] - ) - @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) - def test_image_functional_correctness(self, coefficients, interpolation, fill): - image = make_image(dtype=torch.uint8, device="cpu") - - actual = F.perspective( - image, startpoints=None, endpoints=None, coefficients=coefficients, interpolation=interpolation, fill=fill - ) - expected = F.to_image( - F.perspective( - F.to_pil_image(image), - startpoints=None, - endpoints=None, - coefficients=coefficients, - interpolation=interpolation, - fill=fill, - ) - ) - - if interpolation is transforms.InterpolationMode.BILINEAR: - abs_diff = (actual.float() - expected.float()).abs() - assert (abs_diff > 1).float().mean() < 7e-2 - mae = abs_diff.mean() - assert mae < 3 - else: - assert_equal(actual, expected) - - def _reference_perspective_bounding_boxes(self, bounding_boxes, *, startpoints, endpoints): - format = bounding_boxes.format - canvas_size = bounding_boxes.canvas_size - dtype = bounding_boxes.dtype - device = bounding_boxes.device - - coefficients = _get_perspective_coeffs(endpoints, startpoints) - - def perspective_bounding_boxes(bounding_boxes): - m1 = np.array( - [ - [coefficients[0], coefficients[1], coefficients[2]], - [coefficients[3], coefficients[4], coefficients[5]], - ] - ) - m2 = np.array( - [ - [coefficients[6], coefficients[7], 1.0], - [coefficients[6], coefficients[7], 1.0], - ] - ) - - # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 - input_xyxy = F.convert_bounding_box_format( - bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True), - old_format=format, - new_format=tv_tensors.BoundingBoxFormat.XYXY, - inplace=True, - ) - x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist() - - points = np.array( - [ - [x1, y1, 1.0], - [x2, y1, 1.0], - [x1, y2, 1.0], - [x2, y2, 1.0], - ] - ) - - numerator = points @ m1.T - denominator = points @ m2.T - transformed_points = numerator / denominator - - output_xyxy = torch.Tensor( - [ - float(np.min(transformed_points[:, 0])), - float(np.min(transformed_points[:, 1])), - float(np.max(transformed_points[:, 0])), - float(np.max(transformed_points[:, 1])), - ] - ) - - output = F.convert_bounding_box_format( - output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format - ) - - # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 - return F.clamp_bounding_boxes( - output, - format=format, - canvas_size=canvas_size, - ).to(dtype=dtype, device=device) - - return tv_tensors.BoundingBoxes( - torch.cat([perspective_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape( - bounding_boxes.shape - ), - format=format, - canvas_size=canvas_size, - ) - - @pytest.mark.parametrize(("startpoints", "endpoints"), START_END_POINTS) - @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) - @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_correctness_perspective_bounding_boxes(self, startpoints, endpoints, format, dtype, device): - bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) - - actual = F.perspective(bounding_boxes, startpoints=startpoints, endpoints=endpoints) - expected = self._reference_perspective_bounding_boxes( - bounding_boxes, startpoints=startpoints, endpoints=endpoints - ) - - assert_close(actual, expected, rtol=0, atol=1) - - -class TestColorJitter: - @pytest.mark.parametrize( - "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], - ) - @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) - @pytest.mark.parametrize("device", cpu_and_cuda()) - def test_transform(self, make_input, dtype, device): - if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"): - pytest.skip( - "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' " - "will degenerate to that anyway." - ) - - check_transform( - transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25), - make_input(dtype=dtype, device=device), - ) - - def test_transform_noop(self): - input = make_image() - input_version = input._version - - transform = transforms.ColorJitter() - output = transform(input) - - assert output is input - assert output.data_ptr() == input.data_ptr() - assert output._version == input_version - - def test_transform_error(self): - with pytest.raises(ValueError, match="must be non negative"): - transforms.ColorJitter(brightness=-1) - - for brightness in [object(), [1, 2, 3]]: - with pytest.raises(TypeError, match="single number or a sequence with length 2"): - transforms.ColorJitter(brightness=brightness) - - with pytest.raises(ValueError, match="values should be between"): - transforms.ColorJitter(brightness=(-1, 0.5)) - - with pytest.raises(ValueError, match="values should be between"): - transforms.ColorJitter(hue=1) - - @pytest.mark.parametrize("brightness", [None, 0.1, (0.2, 0.3)]) - @pytest.mark.parametrize("contrast", [None, 0.4, (0.5, 0.6)]) - @pytest.mark.parametrize("saturation", [None, 0.7, (0.8, 0.9)]) - @pytest.mark.parametrize("hue", [None, 0.3, (-0.1, 0.2)]) - def test_transform_correctness(self, brightness, contrast, saturation, hue): - image = make_image(dtype=torch.uint8, device="cpu") - - transform = transforms.ColorJitter(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue) - - with freeze_rng_state(): - torch.manual_seed(0) - actual = transform(image) - - torch.manual_seed(0) - expected = F.to_image(transform(F.to_pil_image(image))) - - mae = (actual.float() - expected.float()).abs().mean() - assert mae < 2 diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py index 246e31485f3..53222c6a2c8 100644 --- a/test/test_transforms_v2_utils.py +++ b/test/test_transforms_v2_utils.py @@ -4,7 +4,7 @@ import torch import torchvision.transforms.v2._utils -from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_mask, make_image +from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_masks, make_image from torchvision import tv_tensors from torchvision.transforms.v2._utils import has_all, has_any @@ -13,7 +13,7 @@ IMAGE = make_image(DEFAULT_SIZE, color_space="RGB") BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=tv_tensors.BoundingBoxFormat.XYXY) -MASK = make_detection_mask(DEFAULT_SIZE) +MASK = make_detection_masks(DEFAULT_SIZE) @pytest.mark.parametrize( diff --git a/test/test_utils.py b/test/test_utils.py index b13bd0f0f5b..2999e84bf8a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -105,7 +105,7 @@ def test_draw_boxes(): res = Image.fromarray(result.permute(1, 2, 0).contiguous().numpy()) res.save(path) - if PILLOW_VERSION >= (8, 2): + if PILLOW_VERSION >= (10, 1): # The reference image is only valid for new PIL versions expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1) assert_equal(result, expected) diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py deleted file mode 100644 index a49b75afdb4..00000000000 --- a/test/transforms_v2_dispatcher_infos.py +++ /dev/null @@ -1,256 +0,0 @@ -import pytest -import torchvision.transforms.v2.functional as F -from torchvision import tv_tensors -from transforms_v2_kernel_infos import KERNEL_INFOS -from transforms_v2_legacy_utils import InfoBase, TestMark - -__all__ = ["DispatcherInfo", "DISPATCHER_INFOS"] - - -class PILKernelInfo(InfoBase): - def __init__( - self, - kernel, - *, - # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name - # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then - kernel_name=None, - ): - super().__init__(id=kernel_name or kernel.__name__) - self.kernel = kernel - - -class DispatcherInfo(InfoBase): - _KERNEL_INFO_MAP = {info.kernel: info for info in KERNEL_INFOS} - - def __init__( - self, - dispatcher, - *, - # Dictionary of types that map to the kernel the dispatcher dispatches to. - kernels, - # If omitted, no PIL dispatch test will be performed. - pil_kernel_info=None, - # See InfoBase - test_marks=None, - # See InfoBase - closeness_kwargs=None, - ): - super().__init__(id=dispatcher.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs) - self.dispatcher = dispatcher - self.kernels = kernels - self.pil_kernel_info = pil_kernel_info - - kernel_infos = {} - for tv_tensor_type, kernel in self.kernels.items(): - kernel_info = self._KERNEL_INFO_MAP.get(kernel) - if not kernel_info: - raise pytest.UsageError( - f"Can't register {kernel.__name__} for type {tv_tensor_type} since there is no `KernelInfo` for it. " - f"Please add a `KernelInfo` for it in `transforms_v2_kernel_infos.py`." - ) - kernel_infos[tv_tensor_type] = kernel_info - self.kernel_infos = kernel_infos - - def sample_inputs(self, *tv_tensor_types, filter_metadata=True): - for tv_tensor_type in tv_tensor_types or self.kernel_infos.keys(): - kernel_info = self.kernel_infos.get(tv_tensor_type) - if not kernel_info: - raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}") - - sample_inputs = kernel_info.sample_inputs_fn() - - if not filter_metadata: - yield from sample_inputs - return - - import itertools - - for args_kwargs in sample_inputs: - if hasattr(tv_tensor_type, "__annotations__"): - for name in itertools.chain( - tv_tensor_type.__annotations__.keys(), - # FIXME: this seems ok for conversion dispatchers, but we should probably handle this on a - # per-dispatcher level. However, so far there is no option for that. - (f"old_{name}" for name in tv_tensor_type.__annotations__.keys()), - ): - if name in args_kwargs.kwargs: - del args_kwargs.kwargs[name] - - yield args_kwargs - - -def xfail_jit(reason, *, condition=None): - return TestMark( - ("TestDispatchers", "test_scripted_smoke"), - pytest.mark.xfail(reason=reason), - condition=condition, - ) - - -def xfail_jit_python_scalar_arg(name, *, reason=None): - return xfail_jit( - reason or f"Python scalar int or float for `{name}` is not supported when scripting", - condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)), - ) - - -skip_dispatch_tv_tensor = TestMark( - ("TestDispatchers", "test_dispatch_tv_tensor"), - pytest.mark.skip(reason="Dispatcher doesn't support arbitrary tv_tensor dispatch."), -) - -multi_crop_skips = [ - TestMark( - ("TestDispatchers", test_name), - pytest.mark.skip(reason="Multi-crop dispatchers return a sequence of items rather than a single one."), - ) - for test_name in ["test_pure_tensor_output_type", "test_pil_output_type", "test_tv_tensor_output_type"] -] -multi_crop_skips.append(skip_dispatch_tv_tensor) - - -DISPATCHER_INFOS = [ - DispatcherInfo( - F.elastic, - kernels={ - tv_tensors.Image: F.elastic_image, - tv_tensors.Video: F.elastic_video, - tv_tensors.BoundingBoxes: F.elastic_bounding_boxes, - tv_tensors.Mask: F.elastic_mask, - }, - pil_kernel_info=PILKernelInfo(F._elastic_image_pil), - test_marks=[xfail_jit_python_scalar_arg("fill")], - ), - DispatcherInfo( - F.equalize, - kernels={ - tv_tensors.Image: F.equalize_image, - tv_tensors.Video: F.equalize_video, - }, - pil_kernel_info=PILKernelInfo(F._equalize_image_pil, kernel_name="equalize_image_pil"), - ), - DispatcherInfo( - F.invert, - kernels={ - tv_tensors.Image: F.invert_image, - tv_tensors.Video: F.invert_video, - }, - pil_kernel_info=PILKernelInfo(F._invert_image_pil, kernel_name="invert_image_pil"), - ), - DispatcherInfo( - F.posterize, - kernels={ - tv_tensors.Image: F.posterize_image, - tv_tensors.Video: F.posterize_video, - }, - pil_kernel_info=PILKernelInfo(F._posterize_image_pil, kernel_name="posterize_image_pil"), - ), - DispatcherInfo( - F.solarize, - kernels={ - tv_tensors.Image: F.solarize_image, - tv_tensors.Video: F.solarize_video, - }, - pil_kernel_info=PILKernelInfo(F._solarize_image_pil, kernel_name="solarize_image_pil"), - ), - DispatcherInfo( - F.autocontrast, - kernels={ - tv_tensors.Image: F.autocontrast_image, - tv_tensors.Video: F.autocontrast_video, - }, - pil_kernel_info=PILKernelInfo(F._autocontrast_image_pil, kernel_name="autocontrast_image_pil"), - ), - DispatcherInfo( - F.adjust_sharpness, - kernels={ - tv_tensors.Image: F.adjust_sharpness_image, - tv_tensors.Video: F.adjust_sharpness_video, - }, - pil_kernel_info=PILKernelInfo(F._adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"), - ), - DispatcherInfo( - F.adjust_contrast, - kernels={ - tv_tensors.Image: F.adjust_contrast_image, - tv_tensors.Video: F.adjust_contrast_video, - }, - pil_kernel_info=PILKernelInfo(F._adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"), - ), - DispatcherInfo( - F.adjust_gamma, - kernels={ - tv_tensors.Image: F.adjust_gamma_image, - tv_tensors.Video: F.adjust_gamma_video, - }, - pil_kernel_info=PILKernelInfo(F._adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"), - ), - DispatcherInfo( - F.adjust_hue, - kernels={ - tv_tensors.Image: F.adjust_hue_image, - tv_tensors.Video: F.adjust_hue_video, - }, - pil_kernel_info=PILKernelInfo(F._adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"), - ), - DispatcherInfo( - F.adjust_saturation, - kernels={ - tv_tensors.Image: F.adjust_saturation_image, - tv_tensors.Video: F.adjust_saturation_video, - }, - pil_kernel_info=PILKernelInfo(F._adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"), - ), - DispatcherInfo( - F.five_crop, - kernels={ - tv_tensors.Image: F.five_crop_image, - tv_tensors.Video: F.five_crop_video, - }, - pil_kernel_info=PILKernelInfo(F._five_crop_image_pil), - test_marks=[ - xfail_jit_python_scalar_arg("size"), - *multi_crop_skips, - ], - ), - DispatcherInfo( - F.ten_crop, - kernels={ - tv_tensors.Image: F.ten_crop_image, - tv_tensors.Video: F.ten_crop_video, - }, - test_marks=[ - xfail_jit_python_scalar_arg("size"), - *multi_crop_skips, - ], - pil_kernel_info=PILKernelInfo(F._ten_crop_image_pil), - ), - DispatcherInfo( - F.normalize, - kernels={ - tv_tensors.Image: F.normalize_image, - tv_tensors.Video: F.normalize_video, - }, - test_marks=[ - xfail_jit_python_scalar_arg("mean"), - xfail_jit_python_scalar_arg("std"), - ], - ), - DispatcherInfo( - F.uniform_temporal_subsample, - kernels={ - tv_tensors.Video: F.uniform_temporal_subsample_video, - }, - test_marks=[ - skip_dispatch_tv_tensor, - ], - ), - DispatcherInfo( - F.clamp_bounding_boxes, - kernels={tv_tensors.BoundingBoxes: F.clamp_bounding_boxes}, - test_marks=[ - skip_dispatch_tv_tensor, - ], - ), -] diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py deleted file mode 100644 index ed5e7e62220..00000000000 --- a/test/transforms_v2_kernel_infos.py +++ /dev/null @@ -1,935 +0,0 @@ -import functools -import itertools - -import PIL.Image -import pytest -import torch.testing -import torchvision.transforms.v2.functional as F -from torchvision.transforms._functional_tensor import _max_value as get_max_value -from transforms_v2_legacy_utils import ( - ArgsKwargs, - DEFAULT_PORTRAIT_SPATIAL_SIZE, - get_num_channels, - ImageLoader, - InfoBase, - make_bounding_box_loaders, - make_image_loader, - make_image_loaders, - make_image_loaders_for_interpolation, - make_mask_loaders, - make_video_loaders, - mark_framework_limitation, - TestMark, -) - -__all__ = ["KernelInfo", "KERNEL_INFOS"] - - -class KernelInfo(InfoBase): - def __init__( - self, - kernel, - *, - # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name - # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then - kernel_name=None, - # Most common tests use these inputs to check the kernel. As such it should cover all valid code paths, but - # should not include extensive parameter combinations to keep to overall test count moderate. - sample_inputs_fn, - # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also - # take tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should - # happen inside the function. It should return a tensor or to be more precise an object that can be compared to - # a tensor by `assert_close`. If omitted, no reference test will be performed. - reference_fn=None, - # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter - # values to be tested. If not specified, `sample_inputs_fn` will be used. - reference_inputs_fn=None, - # If true-ish, triggers a test that checks the kernel for consistency between uint8 and float32 inputs with the - # reference inputs. This is usually used whenever we use a PIL kernel as reference. - # Can be a callable in which case it will be called with `other_args, kwargs`. It should return the same - # structure, but with adapted parameters. This is useful in case a parameter value is closely tied to the input - # dtype. - float32_vs_uint8=False, - # Some kernels don't have dispatchers that would handle logging the usage. Thus, the kernel has to do it - # manually. If set, triggers a test that makes sure this happens. - logs_usage=False, - # See InfoBase - test_marks=None, - # See InfoBase - closeness_kwargs=None, - ): - super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs) - self.kernel = kernel - self.sample_inputs_fn = sample_inputs_fn - self.reference_fn = reference_fn - self.reference_inputs_fn = reference_inputs_fn - - if float32_vs_uint8 and not callable(float32_vs_uint8): - float32_vs_uint8 = lambda other_args, kwargs: (other_args, kwargs) # noqa: E731 - self.float32_vs_uint8 = float32_vs_uint8 - self.logs_usage = logs_usage - - -def pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=False): - return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, mae=mae) - - -def cuda_vs_cpu_pixel_difference(atol=1): - return { - (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): pixel_difference_closeness_kwargs(atol, dtype=dtype) - for dtype in [torch.uint8, torch.float32] - } - - -def pil_reference_pixel_difference(atol=1, mae=False): - return { - (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs( - atol, mae=mae - ) - } - - -def float32_vs_uint8_pixel_difference(atol=1, mae=False): - return { - ( - ("TestKernels", "test_float32_vs_uint8"), - torch.float32, - "cpu", - ): pixel_difference_closeness_kwargs(atol, dtype=torch.float32, mae=mae) - } - - -def scripted_vs_eager_float64_tolerances(device, atol=1e-6, rtol=1e-6): - return { - (("TestKernels", "test_scripted_vs_eager"), torch.float64, device): {"atol": atol, "rtol": rtol, "mae": False}, - } - - -def pil_reference_wrapper(pil_kernel): - @functools.wraps(pil_kernel) - def wrapper(input_tensor, *other_args, **kwargs): - if input_tensor.dtype != torch.uint8: - raise pytest.UsageError(f"Can only test uint8 tensor images against PIL, but input is {input_tensor.dtype}") - if input_tensor.ndim > 3: - raise pytest.UsageError( - f"Can only test single tensor images against PIL, but input has shape {input_tensor.shape}" - ) - - input_pil = F.to_pil_image(input_tensor) - output_pil = pil_kernel(input_pil, *other_args, **kwargs) - if not isinstance(output_pil, PIL.Image.Image): - return output_pil - - output_tensor = F.to_image(output_pil) - - # 2D mask shenanigans - if output_tensor.ndim == 2 and input_tensor.ndim == 3: - output_tensor = output_tensor.unsqueeze(0) - elif output_tensor.ndim == 3 and input_tensor.ndim == 2: - output_tensor = output_tensor.squeeze(0) - - return output_tensor - - return wrapper - - -def xfail_jit(reason, *, condition=None): - return TestMark(("TestKernels", "test_scripted_vs_eager"), pytest.mark.xfail(reason=reason), condition=condition) - - -def xfail_jit_python_scalar_arg(name, *, reason=None): - return xfail_jit( - reason or f"Python scalar int or float for `{name}` is not supported when scripting", - condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)), - ) - - -KERNEL_INFOS = [] - - -def get_fills(*, num_channels, dtype): - yield None - - int_value = get_max_value(dtype) - float_value = int_value / 2 - yield int_value - yield float_value - - for vector_type in [list, tuple]: - yield vector_type([int_value]) - yield vector_type([float_value]) - - if num_channels > 1: - yield vector_type(float_value * c / 10 for c in range(num_channels)) - yield vector_type(int_value if c % 2 == 0 else 0 for c in range(num_channels)) - - -def float32_vs_uint8_fill_adapter(other_args, kwargs): - fill = kwargs.get("fill") - if fill is None: - return other_args, kwargs - - if isinstance(fill, (int, float)): - fill /= 255 - else: - fill = type(fill)(fill_ / 255 for fill_ in fill) - - return other_args, dict(kwargs, fill=fill) - - -def _get_elastic_displacement(canvas_size): - return torch.rand(1, *canvas_size, 2) - - -def sample_inputs_elastic_image_tensor(): - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]): - displacement = _get_elastic_displacement(image_loader.canvas_size) - for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): - yield ArgsKwargs(image_loader, displacement=displacement, fill=fill) - - -def reference_inputs_elastic_image_tensor(): - for image_loader, interpolation in itertools.product( - make_image_loaders_for_interpolation(), - [ - F.InterpolationMode.NEAREST, - F.InterpolationMode.BILINEAR, - F.InterpolationMode.BICUBIC, - ], - ): - displacement = _get_elastic_displacement(image_loader.canvas_size) - for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): - yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill) - - -def sample_inputs_elastic_bounding_boxes(): - for bounding_boxes_loader in make_bounding_box_loaders(): - displacement = _get_elastic_displacement(bounding_boxes_loader.canvas_size) - yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - canvas_size=bounding_boxes_loader.canvas_size, - displacement=displacement, - ) - - -def sample_inputs_elastic_mask(): - for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]): - displacement = _get_elastic_displacement(mask_loader.shape[-2:]) - yield ArgsKwargs(mask_loader, displacement=displacement) - - -def sample_inputs_elastic_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - displacement = _get_elastic_displacement(video_loader.shape[-2:]) - yield ArgsKwargs(video_loader, displacement=displacement) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.elastic_image, - sample_inputs_fn=sample_inputs_elastic_image_tensor, - reference_inputs_fn=reference_inputs_elastic_image_tensor, - float32_vs_uint8=float32_vs_uint8_fill_adapter, - closeness_kwargs={ - **float32_vs_uint8_pixel_difference(6, mae=True), - **cuda_vs_cpu_pixel_difference(), - }, - test_marks=[xfail_jit_python_scalar_arg("fill")], - ), - KernelInfo( - F.elastic_bounding_boxes, - sample_inputs_fn=sample_inputs_elastic_bounding_boxes, - ), - KernelInfo( - F.elastic_mask, - sample_inputs_fn=sample_inputs_elastic_mask, - ), - KernelInfo( - F.elastic_video, - sample_inputs_fn=sample_inputs_elastic_video, - closeness_kwargs=cuda_vs_cpu_pixel_difference(), - ), - ] -) - - -def sample_inputs_equalize_image_tensor(): - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): - yield ArgsKwargs(image_loader) - - -def reference_inputs_equalize_image_tensor(): - # We are not using `make_image_loaders` here since that uniformly samples the values over the whole value range. - # Since the whole point of this kernel is to transform an arbitrary distribution of values into a uniform one, - # the information gain is low if we already provide something really close to the expected value. - def make_uniform_band_image(shape, dtype, device, *, low_factor, high_factor, memory_format): - if dtype.is_floating_point: - low = low_factor - high = high_factor - else: - max_value = torch.iinfo(dtype).max - low = int(low_factor * max_value) - high = int(high_factor * max_value) - return torch.testing.make_tensor(shape, dtype=dtype, device=device, low=low, high=high).to( - memory_format=memory_format, copy=True - ) - - def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_format): - image = torch.distributions.Beta(alpha, beta).sample(shape) - if not dtype.is_floating_point: - image.mul_(torch.iinfo(dtype).max).round_() - return image.to(dtype=dtype, device=device, memory_format=memory_format, copy=True) - - canvas_size = (256, 256) - for dtype, color_space, fn in itertools.product( - [torch.uint8], - ["GRAY", "RGB"], - [ - lambda shape, dtype, device, memory_format: torch.zeros(shape, dtype=dtype, device=device).to( - memory_format=memory_format, copy=True - ), - lambda shape, dtype, device, memory_format: torch.full( - shape, 1.0 if dtype.is_floating_point else torch.iinfo(dtype).max, dtype=dtype, device=device - ).to(memory_format=memory_format, copy=True), - *[ - functools.partial(make_uniform_band_image, low_factor=low_factor, high_factor=high_factor) - for low_factor, high_factor in [ - (0.0, 0.25), - (0.25, 0.75), - (0.75, 1.0), - ] - ], - *[ - functools.partial(make_beta_distributed_image, alpha=alpha, beta=beta) - for alpha, beta in [ - (0.5, 0.5), - (2, 2), - (2, 5), - (5, 2), - ] - ], - ], - ): - image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *canvas_size), dtype=dtype) - yield ArgsKwargs(image_loader) - - -def sample_inputs_equalize_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.equalize_image, - kernel_name="equalize_image_tensor", - sample_inputs_fn=sample_inputs_equalize_image_tensor, - reference_fn=pil_reference_wrapper(F._equalize_image_pil), - float32_vs_uint8=True, - reference_inputs_fn=reference_inputs_equalize_image_tensor, - ), - KernelInfo( - F.equalize_video, - sample_inputs_fn=sample_inputs_equalize_video, - ), - ] -) - - -def sample_inputs_invert_image_tensor(): - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): - yield ArgsKwargs(image_loader) - - -def reference_inputs_invert_image_tensor(): - for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]): - yield ArgsKwargs(image_loader) - - -def sample_inputs_invert_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.invert_image, - kernel_name="invert_image_tensor", - sample_inputs_fn=sample_inputs_invert_image_tensor, - reference_fn=pil_reference_wrapper(F._invert_image_pil), - reference_inputs_fn=reference_inputs_invert_image_tensor, - float32_vs_uint8=True, - ), - KernelInfo( - F.invert_video, - sample_inputs_fn=sample_inputs_invert_video, - ), - ] -) - - -_POSTERIZE_BITS = [1, 4, 8] - - -def sample_inputs_posterize_image_tensor(): - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): - yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0]) - - -def reference_inputs_posterize_image_tensor(): - for image_loader, bits in itertools.product( - make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), - _POSTERIZE_BITS, - ): - yield ArgsKwargs(image_loader, bits=bits) - - -def sample_inputs_posterize_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader, bits=_POSTERIZE_BITS[0]) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.posterize_image, - kernel_name="posterize_image_tensor", - sample_inputs_fn=sample_inputs_posterize_image_tensor, - reference_fn=pil_reference_wrapper(F._posterize_image_pil), - reference_inputs_fn=reference_inputs_posterize_image_tensor, - float32_vs_uint8=True, - closeness_kwargs=float32_vs_uint8_pixel_difference(), - ), - KernelInfo( - F.posterize_video, - sample_inputs_fn=sample_inputs_posterize_video, - ), - ] -) - - -def _get_solarize_thresholds(dtype): - for factor in [0.1, 0.5]: - max_value = get_max_value(dtype) - yield (float if dtype.is_floating_point else int)(max_value * factor) - - -def sample_inputs_solarize_image_tensor(): - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): - yield ArgsKwargs(image_loader, threshold=next(_get_solarize_thresholds(image_loader.dtype))) - - -def reference_inputs_solarize_image_tensor(): - for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]): - for threshold in _get_solarize_thresholds(image_loader.dtype): - yield ArgsKwargs(image_loader, threshold=threshold) - - -def uint8_to_float32_threshold_adapter(other_args, kwargs): - return other_args, dict(threshold=kwargs["threshold"] / 255) - - -def sample_inputs_solarize_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader, threshold=next(_get_solarize_thresholds(video_loader.dtype))) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.solarize_image, - kernel_name="solarize_image_tensor", - sample_inputs_fn=sample_inputs_solarize_image_tensor, - reference_fn=pil_reference_wrapper(F._solarize_image_pil), - reference_inputs_fn=reference_inputs_solarize_image_tensor, - float32_vs_uint8=uint8_to_float32_threshold_adapter, - closeness_kwargs=float32_vs_uint8_pixel_difference(), - ), - KernelInfo( - F.solarize_video, - sample_inputs_fn=sample_inputs_solarize_video, - ), - ] -) - - -def sample_inputs_autocontrast_image_tensor(): - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): - yield ArgsKwargs(image_loader) - - -def reference_inputs_autocontrast_image_tensor(): - for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]): - yield ArgsKwargs(image_loader) - - -def sample_inputs_autocontrast_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.autocontrast_image, - kernel_name="autocontrast_image_tensor", - sample_inputs_fn=sample_inputs_autocontrast_image_tensor, - reference_fn=pil_reference_wrapper(F._autocontrast_image_pil), - reference_inputs_fn=reference_inputs_autocontrast_image_tensor, - float32_vs_uint8=True, - closeness_kwargs={ - **pil_reference_pixel_difference(), - **float32_vs_uint8_pixel_difference(), - }, - ), - KernelInfo( - F.autocontrast_video, - sample_inputs_fn=sample_inputs_autocontrast_video, - ), - ] -) - -_ADJUST_SHARPNESS_FACTORS = [0.1, 0.5] - - -def sample_inputs_adjust_sharpness_image_tensor(): - for image_loader in make_image_loaders( - sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE, (2, 2)], - color_spaces=("GRAY", "RGB"), - ): - yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0]) - - -def reference_inputs_adjust_sharpness_image_tensor(): - for image_loader, sharpness_factor in itertools.product( - make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), - _ADJUST_SHARPNESS_FACTORS, - ): - yield ArgsKwargs(image_loader, sharpness_factor=sharpness_factor) - - -def sample_inputs_adjust_sharpness_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0]) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.adjust_sharpness_image, - kernel_name="adjust_sharpness_image_tensor", - sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor, - reference_fn=pil_reference_wrapper(F._adjust_sharpness_image_pil), - reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor, - float32_vs_uint8=True, - closeness_kwargs=float32_vs_uint8_pixel_difference(2), - ), - KernelInfo( - F.adjust_sharpness_video, - sample_inputs_fn=sample_inputs_adjust_sharpness_video, - ), - ] -) - - -_ADJUST_CONTRAST_FACTORS = [0.1, 0.5] - - -def sample_inputs_adjust_contrast_image_tensor(): - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): - yield ArgsKwargs(image_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0]) - - -def reference_inputs_adjust_contrast_image_tensor(): - for image_loader, contrast_factor in itertools.product( - make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), - _ADJUST_CONTRAST_FACTORS, - ): - yield ArgsKwargs(image_loader, contrast_factor=contrast_factor) - - -def sample_inputs_adjust_contrast_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0]) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.adjust_contrast_image, - kernel_name="adjust_contrast_image_tensor", - sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor, - reference_fn=pil_reference_wrapper(F._adjust_contrast_image_pil), - reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor, - float32_vs_uint8=True, - closeness_kwargs={ - **pil_reference_pixel_difference(), - **float32_vs_uint8_pixel_difference(2), - **cuda_vs_cpu_pixel_difference(), - (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1), - }, - ), - KernelInfo( - F.adjust_contrast_video, - sample_inputs_fn=sample_inputs_adjust_contrast_video, - closeness_kwargs={ - **cuda_vs_cpu_pixel_difference(), - (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1), - }, - ), - ] -) - -_ADJUST_GAMMA_GAMMAS_GAINS = [ - (0.5, 2.0), - (0.0, 1.0), -] - - -def sample_inputs_adjust_gamma_image_tensor(): - gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0] - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): - yield ArgsKwargs(image_loader, gamma=gamma, gain=gain) - - -def reference_inputs_adjust_gamma_image_tensor(): - for image_loader, (gamma, gain) in itertools.product( - make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), - _ADJUST_GAMMA_GAMMAS_GAINS, - ): - yield ArgsKwargs(image_loader, gamma=gamma, gain=gain) - - -def sample_inputs_adjust_gamma_video(): - gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0] - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader, gamma=gamma, gain=gain) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.adjust_gamma_image, - kernel_name="adjust_gamma_image_tensor", - sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor, - reference_fn=pil_reference_wrapper(F._adjust_gamma_image_pil), - reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor, - float32_vs_uint8=True, - closeness_kwargs={ - **pil_reference_pixel_difference(), - **float32_vs_uint8_pixel_difference(), - }, - ), - KernelInfo( - F.adjust_gamma_video, - sample_inputs_fn=sample_inputs_adjust_gamma_video, - ), - ] -) - - -_ADJUST_HUE_FACTORS = [-0.1, 0.5] - - -def sample_inputs_adjust_hue_image_tensor(): - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): - yield ArgsKwargs(image_loader, hue_factor=_ADJUST_HUE_FACTORS[0]) - - -def reference_inputs_adjust_hue_image_tensor(): - for image_loader, hue_factor in itertools.product( - make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), - _ADJUST_HUE_FACTORS, - ): - yield ArgsKwargs(image_loader, hue_factor=hue_factor) - - -def sample_inputs_adjust_hue_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader, hue_factor=_ADJUST_HUE_FACTORS[0]) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.adjust_hue_image, - kernel_name="adjust_hue_image_tensor", - sample_inputs_fn=sample_inputs_adjust_hue_image_tensor, - reference_fn=pil_reference_wrapper(F._adjust_hue_image_pil), - reference_inputs_fn=reference_inputs_adjust_hue_image_tensor, - float32_vs_uint8=True, - closeness_kwargs={ - **pil_reference_pixel_difference(2, mae=True), - **float32_vs_uint8_pixel_difference(), - }, - ), - KernelInfo( - F.adjust_hue_video, - sample_inputs_fn=sample_inputs_adjust_hue_video, - ), - ] -) - -_ADJUST_SATURATION_FACTORS = [0.1, 0.5] - - -def sample_inputs_adjust_saturation_image_tensor(): - for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): - yield ArgsKwargs(image_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0]) - - -def reference_inputs_adjust_saturation_image_tensor(): - for image_loader, saturation_factor in itertools.product( - make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), - _ADJUST_SATURATION_FACTORS, - ): - yield ArgsKwargs(image_loader, saturation_factor=saturation_factor) - - -def sample_inputs_adjust_saturation_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0]) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.adjust_saturation_image, - kernel_name="adjust_saturation_image_tensor", - sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor, - reference_fn=pil_reference_wrapper(F._adjust_saturation_image_pil), - reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor, - float32_vs_uint8=True, - closeness_kwargs={ - **pil_reference_pixel_difference(), - **float32_vs_uint8_pixel_difference(2), - **cuda_vs_cpu_pixel_difference(), - }, - ), - KernelInfo( - F.adjust_saturation_video, - sample_inputs_fn=sample_inputs_adjust_saturation_video, - closeness_kwargs=cuda_vs_cpu_pixel_difference(), - ), - ] -) - - -def sample_inputs_clamp_bounding_boxes(): - for bounding_boxes_loader in make_bounding_box_loaders(): - yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - canvas_size=bounding_boxes_loader.canvas_size, - ) - - -KERNEL_INFOS.append( - KernelInfo( - F.clamp_bounding_boxes, - sample_inputs_fn=sample_inputs_clamp_bounding_boxes, - logs_usage=True, - ) -) - -_FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]] - - -def _get_five_ten_crop_canvas_size(size): - if isinstance(size, int): - crop_height = crop_width = size - elif len(size) == 1: - crop_height = crop_width = size[0] - else: - crop_height, crop_width = size - return 2 * crop_height, 2 * crop_width - - -def sample_inputs_five_crop_image_tensor(): - for size in _FIVE_TEN_CROP_SIZES: - for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_canvas_size(size)], - color_spaces=["RGB"], - dtypes=[torch.float32], - ): - yield ArgsKwargs(image_loader, size=size) - - -def reference_inputs_five_crop_image_tensor(): - for size in _FIVE_TEN_CROP_SIZES: - for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8] - ): - yield ArgsKwargs(image_loader, size=size) - - -def sample_inputs_five_crop_video(): - size = _FIVE_TEN_CROP_SIZES[0] - for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]): - yield ArgsKwargs(video_loader, size=size) - - -def sample_inputs_ten_crop_image_tensor(): - for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): - for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_canvas_size(size)], - color_spaces=["RGB"], - dtypes=[torch.float32], - ): - yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) - - -def reference_inputs_ten_crop_image_tensor(): - for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): - for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8] - ): - yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) - - -def sample_inputs_ten_crop_video(): - size = _FIVE_TEN_CROP_SIZES[0] - for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]): - yield ArgsKwargs(video_loader, size=size) - - -def multi_crop_pil_reference_wrapper(pil_kernel): - def wrapper(input_tensor, *other_args, **kwargs): - output = pil_reference_wrapper(pil_kernel)(input_tensor, *other_args, **kwargs) - return type(output)( - F.to_dtype_image(F.to_image(output_pil), dtype=input_tensor.dtype, scale=True) for output_pil in output - ) - - return wrapper - - -_common_five_ten_crop_marks = [ - xfail_jit_python_scalar_arg("size"), - mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."), -] - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.five_crop_image, - sample_inputs_fn=sample_inputs_five_crop_image_tensor, - reference_fn=multi_crop_pil_reference_wrapper(F._five_crop_image_pil), - reference_inputs_fn=reference_inputs_five_crop_image_tensor, - test_marks=_common_five_ten_crop_marks, - ), - KernelInfo( - F.five_crop_video, - sample_inputs_fn=sample_inputs_five_crop_video, - test_marks=_common_five_ten_crop_marks, - ), - KernelInfo( - F.ten_crop_image, - sample_inputs_fn=sample_inputs_ten_crop_image_tensor, - reference_fn=multi_crop_pil_reference_wrapper(F._ten_crop_image_pil), - reference_inputs_fn=reference_inputs_ten_crop_image_tensor, - test_marks=_common_five_ten_crop_marks, - ), - KernelInfo( - F.ten_crop_video, - sample_inputs_fn=sample_inputs_ten_crop_video, - test_marks=_common_five_ten_crop_marks, - ), - ] -) - -_NORMALIZE_MEANS_STDS = [ - ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), - ([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]), - (0.5, 2.0), -] - - -def sample_inputs_normalize_image_tensor(): - for image_loader, (mean, std) in itertools.product( - make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32]), - _NORMALIZE_MEANS_STDS, - ): - yield ArgsKwargs(image_loader, mean=mean, std=std) - - -def reference_normalize_image_tensor(image, mean, std, inplace=False): - mean = torch.tensor(mean).view(-1, 1, 1) - std = torch.tensor(std).view(-1, 1, 1) - - sub = torch.Tensor.sub_ if inplace else torch.Tensor.sub - return sub(image, mean).div_(std) - - -def reference_inputs_normalize_image_tensor(): - yield ArgsKwargs( - make_image_loader(size=(32, 32), color_space="RGB", extra_dims=[1]), - mean=[0.5, 0.5, 0.5], - std=[1.0, 1.0, 1.0], - ) - - -def sample_inputs_normalize_video(): - mean, std = _NORMALIZE_MEANS_STDS[0] - for video_loader in make_video_loaders( - sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[3], dtypes=[torch.float32] - ): - yield ArgsKwargs(video_loader, mean=mean, std=std) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.normalize_image, - kernel_name="normalize_image_tensor", - sample_inputs_fn=sample_inputs_normalize_image_tensor, - reference_fn=reference_normalize_image_tensor, - reference_inputs_fn=reference_inputs_normalize_image_tensor, - test_marks=[ - xfail_jit_python_scalar_arg("mean"), - xfail_jit_python_scalar_arg("std"), - ], - ), - KernelInfo( - F.normalize_video, - sample_inputs_fn=sample_inputs_normalize_video, - ), - ] -) - - -def sample_inputs_uniform_temporal_subsample_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[4]): - yield ArgsKwargs(video_loader, num_samples=2) - - -def reference_uniform_temporal_subsample_video(x, num_samples): - # Copy-pasted from - # https://github.com/facebookresearch/pytorchvideo/blob/c8d23d8b7e597586a9e2d18f6ed31ad8aa379a7a/pytorchvideo/transforms/functional.py#L19 - t = x.shape[-4] - assert num_samples > 0 and t > 0 - # Sample by nearest neighbor interpolation if num_samples > t. - indices = torch.linspace(0, t - 1, num_samples) - indices = torch.clamp(indices, 0, t - 1).long() - return torch.index_select(x, -4, indices) - - -def reference_inputs_uniform_temporal_subsample_video(): - for video_loader in make_video_loaders( - sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[10] - ): - for num_samples in range(1, video_loader.shape[-4] + 1): - yield ArgsKwargs(video_loader, num_samples) - - -KERNEL_INFOS.append( - KernelInfo( - F.uniform_temporal_subsample_video, - sample_inputs_fn=sample_inputs_uniform_temporal_subsample_video, - reference_fn=reference_uniform_temporal_subsample_video, - reference_inputs_fn=reference_inputs_uniform_temporal_subsample_video, - ) -) diff --git a/test/transforms_v2_legacy_utils.py b/test/transforms_v2_legacy_utils.py deleted file mode 100644 index 9dead793422..00000000000 --- a/test/transforms_v2_legacy_utils.py +++ /dev/null @@ -1,633 +0,0 @@ -""" -As the name implies, these are legacy utilities that are hopefully removed soon. The future of -transforms v2 testing is in test/test_transforms_v2_refactored.py. All new test should be -implemented there and must not use any of the utilities here. - -The following legacy modules depend on this module - -- transforms_v2_kernel_infos.py -- transforms_v2_dispatcher_infos.py -- test_transforms_v2_functional.py -- test_transforms_v2_consistency.py -- test_transforms.py - -When all the logic is ported from the files above to test_transforms_v2_refactored.py, delete -all the legacy modules including this one and drop the _refactored prefix from the name. -""" - -import collections.abc -import dataclasses -import enum -import itertools -import pathlib -from collections import defaultdict -from typing import Callable, Sequence, Tuple, Union - -import PIL.Image -import pytest -import torch - -from torchvision import tv_tensors -from torchvision.transforms._functional_tensor import _max_value as get_max_value -from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image - - -def combinations_grid(**kwargs): - """Creates a grid of input combinations. - - Each element in the returned sequence is a dictionary containing one possible combination as values. - - Example: - >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham")) - [ - {'foo': 'bar', 'spam': 'eggs'}, - {'foo': 'bar', 'spam': 'ham'}, - {'foo': 'baz', 'spam': 'eggs'}, - {'foo': 'baz', 'spam': 'ham'} - ] - """ - return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())] - - -DEFAULT_SIZE = (17, 11) - -NUM_CHANNELS_MAP = { - "GRAY": 1, - "GRAY_ALPHA": 2, - "RGB": 3, - "RGBA": 4, -} - - -def make_image( - size=DEFAULT_SIZE, - *, - color_space="RGB", - batch_dims=(), - dtype=None, - device="cpu", - memory_format=torch.contiguous_format, -): - num_channels = NUM_CHANNELS_MAP[color_space] - dtype = dtype or torch.uint8 - max_value = get_max_value(dtype) - data = torch.testing.make_tensor( - (*batch_dims, num_channels, *size), - low=0, - high=max_value, - dtype=dtype, - device=device, - memory_format=memory_format, - ) - if color_space in {"GRAY_ALPHA", "RGBA"}: - data[..., -1, :, :] = max_value - - return tv_tensors.Image(data) - - -def make_image_tensor(*args, **kwargs): - return make_image(*args, **kwargs).as_subclass(torch.Tensor) - - -def make_image_pil(*args, **kwargs): - return to_pil_image(make_image(*args, **kwargs)) - - -def make_bounding_boxes( - canvas_size=DEFAULT_SIZE, - *, - format=tv_tensors.BoundingBoxFormat.XYXY, - batch_dims=(), - dtype=None, - device="cpu", -): - def sample_position(values, max_value): - # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high. - # However, if we have batch_dims, we need tensors as limits. - return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape) - - if isinstance(format, str): - format = tv_tensors.BoundingBoxFormat[format] - - dtype = dtype or torch.float32 - - if any(dim == 0 for dim in batch_dims): - return tv_tensors.BoundingBoxes( - torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size - ) - - h, w = [torch.randint(1, c, batch_dims) for c in canvas_size] - y = sample_position(h, canvas_size[0]) - x = sample_position(w, canvas_size[1]) - - if format is tv_tensors.BoundingBoxFormat.XYWH: - parts = (x, y, w, h) - elif format is tv_tensors.BoundingBoxFormat.XYXY: - x1, y1 = x, y - x2 = x1 + w - y2 = y1 + h - parts = (x1, y1, x2, y2) - elif format is tv_tensors.BoundingBoxFormat.CXCYWH: - cx = x + w / 2 - cy = y + h / 2 - parts = (cx, cy, w, h) - else: - raise ValueError(f"Format {format} is not supported") - - return tv_tensors.BoundingBoxes( - torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size - ) - - -def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"): - """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks""" - return tv_tensors.Mask( - torch.testing.make_tensor( - (*batch_dims, num_objects, *size), - low=0, - high=2, - dtype=dtype or torch.bool, - device=device, - ) - ) - - -def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"): - """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value""" - return tv_tensors.Mask( - torch.testing.make_tensor( - (*batch_dims, *size), - low=0, - high=num_categories, - dtype=dtype or torch.uint8, - device=device, - ) - ) - - -def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs): - return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs)) - - -def make_video_tensor(*args, **kwargs): - return make_video(*args, **kwargs).as_subclass(torch.Tensor) - - -DEFAULT_SQUARE_SPATIAL_SIZE = 15 -DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33) -DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9) -DEFAULT_SPATIAL_SIZES = ( - DEFAULT_LANDSCAPE_SPATIAL_SIZE, - DEFAULT_PORTRAIT_SPATIAL_SIZE, - DEFAULT_SQUARE_SPATIAL_SIZE, -) - - -def _parse_size(size, *, name="size"): - if size == "random": - raise ValueError("This should never happen") - elif isinstance(size, int) and size > 0: - return (size, size) - elif ( - isinstance(size, collections.abc.Sequence) - and len(size) == 2 - and all(isinstance(length, int) and length > 0 for length in size) - ): - return tuple(size) - else: - raise pytest.UsageError( - f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers," - f"but got {size} instead." - ) - - -def get_num_channels(color_space): - num_channels = NUM_CHANNELS_MAP.get(color_space) - if not num_channels: - raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}") - return num_channels - - -VALID_EXTRA_DIMS = ((), (4,), (2, 3)) -DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5)) - -DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS) - - -def from_loader(loader_fn): - def wrapper(*args, **kwargs): - device = kwargs.pop("device", "cpu") - loader = loader_fn(*args, **kwargs) - return loader.load(device) - - return wrapper - - -def from_loaders(loaders_fn): - def wrapper(*args, **kwargs): - device = kwargs.pop("device", "cpu") - loaders = loaders_fn(*args, **kwargs) - for loader in loaders: - yield loader.load(device) - - return wrapper - - -@dataclasses.dataclass -class TensorLoader: - fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor] - shape: Sequence[int] - dtype: torch.dtype - - def load(self, device): - return self.fn(self.shape, self.dtype, device) - - -@dataclasses.dataclass -class ImageLoader(TensorLoader): - spatial_size: Tuple[int, int] = dataclasses.field(init=False) - num_channels: int = dataclasses.field(init=False) - memory_format: torch.memory_format = torch.contiguous_format - canvas_size: Tuple[int, int] = dataclasses.field(init=False) - - def __post_init__(self): - self.spatial_size = self.canvas_size = self.shape[-2:] - self.num_channels = self.shape[-3] - - def load(self, device): - return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format) - - -def make_image_loader( - size=DEFAULT_PORTRAIT_SPATIAL_SIZE, - *, - color_space="RGB", - extra_dims=(), - dtype=torch.float32, - constant_alpha=True, - memory_format=torch.contiguous_format, -): - if not constant_alpha: - raise ValueError("This should never happen") - size = _parse_size(size) - num_channels = get_num_channels(color_space) - - def fn(shape, dtype, device, memory_format): - *batch_dims, _, height, width = shape - return make_image( - (height, width), - color_space=color_space, - batch_dims=batch_dims, - dtype=dtype, - device=device, - memory_format=memory_format, - ) - - return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format) - - -def make_image_loaders( - *, - sizes=DEFAULT_SPATIAL_SIZES, - color_spaces=( - "GRAY", - "GRAY_ALPHA", - "RGB", - "RGBA", - ), - extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.float32, torch.float64, torch.uint8), - constant_alpha=True, -): - for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes): - yield make_image_loader(**params, constant_alpha=constant_alpha) - - -make_images = from_loaders(make_image_loaders) - - -def make_image_loader_for_interpolation( - size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format -): - size = _parse_size(size) - num_channels = get_num_channels(color_space) - - def fn(shape, dtype, device, memory_format): - height, width = shape[-2:] - - image_pil = ( - PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg") - .resize((width, height)) - .convert( - { - "GRAY": "L", - "GRAY_ALPHA": "LA", - "RGB": "RGB", - "RGBA": "RGBA", - }[color_space] - ) - ) - - image_tensor = to_image(image_pil) - if memory_format == torch.contiguous_format: - image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True) - else: - image_tensor = image_tensor.to(device=device) - image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True) - - return tv_tensors.Image(image_tensor) - - return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format) - - -def make_image_loaders_for_interpolation( - sizes=((233, 147),), - color_spaces=("RGB",), - dtypes=(torch.uint8,), - memory_formats=(torch.contiguous_format, torch.channels_last), -): - for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats): - yield make_image_loader_for_interpolation(**params) - - -@dataclasses.dataclass -class BoundingBoxesLoader(TensorLoader): - format: tv_tensors.BoundingBoxFormat - spatial_size: Tuple[int, int] - canvas_size: Tuple[int, int] = dataclasses.field(init=False) - - def __post_init__(self): - self.canvas_size = self.spatial_size - - -def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32): - if isinstance(format, str): - format = tv_tensors.BoundingBoxFormat[format] - - spatial_size = _parse_size(spatial_size, name="spatial_size") - - def fn(shape, dtype, device): - *batch_dims, num_coordinates = shape - if num_coordinates != 4: - raise pytest.UsageError() - - return make_bounding_boxes( - format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device - ) - - return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size) - - -def make_bounding_box_loaders( - *, - extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2), - formats=tuple(tv_tensors.BoundingBoxFormat), - spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, - dtypes=(torch.float32, torch.float64, torch.int64), -): - for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes): - yield make_bounding_box_loader(**params, spatial_size=spatial_size) - - -make_multiple_bounding_boxes = from_loaders(make_bounding_box_loaders) - - -class MaskLoader(TensorLoader): - pass - - -def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8): - # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects - size = _parse_size(size) - - def fn(shape, dtype, device): - *batch_dims, num_objects, height, width = shape - return make_detection_mask( - (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device - ) - - return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype) - - -def make_detection_mask_loaders( - sizes=DEFAULT_SPATIAL_SIZES, - num_objects=(1, 0, 5), - extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.uint8,), -): - for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes): - yield make_detection_mask_loader(**params) - - -make_detection_masks = from_loaders(make_detection_mask_loaders) - - -def make_segmentation_mask_loader( - size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8 -): - # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values - size = _parse_size(size) - - def fn(shape, dtype, device): - *batch_dims, height, width = shape - return make_segmentation_mask( - (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device - ) - - return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype) - - -def make_segmentation_mask_loaders( - *, - sizes=DEFAULT_SPATIAL_SIZES, - num_categories=(1, 2, 10), - extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.uint8,), -): - for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes): - yield make_segmentation_mask_loader(**params) - - -make_segmentation_masks = from_loaders(make_segmentation_mask_loaders) - - -def make_mask_loaders( - *, - sizes=DEFAULT_SPATIAL_SIZES, - num_objects=(1, 0, 5), - num_categories=(1, 2, 10), - extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.uint8,), -): - yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes) - yield from make_segmentation_mask_loaders( - sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes - ) - - -make_masks = from_loaders(make_mask_loaders) - - -class VideoLoader(ImageLoader): - pass - - -def make_video_loader( - size=DEFAULT_PORTRAIT_SPATIAL_SIZE, - *, - color_space="RGB", - num_frames=3, - extra_dims=(), - dtype=torch.uint8, -): - size = _parse_size(size) - - def fn(shape, dtype, device, memory_format): - *batch_dims, num_frames, _, height, width = shape - return make_video( - (height, width), - num_frames=num_frames, - batch_dims=batch_dims, - color_space=color_space, - dtype=dtype, - device=device, - memory_format=memory_format, - ) - - return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype) - - -def make_video_loaders( - *, - sizes=DEFAULT_SPATIAL_SIZES, - color_spaces=( - "GRAY", - "RGB", - ), - num_frames=(1, 0, 3), - extra_dims=DEFAULT_EXTRA_DIMS, - dtypes=(torch.uint8, torch.float32, torch.float64), -): - for params in combinations_grid( - size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes - ): - yield make_video_loader(**params) - - -make_videos = from_loaders(make_video_loaders) - - -class TestMark: - def __init__( - self, - # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is - # no test class, i.e. a standalone test function, use `None`. - test_id, - # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail` - mark, - *, - # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be - # applied. If omitted, defaults to always apply. - condition=None, - ): - self.test_id = test_id - self.mark = mark - self.condition = condition or (lambda args_kwargs: True) - - -def mark_framework_limitation(test_id, reason, condition=None): - # The purpose of this function is to have a single entry point for skip marks that are only there, because the test - # framework cannot handle the kernel in general or a specific parameter combination. - # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is - # still justified. - # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus, - # we are wasting CI resources for no reason for most of the time - return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition) - - -class InfoBase: - def __init__( - self, - *, - # Identifier if the info that shows up the parametrization. - id, - # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization. - # See the `TestMark` class for details - test_marks=None, - # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see - # `TestMark`), the dtype, and the device. - closeness_kwargs=None, - ): - self.id = id - - self.test_marks = test_marks or [] - test_marks_map = defaultdict(list) - for test_mark in self.test_marks: - test_marks_map[test_mark.test_id].append(test_mark) - self._test_marks_map = dict(test_marks_map) - - self.closeness_kwargs = closeness_kwargs or dict() - - def get_marks(self, test_id, args_kwargs): - return [ - test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs) - ] - - def get_closeness_kwargs(self, test_id, *, dtype, device): - if not (isinstance(test_id, tuple) and len(test_id) == 2): - msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name" - if callable(test_id): - msg += ". Did you forget to add the `test_id` fixture to parameters of the test?" - else: - msg += f", but got {test_id} instead." - raise pytest.UsageError(msg) - if isinstance(device, torch.device): - device = device.type - return self.closeness_kwargs.get((test_id, dtype, device), dict()) - - -class ArgsKwargs: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - def __iter__(self): - yield self.args - yield self.kwargs - - def load(self, device="cpu"): - return ArgsKwargs( - *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args), - **{ - keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg - for keyword, arg in self.kwargs.items() - }, - ) - - -def parametrized_error_message(*args, **kwargs): - def to_str(obj): - if isinstance(obj, torch.Tensor) and obj.numel() > 30: - return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})" - elif isinstance(obj, enum.Enum): - return f"{type(obj).__name__}.{obj.name}" - else: - return repr(obj) - - if args or kwargs: - postfix = "\n".join( - [ - "", - "Failure happened for the following parameters:", - "", - *[to_str(arg) for arg in args], - *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()], - ] - ) - else: - postfix = "" - - def wrapper(msg): - return msg + postfix - - return wrapper diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py index d04028a3bbc..65f0b2fef44 100644 --- a/torchvision/models/efficientnet.py +++ b/torchvision/models/efficientnet.py @@ -439,7 +439,7 @@ def _efficientnet_conf( class EfficientNet_B0_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # Weights ported from https://github.com/rwightman/pytorch-image-models/ - url="https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth", + url="https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth", transforms=partial( ImageClassification, crop_size=224, resize_size=256, interpolation=InterpolationMode.BICUBIC ), @@ -511,7 +511,7 @@ class EfficientNet_B1_Weights(WeightsEnum): class EfficientNet_B2_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # Weights ported from https://github.com/rwightman/pytorch-image-models/ - url="https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth", + url="https://download.pytorch.org/models/efficientnet_b2_rwightman-c35c1473.pth", transforms=partial( ImageClassification, crop_size=288, resize_size=288, interpolation=InterpolationMode.BICUBIC ), @@ -535,7 +535,7 @@ class EfficientNet_B2_Weights(WeightsEnum): class EfficientNet_B3_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # Weights ported from https://github.com/rwightman/pytorch-image-models/ - url="https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth", + url="https://download.pytorch.org/models/efficientnet_b3_rwightman-b3899882.pth", transforms=partial( ImageClassification, crop_size=300, resize_size=320, interpolation=InterpolationMode.BICUBIC ), @@ -559,7 +559,7 @@ class EfficientNet_B3_Weights(WeightsEnum): class EfficientNet_B4_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # Weights ported from https://github.com/rwightman/pytorch-image-models/ - url="https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth", + url="https://download.pytorch.org/models/efficientnet_b4_rwightman-23ab8bcd.pth", transforms=partial( ImageClassification, crop_size=380, resize_size=384, interpolation=InterpolationMode.BICUBIC ), @@ -583,7 +583,7 @@ class EfficientNet_B4_Weights(WeightsEnum): class EfficientNet_B5_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/ - url="https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth", + url="https://download.pytorch.org/models/efficientnet_b5_lukemelas-1a07897c.pth", transforms=partial( ImageClassification, crop_size=456, resize_size=456, interpolation=InterpolationMode.BICUBIC ), @@ -607,7 +607,7 @@ class EfficientNet_B5_Weights(WeightsEnum): class EfficientNet_B6_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/ - url="https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth", + url="https://download.pytorch.org/models/efficientnet_b6_lukemelas-24a108a5.pth", transforms=partial( ImageClassification, crop_size=528, resize_size=528, interpolation=InterpolationMode.BICUBIC ), @@ -631,7 +631,7 @@ class EfficientNet_B6_Weights(WeightsEnum): class EfficientNet_B7_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # Weights ported from https://github.com/lukemelas/EfficientNet-PyTorch/ - url="https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth", + url="https://download.pytorch.org/models/efficientnet_b7_lukemelas-c5b4e57e.pth", transforms=partial( ImageClassification, crop_size=600, resize_size=600, interpolation=InterpolationMode.BICUBIC ), diff --git a/torchvision/models/segmentation/deeplabv3.py b/torchvision/models/segmentation/deeplabv3.py index f58c5d26a66..a92ddfe3b7a 100644 --- a/torchvision/models/segmentation/deeplabv3.py +++ b/torchvision/models/segmentation/deeplabv3.py @@ -1,5 +1,5 @@ from functools import partial -from typing import Any, List, Optional +from typing import Any, Optional, Sequence import torch from torch import nn @@ -46,9 +46,9 @@ class DeepLabV3(_SimpleSegmentationModel): class DeepLabHead(nn.Sequential): - def __init__(self, in_channels: int, num_classes: int) -> None: + def __init__(self, in_channels: int, num_classes: int, atrous_rates: Sequence[int] = (12, 24, 36)) -> None: super().__init__( - ASPP(in_channels, [12, 24, 36]), + ASPP(in_channels, atrous_rates), nn.Conv2d(256, 256, 3, padding=1, bias=False), nn.BatchNorm2d(256), nn.ReLU(), @@ -83,7 +83,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class ASPP(nn.Module): - def __init__(self, in_channels: int, atrous_rates: List[int], out_channels: int = 256) -> None: + def __init__(self, in_channels: int, atrous_rates: Sequence[int], out_channels: int = 256) -> None: super().__init__() modules = [] modules.append( diff --git a/torchvision/transforms/_functional_tensor.py b/torchvision/transforms/_functional_tensor.py index d0e7c17882b..88dc9ca21cc 100644 --- a/torchvision/transforms/_functional_tensor.py +++ b/torchvision/transforms/_functional_tensor.py @@ -440,9 +440,7 @@ def resize( img: Tensor, size: List[int], interpolation: str = "bilinear", - # TODO: in v0.17, change the default to True. This will a private function - # by then, so we don't care about warning here. - antialias: Optional[bool] = None, + antialias: Optional[bool] = True, ) -> Tensor: _assert_image_tensor(img) diff --git a/torchvision/transforms/_presets.py b/torchvision/transforms/_presets.py index ccbe425f2ac..d7f88bdb992 100644 --- a/torchvision/transforms/_presets.py +++ b/torchvision/transforms/_presets.py @@ -2,7 +2,7 @@ This file is part of the private API. Please do not use directly these classes as they will be modified on future versions without warning. The classes should be accessed only via the transforms argument of Weights. """ -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import torch from torch import nn, Tensor @@ -44,7 +44,7 @@ def __init__( mean: Tuple[float, ...] = (0.485, 0.456, 0.406), std: Tuple[float, ...] = (0.229, 0.224, 0.225), interpolation: InterpolationMode = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> None: super().__init__() self.crop_size = [crop_size] @@ -151,7 +151,7 @@ def __init__( mean: Tuple[float, ...] = (0.485, 0.456, 0.406), std: Tuple[float, ...] = (0.229, 0.224, 0.225), interpolation: InterpolationMode = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> None: super().__init__() self.resize_size = [resize_size] if resize_size is not None else None diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py index 8517cf42599..d176e00a8da 100644 --- a/torchvision/transforms/functional.py +++ b/torchvision/transforms/functional.py @@ -393,19 +393,12 @@ def resize( size: List[int], interpolation: InterpolationMode = InterpolationMode.BILINEAR, max_size: Optional[int] = None, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> Tensor: r"""Resize the input image to the given size. If the image is torch Tensor, it is expected to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions - .. warning:: - The output image might be different depending on its type: when downsampling, the interpolation of PIL images - and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences - in the performance of a network. Therefore, it is preferable to train and serve a model with the same input - types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors - closer. - Args: img (PIL Image or Tensor): Image to be resized. size (sequence or int): Desired output size. If size is a sequence like @@ -437,7 +430,7 @@ def resize( tensors), antialiasing makes no sense and this parameter is ignored. Possible values are: - - ``True``: will apply antialiasing for bilinear or bicubic modes. + - ``True`` (default): will apply antialiasing for bilinear or bicubic modes. Other mode aren't affected. This is probably what you want to use. - ``False``: will not apply antialiasing for tensors on any mode. PIL images are still antialiased on bilinear or bicubic modes, because @@ -446,8 +439,8 @@ def resize( PIL images. This value exists for legacy reasons and you probably don't want to use it unless you really know what you are doing. - The current default is ``None`` **but will change to** ``True`` **in - v0.17** for the PIL and Tensor backends to be consistent. + The default value changed from ``None`` to ``True`` in + v0.17, for the PIL and Tensor backends to be consistent. Returns: PIL Image or Tensor: Resized image. @@ -481,8 +474,6 @@ def resize( if [image_height, image_width] == output_size: return img - antialias = _check_antialias(img, antialias, interpolation) - if not isinstance(img, torch.Tensor): if antialias is False: warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.") @@ -615,7 +606,7 @@ def resized_crop( width: int, size: List[int], interpolation: InterpolationMode = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> Tensor: """Crop the given image and resize it to desired size. If the image is torch Tensor, it is expected @@ -643,7 +634,7 @@ def resized_crop( tensors), antialiasing makes no sense and this parameter is ignored. Possible values are: - - ``True``: will apply antialiasing for bilinear or bicubic modes. + - ``True`` (default): will apply antialiasing for bilinear or bicubic modes. Other mode aren't affected. This is probably what you want to use. - ``False``: will not apply antialiasing for tensors on any mode. PIL images are still antialiased on bilinear or bicubic modes, because @@ -652,8 +643,8 @@ def resized_crop( PIL images. This value exists for legacy reasons and you probably don't want to use it unless you really know what you are doing. - The current default is ``None`` **but will change to** ``True`` **in - v0.17** for the PIL and Tensor backends to be consistent. + The default value changed from ``None`` to ``True`` in + v0.17, for the PIL and Tensor backends to be consistent. Returns: PIL Image or Tensor: Cropped image. """ @@ -1279,7 +1270,7 @@ def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor: Note: Please, note that this method supports only RGB images as input. For inputs in other color spaces, - please, consider using meth:`~torchvision.transforms.functional.to_grayscale` with PIL Image. + please, consider using :meth:`~torchvision.transforms.functional.to_grayscale` with PIL Image. Args: img (PIL Image or Tensor): RGB Image to be converted to grayscale. @@ -1590,28 +1581,3 @@ def elastic_transform( if not isinstance(img, torch.Tensor): output = to_pil_image(output, mode=img.mode) return output - - -# TODO in v0.17: remove this helper and change default of antialias to True everywhere -def _check_antialias( - img: Tensor, antialias: Optional[Union[str, bool]], interpolation: InterpolationMode -) -> Optional[bool]: - if isinstance(antialias, str): # it should be "warn", but we don't bother checking against that - if isinstance(img, Tensor) and ( - interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC - ): - warnings.warn( - "The default value of the antialias parameter of all the resizing transforms " - "(Resize(), RandomResizedCrop(), etc.) " - "will change from None to True in v0.17, " - "in order to be consistent across the PIL and Tensor backends. " - "To suppress this warning, directly pass " - "antialias=True (recommended, future default), antialias=None (current default, " - "which means False for Tensors and True for PIL), " - "or antialias=False (only works on Tensors - PIL will still use antialiasing). " - "This also applies if you are using the inference transforms from the models weights: " - "update the call to weights.transforms(antialias=True)." - ) - antialias = None - - return antialias diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index 54e49321fa6..2a6e0ce12c0 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -285,13 +285,6 @@ class Resize(torch.nn.Module): If the image is torch Tensor, it is expected to have [..., H, W] shape, where ... means a maximum of two leading dimensions - .. warning:: - The output image might be different depending on its type: when downsampling, the interpolation of PIL images - and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences - in the performance of a network. Therefore, it is preferable to train and serve a model with the same input - types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors - closer. - Args: size (sequence or int): Desired output size. If size is a sequence like (h, w), output size will be matched to this. If size is an int, @@ -321,7 +314,7 @@ class Resize(torch.nn.Module): tensors), antialiasing makes no sense and this parameter is ignored. Possible values are: - - ``True``: will apply antialiasing for bilinear or bicubic modes. + - ``True`` (default): will apply antialiasing for bilinear or bicubic modes. Other mode aren't affected. This is probably what you want to use. - ``False``: will not apply antialiasing for tensors on any mode. PIL images are still antialiased on bilinear or bicubic modes, because @@ -330,11 +323,11 @@ class Resize(torch.nn.Module): PIL images. This value exists for legacy reasons and you probably don't want to use it unless you really know what you are doing. - The current default is ``None`` **but will change to** ``True`` **in - v0.17** for the PIL and Tensor backends to be consistent. + The default value changed from ``None`` to ``True`` in + v0.17, for the PIL and Tensor backends to be consistent. """ - def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None, antialias="warn"): + def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None, antialias=True): super().__init__() _log_api_usage_once(self) if not isinstance(size, (int, Sequence)): @@ -884,7 +877,7 @@ class RandomResizedCrop(torch.nn.Module): tensors), antialiasing makes no sense and this parameter is ignored. Possible values are: - - ``True``: will apply antialiasing for bilinear or bicubic modes. + - ``True`` (default): will apply antialiasing for bilinear or bicubic modes. Other mode aren't affected. This is probably what you want to use. - ``False``: will not apply antialiasing for tensors on any mode. PIL images are still antialiased on bilinear or bicubic modes, because @@ -893,8 +886,8 @@ class RandomResizedCrop(torch.nn.Module): PIL images. This value exists for legacy reasons and you probably don't want to use it unless you really know what you are doing. - The current default is ``None`` **but will change to** ``True`` **in - v0.17** for the PIL and Tensor backends to be consistent. + The default value changed from ``None`` to ``True`` in + v0.17, for the PIL and Tensor backends to be consistent. """ def __init__( @@ -903,7 +896,7 @@ def __init__( scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), interpolation=InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ): super().__init__() _log_api_usage_once(self) diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index efe731b5ec9..2715eefa21c 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -328,6 +328,11 @@ class RandomSolarize(_RandomApplyTransform): _v1_transform_cls = _transforms.RandomSolarize + def _extract_params_for_v1_transform(self) -> Dict[str, Any]: + params = super()._extract_params_for_v1_transform() + params["threshold"] = float(params["threshold"]) + return params + def __init__(self, threshold: float, p: float = 0.5) -> None: super().__init__(p=p) self.threshold = threshold diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py index 8f591c49707..d57c2a72009 100644 --- a/torchvision/transforms/v2/_container.py +++ b/torchvision/transforms/v2/_container.py @@ -100,14 +100,15 @@ def _extract_params_for_v1_transform(self) -> Dict[str, Any]: return {"transforms": self.transforms, "p": self.p} def forward(self, *inputs: Any) -> Any: - sample = inputs if len(inputs) > 1 else inputs[0] + needs_unpacking = len(inputs) > 1 if torch.rand(1) >= self.p: - return sample + return inputs if needs_unpacking else inputs[0] for transform in self.transforms: - sample = transform(sample) - return sample + outputs = transform(*inputs) + inputs = outputs if needs_unpacking else (outputs,) + return outputs def extra_repr(self) -> str: format_string = [] @@ -173,8 +174,9 @@ def __init__(self, transforms: Sequence[Callable]) -> None: self.transforms = transforms def forward(self, *inputs: Any) -> Any: - sample = inputs if len(inputs) > 1 else inputs[0] + needs_unpacking = len(inputs) > 1 for idx in torch.randperm(len(self.transforms)): transform = self.transforms[idx] - sample = transform(sample) - return sample + outputs = transform(*inputs) + inputs = outputs if needs_unpacking else (outputs,) + return outputs diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index ce98a1ee091..4d3f3fc7fc5 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -10,7 +10,6 @@ from torchvision.ops.boxes import box_iou from torchvision.transforms.functional import _get_perspective_coeffs from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform -from torchvision.transforms.v2.functional._geometry import _check_interpolation from torchvision.transforms.v2.functional._utils import _FillType from ._transform import _RandomApplyTransform @@ -81,13 +80,6 @@ class Resize(Transform): it can have arbitrary number of leading batch dimensions. For example, the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. - .. warning:: - The output image might be different depending on its type: when downsampling, the interpolation of PIL images - and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences - in the performance of a network. Therefore, it is preferable to train and serve a model with the same input - types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors - closer. - Args: size (sequence or int): Desired output size. If size is a sequence like (h, w), output size will be matched to this. If size is an int, @@ -117,7 +109,7 @@ class Resize(Transform): tensors), antialiasing makes no sense and this parameter is ignored. Possible values are: - - ``True``: will apply antialiasing for bilinear or bicubic modes. + - ``True`` (default): will apply antialiasing for bilinear or bicubic modes. Other mode aren't affected. This is probably what you want to use. - ``False``: will not apply antialiasing for tensors on any mode. PIL images are still antialiased on bilinear or bicubic modes, because @@ -126,8 +118,8 @@ class Resize(Transform): PIL images. This value exists for legacy reasons and you probably don't want to use it unless you really know what you are doing. - The current default is ``None`` **but will change to** ``True`` **in - v0.17** for the PIL and Tensor backends to be consistent. + The default value changed from ``None`` to ``True`` in + v0.17, for the PIL and Tensor backends to be consistent. """ _v1_transform_cls = _transforms.Resize @@ -137,21 +129,21 @@ def __init__( size: Union[int, Sequence[int]], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, max_size: Optional[int] = None, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> None: super().__init__() if isinstance(size, int): size = [size] - elif isinstance(size, (list, tuple)) and len(size) in {1, 2}: + elif isinstance(size, Sequence) and len(size) in {1, 2}: size = list(size) else: raise ValueError( - f"size can either be an integer or a list or tuple of one or two integers, " f"but got {size} instead." + f"size can either be an integer or a sequence of one or two integers, but got {size} instead." ) self.size = size - self.interpolation = _check_interpolation(interpolation) + self.interpolation = interpolation self.max_size = max_size self.antialias = antialias @@ -231,7 +223,7 @@ class RandomResizedCrop(Transform): tensors), antialiasing makes no sense and this parameter is ignored. Possible values are: - - ``True``: will apply antialiasing for bilinear or bicubic modes. + - ``True`` (default): will apply antialiasing for bilinear or bicubic modes. Other mode aren't affected. This is probably what you want to use. - ``False``: will not apply antialiasing for tensors on any mode. PIL images are still antialiased on bilinear or bicubic modes, because @@ -240,8 +232,8 @@ class RandomResizedCrop(Transform): PIL images. This value exists for legacy reasons and you probably don't want to use it unless you really know what you are doing. - The current default is ``None`` **but will change to** ``True`` **in - v0.17** for the PIL and Tensor backends to be consistent. + The default value changed from ``None`` to ``True`` in + v0.17, for the PIL and Tensor backends to be consistent. """ _v1_transform_cls = _transforms.RandomResizedCrop @@ -252,7 +244,7 @@ def __init__( scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0), interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> None: super().__init__() self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") @@ -268,7 +260,7 @@ def __init__( self.scale = scale self.ratio = ratio - self.interpolation = _check_interpolation(interpolation) + self.interpolation = interpolation self.antialias = antialias self._log_ratio = torch.log(torch.tensor(self.ratio)) @@ -546,7 +538,7 @@ def __init__( self.side_range = side_range if side_range[0] < 1.0 or side_range[0] > side_range[1]: - raise ValueError(f"Invalid canvas side range provided {side_range}.") + raise ValueError(f"Invalid side range provided {side_range}.") def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: orig_h, orig_w = query_size(flat_inputs) @@ -622,7 +614,7 @@ def __init__( ) -> None: super().__init__() self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,)) - self.interpolation = _check_interpolation(interpolation) + self.interpolation = interpolation self.expand = expand self.fill = fill @@ -724,7 +716,7 @@ def __init__( else: self.shear = shear - self.interpolation = _check_interpolation(interpolation) + self.interpolation = interpolation self.fill = fill self._fill = _setup_fill_arg(fill) @@ -969,7 +961,7 @@ def __init__( raise ValueError("Argument distortion_scale value should be between 0 and 1") self.distortion_scale = distortion_scale - self.interpolation = _check_interpolation(interpolation) + self.interpolation = interpolation self.fill = fill self._fill = _setup_fill_arg(fill) @@ -1070,7 +1062,7 @@ def __init__( self.alpha = _setup_number_or_seq(alpha, "alpha") self.sigma = _setup_number_or_seq(sigma, "sigma") - self.interpolation = _check_interpolation(interpolation) + self.interpolation = interpolation self.fill = fill self._fill = _setup_fill_arg(fill) @@ -1263,7 +1255,7 @@ class ScaleJitter(Transform): tensors), antialiasing makes no sense and this parameter is ignored. Possible values are: - - ``True``: will apply antialiasing for bilinear or bicubic modes. + - ``True`` (default): will apply antialiasing for bilinear or bicubic modes. Other mode aren't affected. This is probably what you want to use. - ``False``: will not apply antialiasing for tensors on any mode. PIL images are still antialiased on bilinear or bicubic modes, because @@ -1272,8 +1264,8 @@ class ScaleJitter(Transform): PIL images. This value exists for legacy reasons and you probably don't want to use it unless you really know what you are doing. - The current default is ``None`` **but will change to** ``True`` **in - v0.17** for the PIL and Tensor backends to be consistent. + The default value changed from ``None`` to ``True`` in + v0.17, for the PIL and Tensor backends to be consistent. """ def __init__( @@ -1281,12 +1273,12 @@ def __init__( target_size: Tuple[int, int], scale_range: Tuple[float, float] = (0.1, 2.0), interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ): super().__init__() self.target_size = target_size self.scale_range = scale_range - self.interpolation = _check_interpolation(interpolation) + self.interpolation = interpolation self.antialias = antialias def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: @@ -1330,7 +1322,7 @@ class RandomShortestSize(Transform): tensors), antialiasing makes no sense and this parameter is ignored. Possible values are: - - ``True``: will apply antialiasing for bilinear or bicubic modes. + - ``True`` (default): will apply antialiasing for bilinear or bicubic modes. Other mode aren't affected. This is probably what you want to use. - ``False``: will not apply antialiasing for tensors on any mode. PIL images are still antialiased on bilinear or bicubic modes, because @@ -1339,8 +1331,8 @@ class RandomShortestSize(Transform): PIL images. This value exists for legacy reasons and you probably don't want to use it unless you really know what you are doing. - The current default is ``None`` **but will change to** ``True`` **in - v0.17** for the PIL and Tensor backends to be consistent. + The default value changed from ``None`` to ``True`` in + v0.17, for the PIL and Tensor backends to be consistent. """ def __init__( @@ -1348,12 +1340,12 @@ def __init__( min_size: Union[List[int], Tuple[int], int], max_size: Optional[int] = None, interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ): super().__init__() self.min_size = [min_size] if isinstance(min_size, int) else list(min_size) self.max_size = max_size - self.interpolation = _check_interpolation(interpolation) + self.interpolation = interpolation self.antialias = antialias def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: @@ -1411,7 +1403,7 @@ class RandomResize(Transform): tensors), antialiasing makes no sense and this parameter is ignored. Possible values are: - - ``True``: will apply antialiasing for bilinear or bicubic modes. + - ``True`` (default): will apply antialiasing for bilinear or bicubic modes. Other mode aren't affected. This is probably what you want to use. - ``False``: will not apply antialiasing for tensors on any mode. PIL images are still antialiased on bilinear or bicubic modes, because @@ -1420,8 +1412,8 @@ class RandomResize(Transform): PIL images. This value exists for legacy reasons and you probably don't want to use it unless you really know what you are doing. - The current default is ``None`` **but will change to** ``True`` **in - v0.17** for the PIL and Tensor backends to be consistent. + The default value changed from ``None`` to ``True`` in + v0.17, for the PIL and Tensor backends to be consistent. """ def __init__( @@ -1429,12 +1421,12 @@ def __init__( min_size: int, max_size: int, interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> None: super().__init__() self.min_size = min_size self.max_size = max_size - self.interpolation = _check_interpolation(interpolation) + self.interpolation = interpolation self.antialias = antialias def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py index c909a17becc..9888fb2a476 100644 --- a/torchvision/transforms/v2/_type_conversion.py +++ b/torchvision/transforms/v2/_type_conversion.py @@ -79,7 +79,7 @@ def _transform( class ToPureTensor(Transform): - """[BETA] Convert all tv_tensors to pure tensors, removing associated metadata (if any). + """[BETA] Convert all TVTensors to pure tensors, removing associated metadata (if any). .. v2betastatus:: ToPureTensor transform diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index c41b910c586..d6d42344fcb 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -11,7 +11,6 @@ from torchvision.transforms import _functional_pil as _FP from torchvision.transforms._functional_tensor import _pad_symmetric from torchvision.transforms.functional import ( - _check_antialias, _compute_resized_output_size as __compute_resized_output_size, _get_perspective_coeffs, _interpolation_modes_from_int, @@ -177,7 +176,7 @@ def resize( size: List[int], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, max_size: Optional[int] = None, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> torch.Tensor: """[BETA] See :class:`~torchvision.transforms.v2.Resize` for details.""" if torch.jit.is_scripting(): @@ -196,17 +195,15 @@ def resize_image( size: List[int], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, max_size: Optional[int] = None, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> torch.Tensor: interpolation = _check_interpolation(interpolation) - antialias = _check_antialias(img=image, antialias=antialias, interpolation=interpolation) - assert not isinstance(antialias, str) antialias = False if antialias is None else antialias align_corners: Optional[bool] = None if interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC: align_corners = False else: - # The default of antialias should be True from 0.17, so we don't warn or + # The default of antialias is True from 0.17, so we don't warn or # error if other interpolation modes are used. This is documented. antialias = False @@ -297,7 +294,7 @@ def __resize_image_pil_dispatch( size: Union[Sequence[int], int], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, max_size: Optional[int] = None, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> PIL.Image.Image: if antialias is False: warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.") @@ -361,7 +358,7 @@ def resize_video( size: List[int], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, max_size: Optional[int] = None, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> torch.Tensor: return resize_image(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias) @@ -554,19 +551,30 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in def _apply_grid_transform(img: torch.Tensor, grid: torch.Tensor, mode: str, fill: _FillTypeJIT) -> torch.Tensor: + input_shape = img.shape + output_height, output_width = grid.shape[1], grid.shape[2] + num_channels, input_height, input_width = input_shape[-3:] + output_shape = input_shape[:-3] + (num_channels, output_height, output_width) + + if img.numel() == 0: + return img.reshape(output_shape) + + img = img.reshape(-1, num_channels, input_height, input_width) + squashed_batch_size = img.shape[0] # We are using context knowledge that grid should have float dtype fp = img.dtype == grid.dtype float_img = img if fp else img.to(grid.dtype) - shape = float_img.shape - if shape[0] > 1: + if squashed_batch_size > 1: # Apply same grid to a batch of images - grid = grid.expand(shape[0], -1, -1, -1) + grid = grid.expand(squashed_batch_size, -1, -1, -1) # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice if fill is not None: - mask = torch.ones((shape[0], 1, shape[2], shape[3]), dtype=float_img.dtype, device=float_img.device) + mask = torch.ones( + (squashed_batch_size, 1, input_height, input_width), dtype=float_img.dtype, device=float_img.device + ) float_img = torch.cat((float_img, mask), dim=1) float_img = grid_sample(float_img, grid, mode=mode, padding_mode="zeros", align_corners=False) @@ -587,7 +595,7 @@ def _apply_grid_transform(img: torch.Tensor, grid: torch.Tensor, mode: str, fill img = float_img.round_().to(img.dtype) if not fp else float_img - return img + return img.reshape(output_shape) def _assert_grid_transform_inputs( @@ -664,24 +672,10 @@ def affine_image( ) -> torch.Tensor: interpolation = _check_interpolation(interpolation) - if image.numel() == 0: - return image - - shape = image.shape - ndim = image.ndim - - if ndim > 4: - image = image.reshape((-1,) + shape[-3:]) - needs_unsquash = True - elif ndim == 3: - image = image.unsqueeze(0) - needs_unsquash = True - else: - needs_unsquash = False - - height, width = shape[-2:] angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center) + height, width = image.shape[-2:] + center_f = [0.0, 0.0] if center is not None: # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center. @@ -695,12 +689,7 @@ def affine_image( dtype = image.dtype if torch.is_floating_point(image) else torch.float32 theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3) grid = _affine_grid(theta, w=width, h=height, ow=width, oh=height) - output = _apply_grid_transform(image, grid, interpolation.value, fill=fill) - - if needs_unsquash: - output = output.reshape(shape) - - return output + return _apply_grid_transform(image, grid, interpolation.value, fill=fill) @_register_kernel_internal(affine, PIL.Image.Image) @@ -972,35 +961,26 @@ def rotate_image( ) -> torch.Tensor: interpolation = _check_interpolation(interpolation) - shape = image.shape - num_channels, height, width = shape[-3:] + input_height, input_width = image.shape[-2:] center_f = [0.0, 0.0] if center is not None: # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center. - center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])] + center_f = [(c - s * 0.5) for c, s in zip(center, [input_width, input_height])] # due to current incoherence of rotation angle direction between affine and rotate implementations # we need to set -angle. matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0]) - if image.numel() > 0: - image = image.reshape(-1, num_channels, height, width) - - _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"]) - - ow, oh = _compute_affine_output_size(matrix, width, height) if expand else (width, height) - dtype = image.dtype if torch.is_floating_point(image) else torch.float32 - theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3) - grid = _affine_grid(theta, w=width, h=height, ow=ow, oh=oh) - output = _apply_grid_transform(image, grid, interpolation.value, fill=fill) - - new_height, new_width = output.shape[-2:] - else: - output = image - new_width, new_height = _compute_affine_output_size(matrix, width, height) if expand else (width, height) + _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"]) - return output.reshape(shape[:-3] + (num_channels, new_height, new_width)) + output_width, output_height = ( + _compute_affine_output_size(matrix, input_width, input_height) if expand else (input_width, input_height) + ) + dtype = image.dtype if torch.is_floating_point(image) else torch.float32 + theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3) + grid = _affine_grid(theta, w=input_width, h=input_height, ow=output_width, oh=output_height) + return _apply_grid_transform(image, grid, interpolation.value, fill=fill) @_register_kernel_internal(rotate, PIL.Image.Image) @@ -1512,21 +1492,6 @@ def perspective_image( perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients) interpolation = _check_interpolation(interpolation) - if image.numel() == 0: - return image - - shape = image.shape - ndim = image.ndim - - if ndim > 4: - image = image.reshape((-1,) + shape[-3:]) - needs_unsquash = True - elif ndim == 3: - image = image.unsqueeze(0) - needs_unsquash = True - else: - needs_unsquash = False - _assert_grid_transform_inputs( image, matrix=None, @@ -1536,15 +1501,10 @@ def perspective_image( coeffs=perspective_coeffs, ) - oh, ow = shape[-2:] + oh, ow = image.shape[-2:] dtype = image.dtype if torch.is_floating_point(image) else torch.float32 grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device) - output = _apply_grid_transform(image, grid, interpolation.value, fill=fill) - - if needs_unsquash: - output = output.reshape(shape) - - return output + return _apply_grid_transform(image, grid, interpolation.value, fill=fill) @_register_kernel_internal(perspective, PIL.Image.Image) @@ -1762,12 +1722,7 @@ def elastic_image( interpolation = _check_interpolation(interpolation) - if image.numel() == 0: - return image - - shape = image.shape - ndim = image.ndim - + height, width = image.shape[-2:] device = image.device dtype = image.dtype if torch.is_floating_point(image) else torch.float32 @@ -1778,32 +1733,18 @@ def elastic_image( dtype = torch.float32 # We are aware that if input image dtype is uint8 and displacement is float64 then - # displacement will be casted to float32 and all computations will be done with float32 + # displacement will be cast to float32 and all computations will be done with float32 # We can fix this later if needed - expected_shape = (1,) + shape[-2:] + (2,) + expected_shape = (1, height, width, 2) if expected_shape != displacement.shape: raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}") - if ndim > 4: - image = image.reshape((-1,) + shape[-3:]) - needs_unsquash = True - elif ndim == 3: - image = image.unsqueeze(0) - needs_unsquash = True - else: - needs_unsquash = False - - if displacement.dtype != dtype or displacement.device != device: - displacement = displacement.to(dtype=dtype, device=device) - - image_height, image_width = shape[-2:] - grid = _create_identity_grid((image_height, image_width), device=device, dtype=dtype).add_(displacement) + grid = _create_identity_grid((height, width), device=device, dtype=dtype).add_( + displacement.to(dtype=dtype, device=device) + ) output = _apply_grid_transform(image, grid, interpolation.value, fill=fill) - if needs_unsquash: - output = output.reshape(shape) - if is_cpu_half: output = output.to(torch.float16) @@ -2066,7 +2007,7 @@ def resized_crop( width: int, size: List[int], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> torch.Tensor: """[BETA] See :class:`~torchvision.transforms.v2.RandomResizedCrop` for details.""" if torch.jit.is_scripting(): @@ -2106,7 +2047,7 @@ def resized_crop_image( width: int, size: List[int], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> torch.Tensor: image = crop_image(image, top, left, height, width) return resize_image(image, size, interpolation=interpolation, antialias=antialias) @@ -2134,7 +2075,7 @@ def _resized_crop_image_pil_dispatch( width: int, size: List[int], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> PIL.Image.Image: if antialias is False: warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.") @@ -2203,7 +2144,7 @@ def resized_crop_video( width: int, size: List[int], interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, - antialias: Optional[Union[str, bool]] = "warn", + antialias: Optional[bool] = True, ) -> torch.Tensor: return resized_crop_image( video, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index 61e21ef8175..5e045391630 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -261,7 +261,7 @@ def clamp_bounding_boxes( if torch.jit.is_scripting() or is_pure_tensor(inpt): if format is None or canvas_size is None: - raise ValueError("For pure tensor inputs, `format` and `canvas_size` has to be passed.") + raise ValueError("For pure tensor inputs, `format` and `canvas_size` have to be passed.") return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size) elif isinstance(inpt, tv_tensors.BoundingBoxes): if format is not None or canvas_size is not None: diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py index 062f85198ee..02aeda83df3 100644 --- a/torchvision/transforms/v2/functional/_type_conversion.py +++ b/torchvision/transforms/v2/functional/_type_conversion.py @@ -17,7 +17,9 @@ def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> tv_tenso elif isinstance(inpt, torch.Tensor): output = inpt else: - raise TypeError(f"Input can either be a numpy array or a PIL image, but got {type(inpt)} instead.") + raise TypeError( + f"Input can either be a pure Tensor, a numpy array, or a PIL image, but got {type(inpt)} instead." + ) return tv_tensors.Image(output)