From d87475fbb390a6b6321e63f7041ca3482520ddb7 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Mon, 16 Dec 2024 10:33:33 -0500 Subject: [PATCH 1/2] implement fused kernels for all delta_e color calculation functions --- .../cucim/src/cucim/skimage/color/delta_e.py | 305 +++++++++++------- .../cucim/skimage/color/tests/test_delta_e.py | 9 +- 2 files changed, 195 insertions(+), 119 deletions(-) diff --git a/python/cucim/src/cucim/skimage/color/delta_e.py b/python/cucim/src/cucim/skimage/color/delta_e.py index f5ad592a9..a580dac84 100644 --- a/python/cucim/src/cucim/skimage/color/delta_e.py +++ b/python/cucim/src/cucim/skimage/color/delta_e.py @@ -24,7 +24,7 @@ import numpy as np from .._shared.utils import _supported_float_type -from .colorconv import _cart2polar_2pi, lab2lch +from .colorconv import lab2lch def _float_inputs(lab1, lab2, allow_float32=True): @@ -37,6 +37,20 @@ def _float_inputs(lab1, lab2, allow_float32=True): return lab1, lab2 +_cie76_kernel = cp.ElementwiseKernel( + "X L1, X a1, X b1, X L2, X a2, X b2", + "X out", + """ +// use double for the intermediate calculation of G to preserve accuracy +X tmp = (L2 - L1) * (L2 - L1); +tmp += (a2 - a1) * (a2 - a1); +tmp += (b2 - b1) * (b2 - b1); +out = sqrt(tmp); +""", + name="cie76_internal", +) + + def deltaE_cie76(lab1, lab2, channel_axis=-1): """Euclidean distance between two points in Lab color space @@ -64,10 +78,31 @@ def deltaE_cie76(lab1, lab2, channel_axis=-1): lab1, lab2 = _float_inputs(lab1, lab2, allow_float32=True) L1, a1, b1 = cp.moveaxis(lab1, source=channel_axis, destination=0)[:3] L2, a2, b2 = cp.moveaxis(lab2, source=channel_axis, destination=0)[:3] - out = (L2 - L1) * (L2 - L1) - out += (a2 - a1) * (a2 - a1) - out += (b2 - b1) * (b2 - b1) - return cp.sqrt(out, out=out) + out = _cie76_kernel(L1, a1, b1, L2, a2, b2) + return out + + +_ciede94_kernel = cp.ElementwiseKernel( + "X dH2, X L1, X C1, X L2, X C2, float64 k1, float64 k2, float64 kL, float64 kH, float64 kC", # noqa: E501 + "X dE2", + """ +X dL = L1 - L2; +X dC = C1 - C2; +X SL = 1; +X SC = 1 + k1 * C1; +X SH = 1 + k2 * C1; +dE2 = dL / (kL * SL); +dE2 *= dE2; +X tmp = dC / (kC * SC); +tmp *= tmp; +dE2 += tmp; +tmp = kH * SH; +tmp *= tmp; +dE2 += dH2 / tmp; +dE2 = sqrt(max(dE2, 0.0)); +""", + name="ciede94_internal", +) def deltaE_ciede94( @@ -132,23 +167,114 @@ def deltaE_ciede94( L1, C1 = lab2lch(lab1, channel_axis=0)[:2] L2, C2 = lab2lch(lab2, channel_axis=0)[:2] - dL = L1 - L2 - dC = C1 - C2 dH2 = get_dH2(lab1, lab2, channel_axis=0) + return _ciede94_kernel(dH2, L1, C1, L2, C2, k1, k2, kL, kH, kC) + - SL = 1 - SC = 1 + k1 * C1 - SH = 1 + k2 * C1 +_ciede2000_kernel = cp.ElementwiseKernel( + "X a1, X b1, X a2, X b2, X L1, X L2, float64 kL, float64 kC, float64 kH", + "X dE2", + """ - dE2 = dL / (kL * SL) - dE2 *= dE2 - tmp = dC / (kC * SC) - tmp *= tmp - dE2 += tmp - tmp = kH * SH - tmp *= tmp - dE2 += dH2 / tmp - return cp.sqrt(cp.maximum(dE2, 0, out=dE2), out=dE2) +// distort `a` based on average chroma +// then convert to lch coordinates from distorted `a` +// all subsequence calculations are in the new coordinates +// (often denoted "prime" in the literature) + +// cide2000 has four terms to delta_e: +// 1) Luminance term +// 2) Hue term +// 3) Chroma term +// 4) hue Rotation term + +// use double for the intermediate calculation of G to preserve accuracy +double G = 0.5 * (hypot((double)a1, (double)b1) + hypot((double)a2, (double)b2)); +G = pow(G, 7.0); +G = sqrt(G / (G + 6103515625)); +X scale = 1.0 + 0.5 * (1.0 - G); + +X C1 = hypot(a1 * scale, b1); +X h1 = atan2(b1, a1 * scale); +if (h1 < 0) { + h1 += 2 * M_PI; +} + +X C2 = hypot(a2 * scale, b2); +X h2 = atan2(b2, a2 * scale); +if (h2 < 0) { + h2 += 2 * M_PI; +} + +// lightness term +X Lbar = 0.5 * (L1 + L2); +X tmp = Lbar - 50; +tmp *= tmp; +X SL = 1.0 + 0.015 * tmp / sqrt(20.0 + tmp); +X L_term = (L2 - L1) / (kL * SL); + +// chroma term +X Cbar = 0.5 * (C1 + C2); +X SC = 1.0 + 0.045 * Cbar; +X C_term = (C2 - C1) / (kC * SC); + +X h_diff = h2 - h1; +X h_sum = h1 + h2; +X CC = C1 * C2; +X dH = h_diff; +if (CC == 0.0) { + dH = 0; +} else { + if (h_diff > M_PI) { + dH -= 2 * M_PI; + } + if (h_diff < -M_PI) { + dH += 2 * M_PI; + } +} +X dH_term = 2 * sqrt(CC) * sin(dH / 2.0); + +X Hbar = h_sum; +if (CC != 0.0) { + if (abs(h_diff) > M_PI) { + if (h_sum < 2.0 * M_PI) { + Hbar += 2.0 * M_PI; + } + if (h_sum >= 2.0 * M_PI) { + Hbar -= 2 * M_PI; + } + } +} +if (CC == 0.0) { + Hbar *= 2; +} +Hbar *= 0.5; + +X T = 1.0 - 0.17 * cos(Hbar - 30.0 / 180.0 * M_PI) + + 0.24 * cos(2 * Hbar) + + 0.32 * cos(3 * Hbar + 6.0 / 180.0 * M_PI) + - 0.20 * cos(4 * Hbar - 63.0 / 180.0 * M_PI); +X SH = 1 + 0.015 * Cbar * T; +X H_term = dH_term / (kH * SH); + +X c7 = pow(Cbar, (X)7.0); +X c7_term = sqrt(c7 / (c7 + 6103515625)); +X Rc = 2.0 * c7_term; + +// hue rotation +tmp = (Hbar- 4.799655442984406) / 0.4363323129985824; +tmp *= tmp; +X dtheta = 0.5235987755982988 * exp(-tmp); +X R_term = -sin(2 * dtheta) * Rc * C_term * H_term; + +// put it all together +dE2 = L_term * L_term; +dE2 += C_term * C_term; +dE2 += H_term * H_term; +dE2 += R_term; +dE2 = sqrt(max(dE2, 0.0)); +""", # noqa: E501 + name="deltaE_ciede2000_internal", +) def deltaE_ciede2000(lab1, lab2, kL=1, kC=1, kH=1, *, channel_axis=-1): @@ -210,84 +336,43 @@ def deltaE_ciede2000(lab1, lab2, kL=1, kC=1, kH=1, *, channel_axis=-1): L1, a1, b1 = cp.moveaxis(lab1, source=channel_axis, destination=0)[:3] L2, a2, b2 = cp.moveaxis(lab2, source=channel_axis, destination=0)[:3] - # distort `a` based on average chroma - # then convert to lch coordinates from distorted `a` - # all subsequence calculations are in the new coordinates - # (often denoted "prime" in the literature) - Cbar = 0.5 * (cp.hypot(a1, b1) + cp.hypot(a2, b2)) - c7 = Cbar**7 - G = 0.5 * (1 - cp.sqrt(c7 / (c7 + 25**7))) - scale = 1 + G - C1, h1 = _cart2polar_2pi(a1 * scale, b1) - C2, h2 = _cart2polar_2pi(a2 * scale, b2) - # recall that c, h are polar coordinates. c==r, h==theta - - # cide2000 has four terms to delta_e: - # 1) Luminance term - # 2) Hue term - # 3) Chroma term - # 4) hue Rotation term - - # lightness term - Lbar = 0.5 * (L1 + L2) - tmp = Lbar - 50 - tmp *= tmp - SL = 1 + 0.015 * tmp / cp.sqrt(20 + tmp) - L_term = (L2 - L1) / (kL * SL) - - # chroma term - Cbar = 0.5 * (C1 + C2) # new coordinates - SC = 1 + 0.045 * Cbar - C_term = (C2 - C1) / (kC * SC) - - # hue term - h_diff = h2 - h1 - h_sum = h1 + h2 - CC = C1 * C2 - - dH = h_diff.copy() - dH[h_diff > np.pi] -= 2 * np.pi - dH[h_diff < -np.pi] += 2 * np.pi - dH[CC == 0.0] = 0.0 # if r == 0, dtheta == 0 - dH_term = 2 * cp.sqrt(CC) * cp.sin(dH / 2) - - Hbar = h_sum.copy() - mask = cp.logical_and(CC != 0.0, cp.abs(h_diff) > np.pi) - Hbar[mask * (h_sum < 2 * np.pi)] += 2 * np.pi - Hbar[mask * (h_sum >= 2 * np.pi)] -= 2 * np.pi - Hbar[CC == 0.0] *= 2 - Hbar *= 0.5 - - T = ( - 1 - - 0.17 * cp.cos(Hbar - np.deg2rad(30)) - + 0.24 * cp.cos(2 * Hbar) - + 0.32 * cp.cos(3 * Hbar + np.deg2rad(6)) - - 0.20 * cp.cos(4 * Hbar - np.deg2rad(63)) - ) - SH = 1 + 0.015 * Cbar * T - - H_term = dH_term / (kH * SH) - - # hue rotation - c7 = Cbar**7 - Rc = 2 * cp.sqrt(c7 / (c7 + 25**7)) - tmp = (cp.rad2deg(Hbar) - 275) / 25 - tmp *= tmp - dtheta = np.deg2rad(30) * cp.exp(-tmp) - R_term = -cp.sin(2 * dtheta) * Rc * C_term * H_term - - # put it all together - dE2 = L_term * L_term - dE2 += C_term * C_term - dE2 += H_term * H_term - dE2 += R_term - cp.sqrt(cp.maximum(dE2, 0, out=dE2), out=dE2) + dE2 = _ciede2000_kernel(a1, b1, a2, b2, L1, L2, kL, kC, kH) if unroll: dE2 = dE2[0] return dE2 +_cmc_kernel = cp.ElementwiseKernel( + "X C1, X L1, X h1, X C2, X L2, X dH2, float64 kL, float64 kC", + "X dE2", + """ + +X dC = C1 - C2; +X dL = L1 - L2; + +X T = (h1 >= 2.8623399732707004 && h1 <= 6.021385919380437) ? + 0.56 + 0.2 * abs(cos(h1 + 2.9321531433504737)) : + 0.36 + 0.4 * abs(cos(h1 + 0.6108652381980153)); + +X tmp = pow(C1, (X)4.0); +X F = sqrt(tmp / (tmp + 1900)); + +X SL = (L1 < 16) ? 0.511 : 0.040975 * L1 / (1.0 + 0.01765 * L1); +X SC = 0.638 + 0.0638 * C1 / (1.0 + 0.0131 * C1); +X SH = SC * (F * T + 1 - F); + +dE2 = dL / (kL * SL); +dE2 *= dE2; +tmp = dC / (kC * SC); +tmp *= tmp; +dE2 += tmp; +dE2 += dH2 / (SH * SH); +dE2 = sqrt(max(dE2, 0.0)); +""", + name="deltaE_cmc_internal", +) + + def deltaE_cmc(lab1, lab2, kL=1, kC=1, *, channel_axis=-1): """Color difference from the CMC l:c standard. @@ -336,26 +421,21 @@ def deltaE_cmc(lab1, lab2, kL=1, kC=1, *, channel_axis=-1): L1, C1, h1 = lab2lch(lab1, channel_axis=0)[:3] L2, C2, h2 = lab2lch(lab2, channel_axis=0)[:3] - dC = C1 - C2 - dL = L1 - L2 dH2 = get_dH2(lab1, lab2, channel_axis=0) + return _cmc_kernel(C1, L1, h1, C2, L2, dH2, kL, kC) - T = cp.where( - cp.logical_and(cp.rad2deg(h1) >= 164, cp.rad2deg(h1) <= 345), - 0.56 + 0.2 * cp.abs(np.cos(h1 + cp.deg2rad(168))), - 0.36 + 0.4 * cp.abs(np.cos(h1 + cp.deg2rad(35))), - ) - c1_4 = C1**4 - F = cp.sqrt(c1_4 / (c1_4 + 1900)) - - SL = cp.where(L1 < 16, 0.511, 0.040975 * L1 / (1.0 + 0.01765 * L1)) - SC = 0.638 + 0.0638 * C1 / (1.0 + 0.0131 * C1) - SH = SC * (F * T + 1 - F) - dE2 = (dL / (kL * SL)) ** 2 - dE2 += (dC / (kC * SC)) ** 2 - dE2 += dH2 / (SH**2) - return cp.sqrt(cp.maximum(dE2, 0, out=dE2), out=dE2) +_dH2_kernel = cp.ElementwiseKernel( + "X a1, X b1, X a2, X b2", + "X out", + """ +// use double for the intermediate calculation of G to preserve accuracy +double C1 = hypot(a1, b1); +double C2 = hypot(a2, b2); +out = 2.0 * ((C1 * C2) - (a1 * a2 + b1 * b2)); +""", + name="dH2_internal", +) def get_dH2(lab1, lab2, *, channel_axis=-1): @@ -384,12 +464,7 @@ def get_dH2(lab1, lab2, *, channel_axis=-1): a1, b1 = cp.moveaxis(lab1, source=channel_axis, destination=0)[1:3] a2, b2 = cp.moveaxis(lab2, source=channel_axis, destination=0)[1:3] - # magnitude of (a, b) is the chroma - C1 = cp.hypot(a1, b1) - C2 = cp.hypot(a2, b2) - - term = (C1 * C2) - (a1 * a2 + b1 * b2) - out = 2 * term + out = _dH2_kernel(a1, b1, a2, b2) if input_is_float_32: out = out.astype(np.float32) return out diff --git a/python/cucim/src/cucim/skimage/color/tests/test_delta_e.py b/python/cucim/src/cucim/skimage/color/tests/test_delta_e.py index 95af3a506..a2d37b30c 100644 --- a/python/cucim/src/cucim/skimage/color/tests/test_delta_e.py +++ b/python/cucim/src/cucim/skimage/color/tests/test_delta_e.py @@ -6,7 +6,6 @@ from cupy.testing import ( assert_allclose, assert_array_almost_equal, - assert_array_equal, ) from cucim.skimage._shared.testing import expected_warnings, fetch @@ -36,7 +35,9 @@ def test_ciede2000_dE(dtype, channel_axis): lab1 = cp.moveaxis(cp.asarray(lab1), source=-1, destination=channel_axis) lab2 = cp.moveaxis(cp.asarray(lab2), source=-1, destination=channel_axis) - dE2 = deltaE_ciede2000(lab1, lab2, channel_axis=channel_axis) + msg = "numerical accuracy of this function on the GPU is reduced" + with pytest.warns(UserWarning, match=msg): + dE2 = deltaE_ciede2000(lab1, lab2, channel_axis=channel_axis) assert dE2.dtype == _supported_float_type(dtype) # Note: lower float64 accuracy than scikit-image @@ -199,10 +200,10 @@ def test_cmc(dtype, channel_axis): def test_cmc_single_item(): # Single item case: lab1 = lab2 = cp.array([0.0, 1.59607713, 0.87755709]) - assert_array_equal(deltaE_cmc(lab1, lab2), 0) + assert_array_almost_equal(deltaE_cmc(lab1, lab2), 0) lab2[0] += cp.finfo(float).eps - assert_array_equal(deltaE_cmc(lab1, lab2), 0) + assert_array_almost_equal(deltaE_cmc(lab1, lab2), 0) def test_single_color_cie76(): From fc954072bd132739c2ba0cbe8df8d0e1e4df4c35 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Thu, 9 Jan 2025 14:08:53 -0500 Subject: [PATCH 2/2] add benchmark cases for the deltaE color distance functions --- benchmarks/skimage/cucim_color_bench.py | 48 ++++++++++++++++++++++++ benchmarks/skimage/run-nv-bench-color.sh | 11 ++++++ 2 files changed, 59 insertions(+) diff --git a/benchmarks/skimage/cucim_color_bench.py b/benchmarks/skimage/cucim_color_bench.py index e4fe9fe95..13df14c80 100644 --- a/benchmarks/skimage/cucim_color_bench.py +++ b/benchmarks/skimage/cucim_color_bench.py @@ -1,4 +1,5 @@ import argparse +import math import os import pickle @@ -25,6 +26,10 @@ "lab2xyz", "rgba2rgb", "label2rgb", + "deltaE_cie76", + "deltaE_ciede94", + "deltaE_ciede2000", + "deltaE_cmc", ] @@ -40,6 +45,32 @@ def set_args(self, dtype): self.args_gpu = (imaged,) +class DeltaEBench(ImageBench): + def set_args(self, dtype): + from skimage import color, data + + # create synthetic lab image pair + rgb1 = data.astronaut() + lab1 = color.rgb2lab(rgb1) + lab2 = color.rgb2lab(np.roll(rgb1, (1, 1), axis=(0, 1))) + + # change to desired dtype + lab1 = lab1.astype(dtype, copy=False) + lab2 = lab2.astype(dtype, copy=False) + + # tile then crop as needed to get the expected size + n_tile0 = math.ceil(self.shape[0] / lab1.shape[0]) + n_tile1 = math.ceil(self.shape[1] / lab1.shape[1]) + lab1 = np.tile(lab1, (n_tile0, n_tile1, 1)) + lab1 = lab1[: self.shape[0], : self.shape[1], :] + lab2 = np.tile(lab2, (n_tile0, n_tile1, 1)) + lab2 = lab2[: self.shape[0], : self.shape[1], :] + + print(f"{lab1.shape=}") + self.args_cpu = (lab1, lab2) + self.args_gpu = (cp.asarray(lab1), cp.asarray(lab2)) + + class RGBABench(ImageBench): def set_args(self, dtype): if self.shape[-1] != 4: @@ -162,6 +193,23 @@ def main(args): results = B.run_benchmark(duration=args.duration) all_results = pd.concat([all_results, results["full"]]) + elif function_name.startswith("deltaE"): + # only run these functions for floating point data types + float_dtypes = [t for t in dtypes if np.dtype(t).kind == "f"] + + B = DeltaEBench( + function_name=function_name, + shape=shape + (3,), + dtypes=float_dtypes, + fixed_kwargs={}, + var_kwargs={}, + # index_str=f"{fromspace.lower()}2{tospace.lower()}", + module_cpu=skimage.color, + module_gpu=cucim.skimage.color, + run_cpu=run_cpu, + ) + results = B.run_benchmark(duration=args.duration) + all_results = pd.concat([all_results, results["full"]]) elif function_name == "rgba2rgb": B = RGBABench( function_name="rgba2rgb", diff --git a/benchmarks/skimage/run-nv-bench-color.sh b/benchmarks/skimage/run-nv-bench-color.sh index 14a0809ec..4d88af9cc 100755 --- a/benchmarks/skimage/run-nv-bench-color.sh +++ b/benchmarks/skimage/run-nv-bench-color.sh @@ -9,3 +9,14 @@ for shape in "${param_shape[@]}"; do done done done + +param_shape=(512,512 3840,2160) +param_filt=(deltaE_cie76 deltaE_ciede94 deltaE_ciede2000 deltaE_cmc) +param_dt=(float32, float64) +for shape in "${param_shape[@]}"; do + for filt in "${param_filt[@]}"; do + for dt in "${param_dt[@]}"; do + python cucim_color_bench.py -f $filt -i $shape -d $dt -t 10 + done + done +done