From d87475fbb390a6b6321e63f7041ca3482520ddb7 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Mon, 16 Dec 2024 10:33:33 -0500
Subject: [PATCH 1/2] implement fused kernels for all delta_e color calculation
 functions

---
 .../cucim/src/cucim/skimage/color/delta_e.py  | 305 +++++++++++-------
 .../cucim/skimage/color/tests/test_delta_e.py |   9 +-
 2 files changed, 195 insertions(+), 119 deletions(-)

diff --git a/python/cucim/src/cucim/skimage/color/delta_e.py b/python/cucim/src/cucim/skimage/color/delta_e.py
index f5ad592a9..a580dac84 100644
--- a/python/cucim/src/cucim/skimage/color/delta_e.py
+++ b/python/cucim/src/cucim/skimage/color/delta_e.py
@@ -24,7 +24,7 @@
 import numpy as np
 
 from .._shared.utils import _supported_float_type
-from .colorconv import _cart2polar_2pi, lab2lch
+from .colorconv import lab2lch
 
 
 def _float_inputs(lab1, lab2, allow_float32=True):
@@ -37,6 +37,20 @@ def _float_inputs(lab1, lab2, allow_float32=True):
     return lab1, lab2
 
 
+_cie76_kernel = cp.ElementwiseKernel(
+    "X L1, X a1, X b1, X L2, X a2, X b2",
+    "X out",
+    """
+// use double for the intermediate calculation of G to preserve accuracy
+X tmp = (L2 - L1) * (L2 - L1);
+tmp += (a2 - a1) * (a2 - a1);
+tmp += (b2 - b1) * (b2 - b1);
+out = sqrt(tmp);
+""",
+    name="cie76_internal",
+)
+
+
 def deltaE_cie76(lab1, lab2, channel_axis=-1):
     """Euclidean distance between two points in Lab color space
 
@@ -64,10 +78,31 @@ def deltaE_cie76(lab1, lab2, channel_axis=-1):
     lab1, lab2 = _float_inputs(lab1, lab2, allow_float32=True)
     L1, a1, b1 = cp.moveaxis(lab1, source=channel_axis, destination=0)[:3]
     L2, a2, b2 = cp.moveaxis(lab2, source=channel_axis, destination=0)[:3]
-    out = (L2 - L1) * (L2 - L1)
-    out += (a2 - a1) * (a2 - a1)
-    out += (b2 - b1) * (b2 - b1)
-    return cp.sqrt(out, out=out)
+    out = _cie76_kernel(L1, a1, b1, L2, a2, b2)
+    return out
+
+
+_ciede94_kernel = cp.ElementwiseKernel(
+    "X dH2, X L1, X C1, X L2, X C2, float64 k1, float64 k2, float64 kL, float64 kH, float64 kC",  # noqa: E501
+    "X dE2",
+    """
+X dL = L1 - L2;
+X dC = C1 - C2;
+X SL = 1;
+X SC = 1 + k1 * C1;
+X SH = 1 + k2 * C1;
+dE2 = dL / (kL * SL);
+dE2 *= dE2;
+X tmp = dC / (kC * SC);
+tmp *= tmp;
+dE2 += tmp;
+tmp = kH * SH;
+tmp *= tmp;
+dE2 += dH2 / tmp;
+dE2 = sqrt(max(dE2, 0.0));
+""",
+    name="ciede94_internal",
+)
 
 
 def deltaE_ciede94(
@@ -132,23 +167,114 @@ def deltaE_ciede94(
     L1, C1 = lab2lch(lab1, channel_axis=0)[:2]
     L2, C2 = lab2lch(lab2, channel_axis=0)[:2]
 
-    dL = L1 - L2
-    dC = C1 - C2
     dH2 = get_dH2(lab1, lab2, channel_axis=0)
+    return _ciede94_kernel(dH2, L1, C1, L2, C2, k1, k2, kL, kH, kC)
+
 
-    SL = 1
-    SC = 1 + k1 * C1
-    SH = 1 + k2 * C1
+_ciede2000_kernel = cp.ElementwiseKernel(
+    "X a1, X b1, X a2, X b2, X L1, X L2, float64 kL, float64 kC, float64 kH",
+    "X dE2",
+    """
 
-    dE2 = dL / (kL * SL)
-    dE2 *= dE2
-    tmp = dC / (kC * SC)
-    tmp *= tmp
-    dE2 += tmp
-    tmp = kH * SH
-    tmp *= tmp
-    dE2 += dH2 / tmp
-    return cp.sqrt(cp.maximum(dE2, 0, out=dE2), out=dE2)
+// distort `a` based on average chroma
+// then convert to lch coordinates from distorted `a`
+// all subsequence calculations are in the new coordinates
+// (often denoted "prime" in the literature)
+
+// cide2000 has four terms to delta_e:
+// 1) Luminance term
+// 2) Hue term
+// 3) Chroma term
+// 4) hue Rotation term
+
+// use double for the intermediate calculation of G to preserve accuracy
+double G = 0.5 * (hypot((double)a1, (double)b1) + hypot((double)a2, (double)b2));
+G = pow(G, 7.0);
+G = sqrt(G / (G + 6103515625));
+X scale = 1.0 + 0.5 * (1.0 - G);
+
+X C1 = hypot(a1 * scale, b1);
+X h1 = atan2(b1, a1 * scale);
+if (h1 < 0) {
+    h1 += 2 * M_PI;
+}
+
+X C2 = hypot(a2 * scale, b2);
+X h2 = atan2(b2, a2 * scale);
+if (h2 < 0) {
+    h2 += 2 * M_PI;
+}
+
+// lightness term
+X Lbar = 0.5 * (L1 + L2);
+X tmp = Lbar - 50;
+tmp *= tmp;
+X SL = 1.0 + 0.015 * tmp / sqrt(20.0 + tmp);
+X L_term = (L2 - L1) / (kL * SL);
+
+// chroma term
+X Cbar = 0.5 * (C1 + C2);
+X SC = 1.0 + 0.045 * Cbar;
+X C_term = (C2 - C1) / (kC * SC);
+
+X h_diff = h2 - h1;
+X h_sum = h1 + h2;
+X CC = C1 * C2;
+X dH = h_diff;
+if (CC == 0.0) {
+    dH = 0;
+} else {
+    if (h_diff > M_PI) {
+      dH -= 2 * M_PI;
+    }
+    if (h_diff < -M_PI) {
+      dH += 2 * M_PI;
+    }
+}
+X dH_term = 2 * sqrt(CC) * sin(dH / 2.0);
+
+X Hbar = h_sum;
+if (CC != 0.0) {
+  if (abs(h_diff) > M_PI) {
+    if (h_sum < 2.0 * M_PI) {
+      Hbar += 2.0 * M_PI;
+    }
+    if (h_sum >= 2.0 * M_PI) {
+      Hbar -= 2 * M_PI;
+    }
+  }
+}
+if (CC == 0.0) {
+  Hbar *= 2;
+}
+Hbar *= 0.5;
+
+X T = 1.0 - 0.17 * cos(Hbar - 30.0 / 180.0 * M_PI)
+    + 0.24 * cos(2 * Hbar)
+    + 0.32 * cos(3 * Hbar + 6.0 / 180.0 * M_PI)
+    - 0.20 * cos(4 * Hbar - 63.0 / 180.0 * M_PI);
+X SH = 1 + 0.015 * Cbar * T;
+X H_term = dH_term / (kH * SH);
+
+X c7 = pow(Cbar, (X)7.0);
+X c7_term = sqrt(c7 / (c7 + 6103515625));
+X Rc = 2.0 * c7_term;
+
+// hue rotation
+tmp = (Hbar- 4.799655442984406) / 0.4363323129985824;
+tmp *= tmp;
+X dtheta = 0.5235987755982988 * exp(-tmp);
+X R_term = -sin(2 * dtheta) * Rc * C_term * H_term;
+
+// put it all together
+dE2 = L_term * L_term;
+dE2 += C_term * C_term;
+dE2 += H_term * H_term;
+dE2 += R_term;
+dE2 = sqrt(max(dE2, 0.0));
+""",  # noqa: E501
+    name="deltaE_ciede2000_internal",
+)
 
 
 def deltaE_ciede2000(lab1, lab2, kL=1, kC=1, kH=1, *, channel_axis=-1):
@@ -210,84 +336,43 @@ def deltaE_ciede2000(lab1, lab2, kL=1, kC=1, kH=1, *, channel_axis=-1):
     L1, a1, b1 = cp.moveaxis(lab1, source=channel_axis, destination=0)[:3]
     L2, a2, b2 = cp.moveaxis(lab2, source=channel_axis, destination=0)[:3]
 
-    # distort `a` based on average chroma
-    # then convert to lch coordinates from distorted `a`
-    # all subsequence calculations are in the new coordinates
-    # (often denoted "prime" in the literature)
-    Cbar = 0.5 * (cp.hypot(a1, b1) + cp.hypot(a2, b2))
-    c7 = Cbar**7
-    G = 0.5 * (1 - cp.sqrt(c7 / (c7 + 25**7)))
-    scale = 1 + G
-    C1, h1 = _cart2polar_2pi(a1 * scale, b1)
-    C2, h2 = _cart2polar_2pi(a2 * scale, b2)
-    # recall that c, h are polar coordinates.  c==r, h==theta
-
-    # cide2000 has four terms to delta_e:
-    # 1) Luminance term
-    # 2) Hue term
-    # 3) Chroma term
-    # 4) hue Rotation term
-
-    # lightness term
-    Lbar = 0.5 * (L1 + L2)
-    tmp = Lbar - 50
-    tmp *= tmp
-    SL = 1 + 0.015 * tmp / cp.sqrt(20 + tmp)
-    L_term = (L2 - L1) / (kL * SL)
-
-    # chroma term
-    Cbar = 0.5 * (C1 + C2)  # new coordinates
-    SC = 1 + 0.045 * Cbar
-    C_term = (C2 - C1) / (kC * SC)
-
-    # hue term
-    h_diff = h2 - h1
-    h_sum = h1 + h2
-    CC = C1 * C2
-
-    dH = h_diff.copy()
-    dH[h_diff > np.pi] -= 2 * np.pi
-    dH[h_diff < -np.pi] += 2 * np.pi
-    dH[CC == 0.0] = 0.0  # if r == 0, dtheta == 0
-    dH_term = 2 * cp.sqrt(CC) * cp.sin(dH / 2)
-
-    Hbar = h_sum.copy()
-    mask = cp.logical_and(CC != 0.0, cp.abs(h_diff) > np.pi)
-    Hbar[mask * (h_sum < 2 * np.pi)] += 2 * np.pi
-    Hbar[mask * (h_sum >= 2 * np.pi)] -= 2 * np.pi
-    Hbar[CC == 0.0] *= 2
-    Hbar *= 0.5
-
-    T = (
-        1
-        - 0.17 * cp.cos(Hbar - np.deg2rad(30))
-        + 0.24 * cp.cos(2 * Hbar)
-        + 0.32 * cp.cos(3 * Hbar + np.deg2rad(6))
-        - 0.20 * cp.cos(4 * Hbar - np.deg2rad(63))
-    )
-    SH = 1 + 0.015 * Cbar * T
-
-    H_term = dH_term / (kH * SH)
-
-    # hue rotation
-    c7 = Cbar**7
-    Rc = 2 * cp.sqrt(c7 / (c7 + 25**7))
-    tmp = (cp.rad2deg(Hbar) - 275) / 25
-    tmp *= tmp
-    dtheta = np.deg2rad(30) * cp.exp(-tmp)
-    R_term = -cp.sin(2 * dtheta) * Rc * C_term * H_term
-
-    # put it all together
-    dE2 = L_term * L_term
-    dE2 += C_term * C_term
-    dE2 += H_term * H_term
-    dE2 += R_term
-    cp.sqrt(cp.maximum(dE2, 0, out=dE2), out=dE2)
+    dE2 = _ciede2000_kernel(a1, b1, a2, b2, L1, L2, kL, kC, kH)
     if unroll:
         dE2 = dE2[0]
     return dE2
 
 
+_cmc_kernel = cp.ElementwiseKernel(
+    "X C1, X L1, X h1, X C2, X L2, X dH2, float64 kL, float64 kC",
+    "X dE2",
+    """
+
+X dC = C1 - C2;
+X dL = L1 - L2;
+
+X T = (h1 >= 2.8623399732707004 && h1 <= 6.021385919380437) ?
+    0.56 + 0.2 * abs(cos(h1 + 2.9321531433504737)) :
+    0.36 + 0.4 * abs(cos(h1 + 0.6108652381980153));
+
+X tmp = pow(C1, (X)4.0);
+X F = sqrt(tmp / (tmp + 1900));
+
+X SL = (L1 < 16) ? 0.511 : 0.040975 * L1 / (1.0 + 0.01765 * L1);
+X SC = 0.638 + 0.0638 * C1 / (1.0 + 0.0131 * C1);
+X SH = SC * (F * T + 1 - F);
+
+dE2 = dL / (kL * SL);
+dE2 *= dE2;
+tmp = dC / (kC * SC);
+tmp *= tmp;
+dE2 += tmp;
+dE2 += dH2 / (SH * SH);
+dE2 = sqrt(max(dE2, 0.0));
+""",
+    name="deltaE_cmc_internal",
+)
+
+
 def deltaE_cmc(lab1, lab2, kL=1, kC=1, *, channel_axis=-1):
     """Color difference from the  CMC l:c standard.
 
@@ -336,26 +421,21 @@ def deltaE_cmc(lab1, lab2, kL=1, kC=1, *, channel_axis=-1):
     L1, C1, h1 = lab2lch(lab1, channel_axis=0)[:3]
     L2, C2, h2 = lab2lch(lab2, channel_axis=0)[:3]
 
-    dC = C1 - C2
-    dL = L1 - L2
     dH2 = get_dH2(lab1, lab2, channel_axis=0)
+    return _cmc_kernel(C1, L1, h1, C2, L2, dH2, kL, kC)
 
-    T = cp.where(
-        cp.logical_and(cp.rad2deg(h1) >= 164, cp.rad2deg(h1) <= 345),
-        0.56 + 0.2 * cp.abs(np.cos(h1 + cp.deg2rad(168))),
-        0.36 + 0.4 * cp.abs(np.cos(h1 + cp.deg2rad(35))),
-    )
-    c1_4 = C1**4
-    F = cp.sqrt(c1_4 / (c1_4 + 1900))
-
-    SL = cp.where(L1 < 16, 0.511, 0.040975 * L1 / (1.0 + 0.01765 * L1))
-    SC = 0.638 + 0.0638 * C1 / (1.0 + 0.0131 * C1)
-    SH = SC * (F * T + 1 - F)
 
-    dE2 = (dL / (kL * SL)) ** 2
-    dE2 += (dC / (kC * SC)) ** 2
-    dE2 += dH2 / (SH**2)
-    return cp.sqrt(cp.maximum(dE2, 0, out=dE2), out=dE2)
+_dH2_kernel = cp.ElementwiseKernel(
+    "X a1, X b1, X a2, X b2",
+    "X out",
+    """
+// use double for the intermediate calculation of G to preserve accuracy
+double C1 = hypot(a1, b1);
+double C2 = hypot(a2, b2);
+out = 2.0 * ((C1 * C2) - (a1 * a2 + b1 * b2));
+""",
+    name="dH2_internal",
+)
 
 
 def get_dH2(lab1, lab2, *, channel_axis=-1):
@@ -384,12 +464,7 @@ def get_dH2(lab1, lab2, *, channel_axis=-1):
     a1, b1 = cp.moveaxis(lab1, source=channel_axis, destination=0)[1:3]
     a2, b2 = cp.moveaxis(lab2, source=channel_axis, destination=0)[1:3]
 
-    # magnitude of (a, b) is the chroma
-    C1 = cp.hypot(a1, b1)
-    C2 = cp.hypot(a2, b2)
-
-    term = (C1 * C2) - (a1 * a2 + b1 * b2)
-    out = 2 * term
+    out = _dH2_kernel(a1, b1, a2, b2)
     if input_is_float_32:
         out = out.astype(np.float32)
     return out
diff --git a/python/cucim/src/cucim/skimage/color/tests/test_delta_e.py b/python/cucim/src/cucim/skimage/color/tests/test_delta_e.py
index 95af3a506..a2d37b30c 100644
--- a/python/cucim/src/cucim/skimage/color/tests/test_delta_e.py
+++ b/python/cucim/src/cucim/skimage/color/tests/test_delta_e.py
@@ -6,7 +6,6 @@
 from cupy.testing import (
     assert_allclose,
     assert_array_almost_equal,
-    assert_array_equal,
 )
 
 from cucim.skimage._shared.testing import expected_warnings, fetch
@@ -36,7 +35,9 @@ def test_ciede2000_dE(dtype, channel_axis):
 
     lab1 = cp.moveaxis(cp.asarray(lab1), source=-1, destination=channel_axis)
     lab2 = cp.moveaxis(cp.asarray(lab2), source=-1, destination=channel_axis)
-    dE2 = deltaE_ciede2000(lab1, lab2, channel_axis=channel_axis)
+    msg = "numerical accuracy of this function on the GPU is reduced"
+    with pytest.warns(UserWarning, match=msg):
+        dE2 = deltaE_ciede2000(lab1, lab2, channel_axis=channel_axis)
     assert dE2.dtype == _supported_float_type(dtype)
 
     # Note: lower float64 accuracy than scikit-image
@@ -199,10 +200,10 @@ def test_cmc(dtype, channel_axis):
 def test_cmc_single_item():
     # Single item case:
     lab1 = lab2 = cp.array([0.0, 1.59607713, 0.87755709])
-    assert_array_equal(deltaE_cmc(lab1, lab2), 0)
+    assert_array_almost_equal(deltaE_cmc(lab1, lab2), 0)
 
     lab2[0] += cp.finfo(float).eps
-    assert_array_equal(deltaE_cmc(lab1, lab2), 0)
+    assert_array_almost_equal(deltaE_cmc(lab1, lab2), 0)
 
 
 def test_single_color_cie76():

From fc954072bd132739c2ba0cbe8df8d0e1e4df4c35 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Thu, 9 Jan 2025 14:08:53 -0500
Subject: [PATCH 2/2] add benchmark cases for the deltaE color distance
 functions

---
 benchmarks/skimage/cucim_color_bench.py  | 48 ++++++++++++++++++++++++
 benchmarks/skimage/run-nv-bench-color.sh | 11 ++++++
 2 files changed, 59 insertions(+)

diff --git a/benchmarks/skimage/cucim_color_bench.py b/benchmarks/skimage/cucim_color_bench.py
index e4fe9fe95..13df14c80 100644
--- a/benchmarks/skimage/cucim_color_bench.py
+++ b/benchmarks/skimage/cucim_color_bench.py
@@ -1,4 +1,5 @@
 import argparse
+import math
 import os
 import pickle
 
@@ -25,6 +26,10 @@
     "lab2xyz",
     "rgba2rgb",
     "label2rgb",
+    "deltaE_cie76",
+    "deltaE_ciede94",
+    "deltaE_ciede2000",
+    "deltaE_cmc",
 ]
 
 
@@ -40,6 +45,32 @@ def set_args(self, dtype):
         self.args_gpu = (imaged,)
 
 
+class DeltaEBench(ImageBench):
+    def set_args(self, dtype):
+        from skimage import color, data
+
+        # create synthetic lab image pair
+        rgb1 = data.astronaut()
+        lab1 = color.rgb2lab(rgb1)
+        lab2 = color.rgb2lab(np.roll(rgb1, (1, 1), axis=(0, 1)))
+
+        # change to desired dtype
+        lab1 = lab1.astype(dtype, copy=False)
+        lab2 = lab2.astype(dtype, copy=False)
+
+        # tile then crop as needed to get the expected size
+        n_tile0 = math.ceil(self.shape[0] / lab1.shape[0])
+        n_tile1 = math.ceil(self.shape[1] / lab1.shape[1])
+        lab1 = np.tile(lab1, (n_tile0, n_tile1, 1))
+        lab1 = lab1[: self.shape[0], : self.shape[1], :]
+        lab2 = np.tile(lab2, (n_tile0, n_tile1, 1))
+        lab2 = lab2[: self.shape[0], : self.shape[1], :]
+
+        print(f"{lab1.shape=}")
+        self.args_cpu = (lab1, lab2)
+        self.args_gpu = (cp.asarray(lab1), cp.asarray(lab2))
+
+
 class RGBABench(ImageBench):
     def set_args(self, dtype):
         if self.shape[-1] != 4:
@@ -162,6 +193,23 @@ def main(args):
                     results = B.run_benchmark(duration=args.duration)
                     all_results = pd.concat([all_results, results["full"]])
 
+        elif function_name.startswith("deltaE"):
+            # only run these functions for floating point data types
+            float_dtypes = [t for t in dtypes if np.dtype(t).kind == "f"]
+
+            B = DeltaEBench(
+                function_name=function_name,
+                shape=shape + (3,),
+                dtypes=float_dtypes,
+                fixed_kwargs={},
+                var_kwargs={},
+                # index_str=f"{fromspace.lower()}2{tospace.lower()}",
+                module_cpu=skimage.color,
+                module_gpu=cucim.skimage.color,
+                run_cpu=run_cpu,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
         elif function_name == "rgba2rgb":
             B = RGBABench(
                 function_name="rgba2rgb",
diff --git a/benchmarks/skimage/run-nv-bench-color.sh b/benchmarks/skimage/run-nv-bench-color.sh
index 14a0809ec..4d88af9cc 100755
--- a/benchmarks/skimage/run-nv-bench-color.sh
+++ b/benchmarks/skimage/run-nv-bench-color.sh
@@ -9,3 +9,14 @@ for shape in "${param_shape[@]}"; do
         done
     done
 done
+
+param_shape=(512,512 3840,2160)
+param_filt=(deltaE_cie76 deltaE_ciede94 deltaE_ciede2000 deltaE_cmc)
+param_dt=(float32, float64)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_color_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done