From 3c185e880483aeb26ef46381b3a1e2164534cb31 Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@mldsai.com>
Date: Sat, 27 Apr 2024 16:41:25 -0400
Subject: [PATCH] feat(VisualReplayStrategy): compute image similarity to avoid
 unnecessary segmentation

* remove sct_image from Screenshot; fix typo

* add Image.cropped_image

* add experiments/imagesimilarity.py

* bugfix: sct_image -> image

* find_similar_image_segmentation

* fix test_crop_active_window
---
 README.md                      |   1 -
 experiments/imagesimilarity.py | 351 +++++++++++++++++++++++++++++++++
 openadapt/events.py            |   6 +-
 openadapt/models.py            |  74 ++++---
 openadapt/strategies/visual.py | 106 +++++++++-
 openadapt/utils.py             |   2 +-
 poetry.lock                    |  84 +++++++-
 pyproject.toml                 |   1 +
 tests/openadapt/test_crop.py   |   6 +-
 9 files changed, 572 insertions(+), 59 deletions(-)
 create mode 100644 experiments/imagesimilarity.py

diff --git a/README.md b/README.md
index 3db408af1..1eeda7eaa 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,6 @@ poetry install
 poetry shell
 alembic upgrade head
 poetry run install-dashbaord
-
 pytest
 ```
 
diff --git a/experiments/imagesimilarity.py b/experiments/imagesimilarity.py
new file mode 100644
index 000000000..28361592d
--- /dev/null
+++ b/experiments/imagesimilarity.py
@@ -0,0 +1,351 @@
+"""This module calculates image similarities using various methods."""
+
+from typing import Callable
+import time
+
+from matplotlib.offsetbox import OffsetImage, AnnotationBbox
+from PIL import Image, ImageOps
+from skimage.metrics import structural_similarity as ssim
+from sklearn.manifold import MDS
+import imagehash
+import matplotlib.gridspec as gridspec
+import matplotlib.pyplot as plt
+import numpy as np
+
+from openadapt.db import crud
+
+
+SHOW_SSIM = False
+
+
+def calculate_ssim(im1: Image.Image, im2: Image.Image) -> float:
+    """Calculate the Structural Similarity Index (SSIM) between two images.
+
+    Args:
+        im1 (Image.Image): The first image.
+        im2 (Image.Image): The second image.
+
+    Returns:
+        float: The SSIM index between the two images.
+    """
+    # Calculate aspect ratios
+    aspect_ratio1 = im1.width / im1.height
+    aspect_ratio2 = im2.width / im2.height
+    # Use the smaller image as the base for resizing to maintain the aspect ratio
+    if aspect_ratio1 < aspect_ratio2:
+        base_width = min(im1.width, im2.width)
+        base_height = int(base_width / aspect_ratio1)
+    else:
+        base_height = min(im1.height, im2.height)
+        base_width = int(base_height * aspect_ratio2)
+
+    # Resize images to a common base while maintaining aspect ratio
+    im1 = im1.resize((base_width, base_height), Image.LANCZOS)
+    im2 = im2.resize((base_width, base_height), Image.LANCZOS)
+
+    # Convert images to grayscale
+    im1_gray = np.array(im1.convert("L"))
+    im2_gray = np.array(im2.convert("L"))
+
+    mssim, grad, S = ssim(
+        im1_gray,
+        im2_gray,
+        data_range=im2_gray.max() - im2_gray.min(),
+        gradient=True,
+        full=True,
+    )
+
+    if SHOW_SSIM:
+        # Normalize the gradient for visualization
+        grad_normalized = (grad - grad.min()) / (grad.max() - grad.min())
+        im_grad = Image.fromarray((grad_normalized * 255).astype(np.uint8))
+
+        # Convert full SSIM image to uint8
+        im_S = Image.fromarray((S * 255).astype(np.uint8))
+
+        # Create a figure to display the images
+        fig, axs = plt.subplots(1, 4, figsize=(20, 5))  # 1 row, 4 columns
+
+        # Display each image in the subplot
+        axs[0].imshow(im1, cmap="gray")
+        axs[0].set_title("Image 1")
+        axs[0].axis("off")
+
+        axs[1].imshow(im2, cmap="gray")
+        axs[1].set_title("Image 2")
+        axs[1].axis("off")
+
+        axs[2].imshow(im_grad, cmap="gray")
+        axs[2].set_title("Gradient of SSIM")
+        axs[2].axis("off")
+
+        axs[3].imshow(im_S, cmap="gray")
+        axs[3].set_title("SSIM Image")
+        axs[3].axis("off")
+
+        plt.show(block=False)
+
+    return 1 - mssim
+
+
+def calculate_dynamic_threshold(
+    im1: Image.Image,
+    im2: Image.Image,
+    k: float = 1.0,
+) -> float:
+    """Calculate a dynamic threshold for image difference.
+
+    Based on the standard deviation of the pixel differences.
+
+    Args:
+        im1 (Image.Image): The first image.
+        im2 (Image.Image): The second image.
+        k (float): The multiplier for the standard deviation to set the threshold.
+
+    Returns:
+        float: The dynamically calculated threshold.
+    """
+    # Convert images to numpy arrays
+    arr1 = np.array(im1)
+    arr2 = np.array(im2)
+
+    # Calculate the absolute difference between the images
+    diff = np.abs(arr1 - arr2)
+
+    # Calculate mean and standard deviation of the differences
+    mean_diff = np.mean(diff)
+    std_diff = np.std(diff)
+
+    # Calculate the threshold as mean plus k times the standard deviation
+    threshold = mean_diff + k * std_diff
+
+    return threshold
+
+
+def thresholded_difference(im1: Image.Image, im2: Image.Image, k: float = 1.0) -> int:
+    """Return number of pixels differing by at least a dynamically calculated threshold.
+
+    Args:
+        im1 (Image.Image): The first image.
+        im2 (Image.Image): The second image.
+        k (float): Multiplier for the standard deviation to set the dynamic threshold.
+
+    Returns:
+        int: The number of pixels differing by at least the dynamically calculated
+        threshold.
+    """
+    common_size = (min(im1.width, im2.width), min(im1.height, im2.height))
+    im1 = im1.resize(common_size)
+    im2 = im2.resize(common_size)
+
+    # Calculate the dynamic threshold
+    difference_threshold = calculate_dynamic_threshold(im1, im2, k)
+
+    # Convert images to numpy arrays
+    arr1 = np.array(im1)
+    arr2 = np.array(im2)
+
+    # Calculate the absolute difference between the images
+    diff = np.abs(arr1 - arr2)
+
+    # Count pixels with a difference above the dynamically calculated threshold
+    count = np.sum(diff >= difference_threshold)
+
+    return count
+
+
+def prepare_image(
+    img: Image.Image,
+    size: tuple[int, int] = (128, 128),
+    border: int = 2,
+    color: str = "red",
+) -> Image.Image:
+    """Resize an image to a common size, add a border to it.
+
+    Args:
+        img (Image.Image): The original image to prepare.
+        size (tuple[int, int]): The size to which the images should be resized.
+        border (int): The width of the border around the image.
+        color (str): The color of the border.
+
+    Returns:
+        Image.Image: The processed image.
+    """
+    # Resize image
+    img = img.resize(size, Image.ANTIALIAS)
+
+    # Add border to the image
+    img_with_border = ImageOps.expand(img, border=border, fill=color)
+
+    return img_with_border
+
+
+def plot_images_with_mds(
+    images: list[Image.Image],
+    distance_matrix: np.ndarray,
+    title: str,
+    hash_func: Callable,
+) -> None:
+    """Plot images on a scatter plot based on the provided distance matrix.
+
+    Args:
+        images (list[Image.Image]): list of images to plot.
+        distance_matrix (np.ndarray): A distance matrix of image differences.
+        title (str): Title of the plot.
+        hash_func (Callable): The hashing function to compute hash values.
+
+    Returns:
+        None
+    """
+    # Prepare images by resizing and adding a border
+    prepared_images = [prepare_image(img) for img in images]
+
+    # Compute hash values for each image
+    hash_values = [str(hash_func(img)) if hash_func else "" for img in images]
+
+    # Initialize MDS and fit the distance matrix to get the 2D embedding
+    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=0)
+    positions = mds.fit_transform(distance_matrix)
+
+    # Create a scatter plot with the MDS results
+    fig, ax = plt.subplots(figsize=(15, 10))
+    ax.scatter(positions[:, 0], positions[:, 1], alpha=0)
+
+    # Define properties for the bounding box
+    bbox_props = dict(boxstyle="round,pad=0.3", ec="b", lw=2, fc="white", alpha=0.7)
+
+    # Loop through images, positions, and hash values to create annotations
+    for img, hash_val, (x, y) in zip(prepared_images, hash_values, positions):
+        im = OffsetImage(np.array(img), zoom=0.5)
+        ab = AnnotationBbox(
+            im,
+            (x, y),
+            xycoords="data",
+            frameon=True,
+            bboxprops=bbox_props,
+        )
+        ax.add_artist(ab)
+        # Display the hash value beside the image
+        ax.text(x, y - 0.05, hash_val, fontsize=9, ha="center")
+
+    # Remove the x and y ticks
+    ax.set_xticks([])
+    ax.set_yticks([])
+
+    plt.title(title)
+    plt.show()
+
+
+def display_distance_matrix_with_images(
+    distance_matrix: np.ndarray,
+    images: list[Image.Image],
+    func_name: str,
+    thumbnail_size: tuple[int, int] = (32, 32),
+) -> None:
+    """Display the distance matrix as an image with thumbnails along the top and left.
+
+    Args:
+        distance_matrix (np.ndarray): A square matrix with distance values.
+        images (list[Image.Image]): list of images corresponding to matrix rows/cols.
+        thumbnail_size (tuple[int, int]): Size to which thumbnails will be resized.
+
+    Returns:
+        None
+    """
+    # Number of images
+    n = len(images)
+    # Create a figure with subplots
+    fig = plt.figure(figsize=(10, 10))
+    # GridSpec layout for the thumbnails and the distance matrix
+    gs = gridspec.GridSpec(n + 1, n + 1, figure=fig)
+
+    # Place the distance matrix
+    ax_matrix = fig.add_subplot(gs[1:, 1:])
+    ax_matrix.imshow(distance_matrix, cmap="viridis")
+    ax_matrix.set_xticks([])
+    ax_matrix.set_yticks([])
+
+    # Annotate each cell with the distance value
+    for (i, j), val in np.ndenumerate(distance_matrix):
+        ax_matrix.text(j, i, f"{val:.4f}", ha="center", va="center", color="white")
+
+    # Resize images to thumbnails
+    thumbnails = [img.resize(thumbnail_size, Image.ANTIALIAS) for img in images]
+
+    # Plot images on the top row
+    for i, img in enumerate(thumbnails):
+        ax_img_top = fig.add_subplot(gs[0, i + 1])
+        ax_img_top.imshow(np.array(img))
+        ax_img_top.axis("off")  # Hide axes
+
+    # Plot images on the left column
+    for i, img in enumerate(thumbnails):
+        ax_img_left = fig.add_subplot(gs[i + 1, 0])
+        ax_img_left.imshow(np.array(img))
+        ax_img_left.axis("off")  # Hide axes
+
+    plt.suptitle(func_name)
+    plt.show()
+
+
+def main() -> None:
+    """Main function to process images and display similarity metrics."""
+    recording = crud.get_latest_recording()
+    action_events = recording.processed_action_events
+    images = [action_event.screenshot.cropped_image for action_event in action_events]
+
+    similarity_funcs = {
+        "ssim": calculate_ssim,
+        "thresholded_difference": thresholded_difference,
+        "average_hash": lambda im1, im2: (
+            imagehash.average_hash(im1) - imagehash.average_hash(im2)
+        ),
+        "dhash": lambda im1, im2: (imagehash.dhash(im1) - imagehash.dhash(im2)),
+        "phash": lambda im1, im2: (imagehash.phash(im1) - imagehash.phash(im2)),
+        "crop_resistant_hash": lambda im1, im2: (
+            imagehash.crop_resistant_hash(im1) - imagehash.crop_resistant_hash(im2)
+        ),
+        "colorhash": lambda im1, im2: (
+            imagehash.colorhash(im1) - imagehash.colorhash(im2)
+        ),
+        "whash": lambda im1, im2: imagehash.whash(im1) - imagehash.whash(im2),
+    }
+
+    # Process each similarity function
+    for func_name, func in similarity_funcs.items():
+        hash_func = {
+            "average_hash": imagehash.average_hash,
+            "dhash": imagehash.dhash,
+            "phash": imagehash.phash,
+            "crop_resistant_hash": imagehash.crop_resistant_hash,
+            "colorhash": imagehash.colorhash,
+            "whash": imagehash.whash,
+        }.get(func_name, None)
+
+        # Create a matrix to store all pairwise distances
+        n = len(images)
+        distance_matrix = np.zeros((n, n))
+        durations = []
+        for i in range(n):
+            for j in range(i + 1, n):
+                start_time = time.time()
+                distance = abs(func(images[i], images[j]))
+                duration = time.time() - start_time
+                durations.append(duration)
+                distance_matrix[i, j] = distance
+                distance_matrix[j, i] = distance
+        mean_duration = sum(durations) / len(durations)
+        print(f"{func_name=}")
+        print(f"distance_matrix=\n{distance_matrix}")
+        print(f"{mean_duration=}")
+        display_distance_matrix_with_images(distance_matrix, images, func_name)
+        plot_images_with_mds(
+            images,
+            distance_matrix,
+            f"Image layout based on {func_name} ({mean_duration=:.4f}s)",
+            hash_func,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/openadapt/events.py b/openadapt/events.py
index 19caaa065..09bd77b30 100644
--- a/openadapt/events.py
+++ b/openadapt/events.py
@@ -357,13 +357,13 @@ def get_timestamp_mappings(
             "double_click_distance_pixels",
             utils.get_double_click_distance_pixels,
         )
-        logger.info(f"{double_click_distance=}")
+        logger.debug(f"{double_click_distance=}")
         double_click_interval = get_recording_attr(
             to_merge[0],
             "double_click_interval_seconds",
             utils.get_double_click_interval_seconds,
         )
-        logger.info(f"{double_click_interval=}")
+        logger.debug(f"{double_click_interval=}")
         press_to_press_t = {}
         press_to_release_t = {}
         prev_pressed_event = None
@@ -770,7 +770,7 @@ def discard_unused_events(
     ]
     num_referred_events_after = len(referred_events)
     num_referred_events_removed = num_referred_events_before - num_referred_events_after
-    logger.info(f"{referred_timestamp_key=} {num_referred_events_removed=}")
+    logger.debug(f"{referred_timestamp_key=} {num_referred_events_removed=}")
     return referred_events
 
 
diff --git a/openadapt/models.py b/openadapt/models.py
index 54f26a7e7..9110de076 100644
--- a/openadapt/models.py
+++ b/openadapt/models.py
@@ -6,7 +6,6 @@
 from loguru import logger
 from oa_pynput import keyboard
 from PIL import Image, ImageChops
-import mss.base
 import numpy as np
 import sqlalchemy as sa
 
@@ -368,6 +367,7 @@ class Screenshot(db.Base):
     png_data = sa.Column(sa.LargeBinary)
     png_diff_data = sa.Column(sa.LargeBinary, nullable=True)
     png_diff_mask_data = sa.Column(sa.LargeBinary, nullable=True)
+    # cropped_png_data = sa.Column(sa.LargeBinary, nullable=True)
 
     recording = sa.orm.relationship("Recording", back_populates="screenshots")
     action_event = sa.orm.relationship("ActionEvent", back_populates="screenshot")
@@ -375,55 +375,58 @@ class Screenshot(db.Base):
     def __init__(
         self,
         *args: tuple,
-        sct_img: mss.base.ScreenShot | None = None,
+        image: Image.Image | None = None,
         **kwargs: dict,
     ) -> None:
         """Initialize."""
         super().__init__(*args, **kwargs)
         self.initialize_instance_attributes()
-        self.sct_img = sct_img
+        self._image = image
 
     @sa.orm.reconstructor
     def initialize_instance_attributes(self) -> None:
         """Initialize attributes for both new and loaded objects."""
         # TODO: convert to png_data on save
-        self.sct_img = None
-
         # TODO: replace prev with prev_timestamp?
         self.prev = None
         self._image = None
-        self._image_history = []
+        self._cropped_image = None
         self._diff = None
         self._diff_mask = None
         self._base64 = None
 
     @property
-    def image(self) -> Image:
+    def image(self) -> Image.Image:
         """Get the image associated with the screenshot."""
         if not self._image:
-            if self.sct_img:
-                self._image = Image.frombytes(
-                    "RGB",
-                    self.sct_img.size,
-                    self.sct_img.bgra,
-                    "raw",
-                    "BGRX",
-                )
-            else:
-                self._image = self.convert_binary_to_png(self.png_data)
+            self._image = self.convert_binary_to_png(self.png_data)
         return self._image
 
+    @property
+    def cropped_image(self) -> Image.Image:
+        """Return screenshot image cropped to corresponding action's active window."""
+        if not self._cropped_image:
+            # if events have been merged, the last event will be the parent, e.g.
+            #   ipdb> [(action.name, action.timestamp) for action in self.action_event]
+            #   [('move', 1714142176.1630979), ('click', 1714142174.4848516),
+            #   ('singleclick', 1714142174.4537418)]
+            # TODO: verify (e.g. assert)
+            # TODO: rename action_event -> action_events?
+            action_event = self.action_event[-1]
+            self._cropped_image = self.crop_active_window(action_event)
+            # TODO: save?
+            # self.cropped_png_data = self.convert_png_to_binary(self._cropped_image)
+        return self._cropped_image
+
     @property
     def base64(self) -> str:
         """Return data URI of JPEG encoded base64."""
         if not self._base64:
-            from openadapt import utils
-
             self._base64 = utils.image2utf8(self.image)
         return self._base64
 
     @property
-    def diff(self) -> Image:
+    def diff(self) -> Image.Image:
         """Get the difference between the current and previous screenshot."""
         if self.png_diff_data:
             return self.convert_binary_to_png(self.png_diff_data)
@@ -433,7 +436,7 @@ def diff(self) -> Image:
         return self._diff
 
     @property
-    def diff_mask(self) -> Image:
+    def diff_mask(self) -> Image.Image:
         """Get the difference mask between the current and previous screenshot."""
         if self.png_diff_mask_data:
             return self.convert_binary_to_png(self.png_diff_mask_data)
@@ -450,18 +453,12 @@ def array(self) -> np.ndarray:
     @classmethod
     def take_screenshot(cls: "Screenshot") -> "Screenshot":
         """Capture a screenshot."""
-        # avoid circular import
-        from openadapt import utils
-
-        sct_img = utils.take_screenshot()
-        screenshot = Screenshot(sct_img=sct_img)
+        image = utils.take_screenshot()
+        screenshot = Screenshot(image=image)
         return screenshot
 
     def crop_active_window(self, action_event: ActionEvent) -> None:
         """Crop the screenshot to the active window defined by the action event."""
-        # avoid circular import
-        from openadapt import utils
-
         window_event = action_event.window_event
         width_ratio, height_ratio = utils.get_scale_ratios(action_event)
 
@@ -471,17 +468,10 @@ def crop_active_window(self, action_event: ActionEvent) -> None:
         y1 = y0 + window_event.height * height_ratio
 
         box = (x0, y0, x1, y1)
-        self._image_history.append(self.image)
-        self._image = self._image.crop(box)
-
-    @property
-    def original_image(self) -> Image:
-        """Get the original image (before any cropping)."""
-        if self._image_history:
-            return self._image_history[0]
-        return self.image
+        cropped_image = self._image.crop(box)
+        return cropped_image
 
-    def convert_binary_to_png(self, image_binary: bytes) -> Image:
+    def convert_binary_to_png(self, image_binary: bytes) -> Image.Image:
         """Convert a binary image to a PNG image.
 
         Args:
@@ -493,7 +483,7 @@ def convert_binary_to_png(self, image_binary: bytes) -> Image:
         buffer = io.BytesIO(image_binary)
         return Image.open(buffer)
 
-    def convert_png_to_binary(self, image: Image) -> bytes:
+    def convert_png_to_binary(self, image: Image.Image) -> bytes:
         """Convert a PNG image to binary image data.
 
         Args:
@@ -529,3 +519,7 @@ class MemoryStat(db.Base):
     recording_timestamp = sa.Column(sa.Integer)
     memory_usage_bytes = sa.Column(ForceFloat)
     timestamp = sa.Column(ForceFloat)
+
+
+# avoid circular import
+from openadapt import utils  # noqa
diff --git a/openadapt/strategies/visual.py b/openadapt/strategies/visual.py
index fb37674c8..a4421b89c 100644
--- a/openadapt/strategies/visual.py
+++ b/openadapt/strategies/visual.py
@@ -47,12 +47,16 @@
 
 from loguru import logger
 from PIL import Image, ImageDraw
+from skimage.metrics import structural_similarity as ssim
+import numpy as np
 
 from openadapt import adapters, common, models, strategies, utils, vision
 
 
 DEBUG = False
 DEBUG_REPLAY = False
+SEGMENTATIONS = []  # TODO: store to db
+MAX_SSIM = 0.9  # threshold for considering an image as similar
 
 
 @dataclass
@@ -60,6 +64,7 @@ class Segmentation:
     """A data class to encapsulate segmentation data of images.
 
     Attributes:
+        image: The original image used to generate segments.
         masked_images: A list of PIL Image objects that have been masked based on
             segmentation.
         descriptions: Descriptions of each segmented region, correlating with each
@@ -72,6 +77,7 @@ class Segmentation:
             centroid of each segmented region.
     """
 
+    image: Image.Image
     masked_images: list[Image.Image]
     descriptions: list[str]
     bounding_boxes: list[dict[str, float]]  # "top", "left", "height", "width"
@@ -362,6 +368,78 @@ def get_active_segment(
     return active_index
 
 
+def get_image_similarity(im1: Image.Image, im2: Image.Image) -> tuple[float, np.array]:
+    """Calculate the structural similarity index (SSIM) between two images.
+
+    This function first resizes the images to a common size maintaining their aspect
+    ratios. It then converts the resized images to grayscale and computes the SSIM.
+
+    Args:
+        im1 (Image.Image): The first image to compare.
+        im2 (Image.Image): The second image to compare.
+
+    Returns:
+        tuple[float, np.array]: A tuple containing the SSIM and the difference image.
+    """
+    # Calculate aspect ratios
+    aspect_ratio1 = im1.width / im1.height
+    aspect_ratio2 = im2.width / im2.height
+    # Use the smaller image as the base for resizing to maintain the aspect ratio
+    if aspect_ratio1 < aspect_ratio2:
+        base_width = min(im1.width, im2.width)
+        base_height = int(base_width / aspect_ratio1)
+    else:
+        base_height = min(im1.height, im2.height)
+        base_width = int(base_height * aspect_ratio2)
+
+    # Resize images to a common base while maintaining aspect ratio
+    im1 = im1.resize((base_width, base_height), Image.LANCZOS)
+    im2 = im2.resize((base_width, base_height), Image.LANCZOS)
+
+    # Convert images to grayscale
+    im1_gray = np.array(im1.convert("L"))
+    im2_gray = np.array(im2.convert("L"))
+
+    data_range = im2_gray.max() - im2_gray.min()
+    mssim, diff_image = ssim(im1_gray, im2_gray, data_range=data_range, full=True)
+
+    return mssim, diff_image
+
+
+def find_similar_image_segmentation(
+    image: Image.Image,
+    max_ssim: float = MAX_SSIM,
+) -> tuple[Segmentation, np.ndarray] | tuple[None, None]:
+    """Identify a similar image in the cache based on the SSIM comparison.
+
+    This function iterates through a global list of image segmentations,
+    comparing each against a given image using the SSIM index calculated by
+    get_image_similarity.
+    It logs and updates the best match found above a specified SSIM threshold.
+
+    Args:
+        image (Image.Image): The image to compare against the cache.
+        max_ssim (float): The minimum SSIM threshold for considering a match.
+
+    Returns:
+        tuple[Segmentation, np.ndarray] | tuple[None, None]: The best matching
+        segmentation and its difference image if a match is found;
+        otherwise, None for both.
+    """
+    similar_segmentation = None
+    similar_segmentation_diff = None
+
+    for segmentation in SEGMENTATIONS:
+        similarity_index, ssim_image = get_image_similarity(image, segmentation.image)
+        if similarity_index > max_ssim:
+            logger.info(f"{similarity_index=}")
+            max_ssim = similarity_index
+            similar_segmentation = segmentation
+            similar_segmentation_diff = ssim_image
+
+    return similar_segmentation, similar_segmentation_diff
+
+
 def get_window_segmentation(
     action_event: models.ActionEvent,
     exceptions: list[Exception] | None = None,
@@ -373,23 +451,35 @@ def get_window_segmentation(
         exceptions: list of exceptions previously raised, added to prompt.
 
     Returns:
-        Segmnetation object containing detailed segmentation information.
+        Segmentation object containing detailed segmentation information.
     """
     screenshot = action_event.screenshot
-    screenshot.crop_active_window(action_event)
-    original_image = screenshot.image
+    original_image = screenshot.cropped_image
     if DEBUG:
         original_image.show()
+
     segmentation_adapter = adapters.get_default_segmentation_adapter()
     segmented_image = segmentation_adapter.fetch_segmented_image(original_image)
     if DEBUG:
         segmented_image.show()
+
+    similar_segmentation, similar_segmentation_diff = find_similar_image_segmentation(
+        original_image,
+    )
+    if similar_segmentation:
+        # TODO XXX: create copy of similar_segmentation, but overwrite with segments of
+        # regions of new image where segments of similar_segmentation overlap non-zero
+        # regions of similar_segmentation_diff
+        return similar_segmentation
+
     masks = vision.process_image_for_masks(segmented_image)
     if DEBUG:
         vision.display_binary_images_grid(masks)
+
     refined_masks = vision.refine_masks(masks)
     if DEBUG:
         vision.display_binary_images_grid(refined_masks)
+
     masked_images = vision.extract_masked_images(original_image, refined_masks)
 
     original_image_base64 = screenshot.base64
@@ -408,9 +498,17 @@ def get_window_segmentation(
         len(descriptions),
         len(centroids),
     )
-    segmentation = Segmentation(masked_images, descriptions, bounding_boxes, centroids)
+    segmentation = Segmentation(
+        original_image,
+        masked_images,
+        descriptions,
+        bounding_boxes,
+        centroids,
+    )
     if DEBUG:
         vision.display_images_table_with_titles(masked_images, descriptions)
+
+    SEGMENTATIONS.append(segmentation)
     return segmentation
 
 
diff --git a/openadapt/utils.py b/openadapt/utils.py
index aa40c574a..5380d2def 100644
--- a/openadapt/utils.py
+++ b/openadapt/utils.py
@@ -479,7 +479,7 @@ def get_scale_ratios(action_event: ActionEvent) -> tuple[float, float]:
         float: The height ratio.
     """
     recording = action_event.recording
-    image = action_event.screenshot.original_image
+    image = action_event.screenshot.image
     width_ratio = image.width / recording.monitor_width
     height_ratio = image.height / recording.monitor_height
     return width_ratio, height_ratio
diff --git a/poetry.lock b/poetry.lock
index 78db08024..91d5d3a6c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
 
 [[package]]
 name = "aiofiles"
@@ -2259,6 +2259,23 @@ files = [
     {file = "ifaddr-0.2.0.tar.gz", hash = "sha256:cc0cbfcaabf765d44595825fb96a99bb12c79716b73b44330ea38ee2b0c4aed4"},
 ]
 
+[[package]]
+name = "imagehash"
+version = "4.3.1"
+description = "Image Hashing library"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ImageHash-4.3.1-py2.py3-none-any.whl", hash = "sha256:5ad9a5cde14fe255745a8245677293ac0d67f09c330986a351f34b614ba62fb5"},
+    {file = "ImageHash-4.3.1.tar.gz", hash = "sha256:7038d1b7f9e0585beb3dd8c0a956f02b95a346c0b5f24a9e8cc03ebadaf0aa70"},
+]
+
+[package.dependencies]
+numpy = "*"
+pillow = "*"
+PyWavelets = "*"
+scipy = "*"
+
 [[package]]
 name = "imageio"
 version = "2.34.0"
@@ -3195,7 +3212,10 @@ files = [
 decorator = ">=4.0.2,<5.0"
 imageio = {version = ">=2.5,<3.0", markers = "python_version >= \"3.4\""}
 imageio_ffmpeg = {version = ">=0.2.0", markers = "python_version >= \"3.4\""}
-numpy = {version = ">=1.17.3", markers = "python_version > \"2.7\""}
+numpy = [
+    {version = ">=1.17.3", markers = "python_version != \"2.7\""},
+    {version = "*", markers = "python_version >= \"2.7\""},
+]
 proglog = "<=1.0.0"
 requests = ">=2.8.1,<3.0"
 tqdm = ">=4.11.2,<5.0"
@@ -3680,8 +3700,11 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.21.2", markers = "python_version >= \"3.10\""},
     {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""},
-    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""},
+    {version = ">=1.19.3", markers = "python_version >= \"3.6\" and platform_system == \"Linux\" and platform_machine == \"aarch64\" or python_version >= \"3.9\""},
+    {version = ">=1.17.0", markers = "python_version >= \"3.7\""},
+    {version = ">=1.17.3", markers = "python_version >= \"3.8\""},
 ]
 
 [[package]]
@@ -3702,8 +3725,11 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.21.2", markers = "python_version >= \"3.10\""},
     {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\""},
-    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\""},
+    {version = ">=1.19.3", markers = "python_version >= \"3.6\" and platform_system == \"Linux\" and platform_machine == \"aarch64\" or python_version >= \"3.9\""},
+    {version = ">=1.17.0", markers = "python_version >= \"3.7\""},
+    {version = ">=1.17.3", markers = "python_version >= \"3.8\""},
 ]
 
 [[package]]
@@ -5320,6 +5346,51 @@ files = [
     {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
 ]
 
+[[package]]
+name = "pywavelets"
+version = "1.6.0"
+description = "PyWavelets, wavelet transform module"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pywavelets-1.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ddc1ff5ad706313d930f857f9656f565dfb81b85bbe58a9db16ad8fa7d1537c5"},
+    {file = "pywavelets-1.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:78feab4e0c25fa32034b6b64cb854c6ce15663b4f0ffb25d8f0ee58915300f9b"},
+    {file = "pywavelets-1.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be36f08efe9bc3abf40cf40cd2ee0aa0db26e4894e13ce5ac178442864161e8c"},
+    {file = "pywavelets-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0595c51472c9c5724fe087cb73e2797053fd25c788d6553fdad6ff61abc60e91"},
+    {file = "pywavelets-1.6.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:058a750477dde633ac53b8806f835af3559d52db6532fb2b93c1f4b5441365b8"},
+    {file = "pywavelets-1.6.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:538795d9c4181152b414285b5a7f72ac52581ecdcdce74b6cca3fa0b8a5ab0aa"},
+    {file = "pywavelets-1.6.0-cp310-cp310-win32.whl", hash = "sha256:47de024ba4f9df97e98b5f540340e1a9edd82d2c477450bef8c9b5381487128e"},
+    {file = "pywavelets-1.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:e2c44760c0906ddf2176920a2613287f6eea947f166ce7eee9546081b06a6835"},
+    {file = "pywavelets-1.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d91aaaf6de53b758bcdc96c81cdb5a8607758602be49f691188c0e108cf1e738"},
+    {file = "pywavelets-1.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3b5302edb6d1d1ff6636d37c9ff29c4892f2a3648d736cc1df01f3f36e25c8cf"},
+    {file = "pywavelets-1.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5e655446e37a3c87213d5c6386b86f65c4d61736b4432d720171e7dd6523d6a"},
+    {file = "pywavelets-1.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ec7d69b746a0eaa327b829a3252a63619f2345e263177be5dd9bf30d7933c8d"},
+    {file = "pywavelets-1.6.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:97ea9613bd6b7108ebb44b709060adc7e2d5fac73be7152342bdd5513d75f84e"},
+    {file = "pywavelets-1.6.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:48b3813c6d1a7a8194f37dbb5dbbdf2fe1112152c91445ea2e54f64ff6350c36"},
+    {file = "pywavelets-1.6.0-cp311-cp311-win32.whl", hash = "sha256:4ffb484d096a5eb10af7121e0203546a03e1369328df321a33ef91f67bac40cf"},
+    {file = "pywavelets-1.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:274bc47b289585383aa65519b3fcae5b4dee5e31db3d4198d4fad701a70e59f7"},
+    {file = "pywavelets-1.6.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d6ec113386a432e04103f95e351d2657b42145bd1e1ed26513423391bcb5f011"},
+    {file = "pywavelets-1.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab652112d3932d21f020e281e06926a751354c2b5629fb716f5eb9d0104b84e5"},
+    {file = "pywavelets-1.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47b0314a22616c5f3f08760f0e00b4a15b7c7dadca5e39bb701cf7869a4207c5"},
+    {file = "pywavelets-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:138471513bc0a4cd2ddc4e50c7ec04e3468c268e101a0d02f698f6aedd1d5e79"},
+    {file = "pywavelets-1.6.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:67936491ae3e5f957c428e34fdaed21f131535b8d60c7c729a1b539ce8864837"},
+    {file = "pywavelets-1.6.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:dd798cee3d28fb3d32a26a00d9831a20bf316c36d685e4ced01b4e4a8f36f5ce"},
+    {file = "pywavelets-1.6.0-cp312-cp312-win32.whl", hash = "sha256:e772f7f0c16bfc3be8ac3cd10d29a9920bb7a39781358856223c491b899e6e79"},
+    {file = "pywavelets-1.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:4ef15a63a72afa67ae9f4f3b06c95c5382730fb3075e668d49a880e65f2f089c"},
+    {file = "pywavelets-1.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:627df378e63e9c789b6f2e7060cb4264ebae6f6b0efc1da287a2c060de454a1f"},
+    {file = "pywavelets-1.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a413b51dc19e05243fe0b0864a8e8a16b5ca9bf2e4713da00a95b1b5747a5367"},
+    {file = "pywavelets-1.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be615c6c1873e189c265d4a76d1751ec49b17e29725e6dd2e9c74f1868f590b7"},
+    {file = "pywavelets-1.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4021ef69ec9f3862f66580fc4417be728bd78722914394594b48212fd1fcaf21"},
+    {file = "pywavelets-1.6.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8fbf7b61b28b5457693c034e58a01622756d1fd60a80ae13ac5888b1d3e57e80"},
+    {file = "pywavelets-1.6.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f58ddbb0a6cd243928876edfc463b990763a24fb94498607d6fea690e32cca4c"},
+    {file = "pywavelets-1.6.0-cp39-cp39-win32.whl", hash = "sha256:42a22e68e345b6de7d387ef752111ab4530c98048d2b4bdac8ceefb078b4ead6"},
+    {file = "pywavelets-1.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:32198de321892743c1a3d1957fe1cd8a8ecc078bfbba6b8f3982518e897271d7"},
+    {file = "pywavelets-1.6.0.tar.gz", hash = "sha256:ea027c70977122c5fc27b2510f0a0d9528f9c3df6ea3e4c577ca55fd00325a5b"},
+]
+
+[package.dependencies]
+numpy = ">=1.22.4,<3"
+
 [[package]]
 name = "pywebview"
 version = "4.4.1"
@@ -5424,7 +5495,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -6631,7 +6701,7 @@ files = [
 ]
 
 [package.dependencies]
-greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
+greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\")"}
 
 [package.extras]
 aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
@@ -7968,4 +8038,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "3.10.x"
-content-hash = "c89aabf0d879d19b441855c57f81dbe688f69547969ec1c2655a0d92c866a029"
+content-hash = "890771696b49b9e034aa81375222ac315c7e97bb2ebf99d87d296b5cc01f70da"
diff --git a/pyproject.toml b/pyproject.toml
index d48d4b2a3..078729527 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,6 +97,7 @@ gradio-client = "0.15.0"
 google-generativeai = "^0.5.0"
 uvicorn = {extras = ["standard"], version = "^0.22"}
 ultralytics = "^8.1.47"
+imagehash = "^4.3.1"
 
 [tool.pytest.ini_options]
 filterwarnings = [
diff --git a/tests/openadapt/test_crop.py b/tests/openadapt/test_crop.py
index 7ce80b179..31f7cd12d 100644
--- a/tests/openadapt/test_crop.py
+++ b/tests/openadapt/test_crop.py
@@ -41,9 +41,9 @@ def test_crop_active_window() -> None:
         original_size = screenshot._image.size
 
         # Perform the cropping operation
-        screenshot.crop_active_window(action_event=action_event_mock)
+        cropped_image = screenshot.crop_active_window(action_event=action_event_mock)
 
         # Verify that the image size has been reduced
-        assert (screenshot._image.size[0] < original_size[0]) or (
-            screenshot._image.size[1] < original_size[1]
+        assert (cropped_image.size[0] < original_size[0]) or (
+            cropped_image.size[1] < original_size[1]
         )