From c80f22991829cc2318b08181ddb645976da57f0a Mon Sep 17 00:00:00 2001 From: Francesca Drummer <36380336+FrancescaDr@users.noreply.github.com> Date: Thu, 31 Oct 2024 21:47:10 +0100 Subject: [PATCH] Add function to assign observations to a sliding window (#859) * read nanostring, optional CellComposite and CellLabels * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * comment out test * pin pydantic (#767) * rebase with squidpy/main * sliding window function * sliding window tests * sliding window implementation * sliding window conflict * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * resolve commit errors * resolve commit errors * PR requested changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pytest errors and warnings resolved * remove statistics * remove stats * grid_len calculation * refactored * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed test * fixed test * fixed bug * fixed test * updated changelog --------- Co-authored-by: Francesca Drummer Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Francesca Drummer Co-authored-by: Giovanni Palla <25887487+giovp@users.noreply.github.com> Co-authored-by: FrancescaDr Co-authored-by: LLehner <64135338+LLehner@users.noreply.github.com> Co-authored-by: Tim Treis Co-authored-by: Tim Treis --- docs/release/notes-dev.rst | 8 + src/squidpy/tl/__init__.py | 1 + src/squidpy/tl/_sliding_window.py | 240 +++++++++++++++++++++++++++++ tests/tools/test_sliding_window.py | 180 ++++++++++++++++++++++ tox.ini | 2 +- 5 files changed, 430 insertions(+), 1 deletion(-) create mode 100644 src/squidpy/tl/_sliding_window.py create mode 100644 tests/tools/test_sliding_window.py diff --git a/docs/release/notes-dev.rst b/docs/release/notes-dev.rst index 86ce6e4c..cc2b0a79 100644 --- a/docs/release/notes-dev.rst +++ b/docs/release/notes-dev.rst @@ -1,2 +1,10 @@ Squidpy dev (the-future) ======================== + +Features +-------- + +- New function :func:`squidpy.tl.sliding_window` for creating sliding window assignments + `@FrancescaDr `__ + `@timtreis `__ + `#842 `__ diff --git a/src/squidpy/tl/__init__.py b/src/squidpy/tl/__init__.py index edaa3182..6d5abe98 100644 --- a/src/squidpy/tl/__init__.py +++ b/src/squidpy/tl/__init__.py @@ -2,4 +2,5 @@ from __future__ import annotations +from squidpy.tl._sliding_window import _calculate_window_corners, sliding_window from squidpy.tl._var_by_distance import var_by_distance diff --git a/src/squidpy/tl/_sliding_window.py b/src/squidpy/tl/_sliding_window.py new file mode 100644 index 00000000..431b3103 --- /dev/null +++ b/src/squidpy/tl/_sliding_window.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +from collections import defaultdict +from itertools import product + +import numpy as np +import pandas as pd +from anndata import AnnData +from scanpy import logging as logg +from spatialdata import SpatialData + +from squidpy._docs import d +from squidpy.gr._utils import _save_data + +__all__ = ["sliding_window"] + + +@d.dedent +def sliding_window( + adata: AnnData | SpatialData, + library_key: str | None = None, + window_size: int | None = None, + overlap: int = 0, + coord_columns: tuple[str, str] = ("globalX", "globalY"), + sliding_window_key: str = "sliding_window_assignment", + spatial_key: str = "spatial", + drop_partial_windows: bool = False, + copy: bool = False, +) -> pd.DataFrame | None: + """ + Divide a tissue slice into regulary shaped spatially contiguous regions (windows). + + Parameters + ---------- + %(adata)s + window_size: int + Size of the sliding window. + %(library_key)s + coord_columns: Tuple[str, str] + Tuple of column names in `adata.obs` that specify the coordinates (x, y), e.i. ('globalX', 'globalY') + sliding_window_key: str + Base name for sliding window columns. + overlap: int + Overlap size between consecutive windows. (0 = no overlap) + %(spatial_key)s + drop_partial_windows: bool + If True, drop windows that are smaller than the window size at the borders. + copy: bool + If True, return the result, otherwise save it to the adata object. + + Returns + ------- + If ``copy = True``, returns the sliding window annotation(s) as pandas dataframe + Otherwise, stores the sliding window annotation(s) in .obs. + """ + if overlap < 0: + raise ValueError("Overlap must be non-negative.") + + if isinstance(adata, SpatialData): + adata = adata.table + + # we don't want to modify the original adata in case of copy=True + if copy: + adata = adata.copy() + + # extract coordinates of observations + x_col, y_col = coord_columns + if x_col in adata.obs and y_col in adata.obs: + coords = adata.obs[[x_col, y_col]].copy() + elif spatial_key in adata.obsm: + coords = pd.DataFrame( + adata.obsm[spatial_key][:, :2], + index=adata.obs.index, + columns=[x_col, y_col], + ) + else: + raise ValueError( + f"Coordinates not found. Provide `{coord_columns}` in `adata.obs` or specify a suitable `spatial_key` in `adata.obsm`." + ) + + # infer window size if not provided + if window_size is None: + coord_range = max( + coords[x_col].max() - coords[x_col].min(), + coords[y_col].max() - coords[y_col].min(), + ) + # mostly arbitrary choice, except that full integers usually generate windows with 1-2 cells at the borders + window_size = max(int(np.floor(coord_range // 3.95)), 1) + + if window_size <= 0: + raise ValueError("Window size must be larger than 0.") + + if library_key is not None and library_key not in adata.obs: + raise ValueError(f"Library key '{library_key}' not found in adata.obs") + + libraries = [None] if library_key is None else adata.obs[library_key].unique() + + # Create a DataFrame to store the sliding window assignments + sliding_window_df = pd.DataFrame(index=adata.obs.index) + + if sliding_window_key in adata.obs: + logg.warning(f"Overwriting existing column '{sliding_window_key}' in adata.obs.") + + for lib in libraries: + if lib is not None: + lib_mask = adata.obs[library_key] == lib + lib_coords = coords.loc[lib_mask] + else: + lib_mask = np.ones(len(adata), dtype=bool) + lib_coords = coords + + min_x, max_x = lib_coords[x_col].min(), lib_coords[x_col].max() + min_y, max_y = lib_coords[y_col].min(), lib_coords[y_col].max() + + # precalculate windows + windows = _calculate_window_corners( + min_x=min_x, + max_x=max_x, + min_y=min_y, + max_y=max_y, + window_size=window_size, + overlap=overlap, + drop_partial_windows=drop_partial_windows, + ) + + lib_key = f"{lib}_" if lib is not None else "" + + # assign observations to windows + for idx, window in windows.iterrows(): + x_start = window["x_start"] + x_end = window["x_end"] + y_start = window["y_start"] + y_end = window["y_end"] + + mask = ( + (lib_coords[x_col] >= x_start) + & (lib_coords[x_col] <= x_end) + & (lib_coords[y_col] >= y_start) + & (lib_coords[y_col] <= y_end) + ) + obs_indices = lib_coords.index[mask] + + if overlap == 0: + mask = ( + (lib_coords[x_col] >= x_start) + & (lib_coords[x_col] <= x_end) + & (lib_coords[y_col] >= y_start) + & (lib_coords[y_col] <= y_end) + ) + obs_indices = lib_coords.index[mask] + sliding_window_df.loc[obs_indices, sliding_window_key] = f"{lib_key}window_{idx}" + + else: + col_name = f"{sliding_window_key}_{lib_key}window_{idx}" + sliding_window_df.loc[obs_indices, col_name] = True + sliding_window_df.loc[:, col_name].fillna(False, inplace=True) + + if overlap == 0: + # create categorical variable for ordered windows + sliding_window_df[sliding_window_key] = pd.Categorical( + sliding_window_df[sliding_window_key], + ordered=True, + categories=sorted( + sliding_window_df[sliding_window_key].unique(), + key=lambda x: int(x.split("_")[-1]), + ), + ) + + sliding_window_df[x_col] = coords[x_col] + sliding_window_df[y_col] = coords[y_col] + + if copy: + return sliding_window_df + for col_name, col_data in sliding_window_df.items(): + _save_data(adata, attr="obs", key=col_name, data=col_data) + + +def _calculate_window_corners( + min_x: int, + max_x: int, + min_y: int, + max_y: int, + window_size: int, + overlap: int = 0, + drop_partial_windows: bool = False, +) -> pd.DataFrame: + """ + Calculate the corner points of all windows covering the area from min_x to max_x and min_y to max_y, + with specified window_size and overlap. + + Parameters + ---------- + min_x: float + minimum X coordinate + max_x: float + maximum X coordinate + min_y: float + minimum Y coordinate + max_y: float + maximum Y coordinate + window_size: float + size of each window + overlap: float + overlap between consecutive windows (must be less than window_size) + drop_partial_windows: bool + if True, drop border windows that are smaller than window_size; + if False, create smaller windows at the borders to cover the remaining space. + + Returns + ------- + windows: pandas DataFrame with columns ['x_start', 'x_end', 'y_start', 'y_end'] + """ + if overlap < 0: + raise ValueError("Overlap must be non-negative.") + if overlap >= window_size: + raise ValueError("Overlap must be less than the window size.") + + x_step = window_size - overlap + y_step = window_size - overlap + + # Generate starting points + x_starts = np.arange(min_x, max_x, x_step) + y_starts = np.arange(min_y, max_y, y_step) + + # Create all combinations of x and y starting points + starts = list(product(x_starts, y_starts)) + windows = pd.DataFrame(starts, columns=["x_start", "y_start"]) + windows["x_end"] = windows["x_start"] + window_size + windows["y_end"] = windows["y_start"] + window_size + + # Adjust windows that extend beyond the bounds + if not drop_partial_windows: + windows["x_end"] = windows["x_end"].clip(upper=max_x) + windows["y_end"] = windows["y_end"].clip(upper=max_y) + else: + valid_windows = (windows["x_end"] <= max_x) & (windows["y_end"] <= max_y) + windows = windows[valid_windows] + + windows = windows.reset_index(drop=True) + return windows[["x_start", "x_end", "y_start", "y_end"]] diff --git a/tests/tools/test_sliding_window.py b/tests/tools/test_sliding_window.py new file mode 100644 index 00000000..51bb99b9 --- /dev/null +++ b/tests/tools/test_sliding_window.py @@ -0,0 +1,180 @@ +from __future__ import annotations + +from typing import Any, Literal + +import numpy as np +import pytest +from anndata import AnnData + +from squidpy.tl import _calculate_window_corners, sliding_window + + +class TestSlidingWindow: + @pytest.mark.parametrize( + "windowsize_overlap_drop", + [ + (300, 0, False), + (300, 50, False), + (300, 50, True), + ], + ) + def test_sliding_window_several_slices( + self, + adata_mibitof: AnnData, + windowsize_overlap_drop: tuple[int, int, bool], + sliding_window_key: str = "sliding_window_key", + library_key: str = "library_id", + ): + def _count_total_assignments(): + total_cells = 0 + for lib_key in ["point8", "point16", "point23"]: + cols_in_lib = df.columns[df.columns.str.contains(lib_key)] + for col in cols_in_lib: + total_cells += df[col].sum() + return total_cells + + window_size, overlap, drop_partial_windows = windowsize_overlap_drop + df = sliding_window( + adata_mibitof, + library_key=library_key, + window_size=window_size, + overlap=overlap, + coord_columns=("globalX", "globalY"), + sliding_window_key=sliding_window_key, + copy=True, + drop_partial_windows=drop_partial_windows, + ) + + if overlap == 0: + sliding_window_columns = [col for col in df.columns if sliding_window_key in col] + assert len(sliding_window_columns) == 1 # only one sliding window + assert df[sliding_window_key].isnull().sum() == 0 # no unassigned cells + assert len(df) == adata_mibitof.n_obs # correct amount of rows + else: + sliding_window_cols = df.columns[df.columns.str.contains("sliding_window")] + + if drop_partial_windows: + assert len(sliding_window_cols) == 27 + assert _count_total_assignments() == 2536 + else: + assert len(sliding_window_cols) == 70 + assert _count_total_assignments() == 4569 + + @pytest.mark.parametrize("overlap", [0, 2]) + def test_sliding_window_square_grid( + self, + adata_squaregrid: AnnData, + overlap: int, + sliding_window_key: str = "sliding_window_key", + window_size: int = 5, + ): + df = sliding_window( + adata_squaregrid, + window_size=window_size, + overlap=overlap, + coord_columns=("globalX", "globalY"), + sliding_window_key=sliding_window_key, + copy=True, + ) + + assert len(df) == adata_squaregrid.n_obs # correct amount of rows + + if overlap == 0: + sliding_window_columns = [col for col in df.columns if sliding_window_key in col] + assert len(sliding_window_columns) == 1 # only one sliding window + assert df[sliding_window_key].isnull().sum() == 0 # no unassigned cells + else: + for i in range(9): # we expect 9 windows + assert ( + f"{sliding_window_key}_window_{i}" in df.columns + ) # correct number of columns; multiple sliding windows + + def test_sliding_window_invalid_window_size( + self, + adata_squaregrid: AnnData, + ): + with pytest.raises(ValueError, match="Window size must be larger than 0."): + sliding_window( + adata_squaregrid, + window_size=-10, + overlap=0, + coord_columns=("globalX", "globalY"), + sliding_window_key="sliding_window", + copy=True, + ) + + with pytest.raises(ValueError, match="Overlap must be non-negative."): + sliding_window( + adata_squaregrid, + window_size=10, + overlap=-10, + coord_columns=("globalX", "globalY"), + sliding_window_key="sliding_window", + copy=True, + ) + + def test_calculate_window_corners_overlap(self): + min_x = 0 + max_x = 200 + min_y = 0 + max_y = 200 + window_size = 100 + overlap = 20 + + windows = _calculate_window_corners( + min_x=min_x, + max_x=max_x, + min_y=min_y, + max_y=max_y, + window_size=window_size, + overlap=overlap, + drop_partial_windows=False, + ) + + assert windows.shape == (9, 4) + assert windows.iloc[0].values.tolist() == [0, 100, 0, 100] + assert windows.iloc[-1].values.tolist() == [160, 200, 160, 200] + + def test_calculate_window_corners_no_overlap(self): + min_x = 0 + max_x = 200 + min_y = 0 + max_y = 200 + window_size = 100 + overlap = 0 + + windows = _calculate_window_corners( + min_x=min_x, + max_x=max_x, + min_y=min_y, + max_y=max_y, + window_size=window_size, + overlap=overlap, + drop_partial_windows=False, + ) + + assert windows.shape == (4, 4) + assert windows.iloc[0].values.tolist() == [0, 100, 0, 100] + assert windows.iloc[-1].values.tolist() == [100, 200, 100, 200] + + def test_calculate_window_corners_drop_partial_windows(self): + min_x = 0 + max_x = 200 + min_y = 0 + max_y = 200 + window_size = 100 + overlap = 20 + + windows = _calculate_window_corners( + min_x=min_x, + max_x=max_x, + min_y=min_y, + max_y=max_y, + window_size=window_size, + overlap=overlap, + drop_partial_windows=True, + ) + + assert windows.shape == (4, 4) + assert windows.iloc[0].values.tolist() == [0, 100, 0, 100] + assert windows.iloc[-1].values.tolist() == [80, 180, 80, 180] diff --git a/tox.ini b/tox.ini index c50f43b9..470c4ccf 100644 --- a/tox.ini +++ b/tox.ini @@ -79,7 +79,7 @@ setenv = linux: PYTEST_FLAGS=--test-napari passenv = TOXENV,CI,CODECOV_*,GITHUB_ACTIONS,PYTEST_FLAGS,DISPLAY,XAUTHORITY,MPLBACKEND usedevelop = true commands = - python -m pytest --cov --cov-append --cov-report=xml --cov-config={toxinidir}/tox.ini --ignore docs/ {posargs:-vv} {env:PYTEST_FLAGS:} + python -m pytest --color=yes --cov --cov-append --cov-report=xml --cov-config={toxinidir}/tox.ini --ignore docs/ {posargs:-vv} {env:PYTEST_FLAGS:} [testenv:covclean] description = Clean coverage files.