Skip to content

Commit

Permalink
[python] Add and use new Visium datasets (#3712)
Browse files Browse the repository at this point in the history
* Remove cleaning steps from Makefile target `data` and create new target `clean_data` that handles removing existing test data.
* Add scripts `scripts/prepare-test-data.sh` and `scripts/clean-test-data.sh` for preparing and cleaning data.
* Add Visium v2 dataset to the `data` and `clean_data` targets.
* Add first `from_visium` test case that using Visium v2 test.
* Add `make data` to appropriate GitHub CI actions.

---------

Co-authored-by: John Kerl <[email protected]>
  • Loading branch information
jp-dark and johnkerl authored Feb 25, 2025
1 parent 47ad4c2 commit 967fc04
Show file tree
Hide file tree
Showing 14 changed files with 227 additions and 11 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/python-ci-single.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ jobs:
with:
fetch-depth: 0 # ensure we get all tags to inform package version determination

- name: Set Up Test Data
run: make data

- name: Set up Python ${{ inputs.python_version }}
uses: actions/setup-python@v5
with:
Expand Down
6 changes: 1 addition & 5 deletions .github/workflows/python-dependency-variation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,7 @@ jobs:

- name: Obtain test data
shell: bash
run: |
cd test
rm -rf soco
tar zxf soco.tgz
cd ..
run: make data

- name: Run pytests for Python
shell: bash
Expand Down
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,11 @@ ctest_update:

.PHONY: data
data:
cd test && rm -rf soco && tar zxf soco.tgz && cd ..
@./scripts/prepare-test-data.sh

.PHONY: clean_data
clean_data:
@./scripts/clean-test-data.sh

# format
# -------------------------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion apis/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,5 @@ no-lines-before = ["tiledb"]


[tool.pytest.ini_options]
filterwarnings = ['ignore:Support for spatial types is experimental']
filterwarnings = ["ignore:Support for spatial types is experimental"]
markers = ["slow: mark test as slow"]
3 changes: 3 additions & 0 deletions apis/python/tests/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,11 @@ def make_pd_df(index_str: str | None = None, **cols) -> pd.DataFrame:

HERE = Path(__file__).parent
PY_ROOT = HERE.parent
PROJECT_ROOT = PY_ROOT.parent.parent
TESTDATA = PY_ROOT / "testdata"

ROOT_DATA_DIR = PROJECT_ROOT / "data"


@contextmanager
def raises_no_typeguard(exc: Type[Exception], *args: Any, **kwargs: Any):
Expand Down
2 changes: 1 addition & 1 deletion apis/python/tests/test_aaa_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os

TEST_DIR = os.path.dirname(__file__)
SOMA_URI = f"{TEST_DIR}/../../../test/soco/pbmc3k_processed"
SOMA_URI = f"{TEST_DIR}/../../../data/soco/pbmc3k_processed"

if not os.path.exists(SOMA_URI):
raise RuntimeError("Please run `make data` in the repo base directory")
112 changes: 112 additions & 0 deletions apis/python/tests/test_from_visium.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import os

import numpy as np
import pytest

import tiledbsoma as soma

from ._util import ROOT_DATA_DIR

spatial_io = pytest.importorskip("tiledbsoma.io.spatial")


@pytest.fixture(scope="module")
def visium_v2_path():
"""Fixture that checks the example Visium v2 dataset exists."""
visium_path = ROOT_DATA_DIR / "example-visium-v2"
if not os.path.isdir(visium_path):
raise RuntimeError(
"Missing 'data/example-visium-v2' directory. Try running `make data` "
"from the TileDB-SOMA project root directory."
)
for filename in [
"filtered_feature_bc_matrix.h5",
"raw_feature_bc_matrix.h5",
"spatial/tissue_positions.csv",
"spatial/scalefactors_json.json",
"spatial/tissue_hires_image.png",
"spatial/tissue_lowres_image.png",
]:
if not os.path.isfile(visium_path / filename):
raise RuntimeError(
f"Missing file 'data/example-visium-v2/{filename}'. Try removing "
f"the directory 'data/example-visium-v2' and re-running `make data'"
f"from the project root directory."
)

return visium_path


def test_visium_paths_v2(visium_v2_path):
"""Test ``VisiumPaths`` for Visium v2 in standard structure."""
visium_paths = spatial_io.VisiumPaths.from_base_folder(visium_v2_path)
assert os.path.isfile(visium_paths.gene_expression)
assert os.path.isfile(visium_paths.tissue_positions)
assert visium_paths.fullres_image is None
assert os.path.isfile(visium_paths.hires_image)
assert os.path.isfile(visium_paths.lowres_image)
assert visium_paths.version == (2, 0, 0)
assert visium_paths.has_image
assert visium_paths.major_version == 2


@pytest.mark.slow
def test_from_visium_for_visium_v2(tmp_path, visium_v2_path):
"""Test `from_visium` runs without error."""
PIL = pytest.importorskip("PIL")
uri = f"{tmp_path.as_uri()}/from_visium_for_visium_v2"
exp_uri = spatial_io.from_visium(
uri,
visium_v2_path,
"RNA",
"fresh_frozen_mouse_brain",
write_obs_spatial_presence=True,
write_var_spatial_presence=True,
)
with soma.Experiment.open(exp_uri) as exp:

# Check for the existance of obs, RNA/X, and RNA/var
assert isinstance(exp.obs, soma.DataFrame)
assert isinstance(exp.ms["RNA"].X["data"], soma.SparseNDArray)
assert isinstance(exp.ms["RNA"].var, soma.DataFrame)

# Check for the existance of the presence matrices.
assert isinstance(exp.obs_spatial_presence, soma.DataFrame)
assert isinstance(exp.ms["RNA"].var_spatial_presence, soma.DataFrame)

# Check for scene.
assert isinstance(exp.spatial["fresh_frozen_mouse_brain"], soma.Scene)

# Check expected datatypes in scene.
scene = exp.spatial["fresh_frozen_mouse_brain"]
assert isinstance(scene.obsl["loc"], soma.PointCloudDataFrame)
assert len(scene.varl.items()) == 0
assert isinstance(scene.img["tissue"], soma.MultiscaleImage)

# Check point cloud dataframe data.
output_points_df = scene.obsl["loc"].read().concat().to_pandas()
assert output_points_df.columns.tolist() == [
"x",
"y",
"soma_joinid",
"in_tissue",
"array_row",
"array_col",
"spot_diameter_fullres",
]
assert len(output_points_df) == 2797

# Check image.
image = scene.img["tissue"]
hires_data = np.moveaxis(image["hires"].read().to_numpy(), 0, -1)
with PIL.Image.open(
visium_v2_path / "spatial" / "tissue_hires_image.png"
) as input_hires:
expected = np.array(input_hires)
np.testing.assert_equal(expected, hires_data)
lowres_data = np.moveaxis(image["lowres"].read().to_numpy(), 0, -1)
with PIL.Image.open(
visium_v2_path / "spatial" / "tissue_lowres_image.png"
) as input_lowres:
expected = np.array(input_lowres)
np.testing.assert_equal(expected, lowres_data)
2 changes: 1 addition & 1 deletion apis/python/tests/test_query_condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
VERBOSE = False

TEST_DIR = os.path.dirname(__file__)
SOMA_URI = f"{TEST_DIR}/../../../test/soco/pbmc3k_processed"
SOMA_URI = f"{TEST_DIR}/../../../data/soco/pbmc3k_processed"

if VERBOSE:
clib.config_logging("debug")
Expand Down
2 changes: 1 addition & 1 deletion apis/python/tests/test_soma_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
VERBOSE = False

TEST_DIR = os.path.dirname(__file__)
SOMA_URI = f"{TEST_DIR}/../../../test/soco/pbmc3k_processed"
SOMA_URI = f"{TEST_DIR}/../../../data/soco/pbmc3k_processed"

if VERBOSE:
clib.config_logging("debug")
Expand Down
8 changes: 8 additions & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Ignore everything by default.
*

# Explicitly track the following files and directories.
!.gitignore
!README.md
!10x-pbmc-multiome-v1.0
!pbmc3k_processed.h5ad
34 changes: 34 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# TileDB-SOMA Data

This folder contains data for use in tests and examples.

## How to add new data

### Check new data and upload to GitHub for hosting

Temporarily add a copy of your data to this directory in the way you will eventually want to use it. Create your new tests or examples and check that the data serves your purpose.

Once you have verified the data works as intended, add it to a [TileDB-SOMA-Test-Data](https://github.com/single-cell-data/TileDB-SOMA-Test-Data) release as described in that project's [README](https://github.com/single-cell-data/TileDB-SOMA-Test-Data/blob/main/README.md).

### Update `make data` to include new dataset

The Makefile `data` target calls [scripts/prepare-test-data.sh](../scripts/prepare-test-data.sh). You will need to update the script to include a section to prepare your new data. Your update must do the following:

1. Create a new file or folder in the `data/` directory (this directory) that includes the desired data if the data has not already been prepared.
2. Gracefully skip preparing data that has already been added.

If you are including data from TileDB-SOMA-Test-Data, the easiest way to access the data is to download all necessary to a new directory using "wget". Extract any data that is compressed.

### Update `make clean_data` to remove new dataset

The Makefile `clean_data` target calls [scripts/clean-test-data.sh](../scripts/clean-test-data). You will need to update the script to include a section that removes your new data.

### Checks

Before committing your changes to `scripts/prepare-test-data.sh` verify the following:

1. The command `make data` fully prepares your data so that if can be easily accessed in your new test and/or example.
2. The command `make clean_data` fully removes your data.
3. The data is being ignored by Git.

It is a good idea to include you new test or example in the same PR that adds the new data to help ensure you can check it has been successfully prepared.
2 changes: 1 addition & 1 deletion libtiledbsoma/src/cli/cli.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ int main(int argc, char** argv) {

if (argc < 2) {
printf("Run with CI test SOMA:\n\n");
printf(" %s test/soco/pbmc3k_processed\n", argv[0]);
printf(" %s data/soco/pbmc3k_processed\n", argv[0]);
return 0;
}

Expand Down
16 changes: 16 additions & 0 deletions scripts/clean-test-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash
#
# A script to remove data generated by `prepare-test-data.sh`. Make sure to update
# this script after adding new dat to `prepare-test-data.sh`.
#
# See ../data/README.md for instructions on updating this script with new data.
#

set -euo pipefail

# Change directory to the `data` folder.
cd "$(dirname "$0")/../data"

# Remove prepared test data.
rm -rf soco
rm -rf example-visium-v2
39 changes: 39 additions & 0 deletions scripts/prepare-test-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env bash
#
# A script to download and extract test data. Skips data that is already present in the
# directory. Make sure to update `clean-test-data.sh` after modifying this script.
#
# See ../data/README.md for instructions on updating this script with new data.
#

set -euo pipefail

echo "Begin preparing data."

# Change directory to the `data` folder.
cd "$(dirname "$0")/../data"


# Extract saco dataset.
if [ -d ../test/soco ]; then
echo "-- Skipping dataset 'data/soco'; directory 'data/soco' already exists."
else
echo "-- Preparing dataset 'data/soco' ..."
tar zxf ../test/soco.tgz
echo " ... finished preparing 'test/soco.tgz'."
fi


# Download and extract Visium v2 dataset.
if [ -d example-visium-v2 ]; then
echo "-- Skipping dataset 'data/example-visium-vs'; directory 'data/example-visium-v2' already exists."
else
echo "-- Preparing dataset 'data/example-visium-v2' ..."
mkdir example-visium-v2 && cd example-visium-v2
wget https://github.com/single-cell-data/TileDB-SOMA-Test-Data/releases/download/dataset-2025-02-19/filtered_feature_bc_matrix.h5
wget https://github.com/single-cell-data/TileDB-SOMA-Test-Data/releases/download/dataset-2025-02-19/raw_feature_bc_matrix.h5
wget https://github.com/single-cell-data/TileDB-SOMA-Test-Data/releases/download/dataset-2025-02-19/spatial.tar.gz
tar zxf spatial.tar.gz
cd ..
echo " ... finished preparing dataset 'data/example-visium-v2'."
fi

0 comments on commit 967fc04

Please sign in to comment.