diff --git a/.github/workflows/python-ci-single.yml b/.github/workflows/python-ci-single.yml index cec049f173..88bfd28395 100644 --- a/.github/workflows/python-ci-single.yml +++ b/.github/workflows/python-ci-single.yml @@ -87,6 +87,9 @@ jobs: with: fetch-depth: 0 # ensure we get all tags to inform package version determination + - name: Set Up Test Data + run: make data + - name: Set up Python ${{ inputs.python_version }} uses: actions/setup-python@v5 with: diff --git a/.github/workflows/python-dependency-variation.yml b/.github/workflows/python-dependency-variation.yml index 59d9cf7b4f..3867d81759 100644 --- a/.github/workflows/python-dependency-variation.yml +++ b/.github/workflows/python-dependency-variation.yml @@ -113,11 +113,7 @@ jobs: - name: Obtain test data shell: bash - run: | - cd test - rm -rf soco - tar zxf soco.tgz - cd .. + run: make data - name: Run pytests for Python shell: bash diff --git a/Makefile b/Makefile index 34e1167efa..e8d4a76379 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,11 @@ ctest_update: .PHONY: data data: - cd test && rm -rf soco && tar zxf soco.tgz && cd .. + @./scripts/prepare-test-data.sh + +.PHONY: clean_data +clean_data: + @./scripts/clean-test-data.sh # format # ------------------------------------------------------------------- diff --git a/apis/python/pyproject.toml b/apis/python/pyproject.toml index 2b6e0e08c4..d0529f130c 100644 --- a/apis/python/pyproject.toml +++ b/apis/python/pyproject.toml @@ -35,4 +35,5 @@ no-lines-before = ["tiledb"] [tool.pytest.ini_options] -filterwarnings = ['ignore:Support for spatial types is experimental'] +filterwarnings = ["ignore:Support for spatial types is experimental"] +markers = ["slow: mark test as slow"] diff --git a/apis/python/tests/_util.py b/apis/python/tests/_util.py index 391206a8cc..64e7edda0f 100644 --- a/apis/python/tests/_util.py +++ b/apis/python/tests/_util.py @@ -134,8 +134,11 @@ def make_pd_df(index_str: str | None = None, **cols) -> pd.DataFrame: HERE = Path(__file__).parent PY_ROOT = HERE.parent +PROJECT_ROOT = PY_ROOT.parent.parent TESTDATA = PY_ROOT / "testdata" +ROOT_DATA_DIR = PROJECT_ROOT / "data" + @contextmanager def raises_no_typeguard(exc: Type[Exception], *args: Any, **kwargs: Any): diff --git a/apis/python/tests/test_aaa_setup.py b/apis/python/tests/test_aaa_setup.py index b8e2a63ad4..4de49444f0 100644 --- a/apis/python/tests/test_aaa_setup.py +++ b/apis/python/tests/test_aaa_setup.py @@ -3,7 +3,7 @@ import os TEST_DIR = os.path.dirname(__file__) -SOMA_URI = f"{TEST_DIR}/../../../test/soco/pbmc3k_processed" +SOMA_URI = f"{TEST_DIR}/../../../data/soco/pbmc3k_processed" if not os.path.exists(SOMA_URI): raise RuntimeError("Please run `make data` in the repo base directory") diff --git a/apis/python/tests/test_from_visium.py b/apis/python/tests/test_from_visium.py new file mode 100644 index 0000000000..1ac1bf6151 --- /dev/null +++ b/apis/python/tests/test_from_visium.py @@ -0,0 +1,112 @@ +import os + +import numpy as np +import pytest + +import tiledbsoma as soma + +from ._util import ROOT_DATA_DIR + +spatial_io = pytest.importorskip("tiledbsoma.io.spatial") + + +@pytest.fixture(scope="module") +def visium_v2_path(): + """Fixture that checks the example Visium v2 dataset exists.""" + visium_path = ROOT_DATA_DIR / "example-visium-v2" + if not os.path.isdir(visium_path): + raise RuntimeError( + "Missing 'data/example-visium-v2' directory. Try running `make data` " + "from the TileDB-SOMA project root directory." + ) + for filename in [ + "filtered_feature_bc_matrix.h5", + "raw_feature_bc_matrix.h5", + "spatial/tissue_positions.csv", + "spatial/scalefactors_json.json", + "spatial/tissue_hires_image.png", + "spatial/tissue_lowres_image.png", + ]: + if not os.path.isfile(visium_path / filename): + raise RuntimeError( + f"Missing file 'data/example-visium-v2/{filename}'. Try removing " + f"the directory 'data/example-visium-v2' and re-running `make data'" + f"from the project root directory." + ) + + return visium_path + + +def test_visium_paths_v2(visium_v2_path): + """Test ``VisiumPaths`` for Visium v2 in standard structure.""" + visium_paths = spatial_io.VisiumPaths.from_base_folder(visium_v2_path) + assert os.path.isfile(visium_paths.gene_expression) + assert os.path.isfile(visium_paths.tissue_positions) + assert visium_paths.fullres_image is None + assert os.path.isfile(visium_paths.hires_image) + assert os.path.isfile(visium_paths.lowres_image) + assert visium_paths.version == (2, 0, 0) + assert visium_paths.has_image + assert visium_paths.major_version == 2 + + +@pytest.mark.slow +def test_from_visium_for_visium_v2(tmp_path, visium_v2_path): + """Test `from_visium` runs without error.""" + PIL = pytest.importorskip("PIL") + uri = f"{tmp_path.as_uri()}/from_visium_for_visium_v2" + exp_uri = spatial_io.from_visium( + uri, + visium_v2_path, + "RNA", + "fresh_frozen_mouse_brain", + write_obs_spatial_presence=True, + write_var_spatial_presence=True, + ) + with soma.Experiment.open(exp_uri) as exp: + + # Check for the existance of obs, RNA/X, and RNA/var + assert isinstance(exp.obs, soma.DataFrame) + assert isinstance(exp.ms["RNA"].X["data"], soma.SparseNDArray) + assert isinstance(exp.ms["RNA"].var, soma.DataFrame) + + # Check for the existance of the presence matrices. + assert isinstance(exp.obs_spatial_presence, soma.DataFrame) + assert isinstance(exp.ms["RNA"].var_spatial_presence, soma.DataFrame) + + # Check for scene. + assert isinstance(exp.spatial["fresh_frozen_mouse_brain"], soma.Scene) + + # Check expected datatypes in scene. + scene = exp.spatial["fresh_frozen_mouse_brain"] + assert isinstance(scene.obsl["loc"], soma.PointCloudDataFrame) + assert len(scene.varl.items()) == 0 + assert isinstance(scene.img["tissue"], soma.MultiscaleImage) + + # Check point cloud dataframe data. + output_points_df = scene.obsl["loc"].read().concat().to_pandas() + assert output_points_df.columns.tolist() == [ + "x", + "y", + "soma_joinid", + "in_tissue", + "array_row", + "array_col", + "spot_diameter_fullres", + ] + assert len(output_points_df) == 2797 + + # Check image. + image = scene.img["tissue"] + hires_data = np.moveaxis(image["hires"].read().to_numpy(), 0, -1) + with PIL.Image.open( + visium_v2_path / "spatial" / "tissue_hires_image.png" + ) as input_hires: + expected = np.array(input_hires) + np.testing.assert_equal(expected, hires_data) + lowres_data = np.moveaxis(image["lowres"].read().to_numpy(), 0, -1) + with PIL.Image.open( + visium_v2_path / "spatial" / "tissue_lowres_image.png" + ) as input_lowres: + expected = np.array(input_lowres) + np.testing.assert_equal(expected, lowres_data) diff --git a/apis/python/tests/test_query_condition.py b/apis/python/tests/test_query_condition.py index c2be25dc19..0bf4522627 100644 --- a/apis/python/tests/test_query_condition.py +++ b/apis/python/tests/test_query_condition.py @@ -12,7 +12,7 @@ VERBOSE = False TEST_DIR = os.path.dirname(__file__) -SOMA_URI = f"{TEST_DIR}/../../../test/soco/pbmc3k_processed" +SOMA_URI = f"{TEST_DIR}/../../../data/soco/pbmc3k_processed" if VERBOSE: clib.config_logging("debug") diff --git a/apis/python/tests/test_soma_array.py b/apis/python/tests/test_soma_array.py index f08b3b093c..3e09ce0a4d 100644 --- a/apis/python/tests/test_soma_array.py +++ b/apis/python/tests/test_soma_array.py @@ -10,7 +10,7 @@ VERBOSE = False TEST_DIR = os.path.dirname(__file__) -SOMA_URI = f"{TEST_DIR}/../../../test/soco/pbmc3k_processed" +SOMA_URI = f"{TEST_DIR}/../../../data/soco/pbmc3k_processed" if VERBOSE: clib.config_logging("debug") diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000000..201b09c4bb --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,8 @@ +# Ignore everything by default. +* + +# Explicitly track the following files and directories. +!.gitignore +!README.md +!10x-pbmc-multiome-v1.0 +!pbmc3k_processed.h5ad diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000000..b58e2cf831 --- /dev/null +++ b/data/README.md @@ -0,0 +1,34 @@ +# TileDB-SOMA Data + +This folder contains data for use in tests and examples. + +## How to add new data + +### Check new data and upload to GitHub for hosting + +Temporarily add a copy of your data to this directory in the way you will eventually want to use it. Create your new tests or examples and check that the data serves your purpose. + +Once you have verified the data works as intended, add it to a [TileDB-SOMA-Test-Data](https://github.com/single-cell-data/TileDB-SOMA-Test-Data) release as described in that project's [README](https://github.com/single-cell-data/TileDB-SOMA-Test-Data/blob/main/README.md). + +### Update `make data` to include new dataset + +The Makefile `data` target calls [scripts/prepare-test-data.sh](../scripts/prepare-test-data.sh). You will need to update the script to include a section to prepare your new data. Your update must do the following: + +1. Create a new file or folder in the `data/` directory (this directory) that includes the desired data if the data has not already been prepared. +2. Gracefully skip preparing data that has already been added. + +If you are including data from TileDB-SOMA-Test-Data, the easiest way to access the data is to download all necessary to a new directory using "wget". Extract any data that is compressed. + +### Update `make clean_data` to remove new dataset + +The Makefile `clean_data` target calls [scripts/clean-test-data.sh](../scripts/clean-test-data). You will need to update the script to include a section that removes your new data. + +### Checks + +Before committing your changes to `scripts/prepare-test-data.sh` verify the following: + +1. The command `make data` fully prepares your data so that if can be easily accessed in your new test and/or example. +2. The command `make clean_data` fully removes your data. +3. The data is being ignored by Git. + +It is a good idea to include you new test or example in the same PR that adds the new data to help ensure you can check it has been successfully prepared. diff --git a/libtiledbsoma/src/cli/cli.cc b/libtiledbsoma/src/cli/cli.cc index 4c49462381..5850074cf2 100644 --- a/libtiledbsoma/src/cli/cli.cc +++ b/libtiledbsoma/src/cli/cli.cc @@ -96,7 +96,7 @@ int main(int argc, char** argv) { if (argc < 2) { printf("Run with CI test SOMA:\n\n"); - printf(" %s test/soco/pbmc3k_processed\n", argv[0]); + printf(" %s data/soco/pbmc3k_processed\n", argv[0]); return 0; } diff --git a/scripts/clean-test-data.sh b/scripts/clean-test-data.sh new file mode 100755 index 0000000000..8a47e6b888 --- /dev/null +++ b/scripts/clean-test-data.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# +# A script to remove data generated by `prepare-test-data.sh`. Make sure to update +# this script after adding new dat to `prepare-test-data.sh`. +# +# See ../data/README.md for instructions on updating this script with new data. +# + +set -euo pipefail + +# Change directory to the `data` folder. +cd "$(dirname "$0")/../data" + +# Remove prepared test data. +rm -rf soco +rm -rf example-visium-v2 diff --git a/scripts/prepare-test-data.sh b/scripts/prepare-test-data.sh new file mode 100755 index 0000000000..6cf1955af0 --- /dev/null +++ b/scripts/prepare-test-data.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# +# A script to download and extract test data. Skips data that is already present in the +# directory. Make sure to update `clean-test-data.sh` after modifying this script. +# +# See ../data/README.md for instructions on updating this script with new data. +# + +set -euo pipefail + +echo "Begin preparing data." + +# Change directory to the `data` folder. +cd "$(dirname "$0")/../data" + + +# Extract saco dataset. +if [ -d ../test/soco ]; then + echo "-- Skipping dataset 'data/soco'; directory 'data/soco' already exists." +else + echo "-- Preparing dataset 'data/soco' ..." + tar zxf ../test/soco.tgz + echo " ... finished preparing 'test/soco.tgz'." +fi + + +# Download and extract Visium v2 dataset. +if [ -d example-visium-v2 ]; then + echo "-- Skipping dataset 'data/example-visium-vs'; directory 'data/example-visium-v2' already exists." +else + echo "-- Preparing dataset 'data/example-visium-v2' ..." + mkdir example-visium-v2 && cd example-visium-v2 + wget https://github.com/single-cell-data/TileDB-SOMA-Test-Data/releases/download/dataset-2025-02-19/filtered_feature_bc_matrix.h5 + wget https://github.com/single-cell-data/TileDB-SOMA-Test-Data/releases/download/dataset-2025-02-19/raw_feature_bc_matrix.h5 + wget https://github.com/single-cell-data/TileDB-SOMA-Test-Data/releases/download/dataset-2025-02-19/spatial.tar.gz + tar zxf spatial.tar.gz + cd .. + echo " ... finished preparing dataset 'data/example-visium-v2'." +fi