-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[python] Add and use new Visium datasets (#3712)
* Remove cleaning steps from Makefile target `data` and create new target `clean_data` that handles removing existing test data. * Add scripts `scripts/prepare-test-data.sh` and `scripts/clean-test-data.sh` for preparing and cleaning data. * Add Visium v2 dataset to the `data` and `clean_data` targets. * Add first `from_visium` test case that using Visium v2 test. * Add `make data` to appropriate GitHub CI actions. --------- Co-authored-by: John Kerl <[email protected]>
- Loading branch information
Showing
14 changed files
with
227 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import os | ||
|
||
import numpy as np | ||
import pytest | ||
|
||
import tiledbsoma as soma | ||
|
||
from ._util import ROOT_DATA_DIR | ||
|
||
spatial_io = pytest.importorskip("tiledbsoma.io.spatial") | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def visium_v2_path(): | ||
"""Fixture that checks the example Visium v2 dataset exists.""" | ||
visium_path = ROOT_DATA_DIR / "example-visium-v2" | ||
if not os.path.isdir(visium_path): | ||
raise RuntimeError( | ||
"Missing 'data/example-visium-v2' directory. Try running `make data` " | ||
"from the TileDB-SOMA project root directory." | ||
) | ||
for filename in [ | ||
"filtered_feature_bc_matrix.h5", | ||
"raw_feature_bc_matrix.h5", | ||
"spatial/tissue_positions.csv", | ||
"spatial/scalefactors_json.json", | ||
"spatial/tissue_hires_image.png", | ||
"spatial/tissue_lowres_image.png", | ||
]: | ||
if not os.path.isfile(visium_path / filename): | ||
raise RuntimeError( | ||
f"Missing file 'data/example-visium-v2/{filename}'. Try removing " | ||
f"the directory 'data/example-visium-v2' and re-running `make data'" | ||
f"from the project root directory." | ||
) | ||
|
||
return visium_path | ||
|
||
|
||
def test_visium_paths_v2(visium_v2_path): | ||
"""Test ``VisiumPaths`` for Visium v2 in standard structure.""" | ||
visium_paths = spatial_io.VisiumPaths.from_base_folder(visium_v2_path) | ||
assert os.path.isfile(visium_paths.gene_expression) | ||
assert os.path.isfile(visium_paths.tissue_positions) | ||
assert visium_paths.fullres_image is None | ||
assert os.path.isfile(visium_paths.hires_image) | ||
assert os.path.isfile(visium_paths.lowres_image) | ||
assert visium_paths.version == (2, 0, 0) | ||
assert visium_paths.has_image | ||
assert visium_paths.major_version == 2 | ||
|
||
|
||
@pytest.mark.slow | ||
def test_from_visium_for_visium_v2(tmp_path, visium_v2_path): | ||
"""Test `from_visium` runs without error.""" | ||
PIL = pytest.importorskip("PIL") | ||
uri = f"{tmp_path.as_uri()}/from_visium_for_visium_v2" | ||
exp_uri = spatial_io.from_visium( | ||
uri, | ||
visium_v2_path, | ||
"RNA", | ||
"fresh_frozen_mouse_brain", | ||
write_obs_spatial_presence=True, | ||
write_var_spatial_presence=True, | ||
) | ||
with soma.Experiment.open(exp_uri) as exp: | ||
|
||
# Check for the existance of obs, RNA/X, and RNA/var | ||
assert isinstance(exp.obs, soma.DataFrame) | ||
assert isinstance(exp.ms["RNA"].X["data"], soma.SparseNDArray) | ||
assert isinstance(exp.ms["RNA"].var, soma.DataFrame) | ||
|
||
# Check for the existance of the presence matrices. | ||
assert isinstance(exp.obs_spatial_presence, soma.DataFrame) | ||
assert isinstance(exp.ms["RNA"].var_spatial_presence, soma.DataFrame) | ||
|
||
# Check for scene. | ||
assert isinstance(exp.spatial["fresh_frozen_mouse_brain"], soma.Scene) | ||
|
||
# Check expected datatypes in scene. | ||
scene = exp.spatial["fresh_frozen_mouse_brain"] | ||
assert isinstance(scene.obsl["loc"], soma.PointCloudDataFrame) | ||
assert len(scene.varl.items()) == 0 | ||
assert isinstance(scene.img["tissue"], soma.MultiscaleImage) | ||
|
||
# Check point cloud dataframe data. | ||
output_points_df = scene.obsl["loc"].read().concat().to_pandas() | ||
assert output_points_df.columns.tolist() == [ | ||
"x", | ||
"y", | ||
"soma_joinid", | ||
"in_tissue", | ||
"array_row", | ||
"array_col", | ||
"spot_diameter_fullres", | ||
] | ||
assert len(output_points_df) == 2797 | ||
|
||
# Check image. | ||
image = scene.img["tissue"] | ||
hires_data = np.moveaxis(image["hires"].read().to_numpy(), 0, -1) | ||
with PIL.Image.open( | ||
visium_v2_path / "spatial" / "tissue_hires_image.png" | ||
) as input_hires: | ||
expected = np.array(input_hires) | ||
np.testing.assert_equal(expected, hires_data) | ||
lowres_data = np.moveaxis(image["lowres"].read().to_numpy(), 0, -1) | ||
with PIL.Image.open( | ||
visium_v2_path / "spatial" / "tissue_lowres_image.png" | ||
) as input_lowres: | ||
expected = np.array(input_lowres) | ||
np.testing.assert_equal(expected, lowres_data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Ignore everything by default. | ||
* | ||
|
||
# Explicitly track the following files and directories. | ||
!.gitignore | ||
!README.md | ||
!10x-pbmc-multiome-v1.0 | ||
!pbmc3k_processed.h5ad |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# TileDB-SOMA Data | ||
|
||
This folder contains data for use in tests and examples. | ||
|
||
## How to add new data | ||
|
||
### Check new data and upload to GitHub for hosting | ||
|
||
Temporarily add a copy of your data to this directory in the way you will eventually want to use it. Create your new tests or examples and check that the data serves your purpose. | ||
|
||
Once you have verified the data works as intended, add it to a [TileDB-SOMA-Test-Data](https://github.com/single-cell-data/TileDB-SOMA-Test-Data) release as described in that project's [README](https://github.com/single-cell-data/TileDB-SOMA-Test-Data/blob/main/README.md). | ||
|
||
### Update `make data` to include new dataset | ||
|
||
The Makefile `data` target calls [scripts/prepare-test-data.sh](../scripts/prepare-test-data.sh). You will need to update the script to include a section to prepare your new data. Your update must do the following: | ||
|
||
1. Create a new file or folder in the `data/` directory (this directory) that includes the desired data if the data has not already been prepared. | ||
2. Gracefully skip preparing data that has already been added. | ||
|
||
If you are including data from TileDB-SOMA-Test-Data, the easiest way to access the data is to download all necessary to a new directory using "wget". Extract any data that is compressed. | ||
|
||
### Update `make clean_data` to remove new dataset | ||
|
||
The Makefile `clean_data` target calls [scripts/clean-test-data.sh](../scripts/clean-test-data). You will need to update the script to include a section that removes your new data. | ||
|
||
### Checks | ||
|
||
Before committing your changes to `scripts/prepare-test-data.sh` verify the following: | ||
|
||
1. The command `make data` fully prepares your data so that if can be easily accessed in your new test and/or example. | ||
2. The command `make clean_data` fully removes your data. | ||
3. The data is being ignored by Git. | ||
|
||
It is a good idea to include you new test or example in the same PR that adds the new data to help ensure you can check it has been successfully prepared. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/usr/bin/env bash | ||
# | ||
# A script to remove data generated by `prepare-test-data.sh`. Make sure to update | ||
# this script after adding new dat to `prepare-test-data.sh`. | ||
# | ||
# See ../data/README.md for instructions on updating this script with new data. | ||
# | ||
|
||
set -euo pipefail | ||
|
||
# Change directory to the `data` folder. | ||
cd "$(dirname "$0")/../data" | ||
|
||
# Remove prepared test data. | ||
rm -rf soco | ||
rm -rf example-visium-v2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env bash | ||
# | ||
# A script to download and extract test data. Skips data that is already present in the | ||
# directory. Make sure to update `clean-test-data.sh` after modifying this script. | ||
# | ||
# See ../data/README.md for instructions on updating this script with new data. | ||
# | ||
|
||
set -euo pipefail | ||
|
||
echo "Begin preparing data." | ||
|
||
# Change directory to the `data` folder. | ||
cd "$(dirname "$0")/../data" | ||
|
||
|
||
# Extract saco dataset. | ||
if [ -d ../test/soco ]; then | ||
echo "-- Skipping dataset 'data/soco'; directory 'data/soco' already exists." | ||
else | ||
echo "-- Preparing dataset 'data/soco' ..." | ||
tar zxf ../test/soco.tgz | ||
echo " ... finished preparing 'test/soco.tgz'." | ||
fi | ||
|
||
|
||
# Download and extract Visium v2 dataset. | ||
if [ -d example-visium-v2 ]; then | ||
echo "-- Skipping dataset 'data/example-visium-vs'; directory 'data/example-visium-v2' already exists." | ||
else | ||
echo "-- Preparing dataset 'data/example-visium-v2' ..." | ||
mkdir example-visium-v2 && cd example-visium-v2 | ||
wget https://github.com/single-cell-data/TileDB-SOMA-Test-Data/releases/download/dataset-2025-02-19/filtered_feature_bc_matrix.h5 | ||
wget https://github.com/single-cell-data/TileDB-SOMA-Test-Data/releases/download/dataset-2025-02-19/raw_feature_bc_matrix.h5 | ||
wget https://github.com/single-cell-data/TileDB-SOMA-Test-Data/releases/download/dataset-2025-02-19/spatial.tar.gz | ||
tar zxf spatial.tar.gz | ||
cd .. | ||
echo " ... finished preparing dataset 'data/example-visium-v2'." | ||
fi |