Skip to content

Commit

Permalink
Cell type annotation: scGPT workflow (#832)
Browse files Browse the repository at this point in the history
Co-authored-by: DriesSchaumont <[email protected]>
Co-authored-by: Sarah <[email protected]>
Co-authored-by: Vladimir Shitov <[email protected]>
Co-authored-by: Jakub Majercik <[email protected]>
Co-authored-by: Robrecht Cannoodt <[email protected]>
  • Loading branch information
6 people authored Dec 5, 2024
1 parent 963ee42 commit e11f4fd
Show file tree
Hide file tree
Showing 19 changed files with 857 additions and 323 deletions.
18 changes: 17 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,25 @@
# openpipelines x.x.x

# MINOR CHANGES
## BREAKING CHANGES

* Several components under `src/scgpt` (`cross_check_genes`, `tokenize_pad`, `binning`) now processes the input (query) datasets differently. Instead of subsetting datasets based on genes in the model vocabulary and/or highly variable genes, these components require an input .var column with a boolean mask specifying this information. The results are written back to the original input data, preserving the dataset structure (PR #832).

## NEW FUNCTIONALITY

* `scgpt/cell_type_annotation` component update: Added support for multi-processing (PR #832).

## MINOR CHANGES

* Several component (cleanup): remove workaround for using being able to use shared utility functions with Nextflow Fusion (PR #920).

* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).

* `scgpt/cross_check_genes` component update: Highly variable genes are now cross-checked based on the boolean mask in `var_input`. The filtering information is stored in the `--output_var_filter` .var field instead of subsetting the dataset (PR #832).

* `scgpt/binning` component update: This component now requires the `--var_input` parameter to provide gene filtering information. Binned data is written to the `--output_obsm_binned_counts` .obsm field in the original input data (PR #832).

* `scgpt/pad_tokenize` component update: Genes are padded and tokenized based on filtering information in `--var_input` and `--input_obsm_binned_counts` (PR #832).

# openpipelines 2.0.0-rc.2

## BUG FIXES
Expand Down
32 changes: 32 additions & 0 deletions resources_test_scripts/scgpt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ OUT=resources_test/$ID
# create foundational model directory
foundation_model_dir="$OUT/source"
mkdir -p "$foundation_model_dir"
export foundation_model_dir

# create finetuned model directory
finetuned_model_dir="$OUT/finetuned_model"
mkdir -p "$finetuned_model_dir"
export finetuned_model_dir

# install gdown if necessary
# Check whether gdown is available
Expand All @@ -19,13 +25,39 @@ if ! command -v gdown &> /dev/null; then
exit 1
fi

# install torch if necessary
# Check whether torch is available
if ! command -v torch &> /dev/null; then
echo "This script requires torch. Please make sure the binary is added to your PATH."
exit 1
fi

echo "> Downloading scGPT foundation model (full_human)"
# download foundational model files (full_human)
# https://drive.google.com/drive/folders/1oWh_-ZRdhtoGQ2Fw24HP41FgLoomVo-y
gdown '1H3E_MJ-Dl36AQV6jLbna2EdvgPaqvqcC' -O "${foundation_model_dir}/vocab.json"
gdown '1hh2zGKyWAx3DyovD30GStZ3QlzmSqdk1' -O "${foundation_model_dir}/args.json"
gdown '14AebJfGOUF047Eg40hk57HCtrb0fyDTm' -O "${foundation_model_dir}/best_model.pt"

echo "> Converting to finetuned model format"
python <<HEREDOC
import torch
import mudata
import os
foundation_model_dir = os.environ.get('foundation_model_dir')
finetuned_model_dir = os.environ.get('finetuned_model_dir')
found_model_path = f"{foundation_model_dir}/best_model.pt"
ft_model_path = f"{finetuned_model_dir}/best_model.pt"
f_model_dict = torch.load(found_model_path, map_location="cpu")
model_dict = {}
model_dict["model_state_dict"] = f_model_dict
model_dict["id_to_class"] = {k: str(k) for k in range(15)}
torch.save(model_dict, ft_model_path)
HEREDOC

# create test data dir
test_resources_dir="$OUT/test_resources"
mkdir -p "$test_resources_dir"
Expand Down
16 changes: 10 additions & 6 deletions src/scgpt/binning/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ argument_groups:
required: False
description: |
Mudata layer (key from .layers) to use as input data for binning. If not specified, .X is used.
- name: "--var_input"
type: string
default: "id_in_vocab"
description: |
The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes.
- name: "--n_input_bins"
type: integer
default: 51
Expand All @@ -53,9 +58,9 @@ argument_groups:
choices: ["gzip", "lzf"]
description: |
The compression algorithm to use for the output h5mu file.
- name: "--binned_layer"
- name: "--output_obsm_binned_counts"
type: string
default: "binned"
default: "binned_counts"
description: |
The name of the adata layer to write the binned data to.
- name: "--seed"
Expand All @@ -67,10 +72,11 @@ resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
- path: /src/utils/subset_vars.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu
- path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu

engines:
- type: docker
Expand All @@ -81,9 +87,7 @@ engines:
- procps
- type: python
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml ]
__merge__: [ /src/base/requirements/python_test_setup.yaml ]
runners:
- type: executable
- type: nextflow
36 changes: 22 additions & 14 deletions src/scgpt/binning/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,27 @@

## VIASH START
par = {
"input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu",
"output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned_sparse.h5mu",
"input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_genes_cross_checked.h5mu",
"output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu",
"modality": "rna",
"input_layer": None,
"binned_layer": "binned",
"output_obsm_binned_counts": "binned_counts",
"n_input_bins": 51,
"output_compression": None,
"var_input": "id_in_vocab",
"seed": 0
}
meta = {
"resources_dir": "src/utils"
}
## VIASH END

if par["seed"]:
np.random.seed(par["seed"])

sys.path.append(meta["resources_dir"])
from setup_logger import setup_logger
from subset_vars import subset_vars
logger = setup_logger()

logger.info("Reading in data")
Expand All @@ -29,6 +35,9 @@
input_adata = mdata.mod[par["modality"]]
adata = input_adata.copy()

logger.info("Subsetting data based on highly variable gene and/or cross-checked genes")
adata = subset_vars(adata, par["var_input"])

logger.info("Converting the input layer into a CSR matrix")
if not par['input_layer'] or par["input_layer"] == "X":
layer_data = adata.X
Expand All @@ -40,7 +49,7 @@
raise ValueError(
f"Assuming non-negative data, but got min value {layer_data.min()}."
)

n_bins = par["n_input_bins"] # NOTE: the first bin is always a spectial for zero
logger.info(f"Binning data into {par['n_input_bins']} bins.")

Expand All @@ -57,7 +66,7 @@ def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray:
digits = np.ceil(digits)
smallest_dtype = np.min_scalar_type(digits.max().astype(np.uint)) # Already checked for non-negative values
digits = digits.astype(smallest_dtype)

return digits


Expand All @@ -78,32 +87,31 @@ def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray:
"this is expected. You can use the `filter_cell_by_counts` "
"arg to filter out all zero rows."
)

# Add binned_rows and bin_edges as all 0
# np.stack will upcast the dtype later
binned_rows.append(np.zeros_like(non_zero_row, dtype=np.int8))
bin_edges.append(np.array([0] * n_bins))
continue

# Binning of non-zero values
bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1))
non_zero_digits = _digitize(non_zero_row, bins)
assert non_zero_digits.min() >= 1
assert non_zero_digits.max() <= n_bins - 1
binned_rows.append(non_zero_digits)

bin_edges.append(np.concatenate([[0], bins]))

# Create new CSR matrix
logger.info("Creating a new CSR matrix of the binned count values")
binned_layer = csr_matrix((np.concatenate(binned_rows, casting="same_kind"),
binned_counts = csr_matrix((np.concatenate(binned_rows, casting="same_kind"),
layer_data.indices, layer_data.indptr), shape=layer_data.shape)

# Set binned values and bin edges layers to adata object
adata.layers[par["binned_layer"]] = binned_layer
adata.obsm["bin_edges"] = np.stack(bin_edges)
input_adata.obsm[par["output_obsm_binned_counts"]] = binned_counts
input_adata.obsm["bin_edges"] = np.stack(bin_edges)

# Write mudata output
logger.info("Writing output data")
mdata.mod[par["modality"]] = adata
mdata.write(par["output"], compression=par["output_compression"])
mdata. write_h5mu(par["output"], compression=par["output_compression"])
25 changes: 12 additions & 13 deletions src/scgpt/binning/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,16 @@


def test_binning(run_component, tmp_path):
input_file_path = f"{meta['resources_dir']}/Kim2020_Lung_subset.h5mu"

input_file_path = f"{meta['resources_dir']}/Kim2020_Lung_subset_preprocessed.h5mu"
output_file_path = tmp_path / "Kim2020_Lung_subset_binned.h5mu"

run_component([
"--input", input_file_path,
"--modality", "rna",
"--binned_layer", "binned",
"--output_obsm_binned_counts", "binned_counts",
"--n_input_bins", "51",
"--var_input", "filter_with_hvg",
"--output", output_file_path
])

Expand All @@ -31,21 +32,19 @@ def test_binning(run_component, tmp_path):
output_adata = output_mdata.mod["rna"]

# Check presence of binning layers
assert "bin_edges" in output_adata.obsm.keys()
assert "binned" in output_adata.layers.keys()

assert {"bin_edges", "binned_counts"}.issubset(output_adata.obsm.keys()), "Binning obsm fields were not added."

# Check bin edges
bin_edges = output_adata.obsm["bin_edges"]
assert all(bin_edges[:, 0] == 0)
assert bin_edges.shape[1] == 51
assert all(all(i>=0) for i in bin_edges)
assert all(all(i >= 0) for i in bin_edges)

# Check binned values
binned_values = output_adata.layers["binned"]
binned_values = output_adata.obsm["binned_counts"]
assert issparse(binned_values)
assert binned_values.shape == output_adata.X.shape
assert (binned_values.data <= 51).all(axis=None)


if __name__ == '__main__':
sys.exit(pytest.main([__file__]))
sys.exit(pytest.main([__file__]))
Loading

0 comments on commit e11f4fd

Please sign in to comment.