From aa4cd62adef6d84ed32823819bb2be3760c5fd0b Mon Sep 17 00:00:00 2001 From: Dorien <41797896+dorien-er@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:14:10 +0100 Subject: [PATCH] Update scgpt test resources (#926) --- CHANGELOG.md | 2 + resources_test_scripts/scgpt.sh | 47 +++--- .../cell_type_annotation/config.vsh.yaml | 2 +- src/scgpt/cell_type_annotation/test.py | 17 +-- src/scgpt/embedding/config.vsh.yaml | 3 +- src/scgpt/embedding/script.py | 3 + src/scgpt/embedding/test.py | 143 +++--------------- src/scgpt/pad_tokenize/test.py | 49 ++---- 8 files changed, 63 insertions(+), 203 deletions(-) mode change 100644 => 100755 resources_test_scripts/scgpt.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fbe1b823c2..9c7811a1f5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ * `scgpt/pad_tokenize` component update: Genes are padded and tokenized based on filtering information in `--var_input` and `--input_obsm_binned_counts` (PR #832). +* `resources_test_scripts/scGPT.sh`: Update scGPT test resources to avoid subsetting of datasets (PR #926). + # openpipelines 2.0.0-rc.2 ## BUG FIXES diff --git a/resources_test_scripts/scgpt.sh b/resources_test_scripts/scgpt.sh old mode 100644 new mode 100755 index f6cd89a14e1..3af4d21b437 --- a/resources_test_scripts/scgpt.sh +++ b/resources_test_scripts/scgpt.sh @@ -27,8 +27,8 @@ fi # install torch if necessary # Check whether torch is available -if ! command -v torch &> /dev/null; then - echo "This script requires torch. Please make sure the binary is added to your PATH." +if ! python -c "import torch"; then + echo "This script requires torch. Please make sure it is available in your python environment." exit 1 fi @@ -77,12 +77,13 @@ input_mdata.write_h5mu("${test_resources_dir}/Kim2020_Lung.h5mu") HEREDOC echo "> Subsetting datasets" -viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \ +viash run src/filter/subset_h5mu/config.vsh.yaml --engine docker -- \ --input "${test_resources_dir}/Kim2020_Lung.h5mu" \ --output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \ --number_of_observations 4000 rm "${test_resources_dir}/Kim2020_Lung.h5ad" +rm "${test_resources_dir}/Kim2020_Lung.h5mu" echo "> Preprocessing datasets" nextflow \ @@ -95,46 +96,38 @@ nextflow \ --publish_dir "${test_resources_dir}" echo "> Filtering highly variable features" -viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml -p docker -- \ - --input "${test_resources_dir}/iKim2020_Lung_subset_preprocessed.h5mu" \ +viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml --engine docker -- \ + --input "${test_resources_dir}/Kim2020_Lung_subset_preprocessed.h5mu" \ --output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \ --layer "log_normalized" \ - --var_name_filter "filter_with_hvg" \ + --var_name_filter "scgpt_filter_with_hvg" \ --n_top_features 1200 \ --flavor "seurat_v3" - -viash run src/filter/do_filter/config.vsh.yaml -p docker -- \ - --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \ - --output "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \ - --var_filter "filter_with_hvg" echo "> Running scGPT cross check genes" -viash run src/scgpt/cross_check_genes/config.vsh.yaml -p docker -- \ - --input "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \ +viash run src/scgpt/cross_check_genes/config.vsh.yaml --engine docker -- \ + --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \ --output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \ - --vocab_file "${foundation_model_dir}/vocab.json" + --vocab_file "${foundation_model_dir}/vocab.json" \ + --var_input "scgpt_filter_with_hvg" \ + --output_var_filter "scgpt_cross_checked_genes" echo "> Running scGPT binning" -viash run src/scgpt/binning/config.vsh.yaml -p docker -- \ +viash run src/scgpt/binning/config.vsh.yaml --engine docker -- \ --input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \ --input_layer "log_normalized" \ - --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" + --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \ + --output_obsm_binned_counts "binned_counts" \ + --var_input "scgpt_cross_checked_genes" echo "> Running scGPT tokenizing" -viash run src/scgpt/pad_tokenize/config.vsh.yaml -p docker -- \ +viash run src/scgpt/pad_tokenize/config.vsh.yaml --engine docker -- \ --input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \ - --input_layer "binned" \ + --input_obsm_binned_counts "binned_counts" \ --output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \ - --model_vocab "${foundation_model_dir}/vocab.json" - -echo "> Running scGPT integration" -viash run src/scgpt/embedding/config.vsh.yaml -p docker -- \ - --input "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \ - --output "${test_resources_dir}/Kim2020_Lung_subset_scgpt_integrated.h5mu" \ - --model "${foundation_model_dir}/best_model.pt" \ --model_vocab "${foundation_model_dir}/vocab.json" \ - --model_config "${foundation_model_dir}/args.json" \ - --obs_batch_label "sample" + --var_input "scgpt_cross_checked_genes" \ + echo "> Removing unnecessary files in test resources dir" find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete diff --git a/src/scgpt/cell_type_annotation/config.vsh.yaml b/src/scgpt/cell_type_annotation/config.vsh.yaml index 0f2b54b4ada..59e2e2a1a00 100644 --- a/src/scgpt/cell_type_annotation/config.vsh.yaml +++ b/src/scgpt/cell_type_annotation/config.vsh.yaml @@ -143,9 +143,9 @@ test_resources: - type: python_script path: test.py - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu - - path: /resources_test/scgpt/source/best_model.pt - path: /resources_test/scgpt/source/args.json - path: /resources_test/scgpt/source/vocab.json + - path: /resources_test/scgpt/finetuned_model/best_model.pt engines: - type: docker diff --git a/src/scgpt/cell_type_annotation/test.py b/src/scgpt/cell_type_annotation/test.py index f0cb97846f7..50662b90765 100644 --- a/src/scgpt/cell_type_annotation/test.py +++ b/src/scgpt/cell_type_annotation/test.py @@ -1,31 +1,16 @@ import pytest from mudata import read_h5mu import sys -import torch import subprocess import re input_path = f'{meta["resources_dir"]}/Kim2020_Lung_subset_tokenized.h5mu' -model = f'{meta["resources_dir"]}/best_model.pt' -ft_model = f'{meta["resources_dir"]}/ft_best_model.pt' +ft_model = f'{meta["resources_dir"]}/best_model.pt' model_config = f'{meta["resources_dir"]}/args.json' model_vocab = f'{meta["resources_dir"]}/vocab.json' -def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): - f_model_dict = torch.load(scgpt_path, map_location="cpu") - model_dict = {} - model_dict[state_dict_key] = f_model_dict - model_dict[mapper_key] = {k: str(k) for k in range(15)} - torch.save(model_dict, ft_scgpt_path) - - -# Convert foundation model into fine-tuned model architecture: -# To be able to do a cell type label mapping, the model architecture needs to contain a class to label mapper dictionary -scgpt_to_ft_scgpt(model, ft_model, "model_state_dict", "id_to_class") - - def test_cell_type_inference(run_component, tmp_path): output_annotation_file = tmp_path / "Kim2020_Lung_subset_annotated.h5mu" diff --git a/src/scgpt/embedding/config.vsh.yaml b/src/scgpt/embedding/config.vsh.yaml index 20de97b5fd7..56a4b69098d 100644 --- a/src/scgpt/embedding/config.vsh.yaml +++ b/src/scgpt/embedding/config.vsh.yaml @@ -133,7 +133,8 @@ test_resources: - type: python_script path: test.py - path: /resources_test/scgpt/source - - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu + - path: /resources_test/scgpt/finetuned_model + - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu engines: - type: docker diff --git a/src/scgpt/embedding/script.py b/src/scgpt/embedding/script.py index dd5e3036f9f..b78d42c3f61 100644 --- a/src/scgpt/embedding/script.py +++ b/src/scgpt/embedding/script.py @@ -27,6 +27,9 @@ "dsbn": True, "n_input_bins": 51, } +meta = { + "resources_dir": "src/utils", +} ## VIASH END sys.path.append(meta["resources_dir"]) diff --git a/src/scgpt/embedding/test.py b/src/scgpt/embedding/test.py index 4f140aae944..c27f77d144f 100644 --- a/src/scgpt/embedding/test.py +++ b/src/scgpt/embedding/test.py @@ -1,142 +1,32 @@ import pytest import subprocess -import torch import re import sys import mudata as mu import numpy as np -from scipy.sparse import issparse -from scgpt.tokenizer import tokenize_and_pad_batch -from scgpt.tokenizer.gene_tokenizer import GeneVocab -from scgpt.preprocess import Preprocessor + ## VIASH START meta = { "resources_dir": "resources_test", - "executable": "./target/docker/scgpt/integration_embedding/integration_embedding", - "temp_dir": "tmp", - "config": "./target/docker/scgpt/integration_embedding/.config.vsh.yaml", } ## VIASH END -input = f"{meta['resources_dir']}/Kim2020_Lung_subset.h5mu" +input = f"{meta['resources_dir']}/Kim2020_Lung_subset_tokenized.h5mu" model_file = f"{meta['resources_dir']}/source/best_model.pt" -ft_model = f'{meta["resources_dir"]}/ft_best_model.pt' +ft_model_file = f'{meta["resources_dir"]}/finetuned_model/best_model.pt' vocab_file = f"{meta['resources_dir']}/source/vocab.json" model_config_file = f"{meta['resources_dir']}/source/args.json" input_file = mu.read(input) -def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): - f_model_dict = torch.load(scgpt_path, map_location="cpu") - model_dict = {} - model_dict[state_dict_key] = f_model_dict - model_dict[mapper_key] = {k: str(k) for k in range(15)} - torch.save(model_dict, ft_scgpt_path) - - -# Convert foundation model into fine-tuned model architecture: -# To be able to do a cell type label mapping, the model architecture needs to contain a class to label mapper dictionary -scgpt_to_ft_scgpt(model_file, ft_model, "model_state_dict", "id_to_class") - - -## START TEMPORARY WORKAROUND DATA PREPROCESSING -# TODO: Remove this workaround once full scGPT preprocessing workflow is implemented -# Read in data -adata = input_file.mod["rna"] - -# Set tokens for integration -pad_token = "" -special_tokens = [pad_token, "", ""] - -# Make batch a category column -adata.obs["str_batch"] = adata.obs["sample"].astype(str) -batch_id_labels = adata.obs["str_batch"].astype("category").cat.codes.values -adata.obs["batch_id"] = batch_id_labels -adata.var["gene_name"] = adata.var.index.tolist() - -# Load model vocab -vocab = GeneVocab.from_file(vocab_file) -for s in special_tokens: - if s not in vocab: - vocab.append_token(s) - -# Cross-check genes with pre-trained model -genes = adata.var["gene_name"].tolist() -adata.var["id_in_vocab"] = [ - 1 if gene in vocab else -1 for gene in adata.var["gene_name"] -] -gene_ids_in_vocab = np.array(adata.var["id_in_vocab"]) -adata = adata[:, adata.var["id_in_vocab"] >= 0] - -# Preprocess data -preprocessor = Preprocessor( - use_key="X", - filter_gene_by_counts=3, - filter_cell_by_counts=False, - normalize_total=10000, - result_normed_key="X_normed", - log1p=True, - result_log1p_key="X_log1p", - subset_hvg=100, - hvg_flavor="seurat_v3", - binning=51, - result_binned_key="X_binned", -) - -preprocessor(adata, batch_key="str_batch") - -all_counts = ( - adata.layers["X_binned"].A - if issparse(adata.layers["X_binned"]) - else adata.layers["X_binned"] -) - -# Fetch gene names and look up tokens in vocab -vocab.set_default_index(vocab[""]) -ntokens = len(vocab) -genes = adata.var["gene_name"].tolist() -gene_ids = np.array(vocab(genes), dtype=int) - -# Fetch number of subset hvg -n_hvg = adata.var.shape[0] - -# Tokenize and pad data -tokenized_data = tokenize_and_pad_batch( - all_counts, - gene_ids, - max_len=n_hvg + 1, - vocab=vocab, - pad_token=pad_token, - pad_value=-2, - append_cls=True, # append token at the beginning, - include_zero_gene=False, - return_pt=True, - mod_type=None, - vocab_mod=None, -) - -all_gene_ids, all_values = tokenized_data["genes"], tokenized_data["values"] -padding_mask = all_gene_ids.eq(vocab[pad_token]) - -adata.obsm["gene_id_tokens"] = all_gene_ids.numpy() -adata.obsm["values_tokenized"] = all_values.numpy() -adata.obsm["padding_mask"] = padding_mask.numpy() - -tokenized_data = mu.MuData({"rna": adata}) -tokenized_data_path = f"{meta['resources_dir']}/Kim2020_Lung_tokenized.h5mu" -tokenized_data.write_h5mu(tokenized_data_path) - -## END TEMPORARY WORKAROUND DATA PREPROCESSING - - def test_integration_embedding(run_component, tmp_path): output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" run_component( [ "--input", - tokenized_data_path, + input, "--modality", "rna", "--model", @@ -157,6 +47,8 @@ def test_integration_embedding(run_component, tmp_path): "padding_mask", "--output", output_embedding_file, + "--batch_size", + "4", ] ) @@ -184,10 +76,11 @@ def test_integration_embedding(run_component, tmp_path): # Run embeddings without dsbn output_embedding_file_without_dsbn = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" + run_component( [ "--input", - tokenized_data_path, + input, "--modality", "rna", "--model", @@ -206,6 +99,8 @@ def test_integration_embedding(run_component, tmp_path): "padding_mask", "--output", output_embedding_file_without_dsbn, + "--batch_size", + "4", ] ) @@ -224,7 +119,7 @@ def test_integration_embedding_dsbn_without_batch_labels(run_component, tmp_path args = [ "--input", - tokenized_data_path, + input, "--modality", "rna", "--model", @@ -259,7 +154,7 @@ def test_integration_embedding_non_existing_keys(run_component, tmp_path): # Test for non-existing gene names key args_1 = [ "--input", - tokenized_data_path, + input, "--modality", "rna", "--model", @@ -293,7 +188,7 @@ def test_integration_embedding_non_existing_keys(run_component, tmp_path): # Test for non-existing batch label key args_2 = [ "--input", - tokenized_data_path, + input, "--modality", "rna", "--model", @@ -325,7 +220,7 @@ def test_integration_embedding_non_existing_keys(run_component, tmp_path): # Test for non-existing tokenized values key args_3 = [ "--input", - tokenized_data_path, + input, "--modality", "rna", "--model", @@ -362,11 +257,11 @@ def test_finetuned_model(run_component, tmp_path): run_component( [ "--input", - tokenized_data_path, + input, "--modality", "rna", "--model", - ft_model, + ft_model_file, "--model_vocab", vocab_file, "--model_config", @@ -385,6 +280,8 @@ def test_finetuned_model(run_component, tmp_path): "model_state_dict", "--output", output_embedding_file, + "--batch_size", + "4", ] ) @@ -416,11 +313,11 @@ def test_finetuned_model_architecture(run_component, tmp_path): args = [ "--input", - tokenized_data_path, + input, "--modality", "rna", "--model", - ft_model, + ft_model_file, "--model_vocab", vocab_file, "--model_config", diff --git a/src/scgpt/pad_tokenize/test.py b/src/scgpt/pad_tokenize/test.py index 0e3d161317d..61a4fc4ab10 100644 --- a/src/scgpt/pad_tokenize/test.py +++ b/src/scgpt/pad_tokenize/test.py @@ -19,43 +19,22 @@ vocab = GeneVocab.from_file(vocab_file) -@pytest.fixture -def binned_h5mu(random_h5mu_path): - binned_h5mu_path = random_h5mu_path() - mdata = mu.read(input_file) - adata = mdata.mod["rna"] - adata.obsm["binned_counts"] = adata.layers["binned"] - mdata.write(binned_h5mu_path) - return binned_h5mu_path - - -def test_integration_pad_tokenize(run_component, tmp_path, binned_h5mu): +def test_integration_pad_tokenize(run_component, tmp_path): output = tmp_path / "Kim2020_Lung_tokenized.h5mu" - run_component( - [ - "--input", - binned_h5mu, - "--output", - output, - "--modality", - "rna", - "--obsm_gene_tokens", - "gene_id_tokens", - "--obsm_tokenized_values", - "values_tokenized", - "--obsm_padding_mask", - "padding_mask", - "--pad_token", - "", - "--pad_value", - "-2", - "--input_obsm_binned_counts", - "binned_counts", - "--model_vocab", - vocab_file, - ] - ) + run_component([ + "--input", input_file, + "--output", output, + "--modality", "rna", + "--var_input", "scgpt_cross_checked_genes", + "--obsm_gene_tokens", "gene_id_tokens", + "--obsm_tokenized_values", "values_tokenized", + "--obsm_padding_mask", "padding_mask", + "--pad_token", "", + "--pad_value", "-2", + "--input_obsm_binned_counts", "binned_counts", + "--model_vocab", vocab_file + ]) output_file = mu.read(output) output_adata = output_file.mod["rna"]