Update scgpt test resources (#926)

openpipelines-bio · Dec 10, 2024 · aa4cd62 · aa4cd62
1 parent d9628ee
commit aa4cd62
Show file tree

Hide file tree

Showing 8 changed files with 63 additions and 203 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,8 @@
 
 * `scgpt/pad_tokenize` component update: Genes are padded and tokenized based on filtering information in `--var_input` and `--input_obsm_binned_counts` (PR #832).
 
+* `resources_test_scripts/scGPT.sh`: Update scGPT test resources to avoid subsetting of datasets (PR #926).
+
 # openpipelines 2.0.0-rc.2
 
 ## BUG FIXES

diff --git a/resources_test_scripts/scgpt.sh b/resources_test_scripts/scgpt.sh
@@ -27,8 +27,8 @@ fi
 
 # install torch if necessary
 # Check whether torch is available
-if ! command -v torch &> /dev/null; then
-    echo "This script requires torch. Please make sure the binary is added to your PATH."
+if ! python -c "import torch"; then
+    echo "This script requires torch. Please make sure it is available in your python environment."
     exit 1
 fi
 
@@ -77,12 +77,13 @@ input_mdata.write_h5mu("${test_resources_dir}/Kim2020_Lung.h5mu")
 HEREDOC
 
 echo "> Subsetting datasets"
-viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
+viash run src/filter/subset_h5mu/config.vsh.yaml --engine docker -- \
   --input "${test_resources_dir}/Kim2020_Lung.h5mu" \
   --output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
   --number_of_observations 4000
 
 rm "${test_resources_dir}/Kim2020_Lung.h5ad"
+rm "${test_resources_dir}/Kim2020_Lung.h5mu"
 
 echo "> Preprocessing datasets"
 nextflow \
@@ -95,46 +96,38 @@ nextflow \
   --publish_dir "${test_resources_dir}"
 
 echo "> Filtering highly variable features"
-viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/iKim2020_Lung_subset_preprocessed.h5mu" \
+viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml --engine docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_preprocessed.h5mu" \
   --output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
   --layer "log_normalized" \
-  --var_name_filter "filter_with_hvg" \
+  --var_name_filter "scgpt_filter_with_hvg" \
   --n_top_features 1200 \
   --flavor "seurat_v3"
-
-viash run src/filter/do_filter/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
-  --var_filter "filter_with_hvg"
 
 echo "> Running scGPT cross check genes"
-viash run src/scgpt/cross_check_genes/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
+viash run src/scgpt/cross_check_genes/config.vsh.yaml --engine docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
   --output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
-  --vocab_file "${foundation_model_dir}/vocab.json"
+  --vocab_file "${foundation_model_dir}/vocab.json" \
+  --var_input "scgpt_filter_with_hvg" \
+  --output_var_filter "scgpt_cross_checked_genes"
 
 echo "> Running scGPT binning"
-viash run src/scgpt/binning/config.vsh.yaml -p docker -- \
+viash run src/scgpt/binning/config.vsh.yaml --engine docker -- \
   --input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
   --input_layer "log_normalized" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu"
+  --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
+  --output_obsm_binned_counts "binned_counts" \
+  --var_input "scgpt_cross_checked_genes"
 
 echo "> Running scGPT tokenizing"
-viash run src/scgpt/pad_tokenize/config.vsh.yaml -p docker -- \
+viash run src/scgpt/pad_tokenize/config.vsh.yaml --engine docker -- \
   --input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
-  --input_layer "binned" \
+  --input_obsm_binned_counts "binned_counts" \
   --output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
-  --model_vocab "${foundation_model_dir}/vocab.json"
-
-echo "> Running scGPT integration"
-viash run src/scgpt/embedding/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_scgpt_integrated.h5mu" \
-  --model "${foundation_model_dir}/best_model.pt" \
   --model_vocab "${foundation_model_dir}/vocab.json" \
-  --model_config "${foundation_model_dir}/args.json" \
-  --obs_batch_label "sample"
+  --var_input "scgpt_cross_checked_genes" \
+
 
 echo "> Removing unnecessary files in test resources dir"
 find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete

diff --git a/src/scgpt/cell_type_annotation/config.vsh.yaml b/src/scgpt/cell_type_annotation/config.vsh.yaml
@@ -143,9 +143,9 @@ test_resources:
   - type: python_script
     path: test.py
   - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu
-  - path: /resources_test/scgpt/source/best_model.pt
   - path: /resources_test/scgpt/source/args.json
   - path: /resources_test/scgpt/source/vocab.json
+  - path: /resources_test/scgpt/finetuned_model/best_model.pt
 
 engines:
   - type: docker

diff --git a/src/scgpt/cell_type_annotation/test.py b/src/scgpt/cell_type_annotation/test.py
@@ -1,31 +1,16 @@
 import pytest
 from mudata import read_h5mu
 import sys
-import torch
 import subprocess
 import re
 
 
 input_path = f'{meta["resources_dir"]}/Kim2020_Lung_subset_tokenized.h5mu'
-model = f'{meta["resources_dir"]}/best_model.pt'
-ft_model = f'{meta["resources_dir"]}/ft_best_model.pt'
+ft_model = f'{meta["resources_dir"]}/best_model.pt'
 model_config = f'{meta["resources_dir"]}/args.json'
 model_vocab = f'{meta["resources_dir"]}/vocab.json'
 
 
-def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key):
-    f_model_dict = torch.load(scgpt_path, map_location="cpu")
-    model_dict = {}
-    model_dict[state_dict_key] = f_model_dict
-    model_dict[mapper_key] = {k: str(k) for k in range(15)}
-    torch.save(model_dict, ft_scgpt_path)
-
-
-# Convert foundation model into fine-tuned model architecture:
-# To be able to do a cell type label mapping, the model architecture needs to contain a class to label mapper dictionary
-scgpt_to_ft_scgpt(model, ft_model, "model_state_dict", "id_to_class")
-
-
 def test_cell_type_inference(run_component, tmp_path):
     output_annotation_file = tmp_path / "Kim2020_Lung_subset_annotated.h5mu"
 

diff --git a/src/scgpt/embedding/config.vsh.yaml b/src/scgpt/embedding/config.vsh.yaml
@@ -133,7 +133,8 @@ test_resources:
   - type: python_script
     path: test.py
   - path: /resources_test/scgpt/source
-  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu
+  - path: /resources_test/scgpt/finetuned_model
+  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu
 
 engines:
   - type: docker

diff --git a/src/scgpt/embedding/script.py b/src/scgpt/embedding/script.py
@@ -27,6 +27,9 @@
     "dsbn": True,
     "n_input_bins": 51,
 }
+meta = {
+    "resources_dir": "src/utils",
+}
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])