From aa4cd62adef6d84ed32823819bb2be3760c5fd0b Mon Sep 17 00:00:00 2001
From: Dorien <41797896+dorien-er@users.noreply.github.com>
Date: Tue, 10 Dec 2024 15:14:10 +0100
Subject: [PATCH] Update scgpt test resources (#926)

---
 CHANGELOG.md                                  |   2 +
 resources_test_scripts/scgpt.sh               |  47 +++---
 .../cell_type_annotation/config.vsh.yaml      |   2 +-
 src/scgpt/cell_type_annotation/test.py        |  17 +--
 src/scgpt/embedding/config.vsh.yaml           |   3 +-
 src/scgpt/embedding/script.py                 |   3 +
 src/scgpt/embedding/test.py                   | 143 +++---------------
 src/scgpt/pad_tokenize/test.py                |  49 ++----
 8 files changed, 63 insertions(+), 203 deletions(-)
 mode change 100644 => 100755 resources_test_scripts/scgpt.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3fbe1b823c2..9c7811a1f5d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,8 @@
 
 * `scgpt/pad_tokenize` component update: Genes are padded and tokenized based on filtering information in `--var_input` and `--input_obsm_binned_counts` (PR #832).
 
+* `resources_test_scripts/scGPT.sh`: Update scGPT test resources to avoid subsetting of datasets (PR #926).
+
 # openpipelines 2.0.0-rc.2
 
 ## BUG FIXES
diff --git a/resources_test_scripts/scgpt.sh b/resources_test_scripts/scgpt.sh
old mode 100644
new mode 100755
index f6cd89a14e1..3af4d21b437
--- a/resources_test_scripts/scgpt.sh
+++ b/resources_test_scripts/scgpt.sh
@@ -27,8 +27,8 @@ fi
 
 # install torch if necessary
 # Check whether torch is available
-if ! command -v torch &> /dev/null; then
-    echo "This script requires torch. Please make sure the binary is added to your PATH."
+if ! python -c "import torch"; then
+    echo "This script requires torch. Please make sure it is available in your python environment."
     exit 1
 fi
 
@@ -77,12 +77,13 @@ input_mdata.write_h5mu("${test_resources_dir}/Kim2020_Lung.h5mu")
 HEREDOC
 
 echo "> Subsetting datasets"
-viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
+viash run src/filter/subset_h5mu/config.vsh.yaml --engine docker -- \
   --input "${test_resources_dir}/Kim2020_Lung.h5mu" \
   --output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
   --number_of_observations 4000
 
 rm "${test_resources_dir}/Kim2020_Lung.h5ad"
+rm "${test_resources_dir}/Kim2020_Lung.h5mu"
 
 echo "> Preprocessing datasets"
 nextflow \
@@ -95,46 +96,38 @@ nextflow \
   --publish_dir "${test_resources_dir}"
 
 echo "> Filtering highly variable features"
-viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/iKim2020_Lung_subset_preprocessed.h5mu" \
+viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml --engine docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_preprocessed.h5mu" \
   --output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
   --layer "log_normalized" \
-  --var_name_filter "filter_with_hvg" \
+  --var_name_filter "scgpt_filter_with_hvg" \
   --n_top_features 1200 \
   --flavor "seurat_v3"
-
-viash run src/filter/do_filter/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
-  --var_filter "filter_with_hvg"
   
 echo "> Running scGPT cross check genes"
-viash run src/scgpt/cross_check_genes/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
+viash run src/scgpt/cross_check_genes/config.vsh.yaml --engine docker -- \
+  --input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
   --output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
-  --vocab_file "${foundation_model_dir}/vocab.json"
+  --vocab_file "${foundation_model_dir}/vocab.json" \
+  --var_input "scgpt_filter_with_hvg" \
+  --output_var_filter "scgpt_cross_checked_genes"
 
 echo "> Running scGPT binning"
-viash run src/scgpt/binning/config.vsh.yaml -p docker -- \
+viash run src/scgpt/binning/config.vsh.yaml --engine docker -- \
   --input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
   --input_layer "log_normalized" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu"
+  --output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
+  --output_obsm_binned_counts "binned_counts" \
+  --var_input "scgpt_cross_checked_genes"
 
 echo "> Running scGPT tokenizing"
-viash run src/scgpt/pad_tokenize/config.vsh.yaml -p docker -- \
+viash run src/scgpt/pad_tokenize/config.vsh.yaml --engine docker -- \
   --input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
-  --input_layer "binned" \
+  --input_obsm_binned_counts "binned_counts" \
   --output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
-  --model_vocab "${foundation_model_dir}/vocab.json"
-
-echo "> Running scGPT integration"
-viash run src/scgpt/embedding/config.vsh.yaml -p docker -- \
-  --input "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
-  --output "${test_resources_dir}/Kim2020_Lung_subset_scgpt_integrated.h5mu" \
-  --model "${foundation_model_dir}/best_model.pt" \
   --model_vocab "${foundation_model_dir}/vocab.json" \
-  --model_config "${foundation_model_dir}/args.json" \
-  --obs_batch_label "sample"
+  --var_input "scgpt_cross_checked_genes" \
+
 
 echo "> Removing unnecessary files in test resources dir"
 find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete
diff --git a/src/scgpt/cell_type_annotation/config.vsh.yaml b/src/scgpt/cell_type_annotation/config.vsh.yaml
index 0f2b54b4ada..59e2e2a1a00 100644
--- a/src/scgpt/cell_type_annotation/config.vsh.yaml
+++ b/src/scgpt/cell_type_annotation/config.vsh.yaml
@@ -143,9 +143,9 @@ test_resources:
   - type: python_script
     path: test.py
   - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu
-  - path: /resources_test/scgpt/source/best_model.pt
   - path: /resources_test/scgpt/source/args.json
   - path: /resources_test/scgpt/source/vocab.json
+  - path: /resources_test/scgpt/finetuned_model/best_model.pt
 
 engines:
   - type: docker
diff --git a/src/scgpt/cell_type_annotation/test.py b/src/scgpt/cell_type_annotation/test.py
index f0cb97846f7..50662b90765 100644
--- a/src/scgpt/cell_type_annotation/test.py
+++ b/src/scgpt/cell_type_annotation/test.py
@@ -1,31 +1,16 @@
 import pytest
 from mudata import read_h5mu
 import sys
-import torch
 import subprocess
 import re
 
 
 input_path = f'{meta["resources_dir"]}/Kim2020_Lung_subset_tokenized.h5mu'
-model = f'{meta["resources_dir"]}/best_model.pt'
-ft_model = f'{meta["resources_dir"]}/ft_best_model.pt'
+ft_model = f'{meta["resources_dir"]}/best_model.pt'
 model_config = f'{meta["resources_dir"]}/args.json'
 model_vocab = f'{meta["resources_dir"]}/vocab.json'
 
 
-def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key):
-    f_model_dict = torch.load(scgpt_path, map_location="cpu")
-    model_dict = {}
-    model_dict[state_dict_key] = f_model_dict
-    model_dict[mapper_key] = {k: str(k) for k in range(15)}
-    torch.save(model_dict, ft_scgpt_path)
-
-
-# Convert foundation model into fine-tuned model architecture:
-# To be able to do a cell type label mapping, the model architecture needs to contain a class to label mapper dictionary
-scgpt_to_ft_scgpt(model, ft_model, "model_state_dict", "id_to_class")
-
-
 def test_cell_type_inference(run_component, tmp_path):
     output_annotation_file = tmp_path / "Kim2020_Lung_subset_annotated.h5mu"
 
diff --git a/src/scgpt/embedding/config.vsh.yaml b/src/scgpt/embedding/config.vsh.yaml
index 20de97b5fd7..56a4b69098d 100644
--- a/src/scgpt/embedding/config.vsh.yaml
+++ b/src/scgpt/embedding/config.vsh.yaml
@@ -133,7 +133,8 @@ test_resources:
   - type: python_script
     path: test.py
   - path: /resources_test/scgpt/source
-  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu
+  - path: /resources_test/scgpt/finetuned_model
+  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu
 
 engines:
   - type: docker
diff --git a/src/scgpt/embedding/script.py b/src/scgpt/embedding/script.py
index dd5e3036f9f..b78d42c3f61 100644
--- a/src/scgpt/embedding/script.py
+++ b/src/scgpt/embedding/script.py
@@ -27,6 +27,9 @@
     "dsbn": True,
     "n_input_bins": 51,
 }
+meta = {
+    "resources_dir": "src/utils",
+}
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
diff --git a/src/scgpt/embedding/test.py b/src/scgpt/embedding/test.py
index 4f140aae944..c27f77d144f 100644
--- a/src/scgpt/embedding/test.py
+++ b/src/scgpt/embedding/test.py
@@ -1,142 +1,32 @@
 import pytest
 import subprocess
-import torch
 import re
 import sys
 import mudata as mu
 import numpy as np
-from scipy.sparse import issparse
-from scgpt.tokenizer import tokenize_and_pad_batch
-from scgpt.tokenizer.gene_tokenizer import GeneVocab
-from scgpt.preprocess import Preprocessor
+
 
 ## VIASH START
 meta = {
     "resources_dir": "resources_test",
-    "executable": "./target/docker/scgpt/integration_embedding/integration_embedding",
-    "temp_dir": "tmp",
-    "config": "./target/docker/scgpt/integration_embedding/.config.vsh.yaml",
 }
 ## VIASH END
 
-input = f"{meta['resources_dir']}/Kim2020_Lung_subset.h5mu"
+input = f"{meta['resources_dir']}/Kim2020_Lung_subset_tokenized.h5mu"
 model_file = f"{meta['resources_dir']}/source/best_model.pt"
-ft_model = f'{meta["resources_dir"]}/ft_best_model.pt'
+ft_model_file = f'{meta["resources_dir"]}/finetuned_model/best_model.pt'
 vocab_file = f"{meta['resources_dir']}/source/vocab.json"
 model_config_file = f"{meta['resources_dir']}/source/args.json"
 input_file = mu.read(input)
 
 
-def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key):
-    f_model_dict = torch.load(scgpt_path, map_location="cpu")
-    model_dict = {}
-    model_dict[state_dict_key] = f_model_dict
-    model_dict[mapper_key] = {k: str(k) for k in range(15)}
-    torch.save(model_dict, ft_scgpt_path)
-
-
-# Convert foundation model into fine-tuned model architecture:
-# To be able to do a cell type label mapping, the model architecture needs to contain a class to label mapper dictionary
-scgpt_to_ft_scgpt(model_file, ft_model, "model_state_dict", "id_to_class")
-
-
-## START TEMPORARY WORKAROUND DATA PREPROCESSING
-# TODO: Remove this workaround once full scGPT preprocessing workflow is implemented
-# Read in data
-adata = input_file.mod["rna"]
-
-# Set tokens for integration
-pad_token = "<pad>"
-special_tokens = [pad_token, "<cls>", "<eoc>"]
-
-# Make batch a category column
-adata.obs["str_batch"] = adata.obs["sample"].astype(str)
-batch_id_labels = adata.obs["str_batch"].astype("category").cat.codes.values
-adata.obs["batch_id"] = batch_id_labels
-adata.var["gene_name"] = adata.var.index.tolist()
-
-# Load model vocab
-vocab = GeneVocab.from_file(vocab_file)
-for s in special_tokens:
-    if s not in vocab:
-        vocab.append_token(s)
-
-# Cross-check genes with pre-trained model
-genes = adata.var["gene_name"].tolist()
-adata.var["id_in_vocab"] = [
-    1 if gene in vocab else -1 for gene in adata.var["gene_name"]
-]
-gene_ids_in_vocab = np.array(adata.var["id_in_vocab"])
-adata = adata[:, adata.var["id_in_vocab"] >= 0]
-
-# Preprocess data
-preprocessor = Preprocessor(
-    use_key="X",
-    filter_gene_by_counts=3,
-    filter_cell_by_counts=False,
-    normalize_total=10000,
-    result_normed_key="X_normed",
-    log1p=True,
-    result_log1p_key="X_log1p",
-    subset_hvg=100,
-    hvg_flavor="seurat_v3",
-    binning=51,
-    result_binned_key="X_binned",
-)
-
-preprocessor(adata, batch_key="str_batch")
-
-all_counts = (
-    adata.layers["X_binned"].A
-    if issparse(adata.layers["X_binned"])
-    else adata.layers["X_binned"]
-)
-
-# Fetch gene names and look up tokens in vocab
-vocab.set_default_index(vocab["<pad>"])
-ntokens = len(vocab)
-genes = adata.var["gene_name"].tolist()
-gene_ids = np.array(vocab(genes), dtype=int)
-
-# Fetch number of subset hvg
-n_hvg = adata.var.shape[0]
-
-# Tokenize and pad data
-tokenized_data = tokenize_and_pad_batch(
-    all_counts,
-    gene_ids,
-    max_len=n_hvg + 1,
-    vocab=vocab,
-    pad_token=pad_token,
-    pad_value=-2,
-    append_cls=True,  # append <cls> token at the beginning,
-    include_zero_gene=False,
-    return_pt=True,
-    mod_type=None,
-    vocab_mod=None,
-)
-
-all_gene_ids, all_values = tokenized_data["genes"], tokenized_data["values"]
-padding_mask = all_gene_ids.eq(vocab[pad_token])
-
-adata.obsm["gene_id_tokens"] = all_gene_ids.numpy()
-adata.obsm["values_tokenized"] = all_values.numpy()
-adata.obsm["padding_mask"] = padding_mask.numpy()
-
-tokenized_data = mu.MuData({"rna": adata})
-tokenized_data_path = f"{meta['resources_dir']}/Kim2020_Lung_tokenized.h5mu"
-tokenized_data.write_h5mu(tokenized_data_path)
-
-## END TEMPORARY WORKAROUND DATA PREPROCESSING
-
-
 def test_integration_embedding(run_component, tmp_path):
     output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu"
 
     run_component(
         [
             "--input",
-            tokenized_data_path,
+            input,
             "--modality",
             "rna",
             "--model",
@@ -157,6 +47,8 @@ def test_integration_embedding(run_component, tmp_path):
             "padding_mask",
             "--output",
             output_embedding_file,
+            "--batch_size",
+            "4",
         ]
     )
 
@@ -184,10 +76,11 @@ def test_integration_embedding(run_component, tmp_path):
 
     # Run embeddings without dsbn
     output_embedding_file_without_dsbn = tmp_path / "Kim2020_Lung_subset_embedded.h5mu"
+
     run_component(
         [
             "--input",
-            tokenized_data_path,
+            input,
             "--modality",
             "rna",
             "--model",
@@ -206,6 +99,8 @@ def test_integration_embedding(run_component, tmp_path):
             "padding_mask",
             "--output",
             output_embedding_file_without_dsbn,
+            "--batch_size",
+            "4",
         ]
     )
 
@@ -224,7 +119,7 @@ def test_integration_embedding_dsbn_without_batch_labels(run_component, tmp_path
 
     args = [
         "--input",
-        tokenized_data_path,
+        input,
         "--modality",
         "rna",
         "--model",
@@ -259,7 +154,7 @@ def test_integration_embedding_non_existing_keys(run_component, tmp_path):
     # Test for non-existing gene names key
     args_1 = [
         "--input",
-        tokenized_data_path,
+        input,
         "--modality",
         "rna",
         "--model",
@@ -293,7 +188,7 @@ def test_integration_embedding_non_existing_keys(run_component, tmp_path):
     # Test for non-existing batch label key
     args_2 = [
         "--input",
-        tokenized_data_path,
+        input,
         "--modality",
         "rna",
         "--model",
@@ -325,7 +220,7 @@ def test_integration_embedding_non_existing_keys(run_component, tmp_path):
     # Test for non-existing tokenized values key
     args_3 = [
         "--input",
-        tokenized_data_path,
+        input,
         "--modality",
         "rna",
         "--model",
@@ -362,11 +257,11 @@ def test_finetuned_model(run_component, tmp_path):
     run_component(
         [
             "--input",
-            tokenized_data_path,
+            input,
             "--modality",
             "rna",
             "--model",
-            ft_model,
+            ft_model_file,
             "--model_vocab",
             vocab_file,
             "--model_config",
@@ -385,6 +280,8 @@ def test_finetuned_model(run_component, tmp_path):
             "model_state_dict",
             "--output",
             output_embedding_file,
+            "--batch_size",
+            "4",
         ]
     )
 
@@ -416,11 +313,11 @@ def test_finetuned_model_architecture(run_component, tmp_path):
 
     args = [
         "--input",
-        tokenized_data_path,
+        input,
         "--modality",
         "rna",
         "--model",
-        ft_model,
+        ft_model_file,
         "--model_vocab",
         vocab_file,
         "--model_config",
diff --git a/src/scgpt/pad_tokenize/test.py b/src/scgpt/pad_tokenize/test.py
index 0e3d161317d..61a4fc4ab10 100644
--- a/src/scgpt/pad_tokenize/test.py
+++ b/src/scgpt/pad_tokenize/test.py
@@ -19,43 +19,22 @@
 vocab = GeneVocab.from_file(vocab_file)
 
 
-@pytest.fixture
-def binned_h5mu(random_h5mu_path):
-    binned_h5mu_path = random_h5mu_path()
-    mdata = mu.read(input_file)
-    adata = mdata.mod["rna"]
-    adata.obsm["binned_counts"] = adata.layers["binned"]
-    mdata.write(binned_h5mu_path)
-    return binned_h5mu_path
-
-
-def test_integration_pad_tokenize(run_component, tmp_path, binned_h5mu):
+def test_integration_pad_tokenize(run_component, tmp_path):
     output = tmp_path / "Kim2020_Lung_tokenized.h5mu"
 
-    run_component(
-        [
-            "--input",
-            binned_h5mu,
-            "--output",
-            output,
-            "--modality",
-            "rna",
-            "--obsm_gene_tokens",
-            "gene_id_tokens",
-            "--obsm_tokenized_values",
-            "values_tokenized",
-            "--obsm_padding_mask",
-            "padding_mask",
-            "--pad_token",
-            "<pad>",
-            "--pad_value",
-            "-2",
-            "--input_obsm_binned_counts",
-            "binned_counts",
-            "--model_vocab",
-            vocab_file,
-        ]
-    )
+    run_component([
+        "--input", input_file,
+        "--output", output,
+        "--modality", "rna",
+        "--var_input", "scgpt_cross_checked_genes",
+        "--obsm_gene_tokens", "gene_id_tokens",
+        "--obsm_tokenized_values", "values_tokenized",
+        "--obsm_padding_mask", "padding_mask",
+        "--pad_token", "<pad>",
+        "--pad_value", "-2",
+        "--input_obsm_binned_counts", "binned_counts",
+        "--model_vocab", vocab_file
+    ])
 
     output_file = mu.read(output)
     output_adata = output_file.mod["rna"]