Cell type annotation: scGPT workflow (#832)

Co-authored-by: DriesSchaumont <[email protected]> Co-authored-by: Sarah <[email protected]> Co-authored-by: Vladimir Shitov <[email protected]> Co-authored-by: Jakub Majercik <[email protected]> Co-authored-by: Robrecht Cannoodt <[email protected]>
openpipelines-bio · Dec 5, 2024 · e11f4fd · e11f4fd
1 parent 963ee42
commit e11f4fd
Show file tree

Hide file tree

Showing 19 changed files with 857 additions and 323 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,25 @@
 # openpipelines x.x.x
 
-# MINOR CHANGES
+## BREAKING CHANGES
+
+* Several components under `src/scgpt` (`cross_check_genes`, `tokenize_pad`, `binning`) now processes the input (query) datasets differently. Instead of subsetting datasets based on genes in the model vocabulary and/or highly variable genes, these components require an input .var column with a boolean mask specifying this information. The results are written back to the original input data, preserving the dataset structure (PR #832).
+
+## NEW FUNCTIONALITY
+
+* `scgpt/cell_type_annotation` component update: Added support for multi-processing (PR #832).
+
+## MINOR CHANGES
 
 * Several component (cleanup): remove workaround for using being able to use shared utility functions with Nextflow Fusion (PR #920).
 
+* `workflows/annotation/scgpt_annotation` workflow: Added a scGPT transformer-based cell type annotation workflow (PR #832).
+
+* `scgpt/cross_check_genes` component update: Highly variable genes are now cross-checked based on the boolean mask in `var_input`. The filtering information is stored in the `--output_var_filter` .var field instead of subsetting the dataset (PR #832).
+
+* `scgpt/binning` component update: This component now requires the `--var_input` parameter to provide gene filtering information. Binned data is written to the `--output_obsm_binned_counts` .obsm field in the original input data (PR #832).
+
+* `scgpt/pad_tokenize` component update: Genes are padded and tokenized based on filtering information in `--var_input` and `--input_obsm_binned_counts` (PR #832).
+
 # openpipelines 2.0.0-rc.2
 
 ## BUG FIXES

diff --git a/resources_test_scripts/scgpt.sh b/resources_test_scripts/scgpt.sh
@@ -11,6 +11,12 @@ OUT=resources_test/$ID
 # create foundational model directory
 foundation_model_dir="$OUT/source"
 mkdir -p "$foundation_model_dir"
+export foundation_model_dir
+
+# create finetuned model directory
+finetuned_model_dir="$OUT/finetuned_model"
+mkdir -p "$finetuned_model_dir"
+export finetuned_model_dir
 
 # install gdown if necessary
 # Check whether gdown is available
@@ -19,13 +25,39 @@ if ! command -v gdown &> /dev/null; then
     exit 1
 fi
 
+# install torch if necessary
+# Check whether torch is available
+if ! command -v torch &> /dev/null; then
+    echo "This script requires torch. Please make sure the binary is added to your PATH."
+    exit 1
+fi
+
 echo "> Downloading scGPT foundation model (full_human)"
 # download foundational model files (full_human)
 # https://drive.google.com/drive/folders/1oWh_-ZRdhtoGQ2Fw24HP41FgLoomVo-y
 gdown '1H3E_MJ-Dl36AQV6jLbna2EdvgPaqvqcC' -O "${foundation_model_dir}/vocab.json"
 gdown '1hh2zGKyWAx3DyovD30GStZ3QlzmSqdk1' -O "${foundation_model_dir}/args.json"
 gdown '14AebJfGOUF047Eg40hk57HCtrb0fyDTm' -O "${foundation_model_dir}/best_model.pt"
 
+echo "> Converting to finetuned model format"
+python <<HEREDOC
+import torch
+import mudata
+import os
+
+foundation_model_dir = os.environ.get('foundation_model_dir')
+finetuned_model_dir = os.environ.get('finetuned_model_dir')
+
+found_model_path = f"{foundation_model_dir}/best_model.pt"
+ft_model_path = f"{finetuned_model_dir}/best_model.pt"
+
+f_model_dict = torch.load(found_model_path, map_location="cpu")
+model_dict = {}
+model_dict["model_state_dict"] = f_model_dict
+model_dict["id_to_class"] = {k: str(k) for k in range(15)}
+torch.save(model_dict, ft_model_path)
+HEREDOC
+
 # create test data dir
 test_resources_dir="$OUT/test_resources"
 mkdir -p "$test_resources_dir"

diff --git a/src/scgpt/binning/config.vsh.yaml b/src/scgpt/binning/config.vsh.yaml
@@ -29,6 +29,11 @@ argument_groups:
         required: False
         description: |
           Mudata layer (key from .layers) to use as input data for binning. If not specified, .X is used.
+      - name: "--var_input"
+        type: string
+        default: "id_in_vocab"
+        description: |
+          The name of the adata.var column containing boolean mask for vocabulary-cross checked and/or highly variable genes.
       - name: "--n_input_bins"
         type: integer
         default: 51
@@ -53,9 +58,9 @@ argument_groups:
         choices: ["gzip", "lzf"]
         description: |
           The compression algorithm to use for the output h5mu file.
-      - name: "--binned_layer"
+      - name: "--output_obsm_binned_counts"
         type: string
-        default: "binned"
+        default: "binned_counts"
         description: |
           The name of the adata layer to write the binned data to.
       - name: "--seed"
@@ -67,10 +72,11 @@ resources:
   - type: python_script
     path: script.py
   - path: /src/utils/setup_logger.py
+  - path: /src/utils/subset_vars.py
 test_resources:
   - type: python_script
     path: test.py
-  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu
+  - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_preprocessed.h5mu
 
 engines:
   - type: docker
@@ -81,9 +87,7 @@ engines:
           - procps
       - type: python
         __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
-    test_setup:
-      - type: python
-        __merge__: [ /src/base/requirements/viashpy.yaml ]
+    __merge__: [ /src/base/requirements/python_test_setup.yaml ]
 runners:
   - type: executable
   - type: nextflow
diff --git a/src/scgpt/binning/script.py b/src/scgpt/binning/script.py
@@ -6,21 +6,27 @@
 
 ## VIASH START
 par = {
-    "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu",
-    "output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned_sparse.h5mu",
+    "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_genes_cross_checked.h5mu",
+    "output": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu",
     "modality": "rna",
     "input_layer": None,
-    "binned_layer": "binned",
+    "output_obsm_binned_counts": "binned_counts",
     "n_input_bins": 51,
     "output_compression": None,
+    "var_input": "id_in_vocab",
     "seed": 0
 }
+meta = {
+    "resources_dir": "src/utils"
+}
 ## VIASH END
+
 if par["seed"]:
     np.random.seed(par["seed"])
 
 sys.path.append(meta["resources_dir"])
 from setup_logger import setup_logger
+from subset_vars import subset_vars
 logger = setup_logger()
 
 logger.info("Reading in data")
@@ -29,6 +35,9 @@
 input_adata = mdata.mod[par["modality"]]
 adata = input_adata.copy()
 
+logger.info("Subsetting data based on highly variable gene and/or cross-checked genes")
+adata = subset_vars(adata, par["var_input"])
+
 logger.info("Converting the input layer into a CSR matrix")
 if not par['input_layer'] or par["input_layer"] == "X":
     layer_data = adata.X
@@ -40,7 +49,7 @@
     raise ValueError(
         f"Assuming non-negative data, but got min value {layer_data.min()}."
     )
-    
+
 n_bins = par["n_input_bins"]  # NOTE: the first bin is always a spectial for zero
 logger.info(f"Binning data into {par['n_input_bins']} bins.")
 
@@ -57,7 +66,7 @@ def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray:
     digits = np.ceil(digits)
     smallest_dtype = np.min_scalar_type(digits.max().astype(np.uint)) # Already checked for non-negative values
     digits = digits.astype(smallest_dtype)
-    
+
     return digits
 
 
@@ -78,32 +87,31 @@ def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray:
                 "this is expected. You can use the `filter_cell_by_counts` "
                 "arg to filter out all zero rows."
             )
-            
+
             # Add binned_rows and bin_edges as all 0
             # np.stack will upcast the dtype later
             binned_rows.append(np.zeros_like(non_zero_row, dtype=np.int8))
             bin_edges.append(np.array([0] * n_bins))
             continue
-        
+
         # Binning of non-zero values
         bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1))
         non_zero_digits = _digitize(non_zero_row, bins)
         assert non_zero_digits.min() >= 1
         assert non_zero_digits.max() <= n_bins - 1
         binned_rows.append(non_zero_digits)
-        
+
         bin_edges.append(np.concatenate([[0], bins]))
 
 # Create new CSR matrix
 logger.info("Creating a new CSR matrix of the binned count values")
-binned_layer = csr_matrix((np.concatenate(binned_rows, casting="same_kind"), 
+binned_counts = csr_matrix((np.concatenate(binned_rows, casting="same_kind"), 
                           layer_data.indices, layer_data.indptr), shape=layer_data.shape)
 
 # Set binned values and bin edges layers to adata object
-adata.layers[par["binned_layer"]] = binned_layer 
-adata.obsm["bin_edges"] = np.stack(bin_edges)
-      
+input_adata.obsm[par["output_obsm_binned_counts"]] = binned_counts
+input_adata.obsm["bin_edges"] = np.stack(bin_edges)
+
 # Write mudata output 
 logger.info("Writing output data")
-mdata.mod[par["modality"]] = adata
-mdata.write(par["output"], compression=par["output_compression"]) 
+mdata. write_h5mu(par["output"], compression=par["output_compression"])
diff --git a/src/scgpt/binning/test.py b/src/scgpt/binning/test.py
@@ -14,15 +14,16 @@
 
 
 def test_binning(run_component, tmp_path):
-    
-    input_file_path = f"{meta['resources_dir']}/Kim2020_Lung_subset.h5mu"
+
+    input_file_path = f"{meta['resources_dir']}/Kim2020_Lung_subset_preprocessed.h5mu"
     output_file_path = tmp_path / "Kim2020_Lung_subset_binned.h5mu"
 
     run_component([
         "--input", input_file_path,
         "--modality", "rna",
-        "--binned_layer", "binned",
+        "--output_obsm_binned_counts", "binned_counts",
         "--n_input_bins", "51",
+        "--var_input", "filter_with_hvg",
         "--output", output_file_path
     ])
 
@@ -31,21 +32,19 @@ def test_binning(run_component, tmp_path):
     output_adata = output_mdata.mod["rna"]
 
     # Check presence of binning layers
-    assert "bin_edges" in output_adata.obsm.keys()
-    assert "binned" in output_adata.layers.keys()
-
+    assert {"bin_edges", "binned_counts"}.issubset(output_adata.obsm.keys()), "Binning obsm fields were not added."
+
     # Check bin edges
     bin_edges = output_adata.obsm["bin_edges"]
     assert all(bin_edges[:, 0] == 0)
     assert bin_edges.shape[1] == 51
-    assert all(all(i>=0) for i in bin_edges)
-    
+    assert all(all(i >= 0) for i in bin_edges)
+
     # Check binned values
-    binned_values = output_adata.layers["binned"]
+    binned_values = output_adata.obsm["binned_counts"]
     assert issparse(binned_values)
-    assert binned_values.shape == output_adata.X.shape
     assert (binned_values.data <= 51).all(axis=None)
-    
-    
+
+
 if __name__ == '__main__':
-    sys.exit(pytest.main([__file__]))
+    sys.exit(pytest.main([__file__]))