Skip to content

Commit

Permalink
Update scgpt test resources (#926)
Browse files Browse the repository at this point in the history
  • Loading branch information
dorien-er authored Dec 10, 2024
1 parent d9628ee commit aa4cd62
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 203 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

* `scgpt/pad_tokenize` component update: Genes are padded and tokenized based on filtering information in `--var_input` and `--input_obsm_binned_counts` (PR #832).

* `resources_test_scripts/scGPT.sh`: Update scGPT test resources to avoid subsetting of datasets (PR #926).

# openpipelines 2.0.0-rc.2

## BUG FIXES
Expand Down
47 changes: 20 additions & 27 deletions resources_test_scripts/scgpt.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ fi

# install torch if necessary
# Check whether torch is available
if ! command -v torch &> /dev/null; then
echo "This script requires torch. Please make sure the binary is added to your PATH."
if ! python -c "import torch"; then
echo "This script requires torch. Please make sure it is available in your python environment."
exit 1
fi

Expand Down Expand Up @@ -77,12 +77,13 @@ input_mdata.write_h5mu("${test_resources_dir}/Kim2020_Lung.h5mu")
HEREDOC

echo "> Subsetting datasets"
viash run src/filter/subset_h5mu/config.vsh.yaml -p docker -- \
viash run src/filter/subset_h5mu/config.vsh.yaml --engine docker -- \
--input "${test_resources_dir}/Kim2020_Lung.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset.h5mu" \
--number_of_observations 4000

rm "${test_resources_dir}/Kim2020_Lung.h5ad"
rm "${test_resources_dir}/Kim2020_Lung.h5mu"

echo "> Preprocessing datasets"
nextflow \
Expand All @@ -95,46 +96,38 @@ nextflow \
--publish_dir "${test_resources_dir}"

echo "> Filtering highly variable features"
viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/iKim2020_Lung_subset_preprocessed.h5mu" \
viash run src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml --engine docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_preprocessed.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
--layer "log_normalized" \
--var_name_filter "filter_with_hvg" \
--var_name_filter "scgpt_filter_with_hvg" \
--n_top_features 1200 \
--flavor "seurat_v3"

viash run src/filter/do_filter/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
--var_filter "filter_with_hvg"

echo "> Running scGPT cross check genes"
viash run src/scgpt/cross_check_genes/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_hvg_filtered.h5mu" \
viash run src/scgpt/cross_check_genes/config.vsh.yaml --engine docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_hvg.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
--vocab_file "${foundation_model_dir}/vocab.json"
--vocab_file "${foundation_model_dir}/vocab.json" \
--var_input "scgpt_filter_with_hvg" \
--output_var_filter "scgpt_cross_checked_genes"

echo "> Running scGPT binning"
viash run src/scgpt/binning/config.vsh.yaml -p docker -- \
viash run src/scgpt/binning/config.vsh.yaml --engine docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_genes_cross_checked.h5mu" \
--input_layer "log_normalized" \
--output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu"
--output "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
--output_obsm_binned_counts "binned_counts" \
--var_input "scgpt_cross_checked_genes"

echo "> Running scGPT tokenizing"
viash run src/scgpt/pad_tokenize/config.vsh.yaml -p docker -- \
viash run src/scgpt/pad_tokenize/config.vsh.yaml --engine docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_binned.h5mu" \
--input_layer "binned" \
--input_obsm_binned_counts "binned_counts" \
--output "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
--model_vocab "${foundation_model_dir}/vocab.json"

echo "> Running scGPT integration"
viash run src/scgpt/embedding/config.vsh.yaml -p docker -- \
--input "${test_resources_dir}/Kim2020_Lung_subset_tokenized.h5mu" \
--output "${test_resources_dir}/Kim2020_Lung_subset_scgpt_integrated.h5mu" \
--model "${foundation_model_dir}/best_model.pt" \
--model_vocab "${foundation_model_dir}/vocab.json" \
--model_config "${foundation_model_dir}/args.json" \
--obs_batch_label "sample"
--var_input "scgpt_cross_checked_genes" \


echo "> Removing unnecessary files in test resources dir"
find "${test_resources_dir}" -type f \( ! -name "Kim2020_*" -o ! -name "*.h5mu" \) -delete
Expand Down
2 changes: 1 addition & 1 deletion src/scgpt/cell_type_annotation/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ test_resources:
- type: python_script
path: test.py
- path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu
- path: /resources_test/scgpt/source/best_model.pt
- path: /resources_test/scgpt/source/args.json
- path: /resources_test/scgpt/source/vocab.json
- path: /resources_test/scgpt/finetuned_model/best_model.pt

engines:
- type: docker
Expand Down
17 changes: 1 addition & 16 deletions src/scgpt/cell_type_annotation/test.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,16 @@
import pytest
from mudata import read_h5mu
import sys
import torch
import subprocess
import re


input_path = f'{meta["resources_dir"]}/Kim2020_Lung_subset_tokenized.h5mu'
model = f'{meta["resources_dir"]}/best_model.pt'
ft_model = f'{meta["resources_dir"]}/ft_best_model.pt'
ft_model = f'{meta["resources_dir"]}/best_model.pt'
model_config = f'{meta["resources_dir"]}/args.json'
model_vocab = f'{meta["resources_dir"]}/vocab.json'


def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key):
f_model_dict = torch.load(scgpt_path, map_location="cpu")
model_dict = {}
model_dict[state_dict_key] = f_model_dict
model_dict[mapper_key] = {k: str(k) for k in range(15)}
torch.save(model_dict, ft_scgpt_path)


# Convert foundation model into fine-tuned model architecture:
# To be able to do a cell type label mapping, the model architecture needs to contain a class to label mapper dictionary
scgpt_to_ft_scgpt(model, ft_model, "model_state_dict", "id_to_class")


def test_cell_type_inference(run_component, tmp_path):
output_annotation_file = tmp_path / "Kim2020_Lung_subset_annotated.h5mu"

Expand Down
3 changes: 2 additions & 1 deletion src/scgpt/embedding/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ test_resources:
- type: python_script
path: test.py
- path: /resources_test/scgpt/source
- path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu
- path: /resources_test/scgpt/finetuned_model
- path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu

engines:
- type: docker
Expand Down
3 changes: 3 additions & 0 deletions src/scgpt/embedding/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
"dsbn": True,
"n_input_bins": 51,
}
meta = {
"resources_dir": "src/utils",
}
## VIASH END

sys.path.append(meta["resources_dir"])
Expand Down
Loading

0 comments on commit aa4cd62

Please sign in to comment.