From 9e87b864f699c371b444b592a19e610a3c9d3286 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:33:43 +0100 Subject: [PATCH] Add python autoformatting using ruff (#921) --- .github/workflows/viash-test.yml | 19 + .pre-commit-config.yaml | 7 + ruff.toml | 43 + src/annotate/celltypist/script.py | 77 +- src/annotate/celltypist/test.py | 195 +- src/annotate/onclass/script.py | 95 +- src/annotate/onclass/test.py | 185 +- src/annotate/popv/script.py | 75 +- src/annotate/popv/test.py | 78 +- .../random_forest_annotation/script.py | 69 +- src/annotate/random_forest_annotation/test.py | 199 +- src/annotate/scanvi/script.py | 89 +- src/annotate/scanvi/test.py | 248 ++- src/annotate/svm_annotation/script.py | 71 +- src/annotate/svm_annotation/test.py | 136 +- src/base/openpipelinetestutils/asserters.py | 229 ++- src/base/openpipelinetestutils/conftest.py | 7 +- src/base/openpipelinetestutils/fixtures.py | 30 +- src/base/openpipelinetestutils/typing.py | 2 +- src/base/openpipelinetestutils/utils.py | 81 +- src/cluster/leiden/script.py | 188 +- src/cluster/leiden/test.py | 99 +- src/compression/compress_h5mu/run_test.py | 47 +- src/compression/compress_h5mu/script.py | 5 +- src/convert/from_10xh5_to_h5mu/script.py | 24 +- src/convert/from_10xh5_to_h5mu/test.py | 43 +- src/convert/from_10xmtx_to_h5mu/run_test.py | 25 +- src/convert/from_10xmtx_to_h5mu/script.py | 8 +- src/convert/from_bdrhap_to_h5mu/script.py | 24 +- src/convert/from_bdrhap_to_h5mu/test.py | 64 +- .../from_cellranger_multi_to_h5mu/script.py | 371 ++-- .../from_cellranger_multi_to_h5mu/test.py | 165 +- src/convert/from_h5ad_to_h5mu/script.py | 24 +- src/convert/from_h5ad_to_h5mu/test.py | 30 +- src/convert/from_h5mu_to_h5ad/script.py | 5 +- src/convert/from_h5mu_to_h5ad/test.py | 19 +- .../cellbender_remove_background/script.py | 110 +- .../cellbender_remove_background/test.py | 22 +- .../helper.py | 109 +- .../script.py | 101 +- .../cellbender_remove_background_v0_2/test.py | 22 +- src/dataflow/concatenate_h5mu/script.py | 267 ++- src/dataflow/concatenate_h5mu/test.py | 1099 ++++++---- src/dataflow/merge/script.py | 43 +- src/dataflow/merge/test.py | 224 ++- src/dataflow/split_h5mu/script.py | 50 +- src/dataflow/split_h5mu/test.py | 221 ++- src/dataflow/split_h5mu_train_test/script.py | 38 +- src/dataflow/split_h5mu_train_test/test.py | 203 +- src/dataflow/split_modalities/script.py | 34 +- src/dataflow/split_modalities/test.py | 65 +- src/demux/cellranger_atac_mkfastq/test.py | 17 +- src/demux/cellranger_mkfastq/test.py | 17 +- src/dimred/densmap/script.py | 102 +- src/dimred/densmap/test.py | 99 +- src/dimred/lsi/script.py | 92 +- src/dimred/lsi/test.py | 203 +- src/dimred/pca/script.py | 43 +- src/dimred/pca/test.py | 231 ++- src/dimred/tsne/script.py | 54 +- src/dimred/tsne/test.py | 135 +- src/dimred/umap/script.py | 66 +- src/dimred/umap/test.py | 65 +- .../highly_variable_features_scanpy/script.py | 147 +- .../highly_variable_features_scanpy/test.py | 181 +- .../score_genes_cell_cycle_scanpy/script.py | 26 +- .../score_genes_cell_cycle_scanpy/test.py | 161 +- .../score_genes_scanpy/helper.py | 16 +- .../score_genes_scanpy/script.py | 22 +- .../score_genes_scanpy/test.py | 140 +- src/filter/delimit_fraction/script.py | 49 +- src/filter/delimit_fraction/test.py | 129 +- src/filter/do_filter/script.py | 29 +- src/filter/do_filter/test.py | 168 +- src/filter/filter_with_counts/script.py | 86 +- src/filter/filter_with_counts/test.py | 224 ++- src/filter/filter_with_scrublet/script.py | 33 +- src/filter/filter_with_scrublet/test.py | 240 ++- src/filter/intersect_obs/script.py | 45 +- src/filter/intersect_obs/test.py | 67 +- src/filter/remove_modality/script.py | 18 +- src/filter/remove_modality/test.py | 29 +- src/filter/subset_h5mu/script.py | 10 +- src/filter/subset_h5mu/test.py | 62 +- src/filter/subset_obsp/script.py | 26 +- src/filter/subset_obsp/test.py | 32 +- src/genetic_demux/demuxlet/config.vsh.yaml | 7 +- src/genetic_demux/demuxlet/demuxlet.patch | 12 + src/genetic_demux/freemuxlet/config.vsh.yaml | 7 +- src/genetic_demux/freemuxlet/freemuxlet.patch | 12 + src/integrate/harmonypy/script.py | 11 +- src/integrate/harmonypy/test.py | 49 +- src/integrate/scanorama/script.py | 25 +- src/integrate/scanorama/test.py | 62 +- src/integrate/scarches/script.py | 76 +- src/integrate/scarches/test.py | 44 +- src/integrate/scvi/script.py | 89 +- src/integrate/scvi/test.py | 83 +- src/integrate/totalvi/script.py | 115 +- src/integrate/totalvi/test.py | 42 +- src/interpret/lianapy/script.py | 50 +- src/interpret/lianapy/test.py | 69 +- src/labels_transfer/knn/script.py | 65 +- src/labels_transfer/knn/test.py | 157 +- src/labels_transfer/utils/helper.py | 20 +- src/labels_transfer/xgboost/script.py | 215 +- src/labels_transfer/xgboost/test.py | 235 ++- .../bd_rhapsody/rhapsody_cell_label.py | 1768 ++++++++++++++--- src/mapping/bd_rhapsody/script.py | 270 +-- src/mapping/bd_rhapsody/test.py | 42 +- src/mapping/cellranger_atac_count/test.py | 35 +- src/mapping/cellranger_count/test.py | 199 +- src/mapping/cellranger_multi/script.py | 366 ++-- src/mapping/cellranger_multi/test.py | 709 ++++--- src/mapping/htseq_count/script.py | 60 +- src/mapping/htseq_count/test.py | 25 +- src/mapping/htseq_count_to_h5mu/script.py | 68 +- src/mapping/htseq_count_to_h5mu/test.py | 29 +- src/mapping/multi_star/script.py | 106 +- src/mapping/multi_star/test.py | 79 +- src/mapping/multi_star_to_h5mu/script.py | 36 +- src/mapping/multi_star_to_h5mu/test.py | 18 +- src/mapping/samtools_sort/script.py | 35 +- src/mapping/samtools_sort/test.py | 33 +- src/mapping/star_align/script.py | 90 +- src/mapping/star_align/test.py | 47 +- src/mapping/star_align_v273a/test.py | 43 +- src/metadata/add_id/script.py | 41 +- src/metadata/add_id/test.py | 118 +- src/metadata/duplicate_obs/script.py | 25 +- src/metadata/duplicate_obs/test.py | 93 +- src/metadata/duplicate_var/script.py | 23 +- src/metadata/duplicate_var/test.py | 99 +- src/metadata/grep_annotation_column/script.py | 122 +- src/metadata/grep_annotation_column/test.py | 617 +++--- src/metadata/join_csv/script.py | 32 +- src/metadata/join_csv/test.py | 157 +- src/metadata/join_uns_to_obs/script.py | 14 +- src/metadata/join_uns_to_obs/test.py | 66 +- src/metadata/move_obsm_to_obs/script.py | 46 +- src/metadata/move_obsm_to_obs/test.py | 206 +- src/neighbors/bbknn/script.py | 30 +- src/neighbors/bbknn/test.py | 78 +- src/neighbors/find_neighbors/script.py | 32 +- src/neighbors/find_neighbors/test.py | 35 +- src/qc/calculate_qc_metrics/script.py | 122 +- src/qc/calculate_qc_metrics/test.py | 348 ++-- src/qc/multiqc/script.py | 9 +- src/qc/multiqc/test.py | 18 +- src/query/cellxgene_census/script.py | 81 +- src/query/cellxgene_census/test.py | 91 +- .../build_bdrhap_reference/script.py | 69 +- src/reference/build_star_reference/script.py | 68 +- src/reference/cellranger_mkgtf/test.py | 81 +- src/scgpt/binning/script.py | 44 +- src/scgpt/binning/test.py | 33 +- src/scgpt/cell_type_annotation/script.py | 79 +- src/scgpt/cell_type_annotation/test.py | 219 +- src/scgpt/cross_check_genes/script.py | 23 +- src/scgpt/cross_check_genes/test.py | 77 +- src/scgpt/embedding/script.py | 55 +- src/scgpt/embedding/test.py | 366 ++-- src/scgpt/pad_tokenize/script.py | 15 +- src/scgpt/pad_tokenize/test.py | 85 +- src/transform/bpcells_regress_out/test.py | 60 +- src/transform/clr/script.py | 25 +- src/transform/clr/test.py | 137 +- src/transform/delete_layer/script.py | 45 +- src/transform/delete_layer/test.py | 115 +- src/transform/log1p/run_test.py | 72 +- src/transform/log1p/script.py | 18 +- src/transform/move_layer/script.py | 22 +- src/transform/move_layer/test.py | 98 +- src/transform/normalize_total/script.py | 21 +- src/transform/normalize_total/test.py | 55 +- src/transform/regress_out/script.py | 10 +- src/transform/regress_out/test.py | 44 +- src/transform/scale/script.py | 34 +- src/transform/scale/test.py | 115 +- src/transform/tfidf/script.py | 3 +- src/transform/tfidf/test.py | 100 +- src/utils/compress_h5mu.py | 54 +- src/utils/cross_check_genes.py | 8 +- src/utils/setup_logger.py | 2 +- src/utils/subset_vars.py | 8 +- src/velocity/scvelo/script.py | 84 +- src/velocity/scvelo/test.py | 26 +- src/velocity/velocyto/test.py | 89 +- src/velocity/velocyto_to_h5mu/script.py | 11 +- src/velocity/velocyto_to_h5mu/test.py | 21 +- .../test_workflows/annotation/scgpt/script.py | 40 +- .../ingestion/bd_rhapsody/script.py | 46 +- .../ingestion/cellranger_mapping/script.py | 32 +- .../ingestion/cellranger_multi/script.py | 35 +- .../cellranger_postprocessing/script.py | 45 +- .../ingestion/conversion/script.py | 21 +- .../dimensionality_reduction/script.py | 18 +- .../process_batches/workflow_test/script.py | 28 +- .../process_batches/workflow_test2/script.py | 44 +- .../multiomics/split_modalities/script.py | 18 +- src/workflows/test_workflows/qc/script.py | 52 +- 201 files changed, 12877 insertions(+), 6747 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 ruff.toml create mode 100644 src/genetic_demux/demuxlet/demuxlet.patch create mode 100644 src/genetic_demux/freemuxlet/freemuxlet.patch diff --git a/.github/workflows/viash-test.yml b/.github/workflows/viash-test.yml index db1426747d0..2e4f549f7d5 100644 --- a/.github/workflows/viash-test.yml +++ b/.github/workflows/viash-test.yml @@ -10,6 +10,25 @@ concurrency: cancel-in-progress: ${{ !contains(github.ref, 'main')}} jobs: + linting: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + - name: Run Ruff + run: ruff check --output-format=github . + + # phase 1 list: env: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000000..acaaaafa703 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.8.1 + hooks: + - id: ruff + - id: ruff-format \ No newline at end of file diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 00000000000..0b94ce246fb --- /dev/null +++ b/ruff.toml @@ -0,0 +1,43 @@ +# Exclude a variety of commonly ignored directories. +exclude = [ + ".git", + ".pyenv", + ".pytest_cache", + ".ruff_cache", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "build", + "dist", + "node_modules", + "site-packages", +] + +builtins = ["meta"] + + + + +[format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +[lint.flake8-pytest-style] +fixture-parentheses = false +mark-parentheses = false + +[lint] +ignore = [ + # module level import not at top of file + "E402" +] \ No newline at end of file diff --git a/src/annotate/celltypist/script.py b/src/annotate/celltypist/script.py index 7838df7f769..8f4c7dd1ede 100644 --- a/src/annotate/celltypist/script.py +++ b/src/annotate/celltypist/script.py @@ -25,8 +25,7 @@ "output_obs_predictions": "celltypist_pred", "output_obs_probabilities": "celltypist_probability", } -meta = { -} +meta = {} ## VIASH END sys.path.append(meta["resources_dir"]) @@ -37,16 +36,20 @@ logger = setup_logger() + def check_celltypist_format(indata): - if np.abs(np.expm1(indata[0]).sum()-10000) > 1: + if np.abs(np.expm1(indata[0]).sum() - 10000) > 1: return False return True def main(par): - - if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]): - raise ValueError("Make sure to provide either 'model' or 'reference', but not both.") + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) input_mudata = mu.read_h5mu(par["input"]) input_adata = input_mudata.mod[par["modality"]] @@ -59,29 +62,53 @@ def main(par): if par["model"]: logger.info("Loading CellTypist model") model = celltypist.models.Model.load(par["model"]) - cross_check_genes(input_modality.var.index, model.features, min_gene_overlap=par["input_reference_gene_overlap"]) + cross_check_genes( + input_modality.var.index, + model.features, + min_gene_overlap=par["input_reference_gene_overlap"], + ) elif par["reference"]: reference_modality = mu.read_h5mu(par["reference"]).mod[par["modality"]] # subset to HVG if required if par["reference_var_input"]: - reference_modality = subset_vars(reference_modality, par["reference_var_input"]) + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) - # Set var names to the desired gene name format (gene symbol, ensembl id, etc.) + # Set var names to the desired gene name format (gene symbol, ensembl id, etc.) # CellTypist requires query gene names to be in index - reference_modality = set_var_index(reference_modality, par["reference_var_gene_names"]) + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) # Ensure enough overlap between genes in query and reference - cross_check_genes(input_modality.var.index, reference_modality.var.index, min_gene_overlap=par["input_reference_gene_overlap"]) + cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + min_gene_overlap=par["input_reference_gene_overlap"], + ) - input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X - reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) if not check_celltypist_format(input_matrix): - logger.warning("Input data is not in the reccommended format for CellTypist.") + logger.warning( + "Input data is not in the reccommended format for CellTypist." + ) if not check_celltypist_format(reference_matrix): - logger.warning("Reference data is not in the reccommended format for CellTypist.") + logger.warning( + "Reference data is not in the reccommended format for CellTypist." + ) labels = reference_modality.obs[par["reference_obs_target"]] @@ -94,21 +121,23 @@ def main(par): max_iter=par["max_iter"], use_SGD=par["use_SGD"], feature_selection=par["feature_selection"], - check_expression=par["check_expression"] - ) + check_expression=par["check_expression"], + ) logger.info("Predicting CellTypist annotations") predictions = celltypist.annotate( - input_modality, - model, - majority_voting=par["majority_voting"] - ) - input_adata.obs[par["output_obs_predictions"]] = predictions.predicted_labels["predicted_labels"] - input_adata.obs[par["output_obs_probability"]] = predictions.probability_matrix.max(axis=1).values + input_modality, model, majority_voting=par["majority_voting"] + ) + input_adata.obs[par["output_obs_predictions"]] = predictions.predicted_labels[ + "predicted_labels" + ] + input_adata.obs[par["output_obs_probability"]] = predictions.probability_matrix.max( + axis=1 + ).values # copy observations back to input data (with full set of features) input_mudata.write_h5mu(par["output"], compression=par["output_compression"]) -if __name__ == '__main__': +if __name__ == "__main__": main(par) diff --git a/src/annotate/celltypist/test.py b/src/annotate/celltypist/test.py index fcdf23fb283..ff64dbb921b 100644 --- a/src/annotate/celltypist/test.py +++ b/src/annotate/celltypist/test.py @@ -7,16 +7,19 @@ import scanpy as sc import anndata as ad from openpipelinetestutils.asserters import assert_annotation_objects_equal + ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu" -model_file = f"{meta['resources_dir']}/annotation_test_data/celltypist_model_Immune_All_Low.pkl" -celltypist_input_file = f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu" +model_file = ( + f"{meta['resources_dir']}/annotation_test_data/celltypist_model_Immune_All_Low.pkl" +) +celltypist_input_file = ( + f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu" +) @pytest.fixture @@ -28,16 +31,19 @@ def wrapper(input_mudata_file, modality, target_sum=1e4): input_layer = adata.X data_for_scanpy = ad.AnnData(X=input_layer.copy()) sc.pp.normalize_total(data_for_scanpy, target_sum=target_sum) - sc.pp.log1p(data_for_scanpy, - base=None, - layer=None, # use X - copy=False) # allow overwrites in the copy that was made + sc.pp.log1p( + data_for_scanpy, + base=None, + layer=None, # use X + copy=False, + ) # allow overwrites in the copy that was made adata.X = data_for_scanpy.X - adata.uns['log1p'] = data_for_scanpy.uns['log1p'].copy() + adata.uns["log1p"] = data_for_scanpy.uns["log1p"].copy() input_mudata.mod[modality] = adata transformed_input_mudata_file = random_h5mu_path() input_mudata.write_h5mu(transformed_input_mudata_file) return transformed_input_mudata_file + return wrapper @@ -45,110 +51,163 @@ def test_simple_execution(run_component, random_h5mu_path, normalize_log_transfo output_file = random_h5mu_path() input_file_transformed = normalize_log_transform(input_file, "rna") - run_component([ - "--input", input_file_transformed, - "--reference", reference_file, - "--reference_obs_targets", "cell_ontology_class", - "--reference_var_gene_names", "ensemblid", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file_transformed, + "--reference", + reference_file, + "--reference_obs_targets", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" input_mudata = mu.read_h5mu(input_file_transformed) output_mudata = mu.read_h5mu(output_file) - assert_annotation_objects_equal(input_mudata.mod["prot"], - output_mudata.mod["prot"]) + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert {'celltypist_pred', 'celltypist_probability'}.issubset(output_mudata.mod["rna"].obs.keys()), "Required keys not found in .obs" + assert {"celltypist_pred", "celltypist_probability"}.issubset( + output_mudata.mod["rna"].obs.keys() + ), "Required keys not found in .obs" obs_values = output_mudata.mod["rna"].obs["celltypist_probability"] - assert all(0 <= value <= 1 for value in obs_values), ".obs at celltypist_probability has values outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), ".obs at celltypist_probability has values outside the range [0, 1]" def test_set_params(run_component, random_h5mu_path, normalize_log_transform): output_file = random_h5mu_path() input_file_transformed = normalize_log_transform(input_file, "rna") - run_component([ - "--input", input_file_transformed, - "--reference", reference_file, - "--reference_obs_target", "cell_ontology_class", - "--reference_var_gene_names", "ensemblid", - "--feature_selection", "True", - "--majority_voting", "True", - "--C", "0.5", - "--max_iter", "100", - "--use_SGD", - "--min_prop", "0.1", - "--input_layer", "log_normalized", - "--output", output_file, - "--output_compression", "gzip", - ]) + run_component( + [ + "--input", + input_file_transformed, + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--feature_selection", + "True", + "--majority_voting", + "True", + "--C", + "0.5", + "--max_iter", + "100", + "--use_SGD", + "--min_prop", + "0.1", + "--input_layer", + "log_normalized", + "--output", + output_file, + "--output_compression", + "gzip", + ] + ) assert os.path.exists(output_file), "Output file does not exist" input_mudata = mu.read_h5mu(input_file_transformed) output_mudata = mu.read_h5mu(output_file) - assert_annotation_objects_equal(input_mudata.mod["prot"], - output_mudata.mod["prot"]) + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert {'celltypist_pred', 'celltypist_probability'}.issubset(output_mudata.mod["rna"].obs.keys()), "Required keys not found in .obs" + assert {"celltypist_pred", "celltypist_probability"}.issubset( + output_mudata.mod["rna"].obs.keys() + ), "Required keys not found in .obs" obs_values = output_mudata.mod["rna"].obs["celltypist_probability"] - assert all(0 <= value <= 1 for value in obs_values), ".obs at celltypist_probability has values outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), ".obs at celltypist_probability has values outside the range [0, 1]" def test_with_model(run_component, random_h5mu_path): output_file = random_h5mu_path() - run_component([ - "--input", celltypist_input_file, - "--model", model_file, - "--reference_obs_targets", "cell_type", - "--output", output_file - ]) + run_component( + [ + "--input", + celltypist_input_file, + "--model", + model_file, + "--reference_obs_targets", + "cell_type", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" output_mudata = mu.read_h5mu(output_file) - assert {'celltypist_pred', 'celltypist_probability'}.issubset(output_mudata.mod["rna"].obs.keys()), "Required keys not found in .obs" + assert {"celltypist_pred", "celltypist_probability"}.issubset( + output_mudata.mod["rna"].obs.keys() + ), "Required keys not found in .obs" obs_values = output_mudata.mod["rna"].obs["celltypist_probability"] - assert all(0 <= value <= 1 for value in obs_values), ".obs at celltypist_probability has values outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), ".obs at celltypist_probability has values outside the range [0, 1]" def test_fail_check_reference_expression(run_component, random_h5mu_path): output_file = random_h5mu_path() with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--reference", reference_file, - "--reference_var_gene_names", "ensemblid", - "--output", output_file, - "--check_expression" - ]) - assert re.search(r"Invalid expression matrix, expect log1p normalized expression to 10000 counts per cell", - err.value.stdout.decode('utf-8')) + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + "--check_expression", + ] + ) + assert re.search( + r"Invalid expression matrix, expect log1p normalized expression to 10000 counts per cell", + err.value.stdout.decode("utf-8"), + ) def test_fail_invalid_input_expression(run_component, random_h5mu_path): output_file = random_h5mu_path() with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--reference", reference_file, - "--reference_var_gene_names", "ensemblid", - "--output", output_file - ]) - assert re.search(r"Invalid expression matrix in `.X`, expect log1p normalized expression to 10000 counts per cell", - err.value.stdout.decode('utf-8')) - - -if __name__ == '__main__': + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + ] + ) + assert re.search( + r"Invalid expression matrix in `.X`, expect log1p normalized expression to 10000 counts per cell", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/annotate/onclass/script.py b/src/annotate/onclass/script.py index 100169d602d..0306c988f9b 100644 --- a/src/annotate/onclass/script.py +++ b/src/annotate/onclass/script.py @@ -29,7 +29,7 @@ "input_reference_gene_overlap": 100, "reference_var_input": None, "reference_var_gene_names": None, - "unkown_celltype": "Unknown" + "unkown_celltype": "Unknown", } meta = {"resources_dir": "src/utils"} ## VIASH END @@ -43,7 +43,9 @@ logger = setup_logger() -def map_celltype_to_ontology_id(cl_obo_file: str) -> Tuple[Dict[str, str], Dict[str, str]]: +def map_celltype_to_ontology_id( + cl_obo_file: str, +) -> Tuple[Dict[str, str], Dict[str, str]]: """ Map cell type names to ontology IDs and vice versa. @@ -66,11 +68,11 @@ def map_celltype_to_ontology_id(cl_obo_file: str) -> Tuple[Dict[str, str], Dict[ def cell_type_prediction( - model: OnClassModel, - input_matrix: np.array, - input_features: List[str], - id_to_name: dict - ) -> Tuple[List[str], List[float]]: + model: OnClassModel, + input_matrix: np.array, + input_features: List[str], + id_to_name: dict, +) -> Tuple[List[str], List[float]]: """ Predict cell types for input data and save results to Anndata obj. @@ -97,7 +99,9 @@ def cell_type_prediction( test_genes=input_features, log_transform=False, ) - onclass_pred = model.Predict(corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0) + onclass_pred = model.Predict( + corr_test_feature, use_normalize=False, refine=True, unseen_ratio=-1.0 + ) pred_label = [model.i2co[ind] for ind in onclass_pred[2]] pred_cell_type_label = [id_to_name[id] for id in pred_label] prob_cell_type_label = np.max(onclass_pred[1], axis=1) / onclass_pred[1].sum(1) @@ -106,9 +110,12 @@ def cell_type_prediction( def main(): - - if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]): - raise ValueError("Make sure to provide either 'model' or 'reference', but not both.") + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) logger.info("Reading input data") input_mudata = mu.read_h5mu(par["input"]) @@ -117,45 +124,68 @@ def main(): # Set var names to the desired gene name format (gene symbol, ensembl id, etc.) input_modality = set_var_index(input_modality, par["input_var_gene_names"]) - input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) # Onclass needs dense matrix format input_matrix = input_matrix.toarray() - + id_to_name, name_to_id = map_celltype_to_ontology_id(par["cl_obo_file"]) if par["model"]: logger.info("Predicting cell types using pre-trained model") model = OnClassModel( cell_type_nlp_emb_file=par["cl_nlp_emb_file"], - cell_type_network_file=par["cl_ontology_file"] - ) + cell_type_network_file=par["cl_ontology_file"], + ) model.BuildModel(use_pretrain=par["model"], ngene=None) - cross_check_genes(model.genes, input_modality.var.index, par["input_reference_gene_overlap"]) + cross_check_genes( + model.genes, input_modality.var.index, par["input_reference_gene_overlap"] + ) elif par["reference"]: logger.info("Reading reference data") - model = OnClassModel(cell_type_nlp_emb_file=par["cl_nlp_emb_file"], - cell_type_network_file=par["cl_ontology_file"]) + model = OnClassModel( + cell_type_nlp_emb_file=par["cl_nlp_emb_file"], + cell_type_network_file=par["cl_ontology_file"], + ) reference_mudata = mu.read_h5mu(par["reference"]) reference_modality = reference_mudata.mod[par["modality"]].copy() - reference_modality = set_var_index(reference_modality, par["reference_var_gene_names"]) + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) # subset to HVG if required if par["reference_var_input"]: - reference_modality = subset_vars(reference_modality, par["reference_var_input"]) + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) - cross_check_genes(input_modality.var.index, reference_modality.var.index, par["input_reference_gene_overlap"]) + cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) - reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) # Onclass needs dense matrix format reference_matrix = reference_matrix.toarray() logger.info("Training a model from reference...") labels = reference_modality.obs[par["reference_obs_target"]].tolist() - labels_cl = [name_to_id[label] if label in name_to_id else par["unknown_celltype"] for label in labels] + labels_cl = [ + name_to_id[label] if label in name_to_id else par["unknown_celltype"] + for label in labels + ] _ = model.EmbedCellTypes(labels_cl) corr_train_feature, _, corr_train_genes, _ = model.ProcessTrainFeature( @@ -165,23 +195,14 @@ def main(): test_feature=input_matrix, test_genes=input_modality.var.index, log_transform=False, - ) - model.BuildModel( - ngene=len(corr_train_genes) - ) - model.Train( - corr_train_feature, - labels_cl, - max_iter=par["max_iter"] - ) + ) + model.BuildModel(ngene=len(corr_train_genes)) + model.Train(corr_train_feature, labels_cl, max_iter=par["max_iter"]) logger.info("Predicting cell types") predictions, probabilities = cell_type_prediction( - model, - input_matrix, - input_modality.var.index, - id_to_name - ) + model, input_matrix, input_modality.var.index, id_to_name + ) logger.info("Writing output data") input_adata.obs[par["output_obs_predictions"]] = predictions diff --git a/src/annotate/onclass/test.py b/src/annotate/onclass/test.py index e699e097285..9c95b009e49 100644 --- a/src/annotate/onclass/test.py +++ b/src/annotate/onclass/test.py @@ -4,135 +4,176 @@ import subprocess import re import mudata as mu -import anndata as ad from openpipelinetestutils.asserters import assert_annotation_objects_equal -import os + ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu" -cl_nlp_emb_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology.nlp.emb" +cl_nlp_emb_file = ( + f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology.nlp.emb" +) cl_ontology_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.ontology" cl_obo_file = f"{meta['resources_dir']}/annotation_test_data/ontology/cl.obo" -model_file = f"{meta['resources_dir']}/annotation_test_data/onclass_model/example_file_model" +model_file = ( + f"{meta['resources_dir']}/annotation_test_data/onclass_model/example_file_model" +) def test_simple_execution(run_component, random_h5mu_path): output_file = random_h5mu_path() - run_component([ - "--input", input_file, - "--input_var_gene_names", "gene_symbol", - "--reference", reference_file, - "--reference_obs_target", "cell_ontology_class", - "--cl_nlp_emb_file", cl_nlp_emb_file, - "--cl_ontology_file", cl_ontology_file, - "--cl_obo_file", cl_obo_file, - "--max_iter", "10", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--cl_nlp_emb_file", + cl_nlp_emb_file, + "--cl_ontology_file", + cl_ontology_file, + "--cl_obo_file", + cl_obo_file, + "--max_iter", + "10", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" input_mudata = mu.read_h5mu(input_file) output_mudata = mu.read_h5mu(output_file) - assert_annotation_objects_equal( - input_mudata.mod["prot"], - output_mudata.mod["prot"] - ) + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert list(output_mudata.mod["rna"].obs.keys()) == ['onclass_pred', 'onclass_prob'] + assert list(output_mudata.mod["rna"].obs.keys()) == ["onclass_pred", "onclass_prob"] obs_values = output_mudata.mod["rna"].obs["onclass_prob"] - assert all(0 <= value <= 1 for value in obs_values), ".obs at cell_ontology_class_prob has values outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), ".obs at cell_ontology_class_prob has values outside the range [0, 1]" def test_custom_obs(run_component, random_h5mu_path): output_file = random_h5mu_path() - run_component([ - "--input", input_file, - "--input_var_gene_names", "gene_symbol", - "--reference", reference_file, - "--reference_obs_target", "cell_ontology_class", - "--output_obs_predictions", "dummy_pred_1", - "--output_obs_probability", "dummy_prob_1", - "--cl_nlp_emb_file", cl_nlp_emb_file, - "--cl_ontology_file", cl_ontology_file, - "--cl_obo_file", cl_obo_file, - "--max_iter", "10", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--output_obs_predictions", + "dummy_pred_1", + "--output_obs_probability", + "dummy_prob_1", + "--cl_nlp_emb_file", + cl_nlp_emb_file, + "--cl_ontology_file", + cl_ontology_file, + "--cl_obo_file", + cl_obo_file, + "--max_iter", + "10", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" input_mudata = mu.read_h5mu(input_file) output_mudata = mu.read_h5mu(output_file) - assert_annotation_objects_equal( - input_mudata.mod["prot"], - output_mudata.mod["prot"] - ) + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert set(output_mudata.mod["rna"].obs.keys()) == {'dummy_pred_1', 'dummy_prob_1'} + assert set(output_mudata.mod["rna"].obs.keys()) == {"dummy_pred_1", "dummy_prob_1"} obs_values = output_mudata.mod["rna"].obs["dummy_prob_1"] - assert all(0 <= value <= 1 for value in obs_values), ".obs at dummy_prob_1 has values outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), ".obs at dummy_prob_1 has values outside the range [0, 1]" def test_no_model_no_reference_error(run_component, random_h5mu_path): output_file = random_h5mu_path() with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--input_var_gene_names", "gene_symbol", - "--output", output_file, - "--cl_nlp_emb_file", cl_nlp_emb_file, - "--cl_ontology_file", cl_ontology_file, - "--cl_obo_file", cl_obo_file, - "--reference_obs_target", "cell_ontology_class" - ]) + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--output", + output_file, + "--cl_nlp_emb_file", + cl_nlp_emb_file, + "--cl_ontology_file", + cl_ontology_file, + "--cl_obo_file", + cl_obo_file, + "--reference_obs_target", + "cell_ontology_class", + ] + ) assert re.search( r"ValueError: Make sure to provide either 'model' or 'reference', but not both.", - err.value.stdout.decode('utf-8') - ) + err.value.stdout.decode("utf-8"), + ) def test_pretrained_model(run_component, random_h5mu_path): output_file = random_h5mu_path() - run_component([ - "--input", input_file, - "--input_var_gene_names", "gene_symbol", - "--cl_nlp_emb_file", cl_nlp_emb_file, - "--cl_ontology_file", cl_ontology_file, - "--cl_obo_file", cl_obo_file, - "--reference_obs_target", "cell_ontology_class", - "--model", model_file, - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--input_var_gene_names", + "gene_symbol", + "--cl_nlp_emb_file", + cl_nlp_emb_file, + "--cl_ontology_file", + cl_ontology_file, + "--cl_obo_file", + cl_obo_file, + "--reference_obs_target", + "cell_ontology_class", + "--model", + model_file, + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" input_mudata = mu.read_h5mu(input_file) output_mudata = mu.read_h5mu(output_file) - assert_annotation_objects_equal( - input_mudata.mod["prot"], - output_mudata.mod["prot"] - ) + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert list(output_mudata.mod["rna"].obs.keys()) == ['onclass_pred', 'onclass_prob'] + assert list(output_mudata.mod["rna"].obs.keys()) == ["onclass_pred", "onclass_prob"] obs_values = output_mudata.mod["rna"].obs["onclass_prob"] - assert all(0 <= value <= 1 for value in obs_values), ".obs at cell_ontology_class_prob has values outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), ".obs at cell_ontology_class_prob has values outside the range [0, 1]" -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/annotate/popv/script.py b/src/annotate/popv/script.py index d3d8b5e0ad6..0c7f629b4bd 100644 --- a/src/annotate/popv/script.py +++ b/src/annotate/popv/script.py @@ -9,6 +9,7 @@ # todo: is this still needed? from torch.cuda import is_available as cuda_is_available + try: from torch.backends.mps import is_available as mps_is_available except ModuleNotFoundError: @@ -17,6 +18,7 @@ def mps_is_available(): return False + # where to find the obo files cl_obo_folder = "/opt/PopV/resources/ontology/" @@ -44,7 +46,7 @@ def mps_is_available(): "rf", # "scanvi", "svm", - ] + ], } meta = {} # for debugging the obo folder can be somewhere local @@ -53,13 +55,17 @@ def mps_is_available(): sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() use_gpu = cuda_is_available() logger.info("GPU enabled? %s", use_gpu) + # Helper functions -def get_X(adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str]): +def get_X( + adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str] +): """Fetch the counts data from X or a layer. Subset columns by var_index if so desired.""" if var_index: adata = adata[:, var_index] @@ -67,28 +73,37 @@ def get_X(adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Opti return adata.layers[layer] else: return adata.X + + def get_obs(adata: ad.AnnData, obs_par_names): """Subset the obs dataframe to just the columns defined by the obs_label and obs_batch.""" obs_columns = [par[x] for x in obs_par_names if par[x]] return adata.obs[obs_columns] + + def get_var(adata: ad.AnnData, var_index: list[str]): """Fetch the var dataframe. Subset rows by var_index if so desired.""" return adata.var.loc[var_index] + def main(par, meta): - assert len(par["methods"]) >= 1, "Please, specify at least one method for cell typing." + assert ( + len(par["methods"]) >= 1 + ), "Please, specify at least one method for cell typing." logger.info("Cell typing methods: {}".format(par["methods"])) ### PREPROCESSING REFERENCE ### logger.info("### PREPROCESSING REFERENCE ###") - + # take a look at reference data logger.info("Reading reference data '%s'", par["reference"]) reference = ad.read_h5ad(par["reference"]) - + logger.info("Setting reference var index to Ensembl IDs") reference.var["gene_symbol"] = list(reference.var.index) - reference.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference.var["ensemblid"]] + reference.var.index = [ + re.sub("\\.[0-9]+$", "", s) for s in reference.var["ensemblid"] + ] logger.info("Detect number of samples per label") min_celltype_size = np.min(reference.obs.groupby(par["reference_obs_batch"]).size()) @@ -103,15 +118,19 @@ def main(par, meta): # subset with var column if par["input_var_subset"]: logger.info("Subset input with .var['%s']", par["input_var_subset"]) - assert par["input_var_subset"] in input_modality.var, f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var" - input_modality = input_modality[:,input_modality.var[par["input_var_subset"]]] + assert ( + par["input_var_subset"] in input_modality.var + ), f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var" + input_modality = input_modality[:, input_modality.var[par["input_var_subset"]]] ### ALIGN REFERENCE AND INPUT ### logger.info("### ALIGN REFERENCE AND INPUT ###") logger.info("Detecting common vars based on ensembl ids") - common_ens_ids = list(set(reference.var.index).intersection(set(input_modality.var.index))) - + common_ens_ids = list( + set(reference.var.index).intersection(set(input_modality.var.index)) + ) + logger.info(" reference n_vars: %i", reference.n_vars) logger.info(" input n_vars: %i", input_modality.n_vars) logger.info(" intersect n_vars: %i", len(common_ens_ids)) @@ -119,18 +138,18 @@ def main(par, meta): # subset input objects to make sure popv is using the data we expect input_modality = ad.AnnData( - X = get_X(input_modality, par["input_layer"], common_ens_ids), - obs = get_obs(input_modality, ["input_obs_label", "input_obs_batch"]), - var = get_var(input_modality, common_ens_ids) + X=get_X(input_modality, par["input_layer"], common_ens_ids), + obs=get_obs(input_modality, ["input_obs_label", "input_obs_batch"]), + var=get_var(input_modality, common_ens_ids), ) reference = ad.AnnData( - X = get_X(reference, par["reference_layer"], common_ens_ids), - obs = get_obs(reference, ["reference_obs_label", "reference_obs_batch"]), - var = get_var(reference, common_ens_ids) + X=get_X(reference, par["reference_layer"], common_ens_ids), + obs=get_obs(reference, ["reference_obs_label", "reference_obs_batch"]), + var=get_var(reference, common_ens_ids), ) - # remove layers that - + # remove layers that + ### ALIGN REFERENCE AND INPUT ### logger.info("### ALIGN REFERENCE AND INPUT ###") @@ -141,7 +160,7 @@ def main(par, meta): query_adata=input_modality, query_labels_key=par["input_obs_label"], query_batch_key=par["input_obs_batch"], - query_layers_key=None, # this is taken care of by subset + query_layers_key=None, # this is taken care of by subset # reference ref_adata=reference, ref_labels_key=par["reference_obs_label"], @@ -158,22 +177,22 @@ def main(par, meta): save_path_trained_models=temp_dir, # hardcoded values cl_obo_folder=cl_obo_folder, - accelerator='cuda' if use_gpu else 'cpu' + accelerator="cuda" if use_gpu else "cpu", ) method_kwargs = {} - if 'scanorama' in par['methods']: - method_kwargs['scanorama'] = {'approx': False} + if "scanorama" in par["methods"]: + method_kwargs["scanorama"] = {"approx": False} logger.info("Annotate data") popv.annotation.annotate_data( - adata=pq.adata, - methods=par["methods"], - methods_kwargs=method_kwargs + adata=pq.adata, methods=par["methods"], methods_kwargs=method_kwargs ) popv_input = pq.adata[input_modality.obs_names] # select columns starting with "popv_" - popv_obs_cols = popv_input.obs.columns[popv_input.obs.columns.str.startswith("popv_")] + popv_obs_cols = popv_input.obs.columns[ + popv_input.obs.columns.str.startswith("popv_") + ] # create new data frame with selected columns df_popv = popv_input.obs[popv_obs_cols] @@ -198,11 +217,11 @@ def main(par, meta): # print(f"{attr}:", flush=True) # for key in diff_keys: # print(f" {key}", flush=True) - + # write output logger.info("Writing %s", par["output"]) input.write_h5mu(par["output"], compression=par["output_compression"]) + if __name__ == "__main__": main(par, meta) - diff --git a/src/annotate/popv/test.py b/src/annotate/popv/test.py index de18509274d..872e75409ca 100644 --- a/src/annotate/popv/test.py +++ b/src/annotate/popv/test.py @@ -4,51 +4,66 @@ import mudata as mu ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" + def test_simple_execution(run_component): output_file = "output.h5mu" - run_component([ - "--input", input_file, - "--reference", reference_file, - "--output", "output.h5mu", - "--methods", "rf;svm" - ]) - + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--output", + "output.h5mu", + "--methods", + "rf;svm", + ] + ) + # check whether file exists assert os.path.exists(output_file), "Output file does not exist" - + # read output mudata output = mu.read_h5mu(output_file) # check output expected_rna_obs_cols = ["popv_prediction"] for col in expected_rna_obs_cols: - assert col in output.mod["rna"].obs.columns, f"could not find columns .mod['rna'].obs['{col}']" + assert ( + col in output.mod["rna"].obs.columns + ), f"could not find columns .mod['rna'].obs['{col}']" print(f"output: {output}", flush=True) + def test_popv_with_other_layer(run_component, tmp_path): input_h5mu = mu.read(input_file) - input_h5mu.mod['rna'].layers['test'] = input_h5mu.mod['rna'].X.copy() + input_h5mu.mod["rna"].layers["test"] = input_h5mu.mod["rna"].X.copy() input_h5mu.write_h5mu(tmp_path / "input.h5mu") - run_component([ - "--input", tmp_path / "input.h5mu", - "--reference", reference_file, - "--output", "output.h5mu", - "--methods", "rf;svm;knn_on_scanorama;knn_on_scvi" - ]) + run_component( + [ + "--input", + tmp_path / "input.h5mu", + "--reference", + reference_file, + "--output", + "output.h5mu", + "--methods", + "rf;svm;knn_on_scanorama;knn_on_scvi", + ] + ) + def test_popv_with_non_overlapping_cells(run_component, tmp_path): input_h5mu = mu.read(input_file) - + # copy previous modalities rna_ad = input_h5mu.mod["rna"].copy() prot_ad = input_h5mu.mod["prot"].copy() @@ -62,12 +77,19 @@ def test_popv_with_non_overlapping_cells(run_component, tmp_path): new_h5mu.write_h5mu(tmp_path / "input.h5mu") # run component - run_component([ - "--input", tmp_path / "input.h5mu", - "--reference", reference_file, - "--output", "output.h5mu", - "--methods", "rf;svm;knn_on_scanorama" - ]) - -if __name__ == '__main__': + run_component( + [ + "--input", + tmp_path / "input.h5mu", + "--reference", + reference_file, + "--output", + "output.h5mu", + "--methods", + "rf;svm;knn_on_scanorama", + ] + ) + + +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/annotate/random_forest_annotation/script.py b/src/annotate/random_forest_annotation/script.py index f1279fd9b4c..df401315b62 100644 --- a/src/annotate/random_forest_annotation/script.py +++ b/src/annotate/random_forest_annotation/script.py @@ -27,9 +27,8 @@ "class_weight": None, "max_features": 200, "output_compression": "gzip", - "reference_layer": None, "output_obs_predictions": "random_forest_pred", - "output_obs_probability": "random_forest_probability" + "output_obs_probability": "random_forest_probability", } meta = {"resources_dir": "src/utils"} ## VIASH END @@ -54,47 +53,81 @@ def main(): max_features_conversion = { "all": None, "sqrt": "sqrt", - "log2": "log2", + "log2": "log2", } try: - max_features = max_features_conversion.get(par["max_features"], int(par["max_features"])) + max_features = max_features_conversion.get( + par["max_features"], int(par["max_features"]) + ) except ValueError: - raise ValueError(f"Invaldid value {par['max_features']} for --max_features: must either be an integer or one of \'sqrt\', \'log2\' or \'all\'") + raise ValueError( + f"Invaldid value {par['max_features']} for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'" + ) - if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]): - raise ValueError("Make sure to provide either 'model' or 'reference', but not both.") + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) if par["model"]: logger.info("Loading a pre-trained model") model = pickle.load(open(par["model"], "rb")) if hasattr(model, "_feature_names_in"): - common_genes = cross_check_genes(input_modality.var.index, model._feature_names_in, par["input_reference_gene_overlap"]) + common_genes = cross_check_genes( + input_modality.var.index, + model._feature_names_in, + par["input_reference_gene_overlap"], + ) if not len(common_genes) == len(model._feature_names_in): raise ValueError("Input dataset does not contain all model features.") input_modality = input_modality[:, common_genes] - input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) else: - logger.warning("Model does not have feature names saved. Could not check overlap of model's features with query genes.") + logger.warning( + "Model does not have feature names saved. Could not check overlap of model's features with query genes." + ) elif par["reference"]: logger.info("Reading reference data") reference_mudata = mu.read_h5mu(par["reference"]) reference_modality = reference_mudata.mod[par["modality"]].copy() - reference_modality = set_var_index(reference_modality, par["reference_var_gene_names"]) + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) # subset to HVG if required if par["reference_var_input"]: - reference_modality = subset_vars(reference_modality, par["reference_var_input"]) + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) # Query and input require the exact same features - common_genes = cross_check_genes(input_modality.var.index, reference_modality.var.index, par["input_reference_gene_overlap"]) + common_genes = cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) reference_modality = reference_modality[:, common_genes] input_modality = input_modality[:, common_genes] - reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X - input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) logger.info("Training a model...") labels = reference_modality.obs[par["reference_obs_target"]].to_numpy() @@ -102,8 +135,10 @@ def main(): n_estimators=par["n_estimators"], criterion=par["criterion"], max_depth=par["max_depth"], - class_weight=par["class_weight"] if not par["class_weight"] == "uniform" else None, - max_features=max_features + class_weight=par["class_weight"] + if not par["class_weight"] == "uniform" + else None, + max_features=max_features, ) model.fit(reference_matrix, labels) model._feature_names_in = reference_modality.var.index diff --git a/src/annotate/random_forest_annotation/test.py b/src/annotate/random_forest_annotation/test.py index d62ca40e031..d1a20df0e5b 100644 --- a/src/annotate/random_forest_annotation/test.py +++ b/src/annotate/random_forest_annotation/test.py @@ -5,18 +5,17 @@ import re import mudata as mu from openpipelinetestutils.asserters import assert_annotation_objects_equal -import os from sklearn.ensemble import RandomForestClassifier import pickle ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} sys.path.append("src/utils") ## VIASH END -input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +input_file = ( + f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu" sys.path.append(meta["resources_dir"]) @@ -32,7 +31,9 @@ def dummy_model(tmp_path): input_modality = mu.read_h5mu(input_file).mod["rna"].copy() input_modality = set_var_index(input_modality, None) - common_genes = cross_check_genes(input_modality.var.index, reference_modality.var.index) + common_genes = cross_check_genes( + input_modality.var.index, reference_modality.var.index + ) reference_modality = reference_modality[:, common_genes] labels = reference_modality.obs["cell_ontology_class"].to_numpy() @@ -50,13 +51,20 @@ def dummy_model(tmp_path): def test_simple_execution(run_component, random_h5mu_path): output_file = random_h5mu_path() - run_component([ - "--input", input_file, - "--reference", reference_file, - "--reference_obs_target", "cell_ontology_class", - "--reference_var_gene_names", "ensemblid", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" @@ -65,28 +73,46 @@ def test_simple_execution(run_component, random_h5mu_path): assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert list(output_mudata.mod["rna"].obs.keys()) == ['random_forest_pred','random_forest_probability'] + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "random_forest_pred", + "random_forest_probability", + ] obs_values = output_mudata.mod["rna"].obs["random_forest_probability"] - assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), "probabilities outside the range [0, 1]" def test_custom_out_obs_model_params(run_component, random_h5mu_path): output_file = random_h5mu_path() - run_component([ - "--input", input_file, - "--reference", reference_file, - "--reference_obs_target", "cell_ontology_class", - "--reference_var_gene_names", "ensemblid", - "--output_obs_predictions", "dummy_pred", - "--output_obs_probability", "dummy_probability", - "--n_estimators", "10", - "--criterion", "entropy", - "--max_depth", "5", - "--class_weight", "balanced", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output_obs_predictions", + "dummy_pred", + "--output_obs_probability", + "dummy_probability", + "--n_estimators", + "10", + "--criterion", + "entropy", + "--max_depth", + "5", + "--class_weight", + "balanced", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" @@ -95,85 +121,120 @@ def test_custom_out_obs_model_params(run_component, random_h5mu_path): assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert list(output_mudata.mod["rna"].obs.keys()) == ['dummy_pred', 'dummy_probability'] + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "dummy_pred", + "dummy_probability", + ] obs_values = output_mudata.mod["rna"].obs["dummy_probability"] - assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), "probabilities outside the range [0, 1]" def test_with_model(run_component, random_h5mu_path, dummy_model): output_file = random_h5mu_path() - run_component([ - "--input", input_file, - "--model", dummy_model, - "--output", output_file, - "--reference_obs_target", "cell_ontology_class" - ]) + run_component( + [ + "--input", + input_file, + "--model", + dummy_model, + "--output", + output_file, + "--reference_obs_target", + "cell_ontology_class", + ] + ) assert os.path.exists(output_file), "Output file does not exist" input_mudata = mu.read_h5mu(input_file) output_mudata = mu.read_h5mu(output_file) - assert_annotation_objects_equal(input_mudata.mod["prot"], - output_mudata.mod["prot"]) + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert list(output_mudata.mod["rna"].obs.keys()) == ['random_forest_pred', 'random_forest_probability'] + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "random_forest_pred", + "random_forest_probability", + ] obs_values = output_mudata.mod["rna"].obs["random_forest_probability"] - assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), "probabilities outside the range [0, 1]" def test_no_model_no_reference_error(run_component, random_h5mu_path): output_file = random_h5mu_path() with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--output", output_file, - "--reference_obs_target", "cell_ontology_class" - "--reference_var_gene_names", "ensemblid" - ]) + run_component( + [ + "--input", + input_file, + "--output", + output_file, + "--reference_obs_target", + "cell_ontology_class" "--reference_var_gene_names", + "ensemblid", + ] + ) assert re.search( r"ValueError: Make sure to provide either 'model' or 'reference', but not both.", - err.value.stdout.decode('utf-8') - ) + err.value.stdout.decode("utf-8"), + ) def test_model_and_reference_error(run_component, random_h5mu_path, dummy_model): output_file = random_h5mu_path() with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--output", output_file, - "--reference", reference_file, - "--reference_obs_target", "cell_ontology_class", - "--reference_var_gene_names", "ensemblid", - "--model", dummy_model, - ]) + run_component( + [ + "--input", + input_file, + "--output", + output_file, + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--model", + dummy_model, + ] + ) assert re.search( r"ValueError: Make sure to provide either 'model' or 'reference', but not both.", - err.value.stdout.decode('utf-8') - ) + err.value.stdout.decode("utf-8"), + ) def test_invalid_max_features(run_component, random_h5mu_path): output_file = random_h5mu_path() with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--output", output_file, - "--reference_obs_target", "cell_ontology_class", - "--reference_var_gene_names", "ensemblid", - "--max_features", "invalid_value" - ]) + run_component( + [ + "--input", + input_file, + "--output", + output_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--max_features", + "invalid_value", + ] + ) assert re.search( r"Invaldid value invalid_value for --max_features: must either be an integer or one of 'sqrt', 'log2' or 'all'", - err.value.stdout.decode('utf-8') - ) + err.value.stdout.decode("utf-8"), + ) -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/annotate/scanvi/script.py b/src/annotate/scanvi/script.py index 4efd9f3d655..b82ca85cef7 100644 --- a/src/annotate/scanvi/script.py +++ b/src/annotate/scanvi/script.py @@ -32,7 +32,7 @@ "query_lr_patience": 25, "query_lr_factor": 0.5, "query_early_stopping": True, - "query_early_stopping_patience": 50 + "query_early_stopping_patience": 50, } meta = {"resources_dir": "src/annotate/utils"} ## VIASH END @@ -41,10 +41,17 @@ from setup_logger import setup_logger from cross_check_genes import cross_check_genes from set_var_index import set_var_index + logger = setup_logger() -if (not par["scvi_reference_model"]) and not (par["scanvi_reference_model"]) or (par["scvi_reference_model"] and par["scanvi_reference_model"]): - raise ValueError("Make sure to provide either an '--scvi_reference_model' or a '--scanvi_reference_model', but not both.") +if ( + (not par["scvi_reference_model"]) + and not (par["scanvi_reference_model"]) + or (par["scvi_reference_model"] and par["scanvi_reference_model"]) +): + raise ValueError( + "Make sure to provide either an '--scvi_reference_model' or a '--scanvi_reference_model', but not both." + ) def main(): @@ -53,86 +60,100 @@ def main(): input_mdata = mu.read_h5mu(par["input"]) input_adata = input_mdata.mod[par["modality"]] input_modality = input_adata.copy() - # scANVI requires query and reference gene names to be equivalent + # scANVI requires query and reference gene names to be equivalent input_modality = set_var_index(input_modality, par["var_input_gene_names"]) if par["scanvi_reference_model"]: - - logger.info(f"Loading the pretrained scANVI model from {par['scanvi_reference_model']} and updating it with the query data {par['input']}") + logger.info( + f"Loading the pretrained scANVI model from {par['scanvi_reference_model']} and updating it with the query data {par['input']}" + ) scanvi_query = scvi.model.SCANVI.load_query_data( input_modality, par["scanvi_reference_model"], freeze_classifier=True, - inplace_subset_query_vars=True - ) + inplace_subset_query_vars=True, + ) elif par["scvi_reference_model"]: - logger.info("Reading in the reference model and associated reference data") scvi_reference_model = scvi.model.SCVI.load(par["scvi_reference_model"]) reference = scvi_reference_model.adata logger.info("Alligning genes in reference and query dataset") - # scANVI requires query and reference gene names to be equivalent + # scANVI requires query and reference gene names to be equivalent reference = set_var_index(reference) # Subset query dataset based on genes present in reference - common_ens_ids = cross_check_genes(input_modality.var.index, reference.var.index, min_gene_overlap=par["input_reference_gene_overlap"]) + common_ens_ids = cross_check_genes( + input_modality.var.index, + reference.var.index, + min_gene_overlap=par["input_reference_gene_overlap"], + ) input_modality = input_modality[:, common_ens_ids] logger.info("Instantiating scANVI model from the scVI model") scanvi_ref = scvi.model.SCANVI.from_scvi_model( scvi_reference_model, unlabeled_category=par["unknown_celltype"], - labels_key=scvi_reference_model.adata_manager._registry["setup_args"]["labels_key"], - ) + labels_key=scvi_reference_model.adata_manager._registry["setup_args"][ + "labels_key" + ], + ) reference_plan_kwargs = { "lr": par["reference_learning_rate"], - "reduce_lr_on_plateau": par['reference_reduce_lr_on_plateau'], - "lr_patience": par['reference_lr_patience'], - "lr_factor": par['reference_lr_factor'] - } + "reduce_lr_on_plateau": par["reference_reduce_lr_on_plateau"], + "lr_patience": par["reference_lr_patience"], + "lr_factor": par["reference_lr_factor"], + } logger.info("Training scANVI model on reference data with celltype labels") scanvi_ref.train( train_size=par["reference_train_size"], - max_epochs=par['reference_max_epochs'], - early_stopping=par['reference_early_stopping'], - early_stopping_patience=par['reference_early_stopping_patience'], + max_epochs=par["reference_max_epochs"], + early_stopping=par["reference_early_stopping"], + early_stopping_patience=par["reference_early_stopping_patience"], plan_kwargs=reference_plan_kwargs, check_val_every_n_epoch=1, - accelerator="auto" + accelerator="auto", ) logger.info(f"Updating scANVI model with query data {par['input']}") - scvi.model.SCANVI.prepare_query_anndata(input_modality, scanvi_ref, inplace=True) + scvi.model.SCANVI.prepare_query_anndata( + input_modality, scanvi_ref, inplace=True + ) scanvi_query = scvi.model.SCANVI.load_query_data(input_modality, scanvi_ref) logger.info("Training scANVI model with query data") query_plan_kwargs = { "lr": par["query_learning_rate"], - "reduce_lr_on_plateau": par['query_reduce_lr_on_plateau'], - "lr_patience": par['query_lr_patience'], - "lr_factor": par['query_lr_factor'] - } + "reduce_lr_on_plateau": par["query_reduce_lr_on_plateau"], + "lr_patience": par["query_lr_patience"], + "lr_factor": par["query_lr_factor"], + } scanvi_query.train( train_size=par["query_train_size"], - max_epochs=par['query_max_epochs'], - early_stopping=par['query_early_stopping'], - early_stopping_patience=par['query_early_stopping_patience'], + max_epochs=par["query_max_epochs"], + early_stopping=par["query_early_stopping"], + early_stopping_patience=par["query_early_stopping_patience"], plan_kwargs=query_plan_kwargs, check_val_every_n_epoch=1, - accelerator="auto" + accelerator="auto", ) logger.info("Adding latent representation to query data") - input_adata.obsm[par["output_obsm_scanvi_embedding"]] = scanvi_query.get_latent_representation() + input_adata.obsm[par["output_obsm_scanvi_embedding"]] = ( + scanvi_query.get_latent_representation() + ) logger.info("Running predictions on query data") - input_adata.obs[par["output_obs_predictions"]] = scanvi_query.predict(input_modality) - input_adata.obs[par["output_obs_probability"]] = np.max(scanvi_query.predict(input_modality, soft=True), axis=1) + input_adata.obs[par["output_obs_predictions"]] = scanvi_query.predict( + input_modality + ) + input_adata.obs[par["output_obs_probability"]] = np.max( + scanvi_query.predict(input_modality, soft=True), axis=1 + ) logger.info("Saving output and model") input_mdata.write_h5mu(par["output"], compression=par["output_compression"]) @@ -141,5 +162,5 @@ def main(): scanvi_query.save(par["output_model"], overwrite=True) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/annotate/scanvi/test.py b/src/annotate/scanvi/test.py index 33d38767f54..7501b76ff81 100644 --- a/src/annotate/scanvi/test.py +++ b/src/annotate/scanvi/test.py @@ -6,11 +6,9 @@ import mudata as mu from openpipelinetestutils.asserters import assert_annotation_objects_equal import scvi -import os + ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" @@ -26,18 +24,24 @@ def wrapper(input_file, reference_file): reference_modality = reference_data.mod["rna"] reference_data.var["gene_symbol"] = list(reference_data.var.index) - reference_data.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_data.var["ensemblid"]] + reference_data.var.index = [ + re.sub("\\.[0-9]+$", "", s) for s in reference_data.var["ensemblid"] + ] reference_modality.var["gene_symbol"] = list(reference_modality.var.index) - reference_modality.var.index = [re.sub("\\.[0-9]+$", "", s) for s in reference_modality.var["ensemblid"]] + reference_modality.var.index = [ + re.sub("\\.[0-9]+$", "", s) for s in reference_modality.var["ensemblid"] + ] - common_ens_ids = list(set(reference_modality.var.index).intersection(set(input_modality.var.index))) + common_ens_ids = list( + set(reference_modality.var.index).intersection( + set(input_modality.var.index) + ) + ) reference = reference_modality[:, common_ens_ids].copy() query = input_modality[:, common_ens_ids].copy() - scvi.model.SCVI.setup_anndata(reference, - labels_key="cell_ontology_class" - ) + scvi.model.SCVI.setup_anndata(reference, labels_key="cell_ontology_class") scvi_model = scvi.model.SCVI( reference, @@ -46,7 +50,7 @@ def wrapper(input_file, reference_file): encode_covariates=True, dropout_rate=0.2, n_layers=1, - ) + ) scvi_model.train(max_epochs=10) input_data.mod["rna"] = query @@ -61,6 +65,7 @@ def wrapper(input_file, reference_file): scvi_model.save(scvi_model_file, save_anndata=True, overwrite=True) return scvi_model_file, input_data_file + return wrapper @@ -74,13 +79,14 @@ def scanvi_wrapper(): scvi_model, unlabeled_category="Unkown", labels_key="cell_ontology_class", - ) + ) scanvi_model.train(max_epochs=10) scanvi_model_file = tmp_path scanvi_model.save(scanvi_model_file, save_anndata=True, overwrite=True) return scanvi_model_file, input_data_file + return scanvi_wrapper @@ -88,58 +94,99 @@ def test_simple_execution(run_component, random_h5mu_path, create_scvi_model): scvi_model_file, input_file_scvi = create_scvi_model(input_file, reference_file) output_file = random_h5mu_path() - run_component([ - "--input", input_file_scvi, - "--scvi_reference_model", scvi_model_file, - "--reference_max_epochs", "10", - "--query_max_epochs", "10", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file_scvi, + "--scvi_reference_model", + scvi_model_file, + "--reference_max_epochs", + "10", + "--query_max_epochs", + "10", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" input_mudata = mu.read_h5mu(input_file_scvi) output_mudata = mu.read_h5mu(output_file) - assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed" - assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed" - assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added" + assert ( + input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs + ), "Number of observations changed" + assert ( + input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars + ), "Number of variables changed" + assert ( + "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys() + ), "Latent representation not added" assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added" - assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added" + assert ( + "scanvi_probability" in output_mudata.mod["rna"].obs.keys() + ), "Probabilities not added" - assert_annotation_objects_equal(input_mudata.mod["prot"], - output_mudata.mod["prot"]) + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) -def test_multiple_arguments(run_component, random_h5mu_path, create_scvi_model, tmp_path): +def test_multiple_arguments( + run_component, random_h5mu_path, create_scvi_model, tmp_path +): scvi_model_file, input_file_scvi = create_scvi_model(input_file, reference_file) output_file = random_h5mu_path() - run_component([ - "--input", input_file_scvi, - "--scvi_reference_model", scvi_model_file, - "--output", output_file, - "--reference_max_epochs", "10", - "--reference_reduce_lr_on_plateau", "True", - "--reference_lr_patience", "5", - "--reference_lr_factor", "0.5", - "--reference_train_size", "0.8", - "--reference_early_stopping", "True", - "--reference_early_stopping_patience", "5", - "--reference_early_stopping_min_delta", "0.01", - "--query_max_epochs", "10", - "--query_reduce_lr_on_plateau", "True", - "--query_lr_patience", "5", - "--query_lr_factor", "0.5", - "--query_train_size", "0.8", - "--query_early_stopping", "True", - "--query_early_stopping_patience", "5", - "--query_early_stopping_min_delta", "0.01", - "--output_obs_predictions", "scanvi_pred", - "--output_obs_probabilities", "scanvi_probabilitity", - "--output_compression", "gzip", - "--output_model", tmp_path - ]) + run_component( + [ + "--input", + input_file_scvi, + "--scvi_reference_model", + scvi_model_file, + "--output", + output_file, + "--reference_max_epochs", + "10", + "--reference_reduce_lr_on_plateau", + "True", + "--reference_lr_patience", + "5", + "--reference_lr_factor", + "0.5", + "--reference_train_size", + "0.8", + "--reference_early_stopping", + "True", + "--reference_early_stopping_patience", + "5", + "--reference_early_stopping_min_delta", + "0.01", + "--query_max_epochs", + "10", + "--query_reduce_lr_on_plateau", + "True", + "--query_lr_patience", + "5", + "--query_lr_factor", + "0.5", + "--query_train_size", + "0.8", + "--query_early_stopping", + "True", + "--query_early_stopping_patience", + "5", + "--query_early_stopping_min_delta", + "0.01", + "--output_obs_predictions", + "scanvi_pred", + "--output_obs_probabilities", + "scanvi_probabilitity", + "--output_compression", + "gzip", + "--output_model", + tmp_path, + ] + ) assert os.path.exists(output_file), "Output file does not exist" assert os.path.exists(tmp_path / "model.pt"), "Model file does not exist" @@ -147,64 +194,97 @@ def test_multiple_arguments(run_component, random_h5mu_path, create_scvi_model, input_mudata = mu.read_h5mu(input_file_scvi) output_mudata = mu.read_h5mu(output_file) - assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed" - assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed" - assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added" + assert ( + input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs + ), "Number of observations changed" + assert ( + input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars + ), "Number of variables changed" + assert ( + "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys() + ), "Latent representation not added" assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added" - assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added" + assert ( + "scanvi_probability" in output_mudata.mod["rna"].obs.keys() + ), "Probabilities not added" - assert_annotation_objects_equal(input_mudata.mod["prot"], - output_mudata.mod["prot"]) + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) def test_pretrained_scanvi(run_component, random_h5mu_path, create_scanvi_model): scanvi_model_file, input_file_scanvi = create_scanvi_model() output_file = random_h5mu_path() - run_component([ - "--input", input_file_scanvi, - "--scanvi_reference_model", scanvi_model_file, - "--reference_obs_label", "cell_ontology_class", - "--reference_max_epochs", "10", - "--query_max_epochs", "10", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file_scanvi, + "--scanvi_reference_model", + scanvi_model_file, + "--reference_obs_label", + "cell_ontology_class", + "--reference_max_epochs", + "10", + "--query_max_epochs", + "10", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" input_mudata = mu.read_h5mu(input_file_scanvi) output_mudata = mu.read_h5mu(output_file) - assert input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs, f"Number of observations changed" - assert input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars, f"Number of variables changed" - assert "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys(), "Latent representation not added" + assert ( + input_mudata.mod["rna"].n_obs == output_mudata.mod["rna"].n_obs + ), "Number of observations changed" + assert ( + input_mudata.mod["rna"].n_vars == output_mudata.mod["rna"].n_vars + ), "Number of variables changed" + assert ( + "scanvi_embedding" in output_mudata.mod["rna"].obsm.keys() + ), "Latent representation not added" assert "scanvi_pred" in output_mudata.mod["rna"].obs.keys(), "Predictions not added" - assert "scanvi_probability" in output_mudata.mod["rna"].obs.keys(), "Probabilities not added" + assert ( + "scanvi_probability" in output_mudata.mod["rna"].obs.keys() + ), "Probabilities not added" - assert_annotation_objects_equal(input_mudata.mod["prot"], - output_mudata.mod["prot"]) + assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) -def test_raises(run_component, random_h5mu_path, create_scvi_model, create_scanvi_model): +def test_raises( + run_component, random_h5mu_path, create_scvi_model, create_scanvi_model +): scvi_model_file, input_file_scvi = create_scvi_model(input_file, reference_file) scanvi_model_file, input_file_scanvi = create_scanvi_model() output_file = random_h5mu_path() with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file_scanvi, - "--scanvi_reference_model", scanvi_model_file, - "--scvi_reference_model", scvi_model_file, - "--reference_obs_label", "cell_ontology_class", - "--reference_max_epochs", "10", - "--query_max_epochs", "10", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file_scanvi, + "--scanvi_reference_model", + scanvi_model_file, + "--scvi_reference_model", + scvi_model_file, + "--reference_obs_label", + "cell_ontology_class", + "--reference_max_epochs", + "10", + "--query_max_epochs", + "10", + "--output", + output_file, + ] + ) assert re.search( r"ValueError: Make sure to provide either an '--scvi_reference_model' or a '--scanvi_reference_model', but not both.", - err.value.stdout.decode('utf-8') - ) + err.value.stdout.decode("utf-8"), + ) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/annotate/svm_annotation/script.py b/src/annotate/svm_annotation/script.py index 8a8d078df9a..ab4f8f69799 100644 --- a/src/annotate/svm_annotation/script.py +++ b/src/annotate/svm_annotation/script.py @@ -24,10 +24,9 @@ "output_compression": "gzip", "input_var_gene_names": None, "reference_var_gene_names": "ensemblid", - "reference_layer": None, "output_obs_prediction": "svm_pred", "output_obs_probability": "svm_probability", - "input_reference_gene_overlap": 100 + "input_reference_gene_overlap": 100, } meta = {"resources_dir": "src/utils"} ## VIASH END @@ -40,10 +39,14 @@ logger = setup_logger() -def main(): - if (not par["model"] and not par["reference"]) or (par["model"] and par["reference"]): - raise ValueError("Make sure to provide either 'model' or 'reference', but not both.") +def main(): + if (not par["model"] and not par["reference"]) or ( + par["model"] and par["reference"] + ): + raise ValueError( + "Make sure to provide either 'model' or 'reference', but not both." + ) logger.info("Reading input data") input_mudata = mu.read_h5mu(par["input"]) input_adata = input_mudata.mod[par["modality"]] @@ -54,42 +57,72 @@ def main(): logger.info("Loading a pre-trained model") model = pickle.load(open(par["model"], "rb")) if hasattr(model, "_feature_names_in"): - common_genes = cross_check_genes(input_modality.var.index, model._feature_names_in, par["input_reference_gene_overlap"]) + common_genes = cross_check_genes( + input_modality.var.index, + model._feature_names_in, + par["input_reference_gene_overlap"], + ) if not len(common_genes) == len(model._feature_names_in): raise ValueError("Input dataset does not contain all model features.") input_modality = input_modality[:, common_genes] - input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) else: - logger.warning("Model does not have feature names saved. Could not check overlap of model's features with query genes.") + logger.warning( + "Model does not have feature names saved. Could not check overlap of model's features with query genes." + ) elif par["reference"]: logger.info("Reading reference data") reference_mudata = mu.read_h5mu(par["reference"]) reference_modality = reference_mudata.mod[par["modality"]].copy() - reference_modality = set_var_index(reference_modality, par["reference_var_gene_names"]) + reference_modality = set_var_index( + reference_modality, par["reference_var_gene_names"] + ) # subset to HVG if required if par["reference_var_input"]: - reference_modality = subset_vars(reference_modality, par["reference_var_input"]) + reference_modality = subset_vars( + reference_modality, par["reference_var_input"] + ) # Query and input require the exact same features - common_genes = cross_check_genes(input_modality.var.index, reference_modality.var.index, par["input_reference_gene_overlap"]) + common_genes = cross_check_genes( + input_modality.var.index, + reference_modality.var.index, + par["input_reference_gene_overlap"], + ) reference_modality = reference_modality[:, common_genes] input_modality = input_modality[:, common_genes] - reference_matrix = reference_modality.layers[par["reference_layer"]] if par["reference_layer"] else reference_modality.X - input_matrix = input_modality.layers[par["input_layer"]] if par["input_layer"] else input_modality.X + reference_matrix = ( + reference_modality.layers[par["reference_layer"]] + if par["reference_layer"] + else reference_modality.X + ) + input_matrix = ( + input_modality.layers[par["input_layer"]] + if par["input_layer"] + else input_modality.X + ) logger.info("Training a model...") labels = reference_modality.obs[par["reference_obs_target"]].to_numpy() - model = CalibratedClassifierCV(svm.LinearSVC( - C=par["c_reg"], - max_iter=par["max_iter"], - class_weight=par["class_weight"] if not par["class_weight"] == "uniform" else None, - dual="auto", - )) + model = CalibratedClassifierCV( + svm.LinearSVC( + C=par["c_reg"], + max_iter=par["max_iter"], + class_weight=par["class_weight"] + if not par["class_weight"] == "uniform" + else None, + dual="auto", + ) + ) model.fit(reference_matrix, labels) model._feature_names_in = reference_modality.var.index diff --git a/src/annotate/svm_annotation/test.py b/src/annotate/svm_annotation/test.py index 5b717fe867d..4c766d2424c 100644 --- a/src/annotate/svm_annotation/test.py +++ b/src/annotate/svm_annotation/test.py @@ -5,15 +5,12 @@ import re import mudata as mu from openpipelinetestutils.asserters import assert_annotation_objects_equal -import os from sklearn import svm from sklearn.calibration import CalibratedClassifierCV import pickle ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} sys.path.append("src/utils") ## VIASH END @@ -33,14 +30,18 @@ def dummy_model(tmp_path): input_modality = mu.read_h5mu(input_file).mod["rna"].copy() input_modality = set_var_index(input_modality, None) - common_genes = cross_check_genes(input_modality.var.index, reference_modality.var.index) + common_genes = cross_check_genes( + input_modality.var.index, reference_modality.var.index + ) reference_modality = reference_modality[:, common_genes] labels = reference_modality.obs["cell_ontology_class"].to_numpy() - model = CalibratedClassifierCV(svm.LinearSVC( - max_iter=10, - dual="auto", - )) + model = CalibratedClassifierCV( + svm.LinearSVC( + max_iter=10, + dual="auto", + ) + ) model.fit(reference_modality.X, labels) model._feature_names_in = reference_modality.var.index @@ -54,13 +55,20 @@ def dummy_model(tmp_path): def test_simple_execution(run_component, random_h5mu_path): output_file = random_h5mu_path() - run_component([ - "--input", input_file, - "--reference", reference_file, - "--reference_obs_target", "cell_ontology_class", - "--reference_var_gene_names", "ensemblid", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_obs_target", + "cell_ontology_class", + "--reference_var_gene_names", + "ensemblid", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" @@ -69,26 +77,39 @@ def test_simple_execution(run_component, random_h5mu_path): assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert list(output_mudata.mod["rna"].obs.keys()) == ['svm_pred', 'svm_probability'] + assert list(output_mudata.mod["rna"].obs.keys()) == ["svm_pred", "svm_probability"] obs_values = output_mudata.mod["rna"].obs["svm_probability"] - assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), "probabilities outside the range [0, 1]" def test_custom_out_obs_model_params(run_component, random_h5mu_path): output_file = random_h5mu_path() - run_component([ - "--input", input_file, - "--reference", reference_file, - "--reference_var_gene_names", "ensemblid", - "--reference_obs_target", "cell_ontology_class", - "--output_obs_prediction", "dummy_pred", - "--output_obs_probability", "dummy_probability", - "--max_iter", "1000", - "--c_reg", "0.1", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--reference", + reference_file, + "--reference_var_gene_names", + "ensemblid", + "--reference_obs_target", + "cell_ontology_class", + "--output_obs_prediction", + "dummy_pred", + "--output_obs_probability", + "dummy_probability", + "--max_iter", + "1000", + "--c_reg", + "0.1", + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" @@ -97,21 +118,32 @@ def test_custom_out_obs_model_params(run_component, random_h5mu_path): assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert list(output_mudata.mod["rna"].obs.keys()) == ['dummy_pred', 'dummy_probability'] + assert list(output_mudata.mod["rna"].obs.keys()) == [ + "dummy_pred", + "dummy_probability", + ] obs_values = output_mudata.mod["rna"].obs["dummy_probability"] - assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), "probabilities outside the range [0, 1]" def test_with_model(run_component, random_h5mu_path, dummy_model): output_file = random_h5mu_path() - run_component([ - "--input", input_file, - "--reference_obs_target", "cell_ontology_class", - "--model", dummy_model, - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--reference_obs_target", + "cell_ontology_class", + "--model", + dummy_model, + "--output", + output_file, + ] + ) assert os.path.exists(output_file), "Output file does not exist" @@ -120,27 +152,33 @@ def test_with_model(run_component, random_h5mu_path, dummy_model): assert_annotation_objects_equal(input_mudata.mod["prot"], output_mudata.mod["prot"]) - assert list(output_mudata.mod["rna"].obs.keys()) == ['svm_pred', - 'svm_probability'] + assert list(output_mudata.mod["rna"].obs.keys()) == ["svm_pred", "svm_probability"] obs_values = output_mudata.mod["rna"].obs["svm_probability"] - assert all(0 <= value <= 1 for value in obs_values), "probabilities outside the range [0, 1]" + assert all( + 0 <= value <= 1 for value in obs_values + ), "probabilities outside the range [0, 1]" def test_no_model_no_reference_error(run_component, random_h5mu_path): output_file = random_h5mu_path() with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--reference_obs_target", "cell_ontology_class", - "--output", output_file, - ]) + run_component( + [ + "--input", + input_file, + "--reference_obs_target", + "cell_ontology_class", + "--output", + output_file, + ] + ) assert re.search( r"ValueError: Make sure to provide either 'model' or 'reference', but not both.", - err.value.stdout.decode('utf-8') - ) + err.value.stdout.decode("utf-8"), + ) -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/base/openpipelinetestutils/asserters.py b/src/base/openpipelinetestutils/asserters.py index 17255b69991..d87edc35b36 100644 --- a/src/base/openpipelinetestutils/asserters.py +++ b/src/base/openpipelinetestutils/asserters.py @@ -13,25 +13,32 @@ def _read_if_needed(anndata_mudata_path_or_obj): if isinstance(anndata_mudata_path_or_obj, (str, Path)): - return mudata.read(str(anndata_mudata_path_or_obj)) # TODO: remove when mudata fixes PAth bug + return mudata.read( + str(anndata_mudata_path_or_obj) + ) # TODO: remove when mudata fixes PAth bug if isinstance(anndata_mudata_path_or_obj, (mudata.MuData, anndata.AnnData)): return anndata_mudata_path_or_obj.copy() - raise AssertionError("Expected 'Path', 'str' to MuData/AnnData " - "file or MuData/AnnData object.") + raise AssertionError( + "Expected 'Path', 'str' to MuData/AnnData " "file or MuData/AnnData object." + ) + def _assert_same_annotation_object_class(left, right): - assert type(left) == type(right), (f"Two objects are not of the same class:" - f"\n[Left]:{type(left)}\n[right]:{type(right)}") - + assert type(left) is type(right), ( + f"Two objects are not of the same class:" + f"\n[Left]:{type(left)}\n[right]:{type(right)}" + ) + + def _promote_dtypes(left, right): # Create new DataFrames to avoid modifying the original ones left_aligned = left.copy() right_aligned = right.copy() - + for column in left.columns: l_dtype = left[column].dtype r_dtype = right[column].dtype - + if l_dtype == r_dtype: # No need to modify dtypes that are already the same continue @@ -42,8 +49,8 @@ def _promote_dtypes(left, right): if is_extension and not pd.api.types.is_extension_array_dtype(r_dtype): continue numpy_dtype_l = l_dtype.type if is_extension else l_dtype - numpy_dtype_r = r_dtype.type if is_extension else r_dtype - # At this point we should have only integer or float dtypes + numpy_dtype_r = r_dtype.type if is_extension else r_dtype + # At this point we should have only integer or float dtypes common_dtype = np.promote_types(numpy_dtype_l, numpy_dtype_r) if is_extension: left_aligned[column] = pd.array(left[column], dtype=common_dtype) @@ -51,101 +58,148 @@ def _promote_dtypes(left, right): else: left_aligned[column] = left[column].astype(common_dtype) right_aligned[column] = right[column].astype(common_dtype) - + return left_aligned, right_aligned def assert_mudata_modality_keys_equal(left, right): left_keys = set(left.mod.keys()) right_keys = set(right.mod.keys()) - if left_keys!= right_keys: - raise AssertionError("MuData modalities differ:" - f"\n[left]:{left_keys}\n[right]:{right_keys}") + if left_keys != right_keys: + raise AssertionError( + "MuData modalities differ:" f"\n[left]:{left_keys}\n[right]:{right_keys}" + ) + -def assert_shape_equal(left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike): +def assert_shape_equal( + left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike +): left, right = _read_if_needed(left), _read_if_needed(right) _assert_same_annotation_object_class(left, right) if left.shape != right.shape: - raise AssertionError(f"{type(left).__name__} shapes differ:" - f"\n[left]:{left.shape}\n[right]:{right.shape}") + raise AssertionError( + f"{type(left).__name__} shapes differ:" + f"\n[left]:{left.shape}\n[right]:{right.shape}" + ) if isinstance(left, MuData): assert_mudata_modality_keys_equal(left, right) - for mod_name, modality in left.mod.items(): + for mod_name, modality in left.mod.items(): assert_shape_equal(modality, right[mod_name]) - -def assert_obs_names_equal(left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike, - *args, **kwargs): + +def assert_obs_names_equal( + left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike, *args, **kwargs +): left, right = _read_if_needed(left), _read_if_needed(right) _assert_same_annotation_object_class(left, right) pd.testing.assert_index_equal(left.obs_names, right.obs_names, *args, **kwargs) if isinstance(left, MuData): assert_mudata_modality_keys_equal(left, right) - for mod_name, modality in left.mod.items(): + for mod_name, modality in left.mod.items(): assert_obs_names_equal(modality, right[mod_name]) -def assert_var_names_equal(left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike, - *args, **kwargs): +def assert_var_names_equal( + left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike, *args, **kwargs +): left, right = _read_if_needed(left), _read_if_needed(right) _assert_same_annotation_object_class(left, right) pd.testing.assert_index_equal(left.var_names, right.var_names, *args, **kwargs) if isinstance(left, MuData): assert_mudata_modality_keys_equal(left, right) - for mod_name, modality in left.mod.items(): + for mod_name, modality in left.mod.items(): assert_var_names_equal(modality, right[mod_name]) -def _assert_frame_equal(left, right, sort=False, promote_precicion=False, *args, **kwargs): +def _assert_frame_equal( + left, right, sort=False, promote_precicion=False, *args, **kwargs +): if sort: left, right = left.sort_index(inplace=False), right.sort_index(inplace=False) - left, right = left.sort_index(axis=1, inplace=False), right.sort_index(axis=1, inplace=False) - + left, right = ( + left.sort_index(axis=1, inplace=False), + right.sort_index(axis=1, inplace=False), + ) + if promote_precicion: left, right = _promote_dtypes(left, right) assert_frame_equal(left, right, check_exact=False, atol=1e-3, *args, **kwargs) else: assert_frame_equal(left, right, *args, **kwargs) -def assert_annotation_frame_equal(annotation_attr: Literal["obs", "var"], - left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike, - sort=False, - promote_precicion=False, - *args, **kwargs): - if not annotation_attr in ("obs", "var"): + +def assert_annotation_frame_equal( + annotation_attr: Literal["obs", "var"], + left: AnnotationObjectOrPathLike, + right: AnnotationObjectOrPathLike, + sort=False, + promote_precicion=False, + *args, + **kwargs, +): + if annotation_attr not in ("obs", "var"): raise ValueError("annotation_attr should be 'obs', or 'var'") left, right = _read_if_needed(left), _read_if_needed(right) _assert_same_annotation_object_class(left, right) - left_frame, right_frame = getattr(left, annotation_attr), getattr(right, annotation_attr) - _assert_frame_equal(left_frame, right_frame, sort=sort, promote_precicion=promote_precicion, *args, **kwargs) + left_frame, right_frame = ( + getattr(left, annotation_attr), + getattr(right, annotation_attr), + ) + _assert_frame_equal( + left_frame, + right_frame, + sort=sort, + promote_precicion=promote_precicion, + *args, + **kwargs, + ) if isinstance(left, MuData): assert_mudata_modality_keys_equal(left, right) - for mod_name, modality in left.mod.items(): - assert_annotation_frame_equal(annotation_attr, modality, - right[mod_name], sort=sort, promote_precicion=promote_precicion, *args, **kwargs) + for mod_name, modality in left.mod.items(): + assert_annotation_frame_equal( + annotation_attr, + modality, + right[mod_name], + sort=sort, + promote_precicion=promote_precicion, + *args, + **kwargs, + ) + def _assert_layer_equal(left, right): if issparse(left): if not issparse(right): - raise AssertionError("Layers differ:\n[left]: sparse\n[right]: not sparse") + raise AssertionError("Layers differ:\n[left]: sparse\n[right]: not sparse") if left.getformat() != right.getformat(): - raise AssertionError("Layers format differ:" - f"\n[left]:{left.getformat()}\n[right]: {right.getformat()}") - assert np.all(left.indices == right.indices), "Layers differ: indices are not the same" - assert np.all(left.indptr == right.indptr), "Layers differ: index pointers are not the same" - np.testing.assert_allclose(left.data, right.data, rtol=1e-5, - err_msg="Layers data differs.", equal_nan=True) + raise AssertionError( + "Layers format differ:" + f"\n[left]:{left.getformat()}\n[right]: {right.getformat()}" + ) + assert np.all( + left.indices == right.indices + ), "Layers differ: indices are not the same" + assert np.all( + left.indptr == right.indptr + ), "Layers differ: index pointers are not the same" + np.testing.assert_allclose( + left.data, + right.data, + rtol=1e-5, + err_msg="Layers data differs.", + equal_nan=True, + ) else: if issparse(right): raise AssertionError("Layers differ:\n[left]: not sparse\n[right]: sparse") - np.testing.assert_allclose(left, right, - rtol=1e-5, - err_msg="Layers data differs.", - equal_nan=True) - - -def assert_layers_equal(left: AnnotationObjectOrPathLike, - right: AnnotationObjectOrPathLike): + np.testing.assert_allclose( + left, right, rtol=1e-5, err_msg="Layers data differs.", equal_nan=True + ) + + +def assert_layers_equal( + left: AnnotationObjectOrPathLike, right: AnnotationObjectOrPathLike +): left, right = _read_if_needed(left), _read_if_needed(right) _assert_same_annotation_object_class(left, right) if left.raw is not None: @@ -156,8 +210,9 @@ def assert_layers_equal(left: AnnotationObjectOrPathLike, raise else: if right.raw: - raise AssertionError("Layer .raw differs: " - f"\n[left]:{left.raw}\n[right]:{right}") + raise AssertionError( + "Layer .raw differs: " f"\n[left]:{left.raw}\n[right]:{right}" + ) if left.X is not None: try: _assert_layer_equal(left.X, right.X) @@ -165,9 +220,9 @@ def assert_layers_equal(left: AnnotationObjectOrPathLike, e.add_note("X is different.") raise if left.layers: - assert right.layers and (left.layers.keys() == right.layers.keys()), \ - "Avaiable layers differ:" \ - f"\n[left]:{left.layers}\n[right]{right.layers}" + assert right.layers and (left.layers.keys() == right.layers.keys()), ( + "Avaiable layers differ:" f"\n[left]:{left.layers}\n[right]{right.layers}" + ) for layer_name, layer in left.layers.items(): try: _assert_layer_equal(layer, right.layers[layer_name]) @@ -176,14 +231,14 @@ def assert_layers_equal(left: AnnotationObjectOrPathLike, raise if isinstance(left, MuData): assert_mudata_modality_keys_equal(left, right) - for mod_name, modality in left.mod.items(): + for mod_name, modality in left.mod.items(): assert_layers_equal(modality, right[mod_name]) - -def assert_multidimensional_annotation_equal(annotation_attr: Literal["obsm", "varm"], - left, right, sort=False): - if not annotation_attr in ("obsm", "varm"): +def assert_multidimensional_annotation_equal( + annotation_attr: Literal["obsm", "varm"], left, right, sort=False +): + if annotation_attr not in ("obsm", "varm"): raise ValueError("annotation_attr should be 'obsm', or 'varm'") left, right = _read_if_needed(left), _read_if_needed(right) _assert_same_annotation_object_class(left, right) @@ -191,45 +246,61 @@ def assert_multidimensional_annotation_equal(annotation_attr: Literal["obsm", "v @singledispatch def _assert_multidimensional_value_equal(left, right, **kwargs): raise NotImplementedError("Unregistered type found while asserting") - + @_assert_multidimensional_value_equal.register def _(left: pd.DataFrame, right, **kwargs): _assert_frame_equal(left, right, **kwargs) - + @_assert_multidimensional_value_equal.register(np.ndarray) @_assert_multidimensional_value_equal.register(spmatrix) def _(left, right, **kwargs): # Cannot sort sparse and dense matrices so ignore sort param _assert_layer_equal(left, right) - left_dict, right_dict = getattr(left, annotation_attr), getattr(right, annotation_attr) + left_dict, right_dict = ( + getattr(left, annotation_attr), + getattr(right, annotation_attr), + ) left_keys, right_keys = left_dict.keys(), right_dict.keys() - assert left_keys == right_keys, f"Keys of {annotation_attr} differ:\n[left]:{left_keys}\n[right]:{right_keys}" + assert ( + left_keys == right_keys + ), f"Keys of {annotation_attr} differ:\n[left]:{left_keys}\n[right]:{right_keys}" for left_key, left_value in left_dict.items(): try: - _assert_multidimensional_value_equal(left_value, right_dict[left_key], sort=sort) + _assert_multidimensional_value_equal( + left_value, right_dict[left_key], sort=sort + ) except AssertionError as e: e.add_note(f"Failing key: {left_key}") raise if isinstance(left, MuData): assert_mudata_modality_keys_equal(left, right) - for mod_name, modality in left.mod.items(): + for mod_name, modality in left.mod.items(): try: - assert_multidimensional_annotation_equal(annotation_attr ,modality, right[mod_name], sort=sort) + assert_multidimensional_annotation_equal( + annotation_attr, modality, right[mod_name], sort=sort + ) except AssertionError as e: e.add_note(f"Failing modality: {mod_name}") raise -def assert_annotation_objects_equal(left: AnnotationObjectOrPathLike, - right: AnnotationObjectOrPathLike, - check_data=True, - sort=True, - promote_precision=False): + +def assert_annotation_objects_equal( + left: AnnotationObjectOrPathLike, + right: AnnotationObjectOrPathLike, + check_data=True, + sort=True, + promote_precision=False, +): left, right = _read_if_needed(left), _read_if_needed(right) _assert_same_annotation_object_class(left, right) assert_shape_equal(left, right) - assert_annotation_frame_equal("obs", left, right, sort=sort, promote_precicion=promote_precision) - assert_annotation_frame_equal("var", left, right, sort=sort, promote_precicion=promote_precision) + assert_annotation_frame_equal( + "obs", left, right, sort=sort, promote_precicion=promote_precision + ) + assert_annotation_frame_equal( + "var", left, right, sort=sort, promote_precicion=promote_precision + ) for slot in ("varm", "obsm"): try: assert_multidimensional_annotation_equal(slot, left, right, sort=sort) @@ -237,4 +308,4 @@ def assert_annotation_objects_equal(left: AnnotationObjectOrPathLike, e.add_note(f"Failing multidimensional slot: {slot}") raise if check_data: - assert_layers_equal(left, right) \ No newline at end of file + assert_layers_equal(left, right) diff --git a/src/base/openpipelinetestutils/conftest.py b/src/base/openpipelinetestutils/conftest.py index ca3948eae0e..a14f8ded901 100644 --- a/src/base/openpipelinetestutils/conftest.py +++ b/src/base/openpipelinetestutils/conftest.py @@ -2,12 +2,13 @@ import pytest from pathlib import Path + def pytest_collect_file(file_path: Path, parent): - if (file_path.name == ".viash_script.sh"): + if file_path.name == ".viash_script.sh": # Allow file ending in .sh to be imported - importlib.machinery.SOURCE_SUFFIXES.append('.viash_script.sh') + importlib.machinery.SOURCE_SUFFIXES.append(".viash_script.sh") return pytest.Module.from_parent(parent, path=file_path) def pytest_collection_finish(session): - importlib.machinery.SOURCE_SUFFIXES.remove('.viash_script.sh') \ No newline at end of file + importlib.machinery.SOURCE_SUFFIXES.remove(".viash_script.sh") diff --git a/src/base/openpipelinetestutils/fixtures.py b/src/base/openpipelinetestutils/fixtures.py index 4069a1a78f0..e425ad0d813 100644 --- a/src/base/openpipelinetestutils/fixtures.py +++ b/src/base/openpipelinetestutils/fixtures.py @@ -4,60 +4,76 @@ import anndata as ad import mudata as md + @pytest.fixture def random_path(tmp_path): def wrapper(extension=None): extension = "" if not extension else f".{extension}" return tmp_path / f"{uuid4()}{extension}" - return wrapper + + return wrapper + @pytest.fixture def random_h5mu_path(random_path): def wrapper(): return random_path(extension="h5mu") + return wrapper + @pytest.fixture def write_mudata_to_file(random_h5mu_path): def wrapper(mudata_obj): output_path = random_h5mu_path() mudata_obj.write(output_path) return output_path + return wrapper + @pytest.fixture def small_anndata_1(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) ad1 = ad.AnnData(df, obs=obs, var=var) return ad1 + @pytest.fixture def small_anndata_2(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var4", "var5", "var6"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var4", "var5", "var6"] + ) obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) ad2 = ad.AnnData(df, obs=obs2, var=var2) return ad2 + @pytest.fixture def small_mudata(small_anndata_1, small_anndata_2): - return md.MuData({'mod1': small_anndata_1, 'mod2': small_anndata_2}) + return md.MuData({"mod1": small_anndata_1, "mod2": small_anndata_2}) + @pytest.fixture def small_mudata_path(small_mudata, write_mudata_to_file): return write_mudata_to_file(small_mudata) + @pytest.fixture def split_small_mudata_path(small_mudata_mod1_path, small_mudata_mod2_path): return small_mudata_mod1_path, small_mudata_mod2_path + @pytest.fixture def small_mudata_mod1_path(small_mudata, write_mudata_to_file): - return write_mudata_to_file(md.MuData({'mod1': small_mudata.mod['mod1']})) + return write_mudata_to_file(md.MuData({"mod1": small_mudata.mod["mod1"]})) + @pytest.fixture def small_mudata_mod2_path(small_mudata, write_mudata_to_file): - return write_mudata_to_file(md.MuData({'mod2': small_mudata.mod['mod2']})) - + return write_mudata_to_file(md.MuData({"mod2": small_mudata.mod["mod2"]})) diff --git a/src/base/openpipelinetestutils/typing.py b/src/base/openpipelinetestutils/typing.py index 75707364321..e8f0a16c363 100644 --- a/src/base/openpipelinetestutils/typing.py +++ b/src/base/openpipelinetestutils/typing.py @@ -4,4 +4,4 @@ from pathlib import Path AnnotationObject = Union[MuData, AnnData] -AnnotationObjectOrPathLike = Union[AnnotationObject, str, Path] \ No newline at end of file +AnnotationObjectOrPathLike = Union[AnnotationObject, str, Path] diff --git a/src/base/openpipelinetestutils/utils.py b/src/base/openpipelinetestutils/utils.py index ec3ecbe5096..4772e173db1 100644 --- a/src/base/openpipelinetestutils/utils.py +++ b/src/base/openpipelinetestutils/utils.py @@ -7,54 +7,73 @@ from itertools import product -def remove_annotation_column(annotation_object: AnnotationObject, - column_names: list[str] | str, - axis: Union[Literal["obs"], Literal["var"], 0, 1], - modality_name: str | None = None): +def remove_annotation_column( + annotation_object: AnnotationObject, + column_names: list[str] | str, + axis: Union[Literal["obs"], Literal["var"], 0, 1], + modality_name: str | None = None, +): if isinstance(annotation_object, AnnData) and modality_name is not None: raise ValueError("Cannot specify modality when object is of type AnnData.") if isinstance(column_names, str): - column_names = [str(column_names)] # str to make a copy - axis_strings = { - "var": "var", - "obs": "obs", - 0: "obs", - 1: "var" - } + column_names = [str(column_names)] # str to make a copy + axis_strings = {"var": "var", "obs": "obs", 0: "obs", 1: "var"} axis_string = axis_strings[axis] axis_getter = attrgetter(axis_string) - axis_setter = lambda obj, value: setattr(obj, axis_string, value) + + def axis_setter(obj, value): + setattr(obj, axis_string, value) + if not modality_name: - axis_setter(annotation_object, axis_getter(annotation_object).drop(column_names, - axis="columns", - inplace=False)) + axis_setter( + annotation_object, + axis_getter(annotation_object).drop( + column_names, axis="columns", inplace=False + ), + ) def _get_columns_in_all_modalities(annotation_object, axis_string: str): return reduce( lambda a, b: a.intersection(b), - [getattr(annotation_object.mod[mod], axis_string).columns - for mod in annotation_object.mod], + [ + getattr(annotation_object.mod[mod], axis_string).columns + for mod in annotation_object.mod + ], ).to_list() if isinstance(annotation_object, MuData): if not annotation_object.axis == 0: - raise ValueError("This function was designed for mudata objects with .axis=0") - modality_names = [modality_name] if modality_name else list(annotation_object.mod.keys()) - global_columns = _get_columns_in_all_modalities(annotation_object, axis_string) \ - if axis_string == "var" else [] - extra_cols_to_remove = [f"{mod_name}:{column_name}" for mod_name, column_name - in product(modality_names, column_names) - if column_name not in global_columns] - extra_cols_to_remove += [column_name for column_name in column_names - if column_name in global_columns] + raise ValueError( + "This function was designed for mudata objects with .axis=0" + ) + modality_names = ( + [modality_name] if modality_name else list(annotation_object.mod.keys()) + ) + global_columns = ( + _get_columns_in_all_modalities(annotation_object, axis_string) + if axis_string == "var" + else [] + ) + extra_cols_to_remove = [ + f"{mod_name}:{column_name}" + for mod_name, column_name in product(modality_names, column_names) + if column_name not in global_columns + ] + extra_cols_to_remove += [ + column_name for column_name in column_names if column_name in global_columns + ] if modality_name: - axis_setter(annotation_object, axis_getter(annotation_object).drop(extra_cols_to_remove, - axis="columns", - inplace=False)) + axis_setter( + annotation_object, + axis_getter(annotation_object).drop( + extra_cols_to_remove, axis="columns", inplace=False + ), + ) for mod_name in modality_names: modality = annotation_object.mod[mod_name] - new_modality = remove_annotation_column(modality, column_names, - axis=axis, modality_name=None) + new_modality = remove_annotation_column( + modality, column_names, axis=axis, modality_name=None + ) annotation_object.mod[mod_name] = new_modality return annotation_object diff --git a/src/cluster/leiden/script.py b/src/cluster/leiden/script.py index 90e35f7a5a1..03a39be5802 100644 --- a/src/cluster/leiden/script.py +++ b/src/cluster/leiden/script.py @@ -24,15 +24,28 @@ "modality": "rna", "output_format": "h5mu", "obsm_name": "leiden", - "resolution": [1, 0.25, 0.10, 0.05, 0.01, 0.2, 0.4, 0.5, 0.6, 0.8, 0.9, 0.7, 0.3, 0.35, 0.95], + "resolution": [ + 1, + 0.25, + 0.10, + 0.05, + 0.01, + 0.2, + 0.4, + 0.5, + 0.6, + 0.8, + 0.9, + 0.7, + 0.3, + 0.35, + 0.95, + ], "obsp_connectivities": "connectivities", "uns_name": "leiden", - "output_compression": "gzip" -} -meta = { - "cpus": 8, - "resources_dir": '.' + "output_compression": "gzip", } +meta = {"cpus": 8, "resources_dir": "."} ## VIASH END sys.path.append(meta["resources_dir"]) @@ -41,51 +54,66 @@ _shared_logger_name = "leiden" -class SharedNumpyMatrix(): - def __init__(self, shared_memory: shared_memory.SharedMemory, dtype: npt.DTypeLike, shape: tuple[int, int]) -> None: + +class SharedNumpyMatrix: + def __init__( + self, + shared_memory: shared_memory.SharedMemory, + dtype: npt.DTypeLike, + shape: tuple[int, int], + ) -> None: self._memory = shared_memory self._dtype = dtype self._shape = shape - + @classmethod - def from_numpy(cls, memory_manager: managers.SharedMemoryManager, array: npt.ArrayLike): + def from_numpy( + cls, memory_manager: managers.SharedMemoryManager, array: npt.ArrayLike + ): shm = memory_manager.SharedMemory(size=array.nbytes) - array_in_shared_memory = np.ndarray(array.shape, dtype=array.dtype, buffer=shm.buf) + array_in_shared_memory = np.ndarray( + array.shape, dtype=array.dtype, buffer=shm.buf + ) # Copy the data into shared memory array_in_shared_memory[:] = array[:] return cls(shm, array.dtype, array.shape) def to_numpy(self): - return np.ndarray(self._shape, dtype=self._dtype, buffer=self._memory.buf) - + return np.ndarray(self._shape, dtype=self._dtype, buffer=self._memory.buf) + def close(self): self._memory.close() -class SharedCsrMatrix(): - def __init__(self, - data: SharedNumpyMatrix, - indices: SharedNumpyMatrix, - indptr: SharedNumpyMatrix, - shape: npt.DTypeLike): + +class SharedCsrMatrix: + def __init__( + self, + data: SharedNumpyMatrix, + indices: SharedNumpyMatrix, + indptr: SharedNumpyMatrix, + shape: npt.DTypeLike, + ): self._data = data self._indices = indices self._indptr = indptr self._shape = shape @classmethod - def from_csr_matrix(cls, memory_manager: managers.SharedMemoryManager, csr_matrix_obj: csr_matrix): + def from_csr_matrix( + cls, memory_manager: managers.SharedMemoryManager, csr_matrix_obj: csr_matrix + ): return cls( SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.data), SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.indices), SharedNumpyMatrix.from_numpy(memory_manager, csr_matrix_obj.indptr), csr_matrix_obj.shape, ) - + def to_csr_matrix(self): return csr_matrix( (self._data.to_numpy(), self._indices.to_numpy(), self._indptr.to_numpy()), - shape=self._shape, - copy=False + shape=self._shape, + copy=False, ) def close(self): @@ -93,40 +121,47 @@ def close(self): self._indices.close() self._indptr.close() + def create_empty_anndata_with_connectivities(connectivities, obs_names): - empty_anndata = ad.AnnData(np.zeros((connectivities.shape[0], 1)), - obs=pd.DataFrame(index=list(obs_names))) - empty_anndata.obsp['connectivities'] = connectivities + empty_anndata = ad.AnnData( + np.zeros((connectivities.shape[0], 1)), obs=pd.DataFrame(index=list(obs_names)) + ) + empty_anndata.obsp["connectivities"] = connectivities return empty_anndata + def run_single_resolution(shared_csr_matrix, obs_names, resolution): logger = logging.getLogger(_shared_logger_name) - logger.info("Process with PID '%s' for resolution '%s' started", os.getpid(), resolution) + logger.info( + "Process with PID '%s' for resolution '%s' started", os.getpid(), resolution + ) try: connectivities = shared_csr_matrix.to_csr_matrix() adata = create_empty_anndata_with_connectivities(connectivities, obs_names) with warnings.catch_warnings(): # In the future, the default backend for leiden will be igraph instead of leidenalg. - warnings.simplefilter(action='ignore', category=FutureWarning) + warnings.simplefilter(action="ignore", category=FutureWarning) adata_out = sc.tl.leiden( adata, resolution=resolution, key_added=str(resolution), obsp="connectivities", - copy=True - ) + copy=True, + ) logger.info(f"Returning result for resolution {resolution}") return adata_out.obs[str(resolution)] finally: obs_names.shm.close() - shared_csr_matrix.close() + shared_csr_matrix.close() + def init_worker(parent_process_id, exit_event, log_queue, log_level): import os import threading import time + pid = os.getpid() - + logger = logging.getLogger(_shared_logger_name) logger.setLevel(log_level) @@ -134,12 +169,17 @@ def init_worker(parent_process_id, exit_event, log_queue, log_level): logger.addHandler(handler) logger.info("Initializing process %s", pid) + def exit_if_orphaned(): - logger.info("Starting orphanned process checker for process %s, parent process %s.", pid, parent_process_id) + logger.info( + "Starting orphanned process checker for process %s, parent process %s.", + pid, + parent_process_id, + ) while True: # Check if parent process is gone try: - # If sig is 0, then no signal is sent, but error checking is still performed; + # If sig is 0, then no signal is sent, but error checking is still performed; # this can be used to check for the existence of a process ID os.kill(parent_process_id, 0) except ProcessLookupError: @@ -151,16 +191,24 @@ def exit_if_orphaned(): try: exit_event_set = exit_event.wait(timeout=1) except BrokenPipeError: - logger.info("Checking for shutdown resulted in BrokenPipeError, " - "parent process is most likely gone. Shutting down %s", pid) - os.kill(pid, signal.SIGTERM) + logger.info( + "Checking for shutdown resulted in BrokenPipeError, " + "parent process is most likely gone. Shutting down %s", + pid, + ) + os.kill(pid, signal.SIGTERM) else: if exit_event_set: - logger.info("Exit event set, shutting down %s", pid) + logger.info("Exit event set, shutting down %s", pid) os.kill(pid, signal.SIGTERM) time.sleep(1) + threading.Thread(target=exit_if_orphaned, daemon=True).start() - logger.info("Initialization of process %s is complete, process is now waiting for work.", pid) + logger.info( + "Initialization of process %s is complete, process is now waiting for work.", + pid, + ) + def main(): with managers.SyncManager() as syncm: @@ -172,21 +220,22 @@ def main(): log_queue = syncm.Queue() log_listener = logging.handlers.QueueListener(log_queue, console_handler) log_listener.start() - + logger = logging.getLogger(_shared_logger_name) logger.setLevel(log_level) handler = logging.handlers.QueueHandler(log_queue) logger.addHandler(handler) logger.info("Reading %s.", par["input"]) - adata = mu.read_h5ad(par["input"], mod=par['modality'], backed='r') - logger.info("Processing modality '%s'.", par['modality']) + adata = mu.read_h5ad(par["input"], mod=par["modality"], backed="r") + logger.info("Processing modality '%s'.", par["modality"]) try: - connectivities = adata.obsp[par['obsp_connectivities']] + connectivities = adata.obsp[par["obsp_connectivities"]] except KeyError: - raise ValueError(f"Could not find .obsp key \"{par['obsp_connectivities']}\" " - "in modality {par['modality']}") - + raise ValueError( + f"Could not find .obsp key \"{par['obsp_connectivities']}\" " + "in modality {par['modality']}" + ) # An event that, when triggered, will kill the child processes that are still running exit_early_event = syncm.Event() @@ -199,15 +248,23 @@ def main(): shared_csr_matrix = SharedCsrMatrix.from_csr_matrix(smm, connectivities) results = {} - n_workers = meta['cpus'] - 2 if (meta['cpus'] and (meta['cpus'] - 2) > 0) else 1 + n_workers = ( + meta["cpus"] - 2 if (meta["cpus"] and (meta["cpus"] - 2) > 0) else 1 + ) logger.info(f"Requesting {n_workers} workers") - executor = ProcessPoolExecutor(max_workers=n_workers, - max_tasks_per_child=1, - mp_context=get_context('spawn'), - initializer=init_worker, - initargs=((os.getpid(), exit_early_event, log_queue, log_level))) - pending_futures = {executor.submit(run_single_resolution, shared_csr_matrix, obs_names, resolution): resolution - for resolution in par["resolution"]} + executor = ProcessPoolExecutor( + max_workers=n_workers, + max_tasks_per_child=1, + mp_context=get_context("spawn"), + initializer=init_worker, + initargs=((os.getpid(), exit_early_event, log_queue, log_level)), + ) + pending_futures = { + executor.submit( + run_single_resolution, shared_csr_matrix, obs_names, resolution + ): resolution + for resolution in par["resolution"] + } try: logger.info("All futures sheduled") for done_future in as_completed(pending_futures): @@ -242,13 +299,22 @@ def main(): adata.obsm[par["obsm_name"]] = pd.DataFrame(results) output_file = Path(par["output"]) - logger.info('Writing output to %s.', par['output']) - output_file_uncompressed = output_file.with_name(output_file.stem + "_uncompressed.h5mu") \ - if par["output_compression"] else output_file - shutil.copyfile(par['input'], output_file_uncompressed) - mu.write_h5ad(filename=output_file_uncompressed, mod=par['modality'], data=adata) + logger.info("Writing output to %s.", par["output"]) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if par["output_compression"] + else output_file + ) + shutil.copyfile(par["input"], output_file_uncompressed) + mu.write_h5ad( + filename=output_file_uncompressed, mod=par["modality"], data=adata + ) if par["output_compression"]: - compress_h5mu(output_file_uncompressed, output_file, compression=par["output_compression"]) + compress_h5mu( + output_file_uncompressed, + output_file, + compression=par["output_compression"], + ) output_file_uncompressed.unlink() logger.info("Finished.") log_listener.enqueue_sentinel() @@ -256,4 +322,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/cluster/leiden/test.py b/src/cluster/leiden/test.py index c31113b6743..2af84271070 100644 --- a/src/cluster/leiden/test.py +++ b/src/cluster/leiden/test.py @@ -1,33 +1,35 @@ -import subprocess -from os import path import mudata as mu import pytest import sys -import uuid ## VIASH START meta = { - 'name': 'foo', - 'resources_dir': 'resources_test/', - 'cpus': 2, - 'config': './src/cluster/leiden/config.vsh.yaml', - 'executable': './target/executable/cluster/leiden/leiden', + "name": "foo", + "resources_dir": "resources_test/", + "cpus": 2, + "config": "./src/cluster/leiden/config.vsh.yaml", + "executable": "./target/executable/cluster/leiden/leiden", } ## VIASH END + @pytest.fixture() def input_path(): return meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + @pytest.fixture() def input_data(input_path): return mu.read_h5mu(input_path) + @pytest.fixture() def mudata_custom_connectivities_key(input_data, random_h5mu_path): result = input_data.copy() - result.mod['rna'].obsp["custom_connectivities"] = result.mod['rna'].obsp['connectivities'].copy() - del result.mod['rna'].obsp['connectivities'] + result.mod["rna"].obsp["custom_connectivities"] = ( + result.mod["rna"].obsp["connectivities"].copy() + ) + del result.mod["rna"].obsp["connectivities"] output_path = random_h5mu_path() result.write(output_path) return output_path @@ -41,14 +43,21 @@ def mudata_custom_connectivities_key(input_data, random_h5mu_path): # return temp_file # return wrapper + @pytest.mark.parametrize("compression", ["gzip", ""]) -@pytest.mark.parametrize("output_key,expected_output_key", [("fooleiden", "fooleiden"), ("", "leiden")]) -def test_leiden(input_path, run_component, random_h5mu_path, compression, output_key, expected_output_key): +@pytest.mark.parametrize( + "output_key,expected_output_key", [("fooleiden", "fooleiden"), ("", "leiden")] +) +def test_leiden( + input_path, + run_component, + random_h5mu_path, + compression, + output_key, + expected_output_key, +): output_path = random_h5mu_path() - args = [ - "--input", input_path, - "--resolution", "1;0.25", - "--output", output_path] + args = ["--input", input_path, "--resolution", "1;0.25", "--output", output_path] if compression: args.extend(["--output_compression", compression]) if output_key: @@ -57,26 +66,48 @@ def test_leiden(input_path, run_component, random_h5mu_path, compression, output run_component(args) assert output_path.exists(), "No output was created." data = mu.read_h5mu(output_path) - assert expected_output_key in data.mod["rna"].obsm, f"Expected to find key '{expected_output_key}' in .obsm" + assert ( + expected_output_key in data.mod["rna"].obsm + ), f"Expected to find key '{expected_output_key}' in .obsm" # check whether leiden.custom.resolution was found - assert "1.0" in data.mod["rna"].obsm[expected_output_key].columns, 'Output should contain resolution 1.0.' - assert "0.25" in data.mod["rna"].obsm[expected_output_key].columns, 'Output should contain resolution 0.25.' - -def test_leiden_custom_connectivities_key(mudata_custom_connectivities_key, run_component, random_h5mu_path): - output_path = random_h5mu_path() - run_component([ - "--input", mudata_custom_connectivities_key, - "--obsm_name", "fooleiden", - "--resolution", "1;0.25", - "--output", output_path, - "--obsp_connectivities", "custom_connectivities", - "--output_compression", "gzip" - ]) + assert ( + "1.0" in data.mod["rna"].obsm[expected_output_key].columns + ), "Output should contain resolution 1.0." + assert ( + "0.25" in data.mod["rna"].obsm[expected_output_key].columns + ), "Output should contain resolution 0.25." + + +def test_leiden_custom_connectivities_key( + mudata_custom_connectivities_key, run_component, random_h5mu_path +): + output_path = random_h5mu_path() + run_component( + [ + "--input", + mudata_custom_connectivities_key, + "--obsm_name", + "fooleiden", + "--resolution", + "1;0.25", + "--output", + output_path, + "--obsp_connectivities", + "custom_connectivities", + "--output_compression", + "gzip", + ] + ) assert output_path.exists(), "No output was created." data = mu.read_h5mu(output_path) # check whether leiden.custom.resolution was found - assert "1.0" in data.mod["rna"].obsm["fooleiden"].columns, 'Output should contain resolution 1.0.' - assert "0.25" in data.mod["rna"].obsm["fooleiden"].columns, 'Output should contain resolution 0.25.' + assert ( + "1.0" in data.mod["rna"].obsm["fooleiden"].columns + ), "Output should contain resolution 1.0." + assert ( + "0.25" in data.mod["rna"].obsm["fooleiden"].columns + ), "Output should contain resolution 0.25." + -if __name__ == '__main__': - sys.exit(pytest.main([__file__, "-v"])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v"])) diff --git a/src/compression/compress_h5mu/run_test.py b/src/compression/compress_h5mu/run_test.py index ee4b6b8fd85..09f864130c9 100644 --- a/src/compression/compress_h5mu/run_test.py +++ b/src/compression/compress_h5mu/run_test.py @@ -1,4 +1,3 @@ - import sys import pytest import mudata as mu @@ -7,45 +6,59 @@ ## VIASH START meta = { - 'executable': './target/executable/compression/compress_h5mu/compress_h5mu', - 'resources_dir': 'resources_test/concat_test_data/', - 'config': 'src/compression/compress_h5mu/config.vsh.yaml' + "executable": "./target/executable/compression/compress_h5mu/compress_h5mu", + "resources_dir": "resources_test/concat_test_data/", + "config": "src/compression/compress_h5mu/config.vsh.yaml", } ## VIASH END -input_file = Path(f"{meta['resources_dir']}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu") +input_file = Path( + f"{meta['resources_dir']}/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" +) + def compare_anndata(first, second): for attr_name in ("obs", "var"): - pd.testing.assert_frame_equal(getattr(first, attr_name), - getattr(second, attr_name)) + pd.testing.assert_frame_equal( + getattr(first, attr_name), getattr(second, attr_name) + ) + @pytest.mark.parametrize("compression_type", ["gzip", "lzf"]) def test_compress_h5mu(run_component, tmp_path, compression_type): output_file = tmp_path / "output.h5mu" - run_component([ - "--input", str(input_file), - "--output", str(output_file), - "--compression", compression_type - ]) - + run_component( + [ + "--input", + str(input_file), + "--output", + str(output_file), + "--compression", + compression_type, + ] + ) + # check whether file exists assert output_file.is_file(), "Output file does not exist" - + # read output mudata output = mu.read_h5mu(output_file) uncompressed_h5mu = mu.read_h5mu(input_file) for attr_name in ("obs", "var"): - pd.testing.assert_frame_equal(getattr(output, attr_name), getattr(uncompressed_h5mu, attr_name)) + pd.testing.assert_frame_equal( + getattr(output, attr_name), getattr(uncompressed_h5mu, attr_name) + ) for mod_name in uncompressed_h5mu.mod: - assert mod_name in output.mod, f"{mod_name} found in uncompressed file, but not in compressed output file." + assert ( + mod_name in output.mod + ), f"{mod_name} found in uncompressed file, but not in compressed output file." mod_compressed = output.mod[mod_name] mod_uncompressed = uncompressed_h5mu.mod[mod_name] compare_anndata(mod_compressed, mod_uncompressed) assert output_file.stat().st_size < input_file.stat().st_size -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/compression/compress_h5mu/script.py b/src/compression/compress_h5mu/script.py index 99b9840ff87..7e6292d9e78 100644 --- a/src/compression/compress_h5mu/script.py +++ b/src/compression/compress_h5mu/script.py @@ -1,9 +1,10 @@ import sys + ### VIASH START par = { "input": "resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", "output": "test.h5mu", - "compression": "gzip" + "compression": "gzip", } meta = {} ### VIASH END @@ -12,4 +13,4 @@ from compress_h5mu import compress_h5mu if __name__ == "__main__": - compress_h5mu(par["input"], par["output"], compression=par["compression"]) \ No newline at end of file + compress_h5mu(par["input"], par["output"], compression=par["compression"]) diff --git a/src/convert/from_10xh5_to_h5mu/script.py b/src/convert/from_10xh5_to_h5mu/script.py index 83e9a174c24..d83b55f73b4 100755 --- a/src/convert/from_10xh5_to_h5mu/script.py +++ b/src/convert/from_10xh5_to_h5mu/script.py @@ -17,6 +17,7 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading %s.", par["input"]) @@ -24,37 +25,38 @@ # set the gene ids as var_names logger.info("Renaming var columns") -adata.var = adata.var\ - .rename_axis("gene_symbol")\ - .reset_index()\ - .set_index("gene_ids") +adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") # parse metrics summary file and store in .uns if par["input_metrics_summary"] and par["uns_metrics"]: - logger.info("Reading metrics summary file '%s'", par['input_metrics_summary']) + logger.info("Reading metrics summary file '%s'", par["input_metrics_summary"]) def read_percentage(val): try: - return float(val.strip('%')) / 100 + return float(val.strip("%")) / 100 except AttributeError: return val - metrics_summary = pd.read_csv(par["input_metrics_summary"], decimal=".", quotechar='"', thousands=",").applymap(read_percentage) + metrics_summary = pd.read_csv( + par["input_metrics_summary"], decimal=".", quotechar='"', thousands="," + ).applymap(read_percentage) - logger.info("Storing metrics summary in .uns['%s']", par['uns_metrics']) + logger.info("Storing metrics summary in .uns['%s']", par["uns_metrics"]) adata.uns[par["uns_metrics"]] = metrics_summary else: - is_none = "input_metrics_summary" if not par["input_metrics_summary"] else "uns_metrics" + is_none = ( + "input_metrics_summary" if not par["input_metrics_summary"] else "uns_metrics" + ) logger.info("Not storing metrics summary because par['%s'] is None", is_none) # might perform basic filtering to get rid of some data # applicable when starting from the raw counts if par["min_genes"]: - logger.info("Filtering with min_genes=%d", par['min_genes']) + logger.info("Filtering with min_genes=%d", par["min_genes"]) sc.pp.filter_cells(adata, min_genes=par["min_genes"]) if par["min_counts"]: - logger.info("Filtering with min_counts=%d", par['min_counts']) + logger.info("Filtering with min_counts=%d", par["min_counts"]) sc.pp.filter_cells(adata, min_counts=par["min_counts"]) # generate output diff --git a/src/convert/from_10xh5_to_h5mu/test.py b/src/convert/from_10xh5_to_h5mu/test.py index 166a390a482..d44e9894d59 100644 --- a/src/convert/from_10xh5_to_h5mu/test.py +++ b/src/convert/from_10xh5_to_h5mu/test.py @@ -5,21 +5,30 @@ ## VIASH START meta = { - 'resources_dir': 'resources_test/', - 'config': './src/convert/from_10xh5_to_h5mu/config.vsh.yaml', - 'executable': './target/executable/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu', + "resources_dir": "resources_test/", + "config": "./src/convert/from_10xh5_to_h5mu/config.vsh.yaml", + "executable": "./target/executable/convert/from_10xh5_to_h5mu/from_10xh5_to_h5mu", } ## VIASH END -input = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5" -metrics = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_metrics_summary.csv" +input = ( + meta["resources_dir"] + + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5" +) +metrics = ( + meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_metrics_summary.csv" +) + def test_run(run_component, random_h5mu_path): output = random_h5mu_path() cmd_pars = [ - "--input", input, - "--output", output, - "--output_compression", "gzip", + "--input", + input, + "--output", + output, + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -42,13 +51,18 @@ def test_run(run_component, random_h5mu_path): "CD3" in data.mod["prot"].var_names ), 'Output should contain antibody column "CD3".' + def test_run_with_metrics(run_component, random_h5mu_path): output = random_h5mu_path() cmd_pars = [ - "--input", input, - "--output", output, - "--input_metrics_summary", metrics, - "--output_compression", "gzip", + "--input", + input, + "--output", + output, + "--input_metrics_summary", + metrics, + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -59,7 +73,10 @@ def test_run_with_metrics(run_component, random_h5mu_path): data = read_h5mu(output) # check whether uns slot was found - assert "metrics_cellranger" in data.uns, "Output mudata object should contain an .uns slot with cellranger metrics." + assert ( + "metrics_cellranger" in data.uns + ), "Output mudata object should contain an .uns slot with cellranger metrics." + if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_10xmtx_to_h5mu/run_test.py b/src/convert/from_10xmtx_to_h5mu/run_test.py index bde5a287763..622101f6a72 100644 --- a/src/convert/from_10xmtx_to_h5mu/run_test.py +++ b/src/convert/from_10xmtx_to_h5mu/run_test.py @@ -5,20 +5,27 @@ ## VIASH START meta = { - 'resources_dir': 'resources_test/', - 'config': './src/convert/from_10xmtx_to_h5mu/config.vsh.yaml', - 'executable': './target/executable/convert/from_10xmtx_to_h5mu/from_10xmtx_to_h5mu', + "resources_dir": "resources_test/", + "config": "./src/convert/from_10xmtx_to_h5mu/config.vsh.yaml", + "executable": "./target/executable/convert/from_10xmtx_to_h5mu/from_10xmtx_to_h5mu", } ## VIASH END -input = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix" +input = ( + meta["resources_dir"] + + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix" +) + def test_run(run_component, random_h5mu_path): output = random_h5mu_path() cmd_pars = [ - "--input", input, - "--output", output, - "--output_compression", "gzip", + "--input", + input, + "--output", + output, + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -40,7 +47,7 @@ def test_run(run_component, random_h5mu_path): assert ( "CD3" in data.mod["prot"].var_names ), 'Output should contain antibody column "CD3".' - - + + if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_10xmtx_to_h5mu/script.py b/src/convert/from_10xmtx_to_h5mu/script.py index 6677852ce83..b4fe5e38891 100755 --- a/src/convert/from_10xmtx_to_h5mu/script.py +++ b/src/convert/from_10xmtx_to_h5mu/script.py @@ -11,16 +11,14 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading %s.", par["input"]) adata = sc.read_10x_mtx(par["input"], gex_only=False) logger.info("Renaming keys.") -adata.var = adata.var\ - .rename_axis("gene_symbol")\ - .reset_index()\ - .set_index("gene_ids") +adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") # generate output logger.info("Convert to mudata") @@ -31,4 +29,4 @@ # write output logger.info("Writing %s", par["output"]) -mdata.write_h5mu(par["output"], compression=par["output_compression"]) \ No newline at end of file +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/convert/from_bdrhap_to_h5mu/script.py b/src/convert/from_bdrhap_to_h5mu/script.py index bdfcf964275..a1d9b917525 100644 --- a/src/convert/from_bdrhap_to_h5mu/script.py +++ b/src/convert/from_bdrhap_to_h5mu/script.py @@ -5,7 +5,7 @@ "id": "sample", "input": "resources_test/bdrhap_5kjrt/processed/output_raw/sample.h5mu", "output": "bd_rhap_to_h5mu_test.h5mu", - "output_compression": None + "output_compression": None, } ## VIASH END @@ -16,33 +16,35 @@ modalities = list(mdata.mod.keys()) assert len(modalities) > 0, "No modalities found in input data" + def process_modality_inline(adata, modality): adata.obs["library_id"] = " & ".join(adata.uns["Pipeline_Inputs"]["Libraries"]) adata.obs["cell_id"] = adata.obs.index adata.obs["run_id"] = par["id"] - + adata.obs.rename( - columns={ - "Sample_Tag": "sample_tag", - "Sample_Name": "sample_id"}, - inplace=True) + columns={"Sample_Tag": "sample_tag", "Sample_Name": "sample_id"}, inplace=True + ) adata.var["gene_ids"] = adata.var.index adata.var["gene_name"] = adata.var.index - + if modality == "rna": adata.var["feature_type"] = "Gene Expression" adata.var["reference_file"] = adata.uns["Pipeline_Inputs"]["Reference_Archive"] - + elif modality == "prot": adata.var["feature_type"] = "Antibody Capture" - adata.var["reference_file"] = " & ".join(adata.uns["Pipeline_Inputs"]["AbSeq_Reference"]) - + adata.var["reference_file"] = " & ".join( + adata.uns["Pipeline_Inputs"]["AbSeq_Reference"] + ) + # TODO: add other modalities + for key, value in mdata.mod.items(): print(">> Processing modality:", key, flush=True) process_modality_inline(value, key) print(">> Writing output file", flush=True) -mdata.write_h5mu(par["output"], compression=par["output_compression"]) \ No newline at end of file +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/convert/from_bdrhap_to_h5mu/test.py b/src/convert/from_bdrhap_to_h5mu/test.py index 24c3cdaf8d8..4350c4a4775 100644 --- a/src/convert/from_bdrhap_to_h5mu/test.py +++ b/src/convert/from_bdrhap_to_h5mu/test.py @@ -6,7 +6,7 @@ ## VIASH START meta = { "executable": "target/docker/convert/from_bdrhap_to_h5mu/from_bdrhap_to_h5mu", - "resources_dir": "resources_test/bdrhap_5kjrt/processed/output_raw/" + "resources_dir": "resources_test/bdrhap_5kjrt/processed/output_raw/", } ## VIASH END @@ -15,10 +15,14 @@ cmd_pars = [ meta["executable"], - "--input", input, - "--output", output, - "--id", "foo", - "--output_compression", "gzip", + "--input", + input, + "--output", + output, + "--id", + "foo", + "--output_compression", + "gzip", ] out = subprocess.check_output(cmd_pars).decode("utf-8") @@ -32,22 +36,46 @@ prot_adata = data.mod["prot"] # check whether correct feature types are detected -assert np.array_equal(rna_adata.var["feature_type"].unique(), ["Gene Expression"]), "RNA expression should only contain Gene Expression vars." -assert np.array_equal(rna_adata.var["reference_file"].unique(), ["reference_bd_rhapsody.tar.gz"]), "Wrong reference file detected for Gene Expression vars." +assert np.array_equal( + rna_adata.var["feature_type"].unique(), ["Gene Expression"] +), "RNA expression should only contain Gene Expression vars." +assert np.array_equal( + rna_adata.var["reference_file"].unique(), ["reference_bd_rhapsody.tar.gz"] +), "Wrong reference file detected for Gene Expression vars." assert "ADAMTSL4" in rna_adata.var_names, 'RNA modality should contain gene "ADAMTS4".' -assert np.array_equal(rna_adata.obs["library_id"].unique(), ["12ABC & 12SMK & 12WTA"]), "Gene Expression .obs library_id should equal '12ABC & 12WTA." -assert "sample_tag" in rna_adata.obs.keys(), "RNA modality should contain column 'sample_id'." -assert "sample_id" in rna_adata.obs.keys(), "RNA modality should contain column 'sample_name'." +assert np.array_equal( + rna_adata.obs["library_id"].unique(), ["12ABC & 12SMK & 12WTA"] +), "Gene Expression .obs library_id should equal '12ABC & 12WTA." +assert ( + "sample_tag" in rna_adata.obs.keys() +), "RNA modality should contain column 'sample_id'." +assert ( + "sample_id" in rna_adata.obs.keys() +), "RNA modality should contain column 'sample_name'." -assert np.array_equal(prot_adata.var["feature_type"].unique(), ["Antibody Capture"]), "RNA expression should only contain Antibody Capture vars." -assert np.array_equal(prot_adata.var["reference_file"].unique(), ["BDAbSeq_ImmuneDiscoveryPanel.fasta"]), "Wrong reference file detected for Antibody Capture vars." -assert "CD279:EH12-1|PDCD1|AHS0014|pAbO" in prot_adata.var_names, 'Protein modality should contain protein "CD279:EH12-1|PDCD1|AHS0014|pAbO".' -assert np.array_equal(prot_adata.obs["library_id"].unique(), ["12ABC & 12SMK & 12WTA"]), "Antibody Capture .obs library_id should equal '12ABC & 12WTA." -assert "sample_tag" in prot_adata.obs.keys(), "Protein modality should contain column 'sample_id'." -assert "sample_id" in prot_adata.obs.keys(), "Protein modality should contain column 'sample_name'." +assert np.array_equal( + prot_adata.var["feature_type"].unique(), ["Antibody Capture"] +), "RNA expression should only contain Antibody Capture vars." +assert np.array_equal( + prot_adata.var["reference_file"].unique(), ["BDAbSeq_ImmuneDiscoveryPanel.fasta"] +), "Wrong reference file detected for Antibody Capture vars." +assert ( + "CD279:EH12-1|PDCD1|AHS0014|pAbO" in prot_adata.var_names +), 'Protein modality should contain protein "CD279:EH12-1|PDCD1|AHS0014|pAbO".' +assert np.array_equal( + prot_adata.obs["library_id"].unique(), ["12ABC & 12SMK & 12WTA"] +), "Antibody Capture .obs library_id should equal '12ABC & 12WTA." +assert ( + "sample_tag" in prot_adata.obs.keys() +), "Protein modality should contain column 'sample_id'." +assert ( + "sample_id" in prot_adata.obs.keys() +), "Protein modality should contain column 'sample_name'." # check whether gene was found assert "PDE4DIP" in data.var_names, 'Output should contain gex column "PDE4DIP".' -assert "CD279:EH12-1|PDCD1|AHS0014|pAbO" in data.var_names, 'Output should contain abc column "CD279:EH12-1|PDCD1|AHS0014|pAbO".' +assert ( + "CD279:EH12-1|PDCD1|AHS0014|pAbO" in data.var_names +), 'Output should contain abc column "CD279:EH12-1|PDCD1|AHS0014|pAbO".' -print("> Test successful", flush=True) \ No newline at end of file +print("> Test successful", flush=True) diff --git a/src/convert/from_cellranger_multi_to_h5mu/script.py b/src/convert/from_cellranger_multi_to_h5mu/script.py index 71c2a76fddc..ea222ea6976 100644 --- a/src/convert/from_cellranger_multi_to_h5mu/script.py +++ b/src/convert/from_cellranger_multi_to_h5mu/script.py @@ -17,53 +17,67 @@ "input": "resources_test/10x_5k_beam/processed/10x_5k_beam.cellranger_multi.output", "output": "foo.h5mu", "uns_metrics": "metrics_cellranger", - "output_compression": "gzip" -} -meta = { - "resources_dir": "." + "output_compression": "gzip", } +meta = {"resources_dir": "."} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() -POSSIBLE_LIBRARY_TYPES = ('vdj_t', 'vdj_b', 'vdj_t_gd', 'count', 'antigen_analysis', 'multiplexing_analysis') +POSSIBLE_LIBRARY_TYPES = ( + "vdj_t", + "vdj_b", + "vdj_t_gd", + "count", + "antigen_analysis", + "multiplexing_analysis", +) FEATURE_TYPES_NAMES = { - "Gene Expression": "rna", - "Peaks": "atac", - "Antibody Capture": "prot", - "VDJ": "vdj", - "VDJ-T": "vdj_t", - "VDJ-B": "vdj_b", - "CRISPR Guide Capture": "gdo", - "Multiplexing Capture": "hto", - "Antigen Capture": "antigen", - } + "Gene Expression": "rna", + "Peaks": "atac", + "Antibody Capture": "prot", + "VDJ": "vdj", + "VDJ-T": "vdj_t", + "VDJ-B": "vdj_b", + "CRISPR Guide Capture": "gdo", + "Multiplexing Capture": "hto", + "Antigen Capture": "antigen", +} + def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame: """ Cast the dataframe to dtypes that can be written by mudata. - """ + """ # dtype inferral workfs better with np.nan result = result.replace({pd.NA: np.nan}) # MuData supports nullable booleans and ints # ie. `IntegerArray` and `BooleanArray` - result = result.convert_dtypes(infer_objects=True, - convert_integer=True, - convert_string=False, - convert_boolean=True, - convert_floating=False) - + result = result.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + # Convert leftover 'object' columns to string # However, na values are supported, so convert all values except NA's to string - object_cols = result.select_dtypes(include='object').columns.values + object_cols = result.select_dtypes(include="object").columns.values for obj_col in object_cols: - result[obj_col] = result[obj_col].where(result[obj_col].isna(), result[obj_col].astype(str)).astype('category') + result[obj_col] = ( + result[obj_col] + .where(result[obj_col].isna(), result[obj_col].astype(str)) + .astype("category") + ) return result + def gather_input_data(dir: Path): # / # +-- multi @@ -87,7 +101,7 @@ def gather_input_data(dir: Path): # | +-- antibody_analysis # | +-- crispr_analysis # | +-- perturbation_efficiencies_by_feature.csv - # | +-- perturbation_efficiencies_by_target.csv + # | +-- perturbation_efficiencies_by_target.csv # +-- vdj_t (unused) # +-- vdj_b (unused) # +-- vdj_t_gd (unused) @@ -96,69 +110,103 @@ def gather_input_data(dir: Path): if not dir.is_dir(): raise ValueError("Specified input is not a directory.") folder_contents = list(dir.iterdir()) - config = dir / 'config.csv' + config = dir / "config.csv" if config not in folder_contents: - logger.warning('Config.csv not found in input directory, this folder might not be a valid cellranger multi output.') + logger.warning( + "Config.csv not found in input directory, this folder might not be a valid cellranger multi output." + ) - required_subfolders = [dir / subfolder_name for subfolder_name in ('multi', 'per_sample_outs')] + required_subfolders = [ + dir / subfolder_name for subfolder_name in ("multi", "per_sample_outs") + ] found_input = {key_: {} for key_ in POSSIBLE_LIBRARY_TYPES} for required_subfolder in required_subfolders: - if not required_subfolder in folder_contents: - raise ValueError(f"Input folder must contain the subfolder {required_subfolder} please make " - "sure that the specified input folder is a valid cellranger multi output.") + if required_subfolder not in folder_contents: + raise ValueError( + f"Input folder must contain the subfolder {required_subfolder} please make " + "sure that the specified input folder is a valid cellranger multi output." + ) - multi_dir = dir / 'multi' + multi_dir = dir / "multi" for library_type in multi_dir.iterdir(): if not library_type.is_dir(): - logger.warning("%s is not a directory. Contents of the multi folder " - "must be directories to be recognized as valid input data", - library_type) + logger.warning( + "%s is not a directory. Contents of the multi folder " + "must be directories to be recognized as valid input data", + library_type, + ) continue if library_type.name not in POSSIBLE_LIBRARY_TYPES: - raise ValueError(f"Contents of the 'multi' folder must be found one of the following: {','.join(POSSIBLE_LIBRARY_TYPES)}.") + raise ValueError( + f"Contents of the 'multi' folder must be found one of the following: {','.join(POSSIBLE_LIBRARY_TYPES)}." + ) found_input[library_type.name] = library_type - per_sample_outs_dir = dir / 'per_sample_outs' - samples_dirs = [samplepath for samplepath in per_sample_outs_dir.iterdir() if samplepath.is_dir()] + per_sample_outs_dir = dir / "per_sample_outs" + samples_dirs = [ + samplepath + for samplepath in per_sample_outs_dir.iterdir() + if samplepath.is_dir() + ] for samples_dir in samples_dirs: - for file_part in ('metrics_summary.csv', 'count/feature_reference.csv', - 'count/crispr_analysis/perturbation_efficiencies_by_feature.csv', - 'count/crispr_analysis/perturbation_efficiencies_by_target.csv', - 'antigen_analysis', - ): + for file_part in ( + "metrics_summary.csv", + "count/feature_reference.csv", + "count/crispr_analysis/perturbation_efficiencies_by_feature.csv", + "count/crispr_analysis/perturbation_efficiencies_by_target.csv", + "antigen_analysis", + ): found_file = samples_dir / file_part if found_file.exists(): - file_name = found_file.name.removesuffix('.csv') + file_name = found_file.name.removesuffix(".csv") found_input.setdefault(file_name, {})[samples_dir.name] = found_file return found_input -def proces_perturbation(key_name: str, mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path]): +def proces_perturbation( + key_name: str, mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path] +): for sample_name, mudata_obj in mudatas.items(): efficiency_file = efficiency_files[sample_name] - assert 'gdo' in mudata_obj.mod - eff_df = pd.read_csv(efficiency_file, index_col="Perturbation", sep=",", decimal=".", quotechar='"') - mudata_obj.mod['gdo'].uns[key_name] = eff_df + assert "gdo" in mudata_obj.mod + eff_df = pd.read_csv( + efficiency_file, + index_col="Perturbation", + sep=",", + decimal=".", + quotechar='"', + ) + mudata_obj.mod["gdo"].uns[key_name] = eff_df return mudatas -def process_feature_reference(mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path]): + +def process_feature_reference( + mudatas: dict[str, mudata.MuData], efficiency_files: dict[str, Path] +): for sample, mudata_obj in mudatas.items(): efficiency_file = efficiency_files[sample] - df = pd.read_csv(efficiency_file, index_col="id", sep=",", decimal=".", quotechar='"') - assert 'feature_type' in df.columns, "Columns 'feature_type' should be present in features_reference file." - feature_types = df['feature_type'] + df = pd.read_csv( + efficiency_file, index_col="id", sep=",", decimal=".", quotechar='"' + ) + assert ( + "feature_type" in df.columns + ), "Columns 'feature_type' should be present in features_reference file." + feature_types = df["feature_type"] missing_features = set(feature_types) - set(FEATURE_TYPES_NAMES) if missing_features: - raise ValueError("Not all feature types present in the features_reference file are supported by this component.\n" - f"Missing support for features: {','.join(missing_features)}.") + raise ValueError( + "Not all feature types present in the features_reference file are supported by this component.\n" + f"Missing support for features: {','.join(missing_features)}." + ) for feature_type in feature_types: modality = FEATURE_TYPES_NAMES[feature_type] - subset_df = df.loc[df['feature_type'] == feature_type] - mudata_obj.mod[modality].uns['feature_reference'] = subset_df + subset_df = df.loc[df["feature_type"] == feature_type] + mudata_obj.mod[modality].uns["feature_reference"] = subset_df return mudatas + def process_counts(counts_folder: Path, multiplexing_info, metrics_files): counts_matrix_file = counts_folder / "raw_feature_bc_matrix.h5" logger.info("Reading %s.", counts_matrix_file) @@ -166,10 +214,7 @@ def process_counts(counts_folder: Path, multiplexing_info, metrics_files): # set the gene ids as var_names logger.info("Renaming var columns") - adata.var = adata.var\ - .rename_axis("gene_symbol")\ - .reset_index()\ - .set_index("gene_ids") + adata.var = adata.var.rename_axis("gene_symbol").reset_index().set_index("gene_ids") # generate output logger.info("Convert to mudata") @@ -181,18 +226,29 @@ def modality_name_factory(library_type): mudata_all_samples = mudata.MuData(adata, feature_types_names=feature_types) if multiplexing_info: # Get the mapping between the barcode and the sample ID from one of the metrics files - metrics_file = pd.read_csv(list(metrics_files.values())[0], - decimal=".", quotechar='"', thousands=",") - sample_ids = metrics_file[(metrics_file['Metric Name'] == "Sample ID") & - (metrics_file['Grouped By'] == "Probe barcode ID")] - barcode_sample_mapping = sample_ids.loc[:,['Group Name', 'Metric Value']].set_index('Group Name').squeeze().to_dict() - return split_samples(mudata_all_samples, multiplexing_info, barcode_sample_mapping) + metrics_file = pd.read_csv( + list(metrics_files.values())[0], decimal=".", quotechar='"', thousands="," + ) + sample_ids = metrics_file[ + (metrics_file["Metric Name"] == "Sample ID") + & (metrics_file["Grouped By"] == "Probe barcode ID") + ] + barcode_sample_mapping = ( + sample_ids.loc[:, ["Group Name", "Metric Value"]] + .set_index("Group Name") + .squeeze() + .to_dict() + ) + return split_samples( + mudata_all_samples, multiplexing_info, barcode_sample_mapping + ) return {"run": mudata_all_samples} + def split_samples(mudata_obj, multiplexing_analysis_folder, barcode_sample_mapping): result = {} cells_per_tag_file = multiplexing_analysis_folder / "cells_per_tag.json" - with cells_per_tag_file.open('r') as open_json: + with cells_per_tag_file.open("r") as open_json: sample_cell_mapping = json.load(open_json) for barcode, indices in sample_cell_mapping.items(): @@ -201,70 +257,96 @@ def split_samples(mudata_obj, multiplexing_analysis_folder, barcode_sample_mappi result[barcode_sample_mapping[barcode]] = sample_mudata.copy() return result -def process_metrics_summary(mudatas: dict[str, mudata.MuData], metrics_files: dict[str, Path]): + +def process_metrics_summary( + mudatas: dict[str, mudata.MuData], metrics_files: dict[str, Path] +): def read_percentage(val): try: - if str(val).endswith('%'): - return float(val.strip('%')) / 100 + if str(val).endswith("%"): + return float(val.strip("%")) / 100 else: return val except (AttributeError, ValueError): return val + for sample, mudata_obj in mudatas.items(): metrics_file = metrics_files[sample] - metrics_summary = pd.read_csv(metrics_file, - decimal=".", - quotechar='"', - thousands=",").applymap(read_percentage) + metrics_summary = pd.read_csv( + metrics_file, decimal=".", quotechar='"', thousands="," + ).applymap(read_percentage) mudata_obj.uns[par["uns_metrics"]] = metrics_summary for colname, coldata in metrics_summary.items(): try: - new_column = coldata.astype(str, copy=True).astype({colname: "category"}) + new_column = coldata.astype(str, copy=True).astype( + {colname: "category"} + ) metrics_summary[colname] = new_column except (ValueError, TypeError): logger.warning(f"Could not store column {colname} from metrics.") pass return mudatas -def process_antigen_analysis(mudatas: dict[str, mudata.MuData], antigen_analysis_folder_paths: dict[str, Path]): + +def process_antigen_analysis( + mudatas: dict[str, mudata.MuData], antigen_analysis_folder_paths: dict[str, Path] +): for sample_id, mudata_obj in mudatas.items(): antigen_analysis_folder_path = antigen_analysis_folder_paths[sample_id] - assert 'antigen' in mudata_obj.mod + assert "antigen" in mudata_obj.mod per_barcodes_file = antigen_analysis_folder_path / "per_barcode.csv" - assert per_barcodes_file.is_file(), "Expected a per_barcode.csv file to be present." - per_barcodes_df = pd.read_csv(per_barcodes_file, index_col="barcode", - sep=",", decimal=".", quotechar='"') - is_gex_cell = per_barcodes_df['is_gex_cell'] - assert len(set(is_gex_cell.unique().tolist()) - set([False, True])) == 0, \ - "Expected 'is_gex_cell' column to be boolean. Please report this as a bug." + assert ( + per_barcodes_file.is_file() + ), "Expected a per_barcode.csv file to be present." + per_barcodes_df = pd.read_csv( + per_barcodes_file, index_col="barcode", sep=",", decimal=".", quotechar='"' + ) + is_gex_cell = per_barcodes_df["is_gex_cell"] + assert ( + len(set(is_gex_cell.unique().tolist()) - set([False, True])) == 0 + ), "Expected 'is_gex_cell' column to be boolean. Please report this as a bug." barcodes_in_gex = per_barcodes_df[is_gex_cell] # All of the barcodes listed in the per_barcode.csv with is_gex_cell set to 'True' # must be in the 'rna' (an thus also 'antigen') modality - assert barcodes_in_gex.index.difference(mudata_obj['rna'].obs_names).empty - orig_obs_names = mudata_obj['antigen'].obs_names.copy() - mudata_obj['antigen'].obs = cast_to_writeable_dtype(pd.concat([mudata_obj['antigen'].obs, barcodes_in_gex], - axis='columns', - join='outer', - verify_integrity=True, - sort=False)) - assert orig_obs_names.equals(mudata_obj['antigen'].obs_names) + assert barcodes_in_gex.index.difference(mudata_obj["rna"].obs_names).empty + orig_obs_names = mudata_obj["antigen"].obs_names.copy() + mudata_obj["antigen"].obs = cast_to_writeable_dtype( + pd.concat( + [mudata_obj["antigen"].obs, barcodes_in_gex], + axis="columns", + join="outer", + verify_integrity=True, + sort=False, + ) + ) + assert orig_obs_names.equals(mudata_obj["antigen"].obs_names) del orig_obs_names - # The antigen_specificity_scores.csv file is only present when cellranger + # The antigen_specificity_scores.csv file is only present when cellranger # multi was run with a [antigen-specificity] section in config - specificity_file = antigen_analysis_folder_path / "antigen_specificity_scores.csv" + specificity_file = ( + antigen_analysis_folder_path / "antigen_specificity_scores.csv" + ) if specificity_file.is_file(): - antigen_scores_df = pd.read_csv(specificity_file, - index_col=["barcode", "antigen"], sep=",", - decimal=".", quotechar='"') + antigen_scores_df = pd.read_csv( + specificity_file, + index_col=["barcode", "antigen"], + sep=",", + decimal=".", + quotechar='"', + ) score = antigen_scores_df.unstack() - assert score.index.difference(mudata_obj['rna'].obs_names).empty - antigens = score.columns.unique(level='antigen') + assert score.index.difference(mudata_obj["rna"].obs_names).empty + antigens = score.columns.unique(level="antigen") for antigen in antigens: - score_antigen = score.loc[:, (slice(None), antigen)].droplevel("antigen", axis=1) - score_antigen = score_antigen.reindex(mudata_obj['rna'].obs_names) - mudata_obj['antigen'].obsm[f'antigen_specificity_scores_{antigen}'] = cast_to_writeable_dtype(score_antigen) + score_antigen = score.loc[:, (slice(None), antigen)].droplevel( + "antigen", axis=1 + ) + score_antigen = score_antigen.reindex(mudata_obj["rna"].obs_names) + mudata_obj["antigen"].obsm[f"antigen_specificity_scores_{antigen}"] = ( + cast_to_writeable_dtype(score_antigen) + ) return mudatas @@ -273,63 +355,92 @@ def process_vdj(mudatas: dict[str, mudata.MuData], vdj_folder_path: str): # According to docs, using the json is preferred as this file includes intron info. all_config_json_file = vdj_folder_path / "all_contig_annotations.json" vdj_type = vdj_folder_path.name - with all_config_json_file.open('r') as open_json: + with all_config_json_file.open("r") as open_json: json_obj = json.load(open_json) for _, mudata_obj in mudatas.items(): - json_for_sample = [entry for entry in json_obj if entry['barcode'] in mudata_obj.obs_names] - with tempfile.NamedTemporaryFile(mode="w", suffix='.json') as tfile: + json_for_sample = [ + entry for entry in json_obj if entry["barcode"] in mudata_obj.obs_names + ] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as tfile: json.dump(json_for_sample, tfile, indent=4) tfile.flush() vdj_anndata = read_10x_vdj(tfile.name) mudata_obj.mod[vdj_type] = vdj_anndata return mudatas + def get_modalities(input_data): dispatcher = { - 'multiplexing_analysis': split_samples, - 'vdj_t': process_vdj, - 'vdj_b': process_vdj, - 'vdj_t_gd': process_vdj, - 'metrics_summary': process_metrics_summary, - 'feature_reference': process_feature_reference, - 'perturbation_efficiencies_by_feature': partial(proces_perturbation, 'perturbation_efficiencies_by_feature'), - 'perturbation_efficiencies_by_target': partial(proces_perturbation, 'perturbation_efficiencies_by_target'), - 'antigen_analysis': process_antigen_analysis, + "multiplexing_analysis": split_samples, + "vdj_t": process_vdj, + "vdj_b": process_vdj, + "vdj_t_gd": process_vdj, + "metrics_summary": process_metrics_summary, + "feature_reference": process_feature_reference, + "perturbation_efficiencies_by_feature": partial( + proces_perturbation, "perturbation_efficiencies_by_feature" + ), + "perturbation_efficiencies_by_target": partial( + proces_perturbation, "perturbation_efficiencies_by_target" + ), + "antigen_analysis": process_antigen_analysis, } - mudata_per_sample = process_counts(input_data['count'], - input_data["multiplexing_analysis"], - input_data['metrics_summary']) + mudata_per_sample = process_counts( + input_data["count"], + input_data["multiplexing_analysis"], + input_data["metrics_summary"], + ) for modality_name, modality_data_path in input_data.items(): - if modality_name in ("count", "multiplexing_analysis") or not modality_data_path: + if ( + modality_name in ("count", "multiplexing_analysis") + or not modality_data_path + ): continue try: parser_function = dispatcher[modality_name] except KeyError as e: - raise ValueError("This component does not support the " - f"parsing of the '{modality_name}' yet.") from e + raise ValueError( + "This component does not support the " + f"parsing of the '{modality_name}' yet." + ) from e mudata_per_sample = parser_function(mudata_per_sample, modality_data_path) return mudata_per_sample + def main(): cellranger_multi_dir = Path(par["input"]) # TODO: remove when issue https://github.com/viash-io/viash/issues/706 is resolved. if isinstance(par["output"], (list, set, tuple)): - assert len(par["output"]) == 1, "A single output file template should have been provided." + assert ( + len(par["output"]) == 1 + ), "A single output file template should have been provided." par["output"] = par["output"][0] - assert par["output"].count('*') == 1, (f"Expected exactly one wildcard character (*) in output " - f"files template ({par['output']}). Found {par['output'].count('*')}") + assert par["output"].count("*") == 1, ( + f"Expected exactly one wildcard character (*) in output " + f"files template ({par['output']}). Found {par['output'].count('*')}" + ) input_data = gather_input_data(cellranger_multi_dir) result = get_modalities(input_data) - output_files = {par["output"].replace("*", sample_name) for sample_name in result.keys()} - assert len(output_files) == len(result.keys()), ("Replacing the wildcard in the output files " - "template did not produce unique file paths.") - logger.info("Writing output for samples: '%s' to '%s'", "".join(result.keys()), par["output"]) - with Path(par["sample_csv"]).open("w", newline='') as open_csv: + output_files = { + par["output"].replace("*", sample_name) for sample_name in result.keys() + } + assert len(output_files) == len(result.keys()), ( + "Replacing the wildcard in the output files " + "template did not produce unique file paths." + ) + logger.info( + "Writing output for samples: '%s' to '%s'", + "".join(result.keys()), + par["output"], + ) + with Path(par["sample_csv"]).open("w", newline="") as open_csv: csvwriter = csv.DictWriter(open_csv, fieldnames=["sample_name", "file"]) csvwriter.writeheader() for sample_name, mudata_obj in result.items(): - output_file = Path(par["output"].replace('*', sample_name)) + output_file = Path(par["output"].replace("*", sample_name)) mudata_obj.write_h5mu(output_file, compression=par["output_compression"]) - csvwriter.writerow({"sample_name": sample_name, "file": output_file.name}) + csvwriter.writerow({"sample_name": sample_name, "file": output_file.name}) + + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/convert/from_cellranger_multi_to_h5mu/test.py b/src/convert/from_cellranger_multi_to_h5mu/test.py index b6e866cdc01..bcf8d29613f 100644 --- a/src/convert/from_cellranger_multi_to_h5mu/test.py +++ b/src/convert/from_cellranger_multi_to_h5mu/test.py @@ -4,28 +4,37 @@ ## VIASH START meta = { - 'executable': './target/executable/convert/from_cellranger_multi_to_h5mu/from_cellranger_multi_to_h5mu', - 'resources_dir': 'resources_test/', - 'config': 'src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml' + "executable": "./target/executable/convert/from_cellranger_multi_to_h5mu/from_cellranger_multi_to_h5mu", + "resources_dir": "resources_test/", + "config": "src/convert/from_cellranger_multi_to_h5mu/config.vsh.yaml", } ## VIASH END input_anticmv = f"{meta['resources_dir']}/10x_5k_anticmv/processed/10x_5k_anticmv.cellranger_multi.output.output" input_lung_crispr = f"{meta['resources_dir']}/10x_5k_lung_crispr/processed/10x_5k_lung_crispr.cellranger_multi.output.output" -input_beam = f"{meta['resources_dir']}/10x_5k_beam/processed/10x_5k_beam.cellranger_multi.output" +input_beam = ( + f"{meta['resources_dir']}/10x_5k_beam/processed/10x_5k_beam.cellranger_multi.output" +) input_fixed_rna = f"{meta['resources_dir']}/10x_5k_fixed/processed/10x_5k_fixed.cellranger_multi.output" + def test_cellranger_multi_basic(run_component, tmp_path): - output_dir = tmp_path / "converted" + output_dir = tmp_path / "converted" output_path_template = output_dir / "*.h5mu" samples_csv = tmp_path / "samples.csv" # run component - run_component([ - "--input", input_anticmv, - "--output", str(output_path_template), - "--output_compression", "gzip", - "--sample_csv", samples_csv, - ]) + run_component( + [ + "--input", + input_anticmv, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) assert output_dir.is_dir() # check output @@ -33,34 +42,59 @@ def test_cellranger_multi_basic(run_component, tmp_path): assert len(samples) == 1 output_path = samples[0] converted_data = read_h5mu(output_path) - assert list(converted_data.mod.keys()) == ['rna', 'prot', 'vdj_t'] - assert list(converted_data.uns.keys()) == ['metrics_cellranger'] - expected_metrics = ['Category', 'Library Type', 'Grouped By', 'Group Name', 'Metric Name', 'Metric Value'] - assert converted_data.uns['metrics_cellranger'].columns.to_list() == expected_metrics + assert list(converted_data.mod.keys()) == ["rna", "prot", "vdj_t"] + assert list(converted_data.uns.keys()) == ["metrics_cellranger"] + expected_metrics = [ + "Category", + "Library Type", + "Grouped By", + "Group Name", + "Metric Name", + "Metric Value", + ] + assert ( + converted_data.uns["metrics_cellranger"].columns.to_list() == expected_metrics + ) # Check that a metric that is stored as percentage (e.g "85.69%") is correctly represented # as a floating point number - metrics_df_with_index = converted_data.uns['metrics_cellranger'].set_index(["Metric Name", "Library Type", "Category"]) - percentage = metrics_df_with_index.loc[("Confidently mapped reads in cells", "Gene Expression", "Cells"), "Metric Value"] + metrics_df_with_index = converted_data.uns["metrics_cellranger"].set_index( + ["Metric Name", "Library Type", "Category"] + ) + percentage = metrics_df_with_index.loc[ + ("Confidently mapped reads in cells", "Gene Expression", "Cells"), + "Metric Value", + ] assert percentage.iloc[0] == "0.8569" - thousand_delimited_number = metrics_df_with_index.loc[("Cells", "Gene Expression", "Cells"), "Metric Value"] - thousand_delimited_number == "3,798" + thousand_delimited_number = metrics_df_with_index.loc[ + ("Cells", "Gene Expression", "Cells"), "Metric Value" + ] + thousand_delimited_number == "3,798" - smaller_number = metrics_df_with_index.loc[("Median genes per cell", "Gene Expression", "Cells"), "Metric Value"] + smaller_number = metrics_df_with_index.loc[ + ("Median genes per cell", "Gene Expression", "Cells"), "Metric Value" + ] smaller_number == "6" - + + def test_cellranger_multi_to_h5mu_crispr(run_component, tmp_path): - output_dir = tmp_path / "converted" + output_dir = tmp_path / "converted" output_path_template = output_dir / "*.h5mu" samples_csv = tmp_path / "samples.csv" # run component - run_component([ - "--input", input_lung_crispr, - "--output", str(output_path_template), - "--output_compression", "gzip", - "--sample_csv", samples_csv, - ]) + run_component( + [ + "--input", + input_lung_crispr, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) assert output_dir.is_dir() # check output @@ -68,25 +102,32 @@ def test_cellranger_multi_to_h5mu_crispr(run_component, tmp_path): assert len(samples) == 1 output_path = samples[0] converted_data = read_h5mu(output_path) - assert list(converted_data.mod.keys()) == ['rna', 'gdo'] - assert list(converted_data.uns.keys()) == ['metrics_cellranger'] - assert 'perturbation_efficiencies_by_feature' in converted_data.mod['gdo'].uns - assert 'perturbation_efficiencies_by_target' in converted_data.mod['gdo'].uns - assert 'feature_reference' not in converted_data.mod['rna'].uns - assert 'feature_reference' in converted_data.mod['gdo'].uns + assert list(converted_data.mod.keys()) == ["rna", "gdo"] + assert list(converted_data.uns.keys()) == ["metrics_cellranger"] + assert "perturbation_efficiencies_by_feature" in converted_data.mod["gdo"].uns + assert "perturbation_efficiencies_by_target" in converted_data.mod["gdo"].uns + assert "feature_reference" not in converted_data.mod["rna"].uns + assert "feature_reference" in converted_data.mod["gdo"].uns + def test_cellranger_multi_to_h5mu_beam(run_component, tmp_path): - output_dir = tmp_path / "converted" + output_dir = tmp_path / "converted" output_path_template = output_dir / "*.h5mu" samples_csv = tmp_path / "samples.csv" # run component - run_component([ - "--input", input_beam, - "--output", str(output_path_template), - "--output_compression", "gzip", - "--sample_csv", samples_csv, - ]) + run_component( + [ + "--input", + input_beam, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) assert output_dir.is_dir() # check output @@ -94,32 +135,44 @@ def test_cellranger_multi_to_h5mu_beam(run_component, tmp_path): assert len(samples) == 1 output_path = samples[0] converted_data = read_h5mu(output_path) - assert list(converted_data.mod.keys()) == ['rna', 'antigen', 'vdj_t'] - assert 'antigen_specificity_scores_CMV_B0702' in converted_data['antigen'].obsm - assert 'antigen_specificity_scores_Flu_A0201' in converted_data['antigen'].obsm + assert list(converted_data.mod.keys()) == ["rna", "antigen", "vdj_t"] + assert "antigen_specificity_scores_CMV_B0702" in converted_data["antigen"].obsm + assert "antigen_specificity_scores_Flu_A0201" in converted_data["antigen"].obsm def test_cellranger_multi_to_h5mu_fixed_rna(run_component, tmp_path): - output_dir = tmp_path / "converted" + output_dir = tmp_path / "converted" output_path_template = output_dir / "*.h5mu" samples_csv = tmp_path / "samples.csv" # run component - run_component([ - "--input", input_fixed_rna, - "--output", str(output_path_template), - "--output_compression", "gzip", - "--sample_csv", samples_csv, - ]) + run_component( + [ + "--input", + input_fixed_rna, + "--output", + str(output_path_template), + "--output_compression", + "gzip", + "--sample_csv", + samples_csv, + ] + ) assert output_dir.is_dir() # check output samples = [item for item in output_dir.iterdir() if item.is_file()] - sample_names = {item.name.removesuffix('.h5mu') for item in samples} - assert sample_names == {"Colorectal_BC3", "Liver_BC1", "Ovarian_BC2", "Pancreas_BC4"} + sample_names = {item.name.removesuffix(".h5mu") for item in samples} + assert sample_names == { + "Colorectal_BC3", + "Liver_BC1", + "Ovarian_BC2", + "Pancreas_BC4", + } for output_path in samples: converted_data = read_h5mu(output_path) - assert list(converted_data.mod.keys()) == ['rna', 'prot'] + assert list(converted_data.mod.keys()) == ["rna", "prot"] + -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_h5ad_to_h5mu/script.py b/src/convert/from_h5ad_to_h5mu/script.py index 8ed51c78ab5..31986d9d67b 100755 --- a/src/convert/from_h5ad_to_h5mu/script.py +++ b/src/convert/from_h5ad_to_h5mu/script.py @@ -4,7 +4,9 @@ ## VIASH START par = { - "input": ["resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5ad"], + "input": [ + "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5ad" + ], "modality": ["rna"], "output": "output.h5mu", "output_compression": "gzip", @@ -14,27 +16,27 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() -assert len(par["input"]) == len(par["modality"]), "Number of input files should be the same length as the number of modalities" +assert len(par["input"]) == len( + par["modality"] +), "Number of input files should be the same length as the number of modalities" logger.info("Reading input files") -data = { key: anndata.read_h5ad(path) for key, path in zip(par["modality"], par["input"]) } - -try: - data.var_names_make_unique() -except: - pass +data = { + key: anndata.read_h5ad(path) for key, path in zip(par["modality"], par["input"]) +} logger.info("Converting to mudata") mudata = mu.MuData(data) try: mudata.var_names_make_unique() -except: +except (TypeError, ValueError): pass -logger.info("Writing to %s.", par['output']) +logger.info("Writing to %s.", par["output"]) mudata.write_h5mu(par["output"], compression=par["output_compression"]) -logger.info("Finished") \ No newline at end of file +logger.info("Finished") diff --git a/src/convert/from_h5ad_to_h5mu/test.py b/src/convert/from_h5ad_to_h5mu/test.py index 2b94000a62c..56e5b4b62e9 100644 --- a/src/convert/from_h5ad_to_h5mu/test.py +++ b/src/convert/from_h5ad_to_h5mu/test.py @@ -5,14 +5,15 @@ ## VIASH START meta = { - 'resources_dir': 'resources_test', - 'executable': './target/executable/convert/from_h5ad_to_h5mu/from_h5ad_to_h5mu', - 'config': './src/convert/from_h5ad_to_h5mu/config.vsh.yaml' + "resources_dir": "resources_test", + "executable": "./target/executable/convert/from_h5ad_to_h5mu/from_h5ad_to_h5mu", + "config": "./src/convert/from_h5ad_to_h5mu/config.vsh.yaml", } ## VIASH END input = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + def test_run(run_component, random_h5mu_path, random_path): mdata = mu.read_h5mu(input) tmp_rna = random_path(extension="h5ad") @@ -23,12 +24,18 @@ def test_run(run_component, random_h5mu_path, random_path): tmp_output = random_h5mu_path() cmd_pars = [ - "--modality", "rna", - "--input", tmp_rna, - "--modality", "prot", - "--input", tmp_prot, - "--output", tmp_output, - "--output_compression", "gzip" + "--modality", + "rna", + "--input", + tmp_rna, + "--modality", + "prot", + "--input", + tmp_prot, + "--output", + tmp_output, + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -37,9 +44,10 @@ def test_run(run_component, random_h5mu_path, random_path): mdata2 = mu.read_h5mu(tmp_output) assert list(mdata2.mod.keys()) == ["rna", "prot"] - + assert_annotation_objects_equal(mdata2.mod["rna"], tmp_rna) assert_annotation_objects_equal(mdata2.mod["prot"], tmp_prot) + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/convert/from_h5mu_to_h5ad/script.py b/src/convert/from_h5mu_to_h5ad/script.py index 968a9ec38d0..215966e3090 100755 --- a/src/convert/from_h5mu_to_h5ad/script.py +++ b/src/convert/from_h5mu_to_h5ad/script.py @@ -12,6 +12,7 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() # TODO: Merge modalities into one layer @@ -22,7 +23,7 @@ logger.info("Converting to h5ad") adat = dat.mod[par["modality"]] -logger.info("Writing to %s.", par['output']) +logger.info("Writing to %s.", par["output"]) adat.write_h5ad(par["output"], compression=par["output_compression"]) -logger.info("Finished") \ No newline at end of file +logger.info("Finished") diff --git a/src/convert/from_h5mu_to_h5ad/test.py b/src/convert/from_h5mu_to_h5ad/test.py index d7f6031ac63..f58e2e6697c 100644 --- a/src/convert/from_h5mu_to_h5ad/test.py +++ b/src/convert/from_h5mu_to_h5ad/test.py @@ -6,21 +6,26 @@ ## VIASH START meta = { - 'executable': 'target/executable/convert/from_h5mu_to_h5ad/from_h5mu_to_h5ad', - 'resources_dir': 'resources_test' + "executable": "target/executable/convert/from_h5mu_to_h5ad/from_h5mu_to_h5ad", + "resources_dir": "resources_test", } ## VIASH END input = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + def test_run(run_component, tmp_path): output = tmp_path / "output.h5ad" cmd_pars = [ - "--modality", "rna", - "--input", input, - "--output", str(output), - "--output_compression", "gzip" + "--modality", + "rna", + "--input", + input, + "--output", + str(output), + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -34,4 +39,4 @@ def test_run(run_component, tmp_path): if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/correction/cellbender_remove_background/script.py b/src/correction/cellbender_remove_background/script.py index a0ebd6ce97f..317d50f103b 100644 --- a/src/correction/cellbender_remove_background/script.py +++ b/src/correction/cellbender_remove_background/script.py @@ -6,6 +6,7 @@ import numpy as np from scipy.sparse import csr_matrix from cellbender.remove_background.downstream import anndata_from_h5 + ## VIASH START file_input = "./resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" @@ -40,7 +41,7 @@ "fpr": [0.01], "exclude_feature_types": [], "projected_ambient_count_threshold": 0.1, - "learning_rate": 1.0E-4, + "learning_rate": 1.0e-4, "final_elbo_fail_fraction": None, "epoch_elbo_fail_fraction": None, "num_training_tries": 1, @@ -53,16 +54,17 @@ "estimator_multiple_cpu": False, "constant_learning_rate": True, "debug": False, - "cuda": False + "cuda": False, } meta = { "temp_dir": os.getenv("VIASH_TEMP"), - "resources_dir": "src/correction/cellbender_remove_background" + "resources_dir": "src/correction/cellbender_remove_background", } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() @@ -76,7 +78,9 @@ # import pathlib # with pathlib.Path(os.path.dirname(par["output"])) / "cellbender" as temp_dir: # os.mkdir(temp_dir) -with tempfile.TemporaryDirectory(prefix="cellbender-", dir=meta["temp_dir"]) as temp_dir: +with tempfile.TemporaryDirectory( + prefix="cellbender-", dir=meta["temp_dir"] +) as temp_dir: # construct paths within tempdir input_file = os.path.join(temp_dir, "input.h5ad") output_file = os.path.join(temp_dir, "output.h5") @@ -86,11 +90,15 @@ logger.info("Constructing CellBender command") cmd_pars = [ - "cellbender", "remove-background", - "--input", input_file, - "--output", output_file, + "cellbender", + "remove-background", + "--input", + input_file, + "--output", + output_file, # don't create checkpoints because they're not used / returned anyways - "--checkpoint-mins", "99999999" + "--checkpoint-mins", + "99999999", ] if meta.get("cpus") is not None: @@ -111,7 +119,11 @@ ("--ignore-features", "ignore_features", True), ("--fpr", "fpr", True), ("--exclude-feature-types", "exclude_feature_types", True), - ("--projected-ambient-count-threshold", "projected_ambient_count_threshold", True), + ( + "--projected-ambient-count-threshold", + "projected_ambient_count_threshold", + True, + ), ("--learning-rate", "learning_rate", True), ("--final-elbo-fail-fraction", "final_elbo_fail_fraction", True), ("--epoch-elbo-fail-fraction", "epoch_elbo_fail_fraction", True), @@ -127,22 +139,37 @@ ("--debug", "debug", False), ("--cuda", "cuda", False), ] - for (flag, name, is_kwarg) in extra_args: + for flag, name, is_kwarg in extra_args: if par[name]: values = par[name] if isinstance(par[name], list) else [par[name]] cmd_pars += [flag] + [str(val) for val in values] if is_kwarg else [flag] if par["expected_cells_from_qc"] and "metrics_cellranger" in data.uns: - assert par["expected_cells"] is None, "If min_counts is defined, expected_cells should be undefined" - assert par["total_droplets_included"] is None, "If min_counts is defined, expected_cells should be undefined" + assert ( + par["expected_cells"] is None + ), "If min_counts is defined, expected_cells should be undefined" + assert ( + par["total_droplets_included"] is None + ), "If min_counts is defined, expected_cells should be undefined" met = data.uns["metrics_cellranger"] col_name = "Estimated Number of Cells" - assert col_name in met.columns, "%s should be a column in .obs[metrics_cellranger]" + assert ( + col_name in met.columns + ), "%s should be a column in .obs[metrics_cellranger]" est_cells = met[col_name].values[0] - logger.info("Selecting --expected-cells %d and --total-droplets-included %d", est_cells, est_cells * 5) - cmd_pars += ["--expected-cells", str(est_cells), "--total-droplets-included", str(5*est_cells)] - - logger.info("Running CellBender: '%s'", ' '.join(cmd_pars)) + logger.info( + "Selecting --expected-cells %d and --total-droplets-included %d", + est_cells, + est_cells * 5, + ) + cmd_pars += [ + "--expected-cells", + str(est_cells), + "--total-droplets-included", + str(5 * est_cells), + ] + + logger.info("Running CellBender: '%s'", " ".join(cmd_pars)) out = subprocess.check_output(cmd_pars).decode("utf-8") logger.info("Reading CellBender 10xh5 output file: '%s'", output_file) @@ -153,10 +180,10 @@ # AnnData object with n_obs x n_vars = 6794880 x 33538 # obs: 'cellbender_analyzed' # var: 'ambient_expression', 'feature_type', 'genome', 'gene_id', 'cellbender_analyzed' - # uns: 'background_fraction', 'barcode_indices_for_latents', 'cell_probability', 'cell_size', 'droplet_efficiency', 'gene_expression_encoding', - # 'cell_size_lognormal_std', 'empty_droplet_size_lognormal_loc', 'empty_droplet_size_lognormal_scale', 'swapping_fraction_dist_params', - # 'barcodes_analyzed', 'barcodes_analyzed_inds', 'estimator', 'features_analyzed_inds', 'fraction_data_used_for_testing', 'learning_curve_learning_rate_epoch', - # 'learning_curve_learning_rate_value', 'learning_curve_test_elbo', 'learning_curve_test_epoch', 'learning_curve_train_elbo', 'learning_curve_train_epoch', + # uns: 'background_fraction', 'barcode_indices_for_latents', 'cell_probability', 'cell_size', 'droplet_efficiency', 'gene_expression_encoding', + # 'cell_size_lognormal_std', 'empty_droplet_size_lognormal_loc', 'empty_droplet_size_lognormal_scale', 'swapping_fraction_dist_params', + # 'barcodes_analyzed', 'barcodes_analyzed_inds', 'estimator', 'features_analyzed_inds', 'fraction_data_used_for_testing', 'learning_curve_learning_rate_epoch', + # 'learning_curve_learning_rate_value', 'learning_curve_test_elbo', 'learning_curve_test_epoch', 'learning_curve_train_elbo', 'learning_curve_train_epoch', # 'target_false_positive_rate' logger.info("Copying X output to MuData") @@ -168,41 +195,56 @@ "obs_cell_probability": "cell_probability", "obs_cell_size": "cell_size", "obs_droplet_efficiency": "droplet_efficiency", - "obs_latent_scale": "latent_scale" + "obs_latent_scale": "latent_scale", } for to_name, from_name in obs_store.items(): if par[to_name]: if from_name in adata_out.obs: data.obs[par[to_name]] = adata_out.obs[from_name] # when using unfiltered data, the values will be in uns instead of obs - elif from_name in adata_out.uns and "barcode_indices_for_latents" in adata_out.uns: + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): vec = np.zeros(data.n_obs) - vec[adata_out.uns["barcode_indices_for_latents"]] = adata_out.uns[from_name] + vec[adata_out.uns["barcode_indices_for_latents"]] = adata_out.uns[ + from_name + ] data.obs[par[to_name]] = vec logger.info("Copying .var output to MuData") - var_store = { "var_ambient_expression": "ambient_expression" } + var_store = {"var_ambient_expression": "ambient_expression"} for to_name, from_name in var_store.items(): if par[to_name]: data.var[par[to_name]] = adata_out.var[from_name] logger.info("Copying obsm_gene_expression_encoding output to MuData") - obsm_store = { "obsm_gene_expression_encoding": "gene_expression_encoding" } + obsm_store = {"obsm_gene_expression_encoding": "gene_expression_encoding"} for to_name, from_name in obsm_store.items(): if par[to_name]: if from_name in adata_out.obsm: - data.obsm[par[to_name]] = adata_out.obsm[from_name] - elif from_name in adata_out.uns and "barcode_indices_for_latents" in adata_out.uns: + data.obsm[par[to_name]] = adata_out.obsm[from_name] + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): matrix_to_store = adata_out.uns[from_name] number_of_obs = data.X.shape[0] - latent_space_sparse = csr_matrix((number_of_obs, par["z_dim"]), - dtype=adata_out.uns[from_name].dtype) - obs_rows_in_space_representation = adata_out.uns["barcode_indices_for_latents"] - latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[from_name] + latent_space_sparse = csr_matrix( + (number_of_obs, par["z_dim"]), dtype=adata_out.uns[from_name].dtype + ) + obs_rows_in_space_representation = adata_out.uns[ + "barcode_indices_for_latents" + ] + latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[ + from_name + ] data.obsm[par[to_name]] = latent_space_sparse else: - raise RuntimeError("Requested to save latent gene encoding, but the data is either missing " - "from cellbender output or in an incorrect format.") + raise RuntimeError( + "Requested to save latent gene encoding, but the data is either missing " + "from cellbender output or in an incorrect format." + ) logger.info("Writing to file %s", par["output"]) diff --git a/src/correction/cellbender_remove_background/test.py b/src/correction/cellbender_remove_background/test.py index a6dee50c67a..bed927091da 100644 --- a/src/correction/cellbender_remove_background/test.py +++ b/src/correction/cellbender_remove_background/test.py @@ -4,12 +4,14 @@ ## VIASH START meta = { - 'executable': 'target/executable/correction/cellbender_remove_background/cellbender_remove_background', - 'resources_dir': 'resources_test/pbmc_1k_protein_v3' + "executable": "target/executable/correction/cellbender_remove_background/cellbender_remove_background", + "resources_dir": "resources_test/pbmc_1k_protein_v3", } ## VIASH END -file_input = meta["resources_dir"] + "/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +file_input = ( + meta["resources_dir"] + "/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) file_output = "output.h5mu" print("> Check whether cellbender works when it should be working") @@ -17,10 +19,14 @@ # run cellbender cmd_pars = [ meta["executable"], - "--input", file_input, - "--output", file_output, - "--epochs", "5", - "--output_compression", "gzip" + "--input", + file_input, + "--output", + file_output, + "--epochs", + "5", + "--output_compression", + "gzip", ] # todo: if cuda is available, add --cuda out = subprocess.check_output(cmd_pars).decode("utf-8") @@ -36,4 +42,4 @@ ], "Output X should only contain Gene Expression vars." # check whether ab counts were found -assert "prot" in data.mod, 'Output should contain data.mod["rna"].' \ No newline at end of file +assert "prot" in data.mod, 'Output should contain data.mod["rna"].' diff --git a/src/correction/cellbender_remove_background_v0_2/helper.py b/src/correction/cellbender_remove_background_v0_2/helper.py index 479dd56f596..55e0cd36340 100644 --- a/src/correction/cellbender_remove_background_v0_2/helper.py +++ b/src/correction/cellbender_remove_background_v0_2/helper.py @@ -11,8 +11,9 @@ from typing import Dict -def anndata_from_h5(file: str, - analyzed_barcodes_only: bool = True) -> 'anndata.AnnData': +def anndata_from_h5( + file: str, analyzed_barcodes_only: bool = True +) -> "anndata.AnnData": """Load an output h5 file into an AnnData object for downstream work. Args: @@ -30,14 +31,19 @@ def anndata_from_h5(file: str, """ d = dict_from_h5(file) - X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')), - shape=d.pop('shape')).transpose().tocsr() + X = ( + sp.csc_matrix( + (d.pop("data"), d.pop("indices"), d.pop("indptr")), shape=d.pop("shape") + ) + .transpose() + .tocsr() + ) # check and see if we have barcode index annotations, and if the file is filtered - barcode_key = [k for k in d.keys() if (('barcode' in k) and ('ind' in k))] + barcode_key = [k for k in d.keys() if (("barcode" in k) and ("ind" in k))] if len(barcode_key) > 0: max_barcode_ind = d[barcode_key[0]].max() - filtered_file = (max_barcode_ind >= X.shape[0]) + filtered_file = max_barcode_ind >= X.shape[0] else: filtered_file = True @@ -46,63 +52,80 @@ def anndata_from_h5(file: str, # filtered file being read, so we don't need to subset print('Assuming we are loading a "filtered" file that contains only cells.') pass - elif 'barcode_indices_for_latents' in d.keys(): - X = X[d['barcode_indices_for_latents'], :] - d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']] - elif 'barcodes_analyzed_inds' in d.keys(): - X = X[d['barcodes_analyzed_inds'], :] - d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']] + elif "barcode_indices_for_latents" in d.keys(): + X = X[d["barcode_indices_for_latents"], :] + d["barcodes"] = d["barcodes"][d["barcode_indices_for_latents"]] + elif "barcodes_analyzed_inds" in d.keys(): + X = X[d["barcodes_analyzed_inds"], :] + d["barcodes"] = d["barcodes"][d["barcodes_analyzed_inds"]] else: - print('Warning: analyzed_barcodes_only=True, but the key ' - '"barcodes_analyzed_inds" or "barcode_indices_for_latents" ' - 'is missing from the h5 file. ' - 'Will output all barcodes, and proceed as if ' - 'analyzed_barcodes_only=False') + print( + "Warning: analyzed_barcodes_only=True, but the key " + '"barcodes_analyzed_inds" or "barcode_indices_for_latents" ' + "is missing from the h5 file. " + "Will output all barcodes, and proceed as if " + "analyzed_barcodes_only=False" + ) # Construct the anndata object. - adata = anndata.AnnData(X=X, - obs={'barcode': d.pop('barcodes').astype(str)}, - var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys() - else d.pop('name')).astype(str)}, - dtype=X.dtype) - adata.obs.set_index('barcode', inplace=True) - adata.var.set_index('gene_name', inplace=True) + adata = anndata.AnnData( + X=X, + obs={"barcode": d.pop("barcodes").astype(str)}, + var={ + "gene_name": ( + d.pop("gene_names") if "gene_names" in d.keys() else d.pop("name") + ).astype(str) + }, + dtype=X.dtype, + ) + adata.obs.set_index("barcode", inplace=True) + adata.var.set_index("gene_name", inplace=True) # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this - if 'genes' in d.keys(): - d['id'] = d.pop('genes') + if "genes" in d.keys(): + d["id"] = d.pop("genes") # For purely aesthetic purposes, rename "id" to "gene_id" - if 'id' in d.keys(): - d['gene_id'] = d.pop('id') + if "id" in d.keys(): + d["gene_id"] = d.pop("id") # If genomes are empty, try to guess them based on gene_id - if 'genome' in d.keys(): - if np.array([s.decode() == '' for s in d['genome']]).all(): - if '_' in d['gene_id'][0].decode(): - print('Genome field blank, so attempting to guess genomes based on gene_id prefixes') - d['genome'] = np.array([s.decode().split('_')[0] for s in d['gene_id']], dtype=str) + if "genome" in d.keys(): + if np.array([s.decode() == "" for s in d["genome"]]).all(): + if "_" in d["gene_id"][0].decode(): + print( + "Genome field blank, so attempting to guess genomes based on gene_id prefixes" + ) + d["genome"] = np.array( + [s.decode().split("_")[0] for s in d["gene_id"]], dtype=str + ) # Add other information to the anndata object in the appropriate slot. _fill_adata_slots_automatically(adata, d) # Add a special additional field to .var if it exists. - if 'features_analyzed_inds' in adata.uns.keys(): - adata.var['cellbender_analyzed'] = [True if (i in adata.uns['features_analyzed_inds']) - else False for i in range(adata.shape[1])] + if "features_analyzed_inds" in adata.uns.keys(): + adata.var["cellbender_analyzed"] = [ + True if (i in adata.uns["features_analyzed_inds"]) else False + for i in range(adata.shape[1]) + ] if analyzed_barcodes_only: - for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed') - | adata.obs.columns.str.startswith('barcode_indices')]: + for col in adata.obs.columns[ + adata.obs.columns.str.startswith("barcodes_analyzed") + | adata.obs.columns.str.startswith("barcode_indices") + ]: try: del adata.obs[col] except Exception: pass else: # Add a special additional field to .obs if all barcodes are included. - if 'barcodes_analyzed_inds' in adata.uns.keys(): - adata.obs['cellbender_analyzed'] = [True if (i in adata.uns['barcodes_analyzed_inds']) - else False for i in range(adata.shape[0])] + if "barcodes_analyzed_inds" in adata.uns.keys(): + adata.obs["cellbender_analyzed"] = [ + True if (i in adata.uns["barcodes_analyzed_inds"]) else False + for i in range(adata.shape[0]) + ] return adata @@ -133,11 +156,11 @@ def _fill_adata_slots_automatically(adata, d): else: adata.obsm[key] = value elif value.shape[0] == adata.shape[1]: - if value.dtype.name.startswith('bytes'): + if value.dtype.name.startswith("bytes"): adata.var[key] = value.astype(str) else: adata.var[key] = value else: adata.uns[key] = value except Exception: - print('Unable to load data into AnnData: ', key, value, type(value)) \ No newline at end of file + print("Unable to load data into AnnData: ", key, value, type(value)) diff --git a/src/correction/cellbender_remove_background_v0_2/script.py b/src/correction/cellbender_remove_background_v0_2/script.py index 4b23074af65..c6b0bc7d3ee 100644 --- a/src/correction/cellbender_remove_background_v0_2/script.py +++ b/src/correction/cellbender_remove_background_v0_2/script.py @@ -8,9 +8,12 @@ ## VIASH START import muon -file_raw = "./resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + +file_raw = ( + "./resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" +) mdat = muon.read_10x_h5(file_raw) -mdat = mdat[0:100000,] # subsample to reduce computational time +mdat = mdat[0:100000,] # subsample to reduce computational time file_input = "cellbender_remove_background_input.h5mu" mdat.write_h5mu(file_input) @@ -44,16 +47,17 @@ "empty_drop_training_fraction": 0.5, "expected_cells_from_qc": True, "output_compression": "gzip", - "obsm_latent_gene_encoding": "cellbender_latent_gene_encoding" + "obsm_latent_gene_encoding": "cellbender_latent_gene_encoding", } meta = { - 'temp_dir': os.getenv("VIASH_TEMP"), - 'resources_dir': 'src/correction/cellbender_remove_background' + "temp_dir": os.getenv("VIASH_TEMP"), + "resources_dir": "src/correction/cellbender_remove_background", } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() from helper import anndata_from_h5 @@ -67,7 +71,9 @@ # with pathlib.Path(meta["temp_dir"]) / "cellbender" as temp_dir: # os.mkdir(temp_dir) -with tempfile.TemporaryDirectory(prefix="cellbender-", dir=meta["temp_dir"]) as temp_dir: +with tempfile.TemporaryDirectory( + prefix="cellbender-", dir=meta["temp_dir"] +) as temp_dir: # construct paths within tempdir input_file = os.path.join(temp_dir, "input.h5ad") output_file = os.path.join(temp_dir, "output.h5") @@ -77,9 +83,12 @@ logger.info("Constructing CellBender command") cmd_pars = [ - "cellbender", "remove-background", - "--input", input_file, - "--output", output_file + "cellbender", + "remove-background", + "--input", + input_file, + "--output", + output_file, ] extra_args = [ @@ -96,22 +105,37 @@ ("--learning-rate", "learning_rate", True), ("--empty-drop-training-fraction", "empty_drop_training_fraction", True), ] - for (flag, name, is_kwarg) in extra_args: + for flag, name, is_kwarg in extra_args: if par[name]: values = par[name] if isinstance(par[name], list) else [par[name]] cmd_pars += [flag] + [str(val) for val in values] if is_kwarg else [flag] if par["expected_cells_from_qc"] and "metrics_cellranger" in data.uns: - assert par["expected_cells"] is None, "If min_counts is defined, expected_cells should be undefined" - assert par["total_droplets_included"] is None, "If min_counts is defined, expected_cells should be undefined" + assert ( + par["expected_cells"] is None + ), "If min_counts is defined, expected_cells should be undefined" + assert ( + par["total_droplets_included"] is None + ), "If min_counts is defined, expected_cells should be undefined" met = data.uns["metrics_cellranger"] col_name = "Estimated Number of Cells" - assert col_name in met.columns, "%s should be a column in .obs[metrics_cellranger]" + assert ( + col_name in met.columns + ), "%s should be a column in .obs[metrics_cellranger]" est_cells = met[col_name].values[0] - logger.info("Selecting --expected-cells %d and --total-droplets-included %d", est_cells, est_cells * 5) - cmd_pars += ["--expected-cells", str(est_cells), "--total-droplets-included", str(5*est_cells)] - - logger.info("Running CellBender: '%s'", ' '.join(cmd_pars)) + logger.info( + "Selecting --expected-cells %d and --total-droplets-included %d", + est_cells, + est_cells * 5, + ) + cmd_pars += [ + "--expected-cells", + str(est_cells), + "--total-droplets-included", + str(5 * est_cells), + ] + + logger.info("Running CellBender: '%s'", " ".join(cmd_pars)) out = subprocess.check_output(cmd_pars).decode("utf-8") logger.info("Reading CellBender 10xh5 output file: '%s'", output_file) @@ -127,42 +151,57 @@ obs_store = { "obs_latent_rt_efficiency": "latent_RT_efficiency", "obs_latent_cell_probability": "latent_cell_probability", - "obs_latent_scale": "latent_scale" + "obs_latent_scale": "latent_scale", } for to_name, from_name in obs_store.items(): if par[to_name]: if from_name in adata_out.obs: data.obs[par[to_name]] = adata_out.obs[from_name] # when using unfiltered data, the values will be in uns instead of obs - elif from_name in adata_out.uns and 'barcode_indices_for_latents' in adata_out.uns: + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): vec = np.zeros(data.n_obs) - vec[adata_out.uns['barcode_indices_for_latents']] = adata_out.uns[from_name] + vec[adata_out.uns["barcode_indices_for_latents"]] = adata_out.uns[ + from_name + ] data.obs[par[to_name]] = vec logger.info("Copying .var output to MuData") - var_store = { "var_ambient_expression": "ambient_expression" } + var_store = {"var_ambient_expression": "ambient_expression"} for to_name, from_name in var_store.items(): if par[to_name]: data.var[par[to_name]] = adata_out.var[from_name] logger.info("Copying obsm_latent_gene_encoding output to MuData") - obsm_store = { "obsm_latent_gene_encoding": "latent_gene_encoding" } + obsm_store = {"obsm_latent_gene_encoding": "latent_gene_encoding"} for to_name, from_name in obsm_store.items(): if par[to_name]: if from_name in adata_out.obsm: - data.obsm[par[to_name]] = adata_out.obsm[from_name] - elif from_name in adata_out.uns and 'barcode_indices_for_latents' in adata_out.uns: + data.obsm[par[to_name]] = adata_out.obsm[from_name] + elif ( + from_name in adata_out.uns + and "barcode_indices_for_latents" in adata_out.uns + ): matrix_to_store = adata_out.uns[from_name] number_of_obs = data.X.shape[0] - latent_space_sparse = csr_matrix((number_of_obs, par['z_dim']), - dtype=adata_out.uns[from_name].dtype) - obs_rows_in_space_representation = adata_out.uns['barcode_indices_for_latents'] - latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[from_name] + latent_space_sparse = csr_matrix( + (number_of_obs, par["z_dim"]), dtype=adata_out.uns[from_name].dtype + ) + obs_rows_in_space_representation = adata_out.uns[ + "barcode_indices_for_latents" + ] + latent_space_sparse[obs_rows_in_space_representation] = adata_out.uns[ + from_name + ] data.obsm[par[to_name]] = latent_space_sparse else: - raise RuntimeError("Requested to save latent gene encoding, but the data is either missing " - "from cellbender output or in an incorrect format.") + raise RuntimeError( + "Requested to save latent gene encoding, but the data is either missing " + "from cellbender output or in an incorrect format." + ) logger.info("Writing to file %s", par["output"]) -mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) \ No newline at end of file +mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) diff --git a/src/correction/cellbender_remove_background_v0_2/test.py b/src/correction/cellbender_remove_background_v0_2/test.py index 045d1b80cf4..4bd05063d9c 100644 --- a/src/correction/cellbender_remove_background_v0_2/test.py +++ b/src/correction/cellbender_remove_background_v0_2/test.py @@ -1,34 +1,38 @@ -import subprocess from os import path import muon as mu import pytest ## VIASH START meta = { - 'executable': 'target/executable/correction/cellbender_remove_background/cellbender_remove_background', - 'resources_dir': 'resources_test/pbmc_1k_protein_v3' + "executable": "target/executable/correction/cellbender_remove_background/cellbender_remove_background", + "resources_dir": "resources_test/pbmc_1k_protein_v3", } ## VIASH END file_raw = meta["resources_dir"] + "/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" + @pytest.fixture def subsampled_input(write_mudata_to_file): mdat = mu.read_10x_h5(file_raw) mdat = mdat[0:100000,] return write_mudata_to_file(mdat) -def test_run(run_component, random_h5mu_path, subsampled_input): +def test_run(run_component, random_h5mu_path, subsampled_input): print("> Check whether cellbender works when it should be working") # run cellbender output_file = random_h5mu_path() cmd_pars = [ - "--input", subsampled_input, - "--output", output_file, - "--epochs", "5", - "--output_compression", "gzip" + "--input", + subsampled_input, + "--output", + output_file, + "--epochs", + "5", + "--output_compression", + "gzip", ] # todo: if cuda is available, add --cuda run_component(cmd_pars) @@ -45,4 +49,4 @@ def test_run(run_component, random_h5mu_path, subsampled_input): ], "Output X should only contain Gene Expression vars." # check whether ab counts were found - assert "prot" in data.mod, 'Output should contain data.mod["rna"].' \ No newline at end of file + assert "prot" in data.mod, 'Output should contain data.mod["rna"].' diff --git a/src/dataflow/concatenate_h5mu/script.py b/src/dataflow/concatenate_h5mu/script.py index c74d2681122..9b59eaaa74e 100644 --- a/src/dataflow/concatenate_h5mu/script.py +++ b/src/dataflow/concatenate_h5mu/script.py @@ -13,18 +13,17 @@ ### VIASH START par = { - "input": ["resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", - "resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu"], + "input": [ + "resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", + "resources_test/concat_test_data/human_brain_3k_filtered_feature_bc_matrix_subset_unique_obs.h5mu", + ], "output": "foo.h5mu", "input_id": ["mouse", "human"], "other_axis_mode": "move", "output_compression": "gzip", "uns_merge_mode": "make_unique", } -meta = { - "cpus": 10, - "resources_dir": "resources_test/" -} +meta = {"cpus": 10, "resources_dir": "resources_test/"} ### VIASH END sys.path.append(meta["resources_dir"]) @@ -33,11 +32,13 @@ logger = setup_logger() + def nunique(row): unique = pd.unique(row) unique_without_na = pd.core.dtypes.missing.remove_na_arraylike(unique) return len(unique_without_na) > 1 + def any_row_contains_duplicate_values(n_processes: int, frame: pd.DataFrame) -> bool: """ Check if any row contains duplicate values, that are not NA. @@ -47,27 +48,32 @@ def any_row_contains_duplicate_values(n_processes: int, frame: pd.DataFrame) -> is_duplicated = pool.map(nunique, iter(numpy_array)) return any(is_duplicated) -def concatenate_matrices(n_processes: int, matrices: dict[str, pd.DataFrame], align_to: pd.Index) \ - -> tuple[dict[str, pd.DataFrame], pd.DataFrame | None, dict[str, pd.core.dtypes.dtypes.Dtype]]: + +def concatenate_matrices( + n_processes: int, matrices: dict[str, pd.DataFrame], align_to: pd.Index +) -> tuple[ + dict[str, pd.DataFrame], pd.DataFrame | None, dict[str, pd.core.dtypes.dtypes.Dtype] +]: """ Merge matrices by combining columns that have the same name. Columns that contain conflicting values (e.i. the columns have different values), are not merged, but instead moved to a new dataframe. """ column_names = set(column_name for var in matrices.values() for column_name in var) - logger.debug('Trying to concatenate columns: %s.', ",".join(column_names)) + logger.debug("Trying to concatenate columns: %s.", ",".join(column_names)) if not column_names: return {}, pd.DataFrame(index=align_to) - conflicts, concatenated_matrix = \ - split_conflicts_and_concatenated_columns(n_processes, - matrices, - column_names, - align_to) + conflicts, concatenated_matrix = split_conflicts_and_concatenated_columns( + n_processes, matrices, column_names, align_to + ) concatenated_matrix = cast_to_writeable_dtype(concatenated_matrix) - conflicts = {conflict_name: cast_to_writeable_dtype(conflict_df) - for conflict_name, conflict_df in conflicts.items()} + conflicts = { + conflict_name: cast_to_writeable_dtype(conflict_df) + for conflict_name, conflict_df in conflicts.items() + } return conflicts, concatenated_matrix + def get_first_non_na_value_vector(df): numpy_arr = df.to_numpy() n_rows, n_cols = numpy_arr.shape @@ -75,6 +81,7 @@ def get_first_non_na_value_vector(df): flat_index = n_cols * np.arange(n_rows) + col_index return pd.Series(numpy_arr.ravel()[flat_index], index=df.index, name=df.columns[0]) + def make_uns_keys_unique(mod_data, concatenated_data): """ Check if the uns keys across samples are unique before adding them @@ -90,18 +97,20 @@ def make_uns_keys_unique(mod_data, concatenated_data): if len(samples_ids) == 1: sample_id = samples_ids[0] concatenated_data.uns[uns_key] = mod_data[sample_id].uns[uns_key] - else: + else: for sample_id in samples_ids: - concatenated_data.uns[f"{sample_id}_{uns_key}"] = \ - mod_data[sample_id].uns[uns_key] + concatenated_data.uns[f"{sample_id}_{uns_key}"] = mod_data[ + sample_id + ].uns[uns_key] return concatenated_data -def split_conflicts_and_concatenated_columns(n_processes: int, - matrices: dict[str, pd.DataFrame], - column_names: Iterable[str], - align_to: pd.Index) -> \ - tuple[dict[str, pd.DataFrame], pd.DataFrame]: +def split_conflicts_and_concatenated_columns( + n_processes: int, + matrices: dict[str, pd.DataFrame], + column_names: Iterable[str], + align_to: pd.Index, +) -> tuple[dict[str, pd.DataFrame], pd.DataFrame]: """ Retrieve columns with the same name from a list of dataframes which are identical across all the frames (ignoring NA values). @@ -112,50 +121,65 @@ def split_conflicts_and_concatenated_columns(n_processes: int, conflicts = {} concatenated_matrix = [] for column_name in column_names: - columns = {input_id: var[column_name] - for input_id, var in matrices.items() - if column_name in var} + columns = { + input_id: var[column_name] + for input_id, var in matrices.items() + if column_name in var + } assert columns, "Some columns should have been found." - concatenated_columns = pd.concat(columns.values(), axis=1, - join="outer", sort=False) + concatenated_columns = pd.concat( + columns.values(), axis=1, join="outer", sort=False + ) if any_row_contains_duplicate_values(n_processes, concatenated_columns): - concatenated_columns.columns = columns.keys() # Use the sample id as column name + concatenated_columns.columns = ( + columns.keys() + ) # Use the sample id as column name concatenated_columns = concatenated_columns.reindex(align_to, copy=False) - conflicts[f'conflict_{column_name}'] = concatenated_columns + conflicts[f"conflict_{column_name}"] = concatenated_columns else: unique_values = get_first_non_na_value_vector(concatenated_columns) concatenated_matrix.append(unique_values) if not concatenated_matrix: return conflicts, pd.DataFrame(index=align_to) - concatenated_matrix = pd.concat(concatenated_matrix, join="outer", - axis=1, sort=False) + concatenated_matrix = pd.concat( + concatenated_matrix, join="outer", axis=1, sort=False + ) concatenated_matrix = concatenated_matrix.reindex(align_to, copy=False) return conflicts, concatenated_matrix + def cast_to_writeable_dtype(result: pd.DataFrame) -> pd.DataFrame: """ Cast the dataframe to dtypes that can be written by mudata. - """ + """ # dtype inferral workfs better with np.nan result = result.replace({pd.NA: np.nan}) # MuData supports nullable booleans and ints # ie. `IntegerArray` and `BooleanArray` - result = result.convert_dtypes(infer_objects=True, - convert_integer=True, - convert_string=False, - convert_boolean=True, - convert_floating=False) - + result = result.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) + # Convert leftover 'object' columns to string # However, na values are supported, so convert all values except NA's to string - object_cols = result.select_dtypes(include='object').columns.values + object_cols = result.select_dtypes(include="object").columns.values for obj_col in object_cols: - result[obj_col] = result[obj_col].where(result[obj_col].isna(), result[obj_col].astype(str)).astype('category') + result[obj_col] = ( + result[obj_col] + .where(result[obj_col].isna(), result[obj_col].astype(str)) + .astype("category") + ) return result -def split_conflicts_modalities(n_processes: int, samples: dict[str, anndata.AnnData], output: anndata.AnnData) \ - -> anndata.AnnData: + +def split_conflicts_modalities( + n_processes: int, samples: dict[str, anndata.AnnData], output: anndata.AnnData +) -> anndata.AnnData: """ Merge .var and .obs matrices of the anndata objects. Columns are merged when the values (excl NA) are the same in each of the matrices. @@ -164,17 +188,26 @@ def split_conflicts_modalities(n_processes: int, samples: dict[str, anndata.AnnD """ matrices_to_parse = ("var", "obs") for matrix_name in matrices_to_parse: - matrices = {sample_id: getattr(sample, matrix_name) for sample_id, sample in samples.items()} - output_index = getattr(output, matrix_name).index - conflicts, concatenated_matrix = concatenate_matrices(n_processes, matrices, output_index) + matrices = { + sample_id: getattr(sample, matrix_name) + for sample_id, sample in samples.items() + } + output_index = getattr(output, matrix_name).index + conflicts, concatenated_matrix = concatenate_matrices( + n_processes, matrices, output_index + ) if concatenated_matrix.empty: - concatenated_matrix.index = output_index + concatenated_matrix.index = output_index # Even though we did not touch the varm and obsm matrices that were already present, # the joining of observations might have caused a dtype change in these matrices as well # so these also need to be casted to a writable dtype... for multidim_name, multidim_data in getattr(output, f"{matrix_name}m").items(): - new_data = cast_to_writeable_dtype(multidim_data) if isinstance(multidim_data, pd.DataFrame) else multidim_data + new_data = ( + cast_to_writeable_dtype(multidim_data) + if isinstance(multidim_data, pd.DataFrame) + else multidim_data + ) getattr(output, f"{matrix_name}m")[multidim_name] = new_data # Write the conflicts to the output @@ -187,19 +220,22 @@ def split_conflicts_modalities(n_processes: int, samples: dict[str, anndata.AnnD return output -def concatenate_modality(n_processes: int, mod: str | None, input_files: Iterable[str | Path], - other_axis_mode: str, uns_merge_mode: str, input_ids: tuple[str]) -> anndata.AnnData: - +def concatenate_modality( + n_processes: int, + mod: str | None, + input_files: Iterable[str | Path], + other_axis_mode: str, + uns_merge_mode: str, + input_ids: tuple[str], +) -> anndata.AnnData: concat_modes = { "move": "unique", } other_axis_mode_to_apply = concat_modes.get(other_axis_mode, other_axis_mode) - uns_merge_modes = { - "make_unique": None - } + uns_merge_modes = {"make_unique": None} uns_merge_mode_to_apply = uns_merge_modes.get(uns_merge_mode, uns_merge_mode) - + mod_data = {} mod_indices_combined = pd.Index([]) for input_id, input_file in zip(input_ids, input_files): @@ -208,64 +244,97 @@ def concatenate_modality(n_processes: int, mod: str | None, input_files: Iterabl data = mu.read_h5ad(input_file, mod=mod) mod_data[input_id] = data mod_indices_combined = mod_indices_combined.append(data.obs.index) - except KeyError as e: # Modality does not exist for this sample, skip it - if f"Unable to synchronously open object (object '{mod}' doesn't exist)" not in str(e): + except KeyError as e: # Modality does not exist for this sample, skip it + if ( + f"Unable to synchronously open object (object '{mod}' doesn't exist)" + not in str(e) + ): raise e pass - else: # When mod=None, process the 'global' h5mu state - with H5File(input_file, 'r') as input_h5: - if "uns" in input_h5.keys(): - uns_data = anndata.experimental.read_elem(input_h5['uns']) + else: # When mod=None, process the 'global' h5mu state + with H5File(input_file, "r") as input_h5: + if "uns" in input_h5.keys(): + uns_data = anndata.experimental.read_elem(input_h5["uns"]) if uns_data: mod_data[input_id] = anndata.AnnData(uns=uns_data) - + if not mod_indices_combined.is_unique: raise ValueError("Observations are not unique across samples.") - + if not mod_data: return anndata.AnnData() - - concatenated_data = anndata.concat(mod_data.values(), join='outer', - merge=other_axis_mode_to_apply, - uns_merge=uns_merge_mode_to_apply) + + concatenated_data = anndata.concat( + mod_data.values(), + join="outer", + merge=other_axis_mode_to_apply, + uns_merge=uns_merge_mode_to_apply, + ) if other_axis_mode == "move": - concatenated_data = split_conflicts_modalities(n_processes, mod_data, concatenated_data) + concatenated_data = split_conflicts_modalities( + n_processes, mod_data, concatenated_data + ) if uns_merge_mode == "make_unique": concatenated_data = make_uns_keys_unique(mod_data, concatenated_data) return concatenated_data -def concatenate_modalities(n_processes: int, modalities: list[str], input_files: Path | str, - other_axis_mode: str, uns_merge_mode: str, output_file: Path | str, - compression: Literal['gzip'] | Literal['lzf'], - input_ids: tuple[str] | None = None) -> None: + +def concatenate_modalities( + n_processes: int, + modalities: list[str], + input_files: Path | str, + other_axis_mode: str, + uns_merge_mode: str, + output_file: Path | str, + compression: Literal["gzip"] | Literal["lzf"], + input_ids: tuple[str] | None = None, +) -> None: """ Join the modalities together into a single multimodal sample. """ - logger.info('Concatenating samples.') - output_file, input_files = Path(output_file), [Path(input_file) for input_file in input_files] - output_file_uncompressed = output_file.with_name(output_file.stem + "_uncompressed.h5mu") + logger.info("Concatenating samples.") + output_file, input_files = ( + Path(output_file), + [Path(input_file) for input_file in input_files], + ) + output_file_uncompressed = output_file.with_name( + output_file.stem + "_uncompressed.h5mu" + ) output_file_uncompressed.touch() # Create empty mudata file mdata = mu.MuData({modality: anndata.AnnData() for modality in modalities}) mdata.write(output_file_uncompressed, compression=compression) # Use "None" for the global slots (not assigned to any modality) - for mod_name in modalities + [None,]: - new_mod = concatenate_modality(n_processes, mod_name, - input_files, other_axis_mode, - uns_merge_mode, input_ids) + for mod_name in modalities + [ + None, + ]: + new_mod = concatenate_modality( + n_processes, + mod_name, + input_files, + other_axis_mode, + uns_merge_mode, + input_ids, + ) if mod_name is None: if new_mod.uns: - with H5File(output_file_uncompressed, 'r+') as open_h5mu_file: - anndata.experimental.write_elem(open_h5mu_file, "uns", dict(new_mod.uns)) + with H5File(output_file_uncompressed, "r+") as open_h5mu_file: + anndata.experimental.write_elem( + open_h5mu_file, "uns", dict(new_mod.uns) + ) continue - logger.info("Writing out modality '%s' to '%s' with compression '%s'.", - mod_name, output_file_uncompressed, compression) + logger.info( + "Writing out modality '%s' to '%s' with compression '%s'.", + mod_name, + output_file_uncompressed, + compression, + ) mu.write_h5ad(output_file_uncompressed, data=new_mod, mod=mod_name) - + if compression: compress_h5mu(output_file_uncompressed, output_file, compression=compression) output_file_uncompressed.unlink() @@ -274,12 +343,13 @@ def concatenate_modalities(n_processes: int, modalities: list[str], input_files: logger.info("Concatenation successful.") + def main() -> None: # Get a list of all possible modalities mods = set() for path in par["input"]: try: - with H5File(path, 'r') as f_root: + with H5File(path, "r") as f_root: mods = mods | set(f_root["mod"].keys()) except OSError: raise OSError(f"Failed to load {path}. Is it a valid h5 file?") @@ -288,26 +358,29 @@ def main() -> None: if par["input_id"]: input_ids: tuple[str] = tuple(i.strip() for i in par["input_id"]) if len(input_ids) != len(par["input"]): - raise ValueError("The number of sample names must match the number of sample files.") + raise ValueError( + "The number of sample names must match the number of sample files." + ) if len(set(input_ids)) != len(input_ids): raise ValueError("The sample names should be unique.") - logger.info("\nConcatenating data from paths:\n\t%s", - "\n\t".join(par["input"])) + logger.info("\nConcatenating data from paths:\n\t%s", "\n\t".join(par["input"])) if par["other_axis_mode"] == "move" and not input_ids: raise ValueError("--mode 'move' requires --input_ids.") n_processes = meta["cpus"] if meta["cpus"] else 1 - concatenate_modalities(n_processes, - list(mods), - par["input"], - par["other_axis_mode"], - par["uns_merge_mode"], - par["output"], - par["output_compression"], - input_ids=input_ids) + concatenate_modalities( + n_processes, + list(mods), + par["input"], + par["other_axis_mode"], + par["uns_merge_mode"], + par["output"], + par["output_compression"], + input_ids=input_ids, + ) if __name__ == "__main__": diff --git a/src/dataflow/concatenate_h5mu/test.py b/src/dataflow/concatenate_h5mu/test.py index 3f7635670d1..19ef59d44fe 100644 --- a/src/dataflow/concatenate_h5mu/test.py +++ b/src/dataflow/concatenate_h5mu/test.py @@ -12,14 +12,14 @@ ## VIASH START meta = { - 'executable': './target/docker/dataflow/concatenate_h5mu/concatenate_h5mu', - 'resources_dir': './resources_test/concat_test_data/', - 'cpus': 2, - 'config': './src/dataflow/concatenate_h5mu/config.vsh.yaml' + "executable": "./target/docker/dataflow/concatenate_h5mu/concatenate_h5mu", + "resources_dir": "./resources_test/concat_test_data/", + "cpus": 2, + "config": "./src/dataflow/concatenate_h5mu/config.vsh.yaml", } ## VIASH END -meta['cpus'] = 1 if not meta['cpus'] else meta['cpus'] +meta["cpus"] = 1 if not meta["cpus"] else meta["cpus"] @pytest.fixture @@ -41,19 +41,38 @@ def sample_1_modality_1(): [4, 5, 6]]) """ - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], - columns=["var1", "var2", "overlapping_var_mod1"]) - obs = pd.DataFrame([["A", "B"], ["C", "D"]], index=df.index, - columns=["Obs1", "Shared_obs"]) - var = pd.DataFrame([["a", "b"], ["c", "d"], ["e", "f"]], - index=df.columns, columns=["Feat1", "Shared_feat"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["obs1", "obs2"], + columns=["var1", "var2", "overlapping_var_mod1"], + ) + obs = pd.DataFrame( + [["A", "B"], ["C", "D"]], index=df.index, columns=["Obs1", "Shared_obs"] + ) + var = pd.DataFrame( + [["a", "b"], ["c", "d"], ["e", "f"]], + index=df.columns, + columns=["Feat1", "Shared_feat"], + ) varm = np.random.rand(df.columns.size, 5) - ad1 = ad.AnnData(df, obs=obs, var=var, varm={"random_vals_mod1": varm}, - uns={"uns_unique_to_sample1": pd.DataFrame(["foo"], index=["bar"], columns=["col1"]), - "overlapping_uns_key": pd.DataFrame(["jing"], index=["jang"], columns=["col2"])}) - #ad1 = ad.AnnData(df, obs=obs, var=var) + ad1 = ad.AnnData( + df, + obs=obs, + var=var, + varm={"random_vals_mod1": varm}, + uns={ + "uns_unique_to_sample1": pd.DataFrame( + ["foo"], index=["bar"], columns=["col1"] + ), + "overlapping_uns_key": pd.DataFrame( + ["jing"], index=["jang"], columns=["col2"] + ), + }, + ) + # ad1 = ad.AnnData(df, obs=obs, var=var) return ad1 + @pytest.fixture def sample_1_input_modality_2(): """ @@ -72,23 +91,33 @@ def sample_1_input_modality_2(): Feat2 Shared_feat var4 d e var5 f g - - """ - df = pd.DataFrame([[7, 8], [9, 10], [11, 12]], index=["obs3", "obs4", "obs5"], - columns=["var3", "var4"]) - obs = pd.DataFrame([["E", "F", "G"], ["H", "I", "J"], ["K", "L", "M"]], - index=df.index, columns=["Obs2", "Obs3", "Shared_obs"]) - var = pd.DataFrame([["d", "e"], ["f", "g"]], index=df.columns, - columns=["Feat2", "Shared_feat"]) + + """ + df = pd.DataFrame( + [[7, 8], [9, 10], [11, 12]], + index=["obs3", "obs4", "obs5"], + columns=["var3", "var4"], + ) + obs = pd.DataFrame( + [["E", "F", "G"], ["H", "I", "J"], ["K", "L", "M"]], + index=df.index, + columns=["Obs2", "Obs3", "Shared_obs"], + ) + var = pd.DataFrame( + [["d", "e"], ["f", "g"]], index=df.columns, columns=["Feat2", "Shared_feat"] + ) ad2 = ad.AnnData(df, obs=obs, var=var) return ad2 + @pytest.fixture def sample_1_h5mu(sample_1_modality_1, sample_1_input_modality_2): - tmp_mudata = md.MuData({'mod1': sample_1_modality_1, - 'mod2': sample_1_input_modality_2}) + tmp_mudata = md.MuData( + {"mod1": sample_1_modality_1, "mod2": sample_1_input_modality_2} + ) return tmp_mudata + @pytest.fixture def sample_2_modality_1(): """ @@ -108,25 +137,42 @@ def sample_2_modality_1(): obs7 R S T obs8 U V W """ - df = pd.DataFrame([[13, 14], [15, 16], [17, 18]], - index=["obs6", "obs7", "obs8"], - columns=["var5", "overlapping_var_mod1"]) - obs = pd.DataFrame([["O", "P", "Q"], ["R", "S", "T"], ["U", "V", "W"]], - index=df.index, columns=["Obs4", "Obs5", "Shared_obs"]) - var = pd.DataFrame([["h", "i"], ["j", "k"]], index=df.columns, - columns=["Feat3", "Shared_feat"]) - ad3 = ad.AnnData(df, obs=obs, var=var, - uns={"uns_unique_to_sample2": pd.DataFrame(["baz"], index=["qux"], columns=["col3"]), - "overlapping_uns_key": pd.DataFrame(["ping"], index=["pong"], columns=["col4"])}) + df = pd.DataFrame( + [[13, 14], [15, 16], [17, 18]], + index=["obs6", "obs7", "obs8"], + columns=["var5", "overlapping_var_mod1"], + ) + obs = pd.DataFrame( + [["O", "P", "Q"], ["R", "S", "T"], ["U", "V", "W"]], + index=df.index, + columns=["Obs4", "Obs5", "Shared_obs"], + ) + var = pd.DataFrame( + [["h", "i"], ["j", "k"]], index=df.columns, columns=["Feat3", "Shared_feat"] + ) + ad3 = ad.AnnData( + df, + obs=obs, + var=var, + uns={ + "uns_unique_to_sample2": pd.DataFrame( + ["baz"], index=["qux"], columns=["col3"] + ), + "overlapping_uns_key": pd.DataFrame( + ["ping"], index=["pong"], columns=["col4"] + ), + }, + ) return ad3 + @pytest.fixture def sample_2_modality_2(): """ >>> ad4.X array([[19, 20, 21], [22, 23, 24]]) - + >>> ad4.obs Obs6 Shared_obs obs8 X Y @@ -138,12 +184,19 @@ def sample_2_modality_2(): var7 n o var8 p q """ - df = pd.DataFrame([[19, 20, 21], [22, 23, 24]], index=["obs8", "obs9"], - columns=["var6", "var7", "var8"]) - obs = pd.DataFrame([["X", "Y"], ["Z", "AA"]], index=df.index, - columns=["Obs6", "Shared_obs"]) - var = pd.DataFrame([["l", "m"], ["n", "o"], ["p", "q"]], - index=df.columns, columns=["Feat4", "Shared_feat"]) + df = pd.DataFrame( + [[19, 20, 21], [22, 23, 24]], + index=["obs8", "obs9"], + columns=["var6", "var7", "var8"], + ) + obs = pd.DataFrame( + [["X", "Y"], ["Z", "AA"]], index=df.index, columns=["Obs6", "Shared_obs"] + ) + var = pd.DataFrame( + [["l", "m"], ["n", "o"], ["p", "q"]], + index=df.columns, + columns=["Feat4", "Shared_feat"], + ) varm = np.random.rand(df.columns.size, 3) ad4 = ad.AnnData(df, obs=obs, var=var, varm={"random_vals_mod2": varm}) # ad4 = ad.AnnData(df, obs=obs, var=var) @@ -152,9 +205,10 @@ def sample_2_modality_2(): @pytest.fixture def sample_2_h5mu(sample_2_modality_1, sample_2_modality_2): - tmp_mudata = md.MuData({'mod1': sample_2_modality_1, 'mod2': sample_2_modality_2}) + tmp_mudata = md.MuData({"mod1": sample_2_modality_1, "mod2": sample_2_modality_2}) return tmp_mudata + @pytest.fixture def sample_3_modality_1(): """ @@ -176,13 +230,14 @@ def sample_3_modality_1(): ad5 = ad.AnnData(df, obs=obs, var=var) return ad5 + @pytest.fixture def sample_3_modality_3(): """ >>> ad6.X array([[ 26, 32, 33, 453], [ 34, 35, 36, 543]]) - + >>> ad6.var Feat5 Feat6 Feat7 Feat8 var10 s t u v @@ -195,28 +250,41 @@ def sample_3_modality_3(): obs11 AC AD AE AF obs12 AG AH AI AJ """ - df = pd.DataFrame([[26, 32, 33, 453], [34, 35, 36, 543]], - index=["obs11", "obs12"], - columns=["var10", "var11", "var12", "var13"]) - obs = pd.DataFrame([["AC", "AD", "AE", "AF"], ["AG", "AH", "AI", "AJ"]], - index=df.index, columns=["Obs8", "Obs9", "obs10", "obs11"]) - var = pd.DataFrame([["s", "t", "u", "v"], - ["w", "x", "y", "z"], - ["aa", "ab", "ac", "ad"], - ["ae", "af", "ag", "ah"]], - index=df.columns, columns=["Feat5", "Feat6", "Feat7", "Feat8"]) + df = pd.DataFrame( + [[26, 32, 33, 453], [34, 35, 36, 543]], + index=["obs11", "obs12"], + columns=["var10", "var11", "var12", "var13"], + ) + obs = pd.DataFrame( + [["AC", "AD", "AE", "AF"], ["AG", "AH", "AI", "AJ"]], + index=df.index, + columns=["Obs8", "Obs9", "obs10", "obs11"], + ) + var = pd.DataFrame( + [ + ["s", "t", "u", "v"], + ["w", "x", "y", "z"], + ["aa", "ab", "ac", "ad"], + ["ae", "af", "ag", "ah"], + ], + index=df.columns, + columns=["Feat5", "Feat6", "Feat7", "Feat8"], + ) ad6 = ad.AnnData(df, obs=obs, var=var) return ad6 + @pytest.fixture def sample_3_h5mu(sample_3_modality_1, sample_3_modality_3): - tmp_mudata = md.MuData({'mod1': sample_3_modality_1, 'mod3': sample_3_modality_3}) + tmp_mudata = md.MuData({"mod1": sample_3_modality_1, "mod3": sample_3_modality_3}) return tmp_mudata + @pytest.fixture def wrap_anndata_to_mudata(): def wrapper(anndata_obj, mod_name="mod"): return md.MuData({mod_name: anndata_obj}) + return wrapper @@ -234,53 +302,74 @@ def wrapper(mudata_obj, annotation_frame_name, column_name, values_per_modality) mudata_obj.update() global_annotation_frame = get_frame(mudata_obj) if column_name in global_annotation_frame.columns: - updated_global_column = pd.concat(modality_columns, copy=True, join='inner') - no_duplicates = updated_global_column.reset_index().drop_duplicates(subset=['index']) - no_duplicates = no_duplicates.set_index('index') - global_annotation_frame[column_name] = no_duplicates - setattr(mudata_obj, annotation_frame_name, - global_annotation_frame.convert_dtypes(infer_objects=True, - convert_integer=True, - convert_string=False, - convert_boolean=True, - convert_floating=False) - ) + updated_global_column = pd.concat(modality_columns, copy=True, join="inner") + no_duplicates = updated_global_column.reset_index().drop_duplicates( + subset=["index"] + ) + no_duplicates = no_duplicates.set_index("index") + global_annotation_frame[column_name] = no_duplicates + setattr( + mudata_obj, + annotation_frame_name, + global_annotation_frame.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ), + ) return wrapper -def test_concatenate_samples_with_same_observation_ids_raises(run_component, wrap_anndata_to_mudata, - write_mudata_to_file, sample_1_modality_1, - sample_2_modality_1, random_h5mu_path): +def test_concatenate_samples_with_same_observation_ids_raises( + run_component, + wrap_anndata_to_mudata, + write_mudata_to_file, + sample_1_modality_1, + sample_2_modality_1, + random_h5mu_path, +): """ Test how concat handles overlapping observation IDs. This should raise. """ # introduce an overlapping observation input_1_mudata = wrap_anndata_to_mudata(sample_1_modality_1) - old_obs_names = sample_2_modality_1.obs_names - new_obs_names = old_obs_names.where(old_obs_names.isin([old_obs_names[0]]), - sample_1_modality_1.obs.index[0]) + old_obs_names = sample_2_modality_1.obs_names + new_obs_names = old_obs_names.where( + old_obs_names.isin([old_obs_names[0]]), sample_1_modality_1.obs.index[0] + ) sample_2_modality_1.obs_names = new_obs_names input_2_mudata = wrap_anndata_to_mudata(sample_2_modality_1) - + with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input_id", "foo;bar", - "--input", write_mudata_to_file(input_1_mudata), - "--input", write_mudata_to_file(input_2_mudata), - "--output", random_h5mu_path(), - "--other_axis_mode", "move", - "--output_compression", "gzip" - ]) - assert "ValueError: Observations are not unique across samples." in \ - err.value.stdout.decode('utf-8') - -def test_concat_different_var_columns_per_sample(run_component, - sample_1_h5mu, - sample_2_h5mu, - random_h5mu_path, - write_mudata_to_file): + run_component( + [ + "--input_id", + "foo;bar", + "--input", + write_mudata_to_file(input_1_mudata), + "--input", + write_mudata_to_file(input_2_mudata), + "--output", + random_h5mu_path(), + "--other_axis_mode", + "move", + "--output_compression", + "gzip", + ] + ) + assert ( + "ValueError: Observations are not unique across samples." + in err.value.stdout.decode("utf-8") + ) + + +def test_concat_different_var_columns_per_sample( + run_component, sample_1_h5mu, sample_2_h5mu, random_h5mu_path, write_mudata_to_file +): """ Test what happens when concatenating samples with differing auxiliary (like in .var) columns (present in 1 sample, absent in other). @@ -297,59 +386,72 @@ def test_concat_different_var_columns_per_sample(run_component, output_path = random_h5mu_path() # Before removing the 'Shared_feat' column from one of the samples, # check if they are present in both - assert 'Shared_feat' in sample_1_h5mu.var_keys() - assert 'Shared_feat' in sample_2_h5mu.var_keys() + assert "Shared_feat" in sample_1_h5mu.var_keys() + assert "Shared_feat" in sample_2_h5mu.var_keys() - sample_2_h5mu = remove_annotation_column(sample_2_h5mu, ['Shared_feat'], axis="var") - assert 'Shared_feat' in sample_1_h5mu.var_keys() - assert 'Shared_feat' not in sample_2_h5mu.var_keys() + sample_2_h5mu = remove_annotation_column(sample_2_h5mu, ["Shared_feat"], axis="var") + assert "Shared_feat" in sample_1_h5mu.var_keys() + assert "Shared_feat" not in sample_2_h5mu.var_keys() # 'Shared_feat' column is not missing from sample2, which is what this test is about input_sample1_path = write_mudata_to_file(sample_1_h5mu) input_sample2_path = write_mudata_to_file(sample_2_h5mu) - run_component([ - "--input_id", "sample1;sample2", - "--input", input_sample1_path, - "--input", input_sample2_path, - "--output", output_path, - "--other_axis_mode", "move" - ]) + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + input_sample1_path, + "--input", + input_sample2_path, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) assert Path(output_path).is_file() concatenated_data = md.read(output_path) data_sample1 = md.read(input_sample1_path) data_sample2 = md.read(input_sample2_path) - - assert concatenated_data.n_vars == data_sample1.var.index.union(data_sample2.var.index).size + + assert ( + concatenated_data.n_vars + == data_sample1.var.index.union(data_sample2.var.index).size + ) for mod_name in ("mod1", "mod2"): # Check if all features are present concatenated_mod = concatenated_data.mod[mod_name] sample1_original_mod = data_sample1.mod[mod_name] sample2_original_mod = data_sample2.mod[mod_name] - - original_var_keys = set(sample1_original_mod.var_keys() + - sample2_original_mod.var_keys() + - list(sample1_original_mod.varm.keys()) + - list(sample2_original_mod.varm.keys())) - - assert original_var_keys == set(concatenated_mod.varm.keys()) | \ - set(concatenated_mod.var.columns.tolist()) - + + original_var_keys = set( + sample1_original_mod.var_keys() + + sample2_original_mod.var_keys() + + list(sample1_original_mod.varm.keys()) + + list(sample2_original_mod.varm.keys()) + ) + + assert original_var_keys == set(concatenated_mod.varm.keys()) | set( + concatenated_mod.var.columns.tolist() + ) + # Values from sample2 (which are also not in sample1) should have NA non_shared_features = data_sample2.var_names.difference(data_sample1.var_names) - assert concatenated_data.var['Shared_feat'].loc[non_shared_features].isna().all() - + assert concatenated_data.var["Shared_feat"].loc[non_shared_features].isna().all() + # Values from sample1 should not have NA, and should be equal to the original values - var_values = concatenated_data.var['Shared_feat'].loc[data_sample1.var_names] - data_sample1.var['Shared_feat'].equals(var_values) + var_values = concatenated_data.var["Shared_feat"].loc[data_sample1.var_names] + data_sample1.var["Shared_feat"].equals(var_values) -def test_concat_different_columns_per_modality(run_component, sample_1_h5mu, - sample_2_h5mu, write_mudata_to_file, - random_h5mu_path): +def test_concat_different_columns_per_modality( + run_component, sample_1_h5mu, sample_2_h5mu, write_mudata_to_file, random_h5mu_path +): """ Test what happens when concatenating samples that have auxiliary columns that is missing in one modality compared to the other, but the the column @@ -361,22 +463,31 @@ def test_concat_different_columns_per_modality(run_component, sample_1_h5mu, sample 1 x present sample 2 x present """ - sample_2_h5mu = remove_annotation_column(sample_2_h5mu, ['Shared_feat'], - axis="var", modality_name='mod1') - sample_1_h5mu = remove_annotation_column(sample_1_h5mu, ['Shared_feat'], - axis="var", modality_name='mod1') - + sample_2_h5mu = remove_annotation_column( + sample_2_h5mu, ["Shared_feat"], axis="var", modality_name="mod1" + ) + sample_1_h5mu = remove_annotation_column( + sample_1_h5mu, ["Shared_feat"], axis="var", modality_name="mod1" + ) + input_sample1_path = write_mudata_to_file(sample_1_h5mu) input_sample2_path = write_mudata_to_file(sample_2_h5mu) - output_path = random_h5mu_path() - run_component([ - "--input_id", "sample1;sample2", - "--input", input_sample1_path, - "--input", input_sample2_path, - "--output", output_path, - "--other_axis_mode", "move" - ]) + output_path = random_h5mu_path() + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + input_sample1_path, + "--input", + input_sample2_path, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) assert Path(output_path).is_file() is True concatenated_data = md.read(output_path) @@ -385,42 +496,58 @@ def test_concat_different_columns_per_modality(run_component, sample_1_h5mu, data_sample2 = md.read(str(input_sample2_path)) # Check if all features are present - assert concatenated_data.n_vars == \ - data_sample1.var.index.union(data_sample2.var.index).size + assert ( + concatenated_data.n_vars + == data_sample1.var.index.union(data_sample2.var.index).size + ) for mod_name in ("mod1", "mod2"): concatenated_mod = concatenated_data.mod[mod_name] data_sample1_mod = data_sample1.mod[mod_name] - data_sample2_mod = data_sample2.mod[mod_name] - original_var_keys = set(data_sample1_mod.var_keys() + - data_sample2_mod.var_keys() + - list(data_sample2_mod.varm.keys()) + - list(data_sample1_mod.varm.keys())) - - assert original_var_keys == \ - set(concatenated_mod.varm.keys()) | \ - set(concatenated_mod.var.columns.tolist()) + data_sample2_mod = data_sample2.mod[mod_name] + original_var_keys = set( + data_sample1_mod.var_keys() + + data_sample2_mod.var_keys() + + list(data_sample2_mod.varm.keys()) + + list(data_sample1_mod.varm.keys()) + ) + + assert original_var_keys == set(concatenated_mod.varm.keys()) | set( + concatenated_mod.var.columns.tolist() + ) # Check if the shared column stays removed from modality - assert 'Shared_feat' not in concatenated_data.mod['mod1'].var.columns + assert "Shared_feat" not in concatenated_data.mod["mod1"].var.columns # Values from modality 1 have NA - mod_1_features = data_sample1['mod1'].var_names.union(data_sample2['mod1'].var_names) - assert concatenated_data.var.loc[mod_1_features, 'mod2:Shared_feat'].isna().all() - + mod_1_features = data_sample1["mod1"].var_names.union( + data_sample2["mod1"].var_names + ) + assert concatenated_data.var.loc[mod_1_features, "mod2:Shared_feat"].isna().all() + # Values from modalitu should not have NA, and should be equal to the original values - mod2_data = pd.concat([data_sample2['mod2'].var['Shared_feat'], data_sample1['mod2'].var['Shared_feat']]) + mod2_data = pd.concat( + [ + data_sample2["mod2"].var["Shared_feat"], + data_sample1["mod2"].var["Shared_feat"], + ] + ) mod2_features = mod2_data.index - assert concatenated_data.var.loc[mod2_features, 'mod2:Shared_feat'].astype(str).equals(mod2_data) + assert ( + concatenated_data.var.loc[mod2_features, "mod2:Shared_feat"] + .astype(str) + .equals(mod2_data) + ) + -def test_concat_different_columns_per_modality_and_per_sample(run_component, sample_1_h5mu, - sample_2_h5mu, write_mudata_to_file, - random_h5mu_path): +def test_concat_different_columns_per_modality_and_per_sample( + run_component, sample_1_h5mu, sample_2_h5mu, write_mudata_to_file, random_h5mu_path +): """ Test what happens when concatenating samples that have auxiliary columns that differ between the modalities and also between samples - + Looking at 'Feat4' from sample 2 here: mod1 mod2 sample 1 x x @@ -431,13 +558,20 @@ def test_concat_different_columns_per_modality_and_per_sample(run_component, sam input_sample2_path = write_mudata_to_file(sample_2_h5mu) output_path = random_h5mu_path() - run_component([ - "--input_id", "mouse;human", - "--input", input_sample1_path, - "--input", input_sample2_path, - "--output", output_path, - "--other_axis_mode", "move" - ]) + run_component( + [ + "--input_id", + "mouse;human", + "--input", + input_sample1_path, + "--input", + input_sample2_path, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) assert Path(output_path).is_file() concatenated_data = md.read(output_path) @@ -446,127 +580,191 @@ def test_concat_different_columns_per_modality_and_per_sample(run_component, sam data_sample2 = md.read(input_sample2_path) # Check if all features are present - assert concatenated_data.n_vars == \ - data_sample1.var_names.union(data_sample2.var_names).size + assert ( + concatenated_data.n_vars + == data_sample1.var_names.union(data_sample2.var_names).size + ) # Check if all features are present for mod_name in ("mod1", "mod2"): concatenated_mod = concatenated_data.mod[mod_name] data_sample1_mod = data_sample1.mod[mod_name] - data_sample2_mod = data_sample2.mod[mod_name] - original_var_keys = set(data_sample1_mod.var_keys() + - data_sample2_mod.var_keys() + - list(data_sample2_mod.varm.keys()) + - list(data_sample1_mod.varm.keys())) - - assert original_var_keys == \ - set(column_name.removeprefix('conflict_') - for column_name in concatenated_mod.varm.keys()) | \ - set(concatenated_mod.var.columns.tolist()) + data_sample2_mod = data_sample2.mod[mod_name] + original_var_keys = set( + data_sample1_mod.var_keys() + + data_sample2_mod.var_keys() + + list(data_sample2_mod.varm.keys()) + + list(data_sample1_mod.varm.keys()) + ) + assert original_var_keys == set( + column_name.removeprefix("conflict_") + for column_name in concatenated_mod.varm.keys() + ) | set(concatenated_mod.var.columns.tolist()) - assert 'Shared_feat' in concatenated_data.mod['mod2'].var.columns + assert "Shared_feat" in concatenated_data.mod["mod2"].var.columns # Values from modality 1 have NA - mod_1_features = data_sample1['mod1'].var_names.union(data_sample2['mod1'].var_names) - assert concatenated_data.var.loc[mod_1_features, 'mod2:Feat4'].isna().all() + mod_1_features = data_sample1["mod1"].var_names.union( + data_sample2["mod1"].var_names + ) + assert concatenated_data.var.loc[mod_1_features, "mod2:Feat4"].isna().all() # Values from modality 2 should not have NA if they originate from sample2 # These values should be equal to the original values - mod2_data = data_sample2['mod2'].var['Feat4'].rename('mod2:Feat4') + mod2_data = data_sample2["mod2"].var["Feat4"].rename("mod2:Feat4") mod2_features = mod2_data.index - assert concatenated_data.var.loc[mod2_features, 'mod2:Feat4'].astype(str).equals(mod2_data) + assert ( + concatenated_data.var.loc[mod2_features, "mod2:Feat4"] + .astype(str) + .equals(mod2_data) + ) # Values from modality2 should have NA if they originate from sample1 (and only from sample1) non_shared_features = data_sample1.var_names.difference(data_sample2.var_names) - assert concatenated_data.var.loc[non_shared_features, 'mod2:Feat4'].isna().all() - -@pytest.mark.parametrize("test_value,test_value_dtype,expected", [("bar", "str", "bar"), - (True, pd.BooleanDtype(), True), - (1, pd.Int16Dtype(), 1), - (0.1, float, 0.1), - (0.1, np.float64, 0.1), - (np.nan, np.float64, pd.NA)]) -def test_concat_remove_na(run_component, sample_1_h5mu, sample_2_h5mu, - write_mudata_to_file, random_h5mu_path, test_value, test_value_dtype, expected, - change_column_contents): + assert concatenated_data.var.loc[non_shared_features, "mod2:Feat4"].isna().all() + + +@pytest.mark.parametrize( + "test_value,test_value_dtype,expected", + [ + ("bar", "str", "bar"), + (True, pd.BooleanDtype(), True), + (1, pd.Int16Dtype(), 1), + (0.1, float, 0.1), + (0.1, np.float64, 0.1), + (np.nan, np.float64, pd.NA), + ], +) +def test_concat_remove_na( + run_component, + sample_1_h5mu, + sample_2_h5mu, + write_mudata_to_file, + random_h5mu_path, + test_value, + test_value_dtype, + expected, + change_column_contents, +): """ Test concatenation of samples where the column from one sample contains NA values NA values should be removed from the concatenated result mod1 mod2 - sample 1 NA NA + sample 1 NA NA sample 2 test_value NA """ - change_column_contents(sample_1_h5mu, 'var', 'Shared_feat', {'mod1': np.nan, 'mod2': np.nan}) - change_column_contents(sample_2_h5mu, 'var', 'Shared_feat', {'mod1': test_value, 'mod2': np.nan}) - sample_2_h5mu.var['Shared_feat'] = sample_2_h5mu.var['Shared_feat'].astype(test_value_dtype) + change_column_contents( + sample_1_h5mu, "var", "Shared_feat", {"mod1": np.nan, "mod2": np.nan} + ) + change_column_contents( + sample_2_h5mu, "var", "Shared_feat", {"mod1": test_value, "mod2": np.nan} + ) + sample_2_h5mu.var["Shared_feat"] = sample_2_h5mu.var["Shared_feat"].astype( + test_value_dtype + ) output_path = random_h5mu_path() - run_component([ - "--input_id", "sample1;sample2", - "--input", write_mudata_to_file(sample_1_h5mu), - "--input", write_mudata_to_file(sample_2_h5mu), - "--output", output_path, - "--other_axis_mode", "move" - ]) + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) assert Path(output_path).is_file() concatenated_data = md.read(output_path) # Values from modality 2 have NA - mod_2_features = sample_1_h5mu['mod2'].var_names.union(sample_2_h5mu['mod2'].var_names) - assert concatenated_data.var.loc[mod_2_features, 'Shared_feat'].isna().all() + mod_2_features = sample_1_h5mu["mod2"].var_names.union( + sample_2_h5mu["mod2"].var_names + ) + assert concatenated_data.var.loc[mod_2_features, "Shared_feat"].isna().all() # Values from modality 1 should not have NA if they originate from sample 1 # These values should be equal to the original values - assert sample_1_h5mu['mod1'].var['Shared_feat'].isna().all() + assert sample_1_h5mu["mod1"].var["Shared_feat"].isna().all() # Values from modality 1 should hold a value if they originate from sample 2 - mod1_features = sample_2_h5mu['mod1'].var_names.difference(sample_1_h5mu.var_names) + mod1_features = sample_2_h5mu["mod1"].var_names.difference(sample_1_h5mu.var_names) if not pd.isna(expected): - assert (concatenated_data.var.loc[mod1_features, 'Shared_feat'] == expected).all() + assert ( + concatenated_data.var.loc[mod1_features, "Shared_feat"] == expected + ).all() else: - assert concatenated_data.var.loc[mod1_features, 'Shared_feat'].isna().all() + assert concatenated_data.var.loc[mod1_features, "Shared_feat"].isna().all() - # The 'Shared_feat' column for mod1 contains an overlapping feature. + # The 'Shared_feat' column for mod1 contains an overlapping feature. # For sample 1, it is NA, for sample 2 is is filled with test value. # The concat component should choose the test-value over NA shared_features = sample_2_h5mu.var_names.intersection(sample_1_h5mu.var_names) if not pd.isna(expected): - assert (concatenated_data.var.loc[shared_features, 'Shared_feat'] == expected).all() + assert ( + concatenated_data.var.loc[shared_features, "Shared_feat"] == expected + ).all() else: - assert concatenated_data.var.loc[shared_features, 'Shared_feat'].isna().all() + assert concatenated_data.var.loc[shared_features, "Shared_feat"].isna().all() -def test_concat_invalid_h5_error_includes_path(run_component, tmp_path, - sample_1_h5mu, write_mudata_to_file): +def test_concat_invalid_h5_error_includes_path( + run_component, tmp_path, sample_1_h5mu, write_mudata_to_file +): empty_file = tmp_path / "empty.h5mu" empty_file.touch() with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input_id", "mouse;empty", - "--input", write_mudata_to_file(sample_1_h5mu), - "--input", empty_file, - "--output", "concat.h5mu", - "--other_axis_mode", "move" - ]) - assert re.search(rf"OSError: Failed to load .*{str(empty_file)}\. Is it a valid h5 file?", - err.value.stdout.decode('utf-8')) - - -@pytest.mark.parametrize("test_value_1,value_1_dtype,test_value_2,value_2_dtype,expected", - [(1, float, "1", str, pd.CategoricalDtype(categories=['1.0', '1'])), - (1, np.float64, "1", str, pd.CategoricalDtype(categories=['1.0', '1'])), - (1, pd.Int16Dtype(), 2.0, pd.Int16Dtype(), pd.Int64Dtype()), - (True, bool, False, bool, pd.BooleanDtype()), - (True, pd.BooleanDtype(), False, bool, pd.BooleanDtype()), - ("foo", str, "bar", str, pd.CategoricalDtype(categories=['bar', 'foo'])), - ] - ) -def test_concat_dtypes_per_modality(run_component, write_mudata_to_file, change_column_contents, - sample_1_h5mu, sample_2_h5mu, test_value_1, value_1_dtype, test_value_2, value_2_dtype, - expected, random_h5mu_path): + run_component( + [ + "--input_id", + "mouse;empty", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + empty_file, + "--output", + "concat.h5mu", + "--other_axis_mode", + "move", + ] + ) + assert re.search( + rf"OSError: Failed to load .*{str(empty_file)}\. Is it a valid h5 file?", + err.value.stdout.decode("utf-8"), + ) + + +@pytest.mark.parametrize( + "test_value_1,value_1_dtype,test_value_2,value_2_dtype,expected", + [ + (1, float, "1", str, pd.CategoricalDtype(categories=["1.0", "1"])), + (1, np.float64, "1", str, pd.CategoricalDtype(categories=["1.0", "1"])), + (1, pd.Int16Dtype(), 2.0, pd.Int16Dtype(), pd.Int64Dtype()), + (True, bool, False, bool, pd.BooleanDtype()), + (True, pd.BooleanDtype(), False, bool, pd.BooleanDtype()), + ("foo", str, "bar", str, pd.CategoricalDtype(categories=["bar", "foo"])), + ], +) +def test_concat_dtypes_per_modality( + run_component, + write_mudata_to_file, + change_column_contents, + sample_1_h5mu, + sample_2_h5mu, + test_value_1, + value_1_dtype, + test_value_2, + value_2_dtype, + expected, + random_h5mu_path, +): """ Test joining column with different dtypes to make sure that they are writable. The default path is to convert all non-na values to strings and wrap the column into a categorical dtype. @@ -576,112 +774,183 @@ def test_concat_dtypes_per_modality(run_component, write_mudata_to_file, change_ to .varm, but for mod1 only. The column is concatenated for mod2 as planned. Here we check if the results for the test column in mod2 is still writable. """ - change_column_contents(sample_1_h5mu, "var", "test_col", {"mod1": test_value_1, "mod2": test_value_1}) - sample_1_h5mu.var['test_col'] = sample_1_h5mu.var['test_col'].astype(value_1_dtype) - change_column_contents(sample_2_h5mu, "var", "test_col", {"mod1": test_value_2, "mod2": test_value_2}) - sample_2_h5mu.var['test_col'] = sample_2_h5mu.var['test_col'].astype(value_2_dtype) + change_column_contents( + sample_1_h5mu, "var", "test_col", {"mod1": test_value_1, "mod2": test_value_1} + ) + sample_1_h5mu.var["test_col"] = sample_1_h5mu.var["test_col"].astype(value_1_dtype) + change_column_contents( + sample_2_h5mu, "var", "test_col", {"mod1": test_value_2, "mod2": test_value_2} + ) + sample_2_h5mu.var["test_col"] = sample_2_h5mu.var["test_col"].astype(value_2_dtype) output_file = random_h5mu_path() - run_component([ - "--input_id", "sample1;sample2", - "--input", write_mudata_to_file(sample_1_h5mu), - "--input", write_mudata_to_file(sample_2_h5mu), - "--output", output_file, - "--other_axis_mode", "move" - ]) + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_file, + "--other_axis_mode", + "move", + ] + ) concatenated_data = md.read(output_file) - assert concatenated_data['mod2'].var['test_col'].dtype == expected - - -@pytest.mark.parametrize("test_value,value_dtype,expected", - [(1, float, pd.Int64Dtype()), - (1, np.float64, pd.Int64Dtype()), - (1, pd.Int16Dtype(), pd.Int16Dtype()), - (True, bool, pd.BooleanDtype()), - (True, pd.BooleanDtype(), pd.BooleanDtype()), - ("foo", str, pd.CategoricalDtype(categories=['foo'])), - ] - ) -def test_concat_dtypes_per_modality_multidim(run_component, write_mudata_to_file, - sample_1_h5mu, sample_2_h5mu, test_value, value_dtype, - expected, random_h5mu_path): - """ - Test if the result of concatenation is still writable when the input already contain + assert concatenated_data["mod2"].var["test_col"].dtype == expected + + +@pytest.mark.parametrize( + "test_value,value_dtype,expected", + [ + (1, float, pd.Int64Dtype()), + (1, np.float64, pd.Int64Dtype()), + (1, pd.Int16Dtype(), pd.Int16Dtype()), + (True, bool, pd.BooleanDtype()), + (True, pd.BooleanDtype(), pd.BooleanDtype()), + ("foo", str, pd.CategoricalDtype(categories=["foo"])), + ], +) +def test_concat_dtypes_per_modality_multidim( + run_component, + write_mudata_to_file, + sample_1_h5mu, + sample_2_h5mu, + test_value, + value_dtype, + expected, + random_h5mu_path, +): + """ + Test if the result of concatenation is still writable when the input already contain data in .varm and this data is kept. Because we are joining observations, the dtype of this data may change and the result might not be writable anymore """ - - sample_1_h5mu['mod1'].varm['test_df'] = pd.DataFrame(index=sample_1_h5mu['mod1'].var_names) - sample_1_h5mu['mod1'].varm['test_df']['test_col'] = test_value - sample_1_h5mu['mod1'].varm['test_df']['test_col'] = sample_1_h5mu['mod1'].varm['test_df']['test_col'].astype(value_dtype) + + sample_1_h5mu["mod1"].varm["test_df"] = pd.DataFrame( + index=sample_1_h5mu["mod1"].var_names + ) + sample_1_h5mu["mod1"].varm["test_df"]["test_col"] = test_value + sample_1_h5mu["mod1"].varm["test_df"]["test_col"] = ( + sample_1_h5mu["mod1"].varm["test_df"]["test_col"].astype(value_dtype) + ) output_file = random_h5mu_path() - run_component([ - "--input_id", "sample1;sample2", - "--input", write_mudata_to_file(sample_1_h5mu), - "--input", write_mudata_to_file(sample_2_h5mu), - "--output", output_file, - "--other_axis_mode", "move" - ]) + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_file, + "--other_axis_mode", + "move", + ] + ) concatenated_data = md.read(output_file) - assert concatenated_data['mod1'].varm['test_df']['test_col'].dtype == expected - -@pytest.mark.parametrize("test_value_1,test_value_2,expected", [(1, "1", pd.CategoricalDtype(categories=['1.0', '1']))]) -def test_concat_dtypes_global(run_component, write_mudata_to_file, change_column_contents, - sample_1_h5mu, sample_2_h5mu, test_value_1, test_value_2, - expected, random_h5mu_path): + assert concatenated_data["mod1"].varm["test_df"]["test_col"].dtype == expected + + +@pytest.mark.parametrize( + "test_value_1,test_value_2,expected", + [(1, "1", pd.CategoricalDtype(categories=["1.0", "1"]))], +) +def test_concat_dtypes_global( + run_component, + write_mudata_to_file, + change_column_contents, + sample_1_h5mu, + sample_2_h5mu, + test_value_1, + test_value_2, + expected, + random_h5mu_path, +): """ Test joining column with different dtypes to make sure that they are writable. The default path is to convert all non-na values to strings and wrap the column into a categorical dtype. Here, we test on the level of a column that is added to a global annotation matrix. """ - change_column_contents(sample_1_h5mu, "var", "test_col", {"mod1": test_value_1, "mod2": test_value_1}) - change_column_contents(sample_2_h5mu, "var", "test_col", {"mod1": test_value_2, "mod2": test_value_2}) - sample1_mod1_names = sample_2_h5mu['mod1'].var_names + change_column_contents( + sample_1_h5mu, "var", "test_col", {"mod1": test_value_1, "mod2": test_value_1} + ) + change_column_contents( + sample_2_h5mu, "var", "test_col", {"mod1": test_value_2, "mod2": test_value_2} + ) + sample1_mod1_names = sample_2_h5mu["mod1"].var_names # Here, we avoid a conflict between sample 1 and sample 2 by making sure there is no overlap in features # between sample 1 and sample 2 (no shared var_names). If this change would not be done, a different - # value for sample 1 and sample 2 would be found by the concat component for the var feature + # value for sample 1 and sample 2 would be found by the concat component for the var feature # 'overlapping_var_mod1' for modality 'mod1'. The concat component would move the column for mod1 to # .varm because of this conflict, and in the global .var column of the concatenated object, only # a 'mod2:test_col' column would be present. But here, we want to test the column that is populated by - # both 'mod1' and 'mod2' - assert 'overlapping_var_mod1' in sample1_mod1_names - new_names = sample1_mod1_names.where(~sample1_mod1_names.isin(['overlapping_var_mod1']), 'non_overlapping') - sample_2_h5mu['mod1'].var_names = new_names + # both 'mod1' and 'mod2' + assert "overlapping_var_mod1" in sample1_mod1_names + new_names = sample1_mod1_names.where( + ~sample1_mod1_names.isin(["overlapping_var_mod1"]), "non_overlapping" + ) + sample_2_h5mu["mod1"].var_names = new_names sample_2_h5mu.update() output_file = random_h5mu_path() - run_component([ - "--input_id", "sample1;sample2", - "--input", write_mudata_to_file(sample_1_h5mu), - "--input", write_mudata_to_file(sample_2_h5mu), - "--output", output_file, - "--other_axis_mode", "move" - ]) + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_file, + "--other_axis_mode", + "move", + ] + ) concatenated_data = md.read(output_file) - assert concatenated_data.var['test_col'].dtype == expected + assert concatenated_data.var["test_col"].dtype == expected + -def test_non_overlapping_modalities(run_component, sample_2_h5mu, sample_3_h5mu, random_h5mu_path, write_mudata_to_file): +def test_non_overlapping_modalities( + run_component, sample_2_h5mu, sample_3_h5mu, random_h5mu_path, write_mudata_to_file +): """ Test that the component does not fail when the modalities are not shared between samples. """ output_path = random_h5mu_path() input_file_2 = write_mudata_to_file(sample_2_h5mu) input_file_3 = write_mudata_to_file(sample_3_h5mu) - - run_component([ - "--input_id", "sample2;sample3", - "--input", input_file_2, - "--input", input_file_3, - "--output", output_path, - "--other_axis_mode", "move" - ]) + + run_component( + [ + "--input_id", + "sample2;sample3", + "--input", + input_file_2, + "--input", + input_file_3, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) output_data = md.read(output_path) assert set(output_data.mod.keys()) == {"mod1", "mod2", "mod3"} -def test_resolve_annotation_conflict_missing_column(run_component, sample_1_h5mu, - sample_2_h5mu, sample_3_h5mu, - write_mudata_to_file, random_h5mu_path): +def test_resolve_annotation_conflict_missing_column( + run_component, + sample_1_h5mu, + sample_2_h5mu, + sample_3_h5mu, + write_mudata_to_file, + random_h5mu_path, +): """ Test using mode 'move' and resolving a conflict in metadata between the samples, but the metadata column is missing in one of the samples. @@ -691,83 +960,113 @@ def test_resolve_annotation_conflict_missing_column(run_component, sample_1_h5mu input_file_2 = write_mudata_to_file(sample_2_h5mu) input_file_3 = write_mudata_to_file(sample_3_h5mu) - - run_component([ - "--input_id", "sample1;sample2;sample3", - "--input", input_file_1, - "--input", input_file_2, - "--input", input_file_3, - "--output", output_path, - "--other_axis_mode", "move" - ]) + run_component( + [ + "--input_id", + "sample1;sample2;sample3", + "--input", + input_file_1, + "--input", + input_file_2, + "--input", + input_file_3, + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) concatenated_data = md.read(output_path) # 'Shared_feat' is defined for mod1 in sample 1 and 2 and there is a conflict - assert 'conflict_Shared_feat' in concatenated_data['mod1'].varm + assert "conflict_Shared_feat" in concatenated_data["mod1"].varm # 'Shared_feat' is defined for mod2 in sample 1 and 2 and there is no conflict - assert 'Shared_feat' in concatenated_data['mod2'].var.columns + assert "Shared_feat" in concatenated_data["mod2"].var.columns # 'Shared_feat' is not defined in any of the samples samples for modality 3 - assert 'Shared_feat' not in concatenated_data['mod3'].var.columns - assert 'Shared_feat' not in concatenated_data['mod3'].varm + assert "Shared_feat" not in concatenated_data["mod3"].var.columns + assert "Shared_feat" not in concatenated_data["mod3"].varm + -def test_mode_move(run_component, sample_1_h5mu, sample_2_h5mu, - random_h5mu_path, write_mudata_to_file): +def test_mode_move( + run_component, sample_1_h5mu, sample_2_h5mu, random_h5mu_path, write_mudata_to_file +): """ - Test that in case of a conflict, the conflicting columns are move to the multidimensional annotation slot - (.varm and .obsm). The key of the datafame in the slot should start with 'conflict_' followed by the name + Test that in case of a conflict, the conflicting columns are move to the multidimensional annotation slot + (.varm and .obsm). The key of the datafame in the slot should start with 'conflict_' followed by the name of the column and the columns of the dataframe should contain the sample names. """ output_path = random_h5mu_path() - run_component([ - "--input_id", "sample1;sample2", - "--input", write_mudata_to_file(sample_1_h5mu), - "--input", write_mudata_to_file(sample_2_h5mu), - "--output", output_path, - "--other_axis_mode", "move" - ]) + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) assert output_path.is_file() concatenated_data = md.read(output_path) # Check if observations from all of the samples are present - assert (concatenated_data.n_obs == sample_1_h5mu.n_obs + sample_2_h5mu.n_obs) + assert concatenated_data.n_obs == sample_1_h5mu.n_obs + sample_2_h5mu.n_obs # Check if all modalities are present - sample1_mods, sample2_mods = set(sample_1_h5mu.mod.keys()), set(sample_2_h5mu.mod.keys()) + sample1_mods, sample2_mods = ( + set(sample_1_h5mu.mod.keys()), + set(sample_2_h5mu.mod.keys()), + ) concatentated_mods = set(concatenated_data.mod.keys()) assert (sample1_mods | sample2_mods) == concatentated_mods varm_check = { "mod1": ({"conflict_Shared_feat": ("sample1", "sample2")}), - "mod2": {} + "mod2": {}, } # Check if all features are present for mod_name in ("mod1", "mod2"): concatenated_mod = concatenated_data.mod[mod_name] sample_1_mod = sample_1_h5mu.mod[mod_name] - sample_2_mod = sample_2_h5mu.mod[mod_name] - original_varm_keys = set(list(sample_1_mod.varm.keys()) + - list(sample_2_mod.varm.keys())) - original_var_keys = set(sample_1_mod.var_keys() + - sample_2_mod.var_keys()) | original_varm_keys - - assert original_var_keys == \ - set(column_name.removeprefix('conflict_') - for column_name in concatenated_mod.varm.keys()) | \ - set(concatenated_mod.var.columns.tolist()) + sample_2_mod = sample_2_h5mu.mod[mod_name] + original_varm_keys = set( + list(sample_1_mod.varm.keys()) + list(sample_2_mod.varm.keys()) + ) + original_var_keys = ( + set(sample_1_mod.var_keys() + sample_2_mod.var_keys()) | original_varm_keys + ) + + assert original_var_keys == set( + column_name.removeprefix("conflict_") + for column_name in concatenated_mod.varm.keys() + ) | set(concatenated_mod.var.columns.tolist()) varm_expected = varm_check[mod_name] - assert set(concatenated_mod.varm.keys()) == set(varm_expected.keys() | original_varm_keys) + assert set(concatenated_mod.varm.keys()) == set( + varm_expected.keys() | original_varm_keys + ) for varm_key, expected_columns in varm_expected.items(): assert tuple(concatenated_mod.varm[varm_key].columns) == expected_columns if not varm_expected: - assert set(concatenated_mod.varm.keys()) == original_varm_keys + assert set(concatenated_mod.varm.keys()) == original_varm_keys assert concatenated_mod.obsm == {} + # Execute this test multiple times, anndata.concat sometimes returns the observations in a different order -@pytest.mark.parametrize('_', range(10)) -def test_concat_var_obs_names_order(run_component, sample_1_h5mu, sample_2_h5mu, - write_mudata_to_file, random_h5mu_path, _): +@pytest.mark.parametrize("_", range(10)) +def test_concat_var_obs_names_order( + run_component, + sample_1_h5mu, + sample_2_h5mu, + write_mudata_to_file, + random_h5mu_path, + _, +): """ Test that the var_names and obs_names are still linked to the correct count data. """ @@ -776,53 +1075,93 @@ def test_concat_var_obs_names_order(run_component, sample_1_h5mu, sample_2_h5mu, sample_1_h5mu["mod2"].obs["sample_id"] = "sample1" sample_2_h5mu["mod1"].obs["sample_id"] = "sample2" sample_2_h5mu["mod2"].obs["sample_id"] = "sample2" - run_component([ - "--input_id", "sample1;sample2", - "--input", write_mudata_to_file(sample_1_h5mu), - "--input", write_mudata_to_file(sample_2_h5mu), - "--output", output_path, - "--other_axis_mode", "move" - ]) + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_path, + "--other_axis_mode", + "move", + ] + ) assert output_path.is_file() - for sample_name, sample_h5mu in {"sample1": sample_1_h5mu, - "sample2": sample_2_h5mu}.items(): + for sample_name, sample_h5mu in { + "sample1": sample_1_h5mu, + "sample2": sample_2_h5mu, + }.items(): for mod_name in ["mod1", "mod2"]: data_sample = sample_h5mu[mod_name].copy() processed_data_ad = md.read_h5ad(output_path, mod=mod_name) - processed_data_ad = processed_data_ad[processed_data_ad.obs["sample_id"] == sample_name] + processed_data_ad = processed_data_ad[ + processed_data_ad.obs["sample_id"] == sample_name + ] processed_data_ad = processed_data_ad[:, data_sample.var_names] - processed_data = pd.DataFrame(processed_data_ad.X, index=processed_data_ad.obs_names, - columns=processed_data_ad.var_names) - data_sample = pd.DataFrame(data_sample.X, index=data_sample.obs_names, - columns=data_sample.var_names).reindex_like(processed_data) - pd.testing.assert_frame_equal(processed_data, data_sample, check_dtype=False) - - -def test_keep_uns(run_component, sample_1_h5mu, sample_2_h5mu, - write_mudata_to_file, random_h5mu_path): + processed_data = pd.DataFrame( + processed_data_ad.X, + index=processed_data_ad.obs_names, + columns=processed_data_ad.var_names, + ) + data_sample = pd.DataFrame( + data_sample.X, + index=data_sample.obs_names, + columns=data_sample.var_names, + ).reindex_like(processed_data) + pd.testing.assert_frame_equal( + processed_data, data_sample, check_dtype=False + ) + + +def test_keep_uns( + run_component, sample_1_h5mu, sample_2_h5mu, write_mudata_to_file, random_h5mu_path +): sample_1_h5mu.uns["global_uns_sample1"] = "dolor" sample_1_h5mu.uns["overlapping_global"] = "sed" sample_2_h5mu.uns["global_uns_sample2"] = "amet" sample_2_h5mu.uns["overlapping_global"] = "elit" output_path = random_h5mu_path() - run_component([ - "--input_id", "sample1;sample2", - "--input", write_mudata_to_file(sample_1_h5mu), - "--input", write_mudata_to_file(sample_2_h5mu), - "--output", output_path, - "--other_axis_mode", "move", - "--uns_merge_mode", "make_unique", - ]) + run_component( + [ + "--input_id", + "sample1;sample2", + "--input", + write_mudata_to_file(sample_1_h5mu), + "--input", + write_mudata_to_file(sample_2_h5mu), + "--output", + output_path, + "--other_axis_mode", + "move", + "--uns_merge_mode", + "make_unique", + ] + ) assert output_path.is_file() concatenated_data = md.read(output_path) mod1 = concatenated_data.mod["mod1"] mod2 = concatenated_data.mod["mod2"] - assert set(concatenated_data.uns.keys()) == set(["global_uns_sample1", "global_uns_sample2", - "sample1_overlapping_global", "sample2_overlapping_global"]) - assert set(mod1.uns.keys()) == set(["sample1_overlapping_uns_key", "uns_unique_to_sample1", - "sample2_overlapping_uns_key", "uns_unique_to_sample2"]) + assert set(concatenated_data.uns.keys()) == set( + [ + "global_uns_sample1", + "global_uns_sample2", + "sample1_overlapping_global", + "sample2_overlapping_global", + ] + ) + assert set(mod1.uns.keys()) == set( + [ + "sample1_overlapping_uns_key", + "uns_unique_to_sample1", + "sample2_overlapping_uns_key", + "uns_unique_to_sample2", + ] + ) assert set(mod2.uns.keys()) == set() -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__, "-v"])) diff --git a/src/dataflow/merge/script.py b/src/dataflow/merge/script.py index 454ae8be4a4..87e1a24e25e 100644 --- a/src/dataflow/merge/script.py +++ b/src/dataflow/merge/script.py @@ -7,27 +7,32 @@ ### VIASH START par = { - "input": ["./resources_test/merge/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu", - "./resources_test/merge/pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu"], - "output": "foo.h5mu" - + "input": [ + "./resources_test/merge/pbmc_1k_protein_v3_filtered_feature_bc_matrix_rna.h5mu", + "./resources_test/merge/pbmc_1k_protein_v3_filtered_feature_bc_matrix_prot.h5mu", + ], + "output": "foo.h5mu", } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + def main(): - logger.info('Reading input files %s', ",".join(par["input"])) + logger.info("Reading input files %s", ",".join(par["input"])) input_samples = [md.read_h5mu(path) for path in par["input"]] - logger.info('Merging into single object.') + logger.info("Merging into single object.") sample_modalities = {} for input_sample in input_samples: for mod_name, mod_data in input_sample.mod.items(): if mod_name in sample_modalities: - raise ValueError(f"Modality '{mod_name}' was found in more than 1 sample.") + raise ValueError( + f"Modality '{mod_name}' was found in more than 1 sample." + ) sample_modalities[mod_name] = mod_data merged = md.MuData(sample_modalities) @@ -35,24 +40,26 @@ def main(): for df_attr in ("var", "obs"): df = getattr(merged, df_attr) df = df.replace({pd.NA: np.nan}, inplace=False) - + # MuData supports nullable booleans and ints # ie. `IntegerArray` and `BooleanArray` - df = df.convert_dtypes(infer_objects=True, - convert_integer=True, - convert_string=False, - convert_boolean=True, - convert_floating=False) + df = df.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) # Convert leftover 'object' columns to string - object_cols = df.select_dtypes(include='object').columns.values + object_cols = df.select_dtypes(include="object").columns.values for obj_col in object_cols: - df[obj_col].astype(str).astype('category') + df[obj_col].astype(str).astype("category") setattr(merged, df_attr, df) merged.write_h5mu(par["output"], compression=par["output_compression"]) - logger.info('Finished') + logger.info("Finished") -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/src/dataflow/merge/test.py b/src/dataflow/merge/test.py index f2963ca8c4a..f31a5e61ceb 100644 --- a/src/dataflow/merge/test.py +++ b/src/dataflow/merge/test.py @@ -1,19 +1,17 @@ import sys import pytest -from pathlib import Path import subprocess from mudata import MuData, read_h5mu import pandas as pd import numpy as np import re from openpipelinetestutils.asserters import assert_annotation_objects_equal -import os ## VIASH START meta = { - 'executable': './target/executable/dataflow/merge/merge', - 'resources_dir': './resources_test/merge_test_data/', - 'config': './src/dataflow/merge/config.vsh.yml' + "executable": "./target/executable/dataflow/merge/merge", + "resources_dir": "./resources_test/merge_test_data/", + "config": "./src/dataflow/merge/config.vsh.yml", } ## VIASH END @@ -26,50 +24,66 @@ def mudata_non_overlapping_observations(request, random_h5mu_path): mudata_to_change = read_h5mu(mudata_to_change_path) # Remove 1 observation removed_observation_name = mudata_to_change.obs.index[-1] - mudata_to_change = mudata_to_change[:mudata_to_change.n_obs-1] + mudata_to_change = mudata_to_change[: mudata_to_change.n_obs - 1] mudata_to_change.write(temp_h5mu, compression="gzip") return temp_h5mu, removed_observation_name + @pytest.fixture def extra_var_column_value(): return "bar" + @pytest.fixture def extra_var_column_name(): return "test" + @pytest.fixture -def mudata_with_extra_var_column(random_h5mu_path, request, extra_var_column_value, extra_var_column_name): +def mudata_with_extra_var_column( + random_h5mu_path, request, extra_var_column_value, extra_var_column_name +): [sample1_path, sample2_path] = request.getfixturevalue(request.param) result = [] - for sample_path, column_value in ((sample1_path, extra_var_column_value), (sample2_path, np.nan)): + for sample_path, column_value in ( + (sample1_path, extra_var_column_value), + (sample2_path, np.nan), + ): sample = read_h5mu(sample_path) mod_names = list(sample.mod.keys()) assert len(mod_names) == 1 mod_name = mod_names[0] sample.mod[mod_name].var[extra_var_column_name] = column_value - sample.var = sample.var.convert_dtypes(infer_objects=True, - convert_integer=True, - convert_string=False, - convert_boolean=True, - convert_floating=False) + sample.var = sample.var.convert_dtypes( + infer_objects=True, + convert_integer=True, + convert_string=False, + convert_boolean=True, + convert_floating=False, + ) new_path = random_h5mu_path() sample.write(new_path) result.append(new_path) return result -def test_merge(run_component, random_h5mu_path, small_mudata_mod1_path, small_mudata_mod2_path): +def test_merge( + run_component, random_h5mu_path, small_mudata_mod1_path, small_mudata_mod2_path +): """ Test a simple merge with fully overlapping observations """ output_path = random_h5mu_path() # input_sample1_path, input_sample2_path = split_small_mudata_path args = [ - "--input", small_mudata_mod1_path, - "--input", small_mudata_mod2_path, - "--output", output_path, - "--output_compression", "gzip" + "--input", + small_mudata_mod1_path, + "--input", + small_mudata_mod2_path, + "--output", + output_path, + "--output_compression", + "gzip", ] run_component(args) @@ -77,77 +91,135 @@ def test_merge(run_component, random_h5mu_path, small_mudata_mod1_path, small_mu concatenated_data = read_h5mu(output_path) data_sample1 = read_h5mu(small_mudata_mod1_path) data_sample2 = read_h5mu(small_mudata_mod2_path) - - expected_concatenated_data = MuData({'mod1': data_sample1.mod['mod1'], 'mod2': data_sample2.mod['mod2']}) + + expected_concatenated_data = MuData( + {"mod1": data_sample1.mod["mod1"], "mod2": data_sample2.mod["mod2"]} + ) assert_annotation_objects_equal(concatenated_data, expected_concatenated_data) -@pytest.mark.parametrize("mudata_non_overlapping_observations", ["small_mudata_mod1_path"], indirect=["mudata_non_overlapping_observations"]) -def test_merge_non_overlapping_observations(run_component, mudata_non_overlapping_observations, random_h5mu_path, small_mudata_mod2_path): +@pytest.mark.parametrize( + "mudata_non_overlapping_observations", + ["small_mudata_mod1_path"], + indirect=["mudata_non_overlapping_observations"], +) +def test_merge_non_overlapping_observations( + run_component, + mudata_non_overlapping_observations, + random_h5mu_path, + small_mudata_mod2_path, +): """ Merge with differing observations in the samples """ edited_h5mu_path, removed_observation_name = mudata_non_overlapping_observations output_path = random_h5mu_path() # Remove 1 observation - run_component([ - "--input", edited_h5mu_path, - "--input", small_mudata_mod2_path, - "--output", output_path]) - - assert output_path.is_file() + run_component( + [ + "--input", + edited_h5mu_path, + "--input", + small_mudata_mod2_path, + "--output", + output_path, + ] + ) + + assert output_path.is_file() concatenated_data = read_h5mu(output_path, backed=False) data_sample1 = read_h5mu(edited_h5mu_path, backed=False) data_sample2 = read_h5mu(small_mudata_mod2_path, backed=False) - - expected_concatenated_data = MuData({'mod1': data_sample1.mod['mod1'], 'mod2': data_sample2.mod['mod2']}) - - assert set(concatenated_data.obs_names) == (set(data_sample1.obs_names) | set(data_sample2.obs_names)) - assert concatenated_data[removed_observation_name:]['mod1'].n_obs == 0 - assert concatenated_data[removed_observation_name:]['mod2'].n_obs == 1 - - np.testing.assert_equal(concatenated_data.copy()[removed_observation_name:]['mod2'].X.data, - data_sample2.copy()[removed_observation_name:]['mod2'].X.data) - + + expected_concatenated_data = MuData( + {"mod1": data_sample1.mod["mod1"], "mod2": data_sample2.mod["mod2"]} + ) + + assert set(concatenated_data.obs_names) == ( + set(data_sample1.obs_names) | set(data_sample2.obs_names) + ) + assert concatenated_data[removed_observation_name:]["mod1"].n_obs == 0 + assert concatenated_data[removed_observation_name:]["mod2"].n_obs == 1 + + np.testing.assert_equal( + concatenated_data.copy()[removed_observation_name:]["mod2"].X.data, + data_sample2.copy()[removed_observation_name:]["mod2"].X.data, + ) + assert_annotation_objects_equal(concatenated_data, expected_concatenated_data) - - -@pytest.mark.parametrize("extra_var_column_name,extra_var_column_value,expected", [("test", "bar", "bar"), ("test", True, True), ("test", 0.1, 0.1), ("test", np.nan, pd.NA)]) -@pytest.mark.parametrize("mudata_with_extra_var_column", - ["split_small_mudata_path"], - indirect=["mudata_with_extra_var_column"]) -def test_boolean_and_na_types(run_component, mudata_with_extra_var_column, extra_var_column_name, expected, random_h5mu_path): + + +@pytest.mark.parametrize( + "extra_var_column_name,extra_var_column_value,expected", + [ + ("test", "bar", "bar"), + ("test", True, True), + ("test", 0.1, 0.1), + ("test", np.nan, pd.NA), + ], +) +@pytest.mark.parametrize( + "mudata_with_extra_var_column", + ["split_small_mudata_path"], + indirect=["mudata_with_extra_var_column"], +) +def test_boolean_and_na_types( + run_component, + mudata_with_extra_var_column, + extra_var_column_name, + expected, + random_h5mu_path, +): """ Test if merging booleans of NAs results in the .var .obs column being writeable """ input_sample1_path, input_sample2_path = mudata_with_extra_var_column output_path = random_h5mu_path() - run_component([ - "--input", input_sample1_path, - "--input", input_sample2_path, - "--output", output_path]) + run_component( + [ + "--input", + input_sample1_path, + "--input", + input_sample2_path, + "--output", + output_path, + ] + ) assert output_path.is_file() merged_data = read_h5mu(output_path, backed=False) first_sample_mod = list(read_h5mu(input_sample1_path).mod)[0] second_sample_mod = list(read_h5mu(input_sample2_path).mod)[0] - - expected_merged_data = MuData({'mod1': read_h5mu(input_sample1_path).mod['mod1'], - 'mod2': read_h5mu(input_sample2_path).mod['mod2']}) - + + expected_merged_data = MuData( + { + "mod1": read_h5mu(input_sample1_path).mod["mod1"], + "mod2": read_h5mu(input_sample2_path).mod["mod2"], + } + ) + if not pd.isna(expected): - assert merged_data.var.loc['var1'][extra_var_column_name] == expected - assert merged_data.mod[first_sample_mod].var.loc['var1'][extra_var_column_name] == expected + assert merged_data.var.loc["var1"][extra_var_column_name] == expected + assert ( + merged_data.mod[first_sample_mod].var.loc["var1"][extra_var_column_name] + == expected + ) else: - assert pd.isna(merged_data.var.loc['var1'][extra_var_column_name]) - assert pd.isna(merged_data.mod[first_sample_mod].var.loc['var1'][extra_var_column_name]) - assert pd.isna(merged_data.var.loc['var4'][extra_var_column_name]) - assert pd.isna(merged_data.mod[second_sample_mod].var.loc['var4'][extra_var_column_name]) + assert pd.isna(merged_data.var.loc["var1"][extra_var_column_name]) + assert pd.isna( + merged_data.mod[first_sample_mod].var.loc["var1"][extra_var_column_name] + ) + assert pd.isna(merged_data.var.loc["var4"][extra_var_column_name]) + assert pd.isna( + merged_data.mod[second_sample_mod].var.loc["var4"][extra_var_column_name] + ) assert_annotation_objects_equal(merged_data, expected_merged_data) - -def test_same_modalities_raises(run_component, random_h5mu_path, split_small_mudata_path): + +def test_same_modalities_raises( + run_component, random_h5mu_path, split_small_mudata_path +): """ Raise when trying to merge modalities with the same name. """ @@ -155,17 +227,25 @@ def test_same_modalities_raises(run_component, random_h5mu_path, split_small_mud input_sample2_edited_path = random_h5mu_path() output_path = random_h5mu_path() data_sample2 = read_h5mu(input_sample2_path) - data_sample2 = MuData({'mod1': data_sample2.mod['mod2']}) + data_sample2 = MuData({"mod1": data_sample2.mod["mod2"]}) data_sample2.write(input_sample2_edited_path, compression="gzip") - + with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_sample1_path, - "--input", input_sample2_edited_path, - "--output", output_path]) - assert re.search(r"ValueError: Modality 'mod1' was found in more than 1 sample\.", - err.value.stdout.decode('utf-8')) - - -if __name__ == '__main__': + run_component( + [ + "--input", + input_sample1_path, + "--input", + input_sample2_edited_path, + "--output", + output_path, + ] + ) + assert re.search( + r"ValueError: Modality 'mod1' was found in more than 1 sample\.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/dataflow/split_h5mu/script.py b/src/dataflow/split_h5mu/script.py index 1a42ee0a544..607c002cb4b 100644 --- a/src/dataflow/split_h5mu/script.py +++ b/src/dataflow/split_h5mu/script.py @@ -8,21 +8,24 @@ ### VIASH START par = { - 'input': 'harmony_knn/integrated.pynndescent_knn.output', - 'modality': 'rna', - 'obs_feature': 'dataset', - 'output': 'reference_download/sample_split', - 'drop_obs_nan': "true", - 'output_compression': None, - 'output_files': 'reference_download/sample_files.csv', - 'ensure_unique_filenames': True + "input": "harmony_knn/integrated.pynndescent_knn.output", + "modality": "rna", + "obs_feature": "dataset", + "output": "reference_download/sample_split", + "drop_obs_nan": "true", + "output_compression": None, + "output_files": "reference_download/sample_files.csv", + "ensure_unique_filenames": True, } import anndata as ad -df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + +df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] +) var3 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs3 = pd.DataFrame(["C C", "C_C"], index=df.index, columns=["Obs"]) ad3 = ad.AnnData(df, obs=obs3, var=var3) -mdata = mu.MuData({'rna': ad3}) +mdata = mu.MuData({"rna": ad3}) mdata.write_h5mu("test_san.h5mu") par["input"] = "test_san.h5mu" par["obs_feature"] = "Obs" @@ -30,6 +33,7 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() @@ -44,13 +48,15 @@ def main(): obs_features = adata.obs[par["obs_feature"]].unique().tolist() # sanitize --obs_feature values - obs_features_s = [re.sub(r'[-\s]', "_", str(s).strip()) for s in obs_features] - obs_features_s = [re.sub(r'[^A-Za-z0-9_]', "", s) for s in obs_features_s] + obs_features_s = [re.sub(r"[-\s]", "_", str(s).strip()) for s in obs_features] + obs_features_s = [re.sub(r"[^A-Za-z0-9_]", "", s) for s in obs_features_s] # ensure that names are unique, if not raise or append number as suffix if not len(obs_features_s) == len(set(obs_features_s)): if not par["ensure_unique_filenames"]: - raise ValueError(f"File names are not unique after sanitizing the --obs_feature {par['obs_feature']} values") + raise ValueError( + f"File names are not unique after sanitizing the --obs_feature {par['obs_feature']} values" + ) logger.info("Ensuring unique names for par['obs_feature']") counts = defaultdict(lambda: -1) @@ -69,7 +75,9 @@ def main(): obs_files = [] for obs_name, file_name in zip(obs_features, obs_features_s): - logger.info(f"Filtering modality '{par['modality']}' observations by .obs['{par['obs_feature']}'] == {obs_name}") + logger.info( + f"Filtering modality '{par['modality']}' observations by .obs['{par['obs_feature']}'] == {obs_name}" + ) mdata_obs = mdata.copy() adata_obs = mdata_obs.mod[par["modality"]] @@ -80,13 +88,17 @@ def main(): # Dropping columns that only have nan values after splitting if par["drop_obs_nan"]: - logger.info(f"Dropping all .obs columns with NaN values") - adata_obs.obs.dropna(axis=1, how='all', inplace=True) + logger.info("Dropping all .obs columns with NaN values") + adata_obs.obs.dropna(axis=1, how="all", inplace=True) # replace mdata file with modality adata contianing split samples - logger.info(f"Writing h5mu filtered for {par['obs_feature']} {obs_name} to file {output_dir / mdata_obs_name}") + logger.info( + f"Writing h5mu filtered for {par['obs_feature']} {obs_name} to file {output_dir / mdata_obs_name}" + ) mdata_obs.mod[par["modality"]] = adata_obs - mdata_obs.write_h5mu(output_dir / mdata_obs_name, compression=par["output_compression"]) + mdata_obs.write_h5mu( + output_dir / mdata_obs_name, compression=par["output_compression"] + ) # avoid keeping files in memory del mdata_obs @@ -98,5 +110,5 @@ def main(): df.to_csv(par["output_files"], index=False) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/dataflow/split_h5mu/test.py b/src/dataflow/split_h5mu/test.py index e7774de09dc..0451be26b07 100644 --- a/src/dataflow/split_h5mu/test.py +++ b/src/dataflow/split_h5mu/test.py @@ -11,8 +11,10 @@ @pytest.fixture def input_modality_1(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) - obs = pd.DataFrame({'Obs': ["A", "B"], "Obs_nan": [np.nan, np.nan]}, index=df.index) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame({"Obs": ["A", "B"], "Obs_nan": [np.nan, np.nan]}, index=df.index) var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) ad1 = ad.AnnData(df, obs=obs, var=var) return ad1 @@ -20,7 +22,9 @@ def input_modality_1(): @pytest.fixture def input_modality_2(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) ad2 = ad.AnnData(df, obs=obs2, var=var2) @@ -29,7 +33,9 @@ def input_modality_2(): @pytest.fixture def input_modality_3(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) var3 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs3 = pd.DataFrame(["C C", "C_C"], index=df.index, columns=["Obs"]) ad3 = ad.AnnData(df, obs=obs3, var=var3) @@ -38,7 +44,7 @@ def input_modality_3(): @pytest.fixture def input_h5mu(input_modality_1, input_modality_2): - tmp_mudata = mu.MuData({'mod1': input_modality_1, 'mod2': input_modality_2}) + tmp_mudata = mu.MuData({"mod1": input_modality_1, "mod2": input_modality_2}) return tmp_mudata @@ -49,12 +55,14 @@ def input_h5mu_path(write_mudata_to_file, input_h5mu): @pytest.fixture def input_h5mu_non_unique_filenames(input_modality_3): - tmp_mudata = mu.MuData({'mod3': input_modality_3}) + tmp_mudata = mu.MuData({"mod3": input_modality_3}) return tmp_mudata @pytest.fixture -def input_h5mu_path_non_unique_filenames(write_mudata_to_file, input_h5mu_non_unique_filenames): +def input_h5mu_path_non_unique_filenames( + write_mudata_to_file, input_h5mu_non_unique_filenames +): return write_mudata_to_file(input_h5mu_non_unique_filenames) @@ -62,11 +70,16 @@ def test_sample_split(run_component, random_path, input_h5mu, input_h5mu_path): output_dir = random_path() output_files = random_path(extension="csv") args = [ - "--input", input_h5mu_path, - "--output", str(output_dir), - "--modality", "mod1", - "--obs_feature", "Obs", - "--output_files", str(output_files), + "--input", + input_h5mu_path, + "--output", + str(output_dir), + "--modality", + "mod1", + "--obs_feature", + "Obs", + "--output_files", + str(output_files), ] run_component(args) @@ -74,8 +87,11 @@ def test_sample_split(run_component, random_path, input_h5mu, input_h5mu_path): assert output_dir.is_dir() # check output dir and file names - dir_content = [h5mu_file for h5mu_file in output_dir.iterdir() - if h5mu_file.suffix == ".h5mu" and h5mu_file != input_h5mu_path] + dir_content = [ + h5mu_file + for h5mu_file in output_dir.iterdir() + if h5mu_file.suffix == ".h5mu" and h5mu_file != input_h5mu_path + ] s1_file = output_dir / f"{input_h5mu_path.stem}_A.h5mu" s2_file = output_dir / f"{input_h5mu_path.stem}_B.h5mu" assert set(dir_content) == set([s1_file, s2_file]) @@ -86,30 +102,62 @@ def test_sample_split(run_component, random_path, input_h5mu, input_h5mu_path): assert s1.n_mod == 2 assert s2.n_mod == 2 - assert s1.n_obs == input_h5mu.n_obs, "number of observations of split file does not match input file" - assert s2.n_obs == input_h5mu.n_obs, "number of observations of split file does not match input file" - - assert s1.mod["mod1"].n_obs == 1, "number of observations of split file s1 modality mod1 should equal 1" - assert s1.mod["mod2"].n_obs == input_h5mu.n_obs, "number of observations of split file s1 modality mod2 should equal input file" - - assert len(s1.mod["mod1"].obs.keys()) == 2, "number of observation keys split file s1 modality mod1 should equal 2" - assert len(s1.mod["mod2"].obs.keys()) == 1, "number of observation keys split file s1 modality mod2 should equal 1" - - assert s2.mod["mod1"].n_obs == 1, "number of observations of split file s2 modality mod1 should equal 1" - assert s2.mod["mod2"].n_obs == input_h5mu.n_obs, "number of observations of split file s2 modality mod2 should equal input file" - - assert s1.n_vars == input_h5mu.n_vars, "number of variables of split file s1 should equal input file" - assert s2.n_vars == input_h5mu.n_vars, "number of variables of split file s1 should equal input file" - - assert s1.mod["mod1"].n_vars == input_h5mu.mod["mod1"].n_vars, "number of variables of split file s1 modalitty mod1 should equal input file" - assert s1.mod["mod2"].n_vars == input_h5mu.mod["mod1"].n_vars, "number of variables of split file s1 modalitty mod2 should equal input file" - - assert s2.mod["mod1"].n_vars == input_h5mu.mod["mod1"].n_vars, "number of variables of split file s2 modalitty mod1 should equal input file" - assert s2.mod["mod2"].n_vars == input_h5mu.mod["mod1"].n_vars, "number of variables of split file s2 modalitty mod2 should equal input file" + assert ( + s1.n_obs == input_h5mu.n_obs + ), "number of observations of split file does not match input file" + assert ( + s2.n_obs == input_h5mu.n_obs + ), "number of observations of split file does not match input file" + + assert ( + s1.mod["mod1"].n_obs == 1 + ), "number of observations of split file s1 modality mod1 should equal 1" + assert ( + s1.mod["mod2"].n_obs == input_h5mu.n_obs + ), "number of observations of split file s1 modality mod2 should equal input file" + + assert ( + len(s1.mod["mod1"].obs.keys()) == 2 + ), "number of observation keys split file s1 modality mod1 should equal 2" + assert ( + len(s1.mod["mod2"].obs.keys()) == 1 + ), "number of observation keys split file s1 modality mod2 should equal 1" + + assert ( + s2.mod["mod1"].n_obs == 1 + ), "number of observations of split file s2 modality mod1 should equal 1" + assert ( + s2.mod["mod2"].n_obs == input_h5mu.n_obs + ), "number of observations of split file s2 modality mod2 should equal input file" + + assert ( + s1.n_vars == input_h5mu.n_vars + ), "number of variables of split file s1 should equal input file" + assert ( + s2.n_vars == input_h5mu.n_vars + ), "number of variables of split file s1 should equal input file" + + assert ( + s1.mod["mod1"].n_vars == input_h5mu.mod["mod1"].n_vars + ), "number of variables of split file s1 modalitty mod1 should equal input file" + assert ( + s1.mod["mod2"].n_vars == input_h5mu.mod["mod1"].n_vars + ), "number of variables of split file s1 modalitty mod2 should equal input file" + + assert ( + s2.mod["mod1"].n_vars == input_h5mu.mod["mod1"].n_vars + ), "number of variables of split file s2 modalitty mod1 should equal input file" + assert ( + s2.mod["mod2"].n_vars == input_h5mu.mod["mod1"].n_vars + ), "number of variables of split file s2 modalitty mod2 should equal input file" # check correct sample splitting - assert np.all(s1.mod["mod1"].obs["Obs"] == "A"), "observation of .obs Obs in s1 should equal A" - assert np.all(s2.mod["mod1"].obs["Obs"] == "B"), "observation of .obs Obs in s2 should equal B" + assert np.all( + s1.mod["mod1"].obs["Obs"] == "A" + ), "observation of .obs Obs in s1 should equal A" + assert np.all( + s2.mod["mod1"].obs["Obs"] == "B" + ), "observation of .obs Obs in s2 should equal B" # Check contents of csv file expected_csv_output = dedent( @@ -119,7 +167,7 @@ def test_sample_split(run_component, random_path, input_h5mu, input_h5mu_path): B,{s2_file.name} """ ) - with open(output_files, 'r') as open_csv_file: + with open(output_files, "r") as open_csv_file: result = open_csv_file.read() assert result == expected_csv_output @@ -128,12 +176,18 @@ def test_sample_split_dropna(run_component, random_path, input_h5mu, input_h5mu_ output_dir = random_path() output_files = random_path(extension="csv") args = [ - "--input", input_h5mu_path, - "--output", str(output_dir), - "--modality", "mod1", - "--obs_feature", "Obs", - "--drop_obs_nan", "true", - "--output_files", str(output_files), + "--input", + input_h5mu_path, + "--output", + str(output_dir), + "--modality", + "mod1", + "--obs_feature", + "Obs", + "--drop_obs_nan", + "true", + "--output_files", + str(output_files), ] run_component(args) @@ -146,55 +200,88 @@ def test_sample_split_dropna(run_component, random_path, input_h5mu, input_h5mu_ s1 = mu.read_h5mu(s1_file) s2 = mu.read_h5mu(s2_file) - assert s1.n_obs == input_h5mu.n_obs, "number of observations of split file does not match input file" - assert s2.n_obs == input_h5mu.n_obs, "number of observations of split file does not match input file" + assert ( + s1.n_obs == input_h5mu.n_obs + ), "number of observations of split file does not match input file" + assert ( + s2.n_obs == input_h5mu.n_obs + ), "number of observations of split file does not match input file" + + assert ( + s1.mod["mod1"].n_obs == 1 + ), "number of observations of split file s1 modality mod1 should equal 1" + assert ( + s1.mod["mod2"].n_obs == input_h5mu.n_obs + ), "number of observations of split file s1 modality mod2 should equal input file" - assert s1.mod["mod1"].n_obs == 1, "number of observations of split file s1 modality mod1 should equal 1" - assert s1.mod["mod2"].n_obs == input_h5mu.n_obs, "number of observations of split file s1 modality mod2 should equal input file" + assert ( + len(s1.mod["mod1"].obs.keys()) == 1 + ), "number of observation keys split file s1 modality mod1 should equal 1" + assert ( + len(s1.mod["mod2"].obs.keys()) == 1 + ), "number of observation keys split file s1 modality mod2 should equal 1" - assert len(s1.mod["mod1"].obs.keys()) == 1, "number of observation keys split file s1 modality mod1 should equal 1" - assert len(s1.mod["mod2"].obs.keys()) == 1, "number of observation keys split file s1 modality mod2 should equal 1" def test_sanitizing(run_component, random_path, input_h5mu_path_non_unique_filenames): output_dir = random_path() output_files = random_path(extension="csv") args = [ - "--input", input_h5mu_path_non_unique_filenames, - "--output", str(output_dir), - "--modality", "mod3", - "--obs_feature", "Obs", - "--drop_obs_nan", "true", - "--output_files", str(output_files) + "--input", + input_h5mu_path_non_unique_filenames, + "--output", + str(output_dir), + "--modality", + "mod3", + "--obs_feature", + "Obs", + "--drop_obs_nan", + "true", + "--output_files", + str(output_files), ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) assert re.search( r"ValueError: File names are not unique after sanitizing the --obs_feature Obs values", - err.value.stdout.decode('utf-8')) + err.value.stdout.decode("utf-8"), + ) args_san = [ - "--input", input_h5mu_path_non_unique_filenames, - "--output", str(output_dir), - "--modality", "mod3", - "--obs_feature", "Obs", - "--drop_obs_nan", "true", - "--output_files", str(output_files), - "--ensure_unique_filenames", "true" - ] + "--input", + input_h5mu_path_non_unique_filenames, + "--output", + str(output_dir), + "--modality", + "mod3", + "--obs_feature", + "Obs", + "--drop_obs_nan", + "true", + "--output_files", + str(output_files), + "--ensure_unique_filenames", + "true", + ] run_component(args_san) # check output dir and file names - dir_content = [h5mu_file for h5mu_file in output_dir.iterdir() - if h5mu_file.suffix == ".h5mu" and h5mu_file != input_h5mu_path_non_unique_filenames] + dir_content = [ + h5mu_file + for h5mu_file in output_dir.iterdir() + if h5mu_file.suffix == ".h5mu" + and h5mu_file != input_h5mu_path_non_unique_filenames + ] s1_file = output_dir / f"{input_h5mu_path_non_unique_filenames.stem}_C_C.h5mu" s2_file = output_dir / f"{input_h5mu_path_non_unique_filenames.stem}_C_C_1.h5mu" assert s1_file.is_file(), f"{s1_file} does not exist" assert s2_file.is_file(), f"{s2_file} does not exist" - assert set(dir_content) == set([s1_file, s2_file]), "Output files do not match file names in csv" + assert set(dir_content) == set( + [s1_file, s2_file] + ), "Output files do not match file names in csv" if __name__ == "__main__": diff --git a/src/dataflow/split_h5mu_train_test/script.py b/src/dataflow/split_h5mu_train_test/script.py index a0d9ce17dad..551311824b7 100644 --- a/src/dataflow/split_h5mu_train_test/script.py +++ b/src/dataflow/split_h5mu_train_test/script.py @@ -1,7 +1,6 @@ import mudata as mu from sklearn.model_selection import train_test_split import sys -import logging ### VIASH START par = { @@ -14,32 +13,45 @@ "output_val": "val.h5mu", "output_test": "test.h5mu", "compression": "gzip", - "shuffle": True + "shuffle": True, } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() def main(): input_mudata = mu.read_h5mu(par["input"]) input_modality = input_mudata.mod[par["modality"]] - + n_obs = input_modality.n_obs - train_idx, test_idx = train_test_split(range(n_obs), test_size=par["test_size"], random_state=par["random_state"], shuffle=par["shuffle"]) - + train_idx, test_idx = train_test_split( + range(n_obs), + test_size=par["test_size"], + random_state=par["random_state"], + shuffle=par["shuffle"], + ) + if bool(par["val_size"]) != bool(par["output_val"]): - raise ValueError("Both --val_size and --output_val must be set to use validation set.") - + raise ValueError( + "Both --val_size and --output_val must be set to use validation set." + ) + elif par["val_size"] and par["output_val"]: if par["val_size"] + par["test_size"] > 1: raise ValueError("Sum of test_size and val_size must not exceed 1.") - + val_size_relative = par["val_size"] / (1 - par["test_size"]) - train_idx, val_idx = train_test_split(train_idx, test_size=val_size_relative, random_state=par["random_state"], shuffle=par["shuffle"]) - + train_idx, val_idx = train_test_split( + train_idx, + test_size=val_size_relative, + random_state=par["random_state"], + shuffle=par["shuffle"], + ) + train_modality = input_modality[train_idx].copy() val_modality = input_modality[val_idx].copy() test_modality = input_modality[test_idx].copy() @@ -47,16 +59,16 @@ def main(): train_mudata = mu.MuData({par["modality"]: train_modality}) val_mudata = mu.MuData({par["modality"]: val_modality}) test_mudata = mu.MuData({par["modality"]: test_modality}) - + val_mudata.write_h5mu(par["output_val"], compression=par["compression"]) - + else: train_modality = input_modality[train_idx].copy() test_modality = input_modality[test_idx].copy() train_mudata = mu.MuData({par["modality"]: train_modality}) test_mudata = mu.MuData({par["modality"]: test_modality}) - + train_mudata.write_h5mu(par["output_train"], compression=par["compression"]) test_mudata.write_h5mu(par["output_test"], compression=par["compression"]) diff --git a/src/dataflow/split_h5mu_train_test/test.py b/src/dataflow/split_h5mu_train_test/test.py index 8d0592407e3..3d2731e7194 100644 --- a/src/dataflow/split_h5mu_train_test/test.py +++ b/src/dataflow/split_h5mu_train_test/test.py @@ -7,9 +7,7 @@ ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" @@ -18,48 +16,70 @@ def test_train_test(run_component, random_h5mu_path): output_train = random_h5mu_path() output_test = random_h5mu_path() - - run_component([ - "--input", input_file, - "--modality", "rna", - "--test_size", "0.2", - "--output_train", output_train, - "--output_test", output_test, - ]) - + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--test_size", + "0.2", + "--output_train", + output_train, + "--output_test", + output_test, + ] + ) + assert os.path.exists(output_train), "train file does not exist" assert os.path.exists(output_test), "test file does not exist" input_mudata = mu.read_h5mu(input_file) train_mudata = mu.read_h5mu(output_train) test_mudata = mu.read_h5mu(output_test) - + assert list(train_mudata.mod.keys()) == list(test_mudata.mod.keys()) == ["rna"] - - assert train_mudata.mod["rna"].n_obs + test_mudata.mod["rna"].n_obs == input_mudata.mod["rna"].n_obs, \ - "train and test data do not sum up to input data" - - assert abs(train_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.8)) <= 1, \ - "train data has wrong size" - assert abs(test_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.2)) <= 1, \ - "test data has wrong size" + + assert ( + train_mudata.mod["rna"].n_obs + test_mudata.mod["rna"].n_obs + == input_mudata.mod["rna"].n_obs + ), "train and test data do not sum up to input data" + + assert ( + abs(train_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.8)) + <= 1 + ), "train data has wrong size" + assert ( + abs(test_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.2)) + <= 1 + ), "test data has wrong size" def test_train_val_test(run_component, random_h5mu_path): output_train = random_h5mu_path() output_val = random_h5mu_path() output_test = random_h5mu_path() - - run_component([ - "--input", input_file, - "--modality", "rna", - "--test_size", "0.2", - "--val_size", "0.1", - "--output_train", output_train, - "--output_val", output_val, - "--output_test", output_test, - ]) - + + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--test_size", + "0.2", + "--val_size", + "0.1", + "--output_train", + output_train, + "--output_val", + output_val, + "--output_test", + output_test, + ] + ) + assert os.path.exists(output_train), "train file does not exist" assert os.path.exists(output_val), "val file does not exist" assert os.path.exists(output_test), "test file does not exist" @@ -68,49 +88,86 @@ def test_train_val_test(run_component, random_h5mu_path): train_mudata = mu.read_h5mu(output_train) val_mudata = mu.read_h5mu(output_val) test_mudata = mu.read_h5mu(output_test) - - assert list(train_mudata.mod.keys()) == list(val_mudata.mod.keys()) == list(test_mudata.mod.keys()) == ["rna"] - - assert train_mudata.mod["rna"].n_obs + val_mudata.mod["rna"].n_obs + test_mudata.mod["rna"].n_obs == input_mudata.mod["rna"].n_obs, \ - "train, val and test data do not sum up to input data" - - assert abs(train_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.7)) <= 1, \ - "train data has wrong size" - assert abs(val_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.1)) <= 1, \ - "val data has wrong size" - assert abs(test_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.2)) <= 1, \ - "test data has wrong size" + + assert ( + list(train_mudata.mod.keys()) + == list(val_mudata.mod.keys()) + == list(test_mudata.mod.keys()) + == ["rna"] + ) + + assert ( + train_mudata.mod["rna"].n_obs + + val_mudata.mod["rna"].n_obs + + test_mudata.mod["rna"].n_obs + == input_mudata.mod["rna"].n_obs + ), "train, val and test data do not sum up to input data" + + assert ( + abs(train_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.7)) + <= 1 + ), "train data has wrong size" + assert ( + abs(val_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.1)) + <= 1 + ), "val data has wrong size" + assert ( + abs(test_mudata.mod["rna"].n_obs - round(input_mudata.mod["rna"].n_obs * 0.2)) + <= 1 + ), "test data has wrong size" + def test_raise_test_val_size(run_component): with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--modality", "rna", - "--test_size", "0.9", - "--val_size", "0.5", - "--output_train", "train.h5mu", - "--output_val", "val.h5mu", - "--output_test", "test.h5mu", - ]) - - assert re.search(r"Sum of test_size and val_size must not exceed 1.", - err.value.stdout.decode('utf-8')) - - + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--test_size", + "0.9", + "--val_size", + "0.5", + "--output_train", + "train.h5mu", + "--output_val", + "val.h5mu", + "--output_test", + "test.h5mu", + ] + ) + + assert re.search( + r"Sum of test_size and val_size must not exceed 1.", + err.value.stdout.decode("utf-8"), + ) + + def test_raise_invalid_val_out(run_component, random_h5mu_path): - with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--modality", "rna", - "--test_size", "0.2", - "--val_size", "0.1", - "--output_train", "train.h5mu", - "--output_test", "test.h5mu", - ]) - - assert re.search(r"Both --val_size and --output_val must be set to use validation set.", - err.value.stdout.decode('utf-8')) - -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--test_size", + "0.2", + "--val_size", + "0.1", + "--output_train", + "train.h5mu", + "--output_test", + "test.h5mu", + ] + ) + + assert re.search( + r"Both --val_size and --output_val must be set to use validation set.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dataflow/split_modalities/script.py b/src/dataflow/split_modalities/script.py index 9ca3d93ecd9..198baf68141 100644 --- a/src/dataflow/split_modalities/script.py +++ b/src/dataflow/split_modalities/script.py @@ -1,7 +1,6 @@ from __future__ import annotations import sys import mudata as md -from sys import stdout from pathlib import Path import pandas as pd @@ -12,15 +11,15 @@ "output_types": "foo_types.csv", "output_compression": "gzip", } -meta = { - "resources_dir": "." -} +meta = {"resources_dir": "."} ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + def main() -> None: output_dir = Path(par["output"]) logger.info("Creating output directory '%s' if it does not exist", output_dir) @@ -28,30 +27,39 @@ def main() -> None: logger.info("Creating %s", output_dir) output_dir.mkdir(parents=True) - logger.info("Reading input file '%s'", par['input']) + logger.info("Reading input file '%s'", par["input"]) input_file = Path(par["input"].strip()) sample = md.read_h5mu(input_file) - - logger.info('Creating output types CSV.') + + logger.info("Creating output types CSV.") modalities = list(sample.mod.keys()) logger.info("Found the following modalities:\n%s", "\n".join(modalities)) - names = {mod_name: f"{input_file.stem}_{mod_name}.h5mu" - for mod_name in modalities} + names = {mod_name: f"{input_file.stem}_{mod_name}.h5mu" for mod_name in modalities} output_files = list(names.values()) - logger.info("Will be creating the following output .h5mu files:\n%s", "\n".join(output_files)) + logger.info( + "Will be creating the following output .h5mu files:\n%s", + "\n".join(output_files), + ) df = pd.DataFrame({"name": modalities, "filename": output_files}) logger.info("Writing output_types CSV file to '%s'.", par["output_types"]) df.to_csv(par["output_types"], index=False) - logger.info('Splitting input file into unimodal output files.') + logger.info("Splitting input file into unimodal output files.") for mod_name, mod in sample.mod.items(): logger.info("Processing modality '%s'", mod_name) new_sample = md.MuData({mod_name: mod}) - logger.info("Writing to '%s', with compression '%s'", names[mod_name], par["output_compression"]) - new_sample.write_h5mu(output_dir / names[mod_name], compression=par["output_compression"]) + logger.info( + "Writing to '%s', with compression '%s'", + names[mod_name], + par["output_compression"], + ) + new_sample.write_h5mu( + output_dir / names[mod_name], compression=par["output_compression"] + ) logger.info("Done writing output file.") logger.info("Finished") + if __name__ == "__main__": main() diff --git a/src/dataflow/split_modalities/test.py b/src/dataflow/split_modalities/test.py index 52a3536c6fc..688495faf8c 100644 --- a/src/dataflow/split_modalities/test.py +++ b/src/dataflow/split_modalities/test.py @@ -9,26 +9,30 @@ ## VIASH START meta = { - 'name': './target/native/dataflow/split_modalities/split_modalities', - 'resources_dir': './resources_test/', - 'config': './src/dataflow/split_modalities/config.vsh.yaml', - 'executable': './target/docker/dataflow/split_modalities/split_modalities' + "name": "./target/native/dataflow/split_modalities/split_modalities", + "resources_dir": "./resources_test/", + "config": "./src/dataflow/split_modalities/config.vsh.yaml", + "executable": "./target/docker/dataflow/split_modalities/split_modalities", } ## VIASH END + @pytest.fixture def input_modality_1(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) - var = pd.DataFrame([["a"], ["b"], ["c"]], - index=df.columns, columns=["Feat"]) + var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) ad1 = ad.AnnData(df, obs=obs, var=var) return ad1 @pytest.fixture def input_modality_2(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) ad2 = ad.AnnData(df, obs=obs2, var=var2) @@ -37,7 +41,7 @@ def input_modality_2(): @pytest.fixture def input_h5mu(input_modality_1, input_modality_2): - tmp_mudata = md.MuData({'mod1': input_modality_1, 'mod2': input_modality_2}) + tmp_mudata = md.MuData({"mod1": input_modality_1, "mod2": input_modality_2}) return tmp_mudata @@ -45,15 +49,26 @@ def input_h5mu(input_modality_1, input_modality_2): def input_h5mu_path(write_mudata_to_file, input_h5mu): return write_mudata_to_file(input_h5mu) + @pytest.mark.parametrize("compression", ["gzip", None]) -def test_split(run_component, random_path, input_h5mu, input_h5mu_path, - input_modality_1, input_modality_2, compression): +def test_split( + run_component, + random_path, + input_h5mu, + input_h5mu_path, + input_modality_1, + input_modality_2, + compression, +): output_dir = random_path() output_types = random_path(extension="csv") args = [ - "--input", input_h5mu_path, - "--output", str(output_dir), - "--output_types", str(output_types), + "--input", + input_h5mu_path, + "--output", + str(output_dir), + "--output_types", + str(output_types), ] if compression: args += ["--output_compression", compression] @@ -62,8 +77,11 @@ def test_split(run_component, random_path, input_h5mu, input_h5mu_path, assert output_dir.is_dir() # check output dir - dir_content = [h5mu_file for h5mu_file in output_dir.iterdir() - if h5mu_file.suffix == ".h5mu" and h5mu_file != input_h5mu_path] + dir_content = [ + h5mu_file + for h5mu_file in output_dir.iterdir() + if h5mu_file.suffix == ".h5mu" and h5mu_file != input_h5mu_path + ] mod1_file = output_dir / f"{input_h5mu_path.stem}_mod1.h5mu" mod2_file = output_dir / f"{input_h5mu_path.stem}_mod2.h5mu" assert set(dir_content) == set([mod1_file, mod2_file]) @@ -72,8 +90,8 @@ def test_split(run_component, random_path, input_h5mu, input_h5mu_path, assert mod1.n_mod == 1 assert mod2.n_mod == 1 - assert_annotation_objects_equal(mod1.mod['mod1'], input_modality_1) - assert_annotation_objects_equal(mod2.mod['mod2'], input_modality_2) + assert_annotation_objects_equal(mod1.mod["mod1"], input_modality_1) + assert_annotation_objects_equal(mod2.mod["mod2"], input_modality_2) assert mod1.n_obs == input_h5mu.n_obs assert mod2.n_obs == input_h5mu.n_obs @@ -81,11 +99,13 @@ def test_split(run_component, random_path, input_h5mu, input_h5mu_path, # When a var_key is only present for one modality, it is prefixed by the name of the # modality followed by a colon and the name of the key (in the global .var). replace_regex = r"(^mod1:|^mod2:)" - expected_var_keys = {re.sub(replace_regex, "", col_name) for col_name in input_h5mu.var_keys()} + expected_var_keys = { + re.sub(replace_regex, "", col_name) for col_name in input_h5mu.var_keys() + } assert set(mod1.var_keys()) | set(mod2.var_keys()) == expected_var_keys - assert set(mod1.var_keys()) == set(input_h5mu.mod['mod1'].var.columns) - assert set(mod2.var_keys()) == set(input_h5mu.mod['mod2'].var.columns) + assert set(mod1.var_keys()) == set(input_h5mu.mod["mod1"].var.columns) + assert set(mod2.var_keys()) == set(input_h5mu.mod["mod2"].var.columns) expected_csv_output = dedent( f"""\ @@ -94,9 +114,10 @@ def test_split(run_component, random_path, input_h5mu, input_h5mu_path, mod2,{mod2_file.name} """ ) - with open(output_types, 'r') as open_csv_file: + with open(output_types, "r") as open_csv_file: result = open_csv_file.read() assert result == expected_csv_output + if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/demux/cellranger_atac_mkfastq/test.py b/src/demux/cellranger_atac_mkfastq/test.py index 3a64237b19e..a854be12e5c 100644 --- a/src/demux/cellranger_atac_mkfastq/test.py +++ b/src/demux/cellranger_atac_mkfastq/test.py @@ -3,15 +3,13 @@ import pytest ## VIASH START -meta = { - "name": "cellranger_mkfastq", - "resources_dir": "resources_test" -} +meta = {"name": "cellranger_mkfastq", "resources_dir": "resources_test"} ## VIASH END input_dir = Path(meta["resources_dir"]) / "cellranger_atac_tiny_bcl/bcl" sample_sheet = Path(meta["resources_dir"]) / "cellranger_atac_tiny_bcl/bcl/layout.csv" + def test_run(run_component, tmp_path): output = tmp_path / "output" @@ -20,9 +18,12 @@ def test_run(run_component, tmp_path): print("Sample sheet exists: ", sample_sheet.is_file()) cmd_pars = [ - "--input", str(input_dir), - "--csv", str(sample_sheet), - "--output", str(output) + "--input", + str(input_dir), + "--csv", + str(sample_sheet), + "--output", + str(output), ] run_component(cmd_pars) @@ -32,4 +33,4 @@ def test_run(run_component, tmp_path): if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/demux/cellranger_mkfastq/test.py b/src/demux/cellranger_mkfastq/test.py index 64621f0f550..44870f6a590 100644 --- a/src/demux/cellranger_mkfastq/test.py +++ b/src/demux/cellranger_mkfastq/test.py @@ -3,22 +3,23 @@ import pytest ## VIASH START -meta = { - "name": "cellranger_mkfastq", - "resources_dir": "resources_test" -} +meta = {"name": "cellranger_mkfastq", "resources_dir": "resources_test"} ## VIASH END input = meta["resources_dir"] + "/cellranger_tiny_bcl/bcl" sample_sheet = meta["resources_dir"] + "/cellranger_tiny_bcl/bcl/sample_sheet.csv" + def test_run(run_component, tmp_path): output = tmp_path / "output" cmd_pars = [ - "--input", input, - "--sample_sheet", sample_sheet, - "--output", str(output) + "--input", + input, + "--sample_sheet", + sample_sheet, + "--output", + str(output), ] run_component(cmd_pars) @@ -28,4 +29,4 @@ def test_run(run_component, tmp_path): if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/densmap/script.py b/src/dimred/densmap/script.py index 64543a8ad46..520c15ee012 100644 --- a/src/dimred/densmap/script.py +++ b/src/dimred/densmap/script.py @@ -1,84 +1,90 @@ from umap import UMAP import mudata as mu import sys -import anndata as ad ## VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu', - 'modality': 'rna', - 'output': 'output.h5mu', - 'obsm_output': 'X_densmap', - 'lambda': 2.0, - 'fraction': 0.3, - 'var_shift': 0.1 + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "output": "output.h5mu", + "obsm_output": "X_densmap", + "lambda": 2.0, + "fraction": 0.3, + "var_shift": 0.1, } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading %s", par["input"]) mdata = mu.read_h5mu(par["input"]) -if par['modality'] not in mdata.mod: +if par["modality"] not in mdata.mod: raise ValueError(f"Modality '{par['modality']}' not found in the input data.") -logger.info("Computing densMAP for modality '%s'", par['modality']) -data = mdata.mod[par['modality']] +logger.info("Computing densMAP for modality '%s'", par["modality"]) +data = mdata.mod[par["modality"]] neigh_key = par["uns_neighbors"] if neigh_key not in data.uns: - raise ValueError(f"'{neigh_key}' was not found in .mod['{par['modality']}'].uns. Set the correct key or run 'find_neighbors' first.") + raise ValueError( + f"'{neigh_key}' was not found in .mod['{par['modality']}'].uns. Set the correct key or run 'find_neighbors' first." + ) -temp_uns = { neigh_key: data.uns[neigh_key] } +temp_uns = {neigh_key: data.uns[neigh_key]} -if 'use_rep' not in temp_uns[neigh_key]['params']: - raise ValueError(f"'use_rep' was not found in .mod['{par['modality']}'].uns['{neigh_key}'].params. Set the correct key or run PCA first.") +if "use_rep" not in temp_uns[neigh_key]["params"]: + raise ValueError( + f"'use_rep' was not found in .mod['{par['modality']}'].uns['{neigh_key}'].params. Set the correct key or run PCA first." + ) X_densmap = UMAP( - min_dist=par["min_dist"], - spread=par["spread"], - n_components=par["num_components"], - n_epochs=par["max_iter"], - learning_rate=par["alpha"], - repulsion_strength=par["gamma"], - negative_sample_rate=par["negative_sample_rate"], - init=par["init_pos"], - metric=data.uns["neighbors"].get("metric", "euclidean"), - metric_kwds=data.uns["neighbors"].get("metric_kwds", {}), - densmap=True, - dens_lambda=par["lambda"], - dens_frac=par["fraction"], - dens_var_shift=par["var_shift"], + min_dist=par["min_dist"], + spread=par["spread"], + n_components=par["num_components"], + n_epochs=par["max_iter"], + learning_rate=par["alpha"], + repulsion_strength=par["gamma"], + negative_sample_rate=par["negative_sample_rate"], + init=par["init_pos"], + metric=data.uns["neighbors"].get("metric", "euclidean"), + metric_kwds=data.uns["neighbors"].get("metric_kwds", {}), + densmap=True, + dens_lambda=par["lambda"], + dens_frac=par["fraction"], + dens_var_shift=par["var_shift"], ).fit_transform(data.obsm[par["obsm_pca"]]) -logger.info(f"Writing densMAP embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]") -data.obsm[par['obsm_output']] = X_densmap +logger.info( + f"Writing densMAP embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]" +) +data.obsm[par["obsm_output"]] = X_densmap logger.info(f"Writing densMAP metadata to .mod[{par['modality']}].uns['densmap']") -data.uns['densmap'] = { - 'params': { - 'min_dist': par["min_dist"], - 'spread': par["spread"], - 'n_components': par["num_components"], - 'n_epochs': par["max_iter"], - 'learning_rate': par["alpha"], - 'repulsion_strength': par["gamma"], - 'negative_sample_rate': par["negative_sample_rate"], - 'init': par["init_pos"], - 'metric': data.uns["neighbors"].get("metric", "euclidean"), - 'metric_kwds': data.uns["neighbors"].get("metric_kwds", {}), - 'dens_lambda': par["lambda"], - 'dens_frac': par["fraction"], - 'dens_var_shift': par["var_shift"], - } +data.uns["densmap"] = { + "params": { + "min_dist": par["min_dist"], + "spread": par["spread"], + "n_components": par["num_components"], + "n_epochs": par["max_iter"], + "learning_rate": par["alpha"], + "repulsion_strength": par["gamma"], + "negative_sample_rate": par["negative_sample_rate"], + "init": par["init_pos"], + "metric": data.uns["neighbors"].get("metric", "euclidean"), + "metric_kwds": data.uns["neighbors"].get("metric_kwds", {}), + "dens_lambda": par["lambda"], + "dens_frac": par["fraction"], + "dens_var_shift": par["var_shift"], + } } logger.info("Writing to %s.", par["output"]) mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) -logger.info("Finished") \ No newline at end of file +logger.info("Finished") diff --git a/src/dimred/densmap/test.py b/src/dimred/densmap/test.py index 5c4b7089e87..6008076f8e2 100644 --- a/src/dimred/densmap/test.py +++ b/src/dimred/densmap/test.py @@ -7,74 +7,103 @@ ## VIASH START meta = { - 'executable': './target/docker/dimred/densmap/densmap', - 'resources_dir': './resources_test/', - 'config': './src/dimred/densmap/config.vsh.yaml' + "executable": "./target/docker/dimred/densmap/densmap", + "resources_dir": "./resources_test/", + "config": "./src/dimred/densmap/config.vsh.yaml", } ## VIASH END input_path = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + def test_densmap(run_component, random_h5mu_path): output_path = random_h5mu_path() args = [ - "--input", input_path, - "--output", output_path, - "--modality", "rna", - "--obsm_pca", "X_pca", - "--output_compression", "gzip" + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--obsm_pca", + "X_pca", + "--output_compression", + "gzip", ] run_component(args) - + assert output_path.is_file(), "No output was created." output_mudata = read_h5mu(output_path) input_mudata = read_h5mu(input_path) - + # check whether densmap was found and remove for comparison - assert "X_densmap" in output_mudata.mod["rna"].obsm, "Check whether output was found in .obsm" - assert "densmap" in output_mudata.mod["rna"].uns, "Check whether output was found in .uns" + assert ( + "X_densmap" in output_mudata.mod["rna"].obsm + ), "Check whether output was found in .obsm" + assert ( + "densmap" in output_mudata.mod["rna"].uns + ), "Check whether output was found in .uns" output_mudata.mod["rna"].obsm.pop("X_densmap") output_mudata.mod["rna"].uns.pop("densmap") assert_annotation_objects_equal(output_mudata, input_mudata) - + + def test_densmap_custom_obsm_output(run_component, random_h5mu_path): output_path = random_h5mu_path() args = [ - "--input", input_path, - "--output", output_path, - "--modality", "rna", - "--obsm_pca", "X_pca", - "--output_compression", "gzip", - "--obsm_output", "X_custom_densmap" + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--obsm_pca", + "X_pca", + "--output_compression", + "gzip", + "--obsm_output", + "X_custom_densmap", ] run_component(args) - + assert output_path.is_file(), "No output was created." output_mudata = read_h5mu(output_path) input_mudata = read_h5mu(input_path) - + # check whether tsne was found and remove for comparison - assert "X_custom_densmap" in output_mudata.mod["rna"].obsm, "Check whether output was found in .obsm" - assert "densmap" in output_mudata.mod["rna"].uns, "Check whether output was found in .uns" + assert ( + "X_custom_densmap" in output_mudata.mod["rna"].obsm + ), "Check whether output was found in .obsm" + assert ( + "densmap" in output_mudata.mod["rna"].uns + ), "Check whether output was found in .uns" output_mudata.mod["rna"].obsm.pop("X_custom_densmap") output_mudata.mod["rna"].uns.pop("densmap") assert_annotation_objects_equal(output_mudata, input_mudata) - + + def test_densmap_no_neighbors_raise(run_component, random_h5mu_path): output_path = random_h5mu_path() args = [ - "--input", input_path, - "--output", output_path, - "--obsm_pca", "X_pca", - "--modality", "prot", - "--output_compression", "gzip" + "--input", + input_path, + "--output", + output_path, + "--obsm_pca", + "X_pca", + "--modality", + "prot", + "--output_compression", + "gzip", ] - + with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) - assert re.search(r"ValueError: 'neighbors' was not found in .mod\['prot'\].uns.", - err.value.stdout.decode('utf-8')) - - + assert re.search( + r"ValueError: 'neighbors' was not found in .mod\['prot'\].uns.", + err.value.stdout.decode("utf-8"), + ) + + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/lsi/script.py b/src/dimred/lsi/script.py index 600278100b4..8c76b86d595 100644 --- a/src/dimred/lsi/script.py +++ b/src/dimred/lsi/script.py @@ -7,18 +7,17 @@ ## VIASH START par = { - "num_components": 50, # number of components to calculate with SVD - "scale_embeddings": True, # scale embeddings to zero mean and unit variance - "modality": "atac", # on which modality the LSI should be run - "layer": None, # on which layer to run the LSI, if None, will run it on anndata.X - "var_input": None, # column in anndata.var of the highly variable features - - "overwrite": True, + "num_components": 50, # number of components to calculate with SVD + "scale_embeddings": True, # scale embeddings to zero mean and unit variance + "modality": "atac", # on which modality the LSI should be run + "layer": None, # on which layer to run the LSI, if None, will run it on anndata.X + "var_input": None, # column in anndata.var of the highly variable features + "overwrite": True, "obsm_output": "X_lsi", "varm_output": "LSI", "uns_output": "lsi", "output": "output.h5mu", - "output_compression": "gzip" + "output_compression": "gzip", } ## VIASH END @@ -28,66 +27,87 @@ from setup_logger import setup_logger + logger = setup_logger() -#1.read in mudata +# 1.read in mudata logger.info("Reading %s.", par["input"]) mdata = md.read_h5mu(par["input"]) -#2. subset on modality +# 2. subset on modality if par["modality"] not in mdata.mod: - raise ValueError(f"Modality '{par['modality']}' was not found in mudata {par['input']}.") -adata = mdata.mod[par['modality']] - - -#3. Specify layer -if par['layer'] and par["layer"] not in adata.layers: - raise ValueError(f"Layer '{par['layer']}' was not found in modality '{par['modality']}'.") -layer = adata.X if not par['layer'] else adata.layers[par['layer']] + raise ValueError( + f"Modality '{par['modality']}' was not found in mudata {par['input']}." + ) +adata = mdata.mod[par["modality"]] + + +# 3. Specify layer +if par["layer"] and par["layer"] not in adata.layers: + raise ValueError( + f"Layer '{par['layer']}' was not found in modality '{par['modality']}'." + ) +layer = adata.X if not par["layer"] else adata.layers[par["layer"]] adata_input_layer = AnnData(layer, var=adata.var) if not par["layer"]: - logger.info("Using modality '%s' and adata.X for LSI computation", par['modality']) + logger.info("Using modality '%s' and adata.X for LSI computation", par["modality"]) else: - logger.info("Using modality '%s' and layer '%s' for LSI computation", par['modality'], par["layer"]) + logger.info( + "Using modality '%s' and layer '%s' for LSI computation", + par["modality"], + par["layer"], + ) -#4. Subset on highly variable features if applicable +# 4. Subset on highly variable features if applicable if par["var_input"]: adata_input_layer = subset_vars(adata_input_layer, par["var_input"]) - -#5. Run LSI -logger.info("Computing %s LSI components on %s features", par["num_components"], adata_input_layer.X.shape[1]) -mu.atac.tl.lsi(adata_input_layer, scale_embeddings = par["scale_embeddings"], n_comps = par["num_components"]) - +# 5. Run LSI +logger.info( + "Computing %s LSI components on %s features", + par["num_components"], + adata_input_layer.X.shape[1], +) +mu.atac.tl.lsi( + adata_input_layer, + scale_embeddings=par["scale_embeddings"], + n_comps=par["num_components"], +) -#6. Store output in object +# 6. Store output in object check_exist_dict = { "obsm_output": ("obsm"), "varm_output": ("varm"), - "uns_output": ("uns") + "uns_output": ("uns"), } for parameter_name, field in check_exist_dict.items(): if par[parameter_name] in getattr(adata, field): if not par["overwrite"]: - raise ValueError(f"Requested to create field {par[parameter_name]} in .{field} " - f"for modality {par['modality']}, but field already exists.") + raise ValueError( + f"Requested to create field {par[parameter_name]} in .{field} " + f"for modality {par['modality']}, but field already exists." + ) del getattr(adata, field)[par[parameter_name]] -adata.obsm[par["obsm_output"]] = adata_input_layer.obsm['X_lsi'] -adata.uns[par["uns_output"]] = adata_input_layer.uns['lsi'] +adata.obsm[par["obsm_output"]] = adata_input_layer.obsm["X_lsi"] +adata.uns[par["uns_output"]] = adata_input_layer.uns["lsi"] if par["var_input"]: - adata.varm[par["varm_output"]] = np.zeros(shape=(adata.n_vars, adata_input_layer.varm["LSI"].shape[1])) - adata.varm[par["varm_output"]][adata.var[par["var_input"]]] = adata_input_layer.varm['LSI'] + adata.varm[par["varm_output"]] = np.zeros( + shape=(adata.n_vars, adata_input_layer.varm["LSI"].shape[1]) + ) + adata.varm[par["varm_output"]][adata.var[par["var_input"]]] = ( + adata_input_layer.varm["LSI"] + ) else: - adata.varm[par["varm_output"]] = adata_input_layer.varm['LSI'] + adata.varm[par["varm_output"]] = adata_input_layer.varm["LSI"] logger.info("Writing to %s.", par["output"]) -mdata.write(filename = par["output"], compression=par["output_compression"]) +mdata.write(filename=par["output"], compression=par["output_compression"]) logger.info("Finished") diff --git a/src/dimred/lsi/test.py b/src/dimred/lsi/test.py index f6d293c6535..510d49a7682 100644 --- a/src/dimred/lsi/test.py +++ b/src/dimred/lsi/test.py @@ -6,196 +6,241 @@ ## VIASH START meta = { - 'resources_dir': 'resources_test', - 'executable': './target/docker/dimred/lsi/lsi', - 'config': './src/dimred/lsi/config.vsh.yaml' + "resources_dir": "resources_test", + "executable": "./target/docker/dimred/lsi/lsi", + "config": "./src/dimred/lsi/config.vsh.yaml", } ## VIASH END input_path = f"{meta['resources_dir']}/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset_unique_obs.h5mu" -''' +""" Tests: 1. general test 2. test HVF 3. test modality 4. test layer 5. test overwrite -''' +""" + @pytest.fixture def atac_mudata(tmp_path): - mdata = mu.read_h5mu(input_path) mdata.mod["atac"].layers["counts"] = mdata.mod["atac"].X - mdata.mod["atac"].var["highly_variable"] = np.random.choice([True, False], size=mdata.mod["atac"].n_vars) + mdata.mod["atac"].var["highly_variable"] = np.random.choice( + [True, False], size=mdata.mod["atac"].n_vars + ) print(mdata) mdata.write(tmp_path / "atac_mudata.h5mu") return tmp_path / "atac_mudata.h5mu" + # 1.general test def test_lsi(run_component, tmp_path): output_path = tmp_path / "output_lsi.h5mu" - + cmd_args = [ - "--input", input_path, - "--output", str(output_path), - "--obsm_output", "X_test", - "--num_components", "30" + "--input", + input_path, + "--output", + str(output_path), + "--obsm_output", + "X_test", + "--num_components", + "30", ] - run_component(cmd_args) + run_component(cmd_args) assert output_path.is_file() data = mu.read_h5mu(output_path) - assert "X_test" in data.mod['atac'].obsm + assert "X_test" in data.mod["atac"].obsm assert data.mod["atac"].obsm["X_test"].shape == (data.mod["atac"].n_obs, 30) - assert "lsi" in data.mod['atac'].uns - assert "lsi" in data.mod['atac'].varm + assert "lsi" in data.mod["atac"].uns + assert "lsi" in data.mod["atac"].varm - -# 2.test HVF +# 2.test HVF def test_select_highly_variable_column(run_component, random_h5mu_path, atac_mudata): output_path = random_h5mu_path() # run component cmd_args = [ - "--input", str(atac_mudata), - "--output", str(output_path), - "--var_input", "highly_variable" + "--input", + str(atac_mudata), + "--output", + str(output_path), + "--var_input", + "highly_variable", ] run_component(cmd_args) - + assert output_path.is_file() data = mu.read_h5mu(output_path) - assert "X_lsi" in data.mod['atac'].obsm + assert "X_lsi" in data.mod["atac"].obsm assert data.mod["atac"].obsm["X_lsi"].shape == (data.mod["atac"].n_obs, 50) assert "highly_variable" in data.mod["atac"].var.columns - assert "lsi" in data.mod['atac'].uns - assert "lsi" in data.mod['atac'].varm + assert "lsi" in data.mod["atac"].uns + assert "lsi" in data.mod["atac"].varm assert data.mod["atac"].varm["lsi"].shape == (data.mod["atac"].n_vars, 50) def test_highly_variable_column_does_not_exist_raises(run_component): with pytest.raises(subprocess.CalledProcessError) as err: cmd_args = [ - "--input", input_path, - "--output", "output_lsi.h5mu", - "--var_input", "does_not_exist" + "--input", + input_path, + "--output", + "output_lsi.h5mu", + "--var_input", + "does_not_exist", ] run_component(cmd_args) - assert "ValueError: Requested to use .var column 'does_not_exist' as a selection of genes, but the column is not available." in \ - err.value.stdout.decode('utf-8') - + assert ( + "ValueError: Requested to use .var column 'does_not_exist' as a selection of genes, but the column is not available." + in err.value.stdout.decode("utf-8") + ) + # 3.test modality def test_modality_does_not_exist_raises(run_component): with pytest.raises(subprocess.CalledProcessError) as err: cmd_args = [ - "--input", input_path, - "--output", "output_lsi.h5mu", - "--modality", "does_not_exist" + "--input", + input_path, + "--output", + "output_lsi.h5mu", + "--modality", + "does_not_exist", ] run_component(cmd_args) - - assert "ValueError: Modality 'does_not_exist' was not found in mudata " + input_path +"." in \ - err.value.stdout.decode('utf-8') + assert ( + "ValueError: Modality 'does_not_exist' was not found in mudata " + + input_path + + "." + in err.value.stdout.decode("utf-8") + ) -# 4.test layer +# 4.test layer def test_selecting_input_layer(run_component, atac_mudata, tmp_path): output_path = tmp_path / "output_lsi.h5mu" # run component cmd_args = [ - "--input", str(atac_mudata), - "--output", str(output_path), - "--num_components", "20", - "--layer", "counts" - ] + "--input", + str(atac_mudata), + "--output", + str(output_path), + "--num_components", + "20", + "--layer", + "counts", + ] run_component(cmd_args) - assert output_path.is_file() data = mu.read_h5mu(output_path) assert "counts" in data.mod["atac"].layers - assert "X_lsi" in data.mod['atac'].obsm + assert "X_lsi" in data.mod["atac"].obsm assert data.mod["atac"].obsm["X_lsi"].shape == (data.mod["atac"].n_obs, 20) - assert "lsi" in data.mod['atac'].uns - assert "lsi" in data.mod['atac'].varm - + assert "lsi" in data.mod["atac"].uns + assert "lsi" in data.mod["atac"].varm def test_raise_if_input_layer_is_missing(run_component): with pytest.raises(subprocess.CalledProcessError) as err: cmd_args = [ - "--input", input_path, - "--output", "output.h5mu", - "--layer", "does_not_exist" + "--input", + input_path, + "--output", + "output.h5mu", + "--layer", + "does_not_exist", ] run_component(cmd_args) - - assert "ValueError: Layer 'does_not_exist' was not found in modality 'atac'." in \ - err.value.stdout.decode('utf-8') + + assert ( + "ValueError: Layer 'does_not_exist' was not found in modality 'atac'." + in err.value.stdout.decode("utf-8") + ) +# 5.test overwrite -# 5.test overwrite def test_output_field_already_present_raises(run_component, tmp_path): output_path = tmp_path / "output_lsi.h5mu" - #create slots + # create slots input_data = mu.read_h5mu(input_path) - input_data.mod["atac"].varm["lsi"] = np.zeros(shape=(input_data.mod["atac"].n_vars, 50)) - input_data.mod["atac"].obsm["X_lsi"] = np.zeros(shape=(input_data.mod["atac"].n_obs, 50)) - input_data.mod["atac"].uns['lsi'] = "test" + input_data.mod["atac"].varm["lsi"] = np.zeros( + shape=(input_data.mod["atac"].n_vars, 50) + ) + input_data.mod["atac"].obsm["X_lsi"] = np.zeros( + shape=(input_data.mod["atac"].n_obs, 50) + ) + input_data.mod["atac"].uns["lsi"] = "test" tmp_file = tmp_path / "input_data_adjusted.h5mu" input_data.write_h5mu(tmp_file) with pytest.raises(subprocess.CalledProcessError) as err: cmd_args = [ - "--input", str(tmp_file), - "--output", str(output_path), - "--output_compression", "gzip" + "--input", + str(tmp_file), + "--output", + str(output_path), + "--output_compression", + "gzip", ] run_component(cmd_args) - - assert "ValueError: Requested to create field X_lsi in .obsm for " \ - "modality atac, but field already exists." in \ - err.value.stdout.decode('utf-8') + + assert ( + "ValueError: Requested to create field X_lsi in .obsm for " + "modality atac, but field already exists." in err.value.stdout.decode("utf-8") + ) + def test_output_field_already_present_overwrite(run_component, tmp_path): output_path = tmp_path / "output_lsi.h5mu" - #create slots + # create slots input_data = mu.read_h5mu(input_path) - input_data.mod["atac"].varm["lsi"] = np.zeros(shape=(input_data.mod["atac"].n_vars, 50)) - input_data.mod["atac"].obsm["X_lsi"] = np.zeros(shape=(input_data.mod["atac"].n_obs, 50)) - input_data.mod["atac"].uns['lsi'] = "test" + input_data.mod["atac"].varm["lsi"] = np.zeros( + shape=(input_data.mod["atac"].n_vars, 50) + ) + input_data.mod["atac"].obsm["X_lsi"] = np.zeros( + shape=(input_data.mod["atac"].n_obs, 50) + ) + input_data.mod["atac"].uns["lsi"] = "test" tmp_file = tmp_path / "input_data_adjusted.h5mu" input_data.write_h5mu(tmp_file) cmd_args = [ - "--input", str(tmp_file), - "--output", str(output_path), - "--output_compression", "gzip", + "--input", + str(tmp_file), + "--output", + str(output_path), + "--output_compression", + "gzip", "--overwrite", - "--num_components", "30" + "--num_components", + "30", ] run_component(cmd_args) assert output_path.is_file() data = mu.read_h5mu(output_path) - assert "X_lsi" in data.mod['atac'].obsm + assert "X_lsi" in data.mod["atac"].obsm assert data.mod["atac"].obsm["X_lsi"].shape == (data.mod["atac"].n_obs, 30) - assert "lsi" in data.mod['atac'].uns - assert "lsi" in data.mod['atac'].varm + assert "lsi" in data.mod["atac"].uns + assert "lsi" in data.mod["atac"].varm + -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/pca/script.py b/src/dimred/pca/script.py index 6d994a37f04..919de031dee 100644 --- a/src/dimred/pca/script.py +++ b/src/dimred/pca/script.py @@ -21,34 +21,37 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading %s.", par["input"]) mdata = mu.read_h5mu(par["input"]) -logger.info("Computing PCA components for modality '%s'", par['modality']) -data = mdata.mod[par['modality']] -if par['layer'] and par['layer'] not in data.layers: +logger.info("Computing PCA components for modality '%s'", par["modality"]) +data = mdata.mod[par["modality"]] +if par["layer"] and par["layer"] not in data.layers: raise ValueError(f"{par['layer']} was not found in modality {par['modality']}.") -layer = data.X if not par['layer'] else data.layers[par['layer']] +layer = data.X if not par["layer"] else data.layers[par["layer"]] adata_input_layer = AnnData(layer) adata_input_layer.var.index = data.var.index use_highly_variable = False if par["var_input"]: - if not par["var_input"] in data.var.columns: - raise ValueError(f"Requested to use .var column {par['var_input']} " - "as a selection of genes to run the PCA on, " - f"but the column is not available for modality {par['modality']}") + if par["var_input"] not in data.var.columns: + raise ValueError( + f"Requested to use .var column {par['var_input']} " + "as a selection of genes to run the PCA on, " + f"but the column is not available for modality {par['modality']}" + ) use_highly_variable = True - adata_input_layer.var['highly_variable'] = data.var[par["var_input"]] + adata_input_layer.var["highly_variable"] = data.var[par["var_input"]] # run pca output_adata = sc.tl.pca( adata_input_layer, n_comps=par["num_components"], copy=True, - use_highly_variable=use_highly_variable + use_highly_variable=use_highly_variable, ) # store output in specific objects @@ -56,22 +59,26 @@ check_exist_dict = { "obsm_output": ("obs"), "varm_output": ("varm"), - "uns_output": ("uns") + "uns_output": ("uns"), } for parameter_name, field in check_exist_dict.items(): if par[parameter_name] in getattr(data, field): if not par["overwrite"]: - raise ValueError(f"Requested to create field {par[parameter_name]} in .{field} " - f"for modality {par['modality']}, but field already exists.") + raise ValueError( + f"Requested to create field {par[parameter_name]} in .{field} " + f"for modality {par['modality']}, but field already exists." + ) del getattr(data, field)[par[parameter_name]] -data.obsm[par["obsm_output"]] = output_adata.obsm['X_pca'] -data.varm[par["varm_output"]] = output_adata.varm['PCs'] -data.uns[par["uns_output"]] = { "variance": output_adata.uns['pca']['variance'], - "variance_ratio": output_adata.uns['pca']['variance_ratio'] } +data.obsm[par["obsm_output"]] = output_adata.obsm["X_pca"] +data.varm[par["varm_output"]] = output_adata.varm["PCs"] +data.uns[par["uns_output"]] = { + "variance": output_adata.uns["pca"]["variance"], + "variance_ratio": output_adata.uns["pca"]["variance_ratio"], +} logger.info("Writing to %s.", par["output"]) mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) -logger.info("Finished") \ No newline at end of file +logger.info("Finished") diff --git a/src/dimred/pca/test.py b/src/dimred/pca/test.py index eff9f06f67e..a6989c992b8 100644 --- a/src/dimred/pca/test.py +++ b/src/dimred/pca/test.py @@ -6,26 +6,33 @@ ## VIASH START meta = { - 'name': 'foo', - 'resources_dir': 'resources_test/', - 'executable': './target/executable/dimred/pca/pca', - 'config': './src/dimred/pca/config.vsh.yaml' + "name": "foo", + "resources_dir": "resources_test/", + "executable": "./target/executable/dimred/pca/pca", + "config": "./src/dimred/pca/config.vsh.yaml", } ## VIASH END input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + def test_pca(run_component, tmp_path): output_path = tmp_path / "output.h5mu" # run component - run_component([ - "--input", input_path, - "--output", str(output_path), - "--obsm_output", "X_foo", - "--num_components", "26", - "--overwrite" - ]) + run_component( + [ + "--input", + input_path, + "--output", + str(output_path), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--overwrite", + ] + ) assert output_path.is_file() data = mu.read_h5mu(output_path) @@ -33,32 +40,42 @@ def test_pca(run_component, tmp_path): assert data.mod["rna"].obsm["X_foo"].shape == (data.n_obs, 26) assert "highly_variable" not in data.mod["rna"].var.columns assert "filter_with_hvg" in data.mod["rna"].var.columns - assert "pca_variance" in data.mod['rna'].uns - assert "pca_loadings" in data.mod['rna'].varm - assert "X_foo" in data.mod['rna'].obsm + assert "pca_variance" in data.mod["rna"].uns + assert "pca_loadings" in data.mod["rna"].varm + assert "X_foo" in data.mod["rna"].obsm # GH298 - assert not np.array_equal(data.mod['rna'].uns['pca_variance']['variance'], - data.mod['rna'].uns['pca_variance']['variance_ratio']) + assert not np.array_equal( + data.mod["rna"].uns["pca_variance"]["variance"], + data.mod["rna"].uns["pca_variance"]["variance_ratio"], + ) + def test_no_overwrite_but_field_also_not_present(run_component, tmp_path): output_path = tmp_path / "output.h5mu" # create input data input_data = mu.read_h5mu(input_path) - input_data.mod['rna'].uns.pop('pca_variance') - input_data.mod['rna'].varm.pop('pca_loadings') - input_data.mod['rna'].obsm.pop('X_pca') + input_data.mod["rna"].uns.pop("pca_variance") + input_data.mod["rna"].varm.pop("pca_loadings") + input_data.mod["rna"].obsm.pop("X_pca") tmp_file = tmp_path / "input_data_adjusted.h5mu" input_data.write(tmp_file) # run component - run_component([ - "--input", str(tmp_file), - "--output", str(output_path), - "--obsm_output", "X_foo", - "--num_components", "26", - "--output_compression", "gzip" - ]) + run_component( + [ + "--input", + str(tmp_file), + "--output", + str(output_path), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--output_compression", + "gzip", + ] + ) assert output_path.is_file() data = mu.read_h5mu(output_path) @@ -66,8 +83,8 @@ def test_no_overwrite_but_field_also_not_present(run_component, tmp_path): assert data.mod["rna"].obsm["X_foo"].shape == (data.n_obs, 26) assert "highly_variable" not in data.mod["rna"].var.columns assert "filter_with_hvg" in data.mod["rna"].var.columns - assert "pca_variance" in data.mod['rna'].uns - assert "pca_loadings" in data.mod['rna'].varm + assert "pca_variance" in data.mod["rna"].uns + assert "pca_loadings" in data.mod["rna"].varm assert "X_foo" in data.mod["rna"].obsm @@ -76,19 +93,26 @@ def test_selecting_input_layer(run_component, tmp_path): # generate input data input_data = mu.read_h5mu(input_path) - input_data.mod['rna'].layers['test'] = input_data.mod['rna'].X + input_data.mod["rna"].layers["test"] = input_data.mod["rna"].X tmp_file = tmp_path / "input_data_adjusted.h5mu" input_data.write_h5mu(tmp_file) # run component - run_component([ - "--input", str(tmp_file), - "--output", str(output_path), - "--obsm_output", "test_foo", - "--num_components", "26", - "--layer", "test", - "--overwrite" - ]) + run_component( + [ + "--input", + str(tmp_file), + "--output", + str(output_path), + "--obsm_output", + "test_foo", + "--num_components", + "26", + "--layer", + "test", + "--overwrite", + ] + ) assert output_path.is_file() # check whether pca was found @@ -97,42 +121,59 @@ def test_selecting_input_layer(run_component, tmp_path): assert data.mod["rna"].obsm["test_foo"].shape == (data.n_obs, 26) assert "highly_variable" not in data.mod["rna"].var.columns assert "filter_with_hvg" in data.mod["rna"].var.columns - assert "pca_variance" in data.mod['rna'].uns - assert "pca_loadings" in data.mod['rna'].varm - assert "X_pca" in data.mod['rna'].obsm + assert "pca_variance" in data.mod["rna"].uns + assert "pca_loadings" in data.mod["rna"].varm + assert "X_pca" in data.mod["rna"].obsm + def test_highly_variable_column_does_not_exist_raises(run_component): with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_path, - "--output", "output.h5mu", - "--obsm_output", "X_foo", - "--num_components", "26", - "--var_input", "does_not_exist" - ]) - assert "ValueError: Requested to use .var column does_not_exist as " \ - "a selection of genes to run the PCA on, but the column is " \ - "not available for modality rna" in \ - err.value.stdout.decode('utf-8') + run_component( + [ + "--input", + input_path, + "--output", + "output.h5mu", + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--var_input", + "does_not_exist", + ] + ) + assert ( + "ValueError: Requested to use .var column does_not_exist as " + "a selection of genes to run the PCA on, but the column is " + "not available for modality rna" in err.value.stdout.decode("utf-8") + ) + def test_select_highly_variable_column(run_component, tmp_path): output_path = tmp_path / "output.h5mu" # create input data input_data = mu.read_h5mu(input_path) - input_data.mod['rna'].var["filter_with_hvg"] = True + input_data.mod["rna"].var["filter_with_hvg"] = True tmp_file = tmp_path / "input_data_adjusted.h5mu" input_data.write_h5mu(tmp_file) # run component - run_component([ - "--input", str(tmp_file), - "--output", str(output_path), - "--obsm_output", "X_foo", - "--num_components", "26", - "--var_input", "filter_with_hvg", - "--overwrite" - ]) + run_component( + [ + "--input", + str(tmp_file), + "--output", + str(output_path), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--var_input", + "filter_with_hvg", + "--overwrite", + ] + ) assert output_path.is_file() # check whether pca was found @@ -140,34 +181,54 @@ def test_select_highly_variable_column(run_component, tmp_path): assert data.mod["rna"].obsm["X_foo"].shape == (data.n_obs, 26) assert "highly_variable" not in data.mod["rna"].var.columns assert "filter_with_hvg" in data.mod["rna"].var.columns - assert "pca_variance" in data.mod['rna'].uns - assert "pca_loadings" in data.mod['rna'].varm - assert "X_pca" in data.mod['rna'].obsm + assert "pca_variance" in data.mod["rna"].uns + assert "pca_loadings" in data.mod["rna"].varm + assert "X_pca" in data.mod["rna"].obsm + def test_raise_if_input_layer_is_missing(run_component): with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_path, - "--output", "output.h5mu", - "--obsm_output", "X_foo", - "--num_components", "26", - "--layer", "does_not_exist", - "--var_input", "filter_with_hvg" - ]) - assert "ValueError: does_not_exist was not found in modality rna." in \ - err.value.stdout.decode('utf-8') + run_component( + [ + "--input", + input_path, + "--output", + "output.h5mu", + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--layer", + "does_not_exist", + "--var_input", + "filter_with_hvg", + ] + ) + assert ( + "ValueError: does_not_exist was not found in modality rna." + in err.value.stdout.decode("utf-8") + ) + def test_output_field_already_present_raises(run_component): with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_path, - "--output", "output.h5mu", - "--obsm_output", "X_foo", - "--num_components", "26" - ]) - assert "ValueError: Requested to create field pca_loadings in .varm for " \ - "modality rna, but field already exists." in \ - err.value.stdout.decode('utf-8') - -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file + run_component( + [ + "--input", + input_path, + "--output", + "output.h5mu", + "--obsm_output", + "X_foo", + "--num_components", + "26", + ] + ) + assert ( + "ValueError: Requested to create field pca_loadings in .varm for " + "modality rna, but field already exists." in err.value.stdout.decode("utf-8") + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/tsne/script.py b/src/dimred/tsne/script.py index 637e59e23ba..28c3f87cda7 100644 --- a/src/dimred/tsne/script.py +++ b/src/dimred/tsne/script.py @@ -5,40 +5,40 @@ ## VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu', - 'modality': 'rna', - 'use_rep': 'X_pca', - 'output': 'output.h5mu', - 'output_compression': 'gzip', - 'obsm_output': 'X_tsne', - 'n_pcs': 50, - 'perplexity': 30, - 'min_dist': 0.5, - 'metric': 'euclidean', - 'early_exaggeration': 12, - 'learning_rate': 1000, - 'random_state': 0, + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "use_rep": "X_pca", + "output": "output.h5mu", + "output_compression": "gzip", + "obsm_output": "X_tsne", + "n_pcs": 50, + "perplexity": 30, + "min_dist": 0.5, + "metric": "euclidean", + "early_exaggeration": 12, + "learning_rate": 1000, + "random_state": 0, } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading %s", par["input"]) mdata = mu.read_h5mu(par["input"]) -logger.info("Computing tSNE for modality '%s'", par['modality']) -data = mdata.mod[par['modality']] +logger.info("Computing tSNE for modality '%s'", par["modality"]) +data = mdata.mod[par["modality"]] -if par['use_rep'] not in data.obsm.keys(): - raise ValueError(f"'{par['use_rep']}' was not found in .mod['{par['modality']}'].obsm. No precomputed PCA provided. Please run PCA first.") +if par["use_rep"] not in data.obsm.keys(): + raise ValueError( + f"'{par['use_rep']}' was not found in .mod['{par['modality']}'].obsm. No precomputed PCA provided. Please run PCA first." + ) temp_obsm = {par["use_rep"]: data.obsm[par["use_rep"]]} -temp_adata = ad.AnnData( - obsm=temp_obsm, - shape=data.shape -) +temp_adata = ad.AnnData(obsm=temp_obsm, shape=data.shape) sc.tl.tsne( adata=temp_adata, @@ -49,16 +49,18 @@ early_exaggeration=par["early_exaggeration"], learning_rate=par["learning_rate"], random_state=par["random_state"], - n_jobs=meta["cpus"] + n_jobs=meta["cpus"], ) -logger.info(f"Writing tSNE embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]") -data.obsm[par['obsm_output']] = temp_adata.obsm['X_tsne'] +logger.info( + f"Writing tSNE embeddings to .mod[{par['modality']}].obsm[{par['obsm_output']}]" +) +data.obsm[par["obsm_output"]] = temp_adata.obsm["X_tsne"] logger.info(f"Writing tSNE metadata to .mod[{par['modality']}].uns['tsne']") -data.uns['tsne'] = temp_adata.uns['tsne'] +data.uns["tsne"] = temp_adata.uns["tsne"] logger.info("Writing to %s.", par["output"]) mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) -logger.info("Finished") \ No newline at end of file +logger.info("Finished") diff --git a/src/dimred/tsne/test.py b/src/dimred/tsne/test.py index 20e6d250790..ac4b1c11240 100644 --- a/src/dimred/tsne/test.py +++ b/src/dimred/tsne/test.py @@ -7,101 +7,142 @@ ## VIASH START meta = { - 'executable': './target/executable/dimred/tsne/tsne', - 'resources_dir': './resources_test/pbmc_1k_protein_v3/', - 'config': './src/dimred/tsne/config.vsh.yaml' + "executable": "./target/executable/dimred/tsne/tsne", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", + "config": "./src/dimred/tsne/config.vsh.yaml", } ## VIASH END input_path = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + @pytest.fixture def mudata_no_obsm_pca(write_mudata_to_file): input_mudata = read_h5mu(input_path) input_mudata.mod["rna"].obsm.pop("X_pca") return write_mudata_to_file(input_mudata) + def test_tsne(run_component, random_h5mu_path): output_path = random_h5mu_path() args = [ - "--input", input_path, - "--output", output_path, - "--modality", "rna", - "--use_rep", "X_pca", - "--output_compression", "gzip" + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--use_rep", + "X_pca", + "--output_compression", + "gzip", ] run_component(args) - + assert output_path.is_file(), "No output was created." output_mudata = read_h5mu(output_path) input_mudata = read_h5mu(input_path) - + # check whether tsne was found and remove for comparison - assert "X_tsne" in output_mudata.mod["rna"].obsm, "Check whether output was found in .obsm" - assert "tsne" in output_mudata.mod["rna"].uns, "Check whether output was found in .uns" + assert ( + "X_tsne" in output_mudata.mod["rna"].obsm + ), "Check whether output was found in .obsm" + assert ( + "tsne" in output_mudata.mod["rna"].uns + ), "Check whether output was found in .uns" output_mudata.mod["rna"].obsm.pop("X_tsne") output_mudata.mod["rna"].uns.pop("tsne") assert_annotation_objects_equal(output_mudata, input_mudata) - + + def test_tsne_custom_rep_obsm_output(run_component, random_h5mu_path): input_mudata_custom = read_h5mu(input_path) - input_mudata_custom.mod["rna"].obsm["X_custom_pca"] = input_mudata_custom.mod["rna"].obsm["X_pca"] + input_mudata_custom.mod["rna"].obsm["X_custom_pca"] = input_mudata_custom.mod[ + "rna" + ].obsm["X_pca"] input_mudata_custom_path = random_h5mu_path() input_mudata_custom.write_h5mu(input_mudata_custom_path) - + output_path = random_h5mu_path() args = [ - "--input", input_mudata_custom_path, - "--output", output_path, - "--modality", "rna", - "--use_rep", "X_custom_pca", - "--obsm_output", "X_custom_tsne", - "--output_compression", "gzip" + "--input", + input_mudata_custom_path, + "--output", + output_path, + "--modality", + "rna", + "--use_rep", + "X_custom_pca", + "--obsm_output", + "X_custom_tsne", + "--output_compression", + "gzip", ] run_component(args) - + assert output_path.is_file(), "No output was created." - output_mudata = read_h5mu(output_path) + output_mudata = read_h5mu(output_path) # check whether tsne was found and remove for comparison - assert "X_custom_tsne" in output_mudata.mod["rna"].obsm, "Check whether output was found in .obsm" - assert "tsne" in output_mudata.mod["rna"].uns, "Check whether output was found in .uns" + assert ( + "X_custom_tsne" in output_mudata.mod["rna"].obsm + ), "Check whether output was found in .obsm" + assert ( + "tsne" in output_mudata.mod["rna"].uns + ), "Check whether output was found in .uns" output_mudata.mod["rna"].obsm.pop("X_custom_tsne") output_mudata.mod["rna"].uns.pop("tsne") assert_annotation_objects_equal(output_mudata, input_mudata_custom) - -def test_tsne_no_pca_in_input_raise(run_component, random_h5mu_path, mudata_no_obsm_pca): + +def test_tsne_no_pca_in_input_raise( + run_component, random_h5mu_path, mudata_no_obsm_pca +): output_path = random_h5mu_path() args = [ - "--input", mudata_no_obsm_pca, - "--output", output_path, - "--modality", "rna", - "--use_rep", "X_pca", - "--output_compression", "gzip" + "--input", + mudata_no_obsm_pca, + "--output", + output_path, + "--modality", + "rna", + "--use_rep", + "X_pca", + "--output_compression", + "gzip", ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) - assert re.search(r"ValueError: 'X_pca' was not found in \.mod\['rna'\]\.obsm\. No precomputed PCA provided\. Please run PCA first\.", - err.value.stdout.decode('utf-8')) - - + assert re.search( + r"ValueError: 'X_pca' was not found in \.mod\['rna'\]\.obsm\. No precomputed PCA provided\. Please run PCA first\.", + err.value.stdout.decode("utf-8"), + ) + + def test_tsne_too_many_pcs_raise(run_component, random_h5mu_path): output_path = random_h5mu_path() args = [ - "--input", input_path, - "--output", output_path, - "--modality", "rna", - "--use_rep", "X_pca", - "--output_compression", "gzip", - "--n_pcs", "100" + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--use_rep", + "X_pca", + "--output_compression", + "gzip", + "--n_pcs", + "100", ] - + with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) - assert re.search(r"ValueError: X_pca does not have enough Dimensions\. Provide a Representation with equal or more dimensions than`n_pcs` or lower `n_pcs`", - err.value.stdout.decode('utf-8')) - + assert re.search( + r"ValueError: X_pca does not have enough Dimensions\. Provide a Representation with equal or more dimensions than`n_pcs` or lower `n_pcs`", + err.value.stdout.decode("utf-8"), + ) + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/dimred/umap/script.py b/src/dimred/umap/script.py index 897cf30963d..4fdc5f69b4d 100644 --- a/src/dimred/umap/script.py +++ b/src/dimred/umap/script.py @@ -5,58 +5,54 @@ ## VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu', - 'modality': 'rna', - 'output': 'output.h5mu', - 'obsm_output': 'X_umap', - 'min_dist': 0.5, - 'spread': 1.0, - 'num_components': 2, - 'max_iter': None, - 'alpha': 1.0, - 'gamma': 1.0, - 'negative_sample_rate': 5, - 'init_pos': 'spectral', - 'uns_neighbors': 'neighbors' + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "output": "output.h5mu", + "obsm_output": "X_umap", + "min_dist": 0.5, + "spread": 1.0, + "num_components": 2, + "max_iter": None, + "alpha": 1.0, + "gamma": 1.0, + "negative_sample_rate": 5, + "init_pos": "spectral", + "uns_neighbors": "neighbors", } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading %s", par["input"]) mdata = mu.read_h5mu(par["input"]) -logger.info("Computing UMAP for modality '%s'", par['modality']) -data = mdata.mod[par['modality']] +logger.info("Computing UMAP for modality '%s'", par["modality"]) +data = mdata.mod[par["modality"]] -if par['uns_neighbors'] not in data.uns: - raise ValueError(f"'{par['uns_neighbors']}' was not found in .mod['{par['modality']}'].uns.") +if par["uns_neighbors"] not in data.uns: + raise ValueError( + f"'{par['uns_neighbors']}' was not found in .mod['{par['modality']}'].uns." + ) # create temporary AnnData # ... because sc.tl.umap doesn't allow to choose # the obsm output slot # ... also we can see scanpy is a data format dependency hell neigh_key = par["uns_neighbors"] -temp_uns = { neigh_key: data.uns[neigh_key] } -conn_key = temp_uns[neigh_key]['connectivities_key'] -dist_key = temp_uns[neigh_key]['distances_key'] +temp_uns = {neigh_key: data.uns[neigh_key]} +conn_key = temp_uns[neigh_key]["connectivities_key"] +dist_key = temp_uns[neigh_key]["distances_key"] temp_obsp = { - conn_key: data.obsp[conn_key], - dist_key: data.obsp[dist_key], -} -pca_key = temp_uns[neigh_key]['params']['use_rep'] -temp_obsm = { - pca_key: data.obsm[pca_key] + conn_key: data.obsp[conn_key], + dist_key: data.obsp[dist_key], } +pca_key = temp_uns[neigh_key]["params"]["use_rep"] +temp_obsm = {pca_key: data.obsm[pca_key]} -temp_adata = ad.AnnData( - obsm=temp_obsm, - obsp=temp_obsp, - uns=temp_uns, - shape=data.shape -) +temp_adata = ad.AnnData(obsm=temp_obsm, obsp=temp_obsp, uns=temp_uns, shape=data.shape) sc.tl.umap( temp_adata, @@ -68,12 +64,12 @@ gamma=par["gamma"], negative_sample_rate=par["negative_sample_rate"], init_pos=par["init_pos"], - neighbors_key=neigh_key + neighbors_key=neigh_key, ) -data.obsm[par['obsm_output']] = temp_adata.obsm['X_umap'] +data.obsm[par["obsm_output"]] = temp_adata.obsm["X_umap"] logger.info("Writing to %s.", par["output"]) mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) -logger.info("Finished") \ No newline at end of file +logger.info("Finished") diff --git a/src/dimred/umap/test.py b/src/dimred/umap/test.py index 3cdccfda488..ed83c305209 100644 --- a/src/dimred/umap/test.py +++ b/src/dimred/umap/test.py @@ -4,44 +4,63 @@ import mudata as mu ## VIASH START -meta = { - 'name': 'foo', - 'resources_dir': '/resources_test/pbmc_1k_protein_v3/' -} +meta = {"name": "foo", "resources_dir": "/resources_test/pbmc_1k_protein_v3/"} ## VIASH END input = meta["resources_dir"] + "/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + def test_umap(run_component, tmp_path): output = tmp_path / "output.h5mu" - run_component([ - "--input", input, - "--output", str(output), - "--obsm_output", "X_foo", - "--num_components", "26", - "--output_compression", "gzip" - ]) - + run_component( + [ + "--input", + input, + "--output", + str(output), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--output_compression", + "gzip", + ] + ) + assert output.is_file(), "No output was created." data = mu.read_h5mu(output) # check whether umap was found assert "X_foo" in data.mod["rna"].obsm, "Check whether output was found in .obsm" - assert data.mod["rna"].obsm["X_foo"].shape == (data.n_obs, 26), "Check whether output has correct shape" + assert data.mod["rna"].obsm["X_foo"].shape == ( + data.n_obs, + 26, + ), "Check whether output has correct shape" + def test_raise_if_uns_neighbor_is_missing(run_component, tmp_path): output = tmp_path / "output.h5mu" with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input, - "--output", str(output), - "--obsm_output", "X_foo", - "--num_components", "26", - "--uns_neighbors", "does_not_exist" - ]) + run_component( + [ + "--input", + input, + "--output", + str(output), + "--obsm_output", + "X_foo", + "--num_components", + "26", + "--uns_neighbors", + "does_not_exist", + ] + ) assert not output.is_file(), "No output should be created." - assert "ValueError: 'does_not_exist' was not found in .mod['rna'].uns." in \ - err.value.stdout.decode('utf-8') + assert ( + "ValueError: 'does_not_exist' was not found in .mod['rna'].uns." + in err.value.stdout.decode("utf-8") + ) + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/feature_annotation/highly_variable_features_scanpy/script.py b/src/feature_annotation/highly_variable_features_scanpy/script.py index ac6b97503d6..01dec6d9c7a 100644 --- a/src/feature_annotation/highly_variable_features_scanpy/script.py +++ b/src/feature_annotation/highly_variable_features_scanpy/script.py @@ -7,110 +7,115 @@ ## VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu', - 'modality': 'rna', - 'output': 'output.h5mu', - 'var_name_filter': 'filter_with_hvg', - 'do_subset': False, - 'flavor': 'seurat', - 'n_top_features': None, - 'min_mean': 0.0125, - 'max_mean': 3.0, - 'min_disp': 0.5, - 'span': 0.3, - 'n_bins': 20, - 'varm_name': 'hvg', - 'obs_batch_key': "batch", - 'layer': 'log_transformed' + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "output": "output.h5mu", + "var_name_filter": "filter_with_hvg", + "do_subset": False, + "flavor": "seurat", + "n_top_features": None, + "min_mean": 0.0125, + "max_mean": 3.0, + "min_disp": 0.5, + "span": 0.3, + "n_bins": 20, + "varm_name": "hvg", + "obs_batch_key": "batch", + "layer": "log_transformed", } -meta = { - "resources_dir": "." -} +meta = {"resources_dir": "."} -mu_in = mu.read_h5mu('resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu') +mu_in = mu.read_h5mu( + "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) rna_in = mu_in.mod["rna"] assert "filter_with_hvg" not in rna_in.var.columns log_transformed = sc.pp.log1p(rna_in, copy=True) -rna_in.layers['log_transformed'] = log_transformed.X -rna_in.uns['log1p'] = log_transformed.uns['log1p'] +rna_in.layers["log_transformed"] = log_transformed.X +rna_in.uns["log1p"] = log_transformed.uns["log1p"] temp_h5mu = "lognormed.h5mu" -rna_in.obs['batch'] = 'A' -column_index = rna_in.obs.columns.get_indexer(['batch']) -rna_in.obs.iloc[slice(rna_in.n_obs//2, None), column_index] = 'B' +rna_in.obs["batch"] = "A" +column_index = rna_in.obs.columns.get_indexer(["batch"]) +rna_in.obs.iloc[slice(rna_in.n_obs // 2, None), column_index] = "B" mu_in.write_h5mu(temp_h5mu) -par['input'] = temp_h5mu +par["input"] = temp_h5mu ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() mdata = mu.read_h5mu(par["input"]) mdata.var_names_make_unique() -mod = par['modality'] -logger.info(f"Processing modality '%s'", mod) +mod = par["modality"] +logger.info("Processing modality '%s'", mod) data = mdata.mod[mod] -if par["layer"] and not par['layer'] in data.layers: - raise ValueError(f"Layer '{par['layer']}' not found in layers for modality '{mod}'. " - f"Found layers are: {','.join(data.layers)}") +if par["layer"] and par["layer"] not in data.layers: + raise ValueError( + f"Layer '{par['layer']}' not found in layers for modality '{mod}'. " + f"Found layers are: {','.join(data.layers)}" + ) -# input layer argument does not work when batch_key is specified because +# input layer argument does not work when batch_key is specified because # it still uses .X to filter out genes with 0 counts, even if .X might not exist. # So create a custom anndata as input that always uses .X input_layer = data.X if not par["layer"] else data.layers[par["layer"]] obs = pd.DataFrame(index=data.obs_names.copy()) var = pd.DataFrame(index=data.var_names.copy()) if par["obs_batch_key"]: - obs = data.obs.loc[:,par["obs_batch_key"]].to_frame() + obs = data.obs.loc[:, par["obs_batch_key"]].to_frame() input_anndata = ad.AnnData(X=input_layer.copy(), obs=obs, var=var) -if 'log1p' in data.uns: - input_anndata.uns['log1p'] = data.uns['log1p'] +if "log1p" in data.uns: + input_anndata.uns["log1p"] = data.uns["log1p"] # Workaround for issue # https://github.com/scverse/scanpy/issues/2239 # https://github.com/scverse/scanpy/issues/2181 -if par['flavor'] != "seurat_v3": +if par["flavor"] != "seurat_v3": # This component requires log normalized data when flavor is not seurat_v3 # We assume that the data is correctly normalized but scanpy will look at # .uns to check the transformations performed on the data. # To prevent scanpy from automatically tranforming the counts when they are # already transformed, we set the appropriate values to .uns. - if 'log1p' not in input_anndata.uns: - logger.warning("When flavor is not set to 'seurat_v3', " - "the input data for this component must be log-transformed. " - "However, the 'log1p' dictionairy in .uns has not been set. " - "This is fine if you did not log transform your data with scanpy." - "Otherwise, please check if you are providing log transformed " - "data using --layer.") - input_anndata.uns['log1p'] = {'base': None} - elif 'log1p' in input_anndata.uns and 'base' not in input_anndata.uns['log1p']: - input_anndata.uns['log1p']['base'] = None + if "log1p" not in input_anndata.uns: + logger.warning( + "When flavor is not set to 'seurat_v3', " + "the input data for this component must be log-transformed. " + "However, the 'log1p' dictionairy in .uns has not been set. " + "This is fine if you did not log transform your data with scanpy." + "Otherwise, please check if you are providing log transformed " + "data using --layer." + ) + input_anndata.uns["log1p"] = {"base": None} + elif "log1p" in input_anndata.uns and "base" not in input_anndata.uns["log1p"]: + input_anndata.uns["log1p"]["base"] = None logger.info("\tUnfiltered data: %s", data) logger.info("\tComputing hvg") # construct arguments hvg_args = { - 'adata': input_anndata, - 'n_top_genes': par["n_top_features"], - 'min_mean': par["min_mean"], - 'max_mean': par["max_mean"], - 'min_disp': par["min_disp"], - 'span': par["span"], - 'n_bins': par["n_bins"], - 'flavor': par["flavor"], - 'subset': False, - 'inplace': False, - 'layer': None, # Always uses .X because the input layer was already handled + "adata": input_anndata, + "n_top_genes": par["n_top_features"], + "min_mean": par["min_mean"], + "max_mean": par["max_mean"], + "min_disp": par["min_disp"], + "span": par["span"], + "n_bins": par["n_bins"], + "flavor": par["flavor"], + "subset": False, + "inplace": False, + "layer": None, # Always uses .X because the input layer was already handled } optional_parameters = { "max_disp": "max_disp", "obs_batch_key": "batch_key", - "n_top_genes": "n_top_features" + "n_top_genes": "n_top_features", } # only add parameter if it's passed for par_name, dest_name in optional_parameters.items(): @@ -118,23 +123,31 @@ hvg_args[dest_name] = par[par_name] # scanpy does not do this check, although it is stated in the documentation -if par['flavor'] == "seurat_v3" and not par['n_top_features']: - raise ValueError("When flavor is set to 'seurat_v3', you are required to set 'n_top_features'.") +if par["flavor"] == "seurat_v3" and not par["n_top_features"]: + raise ValueError( + "When flavor is set to 'seurat_v3', you are required to set 'n_top_features'." + ) # call function try: out = sc.pp.highly_variable_genes(**hvg_args) - if par['obs_batch_key'] is not None: + if par["obs_batch_key"] is not None: out = out.reindex(index=data.var.index, method=None) - assert (out.index == data.var.index).all(), "Expected output index values to be equivalent to the input index" + assert ( + out.index == data.var.index + ).all(), "Expected output index values to be equivalent to the input index" except ValueError as err: if str(err) == "cannot specify integer `bins` when input data contains infinity": - err.args = ("Cannot specify integer `bins` when input data contains infinity. " - "Perhaps input data has not been log normalized?",) + err.args = ( + "Cannot specify integer `bins` when input data contains infinity. " + "Perhaps input data has not been log normalized?", + ) if re.search("Bin edges must be unique:", str(err)): - raise RuntimeError("Scanpy failed to calculate hvg. The error " - "returned by scanpy (see above) could be the " - "result from trying to use this component on unfiltered data.") from err + raise RuntimeError( + "Scanpy failed to calculate hvg. The error " + "returned by scanpy (see above) could be the " + "result from trying to use this component on unfiltered data." + ) from err raise err out.index = data.var.index @@ -142,7 +155,7 @@ if par.get("var_name_filter", None) is not None: data.var[par["var_name_filter"]] = out["highly_variable"] -if par.get("varm_name", None) is not None and 'mean_bin' in out: +if par.get("varm_name", None) is not None and "mean_bin" in out: # drop mean_bin as mudata/anndata doesn't support tuples data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1) diff --git a/src/feature_annotation/highly_variable_features_scanpy/test.py b/src/feature_annotation/highly_variable_features_scanpy/test.py index 4f1f111e78c..0a8b5710475 100644 --- a/src/feature_annotation/highly_variable_features_scanpy/test.py +++ b/src/feature_annotation/highly_variable_features_scanpy/test.py @@ -10,50 +10,57 @@ ## VIASH START meta = { - 'resources_dir': 'resources_test/', - 'config': './src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml', - 'executable': './target/executable/feature_annotation/highly_variable_features_scanpy/highly_variable_features_scanpy' + "resources_dir": "resources_test/", + "config": "./src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml", + "executable": "./target/executable/feature_annotation/highly_variable_features_scanpy/highly_variable_features_scanpy", } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + @pytest.fixture def input_path(): return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + @pytest.fixture def input_data(input_path): mu_in = mu.read_h5mu(input_path) return mu_in + @pytest.fixture def lognormed_test_data(input_data): rna_in = input_data.mod["rna"] assert "filter_with_hvg" not in rna_in.var.columns log_transformed = sc.pp.log1p(rna_in, copy=True) - rna_in.layers['log_transformed'] = log_transformed.X - rna_in.uns['log1p'] = log_transformed.uns['log1p'] + rna_in.layers["log_transformed"] = log_transformed.X + rna_in.uns["log1p"] = log_transformed.uns["log1p"] return input_data + @pytest.fixture def lognormed_test_data_path(tmp_path, lognormed_test_data): temp_h5mu = tmp_path / "lognormed.h5mu" lognormed_test_data.write_h5mu(temp_h5mu) return temp_h5mu + @pytest.fixture def lognormed_batch_test_data_path(tmp_path, lognormed_test_data): temp_h5mu = tmp_path / "lognormed_batch.h5mu" - rna_mod = lognormed_test_data.mod['rna'] - rna_mod.obs['batch'] = 'A' - column_index = rna_mod.obs.columns.get_indexer(['batch']) - rna_mod.obs.iloc[slice(rna_mod.n_obs//2, None), column_index] = 'B' + rna_mod = lognormed_test_data.mod["rna"] + rna_mod.obs["batch"] = "A" + column_index = rna_mod.obs.columns.get_indexer(["batch"]) + rna_mod.obs.iloc[slice(rna_mod.n_obs // 2, None), column_index] = "B" lognormed_test_data.write_h5mu(temp_h5mu) return temp_h5mu + @pytest.fixture() def filter_data_path(tmp_path, input_data): temp_h5mu = tmp_path / "filtered.h5mu" @@ -64,82 +71,140 @@ def filter_data_path(tmp_path, input_data): def test_filter_with_hvg(run_component, lognormed_test_data_path): - out = run_component([ - "--flavor", "seurat", - "--input", lognormed_test_data_path, - "--output", "output.h5mu", - "--layer", "log_transformed", - "--output_compression", "gzip"]) + run_component( + [ + "--flavor", + "seurat", + "--input", + lognormed_test_data_path, + "--output", + "output.h5mu", + "--layer", + "log_transformed", + "--output_compression", + "gzip", + ] + ) assert os.path.exists("output.h5mu") data = mu.read_h5mu("output.h5mu") assert "filter_with_hvg" in data.mod["rna"].var.columns -def test_filter_with_hvg_batch_with_batch(run_component, lognormed_batch_test_data_path): + +def test_filter_with_hvg_batch_with_batch( + run_component, lognormed_batch_test_data_path +): """ Make sure that selecting a layer works together with obs_batch_key. https://github.com/scverse/scanpy/issues/2396 """ - run_component([ - "--flavor", "seurat", - "--input", lognormed_batch_test_data_path, - "--output", "output.h5mu", - "--obs_batch_key", "batch", - "--layer", "log_transformed"]) + run_component( + [ + "--flavor", + "seurat", + "--input", + lognormed_batch_test_data_path, + "--output", + "output.h5mu", + "--obs_batch_key", + "batch", + "--layer", + "log_transformed", + ] + ) assert os.path.exists("output.h5mu") output_data = mu.read_h5mu("output.h5mu") assert "filter_with_hvg" in output_data.mod["rna"].var.columns # Check the contents of the output to check if the correct layer was selected input_mudata = mu.read_h5mu(lognormed_batch_test_data_path) - input_data = input_mudata.mod['rna'].copy() - input_data.X = input_data.layers['log_transformed'].copy() - del input_data.layers['log_transformed'] - input_data.uns['log1p']['base'] = None - expected_output = sc.pp.highly_variable_genes(input_data, batch_key="batch", inplace=False, subset=False) - expected_output = expected_output.reindex(index=input_mudata.mod['rna'].var.index) - pd.testing.assert_series_equal(expected_output['highly_variable'], - output_data.mod['rna'].var['filter_with_hvg'], - check_names=False) + input_data = input_mudata.mod["rna"].copy() + input_data.X = input_data.layers["log_transformed"].copy() + del input_data.layers["log_transformed"] + input_data.uns["log1p"]["base"] = None + expected_output = sc.pp.highly_variable_genes( + input_data, batch_key="batch", inplace=False, subset=False + ) + expected_output = expected_output.reindex(index=input_mudata.mod["rna"].var.index) + pd.testing.assert_series_equal( + expected_output["highly_variable"], + output_data.mod["rna"].var["filter_with_hvg"], + check_names=False, + ) + def test_filter_with_hvg_seurat_v3_requires_n_top_features(run_component, input_path): with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_path, - "--flavor", "seurat_v3", # Uses raw data. - "--output", "output.h5mu"]) - assert re.search(f"When flavor is set to 'seurat_v3', you are required to set 'n_top_features'.", - err.value.stdout.decode('utf-8')) + run_component( + [ + "--input", + input_path, + "--flavor", + "seurat_v3", # Uses raw data. + "--output", + "output.h5mu", + ] + ) + assert re.search( + "When flavor is set to 'seurat_v3', you are required to set 'n_top_features'.", + err.value.stdout.decode("utf-8"), + ) + def test_filter_with_hvg_seurat_v3(run_component, input_path): - run_component([ - "--input", input_path, - "--flavor", "seurat_v3", # Uses raw data. - "--output", "output.h5mu", - "--n_top_features", "50"]) + run_component( + [ + "--input", + input_path, + "--flavor", + "seurat_v3", # Uses raw data. + "--output", + "output.h5mu", + "--n_top_features", + "50", + ] + ) assert os.path.exists("output.h5mu") data = mu.read_h5mu("output.h5mu") assert "filter_with_hvg" in data.mod["rna"].var.columns + def test_filter_with_hvg_cell_ranger(run_component, filter_data_path): - run_component([ - "--input", filter_data_path, - "--flavor", "cell_ranger", # Must use filtered data. - "--output", "output.h5mu"]) + run_component( + [ + "--input", + filter_data_path, + "--flavor", + "cell_ranger", # Must use filtered data. + "--output", + "output.h5mu", + ] + ) assert os.path.exists("output.h5mu") data = mu.read_h5mu("output.h5mu") assert "filter_with_hvg" in data.mod["rna"].var.columns -def test_filter_with_hvg_cell_ranger_unfiltered_data_change_error_message(run_component, input_path): - with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_path, - "--flavor", "cell_ranger", # Must use filtered data, but in this test we use unfiltered data - "--output", "output.h5mu"]) - assert re.search(r"Scanpy failed to calculate hvg. The error " - r"returned by scanpy \(see above\) could be the " - r"result from trying to use this component on unfiltered data.", - err.value.stdout.decode('utf-8')) - -if __name__ == '__main__': +def test_filter_with_hvg_cell_ranger_unfiltered_data_change_error_message( + run_component, input_path +): + with pytest.raises(subprocess.CalledProcessError) as err: + run_component( + [ + "--input", + input_path, + "--flavor", + "cell_ranger", # Must use filtered data, but in this test we use unfiltered data + "--output", + "output.h5mu", + ] + ) + assert re.search( + r"Scanpy failed to calculate hvg. The error " + r"returned by scanpy \(see above\) could be the " + r"result from trying to use this component on unfiltered data.", + err.value.stdout.decode("utf-8"), + ) + + +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/feature_annotation/score_genes_cell_cycle_scanpy/script.py b/src/feature_annotation/score_genes_cell_cycle_scanpy/script.py index b1b87664e71..91595d8354b 100644 --- a/src/feature_annotation/score_genes_cell_cycle_scanpy/script.py +++ b/src/feature_annotation/score_genes_cell_cycle_scanpy/script.py @@ -23,11 +23,9 @@ "n_bins": 25, "random_state": 0, "output_compression": "gzip", - "allow_missing_genes": False -} -meta = { - "resources_dir": "src/feature_annotation/score_genes_scanpy" + "allow_missing_genes": False, } +meta = {"resources_dir": "src/feature_annotation/score_genes_scanpy"} ## VIASH END # import helper functions @@ -38,7 +36,11 @@ mdata = mu.read(par["input"]) input_adata = mdata.mod[par["modality"]] -gene_names_index = input_adata.var[par["var_gene_names"]] if par["var_gene_names"] else input_adata.var_names +gene_names_index = ( + input_adata.var[par["var_gene_names"]] + if par["var_gene_names"] + else input_adata.var_names +) gene_names = pd.Series(input_adata.var_names, index=gene_names_index) # check if var index is unique @@ -49,7 +51,9 @@ # read gene lists s_genes = read_gene_list(par, gene_names.index, "s_genes", "s_genes_file") g2m_genes = read_gene_list(par, gene_names.index, "g2m_genes", "g2m_genes_file") -gene_pool = read_gene_list(par, gene_names.index, "gene_pool", "gene_pool_file", required=False) +gene_pool = read_gene_list( + par, gene_names.index, "gene_pool", "gene_pool_file", required=False +) # find matching index names for given genes g2m_index = gene_names.loc[g2m_genes].tolist() @@ -64,7 +68,7 @@ adata_scanpy = ad.AnnData( X=X_data, obs=pd.DataFrame(index=input_adata.obs.index), - var=pd.DataFrame(index=input_adata.var.index) + var=pd.DataFrame(index=input_adata.var.index), ) # run score_genes_cell_cycle @@ -74,16 +78,18 @@ g2m_genes=g2m_index, gene_pool=gene_pool_index, n_bins=par["n_bins"], - random_state=par["random_state"] + random_state=par["random_state"], ) # copy results to mudata output_slot_mapping = { par["obs_s_score"]: "S_score", par["obs_g2m_score"]: "G2M_score", - par["obs_phase"]: "phase" + par["obs_phase"]: "phase", } -assert all(adata_scanpy.obs.index == input_adata.obs.index), "index mismatch between input adata and scanpy output adata" +assert all( + adata_scanpy.obs.index == input_adata.obs.index +), "index mismatch between input adata and scanpy output adata" for dest, orig in output_slot_mapping.items(): input_adata.obs[dest] = adata_scanpy.obs[orig] diff --git a/src/feature_annotation/score_genes_cell_cycle_scanpy/test.py b/src/feature_annotation/score_genes_cell_cycle_scanpy/test.py index b3b06bcaca8..62edd3ebd9b 100644 --- a/src/feature_annotation/score_genes_cell_cycle_scanpy/test.py +++ b/src/feature_annotation/score_genes_cell_cycle_scanpy/test.py @@ -6,25 +6,42 @@ input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + def test_cell_scoring_cell_cycle(run_component, tmp_path): output_file = tmp_path / "output.h5mu" - run_component([ - "--input", input_file, - "--modality", "rna", - "--input_layer", "log_normalized", - "--var_gene_names", "gene_symbol", - "--s_genes", "MCM5", - "--s_genes", "PCNA", - "--s_genes", "TYMS", - "--g2m_genes", "UBE2C", - "--g2m_genes", "BIRC5", - "--g2m_genes", "TPX2", - "--output", output_file, - "--obs_phase", "my_phase", - "--obs_s_score", "my_s_score", - "--obs_g2m_score", "my_g2m_score", - ]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--s_genes", + "MCM5", + "--s_genes", + "PCNA", + "--s_genes", + "TYMS", + "--g2m_genes", + "UBE2C", + "--g2m_genes", + "BIRC5", + "--g2m_genes", + "TPX2", + "--output", + output_file, + "--obs_phase", + "my_phase", + "--obs_s_score", + "my_s_score", + "--obs_g2m_score", + "my_g2m_score", + ] + ) output = mu.read(output_file) @@ -35,8 +52,9 @@ def test_cell_scoring_cell_cycle(run_component, tmp_path): "my_g2m_score", ] for col in expected_rna_obs_cols: - assert col in output.mod["rna"].obs.columns, \ - f"could not find columns mdata.mod['rna'].obs['{col}']" + assert ( + col in output.mod["rna"].obs.columns + ), f"could not find columns mdata.mod['rna'].obs['{col}']" def test_cell_scoring_cell_cycle_with_alternative_args(run_component, tmp_path): @@ -49,15 +67,24 @@ def test_cell_scoring_cell_cycle_with_alternative_args(run_component, tmp_path): with open(s_gene_file, "w") as f: f.write("MCM5\nPCNA\nTYMS\n") - run_component([ - "--input", input_file, - "--modality", "rna", - "--input_layer", "log_normalized", - "--var_gene_names", "gene_symbol", - "--s_genes_file", s_gene_file, - "--g2m_genes_file", g2m_gene_file, - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--s_genes_file", + s_gene_file, + "--g2m_genes_file", + g2m_gene_file, + "--output", + output_file, + ] + ) output = mu.read(output_file) @@ -68,8 +95,9 @@ def test_cell_scoring_cell_cycle_with_alternative_args(run_component, tmp_path): "G2M_score", ] for col in expected_rna_obs_cols: - assert col in output.mod["rna"].obs.columns, \ - f"could not find columns mdata.mod['rna'].obs['{col}']" + assert ( + col in output.mod["rna"].obs.columns + ), f"could not find columns mdata.mod['rna'].obs['{col}']" def test_cell_scoring_cell_cycle_with_mixed_args(run_component, tmp_path): @@ -82,17 +110,28 @@ def test_cell_scoring_cell_cycle_with_mixed_args(run_component, tmp_path): with open(s_gene_file, "w") as f: f.write("MCM5\nPCNA\nTYMS\n") - run_component([ - "--input", input_file, - "--modality", "rna", - "--input_layer", "log_normalized", - "--var_gene_names", "gene_symbol", - "--s_genes_file", s_gene_file, - "--s_genes", "FEN1", - "--g2m_genes_file", g2m_gene_file, - "--g2m_genes", "TOP2A", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--s_genes_file", + s_gene_file, + "--s_genes", + "FEN1", + "--g2m_genes_file", + g2m_gene_file, + "--g2m_genes", + "TOP2A", + "--output", + output_file, + ] + ) output = mu.read(output_file) @@ -103,30 +142,42 @@ def test_cell_scoring_cell_cycle_with_mixed_args(run_component, tmp_path): "G2M_score", ] for col in expected_rna_obs_cols: - assert col in output.mod["rna"].obs.columns, \ - f"could not find columns mdata.mod['rna'].obs['{col}']" + assert ( + col in output.mod["rna"].obs.columns + ), f"could not find columns mdata.mod['rna'].obs['{col}']" + def test_fail(run_component, tmp_path): output_file = tmp_path / "output_newest.h5mu" with pytest.raises(subprocess.CalledProcessError) as e_info: - run_component([ - "--input", input_file, - "--modality", "rna", - "--input_layer", "log_normalized", - "--var_gene_names", "gene_symbol", - "--s_genes", "a_gene_name_that_does_not_exist", - "--g2m_genes", "MCM5", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--s_genes", + "a_gene_name_that_does_not_exist", + "--g2m_genes", + "MCM5", + "--output", + output_file, + ] + ) assert e_info.value.returncode != 0 expected_error = r"The follow genes are missing from the input dataset: {\'a_gene_name_that_does_not_exist\'}" - assert re.search(expected_error, e_info.value.stdout.decode('utf-8')) is not None, \ - f"expected error message not found in {e_info.value.stdout.decode('utf-8')}" + assert ( + re.search(expected_error, e_info.value.stdout.decode("utf-8")) is not None + ), f"expected error message not found in {e_info.value.stdout.decode('utf-8')}" assert not output_file.exists(), f"output file should not exist: {output_file}" -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/feature_annotation/score_genes_scanpy/helper.py b/src/feature_annotation/score_genes_scanpy/helper.py index b1625ac8c31..fe559658066 100644 --- a/src/feature_annotation/score_genes_scanpy/helper.py +++ b/src/feature_annotation/score_genes_scanpy/helper.py @@ -1,11 +1,13 @@ from typing import List, Dict, Any, Optional + def read_gene_list( - par: Dict[str, Any], - gene_names: List[str], - list_key: str, - file_key: str, - required: bool = True) -> Optional[List[str]]: + par: Dict[str, Any], + gene_names: List[str], + list_key: str, + file_key: str, + required: bool = True, +) -> Optional[List[str]]: """ Reads a gene list from the parameters and returns it as a list of strings. """ @@ -27,7 +29,9 @@ def read_gene_list( if not par["allow_missing_genes"] and list_of_genes: missing = set(list_of_genes).difference(gene_names) if missing: - raise ValueError(f"The follow genes are missing from the input dataset: {missing}") + raise ValueError( + f"The follow genes are missing from the input dataset: {missing}" + ) # return gene list if list_of_genes: diff --git a/src/feature_annotation/score_genes_scanpy/script.py b/src/feature_annotation/score_genes_scanpy/script.py index 8f87c457ccd..dc8eed87f4b 100644 --- a/src/feature_annotation/score_genes_scanpy/script.py +++ b/src/feature_annotation/score_genes_scanpy/script.py @@ -22,9 +22,7 @@ "output_compression": "gzip", "allow_missing_genes": False, } -meta = { - "resources_dir": "src/feature_annotation/score_genes_scanpy" -} +meta = {"resources_dir": "src/feature_annotation/score_genes_scanpy"} ## VIASH END sys.path.append(meta["resources_dir"]) @@ -34,7 +32,11 @@ mdata = mu.read(par["input"]) input_adata = mdata.mod[par["modality"]] -gene_names_index = input_adata.var[par["var_gene_names"]] if par["var_gene_names"] else input_adata.var_names +gene_names_index = ( + input_adata.var[par["var_gene_names"]] + if par["var_gene_names"] + else input_adata.var_names +) gene_names = pd.Series(input_adata.var_names, index=gene_names_index) # check if var index is unique @@ -44,7 +46,9 @@ # read gene list gene_list = read_gene_list(par, gene_names.index, "gene_list", "gene_list_file") -gene_pool = read_gene_list(par, gene_names.index, "gene_pool", "gene_pool_file", required=False) +gene_pool = read_gene_list( + par, gene_names.index, "gene_pool", "gene_pool_file", required=False +) # find matching index names for given genes gene_list_index = gene_names.loc[gene_list].tolist() @@ -58,7 +62,7 @@ adata_scanpy = ad.AnnData( X=layer_data, obs=pd.DataFrame(index=input_adata.obs.index), - var=pd.DataFrame(index=input_adata.var.index) + var=pd.DataFrame(index=input_adata.var.index), ) # run score_genes @@ -68,11 +72,13 @@ gene_pool=gene_pool_index, ctrl_size=par["ctrl_size"], n_bins=par["n_bins"], - random_state=par["random_state"] + random_state=par["random_state"], ) # copy results to mudata -assert all(adata_scanpy.obs.index == input_adata.obs.index), "index mismatch between input adata and scanpy output adata" +assert all( + adata_scanpy.obs.index == input_adata.obs.index +), "index mismatch between input adata and scanpy output adata" input_adata.obs[par["obs_score"]] = adata_scanpy.obs["score"] # write output to mudata diff --git a/src/feature_annotation/score_genes_scanpy/test.py b/src/feature_annotation/score_genes_scanpy/test.py index 30f95c86734..2eef6964bc6 100644 --- a/src/feature_annotation/score_genes_scanpy/test.py +++ b/src/feature_annotation/score_genes_scanpy/test.py @@ -11,100 +11,142 @@ def gene_list_file(tmp_path): result = tmp_path / "s_genes.txt" gene_list = ["UBE2C", "BIRC5", "TPX2"] - with result.open('w') as open_gene_list_file: + with result.open("w") as open_gene_list_file: open_gene_list_file.write("\n".join(gene_list)) return result def test_cell_scoring(run_component, tmp_path): - output_file = tmp_path / "output.h5mu" - run_component([ - "--input", input_file, - "--modality", "rna", - "--input_layer", "log_normalized", - "--var_gene_names", "gene_symbol", - "--gene_list", "UBE2C", - "--gene_list", "BIRC5", - "--gene_list", "TPX2", - "--output", output_file, - "--obs_score", 'cell_cycle_score' - ]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--gene_list", + "UBE2C", + "--gene_list", + "BIRC5", + "--gene_list", + "TPX2", + "--output", + output_file, + "--obs_score", + "cell_cycle_score", + ] + ) output = mu.read(output_file) # check output expected_rna_obs_cols = ["cell_cycle_score"] for col in expected_rna_obs_cols: - assert col in output.mod["rna"].obs.columns, \ - f"could not find columns .mod['rna'].obs['{col}']" + assert ( + col in output.mod["rna"].obs.columns + ), f"could not find columns .mod['rna'].obs['{col}']" def test_cell_scoring_with_alternative_args(run_component, tmp_path, gene_list_file): output_file = tmp_path / "output_new.h5mu" - run_component([ - "--input", input_file, - "--modality", "rna", - "--input_layer", "log_normalized", - "--var_gene_names", "gene_symbol", - "--gene_list_file", gene_list_file, - "--output", output_file, - "--obs_score", 'cell_cycle_score' - ]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--gene_list_file", + gene_list_file, + "--output", + output_file, + "--obs_score", + "cell_cycle_score", + ] + ) output = mu.read(output_file) # check output expected_rna_obs_cols = ["cell_cycle_score"] for col in expected_rna_obs_cols: - assert col in output.mod["rna"].obs.columns, \ - f"could not find columns mdata.mod['rna'].obs['{col}']" + assert ( + col in output.mod["rna"].obs.columns + ), f"could not find columns mdata.mod['rna'].obs['{col}']" def test_cell_scoring_with_mixed_args(run_component, tmp_path, gene_list_file): output_file = tmp_path / "output_new.h5mu" - run_component([ - "--input", input_file, - "--modality", "rna", - "--input_layer", "log_normalized", - "--var_gene_names", "gene_symbol", - "--gene_list_file", gene_list_file, - "--gene_list", "TOP2A", - "--output", output_file, - "--obs_score", 'cell_cycle_score' - ]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--gene_list_file", + gene_list_file, + "--gene_list", + "TOP2A", + "--output", + output_file, + "--obs_score", + "cell_cycle_score", + ] + ) output = mu.read(output_file) # check output expected_rna_obs_cols = ["cell_cycle_score"] for col in expected_rna_obs_cols: - assert col in output.mod["rna"].obs.columns, \ - f"could not find columns mdata.mod['rna'].obs['{col}']" + assert ( + col in output.mod["rna"].obs.columns + ), f"could not find columns mdata.mod['rna'].obs['{col}']" + def test_fail(run_component, tmp_path): output_file = tmp_path / "output_newest.h5mu" with pytest.raises(subprocess.CalledProcessError) as e_info: - run_component([ - "--input", input_file, - "--modality", "rna", - "--input_layer", "log_normalized", - "--var_gene_names", "gene_symbol", - "--gene_list", "a_gene_name_that_does_not_exist", - "--output", output_file - ]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--input_layer", + "log_normalized", + "--var_gene_names", + "gene_symbol", + "--gene_list", + "a_gene_name_that_does_not_exist", + "--output", + output_file, + ] + ) assert e_info.value.returncode != 0 expected_error = r"The follow genes are missing from the input dataset: {\'a_gene_name_that_does_not_exist\'}" - assert re.search(expected_error, e_info.value.stdout.decode('utf-8')) is not None, \ - f"expected error message not found in {e_info.value.stdout.decode('utf-8')}" + assert ( + re.search(expected_error, e_info.value.stdout.decode("utf-8")) is not None + ), f"expected error message not found in {e_info.value.stdout.decode('utf-8')}" assert not output_file.exists(), f"output file should not exist: {output_file}" -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/filter/delimit_fraction/script.py b/src/filter/delimit_fraction/script.py index 514f7d36509..8e2ad51e238 100644 --- a/src/filter/delimit_fraction/script.py +++ b/src/filter/delimit_fraction/script.py @@ -1,4 +1,3 @@ - import mudata as mu import numpy as np import sys @@ -8,18 +7,19 @@ ### VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu', - 'modality': 'rna', - 'output': 'output.h5mu', - 'var_name_filter': 'filter_with_counts', - 'min_fraction': 0, - 'max_fraction': 1, - 'output_compression': 'gzip' + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "output": "output.h5mu", + "var_name_filter": "filter_with_counts", + "min_fraction": 0, + "max_fraction": 1, + "output_compression": "gzip", } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading input data") @@ -27,7 +27,7 @@ mdata.var_names_make_unique() -mod = par['modality'] +mod = par["modality"] logger.info("Processing modality %s.", mod) data = mdata.mod[mod] @@ -35,18 +35,22 @@ logger.info("\tComputing aggregations.") + def apply_filter_to_mask(mask, base, filter, comparator): new_filt = np.ravel(comparator(base, filter)) num_removed = np.sum(np.invert(new_filt) & mask) mask &= new_filt return num_removed, mask + try: - fraction = data.obs[par['obs_fraction_column']] + fraction = data.obs[par["obs_fraction_column"]] except KeyError: raise ValueError(f"Could not find column '{par['obs_fraction_column']}'") if not is_float_dtype(fraction): - raise ValueError(f"Column '{par['obs_fraction_column']}' does not contain float datatype.") + raise ValueError( + f"Column '{par['obs_fraction_column']}' does not contain float datatype." + ) if fraction.max() > 1: raise ValueError(f"Column '{par['obs_fraction_column']}' contains values > 1.") if fraction.min() < 0: @@ -54,9 +58,20 @@ def apply_filter_to_mask(mask, base, filter, comparator): # Filter cells -filters = (("min_fraction", fraction, ge, "\tRemoving %s cells with <%s percentage mitochondrial reads."), - ("max_fraction", fraction, le, "\tRemoving %s cells with >%s percentage mitochondrial reads."), - ) +filters = ( + ( + "min_fraction", + fraction, + ge, + "\tRemoving %s cells with <%s percentage mitochondrial reads.", + ), + ( + "max_fraction", + fraction, + le, + "\tRemoving %s cells with >%s percentage mitochondrial reads.", + ), +) keep_cells = np.repeat(True, data.n_obs) for filter_name_or_value, base, comparator, message in filters: @@ -65,7 +80,9 @@ def apply_filter_to_mask(mask, base, filter, comparator): except KeyError: filter = filter_name_or_value if filter is not None: - num_removed, keep_cells = apply_filter_to_mask(keep_cells, base, filter, comparator) + num_removed, keep_cells = apply_filter_to_mask( + keep_cells, base, filter, comparator + ) logger.info(message, num_removed, filter) data.obs[par["obs_name_filter"]] = keep_cells @@ -74,4 +91,4 @@ def apply_filter_to_mask(mask, base, filter, comparator): logger.info("Writing output data to %s", par["output"]) mdata.write_h5mu(par["output"], compression=par["output_compression"]) -logger.info("Finished") \ No newline at end of file +logger.info("Finished") diff --git a/src/filter/delimit_fraction/test.py b/src/filter/delimit_fraction/test.py index a2cb6ce1a22..8a96ee59b63 100644 --- a/src/filter/delimit_fraction/test.py +++ b/src/filter/delimit_fraction/test.py @@ -8,15 +8,16 @@ ## VIASH START meta = { - 'executable': './target/executable/filter/delimit_fraction/delimit_fraction', - 'resources_dir': 'resources_test/', - 'config': "./src/filter/delimit_fraction/config.vsh.yaml" + "executable": "./target/executable/filter/delimit_fraction/delimit_fraction", + "resources_dir": "resources_test/", + "config": "./src/filter/delimit_fraction/config.vsh.yaml", } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() @@ -24,22 +25,26 @@ def original_input_path(): return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + @pytest.fixture def input_h5mu(original_input_path): input_data = mu.read_h5mu(original_input_path) - input_data.mod['rna'].obs['test_fraction'] = \ - np.random.rand(input_data.mod['rna'].n_obs) + input_data.mod["rna"].obs["test_fraction"] = np.random.rand( + input_data.mod["rna"].n_obs + ) return input_data @pytest.fixture def input_h5mu_string_data(original_input_path): input_data = mu.read_h5mu(original_input_path) - string_data = ['these', 'are', 'random', 'values'] - input_data.mod['rna'].obs['test_fraction'] = \ - np.random.choice(string_data, input_data.mod['rna'].n_obs) + string_data = ["these", "are", "random", "values"] + input_data.mod["rna"].obs["test_fraction"] = np.random.choice( + string_data, input_data.mod["rna"].n_obs + ) return input_data + @pytest.fixture def input_path(input_h5mu, random_h5mu_path): output_path = random_h5mu_path() @@ -53,65 +58,97 @@ def input_path_string_data(input_h5mu_string_data, random_h5mu_path): input_h5mu_string_data.write(output_path) return output_path + def test_filter_nothing(run_component, input_path, random_h5mu_path): output_path = random_h5mu_path() - run_component([ - "--input", input_path, - "--output", output_path, - "--min_fraction", "0", - "--max_fraction", "1", - "--output_compression", "gzip", - "--obs_name_filter", "test_output", - "--obs_fraction_column", "test_fraction" - ]) + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--min_fraction", + "0", + "--max_fraction", + "1", + "--output_compression", + "gzip", + "--obs_name_filter", + "test_output", + "--obs_fraction_column", + "test_fraction", + ] + ) assert Path(output_path).is_file() mu_out = mu.read_h5mu(output_path) assert "test_output" in mu_out.mod["rna"].obs - assert mu_out.mod['rna'].obs['test_output'].all() + assert mu_out.mod["rna"].obs["test_output"].all() - mu_out.mod['rna'].obs = mu_out.mod['rna'].obs.drop(["test_output"], axis=1) + mu_out.mod["rna"].obs = mu_out.mod["rna"].obs.drop(["test_output"], axis=1) mu_out.obs = mu_out.obs.drop(["rna:test_output"], axis=1) mu_out.update() - assert_annotation_objects_equal(input_path, mu_out) + assert_annotation_objects_equal(input_path, mu_out) + def test_filtering_a_little(run_component, input_path, random_h5mu_path): output_path = random_h5mu_path() - run_component([ - "--input", input_path, - "--output", output_path, - "--min_fraction", "0.5", - "--max_fraction", "0.7", - "--output_compression", "gzip", - "--obs_name_filter", "test_output", - "--obs_fraction_column", "test_fraction" - ]) + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--min_fraction", + "0.5", + "--max_fraction", + "0.7", + "--output_compression", + "gzip", + "--obs_name_filter", + "test_output", + "--obs_fraction_column", + "test_fraction", + ] + ) assert Path(output_path).is_file() mu_out = mu.read_h5mu(output_path) - assert not mu_out.mod['rna'].obs['test_output'].all() - assert mu_out.mod['rna'].obs['test_output'].any() + assert not mu_out.mod["rna"].obs["test_output"].all() + assert mu_out.mod["rna"].obs["test_output"].any() - mu_out.mod['rna'].obs = mu_out.mod['rna'].obs.drop(["test_output"], axis=1) + mu_out.mod["rna"].obs = mu_out.mod["rna"].obs.drop(["test_output"], axis=1) mu_out.obs = mu_out.obs.drop(["rna:test_output"], axis=1) mu_out.update() - assert_annotation_objects_equal(input_path, mu_out) + assert_annotation_objects_equal(input_path, mu_out) -def test_filtering_wrong_data_raises(run_component, input_path_string_data, - random_h5mu_path): +def test_filtering_wrong_data_raises( + run_component, input_path_string_data, random_h5mu_path +): output_path = random_h5mu_path() with pytest.raises(CalledProcessError) as err: - run_component([ - "--input", input_path_string_data, - "--output", output_path, - "--min_fraction", "0.5", - "--max_fraction", "0.7", - "--output_compression", "gzip", - "--obs_name_filter", "test_output", - "--obs_fraction_column", "test_fraction" - ]) - assert "Column 'test_fraction' does not contain float datatype." in \ - err.value.stdout.decode('utf-8') + run_component( + [ + "--input", + input_path_string_data, + "--output", + output_path, + "--min_fraction", + "0.5", + "--max_fraction", + "0.7", + "--output_compression", + "gzip", + "--obs_name_filter", + "test_output", + "--obs_fraction_column", + "test_fraction", + ] + ) + assert ( + "Column 'test_fraction' does not contain float datatype." + in err.value.stdout.decode("utf-8") + ) if __name__ == "__main__": diff --git a/src/filter/do_filter/script.py b/src/filter/do_filter/script.py index 746b18fe22a..d8b9a3d2135 100644 --- a/src/filter/do_filter/script.py +++ b/src/filter/do_filter/script.py @@ -4,25 +4,30 @@ ### VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu', - 'modality': 'rna', - 'obs_filter': ['filter_none', 'filter_with_random'], - 'var_filter': ['filter_with_random'], - 'output': 'output.h5mu' + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "obs_filter": ["filter_none", "filter_with_random"], + "var_filter": ["filter_with_random"], + "output": "output.h5mu", } mdata = mu.read_h5mu(par["input"]) -mdata.mod['rna'].obs["filter_none"] = np.repeat(True, mdata.mod['rna'].n_obs) -mdata.mod['rna'].obs["filter_with_random"] = np.random.choice(a=[False, True], size=mdata.mod['rna'].n_obs) -mdata.mod['rna'].var["filter_with_random"] = np.random.choice(a=[False, True], size=mdata.mod['rna'].n_vars) -mod = 'rna' +mdata.mod["rna"].obs["filter_none"] = np.repeat(True, mdata.mod["rna"].n_obs) +mdata.mod["rna"].obs["filter_with_random"] = np.random.choice( + a=[False, True], size=mdata.mod["rna"].n_obs +) +mdata.mod["rna"].var["filter_with_random"] = np.random.choice( + a=[False, True], size=mdata.mod["rna"].n_vars +) +mod = "rna" ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() -logger.info("Reading %s", par['input']) +logger.info("Reading %s", par["input"]) mdata = mu.read_h5mu(par["input"]) mod = par["modality"] @@ -36,14 +41,14 @@ for obs_name in par["obs_filter"]: logger.info("Filtering modality '%s' observations by .obs['%s']", mod, obs_name) - if not obs_name in mdata.mod[mod].obs: + if obs_name not in mdata.mod[mod].obs: raise ValueError(f".mod[{mod}].obs[{obs_name}] does not exist.") if obs_name in mdata.mod[mod].obs: obs_filt &= mdata.mod[mod].obs[obs_name] for var_name in par["var_filter"]: logger.info("Filtering modality '%s' variables by .var['%s']", mod, var_name) - if not var_name in mdata.mod[mod].var: + if var_name not in mdata.mod[mod].var: raise ValueError(f".mod[{mod}].var[{var_name}] does not exist.") if var_name in mdata.mod[mod].var: var_filt &= mdata.mod[mod].var[var_name] diff --git a/src/filter/do_filter/test.py b/src/filter/do_filter/test.py index 8c08dcd86f7..6b8403391b2 100644 --- a/src/filter/do_filter/test.py +++ b/src/filter/do_filter/test.py @@ -1,4 +1,3 @@ - import sys import pytest import uuid @@ -9,21 +8,23 @@ ## VIASH START meta = { - 'name': './target/native/filter/do_filter/do_filter', - 'resources_dir': 'resources_test/', - 'executable': './target/executable/filter/do_filter/do_filter', - 'config': './src/filter/do_filter/config.vsh.yaml' + "name": "./target/native/filter/do_filter/do_filter", + "resources_dir": "resources_test/", + "executable": "./target/executable/filter/do_filter/do_filter", + "config": "./src/filter/do_filter/config.vsh.yaml", } ## VIASH END input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + @pytest.fixture def random_h5mu_path(tmp_path): unique_filename = f"{str(uuid.uuid4())}.h5mu" temp_file = tmp_path / unique_filename return temp_file + @pytest.fixture def write_to_temp_file(tmp_path): def write_h5mu_wrapper(mudata_object): @@ -31,90 +32,143 @@ def write_h5mu_wrapper(mudata_object): temp_file = tmp_path / unique_filename mudata_object.write(temp_file) return temp_file + return write_h5mu_wrapper + @pytest.fixture() def input_data(): mu_in = mu.read_h5mu(input_path) return mu_in + @pytest.fixture() def original_n_obs(input_data): - return input_data.mod['rna'].n_obs + return input_data.mod["rna"].n_obs + @pytest.fixture() def original_n_vars(input_data): - return input_data.mod['rna'].n_vars + return input_data.mod["rna"].n_vars + @pytest.fixture() def test_data_filter_nothing(input_data, write_to_temp_file): - rna_mod = input_data.mod['rna'] + rna_mod = input_data.mod["rna"] rna_mod.obs["filter_none"] = np.repeat(True, rna_mod.n_obs) return write_to_temp_file(input_data) + @pytest.fixture() def test_data_filter_with_random(input_data, write_to_temp_file): - rna_mod = input_data.mod['rna'] - rna_mod.obs["filter_with_random"] = np.random.choice([False, True], size=rna_mod.n_obs) - rna_mod.var["filter_with_random"] = np.random.choice([False, True], size=rna_mod.n_vars) + rna_mod = input_data.mod["rna"] + rna_mod.obs["filter_with_random"] = np.random.choice( + [False, True], size=rna_mod.n_obs + ) + rna_mod.var["filter_with_random"] = np.random.choice( + [False, True], size=rna_mod.n_vars + ) return write_to_temp_file(input_data) -def test_filtering_a_little_bit(run_component, - test_data_filter_with_random, - random_h5mu_path, - original_n_obs, - original_n_vars): - component_output = run_component([ - "--input", test_data_filter_with_random, - "--output", random_h5mu_path, - "--obs_filter", "filter_with_random", - "--var_filter", "filter_with_random", - "--output_compression", "gzip" - ]) + +def test_filtering_a_little_bit( + run_component, + test_data_filter_with_random, + random_h5mu_path, + original_n_obs, + original_n_vars, +): + component_output = run_component( + [ + "--input", + test_data_filter_with_random, + "--output", + random_h5mu_path, + "--obs_filter", + "filter_with_random", + "--var_filter", + "filter_with_random", + "--output_compression", + "gzip", + ] + ) assert random_h5mu_path.is_file(), "Output file not found" mu_out = mu.read_h5mu(random_h5mu_path) - new_obs = mu_out.mod['rna'].n_obs - new_vars = mu_out.mod['rna'].n_vars + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars assert new_obs < original_n_obs, "Some RNA obs should have been filtered" - assert new_vars < original_n_vars,"Some RNA vars should have been filtered" - assert b"Filtering modality 'rna' observations by .obs['filter_with_random']" in component_output - assert b"Filtering modality 'rna' variables by .var['filter_with_random']" in component_output - -def test_filter_nothing(run_component, - test_data_filter_nothing, - random_h5mu_path, - original_n_obs, - original_n_vars): - run_component([ - "--input", test_data_filter_nothing, - "--output", random_h5mu_path, - "--obs_filter", "filter_none"]) + assert new_vars < original_n_vars, "Some RNA vars should have been filtered" + assert ( + b"Filtering modality 'rna' observations by .obs['filter_with_random']" + in component_output + ) + assert ( + b"Filtering modality 'rna' variables by .var['filter_with_random']" + in component_output + ) + + +def test_filter_nothing( + run_component, + test_data_filter_nothing, + random_h5mu_path, + original_n_obs, + original_n_vars, +): + run_component( + [ + "--input", + test_data_filter_nothing, + "--output", + random_h5mu_path, + "--obs_filter", + "filter_none", + ] + ) assert random_h5mu_path.is_file(), "Output file not found" mu_out = mu.read_h5mu(random_h5mu_path) - new_obs = mu_out.mod['rna'].n_obs - new_vars = mu_out.mod['rna'].n_vars + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars assert new_obs == original_n_obs, "No RNA obs should have been filtered" assert new_vars == original_n_vars, "No RNA vars should have been filtered" -def test_nonexisting_column_raises(run_component, - test_data_filter_nothing, - random_h5mu_path): + +def test_nonexisting_column_raises( + run_component, test_data_filter_nothing, random_h5mu_path +): with pytest.raises(CalledProcessError) as err: - run_component([ - "--input", test_data_filter_nothing, - "--output", random_h5mu_path, - "--obs_filter", "doesnotexist"]) - assert re.search(r"\.mod\[rna\]\.obs\[doesnotexist\] does not exist\.", - err.value.stdout.decode('utf-8')) - + run_component( + [ + "--input", + test_data_filter_nothing, + "--output", + random_h5mu_path, + "--obs_filter", + "doesnotexist", + ] + ) + assert re.search( + r"\.mod\[rna\]\.obs\[doesnotexist\] does not exist\.", + err.value.stdout.decode("utf-8"), + ) + with pytest.raises(CalledProcessError) as err: - run_component([ - "--input", test_data_filter_nothing, - "--output", random_h5mu_path, - "--var_filter", "doesnotexist"]) + run_component( + [ + "--input", + test_data_filter_nothing, + "--output", + random_h5mu_path, + "--var_filter", + "doesnotexist", + ] + ) + + assert re.search( + r"\.mod\[rna\]\.var\[doesnotexist\] does not exist\.", + err.value.stdout.decode("utf-8"), + ) - assert re.search(r"\.mod\[rna\]\.var\[doesnotexist\] does not exist\.", - err.value.stdout.decode('utf-8')) if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/filter/filter_with_counts/script.py b/src/filter/filter_with_counts/script.py index bd7c71e5eb7..739774e2cd9 100644 --- a/src/filter/filter_with_counts/script.py +++ b/src/filter/filter_with_counts/script.py @@ -1,4 +1,3 @@ - import mudata as mu import numpy as np import sys @@ -6,27 +5,25 @@ ### VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu', - 'modality': 'rna', - 'output': 'output.h5mu', - 'obs_name_filter': 'filter_with_counts', - 'var_name_filter': 'filter_with_counts', - 'do_subset': True, - 'min_counts': 200, - 'max_counts': 5000000, - 'min_genes_per_cell': 200, - 'max_genes_per_cell': 1500000, - 'min_cells_per_gene': 3, - 'layer': None -} -meta = { - 'name': 'filter_on_counts', - 'resources_dir': '.' + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "rna", + "output": "output.h5mu", + "obs_name_filter": "filter_with_counts", + "var_name_filter": "filter_with_counts", + "do_subset": True, + "min_counts": 200, + "max_counts": 5000000, + "min_genes_per_cell": 200, + "max_genes_per_cell": 1500000, + "min_cells_per_gene": 3, + "layer": None, } +meta = {"name": "filter_on_counts", "resources_dir": "."} ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading input data") @@ -34,41 +31,64 @@ mdata.var_names_make_unique() -mod = par['modality'] +mod = par["modality"] logger.info("Processing modality %s.", mod) modality_data = mdata.mod[mod] logger.info("\tUnfiltered data: %s", modality_data) logger.info("Selecting input layer %s", "X" if par["layer"] else par["layer"]) -input_layer = modality_data.X if not par["layer"] else modality_data.layers[par["layer"]] +input_layer = ( + modality_data.X if not par["layer"] else modality_data.layers[par["layer"]] +) logger.info("\tComputing aggregations.") n_counts_per_cell = np.ravel(np.sum(input_layer, axis=1)) n_cells_per_gene = np.sum(input_layer > 0, axis=0) n_genes_per_cell = np.sum(input_layer > 0, axis=1) + def apply_filter_to_mask(mask, base, filter, comparator): new_filt = np.ravel(comparator(base, filter)) num_removed = np.sum(np.invert(new_filt) & mask) mask &= new_filt return num_removed, mask + # Filter genes keep_genes = np.repeat(True, modality_data.n_vars) if par["min_cells_per_gene"] is not None: - num_removed, keep_genes = apply_filter_to_mask(keep_genes, - n_cells_per_gene, - par['min_cells_per_gene'], - ge) - logger.info("\tRemoving %s genes with non-zero values in <%s cells.", - num_removed, par['min_cells_per_gene']) + num_removed, keep_genes = apply_filter_to_mask( + keep_genes, n_cells_per_gene, par["min_cells_per_gene"], ge + ) + logger.info( + "\tRemoving %s genes with non-zero values in <%s cells.", + num_removed, + par["min_cells_per_gene"], + ) # Filter cells -filters = (("min_genes_per_cell", n_genes_per_cell, ge, "\tRemoving %s cells with non-zero values in <%s genes."), - ("max_genes_per_cell", n_genes_per_cell, le, "\tRemoving %s cells with non-zero values in >%s genes."), - ("min_counts", n_counts_per_cell, ge, "\tRemoving %s cells with <%s total counts."), - ("max_counts", n_counts_per_cell, le, "\tRemoving %s cells with >%s total counts."), - (0, np.sum(input_layer[:,keep_genes], axis=1), gt, "\tRemoving %s cells with %s counts")) +filters = ( + ( + "min_genes_per_cell", + n_genes_per_cell, + ge, + "\tRemoving %s cells with non-zero values in <%s genes.", + ), + ( + "max_genes_per_cell", + n_genes_per_cell, + le, + "\tRemoving %s cells with non-zero values in >%s genes.", + ), + ("min_counts", n_counts_per_cell, ge, "\tRemoving %s cells with <%s total counts."), + ("max_counts", n_counts_per_cell, le, "\tRemoving %s cells with >%s total counts."), + ( + 0, + np.sum(input_layer[:, keep_genes], axis=1), + gt, + "\tRemoving %s cells with %s counts", + ), +) keep_cells = np.repeat(True, modality_data.n_obs) for filter_name_or_value, base, comparator, message in filters: @@ -77,7 +97,9 @@ def apply_filter_to_mask(mask, base, filter, comparator): except KeyError: filter = filter_name_or_value if filter is not None: - num_removed, keep_cells = apply_filter_to_mask(keep_cells, base, filter, comparator) + num_removed, keep_cells = apply_filter_to_mask( + keep_cells, base, filter, comparator + ) logger.info(message, num_removed, filter) if par["obs_name_filter"] is not None: @@ -92,4 +114,4 @@ def apply_filter_to_mask(mask, base, filter, comparator): logger.info("Writing output data to %s", par["output"]) mdata.write_h5mu(par["output"], compression=par["output_compression"]) -logger.info("Finished") \ No newline at end of file +logger.info("Finished") diff --git a/src/filter/filter_with_counts/test.py b/src/filter/filter_with_counts/test.py index 77b32414181..0d22c5775ad 100644 --- a/src/filter/filter_with_counts/test.py +++ b/src/filter/filter_with_counts/test.py @@ -2,18 +2,20 @@ import sys from pathlib import Path import pytest +import numpy as np ## VIASH START meta = { - 'executable': './target/executable/filter/filter_with_counts/filter_with_counts', - 'resources_dir': 'resources_test/', - 'config': "/home/di/code/openpipeline/src/filter/filter_with_counts/config.vsh.yaml" + "executable": "./target/executable/filter/filter_with_counts/filter_with_counts", + "resources_dir": "resources_test/", + "config": "/home/di/code/openpipeline/src/filter/filter_with_counts/config.vsh.yaml", } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() @@ -21,118 +23,190 @@ def input_path(): return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + @pytest.fixture def input_h5mu(input_path): return mu.read_h5mu(input_path) + @pytest.fixture def input_n_rna_obs(input_h5mu): - return input_h5mu.mod['rna'].n_obs + return input_h5mu.mod["rna"].n_obs + @pytest.fixture def input_n_prot_obs(input_h5mu): - return input_h5mu.mod['prot'].n_obs + return input_h5mu.mod["prot"].n_obs + @pytest.fixture def input_n_rna_vars(input_h5mu): - return input_h5mu.mod['rna'].n_vars + return input_h5mu.mod["rna"].n_vars + @pytest.fixture def input_n_prot_vars(input_h5mu): - return input_h5mu.mod['prot'].n_vars - -def test_filter_nothing(run_component, input_path, - input_n_rna_obs, input_n_prot_obs, - input_n_rna_vars, input_n_prot_vars): - run_component([ - "--input", input_path, - "--output", "output-1.h5mu", - "--min_cells_per_gene", "3", - "--output_compression", "gzip" - ]) + return input_h5mu.mod["prot"].n_vars + + +def test_filter_nothing( + run_component, + input_path, + input_n_rna_obs, + input_n_prot_obs, + input_n_rna_vars, + input_n_prot_vars, +): + run_component( + [ + "--input", + input_path, + "--output", + "output-1.h5mu", + "--min_cells_per_gene", + "3", + "--output_compression", + "gzip", + ] + ) assert Path("output-1.h5mu").is_file() mu_out = mu.read_h5mu("output-1.h5mu") assert "filter_with_counts" in mu_out.mod["rna"].obs assert "filter_with_counts" in mu_out.mod["rna"].var - new_obs = mu_out.mod['rna'].n_obs - new_vars = mu_out.mod['rna'].n_vars + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars assert new_obs == input_n_rna_obs assert new_vars == input_n_rna_vars - assert mu_out.mod['prot'].n_obs == input_n_prot_obs - assert mu_out.mod['prot'].n_vars == input_n_prot_vars - assert list(mu_out.mod['rna'].var['feature_types'].cat.categories) == ["Gene Expression"] - assert list(mu_out.mod['prot'].var['feature_types'].cat.categories) == ["Antibody Capture"] - -def test_filtering_a_little(run_component, input_path, - input_n_rna_obs, input_n_prot_obs, - input_n_rna_vars, input_n_prot_vars): - run_component([ - "--input", input_path, - "--output", "output-2.h5mu", - "--modality", "rna", - "--min_counts", "200", - "--max_counts", "5000000", - "--min_genes_per_cell", "200", - "--max_genes_per_cell", "1500000", - "--min_cells_per_gene", "10", - "--do_subset"]) + assert mu_out.mod["prot"].n_obs == input_n_prot_obs + assert mu_out.mod["prot"].n_vars == input_n_prot_vars + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ] + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ] + + +def test_filtering_a_little( + run_component, + input_path, + input_n_rna_obs, + input_n_prot_obs, + input_n_rna_vars, + input_n_prot_vars, +): + run_component( + [ + "--input", + input_path, + "--output", + "output-2.h5mu", + "--modality", + "rna", + "--min_counts", + "200", + "--max_counts", + "5000000", + "--min_genes_per_cell", + "200", + "--max_genes_per_cell", + "1500000", + "--min_cells_per_gene", + "10", + "--do_subset", + ] + ) assert Path("output-2.h5mu").is_file() mu_out = mu.read_h5mu("output-2.h5mu") - new_obs = mu_out.mod['rna'].n_obs - new_vars = mu_out.mod['rna'].n_vars + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars assert new_obs < input_n_rna_obs assert new_vars < input_n_rna_vars - assert mu_out.mod['prot'].n_obs == input_n_prot_obs - assert mu_out.mod['prot'].n_vars == input_n_prot_vars - assert list(mu_out.mod['rna'].var['feature_types'].cat.categories) == ["Gene Expression"] - assert list(mu_out.mod['prot'].var['feature_types'].cat.categories) == ["Antibody Capture"] + assert mu_out.mod["prot"].n_obs == input_n_prot_obs + assert mu_out.mod["prot"].n_vars == input_n_prot_vars + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ] + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ] + def test_filter_cells_without_counts(run_component, input_h5mu, tmp_path): # create_an_empty_cell - obs_to_remove = input_h5mu.mod['rna'].obs.index[0] - input_h5mu.mod['rna'].X[0] = 0 + obs_to_remove = input_h5mu.mod["rna"].obs.index[0] + input_h5mu.mod["rna"].X[0] = 0 temp_h5mu_path = tmp_path / "temp.h5mu" input_h5mu.write(temp_h5mu_path) - run_component([ - "--input", temp_h5mu_path, - "--output", "output-3.h5mu", - "--min_cells_per_gene", "0", - ]) + run_component( + [ + "--input", + temp_h5mu_path, + "--output", + "output-3.h5mu", + "--min_cells_per_gene", + "0", + ] + ) assert Path("output-3.h5mu").is_file() mu_out = mu.read_h5mu("output-3.h5mu") - assert mu_out.mod['rna'].obs.at[obs_to_remove, 'filter_with_counts'] == False - assert "mitochondrial" not in mu_out.mod['rna'].var - -def test_filter_using_different_layer(run_component, input_h5mu, tmp_path, - input_n_rna_obs, input_n_prot_obs, - input_n_rna_vars, input_n_prot_vars): + assert mu_out.mod["rna"].obs.at[obs_to_remove, "filter_with_counts"] is np.False_ + assert "mitochondrial" not in mu_out.mod["rna"].var + + +def test_filter_using_different_layer( + run_component, + input_h5mu, + tmp_path, + input_n_rna_obs, + input_n_prot_obs, + input_n_rna_vars, + input_n_prot_vars, +): # move X to different input layer - input_h5mu.mod['rna'].layers['test_layer'] = input_h5mu.mod['rna'].X.copy() - input_h5mu.mod['rna'].X = None + input_h5mu.mod["rna"].layers["test_layer"] = input_h5mu.mod["rna"].X.copy() + input_h5mu.mod["rna"].X = None temp_h5mu_path = tmp_path / "temp.h5mu" input_h5mu.write(temp_h5mu_path) - run_component([ - "--input", temp_h5mu_path, - "--output", "output-4.h5mu", - "--modality", "rna", - "--min_counts", "200", - "--max_counts", "5000000", - "--min_genes_per_cell", "200", - "--max_genes_per_cell", "1500000", - "--min_cells_per_gene", "10", - "--layer", "test_layer", - "--do_subset"]) + run_component( + [ + "--input", + temp_h5mu_path, + "--output", + "output-4.h5mu", + "--modality", + "rna", + "--min_counts", + "200", + "--max_counts", + "5000000", + "--min_genes_per_cell", + "200", + "--max_genes_per_cell", + "1500000", + "--min_cells_per_gene", + "10", + "--layer", + "test_layer", + "--do_subset", + ] + ) assert Path("output-4.h5mu").is_file() mu_out = mu.read_h5mu("output-2.h5mu") - new_obs = mu_out.mod['rna'].n_obs - new_vars = mu_out.mod['rna'].n_vars + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars assert new_obs < input_n_rna_obs assert new_vars < input_n_rna_vars - assert mu_out.mod['prot'].n_obs == input_n_prot_obs - assert mu_out.mod['prot'].n_vars == input_n_prot_vars - assert list(mu_out.mod['rna'].var['feature_types'].cat.categories) == ["Gene Expression"] - assert list(mu_out.mod['prot'].var['feature_types'].cat.categories) == ["Antibody Capture"] + assert mu_out.mod["prot"].n_obs == input_n_prot_obs + assert mu_out.mod["prot"].n_vars == input_n_prot_vars + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ] + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ] + if __name__ == "__main__": exit(pytest.main([__file__])) diff --git a/src/filter/filter_with_scrublet/script.py b/src/filter/filter_with_scrublet/script.py index f0c92906c07..6900135dbe8 100644 --- a/src/filter/filter_with_scrublet/script.py +++ b/src/filter/filter_with_scrublet/script.py @@ -24,16 +24,17 @@ "layer": None, } meta = { - 'name': 'scrublet', - 'resources_dir': '.', + "name": "scrublet", + "resources_dir": ".", } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() -logger.info("Reading %s.", par['input']) +logger.info("Reading %s.", par["input"]) mdata = mu.read_h5mu(par["input"]) mod = par["modality"] @@ -52,34 +53,42 @@ min_gene_variability_pctl=par["min_gene_variablity_percent"], n_prin_comps=par["num_pca_components"], distance_metric=par["distance_metric"], - use_approx_neighbors=False + use_approx_neighbors=False, ) try: keep_cells = np.invert(predicted_doublets) except TypeError: - if par['allow_automatic_threshold_detection_fail']: + if par["allow_automatic_threshold_detection_fail"]: # Scrublet might not throw an error and return None if it fails to detect doublets... - logger.info("\tScrublet could not automatically detect the doublet score threshold. Setting output columns to NA.") + logger.info( + "\tScrublet could not automatically detect the doublet score threshold. Setting output columns to NA." + ) keep_cells = np.nan doublet_scores = np.nan else: - raise RuntimeError("Scrublet could not automatically detect the doublet score threshold. " - "--allow_automatic_threshold_detection_fail can be used to ignore this failure " - "and set the corresponding output columns to NA.") + raise RuntimeError( + "Scrublet could not automatically detect the doublet score threshold. " + "--allow_automatic_threshold_detection_fail can be used to ignore this failure " + "and set the corresponding output columns to NA." + ) logger.info("\tStoring output into .obs") if par["obs_name_doublet_score"] is not None: data.obs[par["obs_name_doublet_score"]] = doublet_scores - data.obs[par["obs_name_doublet_score"]] = data.obs[par["obs_name_doublet_score"]].astype("float64") + data.obs[par["obs_name_doublet_score"]] = data.obs[ + par["obs_name_doublet_score"] + ].astype("float64") if par["obs_name_filter"] is not None: data.obs[par["obs_name_filter"]] = keep_cells - data.obs[par["obs_name_filter"]] = data.obs[par["obs_name_filter"]].astype(pd.BooleanDtype()) + data.obs[par["obs_name_filter"]] = data.obs[par["obs_name_filter"]].astype( + pd.BooleanDtype() + ) if par["do_subset"]: if pd.api.types.is_scalar(keep_cells) and pd.isna(keep_cells): logger.warning("Not subsetting beacuse doublets were not predicted") - else: + else: mdata.mod[mod] = data[keep_cells, :] logger.info("Writing h5mu to %s", par["output"]) diff --git a/src/filter/filter_with_scrublet/test.py b/src/filter/filter_with_scrublet/test.py index 8870be3c31c..8d3899e3abe 100644 --- a/src/filter/filter_with_scrublet/test.py +++ b/src/filter/filter_with_scrublet/test.py @@ -11,9 +11,9 @@ ## VIASH START meta = { - 'name': 'foo', - 'resources_dir': 'resources_test/', - 'executable': 'target/executable/filter/filter_with_scrublet/filter_with_scrublet' + "name": "foo", + "resources_dir": "resources_test/", + "executable": "target/executable/filter/filter_with_scrublet/filter_with_scrublet", } # def run_component(args_as_list): # try: @@ -28,61 +28,86 @@ # read input file input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" input_mu = mu.read_h5mu(input_path) -orig_obs = input_mu.mod['rna'].n_obs -orig_vars = input_mu.mod['rna'].n_vars -orig_prot_obs = input_mu.mod['prot'].n_obs -orig_prot_vars = input_mu.mod['prot'].n_vars +orig_obs = input_mu.mod["rna"].n_obs +orig_vars = input_mu.mod["rna"].n_vars +orig_prot_obs = input_mu.mod["prot"].n_obs +orig_prot_vars = input_mu.mod["prot"].n_vars def test_filter_a_little_bit(run_component): output_mu = "output-1.h5mu" - run_component([ - "--input", input_path, - "--output", output_mu, - "--min_counts", "3", - "--output_compression", "gzip" - ]) + run_component( + [ + "--input", + input_path, + "--output", + output_mu, + "--min_counts", + "3", + "--output_compression", + "gzip", + ] + ) assert Path(output_mu).is_file(), "Output file not found" mu_out = mu.read_h5mu(output_mu) assert "filter_with_scrublet" in mu_out.mod["rna"].obs - new_obs = mu_out.mod['rna'].n_obs - new_vars = mu_out.mod['rna'].n_vars + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars assert new_obs == orig_obs, "No RNA obs should have been filtered" assert new_vars == orig_vars, "No RNA vars should have been filtered" - assert mu_out.mod['prot'].n_obs == orig_prot_obs, "No prot obs should have been filtered" - assert mu_out.mod['prot'].n_vars == orig_prot_vars, "No prot vars should have been filtered" - assert list(mu_out.mod['rna'].var['feature_types'].cat.categories) == ["Gene Expression"],\ - "Feature types of RNA modality should be Gene Expression" - assert list(mu_out.mod['prot'].var['feature_types'].cat.categories) == ["Antibody Capture"],\ - "Feature types of prot modality should be Antibody Capture" + assert ( + mu_out.mod["prot"].n_obs == orig_prot_obs + ), "No prot obs should have been filtered" + assert ( + mu_out.mod["prot"].n_vars == orig_prot_vars + ), "No prot vars should have been filtered" + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ], "Feature types of RNA modality should be Gene Expression" + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ], "Feature types of prot modality should be Antibody Capture" + def test_filtering_a_lot(run_component): output_mu = "output-2.h5mu" - run_component([ - "--input", input_path, - "--output", output_mu, - "--modality", "rna", - "--min_counts", "10", - "--num_pca_components", "10", - "--do_subset" - ]) + run_component( + [ + "--input", + input_path, + "--output", + output_mu, + "--modality", + "rna", + "--min_counts", + "10", + "--num_pca_components", + "10", + "--do_subset", + ] + ) assert Path(output_mu).is_file(), "Output file not found" mu_out = mu.read_h5mu(output_mu) - new_obs = mu_out.mod['rna'].n_obs - new_vars = mu_out.mod['rna'].n_vars + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars assert new_obs < orig_obs, "Some cells should have been filtered" assert new_vars == orig_vars, "No genes should have been filtered" - assert mu_out.mod['prot'].n_obs == orig_obs, "No prot obs should have been filtered" - assert mu_out.mod['prot'].n_vars == orig_prot_vars, "No prot vars should have been filtered" - assert list(mu_out.mod['rna'].var['feature_types'].cat.categories) == ["Gene Expression"],\ - "Feature types of RNA modality should be Gene Expression" - assert list(mu_out.mod['prot'].var['feature_types'].cat.categories) == ["Antibody Capture"],\ - "Feature types of prot modality should be Antibody Capture" + assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered" + assert ( + mu_out.mod["prot"].n_vars == orig_prot_vars + ), "No prot vars should have been filtered" + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ], "Feature types of RNA modality should be Gene Expression" + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ], "Feature types of prot modality should be Antibody Capture" + @pytest.fixture(scope="module") def input_with_failed_run(): @@ -96,105 +121,146 @@ def input_with_failed_run(): # Simulate a failed scrublet run by passing very little cells mudata = mudata_in[152].copy() nobs = 100 - x_data = np.repeat(mudata.mod['rna'].X.todense(), nobs, axis=0) - + x_data = np.repeat(mudata.mod["rna"].X.todense(), nobs, axis=0) + # Random perturbations because otherwise the detection fails in other ways (PCA cannot be run) replace_rate = 0.000001 - mask = np.random.choice([0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate)).astype("bool") + mask = np.random.choice( + [0, 1], size=x_data.shape, p=((1 - replace_rate), replace_rate) + ).astype("bool") r = np.random.rand(*x_data.shape) * np.max(x_data) x_data[mask] = r[mask] # create obs - obs_name = mudata.mod['rna'].obs.index.to_list()[0] + obs_name = mudata.mod["rna"].obs.index.to_list()[0] obs_data = pd.DataFrame([], index=[f"{obs_name}_{i}" for i in range(nobs)]) # create resulting mudata - mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod['rna'].var) + mod = ad.AnnData(X=csr_matrix(x_data), obs=obs_data, var=mudata.mod["rna"].var) new_mudata = mu.MuData({"rna": mod}) new_mudata.update() new_mudata.write(new_mudata_path) return new_mudata_path + @pytest.mark.xfail(strict=False) -def test_doublet_automatic_threshold_detection_fails(run_component, input_with_failed_run): +def test_doublet_automatic_threshold_detection_fails( + run_component, input_with_failed_run +): """ Test if the component fails if doublet score threshold could not automatically be set """ output_mu = "output-4.h5mu" - with pytest.raises(subprocess.CalledProcessError) as e_info: - run_component([ - "--input", input_with_failed_run, - "--output", output_mu, - "--output_compression", "gzip", - "--num_pca_components", "1", - "--min_gene_variablity_percent", "0", - "--min_cells", "1", - "--min_counts", "1", - ]) - assert re.search(r"RuntimeError: Scrublet could not automatically detect the doublet score threshold\. " + with pytest.raises(subprocess.CalledProcessError) as e_info: + run_component( + [ + "--input", + input_with_failed_run, + "--output", + output_mu, + "--output_compression", + "gzip", + "--num_pca_components", + "1", + "--min_gene_variablity_percent", + "0", + "--min_cells", + "1", + "--min_counts", + "1", + ] + ) + assert re.search( + r"RuntimeError: Scrublet could not automatically detect the doublet score threshold\. " r"--allow_automatic_threshold_detection_fail can be used to ignore this failure and " r"set the corresponding output columns to NA\.", - e_info.value.stdout.decode('utf-8')) + e_info.value.stdout.decode("utf-8"), + ) assert not Path(output_mu).is_file(), "Output file not found" + @pytest.mark.xfail(strict=False) -def test_doublet_automatic_threshold_detection_fails_recovery(run_component, input_with_failed_run): +def test_doublet_automatic_threshold_detection_fails_recovery( + run_component, input_with_failed_run +): """ Test if the component can recover from scrublet not automatically able to set the doublet score threshold and it is not set. """ output_mu = "output-5.h5mu" - run_component([ - "--input", input_with_failed_run, - "--output", output_mu, - "--output_compression", "gzip", - "--num_pca_components", "1", - "--min_gene_variablity_percent", "0", - "--min_cells", "1", - "--min_counts", "1", - "--allow_automatic_threshold_detection_fail" - ]) + run_component( + [ + "--input", + input_with_failed_run, + "--output", + output_mu, + "--output_compression", + "gzip", + "--num_pca_components", + "1", + "--min_gene_variablity_percent", + "0", + "--min_cells", + "1", + "--min_counts", + "1", + "--allow_automatic_threshold_detection_fail", + ] + ) assert Path(output_mu).is_file(), "Output file not found" mu_out = mu.read_h5mu(output_mu) - assert mu_out.mod['rna'].obs['filter_with_scrublet'].isna().all() + assert mu_out.mod["rna"].obs["filter_with_scrublet"].isna().all() + def test_selecting_input_layer(run_component, tmp_path): output_mu = "output-2.h5mu" input_data = mu.read_h5mu(input_path) - input_data.mod['rna'].layers['test_layer'] = input_data.mod['rna'].X - input_data.mod['rna'].X = None + input_data.mod["rna"].layers["test_layer"] = input_data.mod["rna"].X + input_data.mod["rna"].X = None temp_input = tmp_path / "temp.h5mu" input_data.write(temp_input) - run_component([ - "--input", temp_input, - "--output", output_mu, - "--modality", "rna", - "--min_counts", "10", - "--num_pca_components", "10", - "--layer", "test_layer", - "--do_subset" - ]) + run_component( + [ + "--input", + temp_input, + "--output", + output_mu, + "--modality", + "rna", + "--min_counts", + "10", + "--num_pca_components", + "10", + "--layer", + "test_layer", + "--do_subset", + ] + ) assert Path(output_mu).is_file(), "Output file not found" mu_out = mu.read_h5mu(output_mu) - new_obs = mu_out.mod['rna'].n_obs - new_vars = mu_out.mod['rna'].n_vars + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars assert new_obs < orig_obs, "Some cells should have been filtered" assert new_vars == orig_vars, "No genes should have been filtered" - assert mu_out.mod['prot'].n_obs == orig_obs, "No prot obs should have been filtered" - assert mu_out.mod['prot'].n_vars == orig_prot_vars, "No prot vars should have been filtered" - assert list(mu_out.mod['rna'].var['feature_types'].cat.categories) == ["Gene Expression"],\ - "Feature types of RNA modality should be Gene Expression" - assert list(mu_out.mod['prot'].var['feature_types'].cat.categories) == ["Antibody Capture"],\ - "Feature types of prot modality should be Antibody Capture" + assert mu_out.mod["prot"].n_obs == orig_obs, "No prot obs should have been filtered" + assert ( + mu_out.mod["prot"].n_vars == orig_prot_vars + ), "No prot vars should have been filtered" + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ], "Feature types of RNA modality should be Gene Expression" + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ], "Feature types of prot modality should be Antibody Capture" -if __name__ == '__main__': +if __name__ == "__main__": exit(pytest.main([__file__])) diff --git a/src/filter/intersect_obs/script.py b/src/filter/intersect_obs/script.py index d99ae738079..d8a709fd7d4 100644 --- a/src/filter/intersect_obs/script.py +++ b/src/filter/intersect_obs/script.py @@ -8,11 +8,9 @@ par = { "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", "modalities": ["rna", "prot"], - "output": "output.h5mu" -} -meta = { - + "output": "output.h5mu", } +meta = {} ## VIASH END sys.path.append(meta["resources_dir"]) @@ -21,47 +19,54 @@ logger = setup_logger() + def main(): - modality_names = par['modalities'] + modality_names = par["modalities"] if len(modality_names) < 2: raise ValueError("Please provide two more more modalities.") - + obs_names = {} - for mod_name in par['modalities']: + for mod_name in par["modalities"]: try: - modality = mu.read_h5ad(filename=par['input'], mod=mod_name) + modality = mu.read_h5ad(filename=par["input"], mod=mod_name) except KeyError: - raise ValueError(f"Modality {mod_name} does not exist for file {par['input']}.") + raise ValueError( + f"Modality {mod_name} does not exist for file {par['input']}." + ) obs_names[mod_name] = modality.obs_names.copy() del modality - + intersected_index = None for mod_name, mod_index in obs_names.items(): if intersected_index is None: intersected_index = mod_index continue intersected_index = intersected_index.intersection(mod_index) - - output_file = Path(par['output']) - output_file_uncompressed = output_file.with_name(output_file.stem + "_uncompressed.h5mu") + output_file = Path(par["output"]) + output_file_uncompressed = output_file.with_name( + output_file.stem + "_uncompressed.h5mu" + ) output_file_uncompressed.touch() mdata = mu.MuData({modality: ad.AnnData() for modality in modality_names}) - mdata.write(output_file_uncompressed, compression=par['output_compression']) - + mdata.write(output_file_uncompressed, compression=par["output_compression"]) + for mod_name in modality_names: - modality = mu.read_h5ad(filename=par['input'], mod=mod_name) + modality = mu.read_h5ad(filename=par["input"], mod=mod_name) intersected_modality = modality[intersected_index] mu.write_h5ad(output_file_uncompressed, data=intersected_modality, mod=mod_name) - if par['output_compression']: - compress_h5mu(output_file_uncompressed, output_file, compression=par['output_compression']) + if par["output_compression"]: + compress_h5mu( + output_file_uncompressed, output_file, compression=par["output_compression"] + ) output_file_uncompressed.unlink() else: shutil.move(output_file_uncompressed, output_file) - + + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/filter/intersect_obs/test.py b/src/filter/intersect_obs/test.py index 275eefe841d..5d5fb3719a5 100644 --- a/src/filter/intersect_obs/test.py +++ b/src/filter/intersect_obs/test.py @@ -7,48 +7,60 @@ ## VIASH START meta = { - 'executable': './target/executable/filter/intersect_obs/intersect_obs', - 'resources_dir': './resources_test/', - 'cpus': 2, - 'config': './src/filter/intersect_modalities/config.vsh.yaml' + "executable": "./target/executable/filter/intersect_obs/intersect_obs", + "resources_dir": "./resources_test/", + "cpus": 2, + "config": "./src/filter/intersect_modalities/config.vsh.yaml", } ## VIASH END -input_sample_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" +input_sample_file = ( + f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" +) @pytest.fixture def generate_h5mu(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) - var = pd.DataFrame([["a"], ["b"], ["c"]], - index=df.columns, columns=["Feat"]) + var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) ad1 = AnnData(df, obs=obs, var=var) - df2 = pd.DataFrame([[7, 8, 9], [10, 11, 12]], index=["obs2", "obs3"], columns=df.columns) + df2 = pd.DataFrame( + [[7, 8, 9], [10, 11, 12]], index=["obs2", "obs3"], columns=df.columns + ) var2 = pd.DataFrame(["d", "e", "g"], index=df2.columns, columns=["Feat"]) obs2 = pd.DataFrame(["C", "D"], index=df2.index, columns=["Obs"]) ad2 = AnnData(df2, obs=obs2, var=var2) - tmp_mudata = MuData({'mod1': ad1, 'mod2': ad2}) + tmp_mudata = MuData({"mod1": ad1, "mod2": ad2}) return tmp_mudata + @pytest.fixture def sample_mudata(generate_h5mu, tmp_path): filename = f"{uuid.uuid4()}.h5mu" output_file = tmp_path / filename generate_h5mu.write(output_file) return output_file - + def test_intersect_obs(run_component, sample_mudata, tmp_path): output_path = tmp_path / f"{uuid.uuid4()}.h5mu" # run component - run_component([ - "--input", sample_mudata, - "--modalities", "mod1;mod2", - "--output", str(output_path), - "--output_compression", "gzip" - ]) + run_component( + [ + "--input", + sample_mudata, + "--modalities", + "mod1;mod2", + "--output", + str(output_path), + "--output_compression", + "gzip", + ] + ) assert output_path.is_file() output = read_h5mu(output_path) assert list(output.mod.keys()) == ["mod1", "mod2"] @@ -67,17 +79,24 @@ def test_intersect_obs_with_real(run_component, tmp_path): input.write(input_path) # run component - run_component([ - "--input", input_path, - "--modalities", "rna;prot", - "--output", str(output_path), - "--output_compression", "gzip" - ]) + run_component( + [ + "--input", + input_path, + "--modalities", + "rna;prot", + "--output", + str(output_path), + "--output_compression", + "gzip", + ] + ) assert output_path.is_file() output = read_h5mu(output_path) assert list(output.mod.keys()) == ["rna", "prot"] assert output.n_obs == 50 assert output.obs_names.tolist() == input.obs_names[range(50, 100)].tolist() + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/filter/remove_modality/script.py b/src/filter/remove_modality/script.py index f4c15be6da4..301266f38c9 100644 --- a/src/filter/remove_modality/script.py +++ b/src/filter/remove_modality/script.py @@ -3,17 +3,19 @@ ### VIASH START par = { - 'input': "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", - 'modality': ["rna"], - 'output': "foo.h5mu" + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": ["rna"], + "output": "foo.h5mu", } ### VIASH END -input_mudata = read_h5mu(par['input']) -new_mods = {mod_name: mod for mod_name, mod - in input_mudata.mod.items() - if mod_name not in par['modality']} +input_mudata = read_h5mu(par["input"]) +new_mods = { + mod_name: mod + for mod_name, mod in input_mudata.mod.items() + if mod_name not in par["modality"] +} new_mudata = MuData(new_mods) -new_mudata.write_h5mu(filename=par["output"], compression=par["output_compression"]) \ No newline at end of file +new_mudata.write_h5mu(filename=par["output"], compression=par["output_compression"]) diff --git a/src/filter/remove_modality/test.py b/src/filter/remove_modality/test.py index 6a11f94bd48..5c07e5999da 100644 --- a/src/filter/remove_modality/test.py +++ b/src/filter/remove_modality/test.py @@ -4,28 +4,37 @@ ## VIASH START meta = { - 'executable': './target/executable/filter/remove_modality/remove_modality', - 'resources_dir': './resources_test/', - 'cpus': 2 + "executable": "./target/executable/filter/remove_modality/remove_modality", + "resources_dir": "./resources_test/", + "cpus": 2, } ## VIASH END -input_sample_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +input_sample_file = ( + f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) def test_remove_component(run_component, tmp_path): output_path = tmp_path / "output.h5mu" # run component - run_component([ - "--input", input_sample_file, - "--modality", "rna", - "--output", str(output_path), - "--output_compression", "gzip" - ]) + run_component( + [ + "--input", + input_sample_file, + "--modality", + "rna", + "--output", + str(output_path), + "--output_compression", + "gzip", + ] + ) assert output_path.is_file() output = read_h5mu(output_path) assert list(output.mod.keys()) == ["prot"] + if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/filter/subset_h5mu/script.py b/src/filter/subset_h5mu/script.py index 7b02262b1de..ccde6fdc49e 100644 --- a/src/filter/subset_h5mu/script.py +++ b/src/filter/subset_h5mu/script.py @@ -4,7 +4,7 @@ par = { "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", "output": "bar.h5mu", - "number_of_observations": 100 + "number_of_observations": 100, } ### VIASH END @@ -14,9 +14,11 @@ # subset data if par["modality"]: - data.mod[par["modality"]] = data.mod[par["modality"]][:par["number_of_observations"]] + data.mod[par["modality"]] = data.mod[par["modality"]][ + : par["number_of_observations"] + ] else: - data = data[:par["number_of_observations"]] + data = data[: par["number_of_observations"]] # write data - data.write_h5mu(par["output"], compression=par["output_compression"]) \ No newline at end of file + data.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/filter/subset_h5mu/test.py b/src/filter/subset_h5mu/test.py index 845e7137ed6..1791b3a0a40 100644 --- a/src/filter/subset_h5mu/test.py +++ b/src/filter/subset_h5mu/test.py @@ -4,45 +4,61 @@ ## VIASH START meta = { - 'executable': './target/executable/filter/subset_h5mu/subset_h5mu', - 'resources_dir': 'resources_test/pbmc_1k_protein_v3/' + "executable": "./target/executable/filter/subset_h5mu/subset_h5mu", + "resources_dir": "resources_test/pbmc_1k_protein_v3/", } ## VIASH END -input_path = f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +input_path = ( + f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" +) + def test_filter_nothing(run_component, tmp_path): output_path = tmp_path / "output.h5mu" - + # run component - run_component([ - "--input", input_path, - "--output", str(output_path), - "--number_of_observations", "100", - "--output_compression", "gzip" - ]) - + run_component( + [ + "--input", + input_path, + "--output", + str(output_path), + "--number_of_observations", + "100", + "--output_compression", + "gzip", + ] + ) + assert output_path.is_file(), "Output file not found" # check output file mu_in = mu.read_h5mu(input_path) mu_out = mu.read_h5mu(output_path) - orig_vars = mu_in.mod['rna'].n_vars - orig_prot_obs = mu_in.mod['prot'].n_obs - orig_prot_vars = mu_in.mod['prot'].n_vars + orig_vars = mu_in.mod["rna"].n_vars + orig_prot_obs = mu_in.mod["prot"].n_obs + orig_prot_vars = mu_in.mod["prot"].n_vars + + new_obs = mu_out.mod["rna"].n_obs + new_vars = mu_out.mod["rna"].n_vars - new_obs = mu_out.mod['rna'].n_obs - new_vars = mu_out.mod['rna'].n_vars - assert new_obs == 100, "Output should only contain 100 observations" assert new_vars == orig_vars, "No RNA vars should have been filtered" - assert mu_out.mod['prot'].n_obs == orig_prot_obs, "No prot obs should have been filtered" - assert mu_out.mod['prot'].n_vars == orig_prot_vars, "No prot vars should have been filtered" - assert list(mu_out.mod['rna'].var['feature_types'].cat.categories) == ["Gene Expression"], \ - "Feature types of RNA modality should be Gene Expression" - assert list(mu_out.mod['prot'].var['feature_types'].cat.categories) == ["Antibody Capture"], \ - "Feature types of prot modality should be Antibody Capture" + assert ( + mu_out.mod["prot"].n_obs == orig_prot_obs + ), "No prot obs should have been filtered" + assert ( + mu_out.mod["prot"].n_vars == orig_prot_vars + ), "No prot vars should have been filtered" + assert list(mu_out.mod["rna"].var["feature_types"].cat.categories) == [ + "Gene Expression" + ], "Feature types of RNA modality should be Gene Expression" + assert list(mu_out.mod["prot"].var["feature_types"].cat.categories) == [ + "Antibody Capture" + ], "Feature types of prot modality should be Antibody Capture" + if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/filter/subset_obsp/script.py b/src/filter/subset_obsp/script.py index ccd21978779..24ce002f586 100644 --- a/src/filter/subset_obsp/script.py +++ b/src/filter/subset_obsp/script.py @@ -3,28 +3,32 @@ ### VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu', - 'modality': 'rna', - 'input_obsp_key': 'distances', - 'input_obs_key': 'leiden', - 'input_obs_value': '1', - 'output_obsm_key': "leiden_1", - 'output': 'subset_obsp_output.h5mu', - 'output_compression': None, + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "input_obsp_key": "distances", + "input_obs_key": "leiden", + "input_obs_value": "1", + "output_obsm_key": "leiden_1", + "output": "subset_obsp_output.h5mu", + "output_compression": None, } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + def main(): logger.info(f"Reading {par['input']}") mdata = mu.read_h5mu(par["input"]) adata = mdata.mod[par["modality"]] - logger.info(f"Subset columns of obsp matrix under {par['input_obsp_key']} based on {par['input_obs_key']} == {par['input_obs_value']}") + logger.info( + f"Subset columns of obsp matrix under {par['input_obsp_key']} based on {par['input_obs_key']} == {par['input_obs_value']}" + ) # .obsp, .obs and .obsm index and .obsp columns all have a dimension length of `n_obs` - # the index dimensions remain unaltered, but .obsp columns will be subset + # the index dimensions remain unaltered, but .obsp columns will be subset obsp = adata.obsp[par["input_obsp_key"]] idx = adata.obs[par["input_obs_key"]].astype(str) == par["input_obs_value"] obsm_subset = obsp[:, idx] @@ -36,5 +40,5 @@ def main(): mdata.write_h5mu(par["output"], compression=par["output_compression"]) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/filter/subset_obsp/test.py b/src/filter/subset_obsp/test.py index ce72563275d..da35fd6523f 100644 --- a/src/filter/subset_obsp/test.py +++ b/src/filter/subset_obsp/test.py @@ -3,9 +3,7 @@ import mudata as mu ## VIASH START -meta = { - 'resources_dir': 'resources_test/pbmc_1k_protein_v3/' -} +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3/"} ## VIASH END @@ -26,14 +24,22 @@ def test_subset_obsp(input_path, run_component, tmp_path): output_path = tmp_path / "output.h5mu" # run component - run_component([ - "--input", input_path, - "--output", str(output_path), - "--input_obsp_key", "distances", - "--input_obs_key", "filter_column", - "--input_obs_value", "group_1", - "--output_obsm_key", "group_1" - ]) + run_component( + [ + "--input", + input_path, + "--output", + str(output_path), + "--input_obsp_key", + "distances", + "--input_obs_key", + "filter_column", + "--input_obs_value", + "group_1", + "--output_obsm_key", + "group_1", + ] + ) assert output_path.is_file(), "Output file not found" @@ -41,7 +47,9 @@ def test_subset_obsp(input_path, run_component, tmp_path): mu_out = mu.read_h5mu(output_path) assert "group_1" in mu_out.mod["rna"].obsm, "Output should contain group_1 in .obsm" - assert mu_out.mod["rna"].obsm["group_1"].shape[1] == 50, "Obsm should only contain a subset of the original obsp matrix" + assert ( + mu_out.mod["rna"].obsm["group_1"].shape[1] == 50 + ), "Obsm should only contain a subset of the original obsp matrix" if __name__ == "__main__": diff --git a/src/genetic_demux/demuxlet/config.vsh.yaml b/src/genetic_demux/demuxlet/config.vsh.yaml index 623590d36c1..6068446b12c 100644 --- a/src/genetic_demux/demuxlet/config.vsh.yaml +++ b/src/genetic_demux/demuxlet/config.vsh.yaml @@ -124,6 +124,7 @@ argument_groups: resources: - type: r_script path: script.R + - path: demuxlet.patch test_resources: - type: bash_script path: test.sh @@ -131,14 +132,16 @@ test_resources: engines: - type: docker - image: ubuntu:20.04 + image: ubuntu:22.04 setup: + - type: docker + copy: ["demuxlet.patch /opt/demuxlet.patch"] - type: apt packages: [ autoconf, wget, git, build-essential, libcurl4-openssl-dev, cmake, libbz2-dev, libssl-dev, liblzma-dev, zlib1g-dev, r-base] - type: docker run: git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install - type: docker - run: git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin + run: git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && cd /tmp/popscle && git apply /opt/demuxlet.patch && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin - type: r cran: [ readr, processx, dplyr ] diff --git a/src/genetic_demux/demuxlet/demuxlet.patch b/src/genetic_demux/demuxlet/demuxlet.patch new file mode 100644 index 00000000000..29cc67c9796 --- /dev/null +++ b/src/genetic_demux/demuxlet/demuxlet.patch @@ -0,0 +1,12 @@ +diff --git a/gtf_interval_tree.h b/gtf_interval_tree.h +index 99221a8..56ce1f6 100644 +--- a/gtf_interval_tree.h ++++ b/gtf_interval_tree.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + template + class gtfInterval { diff --git a/src/genetic_demux/freemuxlet/config.vsh.yaml b/src/genetic_demux/freemuxlet/config.vsh.yaml index b029265b252..ea25336081a 100644 --- a/src/genetic_demux/freemuxlet/config.vsh.yaml +++ b/src/genetic_demux/freemuxlet/config.vsh.yaml @@ -95,6 +95,7 @@ argument_groups: resources: - type: r_script path: script.R + - path: freemuxlet.patch test_resources: - type: bash_script path: test.sh @@ -102,14 +103,16 @@ test_resources: engines: - type: docker - image: ubuntu:20.04 + image: ubuntu:22.04 setup: + - type: docker + copy: ["freemuxlet.patch /opt/freemuxlet.patch"] - type: apt packages: [ autoconf, wget, git, build-essential, libcurl4-openssl-dev, cmake, libbz2-dev, libssl-dev, liblzma-dev, zlib1g-dev, r-base] - type: docker run: git clone https://github.com/samtools/htslib.git /tmp/htslib && cd /tmp/htslib && git submodule update --init --recursive && autoreconf -i && ./configure --prefix=/usr/local/ && make && make install - type: docker - run: git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin + run: git clone --depth 1 https://github.com/statgen/popscle.git /tmp/popscle && cd /tmp/popscle && git apply /opt/freemuxlet.patch && mkdir -p /tmp/popscle/build && cd /tmp/popscle/build && cmake .. && make && cp /tmp/popscle/bin/popscle /usr/local/bin - type: r cran: [ readr, processx, dplyr ] diff --git a/src/genetic_demux/freemuxlet/freemuxlet.patch b/src/genetic_demux/freemuxlet/freemuxlet.patch new file mode 100644 index 00000000000..29cc67c9796 --- /dev/null +++ b/src/genetic_demux/freemuxlet/freemuxlet.patch @@ -0,0 +1,12 @@ +diff --git a/gtf_interval_tree.h b/gtf_interval_tree.h +index 99221a8..56ce1f6 100644 +--- a/gtf_interval_tree.h ++++ b/gtf_interval_tree.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + template + class gtfInterval { diff --git a/src/integrate/harmonypy/script.py b/src/integrate/harmonypy/script.py index 791264fa685..77d7e4f36d4 100644 --- a/src/integrate/harmonypy/script.py +++ b/src/integrate/harmonypy/script.py @@ -18,13 +18,14 @@ def main(): mdata = mudata.read(par["input"].strip()) - mod_name = par['modality'] + mod_name = par["modality"] mod = mdata.mod[mod_name] - pca_embedding = mod.obsm[par['obsm_input']] + pca_embedding = mod.obsm[par["obsm_input"]] metadata = mod.obs - ho = run_harmony(pca_embedding, metadata, par['obs_covariates'], theta=par['theta']) + ho = run_harmony(pca_embedding, metadata, par["obs_covariates"], theta=par["theta"]) mod.obsm[par["obsm_output"]] = ho.Z_corr.T - mdata.write_h5mu(par['output'].strip(), compression=par["output_compression"]) + mdata.write_h5mu(par["output"].strip(), compression=par["output_compression"]) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/integrate/harmonypy/test.py b/src/integrate/harmonypy/test.py index c70ee4808de..deac4d9ebc5 100644 --- a/src/integrate/harmonypy/test.py +++ b/src/integrate/harmonypy/test.py @@ -5,34 +5,53 @@ ## VIASH START meta = { - 'executable': './target/executable/integrate/harmonypy/harmonypy', - 'resources_dir': './resources_test/pbmc_1k_protein_v3/' + "executable": "./target/executable/integrate/harmonypy/harmonypy", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", } ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + def test_harmonypy(run_component, tmp_path): output_path = tmp_path / "output.h5mu" # run component - run_component([ - "--input", input_file, - "--modality", "rna", - "--obsm_input", "X_pca", - "--obsm_output", "X_pca_int", - "--obs_covariates", "harmony_integration_leiden_1.0", - "--output", str(output_path), - "--output_compression", "gzip"]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--obsm_input", + "X_pca", + "--obsm_output", + "X_pca_int", + "--obs_covariates", + "harmony_integration_leiden_1.0", + "--output", + str(output_path), + "--output_compression", + "gzip", + ] + ) assert output_path.is_file() # check output input_data = mudata.read_h5mu(input_file) output_data = mudata.read_h5mu(output_path) - np.testing.assert_array_equal(output_data.mod['rna'].X.data, input_data.mod['rna'].X.data) - np.testing.assert_array_equal(input_data.mod['rna'].obsm['X_pca'], output_data.mod['rna'].obsm['X_pca']) - assert 'X_pca_int' in output_data.mod['rna'].obsm - assert output_data.mod['rna'].obsm['X_pca_int'].shape == input_data.mod['rna'].obsm['X_pca'].shape + np.testing.assert_array_equal( + output_data.mod["rna"].X.data, input_data.mod["rna"].X.data + ) + np.testing.assert_array_equal( + input_data.mod["rna"].obsm["X_pca"], output_data.mod["rna"].obsm["X_pca"] + ) + assert "X_pca_int" in output_data.mod["rna"].obsm + assert ( + output_data.mod["rna"].obsm["X_pca_int"].shape + == input_data.mod["rna"].obsm["X_pca"].shape + ) + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/integrate/scanorama/script.py b/src/integrate/scanorama/script.py index c7a91221990..64f004aa573 100644 --- a/src/integrate/scanorama/script.py +++ b/src/integrate/scanorama/script.py @@ -1,6 +1,5 @@ ### VIASH START -par = { -} +par = {} ### VIASH END from scanpy.external.pp import scanorama_integrate @@ -12,14 +11,16 @@ mod = mdata.mod[mod_name] # Integration. -scanorama_integrate(mod, - key=par["obs_batch"], - basis=par["obsm_input"], - adjusted_basis=par["obsm_output"], - knn=par["knn"], - alpha=par["alpha"], - sigma=par["sigma"], - approx=par["approx"], - batch_size=par["batch_size"] ) +scanorama_integrate( + mod, + key=par["obs_batch"], + basis=par["obsm_input"], + adjusted_basis=par["obsm_output"], + knn=par["knn"], + alpha=par["alpha"], + sigma=par["sigma"], + approx=par["approx"], + batch_size=par["batch_size"], +) -mdata.write_h5mu(par["output"], compression=par["output_compression"]) \ No newline at end of file +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/integrate/scanorama/test.py b/src/integrate/scanorama/test.py index 2a89b596ad3..c639d99355a 100644 --- a/src/integrate/scanorama/test.py +++ b/src/integrate/scanorama/test.py @@ -4,60 +4,80 @@ ## VIASH START meta = { - 'executable': './target/docker/integrate/scanorama/scanorama', - 'resources_dir': './resources_test/pbmc_1k_protein_v3/' + "executable": "./target/docker/integrate/scanorama/scanorama", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", } ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + @pytest.fixture def input_with_batch(tmp_path): tmp_input_path = tmp_path / "input.h5mu" input_data = read_h5mu(input_file) - mod = input_data.mod['rna'] + mod = input_data.mod["rna"] number_of_obs = mod.n_obs - mod.obs['batch'] = 'A' - column_index = mod.obs.columns.get_indexer(['batch']) - mod.obs.iloc[slice(number_of_obs//2, None), column_index] = 'B' + mod.obs["batch"] = "A" + column_index = mod.obs.columns.get_indexer(["batch"]) + mod.obs.iloc[slice(number_of_obs // 2, None), column_index] = "B" input_data.write(tmp_input_path) return tmp_input_path, input_data + def test_simple_integration(run_component, input_with_batch, tmp_path): tmp_input_path, _ = input_with_batch output_path = tmp_path / "output.h5mu" # run component - run_component([ - "--input", str(tmp_input_path), - "--output", str(output_path), - "--obs_batch", "batch", - "--obsm_input", "X_pca", - "--output_compression", "gzip"]) + run_component( + [ + "--input", + str(tmp_input_path), + "--output", + str(output_path), + "--obs_batch", + "batch", + "--obsm_input", + "X_pca", + "--output_compression", + "gzip", + ] + ) assert output_path.is_file() # check output data = read_h5mu(output_path) - assert "X_scanorama" in data.mod['rna'].obsm + assert "X_scanorama" in data.mod["rna"].obsm + def test_obsm_output(run_component, input_with_batch, tmp_path): tmp_input_path, _ = input_with_batch output_path = tmp_path / "output.h5mu" # run component - run_component([ - "--input", str(tmp_input_path), - "--output", str(output_path), - "--obsm_output", "X_test", - "--obs_batch", "batch", - "--obsm_input", "X_pca"]) + run_component( + [ + "--input", + str(tmp_input_path), + "--output", + str(output_path), + "--obsm_output", + "X_test", + "--obs_batch", + "batch", + "--obsm_input", + "X_pca", + ] + ) assert output_path.is_file() # check output data = read_h5mu(output_path) - assert "X_test" in data.mod['rna'].obsm + assert "X_test" in data.mod["rna"].obsm + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/integrate/scarches/script.py b/src/integrate/scarches/script.py index e6544250429..adca4c2b5be 100644 --- a/src/integrate/scarches/script.py +++ b/src/integrate/scarches/script.py @@ -2,6 +2,7 @@ import mudata import scvi from torch.cuda import is_available as cuda_is_available + try: from torch.backends.mps import is_available as mps_is_available except ModuleNotFoundError: @@ -10,6 +11,7 @@ def mps_is_available(): return False + ### VIASH START par = { "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", @@ -24,13 +26,16 @@ def mps_is_available(): "early_stopping_monitor": "elbo_validation", "early_stopping_patience": 45, "early_stopping_min_delta": 0, - "max_epochs": 500} + "max_epochs": 500, +} ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + def _read_model_name_from_registry(model_path) -> str: """Read registry with information about the model, return the model name""" registry = scvi.model.base.BaseModelClass.load_registry(model_path) @@ -53,13 +58,13 @@ def _detect_base_model(model_path): "AmortizedLDA": scvi.model.AmortizedLDA, "JaxSCVI": scvi.model.JaxSCVI, } - + return names_to_models_map[_read_model_name_from_registry(model_path)] def extract_file_name(file_path): """Return the name of the file from path to this file - + Examples -------- >>> extract_file_name("resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu") @@ -68,7 +73,7 @@ def extract_file_name(file_path): slash_position = file_path.rfind("/") dot_position = file_path.rfind(".") - return file_path[slash_position + 1: dot_position] + return file_path[slash_position + 1 : dot_position] def map_to_existing_reference(adata_query, model_path, check_val_every_n_epoch=1): @@ -78,7 +83,7 @@ def map_to_existing_reference(adata_query, model_path, check_val_every_n_epoch=1 Input: * adata_query: An AnnData object with the query * model_path: The reference model directory - + Output: * vae_query: the trained scvi_tools model * adata_query: The AnnData object with the query preprocessed for the mapping to the reference @@ -88,26 +93,24 @@ def map_to_existing_reference(adata_query, model_path, check_val_every_n_epoch=1 try: model.prepare_query_anndata(adata_query, model_path) except ValueError: - logger.warning("ValueError thrown when preparing adata for mapping. Clearing .varm field to prevent it") + logger.warning( + "ValueError thrown when preparing adata for mapping. Clearing .varm field to prevent it" + ) adata_query.varm.clear() model.prepare_query_anndata(adata_query, model_path) # Load query data into the model - vae_query = model.load_query_data( - adata_query, - model_path, - freeze_dropout=True - ) + vae_query = model.load_query_data(adata_query, model_path, freeze_dropout=True) # Train scArches model for query mapping vae_query.train( - max_epochs=par["max_epochs"], - early_stopping=par['early_stopping'], - early_stopping_monitor=par['early_stopping_monitor'], - early_stopping_patience=par['early_stopping_patience'], - early_stopping_min_delta=par['early_stopping_min_delta'], - check_val_every_n_epoch=check_val_every_n_epoch, - use_gpu=(cuda_is_available() or mps_is_available()) + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], + check_val_every_n_epoch=check_val_every_n_epoch, + use_gpu=(cuda_is_available() or mps_is_available()), ) return vae_query, adata_query @@ -115,12 +118,13 @@ def map_to_existing_reference(adata_query, model_path, check_val_every_n_epoch=1 def _convert_object_dtypes_to_strings(adata): """Convert object dtypes in .var and .obs to string to prevent error when saving file""" + def convert_cols(df): object_cols = df.columns[df.dtypes == "object"] for col in object_cols: - df[col] = df[col].astype(str) - return df - + df[col] = df[col].astype(str) + return df + adata.var = convert_cols(adata.var) adata.obs = convert_cols(adata.obs) @@ -129,12 +133,12 @@ def convert_cols(df): def _get_model_path(model_path: str): """Obtain path to the directory with reference model. If the proposed `model_path` is a .zip archive, unzip it. If nesessary, convert model to the new format - + Parameters ---------- model_path : str Path to a directory, where to search for the model or to a zip file containing the model - + Returns ------- Path to a directory with reference model in format of scvi-tools>=0.15 @@ -147,7 +151,7 @@ def _get_model_path(model_path: str): if os.path.isdir(model_path) and "model.pt" in os.listdir(model_path): # Probably, the `model_path` already contains model in the output format of scvi-tools>=0.15 return model_path - + # The model either has old format or is a zip file downloaded from Zenodo new_directory = Path(tempfile.TemporaryDirectory().name) @@ -170,13 +174,14 @@ def _get_model_path(model_path: str): elif "model.pt" in os.listdir(model_dir): # Archive contained model in the new format, so just return the directory return model_dir - + else: - raise ValueError("Cannot find model in the provided reference path. Please, provide a path or a link to the directory with reference model. For HLCA use https://zenodo.org/record/6337966/files/HLCA_reference_model.zip") + raise ValueError( + "Cannot find model in the provided reference path. Please, provide a path or a link to the directory with reference model. For HLCA use https://zenodo.org/record/6337966/files/HLCA_reference_model.zip" + ) def main(): - mdata_query = mudata.read(par["input"].strip()) adata_query = mdata_query.mod[par["modality"]].copy() @@ -190,7 +195,9 @@ def main(): adata_query.obs["dataset"] = par["dataset_name"] model_path = _get_model_path(par["reference"]) - vae_query, adata_query = map_to_existing_reference(adata_query, model_path=model_path) + vae_query, adata_query = map_to_existing_reference( + adata_query, model_path=model_path + ) model_name = _read_model_name_from_registry(model_path) # Save info about the used model @@ -198,10 +205,14 @@ def main(): logger.info("Trying to write latent representation") output_key = par["obsm_output"].format(model_name=model_name) - mdata_query.mod[par["modality"]].obsm[output_key] = vae_query.get_latent_representation() + mdata_query.mod[par["modality"]].obsm[output_key] = ( + vae_query.get_latent_representation() + ) logger.info("Converting dtypes") - mdata_query.mod[par["modality"]] = _convert_object_dtypes_to_strings(mdata_query.mod[par["modality"]]) + mdata_query.mod[par["modality"]] = _convert_object_dtypes_to_strings( + mdata_query.mod[par["modality"]] + ) logger.info("Updating mudata") try: @@ -209,7 +220,9 @@ def main(): except KeyError: # Sometimes this error is thrown, but then everything is magically fixed, and the file gets saved normally # This is discussed here a bit: https://github.com/scverse/mudata/issues/27 - logger.warning("KeyError was thrown during updating mudata. Probably, the file is fixed after that, but be careful") + logger.warning( + "KeyError was thrown during updating mudata. Probably, the file is fixed after that, but be careful" + ) logger.info("Saving h5mu file") mdata_query.write_h5mu(par["output"].strip(), compression=par["output_compression"]) @@ -217,5 +230,6 @@ def main(): logger.info("Saving model") vae_query.save(par["model_output"], overwrite=True) + if __name__ == "__main__": main() diff --git a/src/integrate/scarches/test.py b/src/integrate/scarches/test.py index 74a5a5958e4..a3a582cca93 100644 --- a/src/integrate/scarches/test.py +++ b/src/integrate/scarches/test.py @@ -4,24 +4,25 @@ ## VIASH START meta = { - 'executable': './target/executable/integrate/scarches/scarches', - 'resources_dir': './resources_test/' + "executable": "./target/executable/integrate/scarches/scarches", + "resources_dir": "./resources_test/", } ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" reference = f"{meta['resources_dir']}/HLCA_reference_model.zip" + @pytest.fixture def input_with_batch(tmp_path): tmp_input_path = tmp_path / "input.h5mu" input_data = mudata.read_h5mu(input_file) - mod = input_data.mod['rna'] + mod = input_data.mod["rna"] number_of_obs = mod.n_obs - mod.obs['batch'] = 'A' - column_index = mod.obs.columns.get_indexer(['batch']) - mod.obs.iloc[slice(number_of_obs//2, None), column_index] = 'B' + mod.obs["batch"] = "A" + column_index = mod.obs.columns.get_indexer(["batch"]) + mod.obs.iloc[slice(number_of_obs // 2, None), column_index] = "B" input_data.write(tmp_input_path) return tmp_input_path, input_data @@ -33,21 +34,32 @@ def test_hlca_reference_model(run_component, input_with_batch, tmp_path): output_model_path = tmp_path / "model_output" # run component - run_component([ - "--input", str(tmp_input_path), - "--reference", reference, - "--modality", "rna", - "--output", str(output_path), - "--model_output", str(output_model_path), - "--max_epochs", "1", - "--output_compression", "gzip"]) + run_component( + [ + "--input", + str(tmp_input_path), + "--reference", + reference, + "--modality", + "rna", + "--output", + str(output_path), + "--model_output", + str(output_model_path), + "--max_epochs", + "1", + "--output_compression", + "gzip", + ] + ) assert output_path.is_file() # check output output_data = mudata.read_h5mu(output_path) - assert 'X_integrated_scanvi' in output_data.mod['rna'].obsm + assert "X_integrated_scanvi" in output_data.mod["rna"].obsm assert output_data["rna"].uns["integration_method"] == "SCANVI" assert (output_model_path / "model.pt").is_file() + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/integrate/scvi/script.py b/src/integrate/scvi/script.py index 6b10b3518f5..d4ae8e56c5c 100644 --- a/src/integrate/scvi/script.py +++ b/src/integrate/scvi/script.py @@ -23,62 +23,65 @@ "n_var_min_count": 10, "output_model": "test/", "output_compression": "gzip", - } - -meta = { - "resources_dir": 'src/integrate/scvi' } + +meta = {"resources_dir": "src/integrate/scvi"} ### VIASH END import sys -sys.path.append(meta['resources_dir']) + +sys.path.append(meta["resources_dir"]) from subset_vars import subset_vars -#TODO: optionally, move to qa + +# TODO: optionally, move to qa # https://github.com/openpipelines-bio/openpipeline/issues/435 -def check_validity_anndata(adata, layer, obs_batch, - n_obs_min_count, n_var_min_count): +def check_validity_anndata(adata, layer, obs_batch, n_obs_min_count, n_var_min_count): assert check_nonnegative_integers( adata.layers[layer] if layer else adata.X - ), f"Make sure input adata contains raw_counts" + ), "Make sure input adata contains raw_counts" assert len(set(adata.var_names)) == len( adata.var_names - ), f"Dataset contains multiple genes with same gene name." + ), "Dataset contains multiple genes with same gene name." # Ensure every obs_batch category has sufficient observations - assert min(adata.obs[[obs_batch]].value_counts()) > n_obs_min_count, \ - f"Anndata has fewer than {n_obs_min_count} cells." - - assert adata.n_vars > n_var_min_count, \ - f"Anndata has fewer than {n_var_min_count} genes." + assert ( + min(adata.obs[[obs_batch]].value_counts()) > n_obs_min_count + ), f"Anndata has fewer than {n_obs_min_count} cells." + assert ( + adata.n_vars > n_var_min_count + ), f"Anndata has fewer than {n_var_min_count} genes." def main(): mdata = mudata.read(par["input"].strip()) - adata = mdata.mod[par['modality']] + adata = mdata.mod[par["modality"]] - if par['var_input']: + if par["var_input"]: # Subset to HVG adata_subset = subset_vars(adata, subset_col=par["var_input"]).copy() else: adata_subset = adata.copy() check_validity_anndata( - adata_subset, par['input_layer'], par['obs_batch'], - par["n_obs_min_count"], par["n_var_min_count"] + adata_subset, + par["input_layer"], + par["obs_batch"], + par["n_obs_min_count"], + par["n_var_min_count"], ) # Set up the data scvi.model.SCVI.setup_anndata( adata_subset, - batch_key=par['obs_batch'], - layer=par['input_layer'], - labels_key=par['obs_labels'], - size_factor_key=par['obs_size_factor'], - categorical_covariate_keys=par['obs_categorical_covariate'], - continuous_covariate_keys=par['obs_continuous_covariate'], + batch_key=par["obs_batch"], + layer=par["input_layer"], + labels_key=par["obs_labels"], + size_factor_key=par["obs_size_factor"], + categorical_covariate_keys=par["obs_categorical_covariate"], + continuous_covariate_keys=par["obs_continuous_covariate"], ) # Set up the model @@ -92,25 +95,30 @@ def main(): gene_likelihood=par["gene_likelihood"], use_layer_norm=par["use_layer_normalization"], use_batch_norm=par["use_batch_normalization"], - encode_covariates=par["encode_covariates"], # Default (True) is for better scArches performance -> maybe don't use this always? - deeply_inject_covariates=par["deeply_inject_covariates"], # Default (False) for better scArches performance -> maybe don't use this always? - use_observed_lib_size=par["use_observed_lib_size"], # When size_factors are not passed + encode_covariates=par[ + "encode_covariates" + ], # Default (True) is for better scArches performance -> maybe don't use this always? + deeply_inject_covariates=par[ + "deeply_inject_covariates" + ], # Default (False) for better scArches performance -> maybe don't use this always? + use_observed_lib_size=par[ + "use_observed_lib_size" + ], # When size_factors are not passed ) plan_kwargs = { - "reduce_lr_on_plateau": par['reduce_lr_on_plateau'], - "lr_patience": par['lr_patience'], - "lr_factor": par['lr_factor'], + "reduce_lr_on_plateau": par["reduce_lr_on_plateau"], + "lr_patience": par["lr_patience"], + "lr_factor": par["lr_factor"], } - # Train the model vae_uns.train( - max_epochs=par['max_epochs'], - early_stopping=par['early_stopping'], - early_stopping_monitor=par['early_stopping_monitor'], - early_stopping_patience=par['early_stopping_patience'], - early_stopping_min_delta=par['early_stopping_min_delta'], + max_epochs=par["max_epochs"], + early_stopping=par["early_stopping"], + early_stopping_monitor=par["early_stopping_monitor"], + early_stopping_patience=par["early_stopping_patience"], + early_stopping_min_delta=par["early_stopping_min_delta"], plan_kwargs=plan_kwargs, check_val_every_n_epoch=1, accelerator="auto", @@ -118,12 +126,13 @@ def main(): # Note: train_size=1.0 should give better results, but then can't do early_stopping on validation set # Get the latent output - adata.obsm[par['obsm_output']] = vae_uns.get_latent_representation() + adata.obsm[par["obsm_output"]] = vae_uns.get_latent_representation() - mdata.mod[par['modality']] = adata - mdata.write_h5mu(par['output'].strip(), compression=par["output_compression"]) + mdata.mod[par["modality"]] = adata + mdata.write_h5mu(par["output"].strip(), compression=par["output_compression"]) if par["output_model"]: vae_uns.save(par["output_model"], overwrite=True) + if __name__ == "__main__": main() diff --git a/src/integrate/scvi/test.py b/src/integrate/scvi/test.py index 7fa7c1251e3..cdf1bce759f 100644 --- a/src/integrate/scvi/test.py +++ b/src/integrate/scvi/test.py @@ -1,53 +1,70 @@ import pytest from pathlib import Path -from tempfile import NamedTemporaryFile import mudata from anndata.tests.helpers import assert_equal ## VIASH START meta = { - 'executable': './target/executable/integrate/scvi/scvi', - 'resources_dir': './resources_test/pbmc_1k_protein_v3/' + "executable": "./target/executable/integrate/scvi/scvi", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", } ## VIASH END import sys -sys.path.append(meta['resources_dir']) + +sys.path.append(meta["resources_dir"]) from subset_vars import subset_vars input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + @pytest.fixture def mudata_with_mod_rna_obs_batch(tmp_path, request): obs_batch, var_input, obsm_output = request.param new_input_file = tmp_path / "input.h5mu" - + input_data = mudata.read_h5mu(input_file) - input_rna = input_data.mod['rna'] - input_rna.obs[obs_batch] = 'A' + input_rna = input_data.mod["rna"] + input_rna.obs[obs_batch] = "A" column_index = input_rna.obs.columns.get_indexer([obs_batch]) - input_rna.obs.iloc[slice(input_rna.n_obs//2, None), column_index] = 'B' + input_rna.obs.iloc[slice(input_rna.n_obs // 2, None), column_index] = "B" input_data.write(new_input_file.name) return new_input_file.name, input_rna, obs_batch, var_input, obsm_output -@pytest.mark.parametrize("mudata_with_mod_rna_obs_batch", [("batch", None, None), ("batch2", "filter_with_hvg", "X_int")], indirect=True) + +@pytest.mark.parametrize( + "mudata_with_mod_rna_obs_batch", + [("batch", None, None), ("batch2", "filter_with_hvg", "X_int")], + indirect=True, +) def test_scvi(run_component, mudata_with_mod_rna_obs_batch): - new_input_file, input_rna, obs_batch, var_input, obsm_output = mudata_with_mod_rna_obs_batch + new_input_file, input_rna, obs_batch, var_input, obsm_output = ( + mudata_with_mod_rna_obs_batch + ) args = [ - "--input", new_input_file, - "--modality", "rna", - "--obs_batch", obs_batch, - "--output", "output.h5mu", - "--output_model", "test/", - "--max_epochs", "1", - "--n_obs_min_count", "10", - "--n_var_min_count", "10", - "--output_compression", "gzip" + "--input", + new_input_file, + "--modality", + "rna", + "--obs_batch", + obs_batch, + "--output", + "output.h5mu", + "--output_model", + "test/", + "--max_epochs", + "1", + "--n_obs_min_count", + "10", + "--n_var_min_count", + "10", + "--output_compression", + "gzip", ] if var_input is not None: @@ -56,7 +73,7 @@ def test_scvi(run_component, mudata_with_mod_rna_obs_batch): args.extend(["--obsm_output", obsm_output]) run_component(args) - + # check files assert Path("output.h5mu").is_file(), "Output file does not exist" assert Path("test").is_dir() @@ -64,17 +81,24 @@ def test_scvi(run_component, mudata_with_mod_rna_obs_batch): # check output h5mu output_data = mudata.read_h5mu("output.h5mu") - output_rna = output_data.mod['rna'] - assert output_rna.n_obs == input_rna.n_obs, f"Number of observations changed\noutput_data: {output_data}" - assert output_rna.n_vars == input_rna.n_vars, f"Number of variables changed\noutput_data: {output_data}" + output_rna = output_data.mod["rna"] + assert ( + output_rna.n_obs == input_rna.n_obs + ), f"Number of observations changed\noutput_data: {output_data}" + assert ( + output_rna.n_vars == input_rna.n_vars + ), f"Number of variables changed\noutput_data: {output_data}" expected_obsm_output = "X_scvi_integrated" if obsm_output is None else obsm_output - assert expected_obsm_output in output_rna.obsm, f".obsm['{expected_obsm_output}'] not added\noutput_data: {output_data}" + assert ( + expected_obsm_output in output_rna.obsm + ), f".obsm['{expected_obsm_output}'] not added\noutput_data: {output_data}" # assert that nothing else has changed del output_rna.obsm[expected_obsm_output] assert_equal(input_rna, output_rna) + def test_hvg_subsetting_helper(): input_data = mudata.read_h5mu(input_file) adata = input_data.mod["rna"] @@ -82,7 +106,9 @@ def test_hvg_subsetting_helper(): old_n_genes = adata.n_vars adata.var["highly_variable_features"] = False - adata.var.iloc[:old_n_genes // 2, adata.var.columns.get_indexer(["highly_variable_features"])] = True + adata.var.iloc[ + : old_n_genes // 2, adata.var.columns.get_indexer(["highly_variable_features"]) + ] = True adata = subset_vars(adata, subset_col="highly_variable_features") @@ -90,6 +116,7 @@ def test_hvg_subsetting_helper(): assert adata.n_vars == old_n_genes // 2 # Only HVG are subsetted assert adata.var["highly_variable_features"].all() - -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/integrate/totalvi/script.py b/src/integrate/totalvi/script.py index 5ba6507b54b..94c3d3dc782 100644 --- a/src/integrate/totalvi/script.py +++ b/src/integrate/totalvi/script.py @@ -8,7 +8,7 @@ import numpy as np import scvi from scipy.sparse import issparse - + ### VIASH START par = { @@ -30,21 +30,31 @@ "query_model_path": "totalvi_model_query/", "max_epochs": 1, "max_query_epochs": 1, - "weight_decay": 0.0 + "weight_decay": 0.0, } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() -def align_proteins_names(adata_reference: AnnData, mdata_query: MuData, adata_query: AnnData, reference_proteins_key: str, query_proteins_key: str) -> AnnData: + +def align_proteins_names( + adata_reference: AnnData, + mdata_query: MuData, + adata_query: AnnData, + reference_proteins_key: str, + query_proteins_key: str, +) -> AnnData: """Make sure that proteins are located in the same .obsm slot in reference and query. Pad query proteins with zeros if they are absent""" proteins_reference = adata_reference.obsm[reference_proteins_key] - # If query has no protein data, put matrix of zeros + # If query has no protein data, put matrix of zeros if not query_proteins_key or query_proteins_key not in mdata_query.mod: - adata_query.obsm[reference_proteins_key] = np.zeros((adata_query.n_obs, proteins_reference.shape[1])) + adata_query.obsm[reference_proteins_key] = np.zeros( + (adata_query.n_obs, proteins_reference.shape[1]) + ) else: # Make sure that proteins expression has the same key in query and reference adata_query.obsm[reference_proteins_key] = adata_query.obsm[query_proteins_key] @@ -52,7 +62,9 @@ def align_proteins_names(adata_reference: AnnData, mdata_query: MuData, adata_qu return adata_query -def extract_proteins_to_anndata(mdata: MuData, rna_modality_key, protein_modality_key, input_layer, hvg_var_key=None) -> AnnData: +def extract_proteins_to_anndata( + mdata: MuData, rna_modality_key, protein_modality_key, input_layer, hvg_var_key=None +) -> AnnData: """TOTALVI requires data to be stored in AnnData format with protein counts in .obsm slot. This function performs the conversion""" adata: AnnData = mdata.mod[rna_modality_key].copy() @@ -77,90 +89,123 @@ def extract_proteins_to_anndata(mdata: MuData, rna_modality_key, protein_modalit return adata -def build_reference_model(adata_reference: AnnData, max_train_epochs: int = 400) -> scvi.model.TOTALVI: - - vae_reference = scvi.model.TOTALVI(adata_reference, use_layer_norm="both", use_batch_norm="none") +def build_reference_model( + adata_reference: AnnData, max_train_epochs: int = 400 +) -> scvi.model.TOTALVI: + vae_reference = scvi.model.TOTALVI( + adata_reference, use_layer_norm="both", use_batch_norm="none" + ) vae_reference.train(max_train_epochs) vae_reference.save(par["reference_model_path"]) return vae_reference + def is_retraining_model() -> bool: """Decide, whether reference model should be trained. It happens when no model exists or force_retrain flag is on""" - - trained_model_exists = os.path.isdir(par["reference_model_path"]) and ("model.pt" in os.listdir(par["reference_model_path"])) + + trained_model_exists = os.path.isdir(par["reference_model_path"]) and ( + "model.pt" in os.listdir(par["reference_model_path"]) + ) return not trained_model_exists or par["force_retrain"] -def map_query_to_reference(mdata_reference: MuData, mdata_query: MuData, adata_query: AnnData) -> Tuple[scvi.model.TOTALVI, AnnData]: +def map_query_to_reference( + mdata_reference: MuData, mdata_query: MuData, adata_query: AnnData +) -> Tuple[scvi.model.TOTALVI, AnnData]: """Build model on the provided reference if necessary, and map query to the reference""" - adata_reference: AnnData = extract_proteins_to_anndata(mdata_reference, rna_modality_key=par["reference_modality"], protein_modality_key=par["reference_proteins_modality"], - input_layer=par["input_layer"], hvg_var_key=par["var_input"]) + adata_reference: AnnData = extract_proteins_to_anndata( + mdata_reference, + rna_modality_key=par["reference_modality"], + protein_modality_key=par["reference_proteins_modality"], + input_layer=par["input_layer"], + hvg_var_key=par["var_input"], + ) scvi.model.TOTALVI.setup_anndata( adata_reference, batch_key=par["obs_batch"], - protein_expression_obsm_key=par["reference_proteins_modality"] + protein_expression_obsm_key=par["reference_proteins_modality"], ) if is_retraining_model(): - vae_reference = build_reference_model(adata_reference, max_train_epochs=par["max_epochs"]) + vae_reference = build_reference_model( + adata_reference, max_train_epochs=par["max_epochs"] + ) else: - vae_reference = scvi.model.TOTALVI.load(dir_path=par["reference_model_path"], adata=adata_reference) + vae_reference = scvi.model.TOTALVI.load( + dir_path=par["reference_model_path"], adata=adata_reference + ) - adata_query: AnnData = align_proteins_names(adata_reference, mdata_query, adata_query, reference_proteins_key=par["reference_proteins_modality"], - query_proteins_key=par["query_proteins_modality"]) + adata_query: AnnData = align_proteins_names( + adata_reference, + mdata_query, + adata_query, + reference_proteins_key=par["reference_proteins_modality"], + query_proteins_key=par["query_proteins_modality"], + ) # Reorder genes and pad missing genes with 0s scvi.model.TOTALVI.prepare_query_anndata(adata_query, vae_reference) # Train the model for query - vae_query = scvi.model.TOTALVI.load_query_data( - adata_query, - vae_reference + vae_query = scvi.model.TOTALVI.load_query_data(adata_query, vae_reference) + vae_query.train( + par["max_query_epochs"], plan_kwargs=dict(weight_decay=par["weight_decay"]) ) - vae_query.train(par["max_query_epochs"], plan_kwargs=dict(weight_decay=par["weight_decay"])) return vae_query, adata_query + def main(): mdata_query = mudata.read(par["input"].strip()) - adata_query = extract_proteins_to_anndata(mdata_query, - rna_modality_key=par["query_modality"], - protein_modality_key=par["query_proteins_modality"], - input_layer=par["input_layer"], - hvg_var_key=par["var_input"]) + adata_query = extract_proteins_to_anndata( + mdata_query, + rna_modality_key=par["query_modality"], + protein_modality_key=par["query_proteins_modality"], + input_layer=par["input_layer"], + hvg_var_key=par["var_input"], + ) if par["reference"].endswith(".h5mu"): logger.info("Reading reference") mdata_reference = mudata.read(par["reference"].strip()) logger.info("Mapping query to the reference") - vae_query, adata_query = map_query_to_reference(mdata_reference, mdata_query, adata_query) + vae_query, adata_query = map_query_to_reference( + mdata_reference, mdata_query, adata_query + ) else: raise ValueError("Incorrect format of reference, please provide a .h5mu file") adata_query.uns["integration_method"] = "totalvi" logger.info("Getting the latent representation of query") - mdata_query.mod[par["query_modality"]].obsm[par["obsm_output"]] = vae_query.get_latent_representation() - + mdata_query.mod[par["query_modality"]].obsm[par["obsm_output"]] = ( + vae_query.get_latent_representation() + ) + norm_rna, norm_protein = vae_query.get_normalized_expression() - mdata_query.mod[par["query_modality"]].obsm[par["obsm_normalized_rna_output"]] = norm_rna.to_numpy() + mdata_query.mod[par["query_modality"]].obsm[par["obsm_normalized_rna_output"]] = ( + norm_rna.to_numpy() + ) if par["query_proteins_modality"] in mdata_query.mod: - mdata_query.mod[par["query_proteins_modality"]].obsm[par["obsm_normalized_protein_output"]] = norm_protein.to_numpy() - + mdata_query.mod[par["query_proteins_modality"]].obsm[ + par["obsm_normalized_protein_output"] + ] = norm_protein.to_numpy() + logger.info("Updating mdata") mdata_query.update() logger.info("Saving updated query data") mdata_query.write_h5mu(par["output"].strip()) - + logger.info("Saving query model") vae_query.save(par["query_model_path"], overwrite=True) + if __name__ == "__main__": main() diff --git a/src/integrate/totalvi/test.py b/src/integrate/totalvi/test.py index 58eaacc25e3..0c6380275bc 100644 --- a/src/integrate/totalvi/test.py +++ b/src/integrate/totalvi/test.py @@ -5,30 +5,43 @@ ## VIASH START meta = { "executable": "./target/docker/integrate/totalvi/totalvi", - "resources_dir": "./resources_test/pbmc_1k_protein_v3/" + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", } ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + def test_totalvi(run_component, tmp_path): """Map data containing proteins on itself""" output_path = tmp_path / "output.h5mu" ref_model_path = tmp_path / "totalvi_reference_model" query_model_path = tmp_path / "totalvi_query_model" - run_component([ - "--input", input_file, - "--reference", input_file, - "--query_proteins_modality", "prot", - "--reference_proteins_modality", "prot", - "--var_input", "filter_with_hvg", - "--reference_model_path", str(ref_model_path), - "--query_model_path", str(query_model_path), - "--max_epochs", "1", - "--max_query_epochs", "1", - "--output", str(output_path) - ]) + run_component( + [ + "--input", + input_file, + "--reference", + input_file, + "--query_proteins_modality", + "prot", + "--reference_proteins_modality", + "prot", + "--var_input", + "filter_with_hvg", + "--reference_model_path", + str(ref_model_path), + "--query_model_path", + str(query_model_path), + "--max_epochs", + "1", + "--max_query_epochs", + "1", + "--output", + str(output_path), + ] + ) assert output_path.is_file() output_data = mudata.read_h5mu(output_path) @@ -36,5 +49,6 @@ def test_totalvi(run_component, tmp_path): assert "X_totalvi_normalized_rna" in output_data.mod["rna"].obsm assert "X_totalvi_normalized_protein" in output_data.mod["prot"].obsm + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/interpret/lianapy/script.py b/src/interpret/lianapy/script.py index cee30e73e5f..ba6027bf146 100644 --- a/src/interpret/lianapy/script.py +++ b/src/interpret/lianapy/script.py @@ -1,8 +1,8 @@ import liana import mudata + # TODO: Remove when grouping labels exist # For sign/PCA/ -import numpy as np import pandas as pd ### VIASH START @@ -25,38 +25,39 @@ def main(): - # Get input data - mdata = mudata.read(par['input'].strip()) - mod = mdata.mod[par['modality']] + mdata = mudata.read(par["input"].strip()) + mod = mdata.mod[par["modality"]] # Add dummy grouping labels when they do not exist - if par['groupby'] not in mod.obs: - raise ValueError(f"Column {par['groupy']} does not exist in " - f".obs for modality {par['modality']}.") - mod_col = mod.obs[par['groupby']] + if par["groupby"] not in mod.obs: + raise ValueError( + f"Column {par['groupy']} does not exist in " + f".obs for modality {par['modality']}." + ) + mod_col = mod.obs[par["groupby"]] original_groupby_col = mod_col.copy() if not isinstance(mod_col, pd.CategoricalDtype): - mod.obs[par['groupby']] = mod_col.astype(str).astype('category') + mod.obs[par["groupby"]] = mod_col.astype(str).astype("category") # Solve gene labels orig_gene_label = mod.var.index - mod.var_names = mod.var[par['gene_symbol']].astype(str) + mod.var_names = mod.var[par["gene_symbol"]].astype(str) mod.var_names_make_unique() liana.mt.rank_aggregate( - adata = mod, - groupby = par['groupby'], - resource_name = par["resource_name"], - expr_prop = par["expr_prop"], - min_cells = par["min_cells"], - aggregate_method = par["aggregate_method"], - return_all_lrs = par["return_all_lrs"], - layer = par["layer"], - n_perms = par["n_perms"], - verbose = True, - inplace = True, - use_raw = False + adata=mod, + groupby=par["groupby"], + resource_name=par["resource_name"], + expr_prop=par["expr_prop"], + min_cells=par["min_cells"], + aggregate_method=par["aggregate_method"], + return_all_lrs=par["return_all_lrs"], + layer=par["layer"], + n_perms=par["n_perms"], + verbose=True, + inplace=True, + use_raw=False, ) # Return original gene labels @@ -66,7 +67,8 @@ def main(): mod.obs[par["groupby"]] = original_groupby_col # TODO: make sure compression is needed - mdata.write_h5mu(par['output'].strip(), compression=par['output_compression']) + mdata.write_h5mu(par["output"].strip(), compression=par["output_compression"]) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/interpret/lianapy/test.py b/src/interpret/lianapy/test.py index cad96d05381..e0b3c6e8171 100644 --- a/src/interpret/lianapy/test.py +++ b/src/interpret/lianapy/test.py @@ -5,8 +5,8 @@ ## VIASH START meta = { - 'executable': './target/executable/interpret/lianapy/', - 'resources_dir': './resources_test/pbmc_1k_protein_v3/' + "executable": "./target/executable/interpret/lianapy/", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", } ## VIASH END @@ -17,30 +17,57 @@ def test_lianapy(run_component, tmp_path): output_path = tmp_path / "output.h5mu" # run component - run_component([ - "--input", input_file, - "--output_compression", "gzip", - "--modality", "rna", - "--layer", "log_normalized", - "--groupby", "harmony_integration_leiden_1.0", - "--resource_name", "consensus", - "--gene_symbol", "gene_symbol", - "--expr_prop", "0.1", - "--min_cells", "5", - "--aggregate_method", "rra", - "--return_all_lrs", "False", - "--n_perms", "11", - "--output", str(output_path)]) + run_component( + [ + "--input", + input_file, + "--output_compression", + "gzip", + "--modality", + "rna", + "--layer", + "log_normalized", + "--groupby", + "harmony_integration_leiden_1.0", + "--resource_name", + "consensus", + "--gene_symbol", + "gene_symbol", + "--expr_prop", + "0.1", + "--min_cells", + "5", + "--aggregate_method", + "rra", + "--return_all_lrs", + "False", + "--n_perms", + "11", + "--output", + str(output_path), + ] + ) assert output_path.is_file() # check output input_data = mudata.read_h5mu(input_file) output_data = mudata.read_h5mu(output_path) - np.testing.assert_array_equal(output_data.mod['rna'].X.data, input_data.mod['rna'].X.data) - np.testing.assert_array_equal(input_data.mod['rna'].var.index, output_data.mod['rna'].var.index) + np.testing.assert_array_equal( + output_data.mod["rna"].X.data, input_data.mod["rna"].X.data + ) + np.testing.assert_array_equal( + input_data.mod["rna"].var.index, output_data.mod["rna"].var.index + ) assert "liana_res" in output_data.mod["rna"].uns - assert all(elem in output_data.mod['rna'].obs['harmony_integration_leiden_1.0'].values for elem in output_data.mod['rna'].uns['liana_res']['source'].unique()) - assert all(elem in output_data.mod['rna'].obs['harmony_integration_leiden_1.0'].values for elem in output_data.mod['rna'].uns['liana_res']['target'].unique()) + assert all( + elem in output_data.mod["rna"].obs["harmony_integration_leiden_1.0"].values + for elem in output_data.mod["rna"].uns["liana_res"]["source"].unique() + ) + assert all( + elem in output_data.mod["rna"].obs["harmony_integration_leiden_1.0"].values + for elem in output_data.mod["rna"].uns["liana_res"]["target"].unique() + ) + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/labels_transfer/knn/script.py b/src/labels_transfer/knn/script.py index b6195e1925a..bac686f37c9 100644 --- a/src/labels_transfer/knn/script.py +++ b/src/labels_transfer/knn/script.py @@ -18,11 +18,9 @@ "output_uns_parameters": "labels_transfer", "output_compression": None, "weights": "distance", - "n_neighbors": 15 -} -meta = { - "resources_dir": "src/labels_transfer/utils" + "n_neighbors": 15, } +meta = {"resources_dir": "src/labels_transfer/utils"} ## VIASH END sys.path.append(meta["resources_dir"]) @@ -41,6 +39,8 @@ def setup_logger(): logger.addHandler(console_handler) return logger + + # END TEMPORARY WORKAROUND setup_logger @@ -58,7 +58,7 @@ def distances_to_affinities(distances): distances_tilda_normalized = np.where( np.sum(distances_tilda, axis=1, keepdims=True) == 0, 1, - distances_tilda / np.sum(distances_tilda, axis=1, keepdims=True) + distances_tilda / np.sum(distances_tilda, axis=1, keepdims=True), ) return distances_tilda_normalized @@ -66,7 +66,9 @@ def distances_to_affinities(distances): logger = setup_logger() # Reading in data -logger.info(f"Reading in query dataset {par['input']} and reference datasets {par['reference']}") +logger.info( + f"Reading in query dataset {par['input']} and reference datasets {par['reference']}" +) q_mdata = mu.read_h5mu(par["input"]) q_adata = q_mdata.mod[par["modality"]] @@ -78,32 +80,45 @@ def distances_to_affinities(distances): par = check_arguments(par) if par["input_obsm_distances"] and par["reference_obsm_distances"]: - logger.info("Using pre-calculated distances for KNN classification as provided in `--input_obsm_distances` and `--reference_obsm_distances`.") + logger.info( + "Using pre-calculated distances for KNN classification as provided in `--input_obsm_distances` and `--reference_obsm_distances`." + ) - assert par["input_obsm_distances"] in q_adata.obsm, f"Make sure --input_obsm_distances {par['input_obsm_distances']} is a valid .obsm key. Found: {q_adata.obsm.keys()}." - assert par["reference_obsm_distances"] in r_adata.obsm, f"Make sure --reference_obsm_distances {par['reference_obsm_distances']} is a valid .obsm key. Found: {r_adata.obsm.keys()}." + assert ( + par["input_obsm_distances"] in q_adata.obsm + ), f"Make sure --input_obsm_distances {par['input_obsm_distances']} is a valid .obsm key. Found: {q_adata.obsm.keys()}." + assert ( + par["reference_obsm_distances"] in r_adata.obsm + ), f"Make sure --reference_obsm_distances {par['reference_obsm_distances']} is a valid .obsm key. Found: {r_adata.obsm.keys()}." query_neighbors = q_adata.obsm[par["input_obsm_distances"]] reference_neighbors = r_adata.obsm[par["reference_obsm_distances"]] if query_neighbors.shape[1] != reference_neighbors.shape[1]: - raise ValueError("The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset.") + raise ValueError( + "The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset." + ) # Make sure the number of neighbors present in the distance matrix matches the requested number of neighbors in --n_neighbors # Otherwise reduce n_neighbors for KNN smallest_neighbor_count = min( - np.diff(query_neighbors.indptr).min(), - np.diff(reference_neighbors.indptr).min() + np.diff(query_neighbors.indptr).min(), np.diff(reference_neighbors.indptr).min() ) if smallest_neighbor_count < par["n_neighbors"]: - logger.warning(f"The number of neighbors in the distance matrices is smaller than the requested number of neighbors in --n_neighbors. Reducing n_neighbors to {smallest_neighbor_count} for KNN Classification") + logger.warning( + f"The number of neighbors in the distance matrices is smaller than the requested number of neighbors in --n_neighbors. Reducing n_neighbors to {smallest_neighbor_count} for KNN Classification" + ) par["n_neighbors"] = smallest_neighbor_count elif par["input_obsm_distances"] or par["reference_obsm_distances"]: - raise ValueError("Make sure to provide both --input_obsm_distances and --reference_obsm_distances if you want to use a pre-calculated distance matrix for KNN classification.") + raise ValueError( + "Make sure to provide both --input_obsm_distances and --reference_obsm_distances if you want to use a pre-calculated distance matrix for KNN classification." + ) elif not par["input_obsm_distances"] and not par["reference_obsm_distances"]: - logger.info("No pre-calculated distances were provided. Calculating distances using the PyNNDescent algorithm.") + logger.info( + "No pre-calculated distances were provided. Calculating distances using the PyNNDescent algorithm." + ) # Generating training and inference data train_X = get_reference_features(r_adata, par, logger) inference_X = get_query_features(q_adata, par, logger) @@ -119,13 +134,17 @@ def distances_to_affinities(distances): reference_neighbors = neighbors_transformer.transform(train_X) # For each target, train a classifier and predict labels -for obs_tar, obs_pred, obs_proba in zip(par["reference_obs_targets"], par["output_obs_predictions"], par["output_obs_probability"]): +for obs_tar, obs_pred, obs_proba in zip( + par["reference_obs_targets"], + par["output_obs_predictions"], + par["output_obs_probability"], +): logger.info(f"Predicting labels for {obs_tar}") weights_dict = { "uniform": "uniform", "distance": "distance", - "gaussian": distances_to_affinities + "gaussian": distances_to_affinities, } logger.info(f"Using KNN classifier with {par['weights']} weights") @@ -133,17 +152,19 @@ def distances_to_affinities(distances): classifier = KNeighborsClassifier( n_neighbors=par["n_neighbors"], metric="precomputed", - weights=weights_dict[par["weights"]] - ) + weights=weights_dict[par["weights"]], + ) classifier.fit(X=reference_neighbors, y=train_y) predicted_labels = classifier.predict(query_neighbors) probabilities = classifier.predict_proba(query_neighbors).max(axis=1) # save_results - logger.info(f"Saving predictions to {obs_pred} and probabilities to {obs_proba} in obs") + logger.info( + f"Saving predictions to {obs_pred} and probabilities to {obs_proba} in obs" + ) q_adata.obs[obs_pred] = predicted_labels q_adata.obs[obs_proba] = probabilities logger.info(f"Saving output data to {par['output']}") -q_mdata.mod[par['modality']] = q_adata -q_mdata.write_h5mu(par['output'], compression=par['output_compression']) +q_mdata.mod[par["modality"]] = q_adata +q_mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/labels_transfer/knn/test.py b/src/labels_transfer/knn/test.py index 271092a3872..c12aa844c7b 100644 --- a/src/labels_transfer/knn/test.py +++ b/src/labels_transfer/knn/test.py @@ -8,12 +8,12 @@ from scipy.sparse import csr_matrix ## VIASH START -meta = { - 'resources_dir': './resources_test/' -} +meta = {"resources_dir": "./resources_test/"} ## VIASH END -reference_h5ad_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" +reference_h5ad_file = ( + f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" +) # convert reference to h5mu reference_adata = ad.read_h5ad(reference_h5ad_file) reference_mdata = mu.MuData({"rna": reference_adata}) @@ -23,16 +23,21 @@ def test_label_transfer(run_component, random_h5mu_path): - output = random_h5mu_path() args = [ - "--input", input_file, - "--modality", "rna", - "--reference", reference_file, - "--reference_obs_targets", "cell_type", - "--output", output, - "--n_neighbors", "5" + "--input", + input_file, + "--modality", + "rna", + "--reference", + reference_file, + "--reference_obs_targets", + "cell_type", + "--output", + output, + "--n_neighbors", + "5", ] run_component(args) @@ -41,25 +46,37 @@ def test_label_transfer(run_component, random_h5mu_path): output_data = mu.read_h5mu(output) - assert "cell_type_pred" in output_data.mod["rna"].obs, f"Predictions cell_type_pred is missing from output\noutput: {output_data.mod['rna'].obs}" - assert "cell_type_probability" in output_data.mod["rna"].obs, f"Uncertainties cell_type_probability is missing from output\noutput: {output_data.mod['rna'].obs}" + assert ( + "cell_type_pred" in output_data.mod["rna"].obs + ), f"Predictions cell_type_pred is missing from output\noutput: {output_data.mod['rna'].obs}" + assert ( + "cell_type_probability" in output_data.mod["rna"].obs + ), f"Uncertainties cell_type_probability is missing from output\noutput: {output_data.mod['rna'].obs}" @pytest.mark.parametrize("weights", ["uniform", "distance", "gaussian"]) def test_label_transfer_prediction_columns(run_component, weights, random_h5mu_path): - output = random_h5mu_path() args = [ - "--input", input_file, - "--modality", "rna", - "--reference", reference_file, - "--reference_obs_targets", "cell_type", - "--weights", weights, - "--output", output, - "--output_obs_probability", "test_probability", - "--output_obs_predictions", "test_prediction", - "--n_neighbors", "5" + "--input", + input_file, + "--modality", + "rna", + "--reference", + reference_file, + "--reference_obs_targets", + "cell_type", + "--weights", + weights, + "--output", + output, + "--output_obs_probability", + "test_probability", + "--output_obs_predictions", + "test_prediction", + "--n_neighbors", + "5", ] run_component(args) @@ -68,12 +85,17 @@ def test_label_transfer_prediction_columns(run_component, weights, random_h5mu_p output_data = mu.read_h5mu(output) - assert "test_prediction" in output_data.mod["rna"].obs, f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}" - assert "test_probability" in output_data.mod["rna"].obs, f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}" - + assert ( + "test_prediction" in output_data.mod["rna"].obs + ), f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}" + assert ( + "test_probability" in output_data.mod["rna"].obs + ), f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}" -def test_label_transfer_prediction_precomputed_neighbor_graph(run_component, random_h5mu_path): +def test_label_transfer_prediction_precomputed_neighbor_graph( + run_component, random_h5mu_path +): output = random_h5mu_path() # Add mock distance matrix to obsm slot @@ -92,16 +114,26 @@ def test_label_transfer_prediction_precomputed_neighbor_graph(run_component, ran query_mdata.write_h5mu(input_file) args = [ - "--input", input_file, - "--modality", "rna", - "--reference", reference_file, - "--reference_obs_targets", "cell_type", - "--output", output, - "--input_obsm_distances", "distances", - "--reference_obsm_distances", "distances", - "--output_obs_probability", "test_probability", - "--output_obs_predictions", "test_prediction", - "--n_neighbors", "5" + "--input", + input_file, + "--modality", + "rna", + "--reference", + reference_file, + "--reference_obs_targets", + "cell_type", + "--output", + output, + "--input_obsm_distances", + "distances", + "--reference_obsm_distances", + "distances", + "--output_obs_probability", + "test_probability", + "--output_obs_predictions", + "test_prediction", + "--n_neighbors", + "5", ] run_component(args) @@ -110,12 +142,15 @@ def test_label_transfer_prediction_precomputed_neighbor_graph(run_component, ran output_data = mu.read_h5mu(output) - assert "test_prediction" in output_data.mod["rna"].obs, f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}" - assert "test_probability" in output_data.mod["rna"].obs, f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}" + assert ( + "test_prediction" in output_data.mod["rna"].obs + ), f"Predictions test_prediction is missing from output\noutput: {output_data.mod['rna'].obs}" + assert ( + "test_probability" in output_data.mod["rna"].obs + ), f"Uncertainties test_probability is missing from output\noutput: {output_data.mod['rna'].obs}" def test_raises_distance_matrix_dimensions(run_component, random_h5mu_path): - output = random_h5mu_path() reference_mdata = mu.read_h5mu(reference_file) @@ -133,23 +168,35 @@ def test_raises_distance_matrix_dimensions(run_component, random_h5mu_path): query_mdata.write_h5mu(input_file) with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", input_file, - "--modality", "rna", - "--reference", reference_file, - "--reference_obs_targets", "cell_type", - "--output", output, - "--input_obsm_distances", "distances", - "--reference_obsm_distances", "distances", - "--output_obs_probability", "test_probability", - "--output_obs_predictions", "test_prediction", - "--n_neighbors", "5" - ]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--reference", + reference_file, + "--reference_obs_targets", + "cell_type", + "--output", + output, + "--input_obsm_distances", + "distances", + "--reference_obsm_distances", + "distances", + "--output_obs_probability", + "test_probability", + "--output_obs_predictions", + "test_prediction", + "--n_neighbors", + "5", + ] + ) assert re.search( r"ValueError: The number of neighbors in the query and reference distance matrices do not match. Make sure both distance matrices contain distances to the reference dataset.", - err.value.stdout.decode('utf-8') - ) + err.value.stdout.decode("utf-8"), + ) -if __name__ == '__main__': +if __name__ == "__main__": exit(pytest.main([__file__])) diff --git a/src/labels_transfer/utils/helper.py b/src/labels_transfer/utils/helper.py index be879425afe..ee24b047569 100644 --- a/src/labels_transfer/utils/helper.py +++ b/src/labels_transfer/utils/helper.py @@ -1,18 +1,25 @@ -from scipy.sparse import issparse - def check_arguments(par): # check output .obs predictions if not par["output_obs_predictions"]: - par["output_obs_predictions"] = [ t + "_pred" for t in par["reference_obs_targets"]] - assert len(par["output_obs_predictions"]) == len(par["reference_obs_targets"]), f"Number of output_obs_predictions must match number of reference_obs_targets\npar: {par}" + par["output_obs_predictions"] = [ + t + "_pred" for t in par["reference_obs_targets"] + ] + assert ( + len(par["output_obs_predictions"]) == len(par["reference_obs_targets"]) + ), f"Number of output_obs_predictions must match number of reference_obs_targets\npar: {par}" # check output .obs uncertainty if not par["output_obs_probability"]: - par["output_obs_probability"] = [ t + "_probability" for t in par["reference_obs_targets"]] - assert len(par["output_obs_probability"]) == len(par["reference_obs_targets"]), f"Number of output_obs_probability must match number of reference_obs_targets\npar: {par}" + par["output_obs_probability"] = [ + t + "_probability" for t in par["reference_obs_targets"] + ] + assert ( + len(par["output_obs_probability"]) == len(par["reference_obs_targets"]) + ), f"Number of output_obs_probability must match number of reference_obs_targets\npar: {par}" return par + def get_reference_features(adata_reference, par, logger): if par["reference_obsm_features"] is None: logger.info("Using .X of reference data") @@ -23,6 +30,7 @@ def get_reference_features(adata_reference, par, logger): return train_data + def get_query_features(adata, par, logger): if par["input_obsm_features"] is None: logger.info("Using .X of query data") diff --git a/src/labels_transfer/xgboost/script.py b/src/labels_transfer/xgboost/script.py index 6296530555a..69c12d0fb8d 100644 --- a/src/labels_transfer/xgboost/script.py +++ b/src/labels_transfer/xgboost/script.py @@ -7,7 +7,6 @@ import mudata import numpy as np -import scanpy as sc import pandas as pd import xgboost as xgb from sklearn.model_selection import train_test_split @@ -22,7 +21,14 @@ "input_obsm_features": "X_integrated_scanvi", "reference": "https://zenodo.org/record/6337966/files/HLCA_emb_and_metadata.h5ad", "reference_obsm_features": "X_integrated_scanvi", - "reference_obs_targets": ["ann_level_1", "ann_level_2", "ann_level_3", "ann_level_4", "ann_level_5", "ann_finest_level"], + "reference_obs_targets": [ + "ann_level_1", + "ann_level_2", + "ann_level_3", + "ann_level_4", + "ann_level_5", + "ann_finest_level", + ], "output": "foo.h5mu", "output_obs_predictions": None, "output_obs_probability": None, @@ -46,36 +52,42 @@ } meta = { "resources_dir": "src/labels_transfer/utils", - "config": "src/labels_transfer/xgboost/config.vsh.yaml" + "config": "src/labels_transfer/xgboost/config.vsh.yaml", } ### VIASH END sys.path.append(meta["resources_dir"]) from helper import check_arguments, get_reference_features, get_query_features from setup_logger import setup_logger + logger = setup_logger() # read config arguments config = yaml.safe_load(Path(meta["config"]).read_text()) # look for training params for method -argument_groups = { grp["name"]: grp["arguments"] for grp in config["argument_groups"] } -training_arg_names = [ arg["name"].replace("--", "") for arg in argument_groups["Learning parameters"] ] -training_params = { arg_name: par[arg_name] for arg_name in training_arg_names } +argument_groups = {grp["name"]: grp["arguments"] for grp in config["argument_groups"]} +training_arg_names = [ + arg["name"].replace("--", "") for arg in argument_groups["Learning parameters"] +] +training_params = {arg_name: par[arg_name] for arg_name in training_arg_names} + def encode_labels(y): labels_encoder = LabelEncoder() labels_encoder.fit(y) - + return labels_encoder.transform(y), labels_encoder def get_model_eval(xgb_model, X_test, y_test, labels_encoder): preds = xgb_model.predict(X_test) - - cr = classification_report(labels_encoder.inverse_transform(y_test), - labels_encoder.inverse_transform(preds), - output_dict=True) + + cr = classification_report( + labels_encoder.inverse_transform(y_test), + labels_encoder.inverse_transform(preds), + output_dict=True, + ) cr_df = pd.DataFrame(cr).transpose() return cr_df @@ -85,34 +97,41 @@ def train_test_split_adata(adata, labels): train_data = pd.DataFrame(data=adata.X, index=adata.obs_names) X_train, X_test, y_train, y_test = train_test_split( - train_data, labels, test_size=0.2, random_state=42, stratify=labels) - + train_data, labels, test_size=0.2, random_state=42, stratify=labels + ) + return X_train, X_test, y_train, y_test def train_xgb_model(X_train, y_train, gpu=True) -> xgb.XGBClassifier: n_classes = len(np.unique(y_train)) objective = "binary:logistic" if n_classes == 2 else "multi:softprob" - - tree_method = "gpu_hist" if gpu else "hist" - xgbc = xgb.XGBClassifier(tree_method=tree_method, objective=objective, **training_params) + + tree_method = "gpu_hist" if gpu else "hist" + xgbc = xgb.XGBClassifier( + tree_method=tree_method, objective=objective, **training_params + ) xgbc.fit(X_train, y_train) - + return xgbc -def build_classifier(X, y, labels_encoder, label_key, eval_verbosity: Optional[int] = 1, gpu=True) -> xgb.XGBClassifier: +def build_classifier( + X, y, labels_encoder, label_key, eval_verbosity: Optional[int] = 1, gpu=True +) -> xgb.XGBClassifier: # Adata prep - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) - #Note: Do we need a new train-test split for each classifier? - + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y + ) + # Note: Do we need a new train-test split for each classifier? + # Model training xgb_model = train_xgb_model(X_train, y_train, gpu=gpu) - + # Model eval if eval_verbosity != 0: cr_df = get_model_eval(xgb_model, X_test, y_test, labels_encoder) - + if eval_verbosity == 2: print(cr_df) @@ -132,44 +151,49 @@ def build_classifier(X, y, labels_encoder, label_key, eval_verbosity: Optional[i print(f"Min. Recall: {low_rec_key}: {low_rec_val}") print(f"Min. F1-score: {low_f1_key}: {low_f1_val}") print("") - + return xgb_model -def build_ref_classifiers(adata_reference, targets, model_path, - eval_verbosity: Optional[int] = 1, gpu: Optional[bool] = True) -> None: +def build_ref_classifiers( + adata_reference, + targets, + model_path, + eval_verbosity: Optional[int] = 1, + gpu: Optional[bool] = True, +) -> None: """ - This function builds xgboost classifiers on a reference embedding for a designated number of + This function builds xgboost classifiers on a reference embedding for a designated number of adata_reference.obs columns. Classifier .xgb files and a model_info.json file is written to the `model_path` directory. Model evaluation is printed to stdout. - + Inputs: * `adata_reference`: The AnnData object that was used to train the reference model * `model_path`: The reference model directory where the classifiers will also be stored * `eval_verbosity`: The verbosity level for evaluation of the classifier from the range [0;2]. * `gpu`: Boolean indicating whether a gpu is available for classifier training - - + + Example: ``` >>> adata AnnData object with n_obs x n_vars = 700 x 765 obs: "ann_finest_level", "ann_level_1" - + >>> os.listdir("/path/to/model") model_params.pt* - + >>> build_ref_classifiers(adata, "path/to/model", eval_verbosity=1, gpu=True) >>> os.listdir("/path/to/model") classifier_ann_finest_level.xgb* model_info.json* - classifier_ann_level_1.xgb* model_params.pt* + classifier_ann_level_1.xgb* model_params.pt* ``` """ # Check inputs if not isinstance(eval_verbosity, int): raise TypeError("`eval_verbosity` should be an integer between 0 and 2.") - + if eval_verbosity < 0 or eval_verbosity > 2: raise ValueError("`eval_verbosity` should be an integer between 0 and 2.") @@ -177,19 +201,19 @@ def build_ref_classifiers(adata_reference, targets, model_path, if not os.path.exists(model_path): os.makedirs(model_path, exist_ok=True) - + # Map from name of classifier to file names classifiers = dict() - + for label, obs_pred in zip(targets, par["output_obs_predictions"]): if label not in adata_reference.obs: raise ValueError(f"{label} is not in the `adata` object passed!") filename = "classifier_" + label + ".xgb" - + labels, labels_encoder = encode_labels(adata_reference.obs[label]) logger.info(f"Classes: {labels_encoder.classes_}") - + logger.info(f"Building classifier for {label}...") xgb_model = build_classifier( X=train_data, @@ -197,13 +221,13 @@ def build_ref_classifiers(adata_reference, targets, model_path, labels_encoder=labels_encoder, label_key=label, eval_verbosity=eval_verbosity, - gpu=gpu + gpu=gpu, ) - + # Save classifier logger.info("Saving model") xgb_model.save_model(os.path.join(model_path, filename)) - + # Store classifier info classifiers[label] = { "filename": filename, @@ -211,24 +235,22 @@ def build_ref_classifiers(adata_reference, targets, model_path, "obs_column": obs_pred, "model_params": training_params, } - + # Store model_info.json file - model_info = { - "classifier_info": classifiers - } - + model_info = {"classifier_info": classifiers} + logger.info("Writing model_info to the file") # Read previous file if it exists if os.path.exists(model_path + "/model_info.json"): logger.info("Old model_info file found, updating") with open(model_path + "/model_info.json", "r") as f: old_model_info = json.loads(f.read()) - + for key in old_model_info: if key in model_info: old_model_info[key].update(model_info[key]) json_string = json.dumps(old_model_info, indent=4) - + else: logger.info("Creating a new file") json_string = json.dumps(model_info, indent=4) @@ -240,9 +262,9 @@ def build_ref_classifiers(adata_reference, targets, model_path, def project_labels( query_dataset, cell_type_classifier_model: xgb.XGBClassifier, - annotation_column_name='label_pred', - probability_column_name='label_probability', - probability_thresh=None # Note: currently not passed to predict function + annotation_column_name="label_pred", + probability_column_name="label_probability", + probability_thresh=None, # Note: currently not passed to predict function ): """ A function that projects predicted labels onto the query dataset, along with probability estimations. @@ -260,19 +282,27 @@ def project_labels( """ - if (probability_thresh is not None) and (probability_thresh < 0 or probability_thresh > 1): - raise ValueError(f'`probability_thresh` must be `None` or between 0 and 1.') + if (probability_thresh is not None) and ( + probability_thresh < 0 or probability_thresh > 1 + ): + raise ValueError("`probability_thresh` must be `None` or between 0 and 1.") query_data = get_query_features(query_dataset, par, logger) # Predict labels and probabilities - query_dataset.obs[annotation_column_name] = cell_type_classifier_model.predict(query_data) + query_dataset.obs[annotation_column_name] = cell_type_classifier_model.predict( + query_data + ) logger.info("Predicting probabilities") probs = cell_type_classifier_model.predict_proba(query_data) # Format probabilities - df_probs = pd.DataFrame(probs, columns=cell_type_classifier_model.classes_, index=query_dataset.obs_names) + df_probs = pd.DataFrame( + probs, + columns=cell_type_classifier_model.classes_, + index=query_dataset.obs_names, + ) query_dataset.obs[probability_column_name] = df_probs.max(1) # Note: this is here in case we want to propose a set of values for the user to accept to seed the @@ -280,8 +310,11 @@ def project_labels( if probability_thresh is not None: logger.info("Marking uncertain predictions") query_dataset.obs[annotation_column_name + "_filtered"] = [ - val if query_dataset.obs[probability_column_name][i] >= probability_thresh - else "Unknown" for i, val in enumerate(query_dataset.obs[annotation_column_name])] + val + if query_dataset.obs[probability_column_name][i] >= probability_thresh + else "Unknown" + for i, val in enumerate(query_dataset.obs[annotation_column_name]) + ] return query_dataset @@ -293,7 +326,7 @@ def predict( prediction_column_name: str, probability_column_name: str, models_info, - use_gpu: bool = False + use_gpu: bool = False, ) -> pd.DataFrame: """ Returns `obs` DataFrame with prediction columns appended @@ -304,21 +337,27 @@ def predict( labels = models_info["classifier_info"][annotation_column_name]["labels"] objective = "binary:logistic" if len(labels) == 2 else "multi:softprob" - cell_type_classifier_model = xgb.XGBClassifier(tree_method=tree_method, objective=objective) + cell_type_classifier_model = xgb.XGBClassifier( + tree_method=tree_method, objective=objective + ) logger.info("Loading model") cell_type_classifier_model.load_model(fname=cell_type_classifier_model_path) logger.info("Predicting labels") - project_labels(query_dataset, - cell_type_classifier_model, - annotation_column_name=prediction_column_name, - probability_column_name=probability_column_name) + project_labels( + query_dataset, + cell_type_classifier_model, + annotation_column_name=prediction_column_name, + probability_column_name=probability_column_name, + ) logger.info("Converting labels from numbers to classes") labels_encoder = LabelEncoder() labels_encoder.classes_ = np.array(labels) - query_dataset.obs[prediction_column_name] = labels_encoder.inverse_transform(query_dataset.obs[prediction_column_name]) + query_dataset.obs[prediction_column_name] = labels_encoder.inverse_transform( + query_dataset.obs[prediction_column_name] + ) return query_dataset @@ -340,46 +379,64 @@ def main(par): targets_to_train = [] for obs_target in par["reference_obs_targets"]: - if not os.path.exists(par["model_output"]) or f"classifier_{obs_target}.xgb" not in os.listdir(par["model_output"]) or par["force_retrain"]: + if ( + not os.path.exists(par["model_output"]) + or f"classifier_{obs_target}.xgb" not in os.listdir(par["model_output"]) + or par["force_retrain"] + ): logger.info(f"Classifier for {obs_target} added to a training schedule") targets_to_train.append(obs_target) else: logger.info(f"Found classifier for {obs_target}, no retraining required") - build_ref_classifiers(adata_reference, targets_to_train, model_path=par["model_output"], - gpu=par["use_gpu"], eval_verbosity=par["verbosity"]) + build_ref_classifiers( + adata_reference, + targets_to_train, + model_path=par["model_output"], + gpu=par["use_gpu"], + eval_verbosity=par["verbosity"], + ) output_uns_parameters = adata_query.uns.get(par["output_uns_parameters"], {}) with open(par["model_output"] + "/model_info.json", "r") as f: models_info = json.loads(f.read()) - for obs_target, obs_pred, obs_unc in zip(par["reference_obs_targets"], par["output_obs_predictions"], par["output_obs_probability"]): + for obs_target, obs_pred, obs_unc in zip( + par["reference_obs_targets"], + par["output_obs_predictions"], + par["output_obs_probability"], + ): logger.info(f"Predicting {obs_target}") - adata_query = predict(query_dataset=adata_query, - cell_type_classifier_model_path=os.path.join(par["model_output"], "classifier_" + obs_target + ".xgb"), - annotation_column_name=obs_target, - prediction_column_name=obs_pred, - probability_column_name=obs_unc, - models_info=models_info, - use_gpu=par["use_gpu"]) - + adata_query = predict( + query_dataset=adata_query, + cell_type_classifier_model_path=os.path.join( + par["model_output"], "classifier_" + obs_target + ".xgb" + ), + annotation_column_name=obs_target, + prediction_column_name=obs_pred, + probability_column_name=obs_unc, + models_info=models_info, + use_gpu=par["use_gpu"], + ) + if obs_target in targets_to_train: # Save information about the transfer to .uns output_uns_parameters[obs_target] = { "method": "XGBClassifier", - **training_params + **training_params, } adata_query.uns[par["output_uns_parameters"]] = output_uns_parameters logger.info("Updating mdata") - mdata_query.mod[par['modality']] = adata_query + mdata_query.mod[par["modality"]] = adata_query mdata_query.update() logger.info("Writing output") - mdata_query.write_h5mu(par['output'].strip()) + mdata_query.write_h5mu(par["output"].strip()) + if __name__ == "__main__": main(par) diff --git a/src/labels_transfer/xgboost/test.py b/src/labels_transfer/xgboost/test.py index 3dcfdfef778..75532bec350 100644 --- a/src/labels_transfer/xgboost/test.py +++ b/src/labels_transfer/xgboost/test.py @@ -7,12 +7,14 @@ ## VIASH START meta = { - 'executable': './target/executable/labels_transfer/xgboost/xgboost', - 'resources_dir': './resources_test/' + "executable": "./target/executable/labels_transfer/xgboost/xgboost", + "resources_dir": "./resources_test/", } ## VIASH END -reference_h5ad_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" +reference_h5ad_file = ( + f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5ad" +) input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" @@ -28,40 +30,81 @@ def test_args(tmp_path, request): # generate reference obs targets for i, target in enumerate(obs_targets): - class_names = [str(idx) for idx in range(i + 2)] # e.g. ["0", "1", "2"], the higher the level, the more the classes - reference_adata.obs[target] = np.random.choice(class_names, size=reference_adata.n_obs) + class_names = [ + str(idx) for idx in range(i + 2) + ] # e.g. ["0", "1", "2"], the higher the level, the more the classes + reference_adata.obs[target] = np.random.choice( + class_names, size=reference_adata.n_obs + ) # read input query input_mudata = mudata.read_h5mu(input_file) input_rna_adata = input_mudata.mod["rna"] - + # generate features - reference_adata.obsm[obsm_features] = np.random.normal(size=(reference_adata.n_obs, 30)) - input_rna_adata.obsm[obsm_features] = np.random.normal(size=(input_rna_adata.n_obs, 30)) + reference_adata.obsm[obsm_features] = np.random.normal( + size=(reference_adata.n_obs, 30) + ) + input_rna_adata.obsm[obsm_features] = np.random.normal( + size=(input_rna_adata.n_obs, 30) + ) reference_mdata = mudata.MuData({"rna": reference_adata}) # write files reference_mdata.write_h5mu(str(tempfile_reference_file)) input_mudata.write_h5mu(str(tempfile_input_file)) - return tempfile_reference_file, reference_adata, tempfile_input_file, input_rna_adata, obsm_features, obs_targets, output_uns_parameters - - -@pytest.mark.parametrize("test_args", [("X_integrated_scvi", ["celltype"], None), ("X_int", ["ann_level_1", "ann_level_2", "ann_level_3"], "lab_tran")], indirect=True) + return ( + tempfile_reference_file, + reference_adata, + tempfile_input_file, + input_rna_adata, + obsm_features, + obs_targets, + output_uns_parameters, + ) + + +@pytest.mark.parametrize( + "test_args", + [ + ("X_integrated_scvi", ["celltype"], None), + ("X_int", ["ann_level_1", "ann_level_2", "ann_level_3"], "lab_tran"), + ], + indirect=True, +) def test_label_transfer(run_component, test_args): - tempfile_reference_file, _, tempfile_input_file, _, obsm_features, obs_targets, output_uns_parameters = test_args + ( + tempfile_reference_file, + _, + tempfile_input_file, + _, + obsm_features, + obs_targets, + output_uns_parameters, + ) = test_args args = [ - "--input", str(tempfile_input_file), - "--modality", "rna", - "--input_obsm_features", obsm_features, - "--reference", str(tempfile_reference_file), - "--reference_obsm_features", obsm_features, - "--reference_obs_targets", ";".join(obs_targets), - "--output", "output.h5mu", - "--model_output", "model_one_class", - "--use_gpu", "false", - "--max_depth", "6" + "--input", + str(tempfile_input_file), + "--modality", + "rna", + "--input_obsm_features", + obsm_features, + "--reference", + str(tempfile_reference_file), + "--reference_obsm_features", + obsm_features, + "--reference_obs_targets", + ";".join(obs_targets), + "--output", + "output.h5mu", + "--model_output", + "model_one_class", + "--use_gpu", + "false", + "--max_depth", + "6", ] if output_uns_parameters is not None: @@ -73,40 +116,77 @@ def test_label_transfer(run_component, test_args): output_data = mudata.read_h5mu("output.h5mu") - exp_uns = "xgboost_parameters" if output_uns_parameters is None else output_uns_parameters + exp_uns = ( + "xgboost_parameters" if output_uns_parameters is None else output_uns_parameters + ) for target in obs_targets: - assert f"{target}_pred" in output_data.mod["rna"].obs, f"Predictions are missing from output\noutput: {output_data.mod['rna'].obs}" - assert f"{target}_probability" in output_data.mod["rna"].obs, f"Probabilities are missing from output\noutput: {output_data.mod['rna'].obs}" - assert exp_uns in output_data.mod["rna"].uns, f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" - assert target in output_data.mod["rna"].uns[exp_uns], f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" - assert output_data.mod["rna"].uns[exp_uns][target].get("method") == "XGBClassifier", f"Wrong method in parameters\noutput: {output_data.mod['rna'].uns}" - assert output_data.mod["rna"].uns[exp_uns][target].get("max_depth") == 6, f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" - - -@pytest.mark.parametrize("test_args", [("X_int", ["ann_level_1", "ann_level_2", "ann_level_3"], "lab_tran")], indirect=True) + assert ( + f"{target}_pred" in output_data.mod["rna"].obs + ), f"Predictions are missing from output\noutput: {output_data.mod['rna'].obs}" + assert ( + f"{target}_probability" in output_data.mod["rna"].obs + ), f"Probabilities are missing from output\noutput: {output_data.mod['rna'].obs}" + assert ( + exp_uns in output_data.mod["rna"].uns + ), f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" + assert ( + target in output_data.mod["rna"].uns[exp_uns] + ), f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" + assert ( + output_data.mod["rna"].uns[exp_uns][target].get("method") == "XGBClassifier" + ), f"Wrong method in parameters\noutput: {output_data.mod['rna'].uns}" + assert ( + output_data.mod["rna"].uns[exp_uns][target].get("max_depth") == 6 + ), f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" + + +@pytest.mark.parametrize( + "test_args", + [("X_int", ["ann_level_1", "ann_level_2", "ann_level_3"], "lab_tran")], + indirect=True, +) def test_retraining(run_component, test_args, tmp_path): output_model = tmp_path / "model_retraining" output_path = tmp_path / "output.h5mu" output2_path = tmp_path / "output2.h5mu" - tempfile_reference_file, _, tempfile_input_file, _, obsm_features, obs_targets, output_uns_parameters = test_args + ( + tempfile_reference_file, + _, + tempfile_input_file, + _, + obsm_features, + obs_targets, + output_uns_parameters, + ) = test_args # Train first 2 targets args = [ - "--modality", "rna", - "--input_obsm_features", obsm_features, - "--reference", str(tempfile_reference_file), - "--reference_obsm_features", obsm_features, - "--model_output", str(output_model)] - + "--modality", + "rna", + "--input_obsm_features", + obsm_features, + "--reference", + str(tempfile_reference_file), + "--reference_obsm_features", + obsm_features, + "--model_output", + str(output_model), + ] + if output_uns_parameters is not None: args.extend(["--output_uns_parameters", output_uns_parameters]) - + args1 = args + [ - "--input", str(tempfile_input_file), - "--output", str(output_path), - "--reference_obs_targets", ";".join(obs_targets[:2]), - "--max_depth", "6"] + "--input", + str(tempfile_input_file), + "--output", + str(output_path), + "--reference_obs_targets", + ";".join(obs_targets[:2]), + "--max_depth", + "6", + ] run_component(args1) assert output_path.is_file() @@ -115,10 +195,15 @@ def test_retraining(run_component, test_args, tmp_path): # Now the code should use 2 previously trained models, # and train only the remaining targets args2 = args + [ - "--input", str(output_path), - "--output", str(output2_path), - "--reference_obs_targets", ";".join(obs_targets), - "--max_depth", "4"] + "--input", + str(output_path), + "--output", + str(output2_path), + "--reference_obs_targets", + ";".join(obs_targets), + "--max_depth", + "4", + ] run_component(args2) assert output2_path.is_file() @@ -126,16 +211,42 @@ def test_retraining(run_component, test_args, tmp_path): output_data = mudata.read_h5mu(output2_path) for target in obs_targets: - assert f"{target}_pred" in output_data.mod["rna"].obs, f"Predictions are missing from output\noutput: {output_data.mod['rna'].obs}" - assert f"{target}_probability" in output_data.mod["rna"].obs, f"Probabilities are missing from output\noutput: {output_data.mod['rna'].obs}" - assert output_uns_parameters in output_data.mod["rna"].uns, f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" - assert target in output_data.mod["rna"].uns[output_uns_parameters], f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" - assert output_data.mod["rna"].uns[output_uns_parameters][target].get("method") == "XGBClassifier", f"Wrong method in parameters\noutput: {output_data.mod['rna'].uns}" - - assert output_data.mod["rna"].uns[output_uns_parameters][obs_targets[0]].get("max_depth") == 6, f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" - assert output_data.mod["rna"].uns[output_uns_parameters][obs_targets[1]].get("max_depth") == 6, f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" - assert output_data.mod["rna"].uns[output_uns_parameters][obs_targets[2]].get("max_depth") == 4, f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" - - -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file + assert ( + f"{target}_pred" in output_data.mod["rna"].obs + ), f"Predictions are missing from output\noutput: {output_data.mod['rna'].obs}" + assert ( + f"{target}_probability" in output_data.mod["rna"].obs + ), f"Probabilities are missing from output\noutput: {output_data.mod['rna'].obs}" + assert ( + output_uns_parameters in output_data.mod["rna"].uns + ), f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" + assert ( + target in output_data.mod["rna"].uns[output_uns_parameters] + ), f"Parameters are missing from output\noutput: {output_data.mod['rna'].uns}" + assert ( + output_data.mod["rna"].uns[output_uns_parameters][target].get("method") + == "XGBClassifier" + ), f"Wrong method in parameters\noutput: {output_data.mod['rna'].uns}" + + assert ( + output_data.mod["rna"] + .uns[output_uns_parameters][obs_targets[0]] + .get("max_depth") + == 6 + ), f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" + assert ( + output_data.mod["rna"] + .uns[output_uns_parameters][obs_targets[1]] + .get("max_depth") + == 6 + ), f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" + assert ( + output_data.mod["rna"] + .uns[output_uns_parameters][obs_targets[2]] + .get("max_depth") + == 4 + ), f"Wrong number of neighbors in parameters\noutput: {output_data.mod['rna'].uns}" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/mapping/bd_rhapsody/rhapsody_cell_label.py b/src/mapping/bd_rhapsody/rhapsody_cell_label.py index 601ce7becab..2bc1f81f281 100644 --- a/src/mapping/bd_rhapsody/rhapsody_cell_label.py +++ b/src/mapping/bd_rhapsody/rhapsody_cell_label.py @@ -53,7 +53,7 @@ ---------------- The Rhapsody Sequence Analysis Pipeline will convert each cell label into a single integer representing a unique cell label sequence - which is used in the output files as the 'Cell_index'. -This cell index integer is deterministic and derived from the 3 part cell label as follows: +This cell index integer is deterministic and derived from the 3 part cell label as follows: - Get the 1-based index for each cell label section from the python sets of sequences below - Apply this equation: @@ -79,11 +79,11 @@ """ -v1_linker1 = 'ACTGGCCTGCGA' -v1_linker2 = 'GGTAGCGGTGACA' +v1_linker1 = "ACTGGCCTGCGA" +v1_linker2 = "GGTAGCGGTGACA" -Enh_linker1 = 'GTGA' -Enh_linker2 = 'GACA' +Enh_linker1 = "GTGA" +Enh_linker2 = "GACA" Enh_5p_primer = "ACAGGAAACTCATGGTGCGT" @@ -95,182 +95,1472 @@ Tso_capture_seq_Enh_EnhV2 = "TATGCGTAGTAGGTATG" Tso_capture_seq_EnhV3 = "GTGGAGTCGTGATTATA" -A96_cell_key1 = ("GTCGCTATA","CTTGTACTA","CTTCACATA","ACACGCCGG","CGGTCCAGG","AATCGAATG","CCTAGTATA","ATTGGCTAA","AAGACATGC","AAGGCGATC", - "GTGTCCTTA","GGATTAGGA","ATGGATCCA","ACATAAGCG","AACTGTATT","ACCTTGCGG","CAGGTGTAG","AGGAGATTA","GCGATTACA","ACCGGATAG", - "CCACTTGGA","AGAGAAGTT","TAAGTTCGA","ACGGATATT","TGGCTCAGA","GAATCTGTA","ACCAAGGAC","AGTATCTGT","CACACACTA","ATTAAGTGC", - "AAGTAACCC","AAATCCTGT","CACATTGCA","GCACTGTCA","ATACTTAGG","GCAATCCGA","ACGCAATCA","GAGTATTAG","GACGGATTA","CAGCTGACA", - "CAACATATT","AACTTCTCC","CTATGAAAT","ATTATTACC","TACCGAGCA","TCTCTTCAA","TAAGCGTTA","GCCTTACAA","AGCACACAG","ACAGTTCCG", - "AGTAAAGCC","CAGTTTCAC","CGTTACTAA","TTGTTCCAA","AGAAGCACT","CAGCAAGAT","CAAACCGCC","CTAACTCGC","AATATTGGG","AGAACTTCC", - "CAAAGGCAC","AAGCTCAAC","TCCAGTCGA","AGCCATCAC","AACGAGAAG","CTACAGAAC","AGAGCTATG","GAGGATGGA","TGTACCTTA","ACACACAAA", - "TCAGGAGGA","GAGGTGCTA","ACCCTGACC","ACAAGGATC","ATCCCGGAG","TATGTGGCA","GCTGCCAAT","ATCAGAGCT","TCGAAGTGA","ATAGACGAG", - "AGCCCAATC","CAGAATCGT","ATCTCCACA","ACGAAAGGT","TAGCTTGTA","ACACGAGAT","AACCGCCTC","ATTTAGATG","CAAGCAAGC","CAAAGTGTG", - "GGCAAGCAA","GAGCCAATA","ATGTAATGG","CCTGAGCAA","GAGTACATT","TGCGATCTA" - ) - -A96_cell_key2 = ("TACAGGATA","CACCAGGTA","TGTGAAGAA","GATTCATCA","CACCCAAAG","CACAAAGGC","GTGTGTCGA","CTAGGTCCT","ACAGTGGTA","TCGTTAGCA", - "AGCGACACC","AAGCTACTT","TGTTCTCCA","ACGCGAAGC","CAGAAATCG","ACCAAAATG","AGTGTTGTC","TAGGGATAC","AGGGCTGGT","TCATCCTAA", - "AATCCTGAA","ATCCTAGGA","ACGACCACC","TTCCATTGA","TAGTCTTGA","ACTGTTAGA","ATTCATCGT","ACTTCGAGC","TTGCGTACA","CAGTGCCCG", - "GACACTTAA","AGGAGGCGC","GCCTGTTCA","GTACATCTA","AATCAGTTT","ACGATGAAT","TGACAGACA","ATTAGGCAT","GGAGTCTAA","TAGAACACA", - "AAATAAATA","CCGACAAGA","CACCTACCC","AAGAGTAGA","TCATTGAGA","GACCTTAGA","CAAGACCTA","GGAATGATA","AAACGTACC","ACTATCCTC", - "CCGTATCTA","ACACATGTC","TTGGTATGA","GTGCAGTAA","AGGATTCAA","AGAATGGAG","CTCTCTCAA","GCTAACTCA","ATCAACCGA","ATGAGTTAC", - "ACTTGATGA","ACTTTAACT","TTGGAGGTA","GCCAATGTA","ATCCAACCG","GATGAACTG","CCATGCACA","TAGTGACTA","AAACTGCGC","ATTACCAAG", - "CACTCGAGA","AACTCATTG","CTTGCTTCA","ACCTGAGTC","AGGTTCGCT","AAGGACTAT","CGTTCGGTA","AGATAGTTC","CAATTGATC","GCATGGCTA", - "ACCAGGTGT","AGCTGCCGT","TATAGCCCT","AGAGGACCA","ACAATATGG","CAGCACTTC","CACTTATGT","AGTGAAAGG","AACCCTCGG","AGGCAGCTA", - "AACCAAAGT","GAGTGCGAA","CGCTAAGCA","AATTATAAC","TACTAGTCA","CAACAACGG" - ) - -A96_cell_key3 = ("AAGCCTTCT","ATCATTCTG","CACAAGTAT","ACACCTTAG","GAACGACAA","AGTCTGTAC","AAATTACAG","GGCTACAGA","AATGTATCG","CAAGTAGAA", - "GATCTCTTA","AACAACGCG","GGTGAGTTA","CAGGGAGGG","TCCGTCTTA","TGCATAGTA","ACTTACGAT","TGTATGCGA","GCTCCTTGA","GGCACAACA", - "CTCAAGACA","ACGCTGTTG","ATATTGTAA","AAGTTTACG","CAGCCTGGC","CTATTAGCC","CAAACGTGG","AAAGTCATT","GTCTTGGCA","GATCAGCGA", - "ACATTCGGC","AGTAATTAG","TGAAGCCAA","TCTACGACA","CATAACGTT","ATGGGACTC","GATAGAGGA","CTACATGCG","CAACGATCT","GTTAGCCTA", - "AGTTGCATC","AAGGGAACT","ACTACATAT","CTAAGCTTC","ACGAACCAG","TACTTCGGA","AACATCCAT","AGCCTGGTT","CAAGTTTCC","CAGGCATTT", - "ACGTGGGAG","TCTCACGGA","GCAACATTA","ATGGTCCGT","CTATCATGA","CAATACAAG","AAAGAGGCC","GTAGAAGCA","GCTATGGAA","ACTCCAGGG", - "ACAAGTGCA","GATGGTCCA","TCCTCAATA","AATAAACAA","CTGTACGGA","CTAGATAGA","AGCTATGTG","AAATGGAGG","AGCCGCAAG","ACAGTAAAC", - "AACGTGTGA","ACTGAATTC","AAGGGTCAG","TGTCTATCA","TCAGATTCA","CACGATCCG","AACAGAAAC","CATGAATGA","CGTACTACG","TTCAGCTCA", - "AAGGCCGCA","GGTTGGACA","CGTCTAGGT","AATTCGGCG","CAACCTCCA","CAATAGGGT","ACAGGCTCC","ACAACTAGT","AGTTGTTCT","AATTACCGG", - "ACAAACTTT","TCTCGGTTA","ACTAGACCG","ACTCATACG","ATCGAGTCT","CATAGGTCA" - ) - -B384_cell_key1 = ("TGTGTTCGC","TGTGGCGCC","TGTCTAGCG","TGGTTGTCC","TGGTTCCTC","TGGTGTGCT","TGGCGACCG","TGCTGTGGC","TGCTGGCAC","TGCTCTTCC", - "TGCCTCACC","TGCCATTAT","TGATGTCTC","TGATGGCCT","TGATGCTTG","TGAAGGACC","TCTGTCTCC","TCTGATTAT","TCTGAGGTT","TCTCGTTCT", - "TCTCATCCG","TCCTGGATT","TCAGCATTC","TCACGCCTT","TATGTGCAC","TATGCGGCC","TATGACGAG","TATCTCGTG","TATATGACC","TAGGCTGTG", - "TACTGCGTT","TACGTGTCC","TAATCACAT","GTTGTGTTG","GTTGTGGCT","GTTGTCTGT","GTTGTCGAG","GTTGTCCTC","GTTGTATCC","GTTGGTTCT", - "GTTGGCGTT","GTTGGAGCG","GTTGCTGCC","GTTGCGCAT","GTTGCAGGT","GTTGCACTG","GTTGATGAT","GTTGATACG","GTTGAAGTC","GTTCTGTGC", - "GTTCTCTCG","GTTCTATAT","GTTCGTATG","GTTCGGCCT","GTTCGCGGC","GTTCGATTC","GTTCCGGTT","GTTCCGACG","GTTCACGCT","GTTATCACC", - "GTTAGTCCG","GTTAGGTGT","GTTAGAGAC","GTTAGACTT","GTTACCTCT","GTTAATTCC","GTTAAGCGC","GTGTTGCTT","GTGTTCGGT","GTGTTCCAG", - "GTGTTCATC","GTGTCACAC","GTGTCAAGT","GTGTACTGC","GTGGTTAGT","GTGGTACCG","GTGGCGATC","GTGCTTCTG","GTGCGTTCC","GTGCGGTAT", - "GTGCGCCTT","GTGCGAACT","GTGCAGCCG","GTGCAATTG","GTGCAAGGC","GTCTTGCGC","GTCTGGCCG","GTCTGAGGC","GTCTCAGAT","GTCTCAACC", - "GTCTATCGT","GTCGGTGTG","GTCGGAATC","GTCGCTCCG","GTCCTCGCC","GTCCTACCT","GTCCGCTTG","GTCCATTCT","GTCCAATAC","GTCATGTAT", - - "GTCAGTGGT","GTCAGATAG","GTATTAACT","GTATCAGTC","GTATAGCCT","GTATACTTG","GTATAAGGT","GTAGCATCG","GTACCGTCC","GTACACCTC", - "GTAAGTGCC","GTAACAGAG","GGTTGTGTC","GGTTGGCTG","GGTTGACGC","GGTTCGTCG","GGTTCAGTT","GGTTATATT","GGTTAATAC","GGTGTACGT", - "GGTGCCGCT","GGTGCATGC","GGTCGTTGC","GGTCGAGGT","GGTAGGCAC","GGTAGCTTG","GGTACATAG","GGTAATCTG","GGCTTGGCC","GGCTTCACG", - "GGCTTATGT","GGCTTACTC","GGCTGTCTT","GGCTCTGTG","GGCTCCGGT","GGCTCACCT","GGCGTTGAG","GGCGTGTAC","GGCGTGCTG","GGCGTATCG", - "GGCGCTCGT","GGCGCTACC","GGCGAGCCT","GGCGAGATC","GGCGACTTG","GGCCTCTTC","GGCCTACAG","GGCCAGCGC","GGCCAACTT","GGCATTCCT", - "GGCATCCGC","GGCATAACC","GGCAACGAT","GGATGTCCG","GGATGAGAG","GGATCTGGC","GGATCCATG","GGATAGGTT","GGAGTCGTG","GGAGAAGGC", - "GGACTCCTT","GGACTAGTC","GGACCGTTG","GGAATTAGT","GGAATCTCT","GGAATCGAC","GGAAGCCTC","GCTTGTAGC","GCTTGACCG","GCTTCGGAC", - "GCTTCACAT","GCTTAGTCT","GCTGGATAT","GCTGGAACC","GCTGCGATG","GCTGATCAG","GCTGAGCGT","GCTCTTGTC","GCTCTCCTG","GCTCGGTCC", - "GCTCCAATT","GCTATTCGC","GCTATGAGT","GCTAGTGTT","GCTAGGATC","GCTAGCACT","GCTACGTAT","GCTAACCTT","GCGTTCCGC","GCGTGTGCC", - "GCGTGCATT","GCGTCGGTT","GCGTATGTG","GCGTATACT","GCGGTTCAC","GCGGTCTTG","GCGGCGTCG","GCGGCACCT","GCGCTGGAC","GCGCTCTCC", - - "GCGCGGCAG","GCGCGATAC","GCGCCGACC","GCGAGCGAG","GCGAGAGGT","GCGAATTAC","GCCTTGCAT","GCCTGCGCT","GCCTAACTG","GCCGTCCGT", - "GCCGCTGTC","GCCATGCCG","GCCAGCTAT","GCCAACCAG","GCATGGTTG","GCATCGACG","GCAGGCTAG","GCAGGACGC","GCAGCCATC","GCAGATACC", - "GCAGACGTT","GCACTATGT","GCACACGAG","GATTGTCAT","GATTGGTAG","GATTGCACC","GATTCTACT","GATTCGCTT","GATTAGGCC","GATTACGGT", - "GATGTTGGC","GATGTTATG","GATGGCCAG","GATCGTTCG","GATCGGAGC","GATCGCCTC","GATCCTCTG","GATCCAGCG","GATACACGC","GAGTTACCT", - "GAGTCGTAT","GAGTCGCCG","GAGGTGTAG","GAGGCATTG","GAGCGGACG","GAGCCTGAG","GAGATCTGT","GAGATAATT","GAGACGGCT","GACTTCGTG", - "GACTGTTCT","GACTCTTAG","GACCGCATT","GAATTGAGC","GAATATTGC","GAAGGCTCT","GAAGAGACT","GAACTGCCG","GAACGCGTG","CTTGTGTAT", - "CTTGTGCGC","CTTGTCATG","CTTGGTCTT","CTTGGTACC","CTTGGATGT","CTTGCTCAC","CTTGCAATC","CTTGAGGCC","CTTGACGGT","CTTCTGATC", - "CTTCTCGTT","CTTCTAGGC","CTTCGTTAG","CTTATGTCC","CTTATGCTT","CTTATATAG","CTTAGGTTG","CTTAGGAGC","CTTACTTAT","CTGTTCTCG", - "CTGTGCCTC","CTGTCGCAT","CTGTCGAGC","CTGTAGCTG","CTGTACGTT","CTGCTTGCC","CTGCGTAGT","CTGCACACC","CTGATGGAT","CTGAGTCAT", - "CTGACGCCG","CTGAACGAG","CTCTTGTAG","CTCTTAGTT","CTCTTACCG","CTCTGCACC","CTCTCGTCC","CTCGTATTG","CTCGACTAT","CTCCTGACG", - - "CTCACTAGC","CTATACGGC","CGTTCGCTC","CGTTCACCG","CGTATAGTT","CGGTGTTCC","CGGTGTCAG","CGGTCCTGC","CGGCGACTC","CGGCACGGT", - "CGGATAGCC","CGGAGAGAT","CGCTAATAG","CGCGTTGGC","CGCGCAGAG","CGCACTGCC","CCTTGTCTC","CCTTGGCGT","CCTTCTGAG","CCTTCTCCT", - "CCTTCGACC","CCTTACTTG","CCTGTTCGT","CCTGTATGC","CCTCGGCCG","CCGTTAATT","CCATGTGCG","CCAGTGGTT","CCAGGCATT","CCAGGATCC", - "CCAGCGTTG","CATTCCGAT","CATTATACC","CATGTTGAG","ATTGCGTGT","ATTGCGGAC","ATTGCGCCG","ATTGACTTG","ATTCGGCTG","ATTCGCGAG", - "ATTCCAAGT","ATTATCTTC","ATTACTGTT","ATTACACTC","ATGTTCTAT","ATGTTACGC","ATGTGTATC","ATGTGGCAG","ATGTCTGTG","ATGGTGCAT", - "ATGCTTACT","ATGCTGTCC","ATGCTCGGC","ATGAGGTTC","ATGAGAGTG","ATCTTGGCT","ATCTGTGCG","ATCGGTTCC","ATCATGCTC","ATCATCACT", - "ATATCTTAT","ATAGGCGCC","AGTTGGTAT","AGTTGAGCC","AGTGCGACC","AGGTGCTAC","AGGCTTGCG","AGGCCTTCC","AGGCACCTT","AGGAATATG", - "AGCGGCCAG","AGCCTGGTC","AGCCTGACT","AGCAATCCG","AGAGATGTT","AGAGAATTC","ACTCGCTTG","ACTCGACCT","ACGTACACC","ACGGATGGT", - "ACCAGTCTG","ACATTCGGC","ACATGAGGT","ACACTAATT" - ) - -B384_cell_key2 = ("TTGTGTTGT","TTGTGGTAG","TTGTGCGGA","TTGTCTGTT","TTGTCTAAG","TTGTCATAT","TTGTCACGA","TTGTATGAA","TTGTACAGT","TTGGTTAAT", - "TTGGTGCAA","TTGGTCGAG","TTGGTATTA","TTGGCACAG","TTGGATACA","TTGGAAGTG","TTGCGGTTA","TTGCCATTG","TTGCACGCG","TTGCAAGGT", - "TTGATGTAT","TTGATAATT","TTGAGACGT","TTGACTACT","TTGACCGAA","TTCTGGTCT","TTCTGCACA","TTCTCCTTA","TTCTCCGCT","TTCTAGGTA", - "TTCTAATCG","TTCGTCGTA","TTCGTAGAT","TTCGGCTTG","TTCGGAATA","TTCGCCAGA","TTCGATTGT","TTCGATCAG","TTCCTCGGT","TTCCGGCAG", - "TTCCGCATT","TTCCAATTA","TTCATTGAA","TTCATGCTG","TTCAGGAGT","TTCACTATA","TTCAACTCT","TTCAACGTG","TTATGCGTT","TTATGATTG", - "TTATCCTGT","TTATCCGAG","TTATATTAT","TTAGGCGCG","TTACTGGAA","TTACTAGTT","TTACGTGGT","TTACGATAT","TTACCTAGA","TTACATGAG", - "TTACAGCGT","TTACACGGA","TTACACACT","TTAATCAGT","TTAATAGGA","TTAAGTGTG","TTAACCTTG","TTAACACAA","TGTTCACTT","TGTTCAAGA", - "TGTTAAGTG","TGTGTTATG","TGTGTCCAA","TGTGGAGCG","TGTCAGTTA","TGTCAGAAG","TGGTTAGTT","TGGTTACAA","TGGCGTTAT","TGGCGCCAA", - "TGGAGTCTT","TGCGTATTG","TGATAGAGA","TGAGGTATT","TGAGAATCT","TCTTGGTAA","TCTTCATAG","TCTGTCCTT","TCTGGAATT","TCTACCGCG", - "TCGTTCGAA","TCGTCAGTG","TCGACGAGA","TCATGGCTT","TCACACTTA","TATTCCGAA","TATTATGGT","TATGCTATT","TATCAAGGA","TAGTTCAAT", - - "TAGCTGCTT","TAGAGGAAG","TACCTGTTA","TACACCTGT","GTTGTGCGT","GTTGGCTAT","GTTGCCAAG","GTTGACCTT","GTTCTGCTA","GTTCTGAAT", - "GTTCTATCA","GTTCGCGTG","GTTCCTTAT","GTTAGCAGT","GTTACTGTG","GTTACTCAA","GTTAAGAGA","GTTAACTTA","GTGTCGGCA","GTGTCCATT", - "GTGCTTGAG","GTGCTCGTT","GTGCTCACA","GTGCCTGGA","GTCTTGTCG","GTCTTGATT","GTCTTCCGT","GTCTTAAGA","GTCTCATCT","GTCTACGAG", - "GTCGTTGCT","GTCGTGTTA","GTCGGTAAT","GTCGGATGT","GTCGAGCTG","GTCCGGACT","GTCCAACAT","GTCAGACGA","GTCAGAATT","GTCACTCTT", - "GTCAAGGAA","GTATGTCTT","GTATGTACA","GTATCGGTT","GTATATGTA","GTATACAAT","GTAGTTAAG","GTAGTCGAT","GTAGCCTTA","GTAGATACT", - "GTACGATTA","GTACAGTCT","GTAATTCGT","GCTTGGCAG","GCTTGCTTG","GCTTGAGGA","GCTTCATTA","GCTTATGCG","GCTGTGTAG","GCTGTCATG", - "GCTGGTTGT","GCTGGACTG","GCTGCCTAA","GCTGATATT","GCTCTTAGT","GCTCTATTG","GCTCGCCGT","GCTCCGCTG","GCTATTCTG","GCTATACGA", - "GCTACTAAG","GCTACATGT","GCTAACTCT","GCGTTGTAA","GCGTTCTCT","GCGTGCGTA","GCGTCTTGA","GCGTCCGAT","GCGTAAGAG","GCGCTTACG", - "GCGCGGATT","GCGCCATAT","GCGCATGAA","GCGATCAAT","GCGAGCCTT","GCGAGATTG","GCGAGAACA","GCCTTGGTA","GCCTTCTAG","GCCTTCACA", - "GCCTGAGTG","GCCTCACGT","GCCGGCGAA","GCCGCACAA","GCCATGCTT","GCCATATAT","GCCAATTCG","GCATTCGTT","GCATGATGT","GCAGTTGGA", - - "GCAGTGTCT","GCACTTGTG","GCAATCTGT","GCAACACTT","GATTGTATT","GATTGCGAG","GATTCCAGT","GATTCATAT","GATTATCAG","GATTAGGTT", - "GATGTTGCG","GATGGATCT","GATGCTGAT","GATGCCTTG","GATCTCCTT","GATCGCTTA","GATATTGAA","GATATTACT","GAGTGTTAT","GAGCTCAGT", - "GAGCGTGCT","GAGCGTCGA","GAGCGGTTG","GAGCGACTT","GAGCCGAAT","GAGATAGAT","GAGACCTAT","GACGGTCGT","GACGCAGGT","GACGATATG", - "GACCTATCT","GAATTAGGA","GAATCAGCT","GAAGTTCAT","GAAGTGGTT","GAAGTATTG","GAAGGCATT","GAACGCTGT","CTTGTCCAG","CTTGGATTG", - "CTTGCTGAA","CTTGCCGTG","CTTGATTCT","CTTCTGTCG","CTTCGGCGT","CTTATGAGT","CTTACCGAT","CTGTTAGGT","CTGTCGTCT","CTGTATAAT", - "CTGGCTCAT","CTGGATGCG","CTGCGTGTG","CTGCGCGGT","CTGCCGATT","CTGCATTGT","CTGATTAAG","CTGAGATAT","CTGACCTGT","CTCGTATCT", - "CTCGGCAAG","CTCGCAATT","CTCCTGCTT","CTCCTAAGT","CTCCGGATG","CTCCGAGCG","CTCACAGGT","CTATTCTAT","CTATTAGTG","CTATGAATT", - "CTACATATT","CGTGGCATT","CGTCTTAAT","CGTCTGGTT","CGTCACTGT","CGTAGGTCT","CGGTTCGAG","CGGTTCATT","CGGTGCTCT","CGGTAATTG", - "CGGCCTGAT","CGGATATAG","CGGAATATT","CGCTCCAAT","CGCGTTCGT","CGCAGGTTG","CGAGGATGT","CGAGCTGTT","CGACGGCTT","CCTTGTGTG", - "CCTGTCTCA","CCTGACTAT","CCTACCTTG","CCGTAGATT","CCGGCTGGT","CATCGGACG","CATCGATAA","CATCCTTCT","CAGTTCTGT","CAGTGCCAG", - - "CAGGCACTG","CAGCCTCTT","CACTTATAT","CACTGGTCG","CACTGCATG","CACGCGTTG","CACGATGTT","CACCATCTG","CACAGGCGT","ATTGTACAA", - "ATTGGTATG","ATTGCTAAT","ATTGCATAG","ATTGCAGTT","ATTCTGCAG","ATTCTACGT","ATTCGGATT","ATTCCGTTG","ATTCATCAA","ATTCAAGAG", - "ATTAGCCTT","ATTAATATT","ATGTTAGAG","ATGTTAACT","ATGTAGTCG","ATGGTGTAG","ATGGATTAT","ATCTTGAAG","ATCTGATAT","ATCTCAGAA", - "ATCGCTCAA","ATCGCGTCG","ATCCATGGT","ATCATGAGA","ATCATAGTT","ATCAGCGAG","ATCACCATT","ATAGTAATT","ATAGCTGTG","ATACTCTCG", - "ATACCTCAT","AGTTGCGCG","AGTTGAATT","AGTTATGAT","AGTGTCCGT","AGTGGCTTG","AGTGCTTCT","AGTATCATT","AGTACACAA","AGGTATGCG", - "AGGTATAGT","AGGCTACTT","AGGCCAGGT","AGGAGCGAT","AGCTTATAG","AGCTCTAGA","AGCGTGTAT","AGCGTCACA","AGCCTTCAT","AGCCTGTCG", - "AGCCTCGAG","AGCACTGAA","AGATGTACG","AGAGTTAAT","AGACCTCTG","ACTTCTATA","ACTGTCGAG","ACTGTATGT","ACTCTGTAA","ACTCGCGAA", - "ACTAGATCT","ACTAACGTT","ACGTTACTG","ACGTGGAAT","ACGGACTCT","ACGCCTAAT","ACGCCGTTA","ACGACGTGT","ACCTCGCAT","ACCATCATA", - "ACATATATT","ACAGGCACA","ACACCTGAG","ACACATTCT" - ) - -B384_cell_key3 = ("TTGTGGCTG","TTGTGGAGT","TTGTGCGAC","TTGTCTTCA","TTGTAAGAT","TTGGTTCTG","TTGGTGCGT","TTGGTCTAC","TTGGTAACT","TTGGCGTGC", - "TTGGATTAG","TTGGAGACG","TTGGAATCA","TTGCGGCGA","TTGCGCTCG","TTGCCTTAC","TTGCCGGAT","TTGCATGCT","TTGCACGTC","TTGCACCAT", - "TTGAACCTG","TTCTCGCGT","TTCTCAACT","TTCTACTCA","TTCGTCCAT","TTCGGATAC","TTCGGACGT","TTCGCAATC","TTCCGGTGC","TTCCGACTG", - "TTCATTATG","TTCATGGAT","TTCAGCGCA","TTCACCTCG","TTCAAGCAG","TTCAACTAC","TTATGCCAG","TTATGCATC","TTATCGTAC","TTATACCTA", - "TTATAATAG","TTATAAGTC","TTAGTTAGC","TTAGCTCAT","TTAGCACTA","TTAGATATG","TTACTACGA","TTACCGTCA","TTACAGAGC","TTAATTGCA", - "TTAACAGAT","TGTTGGCTA","TGTTGATGA","TGTTAAGCT","TGTGGCCGA","TGTGCTAGC","TGTGCGTCA","TGTCGCAGT","TGTCGAGCA","TGTACAACG", - "TGGTTCCGA","TGGTTCACT","TGGTCAAGT","TGGCTTGTA","TGGCTGTCG","TGGCGTATG","TGGCGCGCT","TGGATGTAC","TGGACTTGC","TGGAATACT", - "TGCTAGCGA","TGCGTTGCT","TGCGGTCTG","TGCGCTTAG","TGCGCGACG","TGCCTGCAT","TGCCTAGAC","TGCACGAGT","TGAGTGTGC","TGAGGCTCG", - "TCTTCCGTC","TCTTATAGT","TCTTACCAT","TCTGTTGTC","TCTGTTACT","TCTGGCTAG","TCTCAGATC","TCTAGTTGA","TCTAGTACG","TCGTACTAC", - "TCGGTGTAG","TCGGCTGCT","TCGCTACTG","TCGATCACG","TCGAGGCAT","TCCGGCGTC","TCCGGAGCT","TCCGCTCGT","TCCGAGTAC","TCCATTCAT", - - "TCCATGGTC","TCCAAGTCG","TCATTACGT","TCATGCACT","TCAGGTTGC","TCAGACCGT","TCACTCAGT","TCAAGCTCA","TATTGCGCA","TATTCGGCT", - "TATTCCAGC","TATTCATCA","TATGTTCAG","TATGGTATG","TATGCAAGT","TATCTGGTC","TATCTGACT","TATCCAGAT","TATCAGTCG","TATCACGCT", - "TAGGCGCGA","TAGGCACAT","TAGGATCGT","TAGCATTGC","TAGAGTTAC","TAGACTGAT","TACTTGTCG","TACGTCCGA","TACCGTACT","TACCGCGAT", - "TACCAGGAC","TACAGAAGT","TAAGTGCAT","TAAGCTACT","GTTGACCGA","GTTCTCGAC","GTTCCTGCT","GTTATGATG","GTGCTTGCA","GTGCCGCGT", - "GTATTGCTG","GTATTCCGA","GTATTAAGC","GTATGACGT","GTAGTTGTC","GTAGTACAT","GTAGCTCGA","GGTTGCTCA","GGTTGAGTA","GGTTAACGT", - "GGTGTGGCA","GGTCTTCAG","GGTCGTCTA","GGTCGGCGT","GGTCCGACT","GGTCATGTC","GGTCACATG","GGTAGTGCT","GGTAGCGTC","GGTACCAGT", - "GGTAAGGAT","GGCTTGTGC","GGCTTGACT","GGCTTACGA","GGCTGTAGT","GGCTGGCAG","GGCTCCATC","GGCGTGGAT","GGCGTAATC","GGCGCAAGT", - "GGCGAGTAG","GGCGACCGT","GGCCTGTCA","GGCCATTGC","GGCACTCTG","GGATGTCAT","GGAGTAACT","GGAGAACGA","GGACTGGCT","GGACGTTCA", - "GGAACGTGC","GCTGTCCAT","GCTGGTTCA","GCTGCAACT","GCTCGTTAC","GCTATAGAT","GCTAGTCGT","GCTACCATG","GCGTTCTGA","GCGTGTTAG", - "GCGGTATCG","GCGGAGCAT","GCGCGGTGC","GCGCCTAGT","GCGCCGGCT","GCCTTCATG","GCCATACTG","GCATGTTGA","GCATGCTAC","GCAGTATAC", - - "GCAGGTACT","GCAGCGCGT","GCACCTCAT","GCAATTCGA","GATTGCCGT","GATGAACAT","GATCTTCGA","GATCTGCAT","GAGTGGCAT","GAGTCGGAC", - "GAGTATGAT","GAGGCGAGT","GAGGCAACG","GAGCGCACT","GAATAGGCT","ATTGTCACT","ATTGTATCA","ATTGGTCAG","ATTGGCGAT","ATTGATCGT", - "ATTCGTAGT","ATTCATACG","ATTCAGGAC","ATTACTTCA","ATTAATTAG","ATTAAGCAT","ATGTCTCTA","ATGTAGCGT","ATGGCATAC","ATGGAGATC", - "ATGGACTCG","ATGGAACGA","ATGCTTCAT","ATGCTCGCT","ATGCGACGT","ATGCCGTAG","ATGAGTTCG","ATGACTATC","ATGACCGAC","ATCTTATGC", - "ATCTTACTA","ATCTATCAG","ATCGTGTAC","ATCGTCTGA","ATCGGCATG","ATCGCGAGC","ATCGCAACG","ATCGATGCT","ATCGAATAG","ATCCTTCTG", - "ATCCTGCGT","ATCCGCACT","ATCCATTAC","ATCCAAGCA","ATCAGATCA","ATCACACAT","ATCAACGTC","ATCAACCGA","ATATTGAGT","ATATTCGTC", - "ATATTACAG","ATATCTTGA","ATATCGCAT","ATATCAATC","ATAGTCCTG","ATAGGTCTA","ATAGCTGAC","ATAGCGGTA","AGTTCGCTG","AGTTACAGC", - "AGTTAACTA","AGTGCAATC","AGTCTGGTA","AGTCTGAGC","AGTCTACAT","AGTCGAACT","AGTCCATCG","AGTCATTCA","AGTATCCAG","AGTAGACTG", - "AGTAATCGA","AGTAAGTGC","AGGTTGGCT","AGGTTCTAG","AGGTGTTCA","AGGTGCCAT","AGGTCTGAT","AGGTCGTAC","AGGTCAGCA","AGGCTTATC", - "AGGCTATGA","AGGCCGACG","AGGCCAAGC","AGGCAGGTC","AGGCAAGAT","AGGAGCAGT","AGGACCGCT","AGGAATTAC","AGCTTGGAC","AGCTTAAGT", - - "AGCTACACG","AGCGTTACG","AGCGGTGCA","AGCGGAGTC","AGCGGACGA","AGCGCGCTA","AGCGATAGC","AGCGACTCA","AGCCTCTAC","AGCCGTCGT", - "AGCATGATC","AGCACTTCG","AGCACGGCA","AGATTCTGA","AGATTAGAT","AGATGATAG","AGATATGTA","AGATACCGT","AGAGTGCGT","AGAGCCGAT", - "AGACTCACT","ACTTGCCTA","ACTTGAGCA","ACTTCTAGC","ACTTCGACT","ACTTAGTAC","ACTGTTGAT","ACTGTAACG","ACTGGTATC","ACTGACGTC", - "ACTGAAGCT","ACTCTGATG","ACTCCTGAC","ACTCCGCTA","ACTCAACTG","ACTATTGCA","ACTAGGCAG","ACTACGCGT","ACTAATACT","ACGTTCGTA", - "ACGTGTGCT","ACGTGTATG","ACGTGGAGC","ACGTCTTCG","ACGTCAGTC","ACGGTCTCA","ACGGTCCGT","ACGGTACAG","ACGGCGCTG","ACGCTGCGA", - "ACGCGTGTA","ACGCGCCAG","ACGATGTCG","ACGATGGAT","ACGATCTAC","ACGAGCTGA","ACGAGCATC","ACGAATCGT","ACGAACGCA","ACCTTGTAG", - "ACCTGTTGC","ACCTGTCAT","ACCTCGATC","ACCTAGGTA","ACCTACTGA","ACCTAATCG","ACCGTAGCA","ACCGGTAGT","ACCGGCTAC","ACCGCTTCA", - "ACATTGTGC","ACATTCTCG","ACATGGCTG","ACATGACGA","ACATATGAT","ACATATACG","ACAGCGTAC","ACACTTGCT","ACACTATCA","ACACGCATG", - "ACACCAGTA","ACACCAACT","ACACATAGT","ACACACCTA" - ) +A96_cell_key1 = ( + "GTCGCTATA", + "CTTGTACTA", + "CTTCACATA", + "ACACGCCGG", + "CGGTCCAGG", + "AATCGAATG", + "CCTAGTATA", + "ATTGGCTAA", + "AAGACATGC", + "AAGGCGATC", + "GTGTCCTTA", + "GGATTAGGA", + "ATGGATCCA", + "ACATAAGCG", + "AACTGTATT", + "ACCTTGCGG", + "CAGGTGTAG", + "AGGAGATTA", + "GCGATTACA", + "ACCGGATAG", + "CCACTTGGA", + "AGAGAAGTT", + "TAAGTTCGA", + "ACGGATATT", + "TGGCTCAGA", + "GAATCTGTA", + "ACCAAGGAC", + "AGTATCTGT", + "CACACACTA", + "ATTAAGTGC", + "AAGTAACCC", + "AAATCCTGT", + "CACATTGCA", + "GCACTGTCA", + "ATACTTAGG", + "GCAATCCGA", + "ACGCAATCA", + "GAGTATTAG", + "GACGGATTA", + "CAGCTGACA", + "CAACATATT", + "AACTTCTCC", + "CTATGAAAT", + "ATTATTACC", + "TACCGAGCA", + "TCTCTTCAA", + "TAAGCGTTA", + "GCCTTACAA", + "AGCACACAG", + "ACAGTTCCG", + "AGTAAAGCC", + "CAGTTTCAC", + "CGTTACTAA", + "TTGTTCCAA", + "AGAAGCACT", + "CAGCAAGAT", + "CAAACCGCC", + "CTAACTCGC", + "AATATTGGG", + "AGAACTTCC", + "CAAAGGCAC", + "AAGCTCAAC", + "TCCAGTCGA", + "AGCCATCAC", + "AACGAGAAG", + "CTACAGAAC", + "AGAGCTATG", + "GAGGATGGA", + "TGTACCTTA", + "ACACACAAA", + "TCAGGAGGA", + "GAGGTGCTA", + "ACCCTGACC", + "ACAAGGATC", + "ATCCCGGAG", + "TATGTGGCA", + "GCTGCCAAT", + "ATCAGAGCT", + "TCGAAGTGA", + "ATAGACGAG", + "AGCCCAATC", + "CAGAATCGT", + "ATCTCCACA", + "ACGAAAGGT", + "TAGCTTGTA", + "ACACGAGAT", + "AACCGCCTC", + "ATTTAGATG", + "CAAGCAAGC", + "CAAAGTGTG", + "GGCAAGCAA", + "GAGCCAATA", + "ATGTAATGG", + "CCTGAGCAA", + "GAGTACATT", + "TGCGATCTA", +) + +A96_cell_key2 = ( + "TACAGGATA", + "CACCAGGTA", + "TGTGAAGAA", + "GATTCATCA", + "CACCCAAAG", + "CACAAAGGC", + "GTGTGTCGA", + "CTAGGTCCT", + "ACAGTGGTA", + "TCGTTAGCA", + "AGCGACACC", + "AAGCTACTT", + "TGTTCTCCA", + "ACGCGAAGC", + "CAGAAATCG", + "ACCAAAATG", + "AGTGTTGTC", + "TAGGGATAC", + "AGGGCTGGT", + "TCATCCTAA", + "AATCCTGAA", + "ATCCTAGGA", + "ACGACCACC", + "TTCCATTGA", + "TAGTCTTGA", + "ACTGTTAGA", + "ATTCATCGT", + "ACTTCGAGC", + "TTGCGTACA", + "CAGTGCCCG", + "GACACTTAA", + "AGGAGGCGC", + "GCCTGTTCA", + "GTACATCTA", + "AATCAGTTT", + "ACGATGAAT", + "TGACAGACA", + "ATTAGGCAT", + "GGAGTCTAA", + "TAGAACACA", + "AAATAAATA", + "CCGACAAGA", + "CACCTACCC", + "AAGAGTAGA", + "TCATTGAGA", + "GACCTTAGA", + "CAAGACCTA", + "GGAATGATA", + "AAACGTACC", + "ACTATCCTC", + "CCGTATCTA", + "ACACATGTC", + "TTGGTATGA", + "GTGCAGTAA", + "AGGATTCAA", + "AGAATGGAG", + "CTCTCTCAA", + "GCTAACTCA", + "ATCAACCGA", + "ATGAGTTAC", + "ACTTGATGA", + "ACTTTAACT", + "TTGGAGGTA", + "GCCAATGTA", + "ATCCAACCG", + "GATGAACTG", + "CCATGCACA", + "TAGTGACTA", + "AAACTGCGC", + "ATTACCAAG", + "CACTCGAGA", + "AACTCATTG", + "CTTGCTTCA", + "ACCTGAGTC", + "AGGTTCGCT", + "AAGGACTAT", + "CGTTCGGTA", + "AGATAGTTC", + "CAATTGATC", + "GCATGGCTA", + "ACCAGGTGT", + "AGCTGCCGT", + "TATAGCCCT", + "AGAGGACCA", + "ACAATATGG", + "CAGCACTTC", + "CACTTATGT", + "AGTGAAAGG", + "AACCCTCGG", + "AGGCAGCTA", + "AACCAAAGT", + "GAGTGCGAA", + "CGCTAAGCA", + "AATTATAAC", + "TACTAGTCA", + "CAACAACGG", +) + +A96_cell_key3 = ( + "AAGCCTTCT", + "ATCATTCTG", + "CACAAGTAT", + "ACACCTTAG", + "GAACGACAA", + "AGTCTGTAC", + "AAATTACAG", + "GGCTACAGA", + "AATGTATCG", + "CAAGTAGAA", + "GATCTCTTA", + "AACAACGCG", + "GGTGAGTTA", + "CAGGGAGGG", + "TCCGTCTTA", + "TGCATAGTA", + "ACTTACGAT", + "TGTATGCGA", + "GCTCCTTGA", + "GGCACAACA", + "CTCAAGACA", + "ACGCTGTTG", + "ATATTGTAA", + "AAGTTTACG", + "CAGCCTGGC", + "CTATTAGCC", + "CAAACGTGG", + "AAAGTCATT", + "GTCTTGGCA", + "GATCAGCGA", + "ACATTCGGC", + "AGTAATTAG", + "TGAAGCCAA", + "TCTACGACA", + "CATAACGTT", + "ATGGGACTC", + "GATAGAGGA", + "CTACATGCG", + "CAACGATCT", + "GTTAGCCTA", + "AGTTGCATC", + "AAGGGAACT", + "ACTACATAT", + "CTAAGCTTC", + "ACGAACCAG", + "TACTTCGGA", + "AACATCCAT", + "AGCCTGGTT", + "CAAGTTTCC", + "CAGGCATTT", + "ACGTGGGAG", + "TCTCACGGA", + "GCAACATTA", + "ATGGTCCGT", + "CTATCATGA", + "CAATACAAG", + "AAAGAGGCC", + "GTAGAAGCA", + "GCTATGGAA", + "ACTCCAGGG", + "ACAAGTGCA", + "GATGGTCCA", + "TCCTCAATA", + "AATAAACAA", + "CTGTACGGA", + "CTAGATAGA", + "AGCTATGTG", + "AAATGGAGG", + "AGCCGCAAG", + "ACAGTAAAC", + "AACGTGTGA", + "ACTGAATTC", + "AAGGGTCAG", + "TGTCTATCA", + "TCAGATTCA", + "CACGATCCG", + "AACAGAAAC", + "CATGAATGA", + "CGTACTACG", + "TTCAGCTCA", + "AAGGCCGCA", + "GGTTGGACA", + "CGTCTAGGT", + "AATTCGGCG", + "CAACCTCCA", + "CAATAGGGT", + "ACAGGCTCC", + "ACAACTAGT", + "AGTTGTTCT", + "AATTACCGG", + "ACAAACTTT", + "TCTCGGTTA", + "ACTAGACCG", + "ACTCATACG", + "ATCGAGTCT", + "CATAGGTCA", +) + +B384_cell_key1 = ( + "TGTGTTCGC", + "TGTGGCGCC", + "TGTCTAGCG", + "TGGTTGTCC", + "TGGTTCCTC", + "TGGTGTGCT", + "TGGCGACCG", + "TGCTGTGGC", + "TGCTGGCAC", + "TGCTCTTCC", + "TGCCTCACC", + "TGCCATTAT", + "TGATGTCTC", + "TGATGGCCT", + "TGATGCTTG", + "TGAAGGACC", + "TCTGTCTCC", + "TCTGATTAT", + "TCTGAGGTT", + "TCTCGTTCT", + "TCTCATCCG", + "TCCTGGATT", + "TCAGCATTC", + "TCACGCCTT", + "TATGTGCAC", + "TATGCGGCC", + "TATGACGAG", + "TATCTCGTG", + "TATATGACC", + "TAGGCTGTG", + "TACTGCGTT", + "TACGTGTCC", + "TAATCACAT", + "GTTGTGTTG", + "GTTGTGGCT", + "GTTGTCTGT", + "GTTGTCGAG", + "GTTGTCCTC", + "GTTGTATCC", + "GTTGGTTCT", + "GTTGGCGTT", + "GTTGGAGCG", + "GTTGCTGCC", + "GTTGCGCAT", + "GTTGCAGGT", + "GTTGCACTG", + "GTTGATGAT", + "GTTGATACG", + "GTTGAAGTC", + "GTTCTGTGC", + "GTTCTCTCG", + "GTTCTATAT", + "GTTCGTATG", + "GTTCGGCCT", + "GTTCGCGGC", + "GTTCGATTC", + "GTTCCGGTT", + "GTTCCGACG", + "GTTCACGCT", + "GTTATCACC", + "GTTAGTCCG", + "GTTAGGTGT", + "GTTAGAGAC", + "GTTAGACTT", + "GTTACCTCT", + "GTTAATTCC", + "GTTAAGCGC", + "GTGTTGCTT", + "GTGTTCGGT", + "GTGTTCCAG", + "GTGTTCATC", + "GTGTCACAC", + "GTGTCAAGT", + "GTGTACTGC", + "GTGGTTAGT", + "GTGGTACCG", + "GTGGCGATC", + "GTGCTTCTG", + "GTGCGTTCC", + "GTGCGGTAT", + "GTGCGCCTT", + "GTGCGAACT", + "GTGCAGCCG", + "GTGCAATTG", + "GTGCAAGGC", + "GTCTTGCGC", + "GTCTGGCCG", + "GTCTGAGGC", + "GTCTCAGAT", + "GTCTCAACC", + "GTCTATCGT", + "GTCGGTGTG", + "GTCGGAATC", + "GTCGCTCCG", + "GTCCTCGCC", + "GTCCTACCT", + "GTCCGCTTG", + "GTCCATTCT", + "GTCCAATAC", + "GTCATGTAT", + "GTCAGTGGT", + "GTCAGATAG", + "GTATTAACT", + "GTATCAGTC", + "GTATAGCCT", + "GTATACTTG", + "GTATAAGGT", + "GTAGCATCG", + "GTACCGTCC", + "GTACACCTC", + "GTAAGTGCC", + "GTAACAGAG", + "GGTTGTGTC", + "GGTTGGCTG", + "GGTTGACGC", + "GGTTCGTCG", + "GGTTCAGTT", + "GGTTATATT", + "GGTTAATAC", + "GGTGTACGT", + "GGTGCCGCT", + "GGTGCATGC", + "GGTCGTTGC", + "GGTCGAGGT", + "GGTAGGCAC", + "GGTAGCTTG", + "GGTACATAG", + "GGTAATCTG", + "GGCTTGGCC", + "GGCTTCACG", + "GGCTTATGT", + "GGCTTACTC", + "GGCTGTCTT", + "GGCTCTGTG", + "GGCTCCGGT", + "GGCTCACCT", + "GGCGTTGAG", + "GGCGTGTAC", + "GGCGTGCTG", + "GGCGTATCG", + "GGCGCTCGT", + "GGCGCTACC", + "GGCGAGCCT", + "GGCGAGATC", + "GGCGACTTG", + "GGCCTCTTC", + "GGCCTACAG", + "GGCCAGCGC", + "GGCCAACTT", + "GGCATTCCT", + "GGCATCCGC", + "GGCATAACC", + "GGCAACGAT", + "GGATGTCCG", + "GGATGAGAG", + "GGATCTGGC", + "GGATCCATG", + "GGATAGGTT", + "GGAGTCGTG", + "GGAGAAGGC", + "GGACTCCTT", + "GGACTAGTC", + "GGACCGTTG", + "GGAATTAGT", + "GGAATCTCT", + "GGAATCGAC", + "GGAAGCCTC", + "GCTTGTAGC", + "GCTTGACCG", + "GCTTCGGAC", + "GCTTCACAT", + "GCTTAGTCT", + "GCTGGATAT", + "GCTGGAACC", + "GCTGCGATG", + "GCTGATCAG", + "GCTGAGCGT", + "GCTCTTGTC", + "GCTCTCCTG", + "GCTCGGTCC", + "GCTCCAATT", + "GCTATTCGC", + "GCTATGAGT", + "GCTAGTGTT", + "GCTAGGATC", + "GCTAGCACT", + "GCTACGTAT", + "GCTAACCTT", + "GCGTTCCGC", + "GCGTGTGCC", + "GCGTGCATT", + "GCGTCGGTT", + "GCGTATGTG", + "GCGTATACT", + "GCGGTTCAC", + "GCGGTCTTG", + "GCGGCGTCG", + "GCGGCACCT", + "GCGCTGGAC", + "GCGCTCTCC", + "GCGCGGCAG", + "GCGCGATAC", + "GCGCCGACC", + "GCGAGCGAG", + "GCGAGAGGT", + "GCGAATTAC", + "GCCTTGCAT", + "GCCTGCGCT", + "GCCTAACTG", + "GCCGTCCGT", + "GCCGCTGTC", + "GCCATGCCG", + "GCCAGCTAT", + "GCCAACCAG", + "GCATGGTTG", + "GCATCGACG", + "GCAGGCTAG", + "GCAGGACGC", + "GCAGCCATC", + "GCAGATACC", + "GCAGACGTT", + "GCACTATGT", + "GCACACGAG", + "GATTGTCAT", + "GATTGGTAG", + "GATTGCACC", + "GATTCTACT", + "GATTCGCTT", + "GATTAGGCC", + "GATTACGGT", + "GATGTTGGC", + "GATGTTATG", + "GATGGCCAG", + "GATCGTTCG", + "GATCGGAGC", + "GATCGCCTC", + "GATCCTCTG", + "GATCCAGCG", + "GATACACGC", + "GAGTTACCT", + "GAGTCGTAT", + "GAGTCGCCG", + "GAGGTGTAG", + "GAGGCATTG", + "GAGCGGACG", + "GAGCCTGAG", + "GAGATCTGT", + "GAGATAATT", + "GAGACGGCT", + "GACTTCGTG", + "GACTGTTCT", + "GACTCTTAG", + "GACCGCATT", + "GAATTGAGC", + "GAATATTGC", + "GAAGGCTCT", + "GAAGAGACT", + "GAACTGCCG", + "GAACGCGTG", + "CTTGTGTAT", + "CTTGTGCGC", + "CTTGTCATG", + "CTTGGTCTT", + "CTTGGTACC", + "CTTGGATGT", + "CTTGCTCAC", + "CTTGCAATC", + "CTTGAGGCC", + "CTTGACGGT", + "CTTCTGATC", + "CTTCTCGTT", + "CTTCTAGGC", + "CTTCGTTAG", + "CTTATGTCC", + "CTTATGCTT", + "CTTATATAG", + "CTTAGGTTG", + "CTTAGGAGC", + "CTTACTTAT", + "CTGTTCTCG", + "CTGTGCCTC", + "CTGTCGCAT", + "CTGTCGAGC", + "CTGTAGCTG", + "CTGTACGTT", + "CTGCTTGCC", + "CTGCGTAGT", + "CTGCACACC", + "CTGATGGAT", + "CTGAGTCAT", + "CTGACGCCG", + "CTGAACGAG", + "CTCTTGTAG", + "CTCTTAGTT", + "CTCTTACCG", + "CTCTGCACC", + "CTCTCGTCC", + "CTCGTATTG", + "CTCGACTAT", + "CTCCTGACG", + "CTCACTAGC", + "CTATACGGC", + "CGTTCGCTC", + "CGTTCACCG", + "CGTATAGTT", + "CGGTGTTCC", + "CGGTGTCAG", + "CGGTCCTGC", + "CGGCGACTC", + "CGGCACGGT", + "CGGATAGCC", + "CGGAGAGAT", + "CGCTAATAG", + "CGCGTTGGC", + "CGCGCAGAG", + "CGCACTGCC", + "CCTTGTCTC", + "CCTTGGCGT", + "CCTTCTGAG", + "CCTTCTCCT", + "CCTTCGACC", + "CCTTACTTG", + "CCTGTTCGT", + "CCTGTATGC", + "CCTCGGCCG", + "CCGTTAATT", + "CCATGTGCG", + "CCAGTGGTT", + "CCAGGCATT", + "CCAGGATCC", + "CCAGCGTTG", + "CATTCCGAT", + "CATTATACC", + "CATGTTGAG", + "ATTGCGTGT", + "ATTGCGGAC", + "ATTGCGCCG", + "ATTGACTTG", + "ATTCGGCTG", + "ATTCGCGAG", + "ATTCCAAGT", + "ATTATCTTC", + "ATTACTGTT", + "ATTACACTC", + "ATGTTCTAT", + "ATGTTACGC", + "ATGTGTATC", + "ATGTGGCAG", + "ATGTCTGTG", + "ATGGTGCAT", + "ATGCTTACT", + "ATGCTGTCC", + "ATGCTCGGC", + "ATGAGGTTC", + "ATGAGAGTG", + "ATCTTGGCT", + "ATCTGTGCG", + "ATCGGTTCC", + "ATCATGCTC", + "ATCATCACT", + "ATATCTTAT", + "ATAGGCGCC", + "AGTTGGTAT", + "AGTTGAGCC", + "AGTGCGACC", + "AGGTGCTAC", + "AGGCTTGCG", + "AGGCCTTCC", + "AGGCACCTT", + "AGGAATATG", + "AGCGGCCAG", + "AGCCTGGTC", + "AGCCTGACT", + "AGCAATCCG", + "AGAGATGTT", + "AGAGAATTC", + "ACTCGCTTG", + "ACTCGACCT", + "ACGTACACC", + "ACGGATGGT", + "ACCAGTCTG", + "ACATTCGGC", + "ACATGAGGT", + "ACACTAATT", +) + +B384_cell_key2 = ( + "TTGTGTTGT", + "TTGTGGTAG", + "TTGTGCGGA", + "TTGTCTGTT", + "TTGTCTAAG", + "TTGTCATAT", + "TTGTCACGA", + "TTGTATGAA", + "TTGTACAGT", + "TTGGTTAAT", + "TTGGTGCAA", + "TTGGTCGAG", + "TTGGTATTA", + "TTGGCACAG", + "TTGGATACA", + "TTGGAAGTG", + "TTGCGGTTA", + "TTGCCATTG", + "TTGCACGCG", + "TTGCAAGGT", + "TTGATGTAT", + "TTGATAATT", + "TTGAGACGT", + "TTGACTACT", + "TTGACCGAA", + "TTCTGGTCT", + "TTCTGCACA", + "TTCTCCTTA", + "TTCTCCGCT", + "TTCTAGGTA", + "TTCTAATCG", + "TTCGTCGTA", + "TTCGTAGAT", + "TTCGGCTTG", + "TTCGGAATA", + "TTCGCCAGA", + "TTCGATTGT", + "TTCGATCAG", + "TTCCTCGGT", + "TTCCGGCAG", + "TTCCGCATT", + "TTCCAATTA", + "TTCATTGAA", + "TTCATGCTG", + "TTCAGGAGT", + "TTCACTATA", + "TTCAACTCT", + "TTCAACGTG", + "TTATGCGTT", + "TTATGATTG", + "TTATCCTGT", + "TTATCCGAG", + "TTATATTAT", + "TTAGGCGCG", + "TTACTGGAA", + "TTACTAGTT", + "TTACGTGGT", + "TTACGATAT", + "TTACCTAGA", + "TTACATGAG", + "TTACAGCGT", + "TTACACGGA", + "TTACACACT", + "TTAATCAGT", + "TTAATAGGA", + "TTAAGTGTG", + "TTAACCTTG", + "TTAACACAA", + "TGTTCACTT", + "TGTTCAAGA", + "TGTTAAGTG", + "TGTGTTATG", + "TGTGTCCAA", + "TGTGGAGCG", + "TGTCAGTTA", + "TGTCAGAAG", + "TGGTTAGTT", + "TGGTTACAA", + "TGGCGTTAT", + "TGGCGCCAA", + "TGGAGTCTT", + "TGCGTATTG", + "TGATAGAGA", + "TGAGGTATT", + "TGAGAATCT", + "TCTTGGTAA", + "TCTTCATAG", + "TCTGTCCTT", + "TCTGGAATT", + "TCTACCGCG", + "TCGTTCGAA", + "TCGTCAGTG", + "TCGACGAGA", + "TCATGGCTT", + "TCACACTTA", + "TATTCCGAA", + "TATTATGGT", + "TATGCTATT", + "TATCAAGGA", + "TAGTTCAAT", + "TAGCTGCTT", + "TAGAGGAAG", + "TACCTGTTA", + "TACACCTGT", + "GTTGTGCGT", + "GTTGGCTAT", + "GTTGCCAAG", + "GTTGACCTT", + "GTTCTGCTA", + "GTTCTGAAT", + "GTTCTATCA", + "GTTCGCGTG", + "GTTCCTTAT", + "GTTAGCAGT", + "GTTACTGTG", + "GTTACTCAA", + "GTTAAGAGA", + "GTTAACTTA", + "GTGTCGGCA", + "GTGTCCATT", + "GTGCTTGAG", + "GTGCTCGTT", + "GTGCTCACA", + "GTGCCTGGA", + "GTCTTGTCG", + "GTCTTGATT", + "GTCTTCCGT", + "GTCTTAAGA", + "GTCTCATCT", + "GTCTACGAG", + "GTCGTTGCT", + "GTCGTGTTA", + "GTCGGTAAT", + "GTCGGATGT", + "GTCGAGCTG", + "GTCCGGACT", + "GTCCAACAT", + "GTCAGACGA", + "GTCAGAATT", + "GTCACTCTT", + "GTCAAGGAA", + "GTATGTCTT", + "GTATGTACA", + "GTATCGGTT", + "GTATATGTA", + "GTATACAAT", + "GTAGTTAAG", + "GTAGTCGAT", + "GTAGCCTTA", + "GTAGATACT", + "GTACGATTA", + "GTACAGTCT", + "GTAATTCGT", + "GCTTGGCAG", + "GCTTGCTTG", + "GCTTGAGGA", + "GCTTCATTA", + "GCTTATGCG", + "GCTGTGTAG", + "GCTGTCATG", + "GCTGGTTGT", + "GCTGGACTG", + "GCTGCCTAA", + "GCTGATATT", + "GCTCTTAGT", + "GCTCTATTG", + "GCTCGCCGT", + "GCTCCGCTG", + "GCTATTCTG", + "GCTATACGA", + "GCTACTAAG", + "GCTACATGT", + "GCTAACTCT", + "GCGTTGTAA", + "GCGTTCTCT", + "GCGTGCGTA", + "GCGTCTTGA", + "GCGTCCGAT", + "GCGTAAGAG", + "GCGCTTACG", + "GCGCGGATT", + "GCGCCATAT", + "GCGCATGAA", + "GCGATCAAT", + "GCGAGCCTT", + "GCGAGATTG", + "GCGAGAACA", + "GCCTTGGTA", + "GCCTTCTAG", + "GCCTTCACA", + "GCCTGAGTG", + "GCCTCACGT", + "GCCGGCGAA", + "GCCGCACAA", + "GCCATGCTT", + "GCCATATAT", + "GCCAATTCG", + "GCATTCGTT", + "GCATGATGT", + "GCAGTTGGA", + "GCAGTGTCT", + "GCACTTGTG", + "GCAATCTGT", + "GCAACACTT", + "GATTGTATT", + "GATTGCGAG", + "GATTCCAGT", + "GATTCATAT", + "GATTATCAG", + "GATTAGGTT", + "GATGTTGCG", + "GATGGATCT", + "GATGCTGAT", + "GATGCCTTG", + "GATCTCCTT", + "GATCGCTTA", + "GATATTGAA", + "GATATTACT", + "GAGTGTTAT", + "GAGCTCAGT", + "GAGCGTGCT", + "GAGCGTCGA", + "GAGCGGTTG", + "GAGCGACTT", + "GAGCCGAAT", + "GAGATAGAT", + "GAGACCTAT", + "GACGGTCGT", + "GACGCAGGT", + "GACGATATG", + "GACCTATCT", + "GAATTAGGA", + "GAATCAGCT", + "GAAGTTCAT", + "GAAGTGGTT", + "GAAGTATTG", + "GAAGGCATT", + "GAACGCTGT", + "CTTGTCCAG", + "CTTGGATTG", + "CTTGCTGAA", + "CTTGCCGTG", + "CTTGATTCT", + "CTTCTGTCG", + "CTTCGGCGT", + "CTTATGAGT", + "CTTACCGAT", + "CTGTTAGGT", + "CTGTCGTCT", + "CTGTATAAT", + "CTGGCTCAT", + "CTGGATGCG", + "CTGCGTGTG", + "CTGCGCGGT", + "CTGCCGATT", + "CTGCATTGT", + "CTGATTAAG", + "CTGAGATAT", + "CTGACCTGT", + "CTCGTATCT", + "CTCGGCAAG", + "CTCGCAATT", + "CTCCTGCTT", + "CTCCTAAGT", + "CTCCGGATG", + "CTCCGAGCG", + "CTCACAGGT", + "CTATTCTAT", + "CTATTAGTG", + "CTATGAATT", + "CTACATATT", + "CGTGGCATT", + "CGTCTTAAT", + "CGTCTGGTT", + "CGTCACTGT", + "CGTAGGTCT", + "CGGTTCGAG", + "CGGTTCATT", + "CGGTGCTCT", + "CGGTAATTG", + "CGGCCTGAT", + "CGGATATAG", + "CGGAATATT", + "CGCTCCAAT", + "CGCGTTCGT", + "CGCAGGTTG", + "CGAGGATGT", + "CGAGCTGTT", + "CGACGGCTT", + "CCTTGTGTG", + "CCTGTCTCA", + "CCTGACTAT", + "CCTACCTTG", + "CCGTAGATT", + "CCGGCTGGT", + "CATCGGACG", + "CATCGATAA", + "CATCCTTCT", + "CAGTTCTGT", + "CAGTGCCAG", + "CAGGCACTG", + "CAGCCTCTT", + "CACTTATAT", + "CACTGGTCG", + "CACTGCATG", + "CACGCGTTG", + "CACGATGTT", + "CACCATCTG", + "CACAGGCGT", + "ATTGTACAA", + "ATTGGTATG", + "ATTGCTAAT", + "ATTGCATAG", + "ATTGCAGTT", + "ATTCTGCAG", + "ATTCTACGT", + "ATTCGGATT", + "ATTCCGTTG", + "ATTCATCAA", + "ATTCAAGAG", + "ATTAGCCTT", + "ATTAATATT", + "ATGTTAGAG", + "ATGTTAACT", + "ATGTAGTCG", + "ATGGTGTAG", + "ATGGATTAT", + "ATCTTGAAG", + "ATCTGATAT", + "ATCTCAGAA", + "ATCGCTCAA", + "ATCGCGTCG", + "ATCCATGGT", + "ATCATGAGA", + "ATCATAGTT", + "ATCAGCGAG", + "ATCACCATT", + "ATAGTAATT", + "ATAGCTGTG", + "ATACTCTCG", + "ATACCTCAT", + "AGTTGCGCG", + "AGTTGAATT", + "AGTTATGAT", + "AGTGTCCGT", + "AGTGGCTTG", + "AGTGCTTCT", + "AGTATCATT", + "AGTACACAA", + "AGGTATGCG", + "AGGTATAGT", + "AGGCTACTT", + "AGGCCAGGT", + "AGGAGCGAT", + "AGCTTATAG", + "AGCTCTAGA", + "AGCGTGTAT", + "AGCGTCACA", + "AGCCTTCAT", + "AGCCTGTCG", + "AGCCTCGAG", + "AGCACTGAA", + "AGATGTACG", + "AGAGTTAAT", + "AGACCTCTG", + "ACTTCTATA", + "ACTGTCGAG", + "ACTGTATGT", + "ACTCTGTAA", + "ACTCGCGAA", + "ACTAGATCT", + "ACTAACGTT", + "ACGTTACTG", + "ACGTGGAAT", + "ACGGACTCT", + "ACGCCTAAT", + "ACGCCGTTA", + "ACGACGTGT", + "ACCTCGCAT", + "ACCATCATA", + "ACATATATT", + "ACAGGCACA", + "ACACCTGAG", + "ACACATTCT", +) + +B384_cell_key3 = ( + "TTGTGGCTG", + "TTGTGGAGT", + "TTGTGCGAC", + "TTGTCTTCA", + "TTGTAAGAT", + "TTGGTTCTG", + "TTGGTGCGT", + "TTGGTCTAC", + "TTGGTAACT", + "TTGGCGTGC", + "TTGGATTAG", + "TTGGAGACG", + "TTGGAATCA", + "TTGCGGCGA", + "TTGCGCTCG", + "TTGCCTTAC", + "TTGCCGGAT", + "TTGCATGCT", + "TTGCACGTC", + "TTGCACCAT", + "TTGAACCTG", + "TTCTCGCGT", + "TTCTCAACT", + "TTCTACTCA", + "TTCGTCCAT", + "TTCGGATAC", + "TTCGGACGT", + "TTCGCAATC", + "TTCCGGTGC", + "TTCCGACTG", + "TTCATTATG", + "TTCATGGAT", + "TTCAGCGCA", + "TTCACCTCG", + "TTCAAGCAG", + "TTCAACTAC", + "TTATGCCAG", + "TTATGCATC", + "TTATCGTAC", + "TTATACCTA", + "TTATAATAG", + "TTATAAGTC", + "TTAGTTAGC", + "TTAGCTCAT", + "TTAGCACTA", + "TTAGATATG", + "TTACTACGA", + "TTACCGTCA", + "TTACAGAGC", + "TTAATTGCA", + "TTAACAGAT", + "TGTTGGCTA", + "TGTTGATGA", + "TGTTAAGCT", + "TGTGGCCGA", + "TGTGCTAGC", + "TGTGCGTCA", + "TGTCGCAGT", + "TGTCGAGCA", + "TGTACAACG", + "TGGTTCCGA", + "TGGTTCACT", + "TGGTCAAGT", + "TGGCTTGTA", + "TGGCTGTCG", + "TGGCGTATG", + "TGGCGCGCT", + "TGGATGTAC", + "TGGACTTGC", + "TGGAATACT", + "TGCTAGCGA", + "TGCGTTGCT", + "TGCGGTCTG", + "TGCGCTTAG", + "TGCGCGACG", + "TGCCTGCAT", + "TGCCTAGAC", + "TGCACGAGT", + "TGAGTGTGC", + "TGAGGCTCG", + "TCTTCCGTC", + "TCTTATAGT", + "TCTTACCAT", + "TCTGTTGTC", + "TCTGTTACT", + "TCTGGCTAG", + "TCTCAGATC", + "TCTAGTTGA", + "TCTAGTACG", + "TCGTACTAC", + "TCGGTGTAG", + "TCGGCTGCT", + "TCGCTACTG", + "TCGATCACG", + "TCGAGGCAT", + "TCCGGCGTC", + "TCCGGAGCT", + "TCCGCTCGT", + "TCCGAGTAC", + "TCCATTCAT", + "TCCATGGTC", + "TCCAAGTCG", + "TCATTACGT", + "TCATGCACT", + "TCAGGTTGC", + "TCAGACCGT", + "TCACTCAGT", + "TCAAGCTCA", + "TATTGCGCA", + "TATTCGGCT", + "TATTCCAGC", + "TATTCATCA", + "TATGTTCAG", + "TATGGTATG", + "TATGCAAGT", + "TATCTGGTC", + "TATCTGACT", + "TATCCAGAT", + "TATCAGTCG", + "TATCACGCT", + "TAGGCGCGA", + "TAGGCACAT", + "TAGGATCGT", + "TAGCATTGC", + "TAGAGTTAC", + "TAGACTGAT", + "TACTTGTCG", + "TACGTCCGA", + "TACCGTACT", + "TACCGCGAT", + "TACCAGGAC", + "TACAGAAGT", + "TAAGTGCAT", + "TAAGCTACT", + "GTTGACCGA", + "GTTCTCGAC", + "GTTCCTGCT", + "GTTATGATG", + "GTGCTTGCA", + "GTGCCGCGT", + "GTATTGCTG", + "GTATTCCGA", + "GTATTAAGC", + "GTATGACGT", + "GTAGTTGTC", + "GTAGTACAT", + "GTAGCTCGA", + "GGTTGCTCA", + "GGTTGAGTA", + "GGTTAACGT", + "GGTGTGGCA", + "GGTCTTCAG", + "GGTCGTCTA", + "GGTCGGCGT", + "GGTCCGACT", + "GGTCATGTC", + "GGTCACATG", + "GGTAGTGCT", + "GGTAGCGTC", + "GGTACCAGT", + "GGTAAGGAT", + "GGCTTGTGC", + "GGCTTGACT", + "GGCTTACGA", + "GGCTGTAGT", + "GGCTGGCAG", + "GGCTCCATC", + "GGCGTGGAT", + "GGCGTAATC", + "GGCGCAAGT", + "GGCGAGTAG", + "GGCGACCGT", + "GGCCTGTCA", + "GGCCATTGC", + "GGCACTCTG", + "GGATGTCAT", + "GGAGTAACT", + "GGAGAACGA", + "GGACTGGCT", + "GGACGTTCA", + "GGAACGTGC", + "GCTGTCCAT", + "GCTGGTTCA", + "GCTGCAACT", + "GCTCGTTAC", + "GCTATAGAT", + "GCTAGTCGT", + "GCTACCATG", + "GCGTTCTGA", + "GCGTGTTAG", + "GCGGTATCG", + "GCGGAGCAT", + "GCGCGGTGC", + "GCGCCTAGT", + "GCGCCGGCT", + "GCCTTCATG", + "GCCATACTG", + "GCATGTTGA", + "GCATGCTAC", + "GCAGTATAC", + "GCAGGTACT", + "GCAGCGCGT", + "GCACCTCAT", + "GCAATTCGA", + "GATTGCCGT", + "GATGAACAT", + "GATCTTCGA", + "GATCTGCAT", + "GAGTGGCAT", + "GAGTCGGAC", + "GAGTATGAT", + "GAGGCGAGT", + "GAGGCAACG", + "GAGCGCACT", + "GAATAGGCT", + "ATTGTCACT", + "ATTGTATCA", + "ATTGGTCAG", + "ATTGGCGAT", + "ATTGATCGT", + "ATTCGTAGT", + "ATTCATACG", + "ATTCAGGAC", + "ATTACTTCA", + "ATTAATTAG", + "ATTAAGCAT", + "ATGTCTCTA", + "ATGTAGCGT", + "ATGGCATAC", + "ATGGAGATC", + "ATGGACTCG", + "ATGGAACGA", + "ATGCTTCAT", + "ATGCTCGCT", + "ATGCGACGT", + "ATGCCGTAG", + "ATGAGTTCG", + "ATGACTATC", + "ATGACCGAC", + "ATCTTATGC", + "ATCTTACTA", + "ATCTATCAG", + "ATCGTGTAC", + "ATCGTCTGA", + "ATCGGCATG", + "ATCGCGAGC", + "ATCGCAACG", + "ATCGATGCT", + "ATCGAATAG", + "ATCCTTCTG", + "ATCCTGCGT", + "ATCCGCACT", + "ATCCATTAC", + "ATCCAAGCA", + "ATCAGATCA", + "ATCACACAT", + "ATCAACGTC", + "ATCAACCGA", + "ATATTGAGT", + "ATATTCGTC", + "ATATTACAG", + "ATATCTTGA", + "ATATCGCAT", + "ATATCAATC", + "ATAGTCCTG", + "ATAGGTCTA", + "ATAGCTGAC", + "ATAGCGGTA", + "AGTTCGCTG", + "AGTTACAGC", + "AGTTAACTA", + "AGTGCAATC", + "AGTCTGGTA", + "AGTCTGAGC", + "AGTCTACAT", + "AGTCGAACT", + "AGTCCATCG", + "AGTCATTCA", + "AGTATCCAG", + "AGTAGACTG", + "AGTAATCGA", + "AGTAAGTGC", + "AGGTTGGCT", + "AGGTTCTAG", + "AGGTGTTCA", + "AGGTGCCAT", + "AGGTCTGAT", + "AGGTCGTAC", + "AGGTCAGCA", + "AGGCTTATC", + "AGGCTATGA", + "AGGCCGACG", + "AGGCCAAGC", + "AGGCAGGTC", + "AGGCAAGAT", + "AGGAGCAGT", + "AGGACCGCT", + "AGGAATTAC", + "AGCTTGGAC", + "AGCTTAAGT", + "AGCTACACG", + "AGCGTTACG", + "AGCGGTGCA", + "AGCGGAGTC", + "AGCGGACGA", + "AGCGCGCTA", + "AGCGATAGC", + "AGCGACTCA", + "AGCCTCTAC", + "AGCCGTCGT", + "AGCATGATC", + "AGCACTTCG", + "AGCACGGCA", + "AGATTCTGA", + "AGATTAGAT", + "AGATGATAG", + "AGATATGTA", + "AGATACCGT", + "AGAGTGCGT", + "AGAGCCGAT", + "AGACTCACT", + "ACTTGCCTA", + "ACTTGAGCA", + "ACTTCTAGC", + "ACTTCGACT", + "ACTTAGTAC", + "ACTGTTGAT", + "ACTGTAACG", + "ACTGGTATC", + "ACTGACGTC", + "ACTGAAGCT", + "ACTCTGATG", + "ACTCCTGAC", + "ACTCCGCTA", + "ACTCAACTG", + "ACTATTGCA", + "ACTAGGCAG", + "ACTACGCGT", + "ACTAATACT", + "ACGTTCGTA", + "ACGTGTGCT", + "ACGTGTATG", + "ACGTGGAGC", + "ACGTCTTCG", + "ACGTCAGTC", + "ACGGTCTCA", + "ACGGTCCGT", + "ACGGTACAG", + "ACGGCGCTG", + "ACGCTGCGA", + "ACGCGTGTA", + "ACGCGCCAG", + "ACGATGTCG", + "ACGATGGAT", + "ACGATCTAC", + "ACGAGCTGA", + "ACGAGCATC", + "ACGAATCGT", + "ACGAACGCA", + "ACCTTGTAG", + "ACCTGTTGC", + "ACCTGTCAT", + "ACCTCGATC", + "ACCTAGGTA", + "ACCTACTGA", + "ACCTAATCG", + "ACCGTAGCA", + "ACCGGTAGT", + "ACCGGCTAC", + "ACCGCTTCA", + "ACATTGTGC", + "ACATTCTCG", + "ACATGGCTG", + "ACATGACGA", + "ACATATGAT", + "ACATATACG", + "ACAGCGTAC", + "ACACTTGCT", + "ACACTATCA", + "ACACGCATG", + "ACACCAGTA", + "ACACCAACT", + "ACACATAGT", + "ACACACCTA", +) def label_sections_to_index(label): - """ + """ Return the cell_index integer based on input 3 part cell label string - + """ - cl1, cl2, cl3 = [int(n) for n in label.split('-')] + cl1, cl2, cl3 = [int(n) for n in label.split("-")] return (cl1 - 1) * 384 * 384 + (cl2 - 1) * 384 + (cl3 - 1) + 1 @@ -282,18 +1572,17 @@ def label_sections_to_index(label): # print(label_sections_to_index('384-384-384')) # print('-') -#---------------------------------- +# ---------------------------------- def index_to_label_sections(index): - zerobased = int(index) - 1 cl1 = (int((zerobased) / 384 / 384) % 384) + 1 cl2 = (int((zerobased) / 384) % 384) + 1 cl3 = (zerobased % 384) + 1 - return f'{cl1}-{cl2}-{cl3}' + return f"{cl1}-{cl2}-{cl3}" # print(index_to_label_sections(1)) @@ -303,103 +1592,104 @@ def index_to_label_sections(index): # print(index_to_label_sections(19775576)) # print(index_to_label_sections(56623104)) # print('-') -#---------------------------------- +# ---------------------------------- def index_to_sequence(index, bead_version): - zerobased = int(index) - 1 cl1 = (int((zerobased) / 384 / 384) % 384) + 1 cl2 = (int((zerobased) / 384) % 384) + 1 cl3 = (zerobased % 384) + 1 - if bead_version == 'v1': - cls1_sequence = A96_cell_key1[cl1-1] - cls2_sequence = A96_cell_key2[cl2-1] - cls3_sequence = A96_cell_key3[cl3-1] - - return f'{cls1_sequence}{v1_linker1}{cls2_sequence}{v1_linker2}{cls3_sequence}' + if bead_version == "v1": + cls1_sequence = A96_cell_key1[cl1 - 1] + cls2_sequence = A96_cell_key2[cl2 - 1] + cls3_sequence = A96_cell_key3[cl3 - 1] - elif bead_version == 'Enh': + return f"{cls1_sequence}{v1_linker1}{cls2_sequence}{v1_linker2}{cls3_sequence}" - diversityInsert = '' + elif bead_version == "Enh": + diversityInsert = "" if 1 <= cl1 <= 24: - diversityInsert = '' + diversityInsert = "" elif 25 <= cl1 <= 48: - diversityInsert = 'A' + diversityInsert = "A" elif 49 <= cl1 <= 72: - diversityInsert = 'GT' - else: # 73 <= cl1 <= 96: - diversityInsert = 'TCA' - - cls1_sequence = A96_cell_key1[cl1-1] - cls2_sequence = A96_cell_key2[cl2-1] - cls3_sequence = A96_cell_key3[cl3-1] + diversityInsert = "GT" + else: # 73 <= cl1 <= 96: + diversityInsert = "TCA" - return f'{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}' + cls1_sequence = A96_cell_key1[cl1 - 1] + cls2_sequence = A96_cell_key2[cl2 - 1] + cls3_sequence = A96_cell_key3[cl3 - 1] - elif bead_version == 'EnhV2': + return f"{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}" - diversityInsert = '' - subIndex = ((cl1-1) % 96) + 1 + elif bead_version == "EnhV2": + diversityInsert = "" + subIndex = ((cl1 - 1) % 96) + 1 if 1 <= subIndex <= 24: - diversityInsert = '' + diversityInsert = "" elif 25 <= subIndex <= 48: - diversityInsert = 'A' + diversityInsert = "A" elif 49 <= subIndex <= 72: - diversityInsert = 'GT' - else: # 73 <= subIndex <= 96: - diversityInsert = 'TCA' + diversityInsert = "GT" + else: # 73 <= subIndex <= 96: + diversityInsert = "TCA" - cls1_sequence = B384_cell_key1[cl1-1] - cls2_sequence = B384_cell_key2[cl2-1] - cls3_sequence = B384_cell_key3[cl3-1] + cls1_sequence = B384_cell_key1[cl1 - 1] + cls2_sequence = B384_cell_key2[cl2 - 1] + cls3_sequence = B384_cell_key3[cl3 - 1] - return f'{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}' + return f"{diversityInsert}{cls1_sequence}{Enh_linker1}{cls2_sequence}{Enh_linker2}{cls3_sequence}" # print(index_to_sequence(4748181, 'Enh')) # print(index_to_sequence(52923177, 'EnhV2')) -#---------------------------------- +# ---------------------------------- def create_cell_index_fasta_V1(): - with open('Rhapsody_cellBarcodeV1_IndexToSequence.fasta', 'w') as f: - for cl1 in range(1, 96+1): - for cl2 in range(1, 96+1): - for cl3 in range(1, 96+1): - index = label_sections_to_index(f'{cl1}-{cl2}-{cl3}') - sequence = index_to_sequence(index, 'v1') - f.write(f'>{index}\n') - f.write(f'{sequence}\n') + with open("Rhapsody_cellBarcodeV1_IndexToSequence.fasta", "w") as f: + for cl1 in range(1, 96 + 1): + for cl2 in range(1, 96 + 1): + for cl3 in range(1, 96 + 1): + index = label_sections_to_index(f"{cl1}-{cl2}-{cl3}") + sequence = index_to_sequence(index, "v1") + f.write(f">{index}\n") + f.write(f"{sequence}\n") -#create_cell_index_fasta_V1() + +# create_cell_index_fasta_V1() def create_cell_index_fasta_Enh(): - with open('Rhapsody_cellBarcodeEnh_IndexToSequence.fasta', 'w') as f: - for cl1 in range(1, 96+1): - for cl2 in range(1, 96+1): - for cl3 in range(1, 96+1): - index = label_sections_to_index(f'{cl1}-{cl2}-{cl3}') - sequence = index_to_sequence(index, 'Enh') - f.write(f'>{index}\n') - f.write(f'{sequence}\n') + with open("Rhapsody_cellBarcodeEnh_IndexToSequence.fasta", "w") as f: + for cl1 in range(1, 96 + 1): + for cl2 in range(1, 96 + 1): + for cl3 in range(1, 96 + 1): + index = label_sections_to_index(f"{cl1}-{cl2}-{cl3}") + sequence = index_to_sequence(index, "Enh") + f.write(f">{index}\n") + f.write(f"{sequence}\n") + + +# create_cell_index_fasta_Enh() -#create_cell_index_fasta_Enh() def create_cell_index_fasta_EnhV2(): - with open('Rhapsody_cellBarcodeEnhV2_IndexToSequence.fasta', 'w') as f: - for cl1 in range(1, 384+1): - for cl2 in range(1, 384+1): - for cl3 in range(1, 384+1): - index = label_sections_to_index(f'{cl1}-{cl2}-{cl3}') - sequence = index_to_sequence(index, 'EnhV2') - f.write(f'>{index}\n') - f.write(f'{sequence}\n') - -#create_cell_index_fasta_EnhV2() + with open("Rhapsody_cellBarcodeEnhV2_IndexToSequence.fasta", "w") as f: + for cl1 in range(1, 384 + 1): + for cl2 in range(1, 384 + 1): + for cl3 in range(1, 384 + 1): + index = label_sections_to_index(f"{cl1}-{cl2}-{cl3}") + sequence = index_to_sequence(index, "EnhV2") + f.write(f">{index}\n") + f.write(f"{sequence}\n") + + +# create_cell_index_fasta_EnhV2() diff --git a/src/mapping/bd_rhapsody/script.py b/src/mapping/bd_rhapsody/script.py index a51c8103919..56aa8c80400 100644 --- a/src/mapping/bd_rhapsody/script.py +++ b/src/mapping/bd_rhapsody/script.py @@ -9,111 +9,117 @@ ## VIASH START par = { - 'reads': [ - 'resources_test/bdrhap_5kjrt/raw/12WTA_S1_L432_R1_001_subset.fastq.gz', - 'resources_test/bdrhap_5kjrt/raw/12WTA_S1_L432_R2_001_subset.fastq.gz', - 'resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R1_001_subset.fastq.gz', - 'resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R2_001_subset.fastq.gz' + "reads": [ + "resources_test/bdrhap_5kjrt/raw/12WTA_S1_L432_R1_001_subset.fastq.gz", + "resources_test/bdrhap_5kjrt/raw/12WTA_S1_L432_R2_001_subset.fastq.gz", + "resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R1_001_subset.fastq.gz", + "resources_test/bdrhap_5kjrt/raw/12ABC_S1_L432_R2_001_subset.fastq.gz", ], - 'reads_atac': None, - 'reference_archive': "reference_gencodev41_chr1.tar.gz", - 'targeted_reference': [], - 'abseq_reference': ["resources_test/bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta"], - 'supplemental_reference': [], - 'cell_calling_data': 'mRNA', - 'cell_calling_bioproduct_algorithm': None, - 'cell_calling_atac_algorithm': None, - 'exact_cell_count': 4900, - 'expected_cell_count': None, - 'exclude_intronic_reads': None, - 'sample_tags_version': None, - 'tag_names': [], - 'vdj_version': None, - 'predefined_atac_peaks': None, - 'run_name': "sample", - 'generate_bam': False, - 'alignment_star_params': None, - 'alignment_bwa_mem2_params': None, - 'parallel': True, - 'timestamps': False, - 'dryrun': False, - 'output_dir': 'output_large_op', - 'output_seurat': 'seurat.rds', - 'output_mudata': 'mudata.h5mu', - 'metrics_summary': 'metrics_summary.csv', - 'pipeline_report': 'pipeline_report.html', - 'rsec_mols_per_cell': None, - 'dbec_mols_per_cell': None, - 'rsec_mols_per_cell_unfiltered': None, - 'bam': None, - 'bam_index': None, - 'bioproduct_stats': None, - 'dimred_tsne': None, - 'dimred_umap': None, - 'immune_cell_classification': None, - 'sample_tag_metrics': None, - 'sample_tag_calls': None, - 'sample_tag_counts': None, - 'sample_tag_counts_unassigned': None, - 'vdj_metrics': None, - 'vdj_per_cell': None, - 'vdj_per_cell_uncorrected': None, - 'vdj_dominant_contigs': None, - 'vdj_unfiltered_contigs': None, - 'atac_metrics': None, - 'atac_metrics_json': None, - 'atac_fragments': None, - 'atac_fragments_index': None, - 'atac_transposase_sites': None, - 'atac_transposase_sites_index': None, - 'atac_peaks': None, - 'atac_peaks_index': None, - 'atac_peak_annotation': None, - 'atac_cell_by_peak': None, - 'atac_cell_by_peak_unfiltered': None, - 'atac_bam': None, - 'atac_bam_index': None, - 'protein_aggregates_experimental': None, - 'long_reads': None, - 'custom_star_params': None, - 'custom_bwa_mem2_params': None, - 'abseq_umi': None, - 'target_analysis': None, - 'vdj_jgene_evalue': None, - 'vdj_vgene_evalue': None, - 'write_filtered_reads': None + "reads_atac": None, + "reference_archive": "reference_gencodev41_chr1.tar.gz", + "targeted_reference": [], + "abseq_reference": [ + "resources_test/bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta" + ], + "supplemental_reference": [], + "cell_calling_data": "mRNA", + "cell_calling_bioproduct_algorithm": None, + "cell_calling_atac_algorithm": None, + "exact_cell_count": 4900, + "expected_cell_count": None, + "exclude_intronic_reads": None, + "sample_tags_version": None, + "tag_names": [], + "vdj_version": None, + "predefined_atac_peaks": None, + "run_name": "sample", + "generate_bam": False, + "alignment_star_params": None, + "alignment_bwa_mem2_params": None, + "parallel": True, + "timestamps": False, + "dryrun": False, + "output_dir": "output_large_op", + "output_seurat": "seurat.rds", + "output_mudata": "mudata.h5mu", + "metrics_summary": "metrics_summary.csv", + "pipeline_report": "pipeline_report.html", + "rsec_mols_per_cell": None, + "dbec_mols_per_cell": None, + "rsec_mols_per_cell_unfiltered": None, + "bam": None, + "bam_index": None, + "bioproduct_stats": None, + "dimred_tsne": None, + "dimred_umap": None, + "immune_cell_classification": None, + "sample_tag_metrics": None, + "sample_tag_calls": None, + "sample_tag_counts": None, + "sample_tag_counts_unassigned": None, + "vdj_metrics": None, + "vdj_per_cell": None, + "vdj_per_cell_uncorrected": None, + "vdj_dominant_contigs": None, + "vdj_unfiltered_contigs": None, + "atac_metrics": None, + "atac_metrics_json": None, + "atac_fragments": None, + "atac_fragments_index": None, + "atac_transposase_sites": None, + "atac_transposase_sites_index": None, + "atac_peaks": None, + "atac_peaks_index": None, + "atac_peak_annotation": None, + "atac_cell_by_peak": None, + "atac_cell_by_peak_unfiltered": None, + "atac_bam": None, + "atac_bam_index": None, + "protein_aggregates_experimental": None, + "long_reads": None, + "custom_star_params": None, + "custom_bwa_mem2_params": None, + "abseq_umi": None, + "target_analysis": None, + "vdj_jgene_evalue": None, + "vdj_vgene_evalue": None, + "write_filtered_reads": None, } meta = { - 'config': "target/nextflow/mapping/bd_rhapsody/.config.vsh.yaml", - 'resources_dir': os.path.abspath('src/mapping/bd_rhapsody'), - 'temp_dir': os.getenv("VIASH_TEMP"), - 'memory_mb': None, - 'cpus': None + "config": "target/nextflow/mapping/bd_rhapsody/.config.vsh.yaml", + "resources_dir": os.path.abspath("src/mapping/bd_rhapsody"), + "temp_dir": os.getenv("VIASH_TEMP"), + "memory_mb": None, + "cpus": None, } ## VIASH END + def clean_arg(argument): argument["clean_name"] = re.sub("^-*", "", argument["name"]) return argument + def read_config(path: str) -> dict[str, Any]: - with open(path, 'r') as f: + with open(path, "r") as f: config = yaml.safe_load(f) - + config["arguments"] = [ - clean_arg(arg) - for grp in config["argument_groups"] - for arg in grp["arguments"] + clean_arg(arg) for grp in config["argument_groups"] for arg in grp["arguments"] ] - + return config + def strip_margin(text: str) -> str: - return re.sub('(\n?)[ \t]*\|', '\\1', text) + return re.sub("(\n?)[ \t]*\|", "\\1", text) + def process_params(par: dict[str, Any], config, temp_dir: str) -> str: # check input parameters - assert par["reads"] or par["reads_atac"], "Pass at least one set of inputs to --reads or --reads_atac." + assert ( + par["reads"] or par["reads_atac"] + ), "Pass at least one set of inputs to --reads or --reads_atac." # output to temp dir if output_dir was not passed if not par["output_dir"]: @@ -121,31 +127,40 @@ def process_params(par: dict[str, Any], config, temp_dir: str) -> str: # checking sample prefix if par["run_name"] and re.match("[^A-Za-z0-9]", par["run_name"]): - print("--run_name should only consist of letters, numbers or hyphens. Replacing all '[^A-Za-z0-9]' with '-'.", flush=True) + print( + "--run_name should only consist of letters, numbers or hyphens. Replacing all '[^A-Za-z0-9]' with '-'.", + flush=True, + ) par["run_name"] = re.sub("[^A-Za-z0-9\\-]", "-", par["run_name"]) # make paths absolute for argument in config["arguments"]: if par[argument["clean_name"]] and argument["type"] == "file": if isinstance(par[argument["clean_name"]], list): - par[argument["clean_name"]] = [ os.path.abspath(f) for f in par[argument["clean_name"]] ] + par[argument["clean_name"]] = [ + os.path.abspath(f) for f in par[argument["clean_name"]] + ] else: - par[argument["clean_name"]] = os.path.abspath(par[argument["clean_name"]]) - + par[argument["clean_name"]] = os.path.abspath( + par[argument["clean_name"]] + ) + return par + def generate_config(par: dict[str, Any], config) -> str: - content_list = [strip_margin(f"""\ + content_list = [ + strip_margin("""\ |#!/usr/bin/env cwl-runner | |cwl:tool: rhapsody - |""")] + |""") + ] for argument in config["arguments"]: arg_info = argument.get("info") or {} config_key = arg_info.get("config_key") if par[argument["clean_name"]] and config_key: - if argument["type"] == "file": str = strip_margin(f"""\ |{config_key}: @@ -163,47 +178,62 @@ def generate_config(par: dict[str, Any], config) -> str: |""") content_list.append(str) else: - content_list.append(strip_margin(f"""\ + content_list.append( + strip_margin(f"""\ |{config_key}: {par[argument["clean_name"]]} - |""")) + |""") + ) ## Write config to file - return ''.join(content_list) + return "".join(content_list) + -def generate_config_file(par: dict[str, Any], config: dict[str, Any], temp_dir: str) -> str: +def generate_config_file( + par: dict[str, Any], config: dict[str, Any], temp_dir: str +) -> str: config_file = os.path.join(temp_dir, "config.yml") config_content = generate_config(par, config) with open(config_file, "w") as f: f.write(config_content) return config_file + def generate_cwl_file(meta: dict[str, Any], dir: str) -> str: # create cwl file (if need be) - orig_cwl_file=os.path.join(meta["resources_dir"], "rhapsody_pipeline_2.2.1_nodocker.cwl") + orig_cwl_file = os.path.join( + meta["resources_dir"], "rhapsody_pipeline_2.2.1_nodocker.cwl" + ) # Inject computational requirements into pipeline if meta["memory_mb"] or meta["cpus"]: cwl_file = os.path.join(dir, "pipeline.cwl") # Read in the file - with open(orig_cwl_file, 'r') as file : + with open(orig_cwl_file, "r") as file: cwl_data = file.read() # Inject computational requirements into pipeline if meta["memory_mb"]: - memory = int(meta["memory_mb"]) - 2000 # keep 2gb for OS - cwl_data = re.sub('"ramMin": [^\n]*[^,](,?)\n', f'"ramMin": {memory}\\1\n', cwl_data) + memory = int(meta["memory_mb"]) - 2000 # keep 2gb for OS + cwl_data = re.sub( + '"ramMin": [^\n]*[^,](,?)\n', f'"ramMin": {memory}\\1\n', cwl_data + ) if meta["cpus"]: - cwl_data = re.sub('"coresMin": [^\n]*[^,](,?)\n', f'"coresMin": {meta["cpus"]}\\1\n', cwl_data) + cwl_data = re.sub( + '"coresMin": [^\n]*[^,](,?)\n', + f'"coresMin": {meta["cpus"]}\\1\n', + cwl_data, + ) # Write the file out again - with open(cwl_file, 'w') as file: + with open(cwl_file, "w") as file: file.write(cwl_data) else: cwl_file = orig_cwl_file return cwl_file + def copy_outputs(par: dict[str, Any], config: dict[str, Any]): for arg in config["arguments"]: par_value = par[arg["clean_name"]] @@ -211,16 +241,21 @@ def copy_outputs(par: dict[str, Any], config: dict[str, Any]): # example template: '[sample_name]_(assay)_cell_type_experimental.csv' template = (arg.get("info") or {}).get("template") if template: - template_glob = template\ - .replace("sample", par["run_name"])\ - .replace("assay", "*")\ + template_glob = ( + template.replace("sample", par["run_name"]) + .replace("assay", "*") .replace("number", "*") + ) files = glob.glob(os.path.join(par["output_dir"], template_glob)) if len(files) == 0 and arg["required"]: - raise ValueError(f"Expected output file '{template_glob}' not found.") + raise ValueError( + f"Expected output file '{template_glob}' not found." + ) elif len(files) > 1 and not arg["multiple"]: - raise ValueError(f"Expected single output file '{template_glob}', but found multiple.") - + raise ValueError( + f"Expected single output file '{template_glob}', but found multiple." + ) + if not arg["multiple"]: try: shutil.copy(files[0], par_value) @@ -235,7 +270,7 @@ def copy_outputs(par: dict[str, Any], config: dict[str, Any]): def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): config = read_config(meta["config"]) - + # Preprocess params par = process_params(par, config, temp_dir) @@ -244,7 +279,8 @@ def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): "cwl-runner", "--no-container", "--preserve-entire-environment", - "--outdir", par["output_dir"], + "--outdir", + par["output_dir"], ] if par["parallel"]: @@ -260,7 +296,7 @@ def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): # Create params file config_file = generate_config_file(par, config, temp_dir) cmd.append(config_file) - + # keep environment variables but set TMPDIR to temp_dir env = dict(os.environ) env["TMPDIR"] = temp_dir @@ -270,17 +306,15 @@ def main(par: dict[str, Any], meta: dict[str, Any], temp_dir: str): os.makedirs(par["output_dir"]) # Run command - print("> " + ' '.join(cmd), flush=True) - _ = subprocess.check_call( - cmd, - cwd=os.path.dirname(config_file), - env=env - ) + print("> " + " ".join(cmd), flush=True) + _ = subprocess.check_call(cmd, cwd=os.path.dirname(config_file), env=env) # Copy outputs copy_outputs(par, config) if __name__ == "__main__": - with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody-", dir=meta["temp_dir"]) as temp_dir: + with tempfile.TemporaryDirectory( + prefix="cwl-bd_rhapsody-", dir=meta["temp_dir"] + ) as temp_dir: main(par, meta, temp_dir) diff --git a/src/mapping/bd_rhapsody/test.py b/src/mapping/bd_rhapsody/test.py index b392a9dba68..385000f6865 100644 --- a/src/mapping/bd_rhapsody/test.py +++ b/src/mapping/bd_rhapsody/test.py @@ -4,11 +4,11 @@ ## VIASH START meta = { - "name": "bd_rhapsody", - "executable": "target/docker/mapping/bd_rhapsody/bd_rhapsody", - "resources_dir": "src/mapping/bd_rhapsody", - "cpus": 8, - "memory_mb": 4096, + "name": "bd_rhapsody", + "executable": "target/docker/mapping/bd_rhapsody/bd_rhapsody", + "resources_dir": "src/mapping/bd_rhapsody", + "cpus": 8, + "memory_mb": 4096, } # bdabseq_panel_fa = "resources_test/bdrhap_5kjrt/raw/BDAbSeq_ImmuneDiscoveryPanel.fasta" @@ -25,21 +25,23 @@ # Run executable print(f">> Run {meta['name']}", flush=True) output_dir = Path("output") -subprocess.run([ - meta['executable'], - f"--reads={wta_reads}", - f"--reads={abc_reads}", - f"--reference_archive={reference_file}", - f"--abseq_reference={bdabseq_panel_fa}", - "--output_dir=output", - "--exact_cell_count=4900", - "---cpus=2", - "---memory=10gb", - "--output_seurat=seurat.rds", - "--output_mudata=mudata.h5mu", - "--metrics_summary=metrics_summary.csv", - "--pipeline_report=pipeline_report.html", -]) +subprocess.run( + [ + meta["executable"], + f"--reads={wta_reads}", + f"--reads={abc_reads}", + f"--reference_archive={reference_file}", + f"--abseq_reference={bdabseq_panel_fa}", + "--output_dir=output", + "--exact_cell_count=4900", + "---cpus=2", + "---memory=10gb", + "--output_seurat=seurat.rds", + "--output_mudata=mudata.h5mu", + "--metrics_summary=metrics_summary.csv", + "--pipeline_report=pipeline_report.html", + ] +) # Check if output exists diff --git a/src/mapping/cellranger_atac_count/test.py b/src/mapping/cellranger_atac_count/test.py index 80743232dac..71c9eb23ff4 100644 --- a/src/mapping/cellranger_atac_count/test.py +++ b/src/mapping/cellranger_atac_count/test.py @@ -1,28 +1,33 @@ import subprocess from os import path import sys -from itertools import zip_longest, chain +from itertools import chain ## VIASH START -meta = { - "name": "cellranger_atac_count", - "resources_dir": "resources_test" -} +meta = {"name": "cellranger_atac_count", "resources_dir": "resources_test"} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("> Running command with folder") -input = meta["resources_dir"] + "/cellranger_atac_tiny_bcl/fastqs/HJN3KBCX2/test_sample/" -reference = meta["resources_dir"] + "/reference_gencodev41_chr1/reference_cellranger.tar.gz" +input = ( + meta["resources_dir"] + "/cellranger_atac_tiny_bcl/fastqs/HJN3KBCX2/test_sample/" +) +reference = ( + meta["resources_dir"] + "/reference_gencodev41_chr1/reference_cellranger.tar.gz" +) output = "test_output" cmd_pars = [ meta["executable"], - "--input", input, - "--reference", reference, - "--output", output + "--input", + input, + "--reference", + reference, + "--output", + output, ] if meta.get("cpus"): cmd_pars.extend(["---cpus", str(meta["cpus"])]) @@ -44,7 +49,7 @@ input + "test_sample_S1_L001_I1_001.fastq.gz", input + "test_sample_S1_L001_R1_001.fastq.gz", input + "test_sample_S1_L001_R2_001.fastq.gz", - input + "test_sample_S1_L001_R3_001.fastq.gz", + input + "test_sample_S1_L001_R3_001.fastq.gz", ] output = "test_output2" @@ -52,8 +57,10 @@ cmd_pars = [ meta["executable"], *chain.from_iterable([("--input", input_file) for input_file in input_files]), - "--reference", reference, - "--output", output + "--reference", + reference, + "--output", + output, ] if meta.get("cpus"): cmd_pars.extend(["---cpus", str(meta["cpus"])]) @@ -65,4 +72,4 @@ assert path.exists(output + "/filtered_peak_bc_matrix.h5"), "No output was created." assert path.exists(output + "/fragments.tsv.gz"), "No fragments file was created." -logger.info("> Completed Successfully!") \ No newline at end of file +logger.info("> Completed Successfully!") diff --git a/src/mapping/cellranger_count/test.py b/src/mapping/cellranger_count/test.py index 06a9ad03a7a..27aaa2624ba 100644 --- a/src/mapping/cellranger_count/test.py +++ b/src/mapping/cellranger_count/test.py @@ -3,106 +3,159 @@ from pathlib import Path ## VIASH START -meta = { - "name": "cellranger_count", - "resources_dir": "resources_test" -} +meta = {"name": "cellranger_count", "resources_dir": "resources_test"} ## VIASH END input = Path(meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_fastq/") reference = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_ref/" + def test_cellranger_count_with_folder(run_component, random_path): - output = random_path() - run_component([ - "--input", input, - "--reference", reference, - "--output", output, - "--lanes", "1", - ]) + output = random_path() + run_component( + [ + "--input", + input, + "--reference", + reference, + "--output", + output, + "--lanes", + "1", + ] + ) - assert (output / "filtered_feature_bc_matrix.h5").is_file(), "No output was created." + assert ( + output / "filtered_feature_bc_matrix.h5" + ).is_file(), "No output was created." def test_cellranger_count_with_fastq_files(run_component, random_path): output = random_path() - run_component([ - "--input", input / "tinygex_S1_L001_R1_001.fastq.gz", - "--input", input / "tinygex_S1_L001_R2_001.fastq.gz", - "--reference", reference, - "--output", output, - ]) - assert (output / "filtered_feature_bc_matrix.h5").is_file(), "No output was created." + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + ] + ) + assert ( + output / "filtered_feature_bc_matrix.h5" + ).is_file(), "No output was created." @pytest.mark.parametrize("chemistry", ["auto", "SC3Pv2"]) def test_cellranger_chemistry(run_component, random_path, chemistry): output = random_path() - run_component([ - "--input", input / "tinygex_S1_L001_R1_001.fastq.gz", - "--input", input / "tinygex_S1_L001_R2_001.fastq.gz", - "--reference", reference, - "--output", output, - "--chemistry", chemistry, - ]) - assert (output / "filtered_feature_bc_matrix.h5").is_file(), "No output was created." + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--chemistry", + chemistry, + ] + ) + assert ( + output / "filtered_feature_bc_matrix.h5" + ).is_file(), "No output was created." + def test_cellranger_no_bam(run_component, random_path): output = random_path() - run_component([ - "--input", input / "tinygex_S1_L001_R1_001.fastq.gz", - "--input", input / "tinygex_S1_L001_R2_001.fastq.gz", - "--reference", reference, - "--output", output, - "--generate_bam", "false", - ]) - assert (output / "filtered_feature_bc_matrix.h5").is_file(), "No output was created." + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--generate_bam", + "false", + ] + ) + assert ( + output / "filtered_feature_bc_matrix.h5" + ).is_file(), "No output was created." -def test_cellranger_no_secondary_analysis(run_component, random_path): - output = random_path() - run_component([ - "--input", input / "tinygex_S1_L001_R1_001.fastq.gz", - "--input", input / "tinygex_S1_L001_R2_001.fastq.gz", - "--reference", reference, - "--output", output, - "--secondary_analysis", "false", - ]) - assert (output / "filtered_feature_bc_matrix.h5").is_file(), "No output was created." def test_cellranger_no_secondary_analysis(run_component, random_path): output = random_path() - run_component([ - "--input", input / "tinygex_S1_L001_R1_001.fastq.gz", - "--input", input / "tinygex_S1_L001_R2_001.fastq.gz", - "--reference", reference, - "--output", output, - "--secondary_analysis", "false", - ]) - assert (output / "filtered_feature_bc_matrix.h5").is_file(), "No output was created." + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--secondary_analysis", + "false", + ] + ) + assert ( + output / "filtered_feature_bc_matrix.h5" + ).is_file(), "No output was created." + def test_cellranger_exclude_introns(run_component, random_path): output = random_path() - run_component([ - "--input", input / "tinygex_S1_L001_R1_001.fastq.gz", - "--input", input / "tinygex_S1_L001_R2_001.fastq.gz", - "--reference", reference, - "--output", output, - "--include_introns", "false", - ]) - assert (output / "filtered_feature_bc_matrix.h5").is_file(), "No output was created." + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--include_introns", + "false", + ] + ) + assert ( + output / "filtered_feature_bc_matrix.h5" + ).is_file(), "No output was created." + def test_cellranger_trim_reads(run_component, random_path): output = random_path() - run_component([ - "--input", input / "tinygex_S1_L001_R1_001.fastq.gz", - "--input", input / "tinygex_S1_L001_R2_001.fastq.gz", - "--reference", reference, - "--output", output, - "--r1_length", "100", - "--r2_length", "100", - ]) - assert (output / "filtered_feature_bc_matrix.h5").is_file(), "No output was created." - -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) + run_component( + [ + "--input", + input / "tinygex_S1_L001_R1_001.fastq.gz", + "--input", + input / "tinygex_S1_L001_R2_001.fastq.gz", + "--reference", + reference, + "--output", + output, + "--r1_length", + "100", + "--r2_length", + "100", + ] + ) + assert ( + output / "filtered_feature_bc_matrix.h5" + ).is_file(), "No output was created." + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/mapping/cellranger_multi/script.py b/src/mapping/cellranger_multi/script.py index 953a10d5ae2..15b6fac7892 100644 --- a/src/mapping/cellranger_multi/script.py +++ b/src/mapping/cellranger_multi/script.py @@ -15,54 +15,59 @@ ## VIASH START # The following code has been auto-generated by Viash. par = { - 'output': './cellranger_test_output', - 'input': ['resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz', - 'resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz', - 'resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz', - 'resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz', - 'resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz', - 'resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz'], - 'library_id': ['5k_human_antiCMV_T_TBNK_connect_GEX_1_subset', - '5k_human_antiCMV_T_TBNK_connect_AB_subset', - '5k_human_antiCMV_T_TBNK_connect_VDJ_subset'], - 'library_type': ['Gene Expression', 'Antibody Capture', 'VDJ'], - 'gex_input': None, - 'abc_input': None, - 'cgc_input': None, - 'mux_input': None, - 'vdj_input': None, - 'vdj_t_input': None, - 'vdj_t_gd_input': None, - 'vdj_b_input': None, - 'agc_input': None, - 'library_lanes': None, - 'library_subsample': None, - 'gex_expect_cells': None, - 'gex_chemistry': 'auto', - 'gex_secondary_analysis': False, - 'gex_generate_bam': False, - 'gex_include_introns': False, - 'cell_multiplex_sample_id': None, - 'cell_multiplex_oligo_ids': None, - 'cell_multiplex_description': None, - 'dryrun': False + "output": "./cellranger_test_output", + "input": [ + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz", + "resources_test/10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz", + ], + "library_id": [ + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", + "5k_human_antiCMV_T_TBNK_connect_AB_subset", + "5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + ], + "library_type": ["Gene Expression", "Antibody Capture", "VDJ"], + "gex_input": None, + "abc_input": None, + "cgc_input": None, + "mux_input": None, + "vdj_input": None, + "vdj_t_input": None, + "vdj_t_gd_input": None, + "vdj_b_input": None, + "agc_input": None, + "library_lanes": None, + "library_subsample": None, + "gex_expect_cells": None, + "gex_chemistry": "auto", + "gex_secondary_analysis": False, + "gex_generate_bam": False, + "gex_include_introns": False, + "cell_multiplex_sample_id": None, + "cell_multiplex_oligo_ids": None, + "cell_multiplex_description": None, + "dryrun": False, } meta = { - 'cpus': 10, - 'memory_b': None, - 'memory_kb': None, - 'memory_mb': None, - 'memory_gb': 15, - 'memory_tb': None, - 'memory_pb': None, - 'temp_dir': '/tmp', - 'config': './target/docker/mapping/cellranger_multi/.config.vsh.yaml', - 'resources_dir': './resources_test' + "cpus": 10, + "memory_b": None, + "memory_kb": None, + "memory_mb": None, + "memory_gb": 15, + "memory_tb": None, + "memory_pb": None, + "temp_dir": "/tmp", + "config": "./target/docker/mapping/cellranger_multi/.config.vsh.yaml", + "resources_dir": "./resources_test", } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() # Tested with cellranger 7.0: @@ -71,7 +76,7 @@ # - replacing `.fastq.` for `.fq.` is NOT allowed # - omitting `.gz` is allowed -fastq_regex = r'^([A-Za-z0-9\-_\.]+)_S(\d+)_(L(\d+)_)?[RI](\d+)_(\d+)\.fastq(\.gz)?$' +fastq_regex = r"^([A-Za-z0-9\-_\.]+)_S(\d+)_(L(\d+)_)?[RI](\d+)_(\d+)\.fastq(\.gz)?$" # assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_L001_R1_001.fastq.gz") is not None # assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_R1_001.fastq") is not None # assert re.match(fastq_regex, "5k_human_GEX_1_subset_S1_R1_001.fastq.gz.txt") is None @@ -109,11 +114,12 @@ "min_crispr_umi": "min-crispr-umi", } -VDJ_CONFIG_KEYS = {"vdj_reference": "reference", - "vdj_inner_enrichment_primers": "inner-enrichment-primers", - "vdj_r1_length": "r1-length", - "vdj_r2_length": "r2-length", - } +VDJ_CONFIG_KEYS = { + "vdj_reference": "reference", + "vdj_inner_enrichment_primers": "inner-enrichment-primers", + "vdj_r1_length": "r1-length", + "vdj_r2_length": "r2-length", +} ANTIGEN_SPECIFICITY_CONFIG_KEYS = { @@ -129,60 +135,65 @@ "antigen-specificity": (ANTIGEN_SPECIFICITY_CONFIG_KEYS, "columns"), } -LIBRARY_CONFIG_KEYS = {'library_id': 'fastq_id', - 'library_type': 'feature_types', - 'library_subsample': 'subsample_rate', - 'library_lanes': 'lanes', - 'library_chemistry': 'chemistry', - } +LIBRARY_CONFIG_KEYS = { + "library_id": "fastq_id", + "library_type": "feature_types", + "library_subsample": "subsample_rate", + "library_lanes": "lanes", + "library_chemistry": "chemistry", +} -SAMPLE_PARAMS_CONFIG_KEYS = {'sample_ids': 'sample_id', - 'cell_multiplex_oligo_ids': 'cmo_ids', - 'sample_description': 'description', - 'probe_barcode_ids': 'probe_barcode_ids', - 'sample_expect_cells': 'expect_cells', - 'sample_force_cells': 'force_cells'} +SAMPLE_PARAMS_CONFIG_KEYS = { + "sample_ids": "sample_id", + "cell_multiplex_oligo_ids": "cmo_ids", + "sample_description": "description", + "probe_barcode_ids": "probe_barcode_ids", + "sample_expect_cells": "expect_cells", + "sample_force_cells": "force_cells", +} # These are derived from the dictionaries above -REFERENCES = tuple(reference_param for reference_param, cellranger_param - in chain(GEX_CONFIG_KEYS.items(), FEATURE_CONFIG_KEYS.items(), VDJ_CONFIG_KEYS.items()) - if cellranger_param == "reference") +REFERENCES = tuple( + reference_param + for reference_param, cellranger_param in chain( + GEX_CONFIG_KEYS.items(), FEATURE_CONFIG_KEYS.items(), VDJ_CONFIG_KEYS.items() + ) + if cellranger_param == "reference" +) LIBRARY_PARAMS = tuple(LIBRARY_CONFIG_KEYS.keys()) SAMPLE_PARAMS = tuple(SAMPLE_PARAMS_CONFIG_KEYS.keys()) HELPER_INPUT = { - 'gex_input': 'Gene Expression', - 'abc_input': 'Antibody Capture', - 'cgc_input': 'CRISPR Guide Capture', - 'mux_input': 'Multiplexing Capture', - 'vdj_input': 'VDJ', - 'vdj_t_input': 'VDJ-T', - 'vdj_t_gd_input': 'VDJ-T-GD', - 'vdj_b_input': 'VDJ-B', - 'agc_input': 'Antigen Capture' + "gex_input": "Gene Expression", + "abc_input": "Antibody Capture", + "cgc_input": "CRISPR Guide Capture", + "mux_input": "Multiplexing Capture", + "vdj_input": "VDJ", + "vdj_t_input": "VDJ-T", + "vdj_t_gd_input": "VDJ-T-GD", + "vdj_b_input": "VDJ-B", + "agc_input": "Antigen Capture", } def infer_library_id_from_path(input_path: str) -> str: match = re.match(fastq_regex, input_path) - assert match is not None, \ - f"File name of '{input_path}' should match regex {fastq_regex}." + assert ( + match is not None + ), f"File name of '{input_path}' should match regex {fastq_regex}." return match.group(1) + def transform_helper_inputs(par: dict[str, Any]) -> dict[str, Any]: - helper_input = { - "input": [], - "library_id": [], - "library_type": [] - } + helper_input = {"input": [], "library_id": [], "library_type": []} for input_type, library_type in HELPER_INPUT.items(): if par[input_type]: par[input_type] = resolve_input_directories_to_fastq_paths(par[input_type]) library_ids = [ infer_library_id_from_path(path.name) for path in par[input_type] - ] + ] library_id_dict = {} for fastq, library_id in zip(par[input_type], library_ids): @@ -193,49 +204,71 @@ def transform_helper_inputs(par: dict[str, Any]) -> dict[str, Any]: helper_input["library_id"].append(library_id) helper_input["library_type"].append(library_type) - assert len(helper_input["library_id"]) == len(set(helper_input["library_id"])), "File names passed to feature type-specific inputs must be unique" + assert len(helper_input["library_id"]) == len( + set(helper_input["library_id"]) + ), "File names passed to feature type-specific inputs must be unique" return helper_input + def lengths_gt1(dic: dict[str, Optional[list[Any]]]) -> dict[str, int]: - return {key: len(li) for key, li in dic.items() - if li is not None and isinstance(li, (list, tuple, set))} + return { + key: len(li) + for key, li in dic.items() + if li is not None and isinstance(li, (list, tuple, set)) + } + def strip_margin(text: str) -> str: - return re.sub('(\n?)[ \t]*\|', '\\1', text) + return re.sub("(\n?)[ \t]*\|", "\\1", text) -def subset_dict(dictionary: dict[str, str], - keys: Union[dict[str, str], list[str]]) -> dict[str, str]: + +def subset_dict( + dictionary: dict[str, str], keys: Union[dict[str, str], list[str]] +) -> dict[str, str]: if isinstance(keys, (list, tuple)): keys = {key: key for key in keys} - return {dest_key: dictionary[orig_key] - for orig_key, dest_key in keys.items() - if dictionary[orig_key] is not None} + return { + dest_key: dictionary[orig_key] + for orig_key, dest_key in keys.items() + if dictionary[orig_key] is not None + } + -def check_subset_dict_equal_length(group_name: str, - dictionary: dict[str, list[str]]) -> None: +def check_subset_dict_equal_length( + group_name: str, dictionary: dict[str, list[str]] +) -> None: lens = lengths_gt1(dictionary) - assert len(set(lens.values())) <= 1, f"The number of values passed to {group_name} "\ - f"arguments must be 0, 1 or all the same. Offenders: {lens}" + assert len(set(lens.values())) <= 1, ( + f"The number of values passed to {group_name} " + f"arguments must be 0, 1 or all the same. Offenders: {lens}" + ) -def resolve_input_directories_to_fastq_paths(input_paths: list[str]) -> list[Path]: +def resolve_input_directories_to_fastq_paths(input_paths: list[str]) -> list[Path]: input_paths = [Path(fastq) for fastq in input_paths] if len(input_paths) == 1 and input_paths[0].is_dir(): - logger.info("Detected a directory in input paths, " - "traversing to see if we can detect any FASTQ files.") - input_paths = [input_path for input_path in input_paths[0].rglob('*') - if re.match(fastq_regex, input_path.name) ] + logger.info( + "Detected a directory in input paths, " + "traversing to see if we can detect any FASTQ files." + ) + input_paths = [ + input_path + for input_path in input_paths[0].rglob("*") + if re.match(fastq_regex, input_path.name) + ] # check input fastq files for input_path in input_paths: - assert re.match(fastq_regex, input_path.name) is not None, \ - f"File name of --input '{input_path}' should match regex {fastq_regex}." + assert ( + re.match(fastq_regex, input_path.name) is not None + ), f"File name of --input '{input_path}' should match regex {fastq_regex}." return input_paths + def make_paths_absolute(par: dict[str, Any], config: Path | str): - with open(config, 'r', encoding="utf-8") as open_viash_config: + with open(config, "r", encoding="utf-8") as open_viash_config: config = yaml.safe_load(open_viash_config) arguments = { @@ -248,19 +281,25 @@ def make_paths_absolute(par: dict[str, Any], config: Path | str): continue par_value, is_multiple = par[arg_name], arg["multiple"] assert is_multiple in (True, False) + def make_path_absolute(file: str | Path) -> Path: - logger.info('Making path %s absolute', file) + logger.info("Making path %s absolute", file) return Path(file).resolve() - - new_arg = [make_path_absolute(file) for file in par_value] if is_multiple else make_path_absolute(par_value) + + new_arg = ( + [make_path_absolute(file) for file in par_value] + if is_multiple + else make_path_absolute(par_value) + ) par[arg_name] = new_arg return par + def handle_integers_not_set(par: dict[str, Any], viash_config: Path | str) -> str: """ Allow to use `-1` to define a 'not set' value for arguments of `type: integer` with `multiple: true`. """ - with open(viash_config, 'r', encoding="utf-8") as open_viash_config: + with open(viash_config, "r", encoding="utf-8") as open_viash_config: config = yaml.safe_load(open_viash_config) arguments = { @@ -279,18 +318,24 @@ def handle_integers_not_set(par: dict[str, Any], viash_config: Path | str) -> st def replace_notset_values(integer_value: int) -> int | None: return None if integer_value == -1 else integer_value - + # Use an extension array to handle "None" values, otherwise int + NA # values would be converted to a "float" dtype - new_arg = pd.array([replace_notset_values(value) for value in par_value], dtype="Int64") + new_arg = pd.array( + [replace_notset_values(value) for value in par_value], dtype="Int64" + ) par[arg_name] = new_arg return par -def process_params(par: dict[str, Any], viash_config: Path | str) -> str: +def process_params(par: dict[str, Any], viash_config: Path | str) -> str: if par["input"]: - assert len(par["library_type"]) > 0, "--library_type must be defined when passing input to --input" - assert len(par["library_id"]) > 0, "--library_id must be defined when passing input to --input" + assert ( + len(par["library_type"]) > 0 + ), "--library_type must be defined when passing input to --input" + assert ( + len(par["library_id"]) > 0 + ), "--library_id must be defined when passing input to --input" # if par["input"] is a directory, look for fastq files par["input"] = resolve_input_directories_to_fastq_paths(par["input"]) @@ -298,9 +343,11 @@ def process_params(par: dict[str, Any], viash_config: Path | str) -> str: # add helper input helper_input = transform_helper_inputs(par) for key in ["input", "library_id", "library_type"]: - par[key] = (par[key] if par[key] else []) + helper_input[key] + par[key] = (par[key] if par[key] else []) + helper_input[key] - assert len(par[key]) > 0, f"Either --{key} or feature type-specific input (e.g. --gex_input, --abc_input, ...) must be defined" + assert ( + len(par[key]) > 0 + ), f"Either --{key} or feature type-specific input (e.g. --gex_input, --abc_input, ...) must be defined" # check lengths of libraries metadata library_dict = subset_dict(par, LIBRARY_PARAMS) @@ -321,9 +368,9 @@ def generate_csv_category(name: str, args: dict[str, str], orient: str) -> list[ assert orient in ("index", "columns") if not args: return [] - title = [ f'[{name}]' ] + title = [f"[{name}]"] # Which index to include in csv section is based on orientation - to_csv_args = {"index": (orient=="index"), "header": (orient=="columns")} + to_csv_args = {"index": (orient == "index"), "header": (orient == "columns")} values = [pd.DataFrame.from_dict(args, orient=orient).to_csv(**to_csv_args).strip()] return title + values + [""] @@ -332,49 +379,66 @@ def generate_config(par: dict[str, Any], fastq_dir: str) -> str: content_list = [] par["fastqs"] = fastq_dir libraries = dict(LIBRARY_CONFIG_KEYS, **{"fastqs": "fastqs"}) - #TODO: use the union (|) operator when python is updated to 3.9 - all_sections = REFERENCE_SECTIONS | {"libraries": (libraries, "columns"), - "samples": (SAMPLE_PARAMS_CONFIG_KEYS, "columns")} + # TODO: use the union (|) operator when python is updated to 3.9 + all_sections = REFERENCE_SECTIONS | { + "libraries": (libraries, "columns"), + "samples": (SAMPLE_PARAMS_CONFIG_KEYS, "columns"), + } for section_name, (section_params, orientation) in all_sections.items(): reference_pars = subset_dict(par, section_params) - content_list += generate_csv_category(section_name, reference_pars, orient=orientation) + content_list += generate_csv_category( + section_name, reference_pars, orient=orientation + ) + + return "\n".join(content_list) - return '\n'.join(content_list) def main(par: dict[str, Any], meta: dict[str, Any]): logger.info(" Processing params") - par = process_params(par, meta['config']) + par = process_params(par, meta["config"]) logger.info(par) # TODO: throw error or else Cell Ranger will - with tempfile.TemporaryDirectory(prefix="cellranger_multi-", - dir=meta["temp_dir"]) as temp_dir: + with tempfile.TemporaryDirectory( + prefix="cellranger_multi-", dir=meta["temp_dir"] + ) as temp_dir: temp_dir_path = Path(temp_dir) for reference_par_name in REFERENCES: reference = par[reference_par_name] - logger.info('Looking at %s to check if it needs decompressing', reference) - if reference and Path(reference).is_file() and tarfile.is_tarfile(reference): - extaction_dir_name = Path(reference.stem).stem # Remove two extensions (if they exist) + logger.info("Looking at %s to check if it needs decompressing", reference) + if ( + reference + and Path(reference).is_file() + and tarfile.is_tarfile(reference) + ): + extaction_dir_name = Path( + reference.stem + ).stem # Remove two extensions (if they exist) unpacked_directory = temp_dir_path / extaction_dir_name - logger.info('Extracting %s to %s', reference, unpacked_directory) + logger.info("Extracting %s to %s", reference, unpacked_directory) - with tarfile.open(reference, 'r') as open_tar: + with tarfile.open(reference, "r") as open_tar: members = open_tar.getmembers() - root_dirs = [member for member in members if member.isdir() - and member.name != '.' and '/' not in member.name] + root_dirs = [ + member + for member in members + if member.isdir() + and member.name != "." + and "/" not in member.name + ] # if there is only one root_dir (and there are files in that directory) # strip that directory name from the destination folder if len(root_dirs) == 1: for mem in members: mem.path = Path(*Path(mem.path).parts[1:]) - members_to_move = [mem for mem in members if mem.path != Path('.')] + members_to_move = [mem for mem in members if mem.path != Path(".")] open_tar.extractall(unpacked_directory, members=members_to_move) par[reference_par_name] = unpacked_directory # Creating symlinks of fastq files to tempdir input_symlinks_dir = temp_dir_path / "input_symlinks" input_symlinks_dir.mkdir() - for fastq in par['input']: + for fastq in par["input"]: destination = input_symlinks_dir / fastq.name destination.symlink_to(fastq) @@ -382,12 +446,12 @@ def main(par: dict[str, Any], meta: dict[str, Any]): config_content = generate_config(par, input_symlinks_dir) logger.info(" Creating Cell Ranger argument") - temp_id="run" - proc_pars=["--disable-ui", "--id", temp_id] + temp_id = "run" + proc_pars = ["--disable-ui", "--id", temp_id] command_line_parameters = { - "--localcores": meta['cpus'], - "--localmem": int(meta['memory_gb']) - 2 if meta['memory_gb'] else None, + "--localcores": meta["cpus"], + "--localmem": int(meta["memory_gb"]) - 2 if meta["memory_gb"] else None, } for param, param_value in command_line_parameters.items(): if param_value: @@ -395,17 +459,17 @@ def main(par: dict[str, Any], meta: dict[str, Any]): ## Run pipeline if par["dryrun"]: - par['output'].mkdir(parents=True, exist_ok=True) + par["output"].mkdir(parents=True, exist_ok=True) # write config file - config_file = par['output'] / "config.csv" + config_file = par["output"] / "config.csv" with open(config_file, "w") as f: f.write(config_content) proc_pars.append(f"--csv={config_file}") # display command that would've been used cmd = ["cellranger multi"] + proc_pars + ["--csv=config.csv"] - logger.info("> " + ' '.join(cmd)) + logger.info("> " + " ".join(cmd)) else: # write config file to execution directory config_file = temp_dir_path / "config.csv" @@ -414,27 +478,24 @@ def main(par: dict[str, Any], meta: dict[str, Any]): proc_pars.append(f"--csv={config_file}") # Already copy config file to output directory - par['output'].mkdir(parents=True, exist_ok=True) - with (par['output'] / "config.csv").open('w') as open_config: + par["output"].mkdir(parents=True, exist_ok=True) + with (par["output"] / "config.csv").open("w") as open_config: open_config.write(config_content) # run process cmd = ["cellranger", "multi"] + proc_pars - logger.info("> " + ' '.join(cmd)) + logger.info("> " + " ".join(cmd)) process_output = subprocess.run( - cmd, - cwd=temp_dir, - check=False, - capture_output=True + cmd, cwd=temp_dir, check=False, capture_output=True ) - with (par["output"] / "cellranger_multi.log").open('w') as open_log: - open_log.write(process_output.stdout.decode('utf-8')) + with (par["output"] / "cellranger_multi.log").open("w") as open_log: + open_log.write(process_output.stdout.decode("utf-8")) try: process_output.check_returncode() except subprocess.CalledProcessError as e: - logger.error(e.output.decode('utf-8')) - print(e.output.decode('utf-8'), flush=True) + logger.error(e.output.decode("utf-8")) + print(e.output.decode("utf-8"), flush=True) raise e # look for output dir file @@ -449,9 +510,10 @@ def main(par: dict[str, Any], meta: dict[str, Any]): if not type_func(output_path): raise ValueError(f"Could not find expected '{output_path}'") - for output_path in tmp_output_dir.rglob('*'): - if output_path.name != "config.csv": # Already created - shutil.move(str(output_path), par['output']) + for output_path in tmp_output_dir.rglob("*"): + if output_path.name != "config.csv": # Already created + shutil.move(str(output_path), par["output"]) + if __name__ == "__main__": - main(par, meta) \ No newline at end of file + main(par, meta) diff --git a/src/mapping/cellranger_multi/test.py b/src/mapping/cellranger_multi/test.py index 8abc0a23c29..ec5545d877a 100644 --- a/src/mapping/cellranger_multi/test.py +++ b/src/mapping/cellranger_multi/test.py @@ -8,11 +8,11 @@ ## VIASH START meta = { - 'executable': './target/docker/mapping/cellranger_multi/cellranger_multi', - 'resources_dir': 'resources_test/', - 'cpus': 15, - 'memory_gb': 20, - 'config': 'src/mapping/cellranger_multi/config.vsh.yaml' + "executable": "./target/docker/mapping/cellranger_multi/cellranger_multi", + "resources_dir": "resources_test/", + "cpus": 15, + "memory_gb": 20, + "config": "src/mapping/cellranger_multi/config.vsh.yaml", } ## VIASH END @@ -23,7 +23,7 @@ def make_path_relative(some_path): try: return absolute_input_path.relative_to(absolute_cwd) except ValueError as e: - # TODO: python 3.12: remove lines below and add walk_up=True to `relative_to` call + # TODO: python 3.12: remove lines below and add walk_up=True to `relative_to` call if "is not in the subpath of" in str(e): _, *parts_input = absolute_input_path.parts _, *parts_cwd = absolute_cwd.parts @@ -33,53 +33,111 @@ def make_path_relative(some_path): parts_input.pop() parts_cwd.pop() for part in parts_cwd: - if not part or part == '.': + if not part or part == ".": pass else: - parts_input.append('..') - relative_path = type(absolute_input_path)('', *reversed(parts_input)) - assert relative_path.resolve() == absolute_input_path + parts_input.append("..") + relative_path = type(absolute_input_path)("", *reversed(parts_input)) + assert relative_path.resolve() == absolute_input_path return relative_path raise e - + resources_dir = make_path_relative(meta["resources_dir"]) -input1_R1 = resources_dir / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz" -input1_R2 = resources_dir / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz" -input2_R1 = resources_dir / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz" -input2_R2 = resources_dir / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz" -input3_R1 = resources_dir / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz" -input3_R2 = resources_dir / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz" +input1_R1 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R1_001.fastq.gz" +) +input1_R2 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_L001_R2_001.fastq.gz" +) +input2_R1 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R1_001.fastq.gz" +) +input2_R2 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_L004_R2_001.fastq.gz" +) +input3_R1 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R1_001.fastq.gz" +) +input3_R2 = ( + resources_dir + / "10x_5k_anticmv/raw/5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_L001_R2_001.fastq.gz" +) gex_reference = resources_dir / "reference_gencodev41_chr1/reference_cellranger.tar.gz" feature_reference = resources_dir / "10x_5k_anticmv/raw/feature_reference.csv" -vdj_reference = resources_dir / "10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" +vdj_reference = ( + resources_dir + / "10x_5k_anticmv/raw/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0.tar.gz" +) # Beam Input -input1_R1_beam = resources_dir / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_gex_subset_S3_L001_R1_001.fastq.gz" -input1_R2_beam = resources_dir / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_gex_subset_S3_L001_R2_001.fastq.gz" -input2_R1_beam = resources_dir / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_ag_subset_S1_L001_R1_001.fastq.gz" -input2_R2_beam = resources_dir / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_ag_subset_S1_L001_R2_001.fastq.gz" -input3_R1_beam = resources_dir / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_vdj_subset_S2_L001_R1_001.fastq.gz" -input3_R2_beam = resources_dir / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_vdj_subset_S2_L001_R2_001.fastq.gz" -vdj_reference_beam = resources_dir / "10x_5k_beam/raw/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz" -feature_reference_beam = resources_dir / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_feature_reference.csv" +input1_R1_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_gex_subset_S3_L001_R1_001.fastq.gz" +) +input1_R2_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_gex_subset_S3_L001_R2_001.fastq.gz" +) +input2_R1_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_ag_subset_S1_L001_R1_001.fastq.gz" +) +input2_R2_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_ag_subset_S1_L001_R2_001.fastq.gz" +) +input3_R1_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_vdj_subset_S2_L001_R1_001.fastq.gz" +) +input3_R2_beam = ( + resources_dir + / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_vdj_subset_S2_L001_R2_001.fastq.gz" +) +vdj_reference_beam = ( + resources_dir + / "10x_5k_beam/raw/5k_BEAM-T_Human_A0201_B0702_PBMC_5pv2_Multiplex_vdj_reference.tar.gz" +) +feature_reference_beam = ( + resources_dir / "10x_5k_beam/raw/beamt_human_A0201_B0702_pbmc_feature_reference.csv" +) + def test_cellranger_multi(run_component, random_path): outputpath = random_path() args = [ - "--output", outputpath, - "--input", input1_R1, - "--input", input1_R2, - "--input", input2_R1, - "--input", input2_R2, - "--input", input3_R1, - "--input", input3_R2, - "--gex_reference", gex_reference, - "--vdj_reference", vdj_reference, - "--feature_reference", feature_reference, - "--library_id", "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", - "--library_type", "Gene Expression;Antibody Capture;VDJ"] + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--input", + input2_R1, + "--input", + input2_R2, + "--input", + input3_R1, + "--input", + input3_R2, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + ] run_component(args) # check for raw data @@ -89,95 +147,157 @@ def test_cellranger_multi(run_component, random_path): assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() # check for filtered gex+ab data - assert (outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5").is_file() + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() # check for vdj data - assert (outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv").is_file() + assert ( + outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv" + ).is_file() + def test_cellranger_multi_decompressed_reference(run_component, random_path): extracted_tar = random_path() extracted_tar.mkdir() with tarfile.open(gex_reference) as open_tarfile: open_tarfile.extractall(extracted_tar) - run_component([ - "--output", random_path(), - "--input", input1_R1, - "--input", input1_R2, - "--input", input2_R1, - "--input", input2_R2, - "--input", input3_R1, - "--input", input3_R2, - "--gex_reference", extracted_tar, - "--vdj_reference", vdj_reference, - "--feature_reference", feature_reference, - "--library_id", "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", - "--library_type", "Gene Expression;Antibody Capture;VDJ", - "--dryrun"]) + run_component( + [ + "--output", + random_path(), + "--input", + input1_R1, + "--input", + input1_R2, + "--input", + input2_R1, + "--input", + input2_R2, + "--input", + input3_R1, + "--input", + input3_R2, + "--gex_reference", + extracted_tar, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + "--dryrun", + ] + ) + def test_cellranger_multi_directory_input(run_component, random_path): - args=[ - "--output", random_path(), - "--input", meta["resources_dir"] + "/10x_5k_anticmv/raw/", - "--gex_reference", gex_reference, - "--vdj_reference", vdj_reference, - "--feature_reference", feature_reference, - "--library_id", "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", - "--library_type", "Gene Expression;Antibody Capture;VDJ", - "--gex_secondary_analysis", "true", - "--gex_generate_bam", "false", - "--gex_include_introns", "false", - "--dryrun"] + args = [ + "--output", + random_path(), + "--input", + meta["resources_dir"] + "/10x_5k_anticmv/raw/", + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + "--gex_secondary_analysis", + "true", + "--gex_generate_bam", + "false", + "--gex_include_introns", + "false", + "--dryrun", + ] run_component(args) + def test_vdj_inner_enrichment_primers(run_component, random_path): outputpath = random_path() enrichment_primers_file = random_path("txt") - with enrichment_primers_file.open('w') as primers_file_open: + with enrichment_primers_file.open("w") as primers_file_open: primers_file_open.write("AGTCTCTCAGCTGGTACACG\nTCTGATGGCTCAAACACAGC") - args=[ - "--output", outputpath, - "--input", meta["resources_dir"] + "/10x_5k_anticmv/raw/", - "--gex_reference", gex_reference, - "--vdj_reference", vdj_reference, - "--feature_reference", feature_reference, - "--library_id", "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", - "--library_type", "Gene Expression;Antibody Capture;VDJ", - "--gex_secondary_analysis", "true", - "--gex_generate_bam", "false", - "--gex_include_introns", "false", - "--vdj_inner_enrichment_primers", str(make_path_relative(enrichment_primers_file)), - "--dryrun"] + args = [ + "--output", + outputpath, + "--input", + meta["resources_dir"] + "/10x_5k_anticmv/raw/", + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + "--gex_secondary_analysis", + "true", + "--gex_generate_bam", + "false", + "--gex_include_introns", + "false", + "--vdj_inner_enrichment_primers", + str(make_path_relative(enrichment_primers_file)), + "--dryrun", + ] run_component(args) config_path = outputpath / "config.csv" assert config_path.is_file() - with config_path.open('r') as config_file: + with config_path.open("r") as config_file: config_contents = config_file.read() - expected_csv_content = fr"\[vdj\]\nreference,.*?\ninner-enrichment-primers,{enrichment_primers_file.resolve()}\n" + expected_csv_content = rf"\[vdj\]\nreference,.*?\ninner-enrichment-primers,{enrichment_primers_file.resolve()}\n" assert re.search(expected_csv_content, config_contents) + def test_cellranger_multi_applies_gex_options(run_component, random_path): outputpath = random_path() - args=[ - "--output", outputpath, - "--input", input1_R1, - "--input", input1_R2, - "--input", input2_R1, - "--input", input2_R2, - "--input", input3_R1, - "--input", input3_R2, - "--gex_reference", gex_reference, - "--vdj_reference", vdj_reference, - "--feature_reference", feature_reference, - "--library_id", "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", - "--library_type", "Gene Expression;Antibody Capture;VDJ", - "--gex_secondary_analysis", "true", - "--gex_generate_bam", "false", - "--gex_include_introns", "false", - "--dryrun"] + args = [ + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--input", + input2_R1, + "--input", + input2_R2, + "--input", + input3_R1, + "--input", + input3_R2, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset;5k_human_antiCMV_T_TBNK_connect_VDJ_subset", + "--library_type", + "Gene Expression;Antibody Capture;VDJ", + "--gex_secondary_analysis", + "true", + "--gex_generate_bam", + "false", + "--gex_include_introns", + "false", + "--dryrun", + ] run_component(args) config_path = outputpath / "config.csv" assert config_path.is_file() - with config_path.open('r') as config_file: + with config_path.open("r") as config_file: config_contents = config_file.read() expected_csv_content = dedent( """\ @@ -185,66 +305,110 @@ def test_cellranger_multi_applies_gex_options(run_component, random_path): no-secondary,False create-bam,False include-introns,False - """) - print (expected_csv_content, flush=True) + """ + ) + print(expected_csv_content, flush=True) assert expected_csv_content in config_contents + def test_cellranger_multi_no_vdj_reference(run_component, random_path): # GH291 outputpath = random_path() - args=[ - "--output", outputpath, - "--input", input1_R1, - "--input", input1_R2, - "--input", input2_R1, - "--input", input2_R2, - "--input", input3_R1, - "--input", input3_R2, - "--gex_reference", gex_reference, - "--feature_reference", feature_reference, - "--library_id", "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset", - "--library_type", "Gene Expression;Antibody Capture", - "--dryrun"] + args = [ + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--input", + input2_R1, + "--input", + input2_R2, + "--input", + input3_R1, + "--input", + input3_R2, + "--gex_reference", + gex_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset;5k_human_antiCMV_T_TBNK_connect_AB_subset", + "--library_type", + "Gene Expression;Antibody Capture", + "--dryrun", + ] run_component(args) assert (outputpath / "config.csv").is_file() + def test_cellranger_multi_crispr_data(run_component, random_path): outputpath = random_path() args = [ - "--input", meta["resources_dir"] + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_gex_subset_S5_L001_R1_001.fastq.gz", - "--input", meta["resources_dir"] + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_gex_subset_S5_L001_R2_001.fastq.gz", - "--input", meta["resources_dir"] + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_crispr_subset_S4_L001_R1_001.fastq.gz", - "--input", meta["resources_dir"] + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_crispr_subset_S4_L001_R2_001.fastq.gz", - "--library_id", "SC3_v3_NextGem_DI_CRISPR_A549_5K_gex_subset;SC3_v3_NextGem_DI_CRISPR_A549_5K_crispr_subset", - "--library_type", "Gene Expression;CRISPR Guide Capture", - "--min_crispr_umi", "3", - "--gex_reference", gex_reference, - "--feature_reference", meta["resources_dir"] + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference_corrected.csv", - "--output", outputpath + "--input", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_gex_subset_S5_L001_R1_001.fastq.gz", + "--input", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_gex_subset_S5_L001_R2_001.fastq.gz", + "--input", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_crispr_subset_S4_L001_R1_001.fastq.gz", + "--input", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_crispr_subset_S4_L001_R2_001.fastq.gz", + "--library_id", + "SC3_v3_NextGem_DI_CRISPR_A549_5K_gex_subset;SC3_v3_NextGem_DI_CRISPR_A549_5K_crispr_subset", + "--library_type", + "Gene Expression;CRISPR Guide Capture", + "--min_crispr_umi", + "3", + "--gex_reference", + gex_reference, + "--feature_reference", + meta["resources_dir"] + + "/10x_5k_lung_crispr/raw/SC3_v3_NextGem_DI_CRISPR_A549_5K_Multiplex_count_feature_reference_corrected.csv", + "--output", + outputpath, ] run_component(args) # check for raw data - assert ( outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() + assert (outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() # check for metrics summary assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() # check for filtered gex+ab data - assert (outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5").is_file() + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() # check for crispr data assert (outputpath / "per_sample_outs/run/count/crispr_analysis/").is_dir() + def test_cellranger_multi_helper_input(run_component, random_path): outputpath = random_path() args = [ - "--output", outputpath, - "--gex_input", input1_R1, - "--gex_input", input1_R2, - "--abc_input", input2_R1, - "--abc_input", input2_R2, - "--vdj_input", input3_R1, - "--vdj_input", input3_R2, - "--gex_reference", gex_reference, - "--vdj_reference", vdj_reference, - "--feature_reference", feature_reference] + "--output", + outputpath, + "--gex_input", + input1_R1, + "--gex_input", + input1_R2, + "--abc_input", + input2_R1, + "--abc_input", + input2_R2, + "--vdj_input", + input3_R1, + "--vdj_input", + input3_R2, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + ] run_component(args) # check for raw data @@ -254,26 +418,44 @@ def test_cellranger_multi_helper_input(run_component, random_path): assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() # check for filtered gex+ab data - assert (outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5").is_file() + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() # check for vdj data - assert (outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv").is_file() + assert ( + outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv" + ).is_file() + def test_cellranger_multi_combined_helper_and_global_input(run_component, random_path): outputpath = random_path() args = [ - "--output", outputpath, - "--input", input1_R1, - "--input", input1_R2, - "--abc_input", input2_R1, - "--abc_input", input2_R2, - "--vdj_input", input3_R1, - "--vdj_input", input3_R2, - "--gex_reference", gex_reference, - "--vdj_reference", vdj_reference, - "--feature_reference", feature_reference, - "--library_id", "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", - "--library_type", "Gene Expression"] + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--abc_input", + input2_R1, + "--abc_input", + input2_R2, + "--vdj_input", + input3_R1, + "--vdj_input", + input3_R2, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", + "--library_type", + "Gene Expression", + ] run_component(args) # check for raw data @@ -283,58 +465,92 @@ def test_cellranger_multi_combined_helper_and_global_input(run_component, random assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() # check for filtered gex+ab data - assert (outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5").is_file() + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() # check for vdj data - assert (outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv").is_file() - - + assert ( + outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv" + ).is_file() + + def test_cellranger_multi_create_output_on_fail(run_component, random_path): outputpath = random_path() # missing vdj_reference args = [ - "--output", outputpath, - "--input", input1_R1, - "--input", input1_R2, - "--abc_input", input2_R1, - "--abc_input", input2_R2, - "--vdj_input", input3_R1, - "--vdj_input", input3_R2, - "--gex_reference", gex_reference, - "--feature_reference", feature_reference, - "--library_id", "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", - "--library_type", "Gene Expression"] - - with pytest.raises(subprocess.CalledProcessError) as e: + "--output", + outputpath, + "--input", + input1_R1, + "--input", + input1_R2, + "--abc_input", + input2_R1, + "--abc_input", + input2_R2, + "--vdj_input", + input3_R1, + "--vdj_input", + input3_R2, + "--gex_reference", + gex_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", + "--library_type", + "Gene Expression", + ] + + with pytest.raises(subprocess.CalledProcessError): run_component(args) - - assert (outputpath / "cellranger_multi.log").is_file(), "Should have created log file." + + assert ( + outputpath / "cellranger_multi.log" + ).is_file(), "Should have created log file." def test_cellranger_multi_beam_data(run_component, random_path): outputpath = random_path() args = [ - "--input", input1_R1_beam, - "--input", input1_R2_beam, - "--input", input2_R1_beam, - "--input", input2_R2_beam, - "--input", input3_R1_beam, - "--input", input3_R2_beam, - "--library_id", "beamt_human_A0201_B0702_pbmc_gex_subset;beamt_human_A0201_B0702_pbmc_ag_subset;beamt_human_A0201_B0702_pbmc_vdj_subset", - "--library_type", "Gene Expression;VDJ-T;Antigen Capture", - "--gex_reference", gex_reference, - "--vdj_reference", vdj_reference_beam, - "--feature_reference", feature_reference_beam, - "--output", outputpath, - "--control_id", "negative_control_A0201;negative_control_B0702", - "--mhc_allele", "HLA-A*02:01;HLA-B*07:02" + "--input", + input1_R1_beam, + "--input", + input1_R2_beam, + "--input", + input2_R1_beam, + "--input", + input2_R2_beam, + "--input", + input3_R1_beam, + "--input", + input3_R2_beam, + "--library_id", + "beamt_human_A0201_B0702_pbmc_gex_subset;beamt_human_A0201_B0702_pbmc_ag_subset;beamt_human_A0201_B0702_pbmc_vdj_subset", + "--library_type", + "Gene Expression;VDJ-T;Antigen Capture", + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference_beam, + "--feature_reference", + feature_reference_beam, + "--output", + outputpath, + "--control_id", + "negative_control_A0201;negative_control_B0702", + "--mhc_allele", + "HLA-A*02:01;HLA-B*07:02", ] run_component(args) # check for raw data - assert ( outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() + assert (outputpath / "multi/count/raw_feature_bc_matrix.h5").is_file() # check for metrics summary assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() - assert (outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5").is_file() + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() # check for antigen data assert (outputpath / "per_sample_outs/run/antigen_analysis/").is_dir() # check for vdj data @@ -344,18 +560,30 @@ def test_cellranger_multi_beam_data(run_component, random_path): def test_cellranger_multi_fixed_rna(run_component, random_path): outputpath = random_path() args = [ - "--input", f"{meta['resources_dir']}/10x_5k_fixed/raw/", - "--library_id", "4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_subset", - "--library_type", "Gene Expression", - "--feature_reference", f"{meta['resources_dir']}/10x_5k_fixed/raw/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv", - "--gex_reference", gex_reference, - "--output", outputpath, - "--probe_barcode_ids", "BC001;BC002;BC003;BC004", - "--sample_ids", "Liver_BC1;Ovarian_BC2;Colorectal_BC3;Pancreas_BC4", - "--gex_generate_bam", "false", - "--library_lanes", "any", - "--probe_set", f"{meta['resources_dir']}/10x_5k_fixed/raw/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv", - "--sample_force_cells", "5000;-1;-1;-1" + "--input", + f"{meta['resources_dir']}/10x_5k_fixed/raw/", + "--library_id", + "4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_subset", + "--library_type", + "Gene Expression", + "--feature_reference", + f"{meta['resources_dir']}/10x_5k_fixed/raw/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv", + "--gex_reference", + gex_reference, + "--output", + outputpath, + "--probe_barcode_ids", + "BC001;BC002;BC003;BC004", + "--sample_ids", + "Liver_BC1;Ovarian_BC2;Colorectal_BC3;Pancreas_BC4", + "--gex_generate_bam", + "false", + "--library_lanes", + "any", + "--probe_set", + f"{meta['resources_dir']}/10x_5k_fixed/raw/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv", + "--sample_force_cells", + "5000;-1;-1;-1", ] run_component(args) # check for raw data @@ -363,7 +591,10 @@ def test_cellranger_multi_fixed_rna(run_component, random_path): # check for metrics summary for sample in ["Liver_BC1", "Ovarian_BC2", "Colorectal_BC3", "Pancreas_BC4"]: assert (outputpath / f"per_sample_outs/{sample}/metrics_summary.csv").is_file() - assert (outputpath / f"per_sample_outs/{sample}/count/sample_filtered_feature_bc_matrix.h5").is_file() + assert ( + outputpath + / f"per_sample_outs/{sample}/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() assert (outputpath / "multi/multiplexing_analysis").is_dir() @@ -377,12 +608,24 @@ def test_cellranger_multi_with_alternative_names(run_component, random_path): # Note: if one input file does not use any lanes, none of the input files should use lanes # remove lanes - input1_R1_link = input_dir / "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_R1_001.fastq.gz" - input1_R2_link = input_dir / "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_R2_001.fastq.gz" - input2_R1_link = input_dir / "5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_R1_001.fastq.gz" - input2_R2_link = input_dir / "5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_R2_001.fastq.gz" - input3_R1_link = input_dir / "5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_R1_001.fastq" - input3_R2_link = input_dir / "5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_R2_001.fastq" + input1_R1_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_R1_001.fastq.gz" + ) + input1_R2_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset_S1_R2_001.fastq.gz" + ) + input2_R1_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_R1_001.fastq.gz" + ) + input2_R2_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_AB_subset_S2_R2_001.fastq.gz" + ) + input3_R1_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_R1_001.fastq" + ) + input3_R2_link = ( + input_dir / "5k_human_antiCMV_T_TBNK_connect_VDJ_subset_S1_R2_001.fastq" + ) # copy files shutil.copy(input1_R1, input1_R1_link) @@ -390,27 +633,40 @@ def test_cellranger_multi_with_alternative_names(run_component, random_path): shutil.copy(input2_R1, input2_R1_link) shutil.copy(input2_R2, input2_R2_link) - with gzip.open(input3_R1, 'rb') as f_in: - with open(input3_R1_link, 'wb') as f_out: + with gzip.open(input3_R1, "rb") as f_in: + with open(input3_R1_link, "wb") as f_out: shutil.copyfileobj(f_in, f_out) - with gzip.open(input3_R2, 'rb') as f_in: - with open(input3_R2_link, 'wb') as f_out: + with gzip.open(input3_R2, "rb") as f_in: + with open(input3_R2_link, "wb") as f_out: shutil.copyfileobj(f_in, f_out) outputpath = random_path() args = [ - "--output", outputpath, - "--input", input1_R1_link, - "--input", input1_R2_link, - "--abc_input", input2_R1_link, - "--abc_input", input2_R2_link, - "--vdj_input", input3_R1_link, - "--vdj_input", input3_R2_link, - "--gex_reference", gex_reference, - "--vdj_reference", vdj_reference, - "--feature_reference", feature_reference, - "--library_id", "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", - "--library_type", "Gene Expression"] + "--output", + outputpath, + "--input", + input1_R1_link, + "--input", + input1_R2_link, + "--abc_input", + input2_R1_link, + "--abc_input", + input2_R2_link, + "--vdj_input", + input3_R1_link, + "--vdj_input", + input3_R2_link, + "--gex_reference", + gex_reference, + "--vdj_reference", + vdj_reference, + "--feature_reference", + feature_reference, + "--library_id", + "5k_human_antiCMV_T_TBNK_connect_GEX_1_subset", + "--library_type", + "Gene Expression", + ] run_component(args) # check for raw data @@ -420,10 +676,15 @@ def test_cellranger_multi_with_alternative_names(run_component, random_path): assert (outputpath / "per_sample_outs/run/metrics_summary.csv").is_file() # check for filtered gex+ab data - assert (outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5").is_file() + assert ( + outputpath / "per_sample_outs/run/count/sample_filtered_feature_bc_matrix.h5" + ).is_file() # check for vdj data - assert (outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv").is_file() + assert ( + outputpath / "per_sample_outs/run/vdj_t/filtered_contig_annotations.csv" + ).is_file() + -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/mapping/htseq_count/script.py b/src/mapping/htseq_count/script.py index 9aee7cc14b8..b1598d65268 100644 --- a/src/mapping/htseq_count/script.py +++ b/src/mapping/htseq_count/script.py @@ -8,25 +8,23 @@ ## VIASH START par = { - 'input': ['resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam'], - 'reference': 'resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz', - 'output': 'test_output' -} -meta = { - 'cpus': 2, - 'temp_dir': '/tmp', - 'config': 'src/mapping/htseq/config.vsh.yaml' + "input": ["resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam"], + "reference": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz", + "output": "test_output", } +meta = {"cpus": 2, "temp_dir": "/tmp", "config": "src/mapping/htseq/config.vsh.yaml"} ## VIASH END ######################## ### Helper functions ### ######################## + # helper function for cheching whether something is a gzip def is_gz_file(path: Path) -> bool: - with open(path, 'rb') as file: - return file.read(2) == b'\x1f\x8b' + with open(path, "rb") as file: + return file.read(2) == b"\x1f\x8b" + # if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: @@ -34,19 +32,23 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # Remove two extensions (if they exist) extaction_dir_name = Path(par_value.stem).stem unpacked_path = temp_dir_path / extaction_dir_name - print(f' Tar detected; extracting {par_value} to {unpacked_path}', flush=True) + print( + f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True + ) - with tarfile.open(par_value, 'r') as open_tar: + with tarfile.open(par_value, "r") as open_tar: members = open_tar.getmembers() - root_dirs = [member + root_dirs = [ + member for member in members - if member.isdir() and member.name != '.' and '/' not in member.name] + if member.isdir() and member.name != "." and "/" not in member.name + ] # if there is only one root_dir (and there are files in that directory) # strip that directory name from the destination folder if len(root_dirs) == 1: for mem in members: mem.path = Path(*Path(mem.path).parts[1:]) - members_to_move = [mem for mem in members if mem.path != Path('.')] + members_to_move = [mem for mem in members if mem.path != Path(".")] open_tar.extractall(unpacked_path, members=members_to_move) return unpacked_path @@ -54,22 +56,23 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # Remove extension (if it exists) extaction_file_name = Path(par_value.stem) unpacked_path = temp_dir_path / extaction_file_name - print(f' Gzip detected; extracting {par_value} to {unpacked_path}', flush=True) + print( + f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True + ) - with gzip.open(par_value, 'rb') as f_in: - with open(unpacked_path, 'wb') as f_out: + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: shutil.copyfileobj(f_in, f_out) return unpacked_path else: return par_value + def generate_args(par, config): # fetch arguments from config arguments = [ - arg - for group in config["argument_groups"] - for arg in group["arguments"] + arg for group in config["argument_groups"] for arg in group["arguments"] ] cmd_args = [] @@ -93,6 +96,7 @@ def generate_args(par, config): return cmd_args + ######################## ### Main code ### ######################## @@ -102,25 +106,21 @@ def generate_args(par, config): with tempfile.TemporaryDirectory(prefix="htseq-", dir=meta["temp_dir"]) as temp_dir: - # checking for compressed files, ungzip files if need be temp_dir_path = Path(temp_dir) reference = Path(par["reference"]) - print(f'>> Check compression of --reference with value: {reference}', flush=True) + print(f">> Check compression of --reference with value: {reference}", flush=True) par["reference"] = extract_if_need_be(reference, temp_dir_path) print(">> Constructing command", flush=True) - cmd_args = [ "htseq-count" ] + generate_args(par, config) + cmd_args = ["htseq-count"] + generate_args(par, config) # manually process cpus parameter - if 'cpus' in meta and meta['cpus']: + if "cpus" in meta and meta["cpus"]: cmd_args.extend(["--nprocesses", str(meta["cpus"])]) print(">> Running htseq-count with command:", flush=True) - print("+ " + ' '.join([str(x) for x in cmd_args]), flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) - subprocess.run( - cmd_args, - check=True - ) \ No newline at end of file + subprocess.run(cmd_args, check=True) diff --git a/src/mapping/htseq_count/test.py b/src/mapping/htseq_count/test.py index 0be109272a6..73b2c801402 100644 --- a/src/mapping/htseq_count/test.py +++ b/src/mapping/htseq_count/test.py @@ -4,22 +4,27 @@ ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ## VIASH END print("> Running command with folder", flush=True) input = meta["resources_dir"] + "/cellranger_tiny_fastq/bam/possorted_genome_bam.bam" -reference = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" +reference = ( + meta["resources_dir"] + + "/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" +) output = "test_output.tsv" cmd_pars = [ meta["executable"], - "--input", input, - "--reference", reference, - "--output", output, - "---cpus", "2" + "--input", + input, + "--reference", + reference, + "--output", + output, + "---cpus", + "2", ] subprocess.run(cmd_pars, check=True) @@ -28,8 +33,8 @@ assert output_path.is_file() print("> Check contents", flush=True) -counts = pd.read_table(output_path, sep = "\t") +counts = pd.read_table(output_path, sep="\t") assert counts.shape[0] > 100 assert counts.shape[1] == 2 -print("> Completed Successfully!", flush=True) \ No newline at end of file +print("> Completed Successfully!", flush=True) diff --git a/src/mapping/htseq_count_to_h5mu/script.py b/src/mapping/htseq_count_to_h5mu/script.py index ede8371bb2f..5f8d7cef733 100644 --- a/src/mapping/htseq_count_to_h5mu/script.py +++ b/src/mapping/htseq_count_to_h5mu/script.py @@ -15,21 +15,21 @@ "input_counts": ["resources_test/cellranger_tiny_fastq/htseq_counts.tsv"], "input_id": ["", "bar"], "reference": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz", - "output": "test_output.h5mu" -} -meta = { - "temp_dir": "/tmp" + "output": "test_output.h5mu", } +meta = {"temp_dir": "/tmp"} ## VIASH END ######################## ### Helper functions ### ######################## + # helper function for cheching whether something is a gzip def is_gz_file(path: Path) -> bool: - with open(path, 'rb') as file: - return file.read(2) == b'\x1f\x8b' + with open(path, "rb") as file: + return file.read(2) == b"\x1f\x8b" + # if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: @@ -37,19 +37,21 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # Remove two extensions (if they exist) extaction_dir_name = Path(par_value.stem).stem unpacked_path = temp_dir_path / extaction_dir_name - print(f' Tar detected; extracting {par_value} to {unpacked_path}', flush=True) + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) - with tarfile.open(par_value, 'r') as open_tar: + with tarfile.open(par_value, "r") as open_tar: members = open_tar.getmembers() - root_dirs = [member + root_dirs = [ + member for member in members - if member.isdir() and member.name != '.' and '/' not in member.name] + if member.isdir() and member.name != "." and "/" not in member.name + ] # if there is only one root_dir (and there are files in that directory) # strip that directory name from the destination folder if len(root_dirs) == 1: for mem in members: mem.path = Path(*Path(mem.path).parts[1:]) - members_to_move = [mem for mem in members if mem.path != Path('.')] + members_to_move = [mem for mem in members if mem.path != Path(".")] open_tar.extractall(unpacked_path, members=members_to_move) return unpacked_path @@ -57,10 +59,10 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # Remove extension (if it exists) extaction_file_name = Path(par_value.stem) unpacked_path = temp_dir_path / extaction_file_name - print(f' Gzip detected; extracting {par_value} to {unpacked_path}', flush=True) + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) - with gzip.open(par_value, 'rb') as f_in: - with open(unpacked_path, 'wb') as f_out: + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: shutil.copyfileobj(f_in, f_out) return unpacked_path @@ -72,7 +74,12 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: counts_data = [] for input_id, input_counts in zip(par["input_id"], par["input_counts"]): - data = pd.read_table(input_counts, index_col=0, names=["gene_ids", input_id], dtype={'gene_ids': 'U', input_id: 'i'}).transpose() + data = pd.read_table( + input_counts, + index_col=0, + names=["gene_ids", input_id], + dtype={"gene_ids": "U", input_id: "i"}, + ).transpose() counts_data.append(data) # combine all counts @@ -80,9 +87,9 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: print("> split qc", flush=True) idx = counts_and_qc.columns.str.startswith("_") -qc = counts_and_qc.loc[:,idx] +qc = counts_and_qc.loc[:, idx] qc.columns = qc.columns.str.replace("^__", "", regex=True) -counts = counts_and_qc.loc[:,~idx] +counts = counts_and_qc.loc[:, ~idx] print("> construct var", flush=True) with tempfile.TemporaryDirectory(prefix="htseq-", dir=meta["temp_dir"]) as temp_dir: @@ -90,7 +97,7 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: temp_dir_path = Path(temp_dir) reference = Path(par["reference"]) - print(f'>> Check compression of --reference with value: {reference}', flush=True) + print(f">> Check compression of --reference with value: {reference}", flush=True) par["reference"] = extract_if_need_be(reference, temp_dir_path) # read_gtf only works on str object, not pathlib.Path @@ -98,28 +105,23 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # This is a polars dataframe, not pandas -reference_genes = reference.filter((pl.col("feature") == "gene") & - (pl.col("gene_id").is_in(list(counts.columns))))\ - .sort("gene_id") +reference_genes = reference.filter( + (pl.col("feature") == "gene") & (pl.col("gene_id").is_in(list(counts.columns))) +).sort("gene_id") var = pd.DataFrame( - data={ - "gene_ids": pd.Index(reference_genes.get_column("gene_id")), - "feature_types": "Gene Expression", - "gene_symbol": reference_genes.get_column("gene_name").to_pandas(), - } + data={ + "gene_ids": pd.Index(reference_genes.get_column("gene_id")), + "feature_types": "Gene Expression", + "gene_symbol": reference_genes.get_column("gene_name").to_pandas(), + } ).set_index("gene_ids") print("> construct anndata", flush=True) -adata = ad.AnnData( - X=counts, - obsm={"qc_htseq": qc}, - var=var, - dtype=np.int32 -) +adata = ad.AnnData(X=counts, obsm={"qc_htseq": qc}, var=var, dtype=np.int32) print("> convert to mudata", flush=True) mdata = md.MuData(adata) print("> write to file", flush=True) -mdata.write_h5mu(par["output"], compression=par["output_compression"]) \ No newline at end of file +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/mapping/htseq_count_to_h5mu/test.py b/src/mapping/htseq_count_to_h5mu/test.py index 5af61f4d911..65b9a62e004 100644 --- a/src/mapping/htseq_count_to_h5mu/test.py +++ b/src/mapping/htseq_count_to_h5mu/test.py @@ -3,24 +3,31 @@ import mudata as md ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ## VIASH END print("> Running command with folder", flush=True) input = meta["resources_dir"] + "/cellranger_tiny_fastq/htseq_counts.tsv" -reference = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" +reference = ( + meta["resources_dir"] + + "/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" +) output = "test_output.h5mu" cmd_pars = [ meta["executable"], - "--input_id", "foo;bar", - "--input_counts", f"{input};{input}", - "--reference", reference, - "--output", output, - "---cpus", "2", - "--output_compression", "gzip" + "--input_id", + "foo;bar", + "--input_counts", + f"{input};{input}", + "--reference", + reference, + "--output", + output, + "---cpus", + "2", + "--output_compression", + "gzip", ] subprocess.run(cmd_pars, check=True) @@ -37,4 +44,4 @@ assert mdata.n_obs == 2 assert mdata.mod["rna"].n_vars > 100 -print("> Completed Successfully!", flush=True) \ No newline at end of file +print("> Completed Successfully!", flush=True) diff --git a/src/mapping/multi_star/script.py b/src/mapping/multi_star/script.py index c38228f095c..d6184fb618c 100644 --- a/src/mapping/multi_star/script.py +++ b/src/mapping/multi_star/script.py @@ -49,11 +49,12 @@ def fetch_arguments_info(config: Dict[str, Any]) -> Dict[str, Any]: } return arguments + def process_par( par: Dict[str, Any], arguments_info: Dict[str, Any], gz_args: List[str], - temp_dir: Path + temp_dir: Path, ) -> Dict[str, Any]: """ Process the Viash par dictionary @@ -95,6 +96,7 @@ def process_par( new_par[key] = value return new_par + def generate_cmd_arguments(par, arguments_info, step_filter=None, flatten=False): """ Generate command-line arguments by fetching the relevant args @@ -112,7 +114,7 @@ def generate_cmd_arguments(par, arguments_info, step_filter=None, flatten=False) for key, arg in arguments_info.items(): arg_val = par.get(key) - # The info key is always present (changed in viash 0.7.4) + # The info key is always present (changed in viash 0.7.4) # in the parsed config (None if not specified in source config) info = arg["info"] or {} orig_arg = info.get("orig_arg") @@ -136,11 +138,13 @@ def generate_cmd_arguments(par, arguments_info, step_filter=None, flatten=False) return cmd_args + def is_gz_file(path: Path) -> bool: """Check whether something is a gzip""" with open(path, "rb") as file: return file.read(2) == b"\x1f\x8b" + def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: """if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path""" if par_value.is_file() and tarfile.is_tarfile(par_value): @@ -179,28 +183,35 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: else: return par_value + def load_star_reference(reference_index: str) -> None: """Load star reference index into memory.""" subprocess.run( [ "STAR", - "--genomeLoad", "LoadAndExit", - "--genomeDir", str(reference_index), + "--genomeLoad", + "LoadAndExit", + "--genomeDir", + str(reference_index), ], - check=True + check=True, ) + def unload_star_reference(reference_index: str) -> None: """Remove star reference index from memory.""" subprocess.run( [ "STAR", - "--genomeLoad", "Remove", - "--genomeDir", str(reference_index), + "--genomeLoad", + "Remove", + "--genomeDir", + str(reference_index), ], - check=True + check=True, ) + def star_and_htseq( group_id: str, r1_files: List[Path], @@ -208,8 +219,8 @@ def star_and_htseq( temp_dir: Path, par: Dict[str, Any], arguments_info: Dict[str, Any], - num_threads: int -) -> Tuple[int, str] : + num_threads: int, +) -> Tuple[int, str]: star_output = par["output"] / "per" / group_id temp_dir_group = temp_dir / f"star_tmp_{group_id}" unsorted_bam = star_output / "Aligned.out.bam" @@ -227,18 +238,22 @@ def star_and_htseq( temp_dir=temp_dir / f"star_tmp_{group_id}", par=par, arguments_info=arguments_info, - num_threads=num_threads + num_threads=num_threads, ) if not unsorted_bam.exists(): return (1, f"Could not find unsorted bam at '{unsorted_bam}'") if par["run_htseq_count"]: - print(f">> Running samtools sort for group '{group_id}' with command:", flush=True) + print( + f">> Running samtools sort for group '{group_id}' with command:", flush=True + ) run_samtools_sort(unsorted_bam, sorted_bam) if not sorted_bam.exists(): return (1, f"Could not find sorted bam at '{unsorted_bam}'") - print(f">> Running htseq-count for group '{group_id}' with command:", flush=True) + print( + f">> Running htseq-count for group '{group_id}' with command:", flush=True + ) run_htseq_count(sorted_bam, counts_file, par, arguments_info) if not counts_file.exists(): return (1, f"Could not find counts at '{counts_file}'") @@ -247,9 +262,10 @@ def star_and_htseq( run_multiqc(star_output) if not multiqc_path.exists(): return (1, f"Could not find MultiQC output at '{multiqc_path}'") - + return (0, "") + def run_star( r1_files: List[Path], r2_files: List[Path], @@ -257,7 +273,7 @@ def run_star( temp_dir: Path, par: Dict[str, Any], arguments_info: Dict[str, Any], - num_threads: int + num_threads: int, ) -> None: """Run star""" # process manual arguments @@ -274,12 +290,9 @@ def run_star( # make sure there is a trailing / "--outFileNamePrefix": [f"{output_dir}/"], # fix the outSAMtype to return unsorted BAM files - "--outSAMtype": ["BAM", "Unsorted"] + "--outSAMtype": ["BAM", "Unsorted"], } - manual_cmd = [str(x) - for key, values in manual_par.items() - for x in [key] + values - ] + manual_cmd = [str(x) for key, values in manual_par.items() for x in [key] + values] # process all passthrough star arguments par_cmd = generate_cmd_arguments(par, arguments_info, "star", flatten=True) @@ -290,10 +303,8 @@ def run_star( # run star subprocess.run(cmd_args, check=True) -def run_samtools_sort( - unsorted_bam: Path, - sorted_bam: Path -) -> None: + +def run_samtools_sort(unsorted_bam: Path, sorted_bam: Path) -> None: "Run samtools sort" cmd_args = [ "samtools", @@ -304,18 +315,16 @@ def run_samtools_sort( ] subprocess.run(cmd_args, check=True) + def run_htseq_count( sorted_bam: Path, counts_file: Path, par: Dict[str, Any], - arguments_info: Dict[str, Any] + arguments_info: Dict[str, Any], ) -> None: """Run HTSeq count""" # process manual arguments - manual_cmd = [ - sorted_bam, - par["reference_gtf"] - ] + manual_cmd = [sorted_bam, par["reference_gtf"]] # process all passthrough htseq arguments par_cmd = generate_cmd_arguments(par, arguments_info, "htseq") @@ -327,6 +336,7 @@ def run_htseq_count( with open(counts_file, "w", encoding="utf-8") as file: subprocess.run(cmd_args, check=True, stdout=file) + def get_feature_info(reference_gtf) -> pd.DataFrame: ref = gtfparse.read_gtf(reference_gtf) ref_genes = ref.filter((pl.col("feature") == "gene") | (pl.col("source") == "ERCC")) @@ -334,12 +344,20 @@ def get_feature_info(reference_gtf) -> pd.DataFrame: { "feature_id": pd.Index(ref_genes.get_column("gene_id")), "feature_type": "Gene Expression", - "feature_name": ref_genes.get_column("gene_name").to_pandas() + "feature_name": ref_genes.get_column("gene_name").to_pandas(), } ) + def run_multiqc(input_dir: Path) -> None: - cmd_args = ["multiqc", str(input_dir), "--outdir", str(input_dir), "--no-report", "--force"] + cmd_args = [ + "multiqc", + str(input_dir), + "--outdir", + str(input_dir), + "--no-report", + "--force", + ] # run multiqc subprocess.run(cmd_args, check=True) @@ -349,13 +367,18 @@ def run_multiqc(input_dir: Path) -> None: ### Main code ### ######################## + def main(par, meta): """Main function""" # check input arguments - assert len(par["input_id"]) == len(par["input_r1"]), "--input_r1 should have same length as --input_id" + assert len(par["input_id"]) == len( + par["input_r1"] + ), "--input_r1 should have same length as --input_id" if par["input_r2"]: - assert len(par["input_id"]) == len(par["input_r2"]), "--input_r2 should have same length as --input_id" + assert len(par["input_id"]) == len( + par["input_r2"] + ), "--input_r2 should have same length as --input_id" # read config arguments with open(meta["config"], "r", encoding="utf-8") as file: @@ -366,9 +389,7 @@ def main(par, meta): # temp_dir = "tmp/" with tempfile.TemporaryDirectory( - prefix=f"{meta['name']}-", - dir=meta["temp_dir"], - ignore_cleanup_errors=True + prefix=f"{meta['name']}-", dir=meta["temp_dir"], ignore_cleanup_errors=True ) as temp_dir: temp_dir = Path(temp_dir) temp_dir.mkdir(parents=True, exist_ok=True) @@ -384,7 +405,9 @@ def main(par, meta): # group input_files by input_id print(">> Group by --input_id", flush=True) grouped_inputs = {} - for group_id, file_r1, file_r2 in zip(par["input_id"], par["input_r1"], par["input_r2"]): + for group_id, file_r1, file_r2 in zip( + par["input_id"], par["input_r1"], par["input_r2"] + ): if group_id not in grouped_inputs: grouped_inputs[group_id] = ([], []) grouped_inputs[group_id][0].append(file_r1) @@ -417,9 +440,9 @@ def main(par, meta): temp_dir=temp_dir, par=par, arguments_info=arguments_info, - num_threads=num_threads_per_task + num_threads=num_threads_per_task, ), - grouped_inputs.items() + grouped_inputs.items(), ) num_errored = 0 @@ -432,7 +455,10 @@ def main(par, meta): print("------------------") print(f"Success rate: {math.ceil(pct_succeeded * 100)}%") - assert pct_succeeded >= par["min_success_rate"], f"Success rate should be at least {math.ceil(par['min_success_rate'] * 100)}%" + assert ( + pct_succeeded >= par["min_success_rate"] + ), f"Success rate should be at least {math.ceil(par['min_success_rate'] * 100)}%" + if __name__ == "__main__": main(par, meta) diff --git a/src/mapping/multi_star/test.py b/src/mapping/multi_star/test.py index 814770a1ac9..80c99b22f75 100644 --- a/src/mapping/multi_star/test.py +++ b/src/mapping/multi_star/test.py @@ -2,40 +2,54 @@ from pathlib import Path ## VIASH START -meta = { - "name": "cellranger_count", - "resources_dir": "resources_test" -} +meta = {"name": "cellranger_count", "resources_dir": "resources_test"} ## VIASH END # find common input files resources_dir = Path(meta["resources_dir"]) input_dir = resources_dir / "cellranger_tiny_fastq" / "cellranger_tiny_fastq" -reference_index = resources_dir / "cellranger_tiny_fastq" / "cellranger_tiny_ref_v2_7_10_a/" -reference_gtf = resources_dir / "cellranger_tiny_fastq" / "cellranger_tiny_ref" / "genes" / "genes.gtf.gz" +reference_index = ( + resources_dir / "cellranger_tiny_fastq" / "cellranger_tiny_ref_v2_7_10_a/" +) +reference_gtf = ( + resources_dir + / "cellranger_tiny_fastq" + / "cellranger_tiny_ref" + / "genes" + / "genes.gtf.gz" +) + def test_two_samples(): input_id = ["mysample1", "mysample2"] input_r1 = [ input_dir / "tinygex_S1_L001_R1_001.fastq.gz", - input_dir / "tinygex_S1_L002_R1_001.fastq.gz" + input_dir / "tinygex_S1_L002_R1_001.fastq.gz", ] input_r2 = [ input_dir / "tinygex_S1_L001_R2_001.fastq.gz", - input_dir / "tinygex_S1_L002_R2_001.fastq.gz" + input_dir / "tinygex_S1_L002_R2_001.fastq.gz", ] output = Path("test_output") cmd_pars = [ meta["executable"], - "--input_id", ';'.join(input_id), - "--input_r1", ';'.join([str(r1) for r1 in input_r1]), - "--input_r2", ';'.join([str(r2) for r2 in input_r2]), - "--reference_index", reference_index, - "--reference_gtf", reference_gtf, - "--output", output, - "---cpus", "8", - "--outSAMattributes", "NH;HI;NM;MD" + "--input_id", + ";".join(input_id), + "--input_r1", + ";".join([str(r1) for r1 in input_r1]), + "--input_r2", + ";".join([str(r2) for r2 in input_r2]), + "--reference_index", + reference_index, + "--reference_gtf", + reference_gtf, + "--output", + output, + "---cpus", + "8", + "--outSAMattributes", + "NH;HI;NM;MD", ] subprocess.run([str(x) for x in cmd_pars], check=True) @@ -43,34 +57,42 @@ def test_two_samples(): "Log.final.out", "Aligned.out.bam", "Aligned.sorted.out.bam", - "htseq-count.txt" + "htseq-count.txt", ] for iid in input_id: for expected_file in expected_files: path = output / "per" / iid / expected_file assert path.exists(), f"Required file '{path}' is missing" + def test_one_sample(): input_id = ["mysample", "mysample"] input_r1 = [ input_dir / "tinygex_S1_L001_R1_001.fastq.gz", - input_dir / "tinygex_S1_L002_R1_001.fastq.gz" + input_dir / "tinygex_S1_L002_R1_001.fastq.gz", ] input_r2 = [ input_dir / "tinygex_S1_L001_R2_001.fastq.gz", - input_dir / "tinygex_S1_L002_R2_001.fastq.gz" + input_dir / "tinygex_S1_L002_R2_001.fastq.gz", ] output = Path("test_output") cmd_pars = [ meta["executable"], - "--input_id", ';'.join(input_id), - "--input_r1", ';'.join([str(r1) for r1 in input_r1]), - "--input_r2", ';'.join([str(r2) for r2 in input_r2]), - "--reference_index", reference_index, - "--reference_gtf", reference_gtf, - "--output", output, - "---cpus", "8" + "--input_id", + ";".join(input_id), + "--input_r1", + ";".join([str(r1) for r1 in input_r1]), + "--input_r2", + ";".join([str(r2) for r2 in input_r2]), + "--reference_index", + reference_index, + "--reference_gtf", + reference_gtf, + "--output", + output, + "---cpus", + "8", ] subprocess.run([str(x) for x in cmd_pars], check=True) @@ -79,13 +101,14 @@ def test_one_sample(): "Log.final.out", "Aligned.out.bam", "Aligned.sorted.out.bam", - "htseq-count.txt" + "htseq-count.txt", ] for iid in input_id: for expected_file in expected_files: path = output / "per" / iid / expected_file assert path.exists(), f"Required file '{path}' is missing" -if __name__ == '__main__': + +if __name__ == "__main__": test_two_samples() test_one_sample() diff --git a/src/mapping/multi_star_to_h5mu/script.py b/src/mapping/multi_star_to_h5mu/script.py index 1b6947934cc..976eb735c2d 100644 --- a/src/mapping/multi_star_to_h5mu/script.py +++ b/src/mapping/multi_star_to_h5mu/script.py @@ -6,13 +6,8 @@ import json ## VIASH START -par = { - "input": "output/A2_raw", - "output": "test_output.h5mu" -} -meta = { - "temp_dir": "/tmp" -} +par = {"input": "output/A2_raw", "output": "test_output.h5mu"} +meta = {"temp_dir": "/tmp"} ## VIASH END # convert to path @@ -31,21 +26,25 @@ input_counts, index_col=0, names=["cell_id", input_id], - dtype={"cell_id": "U", input_id: "i"} + dtype={"cell_id": "U", input_id: "i"}, ) data2 = data[~data.index.str.startswith("__")] with open(input_multiqc, "r") as file: qc = json.load(file) - + qc_star = qc.get("report_saved_raw_data", {}).get("multiqc_star", {}).get(input_id) - qc_htseq = qc.get("report_saved_raw_data", {}).get("multiqc_htseq", {}).get("htseq-count") + qc_htseq = ( + qc.get("report_saved_raw_data", {}).get("multiqc_htseq", {}).get("htseq-count") + ) - per_obs_data.append({ - "counts": data2.transpose(), - "qc_star": pd.DataFrame(qc_star, index=[input_id]), - "qc_htseq": pd.DataFrame(qc_htseq, index=[input_id]) - }) + per_obs_data.append( + { + "counts": data2.transpose(), + "qc_star": pd.DataFrame(qc_star, index=[input_id]), + "qc_htseq": pd.DataFrame(qc_htseq, index=[input_id]), + } + ) # combine all counts @@ -67,14 +66,11 @@ print("> construct anndata", flush=True) adata = ad.AnnData( - X=counts, - obsm={"qc_star": qc_star, "qc_htseq": qc_htseq}, - var=var, - dtype=np.int32 + X=counts, obsm={"qc_star": qc_star, "qc_htseq": qc_htseq}, var=var, dtype=np.int32 ) print("> convert to mudata", flush=True) mdata = md.MuData(adata) print("> write to file", flush=True) -mdata.write_h5mu(par["output"], compression=par["output_compression"]) \ No newline at end of file +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/mapping/multi_star_to_h5mu/test.py b/src/mapping/multi_star_to_h5mu/test.py index d646e3dd411..26d4711cfaf 100644 --- a/src/mapping/multi_star_to_h5mu/test.py +++ b/src/mapping/multi_star_to_h5mu/test.py @@ -3,9 +3,7 @@ import mudata as md ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ## VIASH END print("> Running command with folder", flush=True) @@ -14,10 +12,14 @@ cmd_pars = [ meta["executable"], - "--input", str(input), - "--output", output, - "---cpus", "2", - "--output_compression", "gzip" + "--input", + str(input), + "--output", + output, + "---cpus", + "2", + "--output_compression", + "gzip", ] subprocess.run(cmd_pars, check=True) @@ -39,4 +41,4 @@ assert "qc_star" in adata_rna.obsm, "RNA modality should contain STAR QC" assert "qc_htseq" in adata_rna.obsm, "RNA modality should contain htseq QC" -print("> Completed Successfully!", flush=True) \ No newline at end of file +print("> Completed Successfully!", flush=True) diff --git a/src/mapping/samtools_sort/script.py b/src/mapping/samtools_sort/script.py index 294991f3d03..89582605773 100644 --- a/src/mapping/samtools_sort/script.py +++ b/src/mapping/samtools_sort/script.py @@ -5,30 +5,25 @@ ## VIASH START par = { - 'input': ['resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam'], - 'output_bam': 'test_output.bam', - 'output_bai': 'test_output.bam.bai' -} -meta = { - 'cpus': 2, - 'temp_dir': '/tmp', - 'config': 'src/mapping/htseq/config.vsh.yaml' + "input": ["resources_test/cellranger_tiny_fastq/bam/possorted_genome_bam.bam"], + "output_bam": "test_output.bam", + "output_bai": "test_output.bam.bai", } +meta = {"cpus": 2, "temp_dir": "/tmp", "config": "src/mapping/htseq/config.vsh.yaml"} ## VIASH END + def generate_args(par, config): # fetch arguments from config arguments = [ - arg - for group in config["argument_groups"] - for arg in group["arguments"] + arg for group in config["argument_groups"] for arg in group["arguments"] ] cmd_args = [] for arg in arguments: arg_val = par.get(arg["name"].removeprefix("--")) - # The info key is always present (changed in viash 0.7.4) + # The info key is always present (changed in viash 0.7.4) # in the parsed config (None if not specified in source config) info = arg["info"] or {} orig_arg = info.get("orig_arg") @@ -48,19 +43,21 @@ def generate_args(par, config): return cmd_args + # read config arguments config = yaml.safe_load(Path(meta["config"]).read_text()) print(">> Constructing command", flush=True) -cmd_args = [ "samtools", "sort" ] + generate_args(par, config) +cmd_args = ["samtools", "sort"] + generate_args(par, config) # manually process cpus parameter -if 'cpus' in meta and meta['cpus']: +if "cpus" in meta and meta["cpus"]: cmd_args.extend(["--threads", str(meta["cpus"])]) # add memory -if 'memory_mb' in meta and meta['memory_mb']: +if "memory_mb" in meta and meta["memory_mb"]: import math - mem_per_thread = math.ceil(meta['memory_mb'] * .8 / meta['cpus']) + + mem_per_thread = math.ceil(meta["memory_mb"] * 0.8 / meta["cpus"]) cmd_args.extend(["-m", f"{mem_per_thread}M"]) with tempfile.TemporaryDirectory(prefix="samtools-", dir=meta["temp_dir"]) as temp_dir: @@ -69,11 +66,11 @@ def generate_args(par, config): # run command print(">> Running samtools sort with command:", flush=True) - print("+ " + ' '.join([str(x) for x in cmd_args]), flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) subprocess.run(cmd_args, check=True) if par.get("output_bai"): print(">> Running samtools index with command:", flush=True) cmd_index_args = ["samtools", "index", "-b", par["output_bam"], par["output_bai"]] - print("+ " + ' '.join([str(x) for x in cmd_index_args]), flush=True) - subprocess.run(cmd_index_args, check=True) \ No newline at end of file + print("+ " + " ".join([str(x) for x in cmd_index_args]), flush=True) + subprocess.run(cmd_index_args, check=True) diff --git a/src/mapping/samtools_sort/test.py b/src/mapping/samtools_sort/test.py index 2dd10886cae..0483f54fd25 100644 --- a/src/mapping/samtools_sort/test.py +++ b/src/mapping/samtools_sort/test.py @@ -2,9 +2,7 @@ from pathlib import Path ## VIASH START -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ## VIASH END print("> sort and index", flush=True) @@ -13,11 +11,16 @@ cmd_pars = [ meta["executable"], - "--input", input, - "--output_bam", output, - "--output_bai", output + ".bai", - "---memory", "3gb", - "---cpus", "2" + "--input", + input, + "--output_bam", + output, + "--output_bai", + output + ".bai", + "---memory", + "3gb", + "---cpus", + "2", ] subprocess.run(cmd_pars, check=True) @@ -32,11 +35,15 @@ cmd_pars = [ meta["executable"], - "--input", input, - "--output_bam", output2, + "--input", + input, + "--output_bam", + output2, "--sort_by_read_names", - "---memory", "3gb", - "---cpus", "2" + "---memory", + "3gb", + "---cpus", + "2", ] subprocess.run(cmd_pars, check=True) @@ -44,4 +51,4 @@ bam_path2 = Path(output2) assert bam_path2.is_file() -print("> Completed Successfully!", flush=True) \ No newline at end of file +print("> Completed Successfully!", flush=True) diff --git a/src/mapping/star_align/script.py b/src/mapping/star_align/script.py index d04347c74e4..25328ecf36f 100644 --- a/src/mapping/star_align/script.py +++ b/src/mapping/star_align/script.py @@ -8,20 +8,17 @@ ## VIASH START par = { - 'input': [ - 'resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L001_R1_001.fastq.gz', - 'resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L001_R2_001.fastq.gz', + "input": [ + "resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L001_R1_001.fastq.gz", + "resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L001_R2_001.fastq.gz", # 'resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L002_R1_001.fastq.gz', # 'resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/tinygex_S1_L002_R2_001.fastq.gz' ], # 'input': [ 'resources_test/cellranger_tiny_fastq/cellranger_tiny_fastq/' ], - 'reference': 'resources_test/cellranger_tiny_fastq/cellranger_tiny_ref_v2_7_10_a/', - 'output': 'test_output' -} -meta = { - 'cpus': 8, - 'temp_dir': '/tmp' + "reference": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref_v2_7_10_a/", + "output": "test_output", } +meta = {"cpus": 8, "temp_dir": "/tmp"} ## VIASH END ######################## @@ -32,42 +29,51 @@ # examples: # - TSP10_Fat_MAT_SS2_B134171_B115063_Immune_A1_L003_R1.fastq.gz # - tinygex_S1_L001_I1_001.fastq.gz -fastqgz_regex = r'(.+)_(R\d+)(_\d+)?\.fastq(\.gz)?' +fastqgz_regex = r"(.+)_(R\d+)(_\d+)?\.fastq(\.gz)?" + # helper function for cheching whether something is a gzip def is_gz_file(path: Path) -> bool: - with open(path, 'rb') as file: - return file.read(2) == b'\x1f\x8b' + with open(path, "rb") as file: + return file.read(2) == b"\x1f\x8b" + # look for fastq files in a directory def search_fastqs(path: Path) -> list[Path]: if path.is_dir(): - print(f"Input '{path}' is a directory, traversing to see if we can detect any FASTQ files.", flush=True) - value_paths = [file for file in path.iterdir() if re.match(fastqgz_regex, file.name) ] + print( + f"Input '{path}' is a directory, traversing to see if we can detect any FASTQ files.", + flush=True, + ) + value_paths = [ + file for file in path.iterdir() if re.match(fastqgz_regex, file.name) + ] return value_paths else: return [path] + # if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: - if par_value.is_file() and tarfile.is_tarfile(par_value): # Remove two extensions (if they exist) extaction_dir_name = Path(par_value.stem).stem unpacked_path = temp_dir_path / extaction_dir_name - print(f' Tar detected; extracting {par_value} to {unpacked_path}', flush=True) + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) - with tarfile.open(par_value, 'r') as open_tar: + with tarfile.open(par_value, "r") as open_tar: members = open_tar.getmembers() - root_dirs = [member - for member in members - if member.isdir() and member.name != '.' and '/' not in member.name] + root_dirs = [ + member + for member in members + if member.isdir() and member.name != "." and "/" not in member.name + ] # if there is only one root_dir (and there are files in that directory) # strip that directory name from the destination folder if len(root_dirs) == 1: for mem in members: mem.path = Path(*Path(mem.path).parts[1:]) - members_to_move = [mem for mem in members if mem.path != Path('.')] + members_to_move = [mem for mem in members if mem.path != Path(".")] open_tar.extractall(unpacked_path, members=members_to_move) return unpacked_path @@ -75,16 +81,17 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # Remove extension (if it exists) extaction_file_name = Path(par_value.stem) unpacked_path = temp_dir_path / extaction_file_name - print(f' Gzip detected; extracting {par_value} to {unpacked_path}', flush=True) + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) - with gzip.open(par_value, 'rb') as f_in: - with open(unpacked_path, 'wb') as f_out: + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: shutil.copyfileobj(f_in, f_out) return unpacked_path else: return par_value + ######################## ### Main code ### ######################## @@ -92,7 +99,12 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # rename keys and convert path strings to Path # note: only list file arguments here. if non-file arguments also need to be renamed, # the `processPar()` generator needs to be adapted -to_rename = {'input': 'readFilesIn', 'reference': 'genomeDir', 'output': 'outFileNamePrefix'} +to_rename = { + "input": "readFilesIn", + "reference": "genomeDir", + "output": "outFileNamePrefix", +} + def process_par(orig_par, to_rename): for key, value in orig_par.items(): @@ -109,12 +121,16 @@ def process_par(orig_par, to_rename): new_key = key new_value = value yield new_key, new_value + + par = dict(process_par(par, to_rename)) # create output dir if need be par["outFileNamePrefix"].mkdir(parents=True, exist_ok=True) -with tempfile.TemporaryDirectory(prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True) as temp_dir: +with tempfile.TemporaryDirectory( + prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True +) as temp_dir: print(">> Check whether input files are directories", flush=True) new_read_files_in = [] for path in par["readFilesIn"]: @@ -130,12 +146,15 @@ def process_par(orig_par, to_rename): # turn value into list is_multiple = isinstance(par_values, list) if not is_multiple: - par_values = [ par_values ] + par_values = [par_values] # output list new_values = [] for par_value in par_values: - print(f'>> Check compression of --{par_name} with value: {par_value}', flush=True) + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) new_value = extract_if_need_be(par_value, temp_dir_path) new_values.append(new_value) @@ -150,23 +169,23 @@ def process_par(orig_par, to_rename): print("Grouping R1/R2 input files into pairs", flush=True) input_grouped = {} - for path in par['readFilesIn']: + for path in par["readFilesIn"]: key = re.search(fastqgz_regex, path.name).group(2) if key not in input_grouped: input_grouped[key] = [] input_grouped[key].append(str(path)) - par['readFilesIn'] = [ ','.join(val) for val in input_grouped.values() ] + par["readFilesIn"] = [",".join(val) for val in input_grouped.values()] print("", flush=True) print(">> Constructing command", flush=True) par["runMode"] = "alignReads" par["outTmpDir"] = temp_dir_path / "run" - if 'cpus' in meta and meta['cpus']: + if "cpus" in meta and meta["cpus"]: par["runThreadN"] = meta["cpus"] # make sure there is a trailing / par["outFileNamePrefix"] = f"{par['outFileNamePrefix']}/" - cmd_args = [ "STAR" ] + cmd_args = ["STAR"] for name, value in par.items(): if value is not None: if isinstance(value, list): @@ -176,10 +195,7 @@ def process_par(orig_par, to_rename): print("", flush=True) print(">> Running STAR with command:", flush=True) - print("+ " + ' '.join([str(x) for x in cmd_args]), flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) print("", flush=True) - subprocess.run( - cmd_args, - check=True - ) \ No newline at end of file + subprocess.run(cmd_args, check=True) diff --git a/src/mapping/star_align/test.py b/src/mapping/star_align/test.py index 1fd57d46c16..6fd9c86fa82 100644 --- a/src/mapping/star_align/test.py +++ b/src/mapping/star_align/test.py @@ -6,24 +6,24 @@ import shutil ## VIASH START -meta = { - "name": "cellranger_count", - "resources_dir": "resources_test" -} +meta = {"name": "cellranger_count", "resources_dir": "resources_test"} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() ## Test 1: use input dir logger.info("> Running command with folder") input = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_fastq/" input_files = [ - input + "tinygex_S1_L002_R1_001.fastq.gz", - input + "tinygex_S1_L002_R2_001.fastq.gz" + input + "tinygex_S1_L002_R1_001.fastq.gz", + input + "tinygex_S1_L002_R2_001.fastq.gz", ] -reference = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_ref_v2_7_10_a/" +reference = ( + meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_ref_v2_7_10_a/" +) output = "test_output" with TemporaryDirectory() as tempdir: @@ -31,18 +31,22 @@ shutil.copyfile(file, Path(tempdir) / Path(file).name) cmd_pars = [ meta["executable"], - "--input", tempdir, - "--reference", reference, - "--output", output, - "---cpus", "2" + "--input", + tempdir, + "--reference", + reference, + "--output", + output, + "---cpus", + "2", ] subprocess.run(cmd_pars, check=True) logger.info("> Check if file exists") output_path = Path(output) -assert (output_path / "Log.final.out" ).is_file(), "No output log was created." -assert (output_path / "SJ.out.tab" ).is_file(), "No output file was created." +assert (output_path / "Log.final.out").is_file(), "No output log was created." +assert (output_path / "SJ.out.tab").is_file(), "No output file was created." ## Test 2: use input files @@ -51,11 +55,16 @@ cmd_pars = [ meta["executable"], - "--input", input_files[0], - "--input", input_files[1], - "--reference", reference, - "--output", output, - "---cpus", "8", + "--input", + input_files[0], + "--input", + input_files[1], + "--reference", + reference, + "--output", + output, + "---cpus", + "8", ] out = subprocess.check_output(cmd_pars).decode("utf-8") @@ -64,4 +73,4 @@ assert path.exists(output + "/SJ.out.tab"), "No output was created." -logger.info("> Completed Successfully!") \ No newline at end of file +logger.info("> Completed Successfully!") diff --git a/src/mapping/star_align_v273a/test.py b/src/mapping/star_align_v273a/test.py index 5a140c7c305..cdd128391d1 100644 --- a/src/mapping/star_align_v273a/test.py +++ b/src/mapping/star_align_v273a/test.py @@ -6,22 +6,20 @@ import shutil ## VIASH START -meta = { - "name": "cellranger_count", - "resources_dir": "resources_test" -} +meta = {"name": "cellranger_count", "resources_dir": "resources_test"} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() ## Test 1: use input dir logger.info("> Running command with folder") input = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_fastq/" input_files = [ - input + "tinygex_S1_L002_R1_001.fastq.gz", - input + "tinygex_S1_L002_R2_001.fastq.gz" + input + "tinygex_S1_L002_R1_001.fastq.gz", + input + "tinygex_S1_L002_R2_001.fastq.gz", ] reference = meta["resources_dir"] + "/cellranger_tiny_fastq/cellranger_tiny_ref/star" output = "test_output" @@ -31,18 +29,22 @@ shutil.copyfile(file, Path(tempdir) / Path(file).name) cmd_pars = [ meta["executable"], - "--input", tempdir, - "--reference", reference, - "--output", output, - "---cpus", "2" + "--input", + tempdir, + "--reference", + reference, + "--output", + output, + "---cpus", + "2", ] subprocess.run(cmd_pars, check=True) logger.info("> Check if file exists") output_path = Path(output) -assert (output_path / "Log.final.out" ).is_file(), "No output log was created." -assert (output_path / "SJ.out.tab" ).is_file(), "No output file was created." +assert (output_path / "Log.final.out").is_file(), "No output log was created." +assert (output_path / "SJ.out.tab").is_file(), "No output file was created." ## Test 2: use input files @@ -51,11 +53,16 @@ cmd_pars = [ meta["executable"], - "--input", input_files[0], - "--input", input_files[1], - "--reference", reference, - "--output", output, - "---cpus", "8", + "--input", + input_files[0], + "--input", + input_files[1], + "--reference", + reference, + "--output", + output, + "---cpus", + "8", ] out = subprocess.check_output(cmd_pars).decode("utf-8") @@ -64,4 +71,4 @@ assert path.exists(output + "/SJ.out.tab"), "No output was created." -logger.info("> Completed Successfully!") \ No newline at end of file +logger.info("> Completed Successfully!") diff --git a/src/metadata/add_id/script.py b/src/metadata/add_id/script.py index f80b86482ae..0a130c46f89 100644 --- a/src/metadata/add_id/script.py +++ b/src/metadata/add_id/script.py @@ -7,14 +7,16 @@ "input": "resources_test/concat_test_data/e18_mouse_brain_fresh_5k_filtered_feature_bc_matrix_subset.h5mu", "output": "foo.h5mu", "input_id": "mouse", - "make_observation_keys_unique": True + "make_observation_keys_unique": True, } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + def make_observation_keys_unique(sample_id: str, sample: MuData) -> None: """ Make the observation keys unique across all samples. At input, @@ -22,9 +24,11 @@ def make_observation_keys_unique(sample_id: str, sample: MuData) -> None: (unique for a sample) to each observation key, the observation key is made unique across all samples as well. """ - logger.info("Making observation keys unique across all " - "samples by appending prefix '%s' to the observation names.", - sample_id) + logger.info( + "Making observation keys unique across all " + "samples by appending prefix '%s' to the observation names.", + sample_id, + ) sample.obs.index = f"{sample_id}_" + sample.obs.index make_observation_keys_unique_per_mod(sample_id, sample) logger.info("Done making observation keys unique.") @@ -39,24 +43,37 @@ def make_observation_keys_unique_per_mod(sample_id: str, sample: MuData) -> None logger.info("Processing modality '%s'", mod_name) mod.obs_names = f"{sample_id}_" + mod.obs_names + def main(): logger.info("Reading input file '%s'.", par["input"]) input_data = read_h5mu(par["input"]) - logger.info("Adding column '%s' to global .obs dataframe, populated with ID '%s'", - par["obs_output"], par["input_id"]) + logger.info( + "Adding column '%s' to global .obs dataframe, populated with ID '%s'", + par["obs_output"], + par["input_id"], + ) input_data.obs[par["obs_output"]] = par["input_id"] logger.info("Done adding column to global .obs") for mod_name, mod_data in input_data.mod.items(): - logger.info("Adding column '%s' to .obs dataframe for modality '%s', " - "populated with ID '%s'", par["obs_output"], mod_name, par["input_id"]) + logger.info( + "Adding column '%s' to .obs dataframe for modality '%s', " + "populated with ID '%s'", + par["obs_output"], + mod_name, + par["input_id"], + ) mod_data.obs[par["obs_output"]] = par["input_id"] logger.info("Done adding per-modality columns.") if par["make_observation_keys_unique"]: make_observation_keys_unique(par["input_id"], input_data) - logger.info("Writing out data to '%s' with compression '%s'.", - par["output"], par["output_compression"]) + logger.info( + "Writing out data to '%s' with compression '%s'.", + par["output"], + par["output_compression"], + ) input_data.write_h5mu(par["output"], compression=par["output_compression"]) logger.info("Finished") -if __name__ == '__main__': - main() \ No newline at end of file + +if __name__ == "__main__": + main() diff --git a/src/metadata/add_id/test.py b/src/metadata/add_id/test.py index c2044e8cb32..1f02b52683f 100644 --- a/src/metadata/add_id/test.py +++ b/src/metadata/add_id/test.py @@ -7,36 +7,43 @@ ## VIASH START meta = { - 'executable': './target/executable/metadata/add_id/add_id', - 'resources_dir': './resources_test/concat_test_data/', - 'cpus': 2, - 'config': './src/metadata/add_id/config.vsh.yaml' + "executable": "./target/executable/metadata/add_id/add_id", + "resources_dir": "./resources_test/concat_test_data/", + "cpus": 2, + "config": "./src/metadata/add_id/config.vsh.yaml", } ## VIASH END + @pytest.fixture def generate_h5mu(): # generate data - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) - var = pd.DataFrame([["a"], ["b"], ["c"]], - index=df.columns, columns=["Feat"]) + var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) ad1 = AnnData(df, obs=obs, var=var) var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) ad2 = AnnData(df, obs=obs2, var=var2) - tmp_mudata = MuData({'mod1': ad1, 'mod2': ad2}) + tmp_mudata = MuData({"mod1": ad1, "mod2": ad2}) return tmp_mudata @pytest.mark.parametrize("output_compression", ["gzip", "lzf", None]) -def test_add_id(run_component, small_mudata, small_mudata_path, random_h5mu_path, output_compression): +def test_add_id( + run_component, small_mudata, small_mudata_path, random_h5mu_path, output_compression +): output_path = random_h5mu_path() args = [ - "--input", str(small_mudata_path), - "--output", str(output_path), - "--input_id", "test_id", + "--input", + str(small_mudata_path), + "--output", + str(output_path), + "--input_id", + "test_id", ] small_mudata.obs["sample_id"] = ["test_id", "test_id"] @@ -57,17 +64,25 @@ def test_add_id(run_component, small_mudata, small_mudata_path, random_h5mu_path assert_annotation_objects_equal(output_data, small_mudata) -def test_add_id_obs_output(run_component, small_mudata, small_mudata_path, random_h5mu_path): +def test_add_id_obs_output( + run_component, small_mudata, small_mudata_path, random_h5mu_path +): output_path = random_h5mu_path() # run component - run_component([ - "--input", str(small_mudata_path), - "--output", str(output_path), - "--input_id", "test_id", - "--obs_output", "test_key" - ]) - + run_component( + [ + "--input", + str(small_mudata_path), + "--output", + str(output_path), + "--input_id", + "test_id", + "--obs_output", + "test_key", + ] + ) + small_mudata.obs["test_key"] = ["test_id", "test_id"] small_mudata.mod["mod1"].obs["test_key"] = ["test_id", "test_id"] small_mudata.mod["mod2"].obs["test_key"] = ["test_id", "test_id"] @@ -76,22 +91,28 @@ def test_add_id_obs_output(run_component, small_mudata, small_mudata_path, rando assert output_path.is_file() output_data = read_h5mu(output_path) - - assert_annotation_objects_equal(output_data, small_mudata) + assert_annotation_objects_equal(output_data, small_mudata) -def test_add_id_observations_unique(run_component, small_mudata, small_mudata_path, random_h5mu_path): +def test_add_id_observations_unique( + run_component, small_mudata, small_mudata_path, random_h5mu_path +): output_path = random_h5mu_path() # run component - run_component([ - "--input", str(small_mudata_path), - "--output", str(output_path), - "--input_id", "test_id", - "--make_observation_keys_unique" - ]) - + run_component( + [ + "--input", + str(small_mudata_path), + "--output", + str(output_path), + "--input_id", + "test_id", + "--make_observation_keys_unique", + ] + ) + small_mudata.obs["sample_id"] = ["test_id", "test_id"] small_mudata.mod["mod1"].obs["sample_id"] = ["test_id", "test_id"] small_mudata.mod["mod2"].obs["sample_id"] = ["test_id", "test_id"] @@ -100,41 +121,46 @@ def test_add_id_observations_unique(run_component, small_mudata, small_mudata_pa small_mudata.mod["mod1"].obs.index = pd.Index(["test_id_obs1", "test_id_obs2"]) small_mudata.mod["mod2"].obs.index = pd.Index(["test_id_obs1", "test_id_obs2"]) small_mudata.update() - + assert output_path.is_file() output_data = read_h5mu(output_path) - + assert_annotation_objects_equal(output_data, small_mudata) - -def test_add_id_overwrites_output_column(run_component, small_mudata, small_mudata_path, random_h5mu_path): - + +def test_add_id_overwrites_output_column( + run_component, small_mudata, small_mudata_path, random_h5mu_path +): small_mudata.obs["already_exists"] = "alread_exists" for _, modality in small_mudata.mod.items(): modality.obs["already_exists"] = "alread_exists" output_path = random_h5mu_path() # run component - run_component([ - "--input", str(small_mudata_path), - "--output", str(output_path), - "--input_id", "test_id", - "--obs_output", "already_exists" - ]) - + run_component( + [ + "--input", + str(small_mudata_path), + "--output", + str(output_path), + "--input_id", + "test_id", + "--obs_output", + "already_exists", + ] + ) + small_mudata.obs["already_exists"] = ["test_id", "test_id"] small_mudata.mod["mod1"].obs["already_exists"] = ["test_id", "test_id"] small_mudata.mod["mod2"].obs["already_exists"] = ["test_id", "test_id"] small_mudata.update() small_mudata.strings_to_categoricals() - + assert output_path.is_file() output_data = read_h5mu(output_path) - + assert_annotation_objects_equal(output_data, small_mudata) if __name__ == "__main__": sys.exit(pytest.main([__file__])) - - diff --git a/src/metadata/duplicate_obs/script.py b/src/metadata/duplicate_obs/script.py index c79ac63b4df..57633f860e3 100644 --- a/src/metadata/duplicate_obs/script.py +++ b/src/metadata/duplicate_obs/script.py @@ -9,20 +9,19 @@ "input_obs_key": None, "output_obs_key": "index_copy", "output": "output.h5mu", - "output_compression": "gzip" -} -meta = { - "resources_dir": "src/metadata/copy_obs" + "output_compression": "gzip", } +meta = {"resources_dir": "src/metadata/copy_obs"} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Read mudata from file") -mdata = read_h5mu(par['input']) -adata = mdata.mod[par['modality']] +mdata = read_h5mu(par["input"]) +adata = mdata.mod[par["modality"]] def duplicate_obs(adata, input_key, output_key): @@ -33,17 +32,21 @@ def duplicate_obs(adata, input_key, output_key): logger.info(f"Copying .obs index to {output_key}") adata.obs[output_key] = adata.obs.index.copy() - -if not par["output_obs_key"] in adata.obs: + +if par["output_obs_key"] not in adata.obs: duplicate_obs(adata, par["input_obs_key"], par["output_obs_key"]) else: if not par["overwrite_existing_key"]: - raise ValueError(f"--output_obs_key already exists: `{par['output_obs_key']}`. Data can not be duplicated.") + raise ValueError( + f"--output_obs_key already exists: `{par['output_obs_key']}`. Data can not be duplicated." + ) - logger.warning(f"--output_obs_key already exists: `{par['output_obs_key']}`. Data in par['output_obs_key'] will be overwritten.") + logger.warning( + f"--output_obs_key already exists: `{par['output_obs_key']}`. Data in par['output_obs_key'] will be overwritten." + ) duplicate_obs(adata, par["input_obs_key"], par["output_obs_key"]) logger.info("Write output to mudata file") -mdata.write_h5mu(par['output'], compression=par["output_compression"]) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/metadata/duplicate_obs/test.py b/src/metadata/duplicate_obs/test.py index 7528d1fc731..3f6f6de3090 100644 --- a/src/metadata/duplicate_obs/test.py +++ b/src/metadata/duplicate_obs/test.py @@ -11,14 +11,16 @@ @pytest.fixture def input_h5mu(): # generate data - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) ad1 = ad.AnnData(df, obs=obs, var=var) var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) ad2 = ad.AnnData(df, obs=obs2, var=var2) - tmp_mudata = mu.MuData({'mod1': ad1, 'mod2': ad2}) + tmp_mudata = mu.MuData({"mod1": ad1, "mod2": ad2}) return tmp_mudata @@ -31,11 +33,16 @@ def test_copy_obs(run_component, random_h5mu_path, input_h5mu, input_h5mu_path): output_h5mu_path = random_h5mu_path() args = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--modality", "mod1", - "--input_obs_key", "Obs", - "--output_obs_key", "Obs_copy" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--modality", + "mod1", + "--input_obs_key", + "Obs", + "--output_obs_key", + "Obs_copy", ] run_component(args) @@ -45,19 +52,29 @@ def test_copy_obs(run_component, random_h5mu_path, input_h5mu, input_h5mu_path): output_h5mu = mu.read_h5mu(output_h5mu_path) assert "Obs_copy" in output_h5mu.mod["mod1"].obs, "obs key was not copied in mod1" - assert "Obs_copy" not in output_h5mu.mod["mod2"].obs, "obs key should not have been copied in mod2" - assert "Obs copy" not in input_h5mu.mod["mod1"].obs, "obs key should not have been copied in input file" - assert np.all(output_h5mu.mod["mod1"].obs["Obs"] == output_h5mu.mod["mod1"].obs["Obs_copy"]), "copied obs column should be identical to original obs column" + assert ( + "Obs_copy" not in output_h5mu.mod["mod2"].obs + ), "obs key should not have been copied in mod2" + assert ( + "Obs copy" not in input_h5mu.mod["mod1"].obs + ), "obs key should not have been copied in input file" + assert np.all( + output_h5mu.mod["mod1"].obs["Obs"] == output_h5mu.mod["mod1"].obs["Obs_copy"] + ), "copied obs column should be identical to original obs column" def test_copy_index(run_component, random_h5mu_path, input_h5mu, input_h5mu_path): output_h5mu_path = random_h5mu_path() args = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--modality", "mod1", - "--output_obs_key", "Obs_copy" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--modality", + "mod1", + "--output_obs_key", + "Obs_copy", ] run_component(args) @@ -67,36 +84,52 @@ def test_copy_index(run_component, random_h5mu_path, input_h5mu, input_h5mu_path output_h5mu = mu.read_h5mu(output_h5mu_path) assert "Obs_copy" in output_h5mu.mod["mod1"].obs, "obs key was not copied in mod1" - assert "Obs_copy" not in output_h5mu.mod["mod2"].obs, "obs key should not have been copied in mod2" - assert "Obs copy" not in input_h5mu.mod["mod1"].obs, "obs key should not have been copied in input file" - assert np.all(output_h5mu.mod["mod1"].obs.index == output_h5mu.mod["mod1"].obs["Obs_copy"]), "copied obs column should be identical to original obs index" + assert ( + "Obs_copy" not in output_h5mu.mod["mod2"].obs + ), "obs key should not have been copied in mod2" + assert ( + "Obs copy" not in input_h5mu.mod["mod1"].obs + ), "obs key should not have been copied in input file" + assert np.all( + output_h5mu.mod["mod1"].obs.index == output_h5mu.mod["mod1"].obs["Obs_copy"] + ), "copied obs column should be identical to original obs index" def test_overwrite_keys(run_component, random_h5mu_path, input_h5mu, input_h5mu_path): output_h5mu_path = random_h5mu_path() args = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--modality", "mod1", - "--input_obs_key", "Obs", - "--output_obs_key", "Obs" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--modality", + "mod1", + "--input_obs_key", + "Obs", + "--output_obs_key", + "Obs", ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) assert re.search( - r'ValueError: --output_obs_key already exists: \`Obs\`. Data can not be duplicated.', - err.value.stdout.decode('utf-8')) + r"ValueError: --output_obs_key already exists: \`Obs\`. Data can not be duplicated.", + err.value.stdout.decode("utf-8"), + ) disable_raise_args = [ - - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--modality", "mod1", + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--modality", + "mod1", "--overwrite_existing_key", - "--input_obs_key", "Obs", - "--output_obs_key", "Obs" + "--input_obs_key", + "Obs", + "--output_obs_key", + "Obs", ] run_component(disable_raise_args) diff --git a/src/metadata/duplicate_var/script.py b/src/metadata/duplicate_var/script.py index 1be01e2881a..d1389a80160 100644 --- a/src/metadata/duplicate_var/script.py +++ b/src/metadata/duplicate_var/script.py @@ -9,20 +9,19 @@ "output_var_key": "index_copy", "output": "output.h5mu", "output_compression": "gzip", - "overwrite_existing_key": False -} -meta = { - "resources_dir": "src/metadata/copy_var" + "overwrite_existing_key": False, } +meta = {"resources_dir": "src/metadata/copy_var"} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Read mudata from file") -mdata = read_h5mu(par['input']) -adata = mdata.mod[par['modality']] +mdata = read_h5mu(par["input"]) +adata = mdata.mod[par["modality"]] def duplicate_var(adata, input_key, output_key): @@ -34,16 +33,20 @@ def duplicate_var(adata, input_key, output_key): adata.var[output_key] = adata.var.index.copy() -if not par["output_var_key"] in adata.var: +if par["output_var_key"] not in adata.var: duplicate_var(adata, par["input_var_key"], par["output_var_key"]) else: if not par["overwrite_existing_key"]: - raise ValueError(f"--output_var_key already exists: `{par['output_var_key']}`. Data can not be duplicated.") + raise ValueError( + f"--output_var_key already exists: `{par['output_var_key']}`. Data can not be duplicated." + ) - logger.warning(f"--output_var_key already exists: `{par['output_var_key']}`. Data in `{par['output_var_key']}` .var column will be overwritten.") + logger.warning( + f"--output_var_key already exists: `{par['output_var_key']}`. Data in `{par['output_var_key']}` .var column will be overwritten." + ) duplicate_var(adata, par["input_var_key"], par["output_var_key"]) logger.info("Write output to mudata file") -mdata.write_h5mu(par['output'], compression=par["output_compression"]) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/metadata/duplicate_var/test.py b/src/metadata/duplicate_var/test.py index cc149f787da..bb452e5ccbb 100644 --- a/src/metadata/duplicate_var/test.py +++ b/src/metadata/duplicate_var/test.py @@ -11,14 +11,16 @@ @pytest.fixture def input_h5mu(): # generate data - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) ad1 = ad.AnnData(df, obs=obs, var=var) var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) ad2 = ad.AnnData(df, obs=obs2, var=var2) - tmp_mudata = mu.MuData({'mod1': ad1, 'mod2': ad2}) + tmp_mudata = mu.MuData({"mod1": ad1, "mod2": ad2}) return tmp_mudata @@ -31,11 +33,16 @@ def test_copy_var(run_component, random_h5mu_path, input_h5mu, input_h5mu_path): output_h5mu_path = random_h5mu_path() args = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--modality", "mod1", - "--input_var_key", "Feat", - "--output_var_key", "Feat_copy" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--modality", + "mod1", + "--input_var_key", + "Feat", + "--output_var_key", + "Feat_copy", ] run_component(args) @@ -45,18 +52,29 @@ def test_copy_var(run_component, random_h5mu_path, input_h5mu, input_h5mu_path): output_h5mu = mu.read_h5mu(output_h5mu_path) assert "Feat_copy" in output_h5mu.mod["mod1"].var, "var key was not copied in mod1" - assert "Feat_copy" not in output_h5mu.mod["mod2"].var, "var key should not have been copied in mod2" - assert "Feat_copy" not in input_h5mu.mod["mod1"].var, "var key should not have been copied in input file" - assert np.all(output_h5mu.mod["mod1"].var["Feat"] == output_h5mu.mod["mod1"].var["Feat_copy"]), "copied var column should be identical to original var column" + assert ( + "Feat_copy" not in output_h5mu.mod["mod2"].var + ), "var key should not have been copied in mod2" + assert ( + "Feat_copy" not in input_h5mu.mod["mod1"].var + ), "var key should not have been copied in input file" + assert np.all( + output_h5mu.mod["mod1"].var["Feat"] == output_h5mu.mod["mod1"].var["Feat_copy"] + ), "copied var column should be identical to original var column" + def test_copy_index(run_component, random_h5mu_path, input_h5mu, input_h5mu_path): output_h5mu_path = random_h5mu_path() args = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--modality", "mod1", - "--output_var_key", "Index_copy" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--modality", + "mod1", + "--output_var_key", + "Index_copy", ] run_component(args) @@ -65,36 +83,55 @@ def test_copy_index(run_component, random_h5mu_path, input_h5mu, input_h5mu_path output_h5mu = mu.read_h5mu(output_h5mu_path) - assert "Index_copy" in output_h5mu.mod["mod1"].var, "var index was not copied in mod1" - assert "Index_copy" not in output_h5mu.mod["mod2"].var, "var index should not have been copied in mod2" - assert "Index_copy" not in input_h5mu.mod["mod1"].var, "var index should not have been copied in input file" - assert np.all(output_h5mu.mod["mod1"].var.index == output_h5mu.mod["mod1"].var["Index_copy"]), "copied var index should be identical to original var index" + assert ( + "Index_copy" in output_h5mu.mod["mod1"].var + ), "var index was not copied in mod1" + assert ( + "Index_copy" not in output_h5mu.mod["mod2"].var + ), "var index should not have been copied in mod2" + assert ( + "Index_copy" not in input_h5mu.mod["mod1"].var + ), "var index should not have been copied in input file" + assert np.all( + output_h5mu.mod["mod1"].var.index == output_h5mu.mod["mod1"].var["Index_copy"] + ), "copied var index should be identical to original var index" + def test_overwrite_keys(run_component, random_h5mu_path, input_h5mu, input_h5mu_path): output_h5mu_path = random_h5mu_path() args = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--modality", "mod1", - "--input_var_key", "Feat", - "--output_var_key", "Feat" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--modality", + "mod1", + "--input_var_key", + "Feat", + "--output_var_key", + "Feat", ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) assert re.search( - r'ValueError: --output_var_key already exists: \`Feat\`. Data can not be duplicated.', - err.value.stdout.decode('utf-8')) + r"ValueError: --output_var_key already exists: \`Feat\`. Data can not be duplicated.", + err.value.stdout.decode("utf-8"), + ) disable_raise_args = [ - - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--modality", "mod1", + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--modality", + "mod1", "--overwrite_existing_key", - "--input_var_key", "Feat", - "--output_var_key", "Feat" + "--input_var_key", + "Feat", + "--output_var_key", + "Feat", ] run_component(disable_raise_args) diff --git a/src/metadata/grep_annotation_column/script.py b/src/metadata/grep_annotation_column/script.py index 20573577447..f5c29db0357 100644 --- a/src/metadata/grep_annotation_column/script.py +++ b/src/metadata/grep_annotation_column/script.py @@ -19,93 +19,129 @@ "input_id": "mouse", "output_match_column": "test", "output_fraction_column": "fraction_test", - "output_compression": "gzip" + "output_compression": "gzip", } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + def describe_array(arr, msg): # Note: sc.stats returns a DescribeResult NamedTuple. For NamedTuples, # the _asdict method is public facing even though it starts with an underscore. description = sc.stats.describe(arr)._asdict() - logger.info("%s:\nshape: %s\nmean: %s\nnobs: %s\n" - "variance: %s\nmin: %s\nmax: %s\ncontains na: %s\ndtype: %s\ncontains 0: %s", - msg, arr.shape, description["mean"], description["nobs"], - description["variance"], description["minmax"][0], - description["minmax"][1], np.isnan(arr).any(), arr.dtype, - (arr == 0).any()) + logger.info( + "%s:\nshape: %s\nmean: %s\nnobs: %s\n" + "variance: %s\nmin: %s\nmax: %s\ncontains na: %s\ndtype: %s\ncontains 0: %s", + msg, + arr.shape, + description["mean"], + description["nobs"], + description["variance"], + description["minmax"][0], + description["minmax"][1], + np.isnan(arr).any(), + arr.dtype, + (arr == 0).any(), + ) def main(par): - input_file, output_file, mod_name = Path(par["input"]), Path(par["output"]), par['modality'] + input_file, output_file, mod_name = ( + Path(par["input"]), + Path(par["output"]), + par["modality"], + ) logger.info(f"Compiling regular expression '{par['regex_pattern']}'.") try: compiled_regex = re.compile(par["regex_pattern"]) except (TypeError, re.error) as e: - raise ValueError(f"{par['regex_pattern']} is not a valid regular expression pattern.") from e + raise ValueError( + f"{par['regex_pattern']} is not a valid regular expression pattern." + ) from e else: if compiled_regex.groups: - raise NotImplementedError("Using match groups is not supported by this component.") - logger.info('Reading input file %s, modality %s.', input_file, mod_name) + raise NotImplementedError( + "Using match groups is not supported by this component." + ) + logger.info("Reading input file %s, modality %s.", input_file, mod_name) mudata = mu.read_h5mu(input_file) modality_data = mudata[mod_name] logger.info("Reading input file done.") logger.info("Using annotation dataframe '%s'.", par["matrix"]) - annotation_matrix = getattr(modality_data, par['matrix']) - default_column = { - "var": attrgetter("var_names"), - "obs": attrgetter("obs_names") - } + annotation_matrix = getattr(modality_data, par["matrix"]) + default_column = {"var": attrgetter("var_names"), "obs": attrgetter("obs_names")} if par["input_column"]: logger.info("Input column '%s' was specified.", par["input_column"]) try: annotation_column = annotation_matrix[par["input_column"]] except KeyError as e: - raise ValueError(f"Column {par['input_column']} could not be found for modality " - f"{par['modality']}. Available columns:" - f" {','.join(annotation_matrix.columns.to_list())}") from e + raise ValueError( + f"Column {par['input_column']} could not be found for modality " + f"{par['modality']}. Available columns:" + f" {','.join(annotation_matrix.columns.to_list())}" + ) from e else: logger.info(f"No input column specified, using '.{par['matrix']}_names'") - annotation_column = default_column[par['matrix']](modality_data).to_series() + annotation_column = default_column[par["matrix"]](modality_data).to_series() logger.info("Applying regex search.") grep_result = annotation_column.str.contains(par["regex_pattern"], regex=True) logger.info("Search results: %s", grep_result.value_counts()) - other_axis_attribute = { - "var": "obs", - "obs": "var" - } - if par['output_fraction_column']: - logger.info("Enabled writing the fraction of values that matches to the pattern.") - input_layer = modality_data.X if not par["input_layer"] else modality_data.layers[par["input_layer"]] + other_axis_attribute = {"var": "obs", "obs": "var"} + if par["output_fraction_column"]: + logger.info( + "Enabled writing the fraction of values that matches to the pattern." + ) + input_layer = ( + modality_data.X + if not par["input_layer"] + else modality_data.layers[par["input_layer"]] + ) totals = np.ravel(input_layer.sum(axis=1)) describe_array(totals, "Summary of total counts for layer") counts_for_matches = np.ravel(input_layer[:, grep_result].sum(axis=1)) describe_array(counts_for_matches, "Summary of counts matching grep") - with np.errstate(all='raise'): - pct_matching = np.divide(counts_for_matches, totals, - out=np.zeros_like(totals, dtype=np.float64), - where=(~np.isclose(totals, np.zeros_like(totals)))) + with np.errstate(all="raise"): + pct_matching = np.divide( + counts_for_matches, + totals, + out=np.zeros_like(totals, dtype=np.float64), + where=(~np.isclose(totals, np.zeros_like(totals))), + ) logger.info("Testing wether or not fractions data contains NA.") assert ~np.isnan(pct_matching).any(), "Fractions should not contain NA." logger.info("Fraction statistics: \n%s", Series(pct_matching).describe()) pct_matching = np.where(np.isclose(pct_matching, 0, atol=1e-6), 0, pct_matching) pct_matching = np.where(np.isclose(pct_matching, 1, atol=1e-6), 1, pct_matching) - assert (np.logical_and(pct_matching >= 0, pct_matching <= 1)).all(), \ - "Fractions are not within bounds, please report this as a bug" - output_matrix = other_axis_attribute[par['matrix']] - logger.info("Writing fractions to matrix '%s', column '%s'", - output_matrix, par['output_fraction_column']) - getattr(modality_data, output_matrix)[par['output_fraction_column']] = pct_matching - logger.info("Adding values that matched the pattern to '%s', column '%s'", - par["matrix"], par["output_match_column"]) - getattr(modality_data, par['matrix'])[par["output_match_column"]] = grep_result - logger.info("Writing out data to '%s' with compression '%s'.", - output_file, par["output_compression"]) + assert ( + np.logical_and(pct_matching >= 0, pct_matching <= 1) + ).all(), "Fractions are not within bounds, please report this as a bug" + output_matrix = other_axis_attribute[par["matrix"]] + logger.info( + "Writing fractions to matrix '%s', column '%s'", + output_matrix, + par["output_fraction_column"], + ) + getattr(modality_data, output_matrix)[par["output_fraction_column"]] = ( + pct_matching + ) + logger.info( + "Adding values that matched the pattern to '%s', column '%s'", + par["matrix"], + par["output_match_column"], + ) + getattr(modality_data, par["matrix"])[par["output_match_column"]] = grep_result + logger.info( + "Writing out data to '%s' with compression '%s'.", + output_file, + par["output_compression"], + ) mudata.write(output_file, compression=par["output_compression"]) + if __name__ == "__main__": - main(par) \ No newline at end of file + main(par) diff --git a/src/metadata/grep_annotation_column/test.py b/src/metadata/grep_annotation_column/test.py index e0645270625..b04b5842985 100644 --- a/src/metadata/grep_annotation_column/test.py +++ b/src/metadata/grep_annotation_column/test.py @@ -1,6 +1,3 @@ - - - import sys import pytest import pandas as pd @@ -17,10 +14,10 @@ ## VIASH START meta = { - 'executable': './target/executable/metadata/grep_annotation_column/grep_annotation_column', - 'resources_dir': './resources_test/concat_test_data/', - 'cpus': 2, - 'config': './src/metadata/grep_annotation_column/config.vsh.yaml' + "executable": "./target/executable/metadata/grep_annotation_column/grep_annotation_column", + "resources_dir": "./resources_test/concat_test_data/", + "cpus": 2, + "config": "./src/metadata/grep_annotation_column/config.vsh.yaml", } ## VIASH END @@ -28,345 +25,469 @@ @pytest.fixture def generate_h5mu(): # generate data - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) obs = pd.DataFrame([["A"], ["B"]], index=df.index, columns=["Obs"]) - var = pd.DataFrame([["a"], ["b"], ["c"]], - index=df.columns, columns=["Feat"]) + var = pd.DataFrame([["a"], ["b"], ["c"]], index=df.columns, columns=["Feat"]) ad1 = AnnData(df, obs=obs, var=var) var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) ad2 = AnnData(df, obs=obs2, var=var2) - tmp_mudata = MuData({'mod1': ad1, 'mod2': ad2}) + tmp_mudata = MuData({"mod1": ad1, "mod2": ad2}) return tmp_mudata -@pytest.fixture(params=[ - np.float32, np.float64, np.uint8, np.uint16, np.uint32, np.int8, np.int16, np.int64 -]) + +@pytest.fixture( + params=[ + np.float32, + np.float64, + np.uint8, + np.uint16, + np.uint32, + np.int8, + np.int16, + np.int64, + ] +) def very_sparse_mudata(request): # NOTE: np.float16 is not a supported type in scipy! # See https://github.com/scipy/scipy/issues/20200#issuecomment-1982170609 # and https://github.com/scipy/scipy/issues/20200 rng = np.random.default_rng() shape = (10000, 200) - random_counts = scipy.sparse.random(*shape, - density=0.00001, - format='csr', - dtype=request.param, - random_state=rng, - data_rvs=lambda length: np.array([1] * length)) - permutation_length_obs = int(math.log(shape[0])/math.log(26)) + 1 - obs_index_perms = permutations('abcdefghijklmnopqrstuvwxyz', r=permutation_length_obs) + random_counts = scipy.sparse.random( + *shape, + density=0.00001, + format="csr", + dtype=request.param, + random_state=rng, + data_rvs=lambda length: np.array([1] * length), + ) + permutation_length_obs = int(math.log(shape[0]) / math.log(26)) + 1 + obs_index_perms = permutations( + "abcdefghijklmnopqrstuvwxyz", r=permutation_length_obs + ) obs_index = pd.Index(["".join(x) for x in islice(obs_index_perms, shape[0])]) obs = pd.DataFrame(index=obs_index) - permutation_length_var = int(math.log(shape[1])/math.log(26)) + 1 - var_index_perms = permutations("ABCDEFGHIJKLMNOPQRSTUVWXYZ", r=permutation_length_var) + permutation_length_var = int(math.log(shape[1]) / math.log(26)) + 1 + var_index_perms = permutations( + "ABCDEFGHIJKLMNOPQRSTUVWXYZ", r=permutation_length_var + ) var_index = pd.Index(["".join(x) for x in islice(var_index_perms, shape[1])]) var = pd.DataFrame(index=var_index) mod = AnnData(X=random_counts, var=var, obs=obs) - return MuData({'mod1': mod}) - -@pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) -def test_grep_column(run_component, generate_h5mu, - random_h5mu_path, write_mudata_to_file, - compression_format): - output_path = random_h5mu_path() - input_path = write_mudata_to_file(generate_h5mu) + return MuData({"mod1": mod}) - # run component - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--input_column", "Feat", - "--regex_pattern", "^var1", - "--output_match_column", "test", - "--output_compression", compression_format - ]) - assert output_path.is_file() - - # check output - output_data = read_h5mu(output_path) - assert "test" in output_data.mod['mod1'].var.columns.to_list() - assert output_data.mod['mod1'].var['test'].to_list() == [True, False, False] - - assert_annotation_objects_equal(input_path, - remove_annotation_column(output_data, "test", "var", "mod1"), - check_data=True) @pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) -def test_grep_column_default(run_component, generate_h5mu, - random_h5mu_path, write_mudata_to_file, - compression_format): +def test_grep_column_default( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): output_path = random_h5mu_path() input_path = write_mudata_to_file(generate_h5mu) # run component - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--regex_pattern", "^var1", - "--output_match_column", "test", - "--output_compression", compression_format - ]) + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--regex_pattern", + "^var1", + "--output_match_column", + "test", + "--output_compression", + compression_format, + ] + ) assert output_path.is_file() # check output output_data = read_h5mu(output_path) - assert "test" in output_data.mod['mod1'].var.columns.to_list() - assert output_data.mod['mod1'].var['test'].to_list() == [True, False, False] + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + + assert_annotation_objects_equal( + input_path, + remove_annotation_column(output_data, "test", "var", "mod1"), + check_data=True, + ) - assert_annotation_objects_equal(input_path, - remove_annotation_column(output_data, "test", "var", "mod1"), - check_data=True) @pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) -def test_grep_column(run_component, generate_h5mu, - random_h5mu_path, write_mudata_to_file, - compression_format): +def test_grep_column( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): output_path = random_h5mu_path() input_path = write_mudata_to_file(generate_h5mu) # run component - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--input_column", "Feat", - "--regex_pattern", "^a", - "--output_match_column", "test", - "--output_compression", compression_format - ]) + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_compression", + compression_format, + ] + ) assert output_path.is_file() # check output output_data = read_h5mu(output_path) - assert "test" in output_data.mod['mod1'].var.columns.to_list() - assert output_data.mod['mod1'].var['test'].to_list() == [True, False, False] + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + + assert_annotation_objects_equal( + input_path, + remove_annotation_column(output_data, "test", "var", "mod1"), + check_data=True, + ) - assert_annotation_objects_equal(input_path, - remove_annotation_column(output_data, "test", "var", "mod1"), - check_data=True) @pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) -def test_grep_column_fraction_column(run_component, generate_h5mu, - random_h5mu_path, write_mudata_to_file, - compression_format): +def test_grep_column_fraction_column( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): output_path = random_h5mu_path() input_path = write_mudata_to_file(generate_h5mu) # run component - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--input_column", "Feat", - "--regex_pattern", "^a", - "--output_match_column", "test", - "--output_fraction_column", "test_output_fraction", - "--output_compression", compression_format - ]) + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + "--output_compression", + compression_format, + ] + ) assert output_path.is_file() # check output output_data = read_h5mu(output_path) - assert "test" in output_data.mod['mod1'].var.columns.to_list() - assert output_data.mod['mod1'].var['test'].to_list() == [True, False, False] - assert output_data.mod['mod1'].obs['test_output_fraction'].to_list() == [1/6, 4/15] - output_object_without_obs_column = remove_annotation_column(output_data, - ["test_output_fraction"], - "obs", "mod1") - output_object_without_wo_output = remove_annotation_column(output_object_without_obs_column, - ["test"], - "var", "mod1") - assert_annotation_objects_equal(input_path, - output_object_without_wo_output, - check_data=True) + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + assert output_data.mod["mod1"].obs["test_output_fraction"].to_list() == [ + 1 / 6, + 4 / 15, + ] + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + @pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) -def test_fraction_column_nothing_matches(run_component, generate_h5mu, - random_h5mu_path, write_mudata_to_file, - compression_format): +def test_fraction_column_nothing_matches( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): output_path = random_h5mu_path() input_path = write_mudata_to_file(generate_h5mu) # run component - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--input_column", "Feat", - "--regex_pattern", "^doesnotmatch$", - "--output_match_column", "test", - "--output_fraction_column", "test_output_fraction", - "--output_compression", compression_format - ]) + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^doesnotmatch$", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + "--output_compression", + compression_format, + ] + ) assert output_path.is_file() # check output output_data = read_h5mu(output_path) - assert "test" in output_data.mod['mod1'].var.columns.to_list() - assert output_data.mod['mod1'].var['test'].to_list() == [False, False, False] - assert output_data.mod['mod1'].obs['test_output_fraction'].to_list() == [0.0, 0.0] - output_object_without_obs_column = remove_annotation_column(output_data, - ["test_output_fraction"], - "obs", "mod1") - output_object_without_wo_output = remove_annotation_column(output_object_without_obs_column, - ["test"], - "var", "mod1") - assert_annotation_objects_equal(input_path, - output_object_without_wo_output, - check_data=True) - -def test_fraction_column_with_no_counts(run_component, generate_h5mu, - random_h5mu_path, write_mudata_to_file): + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [False, False, False] + assert output_data.mod["mod1"].obs["test_output_fraction"].to_list() == [0.0, 0.0] + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + + +def test_fraction_column_with_no_counts( + run_component, generate_h5mu, random_h5mu_path, write_mudata_to_file +): output_path = random_h5mu_path() - generate_h5mu.mod['mod1'].X[0] = 0 + generate_h5mu.mod["mod1"].X[0] = 0 input_path = write_mudata_to_file(generate_h5mu) # run component - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--input_column", "Feat", - "--regex_pattern", "^a", - "--output_match_column", "test", - "--output_fraction_column", "test_output_fraction" - ]) + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + ] + ) assert output_path.is_file() # check output output_data = read_h5mu(output_path) - assert "test" in output_data.mod['mod1'].var.columns.to_list() - assert output_data.mod['mod1'].var['test'].to_list() == [True, False, False] - assert output_data.mod['mod1'].obs['test_output_fraction'].to_list() == [0.0, 4/15] - output_object_without_obs_column = remove_annotation_column(output_data, - ["test_output_fraction"], - "obs", "mod1") - output_object_without_wo_output = remove_annotation_column(output_object_without_obs_column, - ["test"], - "var", "mod1") - assert_annotation_objects_equal(input_path, - output_object_without_wo_output, - check_data=True) - - -def test_fraction_column_very_sparse(run_component, very_sparse_mudata, - random_h5mu_path, write_mudata_to_file): + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + assert output_data.mod["mod1"].obs["test_output_fraction"].to_list() == [ + 0.0, + 4 / 15, + ] + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + + +def test_fraction_column_very_sparse( + run_component, very_sparse_mudata, random_h5mu_path, write_mudata_to_file +): output_path = random_h5mu_path() input_path = write_mudata_to_file(very_sparse_mudata) # run component - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--regex_pattern", "^a", - "--output_match_column", "test", - "--output_fraction_column", "test_output_fraction" - ]) + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + ] + ) assert output_path.is_file() # check output output_data = read_h5mu(output_path) - assert "test" in output_data.mod['mod1'].var.columns.to_list() - output_object_without_obs_column = remove_annotation_column(output_data, - ["test_output_fraction"], - "obs", "mod1") - output_object_without_wo_output = remove_annotation_column(output_object_without_obs_column, - ["test"], - "var", "mod1") - assert_annotation_objects_equal(input_path, - output_object_without_wo_output, - check_data=True) + assert "test" in output_data.mod["mod1"].var.columns.to_list() + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + @pytest.mark.parametrize("compression_format", ["gzip", "lzf"]) -def test_fraction_column_input_layer(run_component, generate_h5mu, - random_h5mu_path, write_mudata_to_file, - compression_format): +def test_fraction_column_input_layer( + run_component, + generate_h5mu, + random_h5mu_path, + write_mudata_to_file, + compression_format, +): output_path = random_h5mu_path() generate_h5mu.mod["mod1"].layers["test_data"] = generate_h5mu.mod["mod1"].X.copy() generate_h5mu.mod["mod1"].X = None input_path = write_mudata_to_file(generate_h5mu) # run component - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--input_column", "Feat", - "--regex_pattern", "^a", - "--output_match_column", "test", - "--output_fraction_column", "test_output_fraction", - "--input_layer", "test_data", - "--output_compression", compression_format - ]) + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_fraction_column", + "test_output_fraction", + "--input_layer", + "test_data", + "--output_compression", + compression_format, + ] + ) assert output_path.is_file() # check output output_data = read_h5mu(output_path) - assert "test" in output_data.mod['mod1'].var.columns.to_list() - assert output_data.mod['mod1'].var['test'].to_list() == [True, False, False] - output_data.mod['mod1'].obs['test_output_fraction'].to_list() == [1/6, 4/15] - output_object_without_obs_column = remove_annotation_column(output_data, - ["test_output_fraction"], - "obs", "mod1") - output_object_without_wo_output = remove_annotation_column(output_object_without_obs_column, - ["test"], - "var", "mod1") - assert_annotation_objects_equal(input_path, - output_object_without_wo_output, - check_data=True) - - -def test_missing_column(run_component, generate_h5mu, - random_h5mu_path, write_mudata_to_file): + assert "test" in output_data.mod["mod1"].var.columns.to_list() + assert output_data.mod["mod1"].var["test"].to_list() == [True, False, False] + output_data.mod["mod1"].obs["test_output_fraction"].to_list() == [1 / 6, 4 / 15] + output_object_without_obs_column = remove_annotation_column( + output_data, ["test_output_fraction"], "obs", "mod1" + ) + output_object_without_wo_output = remove_annotation_column( + output_object_without_obs_column, ["test"], "var", "mod1" + ) + assert_annotation_objects_equal( + input_path, output_object_without_wo_output, check_data=True + ) + + +def test_missing_column( + run_component, generate_h5mu, random_h5mu_path, write_mudata_to_file +): output_path = random_h5mu_path() input_path = write_mudata_to_file(generate_h5mu) with pytest.raises(CalledProcessError) as err: - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--input_column", "filliberke", - "--regex_pattern", "^a", - "--output_match_column", "test", - "--output_compression", "gzip", - ]) - assert "ValueError: Column filliberke could not be found for modality mod1" in \ - err.value.stdout.decode('utf-8') - -def test_invalid_regex_pattern(run_component, generate_h5mu, - random_h5mu_path, write_mudata_to_file): + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "filliberke", + "--regex_pattern", + "^a", + "--output_match_column", + "test", + "--output_compression", + "gzip", + ] + ) + assert ( + "ValueError: Column filliberke could not be found for modality mod1" + in err.value.stdout.decode("utf-8") + ) + + +def test_invalid_regex_pattern( + run_component, generate_h5mu, random_h5mu_path, write_mudata_to_file +): output_path = random_h5mu_path() input_path = write_mudata_to_file(generate_h5mu) with pytest.raises(CalledProcessError) as err: - run_component([ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "mod1", - "--matrix", "var", - "--input_column", "Feat", - "--regex_pattern", "(a", - "--output_match_column", "test", - "--output_compression", "gzip", - ]) - assert "ValueError: (a is not a valid regular expression pattern." in \ - err.value.stdout.decode('utf-8') + run_component( + [ + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "mod1", + "--matrix", + "var", + "--input_column", + "Feat", + "--regex_pattern", + "(a", + "--output_match_column", + "test", + "--output_compression", + "gzip", + ] + ) + assert ( + "ValueError: (a is not a valid regular expression pattern." + in err.value.stdout.decode("utf-8") + ) if __name__ == "__main__": sys.exit(pytest.main([__file__, "-s"])) - diff --git a/src/metadata/join_csv/script.py b/src/metadata/join_csv/script.py index 58250a4916e..f3c8285453b 100644 --- a/src/metadata/join_csv/script.py +++ b/src/metadata/join_csv/script.py @@ -10,12 +10,13 @@ "modality": "rna", "csv_key": "id", "obs_key": "sample_id", - "var_key": None + "var_key": None, } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() if par["obs_key"] and par["var_key"]: @@ -24,15 +25,15 @@ raise ValueError("Must define either --obs_key or --var_key") logger.info("Read metadata csv from file") -metadata = pd.read_csv(par['input_csv'], sep=",", header=0, index_col=par["csv_key"]) -metadata.fillna('', inplace=True) +metadata = pd.read_csv(par["input_csv"], sep=",", header=0, index_col=par["csv_key"]) +metadata.fillna("", inplace=True) logger.info("Read mudata from file") -mdata = read_h5mu(par['input']) -mod_data = mdata.mod[par['modality']] +mdata = read_h5mu(par["input"]) +mod_data = mdata.mod[par["modality"]] logger.info("Joining csv to mudata") -matrix = 'var' if par["var_key"] else 'obs' +matrix = "var" if par["var_key"] else "obs" matrix_sample_column_name = par["var_key"] if par["var_key"] else par["obs_key"] original_matrix = getattr(mod_data, matrix) sample_ids = original_matrix[matrix_sample_column_name] @@ -40,16 +41,15 @@ try: new_columns = metadata.loc[sample_ids.tolist()] except KeyError as e: - raise KeyError(f"Not all sample IDs selected from {matrix} " - "(using the column selected with --var_key or --obs_key) were found in " - "the csv file.") from e -new_matrix = pd.concat([original_matrix.reset_index(drop=True), - new_columns.reset_index(drop=True)], axis=1)\ - .set_axis(original_matrix.index) + raise KeyError( + f"Not all sample IDs selected from {matrix} " + "(using the column selected with --var_key or --obs_key) were found in " + "the csv file." + ) from e +new_matrix = pd.concat( + [original_matrix.reset_index(drop=True), new_columns.reset_index(drop=True)], axis=1 +).set_axis(original_matrix.index) setattr(mod_data, matrix, new_matrix) logger.info("Write output to mudata file") -mdata.write_h5mu(par['output'], compression=par["output_compression"]) - - - +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/metadata/join_csv/test.py b/src/metadata/join_csv/test.py index 754d73284eb..b50e19c461c 100644 --- a/src/metadata/join_csv/test.py +++ b/src/metadata/join_csv/test.py @@ -8,91 +8,129 @@ ## VIASH START meta = { - 'executable': './target/executable/integrate/add_metadata/add_metadata', + "executable": "./target/executable/integrate/add_metadata/add_metadata", } ## VIASH END + @pytest.fixture def modality_1(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) - obs = pd.DataFrame([["A", "sample1"], ["B", "sample2"]], index=df.index, columns=["Obs", "sample_id"]) - var = pd.DataFrame([["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], - index=df.columns, columns=["Feat", "sample_id_var"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame( + [["A", "sample1"], ["B", "sample2"]], + index=df.index, + columns=["Obs", "sample_id"], + ) + var = pd.DataFrame( + [["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], + index=df.columns, + columns=["Feat", "sample_id_var"], + ) return AnnData(df, obs=obs, var=var) + @pytest.fixture def modality_2(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) - return AnnData(df, obs=obs2, var=var2) + return AnnData(df, obs=obs2, var=var2) + @pytest.fixture def sample_h5mu(modality_1, modality_2): - mudata = MuData({'mod1': modality_1, 'mod2': modality_2}) + mudata = MuData({"mod1": modality_1, "mod2": modality_2}) return mudata + @pytest.fixture def sample_h5mu_path(sample_h5mu, random_path): output = random_path() sample_h5mu.write(output) return output + def test_add_metadata_var(run_component, random_path, sample_h5mu, sample_h5mu_path): input_csv = random_path("csv") output_h5mu = random_path("h5mu") # create csv - csv = pd.DataFrame({"id": ["sample1", "sample2"], "foo": ["v", "w"], "bar": ["x", "y"]}) + csv = pd.DataFrame( + {"id": ["sample1", "sample2"], "foo": ["v", "w"], "bar": ["x", "y"]} + ) csv.to_csv(str(input_csv), index=False) - run_component([ - "--input", str(sample_h5mu_path), - "--input_csv", str(input_csv), - "--output", str(output_h5mu), - "--modality", "mod1", - "--var_key", "sample_id_var", - "--csv_key", "id", - "--compression_output", "gzip" - ]) + run_component( + [ + "--input", + str(sample_h5mu_path), + "--input_csv", + str(input_csv), + "--output", + str(output_h5mu), + "--modality", + "mod1", + "--var_key", + "sample_id_var", + "--csv_key", + "id", + "--compression_output", + "gzip", + ] + ) output_mudata = read_h5mu(output_h5mu) expected_var = pd.DataFrame( { "Feat": ["a", "b", "c"], "sample_id_var": ["sample1", "sample2", "sample1"], "foo": ["v", "w", "v"], - "bar": ["x", "y", "x"] + "bar": ["x", "y", "x"], }, - index=pd.Index(['var1', 'var2', 'var3']), + index=pd.Index(["var1", "var2", "var3"]), ).astype( { "Feat": "object", "sample_id_var": "category", "foo": "category", - "bar": "category" + "bar": "category", } ) sample_h5mu.mod["mod1"].var = expected_var assert_annotation_objects_equal(sample_h5mu, output_mudata) + def test_add_metadata_matrix_sample_column(run_component, tmp_path, sample_h5mu): input_h5mu, input_mudata = sample_h5mu input_csv = tmp_path / "input.csv" output_h5mu = tmp_path / "output.h5mu" # create csv - csv = pd.DataFrame({"id": ["sample1", "sample2"], "foo": ["v", "w"], "bar": ["x", "y"]}) + csv = pd.DataFrame( + {"id": ["sample1", "sample2"], "foo": ["v", "w"], "bar": ["x", "y"]} + ) csv.to_csv(str(input_csv), index=False) - run_component([ - "--input", str(input_h5mu), - "--input_csv", str(input_csv), - "--output", str(output_h5mu), - "--modality", "mod1", - "--obs_key", "sample_id", - "--csv_key", "id", - ]) + run_component( + [ + "--input", + str(input_h5mu), + "--input_csv", + str(input_csv), + "--output", + str(output_h5mu), + "--modality", + "mod1", + "--obs_key", + "sample_id", + "--csv_key", + "id", + ] + ) result = read_h5mu(output_h5mu) expected_obs = pd.DataFrame( @@ -100,40 +138,49 @@ def test_add_metadata_matrix_sample_column(run_component, tmp_path, sample_h5mu) "Obs": ["A", "B"], "sample_id": ["sample1", "sample2"], "foo": ["v", "w"], - "bar": ["x", "y"] + "bar": ["x", "y"], }, - index=pd.Index(['obs1', 'obs2']), - ).astype( - { - "Obs": "object", - "foo": "object", - "bar": "object" - } - ) - pd.testing.assert_frame_equal(result.mod['mod1'].obs, expected_obs) - pd.testing.assert_frame_equal(result.mod['mod1'].var, input_mudata.mod['mod1'].var) - pd.testing.assert_frame_equal(result.mod['mod2'].obs, input_mudata.mod['mod2'].obs) - pd.testing.assert_frame_equal(result.mod['mod2'].var, input_mudata.mod['mod2'].var) + index=pd.Index(["obs1", "obs2"]), + ).astype({"Obs": "object", "foo": "object", "bar": "object"}) + pd.testing.assert_frame_equal(result.mod["mod1"].obs, expected_obs) + pd.testing.assert_frame_equal(result.mod["mod1"].var, input_mudata.mod["mod1"].var) + pd.testing.assert_frame_equal(result.mod["mod2"].obs, input_mudata.mod["mod2"].obs) + pd.testing.assert_frame_equal(result.mod["mod2"].var, input_mudata.mod["mod2"].var) + def test_add_not_all_samples_in_csv_raises(run_component, tmp_path, sample_h5mu): input_h5mu, input_mudata = sample_h5mu input_csv = tmp_path / "input.csv" output_h5mu = tmp_path / "output.h5mu" - csv = pd.DataFrame({"id": ["sample1", "lorem"], "foo": ["v", "w"], "bar": ["x", "y"]}) + csv = pd.DataFrame( + {"id": ["sample1", "lorem"], "foo": ["v", "w"], "bar": ["x", "y"]} + ) csv.to_csv(str(input_csv), index=False) with pytest.raises(subprocess.CalledProcessError) as err: - run_component([ - "--input", str(input_h5mu), - "--input_csv", str(input_csv), - "--output", str(output_h5mu), - "--modality", "mod1", - "--obs_key", "sample_id", - "--csv_key", "id", - ]) - assert "Not all sample IDs selected from obs (using the column selected " \ - "with --var_key or --obs_key) were found in the csv file." in err.value.stdout.decode('utf-8') + run_component( + [ + "--input", + str(input_h5mu), + "--input_csv", + str(input_csv), + "--output", + str(output_h5mu), + "--modality", + "mod1", + "--obs_key", + "sample_id", + "--csv_key", + "id", + ] + ) + assert ( + "Not all sample IDs selected from obs (using the column selected " + "with --var_key or --obs_key) were found in the csv file." + in err.value.stdout.decode("utf-8") + ) + if __name__ == "__main__": - sys.exit(pytest.main([__file__, "-k", "test_add_metadata_var"])) \ No newline at end of file + sys.exit(pytest.main([__file__, "-k", "test_add_metadata_var"])) diff --git a/src/metadata/join_uns_to_obs/script.py b/src/metadata/join_uns_to_obs/script.py index 49451e8c758..dbe5c5de483 100644 --- a/src/metadata/join_uns_to_obs/script.py +++ b/src/metadata/join_uns_to_obs/script.py @@ -7,21 +7,22 @@ "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", "uns_key": "metrics_cellranger", "output": "foo.h5mu", - "modality": "rna" + "modality": "rna", } ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Read mudata from file") -mdata = read_h5mu(par['input']) -mod_data = mdata.mod[par['modality']] +mdata = read_h5mu(par["input"]) +mod_data = mdata.mod[par["modality"]] logger.info("Joining uns to obs") # get data frame -uns_df = mod_data.uns[par['uns_key']] +uns_df = mod_data.uns[par["uns_key"]] # check for overlapping colnames intersect_keys = uns_df.keys().intersection(mod_data.obs.keys()) @@ -35,7 +36,4 @@ mod_data.obs = pd.concat([obs_drop, uns_df_rep], axis=1) logger.info("Write output to mudata file") -mdata.write_h5mu(par['output'], compression=par["output_compression"]) - - - +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/metadata/join_uns_to_obs/test.py b/src/metadata/join_uns_to_obs/test.py index c2170ba2bff..92922ef3f0d 100644 --- a/src/metadata/join_uns_to_obs/test.py +++ b/src/metadata/join_uns_to_obs/test.py @@ -7,48 +7,73 @@ ## VIASH START meta = { - 'executable': './target/executable/metadata/join_uns_to_obs/join_uns_to_obs', - 'config': './src/metadata/join_uns_to_obs/config.vsh.yml' + "executable": "./target/executable/metadata/join_uns_to_obs/join_uns_to_obs", + "config": "./src/metadata/join_uns_to_obs/config.vsh.yml", } ## VIASH END + @pytest.fixture def ad_w_uns(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) - obs = pd.DataFrame([["A", "sample1"], ["B", "sample2"]], index=df.index, columns=["Obs", "sample_id"]) - var = pd.DataFrame([["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], - index=df.columns, columns=["Feat", "sample_id_var"]) - obsm = pd.DataFrame([["X", "W"]], index=pd.Index([0]), columns=["uns_col1", "uns_col2"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame( + [["A", "sample1"], ["B", "sample2"]], + index=df.index, + columns=["Obs", "sample_id"], + ) + var = pd.DataFrame( + [["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], + index=df.columns, + columns=["Feat", "sample_id_var"], + ) + obsm = pd.DataFrame( + [["X", "W"]], index=pd.Index([0]), columns=["uns_col1", "uns_col2"] + ) return AnnData(df, obs=obs, var=var, uns={"obsm1": obsm}) + @pytest.fixture def ad_wo_uns(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) var = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) - return AnnData(df, obs=obs, var=var) + return AnnData(df, obs=obs, var=var) + @pytest.fixture def sample_h5mu(ad_w_uns, ad_wo_uns): - mudata = MuData({'mod1': ad_w_uns, 'mod2': ad_wo_uns}) + mudata = MuData({"mod1": ad_w_uns, "mod2": ad_wo_uns}) return mudata -def test_join_uns_to_obs(run_component, random_h5mu_path, write_mudata_to_file, sample_h5mu): + +def test_join_uns_to_obs( + run_component, random_h5mu_path, write_mudata_to_file, sample_h5mu +): input_file = write_mudata_to_file(sample_h5mu) output_file = random_h5mu_path() - run_component([ - "--input", str(input_file), - "--modality", "mod1", - "--uns_key", "obsm1", - "--output", str(output_file) - ]) + run_component( + [ + "--input", + str(input_file), + "--modality", + "mod1", + "--uns_key", + "obsm1", + "--output", + str(output_file), + ] + ) expected_obs = pd.DataFrame( { "Obs": ["A", "B"], "sample_id": ["sample1", "sample2"], "uns_col1": ["X", "X"], - "uns_col2": ["W", "W"] + "uns_col2": ["W", "W"], }, index=pd.Index(["obs1", "obs2"]), ).astype( @@ -62,10 +87,11 @@ def test_join_uns_to_obs(run_component, random_h5mu_path, write_mudata_to_file, assert output_file.is_file() output_mudata = read_h5mu(output_file) - assert 'obsm1' in output_mudata.mod['mod1'].uns + assert "obsm1" in output_mudata.mod["mod1"].uns sample_h5mu.mod["mod1"].obs = expected_obs assert_annotation_objects_equal(sample_h5mu, output_mudata) -if __name__ == '__main__': + +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/metadata/move_obsm_to_obs/script.py b/src/metadata/move_obsm_to_obs/script.py index 7946afc3766..fd9b0e2fe04 100644 --- a/src/metadata/move_obsm_to_obs/script.py +++ b/src/metadata/move_obsm_to_obs/script.py @@ -9,18 +9,19 @@ "modality": "mod1", "obsm_key": "obsm_key", "output": "output.h5mu", - "output_compression": "gzip" + "output_compression": "gzip", } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Read mudata from file") -mdata = read_h5mu(par['input']) +mdata = read_h5mu(par["input"]) try: - mod_data = mdata.mod[par['modality']] + mod_data = mdata.mod[par["modality"]] except KeyError: raise ValueError(f"Modality {par['modality']} does not exist.") @@ -28,30 +29,39 @@ try: obsm_matrix = mod_data.obsm[par["obsm_key"]].copy() except KeyError: - raise ValueError(f".obsm key {par['obsm_key']} was not found in " - f".obsm slot for modality {par['modality']}.") + raise ValueError( + f".obsm key {par['obsm_key']} was not found in " + f".obsm slot for modality {par['modality']}." + ) -obsm_matrix.rename(partial("{key}_{}".format, key=par["obsm_key"]), - axis="columns", copy=False, inplace=True) +obsm_matrix.rename( + partial("{key}_{}".format, key=par["obsm_key"]), + axis="columns", + copy=False, + inplace=True, +) original_n_obs = len(mod_data.obs) try: logger.info(f".obs names: {mod_data.obs_names}") logger.info(f".obsm index: {obsm_matrix.index}") new_obs = mod_data.obs.drop(obsm_matrix.columns, axis=1, errors="ignore") - new_obs = new_obs.merge(obsm_matrix, how="left", - validate="one_to_one", - left_index=True, right_index=True) + new_obs = new_obs.merge( + obsm_matrix, + how="left", + validate="one_to_one", + left_index=True, + right_index=True, + ) mod_data.obs = new_obs -except MergeError as e: - raise ValueError(f"Could not join .obsm matrix at {par['obsm_key']} to .obs because there " - "are some observation that are not overlapping between the two matrices " - "(indexes should overlap). This is either a bug or your mudata file is corrupt.") +except MergeError: + raise ValueError( + f"Could not join .obsm matrix at {par['obsm_key']} to .obs because there " + "are some observation that are not overlapping between the two matrices " + "(indexes should overlap). This is either a bug or your mudata file is corrupt." + ) del mod_data.obsm[par["obsm_key"]] logger.info("Write output to mudata file") -mdata.write_h5mu(par['output'], compression=par["output_compression"]) - - - +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/metadata/move_obsm_to_obs/test.py b/src/metadata/move_obsm_to_obs/test.py index 81e3df580f2..923145b9410 100644 --- a/src/metadata/move_obsm_to_obs/test.py +++ b/src/metadata/move_obsm_to_obs/test.py @@ -8,110 +8,178 @@ ## VIASH START meta = { - 'name': 'move_obsm_to_obs', - 'resources_dir': 'resources_test/', - 'executable': 'target/executable/metadata/move_obsm_to_obs/move_obsm_to_obs', - 'config': 'src/metadata/move_obsm_to_obs/config.vsh.yaml' + "name": "move_obsm_to_obs", + "resources_dir": "resources_test/", + "executable": "target/executable/metadata/move_obsm_to_obs/move_obsm_to_obs", + "config": "src/metadata/move_obsm_to_obs/config.vsh.yaml", } ## VIASH END + @pytest.fixture def h5mu(): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) - obs = pd.DataFrame([["A", "sample1"], ["B", "sample2"]], index=df.index, columns=["Obs", "sample_id"]) - var = pd.DataFrame([["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], - index=df.columns, columns=["Feat", "sample_id_var"]) - obsm = {"obsm_key": pd.DataFrame([["foo", "bar"], ["lorem", "ipsum"]], - index=obs.index, columns=["obsm_col1", "obsm_col2"])} - ad1 = AnnData(df, obs=obs, var=var, obsm=obsm) - var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) - obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) - ad2 = AnnData(df, obs=obs2, var=var2) - return MuData({'mod1': ad1, 'mod2': ad2}) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame( + [["A", "sample1"], ["B", "sample2"]], + index=df.index, + columns=["Obs", "sample_id"], + ) + var = pd.DataFrame( + [["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], + index=df.columns, + columns=["Feat", "sample_id_var"], + ) + obsm = { + "obsm_key": pd.DataFrame( + [["foo", "bar"], ["lorem", "ipsum"]], + index=obs.index, + columns=["obsm_col1", "obsm_col2"], + ) + } + ad1 = AnnData(df, obs=obs, var=var, obsm=obsm) + var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) + obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) + ad2 = AnnData(df, obs=obs2, var=var2) + return MuData({"mod1": ad1, "mod2": ad2}) + @pytest.fixture def write_temp_h5mu(tmp_path): - def wrapper(test_h5mu): + def wrapper(test_h5mu): test_h5mu_path = tmp_path / f"{str(uuid.uuid4())}.h5mu" test_h5mu.write_h5mu(test_h5mu_path) return test_h5mu_path + return wrapper + @pytest.fixture def h5mu_with_non_overlapping_observations(h5mu): - h5mu.mod['mod1'].obsm['obsm_key'].index = pd.Index(["obs1", "doesnotexist"]) + h5mu.mod["mod1"].obsm["obsm_key"].index = pd.Index(["obs1", "doesnotexist"]) return h5mu def test_move_obsm_to_obs(run_component, h5mu, write_temp_h5mu, tmp_path): - output = tmp_path/ "output.h5mu" - run_component(["--input", write_temp_h5mu(h5mu), - "--modality", "mod1", - "--obsm_key", "obsm_key", - "--output", output - ]) + output = tmp_path / "output.h5mu" + run_component( + [ + "--input", + write_temp_h5mu(h5mu), + "--modality", + "mod1", + "--obsm_key", + "obsm_key", + "--output", + output, + ] + ) assert output.is_file(), "Some output file must have been created." output_data = read_h5mu(output) - pd.testing.assert_index_equal(output_data.mod['mod1'].obs.index, pd.Index(['obs1', 'obs2'])) - pd.testing.assert_index_equal(output_data.mod['mod1'].obs.columns, - pd.Index(['Obs', 'sample_id', 'obsm_key_obsm_col1', 'obsm_key_obsm_col2'])) - assert 'obsm_key' not in output_data.mod['mod1'].obsm - -def test_move_obsm_to_obs_non_overlapping_obs_fails(run_component, write_temp_h5mu, - h5mu_with_non_overlapping_observations, tmp_path): - output = tmp_path/ "output.h5mu" + pd.testing.assert_index_equal( + output_data.mod["mod1"].obs.index, pd.Index(["obs1", "obs2"]) + ) + pd.testing.assert_index_equal( + output_data.mod["mod1"].obs.columns, + pd.Index(["Obs", "sample_id", "obsm_key_obsm_col1", "obsm_key_obsm_col2"]), + ) + assert "obsm_key" not in output_data.mod["mod1"].obsm + + +def test_move_obsm_to_obs_non_overlapping_obs_fails( + run_component, write_temp_h5mu, h5mu_with_non_overlapping_observations, tmp_path +): + output = tmp_path / "output.h5mu" # Mudata seems to handle this error, but keep this test in just in case mudata drops the ball. with pytest.raises((CalledProcessError, ValueError)) as err: - run_component(["--input", write_temp_h5mu(h5mu_with_non_overlapping_observations), - "--modality", "mod1", - "--obsm_key", "obsm_key", - "--output", output - ]) + run_component( + [ + "--input", + write_temp_h5mu(h5mu_with_non_overlapping_observations), + "--modality", + "mod1", + "--obsm_key", + "obsm_key", + "--output", + output, + ] + ) expected_message = r"value.index does not match parent’s obs names" if isinstance(err, CalledProcessError): - assert re.search(expected_message, err.value.stdout.decode('utf-8')) + assert re.search(expected_message, err.value.stdout.decode("utf-8")) else: assert re.search(expected_message, str(err)) - def test_error_non_existing_modality(run_component, h5mu, write_temp_h5mu, tmp_path): - output = tmp_path/ "output.h5mu" + output = tmp_path / "output.h5mu" with pytest.raises(CalledProcessError) as err: - run_component(["--input", write_temp_h5mu(h5mu), - "--modality", "foo", - "--obsm_key", "obsm_key", - "--output", output - ]) - assert re.search(r"ValueError: Modality foo does not exist\.", - err.value.stdout.decode('utf-8')) - + run_component( + [ + "--input", + write_temp_h5mu(h5mu), + "--modality", + "foo", + "--obsm_key", + "obsm_key", + "--output", + output, + ] + ) + assert re.search( + r"ValueError: Modality foo does not exist\.", err.value.stdout.decode("utf-8") + ) + + def test_execute_twice_overwrites(run_component, h5mu, write_temp_h5mu, tmp_path): - output_run_1 = tmp_path/ "output1.h5mu" - run_component(["--input", write_temp_h5mu(h5mu), - "--modality", "mod1", - "--obsm_key", "obsm_key", - "--output", output_run_1 - ]) + output_run_1 = tmp_path / "output1.h5mu" + run_component( + [ + "--input", + write_temp_h5mu(h5mu), + "--modality", + "mod1", + "--obsm_key", + "obsm_key", + "--output", + output_run_1, + ] + ) output_data_run_1 = read_h5mu(output_run_1) - output_data_run_1.mod["mod1"].obsm = \ - {"obsm_key": pd.DataFrame([["dolor", "amet"], ["jommeke", "filiberke"]], - index=output_data_run_1.mod["mod1"].obs_names, - columns=["obsm_col1", "obsm_col2"])} - + output_data_run_1.mod["mod1"].obsm = { + "obsm_key": pd.DataFrame( + [["dolor", "amet"], ["jommeke", "filiberke"]], + index=output_data_run_1.mod["mod1"].obs_names, + columns=["obsm_col1", "obsm_col2"], + ) + } + output_run_2 = tmp_path / "output2.h5mu" input_run_2 = write_temp_h5mu(output_data_run_1) - run_component(["--input", input_run_2, - "--modality", "mod1", - "--obsm_key", "obsm_key", - "--output", output_run_2 - ]) + run_component( + [ + "--input", + input_run_2, + "--modality", + "mod1", + "--obsm_key", + "obsm_key", + "--output", + output_run_2, + ] + ) assert output_run_2.is_file(), "Some output file must have been created." output_data = read_h5mu(output_run_2) - pd.testing.assert_index_equal(output_data.mod['mod1'].obs.index, pd.Index(['obs1', 'obs2'])) - pd.testing.assert_index_equal(output_data.mod['mod1'].obs.columns, - pd.Index(['Obs', 'sample_id', 'obsm_key_obsm_col1', 'obsm_key_obsm_col2'])) - assert 'obsm_key' not in output_data.mod['mod1'].obsm + pd.testing.assert_index_equal( + output_data.mod["mod1"].obs.index, pd.Index(["obs1", "obs2"]) + ) + pd.testing.assert_index_equal( + output_data.mod["mod1"].obs.columns, + pd.Index(["Obs", "sample_id", "obsm_key_obsm_col1", "obsm_key_obsm_col2"]), + ) + assert "obsm_key" not in output_data.mod["mod1"].obsm + -if __name__ == '__main__': - exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + exit(pytest.main([__file__])) diff --git a/src/neighbors/bbknn/script.py b/src/neighbors/bbknn/script.py index fd308d2ea64..2f09cb26b32 100644 --- a/src/neighbors/bbknn/script.py +++ b/src/neighbors/bbknn/script.py @@ -3,18 +3,18 @@ ### VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu', - 'modality': 'rna', - 'obs_batch': 'sample_id', - 'obsm_input': 'X_pca', - 'n_neighbors_within_batch': 3, - 'n_trim': None, - 'n_pcs': 50, - 'output': 'output.h5mu', - 'output_compression': 'gzip', - 'obsp_connectivities': 'my_connectivities', - 'obsp_distances': 'my_distances', - 'uns_output': 'my_neighbors' + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", + "modality": "rna", + "obs_batch": "sample_id", + "obsm_input": "X_pca", + "n_neighbors_within_batch": 3, + "n_trim": None, + "n_pcs": 50, + "output": "output.h5mu", + "output_compression": "gzip", + "obsp_connectivities": "my_connectivities", + "obsp_distances": "my_distances", + "uns_output": "my_neighbors", } ### VIASH END @@ -26,10 +26,10 @@ bbknn.bbknn( tmp_adata, use_rep=par["obsm_input"], - batch_key = par["obs_batch"], + batch_key=par["obs_batch"], neighbors_within_batch=par["n_neighbors_within_batch"], n_pcs=par["n_pcs"], - trim=par["n_trim"] + trim=par["n_trim"], ) # store output @@ -40,4 +40,4 @@ adata.uns[par["uns_output"]]["connectivities_key"] = par["obsp_connectivities"] # write to file -mudata.write_h5mu(par["output"], compression=par["output_compression"]) \ No newline at end of file +mudata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/neighbors/bbknn/test.py b/src/neighbors/bbknn/test.py index 9b0943817dd..1788368fab1 100644 --- a/src/neighbors/bbknn/test.py +++ b/src/neighbors/bbknn/test.py @@ -4,28 +4,29 @@ ## VIASH START meta = { - 'executable': './target/executable/neighbors/bbknn/bbknn', - 'resources_dir': './resources_test/pbmc_1k_protein_v3/' + "executable": "./target/executable/neighbors/bbknn/bbknn", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", } ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu" + @pytest.fixture def sample_mudata(tmp_path): tmp_input_path = tmp_path / "input.h5mu" # create input data mudata = read_h5mu(input_file) - rna_adata = mudata.mod['rna'] + rna_adata = mudata.mod["rna"] # remove previous output (if any) - if 'connectivities' in rna_adata.obsp: - del rna_adata.obsp['connectivities'] - if 'distances' in rna_adata.obsp: - del rna_adata.obsp['distances'] - if 'neighbors' in rna_adata.uns: - del rna_adata.uns['neighbors'] + if "connectivities" in rna_adata.obsp: + del rna_adata.obsp["connectivities"] + if "distances" in rna_adata.obsp: + del rna_adata.obsp["distances"] + if "neighbors" in rna_adata.uns: + del rna_adata.uns["neighbors"] # write to file mudata.write(tmp_input_path) @@ -38,39 +39,58 @@ def test_simple_integration(run_component, tmp_path, sample_mudata): output_path = tmp_path / "output.h5mu" print(mudata, flush=True) # run component - run_component([ - "--input", str(tmp_input_path), - "--output", str(output_path), - "--obs_batch", "harmony_integration_leiden_1.0", - "--obsm_input", "X_pca", - "--output_compression", "gzip" - ]) + run_component( + [ + "--input", + str(tmp_input_path), + "--output", + str(output_path), + "--obs_batch", + "harmony_integration_leiden_1.0", + "--obsm_input", + "X_pca", + "--output_compression", + "gzip", + ] + ) assert output_path.exists() - data = read_h5mu(output_path).mod['rna'] + data = read_h5mu(output_path).mod["rna"] assert "connectivities" in data.obsp assert "distances" in data.obsp assert "neighbors" in data.uns + def test_alternative_names(run_component, tmp_path, sample_mudata): tmp_input_path, mudata = sample_mudata output_path = tmp_path / "output.h5mu" # run component - run_component([ - "--input", str(tmp_input_path), - "--output", str(output_path), - "--obs_batch", "harmony_integration_leiden_1.0", - "--obsm_input", "X_pca", - "--output_compression", "gzip", - "--uns_output", "my_neighbors", - "--obsp_connectivities", "my_connectivities", - "--obsp_distances", "my_distances" - ]) + run_component( + [ + "--input", + str(tmp_input_path), + "--output", + str(output_path), + "--obs_batch", + "harmony_integration_leiden_1.0", + "--obsm_input", + "X_pca", + "--output_compression", + "gzip", + "--uns_output", + "my_neighbors", + "--obsp_connectivities", + "my_connectivities", + "--obsp_distances", + "my_distances", + ] + ) assert output_path.exists() - data = read_h5mu(output_path).mod['rna'] + data = read_h5mu(output_path).mod["rna"] assert "my_connectivities" in data.obsp assert "my_distances" in data.obsp assert "my_neighbors" in data.uns + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/neighbors/find_neighbors/script.py b/src/neighbors/find_neighbors/script.py index 059256de88b..c01a82cc3c1 100644 --- a/src/neighbors/find_neighbors/script.py +++ b/src/neighbors/find_neighbors/script.py @@ -1,8 +1,9 @@ import sys import numpy as np -numpy_module = sys.modules['numpy'] + +numpy_module = sys.modules["numpy"] numpy_module.float_ = np.float64 -sys.modules['numpy'] = numpy_module +sys.modules["numpy"] = numpy_module import mudata as mu import scanpy as sc @@ -12,22 +13,21 @@ par = { "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", "output": "output.h5mu", - "metric": 'cosine', + "metric": "cosine", "num_neighbors": 15, "modality": "rna", "obsm_input": "X_pca", "uns_output": "neighbors", "obsp_distances": "distances", "obsp_connectivities": "connectivities", - "seed": None -} -meta = { - 'resources_dir': "." + "seed": None, } +meta = {"resources_dir": "."} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading input mudata") @@ -46,15 +46,15 @@ ) adata.uns[par["uns_output"]] = { - 'connectivities_key': par["obsp_connectivities"], - 'distances_key': par["obsp_distances"], - 'params': { - 'n_neighbors': neighbors.n_neighbors, - 'method': "umap", - 'random_state': par["seed"], - 'metric': par["metric"], - 'use_rep': par["obsm_input"] - } + "connectivities_key": par["obsp_connectivities"], + "distances_key": par["obsp_distances"], + "params": { + "n_neighbors": neighbors.n_neighbors, + "method": "umap", + "random_state": par["seed"], + "metric": par["metric"], + "use_rep": par["obsm_input"], + }, } adata.obsp[par["obsp_distances"]] = neighbors.distances diff --git a/src/neighbors/find_neighbors/test.py b/src/neighbors/find_neighbors/test.py index 98101cf4825..2b43fb7be12 100644 --- a/src/neighbors/find_neighbors/test.py +++ b/src/neighbors/find_neighbors/test.py @@ -4,27 +4,35 @@ ## VIASH START meta = { - 'executable': './target/docker/graph/neighbors/find_neighbors', - 'name': 'find_neighbors', - 'resources_dir': 'resources_test/' + "executable": "./target/docker/graph/neighbors/find_neighbors", + "name": "find_neighbors", + "resources_dir": "resources_test/", } ## VIASH END input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" output = "output.h5mu" -def test_run(run_component, tmp_path): + +def test_run(run_component, tmp_path): output = tmp_path / "output.h5mu" cmd_pars = [ meta["executable"], - "--input", input, - "--output", str(output), - "--obsm_input", "X_pca", - "--uns_output", "foo_neigh", - "--obsp_distances", "bar_dist", - "--obsp_connectivities", "baz_conn", - "--output_compression", "gzip" + "--input", + input, + "--output", + str(output), + "--obsm_input", + "X_pca", + "--uns_output", + "foo_neigh", + "--obsp_distances", + "bar_dist", + "--obsp_connectivities", + "baz_conn", + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -51,5 +59,6 @@ def test_run(run_component, tmp_path): assert "baz_conn" not in rna_in.obsp, "Input should not have .obsp['baz_conn']" assert "bar_dist" not in rna_in.obsp, "Input should not have .obsp['bar_dist']" -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/qc/calculate_qc_metrics/script.py b/src/qc/calculate_qc_metrics/script.py index aedfc8d352b..023f89554f2 100644 --- a/src/qc/calculate_qc_metrics/script.py +++ b/src/qc/calculate_qc_metrics/script.py @@ -18,15 +18,15 @@ "output_obs_total_counts_vars": "total_counts", "output_obs_num_nonzero_vars": "num_nonzero_vars", } -meta = { - "resources_dir": "." -} +meta = {"resources_dir": "."} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + def count_nonzero(layer, axis): """ This method is the functional equivalent of the old .getnnz function from scirpy, @@ -35,15 +35,18 @@ def count_nonzero(layer, axis): """ axis ^= 1 nonzero_counts = dict(zip(*np.unique(layer.nonzero()[axis], return_counts=True))) - nonzero_per_axis_item = {row_index: nonzero_counts.get(row_index, 0) - for row_index in range(layer.shape[axis])} + nonzero_per_axis_item = { + row_index: nonzero_counts.get(row_index, 0) + for row_index in range(layer.shape[axis]) + } return np.array(list(nonzero_per_axis_item.values()), dtype="int64") + def main(): input_data = read_h5mu(par["input"]) modality_data = input_data.mod[par["modality"]] var = modality_data.var - layer = modality_data.X if not par['layer'] else modality_data.layers[par['layer']] + layer = modality_data.X if not par["layer"] else modality_data.layers[par["layer"]] if not issparse(layer): raise NotImplementedError("Expected layer to be in sparse format.") layer = csr_array(layer) @@ -52,26 +55,30 @@ def main(): var_columns_to_add = {} # var statistics - if par['output_var_obs_mean']: + if par["output_var_obs_mean"]: obs_mean = layer.mean(axis=0) - var_columns_to_add[par['output_var_obs_mean']] = obs_mean - if par['output_var_total_counts_obs']: - # from the np.sum documentation: + var_columns_to_add[par["output_var_obs_mean"]] = obs_mean + if par["output_var_total_counts_obs"]: + # from the np.sum documentation: # Especially when summing a large number of lower precision floating point numbers, # such as float32, numerical errors can become significant. In such cases it can # be advisable to use dtype="float64" to use a higher precision for the output. layer_with_type = layer - if np.issubdtype(layer.dtype, np.floating) and np.can_cast(layer.dtype, np.float64, casting="safe"): + if np.issubdtype(layer.dtype, np.floating) and np.can_cast( + layer.dtype, np.float64, casting="safe" + ): # 'safe' casting makes sure not to cast np.float128 or anything else to a lower precision dtype layer_with_type = layer.astype(np.float64) total_counts_obs = np.ravel(layer_with_type.sum(axis=0)) - var_columns_to_add[par['output_var_total_counts_obs']] = total_counts_obs + var_columns_to_add[par["output_var_total_counts_obs"]] = total_counts_obs num_nonzero_obs = count_nonzero(layer, axis=0) - if par['output_var_num_nonzero_obs']: - var_columns_to_add[par['output_var_num_nonzero_obs']] = num_nonzero_obs - if par['output_var_pct_dropout']: - var_columns_to_add[par['output_var_pct_dropout']] = (1 - num_nonzero_obs / layer.shape[0]) * 100 + if par["output_var_num_nonzero_obs"]: + var_columns_to_add[par["output_var_num_nonzero_obs"]] = num_nonzero_obs + if par["output_var_pct_dropout"]: + var_columns_to_add[par["output_var_pct_dropout"]] = ( + 1 - num_nonzero_obs / layer.shape[0] + ) * 100 modality_data.var = modality_data.var.assign(**var_columns_to_add) @@ -79,77 +86,95 @@ def main(): obs_columns_to_add = {} total_counts_var = np.ravel(layer.sum(axis=1)) - if par['output_obs_num_nonzero_vars']: - num_nonzero_vars = count_nonzero(layer, axis=1) - obs_columns_to_add[par['output_obs_num_nonzero_vars']] = num_nonzero_vars + if par["output_obs_num_nonzero_vars"]: + num_nonzero_vars = count_nonzero(layer, axis=1) + obs_columns_to_add[par["output_obs_num_nonzero_vars"]] = num_nonzero_vars - if par['output_obs_total_counts_vars']: - obs_columns_to_add[par['output_obs_total_counts_vars']] = total_counts_var + if par["output_obs_total_counts_vars"]: + obs_columns_to_add[par["output_obs_total_counts_vars"]] = total_counts_var top_metrics = {} if par["top_n_vars"]: par["top_n_vars"] = sorted(par["top_n_vars"]) distributions = get_top_from_csr_matrix(layer, par["top_n_vars"]) - top_metrics = {distribution_size: distribution * 100 - for distribution_size, distribution - in zip(par["top_n_vars"], distributions.T)} - obs_columns_to_add |= {f"pct_of_counts_in_top_{n_top}_vars": col for - n_top, col in top_metrics.items()} - + top_metrics = { + distribution_size: distribution * 100 + for distribution_size, distribution in zip( + par["top_n_vars"], distributions.T + ) + } + obs_columns_to_add |= { + f"pct_of_counts_in_top_{n_top}_vars": col + for n_top, col in top_metrics.items() + } + if par["var_qc_metrics"]: print(f"qc_metrics: {par['var_qc_metrics']}") for qc_metric in par["var_qc_metrics"]: - if not qc_metric in var: - raise ValueError(f"Value for --var_qc_metrics, '{qc_metric}' " - f"not found in .var for modality {par['modality']}") + if qc_metric not in var: + raise ValueError( + f"Value for --var_qc_metrics, '{qc_metric}' " + f"not found in .var for modality {par['modality']}" + ) qc_column = var[qc_metric] if qc_column.isna().any(): if par["var_qc_metrics_fill_na_value"] is None: - raise ValueError(f"The .var column '{qc_metric}', selected by '--var_qc_metrics', contains NA values. " - "It is ambiguous whether or not to include these values in the static calulation. " - "You can explicitly map the NA values to 'False' or 'True using '--var_qc_metrics_fill_na_value'") + raise ValueError( + f"The .var column '{qc_metric}', selected by '--var_qc_metrics', contains NA values. " + "It is ambiguous whether or not to include these values in the static calulation. " + "You can explicitly map the NA values to 'False' or 'True using '--var_qc_metrics_fill_na_value'" + ) else: - qc_column = qc_column.fillna(par['var_qc_metrics_fill_na_value'], inplace=False) + qc_column = qc_column.fillna( + par["var_qc_metrics_fill_na_value"], inplace=False + ) qc_column = qc_column.to_list() if set(np.unique(qc_column)) - {True, False}: - raise ValueError(f"Column {qc_metric} in .var for modality {par['modality']} " - f"must only contain boolean values") + raise ValueError( + f"Column {qc_metric} in .var for modality {par['modality']} " + f"must only contain boolean values" + ) total_counts_qc_metric = np.ravel(layer[:, qc_column].sum(axis=1)) obs_columns_to_add |= { f"total_counts_{qc_metric}": total_counts_qc_metric, - f"pct_{qc_metric}": total_counts_qc_metric / total_counts_var * 100 + f"pct_{qc_metric}": total_counts_qc_metric / total_counts_var * 100, } modality_data.obs = modality_data.obs.assign(**obs_columns_to_add) input_data.write(par["output"], compression=par["output_compression"]) - + + def get_top_from_csr_matrix(array, top_n_genes): # csr matrices stores a 3D matrix in a format such that data for individual cells # are stored in 1 array. Another array (indptr) here stores the ranges of indices # to select from the data-array (.e.g. data[indptr[0]:indptr[1]] for row 0) for each row. - # Another array 'indices' maps each element of data to a column + # Another array 'indices' maps each element of data to a column # (data and indices arrays have the same length) top_n_genes = np.array(top_n_genes).astype(np.int64) assert np.all(top_n_genes[:-1] <= top_n_genes[1:]), "top_n_genes must be sorted" row_indices, data = array.indptr, array.data - number_of_rows, max_genes_to_parse = row_indices.size-1, top_n_genes[-1] - top_data = np.zeros((number_of_rows, max_genes_to_parse), - dtype=data.dtype) + number_of_rows, max_genes_to_parse = row_indices.size - 1, top_n_genes[-1] + top_data = np.zeros((number_of_rows, max_genes_to_parse), dtype=data.dtype) # Loop over each row to create a dense matrix without the 0 counts, # but not for the whole matrix, only store the genes up until # the largest number of top n genes. for row_number in range(number_of_rows): - row_start_index, row_end_index = row_indices[row_number], row_indices[row_number+1] - row_data = data[row_start_index:row_end_index] # all non-zero counts for an row + row_start_index, row_end_index = ( + row_indices[row_number], + row_indices[row_number + 1], + ) + row_data = data[row_start_index:row_end_index] # all non-zero counts for an row try: # There are less genes with counts in the row than the # maximum number of genes we would like to select # all these genes are in the top genes, just store them - top_data[row_number, :row_end_index-row_start_index] = row_data + top_data[row_number, : row_end_index - row_start_index] = row_data except ValueError: # Store the counts for the top genes - top_data[row_number, :] = np.partition(row_data, -max_genes_to_parse)[-max_genes_to_parse:] + top_data[row_number, :] = np.partition(row_data, -max_genes_to_parse)[ + -max_genes_to_parse: + ] # Partition works from smallest to largest, but we want largest # so do smallest to largest first (but with reversed indices) @@ -157,8 +182,9 @@ def get_top_from_csr_matrix(array, top_n_genes): # And then switch the order around top_data = np.flip(top_data, axis=1) - cumulative = top_data.cumsum(axis=1, dtype=np.float64)[:,top_n_genes-1] + cumulative = top_data.cumsum(axis=1, dtype=np.float64)[:, top_n_genes - 1] return cumulative / np.expand_dims(array.sum(axis=1), 1) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/qc/calculate_qc_metrics/test.py b/src/qc/calculate_qc_metrics/test.py index 1b9f638e719..5faedfab34c 100644 --- a/src/qc/calculate_qc_metrics/test.py +++ b/src/qc/calculate_qc_metrics/test.py @@ -14,10 +14,10 @@ ## VIASH START meta = { - 'executable': './target/executable/qc/calculate_qc_metrics/calculate_qc_metrics', - 'resources_dir': "./resources_test/pbmc_1k_protein_v3/", - 'config': './src/qc/calculate_qc_metrics/config.vsh.yaml', - 'cpus': 2 + "executable": "./target/executable/qc/calculate_qc_metrics/calculate_qc_metrics", + "resources_dir": "./resources_test/pbmc_1k_protein_v3/", + "config": "./src/qc/calculate_qc_metrics/config.vsh.yaml", + "cpus": 2, } ## VIASH END @@ -26,19 +26,20 @@ def input_path(): return f"{meta['resources_dir']}/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + @pytest.fixture def input_mudata_random(): rng = np.random.default_rng(seed=1) - random_counts = scipy.sparse.random(50000, 100, - density=0.8, - format='csr', - dtype=np.uint32, - random_state=rng) - good_dtype=random_counts.astype(np.float32) - del random_counts - mod1 = ad.AnnData(X=good_dtype, - obs=pd.DataFrame(index=pd.RangeIndex(50000)), - var=pd.DataFrame(index=pd.RangeIndex(100))) + random_counts = scipy.sparse.random( + 50000, 100, density=0.8, format="csr", dtype=np.uint32, random_state=rng + ) + good_dtype = random_counts.astype(np.float32) + del random_counts + mod1 = ad.AnnData( + X=good_dtype, + obs=pd.DataFrame(index=pd.RangeIndex(50000)), + var=pd.DataFrame(index=pd.RangeIndex(100)), + ) return md.MuData({"mod1": mod1}) @@ -47,44 +48,56 @@ def input_mudata(input_path): mudata = md.read_h5mu(input_path) # create a less sparse matrix to increase the variability in qc statistics rng = np.random.default_rng() - random_counts = scipy.sparse.random(*mudata['rna'].X.shape, - density=0.8, - format='csr', - random_state=rng) - mudata['rna'].X = random_counts + random_counts = scipy.sparse.random( + *mudata["rna"].X.shape, density=0.8, format="csr", random_state=rng + ) + mudata["rna"].X = random_counts return mudata + @pytest.fixture def input_mudata_path(tmp_path, input_mudata): output_path = tmp_path / f"{str(uuid.uuid4())}.h5mu" input_mudata.write(output_path) return output_path -@pytest.fixture(params=product([True, False, np.nan, "random"], - ["bool", pd.BooleanDtype()])) + +@pytest.fixture( + params=product([True, False, np.nan, "random"], ["bool", pd.BooleanDtype()]) +) def mudata_with_boolean_column(tmp_path, input_mudata, request): requested_value, requested_type = request.param - input_var = input_mudata.mod['rna'].var - input_var["custom"] = requested_value + input_var = input_mudata.mod["rna"].var + input_var["custom"] = requested_value if requested_value == "random": - input_var["custom"] = np.random.choice([True, False], len(input_var), p=[0.20, 0.80]) - input_var["custom"] = input_var["custom"].astype(requested_type) + input_var["custom"] = np.random.choice( + [True, False], len(input_var), p=[0.20, 0.80] + ) + input_var["custom"] = input_var["custom"].astype(requested_type) new_input_path = tmp_path / "input_with_custom_col.h5mu" input_mudata.write(new_input_path) return new_input_path + def test_add_qc(run_component, input_path): - run_component([ - "--input", input_path, - "--output", "foo.h5mu", - "--modality", "rna", - "--top_n_vars", "10;20;90", - "--output_compression", "gzip" - ]) - + run_component( + [ + "--input", + input_path, + "--output", + "foo.h5mu", + "--modality", + "rna", + "--top_n_vars", + "10;20;90", + "--output_compression", + "gzip", + ] + ) + assert Path("foo.h5mu").is_file() data_with_qc = md.read("foo.h5mu") - var, obs = data_with_qc.mod['rna'].var, data_with_qc.mod['rna'].obs + var, obs = data_with_qc.mod["rna"].var, data_with_qc.mod["rna"].obs for top_n_vars in ("10", "20", "90"): assert f"pct_of_counts_in_top_{top_n_vars}_vars" in obs assert "total_counts" in obs @@ -95,52 +108,74 @@ def test_add_qc(run_component, input_path): assert "total_counts" in var -@pytest.mark.parametrize("optional_parameter,annotation_matrix,arg_value,expected_name", - [("--output_obs_num_nonzero_vars", "obs", "lorem", "lorem"), - ("--output_obs_total_counts_vars", "obs", "ipsum", "ipsum"), - ("--output_var_num_nonzero_obs", "var", "dolor", "dolor"), - ("--output_var_total_counts_obs", "var", "amet", "amet"), - ("--output_var_obs_mean", "var", "sit", "sit"), - ("--output_var_pct_dropout", "var", "elit", "elit")]) -def test_qc_metrics_set_output_column(run_component, - input_mudata_path, - optional_parameter, - annotation_matrix, - arg_value, - expected_name): +@pytest.mark.parametrize( + "optional_parameter,annotation_matrix,arg_value,expected_name", + [ + ("--output_obs_num_nonzero_vars", "obs", "lorem", "lorem"), + ("--output_obs_total_counts_vars", "obs", "ipsum", "ipsum"), + ("--output_var_num_nonzero_obs", "var", "dolor", "dolor"), + ("--output_var_total_counts_obs", "var", "amet", "amet"), + ("--output_var_obs_mean", "var", "sit", "sit"), + ("--output_var_pct_dropout", "var", "elit", "elit"), + ], +) +def test_qc_metrics_set_output_column( + run_component, + input_mudata_path, + optional_parameter, + annotation_matrix, + arg_value, + expected_name, +): args = [ - "--input", input_mudata_path, - "--output", "foo.h5mu", - "--modality", "rna", - "--output_compression", "gzip", - optional_parameter, arg_value + "--input", + input_mudata_path, + "--output", + "foo.h5mu", + "--modality", + "rna", + "--output_compression", + "gzip", + optional_parameter, + arg_value, ] run_component(args) assert Path("foo.h5mu").is_file() data_with_qc = md.read("foo.h5mu") - matrix = getattr(data_with_qc.mod['rna'], annotation_matrix) + matrix = getattr(data_with_qc.mod["rna"], annotation_matrix) assert not matrix.filter(regex=expected_name, axis=1).empty -@pytest.mark.parametrize("optional_parameter,annotation_matrix,expected_missing,", - [("--var_qc_metrics", "obs", "total_counts_.*|pct_*"), - ("--top_n_vars", "obs", "pct_of_counts_in_top_.*"), - ("--output_obs_num_nonzero_vars", "obs", "num_nonzero_vars"), - ("--output_obs_total_counts_vars", "obs", "total_counts"), - ("--output_var_num_nonzero_obs", "var", "num_nonzero_obs"), - ("--output_var_total_counts_obs", "var", "total_counts"), - ("--output_var_obs_mean", "var", "obs_mean"), - ("--output_var_pct_dropout", "var", "pct_dropout")]) -def test_qc_metrics_optional(run_component, - input_mudata_path, - optional_parameter, - annotation_matrix, - expected_missing): + +@pytest.mark.parametrize( + "optional_parameter,annotation_matrix,expected_missing,", + [ + ("--var_qc_metrics", "obs", "total_counts_.*|pct_*"), + ("--top_n_vars", "obs", "pct_of_counts_in_top_.*"), + ("--output_obs_num_nonzero_vars", "obs", "num_nonzero_vars"), + ("--output_obs_total_counts_vars", "obs", "total_counts"), + ("--output_var_num_nonzero_obs", "var", "num_nonzero_obs"), + ("--output_var_total_counts_obs", "var", "total_counts"), + ("--output_var_obs_mean", "var", "obs_mean"), + ("--output_var_pct_dropout", "var", "pct_dropout"), + ], +) +def test_qc_metrics_optional( + run_component, + input_mudata_path, + optional_parameter, + annotation_matrix, + expected_missing, +): args = [ - "--input", input_mudata_path, - "--output", "foo.h5mu", - "--modality", "rna", - "--output_compression", "gzip" + "--input", + input_mudata_path, + "--output", + "foo.h5mu", + "--modality", + "rna", + "--output_compression", + "gzip", ] if optional_parameter not in ["--var_qc_metrics", "--top_n_vars"]: args.extend([optional_parameter, ""]) @@ -148,127 +183,160 @@ def test_qc_metrics_optional(run_component, run_component(args) assert Path("foo.h5mu").is_file() data_with_qc = md.read("foo.h5mu") - matrix = getattr(data_with_qc.mod['rna'], annotation_matrix) + matrix = getattr(data_with_qc.mod["rna"], annotation_matrix) assert matrix.filter(regex=expected_missing, axis=1).empty -def test_calculcate_qc_var_qc_metrics(run_component, mudata_with_boolean_column, tmp_path): + +def test_calculcate_qc_var_qc_metrics( + run_component, mudata_with_boolean_column, tmp_path +): output_path = tmp_path / "foo.h5mu" input_data = md.read_h5mu(mudata_with_boolean_column) args = [ - "--input", str(mudata_with_boolean_column), - "--output", str(output_path), - "--modality", "rna", - "--top_n_vars", "10;20;90", - "--var_qc_metrics", "custom", + "--input", + str(mudata_with_boolean_column), + "--output", + str(output_path), + "--modality", + "rna", + "--top_n_vars", + "10;20;90", + "--var_qc_metrics", + "custom", ] - if input_data.mod['rna'].var["custom"].isna().any(): + if input_data.mod["rna"].var["custom"].isna().any(): args.extend(["--var_qc_metrics_fill_na_value", "True"]) run_component(args) assert output_path.is_file() - data_with_qc = md.read(output_path)['rna'] - for qc_metric in ('pct_custom', 'total_counts_custom'): + data_with_qc = md.read(output_path)["rna"] + for qc_metric in ("pct_custom", "total_counts_custom"): assert qc_metric in data_with_qc.obs # Do a percentage calculation based on indexes # and compare it to the calculations from the component - custom_column_true = data_with_qc.var['custom'].fillna(True) + custom_column_true = data_with_qc.var["custom"].fillna(True) gene_counts = sc.get.obs_df(data_with_qc, keys=data_with_qc.var_names.to_list()) - gene_counts_custom = gene_counts.loc[:,custom_column_true] + gene_counts_custom = gene_counts.loc[:, custom_column_true] sum_custom_column = gene_counts_custom.sum(axis=1) sum_all = gene_counts.sum(axis=1) percentage = (sum_custom_column / sum_all) * 100 - pd.testing.assert_series_equal(percentage, data_with_qc.obs['pct_custom'], - check_exact=False, # Comparing floats - check_names=False) - assert (data_with_qc.obs['pct_custom'] <= 100).all() - -def test_compare_scanpy(run_component, - mudata_with_boolean_column, - input_mudata, - tmp_path): - + pd.testing.assert_series_equal( + percentage, + data_with_qc.obs["pct_custom"], + check_exact=False, # Comparing floats + check_names=False, + ) + assert (data_with_qc.obs["pct_custom"] <= 100).all() + + +def test_compare_scanpy( + run_component, mudata_with_boolean_column, input_mudata, tmp_path +): output_path = tmp_path / "foo.h5mu" - run_component([ - "--input", str(mudata_with_boolean_column), - "--output", str(output_path), - "--modality", "rna", - "--top_n_vars", "10;20;90", - "--var_qc_metrics", "custom", - "--var_qc_metrics_fill_na_value", "False" - ]) + run_component( + [ + "--input", + str(mudata_with_boolean_column), + "--output", + str(output_path), + "--modality", + "rna", + "--top_n_vars", + "10;20;90", + "--var_qc_metrics", + "custom", + "--var_qc_metrics_fill_na_value", + "False", + ] + ) assert output_path.is_file() component_data = md.read(output_path) - rna_mod = component_data.mod['rna'] + rna_mod = component_data.mod["rna"] # Replicate --var_qc_metrics_fill_na_value False # Scanpy also does not work with pd.BooleanDtype() # So cast from 'boolean' to 'bool' - input_mudata.mod['rna'].var['custom'] = input_mudata.mod['rna'].var['custom'].fillna(False).astype("bool") + input_mudata.mod["rna"].var["custom"] = ( + input_mudata.mod["rna"].var["custom"].fillna(False).astype("bool") + ) sc.pp.calculate_qc_metrics( - input_mudata.mod['rna'], + input_mudata.mod["rna"], expr_type="counts", var_type="genes", qc_vars=["custom"], - percent_top=[10,20,90], + percent_top=[10, 20, 90], use_raw=False, inplace=True, - log1p=False + log1p=False, ) - scanpy_var = input_mudata.mod['rna'].var + scanpy_var = input_mudata.mod["rna"].var component_var = rna_mod.var vars_to_compare = { - 'pct_dropout': 'pct_dropout_by_counts', - 'num_nonzero_obs': 'n_cells_by_counts', - 'obs_mean': 'mean_counts', - 'total_counts': 'total_counts' + "pct_dropout": "pct_dropout_by_counts", + "num_nonzero_obs": "n_cells_by_counts", + "obs_mean": "mean_counts", + "total_counts": "total_counts", } for from_var, to_var in vars_to_compare.items(): - assert_series_equal(component_var[from_var], - scanpy_var[to_var], - check_names=False, - check_dtype=False) - + assert_series_equal( + component_var[from_var], + scanpy_var[to_var], + check_names=False, + check_dtype=False, + ) - scanpy_obs = input_mudata.mod['rna'].obs + scanpy_obs = input_mudata.mod["rna"].obs component_obs = rna_mod.obs obs_to_compare = { - 'num_nonzero_vars': 'n_genes_by_counts', - 'pct_custom': 'pct_counts_custom', - 'total_counts_custom': 'total_counts_custom', - 'total_counts': 'total_counts' + "num_nonzero_vars": "n_genes_by_counts", + "pct_custom": "pct_counts_custom", + "total_counts_custom": "total_counts_custom", + "total_counts": "total_counts", + } + obs_to_compare |= { + f"pct_of_counts_in_top_{i}_vars": f"pct_counts_in_top_{i}_genes" + for i in (10, 20, 90) } - obs_to_compare |= {f'pct_of_counts_in_top_{i}_vars': f'pct_counts_in_top_{i}_genes' - for i in (10, 20, 90)} for from_obs, to_obs in obs_to_compare.items(): - assert_series_equal(component_obs[from_obs], - scanpy_obs[to_obs], - check_names=False, - check_dtype=False) + assert_series_equal( + component_obs[from_obs], + scanpy_obs[to_obs], + check_names=False, + check_dtype=False, + ) -def test_total_counts_less_precision_dtype(run_component, input_mudata_random, random_h5mu_path): +def test_total_counts_less_precision_dtype( + run_component, input_mudata_random, random_h5mu_path +): input_path = random_h5mu_path() - input_mudata_random.write(input_path) + input_mudata_random.write(input_path) output_path = random_h5mu_path() - run_component([ - "--input", input_path, - "--output", output_path, - "--modality", "mod1", - ]) + run_component( + [ + "--input", + input_path, + "--output", + output_path, + "--modality", + "mod1", + ] + ) output_data = md.read_h5mu(output_path) - matrix_good_type = input_mudata_random.mod['mod1'].X + matrix_good_type = input_mudata_random.mod["mod1"].X var_names = input_mudata_random.var_names obs_names = input_mudata_random.obs_names del input_mudata_random - input_df = pd.DataFrame(matrix_good_type.todense(), - columns=var_names, - index=obs_names) + input_df = pd.DataFrame( + matrix_good_type.todense(), columns=var_names, index=obs_names + ) total_sums_manual = input_df.to_numpy().sum(axis=0, dtype=np.float128) - total_counts = output_data.mod['mod1'].var['total_counts'] + total_counts = output_data.mod["mod1"].var["total_counts"] np.testing.assert_allclose(total_sums_manual, total_counts.to_numpy()) - + + if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/qc/multiqc/script.py b/src/qc/multiqc/script.py index 5cd4cd34b3b..ec33db168d6 100644 --- a/src/qc/multiqc/script.py +++ b/src/qc/multiqc/script.py @@ -2,10 +2,13 @@ ## VIASH START par = { - "input": ["resources_test/10x_5k_anticmv/fastqc/", "resources_test/10x_5k_anticmv/fastqc/"], - "output": "output" + "input": [ + "resources_test/10x_5k_anticmv/fastqc/", + "resources_test/10x_5k_anticmv/fastqc/", + ], + "output": "output", } ## VIASH END # Run MultiQC -subprocess.run(["multiqc", "-o", par["output"]] + par["input"]) \ No newline at end of file +subprocess.run(["multiqc", "-o", par["output"]] + par["input"]) diff --git a/src/qc/multiqc/test.py b/src/qc/multiqc/test.py index 560d8d6a72e..b0579666333 100644 --- a/src/qc/multiqc/test.py +++ b/src/qc/multiqc/test.py @@ -7,17 +7,16 @@ input_fastqc = meta["resources_dir"] + "/fastqc/" + def test_multiqc(run_component, tmp_path): output_path = tmp_path / "output" - run_component([ - "--input", input_fastqc, - "--output", str(output_path) - ]) + run_component(["--input", input_fastqc, "--output", str(output_path)]) assert output_path.exists() assert (output_path / "multiqc_report.html").is_file() + def test_multiple_inputs(run_component, tmp_path): output_path = tmp_path / "output" dir1 = tmp_path / "dir1" @@ -27,14 +26,13 @@ def test_multiple_inputs(run_component, tmp_path): shutil.copytree(input_fastqc, dir1) shutil.copytree(input_fastqc, dir2) - run_component([ - "--input", str(dir1), - "--input", str(dir2), - "--output", str(output_path) - ]) + run_component( + ["--input", str(dir1), "--input", str(dir2), "--output", str(output_path)] + ) assert output_path.exists() assert (output_path / "multiqc_report.html").is_file() + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/query/cellxgene_census/script.py b/src/query/cellxgene_census/script.py index 7cea7db325a..fc9dbf63218 100644 --- a/src/query/cellxgene_census/script.py +++ b/src/query/cellxgene_census/script.py @@ -22,6 +22,7 @@ from setup_logger import setup_logger + logger = setup_logger() @@ -30,46 +31,69 @@ def connect_census(uri, census_version): Connect to CellxGene Census or user-provided TileDBSoma object """ ver = census_version or "stable" - logger.info("Connecting to CellxGene Census at %s", f"'{uri}'" if uri else f"version '{ver}'") + logger.info( + "Connecting to CellxGene Census at %s", + f"'{uri}'" if uri else f"version '{ver}'", + ) return cellxgene_census.open_soma(uri=uri, census_version=ver) + def get_anndata(census_connection, par): - logger.info("Getting gene expression data based on `%s` query.", par["obs_value_filter"]) + logger.info( + "Getting gene expression data based on `%s` query.", par["obs_value_filter"] + ) return cellxgene_census.get_anndata( census=census_connection, obs_value_filter=par["obs_value_filter"], - organism=par["species"] + organism=par["species"], ) + def add_cellcensus_metadata_obs(census_connection, adata): logger.info("Adding additional metadata to gene expression data.") - census_datasets = census_connection["census_info"]["datasets"].read().concat().to_pandas() + census_datasets = ( + census_connection["census_info"]["datasets"].read().concat().to_pandas() + ) adata.obs.dataset_id = adata.obs.dataset_id.astype("category") - dataset_info = census_datasets[census_datasets.dataset_id.isin(adata.obs.dataset_id.cat.categories)]\ - [['collection_id', 'collection_name', 'collection_doi', 'dataset_id', 'dataset_title']]\ - .reset_index(drop=True)\ - .astype('category') - - adata.obs = adata.obs.merge( - dataset_info, on='dataset_id', how='left' + dataset_info = ( + census_datasets[ + census_datasets.dataset_id.isin(adata.obs.dataset_id.cat.categories) + ][ + [ + "collection_id", + "collection_name", + "collection_doi", + "dataset_id", + "dataset_title", + ] + ] + .reset_index(drop=True) + .astype("category") ) + adata.obs = adata.obs.merge(dataset_info, on="dataset_id", how="left") + + def filter_min_cells_per_group(adata, par): n_cells_before, _ = adata.shape - cell_count = adata.obs \ - .groupby(par["cell_filter_grouping"])["soma_joinid"] \ - .transform("count") \ - + cell_count = adata.obs.groupby(par["cell_filter_grouping"])[ + "soma_joinid" + ].transform("count") adata = adata[cell_count >= par["cell_filter_minimum_count"]] n_cells_after, _ = adata.shape logger.info( "Removed %s cells based on %s cell_filter_minimum_count of %s cell_filter_grouping." - % ((n_cells_before - n_cells_after), par["cell_filter_minimum_count"], par["cell_filter_grouping"]) + % ( + (n_cells_before - n_cells_after), + par["cell_filter_minimum_count"], + par["cell_filter_grouping"], + ) ) return adata + def filter_by_counts(adata, par): logger.info("Remove cells with few counts and genes with few counts.") n_cells_before, n_genes_before = adata.shape @@ -84,13 +108,19 @@ def filter_by_counts(adata, par): if threshold: func(adata, **{arg: threshold}) n_cells_after, n_genes_after = adata.shape - logger.info("Removed %s cells and %s genes.", (n_cells_before - n_cells_after), (n_genes_before - n_genes_after)) + logger.info( + "Removed %s cells and %s genes.", + (n_cells_before - n_cells_after), + (n_genes_before - n_genes_after), + ) + def move_x_to_layers(adata): logger.info("Move .X to .layers['counts']") adata.layers["counts"] = adata.X adata.X = None + def print_unique(adata, column): unique_values = adata.obs[column].unique().astype(str) formatted = "', '".join(unique_values[:50]) @@ -98,6 +128,7 @@ def print_unique(adata, column): formatted += ", ..." logger.info(f"Unique {column}: ['{formatted}']") + def print_summary(adata): logger.info(f"Resulting dataset: {adata}") @@ -105,6 +136,7 @@ def print_summary(adata): for field in adata.obs.columns: print_unique(adata, field) + def write_anndata(adata, par): logger.info("Writing MuData object to '%s'", par["output"]) @@ -112,19 +144,24 @@ def write_anndata(adata, par): mdata.write_h5mu(par["output"], compression=par["output_compression"]) + def main(par, meta): # check arguments - if (par["cell_filter_grouping"] is None) != (par["cell_filter_minimum_count"] is None): + if (par["cell_filter_grouping"] is None) != ( + par["cell_filter_minimum_count"] is None + ): raise NotImplementedError( "You need to specify either both or none of the following parameters: cell_filter_grouping, cell_filter_minimum_count" ) - - with connect_census(uri=par["input_uri"], census_version=par["census_version"]) as conn: + + with connect_census( + uri=par["input_uri"], census_version=par["census_version"] + ) as conn: adata = get_anndata(conn, par) - + if par["add_dataset_metadata"]: add_cellcensus_metadata_obs(conn, adata) - + print(f"AnnData: {adata}", flush=True) if par["cell_filter_grouping"] is not None: diff --git a/src/query/cellxgene_census/test.py b/src/query/cellxgene_census/test.py index b18dcb9c894..e81c42471e6 100644 --- a/src/query/cellxgene_census/test.py +++ b/src/query/cellxgene_census/test.py @@ -6,70 +6,89 @@ ## VIASH START meta = { - 'resources_dir': './resources_test/', - 'executable': './target/executable/query/cellxgene_census', - 'config': '/home/di/code/openpipeline/src/query/cellxgene_census/config.vsh.yaml' + "resources_dir": "./resources_test/", + "executable": "./target/executable/query/cellxgene_census", + "config": "/home/di/code/openpipeline/src/query/cellxgene_census/config.vsh.yaml", } ## VIASH END + def test_cellxgene_extract_metadata_expression(run_component, tmp_path): output_file = tmp_path / "output.h5mu" - run_component([ - "--obs_value_filter", - "is_primary_data == True " - "and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] " - "and suspension_type == 'cell'", - "--species", "homo_sapiens", - "--add_dataset_metadata", - "--output", output_file, - ]) + run_component( + [ + "--obs_value_filter", + "is_primary_data == True " + "and cell_type_ontology_term_id in ['CL:0000136', 'CL:1000311', 'CL:0002616'] " + "and suspension_type == 'cell'", + "--species", + "homo_sapiens", + "--add_dataset_metadata", + "--output", + output_file, + ] + ) # check whether file exists assert os.path.exists(output_file), "Output file does not exist" mdata = md.read(output_file) - assert 'rna' in mdata.mod, "Output should contain 'rna' modality." - assert mdata.mod['rna'].n_obs > 0, "Expected at least one cell." - assert mdata.mod['rna'].n_vars > 0, "Expected at least one gene." + assert "rna" in mdata.mod, "Output should contain 'rna' modality." + assert mdata.mod["rna"].n_obs > 0, "Expected at least one cell." + assert mdata.mod["rna"].n_vars > 0, "Expected at least one gene." ## check obs - obs = mdata.mod['rna'].obs + obs = mdata.mod["rna"].obs expected_obs = [ - "dataset_id", "assay", "assay_ontology_term_id", "cell_type", - "cell_type_ontology_term_id", "development_stage", - "development_stage_ontology_term_id", "disease", - "disease_ontology_term_id", "donor_id", "is_primary_data", + "dataset_id", + "assay", + "assay_ontology_term_id", + "cell_type", + "cell_type_ontology_term_id", + "development_stage", + "development_stage_ontology_term_id", + "disease", + "disease_ontology_term_id", + "donor_id", + "is_primary_data", # "organism", "organism_ontology_term_id", # ← missing?? "self_reported_ethnicity", - "self_reported_ethnicity_ontology_term_id", "sex", - "sex_ontology_term_id", "suspension_type", "tissue", - "tissue_ontology_term_id", "tissue_general", - "tissue_general_ontology_term_id", "soma_joinid", "collection_id", - "collection_name", "collection_doi", "dataset_title" + "self_reported_ethnicity_ontology_term_id", + "sex", + "sex_ontology_term_id", + "suspension_type", + "tissue", + "tissue_ontology_term_id", + "tissue_general", + "tissue_general_ontology_term_id", + "soma_joinid", + "collection_id", + "collection_name", + "collection_doi", + "dataset_title", ] for exp_obs in expected_obs: assert exp_obs in obs.columns, f"Expected column '{exp_obs}' not found in .obs" - assert np.all(obs["is_primary_data"] == True) + assert np.all(np.isin(obs["is_primary_data"], [True])) ## check var - var = mdata.mod['rna'].var - expected_var = [ - "feature_id", "feature_name", "soma_joinid" - ] + var = mdata.mod["rna"].var + expected_var = ["feature_id", "feature_name", "soma_joinid"] for exp_var in expected_var: assert exp_var in var.columns, f"Expected column '{exp_var}' not found in .var" ## check layers - layers = mdata.mod['rna'].layers - expected_layers = [ - "counts" - ] + layers = mdata.mod["rna"].layers + expected_layers = ["counts"] for exp_layer in expected_layers: - assert exp_layer in layers.keys(), f"Expected layer '{exp_layer}' not found in .layers" + assert ( + exp_layer in layers.keys() + ), f"Expected layer '{exp_layer}' not found in .layers" + -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/reference/build_bdrhap_reference/script.py b/src/reference/build_bdrhap_reference/script.py index 8359ea55371..8b711412219 100644 --- a/src/reference/build_bdrhap_reference/script.py +++ b/src/reference/build_bdrhap_reference/script.py @@ -22,51 +22,62 @@ "resources_dir": os.path.abspath("src/reference/build_bdrhap_2_reference"), "temp_dir": os.getenv("VIASH_TEMP"), "memory_mb": None, - "cpus": None + "cpus": None, } ## VIASH END + def clean_arg(argument): argument["clean_name"] = re.sub("^-*", "", argument["name"]) return argument + def read_config(path: str) -> dict[str, Any]: with open(path, "r") as f: config = yaml.safe_load(f) - + config["arguments"] = [ - clean_arg(arg) - for grp in config["argument_groups"] - for arg in grp["arguments"] + clean_arg(arg) for grp in config["argument_groups"] for arg in grp["arguments"] ] - + return config + def strip_margin(text: str) -> str: return re.sub("(\n?)[ \t]*\|", "\\1", text) + def process_params(par: dict[str, Any], config) -> str: # check input parameters assert par["genome_fasta"], "Pass at least one set of inputs to --genome_fasta." assert par["gtf"], "Pass at least one set of inputs to --gtf." - assert par["reference_archive"].endswith(".gz"), "Output reference_archive must end with .tar.gz." + assert par["reference_archive"].endswith( + ".gz" + ), "Output reference_archive must end with .tar.gz." # make paths absolute for argument in config["arguments"]: if par[argument["clean_name"]] and argument["type"] == "file": if isinstance(par[argument["clean_name"]], list): - par[argument["clean_name"]] = [ os.path.abspath(f) for f in par[argument["clean_name"]] ] + par[argument["clean_name"]] = [ + os.path.abspath(f) for f in par[argument["clean_name"]] + ] else: - par[argument["clean_name"]] = os.path.abspath(par[argument["clean_name"]]) - + par[argument["clean_name"]] = os.path.abspath( + par[argument["clean_name"]] + ) + return par + def generate_config(par: dict[str, Any], meta, config) -> str: - content_list = [strip_margin(f"""\ + content_list = [ + strip_margin("""\ |#!/usr/bin/env cwl-runner | - |""")] - + |""") + ] + config_key_value_pairs = [] for argument in config["arguments"]: config_key = (argument.get("info") or {}).get("config_key") @@ -98,23 +109,28 @@ def generate_config(par: dict[str, Any], meta, config) -> str: |""") content_list.append(str) else: - content_list.append(strip_margin(f"""\ + content_list.append( + strip_margin(f"""\ |{config_key}: {par_value} - |""")) - + |""") + ) + ## Write config to file return "".join(content_list) + def get_cwl_file(meta: dict[str, Any]) -> str: # create cwl file (if need be) - cwl_file=os.path.join(meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl") + cwl_file = os.path.join( + meta["resources_dir"], "make_rhap_reference_2.2.1_nodocker.cwl" + ) return os.path.abspath(cwl_file) + def main(par: dict[str, Any], meta: dict[str, Any]): - config = read_config(meta["config"]) - + # Preprocess params par = process_params(par, config) @@ -127,7 +143,9 @@ def main(par: dict[str, Any], meta: dict[str, Any]): os.makedirs(outdir) ## Run pipeline - with tempfile.TemporaryDirectory(prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"]) as temp_dir: + with tempfile.TemporaryDirectory( + prefix="cwl-bd_rhapsody_wta-", dir=meta["temp_dir"] + ) as temp_dir: # Create params file config_file = os.path.join(temp_dir, "config.yml") config_content = generate_config(par, meta, config) @@ -141,20 +159,19 @@ def main(par: dict[str, Any], meta: dict[str, Any]): "--outdir", temp_dir, cwl_file, - config_file + config_file, ] env = dict(os.environ) env["TMPDIR"] = temp_dir print("> " + " ".join(cmd), flush=True) - _ = subprocess.check_call( - cmd, - cwd=os.path.dirname(config_file), - env=env + _ = subprocess.check_call(cmd, cwd=os.path.dirname(config_file), env=env) + + shutil.move( + os.path.join(temp_dir, "Rhap_reference.tar.gz"), par["reference_archive"] ) - shutil.move(os.path.join(temp_dir, "Rhap_reference.tar.gz"), par["reference_archive"]) if __name__ == "__main__": main(par, meta) diff --git a/src/reference/build_star_reference/script.py b/src/reference/build_star_reference/script.py index fc270d3dc37..f26dde57e50 100644 --- a/src/reference/build_star_reference/script.py +++ b/src/reference/build_star_reference/script.py @@ -1,4 +1,3 @@ -import re import tempfile import subprocess from pathlib import Path @@ -8,25 +7,24 @@ ## VIASH START par = { - 'genome_fasta': 'resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/fasta/genome.fa', - 'transcriptome_gtf': 'resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz', - 'output': 'star_reference_test', - 'genomeSAindexNbases': 7 -} -meta = { - 'cpus': 8, - 'temp_dir': '/tmp' + "genome_fasta": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/fasta/genome.fa", + "transcriptome_gtf": "resources_test/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz", + "output": "star_reference_test", + "genomeSAindexNbases": 7, } +meta = {"cpus": 8, "temp_dir": "/tmp"} ## VIASH END ######################## ### Helper functions ### ######################## + # helper function for cheching whether something is a gzip def is_gz_file(path: Path) -> bool: - with open(path, 'rb') as file: - return file.read(2) == b'\x1f\x8b' + with open(path, "rb") as file: + return file.read(2) == b"\x1f\x8b" + # if {par_value} is a Path, extract it to a temp_dir_path and return the resulting path def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: @@ -34,19 +32,21 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # Remove two extensions (if they exist) extaction_dir_name = Path(par_value.stem).stem unpacked_path = temp_dir_path / extaction_dir_name - print(f' Tar detected; extracting {par_value} to {unpacked_path}', flush=True) + print(f" Tar detected; extracting {par_value} to {unpacked_path}", flush=True) - with tarfile.open(par_value, 'r') as open_tar: + with tarfile.open(par_value, "r") as open_tar: members = open_tar.getmembers() - root_dirs = [member + root_dirs = [ + member for member in members - if member.isdir() and member.name != '.' and '/' not in member.name] + if member.isdir() and member.name != "." and "/" not in member.name + ] # if there is only one root_dir (and there are files in that directory) # strip that directory name from the destination folder if len(root_dirs) == 1: for mem in members: mem.path = Path(*Path(mem.path).parts[1:]) - members_to_move = [mem for mem in members if mem.path != Path('.')] + members_to_move = [mem for mem in members if mem.path != Path(".")] open_tar.extractall(unpacked_path, members=members_to_move) return unpacked_path @@ -54,16 +54,17 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # Remove extension (if it exists) extaction_file_name = Path(par_value.stem) unpacked_path = temp_dir_path / extaction_file_name - print(f' Gzip detected; extracting {par_value} to {unpacked_path}', flush=True) + print(f" Gzip detected; extracting {par_value} to {unpacked_path}", flush=True) - with gzip.open(par_value, 'rb') as f_in: - with open(unpacked_path, 'wb') as f_out: + with gzip.open(par_value, "rb") as f_in: + with open(unpacked_path, "wb") as f_out: shutil.copyfileobj(f_in, f_out) return unpacked_path else: return par_value + ######################## ### Main code ### ######################## @@ -71,7 +72,12 @@ def extract_if_need_be(par_value: Path, temp_dir_path: Path) -> Path: # rename keys and convert path strings to Path # note: only list file arguments here. if non-file arguments also need to be renamed, # the `processPar()` generator needs to be adapted -to_rename = {'genome_fasta': 'genomeFastaFiles', 'output': 'genomeDir', 'transcriptome_gtf': 'sjdbGTFfile'} +to_rename = { + "genome_fasta": "genomeFastaFiles", + "output": "genomeDir", + "transcriptome_gtf": "sjdbGTFfile", +} + def process_par(orig_par, to_rename): for key, value in orig_par.items(): @@ -88,13 +94,14 @@ def process_par(orig_par, to_rename): new_key = key new_value = value yield new_key, new_value + + par = dict(process_par(par, to_rename)) # create output dir if need be par["genomeDir"].mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory(prefix="star-", dir=meta["temp_dir"]) as temp_dir: - # checking for compressed files, ungzip files if need be temp_dir_path = Path(temp_dir) for par_name in ["genomeFastaFiles", "sjdbGTFfile"]: @@ -103,12 +110,15 @@ def process_par(orig_par, to_rename): # turn value into list is_multiple = isinstance(par_values, list) if not is_multiple: - par_values = [ par_values ] + par_values = [par_values] # output list new_values = [] for par_value in par_values: - print(f'>> Check compression of --{par_name} with value: {par_value}', flush=True) + print( + f">> Check compression of --{par_name} with value: {par_value}", + flush=True, + ) new_value = extract_if_need_be(par_value, temp_dir_path) new_values.append(new_value) @@ -124,11 +134,10 @@ def process_par(orig_par, to_rename): print(">> Constructing command", flush=True) par["runMode"] = "genomeGenerate" par["outTmpDir"] = temp_dir_path / "run" - if 'cpus' in meta and meta['cpus']: + if "cpus" in meta and meta["cpus"]: par["runThreadN"] = meta["cpus"] - - cmd_args = [ "STAR" ] + cmd_args = ["STAR"] for name, value in par.items(): if value is not None: if isinstance(value, list): @@ -138,10 +147,7 @@ def process_par(orig_par, to_rename): print("", flush=True) print(">> Running STAR with command:", flush=True) - print("+ " + ' '.join([str(x) for x in cmd_args]), flush=True) + print("+ " + " ".join([str(x) for x in cmd_args]), flush=True) print("", flush=True) - subprocess.run( - cmd_args, - check=True - ) \ No newline at end of file + subprocess.run(cmd_args, check=True) diff --git a/src/reference/cellranger_mkgtf/test.py b/src/reference/cellranger_mkgtf/test.py index 8e8313744a5..8c7082adb37 100644 --- a/src/reference/cellranger_mkgtf/test.py +++ b/src/reference/cellranger_mkgtf/test.py @@ -6,64 +6,83 @@ ## VIASH START meta = { - 'name': 'cellrnger_mkgtf', - 'resources_dir': 'resources_test/', - 'executable': 'target/docker/reference/cellranger_mkgtf/cellranger_mkgtf', - 'config': 'src/reference/cellranger_mkgtf/config.vsh.yaml' + "name": "cellrnger_mkgtf", + "resources_dir": "resources_test/", + "executable": "target/docker/reference/cellranger_mkgtf/cellranger_mkgtf", + "config": "src/reference/cellranger_mkgtf/config.vsh.yaml", } ## VIASH END + @pytest.fixture def subset_input_gtf(random_path): subset_input_path = random_path(extension="gtf.gz") - with gzip.open(f"{meta['resources_dir']}/reference_gencodev41_chr1/reference.gtf.gz", "rt") as f_in: + with gzip.open( + f"{meta['resources_dir']}/reference_gencodev41_chr1/reference.gtf.gz", "rt" + ) as f_in: with gzip.open(subset_input_path, "wt") as f_out: for line in f_in: - fields = line.split('\t') + fields = line.split("\t") if len(fields) >= 4 and int(fields[3]) < 50001: f_out.write(line) return subset_input_path -@pytest.mark.parametrize("attributes", [["miRNA"],["transcribed_unprocessed_pseudogene", "miRNA"]]) +@pytest.mark.parametrize( + "attributes", [["miRNA"], ["transcribed_unprocessed_pseudogene", "miRNA"]] +) def test_gene_type_column(run_component, subset_input_gtf, random_path, attributes): output_gtf = random_path(extension="gtf.gz") - args = [ - "--input_gtf", subset_input_gtf, - "--output_gtf", output_gtf, - "--attribute" - ] - args.append(';'.join([f"gene_type:{attribute}" for attribute in attributes])) - + args = ["--input_gtf", subset_input_gtf, "--output_gtf", output_gtf, "--attribute"] + args.append(";".join([f"gene_type:{attribute}" for attribute in attributes])) + print(args, flush=True) run_component(args) - + assert os.path.isfile(output_gtf), "Output GTF could not be found." - + with gzip.open(output_gtf, "rt") as f: - unique_gene_types = {match for line in f for match in re.findall(r'gene_type "([^"]*)"', line.split('\t')[8])} - assert set(attributes) == unique_gene_types, "Output GTF does not contain exactly the expected gene types." - - + unique_gene_types = { + match + for line in f + for match in re.findall(r'gene_type "([^"]*)"', line.split("\t")[8]) + } + assert ( + set(attributes) == unique_gene_types + ), "Output GTF does not contain exactly the expected gene types." + + def test_different_columns(run_component, subset_input_gtf, random_path): output_gtf = random_path(extension="gtf.gz") args = [ - "--input_gtf", subset_input_gtf, - "--output_gtf", output_gtf, - "--attribute", "gene_type:transcribed_unprocessed_pseudogene;transcript_id:ENST00000456328.2" + "--input_gtf", + subset_input_gtf, + "--output_gtf", + output_gtf, + "--attribute", + "gene_type:transcribed_unprocessed_pseudogene;transcript_id:ENST00000456328.2", ] - + run_component(args) assert os.path.isfile(output_gtf), "Output GTF could not be found." - + with gzip.open(output_gtf, "rt") as f: wrong_attributes_count = sum( - 1 for line in f - if dict(re.findall(r'(\S+) "([^"]*)"', line.split('\t')[8])).get("gene_type") != "transcribed_unprocessed_pseudogene" and - dict(re.findall(r'(\S+) "([^"]*)"', line.split('\t')[8])).get("transcript_id") != "ENST00000456328.2" + 1 + for line in f + if dict(re.findall(r'(\S+) "([^"]*)"', line.split("\t")[8])).get( + "gene_type" + ) + != "transcribed_unprocessed_pseudogene" + and dict(re.findall(r'(\S+) "([^"]*)"', line.split("\t")[8])).get( + "transcript_id" + ) + != "ENST00000456328.2" ) - assert wrong_attributes_count == 0, "Output GTF contains unexpected attribute values." + assert ( + wrong_attributes_count == 0 + ), "Output GTF contains unexpected attribute values." -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/binning/script.py b/src/scgpt/binning/script.py index 9b8bf5158db..fc805f6ecba 100644 --- a/src/scgpt/binning/script.py +++ b/src/scgpt/binning/script.py @@ -14,11 +14,9 @@ "n_input_bins": 51, "output_compression": None, "var_input": "id_in_vocab", - "seed": 0 -} -meta = { - "resources_dir": "src/utils" + "seed": 0, } +meta = {"resources_dir": "src/utils"} ## VIASH END if par["seed"]: @@ -27,6 +25,7 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger from subset_vars import subset_vars + logger = setup_logger() logger.info("Reading in data") @@ -39,10 +38,10 @@ adata = subset_vars(adata, par["var_input"]) logger.info("Converting the input layer into a CSR matrix") -if not par['input_layer'] or par["input_layer"] == "X": +if not par["input_layer"] or par["input_layer"] == "X": layer_data = adata.X else: - layer_data = adata.layers[par['input_layer']] + layer_data = adata.layers[par["input_layer"]] layer_data = csr_matrix(layer_data) if layer_data.min() < 0: @@ -64,7 +63,9 @@ def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray: digits = rands * (right_difits - left_digits) + left_digits digits = np.ceil(digits) - smallest_dtype = np.min_scalar_type(digits.max().astype(np.uint)) # Already checked for non-negative values + smallest_dtype = np.min_scalar_type( + digits.max().astype(np.uint) + ) # Already checked for non-negative values digits = digits.astype(smallest_dtype) return digits @@ -73,14 +74,19 @@ def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray: with warnings.catch_warnings(): # Make sure warnings are displayed once. warnings.simplefilter("once") - # layer_data.indptr.size is the number of rows in the sparse matrix + # layer_data.indptr.size is the number of rows in the sparse matrix binned_rows = [] bin_edges = [] - logger.info("Establishing bin edges and digitizing of non-zero values into bins for each row of the count matrix") - for row_number in range(layer_data.indptr.size-1): - row_start_index, row_end_index = layer_data.indptr[row_number], layer_data.indptr[row_number+1] + logger.info( + "Establishing bin edges and digitizing of non-zero values into bins for each row of the count matrix" + ) + for row_number in range(layer_data.indptr.size - 1): + row_start_index, row_end_index = ( + layer_data.indptr[row_number], + layer_data.indptr[row_number + 1], + ) # These are all non-zero counts in the row - non_zero_row = layer_data.data[row_start_index:row_end_index] + non_zero_row = layer_data.data[row_start_index:row_end_index] if non_zero_row.max() == 0: logger.warning( "The input data contains all zero rows. Please make sure " @@ -105,13 +111,19 @@ def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray: # Create new CSR matrix logger.info("Creating a new CSR matrix of the binned count values") -binned_counts = csr_matrix((np.concatenate(binned_rows, casting="same_kind"), - layer_data.indices, layer_data.indptr), shape=layer_data.shape) +binned_counts = csr_matrix( + ( + np.concatenate(binned_rows, casting="same_kind"), + layer_data.indices, + layer_data.indptr, + ), + shape=layer_data.shape, +) # Set binned values and bin edges layers to adata object input_adata.obsm[par["output_obsm_binned_counts"]] = binned_counts input_adata.obsm["bin_edges"] = np.stack(bin_edges) -# Write mudata output +# Write mudata output logger.info("Writing output data") -mdata. write_h5mu(par["output"], compression=par["output_compression"]) +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/scgpt/binning/test.py b/src/scgpt/binning/test.py index 0c5f5faaf9c..9cc6eb2dfc3 100644 --- a/src/scgpt/binning/test.py +++ b/src/scgpt/binning/test.py @@ -8,31 +8,40 @@ "resources_dir": "resources_test", "executable": "./target/docker/scgpt/binning/binning", "temp_dir": "tmp", - "config": "./target/docker/scgpt/binning/.config.vsh.yaml" + "config": "./target/docker/scgpt/binning/.config.vsh.yaml", } ## VIASH END def test_binning(run_component, tmp_path): - input_file_path = f"{meta['resources_dir']}/Kim2020_Lung_subset_preprocessed.h5mu" output_file_path = tmp_path / "Kim2020_Lung_subset_binned.h5mu" - run_component([ - "--input", input_file_path, - "--modality", "rna", - "--output_obsm_binned_counts", "binned_counts", - "--n_input_bins", "51", - "--var_input", "filter_with_hvg", - "--output", output_file_path - ]) + run_component( + [ + "--input", + input_file_path, + "--modality", + "rna", + "--output_obsm_binned_counts", + "binned_counts", + "--n_input_bins", + "51", + "--var_input", + "filter_with_hvg", + "--output", + output_file_path, + ] + ) # Read output file output_mdata = mu.read(output_file_path) output_adata = output_mdata.mod["rna"] # Check presence of binning layers - assert {"bin_edges", "binned_counts"}.issubset(output_adata.obsm.keys()), "Binning obsm fields were not added." + assert {"bin_edges", "binned_counts"}.issubset( + output_adata.obsm.keys() + ), "Binning obsm fields were not added." # Check bin edges bin_edges = output_adata.obsm["bin_edges"] @@ -46,5 +55,5 @@ def test_binning(run_component, tmp_path): assert (binned_values.data <= 51).all(axis=None) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/cell_type_annotation/script.py b/src/scgpt/cell_type_annotation/script.py index 9a2d79436af..c3487712512 100644 --- a/src/scgpt/cell_type_annotation/script.py +++ b/src/scgpt/cell_type_annotation/script.py @@ -16,34 +16,36 @@ ## VIASH START par = { - 'input': r'resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu', - 'modality': r'rna', - 'model': r'resources_test/scgpt/finetuned_model/best_model.pt', - 'model_config': r'resources_test/scgpt/source/args.json', - 'model_vocab': r'resources_test/scgpt/source/vocab.json', - 'obs_batch_label': r'sample', - 'obsm_gene_tokens': r'gene_id_tokens', - 'obsm_tokenized_values': r'values_tokenized', - 'output': r'output.h5mu', - 'output_compression': None, - 'output_obs_predictions': r'predictions', - 'output_obs_probability': r'probabilities', - 'dsbn': True, - 'seed': 0, - 'pad_token': "", - 'pad_value': -2, - 'n_input_bins': 51, - 'batch_size': 64, - 'finetuned_checkpoints_key': 'model_state_dict', - 'label_mapper_key': 'id_to_class' + "input": r"resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu", + "modality": r"rna", + "model": r"resources_test/scgpt/finetuned_model/best_model.pt", + "model_config": r"resources_test/scgpt/source/args.json", + "model_vocab": r"resources_test/scgpt/source/vocab.json", + "obs_batch_label": r"sample", + "obsm_gene_tokens": r"gene_id_tokens", + "obsm_tokenized_values": r"values_tokenized", + "output": r"output.h5mu", + "output_compression": None, + "output_obs_predictions": r"predictions", + "output_obs_probability": r"probabilities", + "dsbn": True, + "seed": 0, + "pad_token": "", + "pad_value": -2, + "n_input_bins": 51, + "batch_size": 64, + "finetuned_checkpoints_key": "model_state_dict", + "label_mapper_key": "id_to_class", } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + class SeqDataset(Dataset): def __init__(self, data: Dict[str, torch.Tensor]): self.data = data @@ -72,7 +74,9 @@ def main(): # Fetch batch ids for domain-specific batch normalization if par["dsbn"] and not par["obs_batch_label"]: - raise ValueError("When dsbn is set to True, you are required to provide batch labels (obs_batch_labels).") + raise ValueError( + "When dsbn is set to True, you are required to provide batch labels (obs_batch_labels)." + ) elif par["dsbn"] and par["obs_batch_label"]: logger.info("Fetching batch id's for domain-specific batch normalization") batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") @@ -109,11 +113,13 @@ def main(): model_file = par["model"] model_dict = torch.load(model_file, map_location=device) for k, v in { - "--finetuned_checkpoints_key": par["finetuned_checkpoints_key"], - "--label_mapper_key": par["label_mapper_key"], - }.items(): + "--finetuned_checkpoints_key": par["finetuned_checkpoints_key"], + "--label_mapper_key": par["label_mapper_key"], + }.items(): if v not in model_dict.keys(): - raise KeyError(f"The key '{v}' provided for '{k}' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper.") + raise KeyError( + f"The key '{v}' provided for '{k}' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." + ) pretrained_dict = model_dict[par["finetuned_checkpoints_key"]] # Label mapper configuration @@ -144,12 +150,11 @@ def main(): input_emb_style="continuous", n_input_bins=par["n_input_bins"], cell_emb_style="cls", # required for cell-type annotation - use_fast_transformer=False, #TODO: parametrize when GPU is available - fast_transformer_backend="flash", #TODO: parametrize when GPU is available - pre_norm=False, #TODO: parametrize when GPU is available + use_fast_transformer=False, # TODO: parametrize when GPU is available + fast_transformer_backend="flash", # TODO: parametrize when GPU is available + pre_norm=False, # TODO: parametrize when GPU is available ) - # Load model params logger.info(f"Loading model params from {model_file}") try: @@ -172,11 +177,13 @@ def main(): # Load tokenized gene data logger.info("Loading data for inference") for k, v in { - "--obsm_gene_tokens": par["obsm_gene_tokens"], - "--obsm_tokenized_values": par["obsm_tokenized_values"], - }.items(): + "--obsm_gene_tokens": par["obsm_gene_tokens"], + "--obsm_tokenized_values": par["obsm_tokenized_values"], + }.items(): if v not in adata.obsm.keys(): - raise KeyError(f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm") + raise KeyError( + f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" + ) input_gene_ids = adata.obsm[par["obsm_gene_tokens"]] input_values = adata.obsm[par["obsm_tokenized_values"]] @@ -231,7 +238,9 @@ def main(): # Assign cell type labels to predicted classes logger.info("Assigning cell type predictions and probabilities") adata.obs["scgpt_class_pred"] = predictions - adata.obs[par["output_obs_predictions"]] = adata.obs["scgpt_class_pred"].map(lambda x: cell_type_mapper[x]) + adata.obs[par["output_obs_predictions"]] = adata.obs["scgpt_class_pred"].map( + lambda x: cell_type_mapper[x] + ) adata.obs[par["output_obs_probability"]] = probabilities # Write output @@ -240,7 +249,7 @@ def main(): mdata.write(par["output"], compression=par["output_compression"]) -if __name__ == '__main__': +if __name__ == "__main__": freeze_support() warnings.filterwarnings("ignore") main() diff --git a/src/scgpt/cell_type_annotation/test.py b/src/scgpt/cell_type_annotation/test.py index 8b4dd4d911a..f0cb97846f7 100644 --- a/src/scgpt/cell_type_annotation/test.py +++ b/src/scgpt/cell_type_annotation/test.py @@ -2,7 +2,6 @@ from mudata import read_h5mu import sys import torch -import numpy as np import subprocess import re @@ -28,45 +27,74 @@ def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): def test_cell_type_inference(run_component, tmp_path): - output_annotation_file = tmp_path / "Kim2020_Lung_subset_annotated.h5mu" args = [ - "--input", input_path, - "--output", output_annotation_file, - "--modality", "rna", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--model", ft_model, - "--finetuned_checkpoints_key", "model_state_dict", - "--label_mapper_key", "id_to_class", - "--model_vocab", model_vocab, - "--model_config", model_config, - "--obs_batch_label", "sample", - "--dsbn", "True" + "--input", + input_path, + "--output", + output_annotation_file, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--model", + ft_model, + "--finetuned_checkpoints_key", + "model_state_dict", + "--label_mapper_key", + "id_to_class", + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--obs_batch_label", + "sample", + "--dsbn", + "True", ] run_component(args) output_mudata = read_h5mu(output_annotation_file) output_adata = output_mudata.mod["rna"] - assert "scgpt_pred" in output_adata.obs.keys(), "scgpt_pred is not present in anndata obs keys" - assert "scgpt_probability" in output_adata.obs.keys(), "scgpt_probability is not present in anndata obs keys" + assert ( + "scgpt_pred" in output_adata.obs.keys() + ), "scgpt_pred is not present in anndata obs keys" + assert ( + "scgpt_probability" in output_adata.obs.keys() + ), "scgpt_probability is not present in anndata obs keys" # run withou dsbn - output_annotation_file_without_dsbn = tmp_path / "Kim2020_Lung_subset_annotated_no_dsbn.h5mu" + output_annotation_file_without_dsbn = ( + tmp_path / "Kim2020_Lung_subset_annotated_no_dsbn.h5mu" + ) args = [ - "--input", input_path, - "--output", output_annotation_file_without_dsbn, - "--modality", "rna", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--model", ft_model, - "--model_vocab", model_vocab, - "--model_config", model_config, - "--finetuned_checkpoints_key", "model_state_dict", - "--label_mapper_key", "id_to_class", - "--obs_batch_label", "sample", - "--dsbn", "False" + "--input", + input_path, + "--output", + output_annotation_file_without_dsbn, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--model", + ft_model, + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--finetuned_checkpoints_key", + "model_state_dict", + "--label_mapper_key", + "id_to_class", + "--obs_batch_label", + "sample", + "--dsbn", + "False", ] run_component(args) # Read output file @@ -74,86 +102,129 @@ def test_cell_type_inference(run_component, tmp_path): output_adata_no_dsbn = output_mdata_no_dsbn.mod["rna"] # Assert that embeddings without dsbn are different - assert not (output_adata.obs["scgpt_pred"].astype(str) == output_adata_no_dsbn.obs["scgpt_pred"].astype(str)).all(), "Cell type predictions with and without dsbn are the same" + assert not ( + output_adata.obs["scgpt_pred"].astype(str) + == output_adata_no_dsbn.obs["scgpt_pred"].astype(str) + ).all(), "Cell type predictions with and without dsbn are the same" def test_annotation_dsbn_without_batch_labels(run_component, tmp_path): - - output_annotation_labels_without_dsbn = tmp_path / "Kim2020_Lung_subset_annotated_labels_without_dsbn.h5mu" + output_annotation_labels_without_dsbn = ( + tmp_path / "Kim2020_Lung_subset_annotated_labels_without_dsbn.h5mu" + ) args = [ - "--input", input_path, - "--output", output_annotation_labels_without_dsbn, - "--modality", "rna", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--model", ft_model, - "--model_vocab", model_vocab, - "--model_config", model_config, - "--finetuned_checkpoints_key", "model_state_dict", - "--label_mapper_key", "id_to_class", - "--dsbn", "True", + "--input", + input_path, + "--output", + output_annotation_labels_without_dsbn, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--model", + ft_model, + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--finetuned_checkpoints_key", + "model_state_dict", + "--label_mapper_key", + "id_to_class", + "--dsbn", + "True", ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) assert re.search( r"ValueError: When dsbn is set to True, you are required to provide batch labels \(obs_batch_labels\)\.", - err.value.stdout.decode('utf-8')) + err.value.stdout.decode("utf-8"), + ) def test_annotation_non_existing_keys(run_component, tmp_path): - - output_annotation_dummy_values = tmp_path / "Kim2020_Lung_subset_annotated_dummy_key.h5mu" + output_annotation_dummy_values = ( + tmp_path / "Kim2020_Lung_subset_annotated_dummy_key.h5mu" + ) # Test for non-existing tokenized values key args = [ - "--input", input_path, - "--output", output_annotation_dummy_values, - "--modality", "rna", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "dummy_values_tokenized", - "--model", ft_model, - "--model_vocab", model_vocab, - "--model_config", model_config, - "--finetuned_checkpoints_key", "model_state_dict", - "--label_mapper_key", "id_to_class", - "--obs_batch_label", "sample", - "--dsbn", "True", + "--input", + input_path, + "--output", + output_annotation_dummy_values, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "dummy_values_tokenized", + "--model", + ft_model, + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--finetuned_checkpoints_key", + "model_state_dict", + "--label_mapper_key", + "id_to_class", + "--obs_batch_label", + "sample", + "--dsbn", + "True", ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) assert re.search( r'KeyError: "The parameter \'dummy_values_tokenized\' provided for \'--obsm_tokenized_values\' could not be found in adata.obsm"', - err.value.stdout.decode('utf-8')) + err.value.stdout.decode("utf-8"), + ) -def test_checkpoint_architecture(run_component, tmp_path): +def test_checkpoint_architecture(run_component, tmp_path): output_dummy_model_key = tmp_path / "Kim2020_Lung_subset_annotated_dummy_key.h5mu" # Test for non-existing model file keys args = [ - "--input", input_path, - "--output", output_dummy_model_key, - "--modality", "rna", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--model", ft_model, - "--model_vocab", model_vocab, - "--model_config", model_config, - "--finetuned_checkpoints_key", "dummy_checkpoints_key", - "--label_mapper_key", "id_to_class", - "--obs_batch_label", "sample", - "--dsbn", "True", + "--input", + input_path, + "--output", + output_dummy_model_key, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--model", + ft_model, + "--model_vocab", + model_vocab, + "--model_config", + model_config, + "--finetuned_checkpoints_key", + "dummy_checkpoints_key", + "--label_mapper_key", + "id_to_class", + "--obs_batch_label", + "sample", + "--dsbn", + "True", ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) assert re.search( r'KeyError: "The key \'dummy_checkpoints_key\' provided for \'--finetuned_checkpoints_key\' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper."', - err.value.stdout.decode('utf-8')) + err.value.stdout.decode("utf-8"), + ) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/cross_check_genes/script.py b/src/scgpt/cross_check_genes/script.py index 360e984547b..c181d81534e 100644 --- a/src/scgpt/cross_check_genes/script.py +++ b/src/scgpt/cross_check_genes/script.py @@ -12,16 +12,15 @@ "pad_token": "", "var_input": "filter_with_hvg", "vocab_file": "resources_test/scgpt/source/vocab.json", - "output_compression": None + "output_compression": None, } -meta = { - "resources_dir": "src/utils" -} +meta = {"resources_dir": "src/utils"} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() # Read in data @@ -36,7 +35,9 @@ if not par["input_var_gene_names"]: genes = adata.var.index.astype(str).tolist() elif par["input_var_gene_names"] not in adata.var.columns: - raise ValueError(f"Gene name column '{par['input_var_gene_names']}' not found in .mod['{par['modality']}'].obs.") + raise ValueError( + f"Gene name column '{par['input_var_gene_names']}' not found in .mod['{par['modality']}'].obs." + ) else: genes = adata.var[par["input_var_gene_names"]].astype(str).tolist() @@ -49,12 +50,18 @@ if par["var_input"]: logger.info("Filtering genes based on model vocab and HVG") filter_with_hvg = adata.var[par["var_input"]].tolist() - gene_filter_mask = [1 if gene in vocab and hvg else 0 for gene, hvg in zip(genes, filter_with_hvg)] - logger.info(f"Total number of genes after HVG present in model vocab: {str(sum(gene_filter_mask))}") + gene_filter_mask = [ + 1 if gene in vocab and hvg else 0 for gene, hvg in zip(genes, filter_with_hvg) + ] + logger.info( + f"Total number of genes after HVG present in model vocab: {str(sum(gene_filter_mask))}" + ) else: logger.info("Filtering genes based on model vocab") gene_filter_mask = [1 if gene in vocab else 0 for gene in genes] - logger.info(f"Total number of genes present in model vocab: {str(sum(gene_filter_mask))}") + logger.info( + f"Total number of genes present in model vocab: {str(sum(gene_filter_mask))}" + ) logger.info(f"Writing to {par['output']}") adata.var[par["output_var_filter"]] = gene_filter_mask diff --git a/src/scgpt/cross_check_genes/test.py b/src/scgpt/cross_check_genes/test.py index 61e1e07cd67..bb8c53a1349 100644 --- a/src/scgpt/cross_check_genes/test.py +++ b/src/scgpt/cross_check_genes/test.py @@ -6,9 +6,9 @@ ## VIASH START meta = { - 'executable': './target/docker/scgpt/cross_check/cross_check', - 'resources_dir': './resources_test/scgpt/', - 'config': './src/scgpt/cross_check/config.vsh.yaml' + "executable": "./target/docker/scgpt/cross_check/cross_check", + "resources_dir": "./resources_test/scgpt/", + "config": "./src/scgpt/cross_check/config.vsh.yaml", } ## VIASH END @@ -19,53 +19,78 @@ def test_cross_check(run_component, random_path): output_path = random_path(extension="h5mu") args = [ - "--input", input_path, - "--output", output_path, - "--modality", "rna", - "--vocab_file", vocab_path, - "--output_compression", "gzip" + "--input", + input_path, + "--output", + output_path, + "--modality", + "rna", + "--vocab_file", + vocab_path, + "--output_compression", + "gzip", ] run_component(args) output_mudata = read_h5mu(output_path) # Check added columns - assert {"gene_name", "id_in_vocab"}.issubset(set(output_mudata.mod["rna"].var.columns)), "Gene columns were not added." + assert {"gene_name", "id_in_vocab"}.issubset( + set(output_mudata.mod["rna"].var.columns) + ), "Gene columns were not added." # Check if genes were filtered - assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len(output_mudata.mod["rna"].var["id_in_vocab"]), "Genes were not filtered." + assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len( + output_mudata.mod["rna"].var["id_in_vocab"] + ), "Genes were not filtered." output_hvg_path = random_path(extension="h5mu") args_hvg = [ - "--input", input_path, - "--output", output_hvg_path, - "--modality", "rna", - "--var_input", "filter_with_hvg", - "--vocab_file", vocab_path, - "--output_compression", "gzip" + "--input", + input_path, + "--output", + output_hvg_path, + "--modality", + "rna", + "--var_input", + "filter_with_hvg", + "--vocab_file", + vocab_path, + "--output_compression", + "gzip", ] run_component(args_hvg) output_mudata_hvg = read_h5mu(output_hvg_path) # Check if genes were filtered based on HVG - assert sum(output_mudata_hvg.mod["rna"].var["id_in_vocab"]) != len(output_mudata_hvg.mod["rna"].var["id_in_vocab"]), "Genes were not filtered." - assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len(output_mudata_hvg.mod["rna"].var["id_in_vocab"]), "Genes were not filtered based on HVG." + assert sum(output_mudata_hvg.mod["rna"].var["id_in_vocab"]) != len( + output_mudata_hvg.mod["rna"].var["id_in_vocab"] + ), "Genes were not filtered." + assert sum(output_mudata.mod["rna"].var["id_in_vocab"]) != len( + output_mudata_hvg.mod["rna"].var["id_in_vocab"] + ), "Genes were not filtered based on HVG." def test_cross_check_invalid_gene_layer_raises(run_component, random_path): output_path = random_path(extension="h5mu") args = [ - "--input", input_path, - "--output", output_path, - "--vocab_file", vocab_path, - "--input_var_gene_names", "dummy_var", + "--input", + input_path, + "--output", + output_path, + "--vocab_file", + vocab_path, + "--input_var_gene_names", + "dummy_var", ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) - assert re.search(r"ValueError: Gene name column 'dummy_var' not found in .mod\['rna'\]\.obs\.", - err.value.stdout.decode('utf-8')) + assert re.search( + r"ValueError: Gene name column 'dummy_var' not found in .mod\['rna'\]\.obs\.", + err.value.stdout.decode("utf-8"), + ) -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/embedding/script.py b/src/scgpt/embedding/script.py index 69abb52deda..dd5e3036f9f 100644 --- a/src/scgpt/embedding/script.py +++ b/src/scgpt/embedding/script.py @@ -10,9 +10,9 @@ ## VIASH START par = { "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset_tokenized.h5mu", - "obsm_gene_tokens": 'gene_id_tokens', - "obsm_tokenized_values": 'values_tokenized', - "obsm_padding_mask": 'padding_mask', + "obsm_gene_tokens": "gene_id_tokens", + "obsm_tokenized_values": "values_tokenized", + "obsm_padding_mask": "padding_mask", "model": "resources_test/scgpt/source/best_model.pt", "model_config": "resources_test/scgpt/source/args.json", "model_vocab": "resources_test/scgpt/source/vocab.json", @@ -25,12 +25,13 @@ "batch_size": 64, "modality": "rna", "dsbn": True, - "n_input_bins": 51 - } + "n_input_bins": 51, +} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info(f"Setting device to {'cuda' if torch.cuda.is_available() else 'cpu'}") @@ -44,12 +45,14 @@ adata = input_adata.copy() for k, v in { - "--obsm_gene_tokens": par["obsm_gene_tokens"], - "--obsm_tokenized_values": par["obsm_tokenized_values"], - "--obsm_padding_mask": par["obsm_padding_mask"] - }.items(): + "--obsm_gene_tokens": par["obsm_gene_tokens"], + "--obsm_tokenized_values": par["obsm_tokenized_values"], + "--obsm_padding_mask": par["obsm_padding_mask"], +}.items(): if v not in adata.obsm.keys(): - raise KeyError(f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm") + raise KeyError( + f"The parameter '{v}' provided for '{k}' could not be found in adata.obsm" + ) all_gene_ids = adata.obsm[par["obsm_gene_tokens"]] all_values = adata.obsm[par["obsm_tokenized_values"]] @@ -57,7 +60,9 @@ # Fetch batch ids for domain-specific batch normalization if par["dsbn"] and not par["obs_batch_label"]: - raise ValueError("When dsbn is set to True, you are required to provide batch labels (input_obs_batch_labels).") + raise ValueError( + "When dsbn is set to True, you are required to provide batch labels (input_obs_batch_labels)." + ) elif par["dsbn"] and par["obs_batch_label"]: logger.info("Fetching batch id's for domain-specific batch normalization") batch_id_cats = adata.obs[par["obs_batch_label"]].astype("category") @@ -66,7 +71,9 @@ batch_ids = np.array(batch_ids) num_batch_types = len(set(batch_ids)) elif not par["dsbn"] and par["obs_batch_label"]: - logger.info("Batch labels provided but dsbn is set to False. Batch labels will be ignored and no dsbn will be performed.") + logger.info( + "Batch labels provided but dsbn is set to False. Batch labels will be ignored and no dsbn will be performed." + ) # Set padding specs logger.info("Setting padding specs") @@ -114,7 +121,7 @@ d_hid=d_hid, nlayers=nlayers, vocab=vocab, - dropout=0.5, # scGPT default, only relevant for fine-tuning applications + dropout=0.5, # scGPT default, only relevant for fine-tuning applications pad_token=pad_token, pad_value=pad_value, nlayers_cls=3, # only applicable for decoder-based operations @@ -122,15 +129,15 @@ do_mvc=False, # only applicable for decoder-based operations ecs_threshold=0.8, # only applicable for decoder-based operations do_dab=False, # only applicable for decoder-based operations - use_batch_labels=False, # only applicable for decoder-based operations + use_batch_labels=False, # only applicable for decoder-based operations num_batch_labels=num_batch_types if par["dsbn"] else None, domain_spec_batchnorm=par["dsbn"], input_emb_style="continuous", # scGPT default - explicit_zero_prob=False, #TODO: Parametrize when GPU-based machine types are supported - use_fast_transformer=False, #TODO: Parametrize when GPU-based machine types are supported + explicit_zero_prob=False, # TODO: Parametrize when GPU-based machine types are supported + use_fast_transformer=False, # TODO: Parametrize when GPU-based machine types are supported # fast_transformer_backend="flash", #TODO: Parametrize when GPU-based machine types are supported - pre_norm=False #TODO: Parametrize when GPU-based machine types are supported - ) + pre_norm=False, # TODO: Parametrize when GPU-based machine types are supported +) logger.info("Loading model") @@ -143,14 +150,12 @@ try: model_dict = model_dict[finetuned_checkpoints_key] except KeyError as e: - raise ValueError(f"The key '{finetuned_checkpoints_key}' provided for '--finetuned_checkpoints_key' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper.") from e + raise ValueError( + f"The key '{finetuned_checkpoints_key}' provided for '--finetuned_checkpoints_key' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper." + ) from e # Load model -load_pretrained( - model, - model_dict, - verbose=False - ) +load_pretrained(model, model_dict, verbose=False) # Embed tokenized data logger.info("Converting tokenized input data to embeddings") @@ -165,7 +170,7 @@ batch_labels=torch.from_numpy(batch_ids).long() if par["dsbn"] else None, output_to_cpu=True, time_step=0, - return_np=True + return_np=True, ) cell_embeddings = cell_embeddings / np.linalg.norm( diff --git a/src/scgpt/embedding/test.py b/src/scgpt/embedding/test.py index 7063e83ae89..4f140aae944 100644 --- a/src/scgpt/embedding/test.py +++ b/src/scgpt/embedding/test.py @@ -15,7 +15,7 @@ "resources_dir": "resources_test", "executable": "./target/docker/scgpt/integration_embedding/integration_embedding", "temp_dir": "tmp", - "config": "./target/docker/scgpt/integration_embedding/.config.vsh.yaml" + "config": "./target/docker/scgpt/integration_embedding/.config.vsh.yaml", } ## VIASH END @@ -26,6 +26,7 @@ model_config_file = f"{meta['resources_dir']}/source/args.json" input_file = mu.read(input) + def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): f_model_dict = torch.load(scgpt_path, map_location="cpu") model_dict = {} @@ -33,13 +34,14 @@ def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): model_dict[mapper_key] = {k: str(k) for k in range(15)} torch.save(model_dict, ft_scgpt_path) + # Convert foundation model into fine-tuned model architecture: # To be able to do a cell type label mapping, the model architecture needs to contain a class to label mapper dictionary scgpt_to_ft_scgpt(model_file, ft_model, "model_state_dict", "id_to_class") ## START TEMPORARY WORKAROUND DATA PREPROCESSING -#TODO: Remove this workaround once full scGPT preprocessing workflow is implemented +# TODO: Remove this workaround once full scGPT preprocessing workflow is implemented # Read in data adata = input_file.mod["rna"] @@ -62,8 +64,8 @@ def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): # Cross-check genes with pre-trained model genes = adata.var["gene_name"].tolist() adata.var["id_in_vocab"] = [ - 1 if gene in vocab else -1 for gene in adata.var["gene_name"] - ] + 1 if gene in vocab else -1 for gene in adata.var["gene_name"] +] gene_ids_in_vocab = np.array(adata.var["id_in_vocab"]) adata = adata[:, adata.var["id_in_vocab"] >= 0] @@ -80,7 +82,7 @@ def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): hvg_flavor="seurat_v3", binning=51, result_binned_key="X_binned", - ) +) preprocessor(adata, batch_key="str_batch") @@ -103,7 +105,7 @@ def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): tokenized_data = tokenize_and_pad_batch( all_counts, gene_ids, - max_len=n_hvg+1, + max_len=n_hvg + 1, vocab=vocab, pad_token=pad_token, pad_value=-2, @@ -111,8 +113,8 @@ def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): include_zero_gene=False, return_pt=True, mod_type=None, - vocab_mod=None - ) + vocab_mod=None, +) all_gene_ids, all_values = tokenized_data["genes"], tokenized_data["values"] padding_mask = all_gene_ids.eq(vocab[pad_token]) @@ -121,7 +123,7 @@ def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): adata.obsm["values_tokenized"] = all_values.numpy() adata.obsm["padding_mask"] = padding_mask.numpy() -tokenized_data = mu.MuData({'rna': adata}) +tokenized_data = mu.MuData({"rna": adata}) tokenized_data_path = f"{meta['resources_dir']}/Kim2020_Lung_tokenized.h5mu" tokenized_data.write_h5mu(tokenized_data_path) @@ -129,80 +131,126 @@ def scgpt_to_ft_scgpt(scgpt_path, ft_scgpt_path, state_dict_key, mapper_key): def test_integration_embedding(run_component, tmp_path): - output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" - run_component([ - "--input", tokenized_data_path, - "--modality", "rna", - "--model", model_file, - "--model_vocab", vocab_file, - "--model_config", model_config_file, - "--dsbn", "True", - "--obs_batch_label", "sample", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--obsm_padding_mask", "padding_mask", - "--output", output_embedding_file - ]) + run_component( + [ + "--input", + tokenized_data_path, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, + ] + ) # Read output file output_mdata = mu.read(output_embedding_file) output_adata = output_mdata.mod["rna"] # check that embedding obs is present - assert 'X_scGPT' in output_adata.obsm.keys(), "X_scGPT is not present in anndata obsm keys" + assert ( + "X_scGPT" in output_adata.obsm.keys() + ), "X_scGPT is not present in anndata obsm keys" # check embedding size - assert output_adata.obsm["X_scGPT"].shape[1] == 512, "Embedding size does not equal 512" + assert ( + output_adata.obsm["X_scGPT"].shape[1] == 512 + ), "Embedding size does not equal 512" # check embedding value range - assert not all(np.isnan(output_adata.obsm["X_scGPT"][0])), "Embedding values are nan" - assert all([all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]]), "Range of embedding values is outside of [-1, 1]" + assert not all( + np.isnan(output_adata.obsm["X_scGPT"][0]) + ), "Embedding values are nan" + assert all( + [all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]] + ), "Range of embedding values is outside of [-1, 1]" # Run embeddings without dsbn output_embedding_file_without_dsbn = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" - run_component([ - "--input", tokenized_data_path, - "--modality", "rna", - "--model", model_file, - "--model_vocab", vocab_file, - "--model_config", model_config_file, - "--dsbn", "False", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--obsm_padding_mask", "padding_mask", - "--output", output_embedding_file_without_dsbn - ]) + run_component( + [ + "--input", + tokenized_data_path, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "False", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file_without_dsbn, + ] + ) # Read output file output_mdata_no_dsbn = mu.read(output_embedding_file_without_dsbn) output_adata_no_dsbn = output_mdata_no_dsbn.mod["rna"] # Assert that embeddings without dsbn are different - assert not (output_adata.obsm["X_scGPT"] == output_adata_no_dsbn.obsm["X_scGPT"]).all(), "Embeddings with and without dsbn are the same" + assert not ( + output_adata.obsm["X_scGPT"] == output_adata_no_dsbn.obsm["X_scGPT"] + ).all(), "Embeddings with and without dsbn are the same" + def test_integration_embedding_dsbn_without_batch_labels(run_component, tmp_path): output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" args = [ - "--input", tokenized_data_path, - "--modality", "rna", - "--model", model_file, - "--model_vocab", vocab_file, - "--model_config", model_config_file, - "--dsbn", "True", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--obsm_padding_mask", "padding_mask", - "--output", output_embedding_file + "--input", + tokenized_data_path, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) assert re.search( r"ValueError: When dsbn is set to True, you are required to provide batch labels \(input_obs_batch_labels\)\.", - err.value.stdout.decode('utf-8')) + err.value.stdout.decode("utf-8"), + ) def test_integration_embedding_non_existing_keys(run_component, tmp_path): @@ -210,126 +258,196 @@ def test_integration_embedding_non_existing_keys(run_component, tmp_path): # Test for non-existing gene names key args_1 = [ - "--input", tokenized_data_path, - "--modality", "rna", - "--model", model_file, - "--model_vocab", vocab_file, - "--model_config", model_config_file, - "--dsbn", "True", - "--obs_batch_label", "sample", - "--var_gene_names", "dummy_gene_name_key", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--obsm_padding_mask", "padding_mask", - "--output", output_embedding_file + "--input", + tokenized_data_path, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--var_gene_names", + "dummy_gene_name_key", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args_1) assert re.search( - r"KeyError: \'dummy_gene_name_key\'", - err.value.stdout.decode('utf-8')) + r"KeyError: \'dummy_gene_name_key\'", err.value.stdout.decode("utf-8") + ) # Test for non-existing batch label key args_2 = [ - "--input", tokenized_data_path, - "--modality", "rna", - "--model", model_file, - "--model_vocab", vocab_file, - "--model_config", model_config_file, - "--dsbn", "True", - "--obs_batch_label", "dummy_batch_label_key", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--obsm_padding_mask", "padding_mask", - "--output", output_embedding_file + "--input", + tokenized_data_path, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "dummy_batch_label_key", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args_2) assert re.search( - r"KeyError: \'dummy_batch_label_key\'", - err.value.stdout.decode('utf-8')) + r"KeyError: \'dummy_batch_label_key\'", err.value.stdout.decode("utf-8") + ) # Test for non-existing tokenized values key args_3 = [ - "--input", tokenized_data_path, - "--modality", "rna", - "--model", model_file, - "--model_vocab", vocab_file, - "--model_config", model_config_file, - "--dsbn", "True", - "--obs_batch_label", "sample", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "dummy_values_tokenized", - "--obsm_padding_mask", "padding_mask", - "--output", output_embedding_file + "--input", + tokenized_data_path, + "--modality", + "rna", + "--model", + model_file, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "dummy_values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--output", + output_embedding_file, ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args_3) assert re.search( r'KeyError: "The parameter \'dummy_values_tokenized\' provided for \'--obsm_tokenized_values\' could not be found in adata.obsm"', - err.value.stdout.decode('utf-8')) + err.value.stdout.decode("utf-8"), + ) def test_finetuned_model(run_component, tmp_path): output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" - - run_component([ - "--input", tokenized_data_path, - "--modality", "rna", - "--model", ft_model, - "--model_vocab", vocab_file, - "--model_config", model_config_file, - "--dsbn", "True", - "--obs_batch_label", "sample", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--obsm_padding_mask", "padding_mask", - "--finetuned_checkpoints_key", "model_state_dict", - "--output", output_embedding_file - ]) + + run_component( + [ + "--input", + tokenized_data_path, + "--modality", + "rna", + "--model", + ft_model, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--finetuned_checkpoints_key", + "model_state_dict", + "--output", + output_embedding_file, + ] + ) # Read output file output_mdata = mu.read(output_embedding_file) output_adata = output_mdata.mod["rna"] # check that embedding obs is present - assert 'X_scGPT' in output_adata.obsm.keys(), "X_scGPT is not present in anndata obsm keys" + assert ( + "X_scGPT" in output_adata.obsm.keys() + ), "X_scGPT is not present in anndata obsm keys" # check embedding size - assert output_adata.obsm["X_scGPT"].shape[1] == 512, "Embedding size does not equal 512" + assert ( + output_adata.obsm["X_scGPT"].shape[1] == 512 + ), "Embedding size does not equal 512" # check embedding value range - assert not all(np.isnan(output_adata.obsm["X_scGPT"][0])), "Embedding values are nan" - assert all([all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]]), "Range of embedding values is outside of [-1, 1]" + assert not all( + np.isnan(output_adata.obsm["X_scGPT"][0]) + ), "Embedding values are nan" + assert all( + [all(i > -1) & all(i < 1) for i in output_adata.obsm["X_scGPT"]] + ), "Range of embedding values is outside of [-1, 1]" def test_finetuned_model_architecture(run_component, tmp_path): output_embedding_file = tmp_path / "Kim2020_Lung_subset_embedded.h5mu" args = [ - "--input", tokenized_data_path, - "--modality", "rna", - "--model", ft_model, - "--model_vocab", vocab_file, - "--model_config", model_config_file, - "--dsbn", "True", - "--obs_batch_label", "sample", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--obsm_padding_mask", "padding_mask", - "--finetuned_checkpoints_key", "dummy_checkpoints_key", - "--output", output_embedding_file + "--input", + tokenized_data_path, + "--modality", + "rna", + "--model", + ft_model, + "--model_vocab", + vocab_file, + "--model_config", + model_config_file, + "--dsbn", + "True", + "--obs_batch_label", + "sample", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--finetuned_checkpoints_key", + "dummy_checkpoints_key", + "--output", + output_embedding_file, ] with pytest.raises(subprocess.CalledProcessError) as err: run_component(args) assert re.search( - r'ValueError: The key \'dummy_checkpoints_key\' provided for \'--finetuned_checkpoints_key\' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper.', - err.value.stdout.decode('utf-8')) + r"ValueError: The key \'dummy_checkpoints_key\' provided for \'--finetuned_checkpoints_key\' could not be found in the provided --model file. The finetuned model file for cell type annotation requires valid keys for the checkpoints and the label mapper.", + err.value.stdout.decode("utf-8"), + ) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/scgpt/pad_tokenize/script.py b/src/scgpt/pad_tokenize/script.py index 48dc924a3be..641e28a189e 100644 --- a/src/scgpt/pad_tokenize/script.py +++ b/src/scgpt/pad_tokenize/script.py @@ -21,11 +21,9 @@ "obsm_tokenized_values": "values_tokenized", "obsm_padding_mask": "padding_mask", "output_compression": None, - "var_input": "id_in_vocab" - } -meta = { - "resources_dir": "src/utils/" + "var_input": "id_in_vocab", } +meta = {"resources_dir": "src/utils/"} # mdata = mu.read(par["input"]) # mdata.mod["rna"].obsm["binned_counts"] = mdata.mod["rna"].layers["binned"] @@ -35,6 +33,7 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger from subset_vars import subset_vars + logger = setup_logger() logger.info("Reading in data") @@ -84,7 +83,9 @@ max_seq_len = par["max_seq_len"] # Tokenize and pad data -logger.info(f"Padding and tokenizing data with max length of {max_seq_len}, padding token {pad_token} and pad value {pad_value}.") +logger.info( + f"Padding and tokenizing data with max length of {max_seq_len}, padding token {pad_token} and pad value {pad_value}." +) tokenized_data = tokenize_and_pad_batch( all_counts, gene_ids, @@ -96,8 +97,8 @@ include_zero_gene=False, return_pt=True, mod_type=None, - vocab_mod=None - ) + vocab_mod=None, +) all_gene_ids, all_values = tokenized_data["genes"], tokenized_data["values"] padding_mask = all_gene_ids.eq(vocab[pad_token]) diff --git a/src/scgpt/pad_tokenize/test.py b/src/scgpt/pad_tokenize/test.py index c1ab1a1a9e9..0e3d161317d 100644 --- a/src/scgpt/pad_tokenize/test.py +++ b/src/scgpt/pad_tokenize/test.py @@ -1,7 +1,6 @@ import pytest import sys import mudata as mu -import numpy as np from scgpt.tokenizer.gene_tokenizer import GeneVocab ## VIASH START @@ -9,11 +8,13 @@ "resources_dir": "resources_test/scgpt", "executable": "./target/docker/scgpt/integration_pad_tokenize/integration_pad_tokenize", "temp_dir": "tmp", - "config": "./target/docker/scgpt/integration_pad_tokenize/.config.vsh.yaml" + "config": "./target/docker/scgpt/integration_pad_tokenize/.config.vsh.yaml", } ## VIASH END -input_file = f"{meta['resources_dir']}/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu" +input_file = ( + f"{meta['resources_dir']}/scgpt/test_resources/Kim2020_Lung_subset_binned.h5mu" +) vocab_file = f"{meta['resources_dir']}/scgpt/source/vocab.json" vocab = GeneVocab.from_file(vocab_file) @@ -31,18 +32,30 @@ def binned_h5mu(random_h5mu_path): def test_integration_pad_tokenize(run_component, tmp_path, binned_h5mu): output = tmp_path / "Kim2020_Lung_tokenized.h5mu" - run_component([ - "--input", binned_h5mu, - "--output", output, - "--modality", "rna", - "--obsm_gene_tokens", "gene_id_tokens", - "--obsm_tokenized_values", "values_tokenized", - "--obsm_padding_mask", "padding_mask", - "--pad_token", "", - "--pad_value", "-2", - "--input_obsm_binned_counts", "binned_counts", - "--model_vocab", vocab_file - ]) + run_component( + [ + "--input", + binned_h5mu, + "--output", + output, + "--modality", + "rna", + "--obsm_gene_tokens", + "gene_id_tokens", + "--obsm_tokenized_values", + "values_tokenized", + "--obsm_padding_mask", + "padding_mask", + "--pad_token", + "", + "--pad_value", + "-2", + "--input_obsm_binned_counts", + "binned_counts", + "--model_vocab", + vocab_file, + ] + ) output_file = mu.read(output) output_adata = output_file.mod["rna"] @@ -53,13 +66,23 @@ def test_integration_pad_tokenize(run_component, tmp_path, binned_h5mu): # check output dimensions ## nr of genes that are tokenized - assert gene_ids.shape[1] <= output_adata.var.shape[0] + 1, "gene_ids shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" - assert values.shape[1] <= output_adata.var.shape[0] + 1, "values shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" - assert padding_mask.shape[1] <= output_adata.var.shape[0] + 1, "padding_mask shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" + assert ( + gene_ids.shape[1] <= output_adata.var.shape[0] + 1 + ), "gene_ids shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" + assert ( + values.shape[1] <= output_adata.var.shape[0] + 1 + ), "values shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" + assert ( + padding_mask.shape[1] <= output_adata.var.shape[0] + 1 + ), "padding_mask shape[1] is higher than adata.var.shape[0] (n_hvg + 1)" ## equal size of output tensors - assert gene_ids.shape == values.shape, "gene_ids shape[1] does not match values shape[1]" - assert gene_ids.shape == padding_mask.shape, "gene_ids shape[1] does not match padding_mask shape[1]" + assert ( + gene_ids.shape == values.shape + ), "gene_ids shape[1] does not match values shape[1]" + assert ( + gene_ids.shape == padding_mask.shape + ), "gene_ids shape[1] does not match padding_mask shape[1]" ## check values of output tensors assert gene_ids.dtype == "int64", "tokenized gene_ids are not integers" @@ -71,14 +94,26 @@ def test_integration_pad_tokenize(run_component, tmp_path, binned_h5mu): assert padding_mask.dtype == bool, "padding mask is not boolean" ## check cls token - assert (gene_ids[:, 0] == vocab[""]).all(), "cls token was not correctly appended at the beginning of the gene_ids tensor" - assert (values[:, 0] == 0).all(), "cls token was not correctly appended at the beginning of the values tensors" + assert ( + gene_ids[:, 0] == vocab[""] + ).all(), ( + "cls token was not correctly appended at the beginning of the gene_ids tensor" + ) + assert ( + values[:, 0] == 0 + ).all(), ( + "cls token was not correctly appended at the beginning of the values tensors" + ) # check padding values masked_gene_ids = gene_ids[padding_mask] unmasked_gene_ids = gene_ids[~padding_mask] - assert all(masked_gene_ids == vocab[""]), "masked gene_ids contain non-pad tokens" - assert all(unmasked_gene_ids != vocab[""]), "unmasked gene_ids contain pad tokens" + assert all( + masked_gene_ids == vocab[""] + ), "masked gene_ids contain non-pad tokens" + assert all( + unmasked_gene_ids != vocab[""] + ), "unmasked gene_ids contain pad tokens" masked_values = values[padding_mask] unmasked_values = values[~padding_mask] @@ -86,5 +121,5 @@ def test_integration_pad_tokenize(run_component, tmp_path, binned_h5mu): assert all(unmasked_values != -2), "unmasked values contain pad values" -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/transform/bpcells_regress_out/test.py b/src/transform/bpcells_regress_out/test.py index 272c18cc69a..434240f6806 100644 --- a/src/transform/bpcells_regress_out/test.py +++ b/src/transform/bpcells_regress_out/test.py @@ -34,13 +34,16 @@ def output_h5mu_path(tmp_path): def test_regress_out(run_component, input_h5mu_path, output_h5mu_path): - # execute command cmd_pars = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--obs_keys", "var", - "--output_compression", "gzip" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--obs_keys", + "var", + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -61,16 +64,22 @@ def test_regress_out(run_component, input_h5mu_path, output_h5mu_path): assert prot_in.shape == prot_out.shape, "Should have same shape as before" assert np.mean(rna_in.X) != np.mean(rna_out.X), "RNA expression should have changed" - assert np.mean(prot_in.X) == np.mean(prot_out.X), "Protein expression should remain the same" - + assert np.mean(prot_in.X) == np.mean( + prot_out.X + ), "Protein expression should remain the same" -def test_no_regress_out_without_obs_keys(run_component, input_h5mu_path, output_h5mu_path): +def test_no_regress_out_without_obs_keys( + run_component, input_h5mu_path, output_h5mu_path +): # execute command cmd_pars = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--output_compression", "gzip" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -80,19 +89,26 @@ def test_no_regress_out_without_obs_keys(run_component, input_h5mu_path, output_ rna_in = mu_input.mod["rna"] rna_out = mu_output.mod["rna"] - assert np.mean(rna_in.X) == np.mean(rna_out.X), "RNA expression should remain the same" + assert np.mean(rna_in.X) == np.mean( + rna_out.X + ), "RNA expression should remain the same" def test_regress_out_with_layers(run_component, input_h5mu_path, output_h5mu_path): - # execute command cmd_pars = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--obs_keys", "var", - "--input_layer", "input", - "--output_layer", "output", - "--output_compression", "gzip" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--obs_keys", + "var", + "--input_layer", + "input", + "--output_layer", + "output", + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -102,8 +118,10 @@ def test_regress_out_with_layers(run_component, input_h5mu_path, output_h5mu_pat rna_in = mu_input.mod["rna"] rna_out = mu_output.mod["rna"] - assert np.mean(rna_in.layers["input"]) != np.mean(rna_out.layers["output"]), "RNA expression should have changed" + assert np.mean(rna_in.layers["input"]) != np.mean( + rna_out.layers["output"] + ), "RNA expression should have changed" -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/transform/clr/script.py b/src/transform/clr/script.py index 4de097a9440..200956929a7 100644 --- a/src/transform/clr/script.py +++ b/src/transform/clr/script.py @@ -6,17 +6,17 @@ ## VIASH START par = { - 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu', - 'modality': 'prot', - 'output': "foo.h5mu", - 'layer': None, + "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu", + "modality": "prot", + "output": "foo.h5mu", + "layer": None, } ## VIASH END def main(): - input_h5mu = read_h5mu(par['input']) - modality = input_h5mu[par['modality']] + input_h5mu = read_h5mu(par["input"]) + modality = input_h5mu[par["modality"]] input_data = modality if par["input_layer"]: input_data = AnnData(X=input_data.layers[par["input_layer"]]) @@ -26,11 +26,14 @@ def main(): if not normalized_counts: raise RuntimeError("CLR failed to return the requested output layer") - output_layer_setter = partial(setattr, modality, "X") \ - if not par["output_layer"] \ - else partial(setitem, modality.layers, par["output_layer"]) + output_layer_setter = ( + partial(setattr, modality, "X") + if not par["output_layer"] + else partial(setitem, modality.layers, par["output_layer"]) + ) output_layer_setter(normalized_counts.X) - input_h5mu.write_h5mu(par['output'], compression=par["output_compression"]) + input_h5mu.write_h5mu(par["output"], compression=par["output_compression"]) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/transform/clr/test.py b/src/transform/clr/test.py index a3ba9130f4a..96ce3dd5f9c 100644 --- a/src/transform/clr/test.py +++ b/src/transform/clr/test.py @@ -5,32 +5,41 @@ ## VIASH START meta = { - 'executable': 'target/executable/transform/clr/clr', - 'resources_dir': './resources_test/', - 'cpus': 2, - 'config': "./src/transform/clr/config.vsh.yaml" + "executable": "target/executable/transform/clr/clr", + "resources_dir": "./resources_test/", + "cpus": 2, + "config": "./src/transform/clr/config.vsh.yaml", } ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + def test_clr(run_component, tmp_path): output_file = tmp_path / "foo.h5mu" - run_component([ - "--input", input_file, - "--output", str(output_file), - "--output_compression", "gzip", - "--output_layer", "clr" - ]) + run_component( + [ + "--input", + input_file, + "--output", + str(output_file), + "--output_compression", + "gzip", + "--output_layer", + "clr", + ] + ) assert output_file.is_file() output_h5mu = read_h5mu(output_file) - assert 'clr' in output_h5mu.mod['prot'].layers.keys() - assert output_h5mu.mod['prot'].layers['clr'] is not None + assert "clr" in output_h5mu.mod["prot"].layers.keys() + assert output_h5mu.mod["prot"].layers["clr"] is not None input = read_h5mu(input_file) - input_col = input.mod['prot'].X[:,0].toarray() - result_col = output_h5mu.mod['prot'].layers['clr'][:,0].toarray() - expected_col = np.log1p(input_col / np.exp(np.log1p(input_col).sum(axis=0) / input_col.size )) + input_col = input.mod["prot"].X[:, 0].toarray() + result_col = output_h5mu.mod["prot"].layers["clr"][:, 0].toarray() + expected_col = np.log1p( + input_col / np.exp(np.log1p(input_col).sum(axis=0) / input_col.size) + ) np.testing.assert_allclose(result_col, expected_col) @@ -38,60 +47,88 @@ def test_clr_select_input_layer(run_component, tmp_path): output_file = tmp_path / "foo.h5mu" input_data = read_h5mu(input_file) - input_data.mod['prot'].layers['test_layer'] = input_data.mod["prot"].X.copy() + input_data.mod["prot"].layers["test_layer"] = input_data.mod["prot"].X.copy() input_data.mod["prot"].X = None - + temp_input_file = tmp_path / "temp.h5mu" input_data.write(temp_input_file) - run_component([ - "--input", temp_input_file, - "--output", str(output_file), - "--output_compression", "gzip", - "--output_layer", "clr", - "--input_layer", "test_layer", - ]) + run_component( + [ + "--input", + temp_input_file, + "--output", + str(output_file), + "--output_compression", + "gzip", + "--output_layer", + "clr", + "--input_layer", + "test_layer", + ] + ) assert output_file.is_file() output_h5mu = read_h5mu(output_file) - assert 'clr' in output_h5mu.mod['prot'].layers.keys() - assert output_h5mu.mod['prot'].layers['clr'] is not None + assert "clr" in output_h5mu.mod["prot"].layers.keys() + assert output_h5mu.mod["prot"].layers["clr"] is not None + def test_clr_output_to_x(run_component, tmp_path): output_file = tmp_path / "foo.h5mu" - original_x = read_h5mu(input_file).mod['prot'].X - run_component([ - "--input", input_file, - "--output", str(output_file), - "--output_compression", "gzip", - ]) + original_x = read_h5mu(input_file).mod["prot"].X + run_component( + [ + "--input", + input_file, + "--output", + str(output_file), + "--output_compression", + "gzip", + ] + ) assert output_file.is_file() output_h5mu = read_h5mu(output_file) - assert 'clr' not in output_h5mu.mod['prot'].layers - assert not np.all(np.isclose(original_x.toarray(), - output_h5mu.mod['prot'].X.toarray(), - rtol=1e-07, atol=1e-07)) - + assert "clr" not in output_h5mu.mod["prot"].layers + assert not np.all( + np.isclose( + original_x.toarray(), + output_h5mu.mod["prot"].X.toarray(), + rtol=1e-07, + atol=1e-07, + ) + ) + + def test_clr_set_axis(run_component, tmp_path): output_file = tmp_path / "foo.h5mu" - run_component([ - "--input", input_file, - "--output", str(output_file), - "--output_compression", "gzip", - "--output_layer", "clr", - "--axis", "1", - ]) + run_component( + [ + "--input", + input_file, + "--output", + str(output_file), + "--output_compression", + "gzip", + "--output_layer", + "clr", + "--axis", + "1", + ] + ) assert output_file.is_file() output_h5mu = read_h5mu(output_file) - assert 'clr' in output_h5mu.mod['prot'].layers.keys() - assert output_h5mu.mod['prot'].layers['clr'] is not None + assert "clr" in output_h5mu.mod["prot"].layers.keys() + assert output_h5mu.mod["prot"].layers["clr"] is not None input = read_h5mu(input_file) - input_row = input.mod['prot'].X[0].toarray() - result_row = output_h5mu.mod['prot'].layers['clr'][0].toarray() - expected_row = np.log1p(input_row / np.exp(np.log1p(input_row).sum(axis=1) / input_row.size )) + input_row = input.mod["prot"].X[0].toarray() + result_row = output_h5mu.mod["prot"].layers["clr"][0].toarray() + expected_row = np.log1p( + input_row / np.exp(np.log1p(input_row).sum(axis=1) / input_row.size) + ) np.testing.assert_allclose(result_row, expected_row) if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/transform/delete_layer/script.py b/src/transform/delete_layer/script.py index 2dc7dbb29e0..7b40270c7df 100644 --- a/src/transform/delete_layer/script.py +++ b/src/transform/delete_layer/script.py @@ -4,19 +4,15 @@ from pathlib import Path ## VIASH START -from mudata import read_h5mu par = { "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu", "output": "output.h5mu", "modality": "rna", - "layer": ['log_normalized'], + "layer": ["log_normalized"], "missing_ok": False, - "output_compression": "lzf" -} -meta = { - "name": "delete_layer", - "resources_dir": "resources_test" + "output_compression": "lzf", } +meta = {"name": "delete_layer", "resources_dir": "resources_test"} ## VIASH END sys.path.append(meta["resources_dir"]) @@ -25,29 +21,40 @@ logger = setup_logger() + def main(): - input_file, output_file, mod_name = Path(par["input"]), Path(par["output"]), par['modality'] + input_file, output_file, mod_name = ( + Path(par["input"]), + Path(par["output"]), + par["modality"], + ) - logger.info('Reading input file %s, modality %s.', input_file, mod_name) + logger.info("Reading input file %s, modality %s.", input_file, mod_name) mod = read_h5ad(input_file, mod=mod_name) - for layer in par['layer']: + for layer in par["layer"]: if layer not in mod.layers: - if par['missing_ok']: + if par["missing_ok"]: continue raise ValueError(f"Layer '{layer}' is not present in modality {mod_name}.") - logger.info('Deleting layer %s from modality %s.', layer, mod_name) + logger.info("Deleting layer %s from modality %s.", layer, mod_name) del mod.layers[layer] - logger.info('Writing output to %s.', par['output']) - output_file_uncompressed = output_file.with_name(output_file.stem + "_uncompressed.h5mu") \ - if par["output_compression"] else output_file - shutil.copyfile(par['input'], output_file_uncompressed) + logger.info("Writing output to %s.", par["output"]) + output_file_uncompressed = ( + output_file.with_name(output_file.stem + "_uncompressed.h5mu") + if par["output_compression"] + else output_file + ) + shutil.copyfile(par["input"], output_file_uncompressed) write_h5ad(filename=output_file_uncompressed, mod=mod_name, data=mod) if par["output_compression"]: - compress_h5mu(output_file_uncompressed, output_file, compression=par["output_compression"]) + compress_h5mu( + output_file_uncompressed, output_file, compression=par["output_compression"] + ) output_file_uncompressed.unlink() - logger.info('Finished.') + logger.info("Finished.") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/transform/delete_layer/test.py b/src/transform/delete_layer/test.py index d02ad028d66..665cb0308ed 100644 --- a/src/transform/delete_layer/test.py +++ b/src/transform/delete_layer/test.py @@ -1,67 +1,92 @@ import sys import pytest -import re from mudata import read_h5mu from subprocess import CalledProcessError ## VIASH START meta = { - 'name': './target/executable/transform/delete_layer/delete_layer', - 'resources_dir': './resources_test/' + "name": "./target/executable/transform/delete_layer/delete_layer", + "resources_dir": "./resources_test/", } ## VIASH END input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + def test_delete_layer(run_component, tmp_path): temp_input = tmp_path / "input.h5mu" temp_output = tmp_path / "output.h5mu" # create input file input = read_h5mu(input_file) - new_layer = input.mod['rna'].X - input.mod['rna'].layers['test'] = new_layer - assert "test" in input.mod['rna'].layers.keys() + new_layer = input.mod["rna"].X + input.mod["rna"].layers["test"] = new_layer + assert "test" in input.mod["rna"].layers.keys() input.write_h5mu(temp_input) # run command - run_component([ - "--input", str(temp_input), - "--modality", "rna", - "--layer", "test", - "--output", str(temp_output)]) - + run_component( + [ + "--input", + str(temp_input), + "--modality", + "rna", + "--layer", + "test", + "--output", + str(temp_output), + ] + ) + # check if output is correct assert temp_output.is_file() output = read_h5mu(temp_output) - assert 'test' not in output.mod['rna'].layers.keys() - assert set(output.mod) == {'rna', 'prot'} + assert "test" not in output.mod["rna"].layers.keys() + assert set(output.mod) == {"rna", "prot"} + def test_missing_layer_raises(run_component, tmp_path): output = tmp_path / "temp.h5mu" with pytest.raises(CalledProcessError) as err: - run_component([ - "--input", input_file, - "--modality", "rna", - "--layer", "test", - "--output", str(output)]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--layer", + "test", + "--output", + str(output), + ] + ) assert not output.is_file() - assert "Layer 'test' is not present in modality rna." in \ - err.value.stdout.decode('utf-8') + assert "Layer 'test' is not present in modality rna." in err.value.stdout.decode( + "utf-8" + ) + def test_missing_layer_missing_ok(run_component, tmp_path): output = tmp_path / "temp.h5mu" - run_component([ - "--input", input_file, - "--modality", "rna", - "--layer", "test", - "--output", str(output), - "--missing_ok"]) + run_component( + [ + "--input", + input_file, + "--modality", + "rna", + "--layer", + "test", + "--output", + str(output), + "--missing_ok", + ] + ) assert output.is_file() output_data = read_h5mu(output) - assert 'test' not in output_data.mod['rna'].layers.keys() - assert set(output_data.mod) == {'rna', 'prot'} + assert "test" not in output_data.mod["rna"].layers.keys() + assert set(output_data.mod) == {"rna", "prot"} + @pytest.mark.parametrize("output_compression", ["gzip", "lzf"]) def test_delete_layer_with_compression(run_component, tmp_path, output_compression): @@ -70,24 +95,32 @@ def test_delete_layer_with_compression(run_component, tmp_path, output_compressi # create temp input with 'test' layer original_input_data = read_h5mu(input_file) - new_layer = original_input_data.mod['rna'].X - original_input_data.mod['rna'].layers['test'] = new_layer + new_layer = original_input_data.mod["rna"].X + original_input_data.mod["rna"].layers["test"] = new_layer original_input_data.write_h5mu(temp_input) # run component - run_component([ - "--input", str(temp_input), - "--modality", "rna", - "--layer", "test", - "--output", str(output), - "--output_compression", output_compression]) - + run_component( + [ + "--input", + str(temp_input), + "--modality", + "rna", + "--layer", + "test", + "--output", + str(output), + "--output_compression", + output_compression, + ] + ) + # check if output is correct assert output.is_file() output_data = read_h5mu(output) - assert 'test' not in output_data.mod['rna'].layers.keys() - assert set(output_data.mod) == {'rna', 'prot'} + assert "test" not in output_data.mod["rna"].layers.keys() + assert set(output_data.mod) == {"rna", "prot"} if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/transform/log1p/run_test.py b/src/transform/log1p/run_test.py index 686b4069272..c79a728e7a7 100644 --- a/src/transform/log1p/run_test.py +++ b/src/transform/log1p/run_test.py @@ -5,24 +5,27 @@ import pandas as pd import sys import pytest -import sys import uuid from operator import attrgetter ## VIASH START meta = { - 'name': 'lognorm', - 'resources_dir': 'resources_test/', - 'config': './src/transform/log1p/config.vsh.yaml', - 'executable': "../../executable/docker/transform/log1p/log1p" + "name": "lognorm", + "resources_dir": "resources_test/", + "config": "./src/transform/log1p/config.vsh.yaml", + "executable": "../../executable/docker/transform/log1p/log1p", } ## VIASH END + @pytest.fixture def input_data(): - return mu.read_h5mu(f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu").copy() + return mu.read_h5mu( + f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + ).copy() + @pytest.fixture def random_h5mu_path(tmp_path): @@ -30,8 +33,10 @@ def wrapper(): unique_filename = f"{str(uuid.uuid4())}.h5mu" temp_file = tmp_path / unique_filename return temp_file + return wrapper + @pytest.mark.parametrize("output_layer", [None, "log_normalized"]) @pytest.mark.parametrize("input_layer", [None, "normalized"]) def test_1logp(run_component, input_data, output_layer, input_layer, random_h5mu_path): @@ -45,16 +50,23 @@ def test_1logp(run_component, input_data, output_layer, input_layer, random_h5mu input_path = random_h5mu_path() input_data.write(input_path) run_args = [ - "--input", input_path, - "--output", output, - "--output_compresion", "gzip" + "--input", + input_path, + "--output", + output, + "--output_compresion", + "gzip", ] if output_layer: run_args.extend(["--output_layer", output_layer]) if input_layer: run_args.extend(["--input_layer", input_layer]) run_component(run_args) - get_output_layer = attrgetter("X") if not output_layer else lambda x: getattr(x, 'layers')[output_layer] + get_output_layer = ( + attrgetter("X") + if not output_layer + else lambda x: getattr(x, "layers")[output_layer] + ) assert path.exists(output), "No output was created." @@ -72,30 +84,36 @@ def test_1logp(run_component, input_data, output_layer, input_layer, random_h5mu assert rna_in.shape == rna_out.shape, "Should have same shape as before" assert prot_in.shape == prot_out.shape, "Should have same shape as before" input_layer_data = rna_in.X if not input_layer else rna_in.layers[input_layer] - assert np.mean(input_layer_data) != np.mean(get_output_layer(rna_out)), "Expression should have changed" + assert np.mean(input_layer_data) != np.mean( + get_output_layer(rna_out) + ), "Expression should have changed" nz_row, nz_col = input_layer_data.nonzero() - row_corr = np.corrcoef(input_layer_data[nz_row[0],:].toarray().flatten(), - get_output_layer(rna_out)[nz_row[0],:].toarray().flatten())[0,1] - col_corr = np.corrcoef(input_layer_data[:,nz_col[0]].toarray().flatten(), - get_output_layer(rna_out)[:,nz_col[0]].toarray().flatten())[0,1] - assert row_corr > .1 - assert col_corr > .1 + row_corr = np.corrcoef( + input_layer_data[nz_row[0], :].toarray().flatten(), + get_output_layer(rna_out)[nz_row[0], :].toarray().flatten(), + )[0, 1] + col_corr = np.corrcoef( + input_layer_data[:, nz_col[0]].toarray().flatten(), + get_output_layer(rna_out)[:, nz_col[0]].toarray().flatten(), + )[0, 1] + assert row_corr > 0.1 + assert col_corr > 0.1 - assert 'log1p' in rna_out.uns + assert "log1p" in rna_out.uns # Make sure that the original input layer has not been overwritten layers_to_test = [None] + list(rna_in.layers.keys()) for layer in layers_to_test: if layer != output_layer: - in_data = sc.get.var_df(rna_in, - keys=rna_in.obs_names.to_list(), - layer=layer) - out_data = sc.get.var_df(rna_out, - keys=rna_in.obs_names.to_list(), - layer=layer) + in_data = sc.get.var_df( + rna_in, keys=rna_in.obs_names.to_list(), layer=layer + ) + out_data = sc.get.var_df( + rna_out, keys=rna_in.obs_names.to_list(), layer=layer + ) pd.testing.assert_frame_equal(in_data, out_data) - -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/log1p/script.py b/src/transform/log1p/script.py index bc29cff87d2..49b74dd109b 100644 --- a/src/transform/log1p/script.py +++ b/src/transform/log1p/script.py @@ -16,6 +16,7 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading input mudata") @@ -27,14 +28,15 @@ data = mdata.mod[mod] # Make our own copy with not a lot of data -# this avoid excessive memory usage and accidental overwrites -input_layer = data.layers[par["input_layer"]] \ - if par["input_layer"] else data.X +# this avoid excessive memory usage and accidental overwrites +input_layer = data.layers[par["input_layer"]] if par["input_layer"] else data.X data_for_scanpy = ad.AnnData(X=input_layer.copy()) -sc.pp.log1p(data_for_scanpy, - base=par["base"], - layer=None, # use X - copy=False) # allow overwrites in the copy that was made +sc.pp.log1p( + data_for_scanpy, + base=par["base"], + layer=None, # use X + copy=False, +) # allow overwrites in the copy that was made # Scanpy will overwrite the input layer. # So fetch input layer from the copy and use it to populate the output slot @@ -42,7 +44,7 @@ data.layers[par["output_layer"]] = data_for_scanpy.X else: data.X = data_for_scanpy.X -data.uns['log1p'] = data_for_scanpy.uns['log1p'].copy() +data.uns["log1p"] = data_for_scanpy.uns["log1p"].copy() logger.info("Writing to file %s", par["output"]) mdata.write_h5mu(filename=par["output"], compression=par["output_compression"]) diff --git a/src/transform/move_layer/script.py b/src/transform/move_layer/script.py index a791b8c3e32..00cdc85a4fa 100644 --- a/src/transform/move_layer/script.py +++ b/src/transform/move_layer/script.py @@ -13,13 +13,12 @@ "output_compression": None, } -meta = { - "resources_dir": "." -} +meta = {"resources_dir": "."} ### VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Read mudata from file") @@ -28,7 +27,9 @@ mod_data = mdata.mod[modality] -logger.info("Using input layer '%s'", "X" if not par["input_layer"] else par["input_layer"]) +logger.info( + "Using input layer '%s'", "X" if not par["input_layer"] else par["input_layer"] +) if par["input_layer"]: data_to_write = mod_data.layers[par["input_layer"]].copy() del mod_data.layers[par["input_layer"]] @@ -36,13 +37,12 @@ data_to_write = mod_data.X mod_data.X = None -output_layer_setter = partial(setattr, mod_data, "X") \ - if not par["output_layer"] \ - else partial(setitem, mod_data.layers, par["output_layer"]) +output_layer_setter = ( + partial(setattr, mod_data, "X") + if not par["output_layer"] + else partial(setitem, mod_data.layers, par["output_layer"]) +) output_layer_setter(data_to_write) logger.info("Write output to mudata file") -mdata.write_h5mu(par['output'], compression=par["output_compression"]) - - - +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/transform/move_layer/test.py b/src/transform/move_layer/test.py index 92e8d6f970a..82bfdca871d 100644 --- a/src/transform/move_layer/test.py +++ b/src/transform/move_layer/test.py @@ -6,62 +6,96 @@ ## VIASH START meta = { - 'executable': './target/executable/transform/move_layer/move_layer', - 'config': './src/transform/move_layer/config.vsh.yaml' + "executable": "./target/executable/transform/move_layer/move_layer", + "config": "./src/transform/move_layer/config.vsh.yaml", } ## VIASH END + @pytest.fixture def test_mudata(tmp_path): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"]) - obs = pd.DataFrame([["A", "sample1"], ["B", "sample2"]], index=df.index, columns=["Obs", "sample_id"]) - var = pd.DataFrame([["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], - index=df.columns, columns=["Feat", "sample_id_var"]) - obsm = pd.DataFrame([["X", "W"]], index=pd.Index([0]), columns=["uns_col1", "uns_col2"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["obs1", "obs2"], columns=["var1", "var2", "var3"] + ) + obs = pd.DataFrame( + [["A", "sample1"], ["B", "sample2"]], + index=df.index, + columns=["Obs", "sample_id"], + ) + var = pd.DataFrame( + [["a", "sample1"], ["b", "sample2"], ["c", "sample1"]], + index=df.columns, + columns=["Feat", "sample_id_var"], + ) + obsm = pd.DataFrame( + [["X", "W"]], index=pd.Index([0]), columns=["uns_col1", "uns_col2"] + ) ad1 = AnnData(df, obs=obs, var=var, uns={"obsm1": obsm}) var2 = pd.DataFrame(["d", "e", "g"], index=df.columns, columns=["Feat"]) obs2 = pd.DataFrame(["C", "D"], index=df.index, columns=["Obs"]) ad2 = AnnData(df, obs=obs2, var=var2) test_h5mu = tmp_path / "input.h5mu" - mudata = MuData({'mod1': ad1, 'mod2': ad2}) + mudata = MuData({"mod1": ad1, "mod2": ad2}) mudata.write_h5mu(test_h5mu) return test_h5mu + def test_move_layer(test_mudata, run_component, tmp_path): output_file = tmp_path / "output.h5mu" - run_component([ - "--input", str(test_mudata), - "--modality", "mod1", - "--output_layer", "test_layer", - "--output", str(output_file) - ]) + run_component( + [ + "--input", + str(test_mudata), + "--modality", + "mod1", + "--output_layer", + "test_layer", + "--output", + str(output_file), + ] + ) assert output_file.is_file() output_mudata = read_h5mu(output_file) assert "test_layer" in output_mudata.mod["mod1"].layers - assert output_mudata.mod['mod1'].X is None + assert output_mudata.mod["mod1"].X is None + def test_move_layer_select_input_layer(test_mudata, run_component, tmp_path): output_file = tmp_path / "output.h5mu" - run_component([ - "--input", str(test_mudata), - "--modality", "mod1", - "--output_layer", "test_layer", - "--output", str(output_file) - ]) - output_file_2 = tmp_path / "output2.h5mu" - run_component([ - "--input", str(output_file), - "--modality", "mod1", - "--input_layer", "test_layer", - "--output_layer", "test_layer2", - "--output", str(output_file_2) - ]) + run_component( + [ + "--input", + str(test_mudata), + "--modality", + "mod1", + "--output_layer", + "test_layer", + "--output", + str(output_file), + ] + ) + output_file_2 = tmp_path / "output2.h5mu" + run_component( + [ + "--input", + str(output_file), + "--modality", + "mod1", + "--input_layer", + "test_layer", + "--output_layer", + "test_layer2", + "--output", + str(output_file_2), + ] + ) assert output_file_2.is_file() output_mudata = read_h5mu(output_file_2) assert "test_layer2" in output_mudata.mod["mod1"].layers assert "test_layer" not in output_mudata.mod["mod1"].layers - assert output_mudata.mod['mod1'].X is None + assert output_mudata.mod["mod1"].X is None + -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/normalize_total/script.py b/src/transform/normalize_total/script.py index 4da8b8605fe..81a6a65142f 100644 --- a/src/transform/normalize_total/script.py +++ b/src/transform/normalize_total/script.py @@ -8,13 +8,14 @@ "output": "output.h5mu", "target_sum": 10000, "modality": "rna", - "exclude_highly_expressed": False + "exclude_highly_expressed": False, } meta = {"name": "lognorm"} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading input mudata") @@ -26,15 +27,21 @@ mod = par["modality"] logger.info("Performing total normalization on modality %s", mod) dat = mdata.mod[mod] -if par['input_layer'] and not par['input_layer'] in dat.layers.keys(): +if par["input_layer"] and par["input_layer"] not in dat.layers.keys(): raise ValueError(f"Input layer {par['input_layer']} not found in {mod}") -output_data = sc.pp.normalize_total(dat, - layer=par["input_layer"], - target_sum=par["target_sum"], - copy=True if par["output_layer"] else False) +output_data = sc.pp.normalize_total( + dat, + layer=par["input_layer"], + target_sum=par["target_sum"], + copy=True if par["output_layer"] else False, +) if output_data: - result = output_data.X if not par["input_layer"] else output_data.layers[par["input_layer"]] + result = ( + output_data.X + if not par["input_layer"] + else output_data.layers[par["input_layer"]] + ) dat.layers[par["output_layer"]] = result logger.info("Writing to file") diff --git a/src/transform/normalize_total/test.py b/src/transform/normalize_total/test.py index 23880851692..5fb3f924140 100644 --- a/src/transform/normalize_total/test.py +++ b/src/transform/normalize_total/test.py @@ -1,25 +1,24 @@ import sys -import subprocess import pytest -from os import path import mudata as mu import numpy as np ## VIASH START -meta = { - 'name': 'lognorm', - 'resources_dir': 'resources_test/' -} +meta = {"name": "lognorm", "resources_dir": "resources_test/"} ## VIASH END input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + def test_run(run_component, tmp_path): output = tmp_path / "output.h5mu" cmd_pars = [ - "--input", input, - "--output", str(output), - "--output_compression", "gzip" + "--input", + input, + "--output", + str(output), + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -42,18 +41,29 @@ def test_run(run_component, tmp_path): assert np.mean(rna_in.X) != np.mean(rna_out.X), "Expression should have changed" nz_row, nz_col = rna_in.X.nonzero() - row_corr = np.corrcoef(rna_in.X[nz_row[0],:].toarray().flatten(), rna_out.X[nz_row[0],:].toarray().flatten())[0,1] - col_corr = np.corrcoef(rna_in.X[:,nz_col[0]].toarray().flatten(), rna_out.X[:,nz_col[0]].toarray().flatten())[0,1] - assert row_corr > .1 - assert col_corr > .1 - + row_corr = np.corrcoef( + rna_in.X[nz_row[0], :].toarray().flatten(), + rna_out.X[nz_row[0], :].toarray().flatten(), + )[0, 1] + col_corr = np.corrcoef( + rna_in.X[:, nz_col[0]].toarray().flatten(), + rna_out.X[:, nz_col[0]].toarray().flatten(), + )[0, 1] + assert row_corr > 0.1 + assert col_corr > 0.1 + + def test_target_sum(run_component, tmp_path): output = tmp_path / "output.h5mu" cmd_pars = [ - "--input", input, - "--output", str(output), - "--output_compression", "gzip", - "--target_sum", "10000" + "--input", + input, + "--output", + str(output), + "--output_compression", + "gzip", + "--target_sum", + "10000", ] run_component(cmd_pars) @@ -62,7 +72,10 @@ def test_target_sum(run_component, tmp_path): mu_output = mu.read_h5mu(output) rna_out = mu_output.mod["rna"] - assert np.all(np.abs(rna_out.X.sum(axis=1) - 10000) < 1), "Expression should have changed" + assert np.all( + np.abs(rna_out.X.sum(axis=1) - 10000) < 1 + ), "Expression should have changed" + -if __name__ == '__main__': - sys.exit(pytest.main([__file__])) \ No newline at end of file +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/src/transform/regress_out/script.py b/src/transform/regress_out/script.py index f1bb9244ffe..957be276a71 100644 --- a/src/transform/regress_out/script.py +++ b/src/transform/regress_out/script.py @@ -16,16 +16,14 @@ sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading input mudata") mdata = mu.read_h5mu(par["input"]) mdata.var_names_make_unique() -if ( - par["obs_keys"] is not None - and len(par["obs_keys"]) > 0 -): +if par["obs_keys"] is not None and len(par["obs_keys"]) > 0: mod = par["modality"] data = mdata.mod[mod] @@ -36,9 +34,7 @@ logger.info("Regress out variables on modality %s", mod) sc.pp.regress_out( - sc_data, - keys=par["obs_keys"], - n_jobs=multiprocessing.cpu_count() - 1 + sc_data, keys=par["obs_keys"], n_jobs=multiprocessing.cpu_count() - 1 ) # Copy regressed data back to original input data diff --git a/src/transform/regress_out/test.py b/src/transform/regress_out/test.py index d0dac95d681..7ed403c15f7 100644 --- a/src/transform/regress_out/test.py +++ b/src/transform/regress_out/test.py @@ -4,12 +4,10 @@ import numpy as np ## VIASH START -meta = { - 'name': 'lognorm', - 'resources_dir': 'resources_test/' -} +meta = {"name": "lognorm", "resources_dir": "resources_test/"} ## VIASH END + @pytest.fixture def input_path(): return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" @@ -40,13 +38,16 @@ def output_h5mu_path(tmp_path): def test_regress_out(run_component, input_h5mu_path, output_h5mu_path): - # execute command cmd_pars = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--obs_keys", "var", - "--output_compression", "gzip" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--obs_keys", + "var", + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -70,15 +71,20 @@ def test_regress_out(run_component, input_h5mu_path, output_h5mu_path): def test_regress_out_with_layers(run_component, input_h5mu_path, output_h5mu_path): - # execute command cmd_pars = [ - "--input", input_h5mu_path, - "--output", output_h5mu_path, - "--obs_keys", "var", - "--input_layer", "input", - "--output_layer", "output", - "--output_compression", "gzip" + "--input", + input_h5mu_path, + "--output", + output_h5mu_path, + "--obs_keys", + "var", + "--input_layer", + "input", + "--output_layer", + "output", + "--output_compression", + "gzip", ] run_component(cmd_pars) @@ -88,8 +94,10 @@ def test_regress_out_with_layers(run_component, input_h5mu_path, output_h5mu_pat rna_in = mu_input.mod["rna"] rna_out = mu_output.mod["rna"] - assert np.mean(rna_in.layers["input"]) != np.mean(rna_out.layers["output"]), "RNA expression should have changed" + assert np.mean(rna_in.layers["input"]) != np.mean( + rna_out.layers["output"] + ), "RNA expression should have changed" -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/transform/scale/script.py b/src/transform/scale/script.py index 9a91036707d..0662e8ee5e8 100644 --- a/src/transform/scale/script.py +++ b/src/transform/scale/script.py @@ -10,33 +10,43 @@ "output": "output.h5mu", "modality": "rna", "max_value": None, - "zero_center": True + "zero_center": True, } ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() + def main(): logger.info(f'Reading .h5mu file: {par["input"]}') mudata = read_h5mu(par["input"]) mod = par["modality"] data = mudata.mod[mod] logger.info("Scaling modality: %s", mod) - scanpy_output = scanpy.pp.scale(data, - layer=par["input_layer"], - zero_center=par["zero_center"], - max_value=par["max_value"], - copy=True) - output_layer_setter = partial(setattr, data, "X") \ - if not par["output_layer"] \ - else partial(setitem, data.layers, par["output_layer"]) - output_layer_setter(scanpy_output.X if not par["input_layer"] - else scanpy_output.layers[par["input_layer"]]) + scanpy_output = scanpy.pp.scale( + data, + layer=par["input_layer"], + zero_center=par["zero_center"], + max_value=par["max_value"], + copy=True, + ) + output_layer_setter = ( + partial(setattr, data, "X") + if not par["output_layer"] + else partial(setitem, data.layers, par["output_layer"]) + ) + output_layer_setter( + scanpy_output.X + if not par["input_layer"] + else scanpy_output.layers[par["input_layer"]] + ) logger.info("Writing to %s", par["output"]) mudata.write_h5mu(filename=par["output"], compression=par["output_compression"]) logger.info("Finished") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/transform/scale/test.py b/src/transform/scale/test.py index 057c52b166b..fbfeee3f25a 100644 --- a/src/transform/scale/test.py +++ b/src/transform/scale/test.py @@ -1,6 +1,5 @@ import sys import pytest -from uuid import uuid4 import numpy as np from mudata import read_h5mu @@ -12,32 +11,44 @@ } ## VIASH END + @pytest.fixture def input_path(): return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu" + @pytest.fixture def input_data(input_path): return read_h5mu(input_path) -def test_scaling_input_layer(run_component, input_data, write_mudata_to_file, random_h5mu_path): + +def test_scaling_input_layer( + run_component, input_data, write_mudata_to_file, random_h5mu_path +): """ The component must select the correct input layer. """ - input_data.mod['rna'].layers['test_layer'] = input_data.mod['rna'].X.copy() - del input_data.mod['rna'].X + input_data.mod["rna"].layers["test_layer"] = input_data.mod["rna"].X.copy() + del input_data.mod["rna"].X input_path = write_mudata_to_file(input_data) - output_file = random_h5mu_path() + output_file = random_h5mu_path() - run_component([ - "--input", input_path, - "--output", output_file, - "--input_layer", "test_layer", - "--ouput_compression", "gzip"]) + run_component( + [ + "--input", + input_path, + "--output", + output_file, + "--input_layer", + "test_layer", + "--ouput_compression", + "gzip", + ] + ) assert output_file.is_file() output_data = read_h5mu(output_file) - output_x = output_data.mod['rna'].X + output_x = output_data.mod["rna"].X mean = np.mean(output_x, axis=0, dtype=np.float64) variance = np.multiply(output_x, output_x).mean(axis=0, dtype=np.float64) - mean**2 variance[variance == 0] = 1 @@ -51,18 +62,28 @@ def test_scaling_output_layer(run_component, random_h5mu_path, input_path): """ output_file = random_h5mu_path() - run_component([ - "--input", input_path, - "--output", output_file, - "--output_layer", "scaled", - "--ouput_compression", "gzip"]) - + run_component( + [ + "--input", + input_path, + "--output", + output_file, + "--output_layer", + "scaled", + "--ouput_compression", + "gzip", + ] + ) + assert output_file.is_file() output_data = read_h5mu(output_file) - assert 'scaled' in output_data.mod['rna'].layers - output_scaled = output_data.mod['rna'].layers['scaled'] + assert "scaled" in output_data.mod["rna"].layers + output_scaled = output_data.mod["rna"].layers["scaled"] mean = np.mean(output_scaled, axis=0, dtype=np.float64) - variance = np.multiply(output_scaled, output_scaled).mean(axis=0, dtype=np.float64) - mean**2 + variance = ( + np.multiply(output_scaled, output_scaled).mean(axis=0, dtype=np.float64) + - mean**2 + ) variance[variance == 0] = 1 assert np.all(np.isclose(mean, 0, rtol=1e-07, atol=1e-07)) assert np.all(np.isclose(variance, 1, rtol=1e-03, atol=1e-03)) @@ -74,73 +95,77 @@ def test_scaling(run_component, random_h5mu_path, input_path): """ output_file = random_h5mu_path() - run_component([ - "--input", input_path, - "--output", output_file, - "--ouput_compression", "gzip"]) - + run_component( + ["--input", input_path, "--output", output_file, "--ouput_compression", "gzip"] + ) + assert output_file.is_file() output_data = read_h5mu(output_file) - output_x = output_data.mod['rna'].X + output_x = output_data.mod["rna"].X mean = np.mean(output_x, axis=0, dtype=np.float64) variance = np.multiply(output_x, output_x).mean(axis=0, dtype=np.float64) - mean**2 variance[variance == 0] = 1 assert np.all(np.isclose(mean, 0, rtol=1e-07, atol=1e-07)) assert np.all(np.isclose(variance, 1, rtol=1e-03, atol=1e-03)) + def test_scaling_noncenter(run_component, random_h5mu_path, input_path): """ Check if centering can be disabled. """ output_file = random_h5mu_path() - run_component([ - "--input", input_path, - "--output", str(output_file), - "--zero_center", "false"]) + run_component( + ["--input", input_path, "--output", str(output_file), "--zero_center", "false"] + ) assert output_file.is_file() output_data = read_h5mu(output_file) - output_x = output_data.mod['rna'].X + output_x = output_data.mod["rna"].X mean = np.mean(output_x, axis=0, dtype=np.float64) assert not np.all(np.isclose(mean, 0, rtol=1e-07, atol=1e-07)) + def test_scaling_maxvalue(run_component, random_h5mu_path, input_path): """ Check if output data is clipped when using --max_value """ output_file = random_h5mu_path() - run_component([ - "--input", input_path, - "--output", str(output_file), - "--max_value", "0.5"]) + run_component( + ["--input", input_path, "--output", str(output_file), "--max_value", "0.5"] + ) assert output_file.is_file() output_data = read_h5mu(output_file) - output_x = output_data.mod['rna'].X + output_x = output_data.mod["rna"].X assert np.all(output_x <= 0.5) + def test_scaling_modality(run_component, random_h5mu_path, input_path): """ Check if 'rna' modality remain untouched when using '--modality prot' argument. """ output_file = random_h5mu_path() - run_component([ - "--input", input_path, - "--output", str(output_file), - "--modality", "prot"]) + run_component( + ["--input", input_path, "--output", str(output_file), "--modality", "prot"] + ) assert output_file.is_file() - input_data = read_h5mu(input_path) + input_data = read_h5mu(input_path) output_data = read_h5mu(output_file) - output_rna = output_data.mod['rna'].X - assert np.allclose(input_data.mod['rna'].X.todense(), output_rna.todense(), equal_nan=True) + output_rna = output_data.mod["rna"].X + assert np.allclose( + input_data.mod["rna"].X.todense(), output_rna.todense(), equal_nan=True + ) - output_prot = output_data.mod['prot'].X + output_prot = output_data.mod["prot"].X mean = np.mean(output_prot, axis=0, dtype=np.float64) - variance = np.multiply(output_prot, output_prot).mean(axis=0, dtype=np.float64) - mean**2 + variance = ( + np.multiply(output_prot, output_prot).mean(axis=0, dtype=np.float64) - mean**2 + ) variance[variance == 0] = 1 assert np.all(np.isclose(mean, 0, rtol=1e-07, atol=1e-07)) assert np.all(np.isclose(variance, 1, rtol=1e-03, atol=1e-03)) + if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/transform/tfidf/script.py b/src/transform/tfidf/script.py index 2c4a89fe369..2ff8b6818aa 100644 --- a/src/transform/tfidf/script.py +++ b/src/transform/tfidf/script.py @@ -13,13 +13,14 @@ "output_compression": "gzip", "log_idf": True, "log_tf": True, - "log_tfidf": False + "log_tfidf": False, } meta = {"name": "tfidf"} ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() logger.info("Reading input mudata") diff --git a/src/transform/tfidf/test.py b/src/transform/tfidf/test.py index 843eb5e72f8..97772027416 100644 --- a/src/transform/tfidf/test.py +++ b/src/transform/tfidf/test.py @@ -9,27 +9,25 @@ ## VIASH START meta = { - 'executable': './target/docker/transform/tfidf/tfidf', - 'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/", - 'config': './src/transform/tfidf/config.vsh.yaml', - 'cpus': 2 + "executable": "./target/docker/transform/tfidf/tfidf", + "resources_dir": "./resources_test/cellranger_atac_tiny_bcl/counts/", + "config": "./src/transform/tfidf/config.vsh.yaml", + "cpus": 2, } ## VIASH END + @pytest.fixture def synthetic_example(): - atac = sc.AnnData(np.array([ - [0, 0, 0], - [1, 0, 1], - [10, 0, 0], - [100, 0, 1], - [1000, 0, 0] - ])) + atac = sc.AnnData( + np.array([[0, 0, 0], [1, 0, 1], [10, 0, 0], [100, 0, 1], [1000, 0, 0]]) + ) atac.obs_names = ["A", "B", "C", "D", "E"] atac.var_names = ["x", "y", "z"] return md.MuData({"atac": atac}) + @pytest.fixture def example_mudata(tmp_path, synthetic_example): mdata_path = tmp_path / "example.h5mu" @@ -37,24 +35,33 @@ def example_mudata(tmp_path, synthetic_example): return mdata_path + @pytest.fixture def example_mudata_with_layer(tmp_path, synthetic_example): - synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy() - synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape) + synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod[ + "atac" + ].X.copy() + synthetic_example.mod["atac"].X = np.random.normal( + size=synthetic_example.mod["atac"].X.shape + ) mdata_path = tmp_path / "example.h5mu" synthetic_example.write(mdata_path) return mdata_path + @pytest.fixture def neurips_mudata(tmp_path): """From the `NeurIPS Multimodal Single-Cell Integration Challenge ` - - Link is taken from the Moscot repository: + + Link is taken from the Moscot repository: https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67 """ - adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503") + adata = sc.read( + "../data/neurips_data.h5ad", + backup_url="https://figshare.com/ndownloader/files/37993503", + ) mdata = md.MuData({"atac": adata}) mdata_path = tmp_path / "neurips.h5mu" @@ -62,6 +69,7 @@ def neurips_mudata(tmp_path): return mdata_path + @pytest.fixture def tiny_atac_mudata(tmp_path): resources_dir = Path(meta["resources_dir"]) @@ -72,15 +80,21 @@ def tiny_atac_mudata(tmp_path): return mdata_path -@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"]) + +@pytest.mark.parametrize( + "mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"] +) def test_output_layer(run_component, request, mudata, tmp_path): input_path = request.getfixturevalue(mudata) output_path = tmp_path / "foo.h5mu" args = [ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "atac", + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "atac", ] run_component(args) @@ -89,15 +103,19 @@ def test_output_layer(run_component, request, mudata, tmp_path): assert "tfidf" in output_mdata.mod["atac"].layers.keys() + @pytest.mark.parametrize("mudata", ["example_mudata"]) def test_calculations_correctness(request, run_component, mudata, tmp_path): input_path = request.getfixturevalue(mudata) output_path = tmp_path / "foo.h5mu" args = [ - "--input", str(input_path), - "--output", str(output_path), - "--modality", "atac", + "--input", + str(input_path), + "--output", + str(output_path), + "--modality", + "atac", ] run_component(args + ["--scale_factor", "10000", "--output_layer", "tfidf_10000"]) @@ -106,27 +124,35 @@ def test_calculations_correctness(request, run_component, mudata, tmp_path): assert np.allclose( output_mdata.mod["atac"].layers["tfidf_10000"].toarray(), - np.array([[ np.nan, np.nan, np.nan], - [0.0382461 , 0. , 10.67027475], - [0.04135813, 0. , 0. ], - [0.04131346, 0. , 5.7693107 ], - [0.04135813, 0. , 0. ]]), - equal_nan=True - ) + np.array( + [ + [np.nan, np.nan, np.nan], + [0.0382461, 0.0, 10.67027475], + [0.04135813, 0.0, 0.0], + [0.04131346, 0.0, 5.7693107], + [0.04135813, 0.0, 0.0], + ] + ), + equal_nan=True, + ) run_component(args + ["--scale_factor", "100", "--output_layer", "tfidf_100"]) output_mdata = md.read(output_path) assert np.allclose( output_mdata.mod["atac"].layers["tfidf_100"].toarray(), - np.array([[ np.nan, np.nan, np.nan], - [0.01765529, 0. , 4.92564555], - [0.02072352, 0. , 0. ], - [0.02067929, 0. , 0.86213192], - [0.02072352, 0. , 0. ]]), - equal_nan=True + np.array( + [ + [np.nan, np.nan, np.nan], + [0.01765529, 0.0, 4.92564555], + [0.02072352, 0.0, 0.0], + [0.02067929, 0.0, 0.86213192], + [0.02072352, 0.0, 0.0], + ] + ), + equal_nan=True, ) - + if __name__ == "__main__": sys.exit(pytest.main([__file__])) diff --git a/src/utils/compress_h5mu.py b/src/utils/compress_h5mu.py index 9d92395a573..b9aaeddf2f8 100644 --- a/src/utils/compress_h5mu.py +++ b/src/utils/compress_h5mu.py @@ -5,34 +5,46 @@ from functools import partial -def compress_h5mu(input_path: Union[str, Path], - output_path: Union[str, Path], - compression: Union[Literal['gzip'], Literal['lzf']]): +def compress_h5mu( + input_path: Union[str, Path], + output_path: Union[str, Path], + compression: Union[Literal["gzip"], Literal["lzf"]], +): input_path, output_path = str(input_path), str(output_path) def copy_attributes(in_object, out_object): for key, value in in_object.attrs.items(): out_object.attrs[key] = value - def visit_path(output_h5: H5File, - compression: Union[Literal['gzip'], Literal['lzf']], - name: str, object: Union[Group, Dataset]): - if isinstance(object, Group): - new_group = output_h5.create_group(name) - copy_attributes(object, new_group) - elif isinstance(object, Dataset): - # Compression only works for non-scalar Dataset objects - # Scalar objects dont have a shape defined - if not object.compression and object.shape not in [None, ()]: - new_dataset = output_h5.create_dataset(name, data=object, compression=compression) - copy_attributes(object, new_dataset) - else: - output_h5.copy(object, name) + def visit_path( + output_h5: H5File, + compression: Union[Literal["gzip"], Literal["lzf"]], + name: str, + object: Union[Group, Dataset], + ): + if isinstance(object, Group): + new_group = output_h5.create_group(name) + copy_attributes(object, new_group) + elif isinstance(object, Dataset): + # Compression only works for non-scalar Dataset objects + # Scalar objects dont have a shape defined + if not object.compression and object.shape not in [None, ()]: + new_dataset = output_h5.create_dataset( + name, data=object, compression=compression + ) + copy_attributes(object, new_dataset) else: - raise NotImplementedError(f"Could not copy element {name}, " - f"type has not been implemented yet: {type(object)}") + output_h5.copy(object, name) + else: + raise NotImplementedError( + f"Could not copy element {name}, " + f"type has not been implemented yet: {type(object)}" + ) - with H5File(input_path, 'r') as input_h5, H5File(output_path, 'w', userblock_size=512) as output_h5: + with ( + H5File(input_path, "r") as input_h5, + H5File(output_path, "w", userblock_size=512) as output_h5, + ): copy_attributes(input_h5, output_h5) input_h5.visititems(partial(visit_path, output_h5, compression)) @@ -46,4 +58,4 @@ def visit_path(output_h5: H5File, starting_metadata = starting_metadata[:truncate_location] with open(output_path, "br+") as f: nbytes = f.write(starting_metadata) - f.write(b"\0" * (512 - nbytes)) + f.write(b"\0" * (512 - nbytes)) diff --git a/src/utils/cross_check_genes.py b/src/utils/cross_check_genes.py index adc4457bd46..7588ec9a944 100644 --- a/src/utils/cross_check_genes.py +++ b/src/utils/cross_check_genes.py @@ -1,7 +1,9 @@ from typing import List -def cross_check_genes(query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100) -> List[str]: +def cross_check_genes( + query_genes: List[str], reference_genes: List[str], min_gene_overlap: int = 100 +) -> List[str]: """Cross check the overlap between two lists of genes Parameters @@ -17,6 +19,8 @@ def cross_check_genes(query_genes: List[str], reference_genes: List[str], min_ge List of overlapping genes """ common_ens_ids = list(set(reference_genes).intersection(set(query_genes))) - assert len(common_ens_ids) >= min_gene_overlap, f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." + assert ( + len(common_ens_ids) >= min_gene_overlap + ), f"The intersection of genes between the query and reference dataset is too small, expected at least {min_gene_overlap}." return common_ens_ids diff --git a/src/utils/setup_logger.py b/src/utils/setup_logger.py index ae71eb96115..3ca1cdb613c 100644 --- a/src/utils/setup_logger.py +++ b/src/utils/setup_logger.py @@ -9,4 +9,4 @@ def setup_logger(): console_handler.setFormatter(logFormatter) logger.addHandler(console_handler) - return logger \ No newline at end of file + return logger diff --git a/src/utils/subset_vars.py b/src/utils/subset_vars.py index 64071e6d41a..d8cb05b2d3d 100644 --- a/src/utils/subset_vars.py +++ b/src/utils/subset_vars.py @@ -1,6 +1,6 @@ def subset_vars(adata, subset_col): """Subset AnnData object on highly variable genes - + Parameters ---------- adata : AnnData @@ -13,7 +13,9 @@ def subset_vars(adata, subset_col): AnnData Copy of `adata` with subsetted features """ - if not subset_col in adata.var.columns: - raise ValueError(f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available.") + if subset_col not in adata.var.columns: + raise ValueError( + f"Requested to use .var column '{subset_col}' as a selection of genes, but the column is not available." + ) return adata[:, adata.var[subset_col]].copy() diff --git a/src/velocity/scvelo/script.py b/src/velocity/scvelo/script.py index cb632ca184e..d3eda1f2992 100644 --- a/src/velocity/scvelo/script.py +++ b/src/velocity/scvelo/script.py @@ -6,57 +6,65 @@ # Backwards compatibility for numpy 2.0 import numpy -numpy_module = sys.modules['numpy'] + +numpy_module = sys.modules["numpy"] numpy_module.float_ = numpy.float64 -sys.modules['numpy'] = numpy_module +sys.modules["numpy"] = numpy_module # Backwards compatibility for scipy -import scipy -scipy_module = sys.modules['scipy'] +import scipy # noqa: F401 + +scipy_module = sys.modules["scipy"] scipy_module.sparse._base._spbase.A = property(lambda self: self.toarray()) -sys.modules['scipy'] = scipy_module +sys.modules["scipy"] = scipy_module import scvelo ## VIASH START from collections import defaultdict + def none_factory(): return None -par = defaultdict(none_factory, { - 'input': './resources_test/rna_velocity/velocyto_processed/cellranger_tiny.loom', - 'output': './foo', - 'log_transform': True, - 'n_neighbors': 30 -}) + +par = defaultdict( + none_factory, + { + "input": "./resources_test/rna_velocity/velocyto_processed/cellranger_tiny.loom", + "output": "./foo", + "log_transform": True, + "n_neighbors": 30, + }, +) ## VIASH END sys.path.append(meta["resources_dir"]) from setup_logger import setup_logger + logger = setup_logger() -mpl.rcParams['savefig.dpi']=150 +mpl.rcParams["savefig.dpi"] = 150 + # Script must be wrapped into a main function because scvelo spawn subprocesses # and this fails when the functions are not wrapped. def main(): # Create output directory - output_dir = Path(par['output']) + output_dir = Path(par["output"]) output_dir.mkdir(parents=True, exist_ok=True) scvelo.settings.figdir = str(output_dir) - # Calculate the sample name sample_name = par["output"].removesuffix(".loom") sample_name = Path(sample_name).name # Read the input data - adata = scvelo.read(par['input']) + adata = scvelo.read(par["input"]) # Save spliced vs unspliced proportions to file - with (output_dir / "proportions.txt").open('w') as target: + with (output_dir / "proportions.txt").open("w") as target: with redirect_stdout(target): scvelo.utils.show_proportions(adata) @@ -64,21 +72,22 @@ def main(): scvelo.pl.proportions(adata, save=True, show=False) # Perform preprocessing - scvelo.pp.filter_and_normalize(adata, - min_counts=par["min_counts"], - min_counts_u=par["min_counts_u"], - min_cells=par["min_cells"], - min_cells_u=par["min_cells_u"], - min_shared_counts=par["min_shared_counts"], - min_shared_cells=par["min_shared_cells"], - n_top_genes=par["n_top_genes"], - log=par["log_transform"]) + scvelo.pp.filter_and_normalize( + adata, + min_counts=par["min_counts"], + min_counts_u=par["min_counts_u"], + min_cells=par["min_cells"], + min_cells_u=par["min_cells_u"], + min_shared_counts=par["min_shared_counts"], + min_shared_cells=par["min_shared_cells"], + n_top_genes=par["n_top_genes"], + log=par["log_transform"], + ) # Fitting - scvelo.pp.moments(adata, - n_pcs=par["n_principal_components"], - n_neighbors=par["n_neighbors"]) - + scvelo.pp.moments( + adata, n_pcs=par["n_principal_components"], n_neighbors=par["n_neighbors"] + ) # Second step in velocyto calculations # Velocity calculation and visualization @@ -88,15 +97,22 @@ def main(): scvelo.tl.recover_dynamics(adata) scvelo.tl.velocity(adata, mode="dynamical") scvelo.tl.velocity_graph(adata) - scvelo.pl.velocity_graph(adata, save=str(output_dir / "scvelo_graph.pdf"), show=False) + scvelo.pl.velocity_graph( + adata, save=str(output_dir / "scvelo_graph.pdf"), show=False + ) # Plotting # TODO: add more here. - scvelo.pl.velocity_embedding_stream(adata, save=str(output_dir / "scvelo_embedding.pdf"), show=False) + scvelo.pl.velocity_embedding_stream( + adata, save=str(output_dir / "scvelo_embedding.pdf"), show=False + ) # Create output - ouput_data = mudata.MuData({'rna_velocity': adata}) - ouput_data.write_h5mu(output_dir / f"{sample_name}.h5mu", compression=par["output_compression"]) + ouput_data = mudata.MuData({"rna_velocity": adata}) + ouput_data.write_h5mu( + output_dir / f"{sample_name}.h5mu", compression=par["output_compression"] + ) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/velocity/scvelo/test.py b/src/velocity/scvelo/test.py index 8645b6ab973..7c4d3c5ce55 100644 --- a/src/velocity/scvelo/test.py +++ b/src/velocity/scvelo/test.py @@ -4,31 +4,37 @@ ## VIASH START meta = { - 'name': './target/executable/projection/scvelo/scvelo', - 'resources_dir': './resources_test/' + "name": "./target/executable/projection/scvelo/scvelo", + "resources_dir": "./resources_test/", } ## VIASH END input_loom = f"{meta['resources_dir']}/cellranger_tiny.loom" - def test_scvelo(run_component, tmp_path): output_dir = tmp_path / "foo" - run_component([ - "--input", input_loom, - "--output", str(output_dir), - "--output_compression", "gzip"]) - + run_component( + [ + "--input", + input_loom, + "--output", + str(output_dir), + "--output_compression", + "gzip", + ] + ) + assert output_dir.is_dir() assert (output_dir / "scvelo_proportions.pdf").is_file() assert (output_dir / "scvelo_embedding.pdf").is_file() assert (output_dir / "scvelo_graph.pdf").is_file() assert (output_dir / "proportions.txt").is_file() assert (output_dir / "foo.h5mu").is_file() - + output_data = read_h5mu(output_dir / "foo.h5mu") assert "rna_velocity" in output_data.mod.keys() + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/velocity/velocyto/test.py b/src/velocity/velocyto/test.py index cdfa6eaddc4..b21c416d39f 100644 --- a/src/velocity/velocyto/test.py +++ b/src/velocity/velocyto/test.py @@ -7,8 +7,8 @@ ## VIASH START meta = { - 'name': './target/executable/projection/velocyto/velocyto', - 'resources_dir': './resources_test/' + "name": "./target/executable/projection/velocyto/velocyto", + "resources_dir": "./resources_test/", } ## VIASH END @@ -18,67 +18,100 @@ input_barcodes_bd = f"{meta['resources_dir']}/rna_velocity/velocyto/barcodes.txt" # input data for 10x bam -input_bam_cellranger = f"{meta['resources_dir']}/cellranger_tiny_fastq/bam/possorted_genome_bam.bam" +input_bam_cellranger = ( + f"{meta['resources_dir']}/cellranger_tiny_fastq/bam/possorted_genome_bam.bam" +) input_gtf_cellranger = f"{meta['resources_dir']}/cellranger_tiny_fastq/cellranger_tiny_ref/genes/genes.gtf.gz" + def test_velocyto_cellranger(run_component, tmp_path): """Check whether component accepts compressed gtf files""" - + output_file = tmp_path / "foo" / "velocyto.loom" - - run_component([ - "--input", input_bam_cellranger, - "--transcriptome", input_gtf_cellranger, - "--output", str(output_file)]) + + run_component( + [ + "--input", + input_bam_cellranger, + "--transcriptome", + input_gtf_cellranger, + "--output", + str(output_file), + ] + ) assert output_file.is_file() input_barcodes = set() - with pysam.AlignmentFile(input_bam_cellranger, 'r') as input_bam: + with pysam.AlignmentFile(input_bam_cellranger, "r") as input_bam: for read in input_bam: tags = dict(read.tags) - cell_barcode = tags.get('CB') + cell_barcode = tags.get("CB") if cell_barcode: input_barcodes.add(cell_barcode.removesuffix("-1")) with loompy.connect(output_file) as ds: - result_barcodes = {tag.removeprefix('velocyto:').removesuffix('x') for tag in ds.ca.CellID} + result_barcodes = { + tag.removeprefix("velocyto:").removesuffix("x") for tag in ds.ca.CellID + } assert result_barcodes.issubset(input_barcodes) - assert ds.ca.keys() == ['CellID'] - assert ds.ra.keys(), ['Accession', 'Chromosome', 'End', 'Gene', 'Start' == 'Strand'] + assert ds.ca.keys() == ["CellID"] + assert ds.ra.keys(), [ + "Accession", + "Chromosome", + "End", + "Gene", + "Start" == "Strand", + ] rows, cols = ds.shape assert rows > 0 assert cols > 0 + def test_velocyto_bd_rhapsody(run_component, tmp_path): """Check whether component also accepts uncompressed gtf files""" output_file = tmp_path / "foo" / "velocyto.loom" transcriptome = tmp_path / "genes.gtf" - + with open(transcriptome, "wb") as gtf_uncompressed: - with gzip.open(input_gtf_bd, 'rb') as gtf_compressed: + with gzip.open(input_gtf_bd, "rb") as gtf_compressed: shutil.copyfileobj(gtf_compressed, gtf_uncompressed) - run_component([ - "--input", input_bam_bd, - "--transcriptome", str(transcriptome), - "--output", str(output_file), - "--barcode", input_barcodes_bd - ]) + run_component( + [ + "--input", + input_bam_bd, + "--transcriptome", + str(transcriptome), + "--output", + str(output_file), + "--barcode", + input_barcodes_bd, + ] + ) assert output_file.is_file() input_barcodes = set() - with open(input_barcodes_bd, 'r') as barcodes_file: + with open(input_barcodes_bd, "r") as barcodes_file: for barcode in barcodes_file: input_barcodes.add(barcode.strip()) - + with loompy.connect(output_file) as ds: - result_barcodes = {tag.removeprefix('velocyto:').removesuffix('x') for tag in ds.ca.CellID} + result_barcodes = { + tag.removeprefix("velocyto:").removesuffix("x") for tag in ds.ca.CellID + } assert result_barcodes.issubset(input_barcodes) - assert ds.ca.keys() == ['CellID'] - assert ds.ra.keys(), ['Accession', 'Chromosome', 'End', 'Gene', 'Start' == 'Strand'] + assert ds.ca.keys() == ["CellID"] + assert ds.ra.keys(), [ + "Accession", + "Chromosome", + "End", + "Gene", + "Start" == "Strand", + ] rows, cols = ds.shape assert rows > 0 assert cols > 0 + if __name__ == "__main__": - sys.exit(pytest.main([__file__])) \ No newline at end of file + sys.exit(pytest.main([__file__])) diff --git a/src/velocity/velocyto_to_h5mu/script.py b/src/velocity/velocyto_to_h5mu/script.py index 2c1692d4325..0cfbb7610e1 100644 --- a/src/velocity/velocyto_to_h5mu/script.py +++ b/src/velocity/velocyto_to_h5mu/script.py @@ -2,10 +2,11 @@ import anndata as ad import mudata as mu import numpy as np -numpy_module = sys.modules['numpy'] + +numpy_module = sys.modules["numpy"] numpy_module.string_ = np.bytes_ numpy_module.unicode_ = np.str_ -sys.modules['numpy'] = numpy_module +sys.modules["numpy"] = numpy_module ## VIASH START par = { @@ -15,7 +16,7 @@ "output": "output.h5mu", "layer_spliced": "velo_spliced", "layer_unspliced": "velo_unspliced", - "layer_ambiguous": "velo_ambiguous" + "layer_ambiguous": "velo_ambiguous", } ## VIASH END @@ -32,8 +33,8 @@ layers={ par["layer_spliced"]: adata_in.layers["spliced"], par["layer_unspliced"]: adata_in.layers["unspliced"], - par["layer_ambiguous"]: adata_in.layers["ambiguous"] - } + par["layer_ambiguous"]: adata_in.layers["ambiguous"], + }, ) if par["input_h5mu"]: diff --git a/src/velocity/velocyto_to_h5mu/test.py b/src/velocity/velocyto_to_h5mu/test.py index 23092ace2ff..3eecf632e26 100644 --- a/src/velocity/velocyto_to_h5mu/test.py +++ b/src/velocity/velocyto_to_h5mu/test.py @@ -1,9 +1,10 @@ import sys import numpy as np -numpy_module = sys.modules['numpy'] + +numpy_module = sys.modules["numpy"] numpy_module.string_ = np.bytes_ numpy_module.unicode_ = np.str_ -sys.modules['numpy'] = numpy_module +sys.modules["numpy"] = numpy_module import subprocess import pathlib @@ -12,8 +13,8 @@ ## VIASH START meta = { - 'name': './target/native/convert/from_velocyto_to_h5mu/from_velocyto_to_h5mu', - 'resources_dir': './resources_test/' + "name": "./target/native/convert/from_velocyto_to_h5mu/from_velocyto_to_h5mu", + "resources_dir": "./resources_test/", } ## VIASH END @@ -32,9 +33,10 @@ input_h5mu, "--output", output, - "--output_compression", "gzip" + "--output_compression", + "gzip", ], - check=True + check=True, ) print("Checking whether output exists", flush=True) @@ -47,7 +49,8 @@ assert list(output_data.mod.keys()) == ["rna", "rna_velocity"] with loompy.connect(input_loom) as ds: - mshape = output_data.mod['rna_velocity'].shape[::-1] + mshape = output_data.mod["rna_velocity"].shape[::-1] lshape = ds.shape - assert mshape == lshape, \ - f"Expected mudata shape {mshape} to be the same the loom shape {lshape}" \ No newline at end of file + assert ( + mshape == lshape + ), f"Expected mudata shape {mshape} to be the same the loom shape {lshape}" diff --git a/src/workflows/test_workflows/annotation/scgpt/script.py b/src/workflows/test_workflows/annotation/scgpt/script.py index 658fc45f27c..6409374a291 100644 --- a/src/workflows/test_workflows/annotation/scgpt/script.py +++ b/src/workflows/test_workflows/annotation/scgpt/script.py @@ -1,5 +1,4 @@ from mudata import read_h5mu -import numpy as np import shutil import os import sys @@ -7,33 +6,46 @@ import pytest ##VIASH START -par = { - "input": "input.h5mu" -} +par = {"input": "input.h5mu"} -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ##VIASH END def test_run(): input_mudata = read_h5mu(par["input"]) - expected_obsm = ["gene_id_tokens", "values_tokenized", "padding_mask", "bin_edges", "binned_counts"] + expected_obsm = [ + "gene_id_tokens", + "values_tokenized", + "padding_mask", + "bin_edges", + "binned_counts", + ] expected_var = ["scgpt_filter_with_hvg", "scgpt_cross_checked_genes"] expected_obs = ["scgpt_pred", "scgpt_probability"] assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." - assert all(key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm), f"Input mod['rna'] obs columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." - assert all(key in list(input_mudata.mod["rna"].var) for key in expected_var), f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod['rna'].var.keys()}." - assert all(key in list(input_mudata.mod["rna"].obs) for key in expected_obs), f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." + assert all( + key in list(input_mudata.mod["rna"].obsm) for key in expected_obsm + ), f"Input mod['rna'] obs columns should be: {expected_obsm}, found: {input_mudata.mod['rna'].obsm.keys()}." + assert all( + key in list(input_mudata.mod["rna"].var) for key in expected_var + ), f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod['rna'].var.keys()}." + assert all( + key in list(input_mudata.mod["rna"].obs) for key in expected_obs + ), f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod['rna'].obs.keys()}." # hvg subsetting is not exact - add 10% to allowed data shape - assert input_mudata.mod["rna"].obsm["binned_counts"].shape[1] <= par["n_hvg"] + 0.1 * par["n_hvg"], f"Input shape should be lower or equal than --n_hvg {par['n_hvg']}, found: {input_mudata.shape[1]}." + assert ( + input_mudata.mod["rna"].obsm["binned_counts"].shape[1] + <= par["n_hvg"] + 0.1 * par["n_hvg"] + ), f"Input shape should be lower or equal than --n_hvg {par['n_hvg']}, found: {input_mudata.shape[1]}." if __name__ == "__main__": HERE_DIR = Path(__file__).resolve().parent - shutil.copyfile(os.path.join(meta['resources_dir'], "openpipelinetestutils", "conftest.py"), - os.path.join(HERE_DIR, "conftest.py")) + shutil.copyfile( + os.path.join(meta["resources_dir"], "openpipelinetestutils", "conftest.py"), + os.path.join(HERE_DIR, "conftest.py"), + ) sys.exit(pytest.main(["--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/bd_rhapsody/script.py b/src/workflows/test_workflows/ingestion/bd_rhapsody/script.py index ea6453cc564..2617602f97f 100644 --- a/src/workflows/test_workflows/ingestion/bd_rhapsody/script.py +++ b/src/workflows/test_workflows/ingestion/bd_rhapsody/script.py @@ -7,32 +7,44 @@ import pytest ##VIASH START -par = { - "input": "input.h5mu" -} +par = {"input": "input.h5mu"} -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ##VIASH END + def test_run(): input_mudata = read_h5mu(par["input"]) - expected_var = ['gene_name', 'feature_type', 'reference_file', "gene_ids"] - expected_obs = ['run_id', 'library_id', 'cell_id'] + expected_var = ["gene_name", "feature_type", "reference_file", "gene_ids"] + expected_obs = ["run_id", "library_id", "cell_id"] assert "rna" in list(input_mudata.mod.keys()), "Input should contain rna modality." assert "prot" in list(input_mudata.mod.keys()), "Input should contain rna modality." # assert list(input_mudata.var.columns) == expected_var, f"Input var columns should be: {expected_var}." - assert all(key in list(input_mudata.mod["rna"].var.columns) for key in expected_var), f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod["rna"].var.keys()}." - assert all(key in list(input_mudata.mod["rna"].obs.columns) for key in expected_obs), f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod["rna"].obs.keys()}." - assert all(key in list(input_mudata.mod["prot"].var.columns) for key in expected_var), f"Input mod['prot'] var columns should be: {expected_var}, found: {input_mudata.mod["prot"].var.keys()}." - assert all(key in list(input_mudata.mod["prot"].obs.columns) for key in expected_obs), f"Input mod ['prot'] obs columns should be: {expected_obs}, found: {input_mudata.mod["prot"].obs.keys()}." - assert np.array_equal(input_mudata.mod["rna"].var["feature_type"].unique(), ["Gene Expression"]), "Output X should only contain Gene Expression vars." - assert np.array_equal(input_mudata.mod["prot"].var["feature_type"].unique(), ["Antibody Capture"]), "Output X should only contain Gene Expression vars." + assert all( + key in list(input_mudata.mod["rna"].var.columns) for key in expected_var + ), f"Input mod['rna'] var columns should be: {expected_var}, found: {input_mudata.mod["rna"].var.keys()}." + assert all( + key in list(input_mudata.mod["rna"].obs.columns) for key in expected_obs + ), f"Input mod['rna'] obs columns should be: {expected_obs}, found: {input_mudata.mod["rna"].obs.keys()}." + assert all( + key in list(input_mudata.mod["prot"].var.columns) for key in expected_var + ), f"Input mod['prot'] var columns should be: {expected_var}, found: {input_mudata.mod["prot"].var.keys()}." + assert all( + key in list(input_mudata.mod["prot"].obs.columns) for key in expected_obs + ), f"Input mod ['prot'] obs columns should be: {expected_obs}, found: {input_mudata.mod["prot"].obs.keys()}." + assert np.array_equal( + input_mudata.mod["rna"].var["feature_type"].unique(), ["Gene Expression"] + ), "Output X should only contain Gene Expression vars." + assert np.array_equal( + input_mudata.mod["prot"].var["feature_type"].unique(), ["Antibody Capture"] + ), "Output X should only contain Gene Expression vars." + if __name__ == "__main__": HERE_DIR = Path(__file__).resolve().parent - shutil.copyfile(os.path.join(meta['resources_dir'], "openpipelinetestutils", "conftest.py"), - os.path.join(HERE_DIR, "conftest.py")) - sys.exit(pytest.main(["--import-mode=importlib"])) \ No newline at end of file + shutil.copyfile( + os.path.join(meta["resources_dir"], "openpipelinetestutils", "conftest.py"), + os.path.join(HERE_DIR, "conftest.py"), + ) + sys.exit(pytest.main(["--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/cellranger_mapping/script.py b/src/workflows/test_workflows/ingestion/cellranger_mapping/script.py index e1cecb1a91b..f51caa85eb0 100644 --- a/src/workflows/test_workflows/ingestion/cellranger_mapping/script.py +++ b/src/workflows/test_workflows/ingestion/cellranger_mapping/script.py @@ -6,25 +6,31 @@ import pytest ##VIASH START -par = { - "input": "input.h5mu" -} +par = {"input": "input.h5mu"} -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ##VIASH END + def test_run(): input_mudata = read_h5mu(par["input"]) - expected_colnames = ['gene_symbol', 'feature_types', 'genome'] + expected_colnames = ["gene_symbol", "feature_types", "genome"] + + assert list(input_mudata.mod.keys()) == [ + "rna" + ], "Input should contain rna modality." + assert ( + list(input_mudata.var.columns) == expected_colnames + ), f"Input var columns should be: {expected_colnames}." + assert ( + list(input_mudata.mod["rna"].var.columns) == expected_colnames + ), f"Input mod['rna'] var columns should be: {expected_colnames}." - assert list(input_mudata.mod.keys()) == ["rna"], "Input should contain rna modality." - assert list(input_mudata.var.columns) == expected_colnames, f"Input var columns should be: {expected_colnames}." - assert list(input_mudata.mod["rna"].var.columns) == expected_colnames, f"Input mod['rna'] var columns should be: {expected_colnames}." if __name__ == "__main__": HERE_DIR = Path(__file__).resolve().parent - shutil.copyfile(os.path.join(meta['resources_dir'], "openpipelinetestutils", "conftest.py"), - os.path.join(HERE_DIR, "conftest.py")) - sys.exit(pytest.main(["--import-mode=importlib"])) \ No newline at end of file + shutil.copyfile( + os.path.join(meta["resources_dir"], "openpipelinetestutils", "conftest.py"), + os.path.join(HERE_DIR, "conftest.py"), + ) + sys.exit(pytest.main(["--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/cellranger_multi/script.py b/src/workflows/test_workflows/ingestion/cellranger_multi/script.py index b4aca775929..62d225a8068 100644 --- a/src/workflows/test_workflows/ingestion/cellranger_multi/script.py +++ b/src/workflows/test_workflows/ingestion/cellranger_multi/script.py @@ -6,26 +6,35 @@ import pytest ##VIASH START -par = { - "input": "input.h5mu" -} +par = {"input": "input.h5mu"} -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ##VIASH END + def test_run(): for input_path in par["input"]: input_mudata = read_h5mu(input_path) - assert list(input_mudata.mod.keys()) == ['rna', 'prot', 'vdj_t'] - assert list(input_mudata.uns.keys()) == ['metrics_cellranger'] - expected_metrics = ['Category', 'Library Type', 'Grouped By', 'Group Name', 'Metric Name', 'Metric Value'] - assert input_mudata.uns['metrics_cellranger'].columns.to_list() == expected_metrics + assert list(input_mudata.mod.keys()) == ["rna", "prot", "vdj_t"] + assert list(input_mudata.uns.keys()) == ["metrics_cellranger"] + expected_metrics = [ + "Category", + "Library Type", + "Grouped By", + "Group Name", + "Metric Name", + "Metric Value", + ] + assert ( + input_mudata.uns["metrics_cellranger"].columns.to_list() == expected_metrics + ) + if __name__ == "__main__": HERE_DIR = Path(__file__).resolve().parent - shutil.copyfile(os.path.join(meta['resources_dir'], "openpipelinetestutils", "conftest.py"), - os.path.join(HERE_DIR, "conftest.py")) - sys.exit(pytest.main(["--import-mode=importlib"])) \ No newline at end of file + shutil.copyfile( + os.path.join(meta["resources_dir"], "openpipelinetestutils", "conftest.py"), + os.path.join(HERE_DIR, "conftest.py"), + ) + sys.exit(pytest.main(["--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/cellranger_postprocessing/script.py b/src/workflows/test_workflows/ingestion/cellranger_postprocessing/script.py index 11189b26c25..9ee34768043 100644 --- a/src/workflows/test_workflows/ingestion/cellranger_postprocessing/script.py +++ b/src/workflows/test_workflows/ingestion/cellranger_postprocessing/script.py @@ -7,37 +7,40 @@ import pytest ##VIASH START -par = { - "input": "input.h5mu", - "input": "input_og.h5mu", - "is_corrected": True -} - -meta = { - "resources_dir": "resources_test" -} +par = {"input": "input.h5mu", "input_og": "input_og.h5mu", "is_corrected": True} + +meta = {"resources_dir": "resources_test"} ##VIASH END + def test_run(): input_mudata = read_h5mu(par["input_og"]) output_mudata = read_h5mu(par["input"]) - assert input_mudata.mod.keys() == output_mudata.mod.keys(), "Input and output should have the same modalities." + assert ( + input_mudata.mod.keys() == output_mudata.mod.keys() + ), "Input and output should have the same modalities." - for modality,input_adata,output_adata in zip(input_mudata.mod.keys(), - input_mudata.mod.values(), - output_mudata.mod.values()): - assert input_adata.n_obs >= output_adata.n_obs, "Output should have less or equal number of observations than input." - assert input_adata.n_vars == output_adata.n_vars, "Output should have the same number of variables as input." + for modality, input_adata, output_adata in zip( + input_mudata.mod.keys(), input_mudata.mod.values(), output_mudata.mod.values() + ): + assert ( + input_adata.n_obs >= output_adata.n_obs + ), "Output should have less or equal number of observations than input." + assert ( + input_adata.n_vars == output_adata.n_vars + ), "Output should have the same number of variables as input." if modality != "rna": - assert_annotation_objects_equal(input_adata, - output_adata) + assert_annotation_objects_equal(input_adata, output_adata) if par["is_corrected"]: assert "cellbender_corrected" in output_mudata.mod["rna"].layers - + + if __name__ == "__main__": HERE_DIR = Path(__file__).resolve().parent - shutil.copyfile(os.path.join(meta['resources_dir'], "openpipelinetestutils", "conftest.py"), - os.path.join(HERE_DIR, "conftest.py")) - sys.exit(pytest.main(["--import-mode=importlib"])) \ No newline at end of file + shutil.copyfile( + os.path.join(meta["resources_dir"], "openpipelinetestutils", "conftest.py"), + os.path.join(HERE_DIR, "conftest.py"), + ) + sys.exit(pytest.main(["--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/ingestion/conversion/script.py b/src/workflows/test_workflows/ingestion/conversion/script.py index 64a0e6b9da5..a1af0e7756c 100644 --- a/src/workflows/test_workflows/ingestion/conversion/script.py +++ b/src/workflows/test_workflows/ingestion/conversion/script.py @@ -6,15 +6,12 @@ import pytest ##VIASH START -par = { - "input": "input.h5mu" -} +par = {"input": "input.h5mu"} -meta = { - "resources_dir": "resources_test" -} +meta = {"resources_dir": "resources_test"} ##VIASH END + def test_run(): input_mudata = read_h5mu(par["input"]) assert "rna" in input_mudata.mod.keys() @@ -22,10 +19,12 @@ def test_run(): assert input_mudata.mod["rna"].var["feature_types"].unique() == [ "Gene Expression" ], "Output X should only contain Gene Expression vars." - - + + if __name__ == "__main__": HERE_DIR = Path(__file__).resolve().parent - shutil.copyfile(os.path.join(meta['resources_dir'], "openpipelinetestutils", "conftest.py"), - os.path.join(HERE_DIR, "conftest.py")) - sys.exit(pytest.main(["--import-mode=importlib"])) \ No newline at end of file + shutil.copyfile( + os.path.join(meta["resources_dir"], "openpipelinetestutils", "conftest.py"), + os.path.join(HERE_DIR, "conftest.py"), + ) + sys.exit(pytest.main(["--import-mode=importlib"])) diff --git a/src/workflows/test_workflows/multiomics/dimensionality_reduction/script.py b/src/workflows/test_workflows/multiomics/dimensionality_reduction/script.py index dec25546af7..fd4746df59d 100644 --- a/src/workflows/test_workflows/multiomics/dimensionality_reduction/script.py +++ b/src/workflows/test_workflows/multiomics/dimensionality_reduction/script.py @@ -2,22 +2,22 @@ ##VIASH START par = { - "input": "foo.final.h5mu", + "input": "foo.final.h5mu", } -meta = { - "resources_dir": "resources_test/pbmc_1k_protein_v3" -} +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3"} ##VIASH END -print ("Loading data", flush=True) +print("Loading data", flush=True) data = mu.read_h5mu(par["input"]) assert "X_umap" in data.mod["rna"].obsm, "X_umap not found in .obsm" -assert data.mod["rna"].obsm["X_umap"].shape[1] == 2, f"X_umap has wrong shape expected 2 n_comp but got {data.mod['rna'].obsm['X_umap'].shape[1]}" -assert "pca_variance" in data.mod['rna'].uns -assert "pca_loadings" in data.mod['rna'].varm +assert ( + data.mod["rna"].obsm["X_umap"].shape[1] == 2 +), f"X_umap has wrong shape expected 2 n_comp but got {data.mod['rna'].obsm['X_umap'].shape[1]}" +assert "pca_variance" in data.mod["rna"].uns +assert "pca_loadings" in data.mod["rna"].varm -print("Test successful!", flush=True) \ No newline at end of file +print("Test successful!", flush=True) diff --git a/src/workflows/test_workflows/multiomics/process_batches/workflow_test/script.py b/src/workflows/test_workflows/multiomics/process_batches/workflow_test/script.py index f46cdf5aa54..2e40ee2685e 100644 --- a/src/workflows/test_workflows/multiomics/process_batches/workflow_test/script.py +++ b/src/workflows/test_workflows/multiomics/process_batches/workflow_test/script.py @@ -2,18 +2,13 @@ from openpipelinetestutils.asserters import assert_annotation_objects_equal ##VIASH START -par = { - "input": "output.h5mu", - "orig_input": "input.5mu" -} +par = {"input": "output.h5mu", "orig_input": "input.5mu"} -meta = { - "resources_dir": "resources_test/pbmc_1k_protein_v3" -} +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3"} ##VIASH END -print ("Loading data", flush=True) +print("Loading data", flush=True) input = mu.read_h5mu(par["orig_input"]) output = mu.read_h5mu(par["input"]) @@ -21,16 +16,17 @@ assert input.mod.keys() == output.mod.keys(), "Modalities differ" # Check atac modality -assert_annotation_objects_equal(input.mod["atac"], output.mod["atac"], promote_precision=True) +assert_annotation_objects_equal( + input.mod["atac"], output.mod["atac"], promote_precision=True +) # Check rna modality assert "X_umap" in output.mod["rna"].obsm, "X_umap not found in .obsm" -assert output.mod["rna"].obsm["X_umap"].shape[1] == 2, f"X_umap has wrong shape expected 2 n_comp but got {output.mod['rna'].obsm['X_umap'].shape[1]}" -assert "pca_variance" in output.mod['rna'].uns -assert "pca_loadings" in output.mod['rna'].varm +assert ( + output.mod["rna"].obsm["X_umap"].shape[1] == 2 +), f"X_umap has wrong shape expected 2 n_comp but got {output.mod['rna'].obsm['X_umap'].shape[1]}" +assert "pca_variance" in output.mod["rna"].uns +assert "pca_loadings" in output.mod["rna"].varm - - - -print("Test successful!", flush=True) \ No newline at end of file +print("Test successful!", flush=True) diff --git a/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/script.py b/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/script.py index 7027a3712b2..e360e9433e5 100644 --- a/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/script.py +++ b/src/workflows/test_workflows/multiomics/process_batches/workflow_test2/script.py @@ -1,19 +1,18 @@ import mudata as mu from openpipelinetestutils.asserters import assert_annotation_objects_equal from openpipelinetestutils.utils import remove_annotation_column + ##VIASH START par = { - "input": "resources_test/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_mms.h5mu", - "orig_input": "test.h5mu" + "input": "resources_test/10x_5k_anticmv/5k_human_antiCMV_T_TBNK_connect_mms.h5mu", + "orig_input": "test.h5mu", } -meta = { - "resources_dir": "resources_test/pbmc_1k_protein_v3" -} +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3"} ##VIASH END -print ("Loading data", flush=True) +print("Loading data", flush=True) input = mu.read_h5mu(par["orig_input"]) output = mu.read_h5mu(par["input"]) @@ -24,20 +23,20 @@ # Allow X_umap to be overwritten input_vdj = input.mod["vdj_t"] # del input_vdj.obsm['X_umap'] -output_vdj = output.mod['vdj_t'] +output_vdj = output.mod["vdj_t"] # del output_vdj.obsm['X_umap'] assert_annotation_objects_equal(input_vdj, output_vdj, promote_precision=True) # Check prot modality # Ignore the PCA layer and its derivatives, as its allowed to be overwritten for this test. input_prot = input.mod["prot"] -del input_prot.varm['pca_loadings'] -del input_prot.obsm['X_pca'] -del input_prot.obsm['X_umap'] +del input_prot.varm["pca_loadings"] +del input_prot.obsm["X_pca"] +del input_prot.obsm["X_umap"] output_prot = output.mod["prot"] -del output_prot.varm['pca_loadings'] -del output_prot.obsm['X_pca'] -del output_prot.obsm['X_umap'] +del output_prot.varm["pca_loadings"] +del output_prot.obsm["X_pca"] +del output_prot.obsm["X_umap"] assert_annotation_objects_equal(input_prot, output_prot, promote_precision=True) @@ -45,18 +44,17 @@ # Allow the highly variable genes and PCA + derivatives to be overwritten input_rna = input.mod["rna"] input_rna = remove_annotation_column(input_rna, "filter_with_hvg", "var") -del input_rna.varm['pca_loadings'] -del input_rna.obsm['X_pca'] -del input_rna.obsm['X_umap'] -del input_rna.layers['log_normalized'] +del input_rna.varm["pca_loadings"] +del input_rna.obsm["X_pca"] +del input_rna.obsm["X_umap"] +del input_rna.layers["log_normalized"] output_rna = output.mod["rna"] output_rna = remove_annotation_column(output_rna, "filter_with_hvg", "var") -del output_rna.obsm['X_pca'] -del output_rna.varm['pca_loadings'] -del output_rna.obsm['X_umap'] -del output_rna.layers['log_normalized'] +del output_rna.obsm["X_pca"] +del output_rna.varm["pca_loadings"] +del output_rna.obsm["X_umap"] +del output_rna.layers["log_normalized"] assert_annotation_objects_equal(input_rna, output_rna, promote_precision=True) - -print("Test successful!", flush=True) \ No newline at end of file +print("Test successful!", flush=True) diff --git a/src/workflows/test_workflows/multiomics/split_modalities/script.py b/src/workflows/test_workflows/multiomics/split_modalities/script.py index 8957b414936..1bb3bbe55f7 100644 --- a/src/workflows/test_workflows/multiomics/split_modalities/script.py +++ b/src/workflows/test_workflows/multiomics/split_modalities/script.py @@ -6,14 +6,12 @@ ##VIASH START par = { - "input": "output_test/split_modalities/foo_types.csv", - "mod_dir": "output_test/split_modalities/h5mu", - "orig_input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3.h5mu" + "input": "output_test/split_modalities/foo_types.csv", + "mod_dir": "output_test/split_modalities/h5mu", + "orig_input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3.h5mu", } -meta = { - "resources_dir": "resources_test/pbmc_1k_protein_v3" -} +meta = {"resources_dir": "resources_test/pbmc_1k_protein_v3"} ##VIASH END @@ -30,7 +28,9 @@ # Check if the number of files is equal to the number of lines in the csv assert num_mod == num_files, f"Expected {num_mod} files, but found {num_files}." -assert input_mu.n_mod == num_mod, f"Expected {num_mod} modalities in {par['orig_input']} got {input_mu.n_mod} modalities." +assert ( + input_mu.n_mod == num_mod +), f"Expected {num_mod} modalities in {par['orig_input']} got {input_mu.n_mod} modalities." rna_mod = mu.read_h5mu(os.path.join(par["mod_dir"], data[1][1])) prot_mod = mu.read_h5mu(os.path.join(par["mod_dir"], data[2][1])) @@ -47,7 +47,9 @@ mod_mu = mu.read_h5mu(mod_fp) assert mod_mu.n_mod == 1, f"Expected 1 modality in {row[1]}." assert row[0] in mod_mu.mod.keys(), f"Expected {row[0]} to be the mod in {row[1]}." - assert row[0] in input_mu.mod.keys(), f"Expected {row[0]} to be a mod in {par['orig_input']}." + assert ( + row[0] in input_mu.mod.keys() + ), f"Expected {row[0]} to be a mod in {par['orig_input']}." # Check if extracted modalities are equal to the original modalities assert_annotation_objects_equal(rna_mod.mod["rna"], input_mu.mod["rna"]) diff --git a/src/workflows/test_workflows/qc/script.py b/src/workflows/test_workflows/qc/script.py index 12c731852e0..7ace79effdd 100644 --- a/src/workflows/test_workflows/qc/script.py +++ b/src/workflows/test_workflows/qc/script.py @@ -9,43 +9,57 @@ ##VIASH START -par = { - "input": "input.h5mu", - "og_input": "og_input.h5mu" -} +par = {"input": "input.h5mu", "og_input": "og_input.h5mu"} meta = { "resources_dir": "resources_test/concat_test_data", } ##VIASH END - - + + def test_run(): input_mudata = read_h5mu(par["og_input"]) output_mudata = read_h5mu(par["input"]) - assert input_mudata.n_mod == output_mudata.n_mod, "Number of modalities should be the same" - assert input_mudata.mod.keys() == output_mudata.mod.keys(), "Modalities should be the same" - assert list(output_mudata.mod.keys()) == ["rna", "atac"], "Modalities should be rna and atac" + assert ( + input_mudata.n_mod == output_mudata.n_mod + ), "Number of modalities should be the same" + assert ( + input_mudata.mod.keys() == output_mudata.mod.keys() + ), "Modalities should be the same" + assert list(output_mudata.mod.keys()) == [ + "rna", + "atac", + ], "Modalities should be rna and atac" obs_cols_to_remove = [] for top_n_vars in ("50", "100", "200", "500"): obs_cols_to_remove.append(f"pct_of_counts_in_top_{top_n_vars}_vars") - obs_cols_to_remove.extend(['total_counts', 'num_nonzero_vars']) - var_cols_to_remove = ['obs_mean', 'total_counts', 'num_nonzero_obs', 'pct_dropout'] + obs_cols_to_remove.extend(["total_counts", "num_nonzero_vars"]) + var_cols_to_remove = ["obs_mean", "total_counts", "num_nonzero_obs", "pct_dropout"] - assert set(obs_cols_to_remove).issubset(set(output_mudata.mod["rna"].obs.columns.to_list())) - assert set(var_cols_to_remove).issubset(set(output_mudata.mod["rna"].var.columns.to_list())) + assert set(obs_cols_to_remove).issubset( + set(output_mudata.mod["rna"].obs.columns.to_list()) + ) + assert set(var_cols_to_remove).issubset( + set(output_mudata.mod["rna"].var.columns.to_list()) + ) - initial_mudata = remove_annotation_column(output_mudata, obs_cols_to_remove, axis="obs", modality_name="rna") - initial_mudata = remove_annotation_column(initial_mudata, var_cols_to_remove, axis="var", modality_name="rna") + initial_mudata = remove_annotation_column( + output_mudata, obs_cols_to_remove, axis="obs", modality_name="rna" + ) + initial_mudata = remove_annotation_column( + initial_mudata, var_cols_to_remove, axis="var", modality_name="rna" + ) assert_annotation_objects_equal(input_mudata, initial_mudata) - + if __name__ == "__main__": HERE_DIR = Path(__file__).resolve().parent - shutil.copyfile(os.path.join(meta['resources_dir'], "openpipelinetestutils", "conftest.py"), - os.path.join(HERE_DIR, "conftest.py")) - sys.exit(pytest.main(["--import-mode=importlib"])) \ No newline at end of file + shutil.copyfile( + os.path.join(meta["resources_dir"], "openpipelinetestutils", "conftest.py"), + os.path.join(HERE_DIR, "conftest.py"), + ) + sys.exit(pytest.main(["--import-mode=importlib"]))