diff --git a/CHANGELOG.md b/CHANGELOG.md index b9fb9c6655b..1103dbf82df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -121,6 +121,10 @@ Implementing this changes involved breaking some existing functionality: * `qc/calculate_qc_metrics`: fix calculating mitochondrial gene related QC metrics when only or no mitochondrial genes were found (PR #564). +## NEW FUNCTIONALITY + +* Added `protein_processing/dsb_index` and `protein_processing/dsb_normalize` components (PR #588). + # openpipelines 0.10.1 ## MINOR CHANGES diff --git a/src/protein_processing/dsb_index/config.vsh.yaml b/src/protein_processing/dsb_index/config.vsh.yaml new file mode 100644 index 00000000000..4a7c47f6fbb --- /dev/null +++ b/src/protein_processing/dsb_index/config.vsh.yaml @@ -0,0 +1,48 @@ +functionality: + name: dsb_index + namespace: protein_processing + description: "Filter background and foreground signals for normalising protein expression with DSB (Denoised and Scaled by Background)." + authors: + - name: Xichen Wu + arguments: + - name: "--data_raw" + type: file + required: true + description: "A ``MuData`` object containing raw (unfiltered, including empty droplets) data for both ``prot`` and ``rna`` modalities." + - name: "--cell_index" + type: file + description: "A csv file containing filtered cell barcodes." + - name: "--empty_counts_range" + type: double + description: "Specify the minimum and maximum log10-counts for a droplet to be considered empty." + multiple: true + - name: "--cell_counts_range" + type: double + description: "Specify the minimum and maximum log10-counts for a droplet to be considered not empty." + multiple: true + - name: "--output" + type: file + direction: output + description: dsb_index output directory + example: "dsb_output" + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + - path: ../../../resources_test/pbmc_1k_protein_v3 +platforms: + - type: docker + image: python:3.8 + setup: + - type: python + packages: + - scanpy~=1.9.1 + - muon + - numpy + - mudata~=0.2.0 + - anndata~=0.8.0 + - type: nextflow + directives: + label: lowcpu diff --git a/src/protein_processing/dsb_index/script.py b/src/protein_processing/dsb_index/script.py new file mode 100644 index 00000000000..b2fbe6d5365 --- /dev/null +++ b/src/protein_processing/dsb_index/script.py @@ -0,0 +1,79 @@ +from warnings import warn +import scanpy as sc +import muon as mu +import numpy as np +from mudata import MuData +import pandas as pd +import os +from anndata import AnnData + +## VIASH START +par = { + "data_raw": "mudata_raw.h5mu", + "output": "dsb_processed", + "cell_index": None, + "empty_counts_range": [1.5, 2.8], + "cell_counts_range": None +} + +## VIASH END + +if par['data_raw'].endswith('h5mu'): + raw_data = mu.read_h5mu(par['data_raw']) +elif par['data_raw'].endswith('h5'): + raw_data = mu.read_10x_h5(par['data_raw']) +else: + raise TypeError("data_raw must be a MuData object with 'prot' and 'rna' modalities") + +if "prot" not in raw_data.mod or "rna" not in raw_data.mod: + raise TypeError("Raw data does not contain 'prot' or 'rna' modalities") +if raw_data.mod["rna"].n_obs != raw_data.mod["prot"].n_obs: + raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.") + +droplet_barcode = raw_data.mod["prot"].obs_names +if par["cell_index"] is not None: + cell_barcode = pd.read_csv(par["cell_index"],header=None).iloc[:, 0].tolist() + empty_barcode = list(set(droplet_barcode).difference(cell_barcode)) +else: + cell_barcode = None + empty_barcode = None + +log10umi = np.log10(np.asarray(raw_data.mod["rna"].X.sum(axis=1)).squeeze() + 1) + +if par['empty_counts_range'] is not None: + if len(par['empty_counts_range']) != 2: + raise ValueError("Invalid count ranges provided for the empty droplets.") + if par['cell_counts_range'] is not None and max(*par['empty_counts_range']) > min(*par['cell_counts_range']): + raise ValueError("Overlapping count ranges") + empty_idx = np.where( + (log10umi >= min(*par['empty_counts_range'])) & (log10umi < max(*par['empty_counts_range'])))[0] + empty_idx = droplet_barcode[empty_idx] + if empty_barcode is not None: + empty_barcode = list(set(empty_barcode) & set(empty_idx)) + else: + empty_barcode = empty_idx + + +if par['cell_counts_range'] is not None: + if len(par['cell_counts_range']) != 2: + raise ValueError("Invalid count ranges provided for true cells.") + cell_idx = np.where( + (log10umi >= min(*par['cell_counts_range'])) & (log10umi < max(*par['cell_counts_range'])))[0] + cell_idx = droplet_barcode[cell_idx] + if cell_barcode is not None: + cell_barcode = list(set(cell_barcode) & set(cell_idx)) + else: + cell_barcode = cell_idx + +if empty_barcode is None: + if cell_barcode is None: + raise ValueError("Neither cell_index nor counts ranges for empty droplets " + "or cells provided for filtering empty droplets.") + empty_barcode = list(set(droplet_barcode).difference(cell_barcode)) +elif cell_barcode is None: + cell_barcode = list(set(droplet_barcode).difference(empty_barcode)) + +if not os.path.exists(par["output"]): + os.makedirs(par["output"]) +pd.DataFrame(cell_barcode).to_csv(os.path.join(par["output"], "cell_idx.csv"), header=None, index=None) +pd.DataFrame(empty_barcode).to_csv(os.path.join(par["output"], "empty_idx.csv"), header=None, index=None) diff --git a/src/protein_processing/dsb_index/test.py b/src/protein_processing/dsb_index/test.py new file mode 100644 index 00000000000..4c27bdf389d --- /dev/null +++ b/src/protein_processing/dsb_index/test.py @@ -0,0 +1,50 @@ +import subprocess +from os import path +import muon as mu +import logging +from sys import stdout +import pandas as pd + +## VIASH START +meta = { + 'functionality_name': 'dsb_index', + 'executable': './target/dsb_index', + 'resources_dir': 'resources_test/' + +} +## VIASH END + +logger = logging.getLogger() +logger.setLevel(logging.INFO) +console_handler = logging.StreamHandler(stdout) +logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") +console_handler.setFormatter(logFormatter) +logger.addHandler(console_handler) + +input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" +cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz" +output_cell_idx = "dsb_output/cell_idx.csv" +output_empty_idx = "dsb_output/empty_idx.csv" +cmd_pars = [ + meta["executable"], + "--data_raw", input, + "--output", "dsb_output", + "--cell_index", cell_index, + "--empty_counts_range", "1.5:2.8" +] +try: + subprocess.check_output(cmd_pars) +except subprocess.CalledProcessError as e: + print(e.stdout.decode("utf-8")) + raise e + +logger.info("> Check if output was created.") +assert path.exists(output_cell_idx), "No output for cell index was created." +assert path.exists(output_empty_idx), "No output for empty index was created." + + +logger.info("> Check whether output cell index has the samw shape as the cell_index input.") +cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist() +output_cell_index = pd.read_csv(output_cell_idx, header=None).iloc[:, 0].tolist() +assert len(output_cell_index) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.' + diff --git a/src/protein_processing/dsb_normalize/config.vsh.yaml b/src/protein_processing/dsb_normalize/config.vsh.yaml new file mode 100644 index 00000000000..72d020abb95 --- /dev/null +++ b/src/protein_processing/dsb_normalize/config.vsh.yaml @@ -0,0 +1,61 @@ +functionality: + name: dsb_normalize + namespace: protein_processing + description: "Normalize protein expression with DSB (Denoised and Scaled by Background)." + authors: + - name: Xichen Wu + arguments: + - name: "--data_raw" + type: file + required: true + description: "AnnData object with protein expression counts or MuData object with 'prot' modality containing raw (unfiltered, including empty droplets) data." + - name: "--cell_index" + type: file + description: "A csv file containing filtered cell barcodes." + - name: "--empty_index" + type: file + description: "A csv file containing empty cell barcodes." + - name: "--pseudocount" + type: integer + default: 10 + description: "Pseudocount to add before log-transform." + - name: "--denoise_counts" + type: boolean_true + description: "Whether to perform denoising." + - name: "--isotype_controls" + type: string + multiple: true + description: "Names of the isotype controls. If ``None``, isotype controls will not be used." + - name: "--add_layer" + type: boolean_true + description: "Whether to add a ``'dsb'`` layer instead of assigning to the X matrix." + - name: "--random_state" + type: integer + default: 1 + description: "Random seed." + - name: "--output" + type: file + direction: output + description: dsb_normalize output directory + example: "dsb_output" + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + - path: ../../../resources_test/pbmc_1k_protein_v3 +platforms: + - type: docker + image: python:3.8 + setup: + - type: python + packages: + - scanpy~=1.9.1 + - muon + - numpy + - mudata~=0.2.0 + - anndata~=0.8.0 + - type: nextflow + directives: + label: midcpu diff --git a/src/protein_processing/dsb_normalize/script.py b/src/protein_processing/dsb_normalize/script.py new file mode 100644 index 00000000000..bb47e17e2d0 --- /dev/null +++ b/src/protein_processing/dsb_normalize/script.py @@ -0,0 +1,68 @@ +from warnings import warn +import os +import scanpy as sc +import muon as mu +import numpy as np +from mudata import MuData +import pandas as pd +from anndata import AnnData +from muon import prot as pt + +## VIASH START +par = { + "data_raw": "../dsb_index/mudata_raw.h5mu", + "output": "dsb_processed", + "cell_index": "../dsb_index/dsb_processed/cell_idx.csv", + "empty_index": "../dsb_index/dsb_processed/empty_idx.csv", + "pseudocount": 10, + "denoise_counts": True, + "isotype_controls": None, + "add_layer": False, + "random_state": None +} +## VIASH END + +if par['data_raw'] is not None: + if par['data_raw'].endswith('h5ad'): + raw_data = sc.read_h5ad(par['data_raw']) + elif par['data_raw'].endswith('h5mu'): + raw_data = mu.read_h5mu(par['data_raw']) + elif par['data_raw'].endswith('h5'): + raw_data = mu.read_10x_h5(par['data_raw']) + else: + raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality") + if "prot" not in raw_data.mod: + raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality") +else: + raise ValueError( "Raw data is not available.") + +if par['cell_index'] is None and par['empty_index'] is None: + raise ValueError( "Given the unfiltered object data_raw, at least one index file must be " + "provided for foreground and background signals.") + +cells = None +empty = None + +if par['empty_index'] is not None: + empty_idx = pd.read_csv(par["empty_index"], header=None).iloc[:, 0].tolist() + empty = raw_data[raw_data.obs_names.isin(empty_idx)] +if par['cell_index'] is not None: + cell_idx = pd.read_csv(par["cell_index"], header=None).iloc[:, 0].tolist() + cells = raw_data[raw_data.obs_names.isin(cell_idx)] + +if empty is None: + empty = raw_data[~raw_data.obs_names.isin(cell_idx)] +if cells is None: + cells = raw_data[~raw_data.obs_names.isin(empty_idx)] + +pt.pp.dsb(cells, empty, isotype_controls=par['isotype_controls'], pseudocount=par["pseudocount"], + denoise_counts=par["denoise_counts"], add_layer=par["add_layer"], + random_state=par["random_state"]) + +if not os.path.exists(par["output"]): + os.makedirs(par["output"]) +if isinstance(cells, MuData): + cells.write(os.path.join(par["output"], "normalized.h5mu")) +else: + cells.write(os.path.join(par["output"], "normalized.h5ad")) + diff --git a/src/protein_processing/dsb_normalize/test.py b/src/protein_processing/dsb_normalize/test.py new file mode 100644 index 00000000000..e84703581b0 --- /dev/null +++ b/src/protein_processing/dsb_normalize/test.py @@ -0,0 +1,48 @@ +import subprocess +from os import path +import muon as mu +import logging +from sys import stdout +import pandas as pd + +## VIASH START +meta = { + 'functionality_name': 'dsb_normalize', + 'executable': './target/dsb_normalize', + 'resources_dir': 'resources_test/' + +} +## VIASH END + +logger = logging.getLogger() +logger.setLevel(logging.INFO) +console_handler = logging.StreamHandler(stdout) +logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") +console_handler.setFormatter(logFormatter) +logger.addHandler(console_handler) + +input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" +cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz" +output = "dsb_output/normalized.h5mu" +cmd_pars = [ + meta["executable"], + "--data_raw", input, + "--output", "dsb_output", + "--cell_index", cell_index, + "--empty_counts_range", "1.5:2.8", + "--denoise_counts" +] +try: + subprocess.check_output(cmd_pars) +except subprocess.CalledProcessError as e: + print(e.stdout.decode("utf-8")) + raise e + +logger.info("> Check if output was created.") +assert path.exists(output), "No output was created." + +logger.info("> Check whether output has the samw shape as the cell_index input.") +cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist() +mdata= mu.read_h5mu(output) +assert len(mdata.obs) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.' +