openpipelines-bio · wxicu · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023 · Nov 15, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -121,6 +121,10 @@ Implementing this changes involved breaking some existing functionality:
 
 * `qc/calculate_qc_metrics`: fix calculating mitochondrial gene related QC metrics when only or no mitochondrial genes were found (PR #564).
 
+## NEW FUNCTIONALITY
+
+* Added `protein_processing/dsb_index` and `protein_processing/dsb_normalize` components (PR #588).
+
 # openpipelines 0.10.1
 
 ## MINOR CHANGES

diff --git a/src/protein_processing/dsb_index/config.vsh.yaml b/src/protein_processing/dsb_index/config.vsh.yaml
@@ -0,0 +1,48 @@
+functionality:
+  name: dsb_index
+  namespace: protein_processing
+  description: "Filter background and foreground signals for normalising protein expression with DSB (Denoised and Scaled by Background)."
+  authors:
+    - name: Xichen Wu
-    - name: Xichen Wu
+    - __merge__: /src/authors/xichen_wu.yaml
+      roles: [ maintainer, author ]
-    - name: Xichen Wu
+    - __merge__: /src/authors/xichen_wu.yaml
+      roles: [ maintainer, author ]
+  arguments:
+    - name: "--data_raw"
+      type: file
+      required: true
+      description: "A ``MuData`` object containing raw (unfiltered, including empty droplets) data for both ``prot`` and ``rna`` modalities."
-    - name: "--data_raw"
-      type: file
-      required: true
-      description: "A ``MuData`` object containing raw (unfiltered, including empty droplets) data for both ``prot`` and ``rna`` modalities."
+    - name: "--input"
+      type: file
+      required: true
+      description: "A ``MuData`` object containing raw (unfiltered, including empty droplets) data for both ``prot`` and ``rna`` modalities."
+      example: input.h5mu
-    - name: "--data_raw"
-      type: file
-      required: true
-      description: "A ``MuData`` object containing raw (unfiltered, including empty droplets) data for both ``prot`` and ``rna`` modalities."
+    - name: "--input"
+      type: file
+      required: true
+      description: "A ``MuData`` object containing raw (unfiltered, including empty droplets) data for both ``prot`` and ``rna`` modalities."
+      example: input.h5mu
+    - name: "--cell_index"
+      type: file
+      description: "A csv file containing filtered cell barcodes."
+    - name: "--empty_counts_range"
+      type: double
+      description: "Specify the minimum and maximum log10-counts for a droplet to be considered empty."
+      multiple: true
+    - name: "--cell_counts_range"
+      type: double
+      description: "Specify the minimum and maximum log10-counts for a droplet to be considered not empty."
+      multiple: true
+    - name: "--output"
+      type: file
+      direction: output
+      description: dsb_index output directory
+      example: "dsb_output"
-      example: "dsb_output"
+      example: "dsb_output/"
+      required: true
-      example: "dsb_output"
+      example: "dsb_output/"
+      required: true
+  resources:
+    - type: python_script
+      path: script.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - path: ../../../resources_test/pbmc_1k_protein_v3
+platforms:
+  - type: docker
+    image: python:3.8
+    setup:
+      - type: python
+        packages:
+          - scanpy~=1.9.1
+          - muon
+          - numpy
+          - mudata~=0.2.0
+          - anndata~=0.8.0
+  - type: nextflow
+    directives:
+      label: lowcpu
diff --git a/src/protein_processing/dsb_index/script.py b/src/protein_processing/dsb_index/script.py
@@ -0,0 +1,79 @@
+from warnings import warn
+import scanpy as sc
+import muon as mu
+import numpy as np
+from mudata import MuData
+import pandas as pd
+import os
+from anndata import AnnData
+
+## VIASH START
+par = {
+    "data_raw": "mudata_raw.h5mu",
+    "output": "dsb_processed",
+    "cell_index": None,
+    "empty_counts_range": [1.5, 2.8],
+    "cell_counts_range": None
+}
+
+## VIASH END
+
+if par['data_raw'].endswith('h5mu'):
+    raw_data = mu.read_h5mu(par['data_raw'])
+elif par['data_raw'].endswith('h5'):
+    raw_data = mu.read_10x_h5(par['data_raw'])
+else:
+    raise TypeError("data_raw must be a MuData object with 'prot' and 'rna' modalities")
+
+if "prot" not in raw_data.mod or "rna" not in raw_data.mod:
+    raise TypeError("Raw data does not contain 'prot' or 'rna' modalities")
+if raw_data.mod["rna"].n_obs != raw_data.mod["prot"].n_obs:
+    raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.")
+
+droplet_barcode = raw_data.mod["prot"].obs_names
+if par["cell_index"] is not None:
+    cell_barcode = pd.read_csv(par["cell_index"],header=None).iloc[:, 0].tolist()
+    empty_barcode = list(set(droplet_barcode).difference(cell_barcode))
+else:
+    cell_barcode = None
+    empty_barcode = None
+
+log10umi = np.log10(np.asarray(raw_data.mod["rna"].X.sum(axis=1)).squeeze() + 1)
+
+if par['empty_counts_range'] is not None:
+    if len(par['empty_counts_range']) != 2:
+        raise ValueError("Invalid count ranges provided for the empty droplets.")
+    if par['cell_counts_range'] is not None and max(*par['empty_counts_range']) > min(*par['cell_counts_range']):
+        raise ValueError("Overlapping count ranges")
+    empty_idx = np.where(
+        (log10umi >= min(*par['empty_counts_range'])) & (log10umi < max(*par['empty_counts_range'])))[0]
+    empty_idx = droplet_barcode[empty_idx]
+    if empty_barcode is not None:
+        empty_barcode = list(set(empty_barcode) & set(empty_idx))
+    else:
+        empty_barcode = empty_idx
+
+
+if par['cell_counts_range'] is not None:
+    if len(par['cell_counts_range']) != 2:
+        raise ValueError("Invalid count ranges provided for true cells.")
+    cell_idx = np.where(
+    (log10umi >= min(*par['cell_counts_range'])) & (log10umi < max(*par['cell_counts_range'])))[0]
+    cell_idx = droplet_barcode[cell_idx]
+    if cell_barcode is not None:
+        cell_barcode = list(set(cell_barcode) & set(cell_idx))
+    else:
+        cell_barcode = cell_idx
+
+if empty_barcode is None:
+    if cell_barcode is None:
+        raise ValueError("Neither cell_index nor counts ranges for empty droplets "
+                         "or cells provided for filtering empty droplets.")
+    empty_barcode = list(set(droplet_barcode).difference(cell_barcode))
+elif cell_barcode is None:
+    cell_barcode = list(set(droplet_barcode).difference(empty_barcode))
+
+if not os.path.exists(par["output"]):
+    os.makedirs(par["output"])
+pd.DataFrame(cell_barcode).to_csv(os.path.join(par["output"], "cell_idx.csv"), header=None, index=None)
+pd.DataFrame(empty_barcode).to_csv(os.path.join(par["output"], "empty_idx.csv"), header=None, index=None)
diff --git a/src/protein_processing/dsb_index/test.py b/src/protein_processing/dsb_index/test.py
@@ -0,0 +1,50 @@
+import subprocess
+from os import path
+import muon as mu
+import logging
+from sys import stdout
+import pandas as pd
+
+## VIASH START
+meta = {
+    'functionality_name': 'dsb_index',
+    'executable': './target/dsb_index',
+    'resources_dir': 'resources_test/'
+
+}
+## VIASH END
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler(stdout)
+logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+console_handler.setFormatter(logFormatter)
+logger.addHandler(console_handler)
+
+input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5"
+cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz"
+output_cell_idx = "dsb_output/cell_idx.csv"
+output_empty_idx = "dsb_output/empty_idx.csv"
+cmd_pars = [
+    meta["executable"],
+    "--data_raw", input,
+    "--output", "dsb_output",
+    "--cell_index", cell_index,
+    "--empty_counts_range", "1.5:2.8"
+]
+try:
+    subprocess.check_output(cmd_pars)
+except subprocess.CalledProcessError as e:
+    print(e.stdout.decode("utf-8"))
+    raise e
+
+logger.info("> Check if output was created.")
+assert path.exists(output_cell_idx), "No output for cell index was created."
+assert path.exists(output_empty_idx), "No output for empty index was created."
+
+
+logger.info("> Check whether output cell index has the samw shape as the cell_index input.")
+cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist()
+output_cell_index = pd.read_csv(output_cell_idx, header=None).iloc[:, 0].tolist()
+assert len(output_cell_index) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.'
+
diff --git a/src/protein_processing/dsb_normalize/config.vsh.yaml b/src/protein_processing/dsb_normalize/config.vsh.yaml
@@ -0,0 +1,61 @@
+functionality:
+  name: dsb_normalize
+  namespace: protein_processing
+  description: "Normalize protein expression with DSB (Denoised and Scaled by Background)."
+  authors:
+    - name: Xichen Wu
-    - name: Xichen Wu
+    - __merge__: /src/authors/xichen_wu.yaml
+      roles: [ maintainer, author ]
-    - name: Xichen Wu
+    - __merge__: /src/authors/xichen_wu.yaml
+      roles: [ maintainer, author ]
+  arguments:
+    - name: "--data_raw"
+      type: file
+      required: true
+      description: "AnnData object with protein expression counts or MuData object with 'prot' modality containing raw (unfiltered, including empty droplets) data."
+    - name: "--cell_index"
+      type: file
+      description: "A csv file containing filtered cell barcodes."
+    - name: "--empty_index"
+      type: file
+      description: "A csv file containing empty cell barcodes."
+    - name: "--pseudocount"
+      type: integer
+      default: 10
+      description: "Pseudocount to add before log-transform."
+    - name: "--denoise_counts"
+      type: boolean_true
+      description: "Whether to perform denoising."
+    - name: "--isotype_controls"
+      type: string
+      multiple: true
+      description: "Names of the isotype controls. If ``None``, isotype controls will not be used."
+    - name: "--add_layer"
+      type: boolean_true
+      description: "Whether to add a ``'dsb'`` layer instead of assigning to the X matrix."
+    - name: "--random_state"
+      type: integer
+      default: 1
+      description: "Random seed."
+    - name: "--output"
+      type: file
+      direction: output
+      description: dsb_normalize output directory
+      example: "dsb_output"
+  resources:
+    - type: python_script
+      path: script.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - path: ../../../resources_test/pbmc_1k_protein_v3
+platforms:
+  - type: docker
+    image: python:3.8
+    setup:
+      - type: python
+        packages:
+          - scanpy~=1.9.1
+          - muon
+          - numpy
+          - mudata~=0.2.0
+          - anndata~=0.8.0
+  - type: nextflow
+    directives:
+      label: midcpu
diff --git a/src/protein_processing/dsb_normalize/script.py b/src/protein_processing/dsb_normalize/script.py
@@ -0,0 +1,68 @@
+from warnings import warn
+import os
+import scanpy as sc
+import muon as mu
+import numpy as np
+from mudata import MuData
+import pandas as pd
+from anndata import AnnData
+from muon import prot as pt
+
+## VIASH START
+par = {
+    "data_raw": "../dsb_index/mudata_raw.h5mu",
+    "output": "dsb_processed",
+    "cell_index":  "../dsb_index/dsb_processed/cell_idx.csv",
+    "empty_index": "../dsb_index/dsb_processed/empty_idx.csv",
+    "pseudocount": 10,
+    "denoise_counts": True,
+    "isotype_controls": None,
+    "add_layer": False,
+    "random_state": None
+}
+## VIASH END
+
+if par['data_raw'] is not None:
+    if par['data_raw'].endswith('h5ad'):
+        raw_data = sc.read_h5ad(par['data_raw'])
+    elif par['data_raw'].endswith('h5mu'):
+        raw_data = mu.read_h5mu(par['data_raw'])
+    elif par['data_raw'].endswith('h5'):
+        raw_data = mu.read_10x_h5(par['data_raw'])
+    else:
+        raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality")
+    if "prot" not in raw_data.mod:
+            raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality")
+else:
+    raise ValueError( "Raw data is not available.")
+
+if par['cell_index'] is None and par['empty_index'] is None:
+    raise ValueError( "Given the unfiltered object data_raw, at least one index file must be "
+                      "provided for foreground and background signals.")
+
+cells = None
+empty = None
+
+if par['empty_index'] is not None:
+    empty_idx = pd.read_csv(par["empty_index"], header=None).iloc[:, 0].tolist()
+    empty = raw_data[raw_data.obs_names.isin(empty_idx)]
+if par['cell_index'] is not None:
+    cell_idx = pd.read_csv(par["cell_index"], header=None).iloc[:, 0].tolist()
+    cells = raw_data[raw_data.obs_names.isin(cell_idx)]
+
+if empty is None:
+    empty = raw_data[~raw_data.obs_names.isin(cell_idx)]
+if cells is None:
+    cells = raw_data[~raw_data.obs_names.isin(empty_idx)]
+
+pt.pp.dsb(cells, empty, isotype_controls=par['isotype_controls'], pseudocount=par["pseudocount"],
+    denoise_counts=par["denoise_counts"], add_layer=par["add_layer"],
+    random_state=par["random_state"])
+
+if not os.path.exists(par["output"]):
+    os.makedirs(par["output"])
+if isinstance(cells, MuData):
+    cells.write(os.path.join(par["output"], "normalized.h5mu"))
+else:
+    cells.write(os.path.join(par["output"], "normalized.h5ad"))
+
diff --git a/src/protein_processing/dsb_normalize/test.py b/src/protein_processing/dsb_normalize/test.py
@@ -0,0 +1,48 @@
+import subprocess
+from os import path
+import muon as mu
+import logging
+from sys import stdout
+import pandas as pd
+
+## VIASH START
+meta = {
+    'functionality_name': 'dsb_normalize',
+    'executable': './target/dsb_normalize',
+    'resources_dir': 'resources_test/'
+
+}
+## VIASH END
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler(stdout)
+logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+console_handler.setFormatter(logFormatter)
+logger.addHandler(console_handler)
+
+input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5"
+cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz"
+output = "dsb_output/normalized.h5mu"
+cmd_pars = [
+    meta["executable"],
+    "--data_raw", input,
+    "--output", "dsb_output",
+    "--cell_index", cell_index,
+    "--empty_counts_range", "1.5:2.8",
+    "--denoise_counts"
+]
+try:
+    subprocess.check_output(cmd_pars)
+except subprocess.CalledProcessError as e:
+    print(e.stdout.decode("utf-8"))
+    raise e
+
+logger.info("> Check if output was created.")
+assert path.exists(output), "No output was created."
+
+logger.info("> Check whether output has the samw shape as the cell_index input.")
+cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist()
+mdata= mu.read_h5mu(output)
+assert len(mdata.obs) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.'
+