-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add dsb component #588
base: main
Are you sure you want to change the base?
Add dsb component #588
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,48 @@ | ||||||||||||||||||||
functionality: | ||||||||||||||||||||
name: dsb_index | ||||||||||||||||||||
namespace: protein_processing | ||||||||||||||||||||
description: "Filter background and foreground signals for normalising protein expression with DSB (Denoised and Scaled by Background)." | ||||||||||||||||||||
authors: | ||||||||||||||||||||
- name: Xichen Wu | ||||||||||||||||||||
arguments: | ||||||||||||||||||||
- name: "--data_raw" | ||||||||||||||||||||
type: file | ||||||||||||||||||||
required: true | ||||||||||||||||||||
description: "A ``MuData`` object containing raw (unfiltered, including empty droplets) data for both ``prot`` and ``rna`` modalities." | ||||||||||||||||||||
Comment on lines
+8
to
+11
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We typically use
Suggested change
|
||||||||||||||||||||
- name: "--cell_index" | ||||||||||||||||||||
type: file | ||||||||||||||||||||
description: "A csv file containing filtered cell barcodes." | ||||||||||||||||||||
- name: "--empty_counts_range" | ||||||||||||||||||||
type: double | ||||||||||||||||||||
description: "Specify the minimum and maximum log10-counts for a droplet to be considered empty." | ||||||||||||||||||||
multiple: true | ||||||||||||||||||||
- name: "--cell_counts_range" | ||||||||||||||||||||
type: double | ||||||||||||||||||||
description: "Specify the minimum and maximum log10-counts for a droplet to be considered not empty." | ||||||||||||||||||||
multiple: true | ||||||||||||||||||||
- name: "--output" | ||||||||||||||||||||
type: file | ||||||||||||||||||||
direction: output | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd be inclined to make the output an h5mu file instead of a directory, because then it plays much nicer with the other components in OpenPipeline. If you want, we can discuss this in the next OpenPipelines working meeting. |
||||||||||||||||||||
description: dsb_index output directory | ||||||||||||||||||||
example: "dsb_output" | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make this a required argument and add a slash to the example to denote it's a directory
Suggested change
|
||||||||||||||||||||
resources: | ||||||||||||||||||||
- type: python_script | ||||||||||||||||||||
path: script.py | ||||||||||||||||||||
test_resources: | ||||||||||||||||||||
- type: python_script | ||||||||||||||||||||
path: test.py | ||||||||||||||||||||
- path: ../../../resources_test/pbmc_1k_protein_v3 | ||||||||||||||||||||
platforms: | ||||||||||||||||||||
- type: docker | ||||||||||||||||||||
image: python:3.8 | ||||||||||||||||||||
setup: | ||||||||||||||||||||
- type: python | ||||||||||||||||||||
packages: | ||||||||||||||||||||
- scanpy~=1.9.1 | ||||||||||||||||||||
- muon | ||||||||||||||||||||
- numpy | ||||||||||||||||||||
- mudata~=0.2.0 | ||||||||||||||||||||
- anndata~=0.8.0 | ||||||||||||||||||||
- type: nextflow | ||||||||||||||||||||
directives: | ||||||||||||||||||||
label: lowcpu |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
from warnings import warn | ||
import scanpy as sc | ||
import muon as mu | ||
import numpy as np | ||
from mudata import MuData | ||
import pandas as pd | ||
import os | ||
from anndata import AnnData | ||
|
||
## VIASH START | ||
par = { | ||
"data_raw": "mudata_raw.h5mu", | ||
"output": "dsb_processed", | ||
"cell_index": None, | ||
"empty_counts_range": [1.5, 2.8], | ||
"cell_counts_range": None | ||
} | ||
|
||
## VIASH END | ||
|
||
if par['data_raw'].endswith('h5mu'): | ||
raw_data = mu.read_h5mu(par['data_raw']) | ||
elif par['data_raw'].endswith('h5'): | ||
raw_data = mu.read_10x_h5(par['data_raw']) | ||
else: | ||
raise TypeError("data_raw must be a MuData object with 'prot' and 'rna' modalities") | ||
|
||
if "prot" not in raw_data.mod or "rna" not in raw_data.mod: | ||
raise TypeError("Raw data does not contain 'prot' or 'rna' modalities") | ||
if raw_data.mod["rna"].n_obs != raw_data.mod["prot"].n_obs: | ||
raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.") | ||
|
||
droplet_barcode = raw_data.mod["prot"].obs_names | ||
if par["cell_index"] is not None: | ||
cell_barcode = pd.read_csv(par["cell_index"],header=None).iloc[:, 0].tolist() | ||
empty_barcode = list(set(droplet_barcode).difference(cell_barcode)) | ||
else: | ||
cell_barcode = None | ||
empty_barcode = None | ||
|
||
log10umi = np.log10(np.asarray(raw_data.mod["rna"].X.sum(axis=1)).squeeze() + 1) | ||
|
||
if par['empty_counts_range'] is not None: | ||
if len(par['empty_counts_range']) != 2: | ||
raise ValueError("Invalid count ranges provided for the empty droplets.") | ||
if par['cell_counts_range'] is not None and max(*par['empty_counts_range']) > min(*par['cell_counts_range']): | ||
raise ValueError("Overlapping count ranges") | ||
empty_idx = np.where( | ||
(log10umi >= min(*par['empty_counts_range'])) & (log10umi < max(*par['empty_counts_range'])))[0] | ||
empty_idx = droplet_barcode[empty_idx] | ||
if empty_barcode is not None: | ||
empty_barcode = list(set(empty_barcode) & set(empty_idx)) | ||
else: | ||
empty_barcode = empty_idx | ||
|
||
|
||
if par['cell_counts_range'] is not None: | ||
if len(par['cell_counts_range']) != 2: | ||
raise ValueError("Invalid count ranges provided for true cells.") | ||
cell_idx = np.where( | ||
(log10umi >= min(*par['cell_counts_range'])) & (log10umi < max(*par['cell_counts_range'])))[0] | ||
cell_idx = droplet_barcode[cell_idx] | ||
if cell_barcode is not None: | ||
cell_barcode = list(set(cell_barcode) & set(cell_idx)) | ||
else: | ||
cell_barcode = cell_idx | ||
|
||
if empty_barcode is None: | ||
if cell_barcode is None: | ||
raise ValueError("Neither cell_index nor counts ranges for empty droplets " | ||
"or cells provided for filtering empty droplets.") | ||
empty_barcode = list(set(droplet_barcode).difference(cell_barcode)) | ||
elif cell_barcode is None: | ||
cell_barcode = list(set(droplet_barcode).difference(empty_barcode)) | ||
|
||
if not os.path.exists(par["output"]): | ||
os.makedirs(par["output"]) | ||
pd.DataFrame(cell_barcode).to_csv(os.path.join(par["output"], "cell_idx.csv"), header=None, index=None) | ||
pd.DataFrame(empty_barcode).to_csv(os.path.join(par["output"], "empty_idx.csv"), header=None, index=None) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import subprocess | ||
from os import path | ||
import muon as mu | ||
import logging | ||
from sys import stdout | ||
import pandas as pd | ||
|
||
## VIASH START | ||
meta = { | ||
'functionality_name': 'dsb_index', | ||
'executable': './target/dsb_index', | ||
'resources_dir': 'resources_test/' | ||
|
||
} | ||
## VIASH END | ||
|
||
logger = logging.getLogger() | ||
logger.setLevel(logging.INFO) | ||
console_handler = logging.StreamHandler(stdout) | ||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") | ||
console_handler.setFormatter(logFormatter) | ||
logger.addHandler(console_handler) | ||
|
||
input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" | ||
cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz" | ||
output_cell_idx = "dsb_output/cell_idx.csv" | ||
output_empty_idx = "dsb_output/empty_idx.csv" | ||
cmd_pars = [ | ||
meta["executable"], | ||
"--data_raw", input, | ||
"--output", "dsb_output", | ||
"--cell_index", cell_index, | ||
"--empty_counts_range", "1.5:2.8" | ||
] | ||
try: | ||
subprocess.check_output(cmd_pars) | ||
except subprocess.CalledProcessError as e: | ||
print(e.stdout.decode("utf-8")) | ||
raise e | ||
|
||
logger.info("> Check if output was created.") | ||
assert path.exists(output_cell_idx), "No output for cell index was created." | ||
assert path.exists(output_empty_idx), "No output for empty index was created." | ||
|
||
|
||
logger.info("> Check whether output cell index has the samw shape as the cell_index input.") | ||
cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist() | ||
output_cell_index = pd.read_csv(output_cell_idx, header=None).iloc[:, 0].tolist() | ||
assert len(output_cell_index) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.' | ||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,61 @@ | ||||||||
functionality: | ||||||||
name: dsb_normalize | ||||||||
namespace: protein_processing | ||||||||
description: "Normalize protein expression with DSB (Denoised and Scaled by Background)." | ||||||||
authors: | ||||||||
- name: Xichen Wu | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use the author yaml to insert author information
Suggested change
|
||||||||
arguments: | ||||||||
- name: "--data_raw" | ||||||||
type: file | ||||||||
required: true | ||||||||
description: "AnnData object with protein expression counts or MuData object with 'prot' modality containing raw (unfiltered, including empty droplets) data." | ||||||||
- name: "--cell_index" | ||||||||
type: file | ||||||||
description: "A csv file containing filtered cell barcodes." | ||||||||
- name: "--empty_index" | ||||||||
type: file | ||||||||
description: "A csv file containing empty cell barcodes." | ||||||||
- name: "--pseudocount" | ||||||||
type: integer | ||||||||
default: 10 | ||||||||
description: "Pseudocount to add before log-transform." | ||||||||
- name: "--denoise_counts" | ||||||||
type: boolean_true | ||||||||
description: "Whether to perform denoising." | ||||||||
- name: "--isotype_controls" | ||||||||
type: string | ||||||||
multiple: true | ||||||||
description: "Names of the isotype controls. If ``None``, isotype controls will not be used." | ||||||||
- name: "--add_layer" | ||||||||
type: boolean_true | ||||||||
description: "Whether to add a ``'dsb'`` layer instead of assigning to the X matrix." | ||||||||
- name: "--random_state" | ||||||||
type: integer | ||||||||
default: 1 | ||||||||
description: "Random seed." | ||||||||
- name: "--output" | ||||||||
type: file | ||||||||
direction: output | ||||||||
description: dsb_normalize output directory | ||||||||
example: "dsb_output" | ||||||||
resources: | ||||||||
- type: python_script | ||||||||
path: script.py | ||||||||
test_resources: | ||||||||
- type: python_script | ||||||||
path: test.py | ||||||||
- path: ../../../resources_test/pbmc_1k_protein_v3 | ||||||||
platforms: | ||||||||
- type: docker | ||||||||
image: python:3.8 | ||||||||
setup: | ||||||||
- type: python | ||||||||
packages: | ||||||||
- scanpy~=1.9.1 | ||||||||
- muon | ||||||||
- numpy | ||||||||
- mudata~=0.2.0 | ||||||||
- anndata~=0.8.0 | ||||||||
- type: nextflow | ||||||||
directives: | ||||||||
label: midcpu |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
from warnings import warn | ||
import os | ||
import scanpy as sc | ||
import muon as mu | ||
import numpy as np | ||
from mudata import MuData | ||
import pandas as pd | ||
from anndata import AnnData | ||
from muon import prot as pt | ||
|
||
## VIASH START | ||
par = { | ||
"data_raw": "../dsb_index/mudata_raw.h5mu", | ||
"output": "dsb_processed", | ||
"cell_index": "../dsb_index/dsb_processed/cell_idx.csv", | ||
"empty_index": "../dsb_index/dsb_processed/empty_idx.csv", | ||
"pseudocount": 10, | ||
"denoise_counts": True, | ||
"isotype_controls": None, | ||
"add_layer": False, | ||
"random_state": None | ||
} | ||
## VIASH END | ||
|
||
if par['data_raw'] is not None: | ||
if par['data_raw'].endswith('h5ad'): | ||
raw_data = sc.read_h5ad(par['data_raw']) | ||
elif par['data_raw'].endswith('h5mu'): | ||
raw_data = mu.read_h5mu(par['data_raw']) | ||
elif par['data_raw'].endswith('h5'): | ||
raw_data = mu.read_10x_h5(par['data_raw']) | ||
else: | ||
raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality") | ||
if "prot" not in raw_data.mod: | ||
raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality") | ||
else: | ||
raise ValueError( "Raw data is not available.") | ||
|
||
if par['cell_index'] is None and par['empty_index'] is None: | ||
raise ValueError( "Given the unfiltered object data_raw, at least one index file must be " | ||
"provided for foreground and background signals.") | ||
|
||
cells = None | ||
empty = None | ||
|
||
if par['empty_index'] is not None: | ||
empty_idx = pd.read_csv(par["empty_index"], header=None).iloc[:, 0].tolist() | ||
empty = raw_data[raw_data.obs_names.isin(empty_idx)] | ||
if par['cell_index'] is not None: | ||
cell_idx = pd.read_csv(par["cell_index"], header=None).iloc[:, 0].tolist() | ||
cells = raw_data[raw_data.obs_names.isin(cell_idx)] | ||
|
||
if empty is None: | ||
empty = raw_data[~raw_data.obs_names.isin(cell_idx)] | ||
if cells is None: | ||
cells = raw_data[~raw_data.obs_names.isin(empty_idx)] | ||
|
||
pt.pp.dsb(cells, empty, isotype_controls=par['isotype_controls'], pseudocount=par["pseudocount"], | ||
denoise_counts=par["denoise_counts"], add_layer=par["add_layer"], | ||
random_state=par["random_state"]) | ||
|
||
if not os.path.exists(par["output"]): | ||
os.makedirs(par["output"]) | ||
if isinstance(cells, MuData): | ||
cells.write(os.path.join(par["output"], "normalized.h5mu")) | ||
else: | ||
cells.write(os.path.join(par["output"], "normalized.h5ad")) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import subprocess | ||
from os import path | ||
import muon as mu | ||
import logging | ||
from sys import stdout | ||
import pandas as pd | ||
|
||
## VIASH START | ||
meta = { | ||
'functionality_name': 'dsb_normalize', | ||
'executable': './target/dsb_normalize', | ||
'resources_dir': 'resources_test/' | ||
|
||
} | ||
## VIASH END | ||
|
||
logger = logging.getLogger() | ||
logger.setLevel(logging.INFO) | ||
console_handler = logging.StreamHandler(stdout) | ||
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") | ||
console_handler.setFormatter(logFormatter) | ||
logger.addHandler(console_handler) | ||
|
||
input = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_raw_feature_bc_matrix.h5" | ||
cell_index = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix/barcodes.tsv.gz" | ||
output = "dsb_output/normalized.h5mu" | ||
cmd_pars = [ | ||
meta["executable"], | ||
"--data_raw", input, | ||
"--output", "dsb_output", | ||
"--cell_index", cell_index, | ||
"--empty_counts_range", "1.5:2.8", | ||
"--denoise_counts" | ||
] | ||
try: | ||
subprocess.check_output(cmd_pars) | ||
except subprocess.CalledProcessError as e: | ||
print(e.stdout.decode("utf-8")) | ||
raise e | ||
|
||
logger.info("> Check if output was created.") | ||
assert path.exists(output), "No output was created." | ||
|
||
logger.info("> Check whether output has the samw shape as the cell_index input.") | ||
cell_index = pd.read_csv(cell_index, header=None).iloc[:, 0].tolist() | ||
mdata= mu.read_h5mu(output) | ||
assert len(mdata.obs) == len(cell_index), 'Output cell index has the samw shape as the cell_index input.' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use the author yaml to insert author information