diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 73915b5..1013a20 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -34,4 +34,5 @@ jobs: - name: Test with pytest run: | pip install pytest + pip install pysam pytest diff --git a/docs/source/notebooks/quickstart_mudata.ipynb b/docs/source/notebooks/quickstart_mudata.ipynb index 27c6d71..0445860 100644 --- a/docs/source/notebooks/quickstart_mudata.ipynb +++ b/docs/source/notebooks/quickstart_mudata.ipynb @@ -67,12 +67,13 @@ ], "source": [ "import numpy as np\n", + "\n", "np.random.seed(1)\n", "\n", "n, d, k = 1000, 100, 10\n", "\n", - "z = np.random.normal(loc=np.arange(k), scale=np.arange(k)*2, size=(n,k))\n", - "w = np.random.normal(size=(d,k))\n", + "z = np.random.normal(loc=np.arange(k), scale=np.arange(k) * 2, size=(n, k))\n", + "w = np.random.normal(size=(d, k))\n", "y = np.dot(z, w.T)\n", "y.shape" ] @@ -134,7 +135,7 @@ ], "source": [ "d2 = 50\n", - "w2 = np.random.normal(size=(d2,k))\n", + "w2 = np.random.normal(size=(d2, k))\n", "y2 = np.dot(z, w2.T)\n", "\n", "adata2 = AnnData(y2)\n", @@ -265,7 +266,7 @@ } ], "source": [ - "mdata.varm['A']" + "mdata.varm[\"A\"]" ] }, { @@ -301,7 +302,7 @@ "source": [ "# Only keep variables with value > 1 in obs_1\n", "# with in-place filtering for the variables\n", - "mu.pp.filter_var(adata, adata[\"obs_1\",:].X.flatten() > 1)\n", + "mu.pp.filter_var(adata, adata[\"obs_1\", :].X.flatten() > 1)\n", "adata" ] }, @@ -375,7 +376,7 @@ "source": [ "# Throw away the last sample in the modality 'B'\n", "# with in-place filtering for the observations\n", - "mu.pp.filter_obs(mdata.mod[\"B\"], [True for _ in range(n-1)] + [False])" + "mu.pp.filter_obs(mdata.mod[\"B\"], [True for _ in range(n - 1)] + [False])" ] }, { @@ -568,7 +569,7 @@ } ], "source": [ - "with mu.set_options(display_style = \"html\", display_html_expand = 0b000):\n", + "with mu.set_options(display_style=\"html\", display_html_expand=0b000):\n", " display(mdata)" ] }, @@ -771,7 +772,7 @@ } ], "source": [ - "with mu.set_options(display_style = \"html\", display_html_expand = 0b000):\n", + "with mu.set_options(display_style=\"html\", display_html_expand=0b000):\n", " display(mdata_r)" ] }, @@ -825,7 +826,7 @@ "source": [ "def simple_pca(mdata):\n", " from sklearn import decomposition\n", - " \n", + "\n", " x = np.hstack([m.X for m in mdata.mod.values()])\n", "\n", " pca = decomposition.PCA(n_components=2)\n", @@ -834,8 +835,8 @@ " # By default, methods operate in-place\n", " # and embeddings are stored in the .obsm slot\n", " mdata.obsm[\"X_pca\"] = components\n", - " \n", - " return " + "\n", + " return" ] }, { diff --git a/muon/_atac/__init__.py b/muon/_atac/__init__.py index 2dd2631..d27d20e 100644 --- a/muon/_atac/__init__.py +++ b/muon/_atac/__init__.py @@ -1,4 +1,5 @@ from . import preproc as pp from . import tools as tl from . import plot as pl +from . import fragments as fr from .io import * diff --git a/muon/_atac/fragments.py b/muon/_atac/fragments.py new file mode 100644 index 0000000..cb25bdf --- /dev/null +++ b/muon/_atac/fragments.py @@ -0,0 +1,385 @@ +from collections import OrderedDict +from typing import Optional, Union +import numpy as np +import pandas as pd +import logging +from datetime import datetime +from tqdm import tqdm +from scipy.sparse import lil_matrix +from anndata import AnnData +from mudata import MuData +from . import utils as atacutils +from .._rna.utils import get_gene_annotation_from_rna + +# +# Fragments +# +# Fragments file is a BED-like file describing individual fragments. +# A single record in such a file typically includes 5 tab-separated fields: +# +# chr1 10000 11000 GTCAGTCAGTCAGTCA-1 1 +# ^ ^ ^ ^ ^ +# | | | | | +# | | | 4: name (cell barcode) +# | | 3: end (3' fragment position, exclusive) +# | 2: start (5' fragment position, inclusive)| +# 1: contig (chromosome) 5: score (number of cuts per fragment) +# +# Fragments file is compressed (.gz) and has to be indexed +# with Tabix in order to be used (.gz.tbi). +# + + +def import_pysam(): + """Print helpful message if pysam not available""" + try: + import pysam + + return pysam + except ImportError: + raise ImportError( + "pysam is not available. It is required to work with the fragments file. \ + Install pysam from PyPI (`pip install pysam`) \ + or from GitHub (`pip install git+https://github.com/pysam-developers/pysam`)" + ) + + +def open_fragment_connection(fragment_path): + """Imports pysam and opens connection with BED parser""" + + pysam = import_pysam() + frag = pysam.TabixFile(fragment_path, parser=pysam.asBed()) + + return frag + + +def locate_fragments( + data: Union[AnnData, MuData], fragments: Optional[str] = None, return_fragments: bool = False +): + """ + Parse fragments file and add a variable to access it to the .uns["files"]["fragments"] + + Fragments file is never read to memory, and connection to the file is closed + upon function completion. + + Parameters + ---------- + data + AnnData object with peak counts or multimodal MuData object with 'atac' modality. + fragments + A path to the compressed tab-separated fragments file (e.g. atac_fragments.tsv.gz). + return_fragments + If return the Tabix connection the fragments file. False by default. + """ + frag = None + try: + adata = atacutils.fetch_atac_mod(data) + + if fragments is None: + # Check if a path is already present + if "fragments" in adata.uns["files"]: + fragments = adata.uns["files"]["fragments"] + print(adata.uns["files"]["fragments"]) + else: + raise ValueError( + "No filepath found in .uns['files']['fragments'] and `fragments` argument is None. Please specify one of the two." + ) + + # Here we make sure we can create a connection to the fragments file + frag = open_fragment_connection(fragments) + + if "files" not in adata.uns: + adata.uns["files"] = OrderedDict() + adata.uns["files"]["fragments"] = fragments + + if return_fragments: + return frag + + except Exception as e: + print(e) + + finally: + if frag is not None and not return_fragments: + # The connection has to be closed + frag.close() + + +def count_fragments_features( + data: Union[AnnData, MuData], + features: Optional[pd.DataFrame] = None, + extend_upstream: int = 2e3, + extend_downstream: int = 0, +) -> AnnData: + """ + Count fragments overlapping given Features. Returns cells x features matrix. + + Parameters + ---------- + data + AnnData object with peak counts or multimodal MuData object with 'atac' modality. + features + A DataFrame with feature annotation, e.g. genes. + Annotation has to contain columns: Chromosome, Start, End. + extend_upsteam + Number of nucleotides to extend every gene upstream (2000 by default to extend gene coordinates to promoter regions) + extend_downstream + Number of nucleotides to extend every gene downstream (0 by default) + """ + adata = atacutils.fetch_atac_mod(data) + + if features is None: + # Try to gene gene annotation in the data.mod['rna'] + if ( + isinstance(data, MuData) + and "rna" in data.mod + and "interval" in data.mod["rna"].var.columns + ): + features = get_gene_annotation_from_rna(data) + else: + raise ValueError( + "Argument `features` is required. It should be a BED-like DataFrame with gene coordinates and names." + ) + + n = adata.n_obs + n_features = features.shape[0] + + # Dictionary with matrix positions + d = {k: v for k, v in zip(adata.obs.index, range(n))} + + # Open connection to fragments file + fragments = locate_fragments(adata, return_fragments=True) + + try: + # List of lists matrix is quick and convenient to fill by row + mx = lil_matrix((n_features, n), dtype=int) + + logging.info( + f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Counting fragments in {n} cells for {features.shape[0]} features..." + ) + + for i in tqdm(range(n_features)): # iterate over features (e.g. genes) + f = features.iloc[i] + for fr in fragments.fetch( + f.Chromosome, f.Start - extend_upstream, f.End + extend_downstream + ): + try: + ind = d[fr.name] # cell barcode (e.g. GTCAGTCAGTCAGTCA-1) + mx.rows[i].append(ind) + mx.data[i].append(int(fr.score)) # number of cuts per fragment (e.g. 2) + except: + pass + + # Faster to convert to csr first and then transpose + mx = mx.tocsr().transpose() + + return AnnData(X=mx, obs=adata.obs, var=features) + + except Exception as e: + logging.error(e) + raise e + + finally: + if fragments is not None: + # The connection has to be closed + fragments.close() + + +def _region_pileup(mx, fragments, d, chromosome, start, end): + """Add fragments to existing matrix""" + n_features = mx.shape[1] + + for fr in fragments.fetch(chromosome, start, end): + try: + rowind = d[fr.name] # cell barcode (e.g. GTCAGTCAGTCAGTCA-1) + score = int(fr.score) # number of cuts per fragment (e.g. 2) + colind_start = max(fr.start - start, 0) + colind_end = min(fr.end - start, n_features) # ends are non-inclusive in bed + mx[rowind, colind_start:colind_end] += score + except: + pass + + +# TODO maybe better to pass in connection to tabix file? +def region_pileup( + fragments: str, + cells: np.array, + chromosome: str, + start: int, + end: int, +) -> AnnData: + """ + Pile up reads in regions. Returns a cell x position `AnnData` object that can be used for QC. + + Parameters + ---------- + fragments + Path to a tabix indexed fragments file. + cells + List of cells to fetch + chromosome + Name of the chromosome to extract + start + Start position + end + End position + """ + + fragments = open_fragment_connection(fragments) + + n = cells.shape[0] + n_features = end - start + if n_features < 0: + raise ValueError(f"Start must be smaller than end. (Start = {start}, End = {end})") + + # Dictionary with matrix positions + d = {k: v for k, v in zip(cells, range(n))} + + mx = np.zeros((n, n_features), dtype=int) + + # Check if chromosome present in the fragments file + if chromosome not in fragments.contigs: + raise ValueError( + f"Chromosome {chromosome} is not present in fragments file chromosomes: {fragments.contigs}" + ) + + # logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Counting fragments in {n} cells for {features.shape[0]} features...") + + # Actually fetch the fragments + _region_pileup(mx, fragments, d, chromosome, start, end) + + fragments.close() + + anno = pd.DataFrame( + {"position": range(start, end)}, + ) + anno.index = anno.index.astype(str) + + return AnnData(X=mx, obs=pd.DataFrame(index=cells), var=anno, dtype=int) + + +def _tss_pileup( + adata: AnnData, + features: pd.DataFrame, + extend_upstream: int = 1000, + extend_downstream: int = 1000, +) -> AnnData: + """ + Pile up reads in TSS regions. Returns a cell x position matrix that can be used for QC. + + Parameters + ---------- + data + AnnData object with associated fragments file. + features + A DataFrame with feature annotation, e.g. genes. + Annotation has to contain columns: Chromosome, Start, End. + extend_upsteam + Number of nucleotides to extend every gene upstream (2000 by default to extend gene coordinates to promoter regions) + extend_downstream + Number of nucleotides to extend every gene downstream (0 by default) + """ + if "files" not in adata.uns or "fragments" not in adata.uns["files"]: + raise KeyError( + "There is no fragments file located yet. Run muon.atac.tl.locate_fragments first." + ) + + pysam = import_pysam() + + n = adata.n_obs + n_features = extend_downstream + extend_upstream + 1 + # Not sparse since we expect most positions to be filled + mx = np.zeros((n, n_features), dtype=int) + + # Dictionary with matrix positions + d = {k: v for k, v in zip(adata.obs.index, range(n))} + + fragments = locate_fragments(adata, return_fragments=True) + + # Subset the features to the chromosomes present in the fragments file + chromosomes = fragments.contigs + features = features[features.Chromosome.isin(chromosomes)] + + # logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Counting fragments in {n} cells for {features.shape[0]} features...") + + for i in tqdm( + range(features.shape[0]), desc="Fetching Regions..." + ): # iterate over features (e.g. genes) + + f = features.iloc[i] + tss_start = f.Start - extend_upstream # First position of the TSS region + tss_end = f.Start + extend_downstream # Last position of the TSS region + + # Actually fetch the fragments + _region_pileup(mx, fragments, d, f.Chromosome, tss_start, tss_end) + + fragments.close() + + anno = pd.DataFrame( + {"TSS_position": range(-extend_upstream, extend_downstream + 1)}, + ) + anno.index = anno.index.astype(str) + + return AnnData(X=mx, obs=adata.obs, var=anno, dtype=int) + + +def fetch_regions_to_df( + fragment_path: str, + features: Union[pd.DataFrame, str], + extend_upstream: int = 0, + extend_downstream: int = 0, + relative_coordinates=False, +) -> pd.DataFrame: + """ + Parse peak annotation file and return it as DataFrame. + + Parameters + ---------- + fragment_path + Location of the fragments file (must be tabix indexed). + features + A DataFrame with feature annotation, e.g. genes or a string of format `chr1:1-2000000` or`chr1-1-2000000`. + Annotation has to contain columns: Chromosome, Start, End. + extend_upsteam + Number of nucleotides to extend every gene upstream (2000 by default to extend gene coordinates to promoter regions) + extend_downstream + Number of nucleotides to extend every gene downstream (0 by default) + relative_coordinates + Return the coordinates with their relative position to the middle of the features. + """ + + if isinstance(features, str): + features = atacutils.parse_region_string(features) + n_features = features.shape[0] + + fragments = open_fragment_connection(fragment_path) + + dfs = [] + for i in tqdm( + range(n_features), desc="Fetching Regions..." + ): # iterate over features (e.g. genes) + f = features.iloc[i] + try: + fr = fragments.fetch(f.Chromosome, f.Start - extend_upstream, f.End + extend_downstream) + df = pd.DataFrame( + [(x.contig, x.start, x.end, x.name, x.score) for x in fr], + columns=["Chromosome", "Start", "End", "Cell", "Score"], + ) + if df.shape[0] != 0: + df["Feature"] = f.Chromosome + "_" + str(f.Start) + "_" + str(f.End) + + if relative_coordinates: + middle = int(f.Start + (f.End - f.Start) / 2) + df.Start = df.Start - middle + df.End = df.End - middle + + dfs.append(df) + except ValueError as e: + # TODO this mostly happens when the chromosome is not present + # could add explicit check + print(e) + print("Skipping this region...") + + fragments.close() + + df = pd.concat(dfs, axis=0, ignore_index=True) + return df diff --git a/muon/_atac/tools.py b/muon/_atac/tools.py index 225281b..48b70dc 100644 --- a/muon/_atac/tools.py +++ b/muon/_atac/tools.py @@ -7,18 +7,24 @@ from datetime import datetime from warnings import warn + import numpy as np import pandas as pd import scanpy as sc from tqdm import tqdm from scipy.sparse.linalg import svds + from scipy.sparse import csr_matrix from scipy.sparse import lil_matrix from scanpy import logging + from anndata import AnnData -from . import utils from mudata import MuData + +from . import utils as atacutils from .._rna.utils import get_gene_annotation_from_rna +from .fragments import locate_fragments, _tss_pileup + # # Computational methods for transforming and analysing count data @@ -38,12 +44,7 @@ def lsi(data: Union[AnnData, MuData], scale_embeddings=True, n_comps=50): n_comps: int (default: 50) Number of components to calculate with SVD """ - if isinstance(data, AnnData): - adata = data - elif isinstance(data, MuData) and "atac" in data.mod: - adata = data.mod["atac"] - else: - raise TypeError("Expected AnnData or MuData object with 'atac' modality") + adata = atacutils.fetch_atac_mod(data) # In an unlikely scnenario when there are less 50 features, set n_comps to that value n_comps = min(n_comps, adata.X.shape[1]) @@ -101,12 +102,7 @@ def add_peak_annotation( return_annotation If return adata.uns['atac']['peak_annotation']. False by default. """ - if isinstance(data, AnnData): - adata = data - elif isinstance(data, MuData) and "atac" in data.mod: - adata = data.mod["atac"] - else: - raise TypeError("Expected AnnData or MuData object with 'atac' modality") + adata = atacutils.fetch_atac_mod(data) if isinstance(annotation, str): pa = pd.read_csv(annotation, sep=sep) @@ -275,12 +271,7 @@ def add_genes_peaks_groups( add_distance : bool (False by default) If to add distance to the ranked peaks per group. """ - if isinstance(data, AnnData): - adata = data - elif isinstance(data, MuData) and "atac" in data.mod: - adata = data.mod["atac"] - else: - raise TypeError("Expected AnnData or MuData object with 'atac' modality") + adata = atacutils.fetch_atac_mod(data) if "rank_genes_groups" not in adata.uns: raise KeyError( @@ -363,12 +354,7 @@ def rank_peaks_groups( If to add distance to the ranked peaks per group """ - if isinstance(data, AnnData): - adata = data - elif isinstance(data, MuData): - adata = data.mod["atac"] - else: - raise TypeError("Expected AnnData or MuData object with 'atac' modality") + adata = atacutils.fetch_atac_mod(data) sc.tl.rank_genes_groups(adata, groupby, **kwargs) @@ -528,12 +514,7 @@ def get_sequences(data: Union[AnnData, MuData], bed: str, fasta_file: str, bed_f "Pybedtools is not available. Install pybedtools from PyPI (`pip install pybedtools`) or from GitHub (`pip install git+https://github.com/daler/pybedtools`)" ) - if isinstance(data, AnnData): - adata = data - elif isinstance(data, MuData) and "atac" in data.mod: - adata = data.mod["atac"] - else: - raise TypeError("Expected AnnData or MuData object with 'atac' modality") + adata = atacutils.fetch_atac_mod(data) if "files" not in adata.uns or "genome" not in adata.uns["files"]: if fasta_file is not None: @@ -584,12 +565,7 @@ def locate_file(data: Union[AnnData, MuData], key: str, file: str): file A path to the file (e.g. ./atac_fragments.tsv.gz). """ - if isinstance(data, AnnData): - adata = data - elif isinstance(data, MuData) and "atac" in data.mod: - adata = data.mod["atac"] - else: - raise TypeError("Expected AnnData or MuData object with 'atac' modality") + adata = atacutils.fetch_atac_mod(data) if not os.path.exists(file): raise FileNotFoundError(f"File {file} does not exist") @@ -615,9 +591,6 @@ def locate_genome(data: Union[AnnData, MuData], fasta_file: str): fasta_file A path to the file (e.g. ./atac_fragments.tsv.gz). """ - if not isinstance(data, AnnData) and not (isinstance(data, MuData) and "atac" in data.mod): - raise TypeError("Expected AnnData or MuData object with 'atac' modality") - locate_file(data, "genome", fasta_file) @@ -702,12 +675,7 @@ def initialise_default_files(data: Union[AnnData, MuData], path: Union[str, Path - attempt to locate fragments file (atac_fragments.tsv.gz) """ - if isinstance(data, AnnData): - adata = data - elif isinstance(data, MuData) and "atac" in data.mod: - adata = data.mod["atac"] - else: - raise TypeError("Expected AnnData or MuData object with 'atac' modality") + adata = atacutils.fetch_atac_mod(data) # 2) Add peak annotation @@ -749,6 +717,7 @@ def initialise_default_files(data: Union[AnnData, MuData], path: Union[str, Path def count_fragments_features( data: Union[AnnData, MuData], features: Optional[pd.DataFrame] = None, + stranded: bool = False, extend_upstream: int = 2e3, extend_downstream: int = 0, ) -> AnnData: @@ -761,7 +730,13 @@ def count_fragments_features( AnnData object with peak counts or multimodal MuData object with 'atac' modality. features A DataFrame with feature annotation, e.g. genes. - Annotation has to contain columns: Chromosome, Start, End. + Annotation should contain columns (case-insensitive): + chr/chrom/chromosome (longer takes precedence), start, end. + stranded + Use strand information for each feature. + Has to be encoded as a "strand" (case-insensitive) column in features. + When stranded=True, extend_upsteam and extend_downstream will be used + according to each feature's strand information. extend_upsteam Number of nucleotides to extend every gene upstream (2000 by default to extend gene coordinates to promoter regions) extend_downstream @@ -802,8 +777,31 @@ def count_fragments_features( n = adata.n_obs n_features = features.shape[0] - # Dictionary with matrix positions - d = {k: v for k, v in zip(adata.obs.index, range(n))} + # TODO: refactor and reuse this code + # TODO: write tests (see #59, #68) + + f_cols = np.array([col.lower() for col in features.columns.values]) + for col in ("start", "end"): + if col not in f_cols: + raise ValueError(f"No column with feature {col}s could be found") + + chrom_col: Optional[str] = None + for col in ("chromosome", "chrom", "chr"): + if col in f_cols: + chrom_col = col + break + if chrom_col is None: + raise ValueError("No column with chromosome for features could be found") + + start_col = features.columns.values[np.where(f_cols == "start")[0][0]] + end_col = features.columns.values[np.where(f_cols == "end")[0][0]] + chr_col = features.columns.values[np.where(f_cols == chrom_col)[0][0]] + + strand_col: Optional[str] = None + if stranded: + if "strand" not in f_cols: + raise ValueError("No column with strand for features could be found") + strand_col = features.columns.values[np.where(f_cols == chrom_col)[0][0]] fragments = pysam.TabixFile(adata.uns["files"]["fragments"], parser=pysam.asBed()) try: @@ -817,16 +815,16 @@ def count_fragments_features( stranded = "Strand" in features.columns for i in tqdm(range(n_features)): # iterate over features (e.g. genes) f = features.iloc[i] - if stranded and f.Strand == "-": - f_from = f.Start - extend_downstream - f_to = f.End + extend_upstream + if stranded and f[strand_col] == "-": + f_from = f[start_col] - extend_downstream + f_to = f[end_col] + extend_upstream else: - f_from = f.Start - extend_upstream - f_to = f.End + extend_downstream + f_from = f[start_col] - extend_upstream + f_to = f[end_col] + extend_downstream for fr in fragments.fetch(f.Chromosome, f_from, f_to): try: - ind = d[fr.name] # cell barcode (e.g. GTCAGTCAGTCAGTCA-1) + ind = adata.obs.index.get_loc(fr.name) # cell barcode (e.g. GTCAGTCAGTCAGTCA-1) mx.rows[i].append(ind) mx.data[i].append(int(fr.score)) # number of cuts per fragment (e.g. 2) except: @@ -888,12 +886,7 @@ def tss_enrichment( """ - if isinstance(data, AnnData): - adata = data - elif isinstance(data, MuData) and "atac" in data.mod: - adata = data.mod["atac"] - else: - raise TypeError("Expected AnnData or MuData object with 'atac' modality") + adata = atacutils.fetch_atac_mod(data) if features is None: # Try to gene gene annotation in the data.mod['rna'] @@ -1088,12 +1081,7 @@ def nucleosome_signal( Column name in the .obs of the AnnData with barcodes corresponding to the ones in the fragments file. """ - if isinstance(data, AnnData): - adata = data - elif isinstance(data, MuData) and "atac" in data.mod: - adata = data.mod["atac"] - else: - raise TypeError("Expected AnnData or MuData object with 'atac' modality") + adata = atacutils.fetch_atac_mod(data) if "files" not in adata.uns or "fragments" not in adata.uns["files"]: raise KeyError( @@ -1155,65 +1143,3 @@ def nucleosome_signal( ) return None - - -def fetch_regions_to_df( - fragment_path: str, - features: Union[pd.DataFrame, str], - extend_upstream: int = 0, - extend_downstream: int = 0, - relative_coordinates=False, -) -> pd.DataFrame: - """ - Parse peak annotation file and return it as DataFrame. - - Parameters - ---------- - fragment_path - Location of the fragments file (must be tabix indexed). - features - A DataFrame with feature annotation, e.g. genes or a string of format `chr1:1-2000000` or`chr1-1-2000000`. - Annotation has to contain columns: Chromosome, Start, End. - extend_upsteam - Number of nucleotides to extend every gene upstream (2000 by default to extend gene coordinates to promoter regions) - extend_downstream - Number of nucleotides to extend every gene downstream (0 by default) - relative_coordinates - Return the coordinates with their relative position to the middle of the features. - """ - - try: - import pysam - except ImportError: - raise ImportError( - "pysam is not available. It is required to work with the fragments file. Install pysam from PyPI (`pip install pysam`) or from GitHub (`pip install git+https://github.com/pysam-developers/pysam`)" - ) - - if isinstance(features, str): - features = utils.parse_region_string(features) - - fragments = pysam.TabixFile(fragment_path, parser=pysam.asBed()) - n_features = features.shape[0] - - dfs = [] - for i in tqdm( - range(n_features), desc="Fetching Regions..." - ): # iterate over features (e.g. genes) - f = features.iloc[i] - fr = fragments.fetch(f.Chromosome, f.Start - extend_upstream, f.End + extend_downstream) - df = pd.DataFrame( - [(x.contig, x.start, x.end, x.name, x.score) for x in fr], - columns=["Chromosome", "Start", "End", "Cell", "Score"], - ) - if df.shape[0] != 0: - df["Feature"] = f.Chromosome + "_" + str(f.Start) + "_" + str(f.End) - - if relative_coordinates: - middle = int(f.Start + (f.End - f.Start) / 2) - df.Start = df.Start - middle - df.End = df.End - middle - - dfs.append(df) - - df = pd.concat(dfs, axis=0, ignore_index=True) - return df diff --git a/muon/_atac/utils.py b/muon/_atac/utils.py index 67b2fee..5964472 100644 --- a/muon/_atac/utils.py +++ b/muon/_atac/utils.py @@ -1,8 +1,22 @@ import re import pandas as pd +from anndata import AnnData +from mudata import MuData +from typing import Union -def parse_region_string(region: str): +def fetch_atac_mod(data: Union[AnnData, MuData]): + if isinstance(data, AnnData): + adata = data + elif isinstance(data, MuData) and "atac" in data.mod: + adata = data.mod["atac"] + # TODO: check that ATAC-seq slot is present with this name + else: + raise TypeError("Expected AnnData or MuData object with 'atac' modality") + return adata + + +def parse_region_string(region: str) -> pd.DataFrame: feat_list = re.split("-|:", region) feature_df = pd.DataFrame(columns=["Chromosome", "Start", "End"]) feature_df.loc[0] = feat_list diff --git a/muon/_prot/preproc.py b/muon/_prot/preproc.py index ae3eccc..5f03b90 100644 --- a/muon/_prot/preproc.py +++ b/muon/_prot/preproc.py @@ -111,6 +111,11 @@ def dsb( ) empty = empty[~empty.obs_names.isin(cells.obs_names)] else: + warn( + f"empty_counts_range will be deprecated in the future versions", + DeprecationWarning, + stacklevel=2, + ) if data_raw is not None: if not isinstance(data_raw, MuData) or "rna" not in data_raw.mod: warn( diff --git a/tests/create_test_dataset.py b/tests/create_test_dataset.py new file mode 100644 index 0000000..074b1ce --- /dev/null +++ b/tests/create_test_dataset.py @@ -0,0 +1,176 @@ +# Create a minimal dataset that can be used for testing functions that are difficult to test with generated data + + +import muon as mu +import scanpy as sc +import pandas as pd +import os + + +data_dir = "/home/max/projects/pbmc_multimodal/PBMC_rep1/" +# muon_dir = "/home/max/code/muon/" + +outdir = "data/atac/" + + +mdata = mu.read_10x_h5(os.path.join(data_dir, "filtered_feature_bc_matrix.h5")) + +rna = mdata.mod["rna"] +atac = mdata.mod["atac"] + + +########################### +# RNA +########################### + +# Filter cells to 1000 quality cells +rna.var["mt"] = rna.var_names.str.startswith( + "MT-" +) # annotate the group of mitochondrial genes as 'mt' +sc.pp.calculate_qc_metrics(rna, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True) + + +mu.pp.filter_obs(rna, "n_genes_by_counts", lambda x: (x >= 200) & (x < 5000)) +mu.pp.filter_obs(rna, "pct_counts_mt", lambda x: x < 20) + +mu.pp.filter_obs(rna, "total_counts", lambda x: x < 15000) + +filter_cells = rna.obs.index[:1000] +mu.pp.filter_obs(rna, filter_cells) + +# Filter marker genes +marker_genes = [ + "IL7R", + "TRAC", + "ITGB1", + "CD2", + "SLC4A10", + "CD8A", + "CD8B", + "CCL5", + "GNLY", + "NKG7", + "CD79A", + "MS4A1", + "IGHM", + "IGHD", + "IL4R", + "TCL1A", + "KLF4", + "LYZ", + "S100A8", + "ITGAM", + "CD14", + "FCGR3A", + "MS4A7", + "CST3", + "CLEC10A", + "IRF8", + "TCF4", + "INPP4B", + "IL32", + "LTB", + "SYNE2", + "ANK3", + "CDC14A", + "IL7R", + "ITGB1", + "BCL11B", + "LEF1", + "SLC8A1", + "VCAN", + "BANK1", + "NEAT1", + "TCF7L2", + "CD74", + "RPS27", + "CDK6", + "MAML3", + "SOX4", +] + +mu.pp.filter_var(rna, marker_genes) +rna.obs = pd.DataFrame(index=rna.obs.index) +rna.var = rna.var[["gene_ids", "feature_types", "genome", "interval"]] + + +# Quick check that data still has some signal +# sc.pp.normalize_total(rna, target_sum=1e4) +# sc.pp.log1p(rna) +# rna.raw = rna +# sc.pp.scale(rna, max_value=10) + +# sc.tl.pca(rna, svd_solver='arpack') +# sc.pl.pca(rna, color=['CD2', 'CD79A', 'KLF4', 'IRF8']) +# sc.pl.pca_variance_ratio(rna, log=True) + +# sc.pp.neighbors(rna, n_neighbors=10, n_pcs=20) +# sc.tl.leiden(rna, resolution=0.5) +# sc.tl.umap(rna, spread=1., min_dist=.5, random_state=11) +# sc.pl.umap(rna, color="leiden", legend_loc="on data") +# sc.pl.umap(rna, color=['CD2', 'CD79A', 'KLF4', 'IRF8'], legend_loc="on data") + +########################### +## ATAC +########################### + +# # Filter cells to 1000 quality cells +mu.pp.filter_obs(atac, filter_cells) # From RNA + + +# Filter peaks around the interesting genes +extension = 5000 + +regions = mu.atac.tl.get_gene_annotation_from_rna(rna) +regions.Start = regions.Start - extension +regions.End = regions.End + extension + +import pyranges as pr + +peaks = pd.DataFrame([s.replace(":", "-", 1).split("-") for s in atac.var.interval]) +peaks.columns = ["Chromosome", "Start", "End"] +peaks["id"] = atac.var.gene_ids.values + +peaks = pr.PyRanges(peaks) +genes = pr.PyRanges(regions) +genes = genes.slack(extension) +p2 = peaks.overlap(genes) + +mu.pp.filter_var(atac, p2.id) +peakann = atac.uns["atac"]["peak_annotation"] +atac.uns["atac"]["peak_annotation"] = peakann[peakann.peak.isin(atac.var.index)] + +# Filter fragments around the interesting genes +# and write subsetted fragments file +fragments_file = atac.uns["files"]["fragments"] +if not os.path.isdir(outdir): + os.makedirs(outdir) +outfile = os.path.join(outdir, "test_rna_atac_fragments.tsv") + + +import pysam + +tbx = pysam.TabixFile(fragments_file) + + +with open(outfile, "w") as file: + for region in regions.itertuples(): + for f in tbx.fetch(region.Chromosome, region.Start, region.End): + # print(str(f)) + file.writelines(f"{f}\n") + +# Compress and create tabix index +pysam.tabix_index(outfile, force=True, seq_col=0, start_col=1, end_col=2) + +atac.uns["files"]["fragments"] = str("tests/" + outfile + ".gz") + +mdata.update() + + +print(rna.obs.index) +mdata.write(os.path.join(outdir, "test_rna_atac.h5mu")) +rna.write(os.path.join(outdir, "test_rna.h5ad")) + +# Make sure file can be read + +# mu2 = mu.read_h5mu(os.path.join(outdir, "test_rna_atac.h5mu")) diff --git a/tests/data/atac/test_rna.h5ad b/tests/data/atac/test_rna.h5ad new file mode 100644 index 0000000..172aa37 Binary files /dev/null and b/tests/data/atac/test_rna.h5ad differ diff --git a/tests/data/atac/test_rna_atac.h5ad b/tests/data/atac/test_rna_atac.h5ad new file mode 100644 index 0000000..e92540f Binary files /dev/null and b/tests/data/atac/test_rna_atac.h5ad differ diff --git a/tests/data/atac/test_rna_atac.h5mu b/tests/data/atac/test_rna_atac.h5mu new file mode 100644 index 0000000..c30f11d Binary files /dev/null and b/tests/data/atac/test_rna_atac.h5mu differ diff --git a/tests/data/atac/test_rna_atac_fragments.tsv.gz b/tests/data/atac/test_rna_atac_fragments.tsv.gz new file mode 100644 index 0000000..a97d475 Binary files /dev/null and b/tests/data/atac/test_rna_atac_fragments.tsv.gz differ diff --git a/tests/data/atac/test_rna_atac_fragments.tsv.gz.tbi b/tests/data/atac/test_rna_atac_fragments.tsv.gz.tbi new file mode 100644 index 0000000..893a26f Binary files /dev/null and b/tests/data/atac/test_rna_atac_fragments.tsv.gz.tbi differ diff --git a/tests/test_atac_preproc.py b/tests/test_atac_preproc.py index 5aa1929..465829f 100644 --- a/tests/test_atac_preproc.py +++ b/tests/test_atac_preproc.py @@ -1,8 +1,10 @@ import unittest - +import io import numpy as np from anndata import AnnData from muon import atac as ac +import muon as mu +import pandas as pd from scipy.sparse import rand @@ -57,5 +59,68 @@ def test_tfidf(self): self.assertEqual(str("%.3f" % self.adata.X[50, 5]), "0.000") +class TestFragments(unittest.TestCase): + "Tests functions in fragments.py" + + def setUp(self): + mdata = mu.read("tests/data/atac/test_rna_atac.h5mu") + atac = mdata.mod["atac"] + test_regions = pd.DataFrame( + { + "Chromosome": ["chr1", "chr777", "chr20"], + "Start": [1000, 1, 23642777], + "End": [10000, 9, 23643653], + } + ) + self.mdata = mdata + self.atac = atac + self.test_regions = test_regions + + def test_fetch_regions_to_df(self): + df = ac.fr.fetch_regions_to_df(self.atac.uns["files"]["fragments"], self.test_regions) + + # Fragments should be sorted by start position + np.testing.assert_array_equal(df.Start.sort_values().values, df.Start.values) + assert df.iloc[4, 0] == "chr20" + assert df.iloc[4, 1] == 23642556 + assert df.iloc[4, 2] == 23642844 + assert df.iloc[4, 3] == "GGGCGAATCCTCGATC-1" + + def test_region_pileup(self): + adata = ac.fr.region_pileup( + fragments=self.atac.uns["files"]["fragments"], + cells=self.atac.obs.index.values, + chromosome="chr20", + start=23642777, + end=23643653, + ) + assert adata.X.sum() == 1177 + assert adata.X.sum(axis=0)[111] == 2 + + def test_tss_pileup(self): + + # genes = pd.read_csv( + # io.StringIO( + # """Chromosome,Start,End,gene_id,gene_name + # chr7,92833916,92836594,ENSG00000105810,CDK6 + # chr9,107489765,107489769,ENSG00000136826,KLF4 + # chr10,32935557,32958230,ENSG00000150093,ITGB1""" + # ) + # ) + genes = mu._rna.utils.get_gene_annotation_from_rna(self.mdata) + + adata = ac.fr._tss_pileup( + adata=self.atac, features=genes, extend_upstream=1000, extend_downstream=1000 + ) + + assert adata.X.sum() == 1239536 + assert adata.X.sum(axis=0)[111] == 304 + + def test_count_fragments_features(self): + adata = ac.fr.count_fragments_features(self.mdata) + assert adata.X.sum() == 23852.0 + assert adata.X.sum(axis=0)[0, 42] == 384.0 + + if __name__ == "__main__": unittest.main()