Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data downloading URL, actions, and dependencies. #132

Merged
merged 6 commits into from
Aug 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7,739 changes: 2,106 additions & 5,633 deletions docs/notebooks/single_cell/02_2_1_scatac_multiome_pancreas_priors_train.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dentate gyrus (scRNA-seq) | Training with a RNA-dynamics kNN-graph"
"## Dentate gyrus (scRNA-seq) | Training with an RNA-dynamics kNN-graph"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mouse neurogenesis scRNA-seq (Noack et al. 2022) | Training with a RNA-dynamics kNN-graph\n"
"## Mouse neurogenesis scRNA-seq (Noack et al. 2022) | Training with an RNA-dynamics kNN-graph\n"
]
},
{
Expand Down
3 changes: 2 additions & 1 deletion mubind/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
simulate_data,
simulate_xy,
cisbp_hs, genre,
archetypes, archetypes_anno, archetypes_clu, # pwm datasets
archetypes, archetypes_anno, archetypes_clu, archetypes_pickle, # pwm datasets
pancreas_multiome,
pancreas_rna,
pancreas_rna_pytest,
pancreas_atac,
)
84 changes: 71 additions & 13 deletions mubind/datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import pandas as pd
import os
import pickle

import urllib.request

# Class for reading training/testing SELEX dataset files.
class SelexDataset(tdata.Dataset):
def __init__(self, df, n_rounds=None, enr_series=True, single_encoding_step=False, store_rev=False,
Expand Down Expand Up @@ -476,23 +477,46 @@ def genre(**kwargs):
return pwms

def archetypes_anno(**kwargs):
# read reference clusters
archetypes_dir = os.path.join(mb.bindome.constants.ANNOTATIONS_DIRECTORY, 'archetypes')
anno = pd.read_excel(os.path.join(archetypes_dir, 'motif_annotations.xlsx'), sheet_name='Archetype clusters')
url = kwargs['url']
# read reference clusters
archetypes_dir = 'data/archetypes'
archetypes_path = os.path.join(archetypes_dir, 'motif_annotations.xlsx')

# save to avoid future redownloads
if not os.path.exists(archetypes_path):
if not os.path.exists(archetypes_dir):
os.makedirs(archetypes_dir)
urllib.request.urlretrieve(kwargs['url'], archetypes_path)

anno = pd.read_excel(archetypes_path, sheet_name='Archetype clusters')
return anno

def archetypes_clu(**kwargs):
archetypes_dir = os.path.join(mb.bindome.constants.ANNOTATIONS_DIRECTORY, 'archetypes')
clu = pd.read_excel(os.path.join(archetypes_dir, 'motif_annotations.xlsx'), sheet_name='Motifs')
url = kwargs['url']
archetypes_dir = 'data/archetypes'
archetypes_path = os.path.join(archetypes_dir, 'motif_annotations.xlsx')
clu = pd.read_excel(archetypes_path, sheet_name='Motifs')
return clu

def archetypes(**kwargs):
ppm_by_name = {}
archetypes_dir = os.path.join(mb.bindome.constants.ANNOTATIONS_DIRECTORY, 'archetypes')
def archetypes_pickle(**kwargs):
# read reference clusters
archetypes_dir = 'data/archetypes'
archetypes_path = os.path.join(archetypes_dir, 'archetypes_data.pkl')

anno = archetypes_anno(**kwargs)
clu = archetypes_anno(**kwargs)
# save to avoid future redownloads
if not os.path.exists(archetypes_path):
if not os.path.exists(archetypes_dir):
os.makedirs(archetypes_dir)
# print('downloading...')
urllib.request.urlretrieve(kwargs['url'], archetypes_path)

ppm_by_name = pickle.load(open(archetypes_path, 'rb'))
return ppm_by_name


def archetypes_meme(**kwargs):
ppm_by_name = {}
archetypes_dir = 'data/archetypes'
# read PFM across meme files
for f in os.listdir(archetypes_dir):
if f.endswith('.meme'):
Expand All @@ -513,7 +537,22 @@ def archetypes(**kwargs):
ppm.index = 'A', 'C', 'G', 'T'
ppm_by_name[name] = ppm
print('# motifs loaded %i' % (len(ppm_by_name)))
return ppm_by_name

def archetypes(**kwargs):
# annotation table
url = 'https://www.dropbox.com/scl/fi/odxcg72nj3djbfz6r9nq8/motif_annotations.xlsx?rlkey=qlbyx9m7dj6qqui9ct80q9ejc&dl=1'
kwargs['url'] = url
archetypes_dir = 'data/archetypes'
anno = archetypes_anno(**kwargs)
clu = archetypes_anno(**kwargs)

# PWM weights
url = 'https://www.dropbox.com/scl/fi/gytniua2uay1p6st0svh9/archetypes_data.pkl?rlkey=qe7mzhwaiqfpkjbdj31ijx193&dl=1'
kwargs['url'] = url
ppm_by_name = archetypes_pickle(**kwargs)

# print(clu)
# return non-redundant groups
reduced_groups = []
for k in anno['Seed_motif']:
Expand All @@ -530,10 +569,27 @@ def pancreas_rna(
):
from scanpy import read
# rna
url = 'https://www.dropbox.com/scl/fi/ryb3q25n0kc2vw297f2xd/pancreas_multiome_2022_processed_rna_velocities_2024.h5ad?rlkey=in0qlpv038cn6wxrops1wsxgm&dl=0'
url = 'https://www.dropbox.com/scl/fi/ryb3q25n0kc2vw297f2xd/pancreas_multiome_2022_processed_rna_velocities_2024.h5ad?rlkey=in0qlpv038cn6wxrops1wsxgm&dl=1'
print(os.path.exists(file_path), file_path)
# print('reading RNA')
adata = read(file_path, backup_url=url, sparse=True, cache=True)
adata.var_names_make_unique()
# print('opening RNA successful')
return adata

def pancreas_rna_pytest(
file_path: Optional[
Union[str, Path]
] = "data/scatac/pancreas_multiome/pancreas_multiome_2022_processed_rna_velocities_2024_pytest.h5ad"
):
from scanpy import read
# rna
url = 'https://www.dropbox.com/scl/fi/93hw0wru56ljryo6m17d9/pancreas_multiome_2022_processed_rna_velocities_2024_pytest.h5ad?rlkey=x8r14un3gu8ahyipcylwxytns&dl=1'
print(os.path.exists(file_path), file_path)
# print('reading RNA')
adata = read(file_path, backup_url=url, sparse=True, cache=True)
adata.var_names_make_unique()
# print('opening RNA successful')
return adata

def pancreas_atac(
Expand All @@ -543,9 +599,11 @@ def pancreas_atac(
):
from scanpy import read
# atac
url = 'https://www.dropbox.com/scl/fi/53wv4v7tbnsmr12fbmea7/pancreas_multiome_2022_processed_atac.h5ad?rlkey=1kf352wya0pzffkn990wkbwmd&e=1&st=m6gv9hp5&dl=0'
url = 'https://www.dropbox.com/scl/fi/53wv4v7tbnsmr12fbmea7/pancreas_multiome_2022_processed_atac.h5ad?rlkey=1kf352wya0pzffkn990wkbwmd&e=1&st=m6gv9hp5&dl=1'
print(os.path.exists(file_path), file_path)
print('reading ATAC')
adata = read(file_path, backup_url=url, sparse=True, cache=True)
print('opening ATAC successful')
adata.var_names_make_unique()
return adata

Expand Down
8 changes: 4 additions & 4 deletions mubind/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,10 +828,10 @@ def closure():
self.r2_history += r2_history

def corr_etas_libsizes(self, train):
etas = self.get_log_etas().detach().numpy().cpu().flatten() if self.device != 'cpu' else self.get_log_etas().detach().flatten()
lib_sizes = train.dataset.rounds.sum(axis=0).detach().numpy().cpu().flatten() if self.device != 'cpu' else train.dataset.rounds.sum(axis=0).flatten()
print('etas', etas, etas.shape, etas.device)
print('libsizes', lib_sizes, lib_sizes.shape)
etas = self.get_log_etas().detach().cpu().numpy().flatten() if self.device != 'cpu' else self.get_log_etas().detach().flatten()
lib_sizes = train.dataset.rounds.sum(axis=0) if self.device != 'cpu' else train.dataset.rounds.sum(axis=0).flatten()
# print('etas', etas, etas.shape, etas.device)
# print('libsizes', lib_sizes, lib_sizes.shape)
return 'etas corr with lib_sizes (before refinement)', spearmanr(etas, lib_sizes)

def optimize_iterative(self,
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ maintainers = [
urls.Documentation = "https://mubind.readthedocs.io/"
urls.Source = "https://github.com/theislab/mubind"
urls.Home-page = "https://github.com/theislab/mubind"
version = "0.2.1"
version = "0.2.2"
requires-python = ">=3.9" # for GPU-rapids
license = {file = "LICENSE"}
readme = "README.md"
dependencies = ["seaborn", "scikit-learn", "pandas", "unidecode", "matplotlib", "scipy", "numpy>=1.22", "torch",
"logomaker", "biopython", "numba", "pytest", "pytest-cov", "openpyxl", "tqdm", "anndata"]
"logomaker", "biopython", "numba", "pytest", "pytest-cov", "openpyxl", "tqdm", "scanpy"]

[project.optional-dependencies]
dev = [
Expand Down
11 changes: 11 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import torch.optim as topti
import torch.utils.data as tdata
import mubind as mb
import pytest

def test_dataset_index_int():
import warnings
Expand Down Expand Up @@ -49,7 +50,17 @@ def test_seq_conversion():

assert (x2 == strs).all()

def test_download_and_load_dataset():
import warnings
ad = mb.datasets.pancreas_rna_pytest()
return None

@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
def test_archetypes():
import warnings
data = mb.datasets.archetypes()
return None

def test_dataset_memory_increase():
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
Expand Down
Loading