Skip to content

Commit

Permalink
Refactoring to speed up loading / import
Browse files Browse the repository at this point in the history
  • Loading branch information
shz9 committed Jun 11, 2024
1 parent 45c9462 commit 8a566e0
Show file tree
Hide file tree
Showing 8 changed files with 21 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ not work well for very large datasets with millions of variants and it causes ov
- Fixed major bug in how LD window thresholds that are passed to `plink1.9` are computed.
- Fixed in-place `fillna` in `from_plink_table` in `LDMatrix` to conform to latest `pandas` API.
- Update `run_shell_script` to check for and capture errors.
- Refactored code to slightly reduce import/load times.

### Added

Expand Down
4 changes: 2 additions & 2 deletions bin/magenpy_ld
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ print(fr"""
Version: {mgp.__version__} | Release date: {mgp.__release_date__}
Author: Shadi Zabad, McGill University
**********************************************
< Compute LD matrix and output in Zarr format >
< Compute LD matrix and store in Zarr format >
""")

parser = argparse.ArgumentParser(description="""
Commandline arguments for LD matrix computation
Commandline arguments for LD matrix computation and storage
""")

# General options:
Expand Down
3 changes: 2 additions & 1 deletion magenpy/GenotypeMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import tempfile
import pandas as pd
import numpy as np
from .parsers.plink_parsers import parse_fam_file, parse_bim_file
from .SampleTable import SampleTable


Expand Down Expand Up @@ -821,6 +820,8 @@ def __init__(self,
genome_build=genome_build,
threads=threads)

from .parsers.plink_parsers import parse_fam_file, parse_bim_file

if self.bed_file is not None:
self.bed_file = self.bed_file.replace('.bed', '')

Expand Down
8 changes: 7 additions & 1 deletion magenpy/LDMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import numpy as np
import pandas as pd
import warnings
from scipy.sparse import csr_matrix, identity, triu, diags
from .utils.model_utils import quantize, dequantize


Expand Down Expand Up @@ -133,6 +132,8 @@ def from_csr(cls,
:param compression_level: The compression level to use with the compressor (1-9).
"""

from scipy.sparse import triu

dtype = np.dtype(dtype)

# Get the upper triangular part of the matrix:
Expand Down Expand Up @@ -1004,6 +1005,8 @@ def compute_ld_scores(self,
ld_scores += mat_sq.dot(annotation_matrix)
ld_scores += mat_sq.T.dot(annotation_matrix)

from scipy.sparse import identity

# Add the contribution of the diagonal:
ld_scores += identity(self.n_snps, dtype=np.float32).dot(annotation_matrix)

Expand Down Expand Up @@ -1249,6 +1252,7 @@ def load_data(self,
if return_as_csr:

from .stats.ld.c_utils import expand_ranges
from scipy.sparse import csr_matrix

indices = expand_ranges(leftmost_idx,
(np.diff(indptr) + leftmost_idx).astype(np.int32),
Expand Down Expand Up @@ -1355,6 +1359,8 @@ def load_rows(self,
else:
indices = self.indices

from scipy.sparse import csr_matrix, diags

mat = csr_matrix(
(
csr_data,
Expand Down
6 changes: 3 additions & 3 deletions magenpy/utils/executors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .system_utils import is_cmd_tool, run_shell_script, available_cpu
import magenpy as mgp
from magenpy import get_option


class plink2Executor(object):
Expand All @@ -22,7 +22,7 @@ def __init__(self, threads='auto', verbose=True):
else:
self.threads = threads

self.plink2_path = mgp.get_option('plink2_path')
self.plink2_path = get_option('plink2_path')

if not is_cmd_tool(self.plink2_path):
raise Exception(f"Did not find the executable for plink2 at: {self.plink2_path}")
Expand Down Expand Up @@ -71,7 +71,7 @@ def __init__(self, threads='auto', verbose=True):
else:
self.threads = threads

self.plink1_path = mgp.get_option('plink1.9_path')
self.plink1_path = get_option('plink1.9_path')

if not is_cmd_tool(self.plink1_path):
raise Exception(f"Did not find the executable for plink at: {self.plink1_path}")
Expand Down
5 changes: 4 additions & 1 deletion magenpy/utils/model_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from tqdm import tqdm
import numpy as np
import pandas as pd
from scipy import stats


def match_chromosomes(chrom_1, chrom_2, check_patterns=('chr_', 'chr:', 'chr'), return_both=False):
Expand Down Expand Up @@ -204,6 +203,10 @@ def identify_mismatched_snps(gdl,
:param max_removed_per_iter: The maximum proportion of variants removed in each iteration
"""

# Import required modules / functions:
from scipy import stats

# Data preparation:
if chrom is None:
chromosomes = gdl.chromosomes
else:
Expand Down
2 changes: 1 addition & 1 deletion magenpy/utils/system_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import errno
import os
import os.path as osp
import subprocess
Expand Down Expand Up @@ -113,6 +112,7 @@ def makedir(dirs):
try:
os.makedirs(dir_l)
except OSError as e:
import errno
if e.errno != errno.EEXIST:
raise

Expand Down
2 changes: 1 addition & 1 deletion tests/conda_manual_testing.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ python_versions=("3.8" "3.9" "3.10" "3.11" "3.12")
for version in "${python_versions[@]}"
do
# Create a new conda environment for the Python version
conda create --name "magenpy$version" python="$version" -y
conda create --name "magenpy$version" python="$version" -y || return 1

# Activate the conda environment
conda activate "magenpy$version"
Expand Down

0 comments on commit 8a566e0

Please sign in to comment.