Fixes

broadinstitute · Jan 8, 2024 · aabfe8a · aabfe8a
1 parent f453a66
commit aabfe8a
Show file tree

Hide file tree

Showing 8 changed files with 411 additions and 367 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2019, Broad Institute
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.rst b/README.rst
@@ -0,0 +1,43 @@
+variantreviewparty
+"""""""
+
+|GitHub release| |Generic badge| |PyPI version variantreviewparty|
+
+.. |GitHub release| image:: https://img.shields.io/github/release/broadinstitute/variantreviewparty.svg
+   :target: https://github.com/broadinstitute/variantreviewparty/releases/
+
+.. |Generic badge| image:: https://img.shields.io/badge/Docker-v0.0.1-blue.svg
+   :target: https://console.cloud.google.com/gcr/images/broad-dsp-lrma/US/lr-variantreviewparty
+
+.. |PyPI version variantreviewparty| image:: https://img.shields.io/pypi/v/variantreviewparty.svg
+   :target: https://pypi.python.org/pypi/variantreviewparty/
+
+VariantReviewParty is a python library for viewing read level data spanning variants across thousands of samples.
+
+Documentation for the ``VariantReviewParty`` API can be found on the `documentation page <https://broadinstitute.github.io/variantreviewparty/>`_.
+
+Installation
+------------
+
+``pip`` is recommended for VariantReviewParty installation.
+
+::
+
+   pip install variantreviewparty
+
+For a pre-built version including all dependencies, access our Docker image.
+
+::
+
+   git clone https://github.com/broadinstitute/variantreviewparty.git
+   pip install -e variantreviewparty/
+
+Getting help
+------------
+
+If you encounter bugs or have questions/comments/concerns, please file an issue on our `Github page <https://github.com/broadinstitute/variantreviewparty/issues>`_.
+
+Developers' guide
+-----------------
+
+For information on contributing to VariantReviewParty development, visit our `developer documentation <DEVELOP.md>`_.
diff --git a/playground.ipynb b/playground.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,11 @@
+[build-system]
+requires = ["maturin>=1.0,<2.0"]
+build-backend = "maturin"
+
+[project]
+name = "genomeshader"
+
+[tool.maturin]
+python-source = "python"
+# "extension-module" tells pyo3 we want to build an extension module (skips linking against libpython.so)
+features = ["pyo3/extension-module"]
diff --git a/python/genomeshader/view.py b/python/genomeshader/view.py
@@ -2,6 +2,7 @@
 import re
 import warnings
 from enum import Enum
+from typing import Union, List
 
 import polars as pl
 import holoviews as hv
@@ -37,16 +38,21 @@ def __init__(self,
         self.session_name = session_name
 
         if gcs_session_dir is None:
-            bucket = os.environ['GOOGLE_BUCKET']
-            gcs_session_dir = f"{bucket}/GenomeShader/{session_name}"
+            if 'GOOGLE_BUCKET' in os.environ:
+                bucket = os.environ['GOOGLE_BUCKET']
+                gcs_session_dir = f"{bucket}/GenomeShader/{session_name}"
+            else:
+                raise ValueError(
+                    "gcs_session_dir is None and "
+                    "GOOGLE_BUCKET is not set in environment variables"
+                )
 
         self._validate_gcs_session_dir(gcs_session_dir)
         self.gcs_session_dir = gcs_session_dir
 
         self.genome_build: GenomeBuild = genome_build
 
-        self.reads = set()
-        self.loci = set()
+        self._session = _init()
 
     def _validate_gcs_session_dir(self, gcs_session_dir: str):
         gcs_pattern = re.compile(
@@ -68,38 +74,37 @@ def _validate_session_name(self, session_name: str):
 
     def __str__(self):
         return (
-            f'GenomeShader: '
-            f'session_name={self.session_name}, '
-            f'gcs_session_dir={self.gcs_session_dir}, '
-            f'genome_build={self.genome_build}'
+            f'GenomeShader:\n'
+            f' - session_name: {self.session_name}\n'
+            f' - gcs_session_dir: {self.gcs_session_dir}\n'
+            f' - genome_build: {self.genome_build}\n'
         )
 
     def get_session_name(self):
         return self.session_name
 
-    def attach_reads(self, gcs_path: str):
-        if gcs_path.endswith('.bam') or gcs_path.endswith('.cram'):
-            self.reads.add(gcs_path)
-        else:
-            bams = gcs_list_files_of_type(gcs_path, ".bam")
-            crams = gcs_list_files_of_type(gcs_path, ".cram")
+    def attach_reads(self, gcs_paths: Union[str, List[str]]):
+        if isinstance(gcs_paths, str):
+            gcs_paths = [gcs_paths]  # Convert single string to list
 
-            self.reads.update(bams)
-            self.reads.update(crams)
+        for gcs_path in gcs_paths:
+            if gcs_path.endswith(".bam") or gcs_path.endswith(".cram"):
+                self._session.attach_reads([gcs_path])
+            else:
+                bams = _gcs_list_files_of_type(gcs_path, ".bam")
+                crams = _gcs_list_files_of_type(gcs_path, ".cram")
 
-    def attach_locus(self, locus: str):
-        pieces = re.split("[:-]", re.sub(",", "", locus))
+                self._session.attach_reads(bams)
+                self._session.attach_reads(crams)
 
-        chr = pieces[0]
-        start = int(pieces[1])
-        stop = int(pieces[2]) if len(pieces) > 2 else start
-
-        self.loci.add((chr, start, stop))
+    def attach_loci(self, loci: Union[str, List[str]]):
+        if isinstance(loci, str):
+            self._session.attach_loci([loci])
+        else:
+            self._session.attach_loci(loci)
 
     def stage(self):
-        df = stage_data(self.gcs_session_dir, self.reads, self.loci)
-
-        return df
+        self._session.stage()
 
     def show(self,
              locus: str,
@@ -116,6 +121,8 @@ def show(self,
         df = pl.read_parquet(filename)
         df = df.sort(["sample_name", "query_name", "reference_start"])
 
+        print(filename)
+
         y0s = []
         y0 = 0
         if collapse:
@@ -144,6 +151,7 @@ def show(self,
                 y0s.append(y0)
 
         df = df.with_columns(pl.Series(name="read_num", values=y0s))
+        df = df.with_columns(pl.Series(name="height", values=[1.0]*len(y0s)))
 
         df = df.with_columns(
             pl.col("read_num").alias("y0") * -1 - pl.col("height") / 2
@@ -180,8 +188,11 @@ def show(self,
             default_tools=['reset', 'save']
         )
 
+    def print(self):
+        self._session.print()
+
 
-def init(session_name,
+def init(session_name: str,
          gcs_session_dir: str = None,
          genome_build: GenomeBuild = GenomeBuild.GRCh38) -> GenomeShader:
     session = GenomeShader(session_name=session_name,

diff --git a/src/alignment.rs b/src/alignment.rs
@@ -260,8 +260,6 @@ fn extract_reads(bam_path: &String, chr: String, start: u64, stop: u64) -> DataF
 }
 
 pub fn stage_data(cache_path: PathBuf, bam_paths: &HashSet<String>, loci: &HashSet<(String, u64, u64)>) -> Result<HashMap<(String, u64, u64), PathBuf>, Box<dyn std::error::Error>> {
-    gcs_authorize_data_access();
-
     loci.par_iter()
         .progress_count(loci.len() as u64)
         .for_each(|l| {

diff --git a/src/lib.rs b/src/lib.rs
@@ -8,7 +8,7 @@ pub mod layout;
 use app::{model, update, exit};
 use events::raw_window_event;
 use alignment::stage_data;
-use storage::gcs_list_files_of_type;
+use storage::{_gcs_list_files_of_type,gcs_authorize_data_access};
 use layout::*;
 
 use std::{collections::{HashSet, HashMap}, path::PathBuf, cell::RefCell};
@@ -23,7 +23,7 @@ thread_local!(static GLOBAL_DATA: RefCell<PyDataFrame> = RefCell::new(PyDataFram
 
 #[pyclass]
 pub struct Session {
-    bams: HashSet<String>,
+    reads: HashSet<String>,
     loci: HashSet<(String, u64, u64)>,
     staged_data: HashMap<(String, u64, u64), PathBuf>
 }
@@ -33,14 +33,24 @@ impl Session {
     #[new]
     fn new() -> Self {
         Session {
-            bams: HashSet::new(),
+            reads: HashSet::new(),
             loci: HashSet::new(),
             staged_data: HashMap::new()
         }
     }
 
-    fn attach_bams(&mut self, bams: Vec<String>) {
-        self.bams = bams.into_iter().collect();
+    fn attach_reads(&mut self, reads: Vec<String>) -> PyResult<()> {
+        for read in &reads {
+            if !read.ends_with(".bam") && !read.ends_with(".cram") {
+                return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
+                    format!("File '{}' is not a .bam or .cram file.", read)
+                ));
+            }
+        }
+
+        self.reads = reads.into_iter().collect();
+
+        Ok(())
     }
 
     fn parse_locus(&self, locus: String) -> PyResult<(String, u64, u64)> {
@@ -100,9 +110,11 @@ impl Session {
     }
 
     fn stage(&mut self) -> PyResult<()> {
+        gcs_authorize_data_access();
+
         let cache_path = std::env::temp_dir();
 
-        match stage_data(cache_path, &self.bams, &self.loci) {
+        match stage_data(cache_path, &self.reads, &self.loci) {
             Ok(staged_data) => { self.staged_data = staged_data; },
             Err(_) => {
                 return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
@@ -158,9 +170,9 @@ impl Session {
     }
 
     fn print(&self) {
-        println!("BAMs:");
-        for bam in &self.bams {
-            println!(" - {}", bam);
+        println!("Reads:");
+        for reads in &self.reads {
+            println!(" - {}", reads);
         }
 
         println!("Loci:");
@@ -176,7 +188,7 @@ impl Session {
 }
 
 #[pyfunction]
-fn init() -> PyResult<Session> {
+fn _init() -> PyResult<Session> {
     Ok(Session::new())
 }
 
@@ -185,8 +197,8 @@ fn init() -> PyResult<Session> {
 /// import the module.
 #[pymodule]
 fn genomeshader(_py: Python, m: &PyModule) -> PyResult<()> {
-    m.add_function(wrap_pyfunction!(gcs_list_files_of_type, m)?)?;
-    m.add_function(wrap_pyfunction!(init, m)?)?;
+    m.add_function(wrap_pyfunction!(_gcs_list_files_of_type, m)?)?;
+    m.add_function(wrap_pyfunction!(_init, m)?)?;
 
     Ok(())
 }
diff --git a/src/storage.rs b/src/storage.rs
@@ -29,6 +29,10 @@ pub fn gcs_authorize_data_access() {
         .output()
         .expect("Failed to execute command");
 
+    if !output.status.success() {
+        panic!("{}", String::from_utf8_lossy(&output.stderr));
+    }
+
     // Decode the output and remove trailing newline
     let token = String::from_utf8(output.stdout)
         .expect("Failed to decode output")
@@ -40,7 +44,7 @@ pub fn gcs_authorize_data_access() {
 }
 
 #[pyfunction]
-pub fn gcs_list_files_of_type(path: String, suffix: &str) -> PyResult<Vec<String>> {
+pub fn _gcs_list_files_of_type(path: String, suffix: &str) -> PyResult<Vec<String>> {
     let file_list = gcs_list_files(&path).unwrap();
 
     let bam_files: Vec<_> = file_list.iter().flat_map(|fs| {