Skip to content

Commit

Permalink
185 add flag for download for ggetrs ensembl ref (#186)
Browse files Browse the repository at this point in the history
* update blast testing since server updated their sorting

* added functions for downloading urls async

* download utils exposed publically

* add stream feature to reqwest

* include download utility for ensembl ref which downloads files inplace

* ran cargo formatting

* include download instructions on site

* bump version
  • Loading branch information
noamteyssier authored Jun 16, 2023
1 parent b5694a1 commit a3081d3
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 11 deletions.
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ggetrs"
version = "0.1.70"
version = "0.1.71"
edition = "2021"
license = "MIT"
description = "Efficient querying of biological databases from the command line"
Expand All @@ -26,10 +26,11 @@ clap = { version = "4.0.18", features = ["derive"] }
clap_complete = "4.0.3"
ftp = "3.0.1"
futures = "0.3.24"
indicatif = "0.17.5"
mysql = "23.0.1"
pyo3 = { version = "0.16.5", features = ["extension-module", "anyhow"] }
regex = "1.6.0"
reqwest = { version = "0.11.11", features = ["json", "multipart", "blocking"] }
reqwest = { version = "0.11.11", features = ["json", "multipart", "blocking", "stream"] }
serde = { version = "1.0.144", features = ["derive"] }
serde-xml-rs = "0.6.0"
serde_json = "1.0.85"
Expand Down
4 changes: 4 additions & 0 deletions docs/src/ensembl/ref.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Retrieves reference files from the Ensembl FTP site.
| Release | `-r` | `--release` | Release to use - will default to latest release |
| Data Type | `-d` | `--datatype` | Datatype to query for - provided as a comma-separated list |
| Output | `-o` | `--output` | optional filepath to write output to [default=stdout] |
| Download | `-D` | `--download` | Download all requested files to the current working directory |

## Command Line Interface

Expand All @@ -25,6 +26,9 @@ ggetrs ensembl ref -d cdna,dna

# returns the url for the mouse cdna transcriptome and genome
ggetrs ensembl ref -d cdna,dna -s mus_musculus

# downloads the requested files to the current directory
ggetrs ensembl ref -d cdna,dna,gtf -s homo_sapiens
```

## Python
Expand Down
12 changes: 6 additions & 6 deletions src/blast/functions/blast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,17 @@ mod testing {
assert_eq!(result.query(), sequence);
assert_eq!(result.results().len(), 1);
assert_eq!(result.results()[0].num, 1);
assert_eq!(result.results()[0].id, "gi|2310188890|ref|NG_029005.2|");
assert_eq!(result.results()[0].definition, "Homo sapiens CASP8 and FADD like apoptosis regulator (CFLAR), RefSeqGene on chromosome 2");
assert_eq!(result.results()[0].accession, "NG_029005");
assert_eq!(result.results()[0].length, 67524);
assert_eq!(result.results()[0].id, "gi|2505182875|ref|NG_168413.1|");
assert_eq!(result.results()[0].definition, "Homo sapiens ATAC-STARR-seq lymphoblastoid active region 16974 (LOC129935398) on chromosome 2");
assert_eq!(result.results()[0].accession, "NG_168413");
assert_eq!(result.results()[0].length, 460);
assert_eq!(result.results()[0].bit_score, 222.718);
assert_eq!(result.results()[0].score, 120);
assert_eq!(result.results()[0].gap_opens, 0);
assert_eq!(result.results()[0].alignment_length, 120);
assert_eq!(result.results()[0].query_start, 1);
assert_eq!(result.results()[0].query_end, 120);
assert_eq!(result.results()[0].subject_start, 4992);
assert_eq!(result.results()[0].subject_end, 5111);
assert_eq!(result.results()[0].subject_start, 105);
assert_eq!(result.results()[0].subject_end, 224);
}
}
4 changes: 4 additions & 0 deletions src/cli/ensembl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ pub enum ModEnsembl {
)]
datatype: Vec<DataType>,

/// Download all files to current directory
#[clap(short = 'D', long, value_parser)]
download: bool,

/// Optional filepath to write output to [default=stdout]
#[clap(short, long)]
output: Option<String>,
Expand Down
10 changes: 10 additions & 0 deletions src/ensembl/cli.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
use crate::utils::download_multiple;
use futures::executor::block_on;

use super::{
database, functions::lookup_id, list_species, lookup_symbol, reference, release, search,
DataType,
Expand Down Expand Up @@ -67,9 +70,11 @@ pub fn launch_ensembl_reference(
species: &str,
release: usize,
datatype: &[DataType],
download: bool,
output: &Option<String>,
) -> anyhow::Result<()> {
let files = reference(species, release, datatype)?;

let repr = serde_json::to_string_pretty(&files)?;
match output {
Some(path) => {
Expand All @@ -83,6 +88,11 @@ pub fn launch_ensembl_reference(
println!("{}", repr);
}
}
if download {
eprintln!("Downloading {} files:", files.len());
let urls = files.iter().map(|f| f.url.as_str()).collect::<Vec<&str>>();
block_on(download_multiple(&urls))?;
}
Ok(())
}

Expand Down
6 changes: 4 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ use ggetrs::{
RequestError,
};

fn main() -> Result<(), RequestError> {
#[tokio::main]
async fn main() -> Result<(), RequestError> {
let cli = Cli::parse();
match &cli.command {
Commands::Enrichr(sub) => match sub {
Expand Down Expand Up @@ -140,9 +141,10 @@ fn main() -> Result<(), RequestError> {
species,
release,
datatype,
download,
output,
} => {
launch_ensembl_reference(species, *release, datatype, output)?;
launch_ensembl_reference(species, *release, datatype, *download, output)?;
}
ModEnsembl::Species {
release,
Expand Down
2 changes: 1 addition & 1 deletion src/seq/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pub fn launch_seq(
species: &Option<String>,
output: &Option<String>,
) -> Result<()> {
if translate{
if translate {
let results = query(search_terms, false, &None)?;
match output {
Some(path) => {
Expand Down
38 changes: 38 additions & 0 deletions src/utils/download.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
use anyhow::Result;
use futures::{future::join_all, StreamExt};
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use reqwest::Client;
use std::{fs::File, io::Write};

/// Download a file from a URL asynchronously
async fn download_url(url: &str, pb: ProgressBar) -> Result<()> {
let filename = url.split('/').last().unwrap_or("");
let client = Client::new().get(url).send().await?.error_for_status()?;

let size = client.content_length().unwrap_or(0);
pb.set_style(ProgressStyle::default_bar()
.template(
"{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta}) {msg}")?
.progress_chars("#>-"));
pb.set_length(size);
pb.set_message(format!("{}", filename));

let mut file = File::create(filename)?;
let mut stream = client.bytes_stream();
while let Some(item) = stream.next().await {
let chunk = item?;
pb.inc(chunk.len() as u64);
file.write_all(&chunk)?;
}
pb.finish();
Ok(())
}

/// Download multiple URLs asynchronously
pub async fn download_multiple(urls: &[&str]) -> Result<()> {
let mpb = MultiProgress::new();
let bars = (0..urls.len()).map(|_| mpb.add(ProgressBar::new(0)));
let handles = urls.iter().zip(bars).map(|(url, pb)| download_url(url, pb));
join_all(handles).await;
Ok(())
}
2 changes: 2 additions & 0 deletions src/utils/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
pub mod autocomplete;
mod download;
mod fasta;
pub mod parsing;
mod ping;
pub use download::download_multiple;
pub use fasta::{FastaRecord, FastaRecords};
pub use ping::ping;

0 comments on commit a3081d3

Please sign in to comment.