Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add configurable protocol support to ensembl reference download #2649

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions bio/reference/ensembl-annotation/test/ensembl_annotation_https.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rule get_annotation_https_protocol_gz:
output:
"refs/annotation.gtf.gz",
params:
species="homo_sapiens",
release="105",
build="GRCh37",
flavor="",
protocol="https",
log:
"logs/get_annotation.log",
cache: "omit-software" # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/reference/ensembl-annotation"
4 changes: 3 additions & 1 deletion bio/reference/ensembl-annotation/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
out_fmt = Path(snakemake.output[0]).suffixes
out_gz = (out_fmt.pop() and True) if out_fmt[-1] == ".gz" else False
out_fmt = out_fmt.pop().lstrip(".")
protocol = snakemake.params.get("protocol", "ftp")


branch = ""
Expand Down Expand Up @@ -48,7 +49,7 @@
)


url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{out_fmt}/{species}/{species_cap}.{build}.{gtf_release}.{flavor}{suffix}".format(
url = "{protocol}://ftp.ensembl.org/pub/{branch}release-{release}/{out_fmt}/{species}/{species_cap}.{build}.{gtf_release}.{flavor}{suffix}".format(
release=release,
gtf_release=gtf_release,
build=build,
Expand All @@ -58,6 +59,7 @@
suffix=suffix,
flavor=flavor,
branch=branch,
protocol=protocol,
)


Expand Down
1 change: 1 addition & 0 deletions bio/reference/ensembl-sequence/test/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ rule get_single_chromosome:
wrapper:
"master/bio/reference/ensembl-sequence"


rule get_multiple_chromosome:
output:
"refs/chr1_and_chr2.fasta",
Expand Down
14 changes: 14 additions & 0 deletions bio/reference/ensembl-sequence/test/ensembl_sequence_https.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rule get_genome_http_protocol:
output:
"refs/genome.fasta",
params:
species="saccharomyces_cerevisiae",
datatype="dna",
build="R64-1-1",
release="98",
protocol="http",
log:
"logs/get_genome.log",
cache: "omit-software" # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/reference/ensembl-sequence"
3 changes: 2 additions & 1 deletion bio/reference/ensembl-sequence/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build
protocol = snakemake.params.get("protocol", "ftp")

branch = ""
if release >= 81 and build == "GRCh37":
Expand Down Expand Up @@ -51,7 +52,7 @@
)

spec = spec.format(build=build, release=release)
url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"
url_prefix = f"{protocol}://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
Expand Down
6 changes: 3 additions & 3 deletions bio/reference/ensembl-variation/test/chrom_wise.smk
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
rule get_variation:
output:
vcf="refs/variation.vcf.gz"
vcf="refs/variation.vcf.gz",
params:
species="homo_sapiens",
release="104",
build="GRCh38",
type="all", # one of "all", "somatic", "structural_variation"
type="all", # one of "all", "somatic", "structural_variation"
chromosome="21",
log:
"logs/get_variation.log"
"logs/get_variation.log",
wrapper:
"master/bio/reference/ensembl-variation"
14 changes: 14 additions & 0 deletions bio/reference/ensembl-variation/test/ensembl_variation_https.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rule get_variation_https_protocol:
output:
vcf="refs/variation.vcf.gz",
params:
species="saccharomyces_cerevisiae",
release="98",
build="R64-1-1",
type="all",
protocol="https",
log:
"logs/get_variation.log",
cache: "omit-software" # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/reference/ensembl-variation"
8 changes: 4 additions & 4 deletions bio/reference/ensembl-variation/test/grch37.smk
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
rule get_variation_with_contig_lengths:
input:
fai="refs/grch37.fasta.fai"
fai="refs/grch37.fasta.fai",
output:
vcf="refs/variation.vcf.gz"
vcf="refs/variation.vcf.gz",
params:
species="homo_sapiens",
release="100",
build="GRCh37",
type="all" # one of "all", "somatic", "structural_variation"
type="all", # one of "all", "somatic", "structural_variation"
log:
"logs/get_variation.log"
"logs/get_variation.log",
wrapper:
"master/bio/reference/ensembl-variation"
8 changes: 3 additions & 5 deletions bio/reference/ensembl-variation/test/old_release.smk
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
rule get_variation:
output:
vcf="refs/variation.vcf.gz"
vcf="refs/variation.vcf.gz",
# optional: add fai to get VCF with annotated contig lengths (as required by GATK)
# fai="refs/genome.fasta.fai"
params:
species="saccharomyces_cerevisiae",
release="98",
build="R64-1-1",
type="all" # one of "all", "somatic", "structural_variation"
type="all", # one of "all", "somatic", "structural_variation"
log:
"logs/get_variation.log"
"logs/get_variation.log",
wrapper:
"master/bio/reference/ensembl-variation"


8 changes: 4 additions & 4 deletions bio/reference/ensembl-variation/test/with_fai.smk
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
rule get_variation_with_contig_lengths:
input:
fai="refs/genome.fasta.fai"
fai="refs/genome.fasta.fai",
output:
vcf="refs/variation.vcf.gz"
vcf="refs/variation.vcf.gz",
params:
species="saccharomyces_cerevisiae",
release="98",
build="R64-1-1",
type="all" # one of "all", "somatic", "structural_variation"
type="all", # one of "all", "somatic", "structural_variation"
log:
"logs/get_variation.log"
"logs/get_variation.log",
wrapper:
"master/bio/reference/ensembl-variation"
4 changes: 3 additions & 1 deletion bio/reference/ensembl-variation/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
build = snakemake.params.build
type = snakemake.params.type
chromosome = snakemake.params.get("chromosome", "")
protocol = snakemake.params.get("protocol", "ftp")


branch = ""
Expand Down Expand Up @@ -63,12 +64,13 @@
species_filename = species if release >= 91 else species.capitalize()

urls = [
"ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.vcf.gz".format(
"{protocol}://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.vcf.gz".format(
release=release,
species=species,
suffix=suffix,
species_filename=species_filename,
branch=branch,
protocol=protocol,
)
for suffix in suffixes
]
Expand Down
14 changes: 14 additions & 0 deletions bio/vep/cache/test/vep_cache_https.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

rule get_vep_cache_https_protocol:
output:
directory("resources/vep/cache"),
params:
species="saccharomyces_cerevisiae",
build="R64-1-1",
release="98",
protocol="https",
log:
"logs/vep/cache.log",
cache: "omit-software" # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/vep/cache"
3 changes: 2 additions & 1 deletion bio/vep/cache/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@


extra = snakemake.params.get("extra", "")
protocol = snakemake.params.get("protocol", "ftp")

try:
release = int(snakemake.params.release)
Expand All @@ -24,7 +25,7 @@
)
log = snakemake.log_fmt_shell(stdout=True, stderr=True)
shell(
"curl -L ftp://ftp.ensembl.org/pub/release-{snakemake.params.release}/"
"curl -L {protocol}://ftp.ensembl.org/pub/release-{snakemake.params.release}/"
"variation/{vep_dir}/{cache_tarball} "
"-o {tmpdir}/{cache_tarball} {log}"
)
Expand Down
69 changes: 64 additions & 5 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5329,6 +5329,22 @@ def test_ensembl_sequence_old_release():
)


@skip_if_not_modified
def test_ensembl_sequence_https():
run(
"bio/reference/ensembl-sequence",
[
"snakemake",
"-s",
"ensembl_sequence_https.smk",
"--cores",
"1",
"--use-conda",
"-F",
],
)


@skip_if_not_modified
def test_ensembl_sequence_chromosome():
run(
Expand Down Expand Up @@ -5378,6 +5394,23 @@ def test_ensembl_annotation_gtf_gz():
)


@skip_if_not_modified
def test_ensembl_annotation_gtf_https_gz():
run(
"bio/reference/ensembl-annotation",
[
"snakemake",
"--cores",
"1",
"refs/annotation.gtf.gz",
"--use-conda",
"-F",
"-s",
"ensembl_annotation_https.smk",
],
)


@skip_if_not_modified
def test_ensembl_variation():
run(
Expand Down Expand Up @@ -5428,17 +5461,26 @@ def test_ensembl_variation_with_contig_lengths():


@skip_if_not_modified
def test_ega_fetch():
def test_ensembl_variation_old_release():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def test_ensembl_variation_old_release():
def test_ensembl_variation_old_release_https_protocol():

run(
"bio/ega/fetch",
"bio/reference/ensembl-variation",
[
"snakemake",
"-s",
"ensembl_variation_https.smk",
"--cores",
"1",
"--use-conda",
"-F",
"data/EGAF00007243774.cram"
]
],
)


@skip_if_not_modified
def test_ega_fetch():
run(
"bio/ega/fetch",
["snakemake", "--cores", "1", "--use-conda", "-F", "data/EGAF00007243774.cram"],
)


Expand Down Expand Up @@ -5801,11 +5843,28 @@ def test_vep_cache():
)


@skip_if_not_modified
def test_vep_cache_https_protocol():
run(
"bio/vep/cache",
["snakemake", "--cores", "1", "resources/vep/cache", "--use-conda", "-F"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
["snakemake", "--cores", "1", "resources/vep/cache", "--use-conda", "-F"],
["snakemake", "--cores", "1", "resources/vep/cache", "--use-conda", "-F", "-s", "vep_cache_https.smk"],

)


@skip_if_not_modified
def test_vep_plugins():
run(
"bio/vep/plugins",
["snakemake", "--cores", "1", "resources/vep/plugins", "--use-conda", "-F"],
[
"snakemake",
"--cores",
"1",
"resources/vep/plugins",
"--use-conda",
"-F",
"-s",
"vep_cache_https.smk",
Comment on lines +5865 to +5866
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"-s",
"vep_cache_https.smk",

],
)


Expand Down
Loading