diff --git a/bio/reference/ensembl-annotation/meta.yaml b/bio/reference/ensembl-annotation/meta.yaml index b8fd0924a79..1c649e15bb3 100644 --- a/bio/reference/ensembl-annotation/meta.yaml +++ b/bio/reference/ensembl-annotation/meta.yaml @@ -6,3 +6,5 @@ output: - Ensemble GTF or GFF3 anotation file params: - url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``) + - branch: branch of ftp server to download cache data if required (optional; e.g. "plants") + - collection: collection of ftp server to download cache data if required (optional; e.g. "bacteria_0_collection") \ No newline at end of file diff --git a/bio/reference/ensembl-annotation/test/Snakefile b/bio/reference/ensembl-annotation/test/Snakefile index 3a30ca70bde..b97cdb86ab0 100644 --- a/bio/reference/ensembl-annotation/test/Snakefile +++ b/bio/reference/ensembl-annotation/test/Snakefile @@ -6,7 +6,6 @@ rule get_annotation: release="105", build="GRCh37", flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP. - # branch="plants", # optional: specify branch log: "logs/get_annotation.log", cache: "omit-software" # save space and time with between workflow caching (see docs) @@ -22,11 +21,25 @@ rule get_annotation_gz: release="105", build="GRCh37", flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP. - # branch="plants", # optional: specify branch log: - "logs/get_annotation.log", + "logs/get_annotation_gz.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "master/bio/reference/ensembl-annotation" + + +rule get_off_branch_annotation: + output: + "refs/off_branch_annotation.gtf", params: - url="http://ftp.ensembl.org/pub", + species="bacillus_subtilis_subsp_subtilis_str_168_gca_000009045", + release="59", # note latest release varies with url + build="ASM904v1", + branch="bacteria", # optional for off branch genomes + url="ftp://ftp.ensemblgenomes.org/pub/", # optional set ftp server source + collection="bacteria_0_collection", # optional set collection source for genome + log: + "logs/get_off_branch_annotation.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-annotation" diff --git a/bio/reference/ensembl-annotation/wrapper.py b/bio/reference/ensembl-annotation/wrapper.py index c3d655cbb25..b61f2b67531 100644 --- a/bio/reference/ensembl-annotation/wrapper.py +++ b/bio/reference/ensembl-annotation/wrapper.py @@ -31,6 +31,9 @@ elif snakemake.params.get("branch"): branch = snakemake.params.branch + "/" +collection = snakemake.params.get("collection". "") +if collection: + collection = f"{collection}/" flavor = snakemake.params.get("flavor", "") if flavor: @@ -49,7 +52,7 @@ url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub") -url = f"{url}/{branch}release-{release}/{out_fmt}/{species}/{species.capitalize()}.{build}.{gtf_release}.{flavor}{suffix}" +url = f"{url}/{branch}release-{release}/{out_fmt}/{collection}{species}/{species.capitalize()}.{build}.{gtf_release}.{flavor}{suffix}" try: diff --git a/bio/reference/ensembl-sequence/meta.yaml b/bio/reference/ensembl-sequence/meta.yaml index 20c769a0d1d..8b703c9b2bc 100644 --- a/bio/reference/ensembl-sequence/meta.yaml +++ b/bio/reference/ensembl-sequence/meta.yaml @@ -6,3 +6,5 @@ output: - fasta file params: - url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``) + - branch: branch of ftp server to download cache data if required (optional; e.g. "plants") + - collection: collection of ftp server to download cache data if required (optional; e.g. "bacteria_0_collection") \ No newline at end of file diff --git a/bio/reference/ensembl-sequence/test/Snakefile b/bio/reference/ensembl-sequence/test/Snakefile index fec1c746a4c..aa4961c5027 100644 --- a/bio/reference/ensembl-sequence/test/Snakefile +++ b/bio/reference/ensembl-sequence/test/Snakefile @@ -22,11 +22,8 @@ rule get_single_chromosome: build="R64-1-1", release="101", chromosome=["II"], # optional: restrict to one or multiple chromosomes, for multiple see below - # branch="plants", # optional: specify branch log: - "logs/get_genome.log", - params: - url="http://ftp.ensembl.org/pub", + "logs/get_single_genome.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-sequence" @@ -40,9 +37,26 @@ rule get_multiple_chromosome: build="R64-1-1", release="101", chromosome=["I", "II"], # optional: restrict to one or multiple chromosomes - # branch="plants", # optional: specify branch log: - "logs/get_genome.log", + "logs/get_multiple_chromosome.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "master/bio/reference/ensembl-sequence" + + +rule get_off_branch_genome: + output: + "refs/off_branch_genome.fasta", + params: + species="bacillus_subtilis_subsp_subtilis_str_168_gca_000009045", + datatype="dna", + build="ASM904v1", + release="59", # note latest release varies with url + branch="bacteria", # optional for off branch genomes + url="ftp://ftp.ensemblgenomes.org/pub/", # optional set ftp server source + collection="bacteria_0_collection", # optional set collection source for genome + log: + "logs/get_off_branch_genome.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-sequence" diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index cb2956a6c04..874c9219100 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -19,11 +19,16 @@ elif snakemake.params.get("branch"): branch = snakemake.params.branch + "/" +collection = snakemake.params.get("collection". "") +if collection: + collection = f"{collection}/" + log = snakemake.log_fmt_shell(stdout=False, stderr=True) -spec = ("{build}" if int(release) > 75 else "{build}.{release}").format( - build=build, release=release -) +if branch == "" or branch == "grch37/": + spec = f"{build}" if int(release) > 75 else f"{build}.{release}" +else: + spec = f"{build}" if int(release) > 30 else f"{build}.{release}" suffixes = "" datatype = snakemake.params.get("datatype", "") @@ -52,7 +57,7 @@ url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub") spec = spec.format(build=build, release=release) -url_prefix = f"{url}/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}" +url_prefix = f"{url}/{branch}release-{release}/fasta/{collection}{species}/{datatype}/{species.capitalize()}.{spec}" success = False for suffix in suffixes: diff --git a/test.py b/test.py index 605462538c0..e4b62e00c03 100644 --- a/test.py +++ b/test.py @@ -5565,6 +5565,14 @@ def test_ensembl_sequence_chromosomes(): ) +@skip_if_not_modified +def test_ensembl_sequence_off_branch(): + run( + "bio/reference/ensembl-sequence", + ["snakemake", "--cores", "1", "refs/off_branch_genome.fasta", "--use-conda", "-F"], + ) + + @skip_if_not_modified def test_ensembl_sequence_chromosome_old_release(): run( @@ -5598,6 +5606,14 @@ def test_ensembl_annotation_gtf_gz(): ) +@skip_if_not_modified +def test_ensembl_off_branch_annotation_gtf(): + run( + "bio/reference/ensembl-annotation", + ["snakemake", "--cores", "1", "refs/off_branch_annotation.gtf", "--use-conda", "-F"], + ) + + @skip_if_not_modified def test_ensembl_variation(): run(