From 413517fb5a6a856a6d9e1b4a8a15ed672e2ba85e Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 15 Sep 2021 17:44:02 +0100 Subject: [PATCH 001/106] Bump pipeline version to 1.4dev --- CHANGELOG.md | 4 ++++ nextflow.config | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eae33fae..2d633f30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unpublished Version / DEV] + +### Enhancements & fixes + ## [[1.3](https://github.com/nf-core/fetchngs/releases/tag/1.3)] - 2021-09-15 ### Enhancements & fixes diff --git a/nextflow.config b/nextflow.config index 52382140..131151b9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -147,7 +147,7 @@ manifest { description = 'Pipeline to fetch metadata and raw FastQ files from public databases' mainScript = 'main.nf' nextflowVersion = '!>=21.04.0' - version = '1.3' + version = '1.4dev' } // Function to ensure that resource requirements don't go beyond From a8ed0ad40dfb138352f4f62b69d931cbb02b7814 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 28 Sep 2021 22:28:42 +0100 Subject: [PATCH 002/106] Add support for DDBJ ids --- CHANGELOG.md | 12 ++++++++++++ CITATIONS.md | 3 +++ README.md | 2 +- assets/schema_input.json | 2 +- bin/sra_ids_to_runinfo.py | 10 ++++++---- docs/output.md | 6 +++--- docs/usage.md | 22 +++++++++++----------- nextflow_schema.json | 2 +- 8 files changed, 38 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d633f30..e6dcc246 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements & fixes +* Added support for [DDBJ ids](https://www.ddbj.nig.ac.jp/index-e.html). See examples below: + +! `DDBJ` | +|---------------| +| PRJDB4176 | +| SAMD00114846 | +| DRA008156 | +| DRP004793 | +| DRR171822 | +| DRS090921 | +| DRX162434 | + ## [[1.3](https://github.com/nf-core/fetchngs/releases/tag/1.3)] - 2021-09-15 ### Enhancements & fixes diff --git a/CITATIONS.md b/CITATIONS.md index 00f85a25..4a1f2752 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -22,6 +22,9 @@ * [SRA](https://pubmed.ncbi.nlm.nih.gov/21062823/) > Leinonen R, Sugawara H, Shumway M, International Nucleotide Sequence Database Collaboration. The sequence read archive. Nucleic Acids Res. 2011 Jan;39 (Database issue):D19-21. doi: 10.1093/nar/gkq1019. Epub 2010 Nov 9. PubMed PMID: 21062823; PubMed Central PMCID: PMC3013647. +* [DDBJ](https://pubmed.ncbi.nlm.nih.gov/33156332/) + > Fukuda A, Kodama Y, Mashima J, Fujisawa T, Ogasawara O. DDBJ update: streamlining submission and access of human data. Nucleic Acids Res. 2021 Jan 8;49(D1):D71-D75. doi: 10.1093/nar/gkaa982. PubMed PMID: 33156332; PubMed Central PMCID: PMC7779041. + * [GEO](https://pubmed.ncbi.nlm.nih.gov/23193258/) > Barrett T, Wilhite SE, Ledoux P, Evangelista C, Kim IF, Tomashevsky M, Marshall KA, Phillippy KH, Sherman PM, Holko M, Yefanov A, Lee H, Zhang N, Robertson CL, Serova N, Davis S, Soboleva A. NCBI GEO: archive for functional genomics data sets--update. Nucleic Acids Res. 2013 Jan;41(Database issue):D991-5. doi: 10.1093/nar/gks1193. Epub 2012 Nov 27. PubMed PMID: 23193258; PubMed Central PMCID: PMC3531084. diff --git a/README.md b/README.md index 544862c0..e378eacc 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from public databases. At present, the pipeline supports SRA / ENA / GEO ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from public databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. diff --git a/assets/schema_input.json b/assets/schema_input.json index b4141272..224e5cf7 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -9,7 +9,7 @@ "items": { "type": "string", "pattern": "^[SEPG][RAS][RXSMPAJXE][EN]?[AB]?\\d{4,9}$", - "errorMessage": "Please provide a valid SRA, GEO or ENA identifier" + "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" } } } diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index ae60f545..6de6fc28 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -18,10 +18,12 @@ ## Example ids supported by this script SRA_IDS = ('PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814') -ENA_IDS = ('ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702', 'PRJEB7743', 'SAMEA3121481') +ENA_IDS = ('PRJEB7743', 'SAMEA3121481', 'ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702') +DDBJ_IDS = ('PRJDB4176', 'SAMD00114846', 'DRA008156', 'DRP004793', 'DRR171822', 'DRS090921', 'DRX162434') GEO_IDS = ('GSE18729', 'GSM465244') ID_REGEX = re.compile(r'[A-Z]+') -PREFIX_LIST = sorted({ID_REGEX.match(x).group() for x in SRA_IDS + ENA_IDS + GEO_IDS}) +PREFIX_LIST = sorted({ID_REGEX.match(x).group() for x in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS}) + ## List of meta fields fetched from the ENA API - can be overriden by --ena_metadata_fields ## Full list of accepted fields can be obtained here: https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run @@ -111,7 +113,7 @@ def text(self, encoding=None): def parse_args(args=None): - Description = 'Download and create a run information metadata file from SRA/ENA/GEO identifiers.' + Description = 'Download and create a run information metadata file from SRA / ENA / DDBJ / GEO identifiers.' Epilog = 'Example usage: python fetch_sra_runinfo.py ' parser = argparse.ArgumentParser(description=Description, epilog=Epilog) @@ -228,7 +230,7 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS ids = gse_to_srx(db_id) ## Resolve/expand these ids against SRA URL - elif prefix in ['GSM', 'PRJNA', 'SAMN', 'SRR']: + elif prefix in ['GSM', 'PRJNA', 'SAMN', 'SRR', 'DRA', 'DRP', 'DRR', 'DRS', 'DRX', 'PRJDB', 'SAMD']: ids = id_to_srx(db_id) ## Resolve/expand these ids against ENA URL diff --git a/docs/output.md b/docs/output.md index c0916e6d..4ffc507e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,7 +8,7 @@ This document describes the output produced by the pipeline. The directories lis The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -* [FastQ download](#fastq-download) - Download FastQ files via SRA / ENA / GEO ids +* [FastQ download](#fastq-download) - Download FastQ files via SRA / ENA / DDBJ / GEO ids * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ### FastQ download @@ -17,9 +17,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d Output files * `fastq/` - * `*.fastq.gz`: Paired-end/single-end reads downloaded from the ENA / SRA. + * `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO. * `fastq/md5/` - * `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA / SRA. + * `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA. * `samplesheet/` * `samplesheet.csv`: Auto-created samplesheet with collated metadata and paths to downloaded FastQ files. * `id_mappings.csv`: File with selected fields that can be used to rename samples to more informative names; see [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour. diff --git a/docs/usage.md b/docs/usage.md index 398c739a..1e6066db 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,17 +8,17 @@ The pipeline has been set-up to automatically download and process the raw FastQ files from public repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: -| `SRA` | `ENA` | `GEO` | -|--------------|--------------|------------| -| SRR11605097 | ERR4007730 | GSM4432381 | -| SRX8171613 | ERX4009132 | GSE147507 | -| SRS6531847 | ERS4399630 | | -| SAMN14689442 | SAMEA6638373 | | -| SRP256957 | ERP120836 | | -| SRA1068758 | ERA2420837 | | -| PRJNA625551 | PRJEB37513 | | - -If `SRR`/`ERR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. +| `SRA` | `ENA` | `DDBJ` | `GEO` | +|--------------|--------------|--------------|------------| +| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | +| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | +| SRS6531847 | ERS4399630 | DRS090921 | | +| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | +| SRP256957 | ERP120836 | DRP004793 | | +| SRA1068758 | ERA2420837 | DRA008156 | | +| PRJNA625551 | PRJEB37513 | PRJDB4176 | | + +If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. We may add this functionality in later releases. diff --git a/nextflow_schema.json b/nextflow_schema.json index a391f35d..03612ed2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -21,7 +21,7 @@ "pattern": "^\\S+\\.txt$", "schema": "assets/schema_input.json", "fa_icon": "fas fa-file-excel", - "description": "File containing SRA/ENA/GEO identifiers one per line to download their associated metadata and FastQ files." + "description": "File containing SRA/ENA/DDBJ/GEO identifiers one per line to download their associated metadata and FastQ files." }, "ena_metadata_fields": { "type": "string", From 284f3be60be5bad8df9d1b989e05e84409562249 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 28 Sep 2021 22:35:34 +0100 Subject: [PATCH 003/106] Fix funny pipe --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6dcc246..94968c8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Added support for [DDBJ ids](https://www.ddbj.nig.ac.jp/index-e.html). See examples below: -! `DDBJ` | +| `DDBJ` | |---------------| | PRJDB4176 | | SAMD00114846 | From 7cb6af67567d971ca6b25eb082b10ab43e6f9c49 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 27 Sep 2021 21:53:26 +0200 Subject: [PATCH 004/106] refactor: simplify decoding method --- bin/sra_ids_to_runinfo.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 6de6fc28..714a0196 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -104,11 +104,9 @@ def body(self): def text(self, encoding=None): """Return the response's body as a decoded string.""" - if encoding is not None: - return self.body.decode(encoding) - - _, params = cgi.parse_header(self._response.getheader("Content-Type", "")) - encoding = params.get("charset", "utf-8") + if encoding is None: + _, params = cgi.parse_header(self._response.getheader("Content-Type", "")) + encoding = params.get("charset", "utf-8") return self.body.decode(encoding) From d56a9ee76261edf692d71e21d59ac56a9c2e9216 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 27 Sep 2021 21:59:34 +0200 Subject: [PATCH 005/106] refactor: simplify makedirs --- bin/sra_ids_to_runinfo.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 714a0196..75b944ac 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -134,11 +134,7 @@ def validate_csv_param(param, valid_vals, param_desc): def make_dir(path): if not len(path) == 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise + os.makedirs(path, exist_ok=True) def fetch_url(url): try: From 608908d50af70859cb8a84b6787f35eaf7859c72 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Oct 2021 15:19:38 +0200 Subject: [PATCH 006/106] refactor: check valid prefix in set --- bin/sra_ids_to_runinfo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 75b944ac..c46abaf9 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -22,7 +22,8 @@ DDBJ_IDS = ('PRJDB4176', 'SAMD00114846', 'DRA008156', 'DRP004793', 'DRR171822', 'DRS090921', 'DRX162434') GEO_IDS = ('GSE18729', 'GSM465244') ID_REGEX = re.compile(r'[A-Z]+') -PREFIX_LIST = sorted({ID_REGEX.match(x).group() for x in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS}) +PREFIX_LIST = sorted({ID_REGEX.match(id).group() for id in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS}) +VALID_PREFIX = frozenset(PREFIX_LIST) ## List of meta fields fetched from the ENA API - can be overriden by --ena_metadata_fields @@ -215,7 +216,7 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS match = ID_REGEX.match(db_id) if match: prefix = match.group() - if prefix in PREFIX_LIST: + if prefix in VALID_PREFIX: if db_id not in seen_ids: ids = [db_id] From 93a2dbcc8ff9e2b940a00cb99e485fcb327410d9 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Oct 2021 22:10:04 +0200 Subject: [PATCH 007/106] refactor: handle missing content Refactor **all** the code :/ --- bin/sra_ids_to_runinfo.py | 297 +++++++++++++++++++++++++------------- 1 file changed, 195 insertions(+), 102 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index c46abaf9..eb4f2ac6 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -3,7 +3,6 @@ import argparse import cgi import csv -import errno import gzip import logging import os @@ -21,9 +20,8 @@ ENA_IDS = ('PRJEB7743', 'SAMEA3121481', 'ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702') DDBJ_IDS = ('PRJDB4176', 'SAMD00114846', 'DRA008156', 'DRP004793', 'DRR171822', 'DRS090921', 'DRX162434') GEO_IDS = ('GSE18729', 'GSM465244') -ID_REGEX = re.compile(r'[A-Z]+') -PREFIX_LIST = sorted({ID_REGEX.match(id).group() for id in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS}) -VALID_PREFIX = frozenset(PREFIX_LIST) +ID_REGEX = re.compile(r'^([A-Z]+)([0-9]+)$') +PREFIX_LIST = sorted({ID_REGEX.match(id).group(1) for id in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS}) ## List of meta fields fetched from the ENA API - can be overriden by --ena_metadata_fields @@ -111,6 +109,178 @@ def text(self, encoding=None): return self.body.decode(encoding) +class DatabaseIdentifierChecker: + """Define a service class for validating database identifiers.""" + + _VALID_PREFIX = frozenset(PREFIX_LIST) + + @classmethod + def is_valid(cls, identifier): + """ + Check the validity of the given database identifier. + + Args: + identifier (str): A short identifier presumably belonging to one of the + supported databases. + + Returns: + bool: Whether or not the identifier is valid. + + """ + match = ID_REGEX.match(identifier) + if match is None: + return False + return match.group(1) in cls._VALID_PREFIX + + +class DatabaseResolver: + """Define a service class for resolving various identifiers to experiments.""" + + _GEO_PREFIXES = {'GSE'} + _SRA_PREFIXES = {'GSM', 'PRJNA', 'SAMN', 'SRR', 'DRA', 'DRP', 'DRR', 'DRS', 'DRX', 'PRJDB', 'SAMD'} + _ENA_PREFIXES = {'ERR'} + + @classmethod + def expand_identifier(cls, identifier): + """ + Expand the given identifier to potentially multiple experiment identifiers. + + Args: + identifier (str): A short identifier presumably belonging to one of the + supported databases. + + Returns: + list: A list of one or more SRA/ENA experiment identifiers. + + """ + prefix = ID_REGEX.match(identifier).group(1) + if prefix in cls._GEO_PREFIXES: + return cls._gse_to_srx(identifier) + elif prefix in cls._SRA_PREFIXES: + return cls._id_to_srx(identifier) + elif prefix in cls._ENA_PREFIXES: + return cls._id_to_erx(identifier) + else: + return [identifier] + + @classmethod + def _content_check(cls, response, identifier): + """Check that the response has content or terminate.""" + if response.status == 204: + logger.error(f"There is no content for id {identifier}. Maybe you lack the right permissions?") + sys.exit(1) + + @classmethod + def _id_to_srx(cls, identifier): + """Resolve the identifier to SRA experiments.""" + params = { + "save": "efetch", + "db": "sra", + "rettype": "runinfo", + "term": identifier + } + response = fetch_url( + f'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}' + ) + cls._content_check(response, identifier) + return [ + row['Experiment'] for row in open_table(response, delimiter=',') + ] + + @classmethod + def _gse_to_srx(cls, identifier): + """Resolve the identifier to SRA experiments.""" + ids = [] + params = { + "acc": identifier, + "targ": "gsm", + "view": "data", + "form": "text" + } + response = fetch_url(f'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}') + cls._content_check(response, identifier) + gsm_ids = [line.split('=')[1].strip() for line in response.text().splitlines() if line.startswith('GSM')] + for gsm_id in gsm_ids: + ids += cls._id_to_srx(gsm_id) + return ids + + @classmethod + def _id_to_erx(cls, identifier): + """Resolve the identifier to ENA experiments.""" + fields = ['run_accession', 'experiment_accession'] + params = { + "accession": identifier, + "result": "read_run", + "fields": ",".join(fields) + } + response = fetch_url(f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}') + cls._content_check(response, identifier) + return [ + row['experiment_accession'] for row in open_table(response, delimiter='\t') + ] + + +class ENAMetadataFetcher: + """Define a service class for fetching metadata from ENA.""" + + def __init__(self, ena_metadata_fields, **kwargs): + """ + Initialize the service with the desired metadata fields. + + Args: + ena_metadata_fields (iterable): An iterable of the desired fields. + **kwargs: Passed to parent constructor. + """ + super().__init__(**kwargs) + self._params = { + "result": "read_run", + "fields": ','.join(ena_metadata_fields) + } + + def open_experiment_table(self, accession): + """ + Open the metadata table belonging to the given experiment accession. + + Args: + accession (str): An ENA experiment accession. + + Returns: + csv.DictReader: A CSV reader instance of the metadata. + + """ + params = { + **self._params, + "accession": accession + } + response = fetch_url( + f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}' + ) + self._content_check(response, accession) + return open_table(response, delimiter='\t') + + @classmethod + def _content_check(cls, response, identifier): + """Check that the response has content or terminate.""" + if response.status == 204: + logger.error(f"There is no content for id {identifier}. Maybe you lack the right permissions?") + sys.exit(1) + + +def open_table(response, delimiter=","): + """ + Return a CSV reader instance from the given response. + + Args: + response (Response): An instance of the local HTTP response class. + delimiter (str): The delimiter separating the table fields. + + Returns: + csv.DictReader: A CSV reader instance of the response body. + + """ + return csv.DictReader(response.text().splitlines(), delimiter=delimiter) + + def parse_args(args=None): Description = 'Download and create a run information metadata file from SRA / ENA / DDBJ / GEO identifiers.' Epilog = 'Example usage: python fetch_sra_runinfo.py ' @@ -138,9 +308,10 @@ def make_dir(path): os.makedirs(path, exist_ok=True) def fetch_url(url): + """Return a response object for the given URL and handle errors appropriately.""" try: with urlopen(url) as response: - result = Response(response=response).text().splitlines() + return Response(response=response) except HTTPError as e: logger.error("The server couldn't fulfill the request.") logger.error(f"Status: {e.code} {e.reason}") @@ -149,45 +320,6 @@ def fetch_url(url): logger.error('We failed to reach a server.') logger.error(f"Reason: {e.reason}") sys.exit(1) - return result - -def id_to_srx(db_id): - params = { - "save": "efetch", - "db": "sra", - "rettype": "runinfo", - "term": db_id - } - url = f'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}' - return [ - row['Experiment'] for row in csv.DictReader(fetch_url(url), delimiter=',') - ] - -def id_to_erx(db_id): - fields = ['run_accession', 'experiment_accession'] - params = { - "accession": db_id, - "result": "read_run", - "fields": ",".join(fields) - } - url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}' - return [ - row['experiment_accession'] for row in csv.DictReader(fetch_url(url), delimiter='\t') - ] - -def gse_to_srx(db_id): - ids = [] - params = { - "acc": db_id, - "targ": "gsm", - "view": "data", - "form": "text" - } - url = f'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}' - gsm_ids = [x.split('=')[1].strip() for x in fetch_url(url) if x.startswith('GSM')] - for gsm_id in gsm_ids: - ids += id_to_srx(gsm_id) - return ids def get_ena_fields(): params = { @@ -195,77 +327,38 @@ def get_ena_fields(): "format": "tsv", "result": "read_run" } - url = f'https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}' return [ - row['columnId'] for row in csv.DictReader(fetch_url(url), delimiter='\t') + row['columnId'] for row in open_table(fetch_url(f'https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}'), delimiter='\t') ] + def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS): - total_out = 0 seen_ids = set() run_ids = set() - header = [] make_dir(os.path.dirname(file_out)) - params = { - "result": "read_run", - "fields": ','.join(ena_metadata_fields) - } + ena_fetcher = ENAMetadataFetcher(ena_metadata_fields) with open(file_in,"r") as fin, open(file_out,"w") as fout: + writer = csv.DictWriter(fout, fieldnames=ena_metadata_fields, delimiter='\t') + writer.writeheader() for line in fin: db_id = line.strip() - match = ID_REGEX.match(db_id) - if match: - prefix = match.group() - if prefix in VALID_PREFIX: - if db_id not in seen_ids: - - ids = [db_id] - ## Resolve/expand these ids against GEO URL - if prefix in ['GSE']: - ids = gse_to_srx(db_id) - - ## Resolve/expand these ids against SRA URL - elif prefix in ['GSM', 'PRJNA', 'SAMN', 'SRR', 'DRA', 'DRP', 'DRR', 'DRS', 'DRX', 'PRJDB', 'SAMD']: - ids = id_to_srx(db_id) - - ## Resolve/expand these ids against ENA URL - elif prefix in ['ERR']: - ids = id_to_erx(db_id) - - ## Resolve/expand to get run identifier from ENA and write to file - for id in ids: - params["accession"] = id - url = f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}' - for row in csv.DictReader(fetch_url(url), delimiter='\t'): - run_id = row['run_accession'] - if run_id not in run_ids: - if total_out == 0: - header = row.keys() - header_line = '\t'.join(header) - fout.write(f"{header_line}\n") - else: - if header != row.keys(): - logger.error(f"Metadata columns do not match for id {run_id}!\nLine: '{line.strip()}'") - sys.exit(1) - - ordered_row = '\t'.join([row[x] for x in header]) - fout.write(f'{ordered_row}\n') - total_out += 1 - run_ids.add(run_id) - seen_ids.add(db_id) - - if not ids: - logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'") - sys.exit(1) - - else: - id_str = ', '.join([x + "*" for x in PREFIX_LIST]) - logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'") - sys.exit(1) - else: + if db_id in seen_ids: + continue + seen_ids.add(db_id) + if not DatabaseIdentifierChecker.is_valid(db_id): id_str = ', '.join([x + "*" for x in PREFIX_LIST]) logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'") sys.exit(1) + ids = DatabaseResolver.expand_identifier(db_id) + if not ids: + logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'") + sys.exit(1) + for accession in ids: + for row in ena_fetcher.open_experiment_table(accession): + run_accession = row['run_accession'] + if run_accession not in run_ids: + writer.writerow(row) + run_ids.add(run_accession) def main(args=None): args = parse_args(args) From dba08dcb3fee792a79ed049a939df4d08a7268d6 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Oct 2021 22:18:48 +0200 Subject: [PATCH 008/106] style: apply black --- bin/sra_ids_to_runinfo.py | 259 ++++++++++++++++++++++++++------------ 1 file changed, 180 insertions(+), 79 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index eb4f2ac6..5b1b1866 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -1,5 +1,6 @@ #!/usr/bin/env python + import argparse import cgi import csv @@ -9,33 +10,88 @@ import re import sys import zlib -from urllib.error import URLError, HTTPError +from urllib.error import HTTPError, URLError from urllib.parse import urlencode from urllib.request import urlopen + logger = logging.getLogger() -## Example ids supported by this script -SRA_IDS = ('PRJNA63463', 'SAMN00765663', 'SRA023522', 'SRP003255', 'SRR390278', 'SRS282569', 'SRX111814') -ENA_IDS = ('PRJEB7743', 'SAMEA3121481', 'ERA2421642', 'ERP120836', 'ERR674736', 'ERS4399631', 'ERX629702') -DDBJ_IDS = ('PRJDB4176', 'SAMD00114846', 'DRA008156', 'DRP004793', 'DRR171822', 'DRS090921', 'DRX162434') -GEO_IDS = ('GSE18729', 'GSM465244') -ID_REGEX = re.compile(r'^([A-Z]+)([0-9]+)$') -PREFIX_LIST = sorted({ID_REGEX.match(id).group(1) for id in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS}) + +# Example ids supported by this script +SRA_IDS = ( + "PRJNA63463", + "SAMN00765663", + "SRA023522", + "SRP003255", + "SRR390278", + "SRS282569", + "SRX111814", +) +ENA_IDS = ( + "PRJEB7743", + "SAMEA3121481", + "ERA2421642", + "ERP120836", + "ERR674736", + "ERS4399631", + "ERX629702", +) +DDBJ_IDS = ( + "PRJDB4176", + "SAMD00114846", + "DRA008156", + "DRP004793", + "DRR171822", + "DRS090921", + "DRX162434", +) +GEO_IDS = ("GSE18729", "GSM465244") +ID_REGEX = re.compile(r"^([A-Z]+)([0-9]+)$") +PREFIX_LIST = sorted( + {ID_REGEX.match(id).group(1) for id in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS} +) -## List of meta fields fetched from the ENA API - can be overriden by --ena_metadata_fields -## Full list of accepted fields can be obtained here: https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run +# List of metadata fields fetched from the ENA API - can be overriden by options +# `-ef` or `--ena_metadata_fields`. +# Full list of accepted fields can be obtained here: +# https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run ENA_METADATA_FIELDS = ( - 'accession', 'run_accession', 'experiment_accession', 'sample_accession', 'secondary_sample_accession', 'study_accession', 'secondary_study_accession', 'parent_study', 'submission_accession', - 'run_alias', 'experiment_alias', 'sample_alias', 'study_alias', - 'library_layout', 'library_selection', 'library_source', 'library_strategy', 'library_name', - 'instrument_model', 'instrument_platform', - 'base_count', 'read_count', - 'tax_id', 'scientific_name', - 'sample_title', 'experiment_title', 'study_title', - 'description', 'sample_description', - 'fastq_md5', 'fastq_bytes', 'fastq_ftp', 'fastq_galaxy', 'fastq_aspera' + "accession", + "run_accession", + "experiment_accession", + "sample_accession", + "secondary_sample_accession", + "study_accession", + "secondary_study_accession", + "parent_study", + "submission_accession", + "run_alias", + "experiment_alias", + "sample_alias", + "study_alias", + "library_layout", + "library_selection", + "library_source", + "library_strategy", + "library_name", + "instrument_model", + "instrument_platform", + "base_count", + "read_count", + "tax_id", + "scientific_name", + "sample_title", + "experiment_title", + "study_title", + "description", + "sample_description", + "fastq_md5", + "fastq_bytes", + "fastq_ftp", + "fastq_galaxy", + "fastq_aspera", ) @@ -112,7 +168,7 @@ def text(self, encoding=None): class DatabaseIdentifierChecker: """Define a service class for validating database identifiers.""" - _VALID_PREFIX = frozenset(PREFIX_LIST) + _VALID_PREFIXES = frozenset(PREFIX_LIST) @classmethod def is_valid(cls, identifier): @@ -130,15 +186,27 @@ def is_valid(cls, identifier): match = ID_REGEX.match(identifier) if match is None: return False - return match.group(1) in cls._VALID_PREFIX + return match.group(1) in cls._VALID_PREFIXES class DatabaseResolver: """Define a service class for resolving various identifiers to experiments.""" - _GEO_PREFIXES = {'GSE'} - _SRA_PREFIXES = {'GSM', 'PRJNA', 'SAMN', 'SRR', 'DRA', 'DRP', 'DRR', 'DRS', 'DRX', 'PRJDB', 'SAMD'} - _ENA_PREFIXES = {'ERR'} + _GEO_PREFIXES = {"GSE"} + _SRA_PREFIXES = { + "GSM", + "PRJNA", + "SAMN", + "SRR", + "DRA", + "DRP", + "DRR", + "DRS", + "DRX", + "PRJDB", + "SAMD", + } + _ENA_PREFIXES = {"ERR"} @classmethod def expand_identifier(cls, identifier): @@ -167,7 +235,10 @@ def expand_identifier(cls, identifier): def _content_check(cls, response, identifier): """Check that the response has content or terminate.""" if response.status == 204: - logger.error(f"There is no content for id {identifier}. Maybe you lack the right permissions?") + logger.error( + f"There is no content for id {identifier}. Maybe you lack the right " + f"permissions?" + ) sys.exit(1) @classmethod @@ -177,29 +248,28 @@ def _id_to_srx(cls, identifier): "save": "efetch", "db": "sra", "rettype": "runinfo", - "term": identifier + "term": identifier, } response = fetch_url( - f'https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}' + f"https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}" ) cls._content_check(response, identifier) - return [ - row['Experiment'] for row in open_table(response, delimiter=',') - ] + return [row["Experiment"] for row in open_table(response, delimiter=",")] @classmethod def _gse_to_srx(cls, identifier): """Resolve the identifier to SRA experiments.""" ids = [] - params = { - "acc": identifier, - "targ": "gsm", - "view": "data", - "form": "text" - } - response = fetch_url(f'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}') + params = {"acc": identifier, "targ": "gsm", "view": "data", "form": "text"} + response = fetch_url( + f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}" + ) cls._content_check(response, identifier) - gsm_ids = [line.split('=')[1].strip() for line in response.text().splitlines() if line.startswith('GSM')] + gsm_ids = [ + line.split("=")[1].strip() + for line in response.text().splitlines() + if line.startswith("GSM") + ] for gsm_id in gsm_ids: ids += cls._id_to_srx(gsm_id) return ids @@ -207,16 +277,18 @@ def _gse_to_srx(cls, identifier): @classmethod def _id_to_erx(cls, identifier): """Resolve the identifier to ENA experiments.""" - fields = ['run_accession', 'experiment_accession'] + fields = ["run_accession", "experiment_accession"] params = { "accession": identifier, "result": "read_run", - "fields": ",".join(fields) + "fields": ",".join(fields), } - response = fetch_url(f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}') + response = fetch_url( + f"https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}" + ) cls._content_check(response, identifier) return [ - row['experiment_accession'] for row in open_table(response, delimiter='\t') + row["experiment_accession"] for row in open_table(response, delimiter="\t") ] @@ -232,10 +304,7 @@ def __init__(self, ena_metadata_fields, **kwargs): **kwargs: Passed to parent constructor. """ super().__init__(**kwargs) - self._params = { - "result": "read_run", - "fields": ','.join(ena_metadata_fields) - } + self._params = {"result": "read_run", "fields": ",".join(ena_metadata_fields)} def open_experiment_table(self, accession): """ @@ -248,21 +317,21 @@ def open_experiment_table(self, accession): csv.DictReader: A CSV reader instance of the metadata. """ - params = { - **self._params, - "accession": accession - } + params = {**self._params, "accession": accession} response = fetch_url( - f'https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}' + f"https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}" ) self._content_check(response, accession) - return open_table(response, delimiter='\t') + return open_table(response, delimiter="\t") @classmethod def _content_check(cls, response, identifier): """Check that the response has content or terminate.""" if response.status == 204: - logger.error(f"There is no content for id {identifier}. Maybe you lack the right permissions?") + logger.error( + f"There is no content for id {identifier}. Maybe you lack the right " + f"permissions?" + ) sys.exit(1) @@ -282,31 +351,49 @@ def open_table(response, delimiter=","): def parse_args(args=None): - Description = 'Download and create a run information metadata file from SRA / ENA / DDBJ / GEO identifiers.' - Epilog = 'Example usage: python fetch_sra_runinfo.py ' - - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument('FILE_IN', help="File containing database identifiers, one per line.") - parser.add_argument('FILE_OUT', help="Output file in tab-delimited format.") - parser.add_argument('-ef', '--ena_metadata_fields', type=str, dest="ENA_METADATA_FIELDS", default='', help=f"Comma-separated list of ENA metadata fields to fetch. (default: {','.join(ENA_METADATA_FIELDS)}).") + parser = argparse.ArgumentParser( + description="Download and create a run information metadata file from SRA / " + "ENA / DDBJ / GEO identifiers.", + epilog="Example usage: python fetch_sra_runinfo.py ", + ) + parser.add_argument( + "FILE_IN", help="File containing database identifiers, one per line." + ) + parser.add_argument("FILE_OUT", help="Output file in tab-delimited format.") + parser.add_argument( + "-ef", + "--ena_metadata_fields", + type=str, + dest="ENA_METADATA_FIELDS", + default="", + help=f"Comma-separated list of ENA metadata fields to fetch. " + f"(default: {','.join(ENA_METADATA_FIELDS)}).", + ) return parser.parse_args(args) + def validate_csv_param(param, valid_vals, param_desc): valid_list = [] if param: - user_vals = param.split(',') + user_vals = param.split(",") intersect = [i for i in user_vals if i in valid_vals] if len(intersect) == len(user_vals): valid_list = intersect else: - logger.error(f"Please provide a valid value for {param_desc}!\nProvided values = {param}\nAccepted values = {','.join(valid_vals)}") + logger.error( + f"Please provide a valid value for {param_desc}!\n" + f"Provided values = {param}\n" + f"Accepted values = {','.join(valid_vals)}" + ) sys.exit(1) return valid_list + def make_dir(path): if not len(path) == 0: os.makedirs(path, exist_ok=True) + def fetch_url(url): """Return a response object for the given URL and handle errors appropriately.""" try: @@ -317,18 +404,21 @@ def fetch_url(url): logger.error(f"Status: {e.code} {e.reason}") sys.exit(1) except URLError as e: - logger.error('We failed to reach a server.') + logger.error("We failed to reach a server.") logger.error(f"Reason: {e.reason}") sys.exit(1) + def get_ena_fields(): - params = { - "dataPortal": "ena", - "format": "tsv", - "result": "read_run" - } + params = {"dataPortal": "ena", "format": "tsv", "result": "read_run"} return [ - row['columnId'] for row in open_table(fetch_url(f'https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}'), delimiter='\t') + row["columnId"] + for row in open_table( + fetch_url( + f"https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}" + ), + delimiter="\t", + ) ] @@ -337,8 +427,8 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS run_ids = set() make_dir(os.path.dirname(file_out)) ena_fetcher = ENAMetadataFetcher(ena_metadata_fields) - with open(file_in,"r") as fin, open(file_out,"w") as fout: - writer = csv.DictWriter(fout, fieldnames=ena_metadata_fields, delimiter='\t') + with open(file_in, "r") as fin, open(file_out, "w") as fout: + writer = csv.DictWriter(fout, fieldnames=ena_metadata_fields, delimiter="\t") writer.writeheader() for line in fin: db_id = line.strip() @@ -346,28 +436,39 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS continue seen_ids.add(db_id) if not DatabaseIdentifierChecker.is_valid(db_id): - id_str = ', '.join([x + "*" for x in PREFIX_LIST]) - logger.error(f"Please provide a valid database id starting with {id_str}!\nLine: '{line.strip()}'") + id_str = ", ".join([x + "*" for x in PREFIX_LIST]) + logger.error( + f"Please provide a valid database id starting with {id_str}!\n" + f"Line: '{line.strip()}'" + ) sys.exit(1) ids = DatabaseResolver.expand_identifier(db_id) if not ids: - logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'") + logger.error( + f"No matches found for database id {db_id}!\nLine: '{line.strip()}'" + ) sys.exit(1) for accession in ids: for row in ena_fetcher.open_experiment_table(accession): - run_accession = row['run_accession'] + run_accession = row["run_accession"] if run_accession not in run_ids: writer.writerow(row) run_ids.add(run_accession) + def main(args=None): args = parse_args(args) ena_metadata_fields = args.ENA_METADATA_FIELDS if not args.ENA_METADATA_FIELDS: - ena_metadata_fields = ','.join(ENA_METADATA_FIELDS) - ena_metadata_fields = validate_csv_param(ena_metadata_fields, valid_vals=get_ena_fields(), param_desc='--ena_metadata_fields') + ena_metadata_fields = ",".join(ENA_METADATA_FIELDS) + ena_metadata_fields = validate_csv_param( + ena_metadata_fields, + valid_vals=get_ena_fields(), + param_desc="--ena_metadata_fields", + ) fetch_sra_runinfo(args.FILE_IN, args.FILE_OUT, ena_metadata_fields) -if __name__ == '__main__': - logging.basicConfig(level='INFO', format='[%(levelname)s] %(message)s') + +if __name__ == "__main__": + logging.basicConfig(level="INFO", format="[%(levelname)s] %(message)s") sys.exit(main()) From f3b87a78b6a1e5bcb791f0a4a90d840bef7809dc Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Oct 2021 22:42:36 +0200 Subject: [PATCH 009/106] refactor: improve arguments parsing --- bin/sra_ids_to_runinfo.py | 71 +++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 5b1b1866..d42081c5 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -10,6 +10,7 @@ import re import sys import zlib +from pathlib import Path from urllib.error import HTTPError, URLError from urllib.parse import urlencode from urllib.request import urlopen @@ -357,36 +358,48 @@ def parse_args(args=None): epilog="Example usage: python fetch_sra_runinfo.py ", ) parser.add_argument( - "FILE_IN", help="File containing database identifiers, one per line." + "file_in", + metavar="FILE_IN", + type=Path, + help="File containing database identifiers, one per line.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Output file in tab-delimited format.", ) - parser.add_argument("FILE_OUT", help="Output file in tab-delimited format.") parser.add_argument( "-ef", "--ena_metadata_fields", type=str, - dest="ENA_METADATA_FIELDS", - default="", - help=f"Comma-separated list of ENA metadata fields to fetch. " + default=",".join(ENA_METADATA_FIELDS), + help=f"Comma-separated list of ENA metadata fields to fetch " f"(default: {','.join(ENA_METADATA_FIELDS)}).", ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) return parser.parse_args(args) -def validate_csv_param(param, valid_vals, param_desc): - valid_list = [] - if param: - user_vals = param.split(",") - intersect = [i for i in user_vals if i in valid_vals] - if len(intersect) == len(user_vals): - valid_list = intersect - else: - logger.error( - f"Please provide a valid value for {param_desc}!\n" - f"Provided values = {param}\n" - f"Accepted values = {','.join(valid_vals)}" - ) - sys.exit(1) - return valid_list +def validate_fields_parameter(param, valid_vals, param_desc): + if not param: + return [] + user_vals = param.split(",") + if len(set(user_vals) & set(valid_vals)) == len(user_vals): + return user_vals + else: + logger.error( + f"Please provide a valid value for {param_desc}!\n" + f"Provided values = {param}\n" + f"Accepted values = {','.join(valid_vals)}" + ) + sys.exit(1) def make_dir(path): @@ -422,10 +435,9 @@ def get_ena_fields(): ] -def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS): +def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields): seen_ids = set() run_ids = set() - make_dir(os.path.dirname(file_out)) ena_fetcher = ENAMetadataFetcher(ena_metadata_fields) with open(file_in, "r") as fin, open(file_out, "w") as fout: writer = csv.DictWriter(fout, fieldnames=ena_metadata_fields, delimiter="\t") @@ -458,17 +470,18 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields=ENA_METADATA_FIELDS def main(args=None): args = parse_args(args) - ena_metadata_fields = args.ENA_METADATA_FIELDS - if not args.ENA_METADATA_FIELDS: - ena_metadata_fields = ",".join(ENA_METADATA_FIELDS) - ena_metadata_fields = validate_csv_param( - ena_metadata_fields, + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(1) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + ena_metadata_fields = validate_fields_parameter( + args.ena_metadata_fields, valid_vals=get_ena_fields(), param_desc="--ena_metadata_fields", ) - fetch_sra_runinfo(args.FILE_IN, args.FILE_OUT, ena_metadata_fields) + fetch_sra_runinfo(args.file_in, args.file_out, ena_metadata_fields) if __name__ == "__main__": - logging.basicConfig(level="INFO", format="[%(levelname)s] %(message)s") sys.exit(main()) From d5560d391512e903c27821e8282378e953fc37e7 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Oct 2021 22:49:06 +0200 Subject: [PATCH 010/106] docs: make changelog entry --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94968c8c..0cc2d511 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements & fixes +* Handle identifiers that do **not** return metadata, for example, due to being private. + * Added support for [DDBJ ids](https://www.ddbj.nig.ac.jp/index-e.html). See examples below: | `DDBJ` | From 5bba41093791ffd716794677a6aeed56d162f1ff Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 2 Oct 2021 22:56:04 +0200 Subject: [PATCH 011/106] refactor: remove unused function --- bin/sra_ids_to_runinfo.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index d42081c5..0f81c125 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -402,11 +402,6 @@ def validate_fields_parameter(param, valid_vals, param_desc): sys.exit(1) -def make_dir(path): - if not len(path) == 0: - os.makedirs(path, exist_ok=True) - - def fetch_url(url): """Return a response object for the given URL and handle errors appropriately.""" try: From 2ba307b760f94eff442064caad911ab13a937bf2 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Tue, 5 Oct 2021 12:45:48 -0700 Subject: [PATCH 012/106] Upload Synapse Modules --- modules/local/synapse_get.nf | 36 +++++++++++++++++++ modules/local/synapse_list.nf | 36 +++++++++++++++++++ modules/local/synapse_merge_samplesheet.nf | 41 ++++++++++++++++++++++ modules/local/synapse_metadata_mapping.nf | 36 +++++++++++++++++++ modules/local/synapse_show.nf | 36 +++++++++++++++++++ modules/local/synapse_to_samplesheet.nf | 39 ++++++++++++++++++++ 6 files changed, 224 insertions(+) create mode 100644 modules/local/synapse_get.nf create mode 100644 modules/local/synapse_list.nf create mode 100644 modules/local/synapse_merge_samplesheet.nf create mode 100644 modules/local/synapse_metadata_mapping.nf create mode 100644 modules/local/synapse_show.nf create mode 100644 modules/local/synapse_to_samplesheet.nf diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf new file mode 100644 index 00000000..bdea8fd0 --- /dev/null +++ b/modules/local/synapse_get.nf @@ -0,0 +1,36 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SYNAPSE_GET { + tag '$synid' + label 'process_high' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "bioconda::synapseclient=2.2.2" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" // TODO: Add Singularity + } else { + container "sagebionetworks/synapsepythonclient:v2.4.0" + } + + input: + val synid // synapse ID for individual FastQ files + path synapseconfig // path to synapse.Config file + + output: + path "*.fastq*" , emit: fastq + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + + """ + synapse -c $synapseconfig get $synid + echo \$(synapse --version) > ${software}.version.txt + """ +} \ No newline at end of file diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf new file mode 100644 index 00000000..2d1cb2f8 --- /dev/null +++ b/modules/local/synapse_list.nf @@ -0,0 +1,36 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SYNAPSE_LIST { + tag '$synid' + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "bioconda::synapseclient=2.2.2" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" // TODO: Add Singularity + } else { + container "sagebionetworks/synapsepythonclient:v2.4.0" + } + + input: + val synid // synapse ID for individual FastQ files + path synapseconfig // path to synapse.Config file + + output: + path "*.synlist.csv", emit: synlist_csv + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + + """ + synapse -c $synapseconfig list -l $synid | cut -c-11 > ${synid}.synlist.csv + echo \$(synapse --version) > ${software}.version.txt + """ +} diff --git a/modules/local/synapse_merge_samplesheet.nf b/modules/local/synapse_merge_samplesheet.nf new file mode 100644 index 00000000..d31d17ef --- /dev/null +++ b/modules/local/synapse_merge_samplesheet.nf @@ -0,0 +1,41 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SYNAPSE_MERGE_SAMPLESHEET { + tag 'merge_samplesheet' + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "conda-forge::sed=4.7" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" + } else { + container "biocontainers/biocontainers:v1.2.0_cv1" + } + + input: + path ('samplesheets/*') + path ('metasheet/*') + + output: + path "samplesheet.csv", emit: samplesheet + path "metasheet.csv", emit: metasheet + + script: + """ + head -n 1 `ls ./samplesheets/* | head -n 1` > samplesheet.csv + for fileid in `ls ./samplesheets/*`; do + awk 'NR>1' \$fileid >> samplesheet.csv + done + + head -n 1 `ls ./metasheet/* | head -n 1` > metasheet.csv + for fileid in `ls ./metasheet/*`; do + awk 'NR>1' \$fileid >> metasheet.csv + done + """ +} diff --git a/modules/local/synapse_metadata_mapping.nf b/modules/local/synapse_metadata_mapping.nf new file mode 100644 index 00000000..0981a1d9 --- /dev/null +++ b/modules/local/synapse_metadata_mapping.nf @@ -0,0 +1,36 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SYNAPSEMETADATAMAPPING { + tag "${data[3]}" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + input: + val data + + output: + path("*metasheet.csv"), emit: metasheet + + exec: + meta_map = [ + md5 : "${data[0]}", + fileSize : "${data[1]}", + etag : "${data[2]}", + id : "${data[3]}", + fileName : "${data[4]}", + fileVersion : "${data[5]}" + ] + + // Create Metadata Sheet + metasheet = meta_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' + metasheet += meta_map.values().collect{ '"' + it + '"'}.join(",") + + def metasheet_file = task.workDir.resolve("${meta_map.id}.metasheet.csv") + metasheet_file.text = metasheet +} diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf new file mode 100644 index 00000000..50a3405f --- /dev/null +++ b/modules/local/synapse_show.nf @@ -0,0 +1,36 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SYNAPSE_SHOW { + tag '$synid' + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "bioconda::synapseclient=2.2.2" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" // TODO: Add Singularity + } else { + container "sagebionetworks/synapsepythonclient:v2.4.0" + } + + input: + val synid // synapse ID for individual FastQ files + path synapseconfig // path to synapse.Config file + + output: + path "*.metadata.txt", emit: metadata + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + + """ + synapse -c $synapseconfig show $synid | sed -n '1,3p;15,16p;20p;23p' > ${synid}.metadata.txt + echo \$(synapse --version) > ${software}.version.txt + """ +} diff --git a/modules/local/synapse_to_samplesheet.nf b/modules/local/synapse_to_samplesheet.nf new file mode 100644 index 00000000..1e50a080 --- /dev/null +++ b/modules/local/synapse_to_samplesheet.nf @@ -0,0 +1,39 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SYNAPSE_TO_SAMPLESHEET { + tag '$id' + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + input: + tuple val(id), val(files) + val strandedness + + output: + path("*samplesheet.csv"), emit: samplesheet + + exec: + + // Add fields to the beginning of the map + pipeline_map = [ + sample : "${id}", + fastq_1 : "${params.outdir}/${params.results_dir}/${files[0].getBaseName()}", + fastq_2 : "${params.outdir}/${params.results_dir}/${files[1].getBaseName()}" + ] + // Add Strandedness + pipeline_map << [ strandedness: "${strandedness}" ] + + // Create Samplesheet + samplesheet = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' + samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",") + + def samplesheet_file2 = task.workDir.resolve("${pipeline_map.sample}.samplesheet.csv") + samplesheet_file2.text = samplesheet + +} From 9bfaf2ffbed4326e7136941ac16e8712904fe09a Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Tue, 5 Oct 2021 12:51:43 -0700 Subject: [PATCH 013/106] Upload Synapse Workflow --- workflows/synapse.nf | 154 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 workflows/synapse.nf diff --git a/workflows/synapse.nf b/workflows/synapse.nf new file mode 100644 index 00000000..0bf8109e --- /dev/null +++ b/workflows/synapse.nf @@ -0,0 +1,154 @@ +/* +======================================================================================== + VALIDATE INPUTS +======================================================================================== +*/ + +def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) + +// Validate input parameters +WorkflowFetchngs.initialise(params, log) + +// Check mandatory parameters +if (params.input) { + Channel + .from(file(params.input, checkIfExists: true)) + .splitCsv(header:false, sep:'', strip:true) + .map { it[0] } + .unique() + .set { ch_ids } +} else { + exit 1, 'Input file with Synapse IDs not specified!' +} + +/* +======================================================================================== + IMPORT LOCAL MODULES/SUBWORKFLOWS +======================================================================================== +*/ + +// Don't overwrite global params.modules, create a copy instead and use that within the main script. +def modules = params.modules.clone() + +include { SYNAPSE_LIST } from '../modules/local/synapse_list' addParams( options: modules['synapse_list'] ) +include { SYNAPSE_GET } from '../modules/local/synapse_get' addParams( options: modules['synapse_get'] ) +include { SYNAPSE_SHOW } from '../modules/local/synapse_show' addParams( options: modules['synapse_show'] ) +include { SYNAPSE_TO_SAMPLESHEET } from '../modules/local/synapse_to_samplesheet' addParams( options: modules['synapse_to_samplesheet'] ) +include { SYNAPSE_METADATA_MAPPING } from '../modules/local/synapse_metadata_mapping' addParams( options: modules['synapse_metadata_mapping'] ) +include { SYNAPSE_MERGE_SAMPLESHEET } from '../modules/local/synapse_merge_samplesheet' addParams( options: modules['synapse_merge_samplesheet']) +include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['tsv':'']] ) + +/* +======================================================================================== + RUN MAIN WORKFLOW +======================================================================================== +*/ + +workflow FETCHNGS_SYNAPSE { + + ch_software_versions = Channel.empty() + + Channel + .fromPath(params.synapseconfig) + .set { ch_synapseConfig } + + // MODULE: Get individual FastQ SynapseIDs from Directory SynapseID(s) + SYNAPSE_LIST ( + ch_ids, + ch_synapseConfig + ) + ch_software_versions = ch_software_versions.mix(SYNAPSE_LIST.out.version.first().ifEmpty(null)) + + // CHANNEL: Create channel for FQ SynapseIDs + SYNAPSE_LIST + .out + .synlist_csv + .splitCsv(header:false, strip:true).flatten() + .set { ch_samples } + + // MODULE: Download FastQ Files by SynapseID + SYNAPSE_GET ( + ch_samples, + ch_synapseConfig + ) + ch_software_versions = ch_software_versions.mix(SYNAPSE_GET.out.version.first().ifEmpty(null)) + + // CHANNEL: Create Read Pairs Channel - Creates format [sampleId, [fastq_1, fastq_2]] + SYNAPSE_GET + .out + .fastq + .collect().flatten() + .toSortedList().flatten() + .map { meta -> + def sampleId = meta.name.toString().tokenize('_').get(0) + [sampleId, meta] + } + .groupTuple() + .set{ ch_read_pairs } + + // MODULE: Download FQ Metadata by SynapseID + SYNAPSE_SHOW ( + ch_samples, + ch_synapseConfig + ) + ch_software_versions = ch_software_versions.mix(SYNAPSE_SHOW.out.version.first().ifEmpty(null)) + + // Clean Metadata + SYNAPSE_SHOW + .out + .metadata + .splitCsv(strip:true, sep:"=", skip:1) + .map { it[1] } + .collate( 6 ) + .set { ch_meta } + + // Compile Metadata + SYNAPSE_METADATA_MAPPING ( + ch_meta + ) + + // MODULE: Create Samplesheet + SYNAPSE_TO_SAMPLESHEET ( + ch_read_pairs, + params.strandedness + ) + + // MODULE: Merge Samplesheets + SYNAPSE_MERGE_SAMPLESHEET ( + READ_PAIRS_TO_SAMPLESHEET.out.samplesheet.collect(), + METADATA_TO_METAMAP.out.metasheet.collect() + ) + + // MODULE: Pipeline reporting + ch_software_versions + .map { it -> if (it) [ it.baseName, it ] } + .groupTuple() + .map { it[1][0] } + .flatten() + .collect() + .set { ch_software_versions } + + GET_SOFTWARE_VERSIONS ( + ch_software_versions.map { it }.collect() + ) +} + +/* +======================================================================================== + COMPLETION EMAIL AND SUMMARY +======================================================================================== +*/ + +workflow.onComplete { + if (params.email || params.email_on_fail) { + NfcoreTemplate.email(workflow, params, summary_params, projectDir, log) + } + NfcoreTemplate.summary(workflow, params, log) + WorkflowFetchngs.curateSamplesheetWarn(log) +} + +/* +======================================================================================== + THE END +======================================================================================== +*/ From 4c2fc9b3516227216f92dc9765d3af61782afeaf Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Tue, 5 Oct 2021 12:54:10 -0700 Subject: [PATCH 014/106] Update Synapse Workflow Labels --- workflows/synapse.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 0bf8109e..3fdd367f 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -44,10 +44,11 @@ include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versi ======================================================================================== */ -workflow FETCHNGS_SYNAPSE { +workflow SYNAPSE { ch_software_versions = Channel.empty() + // CHANNEL: Stage Synapse Config File Channel .fromPath(params.synapseconfig) .set { ch_synapseConfig } @@ -93,7 +94,7 @@ workflow FETCHNGS_SYNAPSE { ) ch_software_versions = ch_software_versions.mix(SYNAPSE_SHOW.out.version.first().ifEmpty(null)) - // Clean Metadata + // CHANNEL: Clean Metadata SYNAPSE_SHOW .out .metadata @@ -102,7 +103,7 @@ workflow FETCHNGS_SYNAPSE { .collate( 6 ) .set { ch_meta } - // Compile Metadata + // MODULE: Compile Metadata SYNAPSE_METADATA_MAPPING ( ch_meta ) @@ -128,6 +129,7 @@ workflow FETCHNGS_SYNAPSE { .collect() .set { ch_software_versions } + // MODULE: Get Software Versions GET_SOFTWARE_VERSIONS ( ch_software_versions.map { it }.collect() ) From 37cad77f4249ed09e0423a0781977fef76d43523 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Tue, 5 Oct 2021 14:20:48 -0700 Subject: [PATCH 015/106] Add SynapseID Option to main.nf --- conf/modules.config | 20 ++++++++++++++++++++ main.nf | 18 +++++++++++++++--- nextflow_schema.json | 22 +++++++++++++++++++--- 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 3ae76b68..f3da4f94 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -42,6 +42,26 @@ params { 'sra_merge_samplesheet' { publish_dir = 'samplesheet' } + 'synapse_list' { + publish_dir = 'synapse' + } + 'synapse_get' { + publish_dir = 'fastq' + } + 'synapse_show' { + publish_dir = 'metadata' + } + 'synapse_metadata_mapping' { + publish_dir = 'metadata' + publish_files = false + } + 'synapse_to_samplesheet' { + publish_dir = 'samplesheet' + publish_files = false + } + 'synapse_merge_samplesheet' { + publish_dir = 'samplesheet' + } 'multiqc_mappings_config' { publish_dir = 'samplesheet' } diff --git a/main.nf b/main.nf index 475c87a8..7d4735f8 100644 --- a/main.nf +++ b/main.nf @@ -25,13 +25,25 @@ WorkflowMain.initialise(workflow, params, log) ======================================================================================== */ -include { FETCHNGS } from './workflows/fetchngs' +if (params.input_type == 'Synapse') { + include { SYNAPSE } from './workflows/synapse' +} else { + include { FETCHNGS } from './workflows/fetchngs' +} // -// WORKFLOW: Run main nf-core/fetchngs analysis pipeline +// WORKFLOW: Run main nf-core/fetchngs analysis pipeline, depending on Identifier Type provided // workflow NFCORE_FETCHNGS { - FETCHNGS () + + // Workflow for SynapseIDs + if (params.input_type == 'Synapse') { + SYNAPSE () + } else { + // Workflow for SRA/ENA/GEO IDs + FETCHNGS () + } + } /* diff --git a/nextflow_schema.json b/nextflow_schema.json index a391f35d..1c0dc196 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,7 +11,8 @@ "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", "required": [ - "input" + "input", + "synapse_input" ], "properties": { "input": { @@ -23,6 +24,17 @@ "fa_icon": "fas fa-file-excel", "description": "File containing SRA/ENA/GEO identifiers one per line to download their associated metadata and FastQ files." }, + "input_type": { + "type": "string", + "description": "Type of Sample Identifiers Provided (SRA/ENA/GE/Synapse)", + "enum": [ + "SRA", + "ENA", + "GEO", + "Synapse" + ], + "fa_icon": "fas fa-copy" + }, "ena_metadata_fields": { "type": "string", "fa_icon": "fas fa-columns", @@ -111,6 +123,11 @@ "description": "Institutional config URL link.", "hidden": true, "fa_icon": "fas fa-users-cog" + }, + "synapse_config": { + "type": "string", + "description": "Path to Synapse configuration file", + "fa_icon": "fas fa-users-cog" } } }, @@ -249,5 +266,4 @@ "$ref": "#/definitions/generic_options" } ] -} - +} \ No newline at end of file From 1436b795f7f0fd846ce069e26a608598fa099064 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Tue, 5 Oct 2021 14:49:23 -0700 Subject: [PATCH 016/106] Fix Typos --- modules/local/synapse_metadata_mapping.nf | 2 +- workflows/synapse.nf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/synapse_metadata_mapping.nf b/modules/local/synapse_metadata_mapping.nf index 0981a1d9..5b799326 100644 --- a/modules/local/synapse_metadata_mapping.nf +++ b/modules/local/synapse_metadata_mapping.nf @@ -4,7 +4,7 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' params.options = [:] options = initOptions(params.options) -process SYNAPSEMETADATAMAPPING { +process SYNAPSE_METADATA_MAPPING { tag "${data[3]}" label 'process_low' publishDir "${params.outdir}", diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 3fdd367f..f5355023 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -116,8 +116,8 @@ workflow SYNAPSE { // MODULE: Merge Samplesheets SYNAPSE_MERGE_SAMPLESHEET ( - READ_PAIRS_TO_SAMPLESHEET.out.samplesheet.collect(), - METADATA_TO_METAMAP.out.metasheet.collect() + SYNAPSE_TO_SAMPLESHEET.out.samplesheet.collect(), + SYNAPSE_METADATA_MAPPING.out.metasheet.collect() ) // MODULE: Pipeline reporting From ca5752edc3f93a81c8bd6d3002ff11982523b76a Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Tue, 5 Oct 2021 20:20:17 -0700 Subject: [PATCH 017/106] Remove trailing whitespace --- modules/local/synapse_get.nf | 6 +++--- modules/local/synapse_list.nf | 2 +- modules/local/synapse_metadata_mapping.nf | 2 +- modules/local/synapse_to_samplesheet.nf | 2 +- workflows/synapse.nf | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index bdea8fd0..9b7990a2 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -28,9 +28,9 @@ process SYNAPSE_GET { script: def software = getSoftwareName(task.process) - + """ - synapse -c $synapseconfig get $synid + synapse -c $synapseconfig get $synid echo \$(synapse --version) > ${software}.version.txt """ -} \ No newline at end of file +} diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf index 2d1cb2f8..62a1c16d 100644 --- a/modules/local/synapse_list.nf +++ b/modules/local/synapse_list.nf @@ -30,7 +30,7 @@ process SYNAPSE_LIST { def software = getSoftwareName(task.process) """ - synapse -c $synapseconfig list -l $synid | cut -c-11 > ${synid}.synlist.csv + synapse -c $synapseconfig list -l $synid | cut -c-11 > ${synid}.synlist.csv echo \$(synapse --version) > ${software}.version.txt """ } diff --git a/modules/local/synapse_metadata_mapping.nf b/modules/local/synapse_metadata_mapping.nf index 5b799326..6d3c87ac 100644 --- a/modules/local/synapse_metadata_mapping.nf +++ b/modules/local/synapse_metadata_mapping.nf @@ -11,7 +11,7 @@ process SYNAPSE_METADATA_MAPPING { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - input: + input: val data output: diff --git a/modules/local/synapse_to_samplesheet.nf b/modules/local/synapse_to_samplesheet.nf index 1e50a080..374a07ab 100644 --- a/modules/local/synapse_to_samplesheet.nf +++ b/modules/local/synapse_to_samplesheet.nf @@ -28,7 +28,7 @@ process SYNAPSE_TO_SAMPLESHEET { ] // Add Strandedness pipeline_map << [ strandedness: "${strandedness}" ] - + // Create Samplesheet samplesheet = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",") diff --git a/workflows/synapse.nf b/workflows/synapse.nf index f5355023..a00ca5ec 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -73,14 +73,14 @@ workflow SYNAPSE { ch_synapseConfig ) ch_software_versions = ch_software_versions.mix(SYNAPSE_GET.out.version.first().ifEmpty(null)) - + // CHANNEL: Create Read Pairs Channel - Creates format [sampleId, [fastq_1, fastq_2]] SYNAPSE_GET .out .fastq .collect().flatten() .toSortedList().flatten() - .map { meta -> + .map { meta -> def sampleId = meta.name.toString().tokenize('_').get(0) [sampleId, meta] } @@ -107,7 +107,7 @@ workflow SYNAPSE { SYNAPSE_METADATA_MAPPING ( ch_meta ) - + // MODULE: Create Samplesheet SYNAPSE_TO_SAMPLESHEET ( ch_read_pairs, From a7c3c822e47d898cb8a4d113966bada31551c164 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Wed, 6 Oct 2021 11:55:04 -0700 Subject: [PATCH 018/106] Add input_type param to configs --- conf/test.config | 1 + conf/test_full.config | 1 + modules/local/synapse_to_samplesheet.nf | 3 +-- nextflow.config | 2 ++ nextflow_schema.json | 9 +++++---- workflows/synapse.nf | 11 +++++++---- 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/conf/test.config b/conf/test.config index ee909409..75a1ce09 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,4 +21,5 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt' + input_type = 'SRA' } diff --git a/conf/test_full.config b/conf/test_full.config index a5aea2dc..4fc58a6b 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -16,4 +16,5 @@ params { // Input data for full size test input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt' + input_type = 'SRA' } diff --git a/modules/local/synapse_to_samplesheet.nf b/modules/local/synapse_to_samplesheet.nf index 374a07ab..e51d46ca 100644 --- a/modules/local/synapse_to_samplesheet.nf +++ b/modules/local/synapse_to_samplesheet.nf @@ -13,7 +13,6 @@ process SYNAPSE_TO_SAMPLESHEET { input: tuple val(id), val(files) - val strandedness output: path("*samplesheet.csv"), emit: samplesheet @@ -27,7 +26,7 @@ process SYNAPSE_TO_SAMPLESHEET { fastq_2 : "${params.outdir}/${params.results_dir}/${files[1].getBaseName()}" ] // Add Strandedness - pipeline_map << [ strandedness: "${strandedness}" ] + pipeline_map << [ strandedness: "unstranded" ] // Create Samplesheet samplesheet = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' diff --git a/nextflow.config b/nextflow.config index 52382140..4bdacfba 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,6 +11,7 @@ params { // Input options input = null + input_type = 'SRA' nf_core_pipeline = null ena_metadata_fields = null sample_mapping_fields = 'run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' @@ -39,6 +40,7 @@ params { config_profile_contact = null config_profile_url = null config_profile_name = null + synapse_config = null // Max resource options // Defaults only, expecting to be overwritten diff --git a/nextflow_schema.json b/nextflow_schema.json index 1c0dc196..2f0df31f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -11,8 +11,7 @@ "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", "required": [ - "input", - "synapse_input" + "input" ], "properties": { "input": { @@ -33,7 +32,8 @@ "GEO", "Synapse" ], - "fa_icon": "fas fa-copy" + "fa_icon": "fas fa-copy", + "default": "SRA" }, "ena_metadata_fields": { "type": "string", @@ -127,7 +127,8 @@ "synapse_config": { "type": "string", "description": "Path to Synapse configuration file", - "fa_icon": "fas fa-users-cog" + "fa_icon": "fas fa-users-cog", + "hidden": true } } }, diff --git a/workflows/synapse.nf b/workflows/synapse.nf index a00ca5ec..24dcd9dd 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -4,10 +4,14 @@ ======================================================================================== */ +def valid_params = [ + ena_metadata_fields : ['run_accession', 'experiment_accession', 'library_layout', 'fastq_ftp', 'fastq_md5'] +] + def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) // Validate input parameters -WorkflowFetchngs.initialise(params, log) +WorkflowFetchngs.initialise(params, log, valid_params) // Check mandatory parameters if (params.input) { @@ -50,7 +54,7 @@ workflow SYNAPSE { // CHANNEL: Stage Synapse Config File Channel - .fromPath(params.synapseconfig) + .fromPath(params.synapse_config) .set { ch_synapseConfig } // MODULE: Get individual FastQ SynapseIDs from Directory SynapseID(s) @@ -80,7 +84,7 @@ workflow SYNAPSE { .fastq .collect().flatten() .toSortedList().flatten() - .map { meta -> + .map { meta -> def sampleId = meta.name.toString().tokenize('_').get(0) [sampleId, meta] } @@ -111,7 +115,6 @@ workflow SYNAPSE { // MODULE: Create Samplesheet SYNAPSE_TO_SAMPLESHEET ( ch_read_pairs, - params.strandedness ) // MODULE: Merge Samplesheets From 2a00191fa8b97b113a7d46feb1befaf69aa80c08 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Wed, 6 Oct 2021 14:56:28 -0700 Subject: [PATCH 019/106] Remove SynapseConfig flag --- modules/local/synapse_get.nf | 11 ++++------- modules/local/synapse_list.nf | 2 +- modules/local/synapse_show.nf | 7 ++----- modules/local/synapse_to_samplesheet.nf | 6 +++--- workflows/synapse.nf | 6 +----- 5 files changed, 11 insertions(+), 21 deletions(-) diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index 9b7990a2..0f083141 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -5,8 +5,8 @@ params.options = [:] options = initOptions(params.options) process SYNAPSE_GET { - tag '$synid' - label 'process_high' + tag "$synid" + label 'process_low' publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } @@ -20,17 +20,14 @@ process SYNAPSE_GET { input: val synid // synapse ID for individual FastQ files - path synapseconfig // path to synapse.Config file output: - path "*.fastq*" , emit: fastq - path "*.version.txt" , emit: version + path "*" , emit: fastq script: def software = getSoftwareName(task.process) """ - synapse -c $synapseconfig get $synid - echo \$(synapse --version) > ${software}.version.txt + synapse get $synid """ } diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf index 62a1c16d..5e94f06d 100644 --- a/modules/local/synapse_list.nf +++ b/modules/local/synapse_list.nf @@ -5,7 +5,7 @@ params.options = [:] options = initOptions(params.options) process SYNAPSE_LIST { - tag '$synid' + tag "$synid" label 'process_low' publishDir "${params.outdir}", mode: params.publish_dir_mode, diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf index 50a3405f..bd93e621 100644 --- a/modules/local/synapse_show.nf +++ b/modules/local/synapse_show.nf @@ -5,7 +5,7 @@ params.options = [:] options = initOptions(params.options) process SYNAPSE_SHOW { - tag '$synid' + tag "$synid" label 'process_low' publishDir "${params.outdir}", mode: params.publish_dir_mode, @@ -20,17 +20,14 @@ process SYNAPSE_SHOW { input: val synid // synapse ID for individual FastQ files - path synapseconfig // path to synapse.Config file output: path "*.metadata.txt", emit: metadata - path "*.version.txt", emit: version script: def software = getSoftwareName(task.process) """ - synapse -c $synapseconfig show $synid | sed -n '1,3p;15,16p;20p;23p' > ${synid}.metadata.txt - echo \$(synapse --version) > ${software}.version.txt + synapse show $synid | sed -n '1,3p;15,16p;20p;23p' > ${synid}.metadata.txt """ } diff --git a/modules/local/synapse_to_samplesheet.nf b/modules/local/synapse_to_samplesheet.nf index e51d46ca..20bae1c1 100644 --- a/modules/local/synapse_to_samplesheet.nf +++ b/modules/local/synapse_to_samplesheet.nf @@ -5,7 +5,7 @@ params.options = [:] options = initOptions(params.options) process SYNAPSE_TO_SAMPLESHEET { - tag '$id' + tag "$id" label 'process_low' publishDir "${params.outdir}", mode: params.publish_dir_mode, @@ -22,8 +22,8 @@ process SYNAPSE_TO_SAMPLESHEET { // Add fields to the beginning of the map pipeline_map = [ sample : "${id}", - fastq_1 : "${params.outdir}/${params.results_dir}/${files[0].getBaseName()}", - fastq_2 : "${params.outdir}/${params.results_dir}/${files[1].getBaseName()}" + fastq_1 : "${params.outdir}/fastq/${files[0]}", + fastq_2 : "${params.outdir}/fastq/${files[1]}" ] // Add Strandedness pipeline_map << [ strandedness: "unstranded" ] diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 24dcd9dd..9820ce09 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -68,15 +68,13 @@ workflow SYNAPSE { SYNAPSE_LIST .out .synlist_csv - .splitCsv(header:false, strip:true).flatten() + .splitCsv(header:false).flatten() .set { ch_samples } // MODULE: Download FastQ Files by SynapseID SYNAPSE_GET ( ch_samples, - ch_synapseConfig ) - ch_software_versions = ch_software_versions.mix(SYNAPSE_GET.out.version.first().ifEmpty(null)) // CHANNEL: Create Read Pairs Channel - Creates format [sampleId, [fastq_1, fastq_2]] SYNAPSE_GET @@ -94,9 +92,7 @@ workflow SYNAPSE { // MODULE: Download FQ Metadata by SynapseID SYNAPSE_SHOW ( ch_samples, - ch_synapseConfig ) - ch_software_versions = ch_software_versions.mix(SYNAPSE_SHOW.out.version.first().ifEmpty(null)) // CHANNEL: Clean Metadata SYNAPSE_SHOW From c749172925f8cd29602f2674c5e0eaffb89a3e26 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Thu, 7 Oct 2021 14:04:09 -0700 Subject: [PATCH 020/106] Fix SynapseConfig Bug --- modules/local/synapse_get.nf | 3 ++- modules/local/synapse_show.nf | 3 ++- workflows/synapse.nf | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index 0f083141..47f39def 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -20,6 +20,7 @@ process SYNAPSE_GET { input: val synid // synapse ID for individual FastQ files + path synapseconfig // path to synapse.Config file output: path "*" , emit: fastq @@ -28,6 +29,6 @@ process SYNAPSE_GET { def software = getSoftwareName(task.process) """ - synapse get $synid + synapse -c $synapseconfig get $synid """ } diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf index bd93e621..dd78d1a8 100644 --- a/modules/local/synapse_show.nf +++ b/modules/local/synapse_show.nf @@ -20,6 +20,7 @@ process SYNAPSE_SHOW { input: val synid // synapse ID for individual FastQ files + path synapseconfig // path to synapse.Config file output: path "*.metadata.txt", emit: metadata @@ -28,6 +29,6 @@ process SYNAPSE_SHOW { def software = getSoftwareName(task.process) """ - synapse show $synid | sed -n '1,3p;15,16p;20p;23p' > ${synid}.metadata.txt + synapse -c $synapseconfig show $synid | sed -n '1,3p;15,16p;20p;23p' > ${synid}.metadata.txt """ } diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 9820ce09..fd024af8 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -53,9 +53,7 @@ workflow SYNAPSE { ch_software_versions = Channel.empty() // CHANNEL: Stage Synapse Config File - Channel - .fromPath(params.synapse_config) - .set { ch_synapseConfig } + ch_synapseConfig = file( params.synapse_config ) // MODULE: Get individual FastQ SynapseIDs from Directory SynapseID(s) SYNAPSE_LIST ( @@ -74,6 +72,7 @@ workflow SYNAPSE { // MODULE: Download FastQ Files by SynapseID SYNAPSE_GET ( ch_samples, + ch_synapseConfig ) // CHANNEL: Create Read Pairs Channel - Creates format [sampleId, [fastq_1, fastq_2]] @@ -92,6 +91,7 @@ workflow SYNAPSE { // MODULE: Download FQ Metadata by SynapseID SYNAPSE_SHOW ( ch_samples, + ch_synapseConfig ) // CHANNEL: Clean Metadata From 94c8963ae7379fff8c4ce989ee0ef8dbe879db2d Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Thu, 7 Oct 2021 14:40:06 -0700 Subject: [PATCH 021/106] Remove input_type parameter --- lib/WorkflowMain.groovy | 12 ++++++++++++ main.nf | 5 +++-- nextflow.config | 1 - nextflow_schema.json | 12 ------------ 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index a7fb8c70..59c3e8a4 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -77,4 +77,16 @@ class WorkflowMain { System.exit(1) } } + + // Check input type + public static String getIdentifierType(workflow, params, log) { + def input_type = "" + params.input.eachLine { line -> + if (line.contains("syn")) { + input_type = "Synapse" + } else { + input_type = "SRA" + }} + return input_type + } } diff --git a/main.nf b/main.nf index 7d4735f8..96e38f15 100644 --- a/main.nf +++ b/main.nf @@ -18,6 +18,7 @@ nextflow.enable.dsl = 2 */ WorkflowMain.initialise(workflow, params, log) +input_type = WorkflowMain.getIdentifierType(workflow, params, log) /* ======================================================================================== @@ -25,7 +26,7 @@ WorkflowMain.initialise(workflow, params, log) ======================================================================================== */ -if (params.input_type == 'Synapse') { +if (input_type == 'Synapse') { include { SYNAPSE } from './workflows/synapse' } else { include { FETCHNGS } from './workflows/fetchngs' @@ -37,7 +38,7 @@ if (params.input_type == 'Synapse') { workflow NFCORE_FETCHNGS { // Workflow for SynapseIDs - if (params.input_type == 'Synapse') { + if (input_type == 'Synapse') { SYNAPSE () } else { // Workflow for SRA/ENA/GEO IDs diff --git a/nextflow.config b/nextflow.config index 4bdacfba..5e3c7fd2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,7 +11,6 @@ params { // Input options input = null - input_type = 'SRA' nf_core_pipeline = null ena_metadata_fields = null sample_mapping_fields = 'run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' diff --git a/nextflow_schema.json b/nextflow_schema.json index 2f0df31f..836d00e9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -23,18 +23,6 @@ "fa_icon": "fas fa-file-excel", "description": "File containing SRA/ENA/GEO identifiers one per line to download their associated metadata and FastQ files." }, - "input_type": { - "type": "string", - "description": "Type of Sample Identifiers Provided (SRA/ENA/GE/Synapse)", - "enum": [ - "SRA", - "ENA", - "GEO", - "Synapse" - ], - "fa_icon": "fas fa-copy", - "default": "SRA" - }, "ena_metadata_fields": { "type": "string", "fa_icon": "fas fa-columns", From a1d055de571c634fb726ccb71be24ec59d9884bd Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Thu, 7 Oct 2021 14:45:57 -0700 Subject: [PATCH 022/106] Remove trailing whitespace --- lib/WorkflowMain.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 59c3e8a4..52cfdaea 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -77,7 +77,7 @@ class WorkflowMain { System.exit(1) } } - + // Check input type public static String getIdentifierType(workflow, params, log) { def input_type = "" From 1aa1a1ecdfc00cfb5b68b64b369bc5e8631d08b8 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Thu, 7 Oct 2021 15:04:03 -0700 Subject: [PATCH 023/106] Remove remaining input_type parameters --- conf/test.config | 1 - conf/test_full.config | 1 - 2 files changed, 2 deletions(-) diff --git a/conf/test.config b/conf/test.config index 75a1ce09..ee909409 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,5 +21,4 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt' - input_type = 'SRA' } diff --git a/conf/test_full.config b/conf/test_full.config index 4fc58a6b..a5aea2dc 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -16,5 +16,4 @@ params { // Input data for full size test input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt' - input_type = 'SRA' } From 8b636823da5e598ec5b0d27ceabe33d62cb0ebc7 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 8 Oct 2021 15:57:31 +0100 Subject: [PATCH 024/106] Quick fire re-factor of modules, CI and test configs --- .github/workflows/ci.yml | 24 +++- conf/test.config | 2 +- conf/test_synapse.config | 24 ++++ modules.json | 11 ++ modules/local/functions.nf | 54 +++++---- modules/local/get_software_versions.nf | 33 ------ modules/local/multiqc_mappings_config.nf | 11 +- modules/local/sra_fastq_ftp.nf | 13 ++- modules/local/sra_ids_to_runinfo.nf | 10 +- modules/local/sra_merge_samplesheet.nf | 10 +- modules/local/sra_runinfo_to_ftp.nf | 11 +- modules/local/synapse_get.nf | 29 +++-- modules/local/synapse_list.nf | 32 +++--- modules/local/synapse_merge_samplesheet.nf | 10 +- modules/local/synapse_metadata_mapping.nf | 2 +- modules/local/synapse_show.nf | 30 +++-- modules/local/synapse_to_samplesheet.nf | 2 +- .../custom/dumpsoftwareversions/functions.nf | 78 +++++++++++++ .../custom/dumpsoftwareversions/main.nf | 106 ++++++++++++++++++ .../custom/dumpsoftwareversions/meta.yml | 33 ++++++ nextflow.config | 7 +- nextflow_schema.json | 12 +- 22 files changed, 426 insertions(+), 118 deletions(-) create mode 100644 conf/test_synapse.config create mode 100644 modules.json delete mode 100644 modules/local/get_software_versions.nf create mode 100644 modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf create mode 100644 modules/nf-core/modules/custom/dumpsoftwareversions/main.nf create mode 100644 modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 26785ea1..7788b5de 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,10 +35,32 @@ jobs: wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - name: Run pipeline with test data + - name: Run pipeline with SRA test data run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker + test_synapse: + name: Test Synapse workflow + if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }} + runs-on: ubuntu-latest + env: + NXF_VER: ${{ matrix.nxf_ver }} + NXF_ANSI_LOG: false + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + env: + CAPSULE_LOG: none + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Run pipeline with synapse test data + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_synapse,docker + parameters: name: Test workflow parameters if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }} diff --git a/conf/test.config b/conf/test.config index ee909409..d0d7eb53 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,5 +20,5 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.txt' } diff --git a/conf/test_synapse.config b/conf/test_synapse.config new file mode 100644 index 00000000..d68d4098 --- /dev/null +++ b/conf/test_synapse.config @@ -0,0 +1,24 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/fetchngs -profile test_synapse, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile using Synapse ids' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 6.h + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/synapse_ids_test.txt' +} diff --git a/modules.json b/modules.json new file mode 100644 index 00000000..ff841d00 --- /dev/null +++ b/modules.json @@ -0,0 +1,11 @@ +{ + "name": "nf-core/fetchngs", + "homePage": "https://github.com/nf-core/fetchngs", + "repos": { + "nf-core/modules": { + "custom/dumpsoftwareversions": { + "git_sha": "84f2302920078b0cf7716b2a2e5fcc0be5c4531d" + } + } + } +} \ No newline at end of file diff --git a/modules/local/functions.nf b/modules/local/functions.nf index da9da093..85628ee0 100644 --- a/modules/local/functions.nf +++ b/modules/local/functions.nf @@ -9,6 +9,13 @@ def getSoftwareName(task_process) { return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() } +// +// Extract name of module from process name using $task.process +// +def getProcessName(task_process) { + return task_process.tokenize(':')[-1] +} + // // Function to initialise default values and to generate a Groovy Map of available options for nf-core modules // @@ -37,32 +44,35 @@ def getPathFromList(path_list) { // Function to save/publish module results // def saveFiles(Map args) { - if (!args.filename.endsWith('.version.txt')) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + + // Do not publish versions.yml unless running from pytest workflow + if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { + return null + } + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] } + path = path instanceof String ? path : '' + path_list.add(path) } } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" } } diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf deleted file mode 100644 index 65b7f340..00000000 --- a/modules/local/get_software_versions.nf +++ /dev/null @@ -1,33 +0,0 @@ -// Import generic module functions -include { saveFiles } from './functions' - -params.options = [:] - -process GET_SOFTWARE_VERSIONS { - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } - - conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.9--1" - } else { - container "quay.io/biocontainers/python:3.9--1" - } - - cache false - - input: - path versions - - output: - path "software_versions.tsv" , emit: tsv - path 'software_versions_mqc.yaml', emit: yaml - - script: // This script is bundled with the pipeline, in nf-core/fetchngs/bin/ - """ - echo $workflow.manifest.version > pipeline.version.txt - echo $workflow.nextflow.version > nextflow.version.txt - scrape_software_versions.py &> software_versions_mqc.yaml - """ -} diff --git a/modules/local/multiqc_mappings_config.nf b/modules/local/multiqc_mappings_config.nf index 8360cd34..857c859c 100644 --- a/modules/local/multiqc_mappings_config.nf +++ b/modules/local/multiqc_mappings_config.nf @@ -1,5 +1,5 @@ // Import generic module functions -include { saveFiles; getSoftwareName } from './functions' +include { saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] @@ -19,8 +19,8 @@ process MULTIQC_MAPPINGS_CONFIG { path csv output: - path "*yml" , emit: yml - path "*.version.txt", emit: version + path "*yml" , emit: yml + path "versions.yml", emit: versions script: """ @@ -28,6 +28,9 @@ process MULTIQC_MAPPINGS_CONFIG { $csv \\ multiqc_config.yml - python --version | sed -e "s/Python //g" > python.version.txt + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + python: \$(python --version | sed 's/Python //g') + END_VERSIONS """ } diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index 126c0647..39d3659d 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -1,5 +1,5 @@ // Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] options = initOptions(params.options) @@ -25,6 +25,7 @@ process SRA_FASTQ_FTP { output: tuple val(meta), path("*fastq.gz"), emit: fastq tuple val(meta), path("*md5") , emit: md5 + path "versions.yml" , emit: versions script: if (meta.single_end) { @@ -33,6 +34,11 @@ process SRA_FASTQ_FTP { echo "${meta.md5_1} ${meta.id}.fastq.gz" > ${meta.id}.fastq.gz.md5 md5sum -c ${meta.id}.fastq.gz.md5 + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + END_VERSIONS """ } else { """ @@ -45,6 +51,11 @@ process SRA_FASTQ_FTP { echo "${meta.md5_2} ${meta.id}_2.fastq.gz" > ${meta.id}_2.fastq.gz.md5 md5sum -c ${meta.id}_2.fastq.gz.md5 + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + END_VERSIONS """ } } diff --git a/modules/local/sra_ids_to_runinfo.nf b/modules/local/sra_ids_to_runinfo.nf index 3d3fc063..8200271e 100644 --- a/modules/local/sra_ids_to_runinfo.nf +++ b/modules/local/sra_ids_to_runinfo.nf @@ -1,5 +1,5 @@ // Import generic module functions -include { saveFiles; getSoftwareName } from './functions' +include { saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] @@ -22,7 +22,8 @@ process SRA_IDS_TO_RUNINFO { val fields output: - path "*.tsv", emit: tsv + path "*.tsv" , emit: tsv + path "versions.yml", emit: versions script: def metadata_fields = fields ? "--ena_metadata_fields ${fields}" : '' @@ -32,5 +33,10 @@ process SRA_IDS_TO_RUNINFO { id.txt \\ ${id}.runinfo.tsv \\ $metadata_fields + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + python: \$(python --version | sed 's/Python //g') + END_VERSIONS """ } diff --git a/modules/local/sra_merge_samplesheet.nf b/modules/local/sra_merge_samplesheet.nf index e3c1314a..914bac00 100644 --- a/modules/local/sra_merge_samplesheet.nf +++ b/modules/local/sra_merge_samplesheet.nf @@ -1,5 +1,5 @@ // Import generic module functions -include { saveFiles; getSoftwareName } from './functions' +include { saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] @@ -21,7 +21,8 @@ process SRA_MERGE_SAMPLESHEET { output: path "samplesheet.csv", emit: samplesheet - path "id_mappings.csv" , emit: mappings + path "id_mappings.csv", emit: mappings + path "versions.yml" , emit: versions script: """ @@ -34,5 +35,10 @@ process SRA_MERGE_SAMPLESHEET { for fileid in `ls ./mappings/*`; do awk 'NR>1' \$fileid >> id_mappings.csv done + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + END_VERSIONS """ } diff --git a/modules/local/sra_runinfo_to_ftp.nf b/modules/local/sra_runinfo_to_ftp.nf index f426f4ab..80028705 100644 --- a/modules/local/sra_runinfo_to_ftp.nf +++ b/modules/local/sra_runinfo_to_ftp.nf @@ -1,5 +1,5 @@ // Import generic module functions -include { saveFiles; getSoftwareName } from './functions' +include { saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] @@ -19,8 +19,8 @@ process SRA_RUNINFO_TO_FTP { path runinfo output: - path "*.tsv" , emit: tsv - path "*.version.txt", emit: version + path "*.tsv" , emit: tsv + path "versions.yml", emit: versions script: """ @@ -28,6 +28,9 @@ process SRA_RUNINFO_TO_FTP { ${runinfo.join(',')} \\ ${runinfo.toString().tokenize(".")[0]}.runinfo_ftp.tsv - python --version | sed -e "s/Python //g" > python.version.txt + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + python: \$(python --version | sed 's/Python //g') + END_VERSIONS """ } diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index 47f39def..5061e4d8 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -1,34 +1,41 @@ // Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] options = initOptions(params.options) process SYNAPSE_GET { - tag "$synid" + tag "$id" label 'process_low' publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "bioconda::synapseclient=2.2.2" : null) + conda (params.enable_conda ? "bioconda::synapseclient=2.4.0" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" // TODO: Add Singularity + container "https://depot.galaxyproject.org/singularity/synapseclient:2.4.0--pyh5e36f6f_0" } else { - container "sagebionetworks/synapsepythonclient:v2.4.0" + container "quay.io/biocontainers/synapseclient:2.4.0--pyh5e36f6f_0" } input: - val synid // synapse ID for individual FastQ files - path synapseconfig // path to synapse.Config file + val id + path config output: - path "*" , emit: fastq + path "*.fastq.gz" , emit: fastq + path "versions.yml", emit: versions script: - def software = getSoftwareName(task.process) - """ - synapse -c $synapseconfig get $synid + synapse \\ + -c $config \\ + get \\ + $synid + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$( echo \$(synapse --version) ) + END_VERSIONS """ } diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf index 5e94f06d..de8ee867 100644 --- a/modules/local/synapse_list.nf +++ b/modules/local/synapse_list.nf @@ -1,36 +1,42 @@ // Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] options = initOptions(params.options) process SYNAPSE_LIST { - tag "$synid" + tag "$id" label 'process_low' publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "bioconda::synapseclient=2.2.2" : null) + conda (params.enable_conda ? "bioconda::synapseclient=2.4.0" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" // TODO: Add Singularity + container "https://depot.galaxyproject.org/singularity/synapseclient:2.4.0--pyh5e36f6f_0" } else { - container "sagebionetworks/synapsepythonclient:v2.4.0" + container "quay.io/biocontainers/synapseclient:2.4.0--pyh5e36f6f_0" } input: - val synid // synapse ID for individual FastQ files - path synapseconfig // path to synapse.Config file + val id + path config output: - path "*.synlist.csv", emit: synlist_csv - path "*.version.txt", emit: version + path "*.csv" , emit: csv + path "versions.yml", emit: versions script: - def software = getSoftwareName(task.process) - """ - synapse -c $synapseconfig list -l $synid | cut -c-11 > ${synid}.synlist.csv - echo \$(synapse --version) > ${software}.version.txt + synapse \\ + -c $config \\ + list \\ + -l $id \\ + | cut -c-11 > ${id}.synlist.csv + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$( echo \$(synapse --version) ) + END_VERSIONS """ } diff --git a/modules/local/synapse_merge_samplesheet.nf b/modules/local/synapse_merge_samplesheet.nf index d31d17ef..f634118b 100644 --- a/modules/local/synapse_merge_samplesheet.nf +++ b/modules/local/synapse_merge_samplesheet.nf @@ -1,5 +1,5 @@ // Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] options = initOptions(params.options) @@ -24,7 +24,8 @@ process SYNAPSE_MERGE_SAMPLESHEET { output: path "samplesheet.csv", emit: samplesheet - path "metasheet.csv", emit: metasheet + path "metasheet.csv" , emit: metasheet + path "versions.yml" , emit: versions script: """ @@ -37,5 +38,10 @@ process SYNAPSE_MERGE_SAMPLESHEET { for fileid in `ls ./metasheet/*`; do awk 'NR>1' \$fileid >> metasheet.csv done + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + END_VERSIONS """ } diff --git a/modules/local/synapse_metadata_mapping.nf b/modules/local/synapse_metadata_mapping.nf index 6d3c87ac..a513da09 100644 --- a/modules/local/synapse_metadata_mapping.nf +++ b/modules/local/synapse_metadata_mapping.nf @@ -15,7 +15,7 @@ process SYNAPSE_METADATA_MAPPING { val data output: - path("*metasheet.csv"), emit: metasheet + path "*.csv", emit: metasheet exec: meta_map = [ diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf index dd78d1a8..6f0f0b1c 100644 --- a/modules/local/synapse_show.nf +++ b/modules/local/synapse_show.nf @@ -1,34 +1,42 @@ // Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] options = initOptions(params.options) process SYNAPSE_SHOW { - tag "$synid" + tag "$id" label 'process_low' publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - conda (params.enable_conda ? "bioconda::synapseclient=2.2.2" : null) + conda (params.enable_conda ? "bioconda::synapseclient=2.4.0" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" // TODO: Add Singularity + container "https://depot.galaxyproject.org/singularity/synapseclient:2.4.0--pyh5e36f6f_0" } else { - container "sagebionetworks/synapsepythonclient:v2.4.0" + container "quay.io/biocontainers/synapseclient:2.4.0--pyh5e36f6f_0" } input: - val synid // synapse ID for individual FastQ files - path synapseconfig // path to synapse.Config file + val id + path config output: - path "*.metadata.txt", emit: metadata + path "*.txt" , emit: metadata + path "versions.yml", emit: versions script: - def software = getSoftwareName(task.process) - """ - synapse -c $synapseconfig show $synid | sed -n '1,3p;15,16p;20p;23p' > ${synid}.metadata.txt + synapse \\ + -c $config \\ + show \\ + $id \\ + | sed -n '1,3p;15,16p;20p;23p' > ${id}.metadata.txt + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$( echo \$(synapse --version) ) + END_VERSIONS """ } diff --git a/modules/local/synapse_to_samplesheet.nf b/modules/local/synapse_to_samplesheet.nf index 20bae1c1..7b301ddf 100644 --- a/modules/local/synapse_to_samplesheet.nf +++ b/modules/local/synapse_to_samplesheet.nf @@ -15,7 +15,7 @@ process SYNAPSE_TO_SAMPLESHEET { tuple val(id), val(files) output: - path("*samplesheet.csv"), emit: samplesheet + path "*.csv", emit: samplesheet exec: diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf b/modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf new file mode 100644 index 00000000..85628ee0 --- /dev/null +++ b/modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf @@ -0,0 +1,78 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Extract name of module from process name using $task.process +// +def getProcessName(task_process) { + return task_process.tokenize(':')[-1] +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + + // Do not publish versions.yml unless running from pytest workflow + if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { + return null + } + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } +} diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf b/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf new file mode 100644 index 00000000..faf2073f --- /dev/null +++ b/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf @@ -0,0 +1,106 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process CUSTOM_DUMPSOFTWAREVERSIONS { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } + + // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container + conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0" + } else { + container "quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0" + } + + input: + path versions + + output: + path "software_versions.yml" , emit: yml + path "software_versions_mqc.yml", emit: mqc_yml + path "versions.yml" , emit: versions + + script: + """ + #!/usr/bin/env python + + import yaml + import platform + from textwrap import dedent + + def _make_versions_html(versions): + html = [ + dedent( + '''\\ + + + + + + + + + + ''' + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f'''\\ + + + + + + ''' + ) + ) + html.append("") + html.append("
Process Name Software Version
{process if (i == 0) else ''}{tool}{version}
") + return "\\n".join(html) + + module_versions = {} + module_versions["${getProcessName(task.process)}"] = { + 'python': platform.python_version(), + 'yaml': yaml.__version__ + } + + with open("$versions") as f: + workflow_versions = yaml.load(f, Loader=yaml.BaseLoader) | module_versions + + workflow_versions["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version" + } + + versions_mqc = { + 'id': 'software_versions', + 'section_name': '${workflow.manifest.name} Software Versions', + 'section_href': 'https://github.com/${workflow.manifest.name}', + 'plot_type': 'html', + 'description': 'are collected at run time from the software output.', + 'data': _make_versions_html(workflow_versions) + } + + with open("software_versions.yml", 'w') as f: + yaml.dump(workflow_versions, f, default_flow_style=False) + with open("software_versions_mqc.yml", 'w') as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open('versions.yml', 'w') as f: + yaml.dump(module_versions, f, default_flow_style=False) + """ +} diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml new file mode 100644 index 00000000..8d4a6ed4 --- /dev/null +++ b/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml @@ -0,0 +1,33 @@ +name: custom_dumpsoftwareversions +description: Custom module used to dump software versions within the nf-core pipeline template +keywords: + - custom + - version +tools: + - custom: + description: Custom module used to dump software versions within the nf-core pipeline template + homepage: https://github.com/nf-core/tools + documentation: https://github.com/nf-core/tools + +input: + - versions: + type: file + description: YML file containing software versions + pattern: "*.yml" + +output: + - yml: + type: file + description: Standard YML file containing software versions + pattern: "software_versions.yml" + - mqc_yml: + type: file + description: MultiQC custom content YML file containing software versions + pattern: "software_versions_mqc.yml" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@drpatelh" diff --git a/nextflow.config b/nextflow.config index b4e5dde7..45dfb946 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,6 +14,7 @@ params { nf_core_pipeline = null ena_metadata_fields = null sample_mapping_fields = 'run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' + synapse_config = null skip_fastq_download = false // Boilerplate options @@ -39,7 +40,6 @@ params { config_profile_contact = null config_profile_url = null config_profile_name = null - synapse_config = null // Max resource options // Defaults only, expecting to be overwritten @@ -109,8 +109,9 @@ profiles { podman.enabled = false shifter.enabled = false } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_synapse { includeConfig 'conf/test_synapse.config' } + test_full { includeConfig 'conf/test_full.config' } } // Export these variables to prevent local Python/R libraries from conflicting with those in the container diff --git a/nextflow_schema.json b/nextflow_schema.json index db607cc4..b2f26972 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -57,6 +57,12 @@ "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "synapse_config": { + "type": "string", + "description": "Path to Synapse configuration file", + "fa_icon": "fas fa-users-cog", + "hidden": true } } }, @@ -111,12 +117,6 @@ "description": "Institutional config URL link.", "hidden": true, "fa_icon": "fas fa-users-cog" - }, - "synapse_config": { - "type": "string", - "description": "Path to Synapse configuration file", - "fa_icon": "fas fa-users-cog", - "hidden": true } } }, From b9d5e86184f123313c8bd14957b5e277aac93949 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 9 Oct 2021 09:24:59 +0100 Subject: [PATCH 025/106] Fix tests for SRA workflow --- ...flowFetchngs.groovy => WorkflowSra.groovy} | 4 +-- main.nf | 6 ++-- workflows/{fetchngs.nf => sra.nf} | 36 ++++++++++--------- 3 files changed, 25 insertions(+), 21 deletions(-) rename lib/{WorkflowFetchngs.groovy => WorkflowSra.groovy} (91%) rename workflows/{fetchngs.nf => sra.nf} (82%) diff --git a/lib/WorkflowFetchngs.groovy b/lib/WorkflowSra.groovy similarity index 91% rename from lib/WorkflowFetchngs.groovy rename to lib/WorkflowSra.groovy index ccff938b..a2c16219 100755 --- a/lib/WorkflowFetchngs.groovy +++ b/lib/WorkflowSra.groovy @@ -1,8 +1,8 @@ // -// This file holds several functions specific to the workflow/fetchngs.nf in the nf-core/fetchngs pipeline +// This file holds several functions specific to the workflow/sra.nf in the nf-core/fetchngs pipeline // -class WorkflowFetchngs { +class WorkflowSra { // // Check and validate parameters diff --git a/main.nf b/main.nf index 96e38f15..0b76d9da 100644 --- a/main.nf +++ b/main.nf @@ -29,11 +29,11 @@ input_type = WorkflowMain.getIdentifierType(workflow, params, log) if (input_type == 'Synapse') { include { SYNAPSE } from './workflows/synapse' } else { - include { FETCHNGS } from './workflows/fetchngs' + include { SRA } from './workflows/sra' } // -// WORKFLOW: Run main nf-core/fetchngs analysis pipeline, depending on Identifier Type provided +// WORKFLOW: Run main nf-core/fetchngs analysis pipeline, depending on type of dentifier provided // workflow NFCORE_FETCHNGS { @@ -42,7 +42,7 @@ workflow NFCORE_FETCHNGS { SYNAPSE () } else { // Workflow for SRA/ENA/GEO IDs - FETCHNGS () + SRA () } } diff --git a/workflows/fetchngs.nf b/workflows/sra.nf similarity index 82% rename from workflows/fetchngs.nf rename to workflows/sra.nf index 12733f07..25e214df 100644 --- a/workflows/fetchngs.nf +++ b/workflows/sra.nf @@ -11,7 +11,7 @@ def valid_params = [ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) // Validate input parameters -WorkflowFetchngs.initialise(params, log, valid_params) +WorkflowSra.initialise(params, log, valid_params) // Check mandatory parameters if (params.input) { @@ -40,7 +40,14 @@ include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' addParams( options: modules['sra_to_samplesheet'], results_dir: modules['sra_fastq_ftp'].publish_dir ) include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' addParams( options: modules['sra_merge_samplesheet'] ) include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' addParams( options: modules['multiqc_mappings_config'] ) -include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['tsv':'']] ) + +/* +======================================================================================== + IMPORT NF-CORE MODULES/SUBWORKFLOWS +======================================================================================== +*/ + +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' addParams( options: [publish_files : ['_versions.yml':'']] ) /* ======================================================================================== @@ -48,9 +55,9 @@ include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions ======================================================================================== */ -workflow FETCHNGS { +workflow SRA { - ch_software_versions = Channel.empty() + ch_versions = Channel.empty() // // MODULE: Get SRA run information for public database ids @@ -59,6 +66,7 @@ workflow FETCHNGS { ch_ids, params.ena_metadata_fields ?: '' ) + ch_versions = ch_versions.mix(SRA_IDS_TO_RUNINFO.out.versions.first()) // // MODULE: Parse SRA run information, create file containing FTP links and read into workflow as [ meta, [reads] ] @@ -66,6 +74,7 @@ workflow FETCHNGS { SRA_RUNINFO_TO_FTP ( SRA_IDS_TO_RUNINFO.out.tsv ) + ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) SRA_RUNINFO_TO_FTP .out @@ -78,7 +87,7 @@ workflow FETCHNGS { } .unique() .set { ch_sra_reads } - ch_software_versions = ch_software_versions.mix(SRA_RUNINFO_TO_FTP.out.version.first().ifEmpty(null)) + ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) if (!params.skip_fastq_download) { // @@ -87,6 +96,7 @@ workflow FETCHNGS { SRA_FASTQ_FTP ( ch_sra_reads.map { meta, reads -> if (meta.fastq_1) [ meta, reads ] } ) + ch_versions = ch_versions.mix(SRA_FASTQ_FTP.out.versions.first()) // // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet @@ -104,6 +114,7 @@ workflow FETCHNGS { SRA_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}, SRA_TO_SAMPLESHEET.out.mappings.collect{it[1]} ) + ch_versions = ch_versions.mix(SRA_MERGE_SAMPLESHEET.out.versions) // // MODULE: Create a MutiQC config file with sample name mappings @@ -112,6 +123,7 @@ workflow FETCHNGS { MULTIQC_MAPPINGS_CONFIG ( SRA_MERGE_SAMPLESHEET.out.mappings ) + ch_versions = ch_versions.mix(MULTIQC_MAPPINGS_CONFIG.out.versions) } // @@ -127,16 +139,8 @@ workflow FETCHNGS { // // MODULE: Pipeline reporting // - ch_software_versions - .map { it -> if (it) [ it.baseName, it ] } - .groupTuple() - .map { it[1][0] } - .flatten() - .collect() - .set { ch_software_versions } - - GET_SOFTWARE_VERSIONS ( - ch_software_versions.map { it }.collect() + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') ) } @@ -151,7 +155,7 @@ workflow.onComplete { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log) } NfcoreTemplate.summary(workflow, params, log) - WorkflowFetchngs.curateSamplesheetWarn(log) + WorkflowSra.curateSamplesheetWarn(log) } /* From 5f2179847e90703cd2b6d25bbced90b2924076f7 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 9 Oct 2021 09:28:48 +0100 Subject: [PATCH 026/106] Fix nf-core lint --- .nf-core.yml | 3 +++ bin/scrape_software_versions.py | 36 -------------------------- subworkflows/local/input_check.nf | 42 ------------------------------- 3 files changed, 3 insertions(+), 78 deletions(-) delete mode 100755 bin/scrape_software_versions.py delete mode 100644 subworkflows/local/input_check.nf diff --git a/.nf-core.yml b/.nf-core.yml index 12fe7aa8..92e07317 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -3,3 +3,6 @@ lint: - .github/CONTRIBUTING.md - assets/sendmail_template.txt - lib/NfcoreTemplate.groovy + files_exist: + - bin/scrape_software_versions.py + - modules/local/get_software_versions.nf diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py deleted file mode 100755 index a4e45493..00000000 --- a/bin/scrape_software_versions.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -import os - -results = {} -version_files = [x for x in os.listdir(".") if x.endswith(".version.txt")] -for version_file in version_files: - - software = version_file.replace(".version.txt", "") - if software == "pipeline": - software = "nf-core/fetchngs" - - with open(version_file) as fin: - version = fin.read().strip() - results[software] = version - -# Dump to YAML -print( - """ -id: 'software_versions' -section_name: 'nf-core/fetchngs Software Versions' -section_href: 'https://github.com/nf-core/fetchngs' -plot_type: 'html' -description: 'are collected at run time from the software output.' -data: | -
-""" -) -for k, v in sorted(results.items()): - print("
{}
{}
".format(k, v)) -print("
") - -# Write out as tsv file: -with open("software_versions.tsv", "w") as f: - for k, v in sorted(results.items()): - f.write("{}\t{}\n".format(k, v)) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index b664bc8c..00000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,42 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -params.options = [:] - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' addParams( options: params.options ) - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channels(it) } - .set { reads } - - emit: - reads // channel: [ val(meta), [ reads ] ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channels(LinkedHashMap row) { - def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - - def array = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - array = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" - } - array = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return array -} From 335481c640aa088e035a0ad410f32ae82262244d Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sat, 9 Oct 2021 16:02:00 +0100 Subject: [PATCH 027/106] Get Synapse workflow running - tests still failing --- lib/WorkflowMain.groovy | 2 +- modules/local/synapse_get.nf | 5 +- modules/local/synapse_list.nf | 3 +- modules/local/synapse_show.nf | 3 +- workflows/sra.nf | 20 +++--- workflows/synapse.nf | 130 ++++++++++++++++++---------------- 6 files changed, 86 insertions(+), 77 deletions(-) diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 52cfdaea..f1804f51 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -73,7 +73,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input ids.txt'" + log.error "Please provide an input file containing ids to the pipeline - one per line e.g. '--input ids.txt'" System.exit(1) } } diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index 5061e4d8..1581d41a 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -31,11 +31,12 @@ process SYNAPSE_GET { synapse \\ -c $config \\ get \\ - $synid + $options.args \\ + $id \\ cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$( echo \$(synapse --version) ) + ${getSoftwareName(task.process)}: \$(synapse --version | sed -e "s/Synapse Client //g") END_VERSIONS """ } diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf index de8ee867..367d1e16 100644 --- a/modules/local/synapse_list.nf +++ b/modules/local/synapse_list.nf @@ -31,12 +31,13 @@ process SYNAPSE_LIST { synapse \\ -c $config \\ list \\ + $options.args \\ -l $id \\ | cut -c-11 > ${id}.synlist.csv cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$( echo \$(synapse --version) ) + ${getSoftwareName(task.process)}: \$(synapse --version | sed -e "s/Synapse Client //g") END_VERSIONS """ } diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf index 6f0f0b1c..51bb5420 100644 --- a/modules/local/synapse_show.nf +++ b/modules/local/synapse_show.nf @@ -31,12 +31,13 @@ process SYNAPSE_SHOW { synapse \\ -c $config \\ show \\ + $options.args \\ $id \\ | sed -n '1,3p;15,16p;20p;23p' > ${id}.metadata.txt cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$( echo \$(synapse --version) ) + ${getSoftwareName(task.process)}: \$(synapse --version | sed -e "s/Synapse Client //g") END_VERSIONS """ } diff --git a/workflows/sra.nf b/workflows/sra.nf index 25e214df..835bd874 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -13,17 +13,13 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) // Validate input parameters WorkflowSra.initialise(params, log, valid_params) -// Check mandatory parameters -if (params.input) { - Channel - .from(file(params.input, checkIfExists: true)) - .splitCsv(header:false, sep:'', strip:true) - .map { it[0] } - .unique() - .set { ch_ids } -} else { - exit 1, 'Input file with public database ids not specified!' -} +// Read in ids from --input file +Channel + .from(file(params.input, checkIfExists: true)) + .splitCsv(header:false, sep:'', strip:true) + .map { it[0] } + .unique() + .set { ch_ids } /* ======================================================================================== @@ -137,7 +133,7 @@ workflow SRA { } // - // MODULE: Pipeline reporting + // MODULE: Dump software versions for all tools used in the workflow // CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') diff --git a/workflows/synapse.nf b/workflows/synapse.nf index fd024af8..93f281e7 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -4,25 +4,21 @@ ======================================================================================== */ -def valid_params = [ - ena_metadata_fields : ['run_accession', 'experiment_accession', 'library_layout', 'fastq_ftp', 'fastq_md5'] -] - def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) -// Validate input parameters -WorkflowFetchngs.initialise(params, log, valid_params) - -// Check mandatory parameters -if (params.input) { - Channel - .from(file(params.input, checkIfExists: true)) - .splitCsv(header:false, sep:'', strip:true) - .map { it[0] } - .unique() - .set { ch_ids } +// Read in ids from --input file +Channel + .from(file(params.input, checkIfExists: true)) + .splitCsv(header:false, sep:'', strip:true) + .map { it[0] } + .unique() + .set { ch_ids } + +// Create channel for synapse config +if (params.synapse_config) { + ch_synapse_config = file(params.synapse_config, checkIfExists: true) } else { - exit 1, 'Input file with Synapse IDs not specified!' + exit 1, 'Please provide a Synapse config file for download authentication!' } /* @@ -34,13 +30,20 @@ if (params.input) { // Don't overwrite global params.modules, create a copy instead and use that within the main script. def modules = params.modules.clone() -include { SYNAPSE_LIST } from '../modules/local/synapse_list' addParams( options: modules['synapse_list'] ) -include { SYNAPSE_GET } from '../modules/local/synapse_get' addParams( options: modules['synapse_get'] ) -include { SYNAPSE_SHOW } from '../modules/local/synapse_show' addParams( options: modules['synapse_show'] ) -include { SYNAPSE_TO_SAMPLESHEET } from '../modules/local/synapse_to_samplesheet' addParams( options: modules['synapse_to_samplesheet'] ) -include { SYNAPSE_METADATA_MAPPING } from '../modules/local/synapse_metadata_mapping' addParams( options: modules['synapse_metadata_mapping'] ) -include { SYNAPSE_MERGE_SAMPLESHEET } from '../modules/local/synapse_merge_samplesheet' addParams( options: modules['synapse_merge_samplesheet']) -include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['tsv':'']] ) +include { SYNAPSE_LIST } from '../modules/local/synapse_list' addParams( options: modules['synapse_list'] ) +include { SYNAPSE_GET } from '../modules/local/synapse_get' addParams( options: modules['synapse_get'] ) +include { SYNAPSE_SHOW } from '../modules/local/synapse_show' addParams( options: modules['synapse_show'] ) +include { SYNAPSE_TO_SAMPLESHEET } from '../modules/local/synapse_to_samplesheet' addParams( options: modules['synapse_to_samplesheet'] ) +include { SYNAPSE_METADATA_MAPPING } from '../modules/local/synapse_metadata_mapping' addParams( options: modules['synapse_metadata_mapping'] ) +include { SYNAPSE_MERGE_SAMPLESHEET } from '../modules/local/synapse_merge_samplesheet' addParams( options: modules['synapse_merge_samplesheet'] ) + +/* +======================================================================================== + IMPORT NF-CORE MODULES/SUBWORKFLOWS +======================================================================================== +*/ + +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' addParams( options: [publish_files : ['_versions.yml':'']] ) /* ======================================================================================== @@ -50,51 +53,58 @@ include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versi workflow SYNAPSE { - ch_software_versions = Channel.empty() - - // CHANNEL: Stage Synapse Config File - ch_synapseConfig = file( params.synapse_config ) + ch_versions = Channel.empty() - // MODULE: Get individual FastQ SynapseIDs from Directory SynapseID(s) + // + // MODULE: Get individual FastQ synapse ids from directory based synapse ids + // SYNAPSE_LIST ( ch_ids, - ch_synapseConfig + ch_synapse_config ) - ch_software_versions = ch_software_versions.mix(SYNAPSE_LIST.out.version.first().ifEmpty(null)) + ch_versions = ch_versions.mix(SYNAPSE_LIST.out.versions.first()) - // CHANNEL: Create channel for FQ SynapseIDs + // Create channel for FastQ synapse ids SYNAPSE_LIST .out - .synlist_csv + .csv .splitCsv(header:false).flatten() .set { ch_samples } - // MODULE: Download FastQ Files by SynapseID + // + // MODULE: Download FastQs by synapse id + // SYNAPSE_GET ( ch_samples, - ch_synapseConfig + ch_synapse_config ) + ch_versions = ch_versions.mix(SYNAPSE_LIST.out.versions.first()) - // CHANNEL: Create Read Pairs Channel - Creates format [sampleId, [fastq_1, fastq_2]] + // Create read pair channel: [ sampleId, [ fastq_1, fastq_2 ] ] SYNAPSE_GET .out .fastq - .collect().flatten() - .toSortedList().flatten() + .collect() + .flatten() + .toSortedList() + .flatten() .map { meta -> - def sampleId = meta.name.toString().tokenize('_').get(0) - [sampleId, meta] + def id = meta.name.toString().tokenize('_').get(0) + [ id, meta ] } .groupTuple() - .set{ ch_read_pairs } + .set { ch_read_pairs } - // MODULE: Download FQ Metadata by SynapseID + // + // MODULE: Download FastQ metadata by synapse id + // SYNAPSE_SHOW ( ch_samples, - ch_synapseConfig + ch_synapse_config ) + ch_versions = ch_versions.mix(SYNAPSE_LIST.out.versions.first()) - // CHANNEL: Clean Metadata + // Clean metadata in channels SYNAPSE_SHOW .out .metadata @@ -103,34 +113,34 @@ workflow SYNAPSE { .collate( 6 ) .set { ch_meta } - // MODULE: Compile Metadata + // + // MODULE: Compile metadata + // SYNAPSE_METADATA_MAPPING ( ch_meta ) - // MODULE: Create Samplesheet + // + // MODULE: Create samplesheet + // SYNAPSE_TO_SAMPLESHEET ( - ch_read_pairs, + ch_read_pairs ) - // MODULE: Merge Samplesheets + // + // MODULE: Merge samplesheets + // SYNAPSE_MERGE_SAMPLESHEET ( SYNAPSE_TO_SAMPLESHEET.out.samplesheet.collect(), SYNAPSE_METADATA_MAPPING.out.metasheet.collect() ) + ch_versions = ch_versions.mix(SYNAPSE_LIST.out.versions.first()) - // MODULE: Pipeline reporting - ch_software_versions - .map { it -> if (it) [ it.baseName, it ] } - .groupTuple() - .map { it[1][0] } - .flatten() - .collect() - .set { ch_software_versions } - - // MODULE: Get Software Versions - GET_SOFTWARE_VERSIONS ( - ch_software_versions.map { it }.collect() + // + // MODULE: Dump software versions for all tools used in the workflow + // + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') ) } @@ -145,7 +155,7 @@ workflow.onComplete { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log) } NfcoreTemplate.summary(workflow, params, log) - WorkflowFetchngs.curateSamplesheetWarn(log) + //WorkflowSynapse.curateSamplesheetWarn(log) } /* From b2dacad49396b4de6dbc4c836f132b4e86f55255 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 14 Oct 2021 22:28:28 +0100 Subject: [PATCH 028/106] Fix all the thingsgs --- .github/workflows/ci.yml | 38 +++++----- conf/modules.config | 8 +- lib/WorkflowSynapse.groovy | 38 ++++++++++ modules/local/synapse_get.nf | 15 ++-- modules/local/synapse_list.nf | 7 +- modules/local/synapse_merge_samplesheet.nf | 12 +-- modules/local/synapse_metadata_mapping.nf | 36 --------- modules/local/synapse_show.nf | 3 +- modules/local/synapse_to_samplesheet.nf | 55 +++++++++----- workflows/synapse.nf | 87 +++++++++++----------- 10 files changed, 156 insertions(+), 143 deletions(-) create mode 100755 lib/WorkflowSynapse.groovy delete mode 100644 modules/local/synapse_metadata_mapping.nf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7788b5de..a9c0d4c9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,27 +39,27 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker - test_synapse: - name: Test Synapse workflow - if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }} - runs-on: ubuntu-latest - env: - NXF_VER: ${{ matrix.nxf_ver }} - NXF_ANSI_LOG: false - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 + # test_synapse: + # name: Test Synapse workflow + # if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }} + # runs-on: ubuntu-latest + # env: + # NXF_VER: ${{ matrix.nxf_ver }} + # NXF_ANSI_LOG: false + # steps: + # - name: Check out pipeline code + # uses: actions/checkout@v2 - - name: Install Nextflow - env: - CAPSULE_LOG: none - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + # - name: Install Nextflow + # env: + # CAPSULE_LOG: none + # run: | + # wget -qO- get.nextflow.io | bash + # sudo mv nextflow /usr/local/bin/ - - name: Run pipeline with synapse test data - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_synapse,docker + # - name: Run pipeline with synapse test data + # run: | + # nextflow run ${GITHUB_WORKSPACE} -profile test_synapse,docker parameters: name: Test workflow parameters diff --git a/conf/modules.config b/conf/modules.config index f3da4f94..0d44e9ee 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,18 +43,16 @@ params { publish_dir = 'samplesheet' } 'synapse_list' { - publish_dir = 'synapse' + args = '--long' + publish_dir = 'metadata' } 'synapse_get' { publish_dir = 'fastq' + publish_files = ['fastq.gz':'', 'md5':'md5'] } 'synapse_show' { publish_dir = 'metadata' } - 'synapse_metadata_mapping' { - publish_dir = 'metadata' - publish_files = false - } 'synapse_to_samplesheet' { publish_dir = 'samplesheet' publish_files = false diff --git a/lib/WorkflowSynapse.groovy b/lib/WorkflowSynapse.groovy new file mode 100755 index 00000000..1ac249a1 --- /dev/null +++ b/lib/WorkflowSynapse.groovy @@ -0,0 +1,38 @@ +// +// This file holds several functions specific to the workflow/synapse.nf in the nf-core/fetchngs pipeline +// + +class WorkflowSynapse { + + // + // Convert metadata obtained from the 'synapse show' command to a Groovy map + // + public static Map synapseShowToMap(synapse_file) { + def meta = [:] + def category = '' + synapse_file.eachLine { line -> + def entries = [null, null] + if (!line.startsWith(' ') && !line.trim().isEmpty()) { + category = line.tokenize(':')[0] + } else { + entries = line.trim().tokenize('=') + } + meta["${category}|${entries[0]}"] = entries[1] + } + meta.id = meta['properties|id'] + meta.md5 = meta['File|md5'] + return meta.findAll{ it.value != null } + } + + // + // Print a warning after pipeline has completed + // + public static void curateSamplesheetWarn(log) { + log.warn "=============================================================================\n" + + " Please double-check the samplesheet that has been auto-created by the pipeline.\n\n" + + " Where applicable, default values will be used for sample-specific metadata\n" + + " such as strandedness, controls etc as this information is not provided\n" + + " in a standardised manner when uploading data to Synapse.\n" + + "===================================================================================" + } +} diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index 1581d41a..3f4d0655 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -5,8 +5,9 @@ params.options = [:] options = initOptions(params.options) process SYNAPSE_GET { - tag "$id" + tag "$meta.id" label 'process_low' + label 'error_retry' publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } @@ -19,12 +20,13 @@ process SYNAPSE_GET { } input: - val id + val meta path config output: - path "*.fastq.gz" , emit: fastq - path "versions.yml", emit: versions + tuple val(meta), path("*.fastq.gz"), emit: fastq + tuple val(meta), path("*md5") , emit: md5 + path "versions.yml" , emit: versions script: """ @@ -32,7 +34,10 @@ process SYNAPSE_GET { -c $config \\ get \\ $options.args \\ - $id \\ + $meta.id + + find ./ -type f -name "*.fastq.gz" -exec echo "${meta.md5} " {} \\; > ${meta.id}.md5 + md5sum -c ${meta.id}.md5 cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf index 367d1e16..20275de2 100644 --- a/modules/local/synapse_list.nf +++ b/modules/local/synapse_list.nf @@ -23,7 +23,7 @@ process SYNAPSE_LIST { path config output: - path "*.csv" , emit: csv + path "*.txt" , emit: txt path "versions.yml", emit: versions script: @@ -32,8 +32,9 @@ process SYNAPSE_LIST { -c $config \\ list \\ $options.args \\ - -l $id \\ - | cut -c-11 > ${id}.synlist.csv + $id \\ + $options.args2 \\ + > ${id}.list.txt cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: diff --git a/modules/local/synapse_merge_samplesheet.nf b/modules/local/synapse_merge_samplesheet.nf index f634118b..8027818c 100644 --- a/modules/local/synapse_merge_samplesheet.nf +++ b/modules/local/synapse_merge_samplesheet.nf @@ -1,12 +1,9 @@ // Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' +include { saveFiles; getSoftwareName; getProcessName } from './functions' params.options = [:] -options = initOptions(params.options) process SYNAPSE_MERGE_SAMPLESHEET { - tag 'merge_samplesheet' - label 'process_low' publishDir "${params.outdir}", mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } @@ -20,11 +17,9 @@ process SYNAPSE_MERGE_SAMPLESHEET { input: path ('samplesheets/*') - path ('metasheet/*') output: path "samplesheet.csv", emit: samplesheet - path "metasheet.csv" , emit: metasheet path "versions.yml" , emit: versions script: @@ -34,11 +29,6 @@ process SYNAPSE_MERGE_SAMPLESHEET { awk 'NR>1' \$fileid >> samplesheet.csv done - head -n 1 `ls ./metasheet/* | head -n 1` > metasheet.csv - for fileid in `ls ./metasheet/*`; do - awk 'NR>1' \$fileid >> metasheet.csv - done - cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') diff --git a/modules/local/synapse_metadata_mapping.nf b/modules/local/synapse_metadata_mapping.nf deleted file mode 100644 index a513da09..00000000 --- a/modules/local/synapse_metadata_mapping.nf +++ /dev/null @@ -1,36 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process SYNAPSE_METADATA_MAPPING { - tag "${data[3]}" - label 'process_low' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - - input: - val data - - output: - path "*.csv", emit: metasheet - - exec: - meta_map = [ - md5 : "${data[0]}", - fileSize : "${data[1]}", - etag : "${data[2]}", - id : "${data[3]}", - fileName : "${data[4]}", - fileVersion : "${data[5]}" - ] - - // Create Metadata Sheet - metasheet = meta_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' - metasheet += meta_map.values().collect{ '"' + it + '"'}.join(",") - - def metasheet_file = task.workDir.resolve("${meta_map.id}.metasheet.csv") - metasheet_file.text = metasheet -} diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf index 51bb5420..4edc4207 100644 --- a/modules/local/synapse_show.nf +++ b/modules/local/synapse_show.nf @@ -33,7 +33,8 @@ process SYNAPSE_SHOW { show \\ $options.args \\ $id \\ - | sed -n '1,3p;15,16p;20p;23p' > ${id}.metadata.txt + $options.args2 \\ + > ${id}.metadata.txt cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: diff --git a/modules/local/synapse_to_samplesheet.nf b/modules/local/synapse_to_samplesheet.nf index 7b301ddf..28571005 100644 --- a/modules/local/synapse_to_samplesheet.nf +++ b/modules/local/synapse_to_samplesheet.nf @@ -1,38 +1,57 @@ // Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' +include { saveFiles; getSoftwareName } from './functions' -params.options = [:] -options = initOptions(params.options) +params.options = [:] +params.results_dir = '' process SYNAPSE_TO_SAMPLESHEET { - tag "$id" - label 'process_low' + tag "$meta.id" publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + memory 100.MB input: - tuple val(id), val(files) + tuple val(meta), path(fastq) + val pipeline output: - path "*.csv", emit: samplesheet + tuple val(meta), path("*.csv"), emit: samplesheet exec: - // Add fields to the beginning of the map + // Remove custom keys + def meta_map = meta.clone() + meta_map.remove("id") + + def fastq_1 = "${params.outdir}/${params.results_dir}/${fastq}" + def fastq_2 = '' + if (fastq instanceof List && fastq.size() == 2) { + fastq_1 = "${params.outdir}/${params.results_dir}/${fastq[0]}" + fastq_2 = "${params.outdir}/${params.results_dir}/${fastq[1]}" + } + + // Add relevant fields to the beginning of the map pipeline_map = [ - sample : "${id}", - fastq_1 : "${params.outdir}/fastq/${files[0]}", - fastq_2 : "${params.outdir}/fastq/${files[1]}" + sample : "${meta.id}", + fastq_1 : fastq_1, + fastq_2 : fastq_2 ] - // Add Strandedness - pipeline_map << [ strandedness: "unstranded" ] - // Create Samplesheet + // Add nf-core pipeline specific entries + if (pipeline) { + if (pipeline == 'rnaseq') { + pipeline_map << [ strandedness: 'unstranded' ] + } + } + pipeline_map << meta_map + + // Create a samplesheet samplesheet = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",") - def samplesheet_file2 = task.workDir.resolve("${pipeline_map.sample}.samplesheet.csv") - samplesheet_file2.text = samplesheet - + // Write samplesheet to file + def samplesheet_file = task.workDir.resolve("${meta.id}.samplesheet.csv") + samplesheet_file.text = samplesheet } diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 93f281e7..3d993789 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -31,10 +31,9 @@ if (params.synapse_config) { def modules = params.modules.clone() include { SYNAPSE_LIST } from '../modules/local/synapse_list' addParams( options: modules['synapse_list'] ) -include { SYNAPSE_GET } from '../modules/local/synapse_get' addParams( options: modules['synapse_get'] ) include { SYNAPSE_SHOW } from '../modules/local/synapse_show' addParams( options: modules['synapse_show'] ) -include { SYNAPSE_TO_SAMPLESHEET } from '../modules/local/synapse_to_samplesheet' addParams( options: modules['synapse_to_samplesheet'] ) -include { SYNAPSE_METADATA_MAPPING } from '../modules/local/synapse_metadata_mapping' addParams( options: modules['synapse_metadata_mapping'] ) +include { SYNAPSE_GET } from '../modules/local/synapse_get' addParams( options: modules['synapse_get'] ) +include { SYNAPSE_TO_SAMPLESHEET } from '../modules/local/synapse_to_samplesheet' addParams( options: modules['synapse_to_samplesheet'], results_dir: modules['synapse_get'].publish_dir ) include { SYNAPSE_MERGE_SAMPLESHEET } from '../modules/local/synapse_merge_samplesheet' addParams( options: modules['synapse_merge_samplesheet'] ) /* @@ -56,7 +55,7 @@ workflow SYNAPSE { ch_versions = Channel.empty() // - // MODULE: Get individual FastQ synapse ids from directory based synapse ids + // MODULE: Expand synapse ids for individual FastQ files // SYNAPSE_LIST ( ch_ids, @@ -67,74 +66,72 @@ workflow SYNAPSE { // Create channel for FastQ synapse ids SYNAPSE_LIST .out - .csv - .splitCsv(header:false).flatten() + .txt + .splitCsv(header:false, sep:' ') + .map { it[0] } + .unique() .set { ch_samples } // - // MODULE: Download FastQs by synapse id + // MODULE: Download metadata for each synapse id // - SYNAPSE_GET ( + SYNAPSE_SHOW ( ch_samples, ch_synapse_config ) - ch_versions = ch_versions.mix(SYNAPSE_LIST.out.versions.first()) + ch_versions = ch_versions.mix(SYNAPSE_SHOW.out.versions.first()) - // Create read pair channel: [ sampleId, [ fastq_1, fastq_2 ] ] - SYNAPSE_GET + // Get metadata into channels + SYNAPSE_SHOW .out - .fastq - .collect() - .flatten() - .toSortedList() - .flatten() - .map { meta -> - def id = meta.name.toString().tokenize('_').get(0) - [ id, meta ] - } - .groupTuple() - .set { ch_read_pairs } + .metadata + .map { it -> WorkflowSynapse.synapseShowToMap(it) } + .set { ch_samples_meta } // - // MODULE: Download FastQ metadata by synapse id + // MODULE: Download FastQs by synapse id // - SYNAPSE_SHOW ( - ch_samples, + SYNAPSE_GET ( + ch_samples_meta, ch_synapse_config ) - ch_versions = ch_versions.mix(SYNAPSE_LIST.out.versions.first()) + ch_versions = ch_versions.mix(SYNAPSE_GET.out.versions.first()) - // Clean metadata in channels - SYNAPSE_SHOW + // Combine channels for PE/SE FastQs: [ [ id:SRR6357070, synapse_ids:syn26240474;syn26240477 ], [ fastq_1, fastq_2 ] ] + SYNAPSE_GET .out - .metadata - .splitCsv(strip:true, sep:"=", skip:1) - .map { it[1] } - .collate( 6 ) - .set { ch_meta } + .fastq + .map { meta, fastq -> [ fastq.baseName.tokenize('_')[0], fastq ] } + .groupTuple(sort: { it -> it.baseName }) + .set { ch_fastq } - // - // MODULE: Compile metadata - // - SYNAPSE_METADATA_MAPPING ( - ch_meta - ) + SYNAPSE_GET + .out + .fastq + .map { meta, fastq -> [ fastq.baseName.tokenize('_')[0], meta.id ] } + .groupTuple() + .join(ch_fastq) + .map { id, synids, fastq -> + def meta = [ id:id, synapse_ids:synids.join(';') ] + [ meta, fastq ] + } + .set { ch_fastq } // - // MODULE: Create samplesheet + // MODULE: Create samplesheet per sample // SYNAPSE_TO_SAMPLESHEET ( - ch_read_pairs + ch_fastq, + params.nf_core_pipeline ?: '' ) // // MODULE: Merge samplesheets // SYNAPSE_MERGE_SAMPLESHEET ( - SYNAPSE_TO_SAMPLESHEET.out.samplesheet.collect(), - SYNAPSE_METADATA_MAPPING.out.metasheet.collect() + SYNAPSE_TO_SAMPLESHEET.out.samplesheet.collect{ it[1] } ) - ch_versions = ch_versions.mix(SYNAPSE_LIST.out.versions.first()) + ch_versions = ch_versions.mix(SYNAPSE_MERGE_SAMPLESHEET.out.versions) // // MODULE: Dump software versions for all tools used in the workflow @@ -155,7 +152,7 @@ workflow.onComplete { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log) } NfcoreTemplate.summary(workflow, params, log) - //WorkflowSynapse.curateSamplesheetWarn(log) + WorkflowSynapse.curateSamplesheetWarn(log) } /* From 82b68e1e95faf8961616d4933f900a1ec1ef8a28 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 14 Oct 2021 22:31:38 +0100 Subject: [PATCH 029/106] Add meta entries to saveAs --- modules/local/synapse_get.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index 3f4d0655..1e959296 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -10,7 +10,7 @@ process SYNAPSE_GET { label 'error_retry' publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } conda (params.enable_conda ? "bioconda::synapseclient=2.4.0" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { From 968f2fc9380153c87e8b5263c5cf5912267194d3 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 14 Oct 2021 22:35:39 +0100 Subject: [PATCH 030/106] Update CHANGELOG --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cc2d511..9e9a0cb2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements & fixes -* Handle identifiers that do **not** return metadata, for example, due to being private. - +* Added a workflow to download FastQ files and to create samplesheets for ids from the [Synapse platform](https://www.synapse.org/) hosted by [Sage Bionetworks](https://sagebionetworks.org/). +* Handle SRA identifiers that do **not** return metadata, for example, due to being private. * Added support for [DDBJ ids](https://www.ddbj.nig.ac.jp/index-e.html). See examples below: | `DDBJ` | From 87fa87e1496ee28a53ac751f65272cd3d89d4de0 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 14 Oct 2021 22:37:41 +0100 Subject: [PATCH 031/106] Remove Synapse CI --- .github/workflows/ci.yml | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a9c0d4c9..08ac1cfb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,28 +39,6 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker - # test_synapse: - # name: Test Synapse workflow - # if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }} - # runs-on: ubuntu-latest - # env: - # NXF_VER: ${{ matrix.nxf_ver }} - # NXF_ANSI_LOG: false - # steps: - # - name: Check out pipeline code - # uses: actions/checkout@v2 - - # - name: Install Nextflow - # env: - # CAPSULE_LOG: none - # run: | - # wget -qO- get.nextflow.io | bash - # sudo mv nextflow /usr/local/bin/ - - # - name: Run pipeline with synapse test data - # run: | - # nextflow run ${GITHUB_WORKSPACE} -profile test_synapse,docker - parameters: name: Test workflow parameters if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }} From 4093bba59aa7ab00696fa799037f1563f3a76a2b Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Fri, 15 Oct 2021 12:47:29 -0700 Subject: [PATCH 032/106] Update Docs --- CITATIONS.md | 3 +++ README.md | 8 ++++---- docs/output.md | 10 +++++----- docs/usage.md | 20 +++++++++++--------- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 4a1f2752..01819d3e 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -28,6 +28,9 @@ * [GEO](https://pubmed.ncbi.nlm.nih.gov/23193258/) > Barrett T, Wilhite SE, Ledoux P, Evangelista C, Kim IF, Tomashevsky M, Marshall KA, Phillippy KH, Sherman PM, Holko M, Yefanov A, Lee H, Zhang N, Robertson CL, Serova N, Davis S, Soboleva A. NCBI GEO: archive for functional genomics data sets--update. Nucleic Acids Res. 2013 Jan;41(Database issue):D991-5. doi: 10.1093/nar/gks1193. Epub 2012 Nov 27. PubMed PMID: 23193258; PubMed Central PMCID: PMC3531084. +* [Synapse](https://pubmed.ncbi.nlm.nih.gov/24071850/) + > Omberg L, Ellrott K, Yuan Y, Kandoth C, Wong C, Kellen MR, Friend SH, Stuart J, Liang H, Margolin AA. Enabling transparent and collaborative computational analysis of 12 tumor types within The Cancer Genome Atlas. Nat Genet. 2013 Oct;45(10):1121-6. doi: 10.1038/ng.2761. PMID: 24071850; PMCID: PMC3950337. + ## Software packaging/containerisation tools * [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index e378eacc..cb60455e 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from public databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from public databases. At present, the pipeline supports [SRA](https://www.ncbi.nlm.nih.gov/sra) / [ENA](https://www.ebi.ac.uk/ena/browser/home) / [DDBJ](https://www.ddbj.nig.ac.jp/index-e.html) / [GEO](https://www.ncbi.nlm.nih.gov/geo/) / [Synapse](https://www.synapse.org/#) ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. @@ -26,9 +26,9 @@ On release, automated continuous integration tests run the pipeline on a full-si Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt)) the pipeline performs the following steps: -1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) -2. Fetch extensive id metadata including direct download links to FastQ files via ENA API -3. Download FastQ files in parallel via `curl` and perform `md5sum` check +1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html), or appropriate database +2. Fetch extensive id metadata including direct download links to FastQ files via ENA API or appropriate database +3. Download FastQ files in parallel and perform `md5sum` check 4. Collate id metadata and paths to FastQ files in a single samplesheet The columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. diff --git a/docs/output.md b/docs/output.md index 4ffc507e..167c1f69 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,7 +8,7 @@ This document describes the output produced by the pipeline. The directories lis The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -* [FastQ download](#fastq-download) - Download FastQ files via SRA / ENA / DDBJ / GEO ids +* [FastQ download](#fastq-download) - Download FastQ files via SRA / ENA / DDBJ / GEO / Synapse ids * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ### FastQ download @@ -19,18 +19,18 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d * `fastq/` * `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO. * `fastq/md5/` - * `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA. + * `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA or appropriate database. * `samplesheet/` * `samplesheet.csv`: Auto-created samplesheet with collated metadata and paths to downloaded FastQ files. * `id_mappings.csv`: File with selected fields that can be used to rename samples to more informative names; see [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour. * `multiqc_config.yml`: [MultiQC](https://multiqc.info/docs/#bulk-sample-renaming) config file that can be passed to most nf-core pipelines via the `--multiqc_config` parameter for bulk renaming of sample names from database ids; [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour. * `metadata/` - * `*.runinfo_ftp.tsv`: Re-formatted metadata file downloaded from the ENA. - * `*.runinfo.tsv`: Original metadata file downloaded from the ENA. + * `*.runinfo_ftp.tsv`: Re-formatted metadata file downloaded from the ENA or appropriate database. + * `*.runinfo.tsv`: Original metadata file downloaded from the ENA or appropriate database. -Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because the tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. +Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. The final sample information for `SRA`, `GEO`, `DDBJ`, and `ENA` identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because the tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. Sample information for `Synapse` identifiers are downloaded in parallel directly from the [Synapse](https://www.synapse.org/#) platform. A [configuration file](http://python-docs.synapse.org/build/html/Credentials.html#use-synapseconfig) containing valid login credentials is required for Synapse downloads. ### Pipeline information diff --git a/docs/usage.md b/docs/usage.md index 1e6066db..e1a096f4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,20 +8,22 @@ The pipeline has been set-up to automatically download and process the raw FastQ files from public repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: -| `SRA` | `ENA` | `DDBJ` | `GEO` | -|--------------|--------------|--------------|------------| -| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | -| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | -| SRS6531847 | ERS4399630 | DRS090921 | | -| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | -| SRP256957 | ERP120836 | DRP004793 | | -| SRA1068758 | ERA2420837 | DRA008156 | | -| PRJNA625551 | PRJEB37513 | PRJDB4176 | | +| `SRA` | `ENA` | `DDBJ` | `GEO` | `Synapse` | +|--------------|--------------|--------------|------------|-------------| +| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | syn26240435 | +| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | | +| SRS6531847 | ERS4399630 | DRS090921 | | | +| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | | +| SRP256957 | ERP120836 | DRP004793 | | | +| SRA1068758 | ERA2420837 | DRA008156 | | | +| PRJNA625551 | PRJEB37513 | PRJDB4176 | | | If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. We may add this functionality in later releases. +To download data from `Synapse`, the SynapseID of the directory containing all files to be downloaded should be provided. The directory ID will then be resolved to the SynapseIDs of the corresponding FastQ files, which are then downloaded in parellel using the `synapse get` command. In order to download data from Synapse, an account must be created and a user configuration file provided via the parameter `--synapse_config`. All Synapse metadata, annotations and provenance are also downloaded using the `synapse show` command, and are outputted to a separate metadata file. By default, only the md5sums, file sizes, etags, Synapse IDs, file names, and file versions are shown. + As a bonus, the columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. It is highly recommended that you double-check that all of the identifiers you defined using `--input` are represented in the samplesheet. Also, public databases don't reliably hold information such as strandedness information so you may need to amend these entries too if for example your samplesheet was created by providing `--nf_core_pipeline rnaseq`. All of the sample metadata obtained from the ENA will be appended as additional columns to help you manually curate the samplesheet before you run the pipeline. You can customise the metadata fields that are appended to the samplesheet via the `--ena_metadata_fields` parameter. The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. However, this pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. A comprehensive list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run]). From c2029b3f8a704175a0d4b1ef4b41fcf43bd483cf Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Fri, 15 Oct 2021 12:48:24 -0700 Subject: [PATCH 033/106] Add RegEx for SampleIDs --- lib/WorkflowSynapse.groovy | 53 ++++++++++++++++++++++++++++++++++++++ workflows/synapse.nf | 4 +-- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/lib/WorkflowSynapse.groovy b/lib/WorkflowSynapse.groovy index 1ac249a1..d36d1e55 100755 --- a/lib/WorkflowSynapse.groovy +++ b/lib/WorkflowSynapse.groovy @@ -35,4 +35,57 @@ class WorkflowSynapse { " in a standardised manner when uploading data to Synapse.\n" + "===================================================================================" } + + // + // Obtain Sample ID from File Name + // + public static String getSampleID(input_file, pattern) { + + def sampleids = "" + + def filePattern = pattern.toString() + int p = filePattern.lastIndexOf('/') + if( p != -1 ) + filePattern = filePattern.substring(p+1) + + input_file.each { + String fileName = input_file.getFileName().toString() + + String indexOfWildcards = filePattern.findIndexOf { it=='*' || it=='?' } + String indexOfBrackets = filePattern.findIndexOf { it=='{' || it=='[' } + if( indexOfWildcards==-1 && indexOfBrackets==-1 ) { + if( fileName == filePattern ) + return actual.getSimpleName() + throw new IllegalArgumentException("Not a valid file pair globbing pattern: pattern=$filePattern file=$fileName") + } + + int groupCount = 0 + for( int i=0; i [ fastq.baseName.tokenize('_')[0], fastq ] } + .map { meta, fastq -> [ WorkflowSynapse.getSampleID( fastq , "*{1,2}*"), fastq ] } .groupTuple(sort: { it -> it.baseName }) .set { ch_fastq } SYNAPSE_GET .out .fastq - .map { meta, fastq -> [ fastq.baseName.tokenize('_')[0], meta.id ] } + .map { meta, fastq -> [ WorkflowSynapse.getSampleID( fastq , "*{1,2}*"), meta.id ] } .groupTuple() .join(ch_fastq) .map { id, synids, fastq -> From 6a4ae0aca6f266cb013140a789eefe831a4ffc2f Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Mon, 18 Oct 2021 16:06:28 -0700 Subject: [PATCH 034/106] Separate SRA/Synapse Docs --- README.md | 16 ++++++++++++---- docs/output.md | 51 ++++++++++++++++++++++++++++++++++++++++++++------ docs/usage.md | 33 ++++++++++++++++++++++++++------ 3 files changed, 84 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index cb60455e..889d4a87 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from public databases. At present, the pipeline supports [SRA](https://www.ncbi.nlm.nih.gov/sra) / [ENA](https://www.ebi.ac.uk/ena/browser/home) / [DDBJ](https://www.ddbj.nig.ac.jp/index-e.html) / [GEO](https://www.ncbi.nlm.nih.gov/geo/) / [Synapse](https://www.synapse.org/#) ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. @@ -26,11 +26,19 @@ On release, automated continuous integration tests run the pipeline on a full-si Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt)) the pipeline performs the following steps: -1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html), or appropriate database -2. Fetch extensive id metadata including direct download links to FastQ files via ENA API or appropriate database -3. Download FastQ files in parallel and perform `md5sum` check +**SRA / ENA / DDBJ / GEO Identifiers** +1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) +2. Fetch extensive id metadata including direct download links to FastQ files via ENA API +3. Download FastQ files in parallel via `curl` and perform `md5sum` check 4. Collate id metadata and paths to FastQ files in a single samplesheet +**Synapse Identifiers** +1. Resolve Synapse directory SynapseIDs to the SynapseIDs of the FastQ files contained, using the `synapse list` command. +2. Retrieve FastQ file metadata including FastQ file names, md5sums, etags, annotations, and data provenance via the `synapse show` command. +3. Download FastQ files in parallel via `synapse get` and perform `md5sum` check +4. Collate paths to FastQ files in a single samplesheet + +**Samplesheet Generation** The columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. ## Quick Start diff --git a/docs/output.md b/docs/output.md index 167c1f69..acc1d956 100644 --- a/docs/output.md +++ b/docs/output.md @@ -4,11 +4,11 @@ This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -## Pipeline overview +## SRA / ENA / DDBJ / GEO - Pipeline Overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -* [FastQ download](#fastq-download) - Download FastQ files via SRA / ENA / DDBJ / GEO / Synapse ids +* [FastQ download](#fastq-download) - Download FastQ files via SRA / ENA / DDBJ / GEO ids * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution ### FastQ download @@ -19,18 +19,18 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d * `fastq/` * `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO. * `fastq/md5/` - * `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA or appropriate database. + * `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA. * `samplesheet/` * `samplesheet.csv`: Auto-created samplesheet with collated metadata and paths to downloaded FastQ files. * `id_mappings.csv`: File with selected fields that can be used to rename samples to more informative names; see [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour. * `multiqc_config.yml`: [MultiQC](https://multiqc.info/docs/#bulk-sample-renaming) config file that can be passed to most nf-core pipelines via the `--multiqc_config` parameter for bulk renaming of sample names from database ids; [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour. * `metadata/` - * `*.runinfo_ftp.tsv`: Re-formatted metadata file downloaded from the ENA or appropriate database. - * `*.runinfo.tsv`: Original metadata file downloaded from the ENA or appropriate database. + * `*.runinfo_ftp.tsv`: Re-formatted metadata file downloaded from the ENA. + * `*.runinfo.tsv`: Original metadata file downloaded from the ENA. -Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. The final sample information for `SRA`, `GEO`, `DDBJ`, and `ENA` identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because the tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. Sample information for `Synapse` identifiers are downloaded in parallel directly from the [Synapse](https://www.synapse.org/#) platform. A [configuration file](http://python-docs.synapse.org/build/html/Credentials.html#use-synapseconfig) containing valid login credentials is required for Synapse downloads. +Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because the tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. ### Pipeline information @@ -43,4 +43,43 @@ Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introductio +## Synapse - Pipeline Overview + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +* [FastQ download](#fastq-Download) - Download FastQ files via Synapse ids +* [Pipeline information](#pipeline-Information) - Report metrics generated during the workflow execution + +### FastQ Download + +
+Output files + +* `fastq/` + * `*.fastq.gz`: Paired-end/single-end reads downloaded from Synapse. +* `fastq/md5/` + * `*.md5`: Files containing `md5` sum for FastQ files downloaded from the Synapse platform. +* `samplesheet/` + * `samplesheet.csv`: Auto-created samplesheet with collated metadata and paths to downloaded FastQ files. +* `metadata/` + * `*.metadata.txt`: Original metadata file, generated using the `synapse show` command. + * `*.list.txt`: Original output of the `synapse list` command, containing the SynapseIDs, file version numbers, file names, and other file-specific data of the Synapse directory ID provided. + +
+ +Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for detailed instructions on how to correctly provide Synapse IDs to the pipeline. FastQ files and corresponding sample information for `Synapse` identifiers are downloaded in parallel directly from the [Synapse](https://www.synapse.org/#) platform. A [configuration file](http://python-docs.synapse.org/build/html/Credentials.html#use-synapseconfig) containing valid login credentials is required for Synapse downloads. + +The final sample information for the FastQ files downloaded from `Synapse` is obtained from the file name itself. The file names are parsed according to the glob pattern `*{1,2}*`. This returns the sample name, presumed to be the longest possible string matching the glob pattern, with the fewest number of wildcard insertions. Further information on sample name parsing can be found in the [usage documentation](https://nf-co.re/fetchngs/usage#introduction). + +### Pipeline Information + +
+Output files + +* `pipeline_info/` + * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.tsv`. + +
+ [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. diff --git a/docs/usage.md b/docs/usage.md index e1a096f4..6e9787ad 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,7 +6,7 @@ ## Introduction -The pipeline has been set-up to automatically download and process the raw FastQ files from public repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: +The pipeline has been set-up to automatically download and process the raw FastQ files from both public and private repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: | `SRA` | `ENA` | `DDBJ` | `GEO` | `Synapse` | |--------------|--------------|--------------|------------|-------------| @@ -18,15 +18,13 @@ The pipeline has been set-up to automatically download and process the raw FastQ | SRA1068758 | ERA2420837 | DRA008156 | | | | PRJNA625551 | PRJEB37513 | PRJDB4176 | | | +**SRR / ERR / DRR Sample Identifiers** + If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. We may add this functionality in later releases. -To download data from `Synapse`, the SynapseID of the directory containing all files to be downloaded should be provided. The directory ID will then be resolved to the SynapseIDs of the corresponding FastQ files, which are then downloaded in parellel using the `synapse get` command. In order to download data from Synapse, an account must be created and a user configuration file provided via the parameter `--synapse_config`. All Synapse metadata, annotations and provenance are also downloaded using the `synapse show` command, and are outputted to a separate metadata file. By default, only the md5sums, file sizes, etags, Synapse IDs, file names, and file versions are shown. - -As a bonus, the columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. It is highly recommended that you double-check that all of the identifiers you defined using `--input` are represented in the samplesheet. Also, public databases don't reliably hold information such as strandedness information so you may need to amend these entries too if for example your samplesheet was created by providing `--nf_core_pipeline rnaseq`. - -All of the sample metadata obtained from the ENA will be appended as additional columns to help you manually curate the samplesheet before you run the pipeline. You can customise the metadata fields that are appended to the samplesheet via the `--ena_metadata_fields` parameter. The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. However, this pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. A comprehensive list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run]). +All of the sample metadata obtained from the ENA will be appended as additional columns to help you manually curate the generated samplesheet before you run the pipeline. You can customise the metadata fields that are appended to the samplesheet via the `--ena_metadata_fields` parameter. The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. However, this pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. A comprehensive list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run]). If you have a GEO accession (found in the data availability section of published papers) you can directly download a text file containing the appropriate SRA ids to pass to the pipeline: @@ -36,6 +34,29 @@ If you have a GEO accession (found in the data availability section of published This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. +**Synapse Sample Identifiers** + +[Synapse](https://www.synapse.org/#) is a collaborative research platform created by [Sage Bionetworks](https://sagebionetworks.org/). Its aim is to promote reproducible research and responsible data sharing throughout the biomedical community. To download data from `Synapse`, the SynapseID of the _directory_ containing all files to be downloaded should be provided. The SynapseID should be an eleven-character identifier, beginning with `syn`. + +This SynapseID will then be resolved to the SynapseIDs of the corresponding FastQ files contained within the directory. The individual FastQ files are then downloaded in parellel using the `synapse get` command. All Synapse metadata, annotations and provenance are also downloaded using the `synapse show` command, and are outputted to a separate metadata file. By default, only the md5sums, file sizes, etags, Synapse IDs, file names, and file versions are shown. + +In order to download data from Synapse, an account must be created and a user configuration file provided via the parameter `--synapse_config`. For more information about Synapse configuration, please see the [Synapse client configuration](https://help.synapse.org/docs/Client-Configuration.1985446156.html) documentation. + +The final sample information for the FastQ files used for samplesheet generation is obtained from the file name itself. The file names are parsed according to the glob pattern `*{1,2}*`, which returns the sample name, presumed to be the longest possible string matching the glob pattern, with the fewest number of wildcard insertions. + +
+Supported File Names + +* Files named `SRR493366_1.fastq` and `SRR493366_2.fastq` will have a sample name of `SRR493366` +* Files named `SRR_493_367_1.fastq` and `SRR_493_367_2.fastq` will have a sample name of `SRR_493_367` +* Files named `filename12_1.fastq` and `filename12_2.fastq` will have a sample name of `filename12` + +
+ +**Samplesheet Generation** + +As a bonus, the columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. It is highly recommended that you double-check that all of the identifiers you defined using `--input` are represented in the samplesheet. Also, public databases don't reliably hold information such as strandedness information so you may need to amend these entries too if for example your samplesheet was created by providing `--nf_core_pipeline rnaseq`. + ## Running the pipeline The typical command for running the pipeline is as follows: From 884dafa7b455fe29aa38f44fc5058c7c8a2ddc10 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Mon, 18 Oct 2021 16:06:46 -0700 Subject: [PATCH 035/106] Change SampleName function name --- lib/WorkflowSynapse.groovy | 2 +- workflows/synapse.nf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/WorkflowSynapse.groovy b/lib/WorkflowSynapse.groovy index d36d1e55..557d58bf 100755 --- a/lib/WorkflowSynapse.groovy +++ b/lib/WorkflowSynapse.groovy @@ -39,7 +39,7 @@ class WorkflowSynapse { // // Obtain Sample ID from File Name // - public static String getSampleID(input_file, pattern) { + public static String sampleNameFromFastQ(input_file, pattern) { def sampleids = "" diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 1f535f60..82125050 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -101,14 +101,14 @@ workflow SYNAPSE { SYNAPSE_GET .out .fastq - .map { meta, fastq -> [ WorkflowSynapse.getSampleID( fastq , "*{1,2}*"), fastq ] } + .map { meta, fastq -> [ WorkflowSynapse.sampleNameFromFastQ( fastq , "*{1,2}*"), fastq ] } .groupTuple(sort: { it -> it.baseName }) .set { ch_fastq } SYNAPSE_GET .out .fastq - .map { meta, fastq -> [ WorkflowSynapse.getSampleID( fastq , "*{1,2}*"), meta.id ] } + .map { meta, fastq -> [ WorkflowSynapse.sampleNameFromFastQ( fastq , "*{1,2}*"), meta.id ] } .groupTuple() .join(ch_fastq) .map { id, synids, fastq -> From fc071ecc4306c87f0b2943b5c1c652002db00d67 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Mon, 18 Oct 2021 16:25:26 -0700 Subject: [PATCH 036/106] Fix md --- README.md | 10 ++++++---- docs/usage.md | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 889d4a87..312db468 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. @@ -26,19 +26,21 @@ On release, automated continuous integration tests run the pipeline on a full-si Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt)) the pipeline performs the following steps: -**SRA / ENA / DDBJ / GEO Identifiers** +### SRA / ENA / DDBJ / GEO Identifiers + 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata including direct download links to FastQ files via ENA API 3. Download FastQ files in parallel via `curl` and perform `md5sum` check 4. Collate id metadata and paths to FastQ files in a single samplesheet -**Synapse Identifiers** +### Synapse Identifiers + 1. Resolve Synapse directory SynapseIDs to the SynapseIDs of the FastQ files contained, using the `synapse list` command. 2. Retrieve FastQ file metadata including FastQ file names, md5sums, etags, annotations, and data provenance via the `synapse show` command. 3. Download FastQ files in parallel via `synapse get` and perform `md5sum` check 4. Collate paths to FastQ files in a single samplesheet -**Samplesheet Generation** +### Samplesheet Generation The columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. ## Quick Start diff --git a/docs/usage.md b/docs/usage.md index 6e9787ad..0b2bfa44 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -18,7 +18,7 @@ The pipeline has been set-up to automatically download and process the raw FastQ | SRA1068758 | ERA2420837 | DRA008156 | | | | PRJNA625551 | PRJEB37513 | PRJDB4176 | | | -**SRR / ERR / DRR Sample Identifiers** +### SRR / ERR / DRR Sample Identifiers If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. @@ -34,7 +34,7 @@ If you have a GEO accession (found in the data availability section of published This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. -**Synapse Sample Identifiers** +### Synapse Sample Identifiers [Synapse](https://www.synapse.org/#) is a collaborative research platform created by [Sage Bionetworks](https://sagebionetworks.org/). Its aim is to promote reproducible research and responsible data sharing throughout the biomedical community. To download data from `Synapse`, the SynapseID of the _directory_ containing all files to be downloaded should be provided. The SynapseID should be an eleven-character identifier, beginning with `syn`. @@ -53,7 +53,7 @@ The final sample information for the FastQ files used for samplesheet generation -**Samplesheet Generation** +### Samplesheet Generation As a bonus, the columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. It is highly recommended that you double-check that all of the identifiers you defined using `--input` are represented in the samplesheet. Also, public databases don't reliably hold information such as strandedness information so you may need to amend these entries too if for example your samplesheet was created by providing `--nf_core_pipeline rnaseq`. From 89ae0b7600cd46c53a29af0248867793ba46f09a Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Mon, 18 Oct 2021 16:27:11 -0700 Subject: [PATCH 037/106] Whitespace whitespace --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 312db468..209a239b 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ Via a single file of ids, provided one-per-line (see [example input file](https: 4. Collate paths to FastQ files in a single samplesheet ### Samplesheet Generation + The columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. ## Quick Start From 3909540c682a1170d88d8e7777130e5a769f50b5 Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Tue, 19 Oct 2021 08:44:08 -0700 Subject: [PATCH 038/106] Remove md5sum check --- lib/WorkflowSynapse.groovy | 1 + modules/local/synapse_get.nf | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/WorkflowSynapse.groovy b/lib/WorkflowSynapse.groovy index 557d58bf..8a79c6a6 100755 --- a/lib/WorkflowSynapse.groovy +++ b/lib/WorkflowSynapse.groovy @@ -20,6 +20,7 @@ class WorkflowSynapse { meta["${category}|${entries[0]}"] = entries[1] } meta.id = meta['properties|id'] + meta.name = meta['properties|name'] meta.md5 = meta['File|md5'] return meta.findAll{ it.value != null } } diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index 1e959296..99985f48 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -36,8 +36,7 @@ process SYNAPSE_GET { $options.args \\ $meta.id - find ./ -type f -name "*.fastq.gz" -exec echo "${meta.md5} " {} \\; > ${meta.id}.md5 - md5sum -c ${meta.id}.md5 + echo "${meta.md5} \t ${meta.name}" > ${meta.id}.md5 cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: From fad83a8ba5d3ed75c334a6d09277fbc23020609e Mon Sep 17 00:00:00 2001 From: Daisy Wenyan Han <60151111+daisyhan97@users.noreply.github.com> Date: Tue, 19 Oct 2021 11:09:21 -0500 Subject: [PATCH 039/106] Syntax Edits Co-authored-by: Harshil Patel --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 209a239b..9bf7db8f 100644 --- a/README.md +++ b/README.md @@ -26,21 +26,21 @@ On release, automated continuous integration tests run the pipeline on a full-si Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt)) the pipeline performs the following steps: -### SRA / ENA / DDBJ / GEO Identifiers +### SRA / ENA / DDBJ / GEO ids 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata including direct download links to FastQ files via ENA API 3. Download FastQ files in parallel via `curl` and perform `md5sum` check 4. Collate id metadata and paths to FastQ files in a single samplesheet -### Synapse Identifiers +### Synapse ids -1. Resolve Synapse directory SynapseIDs to the SynapseIDs of the FastQ files contained, using the `synapse list` command. -2. Retrieve FastQ file metadata including FastQ file names, md5sums, etags, annotations, and data provenance via the `synapse show` command. -3. Download FastQ files in parallel via `synapse get` and perform `md5sum` check +1. Resolve Synapse directory ids to their corresponding FastQ files ids via the `synapse list` command. +2. Retrieve FastQ file metadata including FastQ file names, md5sums, etags, annotations and other data provenance via the `synapse show` command. +3. Download FastQ files in parallel via `synapse get` 4. Collate paths to FastQ files in a single samplesheet -### Samplesheet Generation +### Samplesheet format The columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. From bf020ac1fc4528267e13b9de2f29b7e38893a281 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 19 Oct 2021 17:21:20 +0100 Subject: [PATCH 040/106] Update output.md --- docs/output.md | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/docs/output.md b/docs/output.md index acc1d956..575bd84f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -4,14 +4,16 @@ This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -## SRA / ENA / DDBJ / GEO - Pipeline Overview +## Pipeline Overview -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using one of the following workflows: -* [FastQ download](#fastq-download) - Download FastQ files via SRA / ENA / DDBJ / GEO ids +* Download FastQ files and create samplesheet from: + 1. [SRA / ENA / DDBJ / GEO ids](#sra-ena-ddbj-geo-ids) + 2. [Synapse ids](#synapse-ids) * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQ download +### SRA / ENA / DDBJ / GEO ids
Output files @@ -32,25 +34,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because the tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. -### Pipeline information - -
-Output files - -* `pipeline_info/` - * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.tsv`. - -
- -## Synapse - Pipeline Overview - -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - -* [FastQ download](#fastq-Download) - Download FastQ files via Synapse ids -* [Pipeline information](#pipeline-Information) - Report metrics generated during the workflow execution - -### FastQ Download +### Synapse ids
Output files @@ -62,8 +46,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d * `samplesheet/` * `samplesheet.csv`: Auto-created samplesheet with collated metadata and paths to downloaded FastQ files. * `metadata/` - * `*.metadata.txt`: Original metadata file, generated using the `synapse show` command. - * `*.list.txt`: Original output of the `synapse list` command, containing the SynapseIDs, file version numbers, file names, and other file-specific data of the Synapse directory ID provided. + * `*.metadata.txt`: Original metadata file generated using the `synapse show` command. + * `*.list.txt`: Original output of the `synapse list` command, containing the Synapse ids, file version numbers, file names, and other file-specific data for the Synapse directory ID provided.
From 522cab37d805fa7afc08ae75071a6b9cf0277e89 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 19 Oct 2021 17:25:36 +0100 Subject: [PATCH 041/106] Update output.md --- docs/output.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/output.md b/docs/output.md index 575bd84f..510edaf0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -4,15 +4,17 @@ This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -## Pipeline Overview +## Pipeline overview -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using one of the following workflows: +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data depending on the type of ids provided: * Download FastQ files and create samplesheet from: - 1. [SRA / ENA / DDBJ / GEO ids](#sra-ena-ddbj-geo-ids) + 1. [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids) 2. [Synapse ids](#synapse-ids) * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. + ### SRA / ENA / DDBJ / GEO ids
@@ -32,7 +34,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
-Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because the tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because the tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. ### Synapse ids @@ -51,7 +53,7 @@ Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introductio
-Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for detailed instructions on how to correctly provide Synapse IDs to the pipeline. FastQ files and corresponding sample information for `Synapse` identifiers are downloaded in parallel directly from the [Synapse](https://www.synapse.org/#) platform. A [configuration file](http://python-docs.synapse.org/build/html/Credentials.html#use-synapseconfig) containing valid login credentials is required for Synapse downloads. +FastQ files and corresponding sample information for `Synapse` identifiers are downloaded in parallel directly from the [Synapse](https://www.synapse.org/#) platform. A [configuration file](http://python-docs.synapse.org/build/html/Credentials.html#use-synapseconfig) containing valid login credentials is required for Synapse downloads. The final sample information for the FastQ files downloaded from `Synapse` is obtained from the file name itself. The file names are parsed according to the glob pattern `*{1,2}*`. This returns the sample name, presumed to be the longest possible string matching the glob pattern, with the fewest number of wildcard insertions. Further information on sample name parsing can be found in the [usage documentation](https://nf-co.re/fetchngs/usage#introduction). From 3b52a76f4bf343582c22d833f6017be1cf21614a Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 19 Oct 2021 17:32:29 +0100 Subject: [PATCH 042/106] Update usage.md --- docs/usage.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 0b2bfa44..2301a040 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -18,7 +18,7 @@ The pipeline has been set-up to automatically download and process the raw FastQ | SRA1068758 | ERA2420837 | DRA008156 | | | | PRJNA625551 | PRJEB37513 | PRJDB4176 | | | -### SRR / ERR / DRR Sample Identifiers +### SRR / ERR / DRR Sample ids If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. @@ -34,11 +34,11 @@ If you have a GEO accession (found in the data availability section of published This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. -### Synapse Sample Identifiers +### Synapse ids -[Synapse](https://www.synapse.org/#) is a collaborative research platform created by [Sage Bionetworks](https://sagebionetworks.org/). Its aim is to promote reproducible research and responsible data sharing throughout the biomedical community. To download data from `Synapse`, the SynapseID of the _directory_ containing all files to be downloaded should be provided. The SynapseID should be an eleven-character identifier, beginning with `syn`. +[Synapse](https://www.synapse.org/#) is a collaborative research platform created by [Sage Bionetworks](https://sagebionetworks.org/). Its aim is to promote reproducible research and responsible data sharing throughout the biomedical community. To download data from `Synapse`, the Synapse id of the _directory_ containing all files to be downloaded should be provided. The Synapse id should be an eleven-characters beginning with `syn`. -This SynapseID will then be resolved to the SynapseIDs of the corresponding FastQ files contained within the directory. The individual FastQ files are then downloaded in parellel using the `synapse get` command. All Synapse metadata, annotations and provenance are also downloaded using the `synapse show` command, and are outputted to a separate metadata file. By default, only the md5sums, file sizes, etags, Synapse IDs, file names, and file versions are shown. +This Synapse id will then be resolved to the Synapse id of the corresponding FastQ files contained within the directory. The individual FastQ files are then downloaded in parellel using the `synapse get` command. All Synapse metadata, annotations and data provenance are also downloaded using the `synapse show` command, and are outputted to a separate metadata file. By default, only the md5sums, file sizes, etags, Synapse ids, file names, and file versions are shown. In order to download data from Synapse, an account must be created and a user configuration file provided via the parameter `--synapse_config`. For more information about Synapse configuration, please see the [Synapse client configuration](https://help.synapse.org/docs/Client-Configuration.1985446156.html) documentation. @@ -53,7 +53,7 @@ The final sample information for the FastQ files used for samplesheet generation -### Samplesheet Generation +### Samplesheet format As a bonus, the columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. It is highly recommended that you double-check that all of the identifiers you defined using `--input` are represented in the samplesheet. Also, public databases don't reliably hold information such as strandedness information so you may need to amend these entries too if for example your samplesheet was created by providing `--nf_core_pipeline rnaseq`. From 8d4f321ffccce6593c4628a8f383f8bbafced0d3 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 19 Oct 2021 17:32:53 +0100 Subject: [PATCH 043/106] Update docs/output.md --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 510edaf0..c292f563 100644 --- a/docs/output.md +++ b/docs/output.md @@ -57,7 +57,7 @@ FastQ files and corresponding sample information for `Synapse` identifiers are d The final sample information for the FastQ files downloaded from `Synapse` is obtained from the file name itself. The file names are parsed according to the glob pattern `*{1,2}*`. This returns the sample name, presumed to be the longest possible string matching the glob pattern, with the fewest number of wildcard insertions. Further information on sample name parsing can be found in the [usage documentation](https://nf-co.re/fetchngs/usage#introduction). -### Pipeline Information +### Pipeline information
Output files From aa54f0917f49971a44e99a7d057a795bc661ff82 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 19 Oct 2021 17:36:07 +0100 Subject: [PATCH 044/106] Update docs/usage.md --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 2301a040..5040552b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -18,7 +18,7 @@ The pipeline has been set-up to automatically download and process the raw FastQ | SRA1068758 | ERA2420837 | DRA008156 | | | | PRJNA625551 | PRJEB37513 | PRJDB4176 | | | -### SRR / ERR / DRR Sample ids +### SRR / ERR / DRR ids If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. From e9303e6d4c6bdf3ff5a1b48a6bb73cec76feb61e Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 19 Oct 2021 17:37:10 +0100 Subject: [PATCH 045/106] Update output.md --- docs/output.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index c292f563..df7fec02 100644 --- a/docs/output.md +++ b/docs/output.md @@ -9,8 +9,8 @@ This document describes the output produced by the pipeline. The directories lis The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data depending on the type of ids provided: * Download FastQ files and create samplesheet from: - 1. [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids) - 2. [Synapse ids](#synapse-ids) + 1. [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids) + 2. [Synapse ids](#synapse-ids) * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. From 349224fb3fe2ac6fa7b264cf5f51044d69bf7321 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 20 Oct 2021 09:38:52 +0100 Subject: [PATCH 046/106] Update regex for DDBJ and synapse ids --- assets/schema_input.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 224e5cf7..00c0d47f 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -8,7 +8,7 @@ "type": "array", "items": { "type": "string", - "pattern": "^[SEPG][RAS][RXSMPAJXE][EN]?[AB]?\\d{4,9}$", + "pattern": "^[SEPGDs][RASy][RXSMPAJXEn][END]?[AB]?\\d{4,9}$", "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" } } From 756b598cd9800bc6c87f52cea06aaa50974420ad Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 20 Oct 2021 09:39:12 +0100 Subject: [PATCH 047/106] Add functions to check if SRA or Synapse ids have been provided --- lib/WorkflowMain.groovy | 60 ++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index f1804f51..6f1fcb2d 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -78,15 +78,55 @@ class WorkflowMain { } } - // Check input type - public static String getIdentifierType(workflow, params, log) { - def input_type = "" - params.input.eachLine { line -> - if (line.contains("syn")) { - input_type = "Synapse" - } else { - input_type = "SRA" - }} - return input_type + // Check if input ids are from the SRA + public static Boolean isSraId(input, log) { + def is_sra = false + def total_ids = 0 + def no_match_ids = [] + def pattern = /^[SEPGD][RAS][RXSMPAJXE][END]?[AB]?\d{4,9}$/ + input.eachLine { line -> + total_ids += 1 + def matcher = line =~ pattern + if (!matcher) { + no_match_ids << line + } + } + + def num_match = total_ids - no_match_ids.size() + if (num_match > 0) { + if (num_match == total_ids) { + is_sra = true + } else { + log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ / GEO or Synapse ids!" + System.exit(1) + } + } + return is_sra + } + + // Check if input ids are from the Synapse platform + public static Boolean isSynapseId(input, log) { + def is_synapse = false + def total_ids = 0 + def no_match_ids = [] + def pattern = /^syn\d{8}$/ + input.eachLine { line -> + total_ids += 1 + def matcher = line =~ pattern + if (!matcher) { + no_match_ids << line + } + } + + def num_match = total_ids - no_match_ids.size() + if (num_match > 0) { + if (num_match == total_ids) { + is_synapse = true + } else { + log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ / GEO or Synapse ids!" + System.exit(1) + } + } + return is_synapse } } From 68385ddcf173b22b2479c4e8948c87c9b4af65ae Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 20 Oct 2021 09:39:29 +0100 Subject: [PATCH 048/106] Auto-detect id type and invoke relevant workflow --- main.nf | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index 0b76d9da..abb0d5fb 100644 --- a/main.nf +++ b/main.nf @@ -18,7 +18,18 @@ nextflow.enable.dsl = 2 */ WorkflowMain.initialise(workflow, params, log) -input_type = WorkflowMain.getIdentifierType(workflow, params, log) + +// Check if --input file is empty +ch_input = file(params.input, checkIfExists: true) +if (ch_input.isEmpty()) {exit 1, "File provided with --input is empty: ${ch_input.getName()}!"} + +// Read in ids from --input file +Channel + .from(file(params.input, checkIfExists: true)) + .splitCsv(header:false, sep:'', strip:true) + .map { it[0] } + .unique() + .set { ch_ids } /* ======================================================================================== @@ -26,10 +37,15 @@ input_type = WorkflowMain.getIdentifierType(workflow, params, log) ======================================================================================== */ -if (input_type == 'Synapse') { +def id_type = '' +if (WorkflowMain.isSraId(ch_input, log)) { + id_type = 'sra' + include { SRA } from './workflows/sra' +} else if (WorkflowMain.isSynapseId(ch_input, log)) { + id_type = 'synapse' include { SYNAPSE } from './workflows/synapse' } else { - include { SRA } from './workflows/sra' + exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ / GEO or Synapse ids!' } // @@ -37,14 +53,18 @@ if (input_type == 'Synapse') { // workflow NFCORE_FETCHNGS { - // Workflow for SynapseIDs - if (input_type == 'Synapse') { - SYNAPSE () - } else { - // Workflow for SRA/ENA/GEO IDs - SRA () - } + // + // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ / GEO ids + // + if (id_type == 'sra') { + SRA ( ch_ids ) + // + // WORKFLOW: Download FastQ files for Synapse ids + // + } else if (id_type == 'synapse') { + SYNAPSE ( ch_ids ) + } } /* From fe3c09742de5758fbf645d88ec096810c83f9798 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 20 Oct 2021 09:39:54 +0100 Subject: [PATCH 049/106] Pass input ids as a channel directly to workflows --- workflows/sra.nf | 14 +++++--------- workflows/synapse.nf | 14 +++++--------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index 835bd874..dd8e13df 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -13,14 +13,6 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) // Validate input parameters WorkflowSra.initialise(params, log, valid_params) -// Read in ids from --input file -Channel - .from(file(params.input, checkIfExists: true)) - .splitCsv(header:false, sep:'', strip:true) - .map { it[0] } - .unique() - .set { ch_ids } - /* ======================================================================================== IMPORT LOCAL MODULES/SUBWORKFLOWS @@ -53,13 +45,17 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/ workflow SRA { + take: + ids // channel: [ ids ] + + main: ch_versions = Channel.empty() // // MODULE: Get SRA run information for public database ids // SRA_IDS_TO_RUNINFO ( - ch_ids, + ids, params.ena_metadata_fields ?: '' ) ch_versions = ch_versions.mix(SRA_IDS_TO_RUNINFO.out.versions.first()) diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 3d993789..b308372e 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -6,14 +6,6 @@ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) -// Read in ids from --input file -Channel - .from(file(params.input, checkIfExists: true)) - .splitCsv(header:false, sep:'', strip:true) - .map { it[0] } - .unique() - .set { ch_ids } - // Create channel for synapse config if (params.synapse_config) { ch_synapse_config = file(params.synapse_config, checkIfExists: true) @@ -52,13 +44,17 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/ workflow SYNAPSE { + take: + ids // channel: [ ids ] + + main: ch_versions = Channel.empty() // // MODULE: Expand synapse ids for individual FastQ files // SYNAPSE_LIST ( - ch_ids, + ids, ch_synapse_config ) ch_versions = ch_versions.mix(SYNAPSE_LIST.out.versions.first()) From 153d1ab92e342c75501ae04c5719af4e889c5b76 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 20 Oct 2021 09:41:33 +0100 Subject: [PATCH 050/106] Tweak comments --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index abb0d5fb..98b25cb1 100644 --- a/main.nf +++ b/main.nf @@ -37,6 +37,7 @@ Channel ======================================================================================== */ +// Auto-detect id type def id_type = '' if (WorkflowMain.isSraId(ch_input, log)) { id_type = 'sra' @@ -49,7 +50,7 @@ if (WorkflowMain.isSraId(ch_input, log)) { } // -// WORKFLOW: Run main nf-core/fetchngs analysis pipeline, depending on type of dentifier provided +// WORKFLOW: Run main nf-core/fetchngs analysis pipeline depending on type of identifier provided // workflow NFCORE_FETCHNGS { From b7a86fd8b066ecd2313a95e2fe76c44e3c014b14 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 18 Oct 2021 13:33:49 +0200 Subject: [PATCH 051/106] refactor: make curl retry with exponential back-off --- modules/local/sra_fastq_ftp.nf | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index 39d3659d..713a9f81 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -30,7 +30,10 @@ process SRA_FASTQ_FTP { script: if (meta.single_end) { """ - bash -c 'until curl $options.args -L ${fastq[0]} -o ${meta.id}.fastq.gz; do sleep 1; done'; + curl $options.args \\ + --retry 5 \\ + -L ${fastq[0]} \\ + -o ${meta.id}.fastq.gz echo "${meta.md5_1} ${meta.id}.fastq.gz" > ${meta.id}.fastq.gz.md5 md5sum -c ${meta.id}.fastq.gz.md5 @@ -42,12 +45,18 @@ process SRA_FASTQ_FTP { """ } else { """ - bash -c 'until curl $options.args -L ${fastq[0]} -o ${meta.id}_1.fastq.gz; do sleep 1; done'; + curl $options.args \\ + --retry 5 \\ + -L ${fastq[0]} \\ + -o ${meta.id}_1.fastq.gz echo "${meta.md5_1} ${meta.id}_1.fastq.gz" > ${meta.id}_1.fastq.gz.md5 md5sum -c ${meta.id}_1.fastq.gz.md5 - bash -c 'until curl $options.args -L ${fastq[1]} -o ${meta.id}_2.fastq.gz; do sleep 1; done'; + curl $options.args \\ + --retry 5 \\ + -L ${fastq[1]} \\ + -o ${meta.id}_2.fastq.gz echo "${meta.md5_2} ${meta.id}_2.fastq.gz" > ${meta.id}_2.fastq.gz.md5 md5sum -c ${meta.id}_2.fastq.gz.md5 From fc5200ceb5be32de249608e05210ea86b759736a Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 18 Oct 2021 15:27:50 +0200 Subject: [PATCH 052/106] refactor: report curl version instead of sed --- modules/local/sra_fastq_ftp.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index 713a9f81..ce08d602 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -40,7 +40,7 @@ process SRA_FASTQ_FTP { cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: - sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + curl: \$(echo \$(curl --version | head -n 1 | sed 's/^curl //; s/ .*\$//')) END_VERSIONS """ } else { @@ -63,7 +63,7 @@ process SRA_FASTQ_FTP { cat <<-END_VERSIONS > versions.yml ${getProcessName(task.process)}: - sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + curl: \$(echo \$(curl --version | head -n 1 | sed 's/^curl //; s/ .*\$//')) END_VERSIONS """ } From 1336b06ce12aad19a50535fe51fa0779dddd5bb2 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Fri, 22 Oct 2021 21:50:31 +0200 Subject: [PATCH 053/106] refactor: move retry to module options args --- conf/modules.config | 2 +- modules/local/sra_fastq_ftp.nf | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0d44e9ee..5824a876 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -33,7 +33,7 @@ params { 'sra_fastq_ftp' { publish_dir = 'fastq' publish_files = ['fastq.gz':'', 'md5':'md5'] - args = '-C - --max-time 1200' + args = '--retry 5 --continue-at - --max-time 1200' } 'sra_to_samplesheet' { publish_dir = 'samplesheet' diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index ce08d602..8d5adc2f 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -31,7 +31,6 @@ process SRA_FASTQ_FTP { if (meta.single_end) { """ curl $options.args \\ - --retry 5 \\ -L ${fastq[0]} \\ -o ${meta.id}.fastq.gz @@ -46,7 +45,6 @@ process SRA_FASTQ_FTP { } else { """ curl $options.args \\ - --retry 5 \\ -L ${fastq[0]} \\ -o ${meta.id}_1.fastq.gz @@ -54,7 +52,6 @@ process SRA_FASTQ_FTP { md5sum -c ${meta.id}_1.fastq.gz.md5 curl $options.args \\ - --retry 5 \\ -L ${fastq[1]} \\ -o ${meta.id}_2.fastq.gz From 1acd7ea2f15e91da33c56195204758ca44866b9f Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sun, 24 Oct 2021 01:41:42 +0100 Subject: [PATCH 054/106] Apply suggestions from code review --- modules/local/sra_fastq_ftp.nf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index 8d5adc2f..cccdd8ac 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -30,7 +30,8 @@ process SRA_FASTQ_FTP { script: if (meta.single_end) { """ - curl $options.args \\ + curl \\ + $options.args \\ -L ${fastq[0]} \\ -o ${meta.id}.fastq.gz @@ -44,14 +45,16 @@ process SRA_FASTQ_FTP { """ } else { """ - curl $options.args \\ + curl \\ + $options.args \\ -L ${fastq[0]} \\ -o ${meta.id}_1.fastq.gz echo "${meta.md5_1} ${meta.id}_1.fastq.gz" > ${meta.id}_1.fastq.gz.md5 md5sum -c ${meta.id}_1.fastq.gz.md5 - curl $options.args \\ + curl \\ + $options.args \\ -L ${fastq[1]} \\ -o ${meta.id}_2.fastq.gz From e4e1d4f6d4910083e684ab07680547d4b7950c2c Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sun, 24 Oct 2021 01:48:25 +0100 Subject: [PATCH 055/106] Change contributors section --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9bf7db8f..43dcf30d 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ The nf-core/fetchngs pipeline comes with documentation about the pipeline [usage ## Credits -nf-core/fetchngs was originally written by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [The Bioinformatics & Biostatistics Group](https://www.crick.ac.uk/research/science-technology-platforms/bioinformatics-and-biostatistics/) at [The Francis Crick Institute, London](https://www.crick.ac.uk/) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/). +nf-core/fetchngs was originally written by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/). ## Contributions and Support From 1a01b8ccd846e6e010920b46f54032637dcc97ff Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sun, 24 Oct 2021 01:50:44 +0100 Subject: [PATCH 056/106] Simplify matcher logic --- lib/WorkflowMain.groovy | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 6f1fcb2d..d5814c09 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -86,8 +86,7 @@ class WorkflowMain { def pattern = /^[SEPGD][RAS][RXSMPAJXE][END]?[AB]?\d{4,9}$/ input.eachLine { line -> total_ids += 1 - def matcher = line =~ pattern - if (!matcher) { + if (!(line =~ pattern)) { no_match_ids << line } } @@ -112,8 +111,7 @@ class WorkflowMain { def pattern = /^syn\d{8}$/ input.eachLine { line -> total_ids += 1 - def matcher = line =~ pattern - if (!matcher) { + if (!(line =~ pattern)) { no_match_ids << line } } From c133d36a27c7bbe697da50fa56fb0ad6f9cf27db Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Sun, 24 Oct 2021 01:59:32 +0100 Subject: [PATCH 057/106] Use better and more concise regex for ids --- assets/schema_input.json | 2 +- lib/WorkflowMain.groovy | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 00c0d47f..25006d9b 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -8,7 +8,7 @@ "type": "array", "items": { "type": "string", - "pattern": "^[SEPGDs][RASy][RXSMPAJXEn][END]?[AB]?\\d{4,9}$", + "pattern":"^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM])|(syn))(\\d+)$", "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" } } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index d5814c09..09264303 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -83,7 +83,7 @@ class WorkflowMain { def is_sra = false def total_ids = 0 def no_match_ids = [] - def pattern = /^[SEPGD][RAS][RXSMPAJXE][END]?[AB]?\d{4,9}$/ + def pattern = /^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM]))(\d+)$/ input.eachLine { line -> total_ids += 1 if (!(line =~ pattern)) { From b8398ef6983a8ccd4cc09c79afb8561c6eea869b Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 16 Oct 2021 14:19:59 +0200 Subject: [PATCH 058/106] chore: install prefetch module --- modules.json | 3 + .../modules/sratools/prefetch/functions.nf | 78 +++++++++++++++++++ .../nf-core/modules/sratools/prefetch/main.nf | 50 ++++++++++++ .../modules/sratools/prefetch/meta.yml | 43 ++++++++++ 4 files changed, 174 insertions(+) create mode 100644 modules/nf-core/modules/sratools/prefetch/functions.nf create mode 100644 modules/nf-core/modules/sratools/prefetch/main.nf create mode 100644 modules/nf-core/modules/sratools/prefetch/meta.yml diff --git a/modules.json b/modules.json index ff841d00..38d23a23 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,9 @@ "nf-core/modules": { "custom/dumpsoftwareversions": { "git_sha": "84f2302920078b0cf7716b2a2e5fcc0be5c4531d" + }, + "sratools/prefetch": { + "git_sha": "07c0830057cc655de113d84499c7c1499460bb55" } } } diff --git a/modules/nf-core/modules/sratools/prefetch/functions.nf b/modules/nf-core/modules/sratools/prefetch/functions.nf new file mode 100644 index 00000000..85628ee0 --- /dev/null +++ b/modules/nf-core/modules/sratools/prefetch/functions.nf @@ -0,0 +1,78 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Extract name of module from process name using $task.process +// +def getProcessName(task_process) { + return task_process.tokenize(':')[-1] +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + + // Do not publish versions.yml unless running from pytest workflow + if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { + return null + } + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } +} diff --git a/modules/nf-core/modules/sratools/prefetch/main.nf b/modules/nf-core/modules/sratools/prefetch/main.nf new file mode 100644 index 00000000..207d1e10 --- /dev/null +++ b/modules/nf-core/modules/sratools/prefetch/main.nf @@ -0,0 +1,50 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SRATOOLS_PREFETCH { + tag "$id" + label 'process_low' + label 'error_retry' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? 'bioconda::sra-tools=2.11.0' : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/sra-tools:2.11.0--pl5262h314213e_0' + } else { + container 'quay.io/biocontainers/sra-tools:2.11.0--pl5262h314213e_0' + } + + input: + tuple val(meta), val(id) + + output: + tuple val(meta), path("$id"), emit: sra + path "versions.yml" , emit: versions + + script: + def config = "/LIBS/GUID = \"${UUID.randomUUID().toString()}\"\\n/libs/cloud/report_instance_identity = \"true\"\\n" + """ + eval "\$(vdb-config -o n NCBI_SETTINGS | sed 's/[" ]//g')" + if [[ ! -f "\${NCBI_SETTINGS}" ]]; then + mkdir -p "\$(dirname "\${NCBI_SETTINGS}")" + printf '${config}' > "\${NCBI_SETTINGS}" + fi + + prefetch \\ + $options.args \\ + --progress \\ + $id + + vdb-validate $id + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$(prefetch --version 2>&1 | grep -Eo '[0-9.]+') + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/sratools/prefetch/meta.yml b/modules/nf-core/modules/sratools/prefetch/meta.yml new file mode 100644 index 00000000..ab0a5ce5 --- /dev/null +++ b/modules/nf-core/modules/sratools/prefetch/meta.yml @@ -0,0 +1,43 @@ +name: sratools_prefetch +description: Download sequencing data from the NCBI Sequence Read Archive (SRA). +keywords: + - sequencing + - fastq + - prefetch +tools: + - sratools: + description: SRA Toolkit and SDK from NCBI + homepage: https://github.com/ncbi/sra-tools + documentation: https://github.com/ncbi/sra-tools/wiki + tool_dev_url: https://github.com/ncbi/sra-tools + licence: ['Public Domain'] + +input: + - meta: + type: map + description: > + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - id: + type: val + description: > + A string denoting an SRA id. + +output: + - meta: + type: map + description: > + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sra: + type: directory + description: > + Directory containing the ETL data for the given SRA id. + pattern: "*/*.sra" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@Midnighter" From 8f665946b54633e4fc3024306152444c552e7954 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 16 Oct 2021 14:23:14 +0200 Subject: [PATCH 059/106] chore: install fasterqdump module --- conf/modules.config | 3 + modules.json | 3 + .../modules/sratools/fasterqdump/functions.nf | 78 +++++++++++++++++++ .../modules/sratools/fasterqdump/main.nf | 58 ++++++++++++++ .../modules/sratools/fasterqdump/meta.yml | 42 ++++++++++ 5 files changed, 184 insertions(+) create mode 100644 modules/nf-core/modules/sratools/fasterqdump/functions.nf create mode 100644 modules/nf-core/modules/sratools/fasterqdump/main.nf create mode 100644 modules/nf-core/modules/sratools/fasterqdump/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 5824a876..434a5c59 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,6 +35,9 @@ params { publish_files = ['fastq.gz':'', 'md5':'md5'] args = '--retry 5 --continue-at - --max-time 1200' } + 'sratools/fasterqdump' { + publish_dir = 'fastq' + } 'sra_to_samplesheet' { publish_dir = 'samplesheet' publish_files = false diff --git a/modules.json b/modules.json index 38d23a23..f004808d 100644 --- a/modules.json +++ b/modules.json @@ -6,6 +6,9 @@ "custom/dumpsoftwareversions": { "git_sha": "84f2302920078b0cf7716b2a2e5fcc0be5c4531d" }, + "sratools/fasterqdump": { + "git_sha": "de997825de788fe2210db16d9426f10342a1ba1d" + }, "sratools/prefetch": { "git_sha": "07c0830057cc655de113d84499c7c1499460bb55" } diff --git a/modules/nf-core/modules/sratools/fasterqdump/functions.nf b/modules/nf-core/modules/sratools/fasterqdump/functions.nf new file mode 100644 index 00000000..85628ee0 --- /dev/null +++ b/modules/nf-core/modules/sratools/fasterqdump/functions.nf @@ -0,0 +1,78 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Extract name of module from process name using $task.process +// +def getProcessName(task_process) { + return task_process.tokenize(':')[-1] +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + + // Do not publish versions.yml unless running from pytest workflow + if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { + return null + } + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } +} diff --git a/modules/nf-core/modules/sratools/fasterqdump/main.nf b/modules/nf-core/modules/sratools/fasterqdump/main.nf new file mode 100644 index 00000000..08ef9045 --- /dev/null +++ b/modules/nf-core/modules/sratools/fasterqdump/main.nf @@ -0,0 +1,58 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SRATOOLS_FASTERQDUMP { + tag "$meta.id" + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? 'bioconda::sra-tools=2.11.0 conda-forge::pigz=2.6' : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' + } else { + container 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' + } + + input: + tuple val(meta), path(sra) + + output: + tuple val(meta), path(output), emit: reads + path "versions.yml" , emit: versions + + script: + def config = "/LIBS/GUID = \"${UUID.randomUUID().toString()}\"\\n/libs/cloud/report_instance_identity = \"true\"\\n" + // Paired-end data extracted by fasterq-dump (--split-3 the default) always creates + // *_1.fastq *_2.fastq files but sometimes also an additional *.fastq file + // for unpaired reads which we ignore here. + output = meta.single_end ? '*.fastq.gz' : '*_{1,2}.fastq.gz' + """ + eval "\$(vdb-config -o n NCBI_SETTINGS | sed 's/[" ]//g')" + if [[ ! -f "\${NCBI_SETTINGS}" ]]; then + mkdir -p "\$(dirname "\${NCBI_SETTINGS}")" + printf '${config}' > "\${NCBI_SETTINGS}" + fi + + fasterq-dump \\ + ${options.args} \\ + --threads $task.cpus \\ + ${sra.name} + + pigz \\ + ${options.args2} \\ + --no-name \\ + --processes $task.cpus \\ + *.fastq + + cat <<-END_VERSIONS > versions.yml + ${getProcessName(task.process)}: + ${getSoftwareName(task.process)}: \$(fasterq-dump --version 2>&1 | grep -Eo '[0-9.]+') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/sratools/fasterqdump/meta.yml b/modules/nf-core/modules/sratools/fasterqdump/meta.yml new file mode 100644 index 00000000..ac61e71f --- /dev/null +++ b/modules/nf-core/modules/sratools/fasterqdump/meta.yml @@ -0,0 +1,42 @@ +name: sratools_fasterqdump +description: Extract sequencing reads in FASTQ format from a given NCBI Sequence Read Archive (SRA). +keywords: + - sequencing + - FASTQ + - dump +tools: + - sratools: + description: SRA Toolkit and SDK from NCBI + homepage: https://github.com/ncbi/sra-tools + documentation: https://github.com/ncbi/sra-tools/wiki + tool_dev_url: https://github.com/ncbi/sra-tools + licence: ['Public Domain'] + +input: + - meta: + type: map + description: > + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sra: + type: directory + description: Directory containing ETL data for the given SRA. + pattern: "*/*.sra" + +output: + - meta: + type: map + description: > + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Extracted FASTQ file or files if the sequencing reads are paired-end. + pattern: "*.fastq.gz" + +authors: + - "@Midnighter" From d999b7f7487b82d0c06fe6587c8d0b8aaaaa2818 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 16 Oct 2021 14:44:26 +0200 Subject: [PATCH 060/106] feat: add sra-tools subworkflow --- conf/modules.config | 3 ++ subworkflows/local/sra_fastq/main.nf | 34 ++++++++++++++++++ subworkflows/local/sra_fastq/meta.yml | 36 ++++++++++++++++++++ subworkflows/local/sra_fastq/nextflow.config | 2 ++ workflows/sra.nf | 18 +++++----- 5 files changed, 83 insertions(+), 10 deletions(-) create mode 100644 subworkflows/local/sra_fastq/main.nf create mode 100644 subworkflows/local/sra_fastq/meta.yml create mode 100644 subworkflows/local/sra_fastq/nextflow.config diff --git a/conf/modules.config b/conf/modules.config index 434a5c59..e253262e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,6 +35,9 @@ params { publish_files = ['fastq.gz':'', 'md5':'md5'] args = '--retry 5 --continue-at - --max-time 1200' } + 'sratools/prefetch' { + publish_dir = 'sra' + } 'sratools/fasterqdump' { publish_dir = 'fastq' } diff --git a/subworkflows/local/sra_fastq/main.nf b/subworkflows/local/sra_fastq/main.nf new file mode 100644 index 00000000..c6590b59 --- /dev/null +++ b/subworkflows/local/sra_fastq/main.nf @@ -0,0 +1,34 @@ +// +// Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). +// + +params.prefetch_options = [:] +params.fasterqdump_options = [:] + +include { SRATOOLS_PREFETCH } from '../../../modules/nf-core/modules/sratools/prefetch/main' addParams( options: params.prefetch_options ) +include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/modules/sratools/fasterqdump/main' addParams( options: params.fasterqdump_options ) + +workflow SRA_FASTQ { + take: + sra_ids // channel: [ val(meta), val(id) ] + + main: + + ch_versions = Channel.empty() + + // + // Prefetch sequencing reads in SRA format. + // + SRATOOLS_PREFETCH ( sra_ids ) + ch_versions = ch_versions.mix( SRATOOLS_PREFETCH.out.versions.first() ) + + // + // Convert the SRA format into one or more compressed FASTQ files. + // + SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra ) + ch_versions = ch_versions.mix( SRATOOLS_FASTERQDUMP.out.versions.first() ) + + emit: + reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/sra_fastq/meta.yml b/subworkflows/local/sra_fastq/meta.yml new file mode 100644 index 00000000..146176ee --- /dev/null +++ b/subworkflows/local/sra_fastq/meta.yml @@ -0,0 +1,36 @@ +name: sra_fastq +description: Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). +keywords: + - sequencing + - FASTQ + - prefetch + - dump +modules: + - sratools/prefetch + - sratools/fasterqdump +input: + - meta: + type: map + description: > + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - id: + type: string + description: > + SRA identifier. +output: + - meta: + type: map + description: > + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Extracted FASTQ file or files if the sequencing reads are paired-end. + pattern: "*.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - '@Midnighter' diff --git a/subworkflows/local/sra_fastq/nextflow.config b/subworkflows/local/sra_fastq/nextflow.config new file mode 100644 index 00000000..07448834 --- /dev/null +++ b/subworkflows/local/sra_fastq/nextflow.config @@ -0,0 +1,2 @@ +params.prefetch_options = [:] +params.fasterqdump_options = [:] diff --git a/workflows/sra.nf b/workflows/sra.nf index dd8e13df..efc08482 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -25,6 +25,7 @@ def modules = params.modules.clone() include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' addParams( options: modules['sra_ids_to_runinfo'] ) include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' addParams( options: modules['sra_runinfo_to_ftp'] ) include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' addParams( options: modules['sra_fastq_ftp'] ) +include { SRA_FASTQ } from '../subworkflows/local/sra_fastq/main' addParams( prefetch_options: modules['sratools/prefetch'], fasterqdump_options: modules['sratools/fasterqdump'] ) include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' addParams( options: modules['sra_to_samplesheet'], results_dir: modules['sra_fastq_ftp'].publish_dir ) include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' addParams( options: modules['sra_merge_samplesheet'] ) include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' addParams( options: modules['multiqc_mappings_config'] ) @@ -90,11 +91,17 @@ workflow SRA { ) ch_versions = ch_versions.mix(SRA_FASTQ_FTP.out.versions.first()) + // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. + SRA_FASTQ ( + ch_sra_reads.map { meta, reads -> if (!meta.fastq_1) [meta, meta.run_accession] } + ) + ch_versions = ch_versions.mix(SRA_FASTQ.out.versions.first()) + // // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet // SRA_TO_SAMPLESHEET ( - SRA_FASTQ_FTP.out.fastq, + SRA_FASTQ_FTP.out.fastq.mix(SRA_FASTQ.out.reads), params.nf_core_pipeline ?: '', params.sample_mapping_fields ) @@ -117,15 +124,6 @@ workflow SRA { ) ch_versions = ch_versions.mix(MULTIQC_MAPPINGS_CONFIG.out.versions) } - - // - // If ids don't have a direct FTP download link write them to file for download outside of the pipeline - // - def no_ids_file = ["${params.outdir}", "${modules['sra_fastq_ftp'].publish_dir}", "IDS_NOT_DOWNLOADED.txt" ].join(File.separator) - ch_sra_reads - .map { meta, reads -> if (!meta.fastq_1) "${meta.id.split('_')[0..-2].join('_')}" } - .unique() - .collectFile(name: no_ids_file, sort: true, newLine: true) } // From b37bf865e6f2a2d02d502403261eb3a20e50ee96 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Fri, 22 Oct 2021 22:03:28 +0200 Subject: [PATCH 061/106] refactor: rename module options keys --- conf/modules.config | 4 ++-- workflows/sra.nf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index e253262e..534b8249 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,10 +35,10 @@ params { publish_files = ['fastq.gz':'', 'md5':'md5'] args = '--retry 5 --continue-at - --max-time 1200' } - 'sratools/prefetch' { + 'sratools_prefetch' { publish_dir = 'sra' } - 'sratools/fasterqdump' { + 'sratools_fasterqdump' { publish_dir = 'fastq' } 'sra_to_samplesheet' { diff --git a/workflows/sra.nf b/workflows/sra.nf index efc08482..b3d357e4 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -25,7 +25,7 @@ def modules = params.modules.clone() include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' addParams( options: modules['sra_ids_to_runinfo'] ) include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' addParams( options: modules['sra_runinfo_to_ftp'] ) include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' addParams( options: modules['sra_fastq_ftp'] ) -include { SRA_FASTQ } from '../subworkflows/local/sra_fastq/main' addParams( prefetch_options: modules['sratools/prefetch'], fasterqdump_options: modules['sratools/fasterqdump'] ) +include { SRA_FASTQ } from '../subworkflows/local/sra_fastq/main' addParams( prefetch_options: modules['sratools_prefetch'], fasterqdump_options: modules['sratools_fasterqdump'] ) include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' addParams( options: modules['sra_to_samplesheet'], results_dir: modules['sra_fastq_ftp'].publish_dir ) include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' addParams( options: modules['sra_merge_samplesheet'] ) include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' addParams( options: modules['multiqc_mappings_config'] ) From 0d0ba2445ef918a5564f2f8b37fdf83b39cd62f6 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 25 Oct 2021 16:11:49 +0200 Subject: [PATCH 062/106] refactor: make script accept missing fastq_ftp --- bin/sra_runinfo_to_ftp.py | 236 +++++++++++++++++++++++++------------- 1 file changed, 154 insertions(+), 82 deletions(-) diff --git a/bin/sra_runinfo_to_ftp.py b/bin/sra_runinfo_to_ftp.py index a6f02136..130007b6 100755 --- a/bin/sra_runinfo_to_ftp.py +++ b/bin/sra_runinfo_to_ftp.py @@ -1,107 +1,179 @@ #!/usr/bin/env python -import os -import sys -import errno + import argparse -import collections +import csv +import logging +import sys +from itertools import chain +from pathlib import Path + + +logger = logging.getLogger() + def parse_args(args=None): Description = "Create samplesheet with FTP download links and md5ums from sample information obtained via 'sra_ids_to_runinfo.py' script." - Epilog = 'Example usage: python sra_runinfo_to_ftp.py ' + Epilog = "Example usage: python sra_runinfo_to_ftp.py " parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument('FILES_IN', help="Comma-separated list of metadata file created from 'sra_ids_to_runinfo.py' script.") - parser.add_argument('FILE_OUT', help="Output file containing paths to download FastQ files along with their associated md5sums.") + parser.add_argument( + "files_in", + metavar="FILES_IN", + help="Comma-separated list of metadata file created from 'sra_ids_to_runinfo.py' script.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Output file containing paths to download FastQ files along with their associated md5sums.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) return parser.parse_args(args) -def make_dir(path): - if not len(path) == 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise + +def valid_fastq_extension(fastq): + return fastq.endswith("fastq.gz") + def parse_sra_runinfo(file_in): - runinfo_dict = {} - with open(file_in, "r") as fin: - header = fin.readline().strip().split('\t') - for line in fin: - line_dict = dict(zip(header,line.strip().split('\t'))) - line_dict = collections.OrderedDict(list(line_dict.items())) - run_id = line_dict['run_accession'] - exp_id = line_dict['experiment_accession'] - library = line_dict['library_layout'] - fastq_files = line_dict['fastq_ftp'] - fastq_md5 = line_dict['fastq_md5'] - - db_id = exp_id - sample_dict = collections.OrderedDict() - if library == 'SINGLE': - sample_dict = collections.OrderedDict([('fastq_1',''), ('fastq_2',''), ('md5_1',''), ('md5_2',''), ('single_end','true')]) - if fastq_files: - sample_dict['fastq_1'] = fastq_files - sample_dict['md5_1'] = fastq_md5 + runinfo = {} + columns = [ + "run_accession", + "experiment_accession", + "library_layout", + "fastq_ftp", + "fastq_md5", + ] + extensions = [ + "fastq_1", + "fastq_2", + "md5_1", + "md5_2", + "single_end", + ] + with open(file_in, "r", newline="") as fin: + reader = csv.DictReader(fin, delimiter="\t", skipinitialspace=True) + header = list(reader.fieldnames) + if missing := frozenset(columns).difference(frozenset(header)): + logger.critical( + f"The following expected columns are missing from {file_in}: " + f"{', '.join(missing)}." + ) + sys.exit(1) + for row in reader: + db_id = row["experiment_accession"] + if row["fastq_ftp"]: + fq_files = row["fastq_ftp"].split(";")[-2:] + fq_md5 = row["fastq_md5"].split(";")[-2:] + if len(fq_files) == 1: + assert fq_files[0].endswith( + ".fastq.gz" + ), f"Unexpected FastQ file format {file_in.name}." + if row["library_layout"] != "SINGLE": + logger.warning( + f"The library layout '{row['library_layout']}' should be " + f"'SINGLE'." + ) + sample = { + "fastq_1": fq_files[0], + "fastq_2": None, + "md5_1": fq_md5[0], + "md5_2": None, + "single_end": "true", + } + elif len(fq_files) == 2: + assert fq_files[0].endswith( + "_1.fastq.gz" + ), f"Unexpected FastQ file format {file_in.name}." + assert fq_files[1].endswith( + "_2.fastq.gz" + ), f"Unexpected FastQ file format {file_in.name}." + if row["library_layout"] != "PAIRED": + logger.warning( + f"The library layout '{row['library_layout']}' should be " + f"'PAIRED'." + ) + sample = { + "fastq_1": fq_files[0], + "fastq_2": fq_files[1], + "md5_1": fq_md5[0], + "md5_2": fq_md5[1], + "single_end": "false", + } else: - ## In some instances FTP links don't exist for FastQ files - ## These have to be downloaded via fastq-dump / fasterq-dump / parallel-fastq-dump via the run id - db_id = run_id - - elif library == 'PAIRED': - sample_dict = collections.OrderedDict([('fastq_1',''), ('fastq_2',''), ('md5_1',''), ('md5_2',''), ('single_end','false')]) - if fastq_files: - fq_files = fastq_files.split(';')[-2:] - fq_md5 = fastq_md5.split(';')[-2:] - if len(fq_files) == 2: - if fq_files[0].find('_1.fastq.gz') != -1 and fq_files[1].find('_2.fastq.gz') != -1: - sample_dict['fastq_1'] = fq_files[0] - sample_dict['fastq_2'] = fq_files[1] - sample_dict['md5_1'] = fq_md5[0] - sample_dict['md5_2'] = fq_md5[1] - else: - print("Invalid FastQ files found for database id:'{}'!.".format(run_id)) - else: - print("Invalid number of FastQ files ({}) found for paired-end database id:'{}'!.".format(len(fq_files), run_id)) - else: - db_id = run_id + raise RuntimeError(f"Unexpected number of FastQ files: {fq_files}.") + else: + # In some instances, FTP links don't exist for FastQ files. + # These have to be downloaded with the run accession using sra-tools. + db_id = row["run_accession"] + sample = dict.fromkeys(extensions, None) + if row["library_layout"] == "SINGLE": + sample["single_end"] = "true" + elif row["library_layout"] == "PAIRED": + sample["single_end"] = "false" - if sample_dict: - sample_dict.update(line_dict) - if db_id not in runinfo_dict: - runinfo_dict[db_id] = [sample_dict] + sample.update(row) + if db_id not in runinfo: + runinfo[db_id] = [sample] + else: + if sample in runinfo[db_id]: + logger.error( + f"Input run info file contains duplicate rows!\n" + f"{', '.join([row[col] for col in header])}" + ) else: - if sample_dict in runinfo_dict[db_id]: - print("Input run info file contains duplicate rows!\nLine: '{}'".format(line)) - else: - runinfo_dict[db_id].append(sample_dict) + runinfo[db_id].append(sample) + + return runinfo, header + extensions - return runinfo_dict def sra_runinfo_to_ftp(files_in, file_out): - samplesheet_dict = {} + samplesheet = {} + header = [] for file_in in files_in: - runinfo_dict = parse_sra_runinfo(file_in) - for db_id in runinfo_dict.keys(): - if db_id not in samplesheet_dict: - samplesheet_dict[db_id] = runinfo_dict[db_id] + runinfo, sample_header = parse_sra_runinfo(file_in) + header.append(sample_header) + for db_id, rows in runinfo.items(): + if db_id not in samplesheet: + samplesheet[db_id] = rows else: - print("Duplicate sample identifier found!\nID: '{}'".format(db_id)) - - ## Write samplesheet with paths to FastQ files and md5 sums - if samplesheet_dict: - out_dir = os.path.dirname(file_out) - make_dir(out_dir) - with open(file_out, "w") as fout: - header = ['id'] + list(samplesheet_dict[list(samplesheet_dict.keys())[0]][0].keys()) - fout.write("\t".join(header) + "\n") - for db_id in sorted(samplesheet_dict.keys()): - for idx,val in enumerate(samplesheet_dict[db_id]): - fout.write('\t'.join(["{}_T{}".format(db_id,idx+1)] + [val[x] for x in header[1:]]) + '\n') + logger.warning(f"Duplicate sample identifier found!\nID: '{db_id}'") + + # Create a combined header from all input files. + combined_header = header[0] + list( + set().union(chain.from_iterable(header)).difference(header[0]) + ) + combined_header.insert(0, "id") + + # Write samplesheet with paths to FastQ files and md5 sums. + if samplesheet: + with file_out.open("w", newline="") as fout: + writer = csv.DictWriter(fout, fieldnames=combined_header, delimiter="\t") + writer.writeheader() + for db_id in sorted(samplesheet): + for idx, row in enumerate(samplesheet[db_id], start=1): + row["id"] = f"{db_id}_T{idx}" + writer.writerow(row) + def main(args=None): args = parse_args(args) - sra_runinfo_to_ftp([x.strip() for x in args.FILES_IN.split(',')], args.FILE_OUT) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + files = [Path(x.strip()) for x in args.files_in.split(",")] + for path in files: + if not path.is_file(): + logger.critical(f"The given input file {path} was not found!") + sys.exit(1) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + sra_runinfo_to_ftp(files, args.file_out) + -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) From 8b5847a6a1e2e85c760c2f32c4307b3a4316ef9e Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 25 Oct 2021 16:21:31 +0200 Subject: [PATCH 063/106] feat: include SRA subworkflow --- workflows/sra.nf | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index b3d357e4..530bf91a 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -69,6 +69,8 @@ workflow SRA { ) ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) + + SRA_RUNINFO_TO_FTP .out .tsv @@ -79,6 +81,10 @@ workflow SRA { [ meta, [ meta.fastq_1, meta.fastq_2 ] ] } .unique() + .branch { + ftp: it[0].fastq_1 + raw: !it[0].fastq_1 + } .set { ch_sra_reads } ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) @@ -87,13 +93,13 @@ workflow SRA { // MODULE: If FTP link is provided in run information then download FastQ directly via FTP and validate with md5sums // SRA_FASTQ_FTP ( - ch_sra_reads.map { meta, reads -> if (meta.fastq_1) [ meta, reads ] } + ch_sra_reads.ftp ) ch_versions = ch_versions.mix(SRA_FASTQ_FTP.out.versions.first()) // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. SRA_FASTQ ( - ch_sra_reads.map { meta, reads -> if (!meta.fastq_1) [meta, meta.run_accession] } + ch_sra_reads.raw.map { meta, reads -> [ meta, meta.run_accession ] } ) ch_versions = ch_versions.mix(SRA_FASTQ.out.versions.first()) From 8deb5f18e287e7f7082315b699462570166c4b96 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 25 Oct 2021 16:35:20 +0200 Subject: [PATCH 064/106] docs: add credits and usage --- README.md | 4 ++-- docs/usage.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 43dcf30d..b51867ed 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Via a single file of ids, provided one-per-line (see [example input file](https: 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata including direct download links to FastQ files via ENA API -3. Download FastQ files in parallel via `curl` and perform `md5sum` check +3. Download FastQ files in parallel via `curl` if available from FTP and perform `md5sum` check. Otherwise use sra-tools to download SRAs and convert them to FastQ. 4. Collate id metadata and paths to FastQ files in a single samplesheet ### Synapse ids @@ -72,7 +72,7 @@ The nf-core/fetchngs pipeline comes with documentation about the pipeline [usage ## Credits -nf-core/fetchngs was originally written by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/). +nf-core/fetchngs was originally written by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/). Support for download of sequencing reads without FTP links via sra-tools was added by Moritz E. Beber ([@Midnighter](https://github.com/Midnighter)) from [Unseen Bio ApS, Denmark](https://unseenbio.com). ## Contributions and Support diff --git a/docs/usage.md b/docs/usage.md index 5040552b..6af310a6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,7 +22,7 @@ The pipeline has been set-up to automatically download and process the raw FastQ If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. -The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. We may add this functionality in later releases. +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. All of the sample metadata obtained from the ENA will be appended as additional columns to help you manually curate the generated samplesheet before you run the pipeline. You can customise the metadata fields that are appended to the samplesheet via the `--ena_metadata_fields` parameter. The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. However, this pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. A comprehensive list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run]). From 36753b3d91469c81926ed0cc5d89d84abc85e0bc Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Mon, 25 Oct 2021 16:37:03 +0200 Subject: [PATCH 065/106] chore: add changelog entry --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e9a0cb2..5c2178bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unpublished Version / DEV] +* Add support for run accessions not available on FTP via sra-tools. + ### Enhancements & fixes * Added a workflow to download FastQ files and to create samplesheets for ids from the [Synapse platform](https://www.synapse.org/) hosted by [Sage Bionetworks](https://sagebionetworks.org/). From 408f1bc0add9ce4d8c0aa987cbe8f5a6adf3fdf1 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 25 Oct 2021 16:49:30 +0100 Subject: [PATCH 066/106] Fix #46 --- CHANGELOG.md | 1 + bin/sra_ids_to_runinfo.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e9a0cb2..a03c0774 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Added a workflow to download FastQ files and to create samplesheets for ids from the [Synapse platform](https://www.synapse.org/) hosted by [Sage Bionetworks](https://sagebionetworks.org/). * Handle SRA identifiers that do **not** return metadata, for example, due to being private. +* [[#46](https://github.com/nf-core/fetchngs/issues/46)] - Bug in sra_ids_to_runinfo.py * Added support for [DDBJ ids](https://www.ddbj.nig.ac.jp/index-e.html). See examples below: | `DDBJ` | diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 0f81c125..bbd1e6f0 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -269,7 +269,7 @@ def _gse_to_srx(cls, identifier): gsm_ids = [ line.split("=")[1].strip() for line in response.text().splitlines() - if line.startswith("GSM") + if line.split('=')[1].strip().startswith('GSM') ] for gsm_id in gsm_ids: ids += cls._id_to_srx(gsm_id) From 0d82d986b699ce9c30b2c3e1b4407a53d58aa7c2 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 25 Oct 2021 17:21:31 +0100 Subject: [PATCH 067/106] Change default process requirements for sra_fastq_ftp.nf --- modules/local/sra_fastq_ftp.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index cccdd8ac..23ea5dd5 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -6,7 +6,7 @@ options = initOptions(params.options) process SRA_FASTQ_FTP { tag "$meta.id" - label 'process_medium' + label 'process_low' label 'error_retry' publishDir "${params.outdir}", mode: params.publish_dir_mode, From 8c6231ca0a5364514e79673cb447f96320de39d8 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 25 Oct 2021 22:58:47 +0100 Subject: [PATCH 068/106] Update CHANGELOG --- CHANGELOG.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49d2c5f1..0153327e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unpublished Version / DEV] - -* Add support for run accessions not available on FTP via sra-tools. +## [[1.4](https://github.com/nf-core/fetchngs/releases/tag/1.4)] - 2021-10-25 ### Enhancements & fixes * Added a workflow to download FastQ files and to create samplesheets for ids from the [Synapse platform](https://www.synapse.org/) hosted by [Sage Bionetworks](https://sagebionetworks.org/). -* Handle SRA identifiers that do **not** return metadata, for example, due to being private. +* SRA identifiers not available for direct download via the ENA FTP will now be downloaded via sra-tools. +* Correctly handle errors from SRA identifiers that do **not** return metadata, for example, due to being private. * [[#46](https://github.com/nf-core/fetchngs/issues/46)] - Bug in sra_ids_to_runinfo.py * Added support for [DDBJ ids](https://www.ddbj.nig.ac.jp/index-e.html). See examples below: From 5c910777374498f92fdaa5dd826893b804fe08db Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 25 Oct 2021 22:59:02 +0100 Subject: [PATCH 069/106] Update listing in README --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b51867ed..ff3829ef 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,10 @@ Via a single file of ids, provided one-per-line (see [example input file](https: ### SRA / ENA / DDBJ / GEO ids 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) -2. Fetch extensive id metadata including direct download links to FastQ files via ENA API -3. Download FastQ files in parallel via `curl` if available from FTP and perform `md5sum` check. Otherwise use sra-tools to download SRAs and convert them to FastQ. +2. Fetch extensive id metadata via ENA API +3. Download FastQ files: + - If direct download links are available from the ENA API, fetch in parallel via `curl` and perform `md5sum` check + - Otherwise use `sra-tools` to download `.sra` files and convert them to FastQ 4. Collate id metadata and paths to FastQ files in a single samplesheet ### Synapse ids From 97e8c32ca447a0be7e02cdcfb47f05f760dc6cc2 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 25 Oct 2021 22:59:18 +0100 Subject: [PATCH 070/106] Don't publish .sra files by default --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index 534b8249..2a2bcca5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -37,6 +37,7 @@ params { } 'sratools_prefetch' { publish_dir = 'sra' + publish_files = false } 'sratools_fasterqdump' { publish_dir = 'fastq' From 2fcce24cbbd49bc0f3ee335887627322c3bcff45 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 25 Oct 2021 22:59:31 +0100 Subject: [PATCH 071/106] Bump pipeline version to 1.4 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 45dfb946..066fb0b3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -149,7 +149,7 @@ manifest { description = 'Pipeline to fetch metadata and raw FastQ files from public databases' mainScript = 'main.nf' nextflowVersion = '!>=21.04.0' - version = '1.4dev' + version = '1.4' } // Function to ensure that resource requirements don't go beyond From ef931d38e8fba8a8db26e0cdc621458cc30da2cb Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 25 Oct 2021 23:10:11 +0100 Subject: [PATCH 072/106] Update output docs with sra-tools functionality --- docs/output.md | 2 +- workflows/sra.nf | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/output.md b/docs/output.md index df7fec02..6e6c610b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -34,7 +34,7 @@ Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introductio
-The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5sums. If download links exist, the files will be downloaded in parallel by FTP otherwise they will NOT be downloaded. This is intentional because the tools such as `parallel-fastq-dump`, `fasterq-dump`, `prefetch` etc require pre-existing configuration files in the users home directory which makes automation tricky across different platforms and containerisation. +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. ### Synapse ids diff --git a/workflows/sra.nf b/workflows/sra.nf index 530bf91a..4fb41ccd 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -69,8 +69,6 @@ workflow SRA { ) ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) - - SRA_RUNINFO_TO_FTP .out .tsv @@ -83,7 +81,7 @@ workflow SRA { .unique() .branch { ftp: it[0].fastq_1 - raw: !it[0].fastq_1 + sra: !it[0].fastq_1 } .set { ch_sra_reads } ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) @@ -99,7 +97,7 @@ workflow SRA { // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. SRA_FASTQ ( - ch_sra_reads.raw.map { meta, reads -> [ meta, meta.run_accession ] } + ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } ) ch_versions = ch_versions.mix(SRA_FASTQ.out.versions.first()) From 78c55d9f6a505771c50eb9d3c021379399c81681 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 25 Oct 2021 23:34:27 +0100 Subject: [PATCH 073/106] Fix markdownlint --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ff3829ef..e21a94a9 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,8 @@ Via a single file of ids, provided one-per-line (see [example input file](https: 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata via ENA API 3. Download FastQ files: - - If direct download links are available from the ENA API, fetch in parallel via `curl` and perform `md5sum` check - - Otherwise use `sra-tools` to download `.sra` files and convert them to FastQ + - If direct download links are available from the ENA API, fetch in parallel via `curl` and perform `md5sum` check + - Otherwise use `sra-tools` to download `.sra` files and convert them to FastQ 4. Collate id metadata and paths to FastQ files in a single samplesheet ### Synapse ids From 767f02e6007372a5d2c05057b4051024aaa3ab29 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 25 Oct 2021 23:37:01 +0100 Subject: [PATCH 074/106] Replace asterix with dashes for markdownling --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e21a94a9..636ba12f 100644 --- a/README.md +++ b/README.md @@ -58,9 +58,9 @@ The columns in the auto-created samplesheet can be tailored to be accepted out-o nextflow run nf-core/fetchngs -profile test, ``` - > * Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > * If you are using `singularity` then the pipeline will auto-detect this and attempt to download the Singularity images directly as opposed to performing a conversion from Docker images. If you are persistently observing issues downloading Singularity images directly due to timeout or network issues then please use the `--singularity_pull_docker_container` parameter to pull and convert the Docker image instead. Alternatively, it is highly recommended to use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to pre-download all of the required containers before running the pipeline and to set the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options to be able to store and re-use the images from a central location for future pipeline runs. - > * If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. + > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. + > - If you are using `singularity` then the pipeline will auto-detect this and attempt to download the Singularity images directly as opposed to performing a conversion from Docker images. If you are persistently observing issues downloading Singularity images directly due to timeout or network issues then please use the `--singularity_pull_docker_container` parameter to pull and convert the Docker image instead. Alternatively, it is highly recommended to use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to pre-download all of the required containers before running the pipeline and to set the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options to be able to store and re-use the images from a central location for future pipeline runs. + > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. 4. Start running your own analysis! From d739f3718be5f92e5f10e165c4fe75ee9ca9fea4 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 26 Oct 2021 11:26:40 +0200 Subject: [PATCH 075/106] feat: add bash script to retry prefetch --- bin/retry_with_backoff.sh | 100 ++++++++++++++++++ .../nf-core/modules/sratools/prefetch/main.nf | 2 +- 2 files changed, 101 insertions(+), 1 deletion(-) create mode 100755 bin/retry_with_backoff.sh diff --git a/bin/retry_with_backoff.sh b/bin/retry_with_backoff.sh new file mode 100755 index 00000000..4bdbf1e0 --- /dev/null +++ b/bin/retry_with_backoff.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash + +set -u + +retry_with_backoff() { + local max_attempts=${1} + local delay=${2} + local max_time=${3} + local attempt=1 + local output= + local status= + + shift 3 + + while [ ${attempt} -le ${max_attempts} ]; do + output=$("${@}") + status=${?} + + if [ ${status} -eq 0 ]; then + break + fi + + if [ ${attempt} -lt ${max_attempts} ]; then + echo "Failed attempt ${attempt} of ${max_attempts}. Retrying in ${delay} s." >&2 + sleep ${delay} + elif [ ${attempt} -eq ${max_attempts} ]; then + echo "Failed after ${attempt} attempts." >&2 + return ${status} + fi + + attempt=$(( ${attempt} + 1 )) + delay=$(( ${delay} * 2 )) + if [ ${delay} -ge ${max_time} ]; then + delay=${max_time} + fi + done + + echo "${output}" +} + +RETRY=5 +DELAY=1 +MAX_TIME=60 + +usage() { + echo "Usage:" >&2 + echo "$(basename ${0}) [-h] [-r NUM] [-d NUM] [-m NUM] COMMAND" >&2 + echo "Call the given command with retries and exponential backoff." >&2 + echo "" >&2 + echo " -r NUM Set the number of retry attempts (default ${RETRY})." >&2 + echo " -d NUM Set the base number of seconds to delay (default ${DELAY})." >&2 + echo " -m NUM Set the maximum delay in seconds (default ${MAX_TIME})." >&2 + echo "" >&2 +} + +check_numeric() { + local arg=${1} + if [[ ! ${arg} =~ ^[0-9]+$ ]]; then + echo "Illegal argument: ${arg}" >&2 + echo "Expected a number." >&2 + echo "" >&2 + usage + exit 2 + fi +} + +while getopts ":hr:d:m:" arg; do + case ${arg} in + h) + usage + exit 0 + ;; + r) + check_numeric ${OPTARG} + RETRY=${OPTARG} + ;; + d) + check_numeric ${OPTARG} + DELAY=${OPTARG} + ;; + m) + check_numeric ${OPTARG} + MAX_TIME=${OPTARG} + ;; + ?) + echo "Invalid option: -${OPTARG}" >&2 + echo "" >&2 + usage + exit 2 + ;; + :) + echo "Missing argument for: -${OPTARG}" >&2 + echo "" >&2 + usage + exit 2 + ;; + esac +done + +retry_with_backoff ${RETRY} ${DELAY} ${MAX_TIME} ${@:OPTIND} diff --git a/modules/nf-core/modules/sratools/prefetch/main.nf b/modules/nf-core/modules/sratools/prefetch/main.nf index 207d1e10..c57255f5 100644 --- a/modules/nf-core/modules/sratools/prefetch/main.nf +++ b/modules/nf-core/modules/sratools/prefetch/main.nf @@ -35,7 +35,7 @@ process SRATOOLS_PREFETCH { printf '${config}' > "\${NCBI_SETTINGS}" fi - prefetch \\ + retry_with_backoff.sh prefetch \\ $options.args \\ --progress \\ $id From 753e7e2d795a6d8f3c8736b2fe9490f9d85a2ffd Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 26 Oct 2021 11:32:20 +0200 Subject: [PATCH 076/106] chore: note change --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49d2c5f1..6e417235 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unpublished Version / DEV] * Add support for run accessions not available on FTP via sra-tools. +* Resume interrupted downloads with prefetch. ### Enhancements & fixes From c973452f56967e8061736935a03d3cf11b163cd9 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Tue, 26 Oct 2021 11:37:43 +0200 Subject: [PATCH 077/106] chore: reword changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e417235..7dc6402a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unpublished Version / DEV] * Add support for run accessions not available on FTP via sra-tools. -* Resume interrupted downloads with prefetch. +* Retry an error in prefetch via bash script in order to allow it to resume interrupted downloads. ### Enhancements & fixes From 01cc0fbe6a588f5a55cc0afd09ee65b9f4ec8092 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 26 Oct 2021 10:50:08 +0100 Subject: [PATCH 078/106] Update test_full data and credits for synapse --- README.md | 2 +- conf/test_full.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 636ba12f..a5ffb8a6 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ The nf-core/fetchngs pipeline comes with documentation about the pipeline [usage ## Credits -nf-core/fetchngs was originally written by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/). Support for download of sequencing reads without FTP links via sra-tools was added by Moritz E. Beber ([@Midnighter](https://github.com/Midnighter)) from [Unseen Bio ApS, Denmark](https://unseenbio.com). +nf-core/fetchngs was originally written by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/). Support for download of sequencing reads without FTP links via sra-tools was added by Moritz E. Beber ([@Midnighter](https://github.com/Midnighter)) from [Unseen Bio ApS, Denmark](https://unseenbio.com). The Synapse workflow was added by Daisy Han [@daisyhan97](https://github.com/daisyhan97) and Bruno Grande [@BrunoGrandePhD](https://github.com/BrunoGrandePhD) from [Sage Bionetworks, Seattle](https://sagebionetworks.org/). ## Contributions and Support diff --git a/conf/test_full.config b/conf/test_full.config index a5aea2dc..16398e23 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,5 +15,5 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.txt' } From cf374eab1187b8f35426f8807ed0fd10112eb33a Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 3 Nov 2021 15:23:59 +0000 Subject: [PATCH 079/106] Port pipeline to new DSL2 syntax --- conf/modules.config | 200 ++++++++++++------ modules.json | 14 +- .../main.nf => local/dumpsoftwareversions.nf} | 20 +- modules/local/functions.nf | 78 ------- modules/local/multiqc_mappings_config.nf | 17 +- modules/local/sra_fastq_ftp.nf | 27 +-- modules/local/sra_ids_to_runinfo.nf | 17 +- modules/local/sra_merge_samplesheet.nf | 17 +- modules/local/sra_runinfo_to_ftp.nf | 17 +- modules/local/sra_to_samplesheet.nf | 6 - .../main.nf => local/sratools_fasterqdump.nf} | 26 +-- modules/local/sratools_prefetch.nf | 41 ++++ modules/local/synapse_get.nf | 23 +- modules/local/synapse_list.nf | 26 +-- modules/local/synapse_merge_samplesheet.nf | 17 +- modules/local/synapse_show.nf | 26 +-- modules/local/synapse_to_samplesheet.nf | 7 - .../custom/dumpsoftwareversions/functions.nf | 78 ------- .../custom/dumpsoftwareversions/meta.yml | 33 --- .../modules/sratools/fasterqdump/functions.nf | 78 ------- .../modules/sratools/fasterqdump/meta.yml | 42 ---- .../modules/sratools/prefetch/functions.nf | 78 ------- .../nf-core/modules/sratools/prefetch/main.nf | 50 ----- .../modules/sratools/prefetch/meta.yml | 43 ---- nextflow.config | 10 +- subworkflows/local/sra_fastq/main.nf | 7 +- workflows/sra.nf | 27 +-- workflows/synapse.nf | 23 +- 28 files changed, 269 insertions(+), 779 deletions(-) rename modules/{nf-core/modules/custom/dumpsoftwareversions/main.nf => local/dumpsoftwareversions.nf} (79%) delete mode 100644 modules/local/functions.nf rename modules/{nf-core/modules/sratools/fasterqdump/main.nf => local/sratools_fasterqdump.nf} (54%) create mode 100644 modules/local/sratools_prefetch.nf delete mode 100644 modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf delete mode 100644 modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml delete mode 100644 modules/nf-core/modules/sratools/fasterqdump/functions.nf delete mode 100644 modules/nf-core/modules/sratools/fasterqdump/meta.yml delete mode 100644 modules/nf-core/modules/sratools/prefetch/functions.nf delete mode 100644 modules/nf-core/modules/sratools/prefetch/main.nf delete mode 100644 modules/nf-core/modules/sratools/prefetch/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 2a2bcca5..88a1a72b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1,74 +1,144 @@ /* ======================================================================================== - Config file for defining DSL2 per module options + Config file for defining DSL2 per module options and publishing paths ======================================================================================== Available keys to override module options: - args = Additional arguments appended to command in module. - args2 = Second set of arguments appended to command in module (multi-tool modules). - args3 = Third set of arguments appended to command in module (multi-tool modules). - publish_dir = Directory to publish results. - publish_by_meta = Groovy list of keys available in meta map to append as directories to "publish_dir" path - If publish_by_meta = true - Value of ${meta['id']} is appended as a directory to "publish_dir" path - If publish_by_meta = ['id', 'custompath'] - If "id" is in meta map and "custompath" isn't then "${meta['id']}/custompath/" - is appended as a directory to "publish_dir" path - If publish_by_meta = false / null - No directories are appended to "publish_dir" path - publish_files = Groovy map where key = "file_ext" and value = "directory" to publish results for that file extension - The value of "directory" is appended to the standard "publish_dir" path as defined above. - If publish_files = null (unspecified) - All files are published. - If publish_files = false - No files are published. - suffix = File name suffix for output files. + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.suffix = File name suffix for output files. ---------------------------------------------------------------------------------------- */ -params { - modules { - 'sra_ids_to_runinfo' { - publish_dir = 'metadata' - publish_files = false - } - 'sra_runinfo_to_ftp' { - publish_dir = 'metadata' - publish_files = ['tsv':''] - } - 'sra_fastq_ftp' { - publish_dir = 'fastq' - publish_files = ['fastq.gz':'', 'md5':'md5'] - args = '--retry 5 --continue-at - --max-time 1200' - } - 'sratools_prefetch' { - publish_dir = 'sra' - publish_files = false - } - 'sratools_fasterqdump' { - publish_dir = 'fastq' - } - 'sra_to_samplesheet' { - publish_dir = 'samplesheet' - publish_files = false - } - 'sra_merge_samplesheet' { - publish_dir = 'samplesheet' - } - 'synapse_list' { - args = '--long' - publish_dir = 'metadata' - } - 'synapse_get' { - publish_dir = 'fastq' - publish_files = ['fastq.gz':'', 'md5':'md5'] - } - 'synapse_show' { - publish_dir = 'metadata' - } - 'synapse_to_samplesheet' { - publish_dir = 'samplesheet' - publish_files = false - } - 'synapse_merge_samplesheet' { - publish_dir = 'samplesheet' - } - 'multiqc_mappings_config' { - publish_dir = 'samplesheet' - } +process { + + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + withName: SRA_IDS_TO_RUNINFO { + publishDir = [ + path: { "${params.outdir}/metadata" }, + enabled: false + ] + } + + withName: SRA_RUNINFO_TO_FTP { + publishDir = [ + path: { "${params.outdir}/metadata" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SRA_FASTQ_FTP { + ext.args = '--retry 5 --continue-at - --max-time 1200' + publishDir = [ + [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*gz" + ], + [ + path: { "${params.outdir}/fastq/md5" }, + mode: params.publish_dir_mode, + pattern: "*.md5" + ] + ] + } + + withName: SRATOOLS_PREFETCH { + publishDir = [ + path: { "${params.outdir}/sra" }, + enabled: false + ] + } + + withName: SRATOOLS_FASTERQDUMP { + publishDir = [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SRA_TO_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + enabled: false + ] + } + + withName: SRA_MERGE_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SYNAPSE_LIST { + ext.args = '--long' + publishDir = [ + path: { "${params.outdir}/metadata" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SYNAPSE_GET { + publishDir = [ + [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*gz" + ], + [ + path: { "${params.outdir}/fastq/md5" }, + mode: params.publish_dir_mode, + pattern: "*.md5" + ] + ] + } + + withName: SYNAPSE_SHOW { + publishDir = [ + path: { "${params.outdir}/metadata" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SYNAPSE_TO_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + enabled: false + ] + } + + withName: SYNAPSE_MERGE_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: MULTIQC_MAPPINGS_CONFIG { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: DUMPSOFTWAREVERSIONS { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + pattern: '*_versions.yml' + ] } } diff --git a/modules.json b/modules.json index f004808d..0e4143ef 100644 --- a/modules.json +++ b/modules.json @@ -1,17 +1,5 @@ { "name": "nf-core/fetchngs", "homePage": "https://github.com/nf-core/fetchngs", - "repos": { - "nf-core/modules": { - "custom/dumpsoftwareversions": { - "git_sha": "84f2302920078b0cf7716b2a2e5fcc0be5c4531d" - }, - "sratools/fasterqdump": { - "git_sha": "de997825de788fe2210db16d9426f10342a1ba1d" - }, - "sratools/prefetch": { - "git_sha": "07c0830057cc655de113d84499c7c1499460bb55" - } - } - } + "repos": {} } \ No newline at end of file diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf b/modules/local/dumpsoftwareversions.nf similarity index 79% rename from modules/nf-core/modules/custom/dumpsoftwareversions/main.nf rename to modules/local/dumpsoftwareversions.nf index faf2073f..69ebcbaa 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf +++ b/modules/local/dumpsoftwareversions.nf @@ -1,22 +1,12 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' -params.options = [:] -options = initOptions(params.options) - -process CUSTOM_DUMPSOFTWAREVERSIONS { +process DUMPSOFTWAREVERSIONS { label 'process_low' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0" - } else { - container "quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0' }" input: path versions @@ -73,7 +63,7 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { return "\\n".join(html) module_versions = {} - module_versions["${getProcessName(task.process)}"] = { + module_versions["${task.process.tokenize(':').last()}"] = { 'python': platform.python_version(), 'yaml': yaml.__version__ } diff --git a/modules/local/functions.nf b/modules/local/functions.nf deleted file mode 100644 index 85628ee0..00000000 --- a/modules/local/functions.nf +++ /dev/null @@ -1,78 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Extract name of module from process name using $task.process -// -def getProcessName(task_process) { - return task_process.tokenize(':')[-1] -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - - // Do not publish versions.yml unless running from pytest workflow - if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { - return null - } - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } -} diff --git a/modules/local/multiqc_mappings_config.nf b/modules/local/multiqc_mappings_config.nf index 857c859c..15c30846 100644 --- a/modules/local/multiqc_mappings_config.nf +++ b/modules/local/multiqc_mappings_config.nf @@ -1,19 +1,10 @@ -// Import generic module functions -include { saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] process MULTIQC_MAPPINGS_CONFIG { - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.9--1" - } else { - container "quay.io/biocontainers/python:3.9--1" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" input: path csv @@ -29,7 +20,7 @@ process MULTIQC_MAPPINGS_CONFIG { multiqc_config.yml cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: + ${task.process.tokenize(':').last()}: python: \$(python --version | sed 's/Python //g') END_VERSIONS """ diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index 23ea5dd5..a775849c 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -1,23 +1,13 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] -options = initOptions(params.options) process SRA_FASTQ_FTP { tag "$meta.id" label 'process_low' label 'error_retry' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } conda (params.enable_conda ? "conda-forge::sed=4.7" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" - } else { - container "biocontainers/biocontainers:v1.2.0_cv1" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'biocontainers/biocontainers:v1.2.0_cv1' }" input: tuple val(meta), val(fastq) @@ -28,10 +18,11 @@ process SRA_FASTQ_FTP { path "versions.yml" , emit: versions script: + def args = task.ext.args ?: '' if (meta.single_end) { """ curl \\ - $options.args \\ + $args \\ -L ${fastq[0]} \\ -o ${meta.id}.fastq.gz @@ -39,14 +30,14 @@ process SRA_FASTQ_FTP { md5sum -c ${meta.id}.fastq.gz.md5 cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: + ${task.process.tokenize(':').last()}: curl: \$(echo \$(curl --version | head -n 1 | sed 's/^curl //; s/ .*\$//')) END_VERSIONS """ } else { """ curl \\ - $options.args \\ + $args \\ -L ${fastq[0]} \\ -o ${meta.id}_1.fastq.gz @@ -54,7 +45,7 @@ process SRA_FASTQ_FTP { md5sum -c ${meta.id}_1.fastq.gz.md5 curl \\ - $options.args \\ + $args \\ -L ${fastq[1]} \\ -o ${meta.id}_2.fastq.gz @@ -62,7 +53,7 @@ process SRA_FASTQ_FTP { md5sum -c ${meta.id}_2.fastq.gz.md5 cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: + ${task.process.tokenize(':').last()}: curl: \$(echo \$(curl --version | head -n 1 | sed 's/^curl //; s/ .*\$//')) END_VERSIONS """ diff --git a/modules/local/sra_ids_to_runinfo.nf b/modules/local/sra_ids_to_runinfo.nf index 8200271e..c8209690 100644 --- a/modules/local/sra_ids_to_runinfo.nf +++ b/modules/local/sra_ids_to_runinfo.nf @@ -1,21 +1,12 @@ -// Import generic module functions -include { saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] process SRA_IDS_TO_RUNINFO { tag "$id" label 'error_retry' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.9--1" - } else { - container "quay.io/biocontainers/python:3.9--1" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" input: val id @@ -35,7 +26,7 @@ process SRA_IDS_TO_RUNINFO { $metadata_fields cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: + ${task.process.tokenize(':').last()}: python: \$(python --version | sed 's/Python //g') END_VERSIONS """ diff --git a/modules/local/sra_merge_samplesheet.nf b/modules/local/sra_merge_samplesheet.nf index 914bac00..be3f1087 100644 --- a/modules/local/sra_merge_samplesheet.nf +++ b/modules/local/sra_merge_samplesheet.nf @@ -1,19 +1,10 @@ -// Import generic module functions -include { saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] process SRA_MERGE_SAMPLESHEET { - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } conda (params.enable_conda ? "conda-forge::sed=4.7" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" - } else { - container "biocontainers/biocontainers:v1.2.0_cv1" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'biocontainers/biocontainers:v1.2.0_cv1' }" input: path ('samplesheets/*') @@ -37,7 +28,7 @@ process SRA_MERGE_SAMPLESHEET { done cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: + ${task.process.tokenize(':').last()}: sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') END_VERSIONS """ diff --git a/modules/local/sra_runinfo_to_ftp.nf b/modules/local/sra_runinfo_to_ftp.nf index 80028705..a6b6a829 100644 --- a/modules/local/sra_runinfo_to_ftp.nf +++ b/modules/local/sra_runinfo_to_ftp.nf @@ -1,19 +1,10 @@ -// Import generic module functions -include { saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] process SRA_RUNINFO_TO_FTP { - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } conda (params.enable_conda ? "conda-forge::python=3.9.5" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.9--1" - } else { - container "quay.io/biocontainers/python:3.9--1" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'quay.io/biocontainers/python:3.9--1' }" input: path runinfo @@ -29,7 +20,7 @@ process SRA_RUNINFO_TO_FTP { ${runinfo.toString().tokenize(".")[0]}.runinfo_ftp.tsv cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: + ${task.process.tokenize(':').last()}: python: \$(python --version | sed 's/Python //g') END_VERSIONS """ diff --git a/modules/local/sra_to_samplesheet.nf b/modules/local/sra_to_samplesheet.nf index c4fc9af7..230ba4ca 100644 --- a/modules/local/sra_to_samplesheet.nf +++ b/modules/local/sra_to_samplesheet.nf @@ -1,14 +1,8 @@ -// Import generic module functions -include { saveFiles; getSoftwareName } from './functions' -params.options = [:] params.results_dir = '' process SRA_TO_SAMPLESHEET { tag "$meta.id" - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } memory 100.MB diff --git a/modules/nf-core/modules/sratools/fasterqdump/main.nf b/modules/local/sratools_fasterqdump.nf similarity index 54% rename from modules/nf-core/modules/sratools/fasterqdump/main.nf rename to modules/local/sratools_fasterqdump.nf index 08ef9045..cc8fb6fa 100644 --- a/modules/nf-core/modules/sratools/fasterqdump/main.nf +++ b/modules/local/sratools_fasterqdump.nf @@ -1,22 +1,12 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] -options = initOptions(params.options) process SRATOOLS_FASTERQDUMP { tag "$meta.id" label 'process_medium' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } conda (params.enable_conda ? 'bioconda::sra-tools=2.11.0 conda-forge::pigz=2.6' : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' - } else { - container 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' : + 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" input: tuple val(meta), path(sra) @@ -26,6 +16,8 @@ process SRATOOLS_FASTERQDUMP { path "versions.yml" , emit: versions script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' def config = "/LIBS/GUID = \"${UUID.randomUUID().toString()}\"\\n/libs/cloud/report_instance_identity = \"true\"\\n" // Paired-end data extracted by fasterq-dump (--split-3 the default) always creates // *_1.fastq *_2.fastq files but sometimes also an additional *.fastq file @@ -39,19 +31,19 @@ process SRATOOLS_FASTERQDUMP { fi fasterq-dump \\ - ${options.args} \\ + $args \\ --threads $task.cpus \\ ${sra.name} pigz \\ - ${options.args2} \\ + $args2 \\ --no-name \\ --processes $task.cpus \\ *.fastq cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$(fasterq-dump --version 2>&1 | grep -Eo '[0-9.]+') + ${task.process.tokenize(':').last()}: + sratools: \$(fasterq-dump --version 2>&1 | grep -Eo '[0-9.]+') pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) END_VERSIONS """ diff --git a/modules/local/sratools_prefetch.nf b/modules/local/sratools_prefetch.nf new file mode 100644 index 00000000..07be740b --- /dev/null +++ b/modules/local/sratools_prefetch.nf @@ -0,0 +1,41 @@ + +process SRATOOLS_PREFETCH { + tag "$id" + label 'process_low' + label 'error_retry' + + conda (params.enable_conda ? 'bioconda::sra-tools=2.11.0' : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sra-tools:2.11.0--pl5262h314213e_0' : + 'quay.io/biocontainers/sra-tools:2.11.0--pl5262h314213e_0' }" + + input: + tuple val(meta), val(id) + + output: + tuple val(meta), path("$id"), emit: sra + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def config = "/LIBS/GUID = \"${UUID.randomUUID().toString()}\"\\n/libs/cloud/report_instance_identity = \"true\"\\n" + """ + eval "\$(vdb-config -o n NCBI_SETTINGS | sed 's/[" ]//g')" + if [[ ! -f "\${NCBI_SETTINGS}" ]]; then + mkdir -p "\$(dirname "\${NCBI_SETTINGS}")" + printf '${config}' > "\${NCBI_SETTINGS}" + fi + + prefetch \\ + $args \\ + --progress \\ + $id + + vdb-validate $id + + cat <<-END_VERSIONS > versions.yml + ${task.process.tokenize(':').last()}: + sratools: \$(prefetch --version 2>&1 | grep -Eo '[0-9.]+') + END_VERSIONS + """ +} diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index 99985f48..5269286c 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -1,23 +1,13 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] -options = initOptions(params.options) process SYNAPSE_GET { tag "$meta.id" label 'process_low' label 'error_retry' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } conda (params.enable_conda ? "bioconda::synapseclient=2.4.0" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/synapseclient:2.4.0--pyh5e36f6f_0" - } else { - container "quay.io/biocontainers/synapseclient:2.4.0--pyh5e36f6f_0" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/synapseclient:2.4.0--pyh5e36f6f_0' : + 'quay.io/biocontainers/synapseclient:2.4.0--pyh5e36f6f_0' }" input: val meta @@ -29,18 +19,19 @@ process SYNAPSE_GET { path "versions.yml" , emit: versions script: + def args = task.ext.args ?: '' """ synapse \\ -c $config \\ get \\ - $options.args \\ + $args \\ $meta.id echo "${meta.md5} \t ${meta.name}" > ${meta.id}.md5 cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$(synapse --version | sed -e "s/Synapse Client //g") + ${task.process.tokenize(':').last()}: + synapse: \$(synapse --version | sed -e "s/Synapse Client //g") END_VERSIONS """ } diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf index 20275de2..f001a186 100644 --- a/modules/local/synapse_list.nf +++ b/modules/local/synapse_list.nf @@ -1,22 +1,12 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] -options = initOptions(params.options) process SYNAPSE_LIST { tag "$id" label 'process_low' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } conda (params.enable_conda ? "bioconda::synapseclient=2.4.0" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/synapseclient:2.4.0--pyh5e36f6f_0" - } else { - container "quay.io/biocontainers/synapseclient:2.4.0--pyh5e36f6f_0" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/synapseclient:2.4.0--pyh5e36f6f_0' : + 'quay.io/biocontainers/synapseclient:2.4.0--pyh5e36f6f_0' }" input: val id @@ -27,18 +17,20 @@ process SYNAPSE_LIST { path "versions.yml", emit: versions script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' """ synapse \\ -c $config \\ list \\ - $options.args \\ + $args \\ $id \\ - $options.args2 \\ + $args2 \\ > ${id}.list.txt cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$(synapse --version | sed -e "s/Synapse Client //g") + ${task.process.tokenize(':').last()}: + syanpse: \$(synapse --version | sed -e "s/Synapse Client //g") END_VERSIONS """ } diff --git a/modules/local/synapse_merge_samplesheet.nf b/modules/local/synapse_merge_samplesheet.nf index 8027818c..e3f97fa8 100644 --- a/modules/local/synapse_merge_samplesheet.nf +++ b/modules/local/synapse_merge_samplesheet.nf @@ -1,19 +1,10 @@ -// Import generic module functions -include { saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] process SYNAPSE_MERGE_SAMPLESHEET { - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } conda (params.enable_conda ? "conda-forge::sed=4.7" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img" - } else { - container "biocontainers/biocontainers:v1.2.0_cv1" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'biocontainers/biocontainers:v1.2.0_cv1' }" input: path ('samplesheets/*') @@ -30,7 +21,7 @@ process SYNAPSE_MERGE_SAMPLESHEET { done cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: + ${task.process.tokenize(':').last()}: sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') END_VERSIONS """ diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf index 4edc4207..78593861 100644 --- a/modules/local/synapse_show.nf +++ b/modules/local/synapse_show.nf @@ -1,22 +1,12 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] -options = initOptions(params.options) process SYNAPSE_SHOW { tag "$id" label 'process_low' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } conda (params.enable_conda ? "bioconda::synapseclient=2.4.0" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/synapseclient:2.4.0--pyh5e36f6f_0" - } else { - container "quay.io/biocontainers/synapseclient:2.4.0--pyh5e36f6f_0" - } + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/synapseclient:2.4.0--pyh5e36f6f_0' : + 'quay.io/biocontainers/synapseclient:2.4.0--pyh5e36f6f_0' }" input: val id @@ -27,18 +17,20 @@ process SYNAPSE_SHOW { path "versions.yml", emit: versions script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' """ synapse \\ -c $config \\ show \\ - $options.args \\ + $args \\ $id \\ - $options.args2 \\ + $args2 \\ > ${id}.metadata.txt cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$(synapse --version | sed -e "s/Synapse Client //g") + ${task.process.tokenize(':').last()}: + synapse: \$(synapse --version | sed -e "s/Synapse Client //g") END_VERSIONS """ } diff --git a/modules/local/synapse_to_samplesheet.nf b/modules/local/synapse_to_samplesheet.nf index 28571005..976d7ad7 100644 --- a/modules/local/synapse_to_samplesheet.nf +++ b/modules/local/synapse_to_samplesheet.nf @@ -1,14 +1,7 @@ -// Import generic module functions -include { saveFiles; getSoftwareName } from './functions' - -params.options = [:] params.results_dir = '' process SYNAPSE_TO_SAMPLESHEET { tag "$meta.id" - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } memory 100.MB diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf b/modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf deleted file mode 100644 index 85628ee0..00000000 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf +++ /dev/null @@ -1,78 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Extract name of module from process name using $task.process -// -def getProcessName(task_process) { - return task_process.tokenize(':')[-1] -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - - // Do not publish versions.yml unless running from pytest workflow - if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { - return null - } - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } -} diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml deleted file mode 100644 index 8d4a6ed4..00000000 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: custom_dumpsoftwareversions -description: Custom module used to dump software versions within the nf-core pipeline template -keywords: - - custom - - version -tools: - - custom: - description: Custom module used to dump software versions within the nf-core pipeline template - homepage: https://github.com/nf-core/tools - documentation: https://github.com/nf-core/tools - -input: - - versions: - type: file - description: YML file containing software versions - pattern: "*.yml" - -output: - - yml: - type: file - description: Standard YML file containing software versions - pattern: "software_versions.yml" - - mqc_yml: - type: file - description: MultiQC custom content YML file containing software versions - pattern: "software_versions_mqc.yml" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - -authors: - - "@drpatelh" diff --git a/modules/nf-core/modules/sratools/fasterqdump/functions.nf b/modules/nf-core/modules/sratools/fasterqdump/functions.nf deleted file mode 100644 index 85628ee0..00000000 --- a/modules/nf-core/modules/sratools/fasterqdump/functions.nf +++ /dev/null @@ -1,78 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Extract name of module from process name using $task.process -// -def getProcessName(task_process) { - return task_process.tokenize(':')[-1] -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - - // Do not publish versions.yml unless running from pytest workflow - if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { - return null - } - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } -} diff --git a/modules/nf-core/modules/sratools/fasterqdump/meta.yml b/modules/nf-core/modules/sratools/fasterqdump/meta.yml deleted file mode 100644 index ac61e71f..00000000 --- a/modules/nf-core/modules/sratools/fasterqdump/meta.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: sratools_fasterqdump -description: Extract sequencing reads in FASTQ format from a given NCBI Sequence Read Archive (SRA). -keywords: - - sequencing - - FASTQ - - dump -tools: - - sratools: - description: SRA Toolkit and SDK from NCBI - homepage: https://github.com/ncbi/sra-tools - documentation: https://github.com/ncbi/sra-tools/wiki - tool_dev_url: https://github.com/ncbi/sra-tools - licence: ['Public Domain'] - -input: - - meta: - type: map - description: > - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - sra: - type: directory - description: Directory containing ETL data for the given SRA. - pattern: "*/*.sra" - -output: - - meta: - type: map - description: > - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - reads: - type: file - description: Extracted FASTQ file or files if the sequencing reads are paired-end. - pattern: "*.fastq.gz" - -authors: - - "@Midnighter" diff --git a/modules/nf-core/modules/sratools/prefetch/functions.nf b/modules/nf-core/modules/sratools/prefetch/functions.nf deleted file mode 100644 index 85628ee0..00000000 --- a/modules/nf-core/modules/sratools/prefetch/functions.nf +++ /dev/null @@ -1,78 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Extract name of module from process name using $task.process -// -def getProcessName(task_process) { - return task_process.tokenize(':')[-1] -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - - // Do not publish versions.yml unless running from pytest workflow - if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { - return null - } - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } -} diff --git a/modules/nf-core/modules/sratools/prefetch/main.nf b/modules/nf-core/modules/sratools/prefetch/main.nf deleted file mode 100644 index 207d1e10..00000000 --- a/modules/nf-core/modules/sratools/prefetch/main.nf +++ /dev/null @@ -1,50 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process SRATOOLS_PREFETCH { - tag "$id" - label 'process_low' - label 'error_retry' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } - - conda (params.enable_conda ? 'bioconda::sra-tools=2.11.0' : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container 'https://depot.galaxyproject.org/singularity/sra-tools:2.11.0--pl5262h314213e_0' - } else { - container 'quay.io/biocontainers/sra-tools:2.11.0--pl5262h314213e_0' - } - - input: - tuple val(meta), val(id) - - output: - tuple val(meta), path("$id"), emit: sra - path "versions.yml" , emit: versions - - script: - def config = "/LIBS/GUID = \"${UUID.randomUUID().toString()}\"\\n/libs/cloud/report_instance_identity = \"true\"\\n" - """ - eval "\$(vdb-config -o n NCBI_SETTINGS | sed 's/[" ]//g')" - if [[ ! -f "\${NCBI_SETTINGS}" ]]; then - mkdir -p "\$(dirname "\${NCBI_SETTINGS}")" - printf '${config}' > "\${NCBI_SETTINGS}" - fi - - prefetch \\ - $options.args \\ - --progress \\ - $id - - vdb-validate $id - - cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$(prefetch --version 2>&1 | grep -Eo '[0-9.]+') - END_VERSIONS - """ -} diff --git a/modules/nf-core/modules/sratools/prefetch/meta.yml b/modules/nf-core/modules/sratools/prefetch/meta.yml deleted file mode 100644 index ab0a5ce5..00000000 --- a/modules/nf-core/modules/sratools/prefetch/meta.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: sratools_prefetch -description: Download sequencing data from the NCBI Sequence Read Archive (SRA). -keywords: - - sequencing - - fastq - - prefetch -tools: - - sratools: - description: SRA Toolkit and SDK from NCBI - homepage: https://github.com/ncbi/sra-tools - documentation: https://github.com/ncbi/sra-tools/wiki - tool_dev_url: https://github.com/ncbi/sra-tools - licence: ['Public Domain'] - -input: - - meta: - type: map - description: > - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - id: - type: val - description: > - A string denoting an SRA id. - -output: - - meta: - type: map - description: > - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - sra: - type: directory - description: > - Directory containing the ETL data for the given SRA id. - pattern: "*/*.sra" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - -authors: - - "@Midnighter" diff --git a/nextflow.config b/nextflow.config index 066fb0b3..cd5e3891 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,14 +28,12 @@ params { help = false validate_params = true show_hidden_params = false - schema_ignore_params = 'modules,igenomes_base' + schema_ignore_params = 'igenomes_base' enable_conda = false - singularity_pull_docker_container = false // Config options custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - hostnames = [:] config_profile_description = null config_profile_contact = null config_profile_url = null @@ -52,9 +50,6 @@ params { // Load base.config by default for all pipelines includeConfig 'conf/base.config' -// Load modules.config for DSL2 module specific options -includeConfig 'conf/modules.config' - // Load nf-core custom profiles from different Institutions try { includeConfig "${params.custom_config_base}/nfcore_custom.config" @@ -152,6 +147,9 @@ manifest { version = '1.4' } +// Load modules.config for DSL2 module specific options +includeConfig 'conf/modules.config' + // Function to ensure that resource requirements don't go beyond // a maximum limit def check_max(obj, type) { diff --git a/subworkflows/local/sra_fastq/main.nf b/subworkflows/local/sra_fastq/main.nf index c6590b59..84db4b77 100644 --- a/subworkflows/local/sra_fastq/main.nf +++ b/subworkflows/local/sra_fastq/main.nf @@ -2,11 +2,8 @@ // Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). // -params.prefetch_options = [:] -params.fasterqdump_options = [:] - -include { SRATOOLS_PREFETCH } from '../../../modules/nf-core/modules/sratools/prefetch/main' addParams( options: params.prefetch_options ) -include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/modules/sratools/fasterqdump/main' addParams( options: params.fasterqdump_options ) +include { SRATOOLS_PREFETCH } from '../../../modules/local/sratools_prefetch.nf' +include { SRATOOLS_FASTERQDUMP } from '../../../modules/local/sratools_fasterqdump.nf' workflow SRA_FASTQ { take: diff --git a/workflows/sra.nf b/workflows/sra.nf index 4fb41ccd..40eeab9d 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -20,23 +20,16 @@ WorkflowSra.initialise(params, log, valid_params) */ // Don't overwrite global params.modules, create a copy instead and use that within the main script. -def modules = params.modules.clone() +//def modules = params.modules.clone() -include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' addParams( options: modules['sra_ids_to_runinfo'] ) -include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' addParams( options: modules['sra_runinfo_to_ftp'] ) -include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' addParams( options: modules['sra_fastq_ftp'] ) -include { SRA_FASTQ } from '../subworkflows/local/sra_fastq/main' addParams( prefetch_options: modules['sratools_prefetch'], fasterqdump_options: modules['sratools_fasterqdump'] ) -include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' addParams( options: modules['sra_to_samplesheet'], results_dir: modules['sra_fastq_ftp'].publish_dir ) -include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' addParams( options: modules['sra_merge_samplesheet'] ) -include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' addParams( options: modules['multiqc_mappings_config'] ) - -/* -======================================================================================== - IMPORT NF-CORE MODULES/SUBWORKFLOWS -======================================================================================== -*/ - -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' addParams( options: [publish_files : ['_versions.yml':'']] ) +include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' +include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' +include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' +include { SRA_FASTQ } from '../subworkflows/local/sra_fastq/main' +include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' // addParams( results_dir: modules['sra_fastq_ftp'].publish_dir ) +include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' +include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' +include { DUMPSOFTWAREVERSIONS } from '../modules/local/dumpsoftwareversions' /* ======================================================================================== @@ -133,7 +126,7 @@ workflow SRA { // // MODULE: Dump software versions for all tools used in the workflow // - CUSTOM_DUMPSOFTWAREVERSIONS ( + DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) } diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 2fc3cf69..06ce6319 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -20,21 +20,14 @@ if (params.synapse_config) { */ // Don't overwrite global params.modules, create a copy instead and use that within the main script. -def modules = params.modules.clone() +//def modules = params.modules.clone() -include { SYNAPSE_LIST } from '../modules/local/synapse_list' addParams( options: modules['synapse_list'] ) -include { SYNAPSE_SHOW } from '../modules/local/synapse_show' addParams( options: modules['synapse_show'] ) -include { SYNAPSE_GET } from '../modules/local/synapse_get' addParams( options: modules['synapse_get'] ) -include { SYNAPSE_TO_SAMPLESHEET } from '../modules/local/synapse_to_samplesheet' addParams( options: modules['synapse_to_samplesheet'], results_dir: modules['synapse_get'].publish_dir ) -include { SYNAPSE_MERGE_SAMPLESHEET } from '../modules/local/synapse_merge_samplesheet' addParams( options: modules['synapse_merge_samplesheet'] ) - -/* -======================================================================================== - IMPORT NF-CORE MODULES/SUBWORKFLOWS -======================================================================================== -*/ - -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' addParams( options: [publish_files : ['_versions.yml':'']] ) +include { SYNAPSE_LIST } from '../modules/local/synapse_list' +include { SYNAPSE_SHOW } from '../modules/local/synapse_show' +include { SYNAPSE_GET } from '../modules/local/synapse_get' +include { SYNAPSE_TO_SAMPLESHEET } from '../modules/local/synapse_to_samplesheet' // addParams( results_dir: modules['synapse_get'].publish_dir ) +include { SYNAPSE_MERGE_SAMPLESHEET } from '../modules/local/synapse_merge_samplesheet' +include { DUMPSOFTWAREVERSIONS } from '../modules/local/dumpsoftwareversions' /* ======================================================================================== @@ -132,7 +125,7 @@ workflow SYNAPSE { // // MODULE: Dump software versions for all tools used in the workflow // - CUSTOM_DUMPSOFTWAREVERSIONS ( + DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) } From 44844e3be78d15848dfe41bc77c380bf3c6cfb17 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 3 Nov 2021 15:24:28 +0000 Subject: [PATCH 080/106] Run CI for pipeline NXF and latest edge version --- .github/workflows/ci.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08ac1cfb..6b67751c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,12 +18,11 @@ jobs: if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }} runs-on: ubuntu-latest env: - NXF_VER: ${{ matrix.nxf_ver }} NXF_ANSI_LOG: false strategy: matrix: - # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ["21.04.0", ""] + # Nextflow versions: check pipeline minimum and latest edge version + nxf_ver: ["NXF_VER=21.04.0", "NXF_EDGE=1"] steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -34,6 +33,8 @@ jobs: run: | wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ + export ${{ matrix.nxf_ver }} + nextflow self-update - name: Run pipeline with SRA test data run: | From fa4be0b9cd8551b179d8e281d08cd4d9830884de Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 3 Nov 2021 15:24:49 +0000 Subject: [PATCH 081/106] Remove params.hostnames --- lib/NfcoreSchema.groovy | 3 +-- lib/NfcoreTemplate.groovy | 30 +++++++++--------------------- lib/WorkflowMain.groovy | 6 +++--- nextflow_schema.json | 13 ------------- 4 files changed, 13 insertions(+), 39 deletions(-) diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 8d6920dd..dcb39c83 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -260,13 +260,12 @@ class NfcoreSchema { // Get pipeline parameters defined in JSON Schema def Map params_summary = [:] - def blacklist = ['hostnames'] def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) for (group in params_map.keySet()) { def sub_params = new LinkedHashMap() def group_params = params_map.get(group) // This gets the parameters of that particular group for (param in group_params.keySet()) { - if (params.containsKey(param) && !blacklist.contains(param)) { + if (params.containsKey(param)) { def params_value = params.get(param) def schema_value = group_params.get(param).default def param_type = group_params.get(param).type diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 3dd46e64..4592007f 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -19,27 +19,16 @@ class NfcoreTemplate { } // - // Check params.hostnames + // Warn if a -profile or Nextflow config has not been provided to run the pipeline // - public static void hostName(workflow, params, log) { - Map colors = logColours(params.monochrome_logs) - if (params.hostnames) { - try { - def hostname = "hostname".execute().text.trim() - params.hostnames.each { prof, hnames -> - hnames.each { hname -> - if (hostname.contains(hname) && !workflow.profile.contains(prof)) { - log.info "=${colors.yellow}====================================================${colors.reset}=\n" + - "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + - " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + - " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + - "=${colors.yellow}====================================================${colors.reset}=" - } - } - } - } catch (Exception e) { - log.warn "[$workflow.manifest.name] Could not determine 'hostname' - skipping check. Reason: ${e.message}." - } + public static void checkConfigProvided(workflow, log) { + if (workflow.profile == 'standard' && workflow.configFiles.size() <= 1) { + log.warn "[$workflow.manifest.name] You are attempting to run the pipeline without any custom configuration!\n\n" + + "This will be dependent on your local compute enviroment but can be acheived via one or more of the following:\n" + + " (1) Using an existing pipeline profile e.g. `-profile docker` or `-profile singularity`\n" + + " (2) Using an existing nf-core/configs for your Institution e.g. `-profile crick` or `-profile uppmax`\n" + + " (3) Using your own local custom config e.g. `-c /path/to/your/custom.config`\n\n" + + "Please refer to the quick start section and usage docs for the pipeline.\n " } } @@ -146,7 +135,6 @@ class NfcoreTemplate { log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" } } else { - hostName(workflow, params, log) log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" } } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 09264303..e8f3c20d 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -60,6 +60,9 @@ class WorkflowMain { // Print parameter summary log to screen log.info paramsSummaryLog(workflow, params, log) + // Check that a -profile or Nextflow config has been provided to run the pipeline + NfcoreTemplate.checkConfigProvided(workflow, log) + // Check that conda channels are set-up correctly if (params.enable_conda) { Utils.checkCondaChannels(log) @@ -68,9 +71,6 @@ class WorkflowMain { // Check AWS batch settings NfcoreTemplate.awsBatch(workflow, params) - // Check the hostnames against configured profiles - NfcoreTemplate.hostName(workflow, params, log) - // Check input has been provided if (!params.input) { log.error "Please provide an input file containing ids to the pipeline - one per line e.g. '--input ids.txt'" diff --git a/nextflow_schema.json b/nextflow_schema.json index b2f26972..be10a344 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -88,12 +88,6 @@ "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", "fa_icon": "fas fa-users-cog" }, - "hostnames": { - "type": "string", - "description": "Institutional configs hostname.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, "config_profile_name": { "type": "string", "description": "Institutional config name.", @@ -230,13 +224,6 @@ "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", "hidden": true, "fa_icon": "fas fa-bacon" - }, - "singularity_pull_docker_container": { - "type": "boolean", - "description": "Instead of directly downloading Singularity images for use with Singularity, force the workflow to pull and convert Docker containers instead.", - "hidden": true, - "fa_icon": "fas fa-toolbox", - "help_text": "This may be useful for example if you are unable to directly pull Singularity containers to run the pipeline due to http/https proxy issues." } } } From ef640c724e1b69466829576844f885e77104b619 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Wed, 3 Nov 2021 15:55:13 +0000 Subject: [PATCH 082/106] Fix nf-core lint --- .nf-core.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.nf-core.yml b/.nf-core.yml index 92e07317..b8e4d216 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,7 +2,9 @@ lint: files_unchanged: - .github/CONTRIBUTING.md - assets/sendmail_template.txt + - lib/NfcoreSchema.groovy - lib/NfcoreTemplate.groovy files_exist: - bin/scrape_software_versions.py - modules/local/get_software_versions.nf + actions_ci: False From c7cbd2f9dcf3212f7483c26a709054c01e00e801 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 4 Nov 2021 11:49:52 +0000 Subject: [PATCH 083/106] Add params.input_type to split out config options for each workflow --- conf/modules.config | 224 ++++++++++++++++++++++----------------- conf/test_synapse.config | 3 +- lib/WorkflowMain.groovy | 8 ++ main.nf | 24 +++-- nextflow.config | 1 + nextflow_schema.json | 10 ++ 6 files changed, 162 insertions(+), 108 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 88a1a72b..0f6c8138 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,6 +10,9 @@ ---------------------------------------------------------------------------------------- */ +// +// Generic process options for all workflows +// process { publishDir = [ @@ -18,127 +21,150 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SRA_IDS_TO_RUNINFO { - publishDir = [ - path: { "${params.outdir}/metadata" }, - enabled: false - ] - } - - withName: SRA_RUNINFO_TO_FTP { + withName: DUMPSOFTWAREVERSIONS { publishDir = [ - path: { "${params.outdir}/metadata" }, + path: { "${params.outdir}/pipeline_info" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + pattern: '*_versions.yml' ] } - withName: SRA_FASTQ_FTP { - ext.args = '--retry 5 --continue-at - --max-time 1200' - publishDir = [ - [ +} + +// +// Process options for the SRA workflow +// +if (params.input_type == 'sra') { + + process { + + withName: SRA_IDS_TO_RUNINFO { + publishDir = [ + path: { "${params.outdir}/metadata" }, + enabled: false + ] + } + + withName: SRA_RUNINFO_TO_FTP { + publishDir = [ + path: { "${params.outdir}/metadata" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SRA_FASTQ_FTP { + ext.args = '--retry 5 --continue-at - --max-time 1200' + publishDir = [ + [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*gz" + ], + [ + path: { "${params.outdir}/fastq/md5" }, + mode: params.publish_dir_mode, + pattern: "*.md5" + ] + ] + } + + withName: SRATOOLS_PREFETCH { + publishDir = [ + path: { "${params.outdir}/sra" }, + enabled: false + ] + } + + withName: SRATOOLS_FASTERQDUMP { + publishDir = [ path: { "${params.outdir}/fastq" }, mode: params.publish_dir_mode, - pattern: "*gz" - ], - [ - path: { "${params.outdir}/fastq/md5" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SRA_TO_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + enabled: false + ] + } + + withName: SRA_MERGE_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, mode: params.publish_dir_mode, - pattern: "*.md5" + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - ] - } + } - withName: SRATOOLS_PREFETCH { - publishDir = [ - path: { "${params.outdir}/sra" }, - enabled: false - ] - } + withName: MULTIQC_MAPPINGS_CONFIG { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: SRATOOLS_FASTERQDUMP { - publishDir = [ - path: { "${params.outdir}/fastq" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] } - withName: SRA_TO_SAMPLESHEET { - publishDir = [ - path: { "${params.outdir}/samplesheet" }, - enabled: false - ] - } +} - withName: SRA_MERGE_SAMPLESHEET { - publishDir = [ - path: { "${params.outdir}/samplesheet" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } +// +// Process options for the Synapse workflow +// +if (params.input_type == 'synapse') { - withName: SYNAPSE_LIST { - ext.args = '--long' - publishDir = [ - path: { "${params.outdir}/metadata" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + process { - withName: SYNAPSE_GET { - publishDir = [ - [ - path: { "${params.outdir}/fastq" }, - mode: params.publish_dir_mode, - pattern: "*gz" - ], - [ - path: { "${params.outdir}/fastq/md5" }, + withName: SYNAPSE_LIST { + ext.args = '--long' + publishDir = [ + path: { "${params.outdir}/metadata" }, mode: params.publish_dir_mode, - pattern: "*.md5" + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - ] - } + } + + withName: SYNAPSE_GET { + publishDir = [ + [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*gz" + ], + [ + path: { "${params.outdir}/fastq/md5" }, + mode: params.publish_dir_mode, + pattern: "*.md5" + ] + ] + } - withName: SYNAPSE_SHOW { - publishDir = [ - path: { "${params.outdir}/metadata" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: SYNAPSE_SHOW { + publishDir = [ + path: { "${params.outdir}/metadata" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: SYNAPSE_TO_SAMPLESHEET { - publishDir = [ - path: { "${params.outdir}/samplesheet" }, - enabled: false - ] - } + withName: SYNAPSE_TO_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + enabled: false + ] + } - withName: SYNAPSE_MERGE_SAMPLESHEET { - publishDir = [ - path: { "${params.outdir}/samplesheet" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: SYNAPSE_MERGE_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: MULTIQC_MAPPINGS_CONFIG { - publishDir = [ - path: { "${params.outdir}/samplesheet" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] } - withName: DUMPSOFTWAREVERSIONS { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - pattern: '*_versions.yml' - ] - } } diff --git a/conf/test_synapse.config b/conf/test_synapse.config index d68d4098..d9433493 100644 --- a/conf/test_synapse.config +++ b/conf/test_synapse.config @@ -20,5 +20,6 @@ params { max_time = 6.h // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/synapse_ids_test.txt' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/synapse_ids_test.txt' + input_type = 'synapse' } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index e8f3c20d..f64fa80f 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -46,6 +46,7 @@ class WorkflowMain { // Validate parameters and print summary to screen // public static void initialise(workflow, params, log) { + // Print help to screen if required if (params.help) { log.info help(workflow, params, log) @@ -76,6 +77,13 @@ class WorkflowMain { log.error "Please provide an input file containing ids to the pipeline - one per line e.g. '--input ids.txt'" System.exit(1) } + + // Check valid input_type has been provided + def input_types = ['sra', 'synapse'] + if (!input_types.contains(params.input_type)) { + log.error "Invalid option: '${params.input_type}'. Valid options for '--input_type': ${input_types.join(', ')}." + System.exit(1) + } } // Check if input ids are from the SRA diff --git a/main.nf b/main.nf index 98b25cb1..a2ae62f1 100644 --- a/main.nf +++ b/main.nf @@ -37,18 +37,26 @@ Channel ======================================================================================== */ -// Auto-detect id type -def id_type = '' +// Auto-detect input id type +def input_type = '' if (WorkflowMain.isSraId(ch_input, log)) { - id_type = 'sra' - include { SRA } from './workflows/sra' + input_type = 'sra' } else if (WorkflowMain.isSynapseId(ch_input, log)) { - id_type = 'synapse' - include { SYNAPSE } from './workflows/synapse' + input_type = 'synapse' } else { exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ / GEO or Synapse ids!' } +if (params.input_type == input_type) { + if (params.input_type == 'sra') { + include { SRA } from './workflows/sra' + } else if (params.input_type == 'synapse') { + include { SYNAPSE } from './workflows/synapse' + } +} else { + exit 1, "Ids auto-detected as ${input_type}. Please provide '--input_type ${input_type}' as a parameter to the pipeline!" +} + // // WORKFLOW: Run main nf-core/fetchngs analysis pipeline depending on type of identifier provided // @@ -57,13 +65,13 @@ workflow NFCORE_FETCHNGS { // // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ / GEO ids // - if (id_type == 'sra') { + if (params.input_type == 'sra') { SRA ( ch_ids ) // // WORKFLOW: Download FastQ files for Synapse ids // - } else if (id_type == 'synapse') { + } else if (params.input_type == 'synapse') { SYNAPSE ( ch_ids ) } } diff --git a/nextflow.config b/nextflow.config index cd5e3891..6ae6a8da 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,6 +11,7 @@ params { // Input options input = null + input_type = 'sra' nf_core_pipeline = null ena_metadata_fields = null sample_mapping_fields = 'run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' diff --git a/nextflow_schema.json b/nextflow_schema.json index be10a344..1f923f28 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -23,6 +23,16 @@ "fa_icon": "fas fa-file-excel", "description": "File containing SRA/ENA/DDBJ/GEO identifiers one per line to download their associated metadata and FastQ files." }, + "input_type": { + "type": "string", + "default": "sra", + "description": "Specifies the type of identifier provided via `--input` - available options are 'sra' and 'synapse'.", + "fa_icon": "fas fa-keyboard", + "enum": [ + "sra", + "synapse" + ] + }, "ena_metadata_fields": { "type": "string", "fa_icon": "fas fa-columns", From 107e3e5dc96ae9394f058385d51342041be8ffbc Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 4 Nov 2021 12:26:14 +0000 Subject: [PATCH 084/106] Refactor sra_fastq subworkflow to be local --- .../local/{sra_fastq/main.nf => sra_fastq.nf} | 4 +-- subworkflows/local/sra_fastq/meta.yml | 36 ------------------- subworkflows/local/sra_fastq/nextflow.config | 2 -- workflows/sra.nf | 2 +- 4 files changed, 3 insertions(+), 41 deletions(-) rename subworkflows/local/{sra_fastq/main.nf => sra_fastq.nf} (82%) delete mode 100644 subworkflows/local/sra_fastq/meta.yml delete mode 100644 subworkflows/local/sra_fastq/nextflow.config diff --git a/subworkflows/local/sra_fastq/main.nf b/subworkflows/local/sra_fastq.nf similarity index 82% rename from subworkflows/local/sra_fastq/main.nf rename to subworkflows/local/sra_fastq.nf index 84db4b77..04039f79 100644 --- a/subworkflows/local/sra_fastq/main.nf +++ b/subworkflows/local/sra_fastq.nf @@ -2,8 +2,8 @@ // Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). // -include { SRATOOLS_PREFETCH } from '../../../modules/local/sratools_prefetch.nf' -include { SRATOOLS_FASTERQDUMP } from '../../../modules/local/sratools_fasterqdump.nf' +include { SRATOOLS_PREFETCH } from '../../modules/local/sratools_prefetch.nf' +include { SRATOOLS_FASTERQDUMP } from '../../modules/local/sratools_fasterqdump.nf' workflow SRA_FASTQ { take: diff --git a/subworkflows/local/sra_fastq/meta.yml b/subworkflows/local/sra_fastq/meta.yml deleted file mode 100644 index 146176ee..00000000 --- a/subworkflows/local/sra_fastq/meta.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: sra_fastq -description: Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). -keywords: - - sequencing - - FASTQ - - prefetch - - dump -modules: - - sratools/prefetch - - sratools/fasterqdump -input: - - meta: - type: map - description: > - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - id: - type: string - description: > - SRA identifier. -output: - - meta: - type: map - description: > - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: Extracted FASTQ file or files if the sequencing reads are paired-end. - pattern: "*.fastq.gz" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - '@Midnighter' diff --git a/subworkflows/local/sra_fastq/nextflow.config b/subworkflows/local/sra_fastq/nextflow.config deleted file mode 100644 index 07448834..00000000 --- a/subworkflows/local/sra_fastq/nextflow.config +++ /dev/null @@ -1,2 +0,0 @@ -params.prefetch_options = [:] -params.fasterqdump_options = [:] diff --git a/workflows/sra.nf b/workflows/sra.nf index 40eeab9d..e23d968e 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -25,7 +25,7 @@ WorkflowSra.initialise(params, log, valid_params) include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' -include { SRA_FASTQ } from '../subworkflows/local/sra_fastq/main' +include { SRA_FASTQ } from '../subworkflows/local/sra_fastq' include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' // addParams( results_dir: modules['sra_fastq_ftp'].publish_dir ) include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' From 4be83f200d661a24ec074752702cffd9ab7bb329 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 4 Nov 2021 12:51:55 +0000 Subject: [PATCH 085/106] Remove last remaining addParams call --- modules/local/sra_to_samplesheet.nf | 6 ++---- modules/local/synapse_to_samplesheet.nf | 7 +++---- workflows/sra.nf | 5 +---- workflows/synapse.nf | 5 +---- 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/modules/local/sra_to_samplesheet.nf b/modules/local/sra_to_samplesheet.nf index 230ba4ca..dbbc6613 100644 --- a/modules/local/sra_to_samplesheet.nf +++ b/modules/local/sra_to_samplesheet.nf @@ -1,6 +1,4 @@ -params.results_dir = '' - process SRA_TO_SAMPLESHEET { tag "$meta.id" @@ -32,8 +30,8 @@ process SRA_TO_SAMPLESHEET { // Add relevant fields to the beginning of the map pipeline_map = [ sample : "${meta.id.split('_')[0..-2].join('_')}", - fastq_1 : "${params.outdir}/${params.results_dir}/${fastq[0]}", - fastq_2 : meta.single_end ? '' : "${params.outdir}/${params.results_dir}/${fastq[1]}" + fastq_1 : "${params.outdir}/fastq/${fastq[0]}", + fastq_2 : meta.single_end ? '' : "${params.outdir}/fastq/${fastq[1]}" ] // Add nf-core pipeline specific entries diff --git a/modules/local/synapse_to_samplesheet.nf b/modules/local/synapse_to_samplesheet.nf index 976d7ad7..40519ec0 100644 --- a/modules/local/synapse_to_samplesheet.nf +++ b/modules/local/synapse_to_samplesheet.nf @@ -1,4 +1,3 @@ -params.results_dir = '' process SYNAPSE_TO_SAMPLESHEET { tag "$meta.id" @@ -18,11 +17,11 @@ process SYNAPSE_TO_SAMPLESHEET { def meta_map = meta.clone() meta_map.remove("id") - def fastq_1 = "${params.outdir}/${params.results_dir}/${fastq}" + def fastq_1 = "${params.outdir}/fastq/${fastq}" def fastq_2 = '' if (fastq instanceof List && fastq.size() == 2) { - fastq_1 = "${params.outdir}/${params.results_dir}/${fastq[0]}" - fastq_2 = "${params.outdir}/${params.results_dir}/${fastq[1]}" + fastq_1 = "${params.outdir}/fastq/${fastq[0]}" + fastq_2 = "${params.outdir}/fastq/${fastq[1]}" } // Add relevant fields to the beginning of the map diff --git a/workflows/sra.nf b/workflows/sra.nf index e23d968e..c108812f 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -19,14 +19,11 @@ WorkflowSra.initialise(params, log, valid_params) ======================================================================================== */ -// Don't overwrite global params.modules, create a copy instead and use that within the main script. -//def modules = params.modules.clone() - include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' include { SRA_FASTQ } from '../subworkflows/local/sra_fastq' -include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' // addParams( results_dir: modules['sra_fastq_ftp'].publish_dir ) +include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' include { DUMPSOFTWAREVERSIONS } from '../modules/local/dumpsoftwareversions' diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 06ce6319..5a7e3fe7 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -19,13 +19,10 @@ if (params.synapse_config) { ======================================================================================== */ -// Don't overwrite global params.modules, create a copy instead and use that within the main script. -//def modules = params.modules.clone() - include { SYNAPSE_LIST } from '../modules/local/synapse_list' include { SYNAPSE_SHOW } from '../modules/local/synapse_show' include { SYNAPSE_GET } from '../modules/local/synapse_get' -include { SYNAPSE_TO_SAMPLESHEET } from '../modules/local/synapse_to_samplesheet' // addParams( results_dir: modules['synapse_get'].publish_dir ) +include { SYNAPSE_TO_SAMPLESHEET } from '../modules/local/synapse_to_samplesheet' include { SYNAPSE_MERGE_SAMPLESHEET } from '../modules/local/synapse_merge_samplesheet' include { DUMPSOFTWAREVERSIONS } from '../modules/local/dumpsoftwareversions' From eb5816e9c1e5a9f55a8369fdaba66f2c2d7f592d Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 4 Nov 2021 12:55:00 +0000 Subject: [PATCH 086/106] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0153327e..65d06ea6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements & fixes +* Convert pipeline to updated Nextflow DSL2 syntax for future adoption across nf-core * Added a workflow to download FastQ files and to create samplesheets for ids from the [Synapse platform](https://www.synapse.org/) hosted by [Sage Bionetworks](https://sagebionetworks.org/). * SRA identifiers not available for direct download via the ENA FTP will now be downloaded via sra-tools. * Correctly handle errors from SRA identifiers that do **not** return metadata, for example, due to being private. From ddcb96d7ca0c89b9276b5b2b1ce3f6a86b0c9e72 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 4 Nov 2021 15:39:54 +0000 Subject: [PATCH 087/106] Fix assignment alignment --- modules/local/sratools_fasterqdump.nf | 4 ++-- modules/local/sratools_prefetch.nf | 2 +- modules/local/synapse_list.nf | 2 +- modules/local/synapse_show.nf | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/local/sratools_fasterqdump.nf b/modules/local/sratools_fasterqdump.nf index cc8fb6fa..ae69b5e6 100644 --- a/modules/local/sratools_fasterqdump.nf +++ b/modules/local/sratools_fasterqdump.nf @@ -16,8 +16,8 @@ process SRATOOLS_FASTERQDUMP { path "versions.yml" , emit: versions script: - def args = task.ext.args ?: '' - def args2 = task.ext.args2 ?: '' + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' def config = "/LIBS/GUID = \"${UUID.randomUUID().toString()}\"\\n/libs/cloud/report_instance_identity = \"true\"\\n" // Paired-end data extracted by fasterq-dump (--split-3 the default) always creates // *_1.fastq *_2.fastq files but sometimes also an additional *.fastq file diff --git a/modules/local/sratools_prefetch.nf b/modules/local/sratools_prefetch.nf index 07be740b..033aeedf 100644 --- a/modules/local/sratools_prefetch.nf +++ b/modules/local/sratools_prefetch.nf @@ -17,7 +17,7 @@ process SRATOOLS_PREFETCH { path "versions.yml" , emit: versions script: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' def config = "/LIBS/GUID = \"${UUID.randomUUID().toString()}\"\\n/libs/cloud/report_instance_identity = \"true\"\\n" """ eval "\$(vdb-config -o n NCBI_SETTINGS | sed 's/[" ]//g')" diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf index f001a186..0255c925 100644 --- a/modules/local/synapse_list.nf +++ b/modules/local/synapse_list.nf @@ -17,7 +17,7 @@ process SYNAPSE_LIST { path "versions.yml", emit: versions script: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' """ synapse \\ diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf index 78593861..9a81493a 100644 --- a/modules/local/synapse_show.nf +++ b/modules/local/synapse_show.nf @@ -17,7 +17,7 @@ process SYNAPSE_SHOW { path "versions.yml", emit: versions script: - def args = task.ext.args ?: '' + def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' """ synapse \\ From 4b2e0f82775d0475c0fb51948ee0405fa3d69605 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 11:23:21 +0000 Subject: [PATCH 088/106] Update main.nf --- modules/nf-core/modules/sratools/prefetch/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/modules/sratools/prefetch/main.nf b/modules/nf-core/modules/sratools/prefetch/main.nf index c57255f5..207d1e10 100644 --- a/modules/nf-core/modules/sratools/prefetch/main.nf +++ b/modules/nf-core/modules/sratools/prefetch/main.nf @@ -35,7 +35,7 @@ process SRATOOLS_PREFETCH { printf '${config}' > "\${NCBI_SETTINGS}" fi - retry_with_backoff.sh prefetch \\ + prefetch \\ $options.args \\ --progress \\ $id From 0238704dce66dd698f2c85168f9684b3e529d1e4 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 12:38:52 +0000 Subject: [PATCH 089/106] Name output files with EXPID_RUNID_T1 --- bin/sra_runinfo_to_ftp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/sra_runinfo_to_ftp.py b/bin/sra_runinfo_to_ftp.py index 130007b6..094782f8 100755 --- a/bin/sra_runinfo_to_ftp.py +++ b/bin/sra_runinfo_to_ftp.py @@ -112,7 +112,6 @@ def parse_sra_runinfo(file_in): else: # In some instances, FTP links don't exist for FastQ files. # These have to be downloaded with the run accession using sra-tools. - db_id = row["run_accession"] sample = dict.fromkeys(extensions, None) if row["library_layout"] == "SINGLE": sample["single_end"] = "true" @@ -160,6 +159,8 @@ def sra_runinfo_to_ftp(files_in, file_out): for db_id in sorted(samplesheet): for idx, row in enumerate(samplesheet[db_id], start=1): row["id"] = f"{db_id}_T{idx}" + if 'run_accession' in row: + row["id"] = f"{db_id}_{row['run_accession']}_T{idx}" writer.writerow(row) From 5909b7587842a6ce4fe0ce074c7382b7e779847c Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 12:39:09 +0000 Subject: [PATCH 090/106] Output md5sums for fasterqdump --- conf/modules.config | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0f6c8138..753ab27d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -78,9 +78,16 @@ if (params.input_type == 'sra') { withName: SRATOOLS_FASTERQDUMP { publishDir = [ - path: { "${params.outdir}/fastq" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*gz" + ], + [ + path: { "${params.outdir}/fastq/md5" }, + mode: params.publish_dir_mode, + pattern: "*.md5" + ] ] } From a9b05fd83681123e230553d402605750180fddb9 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 12:39:27 +0000 Subject: [PATCH 091/106] Rename FastQs by meta.id and output md5sums for fasterqdump --- modules/local/sratools_fasterqdump.nf | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/modules/local/sratools_fasterqdump.nf b/modules/local/sratools_fasterqdump.nf index ae69b5e6..f834eb8b 100644 --- a/modules/local/sratools_fasterqdump.nf +++ b/modules/local/sratools_fasterqdump.nf @@ -12,8 +12,9 @@ process SRATOOLS_FASTERQDUMP { tuple val(meta), path(sra) output: - tuple val(meta), path(output), emit: reads - path "versions.yml" , emit: versions + tuple val(meta), path(fastq_output), emit: reads + tuple val(meta), path(md5_output) , emit: md5 + path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' @@ -22,7 +23,8 @@ process SRATOOLS_FASTERQDUMP { // Paired-end data extracted by fasterq-dump (--split-3 the default) always creates // *_1.fastq *_2.fastq files but sometimes also an additional *.fastq file // for unpaired reads which we ignore here. - output = meta.single_end ? '*.fastq.gz' : '*_{1,2}.fastq.gz' + fastq_output = meta.single_end ? '*.fastq.gz' : '*_{1,2}.fastq.gz' + md5_output = meta.single_end ? '*.fastq.gz.md5' : '*_{1,2}.fastq.gz.md5' """ eval "\$(vdb-config -o n NCBI_SETTINGS | sed 's/[" ]//g')" if [[ ! -f "\${NCBI_SETTINGS}" ]]; then @@ -41,6 +43,22 @@ process SRATOOLS_FASTERQDUMP { --processes $task.cpus \\ *.fastq + ## Rename FastQ files by meta.id + if [ -f ${sra.name}.fastq.gz ]; then + mv ${sra.name}.fastq.gz ${meta.id}.fastq.gz + md5sum ${meta.id}.fastq.gz > ${meta.id}.fastq.gz.md5 + fi + + if [ -f ${sra.name}_1.fastq.gz ]; then + mv ${sra.name}_1.fastq.gz ${meta.id}_1.fastq.gz + md5sum ${meta.id}_1.fastq.gz > ${meta.id}_1.fastq.gz.md5 + fi + + if [ -f ${sra.name}_2.fastq.gz ]; then + mv ${sra.name}_2.fastq.gz ${meta.id}_2.fastq.gz + md5sum ${meta.id}_2.fastq.gz > ${meta.id}_2.fastq.gz.md5 + fi + cat <<-END_VERSIONS > versions.yml ${task.process.tokenize(':').last()}: sratools: \$(fasterq-dump --version 2>&1 | grep -Eo '[0-9.]+') From e4384a3a759ebae2d423610dc2c57b72d649c35b Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 12:40:02 +0000 Subject: [PATCH 092/106] Run custom bash script for retry with prefetch --- modules/local/sratools_prefetch.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/sratools_prefetch.nf b/modules/local/sratools_prefetch.nf index 033aeedf..1edb16a3 100644 --- a/modules/local/sratools_prefetch.nf +++ b/modules/local/sratools_prefetch.nf @@ -26,7 +26,7 @@ process SRATOOLS_PREFETCH { printf '${config}' > "\${NCBI_SETTINGS}" fi - prefetch \\ + retry_with_backoff.sh prefetch \\ $args \\ --progress \\ $id From 8c895a47cbcd39f60ac689084f3cdbdf655d45c6 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 12:57:12 +0000 Subject: [PATCH 093/106] Dump experiment accession by default in mappings files --- nextflow.config | 2 +- nextflow_schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 6ae6a8da..9e9e2c16 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,7 +14,7 @@ params { input_type = 'sra' nf_core_pipeline = null ena_metadata_fields = null - sample_mapping_fields = 'run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' + sample_mapping_fields = 'experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' synapse_config = null skip_fastq_download = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 1f923f28..c033f6c4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -43,7 +43,7 @@ "type": "string", "fa_icon": "fas fa-globe-americas", "description": "Comma-separated list of ENA metadata fields used to create a separate 'id_mappings.csv' and 'multiqc_config.yml' with selected fields that can be used to rename samples in general and in MultiQC.", - "default": "run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description" + "default": "experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description" }, "nf_core_pipeline": { "type": "string", From 4b0c836744613f7f6ee6da80474937fede8079eb Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 13:05:40 +0000 Subject: [PATCH 094/106] Update CHANGELOG and add links to sra-tools --- CHANGELOG.md | 2 +- CITATIONS.md | 2 ++ README.md | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 327cd99b..dde659d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Convert pipeline to updated Nextflow DSL2 syntax for future adoption across nf-core * Added a workflow to download FastQ files and to create samplesheets for ids from the [Synapse platform](https://www.synapse.org/) hosted by [Sage Bionetworks](https://sagebionetworks.org/). -* SRA identifiers not available for direct download via the ENA FTP will now be downloaded via sra-tools. +* SRA identifiers not available for direct download via the ENA FTP will now be downloaded via [`sra-tools`](https://github.com/ncbi/sra-tools). * Correctly handle errors from SRA identifiers that do **not** return metadata, for example, due to being private. * Retry an error in prefetch via bash script in order to allow it to resume interrupted downloads. * [[#46](https://github.com/nf-core/fetchngs/issues/46)] - Bug in sra_ids_to_runinfo.py diff --git a/CITATIONS.md b/CITATIONS.md index 01819d3e..88db364c 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -28,6 +28,8 @@ * [GEO](https://pubmed.ncbi.nlm.nih.gov/23193258/) > Barrett T, Wilhite SE, Ledoux P, Evangelista C, Kim IF, Tomashevsky M, Marshall KA, Phillippy KH, Sherman PM, Holko M, Yefanov A, Lee H, Zhang N, Robertson CL, Serova N, Davis S, Soboleva A. NCBI GEO: archive for functional genomics data sets--update. Nucleic Acids Res. 2013 Jan;41(Database issue):D991-5. doi: 10.1093/nar/gks1193. Epub 2012 Nov 27. PubMed PMID: 23193258; PubMed Central PMCID: PMC3531084. +* [sra-tools](https://github.com/ncbi/sra-tools) + * [Synapse](https://pubmed.ncbi.nlm.nih.gov/24071850/) > Omberg L, Ellrott K, Yuan Y, Kandoth C, Wong C, Kellen MR, Friend SH, Stuart J, Liang H, Margolin AA. Enabling transparent and collaborative computational analysis of 12 tumor types within The Cancer Genome Atlas. Nat Genet. 2013 Oct;45(10):1121-6. doi: 10.1038/ng.2761. PMID: 24071850; PMCID: PMC3950337. diff --git a/README.md b/README.md index a5ffb8a6..cb694fd5 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Via a single file of ids, provided one-per-line (see [example input file](https: 2. Fetch extensive id metadata via ENA API 3. Download FastQ files: - If direct download links are available from the ENA API, fetch in parallel via `curl` and perform `md5sum` check - - Otherwise use `sra-tools` to download `.sra` files and convert them to FastQ + - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ 4. Collate id metadata and paths to FastQ files in a single samplesheet ### Synapse ids From b9f954766aadadcd927717d788205e12819f211b Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 13:08:19 +0000 Subject: [PATCH 095/106] Move sra-tools citation --- CITATIONS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 88db364c..a1d9d91b 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -14,6 +14,8 @@ * [Requests](https://docs.python-requests.org/) +* [sra-tools](https://github.com/ncbi/sra-tools) + ## Pipeline resources * [ENA](https://pubmed.ncbi.nlm.nih.gov/33175160/) @@ -28,8 +30,6 @@ * [GEO](https://pubmed.ncbi.nlm.nih.gov/23193258/) > Barrett T, Wilhite SE, Ledoux P, Evangelista C, Kim IF, Tomashevsky M, Marshall KA, Phillippy KH, Sherman PM, Holko M, Yefanov A, Lee H, Zhang N, Robertson CL, Serova N, Davis S, Soboleva A. NCBI GEO: archive for functional genomics data sets--update. Nucleic Acids Res. 2013 Jan;41(Database issue):D991-5. doi: 10.1093/nar/gks1193. Epub 2012 Nov 27. PubMed PMID: 23193258; PubMed Central PMCID: PMC3531084. -* [sra-tools](https://github.com/ncbi/sra-tools) - * [Synapse](https://pubmed.ncbi.nlm.nih.gov/24071850/) > Omberg L, Ellrott K, Yuan Y, Kandoth C, Wong C, Kellen MR, Friend SH, Stuart J, Liang H, Margolin AA. Enabling transparent and collaborative computational analysis of 12 tumor types within The Cancer Genome Atlas. Nat Genet. 2013 Oct;45(10):1121-6. doi: 10.1038/ng.2761. PMID: 24071850; PMCID: PMC3950337. From 7d582eeca51b1e9f224e6ca8f28d25dc0b1df1cf Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 13:12:34 +0000 Subject: [PATCH 096/106] Update CHANLGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dde659d5..ff5301b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * SRA identifiers not available for direct download via the ENA FTP will now be downloaded via [`sra-tools`](https://github.com/ncbi/sra-tools). * Correctly handle errors from SRA identifiers that do **not** return metadata, for example, due to being private. * Retry an error in prefetch via bash script in order to allow it to resume interrupted downloads. +* Rename output FastQ files using `{EXP_ACC}_{RUN_ACC}_T1*fastq.gz` convention for run id provenance * [[#46](https://github.com/nf-core/fetchngs/issues/46)] - Bug in sra_ids_to_runinfo.py * Added support for [DDBJ ids](https://www.ddbj.nig.ac.jp/index-e.html). See examples below: From 6675b96ca282bf2c0d102048793ca85b652b1ec9 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 14:42:25 +0000 Subject: [PATCH 097/106] Replace *_T* suffix with run accession --- bin/sra_runinfo_to_ftp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/sra_runinfo_to_ftp.py b/bin/sra_runinfo_to_ftp.py index 094782f8..d9400b2f 100755 --- a/bin/sra_runinfo_to_ftp.py +++ b/bin/sra_runinfo_to_ftp.py @@ -158,9 +158,9 @@ def sra_runinfo_to_ftp(files_in, file_out): writer.writeheader() for db_id in sorted(samplesheet): for idx, row in enumerate(samplesheet[db_id], start=1): - row["id"] = f"{db_id}_T{idx}" + row["id"] = f"{db_id}" if 'run_accession' in row: - row["id"] = f"{db_id}_{row['run_accession']}_T{idx}" + row["id"] = f"{db_id}_{row['run_accession']}" writer.writerow(row) From d11da5602fe65f2c37a43e5d4be418c4408f5fae Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 14:42:46 +0000 Subject: [PATCH 098/106] Fix bug in md5sum output format --- modules/local/sra_fastq_ftp.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index a775849c..dfbe72dd 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -26,7 +26,7 @@ process SRA_FASTQ_FTP { -L ${fastq[0]} \\ -o ${meta.id}.fastq.gz - echo "${meta.md5_1} ${meta.id}.fastq.gz" > ${meta.id}.fastq.gz.md5 + echo "${meta.md5_1} ${meta.id}.fastq.gz" > ${meta.id}.fastq.gz.md5 md5sum -c ${meta.id}.fastq.gz.md5 cat <<-END_VERSIONS > versions.yml @@ -41,7 +41,7 @@ process SRA_FASTQ_FTP { -L ${fastq[0]} \\ -o ${meta.id}_1.fastq.gz - echo "${meta.md5_1} ${meta.id}_1.fastq.gz" > ${meta.id}_1.fastq.gz.md5 + echo "${meta.md5_1} ${meta.id}_1.fastq.gz" > ${meta.id}_1.fastq.gz.md5 md5sum -c ${meta.id}_1.fastq.gz.md5 curl \\ @@ -49,7 +49,7 @@ process SRA_FASTQ_FTP { -L ${fastq[1]} \\ -o ${meta.id}_2.fastq.gz - echo "${meta.md5_2} ${meta.id}_2.fastq.gz" > ${meta.id}_2.fastq.gz.md5 + echo "${meta.md5_2} ${meta.id}_2.fastq.gz" > ${meta.id}_2.fastq.gz.md5 md5sum -c ${meta.id}_2.fastq.gz.md5 cat <<-END_VERSIONS > versions.yml From 73eb06ce77366a5d36b5e3088f3d83806a639a87 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 14:44:44 +0000 Subject: [PATCH 099/106] Update CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff5301b4..04e4c46b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * SRA identifiers not available for direct download via the ENA FTP will now be downloaded via [`sra-tools`](https://github.com/ncbi/sra-tools). * Correctly handle errors from SRA identifiers that do **not** return metadata, for example, due to being private. * Retry an error in prefetch via bash script in order to allow it to resume interrupted downloads. -* Rename output FastQ files using `{EXP_ACC}_{RUN_ACC}_T1*fastq.gz` convention for run id provenance +* Name output FastQ files by `{EXP_ACC}_{RUN_ACC}*fastq.gz` instead of `{EXP_ACC}_{T*}*fastq.gz` for run id provenance * [[#46](https://github.com/nf-core/fetchngs/issues/46)] - Bug in sra_ids_to_runinfo.py * Added support for [DDBJ ids](https://www.ddbj.nig.ac.jp/index-e.html). See examples below: From 04d0de247bd0284c492e698475950923e7de6fa1 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 15:05:32 +0000 Subject: [PATCH 100/106] Add --force_sratools_download parameter --- CHANGELOG.md | 3 ++- nextflow.config | 1 + nextflow_schema.json | 5 +++++ workflows/sra.nf | 7 +++++-- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04e4c46b..0b89d9ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,13 +3,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.4](https://github.com/nf-core/fetchngs/releases/tag/1.4)] - 2021-10-25 +## [[1.4](https://github.com/nf-core/fetchngs/releases/tag/1.4)] - 2021-11-09 ### Enhancements & fixes * Convert pipeline to updated Nextflow DSL2 syntax for future adoption across nf-core * Added a workflow to download FastQ files and to create samplesheets for ids from the [Synapse platform](https://www.synapse.org/) hosted by [Sage Bionetworks](https://sagebionetworks.org/). * SRA identifiers not available for direct download via the ENA FTP will now be downloaded via [`sra-tools`](https://github.com/ncbi/sra-tools). +* Added `--force_sratools_download` parameter to preferentially download all FastQ files via `sra-tools` instead of ENA FTP. * Correctly handle errors from SRA identifiers that do **not** return metadata, for example, due to being private. * Retry an error in prefetch via bash script in order to allow it to resume interrupted downloads. * Name output FastQ files by `{EXP_ACC}_{RUN_ACC}*fastq.gz` instead of `{EXP_ACC}_{T*}*fastq.gz` for run id provenance diff --git a/nextflow.config b/nextflow.config index 9e9e2c16..1fb304d3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,6 +16,7 @@ params { ena_metadata_fields = null sample_mapping_fields = 'experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' synapse_config = null + force_sratools_download = false skip_fastq_download = false // Boilerplate options diff --git a/nextflow_schema.json b/nextflow_schema.json index c033f6c4..2831dac7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -50,6 +50,11 @@ "fa_icon": "fab fa-apple", "description": "Name of supported nf-core pipeline e.g. 'rnaseq'. A samplesheet for direct use with the pipeline will be created with the appropriate columns." }, + "force_sratools_download": { + "type": "boolean", + "fa_icon": "fas fa-tools", + "description": "Force download FastQ files via sra-tools instead of via the ENA FTP." + }, "skip_fastq_download": { "type": "boolean", "fa_icon": "fas fa-fast-forward", diff --git a/workflows/sra.nf b/workflows/sra.nf index c108812f..c5fec775 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -70,13 +70,14 @@ workflow SRA { } .unique() .branch { - ftp: it[0].fastq_1 - sra: !it[0].fastq_1 + ftp: it[0].fastq_1 && !params.force_sratools_download + sra: !it[0].fastq_1 || params.force_sratools_download } .set { ch_sra_reads } ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) if (!params.skip_fastq_download) { + // // MODULE: If FTP link is provided in run information then download FastQ directly via FTP and validate with md5sums // @@ -85,7 +86,9 @@ workflow SRA { ) ch_versions = ch_versions.mix(SRA_FASTQ_FTP.out.versions.first()) + // // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. + // SRA_FASTQ ( ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } ) From c1e8040905fc0e5886d425177ef9ae1803d25caa Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 15:06:46 +0000 Subject: [PATCH 101/106] Add CI test for --force_sratools_download --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6b67751c..4b3fe636 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,6 +54,7 @@ jobs: "--nf_core_pipeline rnaseq", "--ena_metadata_fields run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5 --sample_mapping_fields run_accession,library_layout", --skip_fastq_download, + --force_sratools_download ] steps: - name: Check out pipeline code From 4068b2949a0fe0dfcafe407e68a7e2e5ac48d61f Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 15:22:59 +0000 Subject: [PATCH 102/106] Remove tool specific options section for now --- docs/usage.md | 36 ------------------------------------ 1 file changed, 36 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 6af310a6..3cb17fcd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -186,42 +186,6 @@ process { > **NB:** We specify just the process name i.e. `STAR_ALIGN` in the config file and not the full task name string that is printed to screen in the error message or on the terminal whilst the pipeline is running i.e. `RNASEQ:ALIGN_STAR:STAR_ALIGN`. You may get a warning suggesting that the process selector isn't recognised but you can ignore that if the process name has been specified correctly. This is something that needs to be fixed upstream in core Nextflow. -### Tool-specific options - -For the ultimate flexibility, we have implemented and are using Nextflow DSL2 modules in a way where it is possible for both developers and users to change tool-specific command-line arguments (e.g. providing an additional command-line argument to the `STAR_ALIGN` process) as well as publishing options (e.g. saving files produced by the `STAR_ALIGN` process that aren't saved by default by the pipeline). In the majority of instances, as a user you won't have to change the default options set by the pipeline developer(s), however, there may be edge cases where creating a simple custom config file can improve the behaviour of the pipeline if for example it is failing due to a weird error that requires setting a tool-specific parameter to deal with smaller / larger genomes. - -The command-line arguments passed to STAR in the `STAR_ALIGN` module are a combination of: - -* Mandatory arguments or those that need to be evaluated within the scope of the module, as supplied in the [`script`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L49-L55) section of the module file. - -* An [`options.args`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L56) string of non-mandatory parameters that is set to be empty by default in the module but can be overwritten when including the module in the sub-workflow / workflow context via the `addParams` Nextflow option. - -The nf-core/rnaseq pipeline has a sub-workflow (see [terminology](https://github.com/nf-core/modules#terminology)) specifically to align reads with STAR and to sort, index and generate some basic stats on the resulting BAM files using SAMtools. At the top of this file we import the `STAR_ALIGN` module via the Nextflow [`include`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/subworkflows/nf-core/align_star.nf#L10) keyword and by default the options passed to the module via the `addParams` option are set as an empty Groovy map [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/subworkflows/nf-core/align_star.nf#L5); this in turn means `options.args` will be set to empty by default in the module file too. This is an intentional design choice and allows us to implement well-written sub-workflows composed of a chain of tools that by default run with the bare minimum parameter set for any given tool in order to make it much easier to share across pipelines and to provide the flexibility for users and developers to customise any non-mandatory arguments. - -When including the sub-workflow above in the main pipeline workflow we use the same `include` statement, however, we now have the ability to overwrite options for each of the tools in the sub-workflow including the [`align_options`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/workflows/rnaseq.nf#L225) variable that will be used specifically to overwrite the optional arguments passed to the `STAR_ALIGN` module. In this case, the options to be provided to `STAR_ALIGN` have been assigned sensible defaults by the developer(s) in the pipeline's [`modules.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/modules.config#L70-L74) and can be accessed and customised in the [workflow context](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/workflows/rnaseq.nf#L201-L204) too before eventually passing them to the sub-workflow as a Groovy map called `star_align_options`. These options will then be propagated from `workflow -> sub-workflow -> module`. - -As mentioned at the beginning of this section it may also be necessary for users to overwrite the options passed to modules to be able to customise specific aspects of the way in which a particular tool is executed by the pipeline. Given that all of the default module options are stored in the pipeline's `modules.config` as a [`params` variable](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/modules.config#L24-L25) it is also possible to overwrite any of these options via a custom config file. - -Say for example we want to append an additional, non-mandatory parameter (i.e. `--outFilterMismatchNmax 16`) to the arguments passed to the `STAR_ALIGN` module. Firstly, we need to copy across the default `args` specified in the [`modules.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/modules.config#L71) and create a custom config file that is a composite of the default `args` as well as the additional options you would like to provide. This is very important because Nextflow will overwrite the default value of `args` that you provide via the custom config. - -As you will see in the example below, we have: - -* appended `--outFilterMismatchNmax 16` to the default `args` used by the module. -* changed the default `publish_dir` value to where the files will eventually be published in the main results directory. -* appended `'bam':''` to the default value of `publish_files` so that the BAM files generated by the process will also be saved in the top-level results directory for the module. Note: `'out':'log'` means any file/directory ending in `out` will now be saved in a separate directory called `my_star_directory/log/`. - -```nextflow -params { - modules { - 'star_align' { - args = "--quantMode TranscriptomeSAM --twopassMode Basic --outSAMtype BAM Unsorted --readFilesCommand zcat --runRNGseed 0 --outFilterMultimapNmax 20 --alignSJDBoverhangMin 1 --outSAMattributes NH HI AS NM MD --quantTranscriptomeBan Singleend --outFilterMismatchNmax 16" - publish_dir = "my_star_directory" - publish_files = ['out':'log', 'tab':'log', 'bam':''] - } - } -} -``` - ### Updating containers The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. From cacf5aaf0a8763862892c68b7b86b80444c36634 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 15:33:43 +0000 Subject: [PATCH 103/106] Update test ids link in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cb694fd5..58e4a8a7 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ On release, automated continuous integration tests run the pipeline on a full-si ## Pipeline summary -Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/public_database_ids.txt)) the pipeline performs the following steps: +Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.txt)) the pipeline performs the following steps: ### SRA / ENA / DDBJ / GEO ids From 021bdad14c0d7af9c0b864aa1514729778f7f616 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 15:43:01 +0000 Subject: [PATCH 104/106] Rename subworkflow to sra_fastq_sratools.nf --- subworkflows/local/{sra_fastq.nf => sra_fastq_sratools.nf} | 2 +- workflows/sra.nf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename subworkflows/local/{sra_fastq.nf => sra_fastq_sratools.nf} (96%) diff --git a/subworkflows/local/sra_fastq.nf b/subworkflows/local/sra_fastq_sratools.nf similarity index 96% rename from subworkflows/local/sra_fastq.nf rename to subworkflows/local/sra_fastq_sratools.nf index 04039f79..e093fe2f 100644 --- a/subworkflows/local/sra_fastq.nf +++ b/subworkflows/local/sra_fastq_sratools.nf @@ -5,7 +5,7 @@ include { SRATOOLS_PREFETCH } from '../../modules/local/sratools_prefetch.nf' include { SRATOOLS_FASTERQDUMP } from '../../modules/local/sratools_fasterqdump.nf' -workflow SRA_FASTQ { +workflow SRA_FASTQ_SRATOOLS { take: sra_ids // channel: [ val(meta), val(id) ] diff --git a/workflows/sra.nf b/workflows/sra.nf index c5fec775..8600aa00 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -22,7 +22,7 @@ WorkflowSra.initialise(params, log, valid_params) include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' -include { SRA_FASTQ } from '../subworkflows/local/sra_fastq' +include { SRA_FASTQ_SRATOOLS } from '../subworkflows/local/sra_fastq_sratools' include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' @@ -89,7 +89,7 @@ workflow SRA { // // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. // - SRA_FASTQ ( + SRA_FASTQ_SRATOOLS ( ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } ) ch_versions = ch_versions.mix(SRA_FASTQ.out.versions.first()) From e5e59e982ceb0322c62ec5975cfbb3f790d80121 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 8 Nov 2021 15:46:25 +0000 Subject: [PATCH 105/106] Fix tests --- workflows/sra.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index 8600aa00..4012a571 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -92,13 +92,13 @@ workflow SRA { SRA_FASTQ_SRATOOLS ( ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } ) - ch_versions = ch_versions.mix(SRA_FASTQ.out.versions.first()) + ch_versions = ch_versions.mix(SRA_FASTQ_SRATOOLS.out.versions.first()) // // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet // SRA_TO_SAMPLESHEET ( - SRA_FASTQ_FTP.out.fastq.mix(SRA_FASTQ.out.reads), + SRA_FASTQ_FTP.out.fastq.mix(SRA_FASTQ_SRATOOLS.out.reads), params.nf_core_pipeline ?: '', params.sample_mapping_fields ) From b0363de5626ce526a889b2c5ef6e6f2d9bb49991 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 9 Nov 2021 12:11:18 +0000 Subject: [PATCH 106/106] Update DSL2 docs in CONTRIBUTING.md --- .github/CONTRIBUTING.md | 54 ++++++++++------------------------------- 1 file changed, 13 insertions(+), 41 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 888cff34..f294492d 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -61,21 +61,16 @@ For further information/help, please consult the [nf-core/fetchngs documentation To make the nf-core/fetchngs code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. -### Adding a new step - -If you wish to contribute a new step, please use the following coding standards: - -1. Define the corresponding input channel into your new process from the expected previous process channel -2. Write the process block (see below). -3. Define the output channel if needed (see below). -4. Add any new flags/options to `nextflow.config` with a default (see below). -5. Add any new flags/options to `nextflow_schema.json` with help text (with `nf-core schema build`). -6. Add any new flags/options to the help message (for integer/text parameters, print to help the corresponding `nextflow.config` parameter). -7. Add sanity checks for all relevant parameters. -8. Add any new software to the `scrape_software_versions.py` script in `bin/` and the version command to the `scrape_software_versions` process in `main.nf`. -9. Do local tests that the new code works properly and as expected. -10. Add a new test command in `.github/workflow/ci.yml`. -11. Add any descriptions of output files to `docs/output.md`. +### Adding a new step or module + +If you wish to contribute a new step or module please see the [official guidelines](https://nf-co.re/developers/adding_modules#new-module-guidelines-and-pr-review-checklist) and use the following coding standards: + +1. Add any new flags/options to `nextflow.config` with a default (see section below). +2. Add any new flags/options to `nextflow_schema.json` with help text via `nf-core schema build`. +3. Add sanity checks for all relevant parameters. +4. Perform local tests to validate that the new code works as expected. +5. If applicable, add a new test command in `.github/workflow/ci.yml`. +6. Add any descriptions of output files to `docs/output.md`. ### Default values @@ -87,40 +82,17 @@ Once there, use `nf-core schema build` to add to `nextflow_schema.json`. Sensible defaults for process resource requirements (CPUs / memory / time) for a process should be defined in `conf/base.config`. These should generally be specified generic with `withLabel:` selectors so they can be shared across multiple processes/steps of the pipeline. A nf-core standard set of labels that should be followed where possible can be seen in the [nf-core pipeline template](https://github.com/nf-core/tools/blob/master/nf_core/pipeline-template/conf/base.config), which has the default process as a single core-process, and then different levels of multi-core configurations for increasingly large memory requirements defined with standardised labels. -The process resources can be passed on to the tool dynamically within the process with the `${task.cpu}` and `${task.memory}` variables in the `script:` block. - -### Naming schemes +### Channel naming convention Please use the following naming schemes, to make it easy to understand what is going where. -* initial process channel: `ch_output_from_` -* intermediate and terminal channels: `ch__for_` +* Initial process channel: `ch_output_from_` +* Intermediate and terminal channels: `ch__for_` ### Nextflow version bumping If you are using a new feature from core Nextflow, you may bump the minimum required version of nextflow in the pipeline with: `nf-core bump-version --nextflow . [min-nf-version]` -### Software version reporting - -If you add a new tool to the pipeline, please ensure you add the information of the tool to the `get_software_version` process. - -Add to the script block of the process, something like the following: - -```bash - --version &> v_.txt 2>&1 || true -``` - -or - -```bash - --help | head -n 1 &> v_.txt 2>&1 || true -``` - -You then need to edit the script `bin/scrape_software_versions.py` to: - -1. Add a Python regex for your tool's `--version` output (as in stored in the `v_.txt` file), to ensure the version is reported as a `v` and the version number e.g. `v2.1.1` -2. Add a HTML entry to the `OrderedDict` for formatting in MultiQC. - ### Images and figures For overview images and other documents we follow the nf-core [style guidelines and examples](https://nf-co.re/developers/design_guidelines).