diff --git a/README.md b/README.md index 1809a54..6220c6e 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,17 @@ To run on paired or unpaired Ion Torrent reads, use either of the above commands, but with the option `--tech iontorrent`. +You can run on a batch of reads, using a file +`runs.txt` of INSDC run accessions. +This will download each run from the ENA and run Viridian on it: +``` +viridian dl_and_run --acc_file runs.txt --outdir OUT +``` +The sequencing tech and unpaired/paired is taken from the ENA metadata +for each run. + + + ## Output files The default files in the output directory are: @@ -95,6 +106,6 @@ to the reference will also be present, called used to build trees. It has the consensus sequence aligned to the reference genome, but with insertions in the consensus ignored and deletions replaced with the reference sequence. -* `--ena_run RUN_ID`: using this option will download the specified reads +* `--run_accession RUN_ID`: using this option will download the specified reads from the ENA, and infer the `--tech` option from the ENA metadata diff --git a/tests/utils_test.py b/tests/utils_test.py index 6e3de25..e742bd2 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -48,13 +48,13 @@ def test_check_tech_and_reads_options(): options.reads1 = None options.reads2 = None options.tech = None - options.ena_run = None + options.run_accession = None with pytest.raises(Exception): f(options) - options.ena_run = "ERR12345" + options.run_accession = "ERR12345" assert f(options) - options.ena_run = None + options.run_accession = None options.tech = "ont" with pytest.raises(Exception): @@ -79,7 +79,7 @@ def test_check_tech_and_reads_options(): options.reads1 = None options.reads2 = None options.tech = None - options.ena_run = None + options.run_accession = None with pytest.raises(Exception): f(options) options.tech = tech diff --git a/viridian/__main__.py b/viridian/__main__.py index ef968a7..ff5b6c4 100644 --- a/viridian/__main__.py +++ b/viridian/__main__.py @@ -24,16 +24,19 @@ def main(args=None): action="store_true", ) common_parser.add_argument( + "--force", + action="store_true", + help="Overwrite output directory, if it already exists. Use with caution!", + ) + + # -------------- outdir -------------------------------------------------- + outdir_parser = argparse.ArgumentParser(add_help=False) + outdir_parser.add_argument( "--outdir", help="REQUIRED. Name of output directory (will be created). This must not exist already, unless the --force option is used to overwrite", required=True, metavar="FILENAME", ) - common_parser.add_argument( - "--force", - action="store_true", - help="Overwrite output directory, if it already exists. Use with caution!", - ) # -------------- amplicons options --------------------------------------- amplicons_parser = argparse.ArgumentParser(add_help=False) @@ -54,13 +57,13 @@ def main(args=None): # --------------------- reads options ------------------------------------ reads_parser = argparse.ArgumentParser(add_help=False) reads_parser.add_argument( - "--ena_run", - help="ENA run accession. Reads will be downloaded and --tech determined from ENA metadata", + "--run_accession", + help="INSDC run accession. Reads will be downloaded from the ENA and --tech determined from ENA metadata", metavar="RUN_ID", ) reads_parser.add_argument( - "--keep_ena_reads", - help="Keep reads downloaded from the ENA. Only relevant if --ena_run is used, otherwise is ignored", + "--keep_downloaded_reads", + help="Keep reads downloaded from the ENA. Only relevant if --run_accession is used, otherwise is ignored", action="store_true", ) reads_parser.add_argument( @@ -95,71 +98,55 @@ def main(args=None): choices=tech_choices, help=f"Sequencing technology, currently supported: {','.join(tech_choices)}", ) - reads_epilog = "IMPORTANT: --tech, --outdir are REQUIRED. Reads files are required, and depend on the --tech option. Use one of: 1) '--tech illumina|ont|iontorrent --reads reads.fq'; 2) '--tech illumina|iontorrent --reads1 reads1.fq --reads2 reads2.fq'; 3) (with any tech) --reads_bam reads.bam" + reads_epilog = "IMPORTANT: --outdir is REQUIRED. Unless --run_accession is used, reads files are required, and depend on the --tech option. Use one of: 1) '--tech illumina|ont|iontorrent --reads reads.fq'; 2) '--tech illumina|iontorrent --reads1 reads1.fq --reads2 reads2.fq'; 3) (with any tech) --reads_bam reads.bam" - # --------------------------- ref options -------------------------------- - ref_parser = argparse.ArgumentParser(add_help=False) - ref_parser.add_argument( - "--ref_fasta", - help="FASTA file of reference genome. Default is to use built-in MN908947.3 SARS-CoV-2 reference genome. Only use this if you know what you are doing, since the amplicon scheme(s) must match this reference [%(default)s]", - metavar="FILENAME", - default=viridian.amplicon_schemes.REF_FASTA, - ) - - # ------------------------ run_one_sample ---------------------------- - subparser_run_one_sample = subparsers.add_parser( - "run_one_sample", - parents=[common_parser, amplicons_parser, reads_parser, ref_parser], - help="Run the complete pipeline on one sample", - usage=f"viridian run_one_sample [options] --tech {'|'.join(tech_choices)} --outdir out ", - description="Run the complete pipeline on one sample", - epilog=reads_epilog, - ) - subparser_run_one_sample.add_argument( + # --------------------advanced options ----------------------------------- + advanced_parser = argparse.ArgumentParser(add_help=False) + advanced_parser.add_argument( "--decontam", help="Decontaminate input reads at start of pipeline, using ReadItAndKeep with the provided reference genome FASTA file (use '--decontam COVID' to use the MN908947.3 SARS-CoV-2 reference genome with poly-A removed). Incompatible with --reads_bam", metavar="FILENAME", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--keep_bam", action="store_true", help="Keep BAM file of original reads mapped to reference genome (it is deleted by default)", ) msa_choices = sorted(list(viridian.constants.MSA_CHOICES)) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--write_msa", action="append", choices=msa_choices, help=f"MSA(s) to write. Use this option once for each MSA type to write. Each written to separate FASTA file. Choose from: {';'.join(msa_choices)}", metavar="MSA_TYPE", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--force_amp_scheme", help="Force choice of amplicon scheme. The value provided must exactly match a built-in name or a name in file given by --amp_schemes_tsv", metavar="STRING", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--min_scheme_score", type=int, default=250, help="Minimum score required for a matching scheme. If all schemes are less than this cutoff, the pipeline is stopped unless --force_amp_scheme is used [%(default)s]", metavar="INT", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--max_scheme_ratio", type=float, default=0.5, help="Maximum allowed value of (second best scheme score) / (best scheme score). See also --min_scheme_score [%(default)s]", metavar="FLOAT", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--assemble_depth", type=int, help="Target read depth for each amplicon when assembling [%(default)s]", metavar="INT", default=100, ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--qc_depth", type=int, default=1000, @@ -169,80 +156,137 @@ def main(args=None): frs_default_str = "; ".join( [f"{k}:{v}" for k, v in viridian.constants.TECH2FRS.items()] ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--masking_min_frs", type=float, help=f"Consensus positions with a fraction of read support less than the given number are masked. Default depends on the --tech option [{frs_default_str}]", metavar="FLOAT", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--het_min_pc", type=float, help="Cutoff for when to use ambiguous IUPAC codes. With --het_min_pc X, if two or more alleles each have support of more than X percent of reads, then an ambiguous IUPAC code is used and the position is given the 'HET' filter [%(default)s]", metavar="FLOAT", default=20.0, ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--masking_min_depth", type=int, default=20, help="Consensus positions with total read depth less than the given number are masked [%(default)s]", metavar="INT", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--coverage_min_x", default=20, type=int, help="With options --coverage_min_x X --coverage_min_pc Y, after initial read mapping to the reference, if less than Y percent of the genome has at least X read coverage, then the pipeline is stopped. Default is to require at least 50 percent of the genome with at least 20X read depth", metavar="INT", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--coverage_min_pc", default=50, type=float, help="Please see the help for --coverage_min_x", metavar="FLOAT", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--max_percent_amps_fail", type=float, default=50.0, help="Maximum percent of amplicons allowed to fail during read sampling or making consensus for each amplicon. The pipeline is stopped as soon as too many failed amplicons are detected [%(default)s]", metavar="FLOAT", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--max_cons_n_percent", type=float, default=50.0, help="Maximum allowed percentage of Ns in the consensus sequence. Pipeline is stopped as soon as too many Ns in current consensus [%(default)s]", metavar="FLOAT", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--detect_scheme_only", action="store_true", help="Map the reads and detect the amplicon scheme, then stop the pipeline instead of making a consensus etc", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--force_consensus", help="Use the provided FASTA for the consensus, instead of inferring it from the reads. This option is useful to evaluate the quality of an existing consensus sequence using the reads that made it", metavar="FILENAME", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--tmp_dir", help="Directory in which to put a temporary directory for intermediate files that are not kept. The dir given by --tmp_dir must already exist. Default is platform dependent, and uses python's tempdir default locations (eg for linux, looks for /tmp first). If you use the --debug option, then the --tmp_dir option is ignored and a directory called 'Processing' is used in the output directory", metavar="DIRNAME", ) - subparser_run_one_sample.add_argument( + advanced_parser.add_argument( "--no_gzip", help="Use this to not gzip the final output FASTA file and log.json file", action="store_true", ) + + # --------------------------- ref options -------------------------------- + ref_parser = argparse.ArgumentParser(add_help=False) + ref_parser.add_argument( + "--ref_fasta", + help="FASTA file of reference genome. Default is to use built-in MN908947.3 SARS-CoV-2 reference genome. Only use this if you know what you are doing, since the amplicon scheme(s) must match this reference [%(default)s]", + metavar="FILENAME", + default=viridian.amplicon_schemes.REF_FASTA, + ) + + # ------------------------- dl_and_run options ----------------------- + dl_and_run_parser = argparse.ArgumentParser(add_help=False) + dl_and_run_parser.add_argument( + "--acc_file", + help="REQUIRED. File containing run accessions, one run per line", + metavar="FILENAME", + required=True, + ) + dl_and_run_parser.add_argument( + "--outdir", + help="REQUIRED. Name of output directory. If it already exists, will assume it is from a previous run of `viridian dl_and_run`, keep existing runs of viridian and only run samples that did not finish", + required=True, + metavar="FILENAME", + ) + + # ------------------------ run_one_sample ---------------------------- + subparser_run_one_sample = subparsers.add_parser( + "run_one_sample", + parents=[ + common_parser, + outdir_parser, + amplicons_parser, + reads_parser, + ref_parser, + advanced_parser, + ], + help="Run the complete pipeline on one sample", + usage=f"viridian run_one_sample [options] --tech {'|'.join(tech_choices)} --outdir out ", + description="Run the complete pipeline on one sample", + epilog=reads_epilog, + ) subparser_run_one_sample.set_defaults(func=viridian.tasks.run_one_sample.run) + # ------------------------ dl_and_run -------------------------------- + subparser_dl_and_run = subparsers.add_parser( + "dl_and_run", + parents=[ + dl_and_run_parser, + common_parser, + amplicons_parser, + ref_parser, + advanced_parser, + ], + help="Download reads from the ENA and run the pipeline, for multiple sequencing runs", + usage="viridian dl_and_run [options] --acc_file foo.txt --outdir out", + description="Download reads from the ENA and run the pipeline, for multiple sequencing runs", + ) + subparser_dl_and_run.set_defaults(func=viridian.tasks.dl_and_run.run) + # ------------------------ sim_schemes ------------------------------- subparser_sim_schemes = subparsers.add_parser( "sim_schemes", - parents=[amplicons_parser, common_parser, ref_parser], + parents=[amplicons_parser, common_parser, outdir_parser, ref_parser], help="For each scheme, simulate fragments and run scheme id code", usage="viridian sim_schemes [options] --outdir ", description="For each scheme, simulate fragments and run scheme id code", diff --git a/viridian/tasks/__init__.py b/viridian/tasks/__init__.py index 04dba80..ddbd441 100644 --- a/viridian/tasks/__init__.py +++ b/viridian/tasks/__init__.py @@ -1,4 +1,5 @@ __all__ = [ + "dl_and_run", "run_one_sample", "sim_schemes", ] diff --git a/viridian/tasks/dl_and_run.py b/viridian/tasks/dl_and_run.py new file mode 100644 index 0000000..de010c1 --- /dev/null +++ b/viridian/tasks/dl_and_run.py @@ -0,0 +1,53 @@ +import logging +import os + +from viridian import tasks, utils + + +def viridian_finished(indir): + try: + log = utils.load_json(os.path.join(indir, "log.json.gz")) + last_stage = log["run_summary"]["last_stage_completed"] + except: + return False + + return last_stage == "Finished" + + +def run(options): + with open(options.acc_file) as f: + run_accessions = [x.rstrip() for x in f] + logging.info(f"{len(run_accessions)} runs found in file {options.acc_file}") + + if not os.path.exists(options.outdir): + logging.info(f"Making main output directory {options.outdir}") + os.mkdir(options.outdir) + else: + logging.info(f"Existing output directory found, using it: {options.outdir}") + + options.keep_downloaded_reads = False + options.reads1 = None + options.reads2 = None + options.reads = None + options.reads_bam = None + options.tech = None + options.force = True + + os.chdir(options.outdir) + + for i, run in enumerate(run_accessions): + message = f" STARTING {i+1}/{len(run_accessions)} {run} " + logging.info(f"{message:#^60}") + options.outdir = run + options.run_accession = run + options.sample_name = run + if viridian_finished(options.outdir): + logging.info(f"Already done, skipping: {run}") + else: + try: + tasks.run_one_sample.run(options) + except: + logging.warn(f"Something went wrong: {run}") + + message = f" FINISHED {run} " + logging.info(f"{message:#^60}") diff --git a/viridian/tasks/run_one_sample.py b/viridian/tasks/run_one_sample.py index f858ece..76d6b60 100644 --- a/viridian/tasks/run_one_sample.py +++ b/viridian/tasks/run_one_sample.py @@ -19,8 +19,8 @@ def run(options): options.tech, options.outdir, options.ref_fasta, - ena_run=options.ena_run, - keep_ena_reads=options.keep_ena_reads, + ena_run=options.run_accession, + keep_ena_reads=options.keep_downloaded_reads, reads_file1=options.reads1, reads_file2=options.reads2, reads_file=options.reads, diff --git a/viridian/utils.py b/viridian/utils.py index 1434834..8b9033d 100644 --- a/viridian/utils.py +++ b/viridian/utils.py @@ -3,7 +3,6 @@ import logging import os import subprocess -import sys import pyfastaq @@ -84,7 +83,7 @@ def load_single_seq_fasta(infile): def check_tech_and_reads_options(args): - if args.ena_run is not None: + if args.run_accession is not None: return True if getattr(args, "tech") is None: