Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add task dl_and_run; rename ena options #110

Merged
merged 2 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,17 @@ To run on paired or unpaired Ion Torrent reads, use either of the
above commands, but with the option `--tech iontorrent`.


You can run on a batch of reads, using a file
`runs.txt` of INSDC run accessions.
This will download each run from the ENA and run Viridian on it:
```
viridian dl_and_run --acc_file runs.txt --outdir OUT
```
The sequencing tech and unpaired/paired is taken from the ENA metadata
for each run.



## Output files

The default files in the output directory are:
Expand Down Expand Up @@ -95,6 +106,6 @@ to the reference will also be present, called
used to build trees. It has the consensus sequence aligned to the
reference genome, but with insertions in the consensus ignored and
deletions replaced with the reference sequence.
* `--ena_run RUN_ID`: using this option will download the specified reads
* `--run_accession RUN_ID`: using this option will download the specified reads
from the ENA, and infer the `--tech` option from the ENA metadata

8 changes: 4 additions & 4 deletions tests/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@ def test_check_tech_and_reads_options():
options.reads1 = None
options.reads2 = None
options.tech = None
options.ena_run = None
options.run_accession = None
with pytest.raises(Exception):
f(options)

options.ena_run = "ERR12345"
options.run_accession = "ERR12345"
assert f(options)
options.ena_run = None
options.run_accession = None

options.tech = "ont"
with pytest.raises(Exception):
Expand All @@ -79,7 +79,7 @@ def test_check_tech_and_reads_options():
options.reads1 = None
options.reads2 = None
options.tech = None
options.ena_run = None
options.run_accession = None
with pytest.raises(Exception):
f(options)
options.tech = tech
Expand Down
140 changes: 92 additions & 48 deletions viridian/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,19 @@ def main(args=None):
action="store_true",
)
common_parser.add_argument(
"--force",
action="store_true",
help="Overwrite output directory, if it already exists. Use with caution!",
)

# -------------- outdir --------------------------------------------------
outdir_parser = argparse.ArgumentParser(add_help=False)
outdir_parser.add_argument(
"--outdir",
help="REQUIRED. Name of output directory (will be created). This must not exist already, unless the --force option is used to overwrite",
required=True,
metavar="FILENAME",
)
common_parser.add_argument(
"--force",
action="store_true",
help="Overwrite output directory, if it already exists. Use with caution!",
)

# -------------- amplicons options ---------------------------------------
amplicons_parser = argparse.ArgumentParser(add_help=False)
Expand All @@ -54,13 +57,13 @@ def main(args=None):
# --------------------- reads options ------------------------------------
reads_parser = argparse.ArgumentParser(add_help=False)
reads_parser.add_argument(
"--ena_run",
help="ENA run accession. Reads will be downloaded and --tech determined from ENA metadata",
"--run_accession",
help="INSDC run accession. Reads will be downloaded from the ENA and --tech determined from ENA metadata",
metavar="RUN_ID",
)
reads_parser.add_argument(
"--keep_ena_reads",
help="Keep reads downloaded from the ENA. Only relevant if --ena_run is used, otherwise is ignored",
"--keep_downloaded_reads",
help="Keep reads downloaded from the ENA. Only relevant if --run_accession is used, otherwise is ignored",
action="store_true",
)
reads_parser.add_argument(
Expand Down Expand Up @@ -95,71 +98,55 @@ def main(args=None):
choices=tech_choices,
help=f"Sequencing technology, currently supported: {','.join(tech_choices)}",
)
reads_epilog = "IMPORTANT: --tech, --outdir are REQUIRED. Reads files are required, and depend on the --tech option. Use one of: 1) '--tech illumina|ont|iontorrent --reads reads.fq'; 2) '--tech illumina|iontorrent --reads1 reads1.fq --reads2 reads2.fq'; 3) (with any tech) --reads_bam reads.bam"
reads_epilog = "IMPORTANT: --outdir is REQUIRED. Unless --run_accession is used, reads files are required, and depend on the --tech option. Use one of: 1) '--tech illumina|ont|iontorrent --reads reads.fq'; 2) '--tech illumina|iontorrent --reads1 reads1.fq --reads2 reads2.fq'; 3) (with any tech) --reads_bam reads.bam"

# --------------------------- ref options --------------------------------
ref_parser = argparse.ArgumentParser(add_help=False)
ref_parser.add_argument(
"--ref_fasta",
help="FASTA file of reference genome. Default is to use built-in MN908947.3 SARS-CoV-2 reference genome. Only use this if you know what you are doing, since the amplicon scheme(s) must match this reference [%(default)s]",
metavar="FILENAME",
default=viridian.amplicon_schemes.REF_FASTA,
)

# ------------------------ run_one_sample ----------------------------
subparser_run_one_sample = subparsers.add_parser(
"run_one_sample",
parents=[common_parser, amplicons_parser, reads_parser, ref_parser],
help="Run the complete pipeline on one sample",
usage=f"viridian run_one_sample [options] --tech {'|'.join(tech_choices)} --outdir out <reads options (see help)>",
description="Run the complete pipeline on one sample",
epilog=reads_epilog,
)
subparser_run_one_sample.add_argument(
# --------------------advanced options -----------------------------------
advanced_parser = argparse.ArgumentParser(add_help=False)
advanced_parser.add_argument(
"--decontam",
help="Decontaminate input reads at start of pipeline, using ReadItAndKeep with the provided reference genome FASTA file (use '--decontam COVID' to use the MN908947.3 SARS-CoV-2 reference genome with poly-A removed). Incompatible with --reads_bam",
metavar="FILENAME",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--keep_bam",
action="store_true",
help="Keep BAM file of original reads mapped to reference genome (it is deleted by default)",
)
msa_choices = sorted(list(viridian.constants.MSA_CHOICES))
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--write_msa",
action="append",
choices=msa_choices,
help=f"MSA(s) to write. Use this option once for each MSA type to write. Each written to separate FASTA file. Choose from: {';'.join(msa_choices)}",
metavar="MSA_TYPE",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--force_amp_scheme",
help="Force choice of amplicon scheme. The value provided must exactly match a built-in name or a name in file given by --amp_schemes_tsv",
metavar="STRING",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--min_scheme_score",
type=int,
default=250,
help="Minimum score required for a matching scheme. If all schemes are less than this cutoff, the pipeline is stopped unless --force_amp_scheme is used [%(default)s]",
metavar="INT",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--max_scheme_ratio",
type=float,
default=0.5,
help="Maximum allowed value of (second best scheme score) / (best scheme score). See also --min_scheme_score [%(default)s]",
metavar="FLOAT",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--assemble_depth",
type=int,
help="Target read depth for each amplicon when assembling [%(default)s]",
metavar="INT",
default=100,
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--qc_depth",
type=int,
default=1000,
Expand All @@ -169,80 +156,137 @@ def main(args=None):
frs_default_str = "; ".join(
[f"{k}:{v}" for k, v in viridian.constants.TECH2FRS.items()]
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--masking_min_frs",
type=float,
help=f"Consensus positions with a fraction of read support less than the given number are masked. Default depends on the --tech option [{frs_default_str}]",
metavar="FLOAT",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--het_min_pc",
type=float,
help="Cutoff for when to use ambiguous IUPAC codes. With --het_min_pc X, if two or more alleles each have support of more than X percent of reads, then an ambiguous IUPAC code is used and the position is given the 'HET' filter [%(default)s]",
metavar="FLOAT",
default=20.0,
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--masking_min_depth",
type=int,
default=20,
help="Consensus positions with total read depth less than the given number are masked [%(default)s]",
metavar="INT",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--coverage_min_x",
default=20,
type=int,
help="With options --coverage_min_x X --coverage_min_pc Y, after initial read mapping to the reference, if less than Y percent of the genome has at least X read coverage, then the pipeline is stopped. Default is to require at least 50 percent of the genome with at least 20X read depth",
metavar="INT",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--coverage_min_pc",
default=50,
type=float,
help="Please see the help for --coverage_min_x",
metavar="FLOAT",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--max_percent_amps_fail",
type=float,
default=50.0,
help="Maximum percent of amplicons allowed to fail during read sampling or making consensus for each amplicon. The pipeline is stopped as soon as too many failed amplicons are detected [%(default)s]",
metavar="FLOAT",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--max_cons_n_percent",
type=float,
default=50.0,
help="Maximum allowed percentage of Ns in the consensus sequence. Pipeline is stopped as soon as too many Ns in current consensus [%(default)s]",
metavar="FLOAT",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--detect_scheme_only",
action="store_true",
help="Map the reads and detect the amplicon scheme, then stop the pipeline instead of making a consensus etc",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--force_consensus",
help="Use the provided FASTA for the consensus, instead of inferring it from the reads. This option is useful to evaluate the quality of an existing consensus sequence using the reads that made it",
metavar="FILENAME",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--tmp_dir",
help="Directory in which to put a temporary directory for intermediate files that are not kept. The dir given by --tmp_dir must already exist. Default is platform dependent, and uses python's tempdir default locations (eg for linux, looks for /tmp first). If you use the --debug option, then the --tmp_dir option is ignored and a directory called 'Processing' is used in the output directory",
metavar="DIRNAME",
)
subparser_run_one_sample.add_argument(
advanced_parser.add_argument(
"--no_gzip",
help="Use this to not gzip the final output FASTA file and log.json file",
action="store_true",
)

# --------------------------- ref options --------------------------------
ref_parser = argparse.ArgumentParser(add_help=False)
ref_parser.add_argument(
"--ref_fasta",
help="FASTA file of reference genome. Default is to use built-in MN908947.3 SARS-CoV-2 reference genome. Only use this if you know what you are doing, since the amplicon scheme(s) must match this reference [%(default)s]",
metavar="FILENAME",
default=viridian.amplicon_schemes.REF_FASTA,
)

# ------------------------- dl_and_run options -----------------------
dl_and_run_parser = argparse.ArgumentParser(add_help=False)
dl_and_run_parser.add_argument(
"--acc_file",
help="REQUIRED. File containing run accessions, one run per line",
metavar="FILENAME",
required=True,
)
dl_and_run_parser.add_argument(
"--outdir",
help="REQUIRED. Name of output directory. If it already exists, will assume it is from a previous run of `viridian dl_and_run`, keep existing runs of viridian and only run samples that did not finish",
required=True,
metavar="FILENAME",
)

# ------------------------ run_one_sample ----------------------------
subparser_run_one_sample = subparsers.add_parser(
"run_one_sample",
parents=[
common_parser,
outdir_parser,
amplicons_parser,
reads_parser,
ref_parser,
advanced_parser,
],
help="Run the complete pipeline on one sample",
usage=f"viridian run_one_sample [options] --tech {'|'.join(tech_choices)} --outdir out <reads options (see help)>",
description="Run the complete pipeline on one sample",
epilog=reads_epilog,
)
subparser_run_one_sample.set_defaults(func=viridian.tasks.run_one_sample.run)

# ------------------------ dl_and_run --------------------------------
subparser_dl_and_run = subparsers.add_parser(
"dl_and_run",
parents=[
dl_and_run_parser,
common_parser,
amplicons_parser,
ref_parser,
advanced_parser,
],
help="Download reads from the ENA and run the pipeline, for multiple sequencing runs",
usage="viridian dl_and_run [options] --acc_file foo.txt --outdir out",
description="Download reads from the ENA and run the pipeline, for multiple sequencing runs",
)
subparser_dl_and_run.set_defaults(func=viridian.tasks.dl_and_run.run)

# ------------------------ sim_schemes -------------------------------
subparser_sim_schemes = subparsers.add_parser(
"sim_schemes",
parents=[amplicons_parser, common_parser, ref_parser],
parents=[amplicons_parser, common_parser, outdir_parser, ref_parser],
help="For each scheme, simulate fragments and run scheme id code",
usage="viridian sim_schemes [options] --outdir <out>",
description="For each scheme, simulate fragments and run scheme id code",
Expand Down
1 change: 1 addition & 0 deletions viridian/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
__all__ = [
"dl_and_run",
"run_one_sample",
"sim_schemes",
]
Expand Down
53 changes: 53 additions & 0 deletions viridian/tasks/dl_and_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import logging
import os

from viridian import tasks, utils


def viridian_finished(indir):
try:
log = utils.load_json(os.path.join(indir, "log.json.gz"))
last_stage = log["run_summary"]["last_stage_completed"]
except:
return False

return last_stage == "Finished"


def run(options):
with open(options.acc_file) as f:
run_accessions = [x.rstrip() for x in f]
logging.info(f"{len(run_accessions)} runs found in file {options.acc_file}")

if not os.path.exists(options.outdir):
logging.info(f"Making main output directory {options.outdir}")
os.mkdir(options.outdir)
else:
logging.info(f"Existing output directory found, using it: {options.outdir}")

options.keep_downloaded_reads = False
options.reads1 = None
options.reads2 = None
options.reads = None
options.reads_bam = None
options.tech = None
options.force = True

os.chdir(options.outdir)

for i, run in enumerate(run_accessions):
message = f" STARTING {i+1}/{len(run_accessions)} {run} "
logging.info(f"{message:#^60}")
options.outdir = run
options.run_accession = run
options.sample_name = run
if viridian_finished(options.outdir):
logging.info(f"Already done, skipping: {run}")
else:
try:
tasks.run_one_sample.run(options)
except:
logging.warn(f"Something went wrong: {run}")

message = f" FINISHED {run} "
logging.info(f"{message:#^60}")
4 changes: 2 additions & 2 deletions viridian/tasks/run_one_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def run(options):
options.tech,
options.outdir,
options.ref_fasta,
ena_run=options.ena_run,
keep_ena_reads=options.keep_ena_reads,
ena_run=options.run_accession,
keep_ena_reads=options.keep_downloaded_reads,
reads_file1=options.reads1,
reads_file2=options.reads2,
reads_file=options.reads,
Expand Down
Loading
Loading