diff --git a/singlem/lyrebird.py b/singlem/lyrebird.py index 92d8d661..22ab4330 100644 --- a/singlem/lyrebird.py +++ b/singlem/lyrebird.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -__author__ = "Ben Woodcroft" +__author__ = "Rossen Zhao" __copyright__ = "Copyright 2015-2024" __credits__ = ["Ben Woodcroft", "Samuel Aroney", "Rossen Zhao"] __license__ = "GPL3+" @@ -22,6 +22,8 @@ import singlem import singlem.pipe as pipe + +from singlem.main import add_common_pipe_arguments, add_less_common_pipe_arguments, validate_pipe_args, generate_streaming_otu_table_from_args, add_condense_arguments from singlem.pipe import SearchPipe from singlem.condense import Condenser from singlem.metapackage import DATA_ENVIRONMENT_VARIABLE, CUSTOM_TAXONOMY_DATABASE_NAME @@ -31,9 +33,6 @@ from singlem.condense import DEFAULT_GENOME_MIN_TAXON_COVERAGE as CONDENSE_DEFAULT_GENOME_MIN_TAXON_COVERAGE from singlem.condense import DEFAULT_TRIM_PERCENT as CONDENSE_DEFAULT_TRIM_PERCENT -DEFAULT_WINDOW_SIZE = 60 -SPECIES_LEVEL_AVERAGE_IDENTITY = float(DEFAULT_WINDOW_SIZE - SearchPipe.DEFAULT_MAX_SPECIES_DIVERGENCE) / DEFAULT_WINDOW_SIZE - def main(): bird_argparser = BirdArgparser( program='Lyrebird', @@ -42,7 +41,8 @@ def main(): "Samuel Aroney, "+CMR, "Raphael Eisenhofer, Centre for Evolutionary Hologenomics, University of Copenhagen, Denmark", "Rossen Zhao, "+CMR], - version=singlem.__version__, # TODO: change to lyrebird version 0.1.0 + # version=singlem.__lyrebird_version__, + version='0.1.0', raw_format=True, examples={'pipe': [ Example( @@ -62,141 +62,15 @@ def main(): # TODO: Could make pipe invocation faster by moving DATA_ENVIRONMENT to a separate file data_parser.add_argument('--output-directory', help="Output directory [required unless {} is specified]".format(DATA_ENVIRONMENT_VARIABLE)) data_parser.add_argument('--verify-only', help="Check that the data is up to date and each file has the correct checksum", action='store_true', default=False) - + #TODO: import functions from singlem main.py pipe_description = 'Generate a taxonomic profile or OTU table for dsDNA phages from raw sequences' pipe_parser = bird_argparser.new_subparser('pipe', pipe_description, parser_group='Tools') - # Make a function here so the code can be re-used between pipe and renew - def add_common_pipe_arguments(argument_group): - argument_group.add_argument('-p', '--taxonomic-profile', metavar='FILE', help="output a 'condensed' taxonomic profile for each sample based on the OTU table. Taxonomic profiles output can be further converted to other formats using singlem summarise.") - argument_group.add_argument('--taxonomic-profile-krona', metavar='FILE', help="output a 'condensed' taxonomic profile for each sample based on the OTU table") - argument_group.add_argument('--otu-table', metavar='filename', help='output OTU table') - current_default = pipe.DEFAULT_THREADS - argument_group.add_argument('--threads', type=int, metavar='num_threads', help='number of CPUS to use [default: %i]' % current_default, default=current_default) - current_default = SearchPipe.DEFAULT_TAXONOMY_ASSIGNMENT_METHOD - argument_group.add_argument( - '--assignment-method', '--assignment_method', - choices=( - pipe.SMAFA_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, - pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, - pipe.ANNOY_THEN_DIAMOND_ASSIGNMENT_METHOD, - pipe.SCANN_THEN_DIAMOND_ASSIGNMENT_METHOD, - pipe.DIAMOND_ASSIGNMENT_METHOD, - pipe.DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD, - pipe.ANNOY_ASSIGNMENT_METHOD, - pipe.PPLACER_ASSIGNMENT_METHOD), - help='Method of assigning taxonomy to OTUs and taxonomic profiles [default: %s]\n\n' % (current_default) + - table_roff([ - ["Method", "Description"], - [pipe.SMAFA_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, "Search for the most similar window sequences <= 3bp different using a brute force algorithm (using the smafa implementation) over all window sequences in the database, and if none are found use DIAMOND blastx of all reads from each OTU."], - [pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, "Search for the most similar window sequences <= 3bp different using a brute force algorithm over all window sequences in the database, and if none are found use DIAMOND blastx of all reads from each OTU."], - [pipe.ANNOY_THEN_DIAMOND_ASSIGNMENT_METHOD, "Same as {}, except search using ANNOY rather than using brute force. Requires a non-standard metapackage.".format(pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD)], - [pipe.SCANN_THEN_DIAMOND_ASSIGNMENT_METHOD, "Same as {}, except search using SCANN rather than using brute force. Requires a non-standard metapackage.".format(pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD)], - [pipe.DIAMOND_ASSIGNMENT_METHOD, "DIAMOND blastx best hit(s) of all reads from each OTU."], - [pipe.DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD, "DIAMOND blastx best hit(s) of all reads from each OTU, but report the best hit as a sequence ID instead of a taxonomy."], - [pipe.ANNOY_ASSIGNMENT_METHOD, "Search for the most similar window sequences <= 3bp different using ANNOY, otherwise no taxonomy is assigned. Requires a non-standard metapackage."], - [pipe.PPLACER_ASSIGNMENT_METHOD, "Use pplacer to assign taxonomy of each read in each OTU. Requires a non-standard metapackage."] - ]), - default=current_default) - - argument_group.add_argument('--output-extras', action='store_true', - help='give extra output for each sequence identified (e.g. the read(s) each OTU was generated from) in the output OTU table [default: not set]', - default=False) common_pipe_arguments = pipe_parser.add_argument_group('Common options') - sequence_input_group = common_pipe_arguments.add_mutually_exclusive_group(required=True) - # Keep parity of these arguments with the 'read_fraction' command - sequence_input_group.add_argument('-1','--forward','--reads','--sequences', - nargs='+', - metavar='sequence_file', - help='nucleotide read sequence(s) (forward or unpaired) to be searched. Can be FASTA or FASTQ format, GZIP-compressed or not.') - common_pipe_arguments.add_argument('-2', '--reverse', - nargs='+', - metavar='sequence_file', - help='reverse reads to be searched. Can be FASTA or FASTQ format, GZIP-compressed or not.') - sequence_input_group.add_argument('--genome-fasta-files', - nargs='+', - metavar='sequence_file', - help='nucleotide genome sequence(s) to be searched') - sequence_input_group.add_argument('--sra-files', - nargs='+', - metavar='sra_file', - help='"sra" format files (usually from NCBI SRA) to be searched') - add_common_pipe_arguments(common_pipe_arguments) - - def add_less_common_pipe_arguments(argument_group): - argument_group.add_argument('--archive-otu-table', metavar='filename', help='output OTU table in archive format for making DBs etc. [default: unused]') - argument_group.add_argument('--output-jplace', metavar='filename', help='Output a jplace format file for each singlem package to a file starting with this string, each with one entry per OTU. Requires \'%s\' as the --assignment_method [default: unused]' % pipe.PPLACER_ASSIGNMENT_METHOD) - argument_group.add_argument('--metapackage', help='Set of SingleM packages to use [default: use the default set]') - argument_group.add_argument('--singlem-packages', nargs='+', help='SingleM packages to use [default: use the set from the default metapackage]') - argument_group.add_argument('--assignment-singlem-db', '--assignment_singlem_db', help='Use this SingleM DB when assigning taxonomy [default: not set, use the default]') - argument_group.add_argument('--diamond-taxonomy-assignment-performance-parameters', - help='Performance-type arguments to use when calling \'diamond blastx\' during the taxonomy assignment step. [default: use setting defined in metapackage when set, otherwise use \'%s\']' % SearchPipe.DEFAULT_DIAMOND_ASSIGN_TAXONOMY_PERFORMANCE_PARAMETERS, - default=None) - argument_group.add_argument('--evalue', - help='HMMSEARCH e-value cutoff to use for sequence gathering [default: %s]' % SearchPipe.DEFAULT_HMMSEARCH_EVALUE, default=SearchPipe.DEFAULT_HMMSEARCH_EVALUE) - argument_group.add_argument('--min-orf-length', - metavar='length', - help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i when input is reads, %i when input is genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH), - type=int) - argument_group.add_argument('--restrict-read-length', - metavar='length', - help='Only use this many base pairs at the start of each sequence searched [default: no restriction]', - type=int) - argument_group.add_argument('--translation-table', - metavar='number', - type=int, - help='Codon table for translation. By default, translation table 4 is used, which is the same as translation table 11 (the usual bacterial/archaeal one), except that the TGA codon is translated as tryptophan, not as a stop codon. Using table 4 means that the minority of organisms which use table 4 are not biased against, without a significant effect on the majority of bacteria and archaea that use table 11. See http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes for details on specific tables. [default: %i]' % SearchPipe.DEFAULT_TRANSLATION_TABLE, - default=SearchPipe.DEFAULT_TRANSLATION_TABLE) - argument_group.add_argument('--filter-minimum-protein', - metavar='length', - help='Ignore reads aligning in less than this many positions to each protein HMM when using --no-diamond-prefilter [default: %i]' % SearchPipe.DEFAULT_FILTER_MINIMUM_PROTEIN, - type=int, default=SearchPipe.DEFAULT_FILTER_MINIMUM_PROTEIN) - argument_group.add_argument('--max-species-divergence', metavar='INT', - help='Maximum number of different bases acids to allow between a sequence and the best hit in the database so that it is assigned to the species level. [default: %i]' % SearchPipe.DEFAULT_MAX_SPECIES_DIVERGENCE, - type=int, default=SearchPipe.DEFAULT_MAX_SPECIES_DIVERGENCE) - argument_group.add_argument('--exclude-off-target-hits', action='store_true', help="Exclude hits that are not in the target domain of each SingleM package") - argument_group.add_argument('--min-taxon-coverage', - metavar='FLOAT', - help='Minimum coverage to report in a taxonomic profile. [default: {} for reads, {} for genomes]'.format(CONDENSE_DEFAULT_MIN_TAXON_COVERAGE, CONDENSE_DEFAULT_GENOME_MIN_TAXON_COVERAGE), - type=float) + add_common_pipe_arguments(common_pipe_arguments, extra_args=True) less_common_pipe_arguments = pipe_parser.add_argument_group('Less common options') - add_less_common_pipe_arguments(less_common_pipe_arguments) - - less_common_pipe_arguments.add_argument('--working-directory', metavar='directory', help='use intermediate working directory at a specified location, and do not delete it upon completion [default: not set, use a temporary directory]') - less_common_pipe_arguments.add_argument('--working-directory-dev-shm', default=False, action='store_true', help='use an intermediate results temporary working directory in /dev/shm rather than the default [default: the usual temporary working directory, currently {}]'.format( - tempfile.gettempdir() - )) - less_common_pipe_arguments.add_argument('--force', action='store_true', help='overwrite working directory if required [default: not set]') - less_common_pipe_arguments.add_argument('--filter-minimum-nucleotide', - metavar='length', - help='Ignore reads aligning in less than this many positions to each nucleotide HMM [default: %i]' % SearchPipe.DEFAULT_FILTER_MINIMUM_NUCLEOTIDE, - type=int, default=SearchPipe.DEFAULT_FILTER_MINIMUM_NUCLEOTIDE) - less_common_pipe_arguments.add_argument('--include-inserts', action='store_true', - help='print the entirety of the sequences in the OTU table, not just the aligned nucleotides [default: not set]', default=False) - less_common_pipe_arguments.add_argument('--known-otu-tables', nargs='+', - help='OTU tables previously generated with trusted taxonomies for each sequence [default: unused]') - less_common_pipe_arguments.add_argument('--no-assign-taxonomy', action='store_true', - help='Do not assign any taxonomy except for those already known [default: not set]', - default=False) - less_common_pipe_arguments.add_argument('--known-sequence-taxonomy', metavar='FILE', - help='A 2-column "sequencetaxonomy" file specifying some sequences that have known taxonomy [default: unused]') - less_common_pipe_arguments.add_argument('--no-diamond-prefilter', action='store_true', - help='Do not parse sequence data through DIAMOND blastx using a database constructed from the set of singlem packages. Should be used with --hmmsearch-package-assignment. NOTE: ignored for nucleotide packages [default: protein packages: use the prefilter, nucleotide packages: do not use the prefilter]', - default=False) - less_common_pipe_arguments.add_argument('--diamond-prefilter-performance-parameters', - help='Performance-type arguments to use when calling \'diamond blastx\' during the prefiltering. By default, SingleM should run in <4GB of RAM except in very large (>100Gbp) metagenomes. [default: use setting defined in metapackage when set, otherwise use \'%s\']' % SearchPipe.DEFAULT_PREFILTER_PERFORMANCE_PARAMETERS, - default=None) - less_common_pipe_arguments.add_argument('--hmmsearch-package-assignment', '--hmmsearch_package_assignment', action='store_true', - help='Assign each sequence to a SingleM package using HMMSEARCH, and a sequence may then be assigned to multiple packages. [default: not set]', - default=False) - less_common_pipe_arguments.add_argument('--diamond-prefilter-db', - help='Use this DB when running DIAMOND prefilter [default: use the one in the metapackage, or generate one from the SingleM packages]') - less_common_pipe_arguments.add_argument('--assignment-threads',type=int, - help='Use this many processes in parallel while assigning taxonomy [default: %i]' % SearchPipe.DEFAULT_ASSIGNMENT_THREADS, - default=SearchPipe.DEFAULT_ASSIGNMENT_THREADS) - less_common_pipe_arguments.add_argument('--sleep-after-mkfifo', type=int, - help='Sleep for this many seconds after running os.mkfifo [default: None]') + add_less_common_pipe_arguments(less_common_pipe_arguments, extra_args=True) renew_description = 'Reannotate an OTU table with an updated taxonomy' renew_parser = bird_argparser.new_subparser('renew', renew_description, parser_group='Tools') @@ -209,114 +83,7 @@ def add_less_common_pipe_arguments(argument_group): condense_description = 'Combine OTU tables across different markers into a single taxonomic profile. Modified for non-universal markers and requires a Lyrebird metapackage. Note that while this mode can be run independently, it is often more straightforward to invoke its methodology by specifying -p / --taxonomic-profile when running pipe mode.' condense_parser = bird_argparser.new_subparser('condense', condense_description) - - input_condense_arguments = condense_parser.add_argument_group("Input arguments (1+ required)") - - input_condense_arguments.add_argument('--input-archive-otu-tables', '--input-archive-otu-table', nargs='+', help="Condense from these archive tables") - input_condense_arguments.add_argument('--input-archive-otu-table-list', - help="Condense from the archive tables newline separated in this file") - input_condense_arguments.add_argument('--input-gzip-archive-otu-table-list', - help="Condense from the gzip'd archive tables newline separated in this file") - - output_condense_arguments = condense_parser.add_argument_group("Output arguments (1+ required)") - output_condense_arguments.add_argument('-p', '--taxonomic-profile', metavar='filename', help="output OTU table") - output_condense_arguments.add_argument('--taxonomic-profile-krona', metavar='filename', help='name of krona file to generate.') - output_condense_arguments.add_argument('--output-after-em-otu-table', metavar='filename', help="output OTU table after expectation maximisation has been applied. Note that this table usually contains multiple rows with the same window sequence.") - - optional_condense_arguments = condense_parser.add_argument_group("Other options") - optional_condense_arguments.add_argument('--metapackage', help='Set of SingleM packages to use [default: use the default set]') - current_default = CONDENSE_DEFAULT_MIN_TAXON_COVERAGE - optional_condense_arguments.add_argument('--min-taxon-coverage',metavar='FRACTION', - help='Set taxons with less coverage to coverage=0. [default: {}]'.format(current_default), default=current_default, type=float) - current_default = CONDENSE_DEFAULT_TRIM_PERCENT - optional_condense_arguments.add_argument('--trim-percent', type=float, default=current_default, help="percentage of markers to be trimmed for each taxonomy [default: {}]".format(current_default)) - - def validate_pipe_args(args, subparser='pipe'): - if not args.otu_table and not args.archive_otu_table and not args.taxonomic_profile and not args.taxonomic_profile_krona: - raise Exception("At least one of --output-taxonomic-profile, --output-taxonomic-profile-krona, --otu-table, or --archive-otu-table must be specified") - if args.output_jplace and args.assignment_method != pipe.PPLACER_ASSIGNMENT_METHOD: - raise Exception("If --output-jplace is specified, then --assignment-method must be set to %s" % pipe.PPLACER_ASSIGNMENT_METHOD) - if args.metapackage and args.singlem_packages: - raise Exception("Can only specify a metapackage or a singlem package set, not both") - if args.output_extras and not args.otu_table: - raise Exception("Can't use --output-extras without --otu-table") - if subparser == 'pipe': - if args.include_inserts and not args.otu_table and not args.archive_otu_table: - raise Exception("Can't use --include-inserts without --otu-table or --archive-otu-table") - if args.metapackage and args.diamond_prefilter_db: - raise Exception("Can't use a metapackage with --diamond-prefilter-db") - if args.output_jplace and args.known_otu_tables: - raise Exception("Currently --output-jplace and --known-otu-tables are incompatible") - if args.output_jplace and args.no_assign_taxonomy: - raise Exception("Currently --output-jplace and --no-assign-taxonomy are incompatible") - if args.known_sequence_taxonomy and not args.no_assign_taxonomy: - raise Exception( - "Currently --known-sequence-taxonomy requires --no-assign-taxonomy to be set also") - if args.reverse and args.output_jplace: - raise Exception("Currently --jplace-output cannot be used with --reverse") - if args.working_directory and args.working_directory_dev_shm: - raise Exception("Cannot specify both --working-directory and --working-directory-dev-shm") - if args.sra_files and args.no_diamond_prefilter: - raise Exception("SRA input data requires a DIAMOND prefilter step, currently") - if args.no_assign_taxonomy and (args.taxonomic_profile or args.taxonomic_profile_krona): - raise Exception("Can't use --no-assign-taxonomy with --output-taxonomic-profile or --output-taxonomic-profile-krona") - - def generate_streaming_otu_table_from_args(args, - input_prefix=False, query_prefix=False, archive_only=False, min_archive_otu_table_version=None): - - if archive_only: - otu_tables = False - otu_tables_list = False - if input_prefix: - if not archive_only: - otu_tables = args.input_otu_tables - otu_tables_list = args.input_otu_tables_list - archive_otu_tables = args.input_archive_otu_tables - archive_otu_table_list = args.input_archive_otu_table_list - gzip_archive_otu_table_list = args.input_gzip_archive_otu_table_list - elif query_prefix: - otu_tables = args.query_otu_table - otu_tables_list = args.query_otu_tables_list - archive_otu_tables = args.query_archive_otu_tables - archive_otu_table_list = args.query_archive_otu_table_list - gzip_archive_otu_table_list = args.query_gzip_archive_otu_table_list - else: - if not archive_only: - otu_tables = args.otu_tables - otu_tables_list = args.otu_tables_list - archive_otu_tables = args.archive_otu_tables - archive_otu_table_list = args.archive_otu_table_list - gzip_archive_otu_table_list = args.gzip_archive_otu_table_list - - if archive_only: - if not archive_otu_tables and not archive_otu_table_list and not gzip_archive_otu_table_list: - raise Exception("{} requires input archive OTU tables".format(args.subparser_name)) - else: - if not otu_tables and not otu_tables_list and not archive_otu_tables and \ - not archive_otu_table_list and not gzip_archive_otu_table_list: - raise Exception("{} requires input OTU tables or archive OTU tables".format(args.subparser_name)) - otus = StreamingOtuTableCollection() - if min_archive_otu_table_version: - otus.min_archive_otu_table_version = min_archive_otu_table_version - if otu_tables: - for o in otu_tables: - otus.add_otu_table_file(o) - if otu_tables_list: - with open(otu_tables_list) as f: - for o in f: - otus.add_otu_table_file(o.strip()) - if archive_otu_tables: - for o in archive_otu_tables: - otus.add_archive_otu_table_file(o.strip()) - if archive_otu_table_list: - with open(archive_otu_table_list) as f: - for o in f.readlines(): - otus.add_archive_otu_table_file(o) - if gzip_archive_otu_table_list: - with open(gzip_archive_otu_table_list) as f: - for arc in f.readlines(): - otus.add_gzip_archive_otu_table_file(arc.strip()) - return otus + add_condense_arguments(condense_parser) args = bird_argparser.parse_the_args() @@ -333,7 +100,9 @@ def generate_streaming_otu_table_from_args(args, os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') - logging.info("SingleM v{}".format(singlem.__version__)) + # logging.info("SingleM v{}".format(singlem.__version__)) + logging.info("Lyrebird v{}".format('0.1.0')) + # logging.info("Lyrebird v{}".format(singlem.__lyrebird_version__)) #TODO: change to lyrebird version 0.1.0 def get_min_orf_length(args, subparser='pipe'): if args.min_orf_length: diff --git a/singlem/main.py b/singlem/main.py index 3cde4128..5ff94e03 100755 --- a/singlem/main.py +++ b/singlem/main.py @@ -64,6 +64,242 @@ def seqs(args): logging.info("Found best start position %i" % best_position) print(best_position) +# Make pipe argument functions here so the code can be re-used between pipe and renew +def add_common_pipe_arguments(argument_group, extra_args=False): + if extra_args: + sequence_input_group = argument_group.add_mutually_exclusive_group(required=True) + # Keep parity of these arguments with the 'read_fraction' command + sequence_input_group.add_argument('-1','--forward','--reads','--sequences', + nargs='+', + metavar='sequence_file', + help='nucleotide read sequence(s) (forward or unpaired) to be searched. Can be FASTA or FASTQ format, GZIP-compressed or not.') + argument_group.add_argument('-2', '--reverse', + nargs='+', + metavar='sequence_file', + help='reverse reads to be searched. Can be FASTA or FASTQ format, GZIP-compressed or not.') + sequence_input_group.add_argument('--genome-fasta-files', + nargs='+', + metavar='sequence_file', + help='nucleotide genome sequence(s) to be searched') + sequence_input_group.add_argument('--sra-files', + nargs='+', + metavar='sra_file', + help='"sra" format files (usually from NCBI SRA) to be searched') + argument_group.add_argument('-p', '--taxonomic-profile', metavar='FILE', help="output a 'condensed' taxonomic profile for each sample based on the OTU table. Taxonomic profiles output can be further converted to other formats using singlem summarise.") + argument_group.add_argument('--taxonomic-profile-krona', metavar='FILE', help="output a 'condensed' taxonomic profile for each sample based on the OTU table") + argument_group.add_argument('--otu-table', metavar='filename', help='output OTU table') + current_default = pipe.DEFAULT_THREADS + argument_group.add_argument('--threads', type=int, metavar='num_threads', help='number of CPUS to use [default: %i]' % current_default, default=current_default) + current_default = SearchPipe.DEFAULT_TAXONOMY_ASSIGNMENT_METHOD + argument_group.add_argument( + '--assignment-method', '--assignment_method', + choices=( + pipe.SMAFA_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, + pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, + pipe.ANNOY_THEN_DIAMOND_ASSIGNMENT_METHOD, + pipe.SCANN_THEN_DIAMOND_ASSIGNMENT_METHOD, + pipe.DIAMOND_ASSIGNMENT_METHOD, + pipe.DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD, + pipe.ANNOY_ASSIGNMENT_METHOD, + pipe.PPLACER_ASSIGNMENT_METHOD), + help='Method of assigning taxonomy to OTUs and taxonomic profiles [default: %s]\n\n' % (current_default) + + table_roff([ + ["Method", "Description"], + [pipe.SMAFA_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, "Search for the most similar window sequences <= 3bp different using a brute force algorithm (using the smafa implementation) over all window sequences in the database, and if none are found use DIAMOND blastx of all reads from each OTU."], + [pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, "Search for the most similar window sequences <= 3bp different using a brute force algorithm over all window sequences in the database, and if none are found use DIAMOND blastx of all reads from each OTU."], + [pipe.ANNOY_THEN_DIAMOND_ASSIGNMENT_METHOD, "Same as {}, except search using ANNOY rather than using brute force. Requires a non-standard metapackage.".format(pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD)], + [pipe.SCANN_THEN_DIAMOND_ASSIGNMENT_METHOD, "Same as {}, except search using SCANN rather than using brute force. Requires a non-standard metapackage.".format(pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD)], + [pipe.DIAMOND_ASSIGNMENT_METHOD, "DIAMOND blastx best hit(s) of all reads from each OTU."], + [pipe.DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD, "DIAMOND blastx best hit(s) of all reads from each OTU, but report the best hit as a sequence ID instead of a taxonomy."], + [pipe.ANNOY_ASSIGNMENT_METHOD, "Search for the most similar window sequences <= 3bp different using ANNOY, otherwise no taxonomy is assigned. Requires a non-standard metapackage."], + [pipe.PPLACER_ASSIGNMENT_METHOD, "Use pplacer to assign taxonomy of each read in each OTU. Requires a non-standard metapackage."] + ]), + default=current_default) + + argument_group.add_argument('--output-extras', action='store_true', + help='give extra output for each sequence identified (e.g. the read(s) each OTU was generated from) in the output OTU table [default: not set]', + default=False) + +def add_less_common_pipe_arguments(argument_group, extra_args=False): + argument_group.add_argument('--archive-otu-table', metavar='filename', help='output OTU table in archive format for making DBs etc. [default: unused]') + argument_group.add_argument('--output-jplace', metavar='filename', help='Output a jplace format file for each singlem package to a file starting with this string, each with one entry per OTU. Requires \'%s\' as the --assignment_method [default: unused]' % pipe.PPLACER_ASSIGNMENT_METHOD) + argument_group.add_argument('--metapackage', help='Set of SingleM packages to use [default: use the default set]') + argument_group.add_argument('--singlem-packages', nargs='+', help='SingleM packages to use [default: use the set from the default metapackage]') + argument_group.add_argument('--assignment-singlem-db', '--assignment_singlem_db', help='Use this SingleM DB when assigning taxonomy [default: not set, use the default]') + argument_group.add_argument('--diamond-taxonomy-assignment-performance-parameters', + help='Performance-type arguments to use when calling \'diamond blastx\' during the taxonomy assignment step. [default: use setting defined in metapackage when set, otherwise use \'%s\']' % SearchPipe.DEFAULT_DIAMOND_ASSIGN_TAXONOMY_PERFORMANCE_PARAMETERS, + default=None) + argument_group.add_argument('--evalue', + help='HMMSEARCH e-value cutoff to use for sequence gathering [default: %s]' % SearchPipe.DEFAULT_HMMSEARCH_EVALUE, default=SearchPipe.DEFAULT_HMMSEARCH_EVALUE) + argument_group.add_argument('--min-orf-length', + metavar='length', + help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i when input is reads, %i when input is genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH), + type=int) + argument_group.add_argument('--restrict-read-length', + metavar='length', + help='Only use this many base pairs at the start of each sequence searched [default: no restriction]', + type=int) + argument_group.add_argument('--translation-table', + metavar='number', + type=int, + help='Codon table for translation. By default, translation table 4 is used, which is the same as translation table 11 (the usual bacterial/archaeal one), except that the TGA codon is translated as tryptophan, not as a stop codon. Using table 4 means that the minority of organisms which use table 4 are not biased against, without a significant effect on the majority of bacteria and archaea that use table 11. See http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes for details on specific tables. [default: %i]' % SearchPipe.DEFAULT_TRANSLATION_TABLE, + default=SearchPipe.DEFAULT_TRANSLATION_TABLE) + argument_group.add_argument('--filter-minimum-protein', + metavar='length', + help='Ignore reads aligning in less than this many positions to each protein HMM when using --no-diamond-prefilter [default: %i]' % SearchPipe.DEFAULT_FILTER_MINIMUM_PROTEIN, + type=int, default=SearchPipe.DEFAULT_FILTER_MINIMUM_PROTEIN) + argument_group.add_argument('--max-species-divergence', metavar='INT', + help='Maximum number of different bases acids to allow between a sequence and the best hit in the database so that it is assigned to the species level. [default: %i]' % SearchPipe.DEFAULT_MAX_SPECIES_DIVERGENCE, + type=int, default=SearchPipe.DEFAULT_MAX_SPECIES_DIVERGENCE) + argument_group.add_argument('--exclude-off-target-hits', action='store_true', help="Exclude hits that are not in the target domain of each SingleM package") + argument_group.add_argument('--min-taxon-coverage', + metavar='FLOAT', + help='Minimum coverage to report in a taxonomic profile. [default: {} for reads, {} for genomes]'.format(CONDENSE_DEFAULT_MIN_TAXON_COVERAGE, CONDENSE_DEFAULT_GENOME_MIN_TAXON_COVERAGE), + type=float) + if extra_args: + argument_group.add_argument('--working-directory', metavar='directory', help='use intermediate working directory at a specified location, and do not delete it upon completion [default: not set, use a temporary directory]') + argument_group.add_argument('--working-directory-dev-shm', default=False, action='store_true', help='use an intermediate results temporary working directory in /dev/shm rather than the default [default: the usual temporary working directory, currently {}]'.format( + tempfile.gettempdir() + )) + argument_group.add_argument('--force', action='store_true', help='overwrite working directory if required [default: not set]') + argument_group.add_argument('--filter-minimum-nucleotide', + metavar='length', + help='Ignore reads aligning in less than this many positions to each nucleotide HMM [default: %i]' % SearchPipe.DEFAULT_FILTER_MINIMUM_NUCLEOTIDE, + type=int, default=SearchPipe.DEFAULT_FILTER_MINIMUM_NUCLEOTIDE) + argument_group.add_argument('--include-inserts', action='store_true', + help='print the entirety of the sequences in the OTU table, not just the aligned nucleotides [default: not set]', default=False) + argument_group.add_argument('--known-otu-tables', nargs='+', + help='OTU tables previously generated with trusted taxonomies for each sequence [default: unused]') + argument_group.add_argument('--no-assign-taxonomy', action='store_true', + help='Do not assign any taxonomy except for those already known [default: not set]', + default=False) + argument_group.add_argument('--known-sequence-taxonomy', metavar='FILE', + help='A 2-column "sequencetaxonomy" file specifying some sequences that have known taxonomy [default: unused]') + argument_group.add_argument('--no-diamond-prefilter', action='store_true', + help='Do not parse sequence data through DIAMOND blastx using a database constructed from the set of singlem packages. Should be used with --hmmsearch-package-assignment. NOTE: ignored for nucleotide packages [default: protein packages: use the prefilter, nucleotide packages: do not use the prefilter]', + default=False) + argument_group.add_argument('--diamond-prefilter-performance-parameters', + help='Performance-type arguments to use when calling \'diamond blastx\' during the prefiltering. By default, SingleM should run in <4GB of RAM except in very large (>100Gbp) metagenomes. [default: use setting defined in metapackage when set, otherwise use \'%s\']' % SearchPipe.DEFAULT_PREFILTER_PERFORMANCE_PARAMETERS, + default=None) + argument_group.add_argument('--hmmsearch-package-assignment', '--hmmsearch_package_assignment', action='store_true', + help='Assign each sequence to a SingleM package using HMMSEARCH, and a sequence may then be assigned to multiple packages. [default: not set]', + default=False) + argument_group.add_argument('--diamond-prefilter-db', + help='Use this DB when running DIAMOND prefilter [default: use the one in the metapackage, or generate one from the SingleM packages]') + argument_group.add_argument('--assignment-threads',type=int, + help='Use this many processes in parallel while assigning taxonomy [default: %i]' % SearchPipe.DEFAULT_ASSIGNMENT_THREADS, + default=SearchPipe.DEFAULT_ASSIGNMENT_THREADS) + argument_group.add_argument('--sleep-after-mkfifo', type=int, + help='Sleep for this many seconds after running os.mkfifo [default: None]') + +def validate_pipe_args(args, subparser='pipe'): + if not args.otu_table and not args.archive_otu_table and not args.taxonomic_profile and not args.taxonomic_profile_krona: + raise Exception("At least one of --output-taxonomic-profile, --output-taxonomic-profile-krona, --otu-table, or --archive-otu-table must be specified") + if args.output_jplace and args.assignment_method != pipe.PPLACER_ASSIGNMENT_METHOD: + raise Exception("If --output-jplace is specified, then --assignment-method must be set to %s" % pipe.PPLACER_ASSIGNMENT_METHOD) + if args.metapackage and args.singlem_packages: + raise Exception("Can only specify a metapackage or a singlem package set, not both") + if args.output_extras and not args.otu_table: + raise Exception("Can't use --output-extras without --otu-table") + if subparser == 'pipe': + if args.include_inserts and not args.otu_table and not args.archive_otu_table: + raise Exception("Can't use --include-inserts without --otu-table or --archive-otu-table") + if args.metapackage and args.diamond_prefilter_db: + raise Exception("Can't use a metapackage with --diamond-prefilter-db") + if args.output_jplace and args.known_otu_tables: + raise Exception("Currently --output-jplace and --known-otu-tables are incompatible") + if args.output_jplace and args.no_assign_taxonomy: + raise Exception("Currently --output-jplace and --no-assign-taxonomy are incompatible") + if args.known_sequence_taxonomy and not args.no_assign_taxonomy: + raise Exception( + "Currently --known-sequence-taxonomy requires --no-assign-taxonomy to be set also") + if args.reverse and args.output_jplace: + raise Exception("Currently --jplace-output cannot be used with --reverse") + if args.working_directory and args.working_directory_dev_shm: + raise Exception("Cannot specify both --working-directory and --working-directory-dev-shm") + if args.sra_files and args.no_diamond_prefilter: + raise Exception("SRA input data requires a DIAMOND prefilter step, currently") + if args.no_assign_taxonomy and (args.taxonomic_profile or args.taxonomic_profile_krona): + raise Exception("Can't use --no-assign-taxonomy with --output-taxonomic-profile or --output-taxonomic-profile-krona") + +def generate_streaming_otu_table_from_args(args, + input_prefix=False, query_prefix=False, archive_only=False, min_archive_otu_table_version=None): + + if archive_only: + otu_tables = False + otu_tables_list = False + if input_prefix: + if not archive_only: + otu_tables = args.input_otu_tables + otu_tables_list = args.input_otu_tables_list + archive_otu_tables = args.input_archive_otu_tables + archive_otu_table_list = args.input_archive_otu_table_list + gzip_archive_otu_table_list = args.input_gzip_archive_otu_table_list + elif query_prefix: + otu_tables = args.query_otu_table + otu_tables_list = args.query_otu_tables_list + archive_otu_tables = args.query_archive_otu_tables + archive_otu_table_list = args.query_archive_otu_table_list + gzip_archive_otu_table_list = args.query_gzip_archive_otu_table_list + else: + if not archive_only: + otu_tables = args.otu_tables + otu_tables_list = args.otu_tables_list + archive_otu_tables = args.archive_otu_tables + archive_otu_table_list = args.archive_otu_table_list + gzip_archive_otu_table_list = args.gzip_archive_otu_table_list + + if archive_only: + if not archive_otu_tables and not archive_otu_table_list and not gzip_archive_otu_table_list: + raise Exception("{} requires input archive OTU tables".format(args.subparser_name)) + else: + if not otu_tables and not otu_tables_list and not archive_otu_tables and \ + not archive_otu_table_list and not gzip_archive_otu_table_list: + raise Exception("{} requires input OTU tables or archive OTU tables".format(args.subparser_name)) + otus = StreamingOtuTableCollection() + if min_archive_otu_table_version: + otus.min_archive_otu_table_version = min_archive_otu_table_version + if otu_tables: + for o in otu_tables: + otus.add_otu_table_file(o) + if otu_tables_list: + with open(otu_tables_list) as f: + for o in f: + otus.add_otu_table_file(o.strip()) + if archive_otu_tables: + for o in archive_otu_tables: + otus.add_archive_otu_table_file(o.strip()) + if archive_otu_table_list: + with open(archive_otu_table_list) as f: + for o in f.readlines(): + otus.add_archive_otu_table_file(o) + if gzip_archive_otu_table_list: + with open(gzip_archive_otu_table_list) as f: + for arc in f.readlines(): + otus.add_gzip_archive_otu_table_file(arc.strip()) + return otus + + +def add_condense_arguments(parser): + input_condense_arguments = parser.add_argument_group("Input arguments (1+ required)") + input_condense_arguments.add_argument('--input-archive-otu-tables', '--input-archive-otu-table', nargs='+', help="Condense from these archive tables") + input_condense_arguments.add_argument('--input-archive-otu-table-list', + help="Condense from the archive tables newline separated in this file") + input_condense_arguments.add_argument('--input-gzip-archive-otu-table-list', + help="Condense from the gzip'd archive tables newline separated in this file") + + output_condense_arguments = parser.add_argument_group("Output arguments (1+ required)") + output_condense_arguments.add_argument('-p', '--taxonomic-profile', metavar='filename', help="output OTU table") + output_condense_arguments.add_argument('--taxonomic-profile-krona', metavar='filename', help='name of krona file to generate.') + output_condense_arguments.add_argument('--output-after-em-otu-table', metavar='filename', help="output OTU table after expectation maximisation has been applied. Note that this table usually contains multiple rows with the same window sequence.") + + optional_condense_arguments = parser.add_argument_group("Other options") + optional_condense_arguments.add_argument('--metapackage', help='Set of SingleM packages to use [default: use the default set]') + current_default = CONDENSE_DEFAULT_MIN_TAXON_COVERAGE + optional_condense_arguments.add_argument('--min-taxon-coverage',metavar='FRACTION', + help='Set taxons with less coverage to coverage=0. [default: {}]'.format(current_default), default=current_default, type=float) + current_default = CONDENSE_DEFAULT_TRIM_PERCENT + optional_condense_arguments.add_argument('--trim-percent', type=float, default=current_default, help="percentage of markers to be trimmed for each taxonomy [default: {}]".format(current_default)) def main(): bird_argparser = BirdArgparser( @@ -112,137 +348,11 @@ def main(): pipe_description = 'Generate a taxonomic profile or OTU table from raw sequences' pipe_parser = bird_argparser.new_subparser('pipe', pipe_description, parser_group='Tools') - # Make a function here so the code can be re-used between pipe and renew - def add_common_pipe_arguments(argument_group): - argument_group.add_argument('-p', '--taxonomic-profile', metavar='FILE', help="output a 'condensed' taxonomic profile for each sample based on the OTU table. Taxonomic profiles output can be further converted to other formats using singlem summarise.") - argument_group.add_argument('--taxonomic-profile-krona', metavar='FILE', help="output a 'condensed' taxonomic profile for each sample based on the OTU table") - argument_group.add_argument('--otu-table', metavar='filename', help='output OTU table') - current_default = pipe.DEFAULT_THREADS - argument_group.add_argument('--threads', type=int, metavar='num_threads', help='number of CPUS to use [default: %i]' % current_default, default=current_default) - current_default = SearchPipe.DEFAULT_TAXONOMY_ASSIGNMENT_METHOD - argument_group.add_argument( - '--assignment-method', '--assignment_method', - choices=( - pipe.SMAFA_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, - pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, - pipe.ANNOY_THEN_DIAMOND_ASSIGNMENT_METHOD, - pipe.SCANN_THEN_DIAMOND_ASSIGNMENT_METHOD, - pipe.DIAMOND_ASSIGNMENT_METHOD, - pipe.DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD, - pipe.ANNOY_ASSIGNMENT_METHOD, - pipe.PPLACER_ASSIGNMENT_METHOD), - help='Method of assigning taxonomy to OTUs and taxonomic profiles [default: %s]\n\n' % (current_default) + - table_roff([ - ["Method", "Description"], - [pipe.SMAFA_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, "Search for the most similar window sequences <= 3bp different using a brute force algorithm (using the smafa implementation) over all window sequences in the database, and if none are found use DIAMOND blastx of all reads from each OTU."], - [pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD, "Search for the most similar window sequences <= 3bp different using a brute force algorithm over all window sequences in the database, and if none are found use DIAMOND blastx of all reads from each OTU."], - [pipe.ANNOY_THEN_DIAMOND_ASSIGNMENT_METHOD, "Same as {}, except search using ANNOY rather than using brute force. Requires a non-standard metapackage.".format(pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD)], - [pipe.SCANN_THEN_DIAMOND_ASSIGNMENT_METHOD, "Same as {}, except search using SCANN rather than using brute force. Requires a non-standard metapackage.".format(pipe.SCANN_NAIVE_THEN_DIAMOND_ASSIGNMENT_METHOD)], - [pipe.DIAMOND_ASSIGNMENT_METHOD, "DIAMOND blastx best hit(s) of all reads from each OTU."], - [pipe.DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD, "DIAMOND blastx best hit(s) of all reads from each OTU, but report the best hit as a sequence ID instead of a taxonomy."], - [pipe.ANNOY_ASSIGNMENT_METHOD, "Search for the most similar window sequences <= 3bp different using ANNOY, otherwise no taxonomy is assigned. Requires a non-standard metapackage."], - [pipe.PPLACER_ASSIGNMENT_METHOD, "Use pplacer to assign taxonomy of each read in each OTU. Requires a non-standard metapackage."] - ]), - default=current_default) - - argument_group.add_argument('--output-extras', action='store_true', - help='give extra output for each sequence identified (e.g. the read(s) each OTU was generated from) in the output OTU table [default: not set]', - default=False) common_pipe_arguments = pipe_parser.add_argument_group('Common options') - sequence_input_group = common_pipe_arguments.add_mutually_exclusive_group(required=True) - # Keep parity of these arguments with the 'read_fraction' command - sequence_input_group.add_argument('-1','--forward','--reads','--sequences', - nargs='+', - metavar='sequence_file', - help='nucleotide read sequence(s) (forward or unpaired) to be searched. Can be FASTA or FASTQ format, GZIP-compressed or not.') - common_pipe_arguments.add_argument('-2', '--reverse', - nargs='+', - metavar='sequence_file', - help='reverse reads to be searched. Can be FASTA or FASTQ format, GZIP-compressed or not.') - sequence_input_group.add_argument('--genome-fasta-files', - nargs='+', - metavar='sequence_file', - help='nucleotide genome sequence(s) to be searched') - sequence_input_group.add_argument('--sra-files', - nargs='+', - metavar='sra_file', - help='"sra" format files (usually from NCBI SRA) to be searched') - add_common_pipe_arguments(common_pipe_arguments) - - def add_less_common_pipe_arguments(argument_group): - argument_group.add_argument('--archive-otu-table', metavar='filename', help='output OTU table in archive format for making DBs etc. [default: unused]') - argument_group.add_argument('--output-jplace', metavar='filename', help='Output a jplace format file for each singlem package to a file starting with this string, each with one entry per OTU. Requires \'%s\' as the --assignment_method [default: unused]' % pipe.PPLACER_ASSIGNMENT_METHOD) - argument_group.add_argument('--metapackage', help='Set of SingleM packages to use [default: use the default set]') - argument_group.add_argument('--singlem-packages', nargs='+', help='SingleM packages to use [default: use the set from the default metapackage]') - argument_group.add_argument('--assignment-singlem-db', '--assignment_singlem_db', help='Use this SingleM DB when assigning taxonomy [default: not set, use the default]') - argument_group.add_argument('--diamond-taxonomy-assignment-performance-parameters', - help='Performance-type arguments to use when calling \'diamond blastx\' during the taxonomy assignment step. [default: use setting defined in metapackage when set, otherwise use \'%s\']' % SearchPipe.DEFAULT_DIAMOND_ASSIGN_TAXONOMY_PERFORMANCE_PARAMETERS, - default=None) - argument_group.add_argument('--evalue', - help='HMMSEARCH e-value cutoff to use for sequence gathering [default: %s]' % SearchPipe.DEFAULT_HMMSEARCH_EVALUE, default=SearchPipe.DEFAULT_HMMSEARCH_EVALUE) - argument_group.add_argument('--min-orf-length', - metavar='length', - help='When predicting ORFs require this many base pairs uninterrupted by a stop codon [default: %i when input is reads, %i when input is genomes]' % (SearchPipe.DEFAULT_MIN_ORF_LENGTH, SearchPipe.DEFAULT_GENOME_MIN_ORF_LENGTH), - type=int) - argument_group.add_argument('--restrict-read-length', - metavar='length', - help='Only use this many base pairs at the start of each sequence searched [default: no restriction]', - type=int) - argument_group.add_argument('--translation-table', - metavar='number', - type=int, - help='Codon table for translation. By default, translation table 4 is used, which is the same as translation table 11 (the usual bacterial/archaeal one), except that the TGA codon is translated as tryptophan, not as a stop codon. Using table 4 means that the minority of organisms which use table 4 are not biased against, without a significant effect on the majority of bacteria and archaea that use table 11. See http://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes for details on specific tables. [default: %i]' % SearchPipe.DEFAULT_TRANSLATION_TABLE, - default=SearchPipe.DEFAULT_TRANSLATION_TABLE) - argument_group.add_argument('--filter-minimum-protein', - metavar='length', - help='Ignore reads aligning in less than this many positions to each protein HMM when using --no-diamond-prefilter [default: %i]' % SearchPipe.DEFAULT_FILTER_MINIMUM_PROTEIN, - type=int, default=SearchPipe.DEFAULT_FILTER_MINIMUM_PROTEIN) - argument_group.add_argument('--max-species-divergence', metavar='INT', - help='Maximum number of different bases acids to allow between a sequence and the best hit in the database so that it is assigned to the species level. [default: %i]' % SearchPipe.DEFAULT_MAX_SPECIES_DIVERGENCE, - type=int, default=SearchPipe.DEFAULT_MAX_SPECIES_DIVERGENCE) - argument_group.add_argument('--exclude-off-target-hits', action='store_true', help="Exclude hits that are not in the target domain of each SingleM package") - argument_group.add_argument('--min-taxon-coverage', - metavar='FLOAT', - help='Minimum coverage to report in a taxonomic profile. [default: {} for reads, {} for genomes]'.format(CONDENSE_DEFAULT_MIN_TAXON_COVERAGE, CONDENSE_DEFAULT_GENOME_MIN_TAXON_COVERAGE), - type=float) + add_common_pipe_arguments(common_pipe_arguments, extra_args=True) less_common_pipe_arguments = pipe_parser.add_argument_group('Less common options') - add_less_common_pipe_arguments(less_common_pipe_arguments) - - less_common_pipe_arguments.add_argument('--working-directory', metavar='directory', help='use intermediate working directory at a specified location, and do not delete it upon completion [default: not set, use a temporary directory]') - less_common_pipe_arguments.add_argument('--working-directory-dev-shm', default=False, action='store_true', help='use an intermediate results temporary working directory in /dev/shm rather than the default [default: the usual temporary working directory, currently {}]'.format( - tempfile.gettempdir() - )) - less_common_pipe_arguments.add_argument('--force', action='store_true', help='overwrite working directory if required [default: not set]') - less_common_pipe_arguments.add_argument('--filter-minimum-nucleotide', - metavar='length', - help='Ignore reads aligning in less than this many positions to each nucleotide HMM [default: %i]' % SearchPipe.DEFAULT_FILTER_MINIMUM_NUCLEOTIDE, - type=int, default=SearchPipe.DEFAULT_FILTER_MINIMUM_NUCLEOTIDE) - less_common_pipe_arguments.add_argument('--include-inserts', action='store_true', - help='print the entirety of the sequences in the OTU table, not just the aligned nucleotides [default: not set]', default=False) - less_common_pipe_arguments.add_argument('--known-otu-tables', nargs='+', - help='OTU tables previously generated with trusted taxonomies for each sequence [default: unused]') - less_common_pipe_arguments.add_argument('--no-assign-taxonomy', action='store_true', - help='Do not assign any taxonomy except for those already known [default: not set]', - default=False) - less_common_pipe_arguments.add_argument('--known-sequence-taxonomy', metavar='FILE', - help='A 2-column "sequencetaxonomy" file specifying some sequences that have known taxonomy [default: unused]') - less_common_pipe_arguments.add_argument('--no-diamond-prefilter', action='store_true', - help='Do not parse sequence data through DIAMOND blastx using a database constructed from the set of singlem packages. Should be used with --hmmsearch-package-assignment. NOTE: ignored for nucleotide packages [default: protein packages: use the prefilter, nucleotide packages: do not use the prefilter]', - default=False) - less_common_pipe_arguments.add_argument('--diamond-prefilter-performance-parameters', - help='Performance-type arguments to use when calling \'diamond blastx\' during the prefiltering. By default, SingleM should run in <4GB of RAM except in very large (>100Gbp) metagenomes. [default: use setting defined in metapackage when set, otherwise use \'%s\']' % SearchPipe.DEFAULT_PREFILTER_PERFORMANCE_PARAMETERS, - default=None) - less_common_pipe_arguments.add_argument('--hmmsearch-package-assignment', '--hmmsearch_package_assignment', action='store_true', - help='Assign each sequence to a SingleM package using HMMSEARCH, and a sequence may then be assigned to multiple packages. [default: not set]', - default=False) - less_common_pipe_arguments.add_argument('--diamond-prefilter-db', - help='Use this DB when running DIAMOND prefilter [default: use the one in the metapackage, or generate one from the SingleM packages]') - less_common_pipe_arguments.add_argument('--assignment-threads',type=int, - help='Use this many processes in parallel while assigning taxonomy [default: %i]' % SearchPipe.DEFAULT_ASSIGNMENT_THREADS, - default=SearchPipe.DEFAULT_ASSIGNMENT_THREADS) - less_common_pipe_arguments.add_argument('--sleep-after-mkfifo', type=int, - help='Sleep for this many seconds after running os.mkfifo [default: None]') + add_less_common_pipe_arguments(less_common_pipe_arguments, extra_args=True) appraise_description = 'How much of the metagenome do the genomes or assembly represent?' appraise_parser = bird_argparser.new_subparser('appraise', appraise_description, parser_group='Tools') @@ -500,27 +610,7 @@ def add_less_common_pipe_arguments(argument_group): condense_description = 'Combine OTU tables across different markers into a single taxonomic profile. Note that while this mode can be run independently, it is often more straightforward to invoke its methodology by specifying -p / --taxonomic-profile when running pipe mode.' condense_parser = bird_argparser.new_subparser('condense', condense_description) - - input_condense_arguments = condense_parser.add_argument_group("Input arguments (1+ required)") - - input_condense_arguments.add_argument('--input-archive-otu-tables', '--input-archive-otu-table', nargs='+', help="Condense from these archive tables") - input_condense_arguments.add_argument('--input-archive-otu-table-list', - help="Condense from the archive tables newline separated in this file") - input_condense_arguments.add_argument('--input-gzip-archive-otu-table-list', - help="Condense from the gzip'd archive tables newline separated in this file") - - output_condense_arguments = condense_parser.add_argument_group("Output arguments (1+ required)") - output_condense_arguments.add_argument('-p', '--taxonomic-profile', metavar='filename', help="output OTU table") - output_condense_arguments.add_argument('--taxonomic-profile-krona', metavar='filename', help='name of krona file to generate.') - output_condense_arguments.add_argument('--output-after-em-otu-table', metavar='filename', help="output OTU table after expectation maximisation has been applied. Note that this table usually contains multiple rows with the same window sequence.") - - optional_condense_arguments = condense_parser.add_argument_group("Other options") - optional_condense_arguments.add_argument('--metapackage', help='Set of SingleM packages to use [default: use the default set]') - current_default = CONDENSE_DEFAULT_MIN_TAXON_COVERAGE - optional_condense_arguments.add_argument('--min-taxon-coverage',metavar='FRACTION', - help='Set taxons with less coverage to coverage=0. [default: {}]'.format(current_default), default=current_default, type=float) - current_default = CONDENSE_DEFAULT_TRIM_PERCENT - optional_condense_arguments.add_argument('--trim-percent', type=float, default=current_default, help="percentage of markers to be trimmed for each taxonomy [default: {}]".format(current_default)) + add_condense_arguments(condense_parser) trim_package_hmms_description = 'Trim the width of HMMs to increase speed (expert mode)' trim_package_hmms_parser = bird_argparser.new_subparser('trim_package_hmms', trim_package_hmms_description) @@ -586,95 +676,6 @@ def add_less_common_pipe_arguments(argument_group): supplement_rare_group.add_argument('--new-taxonomy-database-name', help='Name of the taxonomy database to record in the created metapackage [default: %s]' % CUSTOM_TAXONOMY_DATABASE_NAME, default=CUSTOM_TAXONOMY_DATABASE_NAME) supplement_rare_group.add_argument('--new-taxonomy-database-version', help='Version of the taxonomy database to use [default: None]') - def validate_pipe_args(args, subparser='pipe'): - if not args.otu_table and not args.archive_otu_table and not args.taxonomic_profile and not args.taxonomic_profile_krona: - raise Exception("At least one of --output-taxonomic-profile, --output-taxonomic-profile-krona, --otu-table, or --archive-otu-table must be specified") - if args.output_jplace and args.assignment_method != pipe.PPLACER_ASSIGNMENT_METHOD: - raise Exception("If --output-jplace is specified, then --assignment-method must be set to %s" % pipe.PPLACER_ASSIGNMENT_METHOD) - if args.metapackage and args.singlem_packages: - raise Exception("Can only specify a metapackage or a singlem package set, not both") - if args.output_extras and not args.otu_table: - raise Exception("Can't use --output-extras without --otu-table") - if subparser == 'pipe': - if args.include_inserts and not args.otu_table and not args.archive_otu_table: - raise Exception("Can't use --include-inserts without --otu-table or --archive-otu-table") - if args.metapackage and args.diamond_prefilter_db: - raise Exception("Can't use a metapackage with --diamond-prefilter-db") - if args.output_jplace and args.known_otu_tables: - raise Exception("Currently --output-jplace and --known-otu-tables are incompatible") - if args.output_jplace and args.no_assign_taxonomy: - raise Exception("Currently --output-jplace and --no-assign-taxonomy are incompatible") - if args.known_sequence_taxonomy and not args.no_assign_taxonomy: - raise Exception( - "Currently --known-sequence-taxonomy requires --no-assign-taxonomy to be set also") - if args.reverse and args.output_jplace: - raise Exception("Currently --jplace-output cannot be used with --reverse") - if args.working_directory and args.working_directory_dev_shm: - raise Exception("Cannot specify both --working-directory and --working-directory-dev-shm") - if args.sra_files and args.no_diamond_prefilter: - raise Exception("SRA input data requires a DIAMOND prefilter step, currently") - if args.no_assign_taxonomy and (args.taxonomic_profile or args.taxonomic_profile_krona): - raise Exception("Can't use --no-assign-taxonomy with --output-taxonomic-profile or --output-taxonomic-profile-krona") - - - - def generate_streaming_otu_table_from_args(args, - input_prefix=False, query_prefix=False, archive_only=False, min_archive_otu_table_version=None): - - if archive_only: - otu_tables = False - otu_tables_list = False - if input_prefix: - if not archive_only: - otu_tables = args.input_otu_tables - otu_tables_list = args.input_otu_tables_list - archive_otu_tables = args.input_archive_otu_tables - archive_otu_table_list = args.input_archive_otu_table_list - gzip_archive_otu_table_list = args.input_gzip_archive_otu_table_list - elif query_prefix: - otu_tables = args.query_otu_table - otu_tables_list = args.query_otu_tables_list - archive_otu_tables = args.query_archive_otu_tables - archive_otu_table_list = args.query_archive_otu_table_list - gzip_archive_otu_table_list = args.query_gzip_archive_otu_table_list - else: - if not archive_only: - otu_tables = args.otu_tables - otu_tables_list = args.otu_tables_list - archive_otu_tables = args.archive_otu_tables - archive_otu_table_list = args.archive_otu_table_list - gzip_archive_otu_table_list = args.gzip_archive_otu_table_list - - if archive_only: - if not archive_otu_tables and not archive_otu_table_list and not gzip_archive_otu_table_list: - raise Exception("{} requires input archive OTU tables".format(args.subparser_name)) - else: - if not otu_tables and not otu_tables_list and not archive_otu_tables and \ - not archive_otu_table_list and not gzip_archive_otu_table_list: - raise Exception("{} requires input OTU tables or archive OTU tables".format(args.subparser_name)) - otus = StreamingOtuTableCollection() - if min_archive_otu_table_version: - otus.min_archive_otu_table_version = min_archive_otu_table_version - if otu_tables: - for o in otu_tables: - otus.add_otu_table_file(o) - if otu_tables_list: - with open(otu_tables_list) as f: - for o in f: - otus.add_otu_table_file(o.strip()) - if archive_otu_tables: - for o in archive_otu_tables: - otus.add_archive_otu_table_file(o.strip()) - if archive_otu_table_list: - with open(archive_otu_table_list) as f: - for o in f.readlines(): - otus.add_archive_otu_table_file(o) - if gzip_archive_otu_table_list: - with open(gzip_archive_otu_table_list) as f: - for arc in f.readlines(): - otus.add_gzip_archive_otu_table_file(arc.strip()) - return otus - args = bird_argparser.parse_the_args() if args.debug: