From d8d7f2820a9acca70939dbdbf78a0483dad1b7ab Mon Sep 17 00:00:00 2001 From: liberjul Date: Mon, 23 Aug 2021 14:22:53 -0400 Subject: [PATCH] Update to 2.0.15 fixing BLAST getting stuck on 0 hit queries. --- CombineTaxonomy.py | 69 +++++++++++++++++++++++--------------------- blast_to_df.py | 2 +- check_input_names.py | 2 ++ constax.sh | 12 ++++++-- constax_no_inputs.sh | 13 +++++++-- constax_wrapper.py | 2 +- split_inputs.py | 40 +++++++++++++++++++++++++ 7 files changed, 101 insertions(+), 39 deletions(-) create mode 100644 split_inputs.py diff --git a/CombineTaxonomy.py b/CombineTaxonomy.py index 51515f7..7113823 100644 --- a/CombineTaxonomy.py +++ b/CombineTaxonomy.py @@ -157,7 +157,7 @@ def reformat_UTAX(utax_file, output_dir, confidence, ranks): ################################################################################ def reformat_SINTAX(sintax_file, output_dir, confidence, ranks): - input = open (sintax_file) + input = open(sintax_file) all_lines = input.readlines() input.close() @@ -612,6 +612,7 @@ def str2bool(v): print("\tDone\n") if args.hl != "null": print("\nReformatting high level tax result file\n") + print(F"User set query coverage minimum: {args.hl_qc} User set percent identity minimum: {args.hl_id}") hl_dict = build_iso_hl_dict(F"{args.tax}/hl_blast.out", hl=True, hl_fmt=args.hl, hl_qc=args.hl_qc, hl_id=args.hl_id) print("\tDone\n") print("\nGenerating consensus taxonomy & combined taxonomy table\n") @@ -629,41 +630,42 @@ def str2bool(v): consensus.write("\n") if args.blast: - combined = open(F"{args.output_dir}combined_taxonomy.txt", "w") - combined.write("OTU_ID\tKingdom_RDP\tKingdom_BLAST\tKingdom_SINTAX\tKingdom_Consensus\tPhylum_RDP\tPhylum_BLAST\tPhylum_SINTAX") - combined.write("\tPhylum_Consensus\tClass_RDP\tClass_BLAST\tClass_SINTAX\tClass_Consensus\tOrder_RDP\tOrder_BLAST\tOrder_SINTAX") - combined.write("\tOrder_Consensus\tFamily_RDP\tFamily_BLAST\tFamily_SINTAX\tFamily_Consensus\tGenus_RDP\tGenus_BLAST\tGenus_SINTAX") - combined.write("\tGenus_Consensus\tSpecies_RDP\tSpecies_BLAST\tSpecies_SINTAX\tSpecies_Consensus\n") - - for otu in rdp_dict.keys(): - consensus.write(otu+"\t") - combined.write(otu) - levels = [] - for m in range(0,len(ranks)*2,2): - level = vote(rdp_dict[otu][m:m+2], blast_dict[otu][m:m+2], sin_dict[otu][m:m+2], args.conservative) - combined.write("\t"+rdp_dict[otu][m]+"\t"+blast_dict[otu][m]+"\t"+sin_dict[otu][m]+"\t") - if level != "": - levels.append(level) - combined.write(level) - consensus.write('\t'.join(levels+[""]*(len(ranks)-len(levels)))) - if args.isolates == "True": - consensus.write(F"\t{iso_dict[otu][0]}\t{iso_dict[otu][1]}\t{iso_dict[otu][2]}") - if args.hl != "null": - consensus.write(F"\t{hl_dict[otu][0]}\t{hl_dict[otu][1]}\t{hl_dict[otu][2]}") - if args.consistent: - tax_string = '\t'.join(levels) - consensus.write(F"\t{int(tax_string.replace(' ', '_').strip('_') in taxa_set)}\n") - else: - consensus.write("\n") - combined.write("\n") - print("\tDone\n") + print(hl_dict) + combined = open(F"{args.output_dir}combined_taxonomy.txt", "w") + combined.write("OTU_ID\tKingdom_RDP\tKingdom_BLAST\tKingdom_SINTAX\tKingdom_Consensus\tPhylum_RDP\tPhylum_BLAST\tPhylum_SINTAX") + combined.write("\tPhylum_Consensus\tClass_RDP\tClass_BLAST\tClass_SINTAX\tClass_Consensus\tOrder_RDP\tOrder_BLAST\tOrder_SINTAX") + combined.write("\tOrder_Consensus\tFamily_RDP\tFamily_BLAST\tFamily_SINTAX\tFamily_Consensus\tGenus_RDP\tGenus_BLAST\tGenus_SINTAX") + combined.write("\tGenus_Consensus\tSpecies_RDP\tSpecies_BLAST\tSpecies_SINTAX\tSpecies_Consensus\n") + + for otu in rdp_dict.keys(): + consensus.write(otu+"\t") + combined.write(otu) + levels = [] + for m in range(0,len(ranks)*2,2): + level = vote(rdp_dict[otu][m:m+2], blast_dict[otu][m:m+2], sin_dict[otu][m:m+2], args.conservative) + combined.write("\t"+rdp_dict[otu][m]+"\t"+blast_dict[otu][m]+"\t"+sin_dict[otu][m]+"\t") + if level != "": + levels.append(level) + combined.write(level) + consensus.write('\t'.join(levels+[""]*(len(ranks)-len(levels)))) + if args.isolates == "True": + consensus.write(F"\t{iso_dict[otu][0]}\t{iso_dict[otu][1]}\t{iso_dict[otu][2]}") + if args.hl != "null": + consensus.write(F"\t{hl_dict[otu][0]}\t{hl_dict[otu][1]}\t{hl_dict[otu][2]}") + if args.consistent: + tax_string = '\t'.join(levels) + consensus.write(F"\t{int(tax_string.replace(' ', '_').strip('_') in taxa_set)}\n") + else: + consensus.write("\n") + combined.write("\n") + print("\tDone\n") - consensus.close() - combined.close() + consensus.close() + combined.close() - print("\nGenerating classification counts & summary table\n") - count_classifications([rdp_file, sin_file, blast_file, consensus_file], args.output_dir, "UNITE", len(ranks), use_blast=True) + print("\nGenerating classification counts & summary table\n") + count_classifications([rdp_file, sin_file, blast_file, consensus_file], args.output_dir, "UNITE", len(ranks), use_blast=True) else: combined = open(F"{args.output_dir}combined_taxonomy.txt", "w") combined.write("OTU_ID\tKingdom_RDP\tKingdom_SINTAX\tKingdom_UTAX\tKingdom_Consensus\tPhylum_RDP\tPhylum_SINTAX\tPhylum_UTAX") @@ -761,6 +763,7 @@ def str2bool(v): print("\tDone\n") if args.hl != "null": print("\nReformatting high level tax result file\n") + print(F"User set query coverage minimum: {args.hl_qc} User set percent identity minimum: {args.hl_id}") hl_dict = build_iso_hl_dict(F"{args.tax}/hl_blast.out", hl=True, hl_fmt=args.hl, hl_qc=args.hl_qc, hl_id=args.hl_id) print("\tDone\n") print("\nGenerating consensus taxonomy & combined taxonomy table\n") diff --git a/blast_to_df.py b/blast_to_df.py index 9d2b9ad..35c9af6 100644 --- a/blast_to_df.py +++ b/blast_to_df.py @@ -44,7 +44,7 @@ line = ifile.readline() if line == "# 0 hits found\n": # If no hits found buffer = F"{buffer}{quer},{'__'},{1},{0},{0.0},{0},{','.join(['unidentified']*len(ranks))}\n" # Add dummy row to be cut out later - elif line[0] != "#": # BLAST hit lines + elif line[0] != "#" and line != "\n": # BLAST hit lines spl = line.strip().split("\t") sub = spl[1] eval, bitscore, id, qcov = spl[2:] diff --git a/check_input_names.py b/check_input_names.py index efb3550..ed50800 100644 --- a/check_input_names.py +++ b/check_input_names.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + ''' Script to convert invalid input names to compliant ASCII names. Uses a random hex string for file name. diff --git a/constax.sh b/constax.sh index e74b46f..91b730e 100644 --- a/constax.sh +++ b/constax.sh @@ -1,6 +1,6 @@ #!/bin/bash -login -VERSION=2.0.14; BUILD=0; PREFIX=placehold +VERSION=2.0.15; BUILD=0; PREFIX=placehold TRAIN=false BLAST=false HELP=false @@ -507,7 +507,15 @@ then module load BLAST fi - blastn -query "$FRM_INPUT" -db "${TFILES}/${base}"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs $MAX_HITS > "$TAX"/blast.out + # workaround code for blast getting stuck + python "$CONSTAXPATH"/split_inputs.py -i "$FRM_INPUT" + echo > "$TAX"/blast.out + for i in ${FRM_INPUT%.fasta}_*".fasta" + do + blastn -query $i -db "$TFILES"/"$base"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs $MAX_HITS >> "$TAX"/blast.out + rm $i + done + python "$CONSTAXPATH"/blast_to_df.py -i "$TAX"/blast.out -o "$TAX"/otu_taxonomy.blast -d "$DB" -t "$TFILES" -f $FORMAT else "$UTAXPATH" -utax "$FRM_INPUT" -db "${TFILES}"/utax.db -strand both -utaxout "$TAX"/otu_taxonomy.utax -utax_cutoff $CONF -threads $NTHREADS diff --git a/constax_no_inputs.sh b/constax_no_inputs.sh index b937662..46db662 100644 --- a/constax_no_inputs.sh +++ b/constax_no_inputs.sh @@ -1,6 +1,6 @@ #!/bin/bash -login -VERSION=2.0.14; BUILD=0; PREFIX=placehold +VERSION=2.0.15; BUILD=0; PREFIX=placehold echo "Welcome to CONSTAX version $VERSION build $BUILD - The CONSensus TAXonomy classifier" echo "This software is distributed under MIT License" @@ -352,8 +352,15 @@ then then module load BLAST fi + # workaround code for blast getting stuck + python "$CONSTAXPATH"/split_inputs.py -i "$FRM_INPUT" + echo > "$TAX"/blast.out + for i in ${FRM_INPUT%.fasta}_*".fasta" + do + blastn -query $i -db "$TFILES"/"$base"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs $MAX_HITS >> "$TAX"/blast.out + rm $i + done - blastn -query "$FRM_INPUT" -db "${TFILES}/${base}"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs $MAX_HITS > "$TAX"/blast.out python "$CONSTAXPATH"/blast_to_df.py -i "$TAX"/blast.out -o "$TAX"/otu_taxonomy.blast -d "$DB" -t "$TFILES" -f $FORMAT else "$UTAXPATH" -utax "$FRM_INPUT" -db "${TFILES}"/utax.db -strand both -utaxout "$TAX"/otu_taxonomy.utax -utax_cutoff $CONF -threads $NTHREADS @@ -402,6 +409,8 @@ then rm "$TAX/"hl_formatted.fasta blastn -query "$FRM_INPUT" -db "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs 1 -evalue 0.001 > "$TAX"/hl_blast.out rm "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST.n* + else + echo "" fi rm "$FRM_INPUT" fi diff --git a/constax_wrapper.py b/constax_wrapper.py index 2b5fbf1..ee01cf0 100644 --- a/constax_wrapper.py +++ b/constax_wrapper.py @@ -80,7 +80,7 @@ def false_to_null(arg): env["RDPPATH_USER"]=args.rdp_path.lower() if args.rdp_path == "False" else args.rdp_path env["CONSTAXPATH_USER"]=args.constax_path.lower() if args.constax_path == "False" else args.constax_path -version="2.0.14"; build="0"; prefix="placehold" +version="2.0.15"; build="0"; prefix="placehold" if os.path.isfile(args.pathfile): with open(args.pathfile, "r") as pathfile: diff --git a/split_inputs.py b/split_inputs.py new file mode 100644 index 0000000..8681193 --- /dev/null +++ b/split_inputs.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +''' +Splits the formatted input/query FASTA into 100 sequence file to +work around BLAST getting stuck on 0 hit queries, where the +0 hit query is proceeded by >100 seqs. +''' +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", type=str, help="input query file") +args = parser.parse_args() + +buffer = "" +total_rec_count = 0 +file_rec_count = 0 +file_count = 0 +prefix = args.input.split(".fasta")[0] +print("Input FASTA: ", args.input) +with open(args.input, "r") as ifile: + line = ifile.readline() + while line != "": + header = line + line = ifile.readline() + seq = "" + while line != "" and line[0] != ">": + seq += line.strip() + line = ifile.readline() + buffer += F"{header}{seq}\n" + total_rec_count += 1 + file_rec_count += 1 + if file_rec_count > 99: + file_rec_count = 0 + with open(F"{prefix}_{file_count:04d}.fasta", "w") as ofile: + ofile.write(buffer) + file_count += 1 + buffer = "" + if buffer != "": + with open(F"{prefix}_{file_count:04d}.fasta", "w") as ofile: + ofile.write(buffer)