From 3dc184a278197bf23517bab90a59de7908f8a4ca Mon Sep 17 00:00:00 2001
From: liberjul <liberjul@msu.edu>
Date: Tue, 30 Nov 2021 13:48:47 -0500
Subject: [PATCH] Version, docs, message updates.

---
 constax.sh                        | 53 +++++++++++++++++++------------
 constax_no_inputs.sh              |  4 +--
 docs/source/conf.py               |  4 +--
 docs/source/index.rst             |  4 +--
 docs/source/installation.rst      |  8 ++---
 docs/source/referenceDB.rst       | 15 +++++----
 docs/source/tutorial1.rst         |  2 +-
 docs/source/tutorial2.rst         |  2 +-
 docs/source/tutorial5.rst         |  8 +++--
 subscript_fasta_addFullLineage.py | 19 +++++++----
 10 files changed, 72 insertions(+), 47 deletions(-)

diff --git a/constax.sh b/constax.sh
index 91b730e..4177031 100644
--- a/constax.sh
+++ b/constax.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -login
 
-VERSION=2.0.15; BUILD=0; PREFIX=placehold
+VERSION=2.0.16; BUILD=0; PREFIX=placehold
 TRAIN=false
 BLAST=false
 HELP=false
@@ -45,7 +45,7 @@ echo ""
 echo "Please cite us as:"
 echo "CONSTAX2: Improved taxonomic classification of environmental DNA markers"
 echo "Julian Aaron Liber, Gregory Bonito, Gian Maria Niccolò Benucci"
-echo "bioRxiv 2021.02.15.430803; doi: https://doi.org/10.1101/2021.02.15.430803"
+echo "Bioinformatics, Volume 37, Issue 21, 1 November 2021, Pages 3941–3943; doi: https://doi.org/10.1093/bioinformatics/btab347"
 
 ### Parse variable inputs
 TEMP=`getopt -o c:n:m:e:p:d:i:o:x:tbhvf: --long conf:,num_threads:,max_hits:,evalue:,p_iden:,db:,input:,output:,tax:,train,blast,select_by_keyword:,msu_hpcc,help,version,conservative,make_plot,check,trainfile:,mem:,sintax_path:,utax_path:,rdp_path:,constax_path:,pathfile:,isolates:,isolates_query_coverage:,isolates_percent_identity:,high_level_db:,high_level_query_coverage:,high_level_percent_identity: \
@@ -180,6 +180,10 @@ then
   echo "CONSTAX version $VERSION build $BUILD"
   exit 1
 fi
+#Check Python version
+python -V > ver_python.txt 2>&1
+if grep -Fq "Python 2" ver_python.txt; then exit 2; fi
+
 if [ $MAX_HITS -eq 0 ]
 then
   echo "Set -m/--max_hits to an integer greater than zero."
@@ -302,20 +306,17 @@ then
   source "$PATHFILE"
 elif [ -f "pathfile.txt" ] # Next try in local directory
 then
-  echo "Using local pathfile $PATHFILE"
-  source "$PATHFILE"
+  echo "Using local pathfile.txt"
+  source pathfile.txt
 else # Then try in package directory.
   echo "Pathfile input not found in local directory ..."
   DIR=$(conda list | head -n 1 | rev | cut -d' ' -f1 | rev | cut -d: -f1)
   PATHFILE=$DIR"/pkgs/constax-$VERSION-$BUILD/opt/constax-$VERSION/pathfile.txt"
-  if [ -f "$PATHFILE" ]
-  then
-    sed -i'' -e "s|=.*/opt/constax|=$DIR/pkgs/constax-$VERSION-$BUILD/opt/constax|g" "$PATHFILE" > "$PATHFILE".tmp
-    source "$PATHFILE".tmp
-    rm "$PATHFILE".tmp
-  else
-    echo "Pathfile input not found at $PATHFILE ..."
-  fi
+  if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi
+  PATHFILE=$DIR"/pkgs/constax-$VERSION-$BUILD_STRING/opt/constax-$VERSION/pathfile.txt"
+  if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi
+  PATHFILE=$DIR"/opt/constax-$VERSION/pathfile.txt"
+  if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi
 fi
 # Check for user input paths
 if [ $(command -v "$SINTAXPATH_USER") ] && [[ "$SINTAXPATH_USER" != false ]]
@@ -363,13 +364,13 @@ else
   echo "RDP: $RDPPATH"
   if ! [ $(command -v java -jar "$RDPPATH") ] && ! [ $(command -v "$RDPPATH") ] ; then echo "RDP not executable alone or by java -jar" ; fi
   echo "UTAX: $UTAXPATH"
-  if ! $BLAST &&  ! [ $(command -v "$UTAXPATH") ] ; then echo "UTAX not executable" ; fi
+  if ! $BLAST &&  ! [ $(command -v "$UTAXPATH") ] ; then echo "UTAX not executable. Did you mean to use -b/--blast flag?" ; fi
   if $BLAST &&  ! [ $(command -v blastn) ] ; then echo "BLAST not executable" ; fi
   echo "CONSTAX: $CONSTAXPATH"
   if [ -d "$CONSTAXPATH" ] ; then echo "CONSTAX directory not found" ; fi
   exit 1
 fi
-if ! $BLAST  && [ $(echo "$UTAXPATH" | grep -oP "(?<=usearch).*?(?=\.)") -gt 9 ]
+if ! $BLAST  && [ $(echo "$UTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -gt 9 ]
 then
   echo "USEARCH executable must be version 9.X or lower to use UTAX"
   exit 1
@@ -384,7 +385,11 @@ fi
 base=$(basename -- ${DB%.fasta})
 
 FORMAT=$(python "$CONSTAXPATH"/detect_format.py -d "$DB" 2>&1)
-
+if [[ $FORMAT == "INVALID" ]]
+then
+  echo "Database file $DB must be in UNITE or SILVA format, exiting..."
+  exit 1
+fi
 echo "Memory size: "$MEM"mb"
 
 if [[ "$FORMAT" == "null" ]]
@@ -397,7 +402,6 @@ then
   echo "All checks passed, rerun without --check flag."
   exit 0
 fi
-
 if ! $COMBINE_ONLY
 then
   if $TRAIN
@@ -406,7 +410,7 @@ then
 
     echo "__________________________________________________________________________"
   	echo "Training SINTAX Classifier"
-    if [ $(echo "$SINTAXPATH" | grep -oP "(?<=usearch).*?(?=\.)") -lt 11 2> /dev/null ]
+    if [ $(echo "$SINTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -lt 11 2> /dev/null ]
     then
     	"$SINTAXPATH" -makeudb_sintax "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
     else
@@ -459,15 +463,18 @@ then
         exit 1
       else
         echo "RDP training error overcome, continuing with classification after SINTAX is retrained"
-        if [ $(echo "$SINTAXPATH" | grep -oP "(?<=usearch).*?(?=\.)") -lt 11 2> /dev/null ]
+        if [ $(echo "$SINTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -lt 11 2> /dev/null ]
         then
           "$SINTAXPATH" -makeudb_sintax "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
         else
           "$SINTAXPATH" -makeudb_usearch "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
         fi
       fi
+      if [ -f rdp_train.out ]
+      then
+        rm rdp_train.out
+      fi
     fi
-
     # The rRNAClassifier.properties file should be in one of these two places
     if [ -f "$CONSTAXPATH"/rRNAClassifier.properties ]
     then
@@ -506,7 +513,6 @@ then
     then
       module load BLAST
     fi
-
     # workaround code for blast getting stuck
     python "$CONSTAXPATH"/split_inputs.py -i "$FRM_INPUT"
     echo > "$TAX"/blast.out
@@ -550,6 +556,11 @@ then
   then
     echo "High Level Taxonomy Assignment"
     HL_FMT=$(python "$CONSTAXPATH"/detect_format.py -d "$HL_DB" 2>&1)
+    if [[ $HL_FMT == "INVALID" ]]
+    then
+      echo "High-level taxonomy database file $HL_DB must be in UNITE or SILVA format, exiting..."
+      exit 1
+    fi
     if $MSU_HPCC && ! $BLAST
     then
       module load BLAST
@@ -559,6 +570,8 @@ then
     rm "$TAX/"hl_formatted.fasta
     blastn -query "$FRM_INPUT" -db "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs 1 -evalue 0.001 > "$TAX"/hl_blast.out
     rm "$TAX/$(basename -- ${HL_DB%.fasta})"__BLAST.n*
+  else
+    echo ""
   fi
   rm "$FRM_INPUT"
 fi
diff --git a/constax_no_inputs.sh b/constax_no_inputs.sh
index ab831e5..0466a04 100644
--- a/constax_no_inputs.sh
+++ b/constax_no_inputs.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -login
 
-VERSION=2.0.15; BUILD=0; PREFIX=placehold
+VERSION=2.0.16; BUILD=0; PREFIX=placehold
 
 echo "Welcome to CONSTAX version $VERSION build $BUILD - The CONSensus TAXonomy classifier"
 echo "This software is distributed under MIT License"
@@ -12,7 +12,7 @@ echo ""
 echo "Please cite us as:"
 echo "CONSTAX2: Improved taxonomic classification of environmental DNA markers"
 echo "Julian Aaron Liber, Gregory Bonito, Gian Maria Niccolò Benucci"
-echo "bioRxiv 2021.02.15.430803; doi: https://doi.org/10.1101/2021.02.15.430803"
+echo "Bioinformatics, Volume 37, Issue 21, 1 November 2021, Pages 3941–3943; doi: https://doi.org/10.1093/bioinformatics/btab347"
 
 if $SHOW_VERSION
 then
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7728251..bfb56b1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -18,11 +18,11 @@
 # -- Project information -----------------------------------------------------
 
 project = 'CONSTAXv2'
-copyright = '2020, Julian A. Liber and Gian M. N. Benucci'
+copyright = '2021, Julian A. Liber and Gian M. N. Benucci'
 author = 'Julian A. Liber and Gian M. N. Benucci'
 
 # The full version, including alpha/beta/rc tags
-release = '2.0.14'
+release = '2.0.16'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8e1a38c..1484a23 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -12,7 +12,7 @@ taxonomic units at a given taxonomic rank) is realized
 by generating consensus taxonomy of available classifiers
 with CONSTAX.
 
-CONSTAX 2.0.14 improves upon 1.0.0 with the following features:
+CONSTAX 2.0.16 improves upon 1.0.0 with the following features:
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 * **Updated software requirements, including Python 3 and Java 8**
@@ -56,7 +56,7 @@ CONSTAX 1.0.0 was authored by
 Reference
 ^^^^^^^^^
 
-`Liber JA, Bonito G, Benucci GMN (2021) CONSTAX2: improved taxonomic classification of environmental DNA markers. Bioinformatics doi: 10.1093/bioinformatics/btab347 <https://doi.org/10.1093/bioinformatics/btab347>`_
+`Liber JA, Bonito G, Benucci GMN (2021) CONSTAX2: improved taxonomic classification of environmental DNA markers. Bioinformatics doi: 10.1093/bioinformatics/btab347 <https://academic.oup.com/bioinformatics/article/37/21/3941/6271412>`_
 
 `Gdanetz K, Benucci GMN, Vande Pol N, Bonito G (2017) CONSTAX: a tool for improved taxonomic resolution of environmental fungal ITS sequences. BMC Bioinformatics 18:538 doi 10.1186/s12859-017-1952-x <https://bmcbioinformatics.biomedcentral.com/track/pdf/10.1186/s12859-017-1952-x>`_
 
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 93c3e0f..fe98fdc 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -25,15 +25,15 @@ If conda is not installed (you get an error which might include ``command not fo
 
           .. code-block:: text
 
-              wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.9.2-Linux-x86_64.sh
-              bash Miniconda3-py39_4.9.2-Linux-x86_64.sh
+              wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh
+              bash Miniconda3-py39_4.10.3-Linux-x86_64.sh
 
         .. tab:: OSX
 
           .. code-block:: text
 
-              curl -O https://repo.anaconda.com/miniconda/Miniconda3-py39_4.9.2-MacOSX-x86_64.sh
-              bash Miniconda3-py39_4.9.2-MacOSX-x86_64.sh
+              curl -O https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-MacOSX-x86_64.sh
+              bash Miniconda3-py39_4.10.3-MacOSX-x86_64.sh
 
 2. Follow the prompts.
 
diff --git a/docs/source/referenceDB.rst b/docs/source/referenceDB.rst
index bd2c5fd..7c9739d 100644
--- a/docs/source/referenceDB.rst
+++ b/docs/source/referenceDB.rst
@@ -3,18 +3,19 @@
 Suggested Reference Databases
 =============================
 
-Dependent on where your sequences originate (e.g. ITS, 16S, LSU), you will need to have an appropriate 
+Dependent on where your sequences originate (e.g. ITS, 16S, LSU), you will need to have an appropriate
 database with which to classify them.
 
-For Fungi or all Eukaryotes, the `UNITE <https://unite.ut.ee/>`_ database is preferred. The format of the reference database to use with 
-CONSTAX is one of those under the `General <https://unite.ut.ee/repository.php>`_ fasta format.
+For Fungi or all Eukaryotes, the `UNITE <https://unite.ut.ee/>`_ database is preferred. The format of the reference database to use with
+CONSTAX is one of those under the `General <https://unite.ut.ee/repository.php>`_ fasta format. For the latest release (10.05.2021),
+training with 32GB of RAM for Fungi only or 40GB for all Eukaryotes should be sufficient.
 
-For Bacteria and Archaea, we recommend the `SILVA <https://www.arb-silva.de/no_cache/download/archive/current/Exports/>`_ reference database. 
+For Bacteria and Archaea, we recommend the `SILVA <https://www.arb-silva.de/no_cache/download/archive/current/Exports/>`_ reference database.
 The ``SILVA_XXX_SSURef_tax_silva.fasta.gz`` file can be gunzip-ped and used.
 
 .. Note::
-   SILVA taxonomy is not assigned by Linnean ranks (*Kingdom*, *Phylum*, etc.), so instead placeholder ranks 1-n are used. 
-   Also, the size of the SILVA database means that a server/cluster is required to train the classifier becasue 
-   128GB RAM for the RDP training are required. If you have a computer with 32GB of RAM, you may be able to train using 
+   SILVA taxonomy is not assigned by Linnean ranks (*Kingdom*, *Phylum*, etc.), so instead placeholder ranks 1-n are used.
+   Also, the size of the SILVA database means that a server/cluster is required to train the classifier becasue
+   128GB RAM for the RDP training are required. If you have a computer with 32GB of RAM, you may be able to train using
    the UNITE database. If you cannot train locally for UNITE, the RDP files can be downloaded from `here <https://github.com/liberjul/CONSTAXv2_data/tree/master/sh_general_release_fungi_35077_RepS_04.02.2020>`_.
    The ``genus_wordConditionalProbList.txt.gz`` file should be gunzip-ped after downloading.
diff --git a/docs/source/tutorial1.rst b/docs/source/tutorial1.rst
index 5c6491d..1113df4 100644
--- a/docs/source/tutorial1.rst
+++ b/docs/source/tutorial1.rst
@@ -60,7 +60,7 @@ where you will add the absolute PATHs for the required software. VSEARCH, BLAST,
    :align: center
 
 .. warning::
-    Remember to navigate through your anaconda installation and find the ``constax-2.0.14/`` folder.
+    Remember to navigate through your anaconda installation and find the ``constax-2.0.16/`` folder.
     This is the only way to make CONSTAX locate the needed python scripts.
 
 Before you can run CONSTAX you need to activate your anaconda environment (alternatively,
diff --git a/docs/source/tutorial2.rst b/docs/source/tutorial2.rst
index a74ea7d..68cbce7 100644
--- a/docs/source/tutorial2.rst
+++ b/docs/source/tutorial2.rst
@@ -17,7 +17,7 @@ The code will look like as below
     #SBATCH --time=10:00:00
     #SBATCH --ntasks=1
     #SBATCH --cpus-per-task=20
-    #SBATCH --mem=128G
+    #SBATCH --mem=32G
     #SBATCH --job-name constax_fungi
     #SBACTH -A shade-cole-bonito
 
diff --git a/docs/source/tutorial5.rst b/docs/source/tutorial5.rst
index e76b355..36ba4ad 100644
--- a/docs/source/tutorial5.rst
+++ b/docs/source/tutorial5.rst
@@ -4,7 +4,11 @@ Downloading the UNITE database
 This tutorial is about how to obtain a reference database for classification of fungi or
 eukaryotes in general. These will be downloaded from `UNITE <https://unite.ut.ee/repository.php>`_.
 
-For classification of fungi, we have had tested with the `RepS 35077 General Release FASTA <https://plutof.ut.ee/#/doi/10.15156/BIO/786368>`_.
+For classification of fungi, we have had tested with the `RepS 44343 General Release FASTA <https://plutof.ut.ee/#/doi/10.15156/BIO/1280049>`_.
+
+The eukaryote database with `96423 RepS sequences <https://doi.org/10.15156/BIO/1280127>`_ provides better information about the kingdom
+classification of the sequence, but requires slightly more RAM (~40GB). Using the ``--high_level_taxonomy`` option can provide a similar result
+but with reduced RAM requirements.
 
 .. code-block:: language
 
@@ -14,6 +18,6 @@ For classification of fungi, we have had tested with the `RepS 35077 General Rel
 Use the FASTA called ``sh_general_release_fungi_35077_RepS_04.02.2020.fasta`` within the expanded directory for
 your fungal reference database, specified with ``-d`` or ``--db`` in your ``constax`` command.
 
-For the ``--high_level_db`` option, the eukaryotes database found here `https://plutof.ut.ee/#/doi/10.15156/BIO/786370 <https://plutof.ut.ee/#/doi/10.15156/BIO/786370>`_.
+For the ``--high_level_db`` option, the eukaryotes database found here `https://plutof.ut.ee/#/doi/10.15156/BIO/1280127 <https://plutof.ut.ee/#/doi/10.15156/BIO/1280127>`_.
 can be used. This will help to remove non-fungal OTUs from your dataset, or can be used as the main database (``-d, --db``)
 for projects amplifying other eukaryotes.
diff --git a/subscript_fasta_addFullLineage.py b/subscript_fasta_addFullLineage.py
index 11e16a7..7aa5dd6 100644
--- a/subscript_fasta_addFullLineage.py
+++ b/subscript_fasta_addFullLineage.py
@@ -1,15 +1,21 @@
 #!/usr/bin/python
 import sys, os
 
-def RDP_head_to_UTAX(lineage):
+def RDP_head_to_UTAX(lineage, format):
 	taxa = lineage.split(";")[1:]
 	out = ""
 	for r in range(len(taxa)):
-		out = F"{out}{'dkpcofgs'[r]}:{taxa[r]},"
+		if format == "UNITE":
+			out = F"{out}{'dkpcofg'[r]}:{taxa[r]},"
+		else:
+			out = F"{out}{'dkpcofgs'[r]}:{taxa[r]},"
+
 	return out[:-1]
-def addFullLineage(filebase):
+def addFullLineage(filebase, format):
 	print("\n\tAdding Full Lineage\n\n")
-	f1 = open(filebase+"__RDP_taxonomy_headers.txt", 'r').readlines()
+
+	with open(filebase+"__RDP_taxonomy_headers.txt", 'r') as f:
+		f1 = f.readlines()
 	hash = {} #lineage map
 
 	output_RDP = open(filebase+"__RDP_trained.fasta", 'w')
@@ -20,7 +26,8 @@ def addFullLineage(filebase):
 		ID = ID.strip(">")
 		hash[ID] = lineage
 
-	f2 = open(filebase+"__RDP.fasta", 'r').readlines()
+	with open(filebase+"__RDP.fasta", 'r') as f:
+		f2 = f.readlines()
 	for line in f2:
 		if line[0] == '>':
 			ID = line.strip().replace('>', '')
@@ -30,7 +37,7 @@ def addFullLineage(filebase):
 				print(ID, 'not in taxonomy file')
 				sys.exit()
 			output_RDP.write(F"{line.strip()}\t{lineage}\n")
-			output_UTAX.write(F"{line.strip()};tax={RDP_head_to_UTAX(lineage)};\n")
+			output_UTAX.write(F"{line.strip()};tax={RDP_head_to_UTAX(lineage, format)};\n")
 		else:
 			output_RDP.write(line.strip()+"\n")
 			output_UTAX.write(line.strip()+"\n")