Merge remote-tracking branch 'origin/master' into remotes/origin/issue55

Gaius-Augustus · Sep 13, 2019 · 3582797 · 3582797
2 parents 37967be + 40beddc
commit 3582797
Show file tree

Hide file tree

Showing 87 changed files with 2,029 additions and 516 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -54,6 +54,7 @@ RUN make
 WORKDIR "/root/augustus"
 RUN make
 RUN make install
+ENV PATH="/root/augustus/bin:${PATH}"
 
 # Test AUGUSTUS
 RUN make test
diff --git a/README.md b/README.md
@@ -113,13 +113,15 @@ If this environment variable is not set, then the programs will look in the path
 
 AUGUSTUS can also be run through a web-interface at http://bioinf.uni-greifswald.de/augustus/ and a web service at http://bioinf.uni-greifswald.de/webaugustus/index.gsp.
 
-# REFERENCES
+# REFERENCES AND DOCUMENTATION
 
 Mario Stanke, Mark Diekhans, Robert Baertsch, David Haussler (2008).
 [Using native and syntenically mapped cDNA alignments to improve de novo gene finding](https://academic.oup.com/bioinformatics/article/24/5/637/202844). Bioinformatics, 24(5), pages 637–644, doi: 10.1093/bioinformatics/btn013
 
 For further references see [docs/REFERENCES.md](docs/REFERENCES.md)
 
+[3 book chapters with command line walkthroughs](https://math-inf.uni-greifswald.de/en/department/about-us/employees/prof-dr-mario-stanke-english/publications/#c302071)
+
 # LICENSES
 
 All source code, i.e.

diff --git a/docs/REFERENCES.md b/docs/REFERENCES.md
@@ -1,5 +1,11 @@
 # REFERENCES
 
+Stefanie Nachtweide and Mario Stanke (2019), [Multi-Genome Annotation with AUGUSTUS](https://www.ncbi.nlm.nih.gov/pubmed/31020558). Methods Mol Biol., 1962:139-160. doi: 10.1007/978-1-4939-9173-0_8. PubMed PMID: 31020558
+
+Hoff KJ, Lomsadze A, Borodovsky M, Stanke M. (2019), [Whole-Genome Annotation with BRAKER](https://www.ncbi.nlm.nih.gov/pubmed/31020555). Methods Mol Biol., 1962:65-95. doi: 10.1007/978-1-4939-9173-0_5. PubMed PMID: 31020555.
+
+Hoff KJ. ,Stanke M. (2018). [Predicting genes in single genomes with AUGUSTUS](https://currentprotocols.onlinelibrary.wiley.com/doi/abs/10.1002/cpbi.57). Current Protocols in Bioinformatics, e57. doi: 10.1002/cpbi.57. [manuscript (PDF)](https://math-inf.uni-greifswald.de/fileadmin/uni-greifswald/fakultaet/mnf/mathinf/stanke/augustus_wrp.pdf)
+
 Stefanie König, Lars Romoth, Lizzy Gerischer, and Mario Stanke (2016)
 [Simultaneous gene finding in multiple genomes](https://academic.oup.com/bioinformatics/article/32/22/3388/2525611). Bioinformatics, 32 (22): 3388-3395, doi: 10.1093/bioinformatics/btw494
 

diff --git a/scripts/README.autoAug b/scripts/README.autoAug
@@ -57,7 +57,7 @@ can then submit them manually using your batch job submission system.
 		3. Software Components
 		-----------------------------------
 
-AUGUSTUS: Download from http://augustus.gobics.de/
+AUGUSTUS: Download from https://github.com/Gaius-Augustus/Augustus
 
 BLAT:     You get Jim Kent's alignment program at http://users.soe.ucsc.edu/~kent/src/
           Also install pslCDnaFilter which is part of the Jim Kent src tree
@@ -68,8 +68,8 @@ PASA:     Download the Program to Assemble Spliced Alignments from
           You do not need PASA if you already have a training set of genes for your species.
 
 SCIPIO:   Install SCIPIO if you have training genes in the form of protein sequences.
-	  SCIPIO can then find their gene structures in the genome. Most users will not need this.
-	  http://www.webscipio.org/
+          SCIPIO can then find their gene structures in the genome. Most users will not need this.
+          http://www.webscipio.org/
 
 
 		4. Installation and Configuration
@@ -144,7 +144,7 @@ Situation 2:
 You can just input a cDNA sequence file in FASTA format, and let PASA extract
 a training set of genes. You need to install PASA for this separately.
 
->autoAug.pl -g genome.fa --species=yourSpecies -c cdna.fa -v -v --pasa --useexisting
+>autoAug.pl -g genome.fa --species=yourSpecies -c cdna.fa -v -v --pasa --useGMAPforPASA --useexisting
 
 Then follow the output prompts. If your cluster is set as in section 4 you can also use the option --noninteractive.
 
@@ -173,7 +173,10 @@ The keywords (such as genome) can be abbreviated (e.g. "ge") to the extend that
   cdna.fa is a cdna sequence file in the FASTA format (typically EST sequences or 454)
 
 --pasa
-  This argument swithes the PASA function on for creating a training set of genes from the cDNA, default: off.
+  This argument switches the PASA function on for creating a training set of genes from the cDNA, default: off.
+
+--useGMAPforPASA
+  use GMAP instead of BLAT in the PASA run
 
 --hints=hintsfile
   hintsfile is a file with extrinsic evidence about the location and structure of genes, see documentation of AUGUSTUS.
@@ -258,7 +261,7 @@ autoAugTrain.pl [OPTIONS] --species=sname --genome=genome.fa --species=sname --u
 
 --trainingset=genes.gb      
   genes.gb is a file with training genes a Genbank format. For examples for the exact format look at 
-  the files in http://augustus.gobics.de/datasets/
+  the files in http://bioinf.uni-greifswald.de/webaugustus/datasets.gsp
 
 --trainingset=genes.gff     
   genes.gff is a file with training genes in GFF format

diff --git a/scripts/aa2nonred.pl b/scripts/aa2nonred.pl
@@ -11,7 +11,7 @@
 # pipeline may fail upon custom modification of this script.
 # In case of doubt, contact [email protected]
 #
-# Mario Stanke & Katharina Hoff, last modification on Feb 19th 2018
+# Mario Stanke & Katharina Hoff, last modification on May 31 2019
 
 use strict;
 use Getopt::Long;
@@ -30,8 +30,11 @@
 #
 my $max_percent_id = 0.8;
 my $BLAST_PATH;
+my $DIAMOND_PATH;
 my $blast_path;
+my $diamond_path;
 my $CPU = 1;
+my $diamond;
 my $v = 0;
 my $help;
 
@@ -40,17 +43,21 @@
 $usage .= "In output.fa the percent identity value between each pair of \n";
 $usage .= "When removing redundant sequences, priority is given to the sequence occuring last.\n";
 $usage .= "Options:\n";
-$usage .= "--maxid=f       maximum percent identity between to sequences\n";
-$usage .= "                (#identical aa) / (length of shorter sequence) default: 0.8\n";
-$usage .= "--BLAST_PATH=s  path to blast (only implemented for NCBI BLAST)\n";
-$usage .= "--cores=n       number of cores to be used by NCBI BLAST\n";
-$usage .= "--verbosity=n   verbosity level for information printed to stdout\n";
-$usage .= "--help          print this help message\n";
+$usage .= "--maxid=f         maximum percent identity between to sequences\n";
+$usage .= "                  (#identical aa) / (length of shorter sequence) default: 0.8\n";
+$usage .= "--BLAST_PATH=s    path to blast (only implemented for NCBI BLAST)\n";
+$usage .= "--DIAMOND_PATH=s  path to diamond\n"; 
+$usage .= "--cores=n         number of cores to be used by NCBI BLAST or DIAMOND\n";
+$usage .= "--diamond         use DIAMOND istead of NCBI BLAST\n";
+$usage .= "--verbosity=n     verbosity level for information printed to stdout\n";
+$usage .= "--help            print this help message\n";
 
 GetOptions(
     'maxid:f'  => \$max_percent_id,
     'BLAST_PATH=s' => \$blast_path,
+    'DIAMOND_PATH=s' => \$diamond_path,
     'cores=i'  => \$CPU,
+    'diamond!' => \$diamond,
     'verbosity=i' => \$v,
     'help!'    => \$help
 );
@@ -66,8 +73,11 @@
     exit(1);
 }
 
-
-set_BLAST_PATH();
+if($diamond){
+    set_DIAMOND_PATH();
+}else{
+    set_BLAST_PATH();
+}
 
 
 my $inputfilename  = $ARGV[0];
@@ -82,7 +92,7 @@
 my $splitDir; # for parallelization
 my @splitFiles;
 my $SPLITF;
-if ( $CPU > 1 ) {
+if ( $CPU > 1 && not($diamond)) {
 
     my $nFastaEntries = 0;
     # counter number of fasta entries
@@ -165,36 +175,56 @@
 
 my $tempoutfile = "$inputfilename.blastout";
 
-## NCBI blast
-system("$BLAST_PATH/makeblastdb -in $tempdbname -dbtype prot -parse_seqids -out $tempdbname");
-if ( $CPU == 1 ) {
-    system("$BLAST_PATH/blastp -query $tempdbname -db $tempdbname > $tempoutfile");
+if($diamond){
+    if($CPU > 1){
+        system("$DIAMOND_PATH/diamond makedb --in $tempdbname -d $tempdbname --threads $CPU");
+    }else{
+        system("$DIAMOND_PATH/diamond makedb --in $tempdbname -d $tempdbname --threads 1");
+    }
 }else{
-    my $pm = new Parallel::ForkManager($CPU);
-    foreach ( @splitFiles ) {
-        my $pid = $pm->start and next;
-        system("$BLAST_PATH/blastp -query $tempdbname -db $tempdbname > $_.blastout");
-        $pm->finish;
+    ## NCBI blast
+    system("$BLAST_PATH/makeblastdb -in $tempdbname -dbtype prot -parse_seqids -out $tempdbname");
+}
+
+if ( $CPU == 1 ) {
+    if($diamond){
+        system("$DIAMOND_PATH/diamond blastp --db $tempdbname --outfmt 0 --query $tempdbname --out $tempoutfile")
+
+    }else{
+        system("$BLAST_PATH/blastp -query $tempdbname -db $tempdbname > $tempoutfile");
     }
-    $pm->wait_all_children;
-    foreach ( @splitFiles ) {
-        system("cat $_.blastout >> $tempoutfile");
+}else{
+    if($diamond){
+        system("$DIAMOND_PATH/diamond blastp --db $tempdbname --outfmt 0 --query $tempdbname --threads $CPU --out $tempoutfile");
+    }else{
+        my $pm = new Parallel::ForkManager($CPU);
+        foreach ( @splitFiles ) {
+            my $pid = $pm->start and next;
+            system("$BLAST_PATH/blastp -query $tempdbname -db $tempdbname > $_.blastout");
+            $pm->finish;
+        }
+        $pm->wait_all_children;
+        foreach ( @splitFiles ) {
+            system("cat $_.blastout >> $tempoutfile");
+        }
     }
 }
 
 
 ###########################################################################################
 #
-# parse the blast output
+# parse the blast/diamond output
 #
 ###########################################################################################
 
 open( BLASTOUT, "<$tempoutfile" ) or die("ERROR in file " . __FILE__ ." at line ". __LINE__ ."\nCould not open $tempoutfile!\n");
 $/ = "\nQuery= ";
 my ( $query, $target, $qlen, $tlen, $numid, $minlen );
 while (<BLASTOUT>) {
-    next unless / producing /;
-    $_ =~ m/(\S+)\n\nLength=(\d+)/;
+    if(not($diamond)){
+        next unless / producing /;
+    }
+    $_ =~ m/(\S+)\n+Length=(\d+)/;
     $query = $1;
     $qlen  = $2;
     print STDOUT "query=$query, qlen=$qlen\n" if ($v>0);
@@ -217,6 +247,7 @@
             }
         }
     }
+
 }
 close (BLASTOUT) or die("ERROR in file " . __FILE__ ." at line ". __LINE__ ."\nCould not close $tempoutfile!\n");
 
@@ -238,14 +269,18 @@
 #
 ###########################################################################################
 unlink ( rel2abs($tempdbname) );
-unlink ( rel2abs($tempdbname).".phr" );
-unlink ( rel2abs($tempdbname).".pin" );
-unlink ( rel2abs($tempdbname).".pog" );
-unlink ( rel2abs($tempdbname).".psd" );
-unlink ( rel2abs($tempdbname).".psi" );
-unlink ( rel2abs($tempdbname).".psq" );
+if(not($diamond)){
+    unlink ( rel2abs($tempdbname).".phr" );
+    unlink ( rel2abs($tempdbname).".pin" );
+    unlink ( rel2abs($tempdbname).".pog" );
+    unlink ( rel2abs($tempdbname).".psd" );
+    unlink ( rel2abs($tempdbname).".psi" );
+    unlink ( rel2abs($tempdbname).".psq" );
+}else{
+    unlink (rel2abs($tempdbname).".dmnd" );
+}
 unlink ( rel2abs($tempoutfile) );
-if ($CPU > 1) {
+if ($CPU > 1 && not($diamond)) {
     rmtree( ["$splitDir"] );
 }
 
@@ -369,3 +404,121 @@ sub set_BLAST_PATH {
         exit(1);
     }
 }
+
+###########################################################################################
+#
+# finding diamond executable
+#
+###########################################################################################
+sub set_DIAMOND_PATH {
+    my $prtStr;
+    # try to get path from ENV
+    if ( defined( $ENV{'DIAMOND_PATH'} ) ) {
+        if ( -e $ENV{'DIAMOND_PATH'} ) {
+            $prtStr
+                = "\# "
+                . (localtime)
+                . ": Found environment variable \$DIAMOND_PATH.\n";
+            print STDOUT $prtStr;
+            $DIAMOND_PATH = $ENV{'DIAMOND_PATH'};
+        }
+    }
+    else {
+        $prtStr
+            = "\# "
+            . (localtime)
+            . ": Did not find environment variable \$DIAMOND_PATH\n";
+        print STDOUT $prtStr;
+    }
+
+    # try to get path from command line
+    if ( defined($diamond_path) ) {
+        my $last_char = substr( $diamond_path, -1 );
+        if ( $last_char eq "\/" ) {
+            chop($diamond_path);
+        }
+        if ( -d $diamond_path ) {
+            $prtStr
+                = "\# "
+                . (localtime)
+                . ": Setting \$DIAMOND_PATH to command line argument ";
+            $prtStr .= "--DIAMOND_PATH value $diamond_path.\n";
+            print STDOUT $prtStr;
+            $DIAMOND_PATH = $diamond_path;
+        }
+        else {
+            $prtStr
+                = "\# "
+                . (localtime)
+                . ": WARNING: Command line argument --DIAMOND_PATH was ";
+            $prtStr
+                .= "supplied but value $diamond_path is not a directory. Will not set ";
+            $prtStr .= "\$DIAMOND_PATH to $diamond_path!\n";
+            print STDOUT $prtStr;
+        }
+    }
+
+    # try to guess
+    if ( not( defined($DIAMOND_PATH) )
+        || length($DIAMOND_PATH) == 0 )
+    {
+        $prtStr
+            = "\# "
+            . (localtime)
+            . ": Trying to guess \$DIAMOND_PATH from location of diamond";
+        $prtStr .= " executable that is available in your \$PATH.\n";
+        print STDOUT $prtStr;
+        my $epath = which 'diamond';
+        if ( -d dirname($epath) ) {
+            $prtStr
+                = "\# "
+                . (localtime)
+                . ": Setting \$DIAMOND_PATH to "
+                . dirname($epath) . "\n";
+            print STDOUT $prtStr;
+            $DIAMOND_PATH = dirname($epath);
+        }
+        else {
+            $prtStr
+                = "\# "
+                . (localtime)
+                . ": WARNING: Guessing the location of \$DIAMOND_PATH ";
+            $prtStr
+                .= "failed. " . dirname($epath) . " is not a directory!\n";
+            print STDOUT $prtStr;
+        }
+    }
+
+    if ( not( defined($DIAMOND_PATH) ) ) {
+        my $diamond_err;
+        $diamond_err .= "There are 3 alternative ways to set this variable for "
+                     .  " aa2nonred.pl:\n"
+                     .  "   a) provide command-line argument "
+                     .  "--DIAMOND_PATH=/your/path\n"
+                     .  "   b) use an existing environment variable "
+                     .  "\$DIAMOND_PATH\n"
+                     .  "      for setting the environment variable, run\n"
+                     .  "           export DIAMOND_PATH=/your/path\n"
+                     .  "      in your shell. You may append this to your "
+                     .  ".bashrc or .profile file in\n"
+                     .  "      order to make the variable available to all your "
+                     .  "bash sessions.\n"
+                     .  "   c) aa2nonred.pl can try guessing the location of "
+                     .  "\$DIAMOND_PATH from the\n"
+                     .  "      location of a diamond executable that is "
+                     .  "available in your \$PATH variable.\n"
+                     .  "      If you try to rely on this option, you can check "
+                     .  "by typing\n"
+                     .  "           which diamond\n"
+                     .  "      in your shell, whether there is a diamond "
+                     .  "executable in your \$PATH\n";
+        $prtStr = "\# " . (localtime) . " ERROR in file " . __FILE__ ." at line ". __LINE__ ."\n\$DIAMOND_PATH not set!\n";
+        print STDERR $prtStr;
+        print STDERR $diamond_err;
+        exit(1);
+    }
+    if ( not ( -x "$DIAMOND_PATH/diamond" ) ) {
+        print STDERR "\# " . (localtime) . " ERROR in file " . __FILE__ ." at line ". __LINE__ ."\n$DIAMOND_PATH/diamond is not an executable file!\n";
+        exit(1);
+    }
+}
diff --git a/scripts/augustus2browser.pl b/scripts/augustus2browser.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 #
 # This takes the AUGUSTUS output in the standard input
 # and outputs to standard output a file with UCSC browser gtf format

diff --git a/scripts/augustus2gbrowse.pl b/scripts/augustus2gbrowse.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 # convert AUGUSTUS output to Gbrowse format GFF file
 # Mario Stanke