VSEARCH 1.10.0: Improved fastq_mergepairs and more

torognes · Feb 11, 2016 · 32a72c8 · 32a72c8
1 parent 50eebf2
commit 32a72c8
Show file tree

Hide file tree

Showing 20 changed files with 1,433 additions and 5,609 deletions.
diff --git a/README.md b/README.md
@@ -215,6 +215,7 @@ File | Description
 **dbindex.cc** | Indexes the database by identifying unique kmers in the sequences
 **derep.cc** | Dereplication
 **dynlib.cc** | Dynamic loading of compression libraries
+**eestats.cc** | Produce statistics for fastq_eestats command
 **fasta.cc** | FASTA file parser
 **fastq.cc** | FASTQ file parser
 **fastqops.cc** | FASTQ file statistics etc
@@ -226,7 +227,7 @@ File | Description
 **mergepairs.cc** | Paired-end read merging
 **minheap.cc** | A minheap implementation for the list of top kmer matches
 **msa.cc** | Simple multiple sequence alignment and consensus sequence computation for clusters
-**pvalue.h** | Statistical data (from PEAR) used for significance testing of merged paired-end reads
+**rerep.cc** | Rereplication
 **results.cc** | Output results in various formats (alnout, userout, blast6, uc)
 **search.cc** | Implements search using global alignment
 **searchcore.cc** | Core search functions for searching, clustering and chimera detection

diff --git a/configure b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for vsearch 1.10.1.
+# Generated by GNU Autoconf 2.69 for vsearch 1.10.0.
 #
 # Report bugs to <[email protected]>.
 #
@@ -580,8 +580,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='vsearch'
 PACKAGE_TARNAME='vsearch'
-PACKAGE_VERSION='1.10.1'
-PACKAGE_STRING='vsearch 1.10.1'
+PACKAGE_VERSION='1.10.0'
+PACKAGE_STRING='vsearch 1.10.0'
 PACKAGE_BUGREPORT='[email protected]'
 PACKAGE_URL=''
 
@@ -1291,7 +1291,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures vsearch 1.10.1 to adapt to many kinds of systems.
+\`configure' configures vsearch 1.10.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1357,7 +1357,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of vsearch 1.10.1:";;
+     short | recursive ) echo "Configuration of vsearch 1.10.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1453,7 +1453,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-vsearch configure 1.10.1
+vsearch configure 1.10.0
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2044,7 +2044,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by vsearch $as_me 1.10.1, which was
+It was created by vsearch $as_me 1.10.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2907,7 +2907,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='vsearch'
- VERSION='1.10.1'
+ VERSION='1.10.0'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6364,7 +6364,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by vsearch $as_me 1.10.1, which was
+This file was extended by vsearch $as_me 1.10.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6430,7 +6430,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-vsearch config.status 1.10.1
+vsearch config.status 1.10.0
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 

diff --git a/configure.ac b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.63])
-AC_INIT([vsearch], [1.10.1], [[email protected]])
+AC_INIT([vsearch], [1.10.0], [[email protected]])
 AM_INIT_AUTOMAKE([subdir-objects])
 AC_LANG([C++])
 AC_CONFIG_SRCDIR([src/vsearch.cc])

diff --git a/man/Makefile.in b/man/Makefile.in
@@ -263,9 +263,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign man/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu man/Makefile'; \
 	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign man/Makefile
+	  $(AUTOMAKE) --gnu man/Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
 	  *config.status*) \

diff --git a/man/vsearch.1 b/man/vsearch.1
@@ -1,10 +1,11 @@
 .\" ============================================================================
-.TH vsearch 1 "January 26, 2016" "version 1.10.1" "USER COMMANDS"
+.TH vsearch 1 "February 11, 2016" "version 1.10.0" "USER COMMANDS"
 .\" ============================================================================
 .SH NAME
-vsearch \(em chimera detection, clustering, dereplication, FASTA/FASTQ
-file processing, masking, pairwise alignment, searching, shuffling,
-sorting and subsampling of amplicons from metagenomic projects.
+vsearch \(em chimera detection, clustering, dereplication and
+rereplication, FASTA/FASTQ file processing, masking, pairwise
+alignment, searching, shuffling, sorting and subsampling of amplicons
+from metagenomic projects.
 .\" ============================================================================
 .SH SYNOPSIS
 .\" left justified, ragged right
@@ -29,11 +30,14 @@ Clustering:
 \-\-id \fIreal\fR [\fIoptions\fR]
 .PP
 .RE
-Dereplication:
+Dereplication and rereplication:
 .RS
 \fBvsearch\fR (\-\-derep_fulllength | \-\-derep_prefix)
 \fIfastafile\fR (\-\-output | \-\-uc) \fIoutputfile\fR [\fIoptions\fR]
 .PP
+\fBvsearch\fR (\-\-rereplicate) \fIfastafile\fR
+\-\-output \fIoutputfile\fR [\fIoptions\fR]
+.PP
 .RE
 FASTA/FASTQ file processing:
 .RS
@@ -42,6 +46,9 @@ FASTA/FASTQ file processing:
 \fBvsearch\fR \-\-fastq_convert \fIfastqfile\fR \-\-fastqout
 \fIoutputfile\fR [\fIoptions\fR]
 .PP
+\fBvsearch\fR \-\-fastq_eestats \fIfastqfile\fR
+\-\-output \fIoutputfile\fR [\fIoptions\fR]
+.PP
 \fBvsearch\fR \-\-fastq_filter \fIfastqfile\fR (\-\-fastaout |
 \-\-fastaout_discarded | \-\-fastqout | \-\-fastqout_discarded)
 \fIoutputfile\fR [\fIoptions\fR]
@@ -127,9 +134,8 @@ memory requirements.
 several nucleotide sequences. For each sequence, the sequence
 identifier is defined as the string comprised between the ">" (or "@")
 symbol and the first space, tab or the end of the line, whichever
-comes first. Additionally, if the fasta header line starts with
-">[;]size=\fIinteger\fR;label", contains
-">label;size=\fIinteger\fR;label" or ends with
+comes first. Additionally, if the fasta header line matches
+">[;]size=\fIinteger\fR;label", ">label;size=\fIinteger\fR;label" or
 ">label;size=\fIinteger\fR[;]", \fBvsearch\fR will remove the pattern
 [;]size=\fIinteger\fR[;] from the header and interpret \fIinteger\fR
 as the number of occurrences (or abundance) of the sequence in the
@@ -173,8 +179,10 @@ be a regular file, not a stream.
 .SS Options
 \fBvsearch\fR recognizes a large number of command-line options. For
 easier navigation, options are grouped below by theme (chimera
-detection, clustering, dereplication, masking, shuffling, sorting, and
-searching). We start with general options that apply to all themes.
+detection, clustering, dereplication and rereplication, FASTA/FASTQ
+file processing, masking, pairwise alignment, searching, shuffling,
+sorting, and subsampling). We start with the general options that
+apply to all themes.
 .PP
 General options:
 .RS
@@ -543,19 +551,22 @@ Mask regions in sequences using the
 becomes case sensitive. The default is to mask using \fIdust\fR.
 .TP
 .BI \-\-relabel \0string
-Please see the description of the same option under Chimera detection
-for details.
+Relabel sequence identifiers in the output files produced by
+\-\-consout, \-\-profile and \-\-centroids options. Please see the
+description of the same option under Chimera detection for details.
 .TP
 .B \-\-relabel_keep
 When relabelling, keep the old identifier in the header after a space.
 .TP
 .BI \-\-relabel_md5
-Please see the description of the same option under Chimera detection
-for details.
+Relabel sequence identifiers in the output files produced by
+\-\-consout, \-\-profile and \-\-centroids options. Please see the
+description of the same option under Chimera detection for details.
 .TP
 .BI \-\-relabel_sha1
-Please see the description of the same option under Chimera detection
-for details.
+Relabel sequence identifiers in the output files produced by
+\-\-consout, \-\-profile and \-\-centroids options. Please see the
+description of the same option under Chimera detection for details.
 .TP
 .B \-\-sizein
 Take into account the abundance annotations present in the input fasta
@@ -607,7 +618,7 @@ Most searching options as well as score filtering, gap penalties and masking als
 .RE
 .PP
 .\" ----------------------------------------------------------------------------
-Dereplication options:
+Dereplication and rereplication options:
 .RS
 .TP 9
 .BI \-\-derep_fulllength \0filename
@@ -656,6 +667,15 @@ for details.
 Please see the description of the same option under Chimera detection
 for details.
 .TP
+.BI \-\-rereplicate \0filename
+Duplicate each sequence the number of times indicated by the abundance
+of each sequence in the specified file. The sequence labels will be
+identical for the same sequence, unless \-\-relabel, \-\-relabel_sha1
+or \-\-relabel_md5 is used to create unique labels. Output will be
+written to the file specified with the \-\-output option, in FASTA
+format. The output file will not contain abundance information unless
+\-\-sizeout is specified, in which case an abundance of 1 is used.
+.TP
 .B \-\-sizein
 Take into account the abundance annotations present in the input fasta
 file (search for the pattern "[>;]size=\fIinteger\fR[;]" in sequence
@@ -697,12 +717,13 @@ or reverse complement sequences in FASTA or FASTQ files. The
 \-\-fastq_chars command can be used to analyse FASTQ files to identify
 the type of FASTQ file and the range of quality score values used. To
 convert between different FASTQ file variants, use the
-\-\-fastq_convert command. A statistical analysis of the quality and
-length of the sequences in a FASTQ file is performed with the
-\-\-fastq_stats command. Sequences may be shortened, filtered and
-converted by the \-\-fastq_filter command. Paired-end reads can be
-merged using the \-\-fastq_mergepairs command. Finally, the
-\-\-fastx_revcomp command will reverse complement sequences.
+\-\-fastq_convert command. Statistical analysis of the quality and
+length of the sequences in a FASTQ file may be performed with the
+\-\-fastq_stats and \-\-fastq_eestats commands. Sequences may be
+shortened, filtered and converted by the \-\-fastq_filter
+command. Paired-end reads can be merged using the \-\-fastq_mergepairs
+command. Finally, the \-\-fastx_revcomp command will reverse
+complement sequences.
 .PP
 .TP 9
 .B \-\-eeout
@@ -785,6 +806,23 @@ specified with the \-\-fastqout option.
 When writing FASTQ file output, include the number of expected errors
 in the sequence header. Used with \-\-fastq_mergepairs.
 .TP
+.BI \-\-fastq_eestats \0filename
+Analyse a FASTQ file and report statistics on the sequence lengths,
+distribution of quality scores, error probabilities and expected
+accumulated errors. The output is a file of tab-separated values with
+one line for each position. The values included are the 1-based
+position, number of sequences that includes the position, percentage
+of sequences that include the position, followed by columns that
+includes information about the distribution of quality scores in each
+position, error probabilities in each position, and finally the
+expected number of accumulated errors from the beginning of the
+sequence and until the current position.  For each of these
+distributions, the following statistics are included: minimum value,
+lower quartile, median, mean, upper quartile, and maximum value. The
+type of FASTQ file may be specified with \-\-fastq_ascii
+\-\-fastq_qmin and \-\-fastq_qmax. The output is written to the output
+file specified with the \-\-output option.
+.TP
 .BI \-\-fastq_filter \0filename
 Shorten and/or filter the sequences in the given FASTQ file and output
 the remaining sequences to the FASTQ file specified with the
@@ -806,8 +844,8 @@ errors in each sequence.
 .TP
 .BI \-\-fastq_maxdiffs\~ "positive integer"
 Specify the maximum number of non-matching nucleotides allowed in the
-overlap region with the \-\-fastq_mergepairs command. By default there
-is no limit.
+overlap region with the \-\-fastq_mergepairs command. The default
+limit is 5.
 .TP
 .BI \-\-fastq_maxee\~ real
 With the \-\-fastq_filter and \-\-fastq_mergepairs commands, discard
@@ -826,32 +864,30 @@ With the \-\-fastq_filter and \-\-fastq_mergepairs commands, discard
 sequences with more than the specified number of N's.
 .TP
 .BI \-\-fastq_mergepairs\0 filename
-Merge paired-end sequence reads into one sequence using a method
-similar to the PEAR algorithm (Zhang, Kobert, Flouri & Stamatakis,
-2014), except that base frequencies are not empirically estimated. The
-algorithm employs a statistical test to identify merged reads that
-have a significantly good alignment score. The forward reads are
-specified as the argument to this option and the reverse reads are
-specified with the \-\-reverse option. The merged sequences are output
-to the file(s) specified with the \-\-fastaout or \-\-fastqout
-options. The non-merged reads can be output to the files specified
-with the \-\-fastaout_notmerged_fwd, \-\-fastaout_notmerged_rev,
-\-\-fastqout_notmerged_fwd and \-\-fastqout_notmerged_rev
-options. Statistics may be output to the file specified with the
-\-\-eetabbedout option. Sequences will be truncated as specified with
-the \-\-fastq_truncqual option to remove low-quality bases in the 3'
-end. Sequences shorter than specified with \-\-fastq_minlen (after
-truncation) will be discarded (1 by default). Sequences with too many
-ambiguous bases (N's), as specified with the \-\-fastq_maxns will also
-be discarded (no limit by default). Staggered reads will not be merged
-unless the \-\-fastq_allowmergestagger option is specified. The
-minimum length of the overlap region between the reads may be specfied
-with the \-\-minovlen option (default 1), and the overlap region may
-not include more mismatches than specified with the \-\-maxdiffs
-option (no limit by default), otherwise the read pair will be
-discarded. The mimimum and maximum length of the merged sequence may
-be specified with the \-\-fastq_minmergelen and \-\-fastq_maxmergelen
-options, respectively. Other relevant options are: \-\-fastq_ascii,
+Merge paired-end sequence reads into one sequence. The method has some
+similarities to the PEAR algorithm (Zhang, Kobert, Flouri &
+Stamatakis, 2014). The forward reads are specified as the argument to
+this option and the reverse reads are specified with the \-\-reverse
+option. The merged sequences are output to the file(s) specified with
+the \-\-fastaout or \-\-fastqout options. The non-merged reads can be
+output to the files specified with the \-\-fastaout_notmerged_fwd,
+\-\-fastaout_notmerged_rev, \-\-fastqout_notmerged_fwd and
+\-\-fastqout_notmerged_rev options. Statistics may be output to the
+file specified with the \-\-eetabbedout option. Sequences will be
+truncated as specified with the \-\-fastq_truncqual option to remove
+low-quality bases in the 3' end. Sequences shorter than specified with
+\-\-fastq_minlen (after truncation) will be discarded (1 by
+default). Sequences with too many ambiguous bases (N's), as specified
+with the \-\-fastq_maxns will also be discarded (no limit by
+default). Staggered reads will not be merged unless the
+\-\-fastq_allowmergestagger option is specified. The minimum length of
+the overlap region between the reads may be specified with the
+\-\-minovlen option (default 10), and the overlap region may not
+include more mismatches than specified with the \-\-maxdiffs option
+(5 by default), otherwise the read pair will be discarded. The
+mimimum and maximum length of the merged sequence may be specified
+with the \-\-fastq_minmergelen and \-\-fastq_maxmergelen options,
+respectively. Other relevant options are: \-\-fastq_ascii,
 \-\-fastq_maxee, \-\-fastq_nostagger, \-\-fastq_qmax, \-\-fastq_qmin,
 and \-\-label_suffix.
 .TP
@@ -865,12 +901,12 @@ Specifity the minimum length of the merged sequence with the
 .TP
 .BI \-\-fastq_minovlen\~ "positive integer"
 Specifify the minimum overlap between the merged reads with the
-\-\-fastq_mergepairs command. The default is 1.
+\-\-fastq_mergepairs command. The default is 10.
 .TP
 .B \-\-fastq_nostagger
 Disallow the \-\-fastq_mergepairs command to merge staggered read
-pairs. This is the default. See the \-\-fastq_allowmergestagger option
-for details.
+pairs. This is the default, so this option is ignored. See the
+\-\-fastq_allowmergestagger option for details.
 .TP
 .BI \-\-fastq_qmax\~ "positive integer"
 Specify the maximum quality score accepted when reading FASTQ
@@ -991,7 +1027,7 @@ The argument to the \-\-qmask and \-\-dbmask option may be none, soft
 or dust. If the argument is none, the no masking is performed. If the
 argument is soft the lower case symbols will be masked. Finally, if
 the argument is dust, the sequence will be masked using the DUST
-algorith by Tatusov and Lipman to mask low-complexity regions.
+algorithm by Tatusov and Lipman to mask low-complexity regions.
 
 If the \-\-hardmask option is specified, all masked regions will be
 converted to N's, otherwise masked regions will be indicated by lower
@@ -2388,10 +2424,13 @@ improved performance.
 .BR v1.9.10\~ "released January 25nd, 2016"
 Fixed bug related to masking and lower case database sequences.
 .TP
-.BR v1.10.1\~ "released January 26, 2016"
-Improved merging of paired-end reads and adjusted defaults slightly.
-Removed progress indicator when stderr is not a terminal.  Added
-\-\-fasta_score option to report chimera scores in FASTA files.
+.BR v1.10.0\~ "released February 11th, 2016"
+Parallelized and improved merging of paired-end reads and adjusted
+some defaults. Removed progress indicator when stderr is not a
+terminal. Added \-\-fasta_score option to report chimera scores in
+FASTA files. Added rereplicate and fastq_eestats commands. Fixed
+typos. Added relabelling to files produced with \-\-consout and
+\-\-profile options.
 .RE
 .LP
 .\" ============================================================================