Skip to content

Commit 511f884

Browse files
author
travis-ci
committed
updated documentation. Added more FermiAssembler options
1 parent b2d57d9 commit 511f884

13 files changed

+158
-96
lines changed

Doxyfile

+14-21
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,7 @@ WARN_LOGFILE =
463463
# directories like "/usr/src/myproject". Separate the files or directories
464464
# with spaces.
465465

466-
INPUT =
466+
INPUT = /xchip/gistic/Jeremiah/GIT/SeqLib
467467

468468
# If the value of the INPUT tag contains directories, you can use the
469469
# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
@@ -484,26 +484,19 @@ RECURSIVE = YES
484484
# excluded from the INPUT source files. This way you can easily exclude a
485485
# subdirectory from a directory tree whose root is specified with the INPUT tag.
486486

487-
EXCLUDE = bwa \
488-
figs \
489-
htslib \
490-
multifast-v1.4.2 \
491-
blat \
492-
src/non_api \
493-
fermi-lite \
494-
src/BLATWrapper.cpp \
495-
json \
496-
src/old \
497-
src/ssw.c \
498-
src/ssw_cpp.cpp \
499-
SeqLib/ssw.h \
500-
SeqLib/ssw_cpp.h \
501-
src/gzstream.C \
502-
SGA \
503-
seq_test \
504-
.git \
505-
.travis.scripts \
506-
README.md
487+
EXCLUDE = /xchip/gistic/Jeremiah/GIT/SeqLib/bwa \
488+
/xchip/gistic/Jeremiah/GIT/SeqLib/figs \
489+
/xchip/gistic/Jeremiah/GIT/SeqLib/htslib \
490+
/xchip/gistic/Jeremiah/GIT/SeqLib/blat \
491+
/xchip/gistic/Jeremiah/GIT/SeqLib/src/non_api \
492+
/xchip/gistic/Jeremiah/GIT/SeqLib/fermi-lite \
493+
/xchip/gistic/Jeremiah/GIT/SeqLib/json \
494+
/xchip/gistic/Jeremiah/GIT/SeqLib/SeqLib/ssw.h \
495+
/xchip/gistic/Jeremiah/GIT/SeqLib/SeqLib/ssw_cpp.h \
496+
/xchip/gistic/Jeremiah/GIT/SeqLib/SeqLib/seq_test \
497+
/xchip/gistic/Jeremiah/GIT/SeqLib/SeqLib/.git \
498+
/xchip/gistic/Jeremiah/GIT/SeqLib/SeqLib/.travis.scripts \
499+
/xchip/gistic/Jeremiah/GIT/SeqLib/SeqLib/README.md
507500

508501
# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
509502
# directories that are symbolic links (a Unix filesystem feature) are excluded

SeqLib/BFC.h

+2
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,10 @@ namespace SeqLib {
4848
*/
4949
bool AllocateMemory(size_t n);
5050

51+
/** Peform BFC error correction on the sequences stored in this object */
5152
bool ErrorCorrect();
5253

54+
/** Train the error corrector using the reads stored in this object */
5355
bool Train();
5456

5557
/** Add a sequence for either training or correction */

SeqLib/BWAWrapper.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ class BWAWrapper {
105105
void SetGapOpen(int gap_open);
106106

107107
/** Set the gap open penalty
108-
* @param gap_open Gap extension penalty. Default 1
108+
* @param gap_ext Gap extension penalty. Default 1
109109
* @exception Throws invalid_argument if gap_ext < 0
110110
*/
111111
void SetGapExtension(int gap_ext);

SeqLib/BamHeader.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@
1313
namespace SeqLib {
1414

1515
/** Store a reference chromosome and its length
16-
* @note This parallels the data found in @SQ tag of BAM header
16+
* @note This parallels the data found in SQ tag of BAM header
1717
*/
1818
struct HeaderSequence {
1919

2020
/** Make a new header sequence
21-
* @param Name of the chromosome
22-
* @param Length of the chromosome
21+
* @param n Name of the chromosome
22+
* @param l Length of the chromosome
2323
*/
2424
HeaderSequence(const std::string& n, uint32_t l) : Name(n), Length(l) {}
2525

@@ -50,8 +50,8 @@ namespace SeqLib {
5050
BamHeader(const HeaderSequenceVector& hsv);
5151

5252
/** Initialize a BamHeader from a string containing
53-
* a BAM header in human-readable form (e.g. @PG ... )
54-
* @param Text of a BAM header, with newlines separating lines
53+
* a BAM header in human-readable form (e.g. PG ... )
54+
* @param hdr Text of a BAM header, with newlines separating lines
5555
*/
5656
BamHeader(const std::string& hdr);
5757

SeqLib/BamReader.h

+5-4
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@ class BamReader {
122122

123123
/** Explicitly set a reference genome to be used to decode CRAM file.
124124
* If no reference is specified, will automatically load from
125-
* file pointed to in CRAM header using the @SQ tags.
125+
* file pointed to in CRAM header using the SQ tags.
126126
* @note This function is useful if the reference path pointed
127-
* to by the UR field of @SQ is not on your system, and you would
127+
* to by the UR field of SQ is not on your system, and you would
128128
* like to explicitly provide one.
129129
* @param ref Path to an index reference genome
130130
*/
@@ -262,8 +262,9 @@ class BamReader {
262262

263263
protected:
264264

265-
// regions to walk
266-
GRC m_region;
265+
GRC m_region; ///< Regions to access
266+
267+
private:
267268

268269
// store the file pointers etc to BAM files
269270
_BamMap m_bams;

SeqLib/BamRecord.h

+38-19
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ class BamRecord {
222222
void init();
223223

224224
/** Check if a read is empty (not initialized)
225-
* @value true if read was not initialized with any values
225+
* @return true if read was not initialized with any values
226226
*/
227227
bool isEmpty() const { return !b; }
228228

@@ -427,13 +427,24 @@ class BamRecord {
427427
/** Set the query name */
428428
void SetQname(const std::string& n);
429429

430-
//Set the quality scores
430+
/** Set the quality scores
431+
* @param n String of quality scores or empty string
432+
* @param offset Offset parameter for encoding (eg 33)
433+
* @exception Throws an invalid_argument if n is non-empty
434+
* and different length than sequence
435+
*/
431436
void SetQualities(const std::string& n, int offset);
432437

433-
/** Set the sequence name */
438+
/** Set the sequence name
439+
* @param seq Sequence in upper-case (ACTGN) letters.
440+
*/
434441
void SetSequence(const std::string& seq);
435442

436-
/** Set the cigar field explicitly */
443+
/** Set the cigar field explicitly
444+
* @param c Cigar operation to set
445+
* @note Will not check if the cigar ops are consistent with
446+
* the length of the sequence.
447+
*/
437448
void SetCigar(const Cigar& c);
438449

439450
/** Print a SAM-lite record for this alignment */
@@ -515,10 +526,7 @@ class BamRecord {
515526
uint32_t* c = bam_get_cigar(b);
516527
Cigar cig;
517528
for (int k = b->core.n_cigar - 1; k >= 0; --k)
518-
//cig.add(CigarField(c[k]));
519529
cig.add(CigarField(c[k]));
520-
//cig.push_back(CigarField(c[k]));
521-
//cig.add(CigarField("MIDSSHP=XB"[c[k]&BAM_CIGAR_MASK], bam_cigar_oplen(c[k])));
522530
return cig;
523531
}
524532

@@ -527,14 +535,19 @@ class BamRecord {
527535
*/
528536
void ClearSeqQualAndTags();
529537

530-
/** Get the sequence of this read as a string */
531-
/*inline */std::string Sequence() const;
538+
/** Retrieve the sequence of this read as a string (ACTGN) */
539+
std::string Sequence() const;
532540

533-
/** Return the mean phred score
541+
/** Return the mean quality score
534542
*/
535543
double MeanPhred() const;
536544

537-
/** Do a smith waterman alignment
545+
/** Performa a Smith-Waterman alignment between two strings
546+
* @param name Name of the query sequence to align
547+
* @param seq Sequence (ACTGN) of the query string
548+
* @param ref Sequence (ACTGN) of the reference string
549+
* @param gr Location of the reference string. The alignment record after Smith-Waterman alignment
550+
* will be relative to this location.
538551
*/
539552
BamRecord(const std::string& name, const std::string& seq, const std::string& ref, const GenomicRegion * gr);
540553

@@ -651,20 +664,26 @@ class BamRecord {
651664
std::string GetZTag(const std::string& tag) const;
652665

653666
/** Get a vector of type int from a Z tag delimited by "^"
667+
* Smart-tags allow one to store vectors of strings, ints or doubles in the alignment tags, and
668+
* do not require an additional data structure on top of bseq1_t.
654669
* @param tag Name of the tag eg "AL"
655670
* @return A vector of ints, retrieved from the x delimited Z tag
656671
* @exception Throws an invalid_argument if cannot convert delimited field val to int
657672
*/
658673
std::vector<int> GetSmartIntTag(const std::string& tag) const;
659674

660675
/** Get a vector of type double from a Z tag delimited by "x"
676+
* Smart-tags allow one to store vectors of string, ints or doubles in the alignment tags, and
677+
* do not require an additional data structure on top of bseq1_t.
661678
* @param tag Name of the tag eg "AL"
662679
* @return A vector of double elems, retrieved from the "^" delimited Z tag
663680
* @exception Throws an invalid_argument if cannot convert delimited field val to double
664681
*/
665682
std::vector<double> GetSmartDoubleTag(const std::string& tag) const;
666683

667684
/** Get a vector of strings from a Z tag delimited by "^"
685+
* Smart-tags allow one to store vectors of strings, ints or doubles in the alignment tags, and
686+
* do not require an additional data structure on top of bseq1_t.
668687
* @param tag Name of the tag eg "CN"
669688
* @return A vector of strngs, retrieved from the x delimited Z tag
670689
*/
@@ -681,7 +700,6 @@ class BamRecord {
681700
return bam_aux2i(p);
682701
}
683702

684-
685703
/** Add a string (Z) tag
686704
* @param tag Name of the tag. eg "XP"
687705
* @param val Value for the tag
@@ -722,6 +740,9 @@ class BamRecord {
722740

723741
/** Return a human readable chromosome name assuming chr is indexed
724742
* from 0 (eg id 0 return "1")
743+
* @note This is a quick convienence function, and is not robust to non-numbered
744+
* chromosomes (eg chrX becomes 23). For accurate string representation of
745+
* any chromosomes, use the full ChrName with BamHeader input.
725746
*/
726747
inline std::string ChrName() const {
727748
std::stringstream ss;
@@ -732,9 +753,7 @@ class BamRecord {
732753
}
733754

734755
/** Retrieve the human readable chromosome name.
735-
*
736-
* Note that this requires that the header not be empty. If
737-
* it is empty, assumes this ia chr1 based reference
756+
* @param h Dictionary for chr name lookup. If it is empty, assumes this is chr1 based reference.
738757
* @exception Throws an out_of_range exception if chr id is not in dictionary
739758
* @return Empty string if chr id < 0, otherwise chromosome name from dictionary.
740759
*/
@@ -812,11 +831,11 @@ class BamRecord {
812831

813832
};
814833

815-
typedef std::vector<BamRecord> BamRecordVector;
834+
typedef std::vector<BamRecord> BamRecordVector; ///< Store a vector of alignment records
816835

817-
typedef std::vector<BamRecordVector> BamRecordClusterVector;
836+
typedef std::vector<BamRecordVector> BamRecordClusterVector; ///< Store a vector of alignment vectors
818837

819-
/** @brief Sort methods for reads
838+
/** @brief Sort methods for alignment records
820839
*/
821840
namespace BamRecordSort {
822841

@@ -829,7 +848,7 @@ class BamRecord {
829848
}
830849
};
831850

832-
/** @brief Sort by read-mate position
851+
/** @brief Sort by mate position
833852
*/
834853
struct ByMatePosition
835854
{

SeqLib/BamWalker.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#include <stdint.h>
77
#include "SeqLib/BamRecord.h"
88

9-
// dont understand this
9+
// not sure what going on here...
1010
#ifndef INT32_MAX
1111
#define INT32_MAX 0x7fffffffL
1212
#endif
@@ -58,13 +58,13 @@ inline char *samfaipath(const char *fn_ref)
5858

5959
namespace SeqLib {
6060

61-
/** Small class to store a counter to measure BamWalker progress.
62-
*
61+
/** Small class to store a counter to measure BamReader progress.
6362
* Currently only stores number of reads seen / kept.
6463
*/
6564
struct ReadCount {
6665

67-
uint32_t keep, total;
66+
uint32_t keep; ///< Store total number of reads kept
67+
uint32_t total; ///< Store total number of reads seen
6868

6969
ReadCount() : keep(0), total(0) {}
7070

SeqLib/BamWriter.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ class BamWriter {
7676

7777
/** Explicitly set a reference genome to be used to decode CRAM file.
7878
* If no reference is specified, will automatically load from
79-
* file pointed to in CRAM header using the @SQ tags.
79+
* file pointed to in CRAM header using the SQ tags.
8080
* @note This function is useful if the reference path pointed
81-
* to by the UR field of @SQ is not on your system, and you would
81+
* to by the UR field of SQ is not on your system, and you would
8282
* like to explicitly provide one.
8383
* @param ref Path to an index reference genome
8484
* @return Returns true if reference loaded.
@@ -88,7 +88,7 @@ class BamWriter {
8888
/** Return the BAM header */
8989
BamHeader Header() const { return hdr; };
9090

91-
protected:
91+
private:
9292

9393
// path to output file
9494
std::string m_out;

SeqLib/FermiAssembler.h

+25-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ namespace SeqLib {
2929
~FermiAssembler();
3030

3131
/** Provide a set of reads to be assembled
32-
* @param Reads with or without quality scores
32+
* @param brv Reads with or without quality scores
3333
* @note This will copy the reads and quality scores
3434
* into this object. Deallocation is automatic with object
3535
* destruction, or with ClearReads.
@@ -66,7 +66,8 @@ namespace SeqLib {
6666
*/
6767
void PerformAssembly();
6868

69-
/** Return the assembled contigs.
69+
/** Return the assembled contigs
70+
* @return Assembled contigs in upper case strings (ACTGN)
7071
*/
7172
std::vector<std::string> GetContigs() const;
7273

@@ -76,6 +77,28 @@ namespace SeqLib {
7677
/** Set the minimum overlap between reads during string graph construction */
7778
void SetMinOverlap(uint32_t m) { opt.min_asm_ovlp = m; }
7879

80+
/** Aggressively trim graph to discard heterozygotes.
81+
* Suggested by lh3 for bacterial assembly
82+
* @note See: https://github.com/lh3/fermi-lite/blob/master/example.c
83+
*/
84+
void SetAggressiveTrim() { opt.mag_opt.flag |= MAG_F_AGGRESSIVE; }
85+
86+
/** From lh3: Drop an overlap if its length is below max_overlap * ratio
87+
* @param ratio Overlaps below ratio * max_overlap will be removed
88+
*/
89+
void SetDropOverlapRatio(double ratio) { opt.mag_opt.min_dratio1 = ratio; }
90+
91+
/** From lh3: Min k-mer & read count thresholds for ec and graph cleaning
92+
*/
93+
void SetKmerMinThreshold(int min) { opt.min_cnt = min; }
94+
95+
/** From lh3: Max k-mer & read count thresholds for ec and graph cleaning
96+
*/
97+
void SetKmerMaxThreshold(int max) { opt.max_cnt = max; }
98+
99+
// From lh3: retain a bubble if one side is longer than the other side by >INT-bp
100+
//void SetBubbleDifference(int bdiff) { opt.mag_opt.max_bdiff; }
101+
79102
/** Return the minimum overlap parameter for this assembler */
80103
uint32_t GetMinOverlap() const { return opt.min_asm_ovlp; }
81104

SeqLib/GenomicRegion.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class GenomicRegion {
3030
* @param t_chr Chromosome id (chr1 = 0, etc)
3131
* @param t_pos1 Start position
3232
* @param t_pos2 End position. Must be >= start position.
33-
* @param strand. +, -, or * (default is *)
33+
* @param t_strand +, -, or * (default is *)
3434
* @exception throws an invalid_argument exception if pos2 < pos1
3535
* @exception throws an invalid_argument exception if char not one of +, - , *
3636
*/
@@ -54,7 +54,7 @@ class GenomicRegion {
5454
* Note that this requires that a BamHeader be provided as well
5555
* to convert the text representation of the chr to the id number.
5656
* @param reg Samtools-style string (e.g. "1:1,000,000-2,000,000") or single chr
57-
* @param h Pointer to BAM header that will be used to convert chr string to ref id
57+
* @param hdr Pointer to BAM header that will be used to convert chr string to ref id
5858
* @exception throws an invalid_argument exception if cannot parse correctly
5959
*/
6060
GenomicRegion(const std::string& reg, const BamHeader& hdr);

SeqLib/GenomicRegionCollection.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -729,10 +729,11 @@ GenomicRegionCollection<T>::GenomicRegionCollection(const T& gr)
729729
}
730730

731731
template<class T>
732-
GRC GenomicRegionCollection<T>::Intersection(GRC& subject, bool ignore_strand) const
732+
template<class K>
733+
GRC GenomicRegionCollection<T>::Intersection(GenomicRegionCollection<K>& subject, bool ignore_strand) const
733734
{
734735
std::vector<int32_t> sub, que;
735-
GRC out = this->FindOverlaps(subject, que, sub, ignore_strand);
736+
GRC out = this->FindOverlaps<K>(subject, que, sub, ignore_strand);
736737
return out;
737738
}
738739

0 commit comments

Comments
 (0)