Skip to content

Commit 6d57739

Browse files
author
walaj
committed
updated benchmarking and README to improve comparison with SeqAn
1 parent d1a6484 commit 6d57739

File tree

3 files changed

+77
-11
lines changed

3 files changed

+77
-11
lines changed

README.md

+7-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ Citation
1313
--------
1414
If you use SeqLib in your applications, please cite: http://bioinformatics.oxfordjournals.org/content/early/2016/12/21/bioinformatics.btw741.full.pdf+html
1515

16+
Note that the values for the SeqAn benchmarking in Table 2 should be corrected to 7.7 Gb memory and 33.92 seconds in CPU time, when compiling SeqAn with ``-O3 -DNDEBUG``. SeqAn also does full string decompression.
17+
Wall times for SeqAn may be shorter than CPU time because it uses embedded multi-threading during BAM IO.
18+
1619
Table of contents
1720
=================
1821

@@ -107,11 +110,12 @@ bioinformatics problems.
107110
Some differences:
108111
* SeqLib has ~2-4x faster read/write speed over BamTools and SeqAn, and lower memory footprint.
109112
* SeqLib has support for CRAM file
110-
* SeqLib provides in memory access to BWA-MEM, BLAT, a chromosome aware interval tree and range operations, and to read correction and sequence assembly with Fermi. BamTools has more support currently for network access.
111-
* SeqAn provide a substantial amount of additional capabilites not in SeqLib, including graph operations and a more expanded suite of multi-sequence alignments.
113+
* SeqLib provides in memory access to BWA-MEM, BLAT, chromosome aware interval tree, read correction, and sequence assembly with Fermi.
114+
* SeqAn provide a substantial amount of additional capabilites not in SeqLib, including graph operations and an expanded suite of multi-sequence alignments.
115+
* SeqAn embeds multi-threading into some functionality like BAM IO to improve wall times.
112116

113117
For your particular application, our hope is that SeqLib will provide a comprehensive and powerful envrionment to develop
114-
bioinformatics tools. Feature requests and comments are welcomed.
118+
bioinformatics tools, or to be used in conjuction with the capablities in SeqAn and BamTools. Feature requests and comments are welcomed.
115119

116120
Command Line Usage
117121
------------------

benchmark/Makefile

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
##INCLUDES=-I/xchip/gistic/Jeremiah/software/seqan-library-2.0.2/include -I /xchip/gistic/Jeremiah/GIT/SeqLib/src -I/xchip/gistic/Jeremiah/GIT/SeqLib/htslib -I/xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1 -I/xchip/gistic/Jeremiah/software/bamtools-2.4.0/include
2-
INCLUDES=-I/xchip/gistic/Jeremiah/software/seqan-library-2.2.0/include -I /xchip/gistic/Jeremiah/GIT/SeqLib/src -I/xchip/gistic/Jeremiah/GIT/SeqLib/htslib -I/xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1 -I/xchip/gistic/Jeremiah/software/bamtools-2.4.0/include -I/xchip/gistic/Jeremiah/software/bzip2-1.0.6
2+
INCLUDES=-I/xchip/gistic/Jeremiah/software/seqan-library-2.2.0/include -I /xchip/gistic/Jeremiah/GIT/SeqLib -I/xchip/gistic/Jeremiah/GIT/SeqLib/htslib -I/xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1 -I/xchip/gistic/Jeremiah/software/bamtools-2.4.0/include -I/xchip/gistic/Jeremiah/software/bzip2-1.0.6
33
LIBS=/xchip/gistic/Jeremiah/software/bamtools-2.4.0/lib/libbamtools.a /xchip/gistic/Jeremiah/GIT/SeqLib/src/libseqlib.a /xchip/gistic/Jeremiah/GIT/SeqLib/htslib/libhts.a /xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1/stage/lib/libboost_timer.a /xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1/stage/lib/libboost_chrono.a /xchip/gistic/Jeremiah/software/boost_1.61.0_gcc5.1/stage/lib/libboost_system.a /xchip/gistic/Jeremiah/software/bzip2-1.0.6/libbz2.a
4-
CFLAGS=-W -Wall -pedantic -std=c++14 -DSEQAN_HAS_ZLIB=1 -DSEQAN_HAS_BZIP2=1
4+
CXXFLAGS=-W -Wall -pedantic -std=c++14 -DSEQAN_HAS_ZLIB=1 -DSEQAN_HAS_BZIP2=1 -O3 -DNDEBUG -DSEQAN_ENABLE_DEBUG=0 -DSEQAN_ENABLE_TESTING=0
55

66
binaries=benchmark
77

88
all: benchmark.o
9-
g++ benchmark.o -g -o benchmark $(LIBS) -lrt -lpthread -lz -lm
9+
g++ benchmark.o -o benchmark $(LIBS) -lrt -lpthread -lz -lm
1010

1111
benchmark.o: benchmark.cpp
12-
g++ -g -c benchmark.cpp $(INCLUDES) $(CFLAGS)
12+
g++ -c benchmark.cpp $(INCLUDES) $(CXXFLAGS)
1313

1414
.PHONY: clean
1515

benchmark/benchmark.cpp

+66-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#define USE_BOOST
22

3-
#define JUMPING_TEST 1
4-
//#define READ_TEST 1
3+
//#define JUMPING_TEST 1
4+
#define READ_TEST 1
55

66
#include "SeqLib/SeqLibUtils.h"
77

@@ -14,6 +14,7 @@
1414
//#define RUN_SEQAN 1
1515
//#define RUN_BAMTOOLS 1
1616
#define RUN_SEQLIB 1
17+
//#define RUN_HTSLIB 1
1718

1819
#ifdef RUN_SEQAN
1920
#include <seqan/bam_io.h>
@@ -26,7 +27,18 @@ using namespace seqan;
2627
#include "SeqLib/BamWriter.h"
2728
#endif
2829

29-
#define BAMTOOLS_GET_CORE 1
30+
#ifdef RUN_HTSLIB
31+
#include <iostream>
32+
extern "C" {
33+
#include "htslib/htslib/hts.h"
34+
#include "htslib/htslib/sam.h"
35+
#include "htslib/htslib/bgzf.h"
36+
#include "htslib/htslib/kstring.h"
37+
#include "htslib/htslib/faidx.h"
38+
}
39+
#endif
40+
41+
//#define BAMTOOLS_GET_CORE 1
3042

3143
#ifdef RUN_BAMTOOLS
3244
#include "api/BamReader.h"
@@ -37,7 +49,9 @@ int main()
3749

3850
const size_t limit = 5000000;
3951
const size_t print_limit = 1000000;
52+
#ifdef JUMPING_TEST
4053
const size_t jump_limit = 1000;
54+
#endif
4155
size_t count = 0;
4256

4357
//std::string bam = "/xchip/gistic/Jeremiah/GIT/SeqLib/seq_test/test_data/small.bam";
@@ -99,13 +113,18 @@ int main()
99113

100114
SeqLib::BamRecord rec;
101115
SeqLib::BamRecordVector bav;
116+
std::string dummy;
117+
std::stringstream ss;
102118
#ifdef READ_TEST
103119
std::vector<std::string> sq;
104120
while(r.GetNextRecord(rec) && count++ < limit) {
105121
if (count % print_limit == 0)
106122
std::cerr << "...at read " << SeqLib::AddCommas(count) << std::endl;
107123
bav.push_back(rec);
124+
//dummy = rec.Sequence();
108125
//sq.push_back(rec.Sequence());
126+
//sq.push_back(rec.Qname());
127+
//sq.push_back(rec.CigarString());
109128
}
110129
#endif
111130

@@ -162,6 +181,7 @@ int main()
162181
}
163182

164183
bool hasAlignments = false;
184+
long l = 0;
165185
for (int i = 0; i < jump_limit; ++i) {
166186
int chr = rand() % 22;
167187
int pos = rand() % 1000000 + 1000000;
@@ -171,7 +191,8 @@ int main()
171191
}
172192
if (hasAlignments) {
173193
readRecord(record, bamFileIn);
174-
bav.push_back(record);
194+
l += getAlignmentLengthInRef(record);
195+
//bav.push_back(record);
175196
} else {
176197
std::cerr << "no alignments here " << std::endl;
177198
}
@@ -200,6 +221,47 @@ int main()
200221
}
201222
#endif
202223

224+
#endif
225+
226+
#ifdef RUN_HTSLIB
227+
228+
std::cerr << " **** RUNNING HTSLIB **** " << std::endl;
229+
bam1_t* b = bam_init1();
230+
htsFile* in = hts_open(bam.c_str(), "r");
231+
bam_hdr_t* header;
232+
if (in == NULL)
233+
return -1;
234+
if (b == NULL)
235+
return -1;
236+
header = sam_hdr_read(in);
237+
int i = 0;
238+
std::vector<bam1_t*> bav;
239+
while (sam_read1(in, header, b) >= 0 && count++ < limit) {
240+
if (count % print_limit == 0)
241+
std::cerr << "...at read " << SeqLib::AddCommas(count) << std::endl;
242+
bav.push_back(b);
243+
/*
244+
int i;
245+
const bam1_core_t* c = &b->core;
246+
uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
247+
fwrite(bam1_qname(b), c->l_qname - 1, sizeof(char), stdout);
248+
fputc('\t', stdout);
249+
for (i = 0; i < c->l_qseq; ++i)
250+
fputc(bam_nt16_rev_table[bam1_seqi(s, i)], stdout);
251+
fputc('\t', stdout);
252+
if (t[0] == 0xff) {
253+
fputs("*", stdout);
254+
}
255+
else {
256+
for (i = 0; i < c->l_qseq; ++i)
257+
fputc(t[i] + 33, stdout);
258+
}
259+
fputc('\n', stdout);
260+
*/
261+
}
262+
printf("%d", i);
263+
bam_hdr_destroy(header);
264+
hts_close(in);
203265
#endif
204266

205267
std::cerr << " Copied " << bav.size() << " records " << std::endl;

0 commit comments

Comments
 (0)