Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizations #20

Merged
merged 22 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f35e399
Added support for k-mers up to 128.
OndrejSladky Mar 21, 2024
23638a8
Fixed unittests on linux.
OndrejSladky Mar 21, 2024
4c095bb
Minor cleanups.
OndrejSladky Mar 21, 2024
fd05155
Removed duplicit definition of __LITTLE_ENDIAN__
OndrejSladky Mar 21, 2024
a6d8618
Verifying for k up to 128.
OndrejSladky Mar 21, 2024
0df8dd8
1s compilation speedup
OndrejSladky Mar 21, 2024
c301493
Another 1s compilation speedup.
OndrejSladky Mar 21, 2024
81c710b
Unittests for large reverse complements.
OndrejSladky Mar 21, 2024
afa9915
Fixed error in reverse complements for extra large k-mers.
OndrejSladky Apr 25, 2024
fbdd75a
Printing forward extension directly to output.
OndrejSladky Apr 25, 2024
1fd6051
Using stack to store simplitig instead of linked list.
OndrejSladky Apr 25, 2024
939e7a6
Khash modified to support large k-mers sets; wang hash function used.
OndrejSladky Apr 26, 2024
13b41e9
Merge branch 'main' into extra_large_kmers
OndrejSladky Apr 26, 2024
26c030e
README: sizes up to 128.
OndrejSladky Apr 26, 2024
7718e3f
Merge branch 'wang-hash' into optimizations
OndrejSladky Apr 26, 2024
c3ab59b
Running reverse complements & inlining.
OndrejSladky Apr 26, 2024
b5afc1e
Reusing the complementary k-mers.
OndrejSladky Apr 26, 2024
5789fec
Fixed error and reusing canonical k-mers.
OndrejSladky Apr 26, 2024
3671f20
Running reverse complements in parser.
OndrejSladky Apr 26, 2024
9619497
Optimization of intersection.
OndrejSladky Apr 26, 2024
c901662
Usage in README.
OndrejSladky Apr 26, 2024
d7b48f2
03 optimization performs the best.
OndrejSladky Apr 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
.PHONY: all clean test cpptest verify quick-verify

CXX= g++
CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O2
CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O3
LDFLAGS= -lz -lpthread
SRC= src
UINT256= $(SRC)/uint256_t
SCRIPTS= scripts
DATA= data
TESTS= tests
Expand All @@ -24,13 +25,13 @@ verify: $(PROG) $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa
quick-verify: $(PROG) $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa
python $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa --quick --interpath $(DATA)/spyogenes.fa

$(PROG): $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
$(PROG): $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h $(wildcard $(UINT256)/*.cpp $(UINT256)/*.h $(UINT256)/*.include)
./create-version.sh
$(CXX) $(CXXFLAGS) $(SRC)/main.cpp $(SRC)/kthread.c -o $@ $(LDFLAGS)


prophasmtest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)

gtest-all.o: $(GTEST)/src/gtest-all.cc $(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include -I $(GTEST) -DGTEST_CREATE_SHARED_LIBRARY=1 -c -pthread $(GTEST)/src/gtest-all.cc -o $@
Expand Down
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ProphAsm2 is a versatile tool for computing simplitigs/SPSS
from *k-mer sets* and for *k-mer set operations*.
The new features compared to the original [ProphAsm](https://github.com/prophyle/prophasm)
include a largely speed and memory optimization, parallelization,
support for k-mer sizes up to 64 and support for minimum abundances.
support for k-mer sizes up to 128 and support for minimum abundances.

Various types of sequencing datasets can be used as the input for
ProphAsm, including genomes, pan-genomes, metagenomes or sequencing reads.
Expand Down Expand Up @@ -74,8 +74,19 @@ Set operations:
USAGE-BEGIN
-->
```


Usage: prophasm2 [options]
Command-line parameters:
-k INT K-mer size.
-i FILE Input FASTA file (can be used multiple times).
-o FILE Output FASTA file (if used, must be used as many times as -i).
-x FILE Compute intersection, subtract it, save it.
-s FILE Output file with k-mer statistics.
-t INT Number of threads (default 1).
-m INT Minimum abundance of k-mers to appear in the assembly (default 1).
-S Silent mode.
-u Do not consider k-mer and its reverse complement as equivalent.

Note that '-' can be used for standard input/output.
```
<!---
USAGE-END
Expand Down
4 changes: 2 additions & 2 deletions scripts/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,14 @@ def main():
print("Testing ProphAsm2 outputs valid intersection on files " + args.path + " and " + args.interpath)
for complements in [True]:
for m in range(1, 3):
for k in range(2, 65, 23 if args.quick else 1):
for k in range(2, 129, 23 if args.quick else 1):
success &= verify_intersection(args.path, args.interpath, k, complements, m)
print("")

print("Testing ProphAsm2 outputs valid simplitigs on file " + args.path)
for complements in [True, False]:
for m in range(1, 4):
for k in range(2, 65, 11 if args.quick else 1):
for k in range(2, 129, 11 if args.quick else 1):
success &= verify_instance(args.path, k, complements, m)
print("")

Expand Down
11 changes: 8 additions & 3 deletions src/khash.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ typedef unsigned long long khint64_t;
#endif
#endif /* klib_unused */

typedef khint32_t khint_t;
typedef khint64_t khint_t;
typedef khint_t khiter_t;

#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
Expand All @@ -176,6 +176,10 @@ typedef khint_t khiter_t;
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif

#ifndef kroundup64
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
#endif

#ifndef kcalloc
#define kcalloc(N,Z) calloc(N,Z)
#endif
Expand Down Expand Up @@ -246,7 +250,7 @@ static const double __ac_HASH_UPPER = 0.77;
khint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
kroundup32(new_n_buckets); \
kroundup64(new_n_buckets); \
if (new_n_buckets < 4) new_n_buckets = 4; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
else { /* hash table size to be changed (shrink or expand); rehash */ \
Expand Down Expand Up @@ -382,7 +386,8 @@ static const double __ac_HASH_UPPER = 0.77;
@param key The integer [khint64_t]
@return The hash value [khint_t]
*/
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
//#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
#define kh_int64_hash_func(key) (khint64_t)(__ac_Wang_hash(key))
/*! @function
@abstract 64-bit integer comparison function
*/
Expand Down
50 changes: 33 additions & 17 deletions src/khash_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,22 @@
#include "kmers.h"
#include "khash.h"

typedef unsigned char byte;

#define kh_int128_hash_func(key) kh_int64_hash_func((khint64_t)((key)>>65^(key)^(key)<<21))
#define kh_int128_hash_equal(a, b) ((a) == (b))
#define kh_int256_hash_func(key) kh_int128_hash_func((__uint128_t)((key)>>129^(key)^(key)<<35))
#define kh_int256_hash_equal(a, b) ((a) == (b))

#define KHASH_MAP_INIT_INT128(name, khval_t) \
KHASH_INIT(name, __uint128_t, khval_t, 1, kh_int128_hash_func, kh_int128_hash_equal)

#define KHASH_SET_INIT_INT128(name) \
KHASH_INIT(name, __uint128_t, char, 0, kh_int128_hash_func, kh_int128_hash_equal)

typedef unsigned char byte;
#define KHASH_MAP_INIT_INT256(name, khval_t) \
KHASH_INIT(name, uint256_t, khval_t, 1, kh_int256_hash_func, kh_int128_hash_equal)

KHASH_MAP_INIT_INT256(S256M, byte)
KHASH_MAP_INIT_INT128(S128M, byte)
KHASH_MAP_INIT_INT64(S64M, byte)
KHASH_SET_INIT_INT128(S128S)
KHASH_SET_INIT_INT64(S64S)

byte MINIMUM_ABUNDANCE = 1;
Expand All @@ -40,26 +41,35 @@ struct DifferenceInPlaceData##variant {
}; \
\
/* Determine whether the canonical k-mer is present.*/ \
bool containsKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
inline bool containsCanonicalKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer) { \
bool contains_key = kh_get_S##variant(kMers, kMer) != kh_end(kMers); \
if (MINIMUM_ABUNDANCE == 1) return contains_key; \
if (!contains_key) return false; \
return kh_val(kMers, kh_get_S##variant(kMers, kMer)) >= MINIMUM_ABUNDANCE; \
} \
\
/* Remove the canonical k-mer from the set.*/ \
void eraseKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
/* Determine whether the canonical form of a k-mer is present.*/ \
inline bool containsKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
return containsCanonicalKMer(kMers, kMer); \
} \
\
/* Remove the canonical k-mer from the set.*/ \
inline void eraseCanonicalKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer) { \
auto key = kh_get_S##variant(kMers, kMer); \
if (key != kh_end(kMers)) { \
kh_del_S##variant(kMers, key); \
} \
} \
\
/* Insert the canonical k-mer into the set. */ \
void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements, bool force = false) { \
/* Remove the canonical form of a k-mer from the set.*/ \
inline void eraseKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
eraseCanonicalKMer(kMers, kMer); \
} \
\
/* Insert the canonical k-mer into the set. */ \
inline void insertCanonicalKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, bool force=false) { \
int ret; \
if (MINIMUM_ABUNDANCE == (byte)1) { \
kh_put_S##variant(kMers, kMer, &ret); \
Expand All @@ -76,6 +86,12 @@ void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool compl
} \
} \
\
/* Insert the canonical k-mer into the set. */ \
inline void insertKMer(kh_S##variant##_t *kMers, kmer##type##_t kMer, int k, bool complements, bool force=false) { \
if (complements) kMer = CanonicalKMer(kMer, k); \
insertCanonicalKMer(kMers, kMer, force); \
} \
\
/* Parallel wrapper for differenceInPlace. */ \
void DifferenceInPlaceThread##variant(void *arg, long i, int _) { \
auto data = (DifferenceInPlaceData##variant*)arg; \
Expand All @@ -84,12 +100,12 @@ void DifferenceInPlaceThread##variant(void *arg, long i, int _) {

INIT_KHASH_UTILS(64, 64S)
INIT_KHASH_UTILS(64, 64M)
INIT_KHASH_UTILS(128, 128S)
INIT_KHASH_UTILS(128, 128M)
INIT_KHASH_UTILS(256, 256M)

/// Return the next k-mer in the k-mer set and update the index.
template <typename KHT, typename kmer_t>
kmer_t nextKMer(KHT *kMers, size_t &lastIndex, kmer_t &kMer) {
inline kmer_t nextKMer(KHT *kMers, size_t &lastIndex, kmer_t &kMer) {
for (size_t i = kh_begin(kMers) + lastIndex; i != kh_end(kMers); ++i, ++lastIndex) {
if (!kh_exist(kMers, i)) continue;
kMer = kh_key(kMers, i);
Expand All @@ -116,7 +132,7 @@ std::vector<kmer64_t> kMersToVec(kh_S64M_t *kMers) {

/// Compute the intersection of several k-mer sets.
template <typename KHT>
KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets, int k, bool complements) {
KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets) {
if (kMerSets.size() < 2) return result;
KHT* smallestSet = kMerSets[0];
for (size_t i = 1; i < kMerSets.size(); ++i) {
Expand All @@ -125,16 +141,16 @@ KHT *getIntersection(KHT* result, std::vector<KHT*> &kMerSets, int k, bool compl
for (auto i = kh_begin(smallestSet); i != kh_end(smallestSet); ++i) {
if (!kh_exist(smallestSet, i)) continue;
auto kMer = kh_key(smallestSet, i);
if (!containsKMer(smallestSet, kMer, k, complements)) continue;
if (!containsCanonicalKMer(smallestSet, kMer)) continue;
bool everywhere = true;
for (size_t i = 0; i < kMerSets.size(); ++i) if (kMerSets[i] != smallestSet) {
if (!containsKMer(kMerSets[i], kMer, k, complements)) {
if (!containsCanonicalKMer(kMerSets[i], kMer)) {
everywhere = false;
break;
}
}
if (everywhere) {
insertKMer(result, kMer, k, complements, true);
insertCanonicalKMer(result, kMer, true);
}
}
return result;
Expand Down
22 changes: 17 additions & 5 deletions src/kmers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@
#include <iostream>
#include <cstdint>

#include "uint256_t/uint256_t.h"

typedef uint64_t kmer64_t;
typedef __uint128_t kmer128_t;
typedef uint256_t kmer256_t;

/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
/// If non-existing nucleotide is given, return -1.
int NucleotideToInt (char c) {
inline int NucleotideToInt (char c) {
switch (c) {
case 'A': return 0;
case 'C': return 1;
Expand All @@ -25,13 +28,13 @@ int NucleotideToInt (char c) {

/// Compute the prefix of size d of the given k-mer.
template <typename kmer_t>
kmer_t BitPrefix(kmer_t kMer, int k, int d) {
inline kmer_t BitPrefix(kmer_t kMer, int k, int d) {
return kMer >> ((k - d) << kmer_t(1));
}

/// Compute the suffix of size d of the given k-mer.
template <typename kmer_t>
kmer_t BitSuffix(kmer_t kMer, int d) {
inline kmer_t BitSuffix(kmer_t kMer, int d) {
return kMer & ((kmer_t(1) << (d << kmer_t(1))) - kmer_t(1));
}

Expand Down Expand Up @@ -71,9 +74,17 @@ inline kmer128_t word_reverse_complement(kmer128_t w) {
w = ( w >> 64 ) | ( w << 64);
return ((U)-1) - w;
}
/// Compute the reverse complement of a word.
/// Copyright: Jellyfish GPL-3.0
inline kmer256_t word_reverse_complement(kmer256_t w) {
kmer128_t low = word_reverse_complement(w.lower());
kmer128_t high = word_reverse_complement(w.upper());
return kmer256_t(low, high);
}

constexpr int KMER_SIZE_64 = 64;
constexpr int KMER_SIZE_128 = 128;
constexpr int KMER_SIZE_256 = 256;
#define INIT_KMERS(type) \
\
/* Get the mask to mask k-mers. */ \
Expand All @@ -89,6 +100,7 @@ inline kmer##type##_t ReverseComplement(kmer##type##_t kMer, int k) {

INIT_KMERS(64)
INIT_KMERS(128)
INIT_KMERS(256)

/// Return the lexicographically smaller of the k-mer and its reverse complement.
template <typename kmer_t>
Expand All @@ -102,7 +114,7 @@ const char letters[4] {'A', 'C', 'G', 'T'};
/// Return the index-th nucleotide from the encoded k-mer.
template <typename kmer_t>
inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
return letters[(uint32_t)(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
}

/// Convert the encoded KMer representation to string.
Expand All @@ -111,7 +123,7 @@ std::string NumberToKMer(kmer_t encoded, int length) {
std::string ret(length, 'N');
for (int i = 0; i < length; ++i) {
// The last two bits correspond to one nucleotide.
ret[length - i -1] = letters[encoded & 3];
ret[length - i -1] = letters[(uint32_t)(encoded & 3)];
// Move to the next letter.
encoded >>= 2;
}
Expand Down
Loading
Loading