Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extra large kmers #17

Merged
merged 11 commits into from
Sep 27, 2024
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ CXX= g++
CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O2
LDFLAGS= -lz -lpthread
SRC= src
UINT256= $(SRC)/uint256_t
SCRIPTS= scripts
DATA= data
TESTS= tests
Expand All @@ -24,13 +25,13 @@ verify: $(PROG) $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa
quick-verify: $(PROG) $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa
python $(SCRIPTS)/verify.py $(DATA)/spneumoniae.fa --quick --interpath $(DATA)/spyogenes.fa

$(PROG): $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h
$(PROG): $(SRC)/main.cpp $(SRC)/$(wildcard *.cpp *.h *.hpp) src/version.h $(wildcard $(UINT256)/*.cpp $(UINT256)/*.h $(UINT256)/*.include)
./create-version.sh
$(CXX) $(CXXFLAGS) $(SRC)/main.cpp $(SRC)/kthread.c -o $@ $(LDFLAGS)


prophasmtest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS)

gtest-all.o: $(GTEST)/src/gtest-all.cc $(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include -I $(GTEST) -DGTEST_CREATE_SHARED_LIBRARY=1 -c -pthread $(GTEST)/src/gtest-all.cc -o $@
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ ProphAsm2 is a versatile tool for computing simplitigs/SPSS
from *k-mer sets* and for *k-mer set operations*.
The new features compared to the original [ProphAsm](https://github.com/prophyle/prophasm)
include a largely speed and memory optimization, parallelization,
support for k-mer sizes up to 64 and support for minimum abundances.
support for k-mer sizes up to 128 and support for minimum abundances.

Various types of sequencing datasets can be used as the input for
ProphAsm, including genomes, pan-genomes, metagenomes or sequencing reads.
Expand Down
4 changes: 2 additions & 2 deletions scripts/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,14 @@ def main():
print("Testing ProphAsm2 outputs valid intersection on files " + args.path + " and " + args.interpath)
for complements in [True]:
for m in range(1, 3):
for k in range(2, 65, 23 if args.quick else 1):
for k in range(2, 129, 23 if args.quick else 1):
success &= verify_intersection(args.path, args.interpath, k, complements, m)
print("")

print("Testing ProphAsm2 outputs valid simplitigs on file " + args.path)
for complements in [True, False]:
for m in range(1, 4):
for k in range(2, 65, 11 if args.quick else 1):
for k in range(2, 129, 11 if args.quick else 1):
success &= verify_instance(args.path, k, complements, m)
print("")

Expand Down
13 changes: 7 additions & 6 deletions src/khash_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,22 @@
#include "kmers.h"
#include "khash.h"

typedef unsigned char byte;

#define kh_int128_hash_func(key) kh_int64_hash_func((khint64_t)((key)>>65^(key)^(key)<<21))
#define kh_int128_hash_equal(a, b) ((a) == (b))
#define kh_int256_hash_func(key) kh_int128_hash_func((__uint128_t)((key)>>129^(key)^(key)<<35))
#define kh_int256_hash_equal(a, b) ((a) == (b))

#define KHASH_MAP_INIT_INT128(name, khval_t) \
KHASH_INIT(name, __uint128_t, khval_t, 1, kh_int128_hash_func, kh_int128_hash_equal)

#define KHASH_SET_INIT_INT128(name) \
KHASH_INIT(name, __uint128_t, char, 0, kh_int128_hash_func, kh_int128_hash_equal)

typedef unsigned char byte;
#define KHASH_MAP_INIT_INT256(name, khval_t) \
KHASH_INIT(name, uint256_t, khval_t, 1, kh_int256_hash_func, kh_int128_hash_equal)

KHASH_MAP_INIT_INT256(S256M, byte)
KHASH_MAP_INIT_INT128(S128M, byte)
KHASH_MAP_INIT_INT64(S64M, byte)
KHASH_SET_INIT_INT128(S128S)
KHASH_SET_INIT_INT64(S64S)

byte MINIMUM_ABUNDANCE = 1;
Expand Down Expand Up @@ -84,8 +85,8 @@ void DifferenceInPlaceThread##variant(void *arg, long i, int _) {

INIT_KHASH_UTILS(64, 64S)
INIT_KHASH_UTILS(64, 64M)
INIT_KHASH_UTILS(128, 128S)
INIT_KHASH_UTILS(128, 128M)
INIT_KHASH_UTILS(256, 256M)

/// Return the next k-mer in the k-mer set and update the index.
template <typename KHT, typename kmer_t>
Expand Down
16 changes: 14 additions & 2 deletions src/kmers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
#include <iostream>
#include <cstdint>

#include "uint256_t/uint256_t.h"

typedef uint64_t kmer64_t;
typedef __uint128_t kmer128_t;
typedef uint256_t kmer256_t;

/// Convert the given basic nucleotide to int so it can be used for indexing in AC.
/// If non-existing nucleotide is given, return -1.
Expand Down Expand Up @@ -71,9 +74,17 @@ inline kmer128_t word_reverse_complement(kmer128_t w) {
w = ( w >> 64 ) | ( w << 64);
return ((U)-1) - w;
}
/// Compute the reverse complement of a word.
/// Copyright: Jellyfish GPL-3.0
inline kmer256_t word_reverse_complement(kmer256_t w) {
kmer128_t low = word_reverse_complement(w.lower());
kmer128_t high = word_reverse_complement(w.upper());
return kmer256_t(low, high);
}

constexpr int KMER_SIZE_64 = 64;
constexpr int KMER_SIZE_128 = 128;
constexpr int KMER_SIZE_256 = 256;
#define INIT_KMERS(type) \
\
/* Get the mask to mask k-mers. */ \
Expand All @@ -89,6 +100,7 @@ inline kmer##type##_t ReverseComplement(kmer##type##_t kMer, int k) {

INIT_KMERS(64)
INIT_KMERS(128)
INIT_KMERS(256)

/// Return the lexicographically smaller of the k-mer and its reverse complement.
template <typename kmer_t>
Expand All @@ -102,7 +114,7 @@ const char letters[4] {'A', 'C', 'G', 'T'};
/// Return the index-th nucleotide from the encoded k-mer.
template <typename kmer_t>
inline char NucleotideAtIndex(kmer_t encoded, int k, int index) {
return letters[(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
return letters[(uint32_t)(encoded >> ((k - index - kmer_t(1)) << kmer_t(1))) & kmer_t(3)];
}

/// Convert the encoded KMer representation to string.
Expand All @@ -111,7 +123,7 @@ std::string NumberToKMer(kmer_t encoded, int length) {
std::string ret(length, 'N');
for (int i = 0; i < length; ++i) {
// The last two bits correspond to one nucleotide.
ret[length - i -1] = letters[encoded & 3];
ret[length - i -1] = letters[(uint32_t)(encoded & 3)];
// Move to the next letter.
encoded >>= 2;
}
Expand Down
93 changes: 5 additions & 88 deletions src/kthread.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
/* The MIT License

Copyright (c) 2008, 2009, 2011 by Attractive Chaos <[email protected]>
*/
#include <pthread.h>
#include <stdlib.h>
#include <limits.h>
Expand Down Expand Up @@ -69,91 +73,4 @@ void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
long j;
for (j = 0; j < n; ++j) func(data, j, 0);
}
}

/*****************
* kt_pipeline() *
*****************/

struct ktp_t;

typedef struct {
struct ktp_t *pl;
int64_t index;
int step;
void *data;
} ktp_worker_t;

typedef struct ktp_t {
void *shared;
void *(*func)(void*, int, void*);
int64_t index;
int n_workers, n_steps;
ktp_worker_t *workers;
pthread_mutex_t mutex;
pthread_cond_t cv;
} ktp_t;

static void *ktp_worker(void *data)
{
ktp_worker_t *w = (ktp_worker_t*)data;
ktp_t *p = w->pl;
while (w->step < p->n_steps) {
// test whether we can kick off the job with this worker
pthread_mutex_lock(&p->mutex);
for (;;) {
int i;
// test whether another worker is doing the same step
for (i = 0; i < p->n_workers; ++i) {
if (w == &p->workers[i]) continue; // ignore itself
if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
break;
}
if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
pthread_cond_wait(&p->cv, &p->mutex);
}
pthread_mutex_unlock(&p->mutex);

// working on w->step
w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL

// update step and let other workers know
pthread_mutex_lock(&p->mutex);
w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
if (w->step == 0) w->index = p->index++;
pthread_cond_broadcast(&p->cv);
pthread_mutex_unlock(&p->mutex);
}
pthread_exit(0);
}

void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
{
ktp_t aux;
pthread_t *tid;
int i;

if (n_threads < 1) n_threads = 1;
aux.n_workers = n_threads;
aux.n_steps = n_steps;
aux.func = func;
aux.shared = shared_data;
aux.index = 0;
pthread_mutex_init(&aux.mutex, 0);
pthread_cond_init(&aux.cv, 0);

aux.workers = (ktp_worker_t*)calloc(n_threads, sizeof(ktp_worker_t));
for (i = 0; i < n_threads; ++i) {
ktp_worker_t *w = &aux.workers[i];
w->step = 0; w->pl = &aux; w->data = 0;
w->index = aux.index++;
}

tid = (pthread_t*)calloc(n_threads, sizeof(pthread_t));
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
free(tid); free(aux.workers);

pthread_mutex_destroy(&aux.mutex);
pthread_cond_destroy(&aux.cv);
}
}
1 change: 0 additions & 1 deletion src/kthread.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ extern "C" {
#endif

void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);

#ifdef __cplusplus
}
Expand Down
12 changes: 5 additions & 7 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include "khash_utils.h"


constexpr int MAX_K = 64;
constexpr int MAX_K = 128;


int Help() {
Expand Down Expand Up @@ -198,8 +198,8 @@ int run##version(int32_t k,

INIT_RUN(64, 64S)
INIT_RUN(64, 64M)
INIT_RUN(128, 128S)
INIT_RUN(128, 128M)
INIT_RUN(256, 256M)

int main(int argc, char **argv) {
int32_t k = -1;
Expand Down Expand Up @@ -329,11 +329,9 @@ int main(int argc, char **argv) {
} else {
return run64M(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
}
} else if (k <= 64) {
return run128M(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
} else {
if (MINIMUM_ABUNDANCE == (byte)1) {
return run128S(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
} else {
return run128M(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
}
return run256M(k, intersectionPath, inPaths, outPaths, statsPath, fstats, computeIntersection, computeOutput, verbose, complements, threads, setCount);
}
}
2 changes: 1 addition & 1 deletion src/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,5 @@ void ReadKMersThread##variant(void *arg, long i, int _) {

INIT_PARSER(64, 64S)
INIT_PARSER(64, 64M)
INIT_PARSER(128, 128S)
INIT_PARSER(128, 128M)
INIT_PARSER(256, 256M)
6 changes: 3 additions & 3 deletions src/prophasm.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void NextSimplitig(KHT *kMers, kmer_t begin, std::ostream& of, int k, bool comp
} else {
// Extend the simplitig to the right.
eraseKMer(kMers, next, k, complements);
simplitig.emplace_back(letters[ext]);
simplitig.emplace_back(letters[(uint32_t)ext]);
last = next;
}
} else {
Expand All @@ -71,7 +71,7 @@ void NextSimplitig(KHT *kMers, kmer_t begin, std::ostream& of, int k, bool comp
} else {
// Extend the simplitig to the left.
eraseKMer(kMers, next, k, complements);
simplitig.emplace_front(letters[ext]);
simplitig.emplace_front(letters[(uint32_t)ext]);
first = next;
}
}
Expand Down Expand Up @@ -119,5 +119,5 @@ void ComputeSimplitigsThread##variant(void *arg, long i, int _) {

INIT_PROPHASM(64, 64S)
INIT_PROPHASM(64, 64M)
INIT_PROPHASM(128, 128S)
INIT_PROPHASM(128, 128M)
INIT_PROPHASM(256, 256M)
17 changes: 17 additions & 0 deletions src/uint256_t/uint256_t.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// IMPLEMENTATION BUILD HEADER

// We need uint128_t symbols as plain "extern", neither import nor export
// because we're linking the 128 and 256 object files into a single library
// So we can only have one export for symbol in any translation unit
#define UINT256_T_EXTERN
typedef __uint128_t uint128_t;
#undef UINT256_T_EXTERN

#ifndef _UNIT256_T_BUILD
#define _UINT256_T_BUILD
#include "uint256_t_config.include"
const uint128_t uint128_0(0);
const uint128_t uint128_1(1);
#define UINT256_T_EXTERN _UINT256_T_EXPORT
#endif
#include "uint256_t.include"
Loading
Loading