Skip to content

Commit

Permalink
Mash-like distances (initial implementation)
Browse files Browse the repository at this point in the history
  • Loading branch information
kdm9 committed Jul 24, 2023
1 parent f37704a commit 7074f74
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 1 deletion.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CFLAGS= -g -Wall -O2
CFLAGS= -g -Wall -O3
CPPFLAGS=
INCLUDES=
OBJS= kthread.o bbf.o htab.o bseq.o misc.o sys.o 6gjdn.o \
Expand Down
152 changes: 152 additions & 0 deletions htab.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <stdarg.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include "kthread.h"
#include "yak-priv.h"

Expand Down Expand Up @@ -296,3 +297,154 @@ yak_ch_t *yak_ch_restore(const char *fn)
{
return yak_ch_restore_core(0, fn, YAK_LOAD_ALL);
}


//SET OPERATIONS

struct setops_dat {
char **filenames;
yak_ch_t **hts;
volatile int *lhts;
int n_files;
int n_tables;
uint64_t *ca, *cb, *cab, *aob;
double *mashd;
};

void tri2rowcol(long i, size_t *row, size_t *col)
{
*row = (size_t) floor(sqrt(2.0 * i + 0.25) - 0.5); // Triangle index to 0-based row
*col = i-((*row+1)*((*row))/2);
*row +=1;
}

static void worker_setops(void *data, long i, int tid) // callback for kt_for()
{
struct setops_dat *so = data;
khint_t k;

size_t row, col;
tri2rowcol(i, &row, &col);

yak_ch_t *A = yak_ch_restore(so->filenames[row]);
yak_ch_t *B = yak_ch_restore(so->filenames[col]);
//if (so->lhts[row] == 0) {
// so->lhts[row] = -1;
// printf("%i load %i\n", i, row);
// so->hts[row] = yak_ch_restore(so->filenames[row]);
// so->lhts[row] = 1;
//}
//while (so->lhts[row] == -1) {
// printf("%i wait row %i %i\n", i, row, so->lhts[row] );
//}
//yak_ch_t *A = so->hts[row];

//if (so->lhts[col] == 0) {
// so->lhts[col] = -1;
// printf("%i load %i\n", i, col);
// so->hts[col] = yak_ch_restore(so->filenames[col]);
// so->lhts[col] = 1;
//}
//while (so->lhts[col] == -1) {
// printf("%i wait %i %i\n", i, col, so->lhts[col] );
//}
//yak_ch_t *B = so->hts[col];

uint64_t ca=0, cb=0, cab=0, aob=0;
for (int t = 0; t<so->n_tables; t++) {
yak_ht_t *ah = A->h[t].h;
yak_ht_t *bh = B->h[t].h;
for (k = 0; k < kh_end(ah); ++k) {
if (!kh_exist(ah, k)) continue;
uint64_t x = kh_key(ah, k);
khint_t bk = yak_ht_get(bh, x >> A->pre << YAK_COUNTER_BITS);
int a = kh_exist(ah, k);
int b = kh_exist(bh, bk);
if (a && b) {
cab++;
ca++;
cb++;
aob++;
} else if (a) {
aob++;
ca++;
}
}
for (k = 0; k < kh_end(bh); ++k) {
if (!kh_exist(bh, k)) continue;
uint64_t x = kh_key(bh, k);
khint_t ak = yak_ht_get(ah, x >> A->pre << YAK_COUNTER_BITS);
int a = kh_exist(ah, ak);
int b = kh_exist(bh, k);
if (b && !a) {
cb++;
aob++;
}
}
}
so->ca[i] = ca;
so->cb[i] = cb;
so->cab[i] = cab;
so->aob[i] = aob;
double j = (double)cab/(double)aob;
so->mashd[i] = cab == aob ? 0 : -(1.0/A->k)*log(2*j/(1+j));
yak_ch_destroy(A);
yak_ch_destroy(B);
fprintf(stderr, "[M::%s::%.3f*%.2f] processed %s vs %s -> %lu\n", __func__,
yak_realtime(), yak_cputime() / yak_realtime(),
so->filenames[row], so->filenames[col], aob);
}

#include "ketopt.h"
int main_setops(int argc, char *argv[])
{
struct setops_dat so;
ketopt_t o = KETOPT_INIT;
int n_thread = 1;
int sz = 1<<10;

char c;
while ((c = ketopt(&o, argc, argv, 1, "n:t:", 0)) >= 0) {
if (c == 't') n_thread = atoi(o.arg);
if (c == 'n') sz = atoi(o.arg);
}

int ni = argc - o.ind;
if (ni < 2 || sz > 1024 || sz < 0) {
fprintf(stderr, "USAGE: yak setops [options] <a.yak> <b.yak>...\n");
fprintf(stderr, "\n");
fprintf(stderr, "OPTIONS:\n");
fprintf(stderr, "\t-t INT Use INT threads\n");
fprintf(stderr, "\t-n INT Use only first INT/1024 tables\n");
return 1;
}

so.filenames = argv + o.ind;
so.n_tables = sz;

size_t n = (ni * (ni-1)) /2;
so.ca = calloc(n, 8);
so.cb = calloc(n, 8);
so.cab = calloc(n, 8);
so.aob = calloc(n, 8);
//so.hts = calloc(ni, 8);
//so.lhts = calloc(ni, 8);
so.mashd = calloc(n, sizeof(double));

kt_for(n_thread, worker_setops, &so, n);

fprintf(stdout, "A\tB\tn_tbls\tkmers_A\tkmers_B\tkmers_AandB\tkmers_AorB\tmashd\n");
for (size_t i = 0; i < n; i++) {
size_t row, col;
tri2rowcol(i, &row, &col);
fprintf(stdout, "%s\t%s\t%d\t%lu\t%lu\t%lu\t%lu\t%lf\n",
so.filenames[row], so.filenames[col], sz, so.ca[i], so.cb[i], so.cab[i], so.aob[i], so.mashd[i]);
}
free(so.ca);
free(so.cb);
free(so.cab);
free(so.aob);
free(so.mashd);
return EXIT_SUCCESS;
}

4 changes: 4 additions & 0 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "ketopt.h"
#include "yak-priv.h"


int main_count(int argc, char *argv[])
{
yak_ch_t *h;
Expand Down Expand Up @@ -117,6 +118,7 @@ int main_qv(int argc, char *argv[])

int main(int argc, char *argv[])
{
extern int main_setops(int argc, char *argv[]);
extern int main_triobin(int argc, char *argv[]);
extern int main_trioeval(int argc, char *argv[]);
extern int main_inspect(int argc, char *argv[]);
Expand All @@ -134,6 +136,7 @@ int main(int argc, char *argv[])
fprintf(stderr, " inspect k-mer hash tables\n");
fprintf(stderr, " chkerr check errors\n");
fprintf(stderr, " sexchr count sex-chromosome-specific k-mers\n");
fprintf(stderr, " setops calculate set operations between .yak tables\n");
fprintf(stderr, " version print version number\n");
return 1;
}
Expand All @@ -144,6 +147,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "inspect") == 0) ret = main_inspect(argc-1, argv+1);
else if (strcmp(argv[1], "chkerr") == 0) ret = main_chkerr(argc-1, argv+1);
else if (strcmp(argv[1], "sexchr") == 0) ret = main_sexchr(argc-1, argv+1);
else if (strcmp(argv[1], "setops") == 0) ret = main_setops(argc-1, argv+1);
else if (strcmp(argv[1], "version") == 0) {
puts(YAKS_VERSION);
return 0;
Expand Down

0 comments on commit 7074f74

Please sign in to comment.