From 2c1dbef55e47157bd03c695cef363557925596a7 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 12:52:58 +0800
Subject: [PATCH 01/43] pileup and full-alignment create tensor c implement,
 pileup followed medaka pileup count calculation style, full-alignemnt
 integrated read-haplotagging function

---
 src/clair3_full_alignment.c | 910 ++++++++++++++++++++++++++++++++++++
 src/clair3_full_alignment.h | 257 ++++++++++
 src/clair3_pileup.c         | 462 ++++++++++++++++++
 src/clair3_pileup.h         | 105 +++++
 src/khash.h                 | 627 +++++++++++++++++++++++++
 src/kvec.h                  |  90 ++++
 src/levenshtein.c           |  72 +++
 src/levenshtein.h           |  17 +
 src/medaka_bamiter.c        |  72 +++
 src/medaka_bamiter.h        |  37 ++
 src/medaka_common.c         |  99 ++++
 src/medaka_common.h         |  60 +++
 src/medaka_khcounter.c      | 135 ++++++
 src/medaka_khcounter.h      |  53 +++
 14 files changed, 2996 insertions(+)
 create mode 100644 src/clair3_full_alignment.c
 create mode 100644 src/clair3_full_alignment.h
 create mode 100644 src/clair3_pileup.c
 create mode 100644 src/clair3_pileup.h
 create mode 100644 src/khash.h
 create mode 100644 src/kvec.h
 create mode 100644 src/levenshtein.c
 create mode 100644 src/levenshtein.h
 create mode 100644 src/medaka_bamiter.c
 create mode 100644 src/medaka_bamiter.h
 create mode 100644 src/medaka_common.c
 create mode 100644 src/medaka_common.h
 create mode 100644 src/medaka_khcounter.c
 create mode 100644 src/medaka_khcounter.h

diff --git a/src/clair3_full_alignment.c b/src/clair3_full_alignment.c
new file mode 100644
index 0000000..06f6d07
--- /dev/null
+++ b/src/clair3_full_alignment.c
@@ -0,0 +1,910 @@
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <math.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "htslib/sam.h"
+#include "htslib/faidx.h"
+#include "khash.h"
+#include "kvec.h"
+#include "medaka_bamiter.h"
+#include "medaka_common.h"
+#include "medaka_khcounter.h"
+#include "clair3_full_alignment.h"
+#include "zlib.h"
+#include "levenshtein.h"
+
+typedef struct Pos_alt_info
+{
+
+    khash_t(KH_COUNTER) * ins_counter;
+    khash_t(KH_INT_COUNTER) * del_counter;
+    size_t acgt_count[4];
+    size_t depth;
+
+} Pos_alt_info;
+
+int com_func(const void *a, const void *b)
+{
+
+    return (*(size_t *)a - *(size_t *)b);
+}
+
+int hap_cmp(const void *x, const void *y)
+{
+
+    HAP a = *(HAP *)x;
+    HAP b = *(HAP *)y;
+    if (a.haplotype < b.haplotype)
+        return -1;
+    else if (a.haplotype > b.haplotype)
+        return 1;
+    else
+        return (a.read_index - b.read_index);
+}
+
+void destroy_fa_data(fa_data data)
+{
+
+    free(data->matrix);
+    for (size_t i = 0; i < data->candidates_num; i++)
+    {
+       free(data->all_alt_info[i]);
+    }
+    free(data->all_alt_info);
+    free(data);
+}
+
+void sort_read_name_by_haplotype(HAP *read_hap_array, int *matrix_read_index_array, size_t n)
+{
+
+    size_t read_num = min(n, matrix_depth);
+    if (n > matrix_depth)
+    {
+        // shuffle the read index array with the same random seed
+        for (size_t i = 0; i < n - 1; i++)
+        {
+            size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
+            size_t tmp_read_index = read_hap_array[j].read_index;
+            size_t tmp_haplotype = read_hap_array[j].haplotype;
+            read_hap_array[j].read_index = read_hap_array[i].read_index;
+            read_hap_array[j].haplotype = read_hap_array[i].haplotype;
+            read_hap_array[i].read_index = tmp_read_index;
+            read_hap_array[i].haplotype = tmp_haplotype;
+        }
+    }
+
+    qsort(read_hap_array, read_num, sizeof(HAP), hap_cmp);
+
+    // if need padding (overlap read num < matrix depth), add -1 to fill the depth
+    if (n < matrix_depth)
+    {
+        size_t padding_depth = matrix_depth - read_num;
+        size_t prefix_padding_depth = padding_depth >> 1;
+        size_t suffix_padding_depth = padding_depth - prefix_padding_depth;
+        for (size_t i = 0; i < prefix_padding_depth; i++)
+            matrix_read_index_array[i] = -1;
+        for (size_t i = 0; i < read_num; i++)
+            matrix_read_index_array[i + prefix_padding_depth] = read_hap_array[i].read_index;
+        for (size_t i = 0; i < suffix_padding_depth; i++)
+            matrix_read_index_array[read_num + prefix_padding_depth + i] = -1;
+    }
+    else
+    {
+        for (size_t i = 0; i < matrix_depth; i++)
+            matrix_read_index_array[i] = read_hap_array[i].read_index;
+    }
+}
+
+void cigar_prefix_length(uint32_t *cigartuples, size_t reference_bases, size_t *ref_bases, size_t *query_bases, size_t left_cigar_index, size_t right_cigar_index, size_t consumed, bool reverse)
+{
+
+    size_t ref_pos = 0;
+    size_t query_pos = 0;
+    for (size_t i = left_cigar_index; i < right_cigar_index; i++)
+    {
+        size_t index = reverse ? left_cigar_index + right_cigar_index - i - 1 : i;
+        size_t cigar_op = bam_cigar_op(cigartuples[index]);
+        size_t length = bam_cigar_oplen(cigartuples[index]);
+
+        length = i == left_cigar_index ? consumed : length;
+        if (length == 0)
+            continue;
+
+        if (cigar_op == BAM_CMATCH || cigar_op == BAM_CEQUAL || cigar_op == BAM_CDIFF)
+        {
+            query_pos += length;
+            ref_pos += length;
+            if (ref_pos >= reference_bases)
+            {
+                *ref_bases = reference_bases;
+                *query_bases = query_pos + reference_bases - ref_pos;
+                return;
+            }
+        }
+        else if (cigar_op == BAM_CDEL)
+        {
+            ref_pos += length;
+            if (ref_pos >= reference_bases)
+            {
+                *ref_bases = reference_bases;
+                *query_bases = query_pos;
+                return;
+            }
+        }
+        else if (cigar_op == BAM_CINS)
+        {
+            query_pos += length;
+        }
+        else if (cigar_op == BAM_CREF_SKIP)
+        {
+            *ref_bases = reference_bases;
+            *query_bases = query_pos;
+            return;
+        }
+    }
+}
+
+char *get_ref_seq(char *ref_seq, size_t start, size_t end)
+{
+
+    size_t seq_size = end - start;
+    char *sub_seq = malloc((seq_size + 1));
+    strncpy(sub_seq, ref_seq + start, seq_size);
+    sub_seq[seq_size] = '\0';
+    return sub_seq;
+}
+
+size_t get_read_end(uint32_t *cigartuples, size_t n_cigar, size_t read_start)
+{
+
+    size_t ref_pos = read_start;
+    for (size_t i = 0; i < n_cigar; i++)
+    {
+        size_t cigar_op = bam_cigar_op(cigartuples[i]);
+        size_t length = bam_cigar_oplen(cigartuples[i]);
+        if (cigar_op == BAM_CMATCH || cigar_op == BAM_CEQUAL || cigar_op == BAM_CDIFF || cigar_op == BAM_CDEL || cigar_op == BAM_CREF_SKIP)
+        {
+            ref_pos += length;
+        }
+    }
+    return ref_pos;
+}
+
+char *get_query_seq(uint8_t *seqi, size_t start, size_t end)
+{
+
+    size_t seq_size = end - start;
+    char *sub_seq = malloc((seq_size + 1));
+    for (size_t i = 0; i < seq_size; i++)
+    {
+        sub_seq[i] = seq_nt16_str[bam_seqi(seqi, start + i)];
+    }
+    sub_seq[seq_size] = '\0';
+    return sub_seq;
+}
+
+void update_haplotype_cost(int allele, int phase_set, int genotype, khash_t(KH_INT_COUNTER) * haplotype_cost)
+{
+
+    if (allele == 0)
+        return;
+
+    if (allele == genotype)
+    {
+        kh_int_counter_add(haplotype_cost, phase_set, 1);
+    }
+    else
+    {
+        kh_int_counter_add(haplotype_cost, phase_set, -1);
+    }
+}
+
+int realign_read(Variant *variant, Read *read, size_t i, size_t consumed, size_t query_pos, char *reference, size_t ref_start)
+{
+
+    uint32_t *cigartuples = read->cigartuples;
+    uint8_t *seqi = read->seqi;
+    size_t n_cigar = read->n_cigar;
+    size_t middle_op = bam_cigar_op(cigartuples[i]);
+    size_t middle_length = bam_cigar_oplen(cigartuples[i]);
+    size_t left_consumed = consumed > 0 ? consumed : 0;
+    size_t right_consumed = consumed < middle_length ? middle_length - consumed : 0;
+    size_t left_ref_bases = 0;
+    size_t left_query_bases = 0;
+    size_t right_ref_bases = 0;
+    size_t right_query_bases = 0;
+    size_t left_cigar_size = i + 1;
+    size_t right_cigar_size = i;
+
+    cigar_prefix_length(cigartuples, overhang, &left_ref_bases, &left_query_bases, 0, left_cigar_size, left_consumed, true);
+    cigar_prefix_length(cigartuples, overhang + 1, &right_ref_bases, &right_query_bases, right_cigar_size, n_cigar, right_consumed, false);
+
+    char *query = get_query_seq(seqi, query_pos - left_query_bases, query_pos + right_query_bases);
+    char *ref = get_ref_seq(reference, variant->position - left_ref_bases - ref_start, variant->position + right_ref_bases - ref_start);
+
+    size_t alt_length = left_ref_bases + right_ref_bases + 1;
+    char *alt = malloc(alt_length);
+    strcpy(alt, ref);
+    alt[left_ref_bases] = variant->alt_base;
+
+    size_t distance_ref = levenshtein(query, ref);
+    size_t distance_alt = levenshtein(query, alt);
+
+    int allele = 0;
+    if (distance_ref < distance_alt)
+    {
+        allele = 1;
+    }
+    else if (distance_ref > distance_alt)
+    {
+        allele = 2;
+    }
+
+    free(query);
+    free(ref);
+    free(alt);
+
+    return allele;
+}
+
+int haplotag_read(Variants_info *variants_info, Read *read, char *ref_seq, size_t ref_start)
+{
+
+    size_t n = variants_info->variant_num;
+    size_t query_pos = 0;
+    size_t v_position = 0;
+    Variant **variants = variants_info->variants;
+    uint8_t *seqi = read->seqi;
+    uint32_t *cigartuples = read->cigartuples;
+    size_t n_cigar = read->n_cigar;
+    size_t j = variants_info->variant_current_pos;
+    size_t ref_pos = read->read_start;
+    khash_t(KH_INT_COUNTER) *haplotype_cost = kh_init(KH_INT_COUNTER);
+    int allele = 0;
+
+    while (j < n && variants[j]->position < ref_pos)
+        j += 1;
+
+    for (size_t i = 0; i < n_cigar; i++)
+    {
+        size_t cigar_op = bam_cigar_op(cigartuples[i]);
+        size_t length = bam_cigar_oplen(cigartuples[i]);
+        if (j < n)
+            v_position = variants[j]->position;
+
+        if (cigar_op == BAM_CMATCH || cigar_op == BAM_CEQUAL || cigar_op == BAM_CDIFF)
+        { // XM=
+            while (j < n && v_position < ref_pos + length)
+            {
+                allele = realign_read(variants[j], read, i, v_position - ref_pos, query_pos + v_position - ref_pos, ref_seq, ref_start);
+                update_haplotype_cost(allele, variants[j]->phase_set, variants[j]->genotype, haplotype_cost);
+                j++;
+                if (j < n)
+                    v_position = variants[j]->position;
+            }
+            query_pos += length;
+            ref_pos += length;
+        }
+        else if (cigar_op == BAM_CINS)
+        { // I
+            if (j < n && v_position == ref_pos)
+            {
+                allele = realign_read(variants[j], read, i, 0, query_pos, ref_seq, ref_start);
+                update_haplotype_cost(allele, variants[j]->phase_set, variants[j]->genotype, haplotype_cost);
+                j++;
+                if (j < n)
+                    v_position = variants[j]->position;
+            }
+            query_pos += length;
+        }
+        else if (cigar_op == BAM_CDEL)
+        {
+            while (j < n && v_position < ref_pos + length)
+            {
+                allele = realign_read(variants[j], read, i, v_position - ref_pos, query_pos, ref_seq, ref_start);
+                update_haplotype_cost(allele, variants[j]->phase_set, variants[j]->genotype, haplotype_cost);
+                j++;
+                if (j < n)
+                    v_position = variants[j]->position;
+            }
+            ref_pos += length;
+        }
+        else if (cigar_op == BAM_CREF_SKIP)
+        {
+            while (j < n && v_position < ref_pos + length)
+            {
+                j++;
+                if (j < n)
+                    v_position = variants[j]->position;
+            }
+            ref_pos += length;
+        }
+        else if (cigar_op == BAM_CSOFT_CLIP)
+        {
+            query_pos += length;
+        }
+    }
+
+    read->read_end = ref_pos;
+
+    size_t counter_size = 0;
+    int max_value = 0;
+    int min_value = 0;
+    for (khiter_t k = kh_begin(haplotype_cost); k != kh_end(haplotype_cost); ++k)
+    {
+        if (kh_exist(haplotype_cost, k))
+        {
+            int val = kh_val(haplotype_cost, k);
+            max_value = max(max_value, val);
+            min_value = min(min_value, val);
+            counter_size++;
+        }
+    }
+
+    kh_int_counter_destroy(haplotype_cost);
+
+    if (counter_size == 0 || (max_value == 0 && min_value == 0))
+    {
+        return HAP_UNPHASED;
+    }
+    else if (max_value > abs(min_value))
+    {
+        return HAP_1;
+    }
+    else
+    {
+        return HAP_2;
+    }
+}
+
+size_t get_overlap_candidate_num(size_t read_start, size_t read_end, size_t candidate_current_index, size_t flanking_candidates_num, size_t *flanking_candidates)
+{
+    size_t overlap_num = 0;
+    for (size_t i = candidate_current_index; i < flanking_candidates_num; i++)
+    {
+        if (flanking_candidates[i] >= read_start && flanking_candidates[i] < read_end)
+            overlap_num++;
+        else
+            return overlap_num;
+    }
+    return overlap_num;
+}
+
+fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num)
+{
+
+    bool need_haplotagging = true;
+    int start, end;
+    char *chr = xalloc(strlen(region) + 1, sizeof(char), "chr");
+    strcpy(chr, region);
+    char *reg_chr = (char *)hts_parse_reg(chr, &start, &end);
+    if (reg_chr)
+        *reg_chr = '\0';
+
+    // open fasta
+    faidx_t *fai = fai_load(fasta_path);
+    int len = 0;
+    char *ref_seq = NULL;
+
+    const size_t offset_can = no_of_positions * matrix_depth * channel_size;
+    const size_t offset_row = no_of_positions * channel_size;
+    const size_t offset_col = channel_size;
+
+    int ref_start = max(0, start - expand_reference_region); // 0-index
+    int ref_end = end + expand_reference_region;
+    ref_seq = faidx_fetch_seq(fai, chr, ref_start, ref_end, &len);
+
+    // open bam
+    htsFile *hts_file;
+    hts_idx_t *idx;
+    bam_hdr_t *header;
+
+    hts_file = sam_open(bam_path, "r");
+    idx = sam_index_load(hts_file, bam_path);
+    header = sam_hdr_read(hts_file);
+    const int tid = bam_name2id(header, chr);
+    hts_itr_t *iter = sam_itr_queryi(idx, tid, start, end);
+    // initialize an alignment
+    bam1_t *alignment = bam_init1();
+
+    size_t reads_num = 0;
+    size_t variant_current_pos = 0;
+    size_t flanking_candidates_num = 0;
+    size_t candidate_current_index = 0;
+    size_t read_no_overlap_num = 0;
+    Pos_info *pos_info = NULL;
+
+    Variants_info variants_info = {
+        .variants = variants,
+        .variant_num = variant_num,
+        .variant_current_pos = variant_current_pos};
+
+    // dict to store all candidates index
+    khash_t(KH_INT_COUNTER) *candidates_p = kh_init(KH_INT_COUNTER);
+    // dict to store all flanking candidate index
+    khash_t(KH_INT_COUNTER) *flanking_candidates_p = kh_init(KH_INT_COUNTER);
+    // dict to count all read name
+    khash_t(KH_COUNTER) *read_name_set = kh_init(KH_COUNTER);
+    // allocate a position alternative information struct for each candidate
+    Pos_alt_info *pos_alt_info = calloc(candidate_num, sizeof(Pos_alt_info));
+    // a kvec vector to store all read struct
+    kvec_t(Read) read_array;
+    kv_init(read_array);
+
+    for (size_t i = 0; i < candidate_num; i++)
+    {
+        size_t candidate = candidates[i];
+        // each candidate is a new key
+        kh_int_counter_add(candidates_p, candidate, i);
+        pos_alt_info[i].ins_counter = kh_init(KH_COUNTER);
+        pos_alt_info[i].del_counter = kh_init(KH_INT_COUNTER);
+        pos_alt_info[i].depth = 0;
+        for (size_t j = 0; j < 4; j++)
+            pos_alt_info[i].acgt_count[j] = 0;
+
+        for (size_t j = 0; j < no_of_positions; j++)
+        {
+            size_t key = candidate - flanking_base_num + j;
+            if (kh_int_counter_val(flanking_candidates_p, key) == -1)
+            {
+                kh_int_counter_add(flanking_candidates_p, key, flanking_candidates_num++);
+            }
+        }
+    }
+
+    size_t flanking_candidates[flanking_candidates_num];
+    for (khiter_t k = kh_begin(flanking_candidates_p); k != kh_end(flanking_candidates_p); ++k)
+    {
+        if (kh_exist(flanking_candidates_p, k))
+        {
+            size_t key = kh_key(flanking_candidates_p, k);
+            int val = kh_val(flanking_candidates_p, k);
+            flanking_candidates[val] = key;
+        }
+    }
+
+    while (sam_itr_next(hts_file, iter, alignment) >= 0)
+    {
+        int flag = alignment->core.flag;
+
+        if (flag & SAMTOOLS_VIEW_FILTER_FLAG)
+            continue;
+
+        if (alignment->core.qual < min_mq)
+        {
+            continue;
+        }
+
+        const char *q_name = bam_get_qname(alignment);
+        //skip the duplicated read name
+        int ret = 0;
+        khiter_t k = kh_put(KH_COUNTER, read_name_set, q_name, &ret);
+        if (ret == 1)
+        {
+            kh_key(read_name_set, k) = strdup(q_name);
+            kh_value(read_name_set, k) = 1;
+        }
+        else if (ret == 0)
+        {
+            continue;
+        }
+
+        bool is_fwd_strand = (flag & 16) == 16;
+        int32_t pos = alignment->core.pos;
+        uint32_t l_qseq = alignment->core.l_qseq;
+        uint32_t *cigartuples = bam_get_cigar(alignment);
+        uint8_t *seqi = bam_get_seq(alignment);
+        uint8_t *qual = bam_get_qual(alignment);
+        size_t n_cigar = alignment->core.n_cigar;
+
+        Read read = {
+            .mq = normalize_mq(alignment->core.qual),
+            .read_start = pos,
+            .cigartuples = cigartuples,
+            .seqi = seqi,
+            .qual = qual,
+            .strand = normalize_strand(is_fwd_strand),
+            .n_cigar = n_cigar,
+            .l_qseq = l_qseq,
+            .pos_info = NULL,
+            .haplotype = HAP_UNPHASED};
+
+        while (variant_current_pos < variant_num && variants[variant_current_pos]->position < pos)
+            variant_current_pos++;
+        variants_info.variant_current_pos = variant_current_pos;
+
+        while (candidate_current_index < flanking_candidates_num && flanking_candidates[candidate_current_index] < pos)
+            candidate_current_index++;
+
+        read.read_end = get_read_end(cigartuples, n_cigar, read.read_start);
+
+        // get the overlap candidates number and skip the alignment if no flanking candidate overlapped
+        size_t overlap_candidates_num = get_overlap_candidate_num(pos, read.read_end, candidate_current_index, flanking_candidates_num, &flanking_candidates);
+        read.overlap_candidates_num = overlap_candidates_num;
+        if (read.overlap_candidates_num == 0)
+        {
+            read_no_overlap_num++;
+            continue;
+        }
+
+        // haplotag the read following whatshap haplotagging logic
+        if (need_haplotagging && alignment->core.qual >= min_haplotag_mq)
+        {
+            read.haplotype = haplotag_read(&variants_info, &read, ref_seq, ref_start);
+        }
+
+        pos_info = calloc(overlap_candidates_num, sizeof(Pos_info));
+        for (size_t i = 0; i < overlap_candidates_num; i++)
+        {
+            pos_info[i].ins_bases = NULL;
+            pos_info[i].ins_length = 0;
+            pos_info[i].alt_base = 0;
+            pos_info[i].del_length = 0;
+            pos_info[i].bq = 0;
+        }
+
+        // index of current first overlapped flanking candidate
+        size_t flanking_start = kh_int_counter_val(flanking_candidates_p, flanking_candidates[candidate_current_index]);
+        read.flanking_start = flanking_start;
+
+        // store all overlapped flanking candidates information and put all centered candidate information
+        // into pos_alt_info struct
+        size_t ref_pos = read.read_start;
+        size_t query_pos = 0;
+        for (size_t i = 0; i < n_cigar; i++)
+        {
+            size_t cigar_op = bam_cigar_op(cigartuples[i]);
+            size_t length = bam_cigar_oplen(cigartuples[i]);
+            if (cigar_op == BAM_CMATCH || cigar_op == BAM_CEQUAL || cigar_op == BAM_CDIFF)
+            {
+                for (size_t p = ref_pos; p < ref_pos + length; p++)
+                {
+                    int flanking_index = kh_int_counter_val(flanking_candidates_p, p);
+                    if (flanking_index != -1)
+                    {
+                        size_t offset = flanking_index - flanking_start;
+                        pos_info[offset].alt_base = bam_seqi(seqi, query_pos);
+                        pos_info[offset].bq = normalize_bq(qual[query_pos]);
+
+                        int center_pos_index = kh_int_counter_val(candidates_p, p);
+                        if (center_pos_index != -1)
+                        {
+                            char alt_base = seq_nt16_str[pos_info[offset].alt_base];
+                            pos_alt_info[center_pos_index].acgt_count[acgt2num[alt_base - 'A']]++;
+                            pos_alt_info[center_pos_index].depth++;
+                        }
+                    }
+                    query_pos++;
+                }
+                ref_pos += length;
+            }
+            else if (cigar_op == BAM_CDEL)
+            {
+
+                int flanking_index = kh_int_counter_val(flanking_candidates_p, ref_pos - 1);
+                if (flanking_index != -1)
+                {
+                    size_t offset = flanking_index - flanking_start;
+                    pos_info[offset].del_length = length;
+                    int center_pos_index = kh_int_counter_val(candidates_p, ref_pos - 1);
+                    if (center_pos_index != -1)
+                    {
+                        kh_int_counter_add(pos_alt_info[center_pos_index].del_counter, length, 1);
+                    }
+                }
+                for (size_t p = ref_pos; p < ref_pos + length; p++)
+                {
+                    int flanking_index = kh_int_counter_val(flanking_candidates_p, p);
+                    if (flanking_index != -1)
+                    {
+                        size_t offset = flanking_index - flanking_start;
+                        pos_info[offset].alt_base = -1;
+                        int center_pos_index = kh_int_counter_val(candidates_p, p);
+                        if (center_pos_index != -1)
+                        {
+                            pos_alt_info[center_pos_index].depth++;
+                        }
+                    }
+                }
+                ref_pos += length;
+            }
+            else if (cigar_op == BAM_CINS)
+            {
+                int flanking_index = kh_int_counter_val(flanking_candidates_p, ref_pos - 1);
+                if (flanking_index != -1)
+                {
+                    size_t offset = flanking_index - flanking_start;
+                    pos_info[offset].ins_bases = calloc(length + 1, sizeof(char));
+                    for (size_t ins_idx = 0; ins_idx < length; ins_idx++)
+                    {
+                        pos_info[offset].ins_bases[ins_idx] = seq_nt16_str[bam_seqi(read.seqi, query_pos + ins_idx)];
+                    }
+                    pos_info[offset].ins_bases[length] = '\0';
+                    pos_info[offset].ins_length = length;
+
+                    int center_pos_index = kh_int_counter_val(candidates_p, ref_pos - 1);
+                    if (center_pos_index != -1)
+                    {
+                        kh_counter_add(pos_alt_info[center_pos_index].ins_counter, pos_info[offset].ins_bases, 1);
+                    }
+                }
+                query_pos += length;
+            }
+            else if (cigar_op == BAM_CREF_SKIP)
+            {
+                ref_pos += length;
+            }
+            else if (cigar_op == BAM_CSOFT_CLIP)
+            {
+                query_pos += length;
+            }
+        }
+
+        //update the read array
+        read.pos_info = pos_info;
+        reads_num++;
+        kv_push(Read, read_array, read);
+    }
+
+    // allocate memory of the input matrix of all candidates
+    int8_t *matrix = calloc(candidate_num * matrix_depth * no_of_positions * channel_size, sizeof(int8_t));
+
+    HAP read_hap_array[reads_num];
+    int matrix_read_index_array[matrix_depth];
+    Alt_info *alt_info = malloc(matrix_depth * sizeof(Alt_info));
+
+    char **alt_info_p = calloc(candidate_num, sizeof(char*));
+    fa_data data = calloc(1, sizeof(_fa_data));
+
+    // loop each candiate and generate full-alignment input matrix
+    for (size_t i = 0; i < candidate_num; i++)
+    {
+        size_t candidate = candidates[i];
+        size_t start_pos = candidate - flanking_base_num;
+        size_t end_pos = candidate + flanking_base_num + 1;
+        size_t candidate_depth = 0;
+        for (size_t j = 0; j < matrix_depth; j++)
+        {
+            alt_info[j].ins_bases = NULL;
+            alt_info[j].alt_base = '\0';
+            alt_info[j].del_length = 0;
+            alt_info[j].has_alt_info = false;
+        }
+
+        for (size_t j = 0; j < matrix_depth; j++)
+            matrix_read_index_array[j] = -1;
+
+        size_t overlap_read_num = 0;
+        for (size_t j = 0; j < reads_num; j++)
+        {
+            Read read = kv_A(read_array, j);
+            if (read.read_start >= end_pos)
+                break;
+            if (read.read_end <= start_pos)
+                continue;
+            read_hap_array[overlap_read_num].read_index = j;
+            read_hap_array[overlap_read_num++].haplotype = read.haplotype;
+        }
+
+        sort_read_name_by_haplotype(&read_hap_array, &matrix_read_index_array, overlap_read_num);
+
+        // loop each overlapped read of a candidate
+        for (size_t d = 0; d < matrix_depth; d++)
+        {
+            int read_index = matrix_read_index_array[d];
+            if (read_index == -1)
+                continue;
+            Read read = kv_A(read_array, read_index);
+            int8_t hap_v = normalize_hap(read.haplotype);
+            int8_t strand_v = read.strand;
+            int8_t mq_v = read.mq;
+
+            // loop all flanking position of a read
+            for (size_t p = 0; p < no_of_positions; p++)
+            {
+                size_t cp = p + start_pos;
+                size_t flanking_index = kh_int_counter_val(flanking_candidates_p, cp);
+                int32_t offset = flanking_index - read.flanking_start;
+                bool is_center_pos = p == flanking_base_num;
+
+                if (read.pos_info[offset].alt_base < 0)
+                {
+                    if (is_center_pos)
+                        candidate_depth++;
+                    continue;
+                }
+
+                if (offset < 0 || offset >= read.overlap_candidates_num)
+                    continue;
+
+                int8_t alt_v = 0;
+                char ref_base = toupper(ref_seq[cp - ref_start]);
+                int8_t ref_v = num2countbase_fa[ref_base - 'A'];
+                int8_t bq_v = read.pos_info[offset].bq;
+
+                if (is_center_pos)
+                    candidate_depth++;
+                size_t alt_int = read.pos_info[offset].alt_base;
+                char alt_base = seq_nt16_str[read.pos_info[offset].alt_base];
+                if (read.pos_info[offset].ins_length > 0)
+                {
+                    size_t ins_length = read.pos_info[offset].ins_length;
+                    char *ins_bases = read.pos_info[offset].ins_bases;
+                    int8_t ins_v = 0;
+                    size_t max_ins_length = ins_length < no_of_positions - p ? ins_length : no_of_positions - p;
+
+                    for (size_t ins_idx = 0; ins_idx < ins_length; ins_idx++)
+                    {
+                        char ins_alt_base = ins_bases[ins_idx];
+                        if (ins_idx < max_ins_length && p < no_of_positions - 1)
+                        {
+                            ins_v = num2countbase_fa[ins_alt_base - 'A'];
+                            matrix[i * offset_can + d * offset_row + (ins_idx + p) * offset_col + 6] = ins_v;
+                        }
+                    }
+                    if (is_center_pos)
+                    {
+                        alt_info[d].alt_base = alt_base;
+                        alt_info[d].ins_bases = ins_bases;
+                        alt_info[d].has_alt_info = true;
+                    }
+                    alt_v = num2countbase_fa['I' - 'A'];
+                }
+                else if (read.pos_info[offset].del_length > 0)
+                {
+                    if (is_center_pos)
+                    {
+                        alt_info[d].del_length = read.pos_info[offset].del_length;
+                        alt_info[d].has_alt_info = true;
+                    }
+                    alt_v = num2countbase_fa['D' - 'A'];
+                }
+                else if (ref_base - alt_base != 0)
+                {
+                    if (is_center_pos)
+                    {
+                        alt_info[d].alt_base = alt_base;
+                        alt_info[d].has_alt_info = true;
+                    }
+                    alt_v = num2countbase_fa[alt_base - 'A'];
+                }
+
+                // update the matrix
+                matrix[i * offset_can + d * offset_row + p * offset_col + 0] = ref_v;
+                matrix[i * offset_can + d * offset_row + p * offset_col + 1] = alt_v;
+                matrix[i * offset_can + d * offset_row + p * offset_col + 2] = strand_v;
+                matrix[i * offset_can + d * offset_row + p * offset_col + 3] = mq_v;
+                matrix[i * offset_can + d * offset_row + p * offset_col + 4] = bq_v;
+                matrix[i * offset_can + d * offset_row + p * offset_col + 7] = hap_v;
+            }
+        }
+
+        // finish the candidate proportion channel;
+        candidate_depth = pos_alt_info[i].depth;
+        for (size_t j = 0; j < matrix_depth; j++)
+        {
+            int8_t af_v = 0;
+            if (alt_info[j].has_alt_info == false)
+                continue;
+            if (alt_info[j].ins_bases != NULL)
+            {
+                size_t count = kh_counter_val(pos_alt_info[i].ins_counter, alt_info[j].ins_bases);
+                if (count > 0)
+                    af_v = normalize_af(count / (float)candidate_depth);
+            }
+            else if (alt_info[j].del_length > 0)
+            {
+                size_t count = kh_int_counter_val(pos_alt_info[i].del_counter, alt_info[j].del_length);
+                if (count > 0)
+                    af_v = normalize_af(count / (float)candidate_depth);
+            }
+            else if (alt_info[j].alt_base != '\0')
+            {
+                size_t offset = alt_info[j].alt_base - 'A';
+                af_v = normalize_af(pos_alt_info[i].acgt_count[acgt2num[offset]] / (float)candidate_depth);
+            }
+
+            if (af_v > 0)
+            {
+                for (size_t p = 0; p < no_of_positions; p++)
+                {
+                    if (matrix[i * offset_can + j * offset_row + p * offset_col + 0] != 0)
+                        matrix[i * offset_can + j * offset_row + p * offset_col + 5] = af_v;
+                }
+            }
+        }
+
+        // store the alternative information into string
+        size_t max_alt_length = 64;
+        char *alt_info_str = calloc(max_alt_length, sizeof(char));
+        char center_ref_base = toupper(ref_seq[candidate - ref_start]);
+
+        sprintf(alt_info_str, "%i-%i-%c-", candidate + 1, candidate_depth, center_ref_base);
+        for (size_t j = 0; j < 4; j++)
+        {
+            if (j != acgt2num[center_ref_base - 'A'] && pos_alt_info[i].acgt_count[j] > 0)
+                sprintf(alt_info_str + strlen(alt_info_str), "X%c %i ", ACGT[j], pos_alt_info[i].acgt_count[j]);
+        }
+        for (khiter_t k = kh_begin(pos_alt_info[i].ins_counter); k != kh_end(pos_alt_info[i].ins_counter); k++)
+        {
+            if (kh_exist(pos_alt_info[i].ins_counter, k))
+            {
+                char *key = kh_key(pos_alt_info[i].ins_counter, k);
+                int val = kh_val(pos_alt_info[i].ins_counter, k);
+                if (strlen(key) <= MAX_INDEL_LENGTH)
+                {
+                    if (strlen(alt_info_str) + strlen(key) + 32 >= max_alt_length)
+                    {
+                        while (strlen(alt_info_str) + strlen(key) + 32 >= max_alt_length)
+                            max_alt_length = max_alt_length << 1;
+                        alt_info_str = realloc(alt_info_str, max_alt_length*sizeof(char));
+                    }
+                    sprintf(alt_info_str + strlen(alt_info_str), "I%c%s %i ", center_ref_base, key, val);
+                }
+            }
+        }
+
+        for (khiter_t k = kh_begin(pos_alt_info[i].del_counter); k != kh_end(pos_alt_info[i].del_counter); k++)
+        {
+            if (kh_exist(pos_alt_info[i].del_counter, k))
+            {
+                int key = kh_key(pos_alt_info[i].del_counter, k);
+                int val = kh_val(pos_alt_info[i].del_counter, k);
+                if (key <= MAX_INDEL_LENGTH)
+                {
+                    if (strlen(alt_info_str) + key + 32 >= max_alt_length)
+                    {
+                        while (strlen(alt_info_str) + key + 32 >= max_alt_length)
+                            max_alt_length = max_alt_length << 1;
+                        alt_info_str = realloc(alt_info_str, max_alt_length*sizeof(char));
+                    }
+                    sprintf(alt_info_str + strlen(alt_info_str), "D%.*s %i ", key, ref_seq + candidate - ref_start + 1, val);
+                }
+            }
+        }
+
+        alt_info_p[i] = alt_info_str;
+
+    } // end of candidate loop
+
+
+    data->matrix = matrix;
+    data->all_alt_info = alt_info_p;
+    data->candidates_num = candidate_num;
+
+    // free all allocated memory
+    for (size_t j = 0; j < reads_num; j++)
+    {
+        Read read = kv_A(read_array, j);
+        for (size_t p = 0; p < read.overlap_candidates_num; p++)
+        {
+            if (read.pos_info[p].ins_bases != NULL)
+                free(read.pos_info[p].ins_bases);
+        }
+        free(read.pos_info);
+    }
+
+    for (size_t j = 0; j < candidate_num; j++)
+    {
+        kh_counter_destroy(pos_alt_info[j].ins_counter);
+        kh_int_counter_destroy(pos_alt_info[j].del_counter);
+    }
+
+    free(chr);
+    free(pos_alt_info);
+    free(alt_info);
+    kh_counter_destroy(read_name_set);
+    kh_int_counter_destroy(candidates_p);
+    kh_int_counter_destroy(flanking_candidates_p);
+    kv_destroy(read_array);
+    bam_destroy1(alignment);
+    hts_itr_destroy(iter);
+    fai_destroy(fai);
+
+    return data;
+}
diff --git a/src/clair3_full_alignment.h b/src/clair3_full_alignment.h
new file mode 100644
index 0000000..e7485fb
--- /dev/null
+++ b/src/clair3_full_alignment.h
@@ -0,0 +1,257 @@
+#ifndef _CLAIR3_FULL_ALIGNMENT_H
+#define _CLAIR3_FULL_ALIGNMENT_H
+
+#define HAP_UNPHASED 0
+#define HAP_1 1
+#define HAP_2 2
+
+#define normalize_mq(x) ((int)(x < 60 ? 100 * x / 60.0 : 100))
+#define normalize_bq(x) ((int)(x < 40 ? 100 * x / 40.0 : 100))
+#define normalize_af(x) ((int)(x < 1.0 ? 100 * x : 100))
+#define normalize_strand(x) (x == true ? 50 : 100)
+
+static const int8_t HAP_TYPE[3] = {60, 30, 90};
+#define normalize_hap(x) (HAP_TYPE[x])
+
+static const size_t overhang = 10;
+static const char *RN = "\0";
+static const size_t min_haplotag_mq = 20;
+static const size_t expand_reference_region = 2000000;
+static const size_t flanking_base_num = 16;
+static const size_t no_of_positions = 33;
+static const size_t channel_size = 8;
+static const size_t matrix_depth = 89;
+static const size_t min_coverage = 2;
+static const size_t min_bq = 0;
+static const size_t min_mq = 5;
+static const size_t SAMTOOLS_VIEW_FILTER_FLAG = 2316;
+static const size_t MAX_READ_COUNT = 1000;
+static const size_t MAX_INDEL_LENGTH = 50;
+static const char ACGT[] = "ACGT";
+
+// convert 16bit IUPAC (+16 for strand) to plp_bases index
+// {
+//  ,  A,  C,   ,  G,   ,   ,   ,
+// T,   ,   ,   ,   ,   ,   ,   ,
+//  ,  a,  c,   ,  g,   ,   ,   ,
+// t,  ,    ,   ,   ,   ,   ,   ,
+// }
+static const int8_t num2countbase_fa[32] = {
+    100, 0, 25, -100, 0, 0, 75, 0, // abcdefgh
+    -50, 0, 0, 0, 0, 100, 0, 0,    // ijklmnop
+    0, 0, 0, 50, 0, 0, 0, 0,       // qrstuvwx
+    0, 0, 0, 0, 0, 0, 0, 0,        // vz
+};
+
+// convert A-Z character to 0-index offset
+// ACGT: 0123
+// non-ACGT: 0
+static const int8_t acgt2num[32] = {
+    0, 0, 1, 0, 0, 0, 2, 0, // abcdefgh
+    0, 0, 0, 0, 0, 0, 0, 0, // ijklmnop
+    0, 0, 0, 3, 0, 0, 0, 0, // qrstuvwx
+    0, 0, 0, 0, 0, 0, 0, 0, // vz
+};
+
+/*! @typedef
+ @abstract Structure for full-alignment data
+ @field matrix  int array of (total candidate number * matrix depth * no of flanking position * feature channel)
+ @field alt_info  alternative information string with all candidates, including all SNPs, insertions and deletions
+ @field alt_info_length  length of the alternative information string
+ */
+typedef struct _fa_data
+{
+
+    int8_t *matrix;
+    char **all_alt_info;
+    size_t candidates_num;
+} _fa_data;
+
+typedef _fa_data *fa_data;
+
+/*! @typedef
+ @abstract Structure for matrix level alternative information
+ @field ins_bases  the char string storing all insertion bases in current position of an alignment
+ @field alt_base  alternative base other than reference base in query sequence
+ @field del_length  deletion length in current position of an alignment
+ @field has_alt_info  true if any of alternative information exists, false for reference base and deletion bases(#*)
+
+ @ by default we only allocate a maximum `matrix depth` struct array and reset all field in each candidate iteration, we
+   need to calculate each read candidate proportion in given candidate overlapped region
+ */
+typedef struct Alt_info
+{
+    char *ins_bases;
+    char alt_base;
+    size_t del_length;
+    bool has_alt_info;
+} Alt_info;
+
+/*! @typedef
+ @abstract Structure for matrix level alternative information
+ @field read_index  the read start offset of each read, the index is sorted by read start
+ @field haplotype information of read, 0: unphased or not phasable 1|2: haplotype1|2
+ */
+typedef struct HAP
+{
+    size_t read_index;
+    size_t haplotype;
+} HAP;
+
+/*! @typedef
+ @abstract Structure of a phased heterozygous pileup SNP variant
+ @field position  variant start position 0-index
+ @field ref_base  reference base tag in VCF
+ @field alt_base  alternative base tag in VCF
+ @field genotype  phased heterozygous genotype, 0|1 : 1,  1|0: 2
+ @field phase_set phase set tag in VCF, which is acquired from whatshap or longphase
+
+ @ in this release, we only store heterozygous SNP info
+ */
+typedef struct Variant
+{
+    int position;
+    char ref_base;
+    char alt_base;
+    int genotype;
+    int phase_set;
+} Variant;
+
+typedef struct Variants_info
+{
+    Variant **variants;
+    size_t variant_num;
+    size_t variant_current_pos;
+} Variants_info;
+
+/*! @typedef
+ @abstract Structure for matrix level alternative information
+ @field ins_bases  the char string storing all insertion bases in current position of an alignment
+ @field ins_length  length the stored insertion bases
+ @field alt_base  alternative base in htslib int format
+ @field del_length  deletion length in current position of an alignment
+ @field bq phred quality score of given bases
+
+ @ we use the htslib format int alt_base than char as we need to mark the '#*' into -1, for bq field, we only store
+   reference base and alternative base quality and skip the insertion quality as there are only one base quality channel
+ */
+typedef struct Pos_info
+{
+    char *ins_bases;
+    size_t ins_length;
+    int alt_base;
+    size_t del_length;
+    int8_t bq;
+} Pos_info;
+
+/*! @typedef
+ @abstract Structure for the alignment information
+ @field read_start  read start position of alignment, 0-index
+ @field q_name  read name
+ @field read_end  alignment read end compared with the reference sequence, CIGAR length sum of X=MDN
+ @field cigartuples  alignment CIGAR int pointer from htslib bam_get_cigar function
+ @field qual  base quality int pointer from htslib core alignment
+ @field mq  normalized mapping quality value (0-100)
+ @field n_cigar  number of CIGAR operations
+ @field l_qseq  length of the read query sequence
+ @field haplotype  haplotype information of read, 0: unphased or not phasable 1|2: haplotype1|2
+ @field strand  normalized strand value forward: 50 reverse: 100
+ @field pos_info  structure array of overlapped flanking candidates information
+ @field overlap_candidates_num  number of overlapped flanking candidates between read start and read end, including flanking bases
+ @field flanking_start  the first overlapped candidate index (0 index is the the first candidate - 16 by default)
+
+ @note that the seqi and qual pointer information will be released after each htslib sam_itr_next iterator
+ */
+typedef struct Read
+{
+    size_t read_start;
+    char *q_name;
+    size_t read_end;
+    uint32_t *cigartuples;
+    uint8_t *seqi;
+    uint8_t *qual;
+    int8_t mq;
+    size_t n_cigar;
+    uint32_t l_qseq;
+    size_t haplotype;
+    int8_t strand;
+    Pos_info *pos_info;
+    size_t overlap_candidates_num;
+    size_t flanking_start;
+} Read;
+
+/** Destroys a full-alignment data structure.
+ *
+ *  @param data the full-alignment data object to cleanup.
+ *  @returns void.
+ *
+ */
+void destroy_fa_data(fa_data data);
+
+/** Sort overlapped reads of a candidate based on hapltoype information and read start
+ *
+ *  @param read_hap_array  struct array of all overlap reads
+ *  @param matrix_read_index_array  the return reference of the read index array, -1 for padding
+ *  @param n  number of overlapped reads
+ *  @returns void.
+ *
+ */
+void sort_read_name_by_haplotype(HAP *read_hap_array, int *matrix_read_index_array, size_t n);
+
+/** get all overlapped flanking candidates number and start position based on read start and read end
+ *
+ *  @param read_start  read start, 0-index
+ *  @param read_end  read end, 0-index
+ *  @param candidate_current_index  the first flanking candidate index >= read start
+ *  @param flanking_candidates  int array of all flanking candidates, sorted by start position
+ *  @returns number of the total overlapped flanking candidates within read start and read end.
+ *
+ */
+size_t get_overlap_candidate_num(size_t read_start, size_t read_end, size_t candidate_current_index, size_t flanking_candidates_num, size_t *flanking_candidates);
+
+/** get the substring of a reference sequence based on start and end
+ *
+ *  @param ref_seq  a string store all reference sequence from ref_start(0-index)
+ *  @param start  sequence query start, 0-index
+ *  @param end  sequence query end, 0-index
+ *  @returns string of the queried region of reference sequence
+ *
+ */
+char *get_ref_seq(char *ref_seq, size_t start, size_t end);
+
+/** get the substring of a query sequence based on start and end
+ *
+ *  @param seqi  a htslib format pointer stores all query sequence(0-index)
+ *  @param start  query start, 0-index
+ *  @param end  query end, 0-index
+ *  @returns string of the queried sequence
+ *
+ */
+char *get_query_seq(uint8_t *seqi, size_t start, size_t end);
+
+/** C implement of whatshap hapltagging
+ *
+ */
+void cigar_prefix_length(uint32_t *cigartuples, size_t reference_bases, size_t *ref_bases, size_t *query_bases, size_t left_cigar_index, size_t right_cigar_index, size_t consumed, bool reverse);
+
+int realign_read(Variant *variant, Read *read, size_t i, size_t consumed, size_t query_pos, char *reference, size_t ref_start);
+
+int haplotag_read(Variants_info *variants_info, Read *read, char *ref_seq, size_t ref_start);
+
+/** C implement of clair3-style full-alignment feature data and alternative information in a given region of a bam.
+ *
+ *  @param region  1-based region string
+ *  @param bam_path  input alignment file
+ *  @param fasta_path  input reference file
+ *  @param variants  C structure pointer of all phased heterozygous pileup SNP variants
+ *  @param variant_num  total variants number
+ *  @param candidates  int array of all low-quality pileup candidates need to process (0-index)
+ *  @param candidate_num total candidates number
+ *  @returns a full-alignment data pointer, including the data matrix and all candidates alternative information
+ *
+ *  The return value can be freed with destroy_fa_data
+ *
+ */
+fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num);
+
+#endif
diff --git a/src/clair3_pileup.c b/src/clair3_pileup.c
new file mode 100644
index 0000000..e3de48c
--- /dev/null
+++ b/src/clair3_pileup.c
@@ -0,0 +1,462 @@
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <math.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "htslib/sam.h"
+#include "htslib/faidx.h"
+#include "kvec.h"
+#include "medaka_bamiter.h"
+#include "medaka_common.h"
+#include "clair3_pileup.h"
+#include "medaka_khcounter.h"
+
+#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
+#define bam1_seqi(s, i) (bam_seqi((s), (i)))
+#define bam_nt16_rev_table seq_nt16_str
+#define bam_nt16_table seq_nt16_table
+
+
+size_t base2_index(char c) {
+    if (c == 'A') return 0;
+    else if (c == 'C') return 1;
+    else if (c == 'G') return 2;
+    else if (c == 'T') return 3;
+    else if (c == 'a') return 9;
+    else if (c == 'c') return 10;
+    else if (c == 'g') return 11;
+    else if (c == 't') return 12;
+    else return 0;
+}
+
+/** Constructs a pileup data structure.
+ *
+ *  @param n_cols number of pileup columns.
+ *  @param buffer_cols number of pileup columns.
+ *  @param feature_length length of feature vector.
+ *  @param num_dtypes number of datatypes in pileup.
+ *  @param num_homop maximum homopolymer length to consider.
+ *  @param fixed_size if not zero data matrix is allocated as fixed_size * n_cols, ignoring other arguments
+ *  @see destroy_plp_data
+ *  @returns a plp_data pointer.
+ *
+ *  The return value can be freed with destroy_plp_data.
+ *
+ */
+plp_data create_plp_data(size_t n_cols, size_t buffer_cols, size_t feature_length, size_t num_dtypes, size_t num_homop, size_t fixed_size) {
+    assert(buffer_cols >= n_cols);
+    plp_data data = xalloc(1, sizeof(_plp_data), "plp_data");
+    data->buffer_cols = buffer_cols;
+    data->num_dtypes = num_dtypes;
+    data->num_homop = num_homop;
+    data->n_cols = n_cols;
+    if (fixed_size != 0) {
+        assert(buffer_cols == n_cols);
+        data->matrix = xalloc(fixed_size * n_cols, sizeof(int), "matrix");
+    } else {
+        data->matrix = xalloc(feature_length * num_dtypes * buffer_cols * num_homop, sizeof(size_t), "matrix");
+    }
+    data->major = xalloc(buffer_cols, sizeof(size_t), "major");
+    data->minor = xalloc(buffer_cols, sizeof(size_t), "minor");
+    data->all_alt_info = NULL;
+    return data;
+}
+
+
+/** Enlarge the internal buffers of a pileup data structure.
+ *
+ *  @param pileup a plp_data pointer.
+ *  @param buffer_cols number of pileup columns for which to allocate memory
+ *
+ */
+void enlarge_plp_data(plp_data pileup, size_t buffer_cols, size_t feature_length) {
+    assert(buffer_cols > pileup->buffer_cols);
+    size_t old_size = feature_length * pileup->num_dtypes * pileup->num_homop * pileup->buffer_cols;
+    size_t new_size = feature_length * pileup->num_dtypes * pileup->num_homop * buffer_cols;
+
+    pileup->matrix = xrealloc(pileup->matrix, new_size * sizeof(size_t), "matrix");
+    pileup->major = xrealloc(pileup->major, buffer_cols * sizeof(size_t), "major");
+    pileup->minor = xrealloc(pileup->minor, buffer_cols * sizeof(size_t), "minor");
+    // zero out new part of matrix
+    for (size_t i = old_size; i < new_size; ++i) {
+        pileup->matrix[i] = 0;
+    }
+    pileup->buffer_cols = buffer_cols;
+}
+
+
+/** Destroys a pileup data structure.
+ *
+ *  @param data the object to cleanup.
+ *  @returns void.
+ *
+ */
+void destroy_plp_data(plp_data data) {
+    free(data->matrix);
+    free(data->major);
+    free(data->minor);
+    for (size_t i = 0; i < data->candidates_num; i++) {
+       free(data->all_alt_info[i]);
+    }
+    free(data->all_alt_info);
+    free(data);
+}
+
+/** Generates clair3-style pileup feature data in a region of a bam.
+ *
+ *  @param region 1-based region string.
+ *  @param bam_file input aligment file.
+ *  @param tag_value by which to filter data.
+ *  @param keep_missing alignments which do not have tag.
+ *  @param weibull_summation use predefined bam tags to perform homopolymer partial counts.
+ *  @returns a pileup data pointer.
+ *
+ *  The return value can be freed with destroy_plp_data.
+ *
+ *  If num_dtypes is 1, dtypes should be NULL; all reads in the bam will be
+ *  treated equally. If num_dtypes is not 1, dtypes should be an array of
+ *  strings, these strings being prefixes of query names of reads within the
+ *  bam file. Any read not matching the prefixes will cause exit(1).
+ *
+ *  If tag_name is not NULL alignments are filtered by the (integer) tag value.
+ *  When tag_name is given the behaviour for alignments without the tag is
+ *  determined by keep_missing.
+ *
+ */
+
+/**
+ * The pileup input is 594 integers – 33 genome positions wide with 18 features at each position –
+ *
+ * A+, C+, G+, T+, I_S+, I^1 S+, D_S+, D^1_S+, D_R+, A-, C-, G-, T-, I_S-, I^1_S-, D_S-, D^1_S-, and D_R-
+ *
+ * A, C, G, T, I, D, +, - means the count of read support of the four nucleotides: insertion,
+ * deletion, positive strand, and negative strand. Superscript “1” means only the indel with the
+ * highest read support is counted (i.e., all indels are counted if without “1“). Subscript “S”/“R” means
+ * the starting/non-starting position of an indel. For example, a 3bp deletion with the most reads support
+ * will have the first deleted base counted in either D1_S+ or D1_S-, and the second and third deleted bases
+ * counted in either D_R+ or D_R-. The design was determined experimentally, but the rationale is that for
+ * 1bp indels that are easy to call, look into the differences between the “S” counts, but reduce the
+ * quality if the “R” counts and discrepancy between positions increase.
+ *
+ */
+plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth) {
+    // extract `chr`:`start`-`end` from `region`
+    //   (start is one-based and end-inclusive),
+    //   hts_parse_reg below sets return value to point
+    //   at ":", copy the input then set ":" to null terminator
+    //   to get `chr`.
+    int start, end;
+    char *chr = xalloc(strlen(region) + 1, sizeof(char), "chr");
+    strcpy(chr, region);
+    char *reg_chr = (char *) hts_parse_reg(chr, &start, &end);
+    // start and end now zero-based end exclusive
+    if (reg_chr) {
+        *reg_chr = '\0';
+    } else {
+        fprintf(stderr, "Failed to parse region: '%s'.\n", region);
+    }
+
+    // open bam etc.
+    // this is all now deferred to the caller
+    htsFile *fp = bam_set->fp;
+    hts_idx_t *idx = bam_set->idx;
+    sam_hdr_t *hdr = bam_set->hdr;
+    // setup bam interator
+
+    mplp_data *data = xalloc(1, sizeof(mplp_data), "pileup init data");
+    data->fp = fp; data->hdr = hdr; data->iter = bam_itr_querys(idx, hdr, region);
+    data->min_mapQ = min_mq;
+
+    bam_mplp_t mplp = bam_mplp_init(1, read_bam, (void **)& data);
+    bam_mplp_set_maxcnt(mplp, max_depth);
+
+    const bam_pileup1_t **plp = xalloc(1, sizeof(bam_pileup1_t *), "pileup");
+    int ret, pos, tid, n_plp;
+
+    int n_cols = 0;
+    size_t buffer_cols = end - start;
+    plp_data pileup = create_plp_data(n_cols, buffer_cols, featlenclair3, 1, 1, 0);
+
+    // get counts
+    size_t major_col = 0;  // index into `pileup` corresponding to pos
+    n_cols = 0;            // number of processed columns (including insertions, which clair3 doesn't have ;))
+
+    faidx_t* fai = fai_load(fasta_path);
+    int len = 0;
+    char *ref_seq = NULL;
+//    printf("pos: %s %i %i\n", chr, start, end);
+    size_t ref_start = max(0, start - mpileup_expand_reference_region);
+    size_t ref_end = max(0, end + mpileup_expand_reference_region);
+    ref_seq = faidx_fetch_seq(fai, chr, ref_start, ref_end, &len);
+
+    size_t candidates_num = 0;
+    size_t alt_info_p_size = 512;
+    char ** alt_info_p = xalloc(alt_info_p_size, sizeof(char*), "alt_info_p");
+    for (size_t i = 0; i < alt_info_p_size; i++)
+        alt_info_p[i] = NULL;
+
+    size_t pre_pos = 0;
+    size_t contiguous_flanking_num = 0;
+    while ((ret=bam_mplp_auto(mplp, &tid, &pos, &n_plp, plp) > 0)) {
+
+        size_t depth = 0;
+        size_t alt_count = 0;
+        size_t ref_count = 0;
+        size_t del_count = 0;
+        size_t ins_count = 0;
+
+        bool pass_af = false;
+        bool pass_snp_af = false;
+        bool pass_indel_af = false;
+
+        const char *c_name = data->hdr->target_name[tid];
+        if (strcmp(c_name, chr) != 0) continue;
+        if (pos < start) continue;
+        if (pos >= end) break;
+        n_cols++;
+
+
+        if (pre_pos + 1 != pos || pre_pos == 0)
+            contiguous_flanking_num = 0;
+        else
+            contiguous_flanking_num++;
+        pre_pos = pos;
+
+        //update the deletion buffer in each interation
+        size_t del_buf_size = 32;
+        size_t* dels_f = xalloc(del_buf_size, sizeof(size_t), "dels_f");
+        size_t* dels_r = xalloc(del_buf_size, sizeof(size_t), "dels_r");
+
+        memset(dels_f, 0, del_buf_size * sizeof(size_t));
+        memset(dels_r, 0, del_buf_size * sizeof(size_t));
+
+        // we still need this as positions might not be contiguous
+        pileup->major[major_col / featlenclair3] = pos;
+        pileup->minor[major_col / featlenclair3] = 0;
+
+        // counters for insertion strings
+        khash_t(KH_COUNTER) *ins_counts_f = kh_init(KH_COUNTER);
+        khash_t(KH_COUNTER) *ins_counts_r = kh_init(KH_COUNTER);
+        khash_t(KH_COUNTER) *ins_counts_all = kh_init(KH_COUNTER);
+        // loop through all reads at this position
+        for (int i = 0; i < n_plp; ++i) {
+            const bam_pileup1_t *p = plp[0] + i;
+            if (p->is_refskip) continue;
+
+            if (p->indel < 0) {
+                // there's a deletion starting on next genomic position,
+                // record the length here and finalise after the read loop
+                //  - actually deleted bases get recorded in next block
+                size_t d = (size_t) -1 * p->indel;
+
+                if (d >= del_buf_size) {
+                    size_t new_size = max(d, 2 * del_buf_size);
+                    dels_f = xrealloc(dels_f, new_size*sizeof(size_t), "dels_f");
+                    memset(dels_f+del_buf_size, 0, (new_size-del_buf_size) * sizeof(size_t));
+                    dels_r = xrealloc(dels_r, new_size*sizeof(size_t), "dels_r");
+                    memset(dels_r+del_buf_size, 0, (new_size-del_buf_size) * sizeof(size_t));
+                    del_buf_size = new_size;
+                }
+                if (bam_is_rev(p->b)) {
+                    dels_r[d - 1] += 1;
+                } else {
+                    dels_f[d - 1] += 1;
+                }
+            }
+
+            // handle ref_base/sub/del
+            int base_i;
+            if (p->is_del) {
+                // we've been deleted, +1 to DR
+                base_i = bam_is_rev(p->b) ? c3_rev_del : c3_fwd_del;
+                depth++;
+            } else {
+                // just a base
+                int base_j = bam1_seqi(bam1_seq(p->b), p->qpos);
+                if bam_is_rev(p->b) { base_j += 16; }
+                base_i = num2countbaseclair3[base_j];
+                depth++;
+            }
+            pileup->matrix[major_col + base_i] += 1;
+
+            // handle insertion
+            //  - build insert string then hash
+            if (p->indel > 0) {
+                size_t first = p->is_del ? 0 : 1;
+                char* indel = (char*) xalloc(p->indel + 1, sizeof(char), "indel");
+                for (size_t i = 0, j = first; j < p->indel + first; ++i, ++j) {
+                    indel[i] = seq_nt16_str[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+                }
+                indel[p->indel] = '\0';
+                if (bam_is_rev(p->b)) {
+                    kh_counter_increment(ins_counts_r, indel);
+                } else {
+                    kh_counter_increment(ins_counts_f, indel);
+                }
+                kh_counter_increment(ins_counts_all, indel);
+                free(indel);
+            }
+        }
+
+        // finalise deletions: DS (all) and D1S (best)
+        //
+        // forward
+        size_t best_count = 0;
+        size_t all_count = 0;
+        for (size_t i = 0; i < del_buf_size; ++i) {
+            size_t d = dels_f[i];
+            all_count += d;
+            best_count = max(best_count, d);
+        }
+        pileup->matrix[major_col + c3_fwd_del_all] = all_count;
+        pileup->matrix[major_col + c3_fwd_del_best] = best_count;
+        del_count += all_count;
+        // reverse
+        best_count = 0;
+        all_count = 0;
+        for (size_t i = 0; i < del_buf_size; ++i) {
+            size_t d = dels_r[i];
+            all_count += d;
+            best_count = max(best_count, d);
+        }
+        pileup->matrix[major_col + c3_rev_del_all] = all_count;
+        pileup->matrix[major_col + c3_rev_del_best] = best_count;
+        del_count += all_count;
+
+        // finalise IS and I1S
+        // forward
+        kh_counter_stats_t stats = kh_counter_stats(ins_counts_f);
+        pileup->matrix[major_col + c3_fwd_ins_all] = stats.sum;
+        pileup->matrix[major_col + c3_fwd_ins_best] = stats.max;
+        ins_count += stats.sum;
+
+        kh_counter_destroy(ins_counts_f);
+        // reverse
+        stats = kh_counter_stats(ins_counts_r);
+        pileup->matrix[major_col + c3_rev_ins_all] = stats.sum;
+        pileup->matrix[major_col + c3_rev_ins_best] = stats.max;
+        ins_count += stats.sum;
+
+        kh_counter_destroy(ins_counts_r);
+        int offset = pos - ref_start;
+        char ref_base = toupper(ref_seq[offset]);
+        int ref_offset_forward = base2_index(ref_base);
+        int ref_offset_reverse = ref_offset_forward + reverse_pos_start;
+        char major_alt_base = '\0';
+        size_t forward_sum = 0;
+        size_t reverse_sum = 0;
+        for (size_t i = 0; i < 4; i++) {
+            forward_sum += pileup->matrix[major_col + i];
+            reverse_sum += pileup->matrix[major_col + i + reverse_pos_start];
+            if (i == ref_offset_forward) {
+                ref_count = pileup->matrix[major_col + i] + pileup->matrix[major_col + i + reverse_pos_start];
+            } else {
+                size_t current_count = pileup->matrix[major_col + i] + pileup->matrix[major_col + i + reverse_pos_start];
+                if (current_count > alt_count) {
+                    alt_count = current_count;
+                    major_alt_base = plp_bases_clair3[i];
+                }
+            }
+        }
+
+        pileup->matrix[major_col + ref_offset_forward] = -1 * forward_sum;
+        pileup->matrix[major_col + ref_offset_reverse] = -1 * reverse_sum;
+
+        // calculate candidate allele frequency and apply filtering
+        depth = max(1, depth);
+        bool pass_min_depth = depth >= min_depth;
+        bool pass_ref_base_in_acgt = ref_base == 'A' || ref_base == 'C' || ref_base == 'G' || ref_base == 'T';
+        bool non_ref_base_majority = ref_count < alt_count || ref_count < ins_count || ref_count < del_count;
+        bool ref_alt_equal_majority = (ref_count > 0 && ref_count == alt_count && ref_base - major_alt_base < 0);
+        if (call_snp_only == true) {
+            pass_af = alt_count / (float)depth >= min_snp_af;
+        } else {
+            pass_af = non_ref_base_majority || ref_alt_equal_majority || (alt_count / (float)depth >= min_snp_af);
+            pass_af = pass_af || (del_count / (float)depth >= min_indel_af) || (ins_count / (float)depth >= min_indel_af);
+        }
+
+        pass_af = pass_af && pass_min_depth && pass_ref_base_in_acgt;
+        pass_af = pass_af && (contiguous_flanking_num >= pileup_flanking_base_num);
+        // move to next position
+        if (pass_af) {
+
+            if (candidates_num + 1 >= alt_info_p_size) {
+                alt_info_p_size = alt_info_p_size << 1;
+                alt_info_p = xrealloc(alt_info_p, alt_info_p_size * sizeof(char*), "alt_info_p");
+            }
+
+            size_t max_alt_length = 64;
+            char *alt_info_str = xalloc(max_alt_length, sizeof(char), "alt_info_str");
+
+            sprintf(alt_info_str, "%i-%i-%c-", pos+1, depth, ref_base);
+            //snp
+            for (size_t i = 0; i < 4; i++) {
+                forward_sum += pileup->matrix[major_col + i];
+                reverse_sum += pileup->matrix[major_col + i + reverse_pos_start];
+                size_t alt_sum = pileup->matrix[major_col + i] + pileup->matrix[major_col + i + reverse_pos_start];
+
+                if (alt_sum > 0 && i != ref_offset_forward)
+                    sprintf(alt_info_str + strlen(alt_info_str), "X%c %i ", plp_bases_clair3[i], alt_sum);
+            }
+            //del
+            for (size_t i = 0; i < del_buf_size; i++) {
+                size_t d = dels_f[i] + dels_r[i];
+                if (d > 0 && i+1 <= max_indel_length) {
+                    // 32 bytes is a safe number for integer to string
+                    if (strlen(alt_info_str) + i + 32 >= max_alt_length) {
+                        while (strlen(alt_info_str) + i + 32 >= max_alt_length)
+                            max_alt_length = max_alt_length << 1;
+                         alt_info_str = xrealloc(alt_info_str, max_alt_length*sizeof(char), "alt_info_str");
+                    }
+                    sprintf(alt_info_str + strlen(alt_info_str), "D%.*s %i ", i+1,ref_seq+offset+1, d);
+                }
+
+            }
+//            //ins
+            for (khiter_t k = kh_begin(ins_counts_all); k != kh_end(ins_counts_all); ++k) {
+                if (kh_exist(ins_counts_all, k)) {
+                    const char *key = kh_key(ins_counts_all, k);
+                    size_t val = kh_val(ins_counts_all, k);
+                    if (strlen(key) <= max_indel_length) {
+                         if (strlen(alt_info_str) + strlen(key) + 32 >= max_alt_length) {
+                             while (strlen(alt_info_str) + strlen(key) + 32 >= max_alt_length)
+                                 max_alt_length = max_alt_length << 1;
+                             alt_info_str = xrealloc(alt_info_str, max_alt_length *sizeof(char), "alt_info_str");
+                        }
+                        sprintf(alt_info_str + strlen(alt_info_str), "I%c%s %i ", ref_base, key, val);
+                    }
+                }
+            }
+            // update the alternative information for current candidates here
+            alt_info_p[candidates_num++] = alt_info_str;
+        }
+
+        free(dels_f);
+        free(dels_r);
+        kh_counter_destroy(ins_counts_all);
+        major_col += featlenclair3;
+    }
+
+
+    pileup->all_alt_info = alt_info_p;
+    pileup->candidates_num = candidates_num;
+    pileup->n_cols = n_cols;
+
+    bam_itr_destroy(data->iter);
+    bam_mplp_destroy(mplp);
+    fai_destroy(fai);
+    free(data);
+    free(plp);
+    free(chr);
+
+    return pileup;
+}
+
+int main()
+{
+    return 0;
+}
diff --git a/src/clair3_pileup.h b/src/clair3_pileup.h
new file mode 100644
index 0000000..5cf9283
--- /dev/null
+++ b/src/clair3_pileup.h
@@ -0,0 +1,105 @@
+#ifndef _CLAIR3_PILEUP_H
+#define _CLAIR3_PILEUP_H
+
+// medaka-style feature data
+typedef struct _plp_data {
+    size_t buffer_cols;
+    size_t num_dtypes;
+    size_t num_homop;
+    size_t n_cols;
+    size_t *matrix;
+    size_t *major;
+    size_t *minor;
+    char **all_alt_info;
+    size_t candidates_num;
+} _plp_data;
+typedef _plp_data *plp_data;
+
+
+// convert 16bit IUPAC (+16 for strand) to plp_bases index
+// {
+//  ,  A,  C,   ,  G,   ,   ,   , 
+// T,   ,   ,   ,   ,   ,   ,   ,
+//  ,  a,  c,   ,  g,   ,   ,   ,
+// t,  ,    ,   ,   ,   ,   ,   ,
+// }
+static const int num2countbase[32] = {
+  -1,  4,  5, -1,  6, -1, -1, -1,
+   7, -1, -1, -1, -1, -1, -1, -1,
+  -1,  0,  1, -1,  2, -1, -1, -1,
+   3, -1, -1, -1, -1, -1, -1, -1,
+};
+
+
+// convert 16bit IUPAC (+16 for strand) to plp_bases clair3 index
+//  first i: all insertions
+// second i: most common insertion
+//  first d: all first base deletion  (actually a reference base)
+// second d: most common deletion     (actually a reference base)
+//  third d: non-first base deletion  (the deleted bases)
+static const char plp_bases_clair3[] = "ACGTIIDDDacgtiiddd";
+static const size_t featlenclair3 = 18;   // len of the above
+static const size_t c3_fwd_ins_all = 4;     
+static const size_t c3_fwd_ins_best = 5;
+static const size_t c3_fwd_del_all = 6;   // (preceding ref position)
+static const size_t c3_fwd_del_best = 7;  // (preceding ref position)
+static const size_t c3_fwd_del = 8;       // (actually deleted base)
+static const size_t c3_rev_ins_all = 13;     
+static const size_t c3_rev_ins_best = 14;
+static const size_t c3_rev_del_all = 15;  // (preceding ref position)
+static const size_t c3_rev_del_best = 16; // (preceding ref position)
+static const size_t c3_rev_del = 17;      // (actually deleted base)
+static const size_t reverse_pos_start = 9;  // position of reverse position start
+static const size_t mpileup_expand_reference_region = 1000;
+static const size_t pileup_flanking_base_num = 16;
+
+static const int num2countbaseclair3[32] = {
+ -1,  0,  1, -1,  2, -1, -1, -1,
+  3, -1, -1, -1, -1, -1, -1, -1,
+ -1,  9, 10, -1, 11, -1, -1, -1,
+ 12, -1, -1, -1, -1, -1, -1, -1,
+};
+
+
+/** Constructs a pileup data structure.
+ *
+ *  @param n_cols number of pileup columns.
+ *  @param buffer_cols number of pileup columns.
+ *  @param num_dtypes number of datatypes in pileup.
+ *  @param num_homop maximum homopolymer length to consider.
+ *  @param fixed_size if not zero data matrix is allocated as fixed_size * n_cols, ignoring other arguments
+ *  @see destroy_plp_data
+ *  @returns a plp_data pointer.
+ *
+ *  The return value can be freed with destroy_plp_data.
+ *
+ */
+plp_data create_plp_data(size_t n_cols, size_t buffer_cols, size_t feature_length, size_t num_dtypes, size_t num_homop, size_t fixed_size);
+
+
+/** Destroys a pileup data structure.
+ *
+ *  @param data the object to cleanup.
+ *  @returns void.
+ *
+ */
+void destroy_plp_data(plp_data data);
+
+/** C implement of clair3-style pileup feature data and alternative information in a given region of a bam.
+ *
+ *  @param region  1-based region string
+ *  @param bam_set  bam handler of input bam
+ *  @param fasta_path  input reference file
+ *  @param min_depth  minimum coverage required to call a variant
+ *  @param min_snp_af  minimum snp allele frequency for a site to be considered as a candidate site
+ *  @param min_indel_af  minimum indel allele frequency for a site to be considered as a candidate site
+ *  @param min_mq  minimum mapping quality for read to use for calling
+ *  @param max_indel_length  maximum indel length to format into alternative string stream
+ *  @returns a pileup data pointer, including the data matrix and all candidates alternative information
+ *
+ *  The return value can be freed with destroy_plp_data
+ *
+ */
+plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth);
+
+#endif
diff --git a/src/khash.h b/src/khash.h
new file mode 100644
index 0000000..f75f347
--- /dev/null
+++ b/src/khash.h
@@ -0,0 +1,627 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree((void *)h->keys); kfree(h->flags);					\
+			kfree((void *)h->vals);										\
+			kfree(h);													\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(new_flags); return -1; }		\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(new_flags); return -1; }	\
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(h->flags); /* free the working space */				\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More convenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash set containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/src/kvec.h b/src/kvec.h
new file mode 100644
index 0000000..676be8b
--- /dev/null
+++ b/src/kvec.h
@@ -0,0 +1,90 @@
+/* The MIT License
+
+   Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "kvec.h"
+int main() {
+	kvec_t(int) array;
+	kv_init(array);
+	kv_push(int, array, 10); // append
+	kv_a(int, array, 20) = 5; // dynamic
+	kv_A(array, 20) = 4; // static
+	kv_destroy(array);
+	return 0;
+}
+*/
+
+/*
+  2008-09-22 (0.1.0):
+
+	* The initial version.
+
+*/
+
+#ifndef AC_KVEC_H
+#define AC_KVEC_H
+
+#include <stdlib.h>
+
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+#define kvec_t(type) struct { size_t n, m; type *a; }
+#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
+#define kv_destroy(v) free((v).a)
+#define kv_A(v, i) ((v).a[(i)])
+#define kv_pop(v) ((v).a[--(v).n])
+#define kv_size(v) ((v).n)
+#define kv_max(v) ((v).m)
+
+#define kv_resize(type, v, s)  ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
+
+#define kv_copy(type, v1, v0) do {							\
+		if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n);	\
+		(v1).n = (v0).n;									\
+		memcpy((v1).a, (v0).a, sizeof(type) * (v0).n);		\
+	} while (0)												\
+
+#define kv_push(type, v, x) do {									\
+		if ((v).n == (v).m) {										\
+			(v).m = (v).m? (v).m<<1 : 2;							\
+			(v).a = (type*)realloc((v).a, sizeof(type) * (v).m);	\
+		}															\
+		(v).a[(v).n++] = (x);										\
+	} while (0)
+
+#define kv_pushp(type, v) (((v).n == (v).m)?							\
+						   ((v).m = ((v).m? (v).m<<1 : 2),				\
+							(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0)	\
+						   : 0), ((v).a + ((v).n++))
+
+#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
+						  ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
+						   (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
+						  : (v).n <= (size_t)(i)? (v).n = (i) + 1 \
+						  : 0), (v).a[(i)])
+
+#endif
diff --git a/src/levenshtein.c b/src/levenshtein.c
new file mode 100644
index 0000000..76490df
--- /dev/null
+++ b/src/levenshtein.c
@@ -0,0 +1,72 @@
+// `levenshtein.c` - levenshtein
+// MIT licensed.
+// Copyright (c) 2015 Titus Wormer <tituswormer@gmail.com>
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "levenshtein.h"
+
+// Returns a size_t, depicting the difference between `a` and `b`.
+// See <https://en.wikipedia.org/wiki/Levenshtein_distance> for more information.
+size_t
+levenshtein_n(const char *a, const size_t length, const char *b, const size_t bLength) {
+  // Shortcut optimizations / degenerate cases.
+  if (a == b) {
+    return 0;
+  }
+
+  if (length == 0) {
+    return bLength;
+  }
+
+  if (bLength == 0) {
+    return length;
+  }
+
+  size_t *cache = calloc(length, sizeof(size_t));
+  size_t index = 0;
+  size_t bIndex = 0;
+  size_t distance;
+  size_t bDistance;
+  size_t result;
+  char code;
+
+  // initialize the vector.
+  while (index < length) {
+    cache[index] = index + 1;
+    index++;
+  }
+
+  // Loop.
+  while (bIndex < bLength) {
+    code = b[bIndex];
+    result = distance = bIndex++;
+    index = SIZE_MAX;
+
+    while (++index < length) {
+      bDistance = code == a[index] ? distance : distance + 1;
+      distance = cache[index];
+
+      cache[index] = result = distance > result
+        ? bDistance > result
+          ? result + 1
+          : bDistance
+        : bDistance > distance
+          ? distance + 1
+          : bDistance;
+    }
+  }
+
+  free(cache);
+
+  return result;
+}
+
+size_t
+levenshtein(const char *a, const char *b) {
+  const size_t length = strlen(a);
+  const size_t bLength = strlen(b);
+
+  return levenshtein_n(a, length, b, bLength);
+}
diff --git a/src/levenshtein.h b/src/levenshtein.h
new file mode 100644
index 0000000..111a5a5
--- /dev/null
+++ b/src/levenshtein.h
@@ -0,0 +1,17 @@
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+// `levenshtein.h` - levenshtein
+// MIT licensed.
+// Copyright (c) 2015 Titus Wormer <tituswormer@gmail.com>
+
+// Returns a size_t, depicting the difference between `a` and `b`.
+// See <https://en.wikipedia.org/wiki/Levenshtein_distance> for more information.
+
+size_t
+levenshtein(const char *a, const char *b);
+
+size_t
+levenshtein_n (const char *a, const size_t length, const char *b, const size_t bLength);
+
+#endif // LEVENSHTEIN_H
diff --git a/src/medaka_bamiter.c b/src/medaka_bamiter.c
new file mode 100644
index 0000000..a625e14
--- /dev/null
+++ b/src/medaka_bamiter.c
@@ -0,0 +1,72 @@
+#include <errno.h>
+#include <string.h>
+
+#include "medaka_bamiter.h"
+#include "medaka_common.h"
+
+// iterator for reading bam
+int read_bam(void *data, bam1_t *b) {
+    mplp_data *aux = (mplp_data*) data;
+    uint8_t *tag;
+    bool check_tag = (strcmp(aux->tag_name, "") != 0);
+    bool have_rg = (aux->read_group != NULL);
+    uint8_t *rg;
+    char *rg_val;
+    int ret;
+    while (1) {
+        ret = aux->iter ? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b);
+        if (ret<0) break;
+        // only take primary alignments
+        if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FQCFAIL | BAM_FDUP)) continue;
+        // filter by mapping quality
+        if ((int)b->core.qual < aux->min_mapQ) continue;
+        // filter by tag
+        if (check_tag) {
+            tag = bam_aux_get((const bam1_t*) b, aux->tag_name);
+            if (tag == NULL){ // tag isn't present or is currupt
+                if (aux->keep_missing) {
+                    break;
+                } else {
+                    continue;
+                }
+            }
+            int tag_value = bam_aux2i(tag);
+            if (errno == EINVAL) continue; // tag was not integer
+            if (tag_value != aux->tag_value) continue;
+        }
+        // filter by RG (read group):
+        if (have_rg) {
+            rg = bam_aux_get((const bam1_t*) b, "RG");
+            if (rg == NULL) continue;  // missing
+            rg_val = bam_aux2Z(rg);
+            if (errno == EINVAL) continue;  // bad parse
+            if (strcmp(aux->read_group, rg_val) != 0) continue;  // not wanted
+        }
+        break;
+    }
+    return ret;
+}
+
+
+// Initialise BAM file, index and header structures
+bam_fset* create_bam_fset(const char* fname) {
+    bam_fset* fset = xalloc(1, sizeof(bam_fset), "bam fileset");
+    fset->fp = hts_open(fname, "rb");
+    fset->idx = sam_index_load(fset->fp, fname);
+    fset->hdr = sam_hdr_read(fset->fp);
+    if (fset->hdr == 0 || fset->idx == 0 || fset->fp == 0) {
+        destroy_bam_fset(fset);
+        fprintf(stderr, "Failed to read .bam file '%s'.", fname);
+        exit(1);
+    }
+    return fset;
+}
+
+
+// Destory BAM file, index and header structures
+void destroy_bam_fset(bam_fset* fset) {
+    hts_close(fset->fp);
+    hts_idx_destroy(fset->idx);
+    sam_hdr_destroy(fset->hdr);
+    free(fset);
+}
diff --git a/src/medaka_bamiter.h b/src/medaka_bamiter.h
new file mode 100644
index 0000000..100c632
--- /dev/null
+++ b/src/medaka_bamiter.h
@@ -0,0 +1,37 @@
+#ifndef _MEDAKA_BAMITER_H
+#define _MEDAKA_BAMITER_H
+
+#include <stdbool.h>
+#include "htslib/sam.h"
+
+// parameters for bam iteration
+typedef struct {
+    htsFile *fp;
+    sam_hdr_t *hdr;
+    hts_itr_t *iter;
+    int min_mapQ;
+    char tag_name[2];
+    int tag_value;
+    bool keep_missing;
+    const char *read_group;
+} mplp_data;
+
+
+typedef struct {
+    htsFile *fp;
+    hts_idx_t *idx;
+    sam_hdr_t *hdr;
+} bam_fset;
+
+
+// Initialise BAM file, index and header structures
+bam_fset* create_bam_fset(const char* fname);
+
+// Destory BAM file, index and header structures
+void destroy_bam_fset(bam_fset* fset);
+
+
+// iterator for reading bam
+int read_bam(void *data, bam1_t *b);
+
+#endif
diff --git a/src/medaka_common.c b/src/medaka_common.c
new file mode 100644
index 0000000..ba06b03
--- /dev/null
+++ b/src/medaka_common.c
@@ -0,0 +1,99 @@
+#include <string.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "medaka_common.h"
+
+
+/** Allocates zero-initialised memory with a message on failure.
+ *
+ *  @param num number of elements to allocate.
+ *  @param size size of each element.
+ *  @param msg message to describe allocation on failure.
+ *  @returns pointer to allocated memory
+ *
+ */
+void *xalloc(size_t num, size_t size, char* msg){
+    void *res = calloc(num, size);
+    if (res == NULL){
+        fprintf(stderr, "Failed to allocate mem for %s\n", msg);
+        exit(1);
+    }
+    return res;
+}
+
+
+/** Reallocates memory with a message on failure.
+ *
+ *  @param ptr pointer to realloc.
+ *  @param size size of each element.
+ *  @param msg message to describe allocation on failure.
+ *  @returns pointer to allocated memory
+ *
+ */
+void *xrealloc(void *ptr, size_t size, char* msg){
+    void *res = realloc(ptr, size);
+    if (res == NULL){
+        fprintf(stderr, "Failed to reallocate mem for %s\n", msg);
+        exit(1);
+    }
+    return res;
+}
+
+
+/** Retrieves a substring.
+ *
+ *  @param string input string.
+ *  @param postion start position of substring.
+ *  @param length length of substring required.
+ *  @returns string pointer.
+ *
+ */
+char *substring(char *string, int position, int length) {
+   char *ptr;
+   size_t i;
+
+   ptr = malloc(length + 1);
+
+   for (i = 0 ; i < length ; i++) {
+      *(ptr + i) = *(string + position);
+      string++;
+   }
+
+   *(ptr + i) = '\0';
+   return ptr;
+}
+
+
+/** Format a uint32_t to a string
+ *
+ * @param value to format.
+ * @param dst destination char.
+ * @returns length of string.
+ *
+ */
+size_t uint8_to_str(uint8_t value, char *dst) {
+    static char* digits[] = {
+        "0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20",
+        "21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40",
+        "41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60",
+        "61","62","63","64","65","66","67","68","69","70","71","72","73","74","75","76","77","78","79","80",
+        "81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100",
+        "101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120",
+        "121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140",
+        "141","142","143","144","145","146","147","148","149","150","151","152","153","154","155","156","157","158","159","160",
+        "161","162","163","164","165","166","167","168","169","170","171","172","173","174","175","176","177","178","179","180",
+        "181","182","183","184","185","186","187","188","189","190","191","192","193","194","195","196","197","198","199","200",
+        "201","202","203","204","205","206","207","208","209","210","211","212","213","214","215","216","217","218","219","220",
+        "221","222","223","224","225","226","227","228","229","230","231","232","233","234","235","236","237","238","239","240",
+        "241","242","243","244","245","246","247","248","249","250","251","252","253","254","255"};
+    static const uint8_t TEN = 10;
+    static const uint8_t HUNDRED = 100;
+    strcpy(dst, digits[value]);
+    if (value < TEN) return 1;
+    if (value < HUNDRED) return 2;
+    else return 3;
+}
+
diff --git a/src/medaka_common.h b/src/medaka_common.h
new file mode 100644
index 0000000..2f06bf6
--- /dev/null
+++ b/src/medaka_common.h
@@ -0,0 +1,60 @@
+#ifndef _MEDAKA_COMMON_H
+#define _MEDAKA_COMMON_H
+
+#include <stdint.h>
+
+
+/** Simple integer min/max
+ * @param a
+ * @param b
+ *
+ * @returns the min/max of a and b
+ *
+ */
+static inline int max ( int a, int b ) { return a > b ? a : b; }
+static inline int min ( int a, int b ) { return a < b ? a : b; }
+
+
+/** Allocates zero-initialised memory with a message on failure.
+ *
+ *  @param num number of elements to allocate.
+ *  @param size size of each element.
+ *  @param msg message to describe allocation on failure.
+ *  @returns pointer to allocated memory
+ *
+ */
+void *xalloc(size_t num, size_t size, char* msg);
+
+
+/** Reallocates memory with a message on failure.
+ *
+ *  @param ptr pointer to realloc.
+ *  @param size size of each element.
+ *  @param msg message to describe allocation on failure.
+ *  @returns pointer to allocated memory
+ *
+ */
+void *xrealloc(void *ptr, size_t size, char* msg);
+
+
+/** Retrieves a substring.
+ *
+ *  @param string input string.
+ *  @param postion start position of substring.
+ *  @param length length of substring required.
+ *  @returns string pointer.
+ *
+ */
+char *substring(char *string, int position, int length);
+
+
+/** Format a uint32_t to a string
+ *
+ * @param value to format.
+ * @param dst destination char.
+ * @returns length of string.
+ *
+ */
+size_t uint8_to_str(uint8_t value, char *dst);
+
+#endif
diff --git a/src/medaka_khcounter.c b/src/medaka_khcounter.c
new file mode 100644
index 0000000..a8e9577
--- /dev/null
+++ b/src/medaka_khcounter.c
@@ -0,0 +1,135 @@
+// Wrap khash to make it more consise to use
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <string.h>
+#include "medaka_khcounter.h"
+#include "medaka_common.h"
+
+/* Implementation of a counter of strings (increasing only)
+ *
+ * khash_t(KH_COUNTER) *h = kh_init(KH_COUNTER);
+ * kh_counter_increment(h, "one");
+ * kh_counter_increment(h, "two");
+ * kh_counter_increment(h, "two");
+ * kh_counter_add(h, "three", 2);
+ * kh_counter_increment(h, "three");
+ * kh_counter_print(h);
+ * kh_counter_destroy(h);
+ *
+ */
+
+int kh_counter_val(khash_t(KH_COUNTER) *hash, char *key) {
+    khiter_t k = kh_get(KH_COUNTER, hash, key);
+    int val = k != kh_end(hash) ? kh_val(hash, k) : 0;
+    return val;
+}
+
+size_t kh_counter_add(khash_t(KH_COUNTER) *hash, char *key, int val) {
+    // note: key is copied so no need for caller to hold on to it
+    int ret;
+    khiter_t k = kh_put(KH_COUNTER, hash, key, &ret);
+    if (ret == 1) { // new key
+        kh_key(hash, k) = strdup(key);
+        kh_value(hash, k) = val;
+    } else if (ret == 0) {  // exists
+        // get value and add
+        int cur = kh_val(hash, k);
+        kh_value(hash, k) = cur + val;
+    } else {
+        // shouldnt get here - previously deleted key
+    }
+    return ret;
+}
+
+size_t kh_counter_sub(khash_t(KH_COUNTER) *hash, char *key, int val) {
+    // note: key is copied so no need for caller to hold on to it
+    int ret;
+    khiter_t k = kh_put(KH_COUNTER, hash, key, &ret);
+    if (ret == 1) { // new key
+        kh_key(hash, k) = strdup(key);
+        kh_value(hash, k) = -val;
+    } else if (ret == 0) {  // exists
+        // get value and add
+        int cur = kh_val(hash, k);
+        kh_value(hash, k) = cur - val;
+    } else {
+        // shouldnt get here - previously deleted key
+    }
+    return ret;
+}
+
+
+size_t kh_counter_increment(khash_t(KH_COUNTER) *hash, char *key) {
+    return kh_counter_add(hash, key, 1);
+}
+
+kh_counter_stats_t kh_counter_stats(khash_t(KH_COUNTER) *hash) {
+    kh_counter_stats_t stats = { .sum=0, .max=0};
+    for (khiter_t k = kh_begin(hash); k != kh_end(hash); k++) {
+        if (kh_exist(hash, k)) {
+            int val = kh_val(hash, k);
+            stats.sum += val;
+            stats.max = max(stats.max, val);
+        }
+    }
+    return stats;
+}
+
+void kh_counter_destroy(khash_t(KH_COUNTER) *hash) {
+    for (khiter_t k = 0; k < kh_end(hash); k++){
+        if (kh_exist(hash, k)) {
+            free((char*) kh_key(hash, k));
+        }
+    }
+    kh_destroy(KH_COUNTER, hash);
+}
+
+void kh_counter_print(khash_t(KH_COUNTER) *hash) {
+    for (khiter_t k = kh_begin(hash); k != kh_end(hash); k++) {
+        if (kh_exist(hash, k)) {
+            const char *key = kh_key(hash, k);
+            int val = kh_val(hash, k);
+            printf("%s -> %i\n", key, val);
+        }
+    }
+    kh_counter_stats_t stats = kh_counter_stats(hash);
+//    printf("max: %i, sum: %i\n", stats.max, stats.sum);
+}
+
+
+int kh_int_counter_val(khash_t(KH_INT_COUNTER) *hash, int key) {
+    khiter_t k = kh_get(KH_INT_COUNTER, hash, key);
+    int val = k != kh_end(hash) ? kh_val(hash, k) : -1;
+    return val;
+}
+
+
+size_t kh_int_counter_add(khash_t(KH_INT_COUNTER) *hash, int key, int val) {
+
+    int ret;
+    khiter_t k = kh_put(KH_INT_COUNTER, hash, key, &ret);
+    if (ret == 1) { // new key
+        kh_value(hash, k) = val;
+    } else if (ret == 0) {
+        int cur = kh_val(hash, k);
+        kh_value(hash, k) = cur + val;
+    }
+    return ret;
+}
+
+void kh_int_counter_destroy(khash_t(KH_INT_COUNTER) *hash) {
+    kh_destroy(KH_INT_COUNTER, hash);
+}
+
+//int (int argc, char *argv[]) {
+//  khash_t(KH_COUNTER) *h = kh_init(KH_COUNTER);
+//  kh_counter_increment(h, "one");
+//  kh_counter_increment(h, "two");
+//  kh_counter_increment(h, "two");
+//  kh_counter_add(h, "three", 2);
+//  kh_counter_increment(h, "three");
+//  kh_counter_print(h);
+//  kh_counter_destroy(h);
+//  printf("-------\n\n");
+//}
diff --git a/src/medaka_khcounter.h b/src/medaka_khcounter.h
new file mode 100644
index 0000000..1ad46c8
--- /dev/null
+++ b/src/medaka_khcounter.h
@@ -0,0 +1,53 @@
+#ifndef _MEDAKA_KHCOUNTER_H
+#define _MEDAKA_KHCOUNTER_H
+
+#include "khash.h"
+
+typedef struct kh_counter_stats_t {
+    size_t sum;
+    size_t max;
+} kh_counter_stats_t;
+
+KHASH_MAP_INIT_STR(KH_COUNTER, int)
+KHASH_MAP_INIT_INT(KH_INT_COUNTER, int)
+
+// create a counter
+static inline khash_t(KH_COUNTER) *kh_counter_init() {
+    khash_t(KH_COUNTER) *h = kh_init(KH_COUNTER);
+    return h;
+}
+
+static inline khash_t(KH_INT_COUNTER) *kh_int_counter_init() {
+    khash_t(KH_INT_COUNTER) *h = kh_init(KH_INT_COUNTER);
+    return h;
+}
+
+// Get a value from a counter 
+int kh_counter_val(khash_t(KH_COUNTER) *hash, char *key);
+
+// Clean up a counter
+void kh_counter_destroy(khash_t(KH_COUNTER) *hash);
+
+// Increment a counter by one
+size_t kh_counter_increment(khash_t(KH_COUNTER) *hash, char *key);
+
+size_t kh_counter_sub(khash_t(KH_COUNTER) *hash, char *key, int val);
+
+// Increment a counter by a given amount
+size_t kh_counter_add(khash_t(KH_COUNTER) *hash, char *key, int val);
+
+// Retrieve statistics on counter
+kh_counter_stats_t kh_counter_stats(khash_t(KH_COUNTER) *hash);
+
+// Print contents of a counter
+void kh_counter_print(khash_t(KH_COUNTER) *hash);
+
+// similar to the kh_counter, except that the key is integer
+int kh_int_counter_val(khash_t(KH_INT_COUNTER) *hash, int key);
+
+size_t kh_int_counter_add(khash_t(KH_INT_COUNTER) *hash, int key, int val);
+
+void kh_int_counter_destroy(khash_t(KH_INT_COUNTER) *hash);
+
+
+#endif

From c8fc4b336a51ac930417f7bbc885495ce40e299a Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 12:54:26 +0800
Subject: [PATCH 02/43] full-alignment create tensor with cffi

---
 .../CreateTensorFullAlignmentFromCffi.py      | 278 ++++++++++++++++++
 1 file changed, 278 insertions(+)
 create mode 100644 preprocess/CreateTensorFullAlignmentFromCffi.py

diff --git a/preprocess/CreateTensorFullAlignmentFromCffi.py b/preprocess/CreateTensorFullAlignmentFromCffi.py
new file mode 100644
index 0000000..3a1ba8c
--- /dev/null
+++ b/preprocess/CreateTensorFullAlignmentFromCffi.py
@@ -0,0 +1,278 @@
+import os
+import shlex
+import logging
+import numpy as np
+from argparse import ArgumentParser, SUPPRESS
+from collections import defaultdict
+
+import libclair3
+import shared.param_f as param
+from shared.utils import subprocess_popen, file_path_from, IUPAC_base_to_num_dict as BASE2NUM, str2bool, vcf_candidates_from
+from shared.interval_tree import bed_tree_from
+
+logging.basicConfig(format='%(message)s', level=logging.INFO)
+no_of_positions = param.no_of_positions
+flanking_base_num = param.flankingBaseNum
+channel_size = param.channel_size
+
+
+def CreateTensorFullAlignment(args):
+
+    ctg_start = args.ctgStart
+    ctg_end = args.ctgEnd
+    full_aln_regions = args.full_aln_regions
+    fasta_file_path = args.ref_fn
+    ctg_name = args.ctgName
+    bam_file_path = args.bam_fn
+    extend_bp = param.extend_bp
+    platform = args.platform
+    phased_vcf_fn = args.phased_vcf_fn
+
+    vcf_fn = file_path_from(args.vcf_fn)
+    is_known_vcf_file_provided = vcf_fn is not None
+    chunk_id = args.chunk_id - 1 if args.chunk_id else None  # 1-base to 0-base
+    chunk_num = args.chunk_num
+    extend_bed = file_path_from(args.extend_bed)
+    is_extend_bed_file_given = extend_bed is not None
+    confident_bed_fn = file_path_from(args.bed_fn)
+    is_confident_bed_file_given = confident_bed_fn is not None
+
+    # we would't haplotag reads if --no_phasing_for_fa option is enabled
+    need_haplotagging = args.no_phasing_for_fa is not True
+    candidates_set = set()
+
+    if full_aln_regions:
+
+        """
+        If given full alignment bed regions, all candidate positions will be directly selected from each row, define as 
+        'ctg start end', where 0-based center position is the candidate for full alignment calling.
+        if 'need_haplotagging' option enables, full alignment bed regions will also include nearby heterozygous snp candidates for reads
+        haplotag, which is faster than whatshap haplotag with more memory occupation.
+        """
+
+        candidate_file_path_process = subprocess_popen(shlex.split("gzip -fdc %s" % (full_aln_regions)))
+        candidate_file_path_output = candidate_file_path_process.stdout
+
+        ctg_start, ctg_end = float('inf'), 0
+        for row in candidate_file_path_output:
+            row = row.rstrip().split('\t')
+            if row[0] != ctg_name: continue
+            position = int(row[1]) + 1
+            end = int(row[2]) + 1
+            ctg_start = min(position, ctg_start)
+            ctg_end = max(end, ctg_end)
+
+            if platform == "ilmn":
+                continue
+            if len(row) > 3:  # hete snp positions
+                center_pos = position + extend_bp + 1
+                ref_base, alt_base, genotype, phase_set = row[3].split('-')
+            else:
+                center = position + (end - position) // 2 - 1
+                candidates_set.add(center)
+
+        candidate_file_path_output.close()
+        candidate_file_path_process.wait()
+
+    if is_known_vcf_file_provided:
+        known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name)
+        candidates_set = set(known_variants_list)
+
+    variant_list = []
+    if need_haplotagging and phased_vcf_fn and os.path.exists(phased_vcf_fn):
+        # if need_haplotagging option enables, scan the phased vcf file and store the heterozygous SNP candidates from each phase set
+        unzip_process = subprocess_popen(shlex.split("gzip -fdc %s" % (phased_vcf_fn)))
+        for row in unzip_process.stdout:
+            row = row.rstrip()
+            if row[0] == '#':
+                continue
+            columns = row.strip().split('\t')
+            contig_name = columns[0]
+            if ctg_name and contig_name != ctg_name:
+                continue
+            pos = int(columns[1])
+            ref_base = columns[3]
+            alt_base = columns[4]
+            genotype_info = columns[9].split(':')
+            genotype, phase_set = genotype_info[0], genotype_info[-1]
+            if '|' not in genotype:  # unphasable
+                continue
+            genotype = ('1' if genotype == '0|1' else '2')
+
+            # use a C Variant struct to store all phased infos
+            variant_list.append(libclair3.ffi.new("struct Variant *", [pos-1, ref_base.encode(), alt_base.encode(), int(genotype), int(phase_set)]))
+
+        variant_num = len(variant_list)
+        Variants = libclair3.ffi.new("struct Variant *[]", variant_list)
+
+
+    # 1-index to 0-index
+    candidates_list = sorted(list(set([item-1 for item in candidates_set if item >= ctg_start and item <= ctg_end])))
+
+    region_str = '{}:{}-{}'.format(ctg_name, ctg_start, ctg_end).encode()
+    candidate_num = len(candidates_list)
+
+    candidates = libclair3.ffi.new("size_t [{}]".format(candidate_num), candidates_list)
+
+    fa_data = libclair3.lib.calculate_clair3_full_alignment(region_str, bam_file_path.encode(), fasta_file_path.encode(),
+                                                      Variants, variant_num, candidates, candidate_num)
+
+    # use np buffer to get the matrix
+    matrix_depth = param.matrix_depth_dict[platform]
+    ffi = libclair3.ffi
+    _dtype = np.int8
+    size_sizet = np.dtype(_dtype).itemsize
+    np_fa_data = np.frombuffer(ffi.buffer(
+        fa_data.matrix, size_sizet * matrix_depth * no_of_positions * channel_size * candidate_num),
+        dtype=_dtype
+    ).reshape(candidate_num, matrix_depth, no_of_positions, channel_size).copy()
+
+
+    all_position_info, all_alt_info = [], []
+    for idx in range(candidate_num):
+        # decode the C char* to python string
+        alt_info_string = ffi.string(fa_data.all_alt_info[idx]).decode('utf8', 'ignore')
+        alt_info = alt_info_string.rstrip().split('-')
+        pos, depth, center_ref_base, alt = alt_info[:4]
+        all_position_info.append(ctg_name + ':' + pos + ':' + center_ref_base)
+        all_alt_info.append(depth + '-' + alt)
+
+    libclair3.lib.destroy_fa_data(fa_data)
+
+    return np_fa_data, all_position_info, all_alt_info
+
+
+def main():
+    parser = ArgumentParser(description="Generate variant candidate tensors using phased full-alignment")
+
+    parser.add_argument('--platform', type=str, default='ont',
+                        help="Sequencing platform of the input. Options: 'ont,hifi,ilmn', default: %(default)s")
+
+    parser.add_argument('--bam_fn', type=str, default="input.bam", required=True,
+                        help="Sorted BAM file input, required")
+
+    parser.add_argument('--ref_fn', type=str, default="ref.fa", required=True,
+                        help="Reference fasta file input, required")
+
+    parser.add_argument('--tensor_can_fn', type=str, default="PIPE",
+                        help="Tensor output, stdout by default, default: %(default)s")
+
+    parser.add_argument('--vcf_fn', type=str, default=None,
+                        help="Candidate sites VCF file input, if provided, variants will only be called at the sites in the VCF file,  default: %(default)s")
+
+    parser.add_argument('--min_af', type=float, default=0.08,
+                        help="Minimum allele frequency for both SNP and Indel for a site to be considered as a condidate site, default: %(default)f")
+
+    parser.add_argument('--snp_min_af', type=float, default=0.08,
+                        help="Minimum snp allele frequency for a site to be considered as a candidate site, default: %(default)f")
+
+    parser.add_argument('--indel_min_af', type=float, default=0.15,
+                        help="Minimum indel allele frequency for a site to be considered as a candidate site, default: %(default)f")
+
+    parser.add_argument('--ctgName', type=str, default=None,
+                        help="The name of sequence to be processed, required if --bed_fn is not defined")
+
+    parser.add_argument('--ctgStart', type=int, default=None,
+                        help="The 1-based starting position of the sequence to be processed, optional, will process the whole --ctgName if not set")
+
+    parser.add_argument('--ctgEnd', type=int, default=None,
+                        help="The 1-based inclusive ending position of the sequence to be processed, optional, will process the whole --ctgName if not set")
+
+    parser.add_argument('--bed_fn', type=str, default=None,
+                        help="Call variant only in the provided regions. Will take an intersection if --ctgName and/or (--ctgStart, --ctgEnd) are set")
+
+    parser.add_argument('--gvcf', type=str2bool, default=False,
+                        help="Enable GVCF output, default: disabled")
+
+    parser.add_argument('--sampleName', type=str, default="SAMPLE",
+                        help="Define the sample name to be shown in the GVCF file")
+
+    parser.add_argument('--samtools', type=str, default="samtools",
+                        help="Path to the 'samtools', samtools version >= 1.10 is required. default: %(default)s")
+
+    # options for advanced users
+    parser.add_argument('--minCoverage', type=float, default=param.min_coverage,
+                        help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
+
+    parser.add_argument('--minMQ', type=int, default=param.min_mq,
+                        help="EXPERIMENTAL: If set, reads with mapping quality with <$minMQ are filtered, default: %(default)d")
+
+    parser.add_argument('--minBQ', type=int, default=param.min_bq,
+                        help="EXPERIMENTAL: If set, bases with base quality with <$minBQ are filtered, default: %(default)d")
+
+    parser.add_argument('--max_depth', type=int, default=param.max_depth,
+                        help="EXPERIMENTAL: Maximum full alignment depth to be processed. default: %(default)s")
+
+    # options for debug purpose
+    parser.add_argument('--phasing_info_in_bam', action='store_true',
+                        help="DEBUG: Skip phasing and use the phasing info provided in the input BAM (HP tag), default: False")
+
+    parser.add_argument('--phasing_window_size', type=int, default=param.phasing_window_size,
+                        help="DEBUG: The window size for read phasing")
+
+    parser.add_argument('--extend_bed', nargs='?', action="store", type=str, default=None,
+                        help="DEBUG: Extend the regions in the --bed_fn by a few bp for tensor creation, default extend 16bp")
+
+    parser.add_argument('--indel_fn', type=str, default=None,
+                        help="DEBUG: Output all alternative indel cigar for debug purpose")
+
+    parser.add_argument('--base_err', default=0.001, type=float,
+                        help='DEBUG: Estimated base error rate in gvcf option, default: %(default)f')
+
+    parser.add_argument('--gq_bin_size', default=5, type=int,
+                        help='DEBUG: Default gq bin size for merge non-variant block in gvcf option, default: %(default)d')
+
+    parser.add_argument('--bp_resolution', action='store_true',
+                        help="DEBUG: Enable bp resolution for GVCF, default: disabled")
+
+    # options for internal process control
+    ## Path to the 'zstd' compression
+    parser.add_argument('--zstd', type=str, default=param.zstd,
+                        help=SUPPRESS)
+
+    ## Test in specific candidate position. Only for testing
+    parser.add_argument('--test_pos', type=int, default=0,
+                        help=SUPPRESS)
+
+    ## The number of chucks to be divided into for parallel processing
+    parser.add_argument('--chunk_num', type=int, default=None,
+                        help=SUPPRESS)
+
+    ## The chuck ID to work on
+    parser.add_argument('--chunk_id', type=int, default=None,
+                        help=SUPPRESS)
+
+    ## Use heterozygous SNP variants in phased vcf file for haplotaging
+    parser.add_argument('--phased_vcf_fn', type=str, default=None,
+                        help=SUPPRESS)
+    ## Apply no phased data in training. Only works in data training, default: False
+    parser.add_argument('--add_no_phasing_data_training', action='store_true',
+                        help=SUPPRESS)
+
+    ## Output representation unification infos, which refines training labels
+    parser.add_argument('--unify_repre', action='store_true',
+                        help=SUPPRESS)
+
+    ## Path of representation unification output
+    parser.add_argument('--unify_repre_fn', type=str, default=None,
+                        help=SUPPRESS)
+
+    ## Provide the regions to be included in full-alignment based calling
+    parser.add_argument('--full_aln_regions', type=str, default=None,
+                        help=SUPPRESS)
+
+    ## Use Clair3's own phasing module for read level phasing when creating tensor, compared to using Whatshap, speed is faster but has higher memory footprint, default: False
+    parser.add_argument('--need_haplotagging', action='store_true',
+                        help=SUPPRESS)
+
+    ## Apply read realignment for illumina platform. Greatly boost indel performance in trade of running time
+    parser.add_argument('--need_realignment', action='store_true',
+                        help=SUPPRESS)
+
+    args = parser.parse_args()
+
+    CreateTensorFullAlignment(args)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From f6368c576d3f7f77bc3ae038db74062b8576e0c3 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 12:55:10 +0800
Subject: [PATCH 03/43] reuse region function from medaka

---
 preprocess/medaka_utils.py | 95 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 preprocess/medaka_utils.py

diff --git a/preprocess/medaka_utils.py b/preprocess/medaka_utils.py
new file mode 100644
index 0000000..702c103
--- /dev/null
+++ b/preprocess/medaka_utils.py
@@ -0,0 +1,95 @@
+import os
+import collections
+
+
+_Region = collections.namedtuple('Region', 'ref_name start end')
+
+class Region(_Region):
+    """Represents a genomic region."""
+
+    @property
+    def name(self):
+        """Samtools-style region string, zero-base end exclusive."""
+        return self.__str__()
+
+    def __str__(self):
+        """Return string representation of region."""
+        # This will be zero-based, end exclusive
+        start = 0 if self.start is None else self.start
+        end = '' if self.end is None else self.end
+        return '{}:{}-{}'.format(self.ref_name, start, end)
+
+    @property
+    def size(self):
+        """Return size of region."""
+        return self.end - self.start
+
+    @classmethod
+    def from_string(cls, region):
+        """Parse region string into `Region` objects.
+
+        :param region: region str
+
+        >>> Region.from_string('Ecoli') == Region(
+        ...     ref_name='Ecoli', start=None, end=None)
+        True
+        >>> Region.from_string('Ecoli:1000-2000') == Region(
+        ...     ref_name='Ecoli', start=1000, end=2000)
+        True
+        >>> Region.from_string('Ecoli:1000') == Region(
+        ...     ref_name='Ecoli', start=1000, end=None)
+        True
+        >>> Region.from_string('Ecoli:-1000') == Region(
+        ...     ref_name='Ecoli', start=0, end=1000)
+        True
+        >>> Region.from_string('Ecoli:500-') == Region(
+        ...     ref_name='Ecoli', start=500, end=None)
+        True
+        >>> Region.from_string('A:B:c:500-') == Region(
+        ...     ref_name='A:B:c', start=500, end=None)
+        True
+        """
+        if ':' not in region:
+            ref_name, start, end = region, None, None
+        else:
+            start, end = None, None
+            ref_name, bounds = region.rsplit(':', 1)
+            if bounds[0] == '-':
+                start = 0
+                end = int(bounds.replace('-', ''))
+            elif '-' not in bounds:
+                start = int(bounds)
+                end = None
+            elif bounds[-1] == '-':
+                start = int(bounds[:-1])
+                end = None
+            else:
+                start, end = [int(b) for b in bounds.split('-')]
+        return cls(ref_name, start, end)
+
+    def split(region, size, overlap=0, fixed_size=True):
+        """Split region into sub-regions of a given length.
+
+        :param size: size of sub-regions.
+        :param overlap: overlap between ends of sub-regions.
+        :param fixed_size: ensure all sub-regions are equal in size. If `False`
+            then the final chunk will be created as the smallest size to
+            conform with `overlap`.
+
+        :returns: a list of sub-regions.
+
+        """
+        regions = list()
+        if size >= region.size:
+            return [region]
+        for start in range(region.start, region.end, size - overlap):
+            end = min(start + size, region.end)
+            regions.append(Region(region.ref_name, start, end))
+        if len(regions) > 1:
+            if fixed_size and regions[-1].size < size:
+                del regions[-1]
+                end = region.end
+                start = end - size
+                if start > regions[-1].start:
+                    regions.append(Region(region.ref_name, start, end))
+        return regions
\ No newline at end of file

From 7932cbeadbf028199015d2c625e2e3704f6ae4ff Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 12:57:54 +0800
Subject: [PATCH 04/43] pileup cffi function, threading and chunking are
 disabled currently

---
 preprocess/CreateTensorPileupFromCffi.py | 465 +++++++++++++++++++++++
 1 file changed, 465 insertions(+)
 create mode 100644 preprocess/CreateTensorPileupFromCffi.py

diff --git a/preprocess/CreateTensorPileupFromCffi.py b/preprocess/CreateTensorPileupFromCffi.py
new file mode 100644
index 0000000..512f1cb
--- /dev/null
+++ b/preprocess/CreateTensorPileupFromCffi.py
@@ -0,0 +1,465 @@
+import sys
+import logging
+import queue
+import concurrent.futures
+import numpy as np
+
+from argparse import ArgumentParser, SUPPRESS
+from contextlib import contextmanager
+
+import libclair3
+import shared.param_p as param
+from shared.interval_tree import bed_tree_from, is_region_in
+from shared.utils import file_path_from, IUPAC_base_to_num_dict as BASE2NUM, str2bool, vcf_candidates_from
+from preprocess.medaka_utils import Region
+
+logging.getLogger().setLevel(logging.INFO)
+
+flanking_base_num = param.flankingBaseNum
+no_of_positions = 2 * flanking_base_num + 1
+channel = param.channel
+channel_size = len(channel)
+
+
+def pileup_counts_clair3(
+        region, bam, fasta, min_depth, min_snp_af, min_indel_af, min_mq, call_snp_only, max_indel_length, gvcf, \
+        max_depth, region_split=100000, workers=1):
+    """Create pileup counts feature array for region.
+
+    :param region: `medaka.common.Region` object
+    :param bam: .bam file with alignments.
+    :param dtype_prefixes: prefixes for query names which to separate counts.
+        If `None` (or of length 1), counts are not split.
+    :param region_split: largest region to process in single thread.
+        Regions are processed in parallel and stitched before being returned.
+    :param workers: worker threads for calculating pileup.
+    :param tag_name: two letter tag name by which to filter reads.
+    :param tag_value: integer value of tag for reads to keep.
+    :param keep_missing: whether to keep reads when tag is missing.
+    :param num_qstrat: number of layers for qscore stratification.
+    :param weibull_summation: use a Weibull partial-counts approach,
+        requires 'WL' and 'WK' float-array tags.
+
+    :returns: iterator of tuples
+        (pileup counts array, reference positions, insertion positions)
+        Multiple chunks are returned if there are discontinuities in
+        positions caused e.g. by gaps in coverage.
+    """
+    lib = libclair3.lib
+    featlenclair3 = lib.featlenclair3
+    bam = BAMHandler(bam)
+
+    def _process_region(reg):
+        # ctg start is 1-based, medaka.common.Region object is 0-based
+        region_str = '{}:{}-{}'.format(reg.ref_name, max(0, reg.start-1), reg.end)
+        if isinstance(bam, BAMHandler):
+            bam_handle = bam
+        else:
+            bam_handle = BAMHandler(bam)
+        with bam_handle.borrow() as fh:
+            counts = lib.calculate_clair3_pileup(
+                region_str.encode(), fh, fasta.encode(), min_depth, min_snp_af, min_indel_af, min_mq, max_indel_length, call_snp_only, max_depth)
+        np_counts, positions, alt_info_string_list = _plp_data_to_numpy(
+            counts, featlenclair3)
+
+        alt_info_list = []
+        for alt_info in alt_info_string_list:
+            alt_info = alt_info.split('-')
+            # skip mainly because candidate length is larger than maximum indel length
+            if len(alt_info) < 4:
+                continue
+            pos, depth, center_ref_base, alt = alt_info[:4]
+            alt_info_list.append((int(pos), reg.ref_name + ':' + pos + ':' + center_ref_base, depth + '-' + alt))
+
+        lib.destroy_plp_data(counts)
+        return np_counts, positions, alt_info_list
+
+    # we found that split into small chunk would lead to some missing truths,
+    # the candidates cross two negbouring small chunks
+    region_split = region.end - region.start
+    regions = region.split(region_split, fixed_size=False)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
+        results = executor.map(_process_region, regions)
+        chunk_results, all_alt_info_list = __enforce_pileup_chunk_contiguity(results)
+    return chunk_results, all_alt_info_list
+
+
+class BAMHandler(object):
+    """Opening of BAM file handles and indices."""
+
+    def __init__(self, bam, size=16):
+        """Initialise a pool of HTSlib filehandles."""
+        # note: the default size here is set to match the default
+        #       `bam_workers` of prediction.DataLoader and `workers`
+        #       of features.pileup_counts, such that this class
+        #       should never block computations
+        self.bam = bam
+        self._pool = queue.Queue(size)
+
+        lib, ffi = libclair3.lib, libclair3.ffi
+        for _ in range(size):
+            fset = ffi.gc(
+                lib.create_bam_fset(self.bam.encode()),
+                self._destroy_fset)
+            self._pool.put(fset)
+
+    @contextmanager
+    def borrow(self):
+        """Borrow a BAM file handle and index set."""
+        fset = self._pool.get()
+        try:
+            yield fset
+        finally:
+            self._pool.put(fset)
+
+    def encode(self):
+        """Return bare path encoded to bytes.
+
+        For legacy compatibility only.
+        """
+        return self.bam.encode()
+
+    def _destroy_fset(self, fset):
+        libclair3.lib.destroy_bam_fset(fset)
+
+
+def _plp_data_to_numpy(plp_data, n_rows):
+    """Create numpy representation of feature data.
+
+    Copy the feature matrix and alignment column names from a
+    `plp_data` structure returned from C library function calls.
+
+    :param plp_data: a cffi proxy to a `plp_data*` pointer
+    :param nrows: the number of rows in the plp_data.matrix (the number
+        of elements in the feature per pileup column).
+
+    :returns: pileup counts numpy array, reference positions
+
+    """
+    ffi = libclair3.ffi
+    size_sizet = np.dtype(np.int).itemsize
+    _dtype = np.int
+    np_counts = np.frombuffer(ffi.buffer(
+        plp_data.matrix, size_sizet * plp_data.n_cols * n_rows),
+        dtype=_dtype
+    ).reshape(plp_data.n_cols, n_rows).copy()
+
+    alt_info_string_list = []
+    candidates_num = plp_data.candidates_num
+    # decode all alternative information, position-depth-reference_base-alt_info
+    for i in range(candidates_num):
+        alt_info_string = ffi.string(plp_data.all_alt_info[i]).decode('utf8', 'ignore').rstrip()
+        alt_info_string_list.append(alt_info_string)
+
+    positions = np.empty(plp_data.n_cols, dtype=[
+        ('major', int), ('minor', int)])
+    np.copyto(
+        positions['major'], np.frombuffer(
+            ffi.buffer(plp_data.major, size_sizet * plp_data.n_cols),
+            dtype=_dtype))
+    np.copyto(
+        positions['minor'],
+        np.frombuffer(ffi.buffer(
+            plp_data.minor, size_sizet * plp_data.n_cols), dtype=_dtype))
+    return np_counts, positions, alt_info_string_list
+
+
+def __enforce_pileup_chunk_contiguity(pileups):
+    """Split and join ordered pileup chunks to ensure contiguity.
+
+    :param pileups: iterable of (counts, pileups) as constructed by
+        `_plp_data_to_numpy`.
+
+    :returns: a list of reconstituted (counts, pileups) where discontinuities
+        in the inputs cause breaks and abutting inputs are joined.
+
+    """
+    split_results = list()
+    all_alt_info_list = list()
+    # First pass: need to check for discontinuities within chunks,
+    # these show up as >1 changes in the major coordinate
+    for counts, positions, alt_info_list in pileups:
+        move = np.ediff1d(positions['major'])
+        gaps = np.where(move > 1)[0] + 1
+        all_alt_info_list += alt_info_list
+        if len(gaps) == 0:
+            split_results.append((counts, positions))
+        else:
+            start = 0
+            for i in gaps:
+                split_results.append((counts[start:i], positions[start:i]))
+                start = i
+            split_results.append((counts[start:], positions[start:]))
+
+    # Second pass: stitch abutting chunks together, anything not neighbouring
+    # is kept separate whether it came from the same chunk originally or not
+    def _finalize_chunk(c_buf, p_buf):
+        chunk_counts = np.concatenate(c_buf)
+        chunk_positions = np.concatenate(p_buf)
+        return chunk_counts, chunk_positions
+
+    counts_buffer, positions_buffer = list(), list()
+    chunk_results = list()
+    last = None
+    for counts, positions in split_results:
+        if len(positions) == 0:
+            continue
+        first = positions['major'][0]
+        # should be last -first == 1?
+        if len(counts_buffer) == 0 or first - last == 1:
+            # new or contiguous
+            counts_buffer.append(counts)
+            positions_buffer.append(positions)
+            last = positions['major'][-1]
+        else:
+            # discontinuity
+            chunk_results.append(_finalize_chunk(
+                counts_buffer, positions_buffer))
+            counts_buffer = [counts]
+            positions_buffer = [positions]
+            last = positions['major'][-1]
+    if len(counts_buffer) != 0:
+        chunk_results.append(_finalize_chunk(counts_buffer, positions_buffer))
+    return chunk_results, all_alt_info_list
+
+
+def CreateTensorPileup(args):
+    """
+    Create pileup tensor for pileup model training or calling.
+    Use slide window to scan the whole candidate regions, keep all candidates over specific minimum allelic frequency
+    and minimum depth, use samtools mpileup to store pileup info for pileup tensor generation. Only scan candidate
+    regions once, we could directly get all variant candidates directly.
+    """
+    ctg_start = args.ctgStart
+    ctg_end = args.ctgEnd
+    fasta_file_path = args.ref_fn
+    ctg_name = args.ctgName
+    bam_file_path = args.bam_fn
+    chunk_id = args.chunk_id - 1 if args.chunk_id else None  # 1-base to 0-base
+    chunk_num = args.chunk_num
+    minimum_snp_af_for_candidate = args.snp_min_af
+    minimum_indel_af_for_candidate = args.indel_min_af
+    min_coverage = args.minCoverage
+    min_mapping_quality = args.minMQ
+    platform = args.platform
+
+    vcf_fn = file_path_from(args.vcf_fn)
+    is_known_vcf_file_provided = vcf_fn is not None
+    confident_bed_fn = file_path_from(args.extend_bed)
+    is_confident_bed_file_given = confident_bed_fn is not None
+    extend_bed = file_path_from(args.extend_bed)
+    is_extend_bed_file_given = extend_bed is not None
+    fast_mode = args.fast_mode
+    call_snp_only = args.call_snp_only
+    # enable_long_indel = args.enable_long_indel
+    # 1-based regions [start, end] (start and end inclusive)
+    tree, bed_start, bed_end = bed_tree_from(bed_file_path=extend_bed,
+                                             contig_name=ctg_name,
+                                             return_bed_region=True)
+
+    fai_fn = file_path_from(fasta_file_path, suffix=".fai", exit_on_not_found=True, sep='.')
+
+    fast_mode = platform == 'ont' and fast_mode
+    minimum_snp_af_for_candidate = max(minimum_snp_af_for_candidate, param.min_af_dict[platform]) if fast_mode else minimum_snp_af_for_candidate
+    min_coverage = max(min_coverage, 4) if fast_mode else min_coverage
+    max_indel_length = param.maximum_variant_length_that_need_infer #if not enable_long_indel else param.maximum_variant_length_that_need_infer_include_long_indel
+
+    if not is_confident_bed_file_given and chunk_id is not None:
+        contig_length = 0
+        with open(fai_fn, 'r') as fai_fp:
+            for row in fai_fp:
+                columns = row.strip().split("\t")
+
+                contig_name = columns[0]
+                if contig_name != ctg_name:
+                    continue
+                contig_length = int(columns[1])
+        chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num
+        ctg_start = chunk_size * chunk_id  # 0-base to 1-base
+        ctg_end = ctg_start + chunk_size
+
+    if is_confident_bed_file_given and chunk_id is not None:
+        chunk_size = (bed_end - bed_start) // chunk_num + 1 if (bed_end - bed_start) % chunk_num else (bed_end - bed_start) // chunk_num
+        ctg_start = bed_start + 1 + chunk_size * chunk_id  # 0-base to 1-base
+        ctg_end = ctg_start + chunk_size
+
+    if is_known_vcf_file_provided and chunk_id is not None:
+        known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name)
+        total_variants_size = len(known_variants_list)
+        chunk_variants_size = total_variants_size // chunk_num if total_variants_size % chunk_num == 0 else total_variants_size // chunk_num + 1
+        chunk_start_pos = chunk_id * chunk_variants_size
+        known_variants_set = set(known_variants_list[chunk_start_pos: chunk_start_pos + chunk_variants_size])
+        if len(known_variants_set) == 0:
+            return [], [], []
+        ctg_start, ctg_end = min(known_variants_set), max(known_variants_set)
+
+    is_ctg_name_given = ctg_name is not None
+    is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None
+    if is_ctg_range_given:
+        extend_start = max(1, ctg_start - no_of_positions)
+        extend_end = ctg_end + no_of_positions
+
+    region_str = "{}:{}-{}".format(ctg_name, extend_start, extend_end)
+    region = Region.from_string(region_str)
+
+    confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn, contig_name=ctg_name, bed_ctg_start=extend_start,
+                                       bed_ctg_end=extend_end)
+
+    chunk_result, all_alt_info_list = pileup_counts_clair3(region,
+                                                           bam=bam_file_path,
+                                                           fasta=fasta_file_path,
+                                                           min_depth=min_coverage,
+                                                           min_snp_af=minimum_snp_af_for_candidate,
+                                                           min_indel_af=minimum_indel_af_for_candidate,
+                                                           min_mq=min_mapping_quality,
+                                                           max_indel_length=max_indel_length,
+                                                           call_snp_only=call_snp_only,
+                                                           max_depth=param.max_depth,
+                                                           gvcf=args.gvcf)
+
+    # slice all candidates tensor according to the alternative information
+    np_pileup_data, all_position_info, all_alt_info = [], [], []
+    for idx, (pos, pos_info, alt_info) in enumerate(all_alt_info_list):
+        pos = int(pos)
+        pass_confident_bed = not is_confident_bed_file_given or is_region_in(tree=confident_bed_tree,
+                                                                             contig_name=ctg_name,
+                                                                             region_start=pos - 1,
+                                                                             region_end=pos + 1)
+
+        pass_vcf_region = not is_known_vcf_file_provided or (is_known_vcf_file_provided and pos in known_variants_set)
+
+        if not pass_confident_bed or not pass_vcf_region:
+            continue
+        start, end = pos - flanking_base_num, pos + flanking_base_num + 1
+        for result in chunk_result:
+            if start - 1 >= result[1][0][0] and end <= result[1][-1][0]:
+                offset = start - result[1][0][0] - 1
+                tensor = result[0][offset: offset+no_of_positions]
+                # mainly because no coverage in flanking windows
+                if tensor.shape != (no_of_positions, channel_size):
+                    continue
+                # check any empty columns in flanking position, those columns with all zeros
+                if np.sum(np.sum(tensor == 0, axis=1) == channel_size) > 0:
+                    continue
+                np_pileup_data.append(tensor)
+                all_position_info.append(pos_info)
+                all_alt_info.append(alt_info)
+    np_pileup_data = np.array(np_pileup_data, dtype=np.int32)
+
+    return np_pileup_data, all_position_info, all_alt_info
+
+
+def main():
+    parser = ArgumentParser(description="Generate variant candidate tensors using pileup")
+
+    parser.add_argument('--platform', type=str, default='ont',
+                        help="Sequencing platform of the input. Options: 'ont,hifi,ilmn', default: %(default)s")
+
+    parser.add_argument('--bam_fn', type=str, default="input.bam", required=True,
+                        help="Sorted BAM file input, required")
+
+    parser.add_argument('--ref_fn', type=str, default="ref.fa", required=True,
+                        help="Reference fasta file input, required")
+
+    parser.add_argument('--tensor_can_fn', type=str, default="PIPE",
+                        help="Tensor output, stdout by default, default: %(default)s")
+
+    parser.add_argument('--vcf_fn', type=str, default=None,
+                        help="Candidate sites VCF file input, if provided, variants will only be called at the sites in the VCF file,  default: %(default)s")
+
+    parser.add_argument('--min_af', type=float, default=0.08,
+                        help="Minimum allele frequency for both SNP and Indel for a site to be considered as a candidate site, default: %(default)f")
+
+    parser.add_argument('--snp_min_af', type=float, default=0.08,
+                        help="Minimum snp allele frequency for a site to be considered as a candidate site, default: %(default)f")
+
+    parser.add_argument('--indel_min_af', type=float, default=0.15,
+                        help="Minimum indel allele frequency for a site to be considered as a candidate site, default: %(default)f")
+
+    parser.add_argument('--ctgName', type=str, default=None,
+                        help="The name of sequence to be processed, required if --bed_fn is not defined")
+
+    parser.add_argument('--ctgStart', type=int, default=None,
+                        help="The 1-based starting position of the sequence to be processed, optional, will process the whole --ctgName if not set")
+
+    parser.add_argument('--ctgEnd', type=int, default=None,
+                        help="The 1-based inclusive ending position of the sequence to be processed, optional, will process the whole --ctgName if not set")
+
+    parser.add_argument('--bed_fn', type=str, default=None,
+                        help="Call variant only in the provided regions. Will take an intersection if --ctgName and/or (--ctgStart, --ctgEnd) are set")
+
+    parser.add_argument('--gvcf', type=str2bool, default=False,
+                        help="Enable GVCF output, default: disabled")
+
+    parser.add_argument('--sampleName', type=str, default="SAMPLE",
+                        help="Define the sample name to be shown in the VCF file, default: %(default)s")
+
+    parser.add_argument('--samtools', type=str, default="samtools",
+                        help="Path to the 'samtools', samtools version >= 1.10 is required. default: %(default)s")
+
+    # options for advanced users
+    parser.add_argument('--fast_mode', type=str2bool, default=False,
+                        help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s")
+
+    parser.add_argument('--minCoverage', type=float, default=2,
+                        help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
+
+    parser.add_argument('--minMQ', type=int, default=param.min_mq,
+                        help="EXPERIMENTAL: If set, reads with mapping quality with <$minMQ are filtered, default: %(default)d")
+
+    parser.add_argument('--minBQ', type=int, default=param.min_bq,
+                        help="EXPERIMENTAL: If set, bases with base quality with <$minBQ are filtered, default: %(default)d")
+
+    parser.add_argument('--max_depth', type=int, default=param.max_depth,
+                        help="EXPERIMENTAL: Maximum pileup depth to be processed. default: %(default)s")
+
+    parser.add_argument('--call_snp_only', type=str2bool, default=False,
+                        help="EXPERIMENTAL: Call candidates pass snp minimum AF only, ignore Indel candidates")
+
+    # options for debug purpose
+    parser.add_argument('--extend_bed', type=str, default=None,
+                        help="DEBUG: Extend the regions in the --bed_fn by a few bp for tensor creation, default extend 16bp")
+
+    parser.add_argument('--temp_file_dir', type=str, default="./",
+                        help="EXPERIMENTAL: The cache directory for storing temporary non-variant information if --gvcf is enabled, default: %(default)s")
+
+    parser.add_argument('--indel_fn', type=str, default=None,
+                        help="DEBUG: Output all alternative indel cigar for debug purpose")
+
+    parser.add_argument('--base_err', default=param.base_err, type=float,
+                        help='DEBUG: Estimated base error rate in gvcf option, default: %(default)f')
+
+    parser.add_argument('--gq_bin_size', default=param.gq_bin_size, type=int,
+                        help='DEBUG: Default gq bin size for merge non-variant block in gvcf option, default: %(default)d')
+
+    parser.add_argument('--bp_resolution', action='store_true',
+                        help="DEBUG: Enable bp resolution for GVCF, default: disabled")
+
+    # options for internal process control
+    ## Path to the 'zstd' compression
+    parser.add_argument('--zstd', type=str, default=param.zstd,
+                        help=SUPPRESS)
+
+    ## Test in specific candidate position. Only for testing
+    parser.add_argument('--test_pos', type=int, default=0,
+                        help=SUPPRESS)
+
+    ## The number of chucks to be divided into for parallel processing
+    parser.add_argument('--chunk_num', type=int, default=None,
+                        help=SUPPRESS)
+
+    ## The chuck ID to work on
+    parser.add_argument('--chunk_id', type=int, default=None,
+                        help=SUPPRESS)
+
+    args = parser.parse_args()
+
+    if len(sys.argv[1:]) == 0:
+        parser.print_help()
+        sys.exit(1)
+
+    CreateTensorPileup(args)
+
+
+if __name__ == "__main__":
+    main()

From a6f7c88df8f5245f7681c89d2110063736be669a Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 13:04:26 +0800
Subject: [PATCH 05/43] integrate the triton inference server from nvidia, and
 directly use the numpy buffer to get the input matrix from create tensor
 function, reuse all CallVariants function

---
 clair3/CallVariantsFromCffi.py | 335 +++++++++++++++++++++++++++++++++
 1 file changed, 335 insertions(+)
 create mode 100644 clair3/CallVariantsFromCffi.py

diff --git a/clair3/CallVariantsFromCffi.py b/clair3/CallVariantsFromCffi.py
new file mode 100644
index 0000000..8df9d5b
--- /dev/null
+++ b/clair3/CallVariantsFromCffi.py
@@ -0,0 +1,335 @@
+import sys
+import os
+import tensorflow as tf
+import logging
+from time import time
+from argparse import ArgumentParser, SUPPRESS
+
+import tritonclient.grpc as tritongrpcclient
+
+from shared.utils import str2bool, log_error
+from clair3.CallVariants import OutputConfig, output_utilties_from, batch_output
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+logging.basicConfig(format='%(message)s', level=logging.INFO)
+
+
+def Run(args):
+    os.environ["OMP_NUM_THREADS"] = "1"
+    os.environ["OPENBLAS_NUM_THREADS"] = "1"
+    os.environ["MKL_NUM_THREADS"] = "1"
+    os.environ["NUMEXPR_NUM_THREADS"] = "1"
+
+    tf.config.threading.set_intra_op_parallelism_threads(1)
+    tf.config.threading.set_inter_op_parallelism_threads(1)
+
+    global test_pos
+    test_pos = None
+    global param
+    if args.pileup:
+        import shared.param_p as param
+    else:
+        import shared.param_f as param
+
+    if args.enable_long_indel:
+        maximum_variant_length_that_need_infer = param.maximum_variant_length_that_need_infer_include_long_indel
+    else:
+        maximum_variant_length_that_need_infer = param.maximum_variant_length_that_need_infer
+
+    output_config = OutputConfig(
+        is_show_reference=args.showRef,
+        is_debug=args.debug,
+        is_haploid_precise_mode_enabled=args.haploid_precise,
+        is_haploid_sensitive_mode_enabled=args.haploid_sensitive,
+        is_output_for_ensemble=args.output_for_ensemble,
+        quality_score_for_pass=args.qual,
+        tensor_fn=args.tensor_fn,
+        input_probabilities=args.input_probabilities,
+        add_indel_length=args.add_indel_length,
+        gvcf=args.gvcf,
+        pileup=args.pileup,
+        enable_long_indel=args.enable_long_indel,
+        maximum_variant_length_that_need_infer=maximum_variant_length_that_need_infer
+    )
+    output_utilities = output_utilties_from(
+        sample_name=args.sampleName,
+        is_debug=args.debug,
+        is_output_for_ensemble=args.output_for_ensemble,
+        reference_file_path=args.ref_fn,
+        output_file_path=args.call_fn,
+        output_probabilities=args.output_probabilities
+    )
+
+    call_variants_from_cffi(args=args, output_config=output_config, output_utilities=output_utilities)
+
+
+def call_variants_from_cffi(args, output_config, output_utilities):
+    use_gpu = args.use_gpu
+    if use_gpu:
+        server_url = 'localhost:8001'
+        try:
+            triton_client = tritongrpcclient.InferenceServerClient(
+            url=server_url,
+            verbose=False
+        )
+        except Exception as e:
+            print("channel creation failed: " + str(e))
+            sys.exit()
+    else:
+        os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+    global param
+    if args.pileup:
+        import shared.param_p as param
+        if use_gpu:
+            model_name = 'pileup'
+            input_dtype = 'INT32'
+        else:
+            from clair3.model import Clair3_P
+            m = Clair3_P(add_indel_length=args.add_indel_length, predict=True)
+    else:
+        import shared.param_f as param
+        if use_gpu:
+            model_name = 'alignment'
+            input_dtype = 'INT8'
+        else:
+            from clair3.model import Clair3_F
+            m = Clair3_F(add_indel_length=args.add_indel_length, predict=True)
+
+    if not use_gpu:
+        m.load_weights(args.chkpnt_fn)
+    output_utilities.gen_output_file()
+    output_utilities.output_header()
+    chunk_id = args.chunk_id - 1 if args.chunk_id else None  # 1-base to 0-base
+    chunk_num = args.chunk_num
+    full_alignment_mode = not args.pileup
+
+    logging.info("Calling variants ...")
+    variant_call_start_time = time()
+
+    batch_output_method = batch_output
+    total = 0
+
+    if args.pileup:
+        from preprocess.CreateTensorPileupFromCffi import CreateTensorPileup as CT
+    else:
+        from preprocess.CreateTensorFullAlignmentFromCffi import CreateTensorFullAlignment as CT
+
+    tensor, all_position, all_alt_info = CT(args)
+
+    def tensor_generator_from(tensor, all_position, all_alt_info):
+        total_data = len(tensor)
+        assert total_data == len(all_alt_info)
+        assert total_data == len(all_position)
+        batch_size = param.predictBatchSize
+        total_chunk = total_data // batch_size if total_data % batch_size == 0 else total_data // batch_size + 1
+        for chunk_id in range(total_chunk):
+            chunk_start = chunk_id * batch_size
+            chunk_end = (chunk_id + 1) * batch_size if chunk_id < total_chunk - 1 else total_data
+            yield (tensor[chunk_start:chunk_end], all_position[chunk_start:chunk_end], all_alt_info[chunk_start:chunk_end])
+
+    tensor_generator = tensor_generator_from(tensor, all_position, all_alt_info)
+
+    for (X, position, alt_info_list) in tensor_generator:
+            total += len(X)
+
+            if use_gpu:
+                inputs = []; outputs = []
+
+                inputs.append(tritongrpcclient.InferInput('input_1', X.shape, input_dtype))
+                outputs.append(tritongrpcclient.InferRequestedOutput('output_1'))
+
+                inputs[0].set_data_from_numpy(X)
+                results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs)
+                Y = results.as_numpy('output_1')
+            else:
+                Y = m.predict_on_batch(X)
+
+            batch_output_method(position, alt_info_list, Y, output_config, output_utilities)
+
+    if chunk_id is not None:
+        logging.info("Total processed positions in {} (chunk {}/{}) : {}".format(args.ctgName, chunk_id+1, chunk_num, total))
+    elif full_alignment_mode:
+        try:
+            chunk_infos = args.call_fn.split('.')[-2]
+            c_id, c_num = chunk_infos.split('_')
+            c_id = int(c_id) + 1 # 0-index to 1-index
+            logging.info("Total processed positions in {} (chunk {}/{}) : {}".format(args.ctgName, c_id, c_num, total))
+        except:
+            logging.info("Total processed positions in {} : {}".format(args.ctgName, total))
+    else:
+        logging.info("Total processed positions in {} : {}".format(args.ctgName, total))
+
+    if full_alignment_mode and total == 0:
+        logging.info(log_error("[ERROR] No full-alignment output for file {}/{}".format(args.ctgName, args.call_fn)))
+
+    logging.info("Total time elapsed: %.2f s" % (time() - variant_call_start_time))
+
+    output_utilities.close_opened_files()
+    # remove file if on variant in output
+    if os.path.exists(args.call_fn):
+        for row in open(args.call_fn, 'r'):
+            if row[0] != '#':
+                return
+        logging.info("[INFO] No vcf output for file {}, remove empty file".format(args.call_fn))
+        os.remove(args.call_fn)
+
+
+def main():
+    parser = ArgumentParser(description="Call variants using a trained model and tensors of candidate variants")
+
+    parser.add_argument('--platform', type=str, default="ont",
+                        help="Sequencing platform of the input. Options: 'ont,hifi,ilmn', default: %(default)s")
+
+    parser.add_argument('--tensor_fn', type=str, default="PIPE",
+                        help="Tensor input filename, or stdin if not set")
+
+    parser.add_argument('--chkpnt_fn', type=str, default=None,
+                        help="Input a trained model for variant calling, required")
+
+    parser.add_argument('--call_fn', type=str, default="clair3",
+                        help="VCF output filename, or stdout if not set")
+
+    parser.add_argument('--gvcf', type=str2bool, default=False,
+                        help="Enable GVCF output, default: disabled")
+
+    parser.add_argument('--ref_fn', type=str, default=None,
+                        help="Reference fasta file input, required if --gvcf is enabled")
+
+    parser.add_argument('--ctgName', type=str, default=None,
+                        help="The name of the sequence to be processed")
+
+    parser.add_argument('--ctgStart', type=int, default=None,
+                        help="The 1-based starting position of the sequence to be processed, optional, will process the whole --ctgName if not set")
+
+    parser.add_argument('--ctgEnd', type=int, default=None,
+                        help="The 1-based inclusive ending position of the sequence to be processed, optional, will process the whole --ctgName if not set")
+
+    parser.add_argument('--sampleName', type=str, default="SAMPLE",
+                        help="Define the sample name to be shown in the VCF file, optional")
+
+    parser.add_argument('--qual', type=int, default=2,
+                        help="If set, variants with >=$qual will be marked 'PASS', or 'LowQual' otherwise, optional")
+
+    parser.add_argument('--samtools', type=str, default="samtools",
+                        help="Path to the 'samtools', samtools version >= 1.10 is required, default: %(default)s")
+
+    # options for advanced users
+    parser.add_argument('--temp_file_dir', type=str, default='./',
+                        help="EXPERIMENTAL: The cache directory for storing temporary non-variant information if --gvcf is enabled, default: %(default)s")
+
+    parser.add_argument('--haploid_precise', action='store_true',
+                        help="EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant")
+
+    parser.add_argument('--haploid_sensitive', action='store_true',
+                        help="EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant")
+
+    parser.add_argument('--enable_long_indel', type=str2bool, default=False,
+                        help="EXPERIMENTAL: Enable long Indel variants(>50 bp) calling")
+
+    # options for debug purpose
+    parser.add_argument('--use_gpu', type=str2bool, default=False,
+                        help="DEBUG: Use GPU for calling. Speed up is mostly insignficiant. Only use this for building your own pipeline")
+
+    parser.add_argument('--predict_fn', type=str, default=None,
+                        help="DEBUG: Output network output probabilities for further analysis")
+
+    parser.add_argument('--input_probabilities', action='store_true',
+                        help="DEBUG: Use network probability outputs as input and generate variants from them")
+
+    parser.add_argument('--output_probabilities', action='store_true',
+                        help="DEBUG: Output the network probabilities of gt21, genotype, indel_length_1 and indel_length_2")
+
+    # options for internal process control
+    ## In pileup mode or not (full alignment mode), default: False
+    parser.add_argument('--pileup', action='store_true',
+                        help=SUPPRESS)
+
+    ## Include indel length in training and calling, false for pileup and true for raw alignment
+    parser.add_argument('--add_indel_length', action='store_true',
+                        help=SUPPRESS)
+
+    ## The number of chucks to be divided into for parallel processing
+    parser.add_argument('--chunk_num', type=int, default=None,
+                        help=SUPPRESS)
+
+    ## The chuck ID to work on
+    parser.add_argument('--chunk_id', type=int, default=None,
+                        help=SUPPRESS)
+
+    ## Enable debug mode, default: False, optional
+    parser.add_argument('--debug', action='store_true',
+                        help=SUPPRESS)
+
+    ## Generating outputs for ensemble model calling
+    parser.add_argument('--output_for_ensemble', action='store_true',
+                        help=SUPPRESS)
+
+    ## Use bin file from pytables to speed up calling.
+    parser.add_argument('--is_from_tables', action='store_true',
+                        help=SUPPRESS)
+
+    ## Output reference calls
+    parser.add_argument('--showRef', type=str2bool, default=True,
+                        help=SUPPRESS)
+
+    # Pileup create tensor options for pileup calling
+    parser.add_argument('--bam_fn', type=str, default="input.bam", required=True,
+                        help="Sorted BAM file input, required")
+
+    parser.add_argument('--bed_fn', type=str, nargs='?', action="store", default=None,
+                        help="Call variant only in the provided regions. Will take an intersection if --ctgName and/or (--ctgStart, --ctgEnd) are set")
+
+    parser.add_argument('--snp_min_af', type=float, default=0.08,
+                        help="Minimum snp allele frequency for a site to be considered as a candidate site, default: %(default)f")
+
+    parser.add_argument('--indel_min_af', type=float, default=0.15,
+                        help="Minimum indel allele frequency for a site to be considered as a candidate site, default: %(default)f")
+
+    parser.add_argument('--extend_bed', nargs='?', action="store", type=str, default=None,
+                        help="DEBUG: Extend the regions in the --bed_fn by a few bp for tensor creation, default extend 16bp")
+
+
+    parser.add_argument('--vcf_fn', type=str, default=None,
+                        help="Candidate sites VCF file input, if provided, variants will only be called at the sites in the VCF file,  default: %(default)s")
+
+    parser.add_argument('--minCoverage', type=float, default=2,
+                        help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
+
+    parser.add_argument('--minMQ', type=int, default=5,
+                        help="EXPERIMENTAL: If set, reads with mapping quality with <$minMQ are filtered, default: %(default)d")
+
+    parser.add_argument('--minBQ', type=int, default=0,
+                        help="EXPERIMENTAL: If set, bases with base quality with <$minBQ are filtered, default: %(default)d")
+
+    parser.add_argument('--max_depth', type=int, default=144,
+                        help="EXPERIMENTAL: Maximum full alignment depth to be processed. default: %(default)s")
+
+    parser.add_argument('--fast_mode', type=str2bool, default=False,
+                        help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s")
+
+    parser.add_argument('--call_snp_only', type=str2bool, default=False,
+                        help="EXPERIMENTAL: Call candidates pass snp minimum AF only, ignore Indel candidates")
+
+    # Full-alignment create tensor options for full-alignment calling
+    parser.add_argument('--phased_vcf_fn', type=str, default=None,
+                        help="Use heterozygous SNP variants in phased vcf file for haplotaging")
+
+    parser.add_argument('--no_phasing_for_fa', type=str2bool, default=False,
+                        help="EXPERIMENTAL: Call variants without whatshap or longphase phasing in full alignment calling")
+
+    ## Provide the regions to be included in full-alignment based calling
+    parser.add_argument('--full_aln_regions', type=str, default=None,
+                        help=SUPPRESS)
+
+    args = parser.parse_args()
+
+    if len(sys.argv[1:]) == 0:
+        parser.print_help()
+        sys.exit(1)
+
+    Run(args)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From d99c654b61cffc018dd683c40ec96a1d0a6a950b Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 13:05:26 +0800
Subject: [PATCH 06/43] only store the candidate reference base in c implement
 for efficiency

---
 clair3/CallVariants.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clair3/CallVariants.py b/clair3/CallVariants.py
index 56e4f1b..47a7042 100644
--- a/clair3/CallVariants.py
+++ b/clair3/CallVariants.py
@@ -17,6 +17,7 @@
     HETERO_SNP_GT21, HETERO_SNP_LABELS, GT21_LABELS, partial_label_from, mix_two_partial_labels
 )
 import clair3.utils as utils
+import shared.param_p as param
 from clair3.task.genotype import Genotype, genotype_string_from, genotype_enum_from, genotype_enum_for_task
 from shared.utils import IUPAC_base_to_ACGT_base_dict as BASE2ACGT, BASIC_BASES, str2bool, file_path_from, log_error, log_warning
 from clair3.task.variant_length import VariantLength
@@ -1114,7 +1115,8 @@ def output_with(
     chromosome, position, reference_sequence = chr_pos_seq.rstrip().split(':')
     position = int(position)
 
-    tensor_position_center = param.flankingBaseNum
+    # only store the centered reference base for C implment for efficiency
+    tensor_position_center = param.flankingBaseNum if len(reference_sequence) > 1 else 0
     information_string = "P" if output_config.pileup else 'F'
 
     if type(alt_info) == np.memmap:

From 603c94b411f7339f72f58220a9ad95b4dbeaabe9 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 13:06:47 +0800
Subject: [PATCH 07/43] add three new submodules

---
 clair3.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/clair3.py b/clair3.py
index b8a2a4c..8b46e9e 100644
--- a/clair3.py
+++ b/clair3.py
@@ -10,6 +10,7 @@
     "CallVarBam",
     "CallVariants",
     "Train",
+    "CallVariantsFromCffi"
 ]
 
 data_preprocess_folder = [
@@ -27,7 +28,9 @@
     'UnifyRepresentation',
     'CheckEnvs',
     'SortVcf',
-    'SelectQual'
+    'SelectQual',
+    "CreateTensorPileupFromCffi"
+    "CreateTensorFullAlignmentFromCffi",
 ]
 
 post_process_scripts_folder = [

From 9fb0ddf27883c6200365f39c8ebea21babc3ad0b Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 13:07:40 +0800
Subject: [PATCH 08/43] add longphase, use_gpu and enable_c_impl options

---
 run_clair3.sh | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)
 mode change 100755 => 100644 run_clair3.sh

diff --git a/run_clair3.sh b/run_clair3.sh
old mode 100755
new mode 100644
index 3acad08..b00fb47
--- a/run_clair3.sh
+++ b/run_clair3.sh
@@ -51,6 +51,10 @@ print_help_messages()
     echo $'      --no_phasing_for_fa       EXPERIMENTAL: Call variants without whatshap phasing in full alignment calling, default: disable.'
     echo $'      --call_snp_only           EXPERIMENTAL: Call candidates pass SNP minimum AF only, ignore Indel candidates, default: disable.'
     echo $'      --enable_long_indel       EXPERIMENTAL: Call long Indel variants(>50 bp), default: disable.'
+    echo $'      --use_gpu                 Use GPU for calling, default: disable.'
+    echo $'      --longphase_for_phasing   Use longphase for phasing, default: disable.'
+    echo $'      --longphase               Path of longphase, longphase >= 1.0 is required.'
+    echo $'      --enable_c_impl           Use C implement with cffi for pileup and full-alignment create tensor, default: disable.'
     echo $''
 }
 
@@ -66,9 +70,9 @@ NC="\\033[0m"
 
 ARGS=`getopt -o b:f:t:m:p:o:hv \
 -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\
-bed_fn::,vcf_fn::,ctg_name::,sample_name::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,ref_pct_full::,var_pct_phasing::,\
+bed_fn::,vcf_fn::,ctg_name::,sample_name::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,ref_pct_full::,var_pct_phasing::,longphase::,\
 snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\
-remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,help,version -n 'run_clair3.sh' -- "$@"`
+remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,use_gpu,longphase_for_phasing,enable_c_impl,help,version -n 'run_clair3.sh' -- "$@"`
 
 if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi
 eval set -- "${ARGS}"
@@ -83,6 +87,7 @@ PYPY="pypy3"
 PYTHON='python3'
 PARALLEL='parallel'
 WHATSHAP='whatshap'
+longphase='longphase'
 CHUNK_NUM=0
 CHUNK_SIZE=5000000
 QUAL=2
@@ -93,8 +98,8 @@ GVCF=False
 PILEUP_ONLY=False
 FAST_MODE=False
 SHOW_REF=False
-SNP_AF="0"
-INDEL_AF="0"
+SNP_AF="0.08"
+INDEL_AF="0.15"
 HAP_PRE=False
 HAP_SEN=False
 SNP_ONLY=False
@@ -103,6 +108,9 @@ NO_PHASING=False
 RM_TMP_DIR=False
 ENABLE_PHASING=False
 ENABLE_LONG_INDEL=False
+USE_GPU=False
+USE_LONGPHASE=False
+ENABLE_C_IMPL=False
 PILEUP_PREFIX="pileup"
 FA_PREFIX="full_alignment"
 
@@ -126,6 +134,7 @@ while true; do
     --pypy ) PYPY="$2"; shift 2 ;;
     --parallel ) PARALLEL="$2"; shift 2 ;;
     --whatshap ) WHATSHAP="$2"; shift 2 ;;
+    --longphase ) LONGPHASE="$2"; shift 2 ;;
     --var_pct_full ) PRO="$2"; shift 2 ;;
     --ref_pct_full ) REF_PRO="$2"; shift 2 ;;
     --var_pct_phasing ) PHASING_PCT="$2"; shift 2 ;;
@@ -145,6 +154,9 @@ while true; do
     --remove_intermediate_dir ) RM_TMP_DIR=True; shift 1 ;;
     --enable_phasing ) ENABLE_PHASING=True; shift 1 ;;
     --enable_long_indel ) ENABLE_LONG_INDEL=True; shift 1 ;;
+    --use_gpu ) USE_GPU=True; shift 1 ;;
+    --longphase_for_phasing ) USE_LONGPHASE=True; shift 1 ;;
+    --enable_c_impl ) ENABLE_C_IMPL=True; shift 1 ;;
 
     -- ) shift; break; ;;
     -h|--help ) print_help_messages; exit 0 ;;
@@ -195,7 +207,7 @@ if [ "${PLATFORM}" != "ont" ] && [ "${PRO}" = "0" ]; then PRO=0.3; fi
 
 # show default high quality hete variant proportion for whatshap phasing, 0.8 for ont guppy5 and 0.7 for others
 if [ "${PHASING_PCT}" = "0" ]; then PHASING_PCT=0.7; fi
-BASE_MODEL=$(basename ${MODEL_PATH})C
+BASE_MODEL=$(basename ${MODEL_PATH})
 if [ "${BASE_MODEL}" = "r941_prom_sup_g5014" ] || [ "${BASE_MODEL}" = "r941_prom_hac_g5014" ] || [ "${BASE_MODEL}" = "ont_guppy5" ]; then PHASING_PCT=0.8; fi
 
 # remove the last '/' character in directory input
@@ -220,6 +232,7 @@ echo "[INFO] PYTHON PATH: ${PYTHON}"
 echo "[INFO] PYPY PATH: ${PYPY}"
 echo "[INFO] PARALLEL PATH: ${PARALLEL}"
 echo "[INFO] WHATSHAP PATH: ${WHATSHAP}"
+echo "[INFO] LONGPHASE PATH: ${LONGPHASE}"
 echo "[INFO] CHUNK SIZE: ${CHUNK_SIZE}"
 if [ ${CHUNK_NUM} -gt 0 ]; then echo "[INFO] CHUNK NUM: ${CHUNK_NUM}"; fi
 echo "[INFO] FULL ALIGN PROPORTION: ${PRO}"
@@ -239,6 +252,9 @@ echo "[INFO] ENABLE NO PHASING FOR FULL ALIGNMENT: ${NO_PHASING}"
 echo "[INFO] ENABLE REMOVING INTERMEDIATE FILES: ${RM_TMP_DIR}"
 echo "[INFO] ENABLE PHASING VCF OUTPUT: ${ENABLE_PHASING}"
 echo "[INFO] ENABLE LONG INDEL CALLING: ${ENABLE_LONG_INDEL}"
+echo "[INFO] ENABLE GPU CALLING: ${USE_GPU}"
+echo "[INFO] ENABLE LONGPHASE_FOR_PHASING: ${USE_LONGPHASE}"
+echo "[INFO] ENABLE C_IMPLEMENT: ${USE_LONGPHASE}"
 echo $''
 
 # file check
@@ -290,9 +306,11 @@ if [ -z ${FA_PREFIX} ]; then echo -e "${ERROR} Use '--fa_model_prefix=STR' inste
 if [ ! -f ${MODEL_PATH}/${PILEUP_PREFIX}.index ]; then echo -e "${ERROR} No pileup model found in provided model path and model prefix ${MODEL_PATH}/${PILEUP_PREFIX} ${NC}"; exit 1; fi
 if [ ! -f ${MODEL_PATH}/${FA_PREFIX}.index ]; then echo -e "${ERROR} No full-alignment model found in provided model path and model prefix ${MODEL_PATH}/${FA_PREFIX} ${NC}"; exit 1; fi
 
+CLAIR3_SCRIPT="clair3.sh"
+if [ "${ENABLE_C_IMPL}" == True ] && [ ! ${PLATFORM} = "ilmn" ]; then CLAIR3_SCRIPT="clair3_c_impl.sh"; fi
 
 set -x
-${SCRIPT_PATH}/scripts/clair3.sh \
+${SCRIPT_PATH}/scripts/${CLAIR3_SCRIPT} \
     --bam_fn ${BAM_FILE_PATH} \
     --ref_fn ${REFERENCE_FILE_PATH} \
     --threads ${THREADS} \
@@ -329,7 +347,10 @@ ${SCRIPT_PATH}/scripts/clair3.sh \
     --fa_model_prefix=${FA_PREFIX} \
     --remove_intermediate_dir=${RM_TMP_DIR} \
     --enable_phasing=${ENABLE_PHASING} \
-    --enable_long_indel=${ENABLE_LONG_INDEL}
+    --enable_long_indel=${ENABLE_LONG_INDEL} \
+    --use_gpu=${USE_GPU} \
+    --longphase_for_phasing=${USE_LONGPHASE} \
+    --longphase=${LONGPHASE}
 
 
 )) |& tee ${OUTPUT_FOLDER}/run_clair3.log
\ No newline at end of file

From 367d873981e5b58f379270489dec8048452af309 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 13:09:03 +0800
Subject: [PATCH 09/43] clair3 c implement script, directly use
 CallVariantsFromCffi submodule for pileup and full-alignment calling

---
 scripts/clair3_c_impl.sh | 334 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 334 insertions(+)
 create mode 100644 scripts/clair3_c_impl.sh

diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh
new file mode 100644
index 0000000..0f0798f
--- /dev/null
+++ b/scripts/clair3_c_impl.sh
@@ -0,0 +1,334 @@
+#!/bin/bash
+SCRIPT_NAME=$(basename "$0")
+Usage="Usage: ./${SCRIPT_NAME} --bam_fn=BAM --ref_fn=REF --output=OUTPUT_DIR --threads=THREADS --platform=PLATFORM --model_path=MODEL_PREFIX [--bed_fn=BED] [options]"
+# INFO: whole calling workflow of clair3
+
+set -e
+ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \
+-l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\
+bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\
+snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
+no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"`
+
+if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi
+eval set -- "${ARGS}"
+
+while true; do
+   case "$1" in
+    -b|--bam_fn ) BAM_FILE_PATH="$2"; shift 2 ;;
+    -f|--ref_fn ) REFERENCE_FILE_PATH="$2"; shift 2 ;;
+    -t|--threads ) THREADS="$2"; shift 2 ;;
+    -m|--model_path ) MODEL_PATH="$2"; shift 2 ;;
+    -p|--platform ) PLATFORM="$2"; shift 2 ;;
+    -o|--output ) OUTPUT_FOLDER="$2"; shift 2 ;;
+    --bed_fn ) BED_FILE_PATH="$2"; shift 2 ;;
+    --vcf_fn ) VCF_FILE_PATH="$2"; shift 2 ;;
+    --ctg_name ) CONTIGS="$2"; shift 2 ;;
+    --sample_name ) SAMPLE="$2"; shift 2 ;;
+    --chunk_num ) CHUNK_NUM="$2"; shift 2 ;;
+    --chunk_size ) CHUNK_SIZE="$2"; shift 2 ;;
+    --qual ) QUAL="$2"; shift 2 ;;
+    --samtools ) SAMTOOLS="$2"; shift 2 ;;
+    --python ) PYTHON="$2"; shift 2 ;;
+    --pypy ) PYPY="$2"; shift 2 ;;
+    --parallel ) PARALLEL="$2"; shift 2 ;;
+    --whatshap ) WHATSHAP="$2"; shift 2 ;;
+    --longphase ) LONGPHASE="$2"; shift 2 ;;
+    --var_pct_full ) PRO="$2"; shift 2 ;;
+    --ref_pct_full ) REF_PRO="$2"; shift 2 ;;
+    --var_pct_phasing ) PHASING_PCT="$2"; shift 2 ;;
+    --pileup_only ) PILEUP_ONLY="$2"; shift 2 ;;
+    --fast_mode ) FAST_MODE="$2"; shift 2 ;;
+    --call_snp_only ) SNP_ONLY="$2"; shift 2 ;;
+    --print_ref_calls ) SHOW_REF="$2"; shift 2 ;;
+    --gvcf ) GVCF="$2"; shift 2 ;;
+    --snp_min_af ) SNP_AF="$2"; shift 2 ;;
+    --indel_min_af ) INDEL_AF="$2"; shift 2 ;;
+    --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;;
+    --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;;
+    --haploid_precise ) HAP_PRE="$2"; shift 2 ;;
+    --haploid_sensitive ) HAP_SEN="$2"; shift 2 ;;
+    --include_all_ctgs ) INCLUDE_ALL_CTGS="$2"; shift 2 ;;
+    --no_phasing_for_fa ) NO_PHASING="$2"; shift 2 ;;
+    --remove_intermediate_dir ) RM_TMP_DIR="$2"; shift 2 ;;
+    --enable_phasing ) ENABLE_PHASING="$2"; shift 2 ;;
+    --enable_long_indel ) ENABLE_LONG_INDEL="$2"; shift 2 ;;
+    --use_gpu ) USE_GPU="$2"; shift 2 ;;
+    --longphase_for_phasing ) USE_LONGPHASE="$2"; shift 2 ;;
+
+    -- ) shift; break; ;;
+    -h|--help ) print_help_messages; break ;;
+    * ) print_help_messages; exit 0 ;;
+   esac
+done
+
+
+SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
+CLAIR3="${SHELL_FOLDER}/../clair3.py"
+
+if [ ${BED_FILE_PATH} = "EMPTY" ] ; then BED_FILE_PATH= ; fi
+RETRIES=4
+
+PILEUP_CHECKPOINT_PATH="${MODEL_PATH}/${PILEUP_PREFIX}"
+FULL_ALIGNMENT_CHECKPOINT_PATH="${MODEL_PATH}/${FA_PREFIX}"
+LOG_PATH="${OUTPUT_FOLDER}/log"
+TMP_FILE_PATH="${OUTPUT_FOLDER}/tmp"
+SPLIT_BED_PATH="${TMP_FILE_PATH}/split_beds"
+PILEUP_VCF_PATH="${TMP_FILE_PATH}/pileup_output"
+GVCF_TMP_PATH="${TMP_FILE_PATH}/gvcf_tmp_output"
+PHASE_OUTPUT_PATH="${TMP_FILE_PATH}/phase_output"
+FULL_ALIGNMENT_OUTPUT_PATH="${TMP_FILE_PATH}/full_alignment_output"
+PHASE_VCF_PATH="${PHASE_OUTPUT_PATH}/phase_vcf"
+PHASE_BAM_PATH="${PHASE_OUTPUT_PATH}/phase_bam"
+CANDIDATE_BED_PATH="${FULL_ALIGNMENT_OUTPUT_PATH}/candidate_bed"
+export OPENBLAS_NUM_THREADS=1
+export GOTO_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+echo "[INFO] Check environment variables"
+${PYTHON} ${CLAIR3} CheckEnvs \
+    --bam_fn ${BAM_FILE_PATH} \
+    --bed_fn ${BED_FILE_PATH} \
+    --output_fn_prefix ${OUTPUT_FOLDER} \
+    --ref_fn ${REFERENCE_FILE_PATH} \
+    --vcf_fn ${VCF_FILE_PATH} \
+    --ctg_name ${CONTIGS} \
+    --chunk_num ${CHUNK_NUM} \
+    --chunk_size ${CHUNK_SIZE} \
+    --include_all_ctgs ${INCLUDE_ALL_CTGS} \
+    --threads ${THREADS} \
+    --python ${PYTHON} \
+    --pypy ${PYPY} \
+    --samtools ${SAMTOOLS} \
+    --whatshap ${WHATSHAP} \
+    --parallel ${PARALLEL} \
+    --qual ${QUAL} \
+    --sampleName ${SAMPLE} \
+    --var_pct_full ${PRO} \
+    --ref_pct_full ${REF_PRO} \
+    --snp_min_af ${SNP_AF} \
+    --indel_min_af ${INDEL_AF}
+readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS"
+if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi
+THREADS_LOW=$((${THREADS}*3/4))
+LONGPHASE_THREADS=$((${THREADS}*1/2))
+if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi
+if [[ ${LONGPHASE_THREADS} < 1 ]]; then LONGPHASE_THREADS=1; fi
+
+cd ${OUTPUT_FOLDER}
+# Pileup calling
+#-----------------------------------------------------------------------------------------------------------------------
+echo "[INFO] 1/7 Call variants using pileup model"
+time ${PARALLEL} --retries ${RETRIES} -C ' ' --joblog ${LOG_PATH}/parallel_1_call_var_bam_pileup.log -j ${THREADS_LOW} \
+"${PYTHON} ${CLAIR3} CallVariantsFromCffi \
+    --chkpnt_fn ${PILEUP_CHECKPOINT_PATH} \
+    --bam_fn ${BAM_FILE_PATH} \
+    --call_fn ${PILEUP_VCF_PATH}/pileup_{1}_{2}.vcf \
+    --sampleName ${SAMPLE} \
+    --ref_fn ${REFERENCE_FILE_PATH} \
+    --extend_bed ${SPLIT_BED_PATH}/{1} \
+    --bed_fn ${BED_FILE_PATH} \
+    --vcf_fn ${VCF_FILE_PATH} \
+    --ctgName {1} \
+    --chunk_id {2} \
+    --chunk_num {3} \
+    --platform ${PLATFORM} \
+    --fast_mode ${FAST_MODE} \
+    --snp_min_af ${SNP_AF} \
+    --indel_min_af ${INDEL_AF} \
+    --call_snp_only ${SNP_ONLY} \
+    --gvcf ${GVCF} \
+    --enable_long_indel ${ENABLE_LONG_INDEL} \
+    --samtools ${SAMTOOLS} \
+    --temp_file_dir ${GVCF_TMP_PATH} \
+    --pileup \
+    --use_gpu ${USE_GPU}" :::: ${OUTPUT_FOLDER}/tmp/CHUNK_LIST |& tee ${LOG_PATH}/1_call_var_bam_pileup.log
+
+${PYPY} ${CLAIR3} SortVcf \
+    --input_dir ${PILEUP_VCF_PATH} \
+    --vcf_fn_prefix "pileup" \
+    --output_fn ${OUTPUT_FOLDER}/pileup.vcf \
+    --sampleName ${SAMPLE} \
+    --ref_fn ${REFERENCE_FILE_PATH} \
+    --contigs_fn ${TMP_FILE_PATH}/CONTIGS
+
+if [ "$( gzip -fdc ${OUTPUT_FOLDER}/pileup.vcf.gz | grep -v '#' | wc -l )" -eq 0 ]; then echo "[INFO] Exit in pileup variant calling"; exit 0; fi
+if [ ${PILEUP_ONLY} == True ]; then
+    if [ ${RM_TMP_DIR} == True ]; then echo "[INFO] Removing intermediate files in ${OUTPUT_FOLDER}/tmp"; rm -rf ${OUTPUT_FOLDER}/tmp; fi
+    echo "[INFO] Only call pileup output with --pileup_only, output file: ${OUTPUT_FOLDER}/pileup.vcf.gz"
+    echo "[INFO] Finish calling!"
+    exit 0;
+fi
+
+# Whatshap phasing and haplotaging
+#-----------------------------------------------------------------------------------------------------------------------
+if [ ${NO_PHASING} == True ]
+then
+    echo "[INFO] 2/7 No phasing for full alignment calling"
+    ${PARALLEL} -j${THREADS} ln -sf ${BAM_FILE_PATH} ${PHASE_BAM_PATH}/{1}.bam ::: ${CHR[@]}
+    if [ -f ${BAM_FILE_PATH}.bai ]; then ${PARALLEL} --retries ${RETRIES} -j${THREADS} ln -sf ${BAM_FILE_PATH}.bai ${PHASE_BAM_PATH}/{1}.bam.bai ::: ${CHR[@]}; fi
+    if [ -f ${BAM_FILE_PATH%.*}.bai ]; then ${PARALLEL} --retries ${RETRIES} -j${THREADS} ln -sf ${BAM_FILE_PATH%.*}.bai ${PHASE_BAM_PATH}/{1}.bam.bai ::: ${CHR[@]}; fi
+else
+    echo $''
+    echo "[INFO] 2/7 Select heterozygous SNP variants for Whatshap phasing and haplotagging"
+    gzip -fdc ${OUTPUT_FOLDER}/pileup.vcf.gz | ${PYPY} ${CLAIR3} SelectQual --phase --output_fn ${PHASE_VCF_PATH} --var_pct_phasing ${PHASING_PCT}
+    time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_2_select_hetero_snp.log -j${THREADS} \
+    "${PYPY} ${CLAIR3} SelectHetSnp \
+        --vcf_fn ${OUTPUT_FOLDER}/pileup.vcf.gz \
+        --split_folder ${PHASE_VCF_PATH} \
+        --ctgName {1}" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} |& tee ${LOG_PATH}/2_select_hetero_snp.log
+
+    echo $''
+    if [ ${USE_LONGPHASE} == True ]
+    then
+        echo "[INFO] 3/7 Phase VCF file using LongPhase"
+        time ${PARALLEL}  --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \
+        "${LONGPHASE} phase\
+            -s  ${PHASE_VCF_PATH}/{1}.vcf \
+            -b ${BAM_FILE_PATH} \
+            -r ${REFERENCE_FILE_PATH} \
+            -t ${LONGPHASE_THREADS} \
+            -o ${PHASE_VCF_PATH}/phased_{1} \
+            --ont" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log
+        ${PARALLEL} -j${THREADS} bgzip -f ${PHASE_VCF_PATH}/phased_{}.vcf ::: ${CHR[@]}
+    else
+        echo "[INFO] 3/7 Phase VCF file using Whatshap"
+        time ${PARALLEL}  --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \
+        "${WHATSHAP} phase \
+            --output ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \
+            --reference ${REFERENCE_FILE_PATH} \
+            --chromosome {1} \
+            --distrust-genotypes \
+            --ignore-read-groups \
+            ${PHASE_VCF_PATH}/{1}.vcf \
+            ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log
+    fi
+    ${PARALLEL} -j${THREADS} tabix -f -p vcf ${PHASE_VCF_PATH}/phased_{}.vcf.gz ::: ${CHR[@]}
+
+fi
+
+# Full alignment calling
+#-----------------------------------------------------------------------------------------------------------------------
+echo $''
+echo "[INFO] 5/7 Select candidates for full-alignment calling"
+gzip -fdc ${OUTPUT_FOLDER}/pileup.vcf.gz | ${PYPY} ${CLAIR3} SelectQual --output_fn ${CANDIDATE_BED_PATH} \
+--var_pct_full ${PRO} --ref_pct_full ${REF_PRO} --platform ${PLATFORM} --vcf_fn ${VCF_FILE_PATH}
+time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_5_select_candidate.log -j${THREADS} \
+"${PYPY} ${CLAIR3} SelectCandidates \
+    --pileup_vcf_fn ${OUTPUT_FOLDER}/pileup.vcf.gz \
+    --split_folder ${CANDIDATE_BED_PATH} \
+    --ref_fn ${REFERENCE_FILE_PATH} \
+    --var_pct_full ${PRO} \
+    --ref_pct_full ${REF_PRO} \
+    --platform ${PLATFORM} \
+    --ctgName {1}" ::: ${CHR[@]}  |& tee ${LOG_PATH}/5_select_candidate.log
+
+echo $''
+echo "[INFO] 6/7 Call low-quality variants using full-alignment model"
+cat ${CANDIDATE_BED_PATH}/FULL_ALN_FILE_* > ${CANDIDATE_BED_PATH}/FULL_ALN_FILES
+time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_6_call_var_bam_full_alignment.log -j ${THREADS_LOW} \
+"${PYTHON} ${CLAIR3} CallVariantsFromCffi \
+    --chkpnt_fn ${FULL_ALIGNMENT_CHECKPOINT_PATH} \
+    --bam_fn ${BAM_FILE_PATH} \
+    --call_fn ${FULL_ALIGNMENT_OUTPUT_PATH}/full_alignment_{1/}.vcf \
+    --sampleName ${SAMPLE} \
+    --vcf_fn ${VCF_FILE_PATH} \
+    --ref_fn ${REFERENCE_FILE_PATH} \
+    --full_aln_regions {1} \
+    --ctgName {1/.} \
+    --add_indel_length \
+    --no_phasing_for_fa ${NO_PHASING} \
+    --phased_vcf_fn ${PHASE_VCF_PATH}/phased_{/.}.vcf.gz \
+    --gvcf ${GVCF} \
+    --enable_long_indel ${ENABLE_LONG_INDEL} \
+    --samtools ${SAMTOOLS} \
+    --use_gpu ${USE_GPU} \
+    --platform ${PLATFORM}" :::: ${CANDIDATE_BED_PATH}/FULL_ALN_FILES |& tee ${LOG_PATH}/6_call_var_bam_full_alignment.log
+
+${PYPY} ${CLAIR3} SortVcf \
+    --input_dir ${FULL_ALIGNMENT_OUTPUT_PATH} \
+    --vcf_fn_prefix "full_alignment" \
+    --output_fn ${OUTPUT_FOLDER}/full_alignment.vcf \
+    --sampleName ${SAMPLE} \
+    --ref_fn ${REFERENCE_FILE_PATH} \
+    --contigs_fn ${TMP_FILE_PATH}/CONTIGS
+
+if [ "$( gzip -fdc ${OUTPUT_FOLDER}/full_alignment.vcf.gz | grep -v '#' | wc -l )" -eq 0 ]; then echo "[INFO] Exit in full-alignment variant calling"; exit 0; fi
+# Compress GVCF output using lz4
+if [ ${GVCF} == True ]
+then
+    ${PYPY} ${CLAIR3} SortVcf \
+        --input_dir ${GVCF_TMP_PATH} \
+        --vcf_fn_suffix ".tmp.gvcf" \
+        --output_fn ${GVCF_TMP_PATH}/non_var.gvcf \
+        --ref_fn ${REFERENCE_FILE_PATH} \
+        --contigs_fn ${TMP_FILE_PATH}/CONTIGS
+fi
+
+##Merge pileup and full alignment vcf
+##-----------------------------------------------------------------------------------------------------------------------
+echo $''
+echo "[INFO] 7/7 Merge pileup VCF and full-alignment VCF"
+time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_7_merge_vcf.log -j${THREADS} \
+"${PYPY} ${CLAIR3} MergeVcf \
+    --pileup_vcf_fn ${OUTPUT_FOLDER}/pileup.vcf.gz \
+    --bed_fn_prefix ${CANDIDATE_BED_PATH} \
+    --full_alignment_vcf_fn ${OUTPUT_FOLDER}/full_alignment.vcf.gz \
+    --output_fn ${TMP_FILE_PATH}/merge_output/merge_{1}.vcf \
+    --platform ${PLATFORM} \
+    --print_ref_calls ${SHOW_REF} \
+    --gvcf ${GVCF} \
+    --haploid_precise ${HAP_PRE} \
+    --haploid_sensitive ${HAP_SEN} \
+    --gvcf_fn ${TMP_FILE_PATH}/merge_output/merge_{1}.gvcf \
+    --non_var_gvcf_fn ${GVCF_TMP_PATH}/non_var.gvcf \
+    --ref_fn ${REFERENCE_FILE_PATH} \
+    --ctgName {1}" ::: ${CHR[@]} |& tee ${LOG_PATH}/7_merge_vcf.log
+
+${PYPY} ${CLAIR3} SortVcf \
+    --input_dir ${TMP_FILE_PATH}/merge_output \
+    --vcf_fn_prefix "merge" \
+    --output_fn ${OUTPUT_FOLDER}/merge_output.vcf \
+    --sampleName ${SAMPLE} \
+    --ref_fn ${REFERENCE_FILE_PATH} \
+    --contigs_fn ${TMP_FILE_PATH}/CONTIGS
+
+if [ "$( gzip -fdc ${OUTPUT_FOLDER}/merge_output.vcf.gz | grep -v '#' | wc -l )" -eq 0 ]; then echo "[INFO] Exit in variant merging"; exit 0; fi
+if [ ${GVCF} == True ]
+then
+    ${PYPY} ${CLAIR3} SortVcf \
+        --input_dir ${TMP_FILE_PATH}/merge_output \
+        --vcf_fn_prefix "merge" \
+        --vcf_fn_suffix ".gvcf" \
+        --output_fn ${OUTPUT_FOLDER}/merge_output.gvcf \
+        --sampleName ${SAMPLE} \
+        --ref_fn ${REFERENCE_FILE_PATH} \
+        --contigs_fn ${TMP_FILE_PATH}/CONTIGS
+fi
+
+if [ ${ENABLE_PHASING} == True ]
+then
+    echo "[INFO] 7/7 Phasing VCF output in parallel using WhatsHap"
+    time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_8_phase_vcf_output.log -j${THREADS} \
+    "${WHATSHAP} phase \
+        --output ${TMP_FILE_PATH}/merge_output/phased_merge_{1}.vcf \
+        --reference ${REFERENCE_FILE_PATH} \
+        --ignore-read-groups \
+        ${TMP_FILE_PATH}/merge_output/merge_{1}.vcf \
+        ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/8_phase_vcf_output.log
+
+    ${PYPY} ${CLAIR3} SortVcf \
+        --input_dir ${TMP_FILE_PATH}/merge_output \
+        --vcf_fn_prefix "phased_merge" \
+        --output_fn ${OUTPUT_FOLDER}/phased_merge_output.vcf \
+        --sampleName ${SAMPLE} \
+        --ref_fn ${REFERENCE_FILE_PATH} \
+        --contigs_fn ${TMP_FILE_PATH}/CONTIGS
+fi
+
+if [ ${RM_TMP_DIR} == True ]; then echo "[INFO] Removing intermediate files in ${OUTPUT_FOLDER}/tmp"; rm -rf ${OUTPUT_FOLDER}/tmp; fi
+
+echo $''
+echo "[INFO] Finish calling, output file: ${OUTPUT_FOLDER}/merge_output.vcf.gz"
+
+if [ ${ENABLE_PHASING} == True ]; then echo "[INFO] Finish calling, phased output file: ${OUTPUT_FOLDER}/phased_merge_output.vcf.gz"; fi
\ No newline at end of file

From 6872b0abba579e0f7ddd21a774354e8c061f4c6a Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Tue, 15 Mar 2022 13:09:43 +0800
Subject: [PATCH 10/43] update clair3 options to be consistent with main entry

---
 scripts/clair3.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/clair3.sh b/scripts/clair3.sh
index 35dc2a1..57ac44f 100755
--- a/scripts/clair3.sh
+++ b/scripts/clair3.sh
@@ -8,7 +8,7 @@ ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \
 -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\
 bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\
 snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
-no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel:: -n 'run_clair3.sh' -- "$@"`
+no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"`
 
 if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi
 eval set -- "${ARGS}"
@@ -33,6 +33,7 @@ while true; do
     --pypy ) PYPY="$2"; shift 2 ;;
     --parallel ) PARALLEL="$2"; shift 2 ;;
     --whatshap ) WHATSHAP="$2"; shift 2 ;;
+    --longphase ) LONGPHASE="$2"; shift 2 ;;
     --var_pct_full ) PRO="$2"; shift 2 ;;
     --ref_pct_full ) REF_PRO="$2"; shift 2 ;;
     --var_pct_phasing ) PHASING_PCT="$2"; shift 2 ;;
@@ -52,6 +53,8 @@ while true; do
     --remove_intermediate_dir ) RM_TMP_DIR="$2"; shift 2 ;;
     --enable_phasing ) ENABLE_PHASING="$2"; shift 2 ;;
     --enable_long_indel ) ENABLE_LONG_INDEL="$2"; shift 2 ;;
+    --use_gpu ) USE_GPU="$2"; shift 2 ;;
+    --longphase_for_phasing ) USE_LONGPHASE="$2"; shift 2 ;;
 
     -- ) shift; break; ;;
     -h|--help ) print_help_messages; break ;;

From 8f5b8e19b09d09fc76857984dd2df912667c5f25 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 11:10:29 +0800
Subject: [PATCH 11/43] fix the vcf_fn in full-alignment calling

---
 preprocess/CreateTensorFullAlignmentFromCffi.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/preprocess/CreateTensorFullAlignmentFromCffi.py b/preprocess/CreateTensorFullAlignmentFromCffi.py
index 3a1ba8c..0643bc1 100644
--- a/preprocess/CreateTensorFullAlignmentFromCffi.py
+++ b/preprocess/CreateTensorFullAlignmentFromCffi.py
@@ -28,10 +28,6 @@ def CreateTensorFullAlignment(args):
     platform = args.platform
     phased_vcf_fn = args.phased_vcf_fn
 
-    vcf_fn = file_path_from(args.vcf_fn)
-    is_known_vcf_file_provided = vcf_fn is not None
-    chunk_id = args.chunk_id - 1 if args.chunk_id else None  # 1-base to 0-base
-    chunk_num = args.chunk_num
     extend_bed = file_path_from(args.extend_bed)
     is_extend_bed_file_given = extend_bed is not None
     confident_bed_fn = file_path_from(args.bed_fn)
@@ -74,10 +70,6 @@ def CreateTensorFullAlignment(args):
         candidate_file_path_output.close()
         candidate_file_path_process.wait()
 
-    if is_known_vcf_file_provided:
-        known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name)
-        candidates_set = set(known_variants_list)
-
     variant_list = []
     if need_haplotagging and phased_vcf_fn and os.path.exists(phased_vcf_fn):
         # if need_haplotagging option enables, scan the phased vcf file and store the heterozygous SNP candidates from each phase set

From ec4e98a6091995255883e1e1f434a3908c917809 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 11:11:27 +0800
Subject: [PATCH 12/43] integrate need_haplotagging option to c implement

---
 preprocess/CreateTensorFullAlignmentFromCffi.py | 6 ++++--
 src/clair3_full_alignment.c                     | 3 +--
 src/clair3_full_alignment.h                     | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/preprocess/CreateTensorFullAlignmentFromCffi.py b/preprocess/CreateTensorFullAlignmentFromCffi.py
index 0643bc1..043afcf 100644
--- a/preprocess/CreateTensorFullAlignmentFromCffi.py
+++ b/preprocess/CreateTensorFullAlignmentFromCffi.py
@@ -96,7 +96,9 @@ def CreateTensorFullAlignment(args):
 
         variant_num = len(variant_list)
         Variants = libclair3.ffi.new("struct Variant *[]", variant_list)
-
+    else:
+        Variants = libclair3.ffi.new("struct Variant *[]", 1)
+        variant_num = 0
 
     # 1-index to 0-index
     candidates_list = sorted(list(set([item-1 for item in candidates_set if item >= ctg_start and item <= ctg_end])))
@@ -107,7 +109,7 @@ def CreateTensorFullAlignment(args):
     candidates = libclair3.ffi.new("size_t [{}]".format(candidate_num), candidates_list)
 
     fa_data = libclair3.lib.calculate_clair3_full_alignment(region_str, bam_file_path.encode(), fasta_file_path.encode(),
-                                                      Variants, variant_num, candidates, candidate_num)
+                                                      Variants, variant_num, candidates, candidate_num, need_haplotagging)
 
     # use np buffer to get the matrix
     matrix_depth = param.matrix_depth_dict[platform]
diff --git a/src/clair3_full_alignment.c b/src/clair3_full_alignment.c
index 06f6d07..eafae03 100644
--- a/src/clair3_full_alignment.c
+++ b/src/clair3_full_alignment.c
@@ -375,10 +375,9 @@ size_t get_overlap_candidate_num(size_t read_start, size_t read_end, size_t cand
     return overlap_num;
 }
 
-fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num)
+fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num, bool need_haplotagging)
 {
 
-    bool need_haplotagging = true;
     int start, end;
     char *chr = xalloc(strlen(region) + 1, sizeof(char), "chr");
     strcpy(chr, region);
diff --git a/src/clair3_full_alignment.h b/src/clair3_full_alignment.h
index e7485fb..16bec37 100644
--- a/src/clair3_full_alignment.h
+++ b/src/clair3_full_alignment.h
@@ -252,6 +252,6 @@ int haplotag_read(Variants_info *variants_info, Read *read, char *ref_seq, size_
  *  The return value can be freed with destroy_fa_data
  *
  */
-fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num);
+fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num, bool need_haplotagging);
 
 #endif

From 8c6c16f08d1b9b878ddfd0f4d25e4cf9d7017c6e Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 11:12:30 +0800
Subject: [PATCH 13/43] add samtools and longphase compile step and cffi
 builder in installation

---
 build.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 build.py

diff --git a/build.py b/build.py
new file mode 100644
index 0000000..022b1ae
--- /dev/null
+++ b/build.py
@@ -0,0 +1,97 @@
+import itertools
+import os
+import platform
+from subprocess import run
+from cffi import FFI
+
+
+samver = "1.10"
+longphase_version = "1.0"
+file_directory = os.path.dirname(os.path.realpath(__file__))
+def compile_samtools_package():
+    # just a simple way to compile samtools htslib
+    if not os.path.exists(os.path.join(file_directory, 'libhts.a')):
+        samtools_source = "samtools-{}.tar.bz2 https://github.com/samtools/samtools/releases/download/{}/samtools-{}.tar.bz2".format(samver, samver, samver)
+        run("curl -L -o {}".format(samtools_source), shell=True)
+        run("tar -xjf samtools-{}.tar.bz2".format(samver), shell=True)
+        run("rm samtools-{}.tar.bz2".format(samver), shell=True)
+        run("cd samtools-{} && autoheader && autoconf -Wno-syntax && CFLAGS='-fpic -O3' ./configure && make".format(samver), shell=True)
+        run("cp samtools-{}/htslib-{}/libhts.a {}".format(samver, samver, file_directory), shell=True)
+
+
+def compile_longphase_package():
+    if not os.path.exists(os.path.join(file_directory, 'longphase')):
+        longphase_source = "https://github.com/twolinin/longphase/archive/refs/tags/v{}.tar.gz".format(longphase_version)
+        run("wget {}".format(longphase_source), shell=True)
+        run("tar -zxvf v{}.tar.gz".format(longphase_version), shell=True)
+        run("rm v{}.tar.gz".format(longphase_version), shell=True)
+        run("cd longphase-{} && autoreconf -i && ./configure && make -j4".format(longphase_version), shell=True)
+        run("mv longphase-{}/longphase {}".format(longphase_version, file_directory), shell=True)
+        run("rm -r longphase-{}".format(longphase_version), shell=True)
+
+def clean_samtools_package():
+    # after ffi building, clean the samtools htslib source
+    if os.path.exists(os.path.join(file_directory, 'libhts.a')):
+        run("rm -r samtools-{}".format(samver), shell=True)
+
+htslib_dir=os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver))
+
+libraries=['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto']
+library_dirs=[htslib_dir]
+src_dir=os.path.join(file_directory, 'src')
+
+extra_compile_args = ['-std=c99', '-O3']
+if platform.machine() in {"aarch64", "arm64"}:
+    if platform.system() == "Darwin":
+        pass
+    else:
+        extra_compile_args.append("-march=armv8-a+simd")
+else:
+    extra_compile_args.append("-mtune=haswell")
+
+ffibuilder = FFI()
+ffibuilder.set_source("libclair3",
+    r"""
+    #include "kvec.h"
+    #include "khash.h"
+    #include "levenshtein.h"
+    #include "medaka_bamiter.h"
+    #include "medaka_common.h"
+    #include "medaka_khcounter.h"
+    #include "clair3_pileup.h"
+    #include "clair3_full_alignment.h"
+    """,
+    libraries=libraries,
+    library_dirs=library_dirs,
+    include_dirs=[src_dir, htslib_dir],
+    sources=[
+        os.path.join(src_dir, x) for x in (
+            'levenshtein.c',
+            'medaka_bamiter.c',
+            'medaka_common.c',
+            'medaka_khcounter.c',
+            'clair3_pileup.c',
+            'clair3_full_alignment.c')],
+    extra_compile_args=extra_compile_args,
+    extra_objects=['libhts.a']
+)
+
+cdef = [
+    "typedef struct { ...; } bam_fset;"
+    "bam_fset* create_bam_fset(char* fname);"
+    "void destroy_bam_fset(bam_fset* fset);"
+]
+for header in ('clair3_pileup.h', 'clair3_full_alignment.h'):
+    with open(os.path.join(src_dir, header), 'r') as fh:
+        # remove directives
+        lines = ''.join(x for x in fh.readlines() if not x.startswith('#'))
+        cdef.append(lines)
+
+ffibuilder.cdef('\n\n'.join(cdef))
+
+
+if __name__ == "__main__":
+    compile_samtools_package()
+    compile_longphase_package()
+    ffibuilder.compile(verbose=True)
+    clean_samtools_package()

From eeb50676736407a87f6784a8aa8f6f7d49153034 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 13:07:16 +0800
Subject: [PATCH 14/43] add gvcf in c implement

---
 clair3/CallVariantsFromCffi.py           |  9 +++
 preprocess/CreateTensorPileupFromCffi.py | 86 ++++++++++++++++++++----
 src/clair3_pileup.c                      | 26 ++++++-
 src/clair3_pileup.h                      |  6 +-
 4 files changed, 109 insertions(+), 18 deletions(-)

diff --git a/clair3/CallVariantsFromCffi.py b/clair3/CallVariantsFromCffi.py
index 8df9d5b..1f621a0 100644
--- a/clair3/CallVariantsFromCffi.py
+++ b/clair3/CallVariantsFromCffi.py
@@ -311,6 +311,15 @@ def main():
     parser.add_argument('--call_snp_only', type=str2bool, default=False,
                         help="EXPERIMENTAL: Call candidates pass snp minimum AF only, ignore Indel candidates")
 
+    parser.add_argument('--base_err', default=0.001, type=float,
+                        help='DEBUG: Estimated base error rate in gvcf option, default: %(default)f')
+
+    parser.add_argument('--gq_bin_size', default=5, type=int,
+                        help='DEBUG: Default gq bin size for merge non-variant block in gvcf option, default: %(default)d')
+
+    parser.add_argument('--bp_resolution', action='store_true',
+                        help="DEBUG: Enable bp resolution for GVCF, default: disabled")
+
     # Full-alignment create tensor options for full-alignment calling
     parser.add_argument('--phased_vcf_fn', type=str, default=None,
                         help="Use heterozygous SNP variants in phased vcf file for haplotaging")
diff --git a/preprocess/CreateTensorPileupFromCffi.py b/preprocess/CreateTensorPileupFromCffi.py
index 512f1cb..5ea4f07 100644
--- a/preprocess/CreateTensorPileupFromCffi.py
+++ b/preprocess/CreateTensorPileupFromCffi.py
@@ -22,8 +22,8 @@
 
 
 def pileup_counts_clair3(
-        region, bam, fasta, min_depth, min_snp_af, min_indel_af, min_mq, call_snp_only, max_indel_length, gvcf, \
-        max_depth, region_split=100000, workers=1):
+        region, bam, fasta, min_depth, min_snp_af, min_indel_af, min_mq, call_snp_only, max_indel_length, \
+        max_depth, gvcf=False, region_split=100000, workers=1):
     """Create pileup counts feature array for region.
 
     :param region: `medaka.common.Region` object
@@ -58,9 +58,10 @@ def _process_region(reg):
             bam_handle = BAMHandler(bam)
         with bam_handle.borrow() as fh:
             counts = lib.calculate_clair3_pileup(
-                region_str.encode(), fh, fasta.encode(), min_depth, min_snp_af, min_indel_af, min_mq, max_indel_length, call_snp_only, max_depth)
-        np_counts, positions, alt_info_string_list = _plp_data_to_numpy(
-            counts, featlenclair3)
+                region_str.encode(), fh, fasta.encode(), min_depth, min_snp_af, min_indel_af, min_mq, max_indel_length, call_snp_only, max_depth, gvcf)
+
+        np_counts, positions, alt_info_string_list, gvcf_output = _plp_data_to_numpy(
+            counts, featlenclair3, gvcf=gvcf)
 
         alt_info_list = []
         for alt_info in alt_info_string_list:
@@ -71,8 +72,8 @@ def _process_region(reg):
             pos, depth, center_ref_base, alt = alt_info[:4]
             alt_info_list.append((int(pos), reg.ref_name + ':' + pos + ':' + center_ref_base, depth + '-' + alt))
 
-        lib.destroy_plp_data(counts)
-        return np_counts, positions, alt_info_list
+        lib.destroy_plp_data(counts, gvcf)
+        return np_counts, positions, alt_info_list, gvcf_output
 
     # we found that split into small chunk would lead to some missing truths,
     # the candidates cross two negbouring small chunks
@@ -80,8 +81,8 @@ def _process_region(reg):
     regions = region.split(region_split, fixed_size=False)
     with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
         results = executor.map(_process_region, regions)
-        chunk_results, all_alt_info_list = __enforce_pileup_chunk_contiguity(results)
-    return chunk_results, all_alt_info_list
+        chunk_results, all_alt_info_list, gvcf_output = __enforce_pileup_chunk_contiguity(results)
+    return chunk_results, all_alt_info_list, gvcf_output
 
 
 class BAMHandler(object):
@@ -123,7 +124,7 @@ def _destroy_fset(self, fset):
         libclair3.lib.destroy_bam_fset(fset)
 
 
-def _plp_data_to_numpy(plp_data, n_rows):
+def _plp_data_to_numpy(plp_data, n_rows, gvcf=False):
     """Create numpy representation of feature data.
 
     Copy the feature matrix and alignment column names from a
@@ -145,12 +146,24 @@ def _plp_data_to_numpy(plp_data, n_rows):
     ).reshape(plp_data.n_cols, n_rows).copy()
 
     alt_info_string_list = []
+    gvcf_output = []
     candidates_num = plp_data.candidates_num
     # decode all alternative information, position-depth-reference_base-alt_info
     for i in range(candidates_num):
         alt_info_string = ffi.string(plp_data.all_alt_info[i]).decode('utf8', 'ignore').rstrip()
         alt_info_string_list.append(alt_info_string)
 
+    if gvcf:
+        gvcf_pos_ref_count = np.frombuffer(ffi.buffer(
+            plp_data.pos_ref_count, size_sizet * plp_data.buffer_cols),
+            dtype=_dtype
+        ).reshape(plp_data.buffer_cols).copy()
+        gvcf_pos_total_count = np.frombuffer(ffi.buffer(
+            plp_data.pos_total_count, size_sizet * plp_data.buffer_cols),
+            dtype=_dtype
+        ).reshape(plp_data.buffer_cols).copy()
+        gvcf_output = [gvcf_pos_ref_count, gvcf_pos_total_count]
+
     positions = np.empty(plp_data.n_cols, dtype=[
         ('major', int), ('minor', int)])
     np.copyto(
@@ -161,7 +174,7 @@ def _plp_data_to_numpy(plp_data, n_rows):
         positions['minor'],
         np.frombuffer(ffi.buffer(
             plp_data.minor, size_sizet * plp_data.n_cols), dtype=_dtype))
-    return np_counts, positions, alt_info_string_list
+    return np_counts, positions, alt_info_string_list, gvcf_output
 
 
 def __enforce_pileup_chunk_contiguity(pileups):
@@ -178,7 +191,7 @@ def __enforce_pileup_chunk_contiguity(pileups):
     all_alt_info_list = list()
     # First pass: need to check for discontinuities within chunks,
     # these show up as >1 changes in the major coordinate
-    for counts, positions, alt_info_list in pileups:
+    for counts, positions, alt_info_list, gvcf_output in pileups:
         move = np.ediff1d(positions['major'])
         gaps = np.where(move > 1)[0] + 1
         all_alt_info_list += alt_info_list
@@ -220,7 +233,7 @@ def _finalize_chunk(c_buf, p_buf):
             last = positions['major'][-1]
     if len(counts_buffer) != 0:
         chunk_results.append(_finalize_chunk(counts_buffer, positions_buffer))
-    return chunk_results, all_alt_info_list
+    return chunk_results, all_alt_info_list, gvcf_output
 
 
 def CreateTensorPileup(args):
@@ -305,7 +318,14 @@ def CreateTensorPileup(args):
     confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn, contig_name=ctg_name, bed_ctg_start=extend_start,
                                        bed_ctg_end=extend_end)
 
-    chunk_result, all_alt_info_list = pileup_counts_clair3(region,
+    if args.gvcf:
+        from preprocess.utils import variantInfoCalculator
+        nonVariantCaller = variantInfoCalculator(gvcfWritePath=args.temp_file_dir, ref_path=args.ref_fn,
+                                                 bp_resolution=args.bp_resolution, ctgName=ctg_name,sample_name='.'.join(
+                [args.sampleName, ctg_name, str(ctg_start), str(ctg_end)]), p_err=args.base_err,
+                                                 gq_bin_size=args.gq_bin_size)
+
+    chunk_result, all_alt_info_list, gvcf_output = pileup_counts_clair3(region,
                                                            bam=bam_file_path,
                                                            fasta=fasta_file_path,
                                                            min_depth=min_coverage,
@@ -346,6 +366,44 @@ def CreateTensorPileup(args):
                 all_alt_info.append(alt_info)
     np_pileup_data = np.array(np_pileup_data, dtype=np.int32)
 
+
+
+    if args.gvcf:
+
+        from shared.utils import reference_sequence_from, region_from
+        samtools_execute_command = args.samtools
+        ref_regions = []
+        reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion
+        reference_start = 1 if reference_start < 1 else reference_start
+        ref_regions.append(region_from(ctg_name=ctg_name, ctg_start=reference_start, ctg_end=reference_end))
+        reference_sequence = reference_sequence_from(
+            samtools_execute_command=samtools_execute_command,
+            fasta_file_path=fasta_file_path,
+            regions=ref_regions
+        )
+
+        empty_pileup_flag = True
+        for pos in range(ctg_start, ctg_end):
+            ref_count = gvcf_output[0][pos - extend_start + 1]
+            total_count = gvcf_output[1][pos - extend_start + 1]
+            reference_base = reference_sequence[pos-reference_start]
+            if (ref_count == 0 and total_count == 0):
+                cur_site_info = {'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': 0, 'n_ref': 0}
+                nonVariantCaller.make_gvcf_online(cur_site_info)
+                continue
+
+            empty_pileup_flag = False
+            cur_site_info = {'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': total_count,
+                             'n_ref': ref_count}
+            nonVariantCaller.make_gvcf_online(cur_site_info)
+        if len(nonVariantCaller.current_block) != 0:
+            nonVariantCaller.write_to_gvcf_batch(nonVariantCaller.current_block, nonVariantCaller.cur_min_DP,
+                                                 nonVariantCaller.cur_raw_gq)
+
+        if empty_pileup_flag:
+            nonVariantCaller.write_empty_pileup(ctg_name, ctg_start, ctg_end)
+        nonVariantCaller.close_vcf_writer()
+
     return np_pileup_data, all_position_info, all_alt_info
 
 
diff --git a/src/clair3_pileup.c b/src/clair3_pileup.c
index e3de48c..3695a09 100644
--- a/src/clair3_pileup.c
+++ b/src/clair3_pileup.c
@@ -63,6 +63,8 @@ plp_data create_plp_data(size_t n_cols, size_t buffer_cols, size_t feature_lengt
     data->major = xalloc(buffer_cols, sizeof(size_t), "major");
     data->minor = xalloc(buffer_cols, sizeof(size_t), "minor");
     data->all_alt_info = NULL;
+    data->pos_ref_count = NULL;
+    data->pos_total_count = NULL;
     return data;
 }
 
@@ -95,13 +97,18 @@ void enlarge_plp_data(plp_data pileup, size_t buffer_cols, size_t feature_length
  *  @returns void.
  *
  */
-void destroy_plp_data(plp_data data) {
+void destroy_plp_data(plp_data data, bool gvcf) {
     free(data->matrix);
     free(data->major);
     free(data->minor);
     for (size_t i = 0; i < data->candidates_num; i++) {
        free(data->all_alt_info[i]);
     }
+    if (gvcf == true) {
+        free(data->pos_ref_count);
+        free(data->pos_total_count);
+    }
+
     free(data->all_alt_info);
     free(data);
 }
@@ -143,7 +150,7 @@ void destroy_plp_data(plp_data data) {
  * quality if the “R” counts and discrepancy between positions increase.
  *
  */
-plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth) {
+plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth, bool gvcf) {
     // extract `chr`:`start`-`end` from `region`
     //   (start is one-based and end-inclusive),
     //   hts_parse_reg below sets return value to point
@@ -201,6 +208,14 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co
 
     size_t pre_pos = 0;
     size_t contiguous_flanking_num = 0;
+
+    if (gvcf == true) {
+        pileup->pos_ref_count = xalloc(buffer_cols, sizeof(size_t), "pos_ref_count");
+        pileup->pos_total_count = xalloc(buffer_cols, sizeof(size_t), "pos_total_count");
+        memset(pileup->pos_ref_count, 0, buffer_cols * sizeof(size_t));
+        memset(pileup->pos_total_count, 0, buffer_cols * sizeof(size_t));
+    }
+
     while ((ret=bam_mplp_auto(mplp, &tid, &pos, &n_plp, plp) > 0)) {
 
         size_t depth = 0;
@@ -349,6 +364,7 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co
         char major_alt_base = '\0';
         size_t forward_sum = 0;
         size_t reverse_sum = 0;
+        size_t all_alt_count = 0;
         for (size_t i = 0; i < 4; i++) {
             forward_sum += pileup->matrix[major_col + i];
             reverse_sum += pileup->matrix[major_col + i + reverse_pos_start];
@@ -359,6 +375,7 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co
                 if (current_count > alt_count) {
                     alt_count = current_count;
                     major_alt_base = plp_bases_clair3[i];
+                    all_alt_count += alt_count;
                 }
             }
         }
@@ -435,6 +452,11 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co
             alt_info_p[candidates_num++] = alt_info_str;
         }
 
+        if (gvcf == true) {
+            pileup->pos_ref_count[pos-start] = ref_count;
+            pileup->pos_total_count[pos-start] = ref_count + all_alt_count + del_count + ins_count;
+        }
+
         free(dels_f);
         free(dels_r);
         kh_counter_destroy(ins_counts_all);
diff --git a/src/clair3_pileup.h b/src/clair3_pileup.h
index 5cf9283..c4dd8e2 100644
--- a/src/clair3_pileup.h
+++ b/src/clair3_pileup.h
@@ -12,6 +12,8 @@ typedef struct _plp_data {
     size_t *minor;
     char **all_alt_info;
     size_t candidates_num;
+    size_t* pos_ref_count;
+    size_t* pos_total_count;
 } _plp_data;
 typedef _plp_data *plp_data;
 
@@ -83,7 +85,7 @@ plp_data create_plp_data(size_t n_cols, size_t buffer_cols, size_t feature_lengt
  *  @returns void.
  *
  */
-void destroy_plp_data(plp_data data);
+void destroy_plp_data(plp_data data, bool gvcf);
 
 /** C implement of clair3-style pileup feature data and alternative information in a given region of a bam.
  *
@@ -100,6 +102,6 @@ void destroy_plp_data(plp_data data);
  *  The return value can be freed with destroy_plp_data
  *
  */
-plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth);
+plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth, bool gvcf);
 
 #endif

From c622611cc05df4a426461239f675ff6ce93fa384 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 13:08:10 +0800
Subject: [PATCH 15/43]  import tritonclient only when --gpu option is enabled

---
 clair3/CallVariantsFromCffi.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clair3/CallVariantsFromCffi.py b/clair3/CallVariantsFromCffi.py
index 1f621a0..30fb78d 100644
--- a/clair3/CallVariantsFromCffi.py
+++ b/clair3/CallVariantsFromCffi.py
@@ -5,8 +5,6 @@
 from time import time
 from argparse import ArgumentParser, SUPPRESS
 
-import tritonclient.grpc as tritongrpcclient
-
 from shared.utils import str2bool, log_error
 from clair3.CallVariants import OutputConfig, output_utilties_from, batch_output
 
@@ -66,6 +64,7 @@ def Run(args):
 def call_variants_from_cffi(args, output_config, output_utilities):
     use_gpu = args.use_gpu
     if use_gpu:
+        import tritonclient.grpc as tritongrpcclient
         server_url = 'localhost:8001'
         try:
             triton_client = tritongrpcclient.InferenceServerClient(

From 3536da80f52e3d66fa224e5efc75056be060d35f Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 13:08:29 +0800
Subject: [PATCH 16/43] use default path of longphase

---
 run_clair3.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/run_clair3.sh b/run_clair3.sh
index b00fb47..e7b780d 100644
--- a/run_clair3.sh
+++ b/run_clair3.sh
@@ -87,7 +87,7 @@ PYPY="pypy3"
 PYTHON='python3'
 PARALLEL='parallel'
 WHATSHAP='whatshap'
-longphase='longphase'
+longphase='EMPTY'
 CHUNK_NUM=0
 CHUNK_SIZE=5000000
 QUAL=2
@@ -210,6 +210,9 @@ if [ "${PHASING_PCT}" = "0" ]; then PHASING_PCT=0.7; fi
 BASE_MODEL=$(basename ${MODEL_PATH})
 if [ "${BASE_MODEL}" = "r941_prom_sup_g5014" ] || [ "${BASE_MODEL}" = "r941_prom_hac_g5014" ] || [ "${BASE_MODEL}" = "ont_guppy5" ]; then PHASING_PCT=0.8; fi
 
+# use the default longphase binary path
+if [ "${USE_LONGPHASE}" == True ] && [ "${LONGPHASE}" == "EMPTY" ]; then LONGPHASE="${SCRIPT_PATH}/longphase"; fi
+
 # remove the last '/' character in directory input
 OUTPUT_FOLDER=$(echo ${OUTPUT_FOLDER%*/})
 MODEL_PATH=$(echo ${MODEL_PATH%*/})

From 3446d5bcbe7f37abade53234cdae8560ec529c16 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 13:13:52 +0800
Subject: [PATCH 17/43] add installation for c implement

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index e27da3b..5d23501 100644
--- a/README.md
+++ b/README.md
@@ -276,6 +276,10 @@ conda install -c conda-forge -c bioconda whatshap=1.0 -y
 git clone https://github.com/HKU-BAL/Clair3.git
 cd Clair3
 
+# compile samtools, longphase and cffi library for c implement
+# after building, longphase binary is in `Clair3` folder
+python3 build.py
+
 # download pre-trained models
 mkdir models
 wget http://www.bio8.cs.hku.hk/clair3/clair3_models/clair3_models.tar.gz 
@@ -292,6 +296,8 @@ MODEL_NAME="[YOUR_MODEL_NAME]"         # e.g. r941_prom_hac_g360+g422
   --output=${OUTPUT_DIR}               ## output path prefix
 ```
 
+
+
 ### Option 5. Docker Dockerfile
 
 This is the same as option 1 except that you are building a docker image yourself. Please refer to option 1 for usage. 

From d7142b4e71373af415e69e947567a072a3888d57 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 13:42:01 +0800
Subject: [PATCH 18/43] add longphase path check

---
 run_clair3.sh            | 1 +
 scripts/clair3_c_impl.sh | 1 +
 2 files changed, 2 insertions(+)
 mode change 100644 => 100755 run_clair3.sh
 mode change 100644 => 100755 scripts/clair3_c_impl.sh

diff --git a/run_clair3.sh b/run_clair3.sh
old mode 100644
new mode 100755
index e7b780d..0940285
--- a/run_clair3.sh
+++ b/run_clair3.sh
@@ -212,6 +212,7 @@ if [ "${BASE_MODEL}" = "r941_prom_sup_g5014" ] || [ "${BASE_MODEL}" = "r941_prom
 
 # use the default longphase binary path
 if [ "${USE_LONGPHASE}" == True ] && [ "${LONGPHASE}" == "EMPTY" ]; then LONGPHASE="${SCRIPT_PATH}/longphase"; fi
+if [ "${USE_LONGPHASE}" == True ] && [ ! -f ${LONGPHASE} ]; then echo -e "${ERROR} Cannot find LongPhase path in ${LONGPHASE}, exit!${NC}"; exit 1; fi
 
 # remove the last '/' character in directory input
 OUTPUT_FOLDER=$(echo ${OUTPUT_FOLDER%*/})
diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh
old mode 100644
new mode 100755
index 0f0798f..75eb24c
--- a/scripts/clair3_c_impl.sh
+++ b/scripts/clair3_c_impl.sh
@@ -110,6 +110,7 @@ ${PYTHON} ${CLAIR3} CheckEnvs \
     --indel_min_af ${INDEL_AF}
 readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS"
 if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi
+# use all threads here when gpu is enabled?
 THREADS_LOW=$((${THREADS}*3/4))
 LONGPHASE_THREADS=$((${THREADS}*1/2))
 if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi

From 4e5acdab0d5b53eaca95925f73eeeae3b0dbd9ac Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 14:32:51 +0800
Subject: [PATCH 19/43] longphase to upper

---
 run_clair3.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_clair3.sh b/run_clair3.sh
index 0940285..ebe4e3a 100755
--- a/run_clair3.sh
+++ b/run_clair3.sh
@@ -87,7 +87,7 @@ PYPY="pypy3"
 PYTHON='python3'
 PARALLEL='parallel'
 WHATSHAP='whatshap'
-longphase='EMPTY'
+LONGPHASE='EMPTY'
 CHUNK_NUM=0
 CHUNK_SIZE=5000000
 QUAL=2

From 3e07f4cb135990e96f389bad4901a0a449fe92f9 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Wed, 16 Mar 2022 22:11:21 +0800
Subject: [PATCH 20/43] base2index function to more efficient array lookup

---
 src/clair3_pileup.c | 14 +-------------
 src/clair3_pileup.h |  8 ++++++++
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/clair3_pileup.c b/src/clair3_pileup.c
index 3695a09..fb8f88e 100644
--- a/src/clair3_pileup.c
+++ b/src/clair3_pileup.c
@@ -21,18 +21,6 @@
 #define bam_nt16_table seq_nt16_table
 
 
-size_t base2_index(char c) {
-    if (c == 'A') return 0;
-    else if (c == 'C') return 1;
-    else if (c == 'G') return 2;
-    else if (c == 'T') return 3;
-    else if (c == 'a') return 9;
-    else if (c == 'c') return 10;
-    else if (c == 'g') return 11;
-    else if (c == 't') return 12;
-    else return 0;
-}
-
 /** Constructs a pileup data structure.
  *
  *  @param n_cols number of pileup columns.
@@ -359,7 +347,7 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co
         kh_counter_destroy(ins_counts_r);
         int offset = pos - ref_start;
         char ref_base = toupper(ref_seq[offset]);
-        int ref_offset_forward = base2_index(ref_base);
+        int ref_offset_forward = base2index[ref_base - 'A'];
         int ref_offset_reverse = ref_offset_forward + reverse_pos_start;
         char major_alt_base = '\0';
         size_t forward_sum = 0;
diff --git a/src/clair3_pileup.h b/src/clair3_pileup.h
index c4dd8e2..7f7277c 100644
--- a/src/clair3_pileup.h
+++ b/src/clair3_pileup.h
@@ -33,6 +33,14 @@ static const int num2countbase[32] = {
 };
 
 
+static const int base2index[32] = {
+    0, 0, 1, 0, 0, 0, 2, 0, // abcdefgh
+    0, 0, 0, 0, 0, 0, 0, 0, // ijklmnop
+    0, 0, 0, 3, 0, 0, 0, 0, // qrstuvwx
+    0, 0, 0, 0, 0, 0, 0, 0, // vz
+};
+
+
 // convert 16bit IUPAC (+16 for strand) to plp_bases clair3 index
 //  first i: all insertions
 // second i: most common insertion

From 19aa91064692ef31e577847f7bd28b9e70a56b17 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 02:00:49 +0800
Subject: [PATCH 21/43] add Makefile

---
 Makefile | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0f13b7d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,50 @@
+OS := $(shell uname)
+ARCH := $(shell arch)
+
+PYTHON ?= python3
+
+all : libhts.a longphase libclair3.so
+clean : clean_htslib clean_longphase clean_libclair3
+
+SAMVER=1.10
+LPVER=1.0
+
+samtools-$(SAMVER)/Makefile:
+		curl -L -o samtools-${SAMVER}.tar.bz2 https://github.com/samtools/samtools/releases/download/${SAMVER}/samtools-${SAMVER}.tar.bz2; \
+		tar -xjf samtools-${SAMVER}.tar.bz2; \
+		rm samtools-${SAMVER}.tar.bz2
+
+libhts.a: samtools-$(SAMVER)/Makefile
+	# this is required only to add in -fpic so we can build python module
+	@echo "\x1b[1;33mMaking $(@F)\x1b[0m"
+	cd samtools-${SAMVER}/htslib-${SAMVER}/ && CFLAGS="-fpic -std=c99 -O3" ./configure && make
+	cp samtools-${SAMVER}/htslib-${SAMVER}/$@ $@
+
+
+longphase-$(LPVER)/Makefile:
+	curl -L -o longphase-${LPVER}.tar.gz https://github.com/twolinin/longphase/archive/refs/tags/v${LPVER}.tar.gz; \
+	tar -zxvf longphase-${LPVER}.tar.gz; \
+	rm longphase-${LPVER}.tar.gz
+
+longphase: longphase-$(LPVER)/Makefile
+	@echo "\x1b[1;33mMaking $(@F)\x1b[0m"
+	cd longphase-${LPVER} && autoreconf -i && ./configure && make -j4
+	cp longphase-${LPVER}/$@ $@
+
+
+libclair3.so: samtools-${SAMVER}/htslib-${SAMVER}
+	${PYTHON} build.py
+
+
+.PHONY: clean_htslib
+clean_htslib:
+	cd samtools-${SAMVER} && make clean || exit 0
+	cd samtools-${SAMVER}/htslib-${SAMVER} && make clean || exit 0
+
+.PHONY: clean_longphase
+clean_longphase:
+	cd longphase-${LPVER} && make clean || exit 0
+
+.PHONY: clean_libclair3
+clean_libclair3:
+	rm libclair3.*

From 5a4a58c361e90b91ae46fa45876e4e3622c82ce0 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 02:01:27 +0800
Subject: [PATCH 22/43] move samtools and longphse compile to Makefile

---
 build.py | 33 ++-------------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/build.py b/build.py
index 022b1ae..75fee91 100644
--- a/build.py
+++ b/build.py
@@ -4,36 +4,8 @@
 from subprocess import run
 from cffi import FFI
 
-
 samver = "1.10"
-longphase_version = "1.0"
 file_directory = os.path.dirname(os.path.realpath(__file__))
-def compile_samtools_package():
-    # just a simple way to compile samtools htslib
-    if not os.path.exists(os.path.join(file_directory, 'libhts.a')):
-        samtools_source = "samtools-{}.tar.bz2 https://github.com/samtools/samtools/releases/download/{}/samtools-{}.tar.bz2".format(samver, samver, samver)
-        run("curl -L -o {}".format(samtools_source), shell=True)
-        run("tar -xjf samtools-{}.tar.bz2".format(samver), shell=True)
-        run("rm samtools-{}.tar.bz2".format(samver), shell=True)
-        run("cd samtools-{} && autoheader && autoconf -Wno-syntax && CFLAGS='-fpic -O3' ./configure && make".format(samver), shell=True)
-        run("cp samtools-{}/htslib-{}/libhts.a {}".format(samver, samver, file_directory), shell=True)
-
-
-def compile_longphase_package():
-    if not os.path.exists(os.path.join(file_directory, 'longphase')):
-        longphase_source = "https://github.com/twolinin/longphase/archive/refs/tags/v{}.tar.gz".format(longphase_version)
-        run("wget {}".format(longphase_source), shell=True)
-        run("tar -zxvf v{}.tar.gz".format(longphase_version), shell=True)
-        run("rm v{}.tar.gz".format(longphase_version), shell=True)
-        run("cd longphase-{} && autoreconf -i && ./configure && make -j4".format(longphase_version), shell=True)
-        run("mv longphase-{}/longphase {}".format(longphase_version, file_directory), shell=True)
-        run("rm -r longphase-{}".format(longphase_version), shell=True)
-
-def clean_samtools_package():
-    # after ffi building, clean the samtools htslib source
-    if os.path.exists(os.path.join(file_directory, 'libhts.a')):
-        run("rm -r samtools-{}".format(samver), shell=True)
-
 htslib_dir=os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver))
 
 libraries=['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto']
@@ -91,7 +63,6 @@ def clean_samtools_package():
 
 
 if __name__ == "__main__":
-    compile_samtools_package()
-    compile_longphase_package()
     ffibuilder.compile(verbose=True)
-    clean_samtools_package()
+    run("cp {}/libclair3*.so {}/libclair3.so".format(file_directory, file_directory), shell=True)
+

From ed78d1762c520ca1ca453659fef9ed9882c3c411 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 02:27:43 +0800
Subject: [PATCH 23/43] toupper not found in arm64 stdlib

---
 src/clair3_full_alignment.c | 4 ++--
 src/clair3_pileup.c         | 2 +-
 src/medaka_bamiter.c        | 4 ++--
 src/medaka_common.c         | 7 +++++++
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/clair3_full_alignment.c b/src/clair3_full_alignment.c
index eafae03..ea70418 100644
--- a/src/clair3_full_alignment.c
+++ b/src/clair3_full_alignment.c
@@ -722,7 +722,7 @@ fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path
                     continue;
 
                 int8_t alt_v = 0;
-                char ref_base = toupper(ref_seq[cp - ref_start]);
+                char ref_base = upper_base(ref_seq[cp - ref_start]);
                 int8_t ref_v = num2countbase_fa[ref_base - 'A'];
                 int8_t bq_v = read.pos_info[offset].bq;
 
@@ -821,7 +821,7 @@ fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path
         // store the alternative information into string
         size_t max_alt_length = 64;
         char *alt_info_str = calloc(max_alt_length, sizeof(char));
-        char center_ref_base = toupper(ref_seq[candidate - ref_start]);
+        char center_ref_base = upper_base(ref_seq[candidate - ref_start]);
 
         sprintf(alt_info_str, "%i-%i-%c-", candidate + 1, candidate_depth, center_ref_base);
         for (size_t j = 0; j < 4; j++)
diff --git a/src/clair3_pileup.c b/src/clair3_pileup.c
index fb8f88e..be334d0 100644
--- a/src/clair3_pileup.c
+++ b/src/clair3_pileup.c
@@ -346,7 +346,7 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co
 
         kh_counter_destroy(ins_counts_r);
         int offset = pos - ref_start;
-        char ref_base = toupper(ref_seq[offset]);
+        char ref_base = upper_base(ref_seq[offset]);
         int ref_offset_forward = base2index[ref_base - 'A'];
         int ref_offset_reverse = ref_offset_forward + reverse_pos_start;
         char major_alt_base = '\0';
diff --git a/src/medaka_bamiter.c b/src/medaka_bamiter.c
index a625e14..c01e874 100644
--- a/src/medaka_bamiter.c
+++ b/src/medaka_bamiter.c
@@ -3,7 +3,7 @@
 
 #include "medaka_bamiter.h"
 #include "medaka_common.h"
-
+#include <stdlib.h>
 // iterator for reading bam
 int read_bam(void *data, bam1_t *b) {
     mplp_data *aux = (mplp_data*) data;
@@ -57,7 +57,7 @@ bam_fset* create_bam_fset(const char* fname) {
     if (fset->hdr == 0 || fset->idx == 0 || fset->fp == 0) {
         destroy_bam_fset(fset);
         fprintf(stderr, "Failed to read .bam file '%s'.", fname);
-        exit(1);
+        return fset;
     }
     return fset;
 }
diff --git a/src/medaka_common.c b/src/medaka_common.c
index ba06b03..588006b 100644
--- a/src/medaka_common.c
+++ b/src/medaka_common.c
@@ -7,6 +7,13 @@
 #include "medaka_common.h"
 
 
+char upper_base(char c) {
+    if (c >= 'a' && c <= 'z')
+        return c - 32;
+
+    return c;
+}
+
 /** Allocates zero-initialised memory with a message on failure.
  *
  *  @param num number of elements to allocate.

From b04ebcf3be5539a5197727327be63185bdf0ec82 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 14:26:30 +0800
Subject: [PATCH 24/43] put function into header

---
 src/medaka_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/medaka_common.h b/src/medaka_common.h
index 2f06bf6..2aecaf1 100644
--- a/src/medaka_common.h
+++ b/src/medaka_common.h
@@ -14,7 +14,7 @@
 static inline int max ( int a, int b ) { return a > b ? a : b; }
 static inline int min ( int a, int b ) { return a < b ? a : b; }
 
-
+char upper_base(char c);
 /** Allocates zero-initialised memory with a message on failure.
  *
  *  @param num number of elements to allocate.

From a0ae449627def925a2e1289192f15a3194bb6f76 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 14:28:06 +0800
Subject: [PATCH 25/43] change minCoverage type to int

---
 clair3/CallVarBam.py                            | 2 +-
 clair3/CallVariantsFromCffi.py                  | 2 +-
 preprocess/CreateTensorFullAlignment.py         | 2 +-
 preprocess/CreateTensorFullAlignmentFromCffi.py | 2 +-
 preprocess/CreateTensorPileup.py                | 2 +-
 preprocess/CreateTensorPileupFromCffi.py        | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py
index 35b79a1..590a582 100644
--- a/clair3/CallVarBam.py
+++ b/clair3/CallVarBam.py
@@ -347,7 +347,7 @@ def main():
     parser.add_argument('--fast_mode', type=str2bool, default=False,
                         help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s")
 
-    parser.add_argument('--minCoverage', type=float, default=param.min_coverage,
+    parser.add_argument('--minCoverage', type=int, default=param.min_coverage,
                         help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
 
     parser.add_argument('--minMQ', type=int, default=param.min_mq,
diff --git a/clair3/CallVariantsFromCffi.py b/clair3/CallVariantsFromCffi.py
index 30fb78d..019dda1 100644
--- a/clair3/CallVariantsFromCffi.py
+++ b/clair3/CallVariantsFromCffi.py
@@ -292,7 +292,7 @@ def main():
     parser.add_argument('--vcf_fn', type=str, default=None,
                         help="Candidate sites VCF file input, if provided, variants will only be called at the sites in the VCF file,  default: %(default)s")
 
-    parser.add_argument('--minCoverage', type=float, default=2,
+    parser.add_argument('--minCoverage', type=int, default=2,
                         help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
 
     parser.add_argument('--minMQ', type=int, default=5,
diff --git a/preprocess/CreateTensorFullAlignment.py b/preprocess/CreateTensorFullAlignment.py
index f6d8c28..10d563d 100644
--- a/preprocess/CreateTensorFullAlignment.py
+++ b/preprocess/CreateTensorFullAlignment.py
@@ -909,7 +909,7 @@ def main():
                         help="Path to the 'samtools', samtools version >= 1.10 is required. default: %(default)s")
 
     # options for advanced users
-    parser.add_argument('--minCoverage', type=float, default=param.min_coverage,
+    parser.add_argument('--minCoverage', type=int, default=param.min_coverage,
                         help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
 
     parser.add_argument('--minMQ', type=int, default=param.min_mq,
diff --git a/preprocess/CreateTensorFullAlignmentFromCffi.py b/preprocess/CreateTensorFullAlignmentFromCffi.py
index 043afcf..36ea5fe 100644
--- a/preprocess/CreateTensorFullAlignmentFromCffi.py
+++ b/preprocess/CreateTensorFullAlignmentFromCffi.py
@@ -185,7 +185,7 @@ def main():
                         help="Path to the 'samtools', samtools version >= 1.10 is required. default: %(default)s")
 
     # options for advanced users
-    parser.add_argument('--minCoverage', type=float, default=param.min_coverage,
+    parser.add_argument('--minCoverage', type=int, default=param.min_coverage,
                         help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
 
     parser.add_argument('--minMQ', type=int, default=param.min_mq,
diff --git a/preprocess/CreateTensorPileup.py b/preprocess/CreateTensorPileup.py
index 63e3095..bf837ae 100644
--- a/preprocess/CreateTensorPileup.py
+++ b/preprocess/CreateTensorPileup.py
@@ -494,7 +494,7 @@ def main():
     parser.add_argument('--fast_mode', type=str2bool, default=False,
                         help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s")
 
-    parser.add_argument('--minCoverage', type=float, default=2,
+    parser.add_argument('--minCoverage', type=int, default=2,
                         help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
 
     parser.add_argument('--minMQ', type=int, default=param.min_mq,
diff --git a/preprocess/CreateTensorPileupFromCffi.py b/preprocess/CreateTensorPileupFromCffi.py
index 5ea4f07..803c540 100644
--- a/preprocess/CreateTensorPileupFromCffi.py
+++ b/preprocess/CreateTensorPileupFromCffi.py
@@ -459,7 +459,7 @@ def main():
     parser.add_argument('--fast_mode', type=str2bool, default=False,
                         help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s")
 
-    parser.add_argument('--minCoverage', type=float, default=2,
+    parser.add_argument('--minCoverage', type=int, default=2,
                         help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
 
     parser.add_argument('--minMQ', type=int, default=param.min_mq,

From 65661dbafd125abda37f73155a4b35593fb1aecb Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 14:29:17 +0800
Subject: [PATCH 26/43] fix gvcf 0-index ctg_start issue

---
 preprocess/CreateTensorPileupFromCffi.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/preprocess/CreateTensorPileupFromCffi.py b/preprocess/CreateTensorPileupFromCffi.py
index 803c540..59ad726 100644
--- a/preprocess/CreateTensorPileupFromCffi.py
+++ b/preprocess/CreateTensorPileupFromCffi.py
@@ -309,6 +309,7 @@ def CreateTensorPileup(args):
     is_ctg_name_given = ctg_name is not None
     is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None
     if is_ctg_range_given:
+        ctg_start = max(1, ctg_start)
         extend_start = max(1, ctg_start - no_of_positions)
         extend_end = ctg_end + no_of_positions
 
@@ -367,7 +368,6 @@ def CreateTensorPileup(args):
     np_pileup_data = np.array(np_pileup_data, dtype=np.int32)
 
 
-
     if args.gvcf:
 
         from shared.utils import reference_sequence_from, region_from
@@ -384,8 +384,8 @@ def CreateTensorPileup(args):
 
         empty_pileup_flag = True
         for pos in range(ctg_start, ctg_end):
-            ref_count = gvcf_output[0][pos - extend_start + 1]
-            total_count = gvcf_output[1][pos - extend_start + 1]
+            ref_count = gvcf_output[0][pos - extend_start]
+            total_count = gvcf_output[1][pos - extend_start]
             reference_base = reference_sequence[pos-reference_start]
             if (ref_count == 0 and total_count == 0):
                 cur_site_info = {'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': 0, 'n_ref': 0}

From 16c90f5dc6e2f2726247638878dae11d4e242638 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 14:30:47 +0800
Subject: [PATCH 27/43] pytables in not necessary in calling , move to training
 part

---
 clair3/CallVariants.py | 3 ++-
 clair3/utils.py        | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/clair3/CallVariants.py b/clair3/CallVariants.py
index 47a7042..a16f8aa 100644
--- a/clair3/CallVariants.py
+++ b/clair3/CallVariants.py
@@ -1,7 +1,6 @@
 import sys
 import os
 import math
-import tables
 import tensorflow as tf
 import numpy as np
 import logging
@@ -1529,6 +1528,7 @@ def load_mini_batch():
         if full_alignment_mode and total == 0:
             logging.info(log_error("[ERROR] No full-alignment output for file {}/{}".format(args.ctgName, args.call_fn)))
     else:
+        import tables
         dataset = tables.open_file(args.tensor_fn, 'r').root
         batch_size = param.predictBatchSize
         dataset_size = len(dataset.label)
@@ -1712,6 +1712,7 @@ def load_mini_batch():
         logging.info("Total process positions: {}".format(total))
 
     else:
+        import tables
         if not os.path.exists(args.tensor_fn):
             logging.info("skip {}, not existing chunk_id".format(args.tensor_fn))
             return
diff --git a/clair3/utils.py b/clair3/utils.py
index acfbc7e..0c2ce24 100644
--- a/clair3/utils.py
+++ b/clair3/utils.py
@@ -3,7 +3,6 @@
 import copy
 import shlex
 import os
-import tables
 import numpy as np
 from functools import partial
 
@@ -11,7 +10,6 @@
 from shared.interval_tree import bed_tree_from, is_region_in
 from shared.utils import subprocess_popen, IUPAC_base_to_ACGT_base_dict as BASE2BASE, IUPAC_base_to_num_dict as BASE2NUM
 
-FILTERS = tables.Filters(complib='blosc:lz4hc', complevel=5)
 shuffle_bin_size = 50000
 PREFIX_CHAR_STR = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
@@ -389,6 +387,8 @@ def get_training_array(tensor_fn, var_fn, bed_fn, bin_fn, shuffle=True, is_allow
         import shared.param_f as param
         float_type = 'int8'
 
+    import tables
+    FILTERS = tables.Filters(complib='blosc:lz4hc', complevel=5)
     tensor_shape = param.ont_input_shape if platform == 'ont' else param.input_shape
 
     subprocess_list = []

From 0eb5d4b83aef4e01e7b1707b718cf7aaec50abf5 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 14:33:16 +0800
Subject: [PATCH 28/43] add environment check for mac arm64 system

---
 clair3/CallVarBam.py     | 26 ++++++++++++++-----------
 preprocess/CheckEnvs.py  | 16 ++++------------
 run_clair3.sh            | 41 ++++++++++++++++++++++++++++------------
 scripts/clair3.sh        | 10 +++++++++-
 scripts/clair3_c_impl.sh | 12 ++++++++++--
 5 files changed, 67 insertions(+), 38 deletions(-)

diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py
index 590a582..1f6c073 100644
--- a/clair3/CallVarBam.py
+++ b/clair3/CallVarBam.py
@@ -9,6 +9,7 @@
 from time import sleep
 from argparse import ArgumentParser, SUPPRESS
 import logging
+import platform
 
 logging.getLogger().setLevel(logging.INFO)
 
@@ -130,20 +131,23 @@ def Run(args):
         chunk_id = CommandOption('chunk_id', args.chunk_id)
         chunk_num = CommandOption('chunk_num', args.chunk_num)
 
-    sched_getaffinity_list = list(os.sched_getaffinity(0))
-    maxCpus = len(sched_getaffinity_list)
-    if args.tensorflow_threads is None:
-        numCpus = maxCpus
+    if platform.machine() in {"aarch64", "arm64"} or platform.system() == "Darwin":
+        taskSet = ""
     else:
-        numCpus = args.tensorflow_threads if args.tensorflow_threads < maxCpus else maxCpus
+        sched_getaffinity_list = list(os.sched_getaffinity(0))
+        maxCpus = len(sched_getaffinity_list)
+        if args.tensorflow_threads is None:
+            numCpus = maxCpus
+        else:
+            numCpus = args.tensorflow_threads if args.tensorflow_threads < maxCpus else maxCpus
 
-    _cpuSet = ",".join(str(x) for x in random.sample(sched_getaffinity_list, numCpus))
+        _cpuSet = ",".join(str(x) for x in random.sample(sched_getaffinity_list, numCpus))
 
-    taskSet = "taskset -c %s" % (_cpuSet)
-    try:
-        subprocess.check_output("which %s" % ("taskset"), shell=True)
-    except:
-        taskSet = ""
+        taskSet = "taskset -c %s" % (_cpuSet)
+        try:
+            subprocess.check_output("which %s" % ("taskset"), shell=True)
+        except:
+            taskSet = ""
 
     if need_realignment:
         realign_reads_command_options = [
diff --git a/preprocess/CheckEnvs.py b/preprocess/CheckEnvs.py
index 5e03c07..628f700 100644
--- a/preprocess/CheckEnvs.py
+++ b/preprocess/CheckEnvs.py
@@ -3,6 +3,7 @@
 import argparse
 import shlex
 import subprocess
+import platform
 
 from collections import defaultdict
 from argparse import SUPPRESS
@@ -54,6 +55,9 @@ def check_python_path():
 def check_tools_version(tool_version, required_tool_version):
     for tool, version in tool_version.items():
         required_version = required_tool_version[tool]
+        # whatshap cannot be installed in Mac arm64 system
+        if platform.system() == "Darwin" and tool == 'whatshap':
+            continue
         if version is None:
             print(log_error("[ERROR] {} not found, please check you are in clair3 virtual environment".format(tool)))
             check_python_path()
@@ -296,18 +300,6 @@ def CheckEnvs(args):
     contig_length_list = []
     contig_chunk_num = {}
 
-    threads = args.threads
-    sched_getaffinity_list = list(os.sched_getaffinity(0))
-    numCpus = len(sched_getaffinity_list)
-
-    if threads > numCpus:
-        print(log_warning(
-            '[WARNING] Current maximum threads {} is larger than support cpu count {}, You may set a smaller parallel threads by setting --threads=$ for better parallelism.'.format(
-                threads, numCpus)))
-
-    ## for better parallelism for create tensor and call variants, we over commit the overall threads/4 for 3 times, which is 0.75 * overall threads.
-    threads_over_commit = max(4, int(threads * 0.75))
-
     with open(fai_fn, 'r') as fai_fp:
         for row in fai_fp:
             columns = row.strip().split("\t")
diff --git a/run_clair3.sh b/run_clair3.sh
index ebe4e3a..5cb5707 100755
--- a/run_clair3.sh
+++ b/run_clair3.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 SCRIPT_NAME=$(basename "$0")
 SCRIPT_PATH=`dirname "$0"`
-VERSION='v0.1-r10'
+VERSION='v0.1-r11'
 Usage="Usage: ./${SCRIPT_NAME} --bam_fn=BAM --ref_fn=REF --output=OUTPUT_DIR --threads=THREADS --platform=PLATFORM --model_path=MODEL_PREFIX [--bed_fn=BED] [options]"
 
 set -e
@@ -31,12 +31,15 @@ print_help_messages()
     echo $'      --pypy=STR                Path of pypy3, pypy3 >= 3.6 is required.'
     echo $'      --parallel=STR            Path of parallel, parallel >= 20191122 is required.'
     echo $'      --whatshap=STR            Path of whatshap, whatshap >= 1.0 is required.'
+    echo $'      --longphase=STR           Path of longphase, longphase >= 1.0 is required.'
     echo $'      --chunk_size=INT          The size of each chuck for parallel processing, default: 5000000.'
     echo $'      --pileup_only             Use the pileup model only when calling, default: disable.'
     echo $'      --print_ref_calls         Show reference calls (0/0) in VCF file, default: disable.'
     echo $'      --include_all_ctgs        Call variants on all contigs, otherwise call in chr{1..22,X,Y} and {1..22,X,Y}, default: disable.'
     echo $'      --gvcf                    Enable GVCF output, default: disable.'
     echo $'      --enable_phasing          Output phased variants using whatshap, default: disable.'
+    echo $'      --longphase_for_phasing   Use longphase for phasing, default: enable.'
+    echo $'      --disable_c_impl          Disable C implement with cffi for pileup and full-alignment create tensor, default: enable.'
     echo $'      --remove_intermediate_dir Remove intermediate directory, including intermediate phased BAM, pileup and full-alignment results. default: disable.'
     echo $'      --snp_min_af=FLOAT        Minimum SNP AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.08,hifi:0.08,ilmn:0.08.'
     echo $'      --indel_min_af=FLOAT      Minimum Indel AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.15,hifi:0.08,ilmn:0.08.'
@@ -45,16 +48,14 @@ print_help_messages()
     echo $'      --var_pct_phasing=FLOAT   EXPERIMENTAL: Specify an expected percentage of high quality 0/1 variants used in WhatsHap phasing, default: 0.8 for ont guppy5 and 0.7 for other platforms.'
     echo $'      --pileup_model_prefix=STR EXPERIMENTAL: Model prefix in pileup calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index. default: pileup.'
     echo $'      --fa_model_prefix=STR     EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment.'
+    echo $'      --min_mq=INT              EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5.'
+    echo $'      --min_coverage=INT        EXPERIMENTAL: Minimum coverage required to call a variant, default: 2.'
     echo $'      --fast_mode               EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable.'
     echo $'      --haploid_precise         EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable.'
     echo $'      --haploid_sensitive       EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable.'
     echo $'      --no_phasing_for_fa       EXPERIMENTAL: Call variants without whatshap phasing in full alignment calling, default: disable.'
     echo $'      --call_snp_only           EXPERIMENTAL: Call candidates pass SNP minimum AF only, ignore Indel candidates, default: disable.'
     echo $'      --enable_long_indel       EXPERIMENTAL: Call long Indel variants(>50 bp), default: disable.'
-    echo $'      --use_gpu                 Use GPU for calling, default: disable.'
-    echo $'      --longphase_for_phasing   Use longphase for phasing, default: disable.'
-    echo $'      --longphase               Path of longphase, longphase >= 1.0 is required.'
-    echo $'      --enable_c_impl           Use C implement with cffi for pileup and full-alignment create tensor, default: disable.'
     echo $''
 }
 
@@ -71,8 +72,8 @@ NC="\\033[0m"
 ARGS=`getopt -o b:f:t:m:p:o:hv \
 -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\
 bed_fn::,vcf_fn::,ctg_name::,sample_name::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,ref_pct_full::,var_pct_phasing::,longphase::,\
-snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\
-remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,use_gpu,longphase_for_phasing,enable_c_impl,help,version -n 'run_clair3.sh' -- "$@"`
+min_mq::,min_coverage::,snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\
+remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,use_gpu,longphase_for_phasing,disable_c_impl,help,version -n 'run_clair3.sh' -- "$@"`
 
 if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi
 eval set -- "${ARGS}"
@@ -91,6 +92,8 @@ LONGPHASE='EMPTY'
 CHUNK_NUM=0
 CHUNK_SIZE=5000000
 QUAL=2
+MIN_MQ=5
+MIN_COV=2
 PHASING_PCT="0"
 PRO="0"
 REF_PRO="0"
@@ -110,7 +113,7 @@ ENABLE_PHASING=False
 ENABLE_LONG_INDEL=False
 USE_GPU=False
 USE_LONGPHASE=False
-ENABLE_C_IMPL=False
+ENABLE_C_IMPL=True
 PILEUP_PREFIX="pileup"
 FA_PREFIX="full_alignment"
 
@@ -140,6 +143,8 @@ while true; do
     --var_pct_phasing ) PHASING_PCT="$2"; shift 2 ;;
     --snp_min_af ) SNP_AF="$2"; shift 2 ;;
     --indel_min_af ) INDEL_AF="$2"; shift 2 ;;
+    --min_mq ) MIN_MQ="$2"; shift 2 ;;
+    --min_coverage ) MIN_COV="$2"; shift 2 ;;
     --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;;
     --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;;
     --gvcf ) GVCF=True; shift 1 ;;
@@ -156,7 +161,7 @@ while true; do
     --enable_long_indel ) ENABLE_LONG_INDEL=True; shift 1 ;;
     --use_gpu ) USE_GPU=True; shift 1 ;;
     --longphase_for_phasing ) USE_LONGPHASE=True; shift 1 ;;
-    --enable_c_impl ) ENABLE_C_IMPL=True; shift 1 ;;
+    --disable_c_impl ) ENABLE_C_IMPL=False; shift 1 ;;
 
     -- ) shift; break; ;;
     -h|--help ) print_help_messages; exit 0 ;;
@@ -211,6 +216,7 @@ BASE_MODEL=$(basename ${MODEL_PATH})
 if [ "${BASE_MODEL}" = "r941_prom_sup_g5014" ] || [ "${BASE_MODEL}" = "r941_prom_hac_g5014" ] || [ "${BASE_MODEL}" = "ont_guppy5" ]; then PHASING_PCT=0.8; fi
 
 # use the default longphase binary path
+if [ "$(uname)" = "Darwin" ] && [ "${NO_PHASING}" == False ];  then echo -e "${WARNING} Mac arm64 system only support longphase for phasing, will enable it! ${NC}"; USE_LONGPHASE=True; fi
 if [ "${USE_LONGPHASE}" == True ] && [ "${LONGPHASE}" == "EMPTY" ]; then LONGPHASE="${SCRIPT_PATH}/longphase"; fi
 if [ "${USE_LONGPHASE}" == True ] && [ ! -f ${LONGPHASE} ]; then echo -e "${ERROR} Cannot find LongPhase path in ${LONGPHASE}, exit!${NC}"; exit 1; fi
 
@@ -242,6 +248,8 @@ if [ ${CHUNK_NUM} -gt 0 ]; then echo "[INFO] CHUNK NUM: ${CHUNK_NUM}"; fi
 echo "[INFO] FULL ALIGN PROPORTION: ${PRO}"
 echo "[INFO] FULL ALIGN REFERENCE PROPORTION: ${REF_PRO}"
 echo "[INFO] PHASING PROPORTION: ${PHASING_PCT}"
+echo "[INFO] MINIMUM MQ: ${MIN_MQ}"
+echo "[INFO] MINIMUM COVERAGE: ${MIN_COV}"
 if [ "${SNP_AF}" != "0" ]; then echo "[INFO] USER DEFINED SNP THRESHOLD: ${SNP_AF}"; fi
 if [ "${INDEL_AF}" != "0" ]; then echo "[INFO] USER DEFINED INDEL THRESHOLD: ${INDEL_AF}"; fi
 echo "[INFO] ENABLE FILEUP ONLY CALLING: ${PILEUP_ONLY}"
@@ -256,9 +264,8 @@ echo "[INFO] ENABLE NO PHASING FOR FULL ALIGNMENT: ${NO_PHASING}"
 echo "[INFO] ENABLE REMOVING INTERMEDIATE FILES: ${RM_TMP_DIR}"
 echo "[INFO] ENABLE PHASING VCF OUTPUT: ${ENABLE_PHASING}"
 echo "[INFO] ENABLE LONG INDEL CALLING: ${ENABLE_LONG_INDEL}"
-echo "[INFO] ENABLE GPU CALLING: ${USE_GPU}"
 echo "[INFO] ENABLE LONGPHASE_FOR_PHASING: ${USE_LONGPHASE}"
-echo "[INFO] ENABLE C_IMPLEMENT: ${USE_LONGPHASE}"
+echo "[INFO] ENABLE C_IMPLEMENT: ${ENABLE_C_IMPL}"
 echo $''
 
 # file check
@@ -273,7 +280,7 @@ if [ ! -d ${MODEL_PATH} ] && [ -z ${CONDA_PREFIX} ]; then echo -e "${ERROR} Cond
 if [ ! -d ${MODEL_PATH} ]; then echo -e "${ERROR} Model path not found${NC}"; exit 1; fi
 
 # max threads detection
-MAX_THREADS=$(nproc)
+if [ "$(uname)" = "Darwin" ]; then MAX_THREADS=$(sysctl -n hw.logicalcpu); else MAX_THREADS=$(nproc); fi
 if [[ ! ${THREADS} =~ ^[\-0-9]+$ ]] || (( ${THREADS} <= 0)); then echo -e "${ERROR} Invalid threads input --threads=INT ${NC}"; exit 1; fi
 if [[ ${THREADS} -gt ${MAX_THREADS} ]]; then echo -e "${WARNING} Threads setting exceeds maximum available threads ${MAX_THREADS}, set threads=${MAX_THREADS}${NC}"; THREADS=${MAX_THREADS}; fi
 
@@ -283,6 +290,11 @@ if [ ! -z ${MAX_ULIMIT_THREADS} ]; then PER_ULIMIT_THREADS=$((${MAX_ULIMIT_THREA
 if [[ ${PER_ULIMIT_THREADS} < 1 ]]; then PER_ULIMIT_THREADS=1; fi
 if [ "${MAX_ULIMIT_THREADS}" != "unlimited" ] && [[ ${THREADS} -gt ${PER_ULIMIT_THREADS} ]]; then echo -e "${WARNING} Threads setting exceeds maximum ulimit threads ${THREADS} * 30 > ${MAX_ULIMIT_THREADS} (ulimit -u), set threads=${PER_ULIMIT_THREADS}${NC}"; THREADS=${PER_ULIMIT_THREADS}; fi
 
+# min mapping quality and min coverage detection
+if [[ ! ${THREADS} =~ ^[\-0-9]+$ ]] || (( ${THREADS} <= 0)); then echo -e "${ERROR} Invalid threads input --threads=INT ${NC}"; exit 1; fi
+if [[ ! ${MIN_MQ} =~ ^[\-0-9]+$ ]] || (( ${MIN_MQ} < 5)); then echo -e "${WARNING} Invalid minimum mapping quality input --min_mq>=5 ${NC}"; MIN_MQ=5; fi
+if [[ ! ${MIN_COV} =~ ^[\-0-9]+$ ]] || (( ${MIN_COV} < 2)); then echo -e "${WARNING} Invalid minimum coverage input --min_coverage>=2 ${NC}"; MIN_COV=2; fi
+
 # platform check
 if [ ! ${PLATFORM} = "ont" ] && [ ! ${PLATFORM} = "hifi" ] && [ ! ${PLATFORM} = "ilmn" ]; then echo -e "${ERROR} Invalid platform input, optional: {ont, hifi, ilmn}${NC}"; exit 1; fi
 
@@ -305,6 +317,9 @@ if [ -z ${REF_PRO} ]; then echo -e "${ERROR} Use '--ref_pct_full=FLOAT' instead
 if [ -z ${PHASING_PCT} ]; then echo -e "${ERROR} Use '--var_pct_phasing=FLOAT' instead of '--var_pct_phasing FLOAT' for optional parameters${NC}"; exit 1 ; fi
 if [ -z ${PILEUP_PREFIX} ]; then echo -e "${ERROR} Use '--pileup_model_prefix=STR' instead of '--pileup_model_prefix STR' for optional parameters${NC}"; exit 1 ; fi
 if [ -z ${FA_PREFIX} ]; then echo -e "${ERROR} Use '--fa_model_prefix=STR' instead of '--fa_model_prefix STR' for optional parameters${NC}"; exit 1 ; fi
+if [ -z ${MIN_MQ} ]; then echo -e "${ERROR} Use '--min_mq=INT' instead of '--min_mq INT' for optional parameters${NC}"; exit 1 ; fi
+if [ -z ${MIN_COV} ]; then echo -e "${ERROR} Use '--min_coverage=INT' instead of '--min_coverage INT' for optional parameters${NC}"; exit 1 ; fi
+if [ -z ${LONGPHASE} ]; then echo -e "${ERROR} Use '--longphase=STR' instead of '--longphase STR' for optional parameters${NC}"; exit 1 ; fi
 
 # model prefix detection
 if [ ! -f ${MODEL_PATH}/${PILEUP_PREFIX}.index ]; then echo -e "${ERROR} No pileup model found in provided model path and model prefix ${MODEL_PATH}/${PILEUP_PREFIX} ${NC}"; exit 1; fi
@@ -338,6 +353,8 @@ ${SCRIPT_PATH}/scripts/${CLAIR3_SCRIPT} \
     --var_pct_phasing=${PHASING_PCT} \
     --snp_min_af=${SNP_AF} \
     --indel_min_af=${INDEL_AF} \
+    --min_mq=${MIN_MQ} \
+    --min_coverage=${MIN_COV} \
     --pileup_only=${PILEUP_ONLY} \
     --gvcf=${GVCF} \
     --fast_mode=${FAST_MODE} \
diff --git a/scripts/clair3.sh b/scripts/clair3.sh
index 57ac44f..449575d 100755
--- a/scripts/clair3.sh
+++ b/scripts/clair3.sh
@@ -85,6 +85,7 @@ export OPENBLAS_NUM_THREADS=1
 export GOTO_NUM_THREADS=1
 export OMP_NUM_THREADS=1
 
+echo $''
 echo "[INFO] Check environment variables"
 ${PYTHON} ${CLAIR3} CheckEnvs \
     --bam_fn ${BAM_FILE_PATH} \
@@ -108,7 +109,14 @@ ${PYTHON} ${CLAIR3} CheckEnvs \
     --ref_pct_full ${REF_PRO} \
     --snp_min_af ${SNP_AF} \
     --indel_min_af ${INDEL_AF}
-readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS"
+
+if [ "$(uname)" = "Darwin" ];
+then
+    mapfile -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS"
+else
+    readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS"
+fi
+
 if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi
 THREADS_LOW=$((${THREADS}*3/4))
 if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi
diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh
index 75eb24c..811a10e 100755
--- a/scripts/clair3_c_impl.sh
+++ b/scripts/clair3_c_impl.sh
@@ -85,6 +85,7 @@ export OPENBLAS_NUM_THREADS=1
 export GOTO_NUM_THREADS=1
 export OMP_NUM_THREADS=1
 
+echo $''
 echo "[INFO] Check environment variables"
 ${PYTHON} ${CLAIR3} CheckEnvs \
     --bam_fn ${BAM_FILE_PATH} \
@@ -108,9 +109,16 @@ ${PYTHON} ${CLAIR3} CheckEnvs \
     --ref_pct_full ${REF_PRO} \
     --snp_min_af ${SNP_AF} \
     --indel_min_af ${INDEL_AF}
-readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS"
+
+if [ "$(uname)" = "Darwin" ];
+then
+    mapfile -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS"
+else
+    readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS"
+fi
+
 if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi
-# use all threads here when gpu is enabled?
+
 THREADS_LOW=$((${THREADS}*3/4))
 LONGPHASE_THREADS=$((${THREADS}*1/2))
 if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi

From 40f91216bf4f02620af8ac594039634deaed8405 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 14:34:03 +0800
Subject: [PATCH 29/43] add min_coverage and min_mq option in workflow

---
 clair3/CallVarBam.py     | 2 ++
 scripts/clair3.sh        | 8 +++++++-
 scripts/clair3_c_impl.sh | 8 +++++++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py
index 1f6c073..7759622 100644
--- a/clair3/CallVarBam.py
+++ b/clair3/CallVarBam.py
@@ -180,6 +180,8 @@ def Run(args):
         CommandOption('bed_fn', bed_fn),
         CommandOption('extend_bed', extend_bed),
         CommandOption('sampleName', args.sampleName),
+        CommandOption('minCoverage', args.minCoverage),
+        CommandOption('minMQ', args.minMQ),
         ctgStart,
         ctgEnd,
         chunk_id,
diff --git a/scripts/clair3.sh b/scripts/clair3.sh
index 449575d..0ace576 100755
--- a/scripts/clair3.sh
+++ b/scripts/clair3.sh
@@ -7,7 +7,7 @@ set -e
 ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \
 -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\
 bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\
-snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
+min_mq::,min_coverage::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
 no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"`
 
 if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi
@@ -44,6 +44,8 @@ while true; do
     --gvcf ) GVCF="$2"; shift 2 ;;
     --snp_min_af ) SNP_AF="$2"; shift 2 ;;
     --indel_min_af ) INDEL_AF="$2"; shift 2 ;;
+    --min_mq ) MIN_MQ="$2"; shift 2 ;;
+    --min_coverage ) MIN_COV="$2"; shift 2 ;;
     --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;;
     --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;;
     --haploid_precise ) HAP_PRE="$2"; shift 2 ;;
@@ -143,6 +145,8 @@ time ${PARALLEL} --retries ${RETRIES} -C ' ' --joblog ${LOG_PATH}/parallel_1_cal
     --fast_mode ${FAST_MODE} \
     --snp_min_af ${SNP_AF} \
     --indel_min_af ${INDEL_AF} \
+    --minMQ ${MIN_MQ} \
+    --minCoverage ${MIN_COV} \
     --call_snp_only ${SNP_ONLY} \
     --gvcf ${GVCF} \
     --enable_long_indel ${ENABLE_LONG_INDEL} \
@@ -244,6 +248,8 @@ time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_6_call_var_b
     --add_indel_length \
     --phasing_info_in_bam \
     --gvcf ${GVCF} \
+    --minMQ ${MIN_MQ} \
+    --minCoverage ${MIN_COV} \
     --enable_long_indel ${ENABLE_LONG_INDEL} \
     --python ${PYTHON} \
     --pypy ${PYPY} \
diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh
index 811a10e..eaaebca 100755
--- a/scripts/clair3_c_impl.sh
+++ b/scripts/clair3_c_impl.sh
@@ -7,7 +7,7 @@ set -e
 ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \
 -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\
 bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\
-snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
+min_mq::,min_coverage::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
 no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"`
 
 if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi
@@ -44,6 +44,8 @@ while true; do
     --gvcf ) GVCF="$2"; shift 2 ;;
     --snp_min_af ) SNP_AF="$2"; shift 2 ;;
     --indel_min_af ) INDEL_AF="$2"; shift 2 ;;
+    --min_mq ) MIN_MQ="$2"; shift 2 ;;
+    --min_coverage ) MIN_COV="$2"; shift 2 ;;
     --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;;
     --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;;
     --haploid_precise ) HAP_PRE="$2"; shift 2 ;;
@@ -145,6 +147,8 @@ time ${PARALLEL} --retries ${RETRIES} -C ' ' --joblog ${LOG_PATH}/parallel_1_cal
     --fast_mode ${FAST_MODE} \
     --snp_min_af ${SNP_AF} \
     --indel_min_af ${INDEL_AF} \
+    --minMQ ${MIN_MQ} \
+    --minCoverage ${MIN_COV} \
     --call_snp_only ${SNP_ONLY} \
     --gvcf ${GVCF} \
     --enable_long_indel ${ENABLE_LONG_INDEL} \
@@ -247,6 +251,8 @@ time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_6_call_var_b
     --ctgName {1/.} \
     --add_indel_length \
     --no_phasing_for_fa ${NO_PHASING} \
+    --minMQ ${MIN_MQ} \
+    --minCoverage ${MIN_COV} \
     --phased_vcf_fn ${PHASE_VCF_PATH}/phased_{/.}.vcf.gz \
     --gvcf ${GVCF} \
     --enable_long_indel ${ENABLE_LONG_INDEL} \

From ff50ec99d2bb10c7f688237825b051a270d34c3a Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 15:06:41 +0800
Subject: [PATCH 30/43] allow csi indexing for input BAM

---
 preprocess/CheckEnvs.py | 5 ++++-
 run_clair3.sh           | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/preprocess/CheckEnvs.py b/preprocess/CheckEnvs.py
index 628f700..12f1c0a 100644
--- a/preprocess/CheckEnvs.py
+++ b/preprocess/CheckEnvs.py
@@ -206,7 +206,10 @@ def CheckEnvs(args):
     bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True)
     ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True)
     fai_fn = file_path_from(args.ref_fn, suffix=".fai", exit_on_not_found=True, sep='.')
-    bai_fn = file_path_from(args.bam_fn, suffix=".bai", exit_on_not_found=True, sep='.')
+    bai_fn = file_path_from(args.bam_fn, suffix=".bai", sep='.')
+    csi_fn = file_path_from(args.bam_fn, suffix=".csi", sep='.')
+    if bai_fn is None and csi_fn is None:
+        sys.exit(log_error("[ERROR] Neither Bam index file {} or {} not found".format(file_name + '.bai', file_name + '.csi')))
     bed_fn = file_path_from(args.bed_fn)
     vcf_fn = file_path_from(args.vcf_fn)
     tree = bed_tree_from(bed_file_path=bed_fn)
diff --git a/run_clair3.sh b/run_clair3.sh
index 5cb5707..132ca9b 100755
--- a/run_clair3.sh
+++ b/run_clair3.sh
@@ -270,7 +270,7 @@ echo $''
 
 # file check
 if [ ! -f ${BAM_FILE_PATH} ]; then echo -e "${ERROR} BAM file ${BAM_FILE_PATH} not found${NC}"; exit 1; fi
-if [ ! -f ${BAM_FILE_PATH}.bai ] && [ ! -f ${BAM_FILE_PATH%.*}.bai ]; then echo -e "${ERROR} BAM index bai file not found, please use 'samtools index \$BAM' first${NC}"; exit 1; fi
+if [ ! -f ${BAM_FILE_PATH}.bai ] && [ ! -f ${BAM_FILE_PATH%.*}.bai ] && [ ! -f ${BAM_FILE_PATH}.csi ] && [ ! -f ${BAM_FILE_PATH%.*}.csi ]; then echo -e "${ERROR} BAM index bai file not found, please use 'samtools index \$BAM' first${NC}"; exit 1; fi
 if [ ! -f ${REFERENCE_FILE_PATH} ]; then echo -e "${ERROR} Reference file ${REFERENCE_FILE_PATH} not found${NC}"; exit 1; fi
 if [ ! -f ${REFERENCE_FILE_PATH}.fai ] && [ ! -f ${REFERENCE_FILE_PATH%.*}.fai ]; then echo -e "${ERROR} Reference index fai file not found, please use 'samtools faidx \$REF' first${NC}"; exit 1; fi
 

From 9c2736aa18909ff3a5bf395ddd60171abad700e3 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Thu, 31 Mar 2022 15:49:09 +0800
Subject: [PATCH 31/43] platform package conflict with platform option

---
 clair3/CallVarBam.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py
index 7759622..b461ce5 100644
--- a/clair3/CallVarBam.py
+++ b/clair3/CallVarBam.py
@@ -9,7 +9,7 @@
 from time import sleep
 from argparse import ArgumentParser, SUPPRESS
 import logging
-import platform
+from platform import machine, system
 
 logging.getLogger().setLevel(logging.INFO)
 
@@ -131,7 +131,7 @@ def Run(args):
         chunk_id = CommandOption('chunk_id', args.chunk_id)
         chunk_num = CommandOption('chunk_num', args.chunk_num)
 
-    if platform.machine() in {"aarch64", "arm64"} or platform.system() == "Darwin":
+    if machine() in {"aarch64", "arm64"} or system() == "Darwin":
         taskSet = ""
     else:
         sched_getaffinity_list = list(os.sched_getaffinity(0))

From 6e18c230339ce0a84a97aa8859378001f47cb2a9 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Sun, 3 Apr 2022 14:03:38 +0800
Subject: [PATCH 32/43] allow longphase phasing when c implement is disabled

---
 scripts/clair3.sh | 47 ++++++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/scripts/clair3.sh b/scripts/clair3.sh
index 0ace576..275dd22 100755
--- a/scripts/clair3.sh
+++ b/scripts/clair3.sh
@@ -120,7 +120,9 @@ else
 fi
 
 if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi
+
 THREADS_LOW=$((${THREADS}*3/4))
+LONGPHASE_THREADS=$((${THREADS}*1/2))
 if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi
 
 cd ${OUTPUT_FOLDER}
@@ -191,29 +193,32 @@ else
         --ctgName {1}" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} |& tee ${LOG_PATH}/2_select_hetero_snp.log
 
     echo $''
-    echo "[INFO] 3/7 Phase VCF file using Whatshap"
-    time ${PARALLEL}  --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \
-    "${WHATSHAP} phase \
-        --output ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \
-        --reference ${REFERENCE_FILE_PATH} \
-        --chromosome {1} \
-        --distrust-genotypes \
-        --ignore-read-groups \
-        ${PHASE_VCF_PATH}/{1}.vcf \
-        ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log
+    if [ ${USE_LONGPHASE} == True ]
+    then
+        echo "[INFO] 3/7 Phase VCF file using LongPhase"
+        time ${PARALLEL}  --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \
+        "${LONGPHASE} phase\
+            -s  ${PHASE_VCF_PATH}/{1}.vcf \
+            -b ${BAM_FILE_PATH} \
+            -r ${REFERENCE_FILE_PATH} \
+            -t ${LONGPHASE_THREADS} \
+            -o ${PHASE_VCF_PATH}/phased_{1} \
+            --${LP_PLATFORM}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log
+        ${PARALLEL} -j${THREADS} bgzip -f ${PHASE_VCF_PATH}/phased_{}.vcf ::: ${CHR[@]}
+    else
+        echo "[INFO] 3/7 Phase VCF file using Whatshap"
+        time ${PARALLEL}  --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \
+        "${WHATSHAP} phase \
+            --output ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \
+            --reference ${REFERENCE_FILE_PATH} \
+            --chromosome {1} \
+            --distrust-genotypes \
+            --ignore-read-groups \
+            ${PHASE_VCF_PATH}/{1}.vcf \
+            ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log
+    fi
     ${PARALLEL} -j${THREADS} tabix -f -p vcf ${PHASE_VCF_PATH}/phased_{}.vcf.gz ::: ${CHR[@]}
 
-    echo $''
-    echo "[INFO] 4/7 Haplotag input BAM file using Whatshap"
-    time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_4_haplotag.log -j${THREADS} \
-    "${WHATSHAP} haplotag \
-        --output ${PHASE_BAM_PATH}/{1}.bam \
-        --reference ${REFERENCE_FILE_PATH} \
-        --ignore-read-groups \
-        --regions {1} \
-        ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \
-        ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/4_haplotag.log
-    ${PARALLEL} -j${THREADS} ${SAMTOOLS} index -@12 ${PHASE_BAM_PATH}/{1}.bam ::: ${CHR[@]}
 fi
 
 # Full alignment calling

From 3f857c9af9d5a35f93b97b34910aaf969139e0ff Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Sun, 3 Apr 2022 14:20:26 +0800
Subject: [PATCH 33/43] add min_contig_size option

---
 preprocess/CheckEnvs.py  | 10 ++++++++++
 run_clair3.sh            | 32 ++++++++++++++++++++++----------
 scripts/clair3.sh        |  6 ++++--
 scripts/clair3_c_impl.sh |  6 ++++--
 4 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/preprocess/CheckEnvs.py b/preprocess/CheckEnvs.py
index 12f1c0a..7051519 100644
--- a/preprocess/CheckEnvs.py
+++ b/preprocess/CheckEnvs.py
@@ -242,6 +242,7 @@ def CheckEnvs(args):
     ref_pct_full = args.ref_pct_full
     snp_min_af = args.snp_min_af
     indel_min_af = args.indel_min_af
+    min_contig_size = args.min_contig_size
     sample_name = args.sampleName
     contig_name_list = os.path.join(tmp_file_path, 'CONTIGS')
     chunk_list = os.path.join(tmp_file_path, 'CHUNK_LIST')
@@ -319,6 +320,12 @@ def CheckEnvs(args):
             if is_known_vcf_file_provided and contig_name not in contig_set:
                 continue
 
+            if min_contig_size > 0 and contig_length < min_contig_size:
+                print(log_warning(
+                    "[WARNING] {} contig length {} is smaller than minimum contig size {}, will skip it!".format(contig_name, contig_length, min_contig_size)))
+                if contig_name in contig_set:
+                    contig_set.remove(contig_name)
+                continue
             contig_set.add(contig_name)
             contig_length_list.append(contig_length)
             chunk_num = int(
@@ -462,6 +469,9 @@ def main():
     parser.add_argument('--indel_min_af', type=float, default=0.08,
                         help="Minimum Indel allele frequency for a site to be considered as a candidate site, default: %(default)f")
 
+    parser.add_argument('--min_contig_size', type=int, default=0,
+                        help="Minimum Indel allele frequency for a site to be considered as a candidate site, default: %(default)f")
+
     # options for internal process control
     ## The number of chucks to be divided into for parallel processing
     parser.add_argument('--chunk_num', type=int, default=0,
diff --git a/run_clair3.sh b/run_clair3.sh
index 132ca9b..9865bdc 100755
--- a/run_clair3.sh
+++ b/run_clair3.sh
@@ -50,6 +50,7 @@ print_help_messages()
     echo $'      --fa_model_prefix=STR     EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment.'
     echo $'      --min_mq=INT              EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5.'
     echo $'      --min_coverage=INT        EXPERIMENTAL: Minimum coverage required to call a variant, default: 2.'
+    echo $'      --min_contig_size=INT     EXPERIMENTAL: If set, contigs with contig size<=$min_contig_size are filtered, default: 0.'
     echo $'      --fast_mode               EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable.'
     echo $'      --haploid_precise         EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable.'
     echo $'      --haploid_sensitive       EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable.'
@@ -72,7 +73,7 @@ NC="\\033[0m"
 ARGS=`getopt -o b:f:t:m:p:o:hv \
 -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\
 bed_fn::,vcf_fn::,ctg_name::,sample_name::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,ref_pct_full::,var_pct_phasing::,longphase::,\
-min_mq::,min_coverage::,snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\
+min_mq::,min_coverage::,min_contig_size::,snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\
 remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,use_gpu,longphase_for_phasing,disable_c_impl,help,version -n 'run_clair3.sh' -- "$@"`
 
 if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi
@@ -94,6 +95,7 @@ CHUNK_SIZE=5000000
 QUAL=2
 MIN_MQ=5
 MIN_COV=2
+MIN_CONTIG_SIZE=0
 PHASING_PCT="0"
 PRO="0"
 REF_PRO="0"
@@ -101,8 +103,8 @@ GVCF=False
 PILEUP_ONLY=False
 FAST_MODE=False
 SHOW_REF=False
-SNP_AF="0.08"
-INDEL_AF="0.15"
+SNP_AF="0"
+INDEL_AF="0"
 HAP_PRE=False
 HAP_SEN=False
 SNP_ONLY=False
@@ -210,6 +212,11 @@ if [ "${PLATFORM}" != "ont" ] && [ "${REF_PRO}" = "0" ]; then REF_PRO=0.3; fi
 if [ "${PLATFORM}" = "ont" ] && [ "${PRO}" = "0" ]; then PRO=0.7; fi
 if [ "${PLATFORM}" != "ont" ] && [ "${PRO}" = "0" ]; then PRO=0.3; fi
 
+# set default af for ilmn and hifi and ont
+if [ "${SNP_AF}" = "0" ]; then SNP_AF=0.08; fi
+if [ "${PLATFORM}" = "ont" ] && [ "${INDEL_AF}" = "0" ]; then INDEL_AF=0.15; fi
+if [ "${PLATFORM}" != "ont" ] && [ "${INDEL_AF}" = "0" ]; then INDEL_AF=0.08; fi
+
 # show default high quality hete variant proportion for whatshap phasing, 0.8 for ont guppy5 and 0.7 for others
 if [ "${PHASING_PCT}" = "0" ]; then PHASING_PCT=0.7; fi
 BASE_MODEL=$(basename ${MODEL_PATH})
@@ -245,13 +252,14 @@ echo "[INFO] WHATSHAP PATH: ${WHATSHAP}"
 echo "[INFO] LONGPHASE PATH: ${LONGPHASE}"
 echo "[INFO] CHUNK SIZE: ${CHUNK_SIZE}"
 if [ ${CHUNK_NUM} -gt 0 ]; then echo "[INFO] CHUNK NUM: ${CHUNK_NUM}"; fi
+if [ ${MIN_CONTIG_SIZE} -gt 0 ]; then echo "[INFO] MIN CONTIG SIZE: ${CHUNK_NUM}"; fi
 echo "[INFO] FULL ALIGN PROPORTION: ${PRO}"
 echo "[INFO] FULL ALIGN REFERENCE PROPORTION: ${REF_PRO}"
 echo "[INFO] PHASING PROPORTION: ${PHASING_PCT}"
 echo "[INFO] MINIMUM MQ: ${MIN_MQ}"
 echo "[INFO] MINIMUM COVERAGE: ${MIN_COV}"
-if [ "${SNP_AF}" != "0" ]; then echo "[INFO] USER DEFINED SNP THRESHOLD: ${SNP_AF}"; fi
-if [ "${INDEL_AF}" != "0" ]; then echo "[INFO] USER DEFINED INDEL THRESHOLD: ${INDEL_AF}"; fi
+echo "[INFO] SNP AF THRESHOLD: ${SNP_AF}"
+echo "[INFO] INDEL AF THRESHOLD: ${INDEL_AF}"
 echo "[INFO] ENABLE FILEUP ONLY CALLING: ${PILEUP_ONLY}"
 echo "[INFO] ENABLE FAST MODE CALLING: ${FAST_MODE}"
 echo "[INFO] ENABLE CALLING SNP CANDIDATES ONLY: ${SNP_ONLY}"
@@ -290,10 +298,6 @@ if [ ! -z ${MAX_ULIMIT_THREADS} ]; then PER_ULIMIT_THREADS=$((${MAX_ULIMIT_THREA
 if [[ ${PER_ULIMIT_THREADS} < 1 ]]; then PER_ULIMIT_THREADS=1; fi
 if [ "${MAX_ULIMIT_THREADS}" != "unlimited" ] && [[ ${THREADS} -gt ${PER_ULIMIT_THREADS} ]]; then echo -e "${WARNING} Threads setting exceeds maximum ulimit threads ${THREADS} * 30 > ${MAX_ULIMIT_THREADS} (ulimit -u), set threads=${PER_ULIMIT_THREADS}${NC}"; THREADS=${PER_ULIMIT_THREADS}; fi
 
-# min mapping quality and min coverage detection
-if [[ ! ${THREADS} =~ ^[\-0-9]+$ ]] || (( ${THREADS} <= 0)); then echo -e "${ERROR} Invalid threads input --threads=INT ${NC}"; exit 1; fi
-if [[ ! ${MIN_MQ} =~ ^[\-0-9]+$ ]] || (( ${MIN_MQ} < 5)); then echo -e "${WARNING} Invalid minimum mapping quality input --min_mq>=5 ${NC}"; MIN_MQ=5; fi
-if [[ ! ${MIN_COV} =~ ^[\-0-9]+$ ]] || (( ${MIN_COV} < 2)); then echo -e "${WARNING} Invalid minimum coverage input --min_coverage>=2 ${NC}"; MIN_COV=2; fi
 
 # platform check
 if [ ! ${PLATFORM} = "ont" ] && [ ! ${PLATFORM} = "hifi" ] && [ ! ${PLATFORM} = "ilmn" ]; then echo -e "${ERROR} Invalid platform input, optional: {ont, hifi, ilmn}${NC}"; exit 1; fi
@@ -319,14 +323,21 @@ if [ -z ${PILEUP_PREFIX} ]; then echo -e "${ERROR} Use '--pileup_model_prefix=ST
 if [ -z ${FA_PREFIX} ]; then echo -e "${ERROR} Use '--fa_model_prefix=STR' instead of '--fa_model_prefix STR' for optional parameters${NC}"; exit 1 ; fi
 if [ -z ${MIN_MQ} ]; then echo -e "${ERROR} Use '--min_mq=INT' instead of '--min_mq INT' for optional parameters${NC}"; exit 1 ; fi
 if [ -z ${MIN_COV} ]; then echo -e "${ERROR} Use '--min_coverage=INT' instead of '--min_coverage INT' for optional parameters${NC}"; exit 1 ; fi
+if [ -z ${MIN_CONTIG_SIZE} ]; then echo -e "${ERROR} Use '--min_contig_size=INT' instead of '--min_contig_size INT' for optional parameters${NC}"; exit 1 ; fi
 if [ -z ${LONGPHASE} ]; then echo -e "${ERROR} Use '--longphase=STR' instead of '--longphase STR' for optional parameters${NC}"; exit 1 ; fi
 
+# min mapping quality, min coverage and min contig size detection
+if [[ ! ${THREADS} =~ ^[\-0-9]+$ ]] || (( ${THREADS} <= 0)); then echo -e "${ERROR} Invalid threads input --threads=INT ${NC}"; exit 1; fi
+if [[ ! ${MIN_MQ} =~ ^[\-0-9]+$ ]] || (( ${MIN_MQ} < 5)); then echo -e "${WARNING} Invalid minimum mapping quality input --min_mq>=5 ${NC}"; MIN_MQ=5; fi
+if [[ ! ${MIN_COV} =~ ^[\-0-9]+$ ]] || (( ${MIN_COV} < 2)); then echo -e "${WARNING} Invalid minimum coverage input --min_coverage>=2 ${NC}"; MIN_COV=2; fi
+if [[ ! ${MIN_CONTIG_SIZE} =~ ^[\-0-9]+$ ]] || (( ${MIN_CONTIG_SIZE} < 0)); then echo -e "${WARNING} Invalid minimum contig size --min_contig_size>=0 ${NC}"; MIN_CONTIG_SIZE=0; fi
+
 # model prefix detection
 if [ ! -f ${MODEL_PATH}/${PILEUP_PREFIX}.index ]; then echo -e "${ERROR} No pileup model found in provided model path and model prefix ${MODEL_PATH}/${PILEUP_PREFIX} ${NC}"; exit 1; fi
 if [ ! -f ${MODEL_PATH}/${FA_PREFIX}.index ]; then echo -e "${ERROR} No full-alignment model found in provided model path and model prefix ${MODEL_PATH}/${FA_PREFIX} ${NC}"; exit 1; fi
 
 CLAIR3_SCRIPT="clair3.sh"
-if [ "${ENABLE_C_IMPL}" == True ] && [ ! ${PLATFORM} = "ilmn" ]; then CLAIR3_SCRIPT="clair3_c_impl.sh"; fi
+if [ "${ENABLE_C_IMPL}" == True ]; then CLAIR3_SCRIPT="clair3_c_impl.sh"; fi
 
 set -x
 ${SCRIPT_PATH}/scripts/${CLAIR3_SCRIPT} \
@@ -355,6 +366,7 @@ ${SCRIPT_PATH}/scripts/${CLAIR3_SCRIPT} \
     --indel_min_af=${INDEL_AF} \
     --min_mq=${MIN_MQ} \
     --min_coverage=${MIN_COV} \
+    --min_contig_size=${MIN_CONTIG_SIZE} \
     --pileup_only=${PILEUP_ONLY} \
     --gvcf=${GVCF} \
     --fast_mode=${FAST_MODE} \
diff --git a/scripts/clair3.sh b/scripts/clair3.sh
index 275dd22..8259fd2 100755
--- a/scripts/clair3.sh
+++ b/scripts/clair3.sh
@@ -7,7 +7,7 @@ set -e
 ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \
 -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\
 bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\
-min_mq::,min_coverage::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
+min_mq::,min_coverage::,min_contig_size::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
 no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"`
 
 if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi
@@ -46,6 +46,7 @@ while true; do
     --indel_min_af ) INDEL_AF="$2"; shift 2 ;;
     --min_mq ) MIN_MQ="$2"; shift 2 ;;
     --min_coverage ) MIN_COV="$2"; shift 2 ;;
+    --min_contig_size ) MIN_CONTIG_SIZE="$2"; shift 2 ;;
     --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;;
     --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;;
     --haploid_precise ) HAP_PRE="$2"; shift 2 ;;
@@ -110,7 +111,8 @@ ${PYTHON} ${CLAIR3} CheckEnvs \
     --var_pct_full ${PRO} \
     --ref_pct_full ${REF_PRO} \
     --snp_min_af ${SNP_AF} \
-    --indel_min_af ${INDEL_AF}
+    --indel_min_af ${INDEL_AF} \
+    --min_contig_size ${MIN_CONTIG_SIZE}
 
 if [ "$(uname)" = "Darwin" ];
 then
diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh
index eaaebca..a4b8f79 100755
--- a/scripts/clair3_c_impl.sh
+++ b/scripts/clair3_c_impl.sh
@@ -7,7 +7,7 @@ set -e
 ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \
 -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\
 bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\
-min_mq::,min_coverage::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
+min_mq::,min_coverage::,min_contig_size::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\
 no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"`
 
 if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi
@@ -46,6 +46,7 @@ while true; do
     --indel_min_af ) INDEL_AF="$2"; shift 2 ;;
     --min_mq ) MIN_MQ="$2"; shift 2 ;;
     --min_coverage ) MIN_COV="$2"; shift 2 ;;
+    --min_contig_size ) MIN_CONTIG_SIZE="$2"; shift 2 ;;
     --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;;
     --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;;
     --haploid_precise ) HAP_PRE="$2"; shift 2 ;;
@@ -110,7 +111,8 @@ ${PYTHON} ${CLAIR3} CheckEnvs \
     --var_pct_full ${PRO} \
     --ref_pct_full ${REF_PRO} \
     --snp_min_af ${SNP_AF} \
-    --indel_min_af ${INDEL_AF}
+    --indel_min_af ${INDEL_AF} \
+    --min_contig_size ${MIN_CONTIG_SIZE}
 
 if [ "$(uname)" = "Darwin" ];
 then

From effe8c346de525ec25c624703eb01d8029560e09 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Sun, 3 Apr 2022 14:21:01 +0800
Subject: [PATCH 34/43] add min_contig_size in main entry

---
 run_clair3.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/run_clair3.sh b/run_clair3.sh
index 9865bdc..026c80b 100755
--- a/run_clair3.sh
+++ b/run_clair3.sh
@@ -147,6 +147,7 @@ while true; do
     --indel_min_af ) INDEL_AF="$2"; shift 2 ;;
     --min_mq ) MIN_MQ="$2"; shift 2 ;;
     --min_coverage ) MIN_COV="$2"; shift 2 ;;
+    --min_contig_size ) MIN_CONTIG_SIZE="$2"; shift 2 ;;
     --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;;
     --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;;
     --gvcf ) GVCF=True; shift 1 ;;

From 5d4949756885e401a21a0f17f9d7e7630ef249f0 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Sun, 3 Apr 2022 14:21:59 +0800
Subject: [PATCH 35/43] add longphase platform option for pacbio hifi and ont

---
 scripts/clair3.sh        | 2 ++
 scripts/clair3_c_impl.sh | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/clair3.sh b/scripts/clair3.sh
index 8259fd2..71a7e9a 100755
--- a/scripts/clair3.sh
+++ b/scripts/clair3.sh
@@ -126,6 +126,8 @@ if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0
 THREADS_LOW=$((${THREADS}*3/4))
 LONGPHASE_THREADS=$((${THREADS}*1/2))
 if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi
+if [[ ${LONGPHASE_THREADS} < 1 ]]; then LONGPHASE_THREADS=1; fi
+if [ "${PLATFORM}" = "ont" ]; then LP_PLATFORM="ont"; else LP_PLATFORM="pb"; fi
 
 cd ${OUTPUT_FOLDER}
 # Pileup calling
diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh
index a4b8f79..bee42f7 100755
--- a/scripts/clair3_c_impl.sh
+++ b/scripts/clair3_c_impl.sh
@@ -127,6 +127,7 @@ THREADS_LOW=$((${THREADS}*3/4))
 LONGPHASE_THREADS=$((${THREADS}*1/2))
 if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi
 if [[ ${LONGPHASE_THREADS} < 1 ]]; then LONGPHASE_THREADS=1; fi
+if [ "${PLATFORM}" = "ont" ]; then LP_PLATFORM="ont"; else LP_PLATFORM="pb"; fi
 
 cd ${OUTPUT_FOLDER}
 # Pileup calling
@@ -204,7 +205,7 @@ else
             -r ${REFERENCE_FILE_PATH} \
             -t ${LONGPHASE_THREADS} \
             -o ${PHASE_VCF_PATH}/phased_{1} \
-            --ont" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log
+            --${LP_PLATFORM}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log
         ${PARALLEL} -j${THREADS} bgzip -f ${PHASE_VCF_PATH}/phased_{}.vcf ::: ${CHR[@]}
     else
         echo "[INFO] 3/7 Phase VCF file using Whatshap"

From 8d9af7bc1875c8205903fdc9203e42a768aaffa1 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Sun, 3 Apr 2022 17:28:31 +0800
Subject: [PATCH 36/43] zlib is not used in full-alignment

---
 src/clair3_full_alignment.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/clair3_full_alignment.c b/src/clair3_full_alignment.c
index ea70418..1e1fd2e 100644
--- a/src/clair3_full_alignment.c
+++ b/src/clair3_full_alignment.c
@@ -15,7 +15,6 @@
 #include "medaka_common.h"
 #include "medaka_khcounter.h"
 #include "clair3_full_alignment.h"
-#include "zlib.h"
 #include "levenshtein.h"
 
 typedef struct Pos_alt_info

From c497c8a5a59186b8d3e10d9872ad46d3c5f352a7 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Mon, 4 Apr 2022 16:22:48 +0800
Subject: [PATCH 37/43] update dockerfile with c implement

---
 Dockerfile | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index c3e2447..55e8e64 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -34,11 +34,15 @@ RUN /bin/bash -c "source activate clair3" && \
     pip install tensorflow-cpu==2.2.0 && \
     pip install tensorflow-addons==0.11.2 tables==3.6.1 && \
     conda install -c anaconda pigz==2.4 -y && \
+    conda install -c anaconda cffi==1.14.4 -y && \
     conda install -c conda-forge parallel=20191122 zstd=1.4.4 -y && \
     conda install -c conda-forge -c bioconda samtools=1.10 -y && \
     conda install -c conda-forge -c bioconda whatshap=1.0 -y && \
+    conda install -c conda-forge xz zlib bzip2 -y && \
+    conda install -c conda-forge automake curl -y && \
     rm -rf /opt/conda/pkgs/* && \
-    rm -rf /root/.cache/pip
+    rm -rf /root/.cache/pip && \
+    echo "source activate clair3" > ~/.bashrc
 
 COPY . .
 
@@ -48,4 +52,6 @@ RUN cd /opt/bin/preprocess/realign && \
     wget http://www.bio8.cs.hku.hk/clair3/clair3_models/clair3_models.tar.gz -P /opt/models && \
     tar -zxvf /opt/models/clair3_models.tar.gz -C /opt/models && \
     rm /opt/models/clair3_models.tar.gz && \
-    echo "source activate clair3" > ~/.bashrc
\ No newline at end of file
+    cd /opt/bin && \
+    make PREFIX=/opt/conda/envs/clair3 PYTHON=/opt/conda/envs/clair3/bin/python && \
+    rm -rf /opt/bin/samtools-* /opt/bin/longphase-*
\ No newline at end of file

From 34b49a8b8d5fd017559289dd16a84b784be3481b Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Mon, 4 Apr 2022 16:23:23 +0800
Subject: [PATCH 38/43] use absolute path for script path

---
 run_clair3.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/run_clair3.sh b/run_clair3.sh
index 026c80b..3da33bd 100755
--- a/run_clair3.sh
+++ b/run_clair3.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 SCRIPT_NAME=$(basename "$0")
-SCRIPT_PATH=`dirname "$0"`
+SCRIPT_PATH=$(dirname $(readlink -f "$0"))
 VERSION='v0.1-r11'
-Usage="Usage: ./${SCRIPT_NAME} --bam_fn=BAM --ref_fn=REF --output=OUTPUT_DIR --threads=THREADS --platform=PLATFORM --model_path=MODEL_PREFIX [--bed_fn=BED] [options]"
+Usage="Usage: ${SCRIPT_NAME} --bam_fn=BAM --ref_fn=REF --output=OUTPUT_DIR --threads=THREADS --platform=PLATFORM --model_path=MODEL_PREFIX [--bed_fn=BED] [options]"
 
 set -e
 #./run_clair3.sh -b tmp.bam -f ref.fasta -t 32 -o tmp -p ont -m model_path
@@ -50,7 +50,7 @@ print_help_messages()
     echo $'      --fa_model_prefix=STR     EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment.'
     echo $'      --min_mq=INT              EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5.'
     echo $'      --min_coverage=INT        EXPERIMENTAL: Minimum coverage required to call a variant, default: 2.'
-    echo $'      --min_contig_size=INT     EXPERIMENTAL: If set, contigs with contig size<=$min_contig_size are filtered, default: 0.'
+    echo $'      --min_contig_size=INT     EXPERIMENTAL: If set, contigs with contig size<$min_contig_size are filtered, default: 0.'
     echo $'      --fast_mode               EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable.'
     echo $'      --haploid_precise         EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable.'
     echo $'      --haploid_sensitive       EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable.'

From 72fa01c4e94ff6e99d94dedf33bc775e7c75ae9c Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Mon, 4 Apr 2022 16:27:26 +0800
Subject: [PATCH 39/43] add deflate and extra_link_args to link dynamic
 libraries

---
 build.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/build.py b/build.py
index 75fee91..52c0cf8 100644
--- a/build.py
+++ b/build.py
@@ -6,11 +6,19 @@
 
 samver = "1.10"
 file_directory = os.path.dirname(os.path.realpath(__file__))
-htslib_dir=os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver))
+htslib_dir = os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver))
 
-libraries=['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto']
-library_dirs=[htslib_dir]
-src_dir=os.path.join(file_directory, 'src')
+libraries = ['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto', 'deflate']
+
+try:
+    conda_path = os.environ['CONDA_PREFIX']
+    extra_link_args = ['-Wl,-rpath={}/lib'.format(conda_path)]
+except:
+    print("[WARNING] Conda prefix not found, please activate clair3 conda environment first!")
+    extra_link_args = []
+
+library_dirs = [htslib_dir]
+src_dir = os.path.join(file_directory, 'src')
 
 extra_compile_args = ['-std=c99', '-O3']
 if platform.machine() in {"aarch64", "arm64"}:
@@ -45,6 +53,7 @@
             'clair3_pileup.c',
             'clair3_full_alignment.c')],
     extra_compile_args=extra_compile_args,
+    extra_link_args=extra_link_args,
     extra_objects=['libhts.a']
 )
 

From 1a8a0884a456b83c7fa8a03e313634ed68a44dbd Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Mon, 4 Apr 2022 17:34:41 +0800
Subject: [PATCH 40/43] update Makefile

---
 Makefile | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 0f13b7d..ab1c33a 100644
--- a/Makefile
+++ b/Makefile
@@ -6,8 +6,15 @@ PYTHON ?= python3
 all : libhts.a longphase libclair3.so
 clean : clean_htslib clean_longphase clean_libclair3
 
-SAMVER=1.10
-LPVER=1.0
+SAMVER	=	1.10
+LPVER	=	1.0
+GCC	?=	gcc
+GXX	?=	g++
+PREFIX	?=	${CONDA_PREFIX}
+LDFLAGS	=	-L ${PREFIX}/lib
+CFLAGS	= -fpic -std=c99 -O3 -I ${PREFIX}/include -L ${PREFIX}/lib
+CPPFLAGS	=	-std=c++11 -Wall -O3 -I ${PREFIX}/include -L ${PREFIX}/lib -Wl,-rpath=${PREFIX}/lib
+LP_CPPFLAGS	 =	-std=c++11 -Wall -g -O3 -I ${PREFIX}/include -L ${PREFIX}/lib -Wl,-rpath=${PREFIX}/lib
 
 samtools-$(SAMVER)/Makefile:
 		curl -L -o samtools-${SAMVER}.tar.bz2 https://github.com/samtools/samtools/releases/download/${SAMVER}/samtools-${SAMVER}.tar.bz2; \
@@ -17,7 +24,7 @@ samtools-$(SAMVER)/Makefile:
 libhts.a: samtools-$(SAMVER)/Makefile
 	# this is required only to add in -fpic so we can build python module
 	@echo "\x1b[1;33mMaking $(@F)\x1b[0m"
-	cd samtools-${SAMVER}/htslib-${SAMVER}/ && CFLAGS="-fpic -std=c99 -O3" ./configure && make
+	cd samtools-${SAMVER}/htslib-${SAMVER}; CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}" ./configure; make CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}"
 	cp samtools-${SAMVER}/htslib-${SAMVER}/$@ $@
 
 
@@ -28,7 +35,7 @@ longphase-$(LPVER)/Makefile:
 
 longphase: longphase-$(LPVER)/Makefile
 	@echo "\x1b[1;33mMaking $(@F)\x1b[0m"
-	cd longphase-${LPVER} && autoreconf -i && ./configure && make -j4
+	cd longphase-${LPVER}; autoreconf -i; CPPFLAGS="${CPPFLAGS}" ./configure; make CC=${GCC} CXX=${GXX} CPPFLAGS="${CPPFLAGS}"
 	cp longphase-${LPVER}/$@ $@
 
 
@@ -40,10 +47,12 @@ libclair3.so: samtools-${SAMVER}/htslib-${SAMVER}
 clean_htslib:
 	cd samtools-${SAMVER} && make clean || exit 0
 	cd samtools-${SAMVER}/htslib-${SAMVER} && make clean || exit 0
+	rm libhts.a
 
 .PHONY: clean_longphase
 clean_longphase:
 	cd longphase-${LPVER} && make clean || exit 0
+	rm longphase
 
 .PHONY: clean_libclair3
 clean_libclair3:

From 6294e0c85f5d342f9bb4378c6adbeeebe3ea9c2e Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Mon, 4 Apr 2022 17:35:37 +0800
Subject: [PATCH 41/43] set deflate as an option for arm64

---
 build.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/build.py b/build.py
index 52c0cf8..eff5464 100644
--- a/build.py
+++ b/build.py
@@ -8,15 +8,8 @@
 file_directory = os.path.dirname(os.path.realpath(__file__))
 htslib_dir = os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver))
 
-libraries = ['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto', 'deflate']
-
-try:
-    conda_path = os.environ['CONDA_PREFIX']
-    extra_link_args = ['-Wl,-rpath={}/lib'.format(conda_path)]
-except:
-    print("[WARNING] Conda prefix not found, please activate clair3 conda environment first!")
-    extra_link_args = []
-
+libraries = ['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto']
+extra_link_args = []
 library_dirs = [htslib_dir]
 src_dir = os.path.join(file_directory, 'src')
 
@@ -28,6 +21,12 @@
         extra_compile_args.append("-march=armv8-a+simd")
 else:
     extra_compile_args.append("-mtune=haswell")
+    libraries.append('deflate')
+    try:
+        conda_path = os.environ['CONDA_PREFIX']
+        extra_link_args = ['-Wl,-rpath={}/lib'.format(conda_path)]
+    except:
+        print("[WARNING] Conda prefix not found, please activate clair3 conda environment first!")
 
 ffibuilder = FFI()
 ffibuilder.set_source("libclair3",

From 0e1f64914649b0c573cef8214a3d3fb8686d8fac Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Mon, 4 Apr 2022 17:37:28 +0800
Subject: [PATCH 42/43] update Readme

---
 README.md | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5d23501..d554517 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,8 @@ A short preprint describing Clair3's algorithms and results is at [bioRxiv](http
 
 ## Latest Updates
 
+*v0.1-r11 (Apr 4)* : 1. Variant calling ~2.5x faster than `v0.1-r10` tested with ONT Q20 data, with feature generation in both pileup and full-alignment now implemented in C (co-contributors @[cjw85](https://github.com/cjw85), @[ftostevin-ont](https://github.com/ftostevin-ont), @[EpiSlim](https://github.com/EpiSlim)). 2. Added the lightning-fast [longphase](https://github.com/twolinin/longphase) as an option for phasing. Enable using `longphase` with option `--longphase_for_phasing`. New option disabled by default to align with the default behavior of the previous versions, but we recommend enable when calling human variants with ≥20x long-reads). 3. Added `--min_coverage` and `--min_mq` options ([#83](https://github.com/HKU-BAL/Clair3/issues/83)). 4. Added `--min_contig_size` option to skip calling variants in short contigs when using genome assembly as input. 4. Reads haplotagging after phasing before full-alignment calling now integrated into full-alignment calling to avoid generating an intermediate BAM file. 5. Supported .`csi` BAM index for large references ([#90](https://github.com/HKU-BAL/Clair3/issues/90)). For more speedup details, please check [Notes on r11](docs/v0.1_r11_speedup.md).
+
 *v0.1-r10 (Jan 13)* : 1. Added a new ONT Guppy5 model  (`r941_prom_sup_g5014`). Click [here](docs/guppy5_20220113.md) for some benchmarking results. This `sup` model is also applicable to reads called using the `hac` and `fast` mode. The old `r941_prom_sup_g506` model that was fine-tuned from the Guppy3,4 model is obsoleted. 2. Added `--var_pct_phasing` option to control the percentage of top ranked heterozygous pile-up variants used for WhatsHap phasing.
 
 *v0.1-r9 (Dec 1)* : Added the `--enable_long_indel` option to output indel variant calls >50bp ([#64](https://github.com/HKU-BAL/Clair3/issues/64)), Click [here](https://github.com/HKU-BAL/Clair3/blob/main/docs/indel_gt50_performance.md) to see more benchmarking results.
@@ -267,18 +269,19 @@ pypy3 -m pip install mpmath==1.2.1
 # install python packages in environment
 pip3 install tensorflow==2.2.0
 pip3 install tensorflow-addons==0.11.2 tables==3.6.1
-conda install -c anaconda pigz==2.4 -y
+conda install -c anaconda pigz==2.4 cffi==1.14.4 -y
 conda install -c conda-forge parallel=20191122 zstd=1.4.4 -y
 conda install -c conda-forge -c bioconda samtools=1.10 -y
 conda install -c conda-forge -c bioconda whatshap=1.0 -y
-
+conda install -c conda-forge xz zlib bzip2 automake curl -y
+    
 # clone Clair3
 git clone https://github.com/HKU-BAL/Clair3.git
 cd Clair3
 
 # compile samtools, longphase and cffi library for c implement
 # after building, longphase binary is in `Clair3` folder
-python3 build.py
+source activate clair3 && make PREFIX=${CONDA_PREFIX}
 
 # download pre-trained models
 mkdir models
@@ -364,12 +367,15 @@ docker run -it hkubal/clair3:latest /opt/bin/run_clair3.sh --help
       --pypy=STR                Path of pypy3, pypy3 >= 3.6 is required.
       --parallel=STR            Path of parallel, parallel >= 20191122 is required.
       --whatshap=STR            Path of whatshap, whatshap >= 1.0 is required.
+      --longphase=STR           Path of longphase, longphase >= 1.0 is required.
       --chunk_size=INT          The size of each chuck for parallel processing, default: 5Mbp.
       --pileup_only             Use the pileup model only when calling, default: disable.
       --print_ref_calls         Show reference calls (0/0) in vcf file, default: disable.
       --include_all_ctgs        Call variants on all contigs, otherwise call in chr{1..22,X,Y} and {1..22,X,Y}, default: disable.
       --gvcf                    Enable GVCF output, default: disable.
       --enable_phasing          Output phased variants using whatshap, default: disable.
+      --longphase_for_phasing   Use longphase for phasing, default: enable.
+      --disable_c_impl          Disable C implement with cffi for pileup and full-alignment create tensor, default: enable.
       --remove_intermediate_dir Remove intermediate directory, including intermediate phased BAM, pileup and full-alignment results. default: disable.
       --snp_min_af=FLOAT        Minimum SNP AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.08,hifi:0.08,ilmn:0.08.
       --indel_min_af=FLOAT      Minimum INDEL AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.15,hifi:0.08,ilmn:0.08.
@@ -378,6 +384,9 @@ docker run -it hkubal/clair3:latest /opt/bin/run_clair3.sh --help
       --var_pct_phasing=FLOAT   EXPERIMENTAL: Specify an expected percentage of high quality 0/1 variants used in WhatsHap phasing, default: 0.8 for ont guppy5 and 0.7 for other platforms.
       --pileup_model_prefix=STR EXPERIMENTAL: Model prefix in pileup calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index. default: pileup.
       --fa_model_prefix=STR     EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment.
+      --min_mq=INT              EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5.
+      --min_coverage=INT        EXPERIMENTAL: Minimum coverage required to call a variant, default: 2.
+      --min_contig_size=INT     EXPERIMENTAL: If set, contigs with contig size<$min_contig_size are filtered, default: 0.
       --fast_mode               EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable.
       --haploid_precise         EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable.
       --haploid_sensitive       EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable.

From 2564a1a37a8c175bde120e97df23a00419c77ac5 Mon Sep 17 00:00:00 2001
From: zxzheng <zxzheng@cs.hku.hk>
Date: Mon, 4 Apr 2022 17:38:12 +0800
Subject: [PATCH 43/43] add document for c implement and longphase speedup

---
 docs/v0.1_r11_speedup.md | 41 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 docs/v0.1_r11_speedup.md

diff --git a/docs/v0.1_r11_speedup.md b/docs/v0.1_r11_speedup.md
new file mode 100644
index 0000000..f584aab
--- /dev/null
+++ b/docs/v0.1_r11_speedup.md
@@ -0,0 +1,41 @@
+# Notes on v0.1-r11
+
+We focused on speedup in `v0.1-r11`. We tried a few techniques and listed those that worked as follows.
+
+1.  **C implementation for pileup and full-alignment feature generation.**  Before r11, feature generation (tensor creation) in Clair3 was sped up using pypy on python code. The speedup was ~10x over native python. The practice balanced speed and ease of coding in the developmental stage of Clair3. In r11, we added C implementation, bringing another ~2-3 times speedup over pypy. The C code is integrated with the other python parts using CFFI (C Foreign Function Interface). The variants called with the new C implementation are identical to the previous version. 
+2. **Use longphase for phasing.**  [longphase](https://github.com/twolinin/longphase) by [Lin et al.](https://academic.oup.com/bioinformatics/advance-article-abstract/doi/10.1093/bioinformatics/btac058/6519151) is an ultra-fast chromosome-scale phasing algorithm for small and large variants. In our experiments, longphase took ~3 minutes to phase 69x Q20 ONT WGS with 24 CPU cores and no I/O bound, faster than `whatshap` that took 52 minutes. To enable using longphase for phasing, please use the `--longphase_for_phasing` option. Our suggestions on when to enable longphase are shown in the section below.
+3. **Haplotagging on the fly.**  Whatshap `haplotag` was used to add an `HP` tag to each read after phasing. This process writes out a new BAM, which is I/O intensive and in fact, unnecessary. In r11, we implemented haplotagging to feed tagged read directly to full-alignment calling. We used the exact logic that was implemented in whatshap's haplotag module. This technique, no matter whatshap or longphase was used, saves more than 10-20 minutes on compressing, writing and reading a new BAM.
+
+We benchmarked r11 against r10 with [69x Q20 ONT HG002 data](https://labs.epi2me.io/gm24385_q20_2021.10). 24 CPU cores with minimal I/O speed limit were used. The results are as follows. With C implementation and longphase enabled, the total runtime reduced from 234 to 101 minutes.
+
+| Implementation     | Sample            | CPU cores | Inference hardware | Total runtime | Pileup runtime | Phasing runtime | Full-alignment runtime |
+| ------------------ | ----------------- | --------- | ------------------ | ------------- | -------------- | --------------- | ---------------------- |
+| c\_impl, longphase | HG002 WGS Q20 69x | 24        | CPU                | 101m          | 38m            | 3m              | 56m                    |
+| v0.1-r10, whatshap | HG002 WGS Q20 69x | 24        | CPU                | 234m          | 57m            | 52m             | 118m                   |
+
+----
+
+## When to use `longphase` (to replace `whatshap`)
+
+`longphase` is **not** enabled by default. We suggest enabling `longphase` through the `--longphase_for_phasing` option when calling variants in human with ≥20x of data. **Use `whatshap` with non-human samples or insufficient depth.**
+
+Benchmarks between using longphase and whatshap on HG003 WGS ONT Guppy5 with five depths from 10x to 50x are as follows.
+
+| Phasing algorithm | Depth | SNP-Precision | SNP-Recall | SNP-F1 | Indel-Precision | Indel-Recall | Indel-F1 |
+| ----------------- | ----- | ------------- | ---------- | ------ | --------------- | ------------ | -------- |
+| longphase         | 10x   | 96.75%        | 93.94%     | 95.32% | 82.86%          | 47.30%       | 60.22%   |
+| whatshap          | 10x   | 95.87%        | 96.64%     | 96.26% | 83.37%          | 47.50%       | 60.52%   |
+| longphase         | 20x   | 99.22%        | 99.27%     | 99.25% | 88.49%          | 62.22%       | 73.07%   |
+| whatshap          | 20x   | 99.21%        | 99.36%     | 99.28% | 88.75%          | 60.47%       | 71.93%   |
+| longphase         | 30x   | 99.50%        | 99.60%     | 99.55% | 90.63%          | 68.39%       | 77.96%   |
+| whatshap          | 30x   | 99.50%        | 99.61%     | 99.56% | 90.61%          | 66.52%       | 76.72%   |
+| longphase         | 40x   | 99.59%        | 99.67%     | 99.63% | 91.69%          | 72.34%       | 80.87%   |
+| whatshap          | 40x   | 99.60%        | 99.70%     | 99.65% | 91.71%          | 72.39%       | 80.91%   |
+| longphase         | 50x   | 99.63%        | 99.70%     | 99.66% | 92.17%          | 75.29%       | 82.88%   |
+| whatshap          | 50x   | 99.62%        | 99.70%     | 99.66% | 91.59%          | 73.66%       | 81.65%   |
+
+---
+
+## Use the old python-based feature generation code (to disable the new C implementation)
+
+The new C implementation generates results identical to the previous version. However, we retained the old python-based feature generation code for benchmarking or back-compatibility purposes. Users can use it through the `--disable_c_impl` option.