From 2c1dbef55e47157bd03c695cef363557925596a7 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 12:52:58 +0800 Subject: [PATCH 01/43] pileup and full-alignment create tensor c implement, pileup followed medaka pileup count calculation style, full-alignemnt integrated read-haplotagging function --- src/clair3_full_alignment.c | 910 ++++++++++++++++++++++++++++++++++++ src/clair3_full_alignment.h | 257 ++++++++++ src/clair3_pileup.c | 462 ++++++++++++++++++ src/clair3_pileup.h | 105 +++++ src/khash.h | 627 +++++++++++++++++++++++++ src/kvec.h | 90 ++++ src/levenshtein.c | 72 +++ src/levenshtein.h | 17 + src/medaka_bamiter.c | 72 +++ src/medaka_bamiter.h | 37 ++ src/medaka_common.c | 99 ++++ src/medaka_common.h | 60 +++ src/medaka_khcounter.c | 135 ++++++ src/medaka_khcounter.h | 53 +++ 14 files changed, 2996 insertions(+) create mode 100644 src/clair3_full_alignment.c create mode 100644 src/clair3_full_alignment.h create mode 100644 src/clair3_pileup.c create mode 100644 src/clair3_pileup.h create mode 100644 src/khash.h create mode 100644 src/kvec.h create mode 100644 src/levenshtein.c create mode 100644 src/levenshtein.h create mode 100644 src/medaka_bamiter.c create mode 100644 src/medaka_bamiter.h create mode 100644 src/medaka_common.c create mode 100644 src/medaka_common.h create mode 100644 src/medaka_khcounter.c create mode 100644 src/medaka_khcounter.h diff --git a/src/clair3_full_alignment.c b/src/clair3_full_alignment.c new file mode 100644 index 0000000..06f6d07 --- /dev/null +++ b/src/clair3_full_alignment.c @@ -0,0 +1,910 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "htslib/sam.h" +#include "htslib/faidx.h" +#include "khash.h" +#include "kvec.h" +#include "medaka_bamiter.h" +#include "medaka_common.h" +#include "medaka_khcounter.h" +#include "clair3_full_alignment.h" +#include "zlib.h" +#include "levenshtein.h" + +typedef struct Pos_alt_info +{ + + khash_t(KH_COUNTER) * ins_counter; + khash_t(KH_INT_COUNTER) * del_counter; + size_t acgt_count[4]; + size_t depth; + +} Pos_alt_info; + +int com_func(const void *a, const void *b) +{ + + return (*(size_t *)a - *(size_t *)b); +} + +int hap_cmp(const void *x, const void *y) +{ + + HAP a = *(HAP *)x; + HAP b = *(HAP *)y; + if (a.haplotype < b.haplotype) + return -1; + else if (a.haplotype > b.haplotype) + return 1; + else + return (a.read_index - b.read_index); +} + +void destroy_fa_data(fa_data data) +{ + + free(data->matrix); + for (size_t i = 0; i < data->candidates_num; i++) + { + free(data->all_alt_info[i]); + } + free(data->all_alt_info); + free(data); +} + +void sort_read_name_by_haplotype(HAP *read_hap_array, int *matrix_read_index_array, size_t n) +{ + + size_t read_num = min(n, matrix_depth); + if (n > matrix_depth) + { + // shuffle the read index array with the same random seed + for (size_t i = 0; i < n - 1; i++) + { + size_t j = i + rand() / (RAND_MAX / (n - i) + 1); + size_t tmp_read_index = read_hap_array[j].read_index; + size_t tmp_haplotype = read_hap_array[j].haplotype; + read_hap_array[j].read_index = read_hap_array[i].read_index; + read_hap_array[j].haplotype = read_hap_array[i].haplotype; + read_hap_array[i].read_index = tmp_read_index; + read_hap_array[i].haplotype = tmp_haplotype; + } + } + + qsort(read_hap_array, read_num, sizeof(HAP), hap_cmp); + + // if need padding (overlap read num < matrix depth), add -1 to fill the depth + if (n < matrix_depth) + { + size_t padding_depth = matrix_depth - read_num; + size_t prefix_padding_depth = padding_depth >> 1; + size_t suffix_padding_depth = padding_depth - prefix_padding_depth; + for (size_t i = 0; i < prefix_padding_depth; i++) + matrix_read_index_array[i] = -1; + for (size_t i = 0; i < read_num; i++) + matrix_read_index_array[i + prefix_padding_depth] = read_hap_array[i].read_index; + for (size_t i = 0; i < suffix_padding_depth; i++) + matrix_read_index_array[read_num + prefix_padding_depth + i] = -1; + } + else + { + for (size_t i = 0; i < matrix_depth; i++) + matrix_read_index_array[i] = read_hap_array[i].read_index; + } +} + +void cigar_prefix_length(uint32_t *cigartuples, size_t reference_bases, size_t *ref_bases, size_t *query_bases, size_t left_cigar_index, size_t right_cigar_index, size_t consumed, bool reverse) +{ + + size_t ref_pos = 0; + size_t query_pos = 0; + for (size_t i = left_cigar_index; i < right_cigar_index; i++) + { + size_t index = reverse ? left_cigar_index + right_cigar_index - i - 1 : i; + size_t cigar_op = bam_cigar_op(cigartuples[index]); + size_t length = bam_cigar_oplen(cigartuples[index]); + + length = i == left_cigar_index ? consumed : length; + if (length == 0) + continue; + + if (cigar_op == BAM_CMATCH || cigar_op == BAM_CEQUAL || cigar_op == BAM_CDIFF) + { + query_pos += length; + ref_pos += length; + if (ref_pos >= reference_bases) + { + *ref_bases = reference_bases; + *query_bases = query_pos + reference_bases - ref_pos; + return; + } + } + else if (cigar_op == BAM_CDEL) + { + ref_pos += length; + if (ref_pos >= reference_bases) + { + *ref_bases = reference_bases; + *query_bases = query_pos; + return; + } + } + else if (cigar_op == BAM_CINS) + { + query_pos += length; + } + else if (cigar_op == BAM_CREF_SKIP) + { + *ref_bases = reference_bases; + *query_bases = query_pos; + return; + } + } +} + +char *get_ref_seq(char *ref_seq, size_t start, size_t end) +{ + + size_t seq_size = end - start; + char *sub_seq = malloc((seq_size + 1)); + strncpy(sub_seq, ref_seq + start, seq_size); + sub_seq[seq_size] = '\0'; + return sub_seq; +} + +size_t get_read_end(uint32_t *cigartuples, size_t n_cigar, size_t read_start) +{ + + size_t ref_pos = read_start; + for (size_t i = 0; i < n_cigar; i++) + { + size_t cigar_op = bam_cigar_op(cigartuples[i]); + size_t length = bam_cigar_oplen(cigartuples[i]); + if (cigar_op == BAM_CMATCH || cigar_op == BAM_CEQUAL || cigar_op == BAM_CDIFF || cigar_op == BAM_CDEL || cigar_op == BAM_CREF_SKIP) + { + ref_pos += length; + } + } + return ref_pos; +} + +char *get_query_seq(uint8_t *seqi, size_t start, size_t end) +{ + + size_t seq_size = end - start; + char *sub_seq = malloc((seq_size + 1)); + for (size_t i = 0; i < seq_size; i++) + { + sub_seq[i] = seq_nt16_str[bam_seqi(seqi, start + i)]; + } + sub_seq[seq_size] = '\0'; + return sub_seq; +} + +void update_haplotype_cost(int allele, int phase_set, int genotype, khash_t(KH_INT_COUNTER) * haplotype_cost) +{ + + if (allele == 0) + return; + + if (allele == genotype) + { + kh_int_counter_add(haplotype_cost, phase_set, 1); + } + else + { + kh_int_counter_add(haplotype_cost, phase_set, -1); + } +} + +int realign_read(Variant *variant, Read *read, size_t i, size_t consumed, size_t query_pos, char *reference, size_t ref_start) +{ + + uint32_t *cigartuples = read->cigartuples; + uint8_t *seqi = read->seqi; + size_t n_cigar = read->n_cigar; + size_t middle_op = bam_cigar_op(cigartuples[i]); + size_t middle_length = bam_cigar_oplen(cigartuples[i]); + size_t left_consumed = consumed > 0 ? consumed : 0; + size_t right_consumed = consumed < middle_length ? middle_length - consumed : 0; + size_t left_ref_bases = 0; + size_t left_query_bases = 0; + size_t right_ref_bases = 0; + size_t right_query_bases = 0; + size_t left_cigar_size = i + 1; + size_t right_cigar_size = i; + + cigar_prefix_length(cigartuples, overhang, &left_ref_bases, &left_query_bases, 0, left_cigar_size, left_consumed, true); + cigar_prefix_length(cigartuples, overhang + 1, &right_ref_bases, &right_query_bases, right_cigar_size, n_cigar, right_consumed, false); + + char *query = get_query_seq(seqi, query_pos - left_query_bases, query_pos + right_query_bases); + char *ref = get_ref_seq(reference, variant->position - left_ref_bases - ref_start, variant->position + right_ref_bases - ref_start); + + size_t alt_length = left_ref_bases + right_ref_bases + 1; + char *alt = malloc(alt_length); + strcpy(alt, ref); + alt[left_ref_bases] = variant->alt_base; + + size_t distance_ref = levenshtein(query, ref); + size_t distance_alt = levenshtein(query, alt); + + int allele = 0; + if (distance_ref < distance_alt) + { + allele = 1; + } + else if (distance_ref > distance_alt) + { + allele = 2; + } + + free(query); + free(ref); + free(alt); + + return allele; +} + +int haplotag_read(Variants_info *variants_info, Read *read, char *ref_seq, size_t ref_start) +{ + + size_t n = variants_info->variant_num; + size_t query_pos = 0; + size_t v_position = 0; + Variant **variants = variants_info->variants; + uint8_t *seqi = read->seqi; + uint32_t *cigartuples = read->cigartuples; + size_t n_cigar = read->n_cigar; + size_t j = variants_info->variant_current_pos; + size_t ref_pos = read->read_start; + khash_t(KH_INT_COUNTER) *haplotype_cost = kh_init(KH_INT_COUNTER); + int allele = 0; + + while (j < n && variants[j]->position < ref_pos) + j += 1; + + for (size_t i = 0; i < n_cigar; i++) + { + size_t cigar_op = bam_cigar_op(cigartuples[i]); + size_t length = bam_cigar_oplen(cigartuples[i]); + if (j < n) + v_position = variants[j]->position; + + if (cigar_op == BAM_CMATCH || cigar_op == BAM_CEQUAL || cigar_op == BAM_CDIFF) + { // XM= + while (j < n && v_position < ref_pos + length) + { + allele = realign_read(variants[j], read, i, v_position - ref_pos, query_pos + v_position - ref_pos, ref_seq, ref_start); + update_haplotype_cost(allele, variants[j]->phase_set, variants[j]->genotype, haplotype_cost); + j++; + if (j < n) + v_position = variants[j]->position; + } + query_pos += length; + ref_pos += length; + } + else if (cigar_op == BAM_CINS) + { // I + if (j < n && v_position == ref_pos) + { + allele = realign_read(variants[j], read, i, 0, query_pos, ref_seq, ref_start); + update_haplotype_cost(allele, variants[j]->phase_set, variants[j]->genotype, haplotype_cost); + j++; + if (j < n) + v_position = variants[j]->position; + } + query_pos += length; + } + else if (cigar_op == BAM_CDEL) + { + while (j < n && v_position < ref_pos + length) + { + allele = realign_read(variants[j], read, i, v_position - ref_pos, query_pos, ref_seq, ref_start); + update_haplotype_cost(allele, variants[j]->phase_set, variants[j]->genotype, haplotype_cost); + j++; + if (j < n) + v_position = variants[j]->position; + } + ref_pos += length; + } + else if (cigar_op == BAM_CREF_SKIP) + { + while (j < n && v_position < ref_pos + length) + { + j++; + if (j < n) + v_position = variants[j]->position; + } + ref_pos += length; + } + else if (cigar_op == BAM_CSOFT_CLIP) + { + query_pos += length; + } + } + + read->read_end = ref_pos; + + size_t counter_size = 0; + int max_value = 0; + int min_value = 0; + for (khiter_t k = kh_begin(haplotype_cost); k != kh_end(haplotype_cost); ++k) + { + if (kh_exist(haplotype_cost, k)) + { + int val = kh_val(haplotype_cost, k); + max_value = max(max_value, val); + min_value = min(min_value, val); + counter_size++; + } + } + + kh_int_counter_destroy(haplotype_cost); + + if (counter_size == 0 || (max_value == 0 && min_value == 0)) + { + return HAP_UNPHASED; + } + else if (max_value > abs(min_value)) + { + return HAP_1; + } + else + { + return HAP_2; + } +} + +size_t get_overlap_candidate_num(size_t read_start, size_t read_end, size_t candidate_current_index, size_t flanking_candidates_num, size_t *flanking_candidates) +{ + size_t overlap_num = 0; + for (size_t i = candidate_current_index; i < flanking_candidates_num; i++) + { + if (flanking_candidates[i] >= read_start && flanking_candidates[i] < read_end) + overlap_num++; + else + return overlap_num; + } + return overlap_num; +} + +fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num) +{ + + bool need_haplotagging = true; + int start, end; + char *chr = xalloc(strlen(region) + 1, sizeof(char), "chr"); + strcpy(chr, region); + char *reg_chr = (char *)hts_parse_reg(chr, &start, &end); + if (reg_chr) + *reg_chr = '\0'; + + // open fasta + faidx_t *fai = fai_load(fasta_path); + int len = 0; + char *ref_seq = NULL; + + const size_t offset_can = no_of_positions * matrix_depth * channel_size; + const size_t offset_row = no_of_positions * channel_size; + const size_t offset_col = channel_size; + + int ref_start = max(0, start - expand_reference_region); // 0-index + int ref_end = end + expand_reference_region; + ref_seq = faidx_fetch_seq(fai, chr, ref_start, ref_end, &len); + + // open bam + htsFile *hts_file; + hts_idx_t *idx; + bam_hdr_t *header; + + hts_file = sam_open(bam_path, "r"); + idx = sam_index_load(hts_file, bam_path); + header = sam_hdr_read(hts_file); + const int tid = bam_name2id(header, chr); + hts_itr_t *iter = sam_itr_queryi(idx, tid, start, end); + // initialize an alignment + bam1_t *alignment = bam_init1(); + + size_t reads_num = 0; + size_t variant_current_pos = 0; + size_t flanking_candidates_num = 0; + size_t candidate_current_index = 0; + size_t read_no_overlap_num = 0; + Pos_info *pos_info = NULL; + + Variants_info variants_info = { + .variants = variants, + .variant_num = variant_num, + .variant_current_pos = variant_current_pos}; + + // dict to store all candidates index + khash_t(KH_INT_COUNTER) *candidates_p = kh_init(KH_INT_COUNTER); + // dict to store all flanking candidate index + khash_t(KH_INT_COUNTER) *flanking_candidates_p = kh_init(KH_INT_COUNTER); + // dict to count all read name + khash_t(KH_COUNTER) *read_name_set = kh_init(KH_COUNTER); + // allocate a position alternative information struct for each candidate + Pos_alt_info *pos_alt_info = calloc(candidate_num, sizeof(Pos_alt_info)); + // a kvec vector to store all read struct + kvec_t(Read) read_array; + kv_init(read_array); + + for (size_t i = 0; i < candidate_num; i++) + { + size_t candidate = candidates[i]; + // each candidate is a new key + kh_int_counter_add(candidates_p, candidate, i); + pos_alt_info[i].ins_counter = kh_init(KH_COUNTER); + pos_alt_info[i].del_counter = kh_init(KH_INT_COUNTER); + pos_alt_info[i].depth = 0; + for (size_t j = 0; j < 4; j++) + pos_alt_info[i].acgt_count[j] = 0; + + for (size_t j = 0; j < no_of_positions; j++) + { + size_t key = candidate - flanking_base_num + j; + if (kh_int_counter_val(flanking_candidates_p, key) == -1) + { + kh_int_counter_add(flanking_candidates_p, key, flanking_candidates_num++); + } + } + } + + size_t flanking_candidates[flanking_candidates_num]; + for (khiter_t k = kh_begin(flanking_candidates_p); k != kh_end(flanking_candidates_p); ++k) + { + if (kh_exist(flanking_candidates_p, k)) + { + size_t key = kh_key(flanking_candidates_p, k); + int val = kh_val(flanking_candidates_p, k); + flanking_candidates[val] = key; + } + } + + while (sam_itr_next(hts_file, iter, alignment) >= 0) + { + int flag = alignment->core.flag; + + if (flag & SAMTOOLS_VIEW_FILTER_FLAG) + continue; + + if (alignment->core.qual < min_mq) + { + continue; + } + + const char *q_name = bam_get_qname(alignment); + //skip the duplicated read name + int ret = 0; + khiter_t k = kh_put(KH_COUNTER, read_name_set, q_name, &ret); + if (ret == 1) + { + kh_key(read_name_set, k) = strdup(q_name); + kh_value(read_name_set, k) = 1; + } + else if (ret == 0) + { + continue; + } + + bool is_fwd_strand = (flag & 16) == 16; + int32_t pos = alignment->core.pos; + uint32_t l_qseq = alignment->core.l_qseq; + uint32_t *cigartuples = bam_get_cigar(alignment); + uint8_t *seqi = bam_get_seq(alignment); + uint8_t *qual = bam_get_qual(alignment); + size_t n_cigar = alignment->core.n_cigar; + + Read read = { + .mq = normalize_mq(alignment->core.qual), + .read_start = pos, + .cigartuples = cigartuples, + .seqi = seqi, + .qual = qual, + .strand = normalize_strand(is_fwd_strand), + .n_cigar = n_cigar, + .l_qseq = l_qseq, + .pos_info = NULL, + .haplotype = HAP_UNPHASED}; + + while (variant_current_pos < variant_num && variants[variant_current_pos]->position < pos) + variant_current_pos++; + variants_info.variant_current_pos = variant_current_pos; + + while (candidate_current_index < flanking_candidates_num && flanking_candidates[candidate_current_index] < pos) + candidate_current_index++; + + read.read_end = get_read_end(cigartuples, n_cigar, read.read_start); + + // get the overlap candidates number and skip the alignment if no flanking candidate overlapped + size_t overlap_candidates_num = get_overlap_candidate_num(pos, read.read_end, candidate_current_index, flanking_candidates_num, &flanking_candidates); + read.overlap_candidates_num = overlap_candidates_num; + if (read.overlap_candidates_num == 0) + { + read_no_overlap_num++; + continue; + } + + // haplotag the read following whatshap haplotagging logic + if (need_haplotagging && alignment->core.qual >= min_haplotag_mq) + { + read.haplotype = haplotag_read(&variants_info, &read, ref_seq, ref_start); + } + + pos_info = calloc(overlap_candidates_num, sizeof(Pos_info)); + for (size_t i = 0; i < overlap_candidates_num; i++) + { + pos_info[i].ins_bases = NULL; + pos_info[i].ins_length = 0; + pos_info[i].alt_base = 0; + pos_info[i].del_length = 0; + pos_info[i].bq = 0; + } + + // index of current first overlapped flanking candidate + size_t flanking_start = kh_int_counter_val(flanking_candidates_p, flanking_candidates[candidate_current_index]); + read.flanking_start = flanking_start; + + // store all overlapped flanking candidates information and put all centered candidate information + // into pos_alt_info struct + size_t ref_pos = read.read_start; + size_t query_pos = 0; + for (size_t i = 0; i < n_cigar; i++) + { + size_t cigar_op = bam_cigar_op(cigartuples[i]); + size_t length = bam_cigar_oplen(cigartuples[i]); + if (cigar_op == BAM_CMATCH || cigar_op == BAM_CEQUAL || cigar_op == BAM_CDIFF) + { + for (size_t p = ref_pos; p < ref_pos + length; p++) + { + int flanking_index = kh_int_counter_val(flanking_candidates_p, p); + if (flanking_index != -1) + { + size_t offset = flanking_index - flanking_start; + pos_info[offset].alt_base = bam_seqi(seqi, query_pos); + pos_info[offset].bq = normalize_bq(qual[query_pos]); + + int center_pos_index = kh_int_counter_val(candidates_p, p); + if (center_pos_index != -1) + { + char alt_base = seq_nt16_str[pos_info[offset].alt_base]; + pos_alt_info[center_pos_index].acgt_count[acgt2num[alt_base - 'A']]++; + pos_alt_info[center_pos_index].depth++; + } + } + query_pos++; + } + ref_pos += length; + } + else if (cigar_op == BAM_CDEL) + { + + int flanking_index = kh_int_counter_val(flanking_candidates_p, ref_pos - 1); + if (flanking_index != -1) + { + size_t offset = flanking_index - flanking_start; + pos_info[offset].del_length = length; + int center_pos_index = kh_int_counter_val(candidates_p, ref_pos - 1); + if (center_pos_index != -1) + { + kh_int_counter_add(pos_alt_info[center_pos_index].del_counter, length, 1); + } + } + for (size_t p = ref_pos; p < ref_pos + length; p++) + { + int flanking_index = kh_int_counter_val(flanking_candidates_p, p); + if (flanking_index != -1) + { + size_t offset = flanking_index - flanking_start; + pos_info[offset].alt_base = -1; + int center_pos_index = kh_int_counter_val(candidates_p, p); + if (center_pos_index != -1) + { + pos_alt_info[center_pos_index].depth++; + } + } + } + ref_pos += length; + } + else if (cigar_op == BAM_CINS) + { + int flanking_index = kh_int_counter_val(flanking_candidates_p, ref_pos - 1); + if (flanking_index != -1) + { + size_t offset = flanking_index - flanking_start; + pos_info[offset].ins_bases = calloc(length + 1, sizeof(char)); + for (size_t ins_idx = 0; ins_idx < length; ins_idx++) + { + pos_info[offset].ins_bases[ins_idx] = seq_nt16_str[bam_seqi(read.seqi, query_pos + ins_idx)]; + } + pos_info[offset].ins_bases[length] = '\0'; + pos_info[offset].ins_length = length; + + int center_pos_index = kh_int_counter_val(candidates_p, ref_pos - 1); + if (center_pos_index != -1) + { + kh_counter_add(pos_alt_info[center_pos_index].ins_counter, pos_info[offset].ins_bases, 1); + } + } + query_pos += length; + } + else if (cigar_op == BAM_CREF_SKIP) + { + ref_pos += length; + } + else if (cigar_op == BAM_CSOFT_CLIP) + { + query_pos += length; + } + } + + //update the read array + read.pos_info = pos_info; + reads_num++; + kv_push(Read, read_array, read); + } + + // allocate memory of the input matrix of all candidates + int8_t *matrix = calloc(candidate_num * matrix_depth * no_of_positions * channel_size, sizeof(int8_t)); + + HAP read_hap_array[reads_num]; + int matrix_read_index_array[matrix_depth]; + Alt_info *alt_info = malloc(matrix_depth * sizeof(Alt_info)); + + char **alt_info_p = calloc(candidate_num, sizeof(char*)); + fa_data data = calloc(1, sizeof(_fa_data)); + + // loop each candiate and generate full-alignment input matrix + for (size_t i = 0; i < candidate_num; i++) + { + size_t candidate = candidates[i]; + size_t start_pos = candidate - flanking_base_num; + size_t end_pos = candidate + flanking_base_num + 1; + size_t candidate_depth = 0; + for (size_t j = 0; j < matrix_depth; j++) + { + alt_info[j].ins_bases = NULL; + alt_info[j].alt_base = '\0'; + alt_info[j].del_length = 0; + alt_info[j].has_alt_info = false; + } + + for (size_t j = 0; j < matrix_depth; j++) + matrix_read_index_array[j] = -1; + + size_t overlap_read_num = 0; + for (size_t j = 0; j < reads_num; j++) + { + Read read = kv_A(read_array, j); + if (read.read_start >= end_pos) + break; + if (read.read_end <= start_pos) + continue; + read_hap_array[overlap_read_num].read_index = j; + read_hap_array[overlap_read_num++].haplotype = read.haplotype; + } + + sort_read_name_by_haplotype(&read_hap_array, &matrix_read_index_array, overlap_read_num); + + // loop each overlapped read of a candidate + for (size_t d = 0; d < matrix_depth; d++) + { + int read_index = matrix_read_index_array[d]; + if (read_index == -1) + continue; + Read read = kv_A(read_array, read_index); + int8_t hap_v = normalize_hap(read.haplotype); + int8_t strand_v = read.strand; + int8_t mq_v = read.mq; + + // loop all flanking position of a read + for (size_t p = 0; p < no_of_positions; p++) + { + size_t cp = p + start_pos; + size_t flanking_index = kh_int_counter_val(flanking_candidates_p, cp); + int32_t offset = flanking_index - read.flanking_start; + bool is_center_pos = p == flanking_base_num; + + if (read.pos_info[offset].alt_base < 0) + { + if (is_center_pos) + candidate_depth++; + continue; + } + + if (offset < 0 || offset >= read.overlap_candidates_num) + continue; + + int8_t alt_v = 0; + char ref_base = toupper(ref_seq[cp - ref_start]); + int8_t ref_v = num2countbase_fa[ref_base - 'A']; + int8_t bq_v = read.pos_info[offset].bq; + + if (is_center_pos) + candidate_depth++; + size_t alt_int = read.pos_info[offset].alt_base; + char alt_base = seq_nt16_str[read.pos_info[offset].alt_base]; + if (read.pos_info[offset].ins_length > 0) + { + size_t ins_length = read.pos_info[offset].ins_length; + char *ins_bases = read.pos_info[offset].ins_bases; + int8_t ins_v = 0; + size_t max_ins_length = ins_length < no_of_positions - p ? ins_length : no_of_positions - p; + + for (size_t ins_idx = 0; ins_idx < ins_length; ins_idx++) + { + char ins_alt_base = ins_bases[ins_idx]; + if (ins_idx < max_ins_length && p < no_of_positions - 1) + { + ins_v = num2countbase_fa[ins_alt_base - 'A']; + matrix[i * offset_can + d * offset_row + (ins_idx + p) * offset_col + 6] = ins_v; + } + } + if (is_center_pos) + { + alt_info[d].alt_base = alt_base; + alt_info[d].ins_bases = ins_bases; + alt_info[d].has_alt_info = true; + } + alt_v = num2countbase_fa['I' - 'A']; + } + else if (read.pos_info[offset].del_length > 0) + { + if (is_center_pos) + { + alt_info[d].del_length = read.pos_info[offset].del_length; + alt_info[d].has_alt_info = true; + } + alt_v = num2countbase_fa['D' - 'A']; + } + else if (ref_base - alt_base != 0) + { + if (is_center_pos) + { + alt_info[d].alt_base = alt_base; + alt_info[d].has_alt_info = true; + } + alt_v = num2countbase_fa[alt_base - 'A']; + } + + // update the matrix + matrix[i * offset_can + d * offset_row + p * offset_col + 0] = ref_v; + matrix[i * offset_can + d * offset_row + p * offset_col + 1] = alt_v; + matrix[i * offset_can + d * offset_row + p * offset_col + 2] = strand_v; + matrix[i * offset_can + d * offset_row + p * offset_col + 3] = mq_v; + matrix[i * offset_can + d * offset_row + p * offset_col + 4] = bq_v; + matrix[i * offset_can + d * offset_row + p * offset_col + 7] = hap_v; + } + } + + // finish the candidate proportion channel; + candidate_depth = pos_alt_info[i].depth; + for (size_t j = 0; j < matrix_depth; j++) + { + int8_t af_v = 0; + if (alt_info[j].has_alt_info == false) + continue; + if (alt_info[j].ins_bases != NULL) + { + size_t count = kh_counter_val(pos_alt_info[i].ins_counter, alt_info[j].ins_bases); + if (count > 0) + af_v = normalize_af(count / (float)candidate_depth); + } + else if (alt_info[j].del_length > 0) + { + size_t count = kh_int_counter_val(pos_alt_info[i].del_counter, alt_info[j].del_length); + if (count > 0) + af_v = normalize_af(count / (float)candidate_depth); + } + else if (alt_info[j].alt_base != '\0') + { + size_t offset = alt_info[j].alt_base - 'A'; + af_v = normalize_af(pos_alt_info[i].acgt_count[acgt2num[offset]] / (float)candidate_depth); + } + + if (af_v > 0) + { + for (size_t p = 0; p < no_of_positions; p++) + { + if (matrix[i * offset_can + j * offset_row + p * offset_col + 0] != 0) + matrix[i * offset_can + j * offset_row + p * offset_col + 5] = af_v; + } + } + } + + // store the alternative information into string + size_t max_alt_length = 64; + char *alt_info_str = calloc(max_alt_length, sizeof(char)); + char center_ref_base = toupper(ref_seq[candidate - ref_start]); + + sprintf(alt_info_str, "%i-%i-%c-", candidate + 1, candidate_depth, center_ref_base); + for (size_t j = 0; j < 4; j++) + { + if (j != acgt2num[center_ref_base - 'A'] && pos_alt_info[i].acgt_count[j] > 0) + sprintf(alt_info_str + strlen(alt_info_str), "X%c %i ", ACGT[j], pos_alt_info[i].acgt_count[j]); + } + for (khiter_t k = kh_begin(pos_alt_info[i].ins_counter); k != kh_end(pos_alt_info[i].ins_counter); k++) + { + if (kh_exist(pos_alt_info[i].ins_counter, k)) + { + char *key = kh_key(pos_alt_info[i].ins_counter, k); + int val = kh_val(pos_alt_info[i].ins_counter, k); + if (strlen(key) <= MAX_INDEL_LENGTH) + { + if (strlen(alt_info_str) + strlen(key) + 32 >= max_alt_length) + { + while (strlen(alt_info_str) + strlen(key) + 32 >= max_alt_length) + max_alt_length = max_alt_length << 1; + alt_info_str = realloc(alt_info_str, max_alt_length*sizeof(char)); + } + sprintf(alt_info_str + strlen(alt_info_str), "I%c%s %i ", center_ref_base, key, val); + } + } + } + + for (khiter_t k = kh_begin(pos_alt_info[i].del_counter); k != kh_end(pos_alt_info[i].del_counter); k++) + { + if (kh_exist(pos_alt_info[i].del_counter, k)) + { + int key = kh_key(pos_alt_info[i].del_counter, k); + int val = kh_val(pos_alt_info[i].del_counter, k); + if (key <= MAX_INDEL_LENGTH) + { + if (strlen(alt_info_str) + key + 32 >= max_alt_length) + { + while (strlen(alt_info_str) + key + 32 >= max_alt_length) + max_alt_length = max_alt_length << 1; + alt_info_str = realloc(alt_info_str, max_alt_length*sizeof(char)); + } + sprintf(alt_info_str + strlen(alt_info_str), "D%.*s %i ", key, ref_seq + candidate - ref_start + 1, val); + } + } + } + + alt_info_p[i] = alt_info_str; + + } // end of candidate loop + + + data->matrix = matrix; + data->all_alt_info = alt_info_p; + data->candidates_num = candidate_num; + + // free all allocated memory + for (size_t j = 0; j < reads_num; j++) + { + Read read = kv_A(read_array, j); + for (size_t p = 0; p < read.overlap_candidates_num; p++) + { + if (read.pos_info[p].ins_bases != NULL) + free(read.pos_info[p].ins_bases); + } + free(read.pos_info); + } + + for (size_t j = 0; j < candidate_num; j++) + { + kh_counter_destroy(pos_alt_info[j].ins_counter); + kh_int_counter_destroy(pos_alt_info[j].del_counter); + } + + free(chr); + free(pos_alt_info); + free(alt_info); + kh_counter_destroy(read_name_set); + kh_int_counter_destroy(candidates_p); + kh_int_counter_destroy(flanking_candidates_p); + kv_destroy(read_array); + bam_destroy1(alignment); + hts_itr_destroy(iter); + fai_destroy(fai); + + return data; +} diff --git a/src/clair3_full_alignment.h b/src/clair3_full_alignment.h new file mode 100644 index 0000000..e7485fb --- /dev/null +++ b/src/clair3_full_alignment.h @@ -0,0 +1,257 @@ +#ifndef _CLAIR3_FULL_ALIGNMENT_H +#define _CLAIR3_FULL_ALIGNMENT_H + +#define HAP_UNPHASED 0 +#define HAP_1 1 +#define HAP_2 2 + +#define normalize_mq(x) ((int)(x < 60 ? 100 * x / 60.0 : 100)) +#define normalize_bq(x) ((int)(x < 40 ? 100 * x / 40.0 : 100)) +#define normalize_af(x) ((int)(x < 1.0 ? 100 * x : 100)) +#define normalize_strand(x) (x == true ? 50 : 100) + +static const int8_t HAP_TYPE[3] = {60, 30, 90}; +#define normalize_hap(x) (HAP_TYPE[x]) + +static const size_t overhang = 10; +static const char *RN = "\0"; +static const size_t min_haplotag_mq = 20; +static const size_t expand_reference_region = 2000000; +static const size_t flanking_base_num = 16; +static const size_t no_of_positions = 33; +static const size_t channel_size = 8; +static const size_t matrix_depth = 89; +static const size_t min_coverage = 2; +static const size_t min_bq = 0; +static const size_t min_mq = 5; +static const size_t SAMTOOLS_VIEW_FILTER_FLAG = 2316; +static const size_t MAX_READ_COUNT = 1000; +static const size_t MAX_INDEL_LENGTH = 50; +static const char ACGT[] = "ACGT"; + +// convert 16bit IUPAC (+16 for strand) to plp_bases index +// { +// , A, C, , G, , , , +// T, , , , , , , , +// , a, c, , g, , , , +// t, , , , , , , , +// } +static const int8_t num2countbase_fa[32] = { + 100, 0, 25, -100, 0, 0, 75, 0, // abcdefgh + -50, 0, 0, 0, 0, 100, 0, 0, // ijklmnop + 0, 0, 0, 50, 0, 0, 0, 0, // qrstuvwx + 0, 0, 0, 0, 0, 0, 0, 0, // vz +}; + +// convert A-Z character to 0-index offset +// ACGT: 0123 +// non-ACGT: 0 +static const int8_t acgt2num[32] = { + 0, 0, 1, 0, 0, 0, 2, 0, // abcdefgh + 0, 0, 0, 0, 0, 0, 0, 0, // ijklmnop + 0, 0, 0, 3, 0, 0, 0, 0, // qrstuvwx + 0, 0, 0, 0, 0, 0, 0, 0, // vz +}; + +/*! @typedef + @abstract Structure for full-alignment data + @field matrix int array of (total candidate number * matrix depth * no of flanking position * feature channel) + @field alt_info alternative information string with all candidates, including all SNPs, insertions and deletions + @field alt_info_length length of the alternative information string + */ +typedef struct _fa_data +{ + + int8_t *matrix; + char **all_alt_info; + size_t candidates_num; +} _fa_data; + +typedef _fa_data *fa_data; + +/*! @typedef + @abstract Structure for matrix level alternative information + @field ins_bases the char string storing all insertion bases in current position of an alignment + @field alt_base alternative base other than reference base in query sequence + @field del_length deletion length in current position of an alignment + @field has_alt_info true if any of alternative information exists, false for reference base and deletion bases(#*) + + @ by default we only allocate a maximum `matrix depth` struct array and reset all field in each candidate iteration, we + need to calculate each read candidate proportion in given candidate overlapped region + */ +typedef struct Alt_info +{ + char *ins_bases; + char alt_base; + size_t del_length; + bool has_alt_info; +} Alt_info; + +/*! @typedef + @abstract Structure for matrix level alternative information + @field read_index the read start offset of each read, the index is sorted by read start + @field haplotype information of read, 0: unphased or not phasable 1|2: haplotype1|2 + */ +typedef struct HAP +{ + size_t read_index; + size_t haplotype; +} HAP; + +/*! @typedef + @abstract Structure of a phased heterozygous pileup SNP variant + @field position variant start position 0-index + @field ref_base reference base tag in VCF + @field alt_base alternative base tag in VCF + @field genotype phased heterozygous genotype, 0|1 : 1, 1|0: 2 + @field phase_set phase set tag in VCF, which is acquired from whatshap or longphase + + @ in this release, we only store heterozygous SNP info + */ +typedef struct Variant +{ + int position; + char ref_base; + char alt_base; + int genotype; + int phase_set; +} Variant; + +typedef struct Variants_info +{ + Variant **variants; + size_t variant_num; + size_t variant_current_pos; +} Variants_info; + +/*! @typedef + @abstract Structure for matrix level alternative information + @field ins_bases the char string storing all insertion bases in current position of an alignment + @field ins_length length the stored insertion bases + @field alt_base alternative base in htslib int format + @field del_length deletion length in current position of an alignment + @field bq phred quality score of given bases + + @ we use the htslib format int alt_base than char as we need to mark the '#*' into -1, for bq field, we only store + reference base and alternative base quality and skip the insertion quality as there are only one base quality channel + */ +typedef struct Pos_info +{ + char *ins_bases; + size_t ins_length; + int alt_base; + size_t del_length; + int8_t bq; +} Pos_info; + +/*! @typedef + @abstract Structure for the alignment information + @field read_start read start position of alignment, 0-index + @field q_name read name + @field read_end alignment read end compared with the reference sequence, CIGAR length sum of X=MDN + @field cigartuples alignment CIGAR int pointer from htslib bam_get_cigar function + @field qual base quality int pointer from htslib core alignment + @field mq normalized mapping quality value (0-100) + @field n_cigar number of CIGAR operations + @field l_qseq length of the read query sequence + @field haplotype haplotype information of read, 0: unphased or not phasable 1|2: haplotype1|2 + @field strand normalized strand value forward: 50 reverse: 100 + @field pos_info structure array of overlapped flanking candidates information + @field overlap_candidates_num number of overlapped flanking candidates between read start and read end, including flanking bases + @field flanking_start the first overlapped candidate index (0 index is the the first candidate - 16 by default) + + @note that the seqi and qual pointer information will be released after each htslib sam_itr_next iterator + */ +typedef struct Read +{ + size_t read_start; + char *q_name; + size_t read_end; + uint32_t *cigartuples; + uint8_t *seqi; + uint8_t *qual; + int8_t mq; + size_t n_cigar; + uint32_t l_qseq; + size_t haplotype; + int8_t strand; + Pos_info *pos_info; + size_t overlap_candidates_num; + size_t flanking_start; +} Read; + +/** Destroys a full-alignment data structure. + * + * @param data the full-alignment data object to cleanup. + * @returns void. + * + */ +void destroy_fa_data(fa_data data); + +/** Sort overlapped reads of a candidate based on hapltoype information and read start + * + * @param read_hap_array struct array of all overlap reads + * @param matrix_read_index_array the return reference of the read index array, -1 for padding + * @param n number of overlapped reads + * @returns void. + * + */ +void sort_read_name_by_haplotype(HAP *read_hap_array, int *matrix_read_index_array, size_t n); + +/** get all overlapped flanking candidates number and start position based on read start and read end + * + * @param read_start read start, 0-index + * @param read_end read end, 0-index + * @param candidate_current_index the first flanking candidate index >= read start + * @param flanking_candidates int array of all flanking candidates, sorted by start position + * @returns number of the total overlapped flanking candidates within read start and read end. + * + */ +size_t get_overlap_candidate_num(size_t read_start, size_t read_end, size_t candidate_current_index, size_t flanking_candidates_num, size_t *flanking_candidates); + +/** get the substring of a reference sequence based on start and end + * + * @param ref_seq a string store all reference sequence from ref_start(0-index) + * @param start sequence query start, 0-index + * @param end sequence query end, 0-index + * @returns string of the queried region of reference sequence + * + */ +char *get_ref_seq(char *ref_seq, size_t start, size_t end); + +/** get the substring of a query sequence based on start and end + * + * @param seqi a htslib format pointer stores all query sequence(0-index) + * @param start query start, 0-index + * @param end query end, 0-index + * @returns string of the queried sequence + * + */ +char *get_query_seq(uint8_t *seqi, size_t start, size_t end); + +/** C implement of whatshap hapltagging + * + */ +void cigar_prefix_length(uint32_t *cigartuples, size_t reference_bases, size_t *ref_bases, size_t *query_bases, size_t left_cigar_index, size_t right_cigar_index, size_t consumed, bool reverse); + +int realign_read(Variant *variant, Read *read, size_t i, size_t consumed, size_t query_pos, char *reference, size_t ref_start); + +int haplotag_read(Variants_info *variants_info, Read *read, char *ref_seq, size_t ref_start); + +/** C implement of clair3-style full-alignment feature data and alternative information in a given region of a bam. + * + * @param region 1-based region string + * @param bam_path input alignment file + * @param fasta_path input reference file + * @param variants C structure pointer of all phased heterozygous pileup SNP variants + * @param variant_num total variants number + * @param candidates int array of all low-quality pileup candidates need to process (0-index) + * @param candidate_num total candidates number + * @returns a full-alignment data pointer, including the data matrix and all candidates alternative information + * + * The return value can be freed with destroy_fa_data + * + */ +fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num); + +#endif diff --git a/src/clair3_pileup.c b/src/clair3_pileup.c new file mode 100644 index 0000000..e3de48c --- /dev/null +++ b/src/clair3_pileup.c @@ -0,0 +1,462 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "htslib/sam.h" +#include "htslib/faidx.h" +#include "kvec.h" +#include "medaka_bamiter.h" +#include "medaka_common.h" +#include "clair3_pileup.h" +#include "medaka_khcounter.h" + +#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) +#define bam1_seqi(s, i) (bam_seqi((s), (i))) +#define bam_nt16_rev_table seq_nt16_str +#define bam_nt16_table seq_nt16_table + + +size_t base2_index(char c) { + if (c == 'A') return 0; + else if (c == 'C') return 1; + else if (c == 'G') return 2; + else if (c == 'T') return 3; + else if (c == 'a') return 9; + else if (c == 'c') return 10; + else if (c == 'g') return 11; + else if (c == 't') return 12; + else return 0; +} + +/** Constructs a pileup data structure. + * + * @param n_cols number of pileup columns. + * @param buffer_cols number of pileup columns. + * @param feature_length length of feature vector. + * @param num_dtypes number of datatypes in pileup. + * @param num_homop maximum homopolymer length to consider. + * @param fixed_size if not zero data matrix is allocated as fixed_size * n_cols, ignoring other arguments + * @see destroy_plp_data + * @returns a plp_data pointer. + * + * The return value can be freed with destroy_plp_data. + * + */ +plp_data create_plp_data(size_t n_cols, size_t buffer_cols, size_t feature_length, size_t num_dtypes, size_t num_homop, size_t fixed_size) { + assert(buffer_cols >= n_cols); + plp_data data = xalloc(1, sizeof(_plp_data), "plp_data"); + data->buffer_cols = buffer_cols; + data->num_dtypes = num_dtypes; + data->num_homop = num_homop; + data->n_cols = n_cols; + if (fixed_size != 0) { + assert(buffer_cols == n_cols); + data->matrix = xalloc(fixed_size * n_cols, sizeof(int), "matrix"); + } else { + data->matrix = xalloc(feature_length * num_dtypes * buffer_cols * num_homop, sizeof(size_t), "matrix"); + } + data->major = xalloc(buffer_cols, sizeof(size_t), "major"); + data->minor = xalloc(buffer_cols, sizeof(size_t), "minor"); + data->all_alt_info = NULL; + return data; +} + + +/** Enlarge the internal buffers of a pileup data structure. + * + * @param pileup a plp_data pointer. + * @param buffer_cols number of pileup columns for which to allocate memory + * + */ +void enlarge_plp_data(plp_data pileup, size_t buffer_cols, size_t feature_length) { + assert(buffer_cols > pileup->buffer_cols); + size_t old_size = feature_length * pileup->num_dtypes * pileup->num_homop * pileup->buffer_cols; + size_t new_size = feature_length * pileup->num_dtypes * pileup->num_homop * buffer_cols; + + pileup->matrix = xrealloc(pileup->matrix, new_size * sizeof(size_t), "matrix"); + pileup->major = xrealloc(pileup->major, buffer_cols * sizeof(size_t), "major"); + pileup->minor = xrealloc(pileup->minor, buffer_cols * sizeof(size_t), "minor"); + // zero out new part of matrix + for (size_t i = old_size; i < new_size; ++i) { + pileup->matrix[i] = 0; + } + pileup->buffer_cols = buffer_cols; +} + + +/** Destroys a pileup data structure. + * + * @param data the object to cleanup. + * @returns void. + * + */ +void destroy_plp_data(plp_data data) { + free(data->matrix); + free(data->major); + free(data->minor); + for (size_t i = 0; i < data->candidates_num; i++) { + free(data->all_alt_info[i]); + } + free(data->all_alt_info); + free(data); +} + +/** Generates clair3-style pileup feature data in a region of a bam. + * + * @param region 1-based region string. + * @param bam_file input aligment file. + * @param tag_value by which to filter data. + * @param keep_missing alignments which do not have tag. + * @param weibull_summation use predefined bam tags to perform homopolymer partial counts. + * @returns a pileup data pointer. + * + * The return value can be freed with destroy_plp_data. + * + * If num_dtypes is 1, dtypes should be NULL; all reads in the bam will be + * treated equally. If num_dtypes is not 1, dtypes should be an array of + * strings, these strings being prefixes of query names of reads within the + * bam file. Any read not matching the prefixes will cause exit(1). + * + * If tag_name is not NULL alignments are filtered by the (integer) tag value. + * When tag_name is given the behaviour for alignments without the tag is + * determined by keep_missing. + * + */ + +/** + * The pileup input is 594 integers – 33 genome positions wide with 18 features at each position – + * + * A+, C+, G+, T+, I_S+, I^1 S+, D_S+, D^1_S+, D_R+, A-, C-, G-, T-, I_S-, I^1_S-, D_S-, D^1_S-, and D_R- + * + * A, C, G, T, I, D, +, - means the count of read support of the four nucleotides: insertion, + * deletion, positive strand, and negative strand. Superscript “1” means only the indel with the + * highest read support is counted (i.e., all indels are counted if without “1“). Subscript “S”/“R” means + * the starting/non-starting position of an indel. For example, a 3bp deletion with the most reads support + * will have the first deleted base counted in either D1_S+ or D1_S-, and the second and third deleted bases + * counted in either D_R+ or D_R-. The design was determined experimentally, but the rationale is that for + * 1bp indels that are easy to call, look into the differences between the “S” counts, but reduce the + * quality if the “R” counts and discrepancy between positions increase. + * + */ +plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth) { + // extract `chr`:`start`-`end` from `region` + // (start is one-based and end-inclusive), + // hts_parse_reg below sets return value to point + // at ":", copy the input then set ":" to null terminator + // to get `chr`. + int start, end; + char *chr = xalloc(strlen(region) + 1, sizeof(char), "chr"); + strcpy(chr, region); + char *reg_chr = (char *) hts_parse_reg(chr, &start, &end); + // start and end now zero-based end exclusive + if (reg_chr) { + *reg_chr = '\0'; + } else { + fprintf(stderr, "Failed to parse region: '%s'.\n", region); + } + + // open bam etc. + // this is all now deferred to the caller + htsFile *fp = bam_set->fp; + hts_idx_t *idx = bam_set->idx; + sam_hdr_t *hdr = bam_set->hdr; + // setup bam interator + + mplp_data *data = xalloc(1, sizeof(mplp_data), "pileup init data"); + data->fp = fp; data->hdr = hdr; data->iter = bam_itr_querys(idx, hdr, region); + data->min_mapQ = min_mq; + + bam_mplp_t mplp = bam_mplp_init(1, read_bam, (void **)& data); + bam_mplp_set_maxcnt(mplp, max_depth); + + const bam_pileup1_t **plp = xalloc(1, sizeof(bam_pileup1_t *), "pileup"); + int ret, pos, tid, n_plp; + + int n_cols = 0; + size_t buffer_cols = end - start; + plp_data pileup = create_plp_data(n_cols, buffer_cols, featlenclair3, 1, 1, 0); + + // get counts + size_t major_col = 0; // index into `pileup` corresponding to pos + n_cols = 0; // number of processed columns (including insertions, which clair3 doesn't have ;)) + + faidx_t* fai = fai_load(fasta_path); + int len = 0; + char *ref_seq = NULL; +// printf("pos: %s %i %i\n", chr, start, end); + size_t ref_start = max(0, start - mpileup_expand_reference_region); + size_t ref_end = max(0, end + mpileup_expand_reference_region); + ref_seq = faidx_fetch_seq(fai, chr, ref_start, ref_end, &len); + + size_t candidates_num = 0; + size_t alt_info_p_size = 512; + char ** alt_info_p = xalloc(alt_info_p_size, sizeof(char*), "alt_info_p"); + for (size_t i = 0; i < alt_info_p_size; i++) + alt_info_p[i] = NULL; + + size_t pre_pos = 0; + size_t contiguous_flanking_num = 0; + while ((ret=bam_mplp_auto(mplp, &tid, &pos, &n_plp, plp) > 0)) { + + size_t depth = 0; + size_t alt_count = 0; + size_t ref_count = 0; + size_t del_count = 0; + size_t ins_count = 0; + + bool pass_af = false; + bool pass_snp_af = false; + bool pass_indel_af = false; + + const char *c_name = data->hdr->target_name[tid]; + if (strcmp(c_name, chr) != 0) continue; + if (pos < start) continue; + if (pos >= end) break; + n_cols++; + + + if (pre_pos + 1 != pos || pre_pos == 0) + contiguous_flanking_num = 0; + else + contiguous_flanking_num++; + pre_pos = pos; + + //update the deletion buffer in each interation + size_t del_buf_size = 32; + size_t* dels_f = xalloc(del_buf_size, sizeof(size_t), "dels_f"); + size_t* dels_r = xalloc(del_buf_size, sizeof(size_t), "dels_r"); + + memset(dels_f, 0, del_buf_size * sizeof(size_t)); + memset(dels_r, 0, del_buf_size * sizeof(size_t)); + + // we still need this as positions might not be contiguous + pileup->major[major_col / featlenclair3] = pos; + pileup->minor[major_col / featlenclair3] = 0; + + // counters for insertion strings + khash_t(KH_COUNTER) *ins_counts_f = kh_init(KH_COUNTER); + khash_t(KH_COUNTER) *ins_counts_r = kh_init(KH_COUNTER); + khash_t(KH_COUNTER) *ins_counts_all = kh_init(KH_COUNTER); + // loop through all reads at this position + for (int i = 0; i < n_plp; ++i) { + const bam_pileup1_t *p = plp[0] + i; + if (p->is_refskip) continue; + + if (p->indel < 0) { + // there's a deletion starting on next genomic position, + // record the length here and finalise after the read loop + // - actually deleted bases get recorded in next block + size_t d = (size_t) -1 * p->indel; + + if (d >= del_buf_size) { + size_t new_size = max(d, 2 * del_buf_size); + dels_f = xrealloc(dels_f, new_size*sizeof(size_t), "dels_f"); + memset(dels_f+del_buf_size, 0, (new_size-del_buf_size) * sizeof(size_t)); + dels_r = xrealloc(dels_r, new_size*sizeof(size_t), "dels_r"); + memset(dels_r+del_buf_size, 0, (new_size-del_buf_size) * sizeof(size_t)); + del_buf_size = new_size; + } + if (bam_is_rev(p->b)) { + dels_r[d - 1] += 1; + } else { + dels_f[d - 1] += 1; + } + } + + // handle ref_base/sub/del + int base_i; + if (p->is_del) { + // we've been deleted, +1 to DR + base_i = bam_is_rev(p->b) ? c3_rev_del : c3_fwd_del; + depth++; + } else { + // just a base + int base_j = bam1_seqi(bam1_seq(p->b), p->qpos); + if bam_is_rev(p->b) { base_j += 16; } + base_i = num2countbaseclair3[base_j]; + depth++; + } + pileup->matrix[major_col + base_i] += 1; + + // handle insertion + // - build insert string then hash + if (p->indel > 0) { + size_t first = p->is_del ? 0 : 1; + char* indel = (char*) xalloc(p->indel + 1, sizeof(char), "indel"); + for (size_t i = 0, j = first; j < p->indel + first; ++i, ++j) { + indel[i] = seq_nt16_str[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + } + indel[p->indel] = '\0'; + if (bam_is_rev(p->b)) { + kh_counter_increment(ins_counts_r, indel); + } else { + kh_counter_increment(ins_counts_f, indel); + } + kh_counter_increment(ins_counts_all, indel); + free(indel); + } + } + + // finalise deletions: DS (all) and D1S (best) + // + // forward + size_t best_count = 0; + size_t all_count = 0; + for (size_t i = 0; i < del_buf_size; ++i) { + size_t d = dels_f[i]; + all_count += d; + best_count = max(best_count, d); + } + pileup->matrix[major_col + c3_fwd_del_all] = all_count; + pileup->matrix[major_col + c3_fwd_del_best] = best_count; + del_count += all_count; + // reverse + best_count = 0; + all_count = 0; + for (size_t i = 0; i < del_buf_size; ++i) { + size_t d = dels_r[i]; + all_count += d; + best_count = max(best_count, d); + } + pileup->matrix[major_col + c3_rev_del_all] = all_count; + pileup->matrix[major_col + c3_rev_del_best] = best_count; + del_count += all_count; + + // finalise IS and I1S + // forward + kh_counter_stats_t stats = kh_counter_stats(ins_counts_f); + pileup->matrix[major_col + c3_fwd_ins_all] = stats.sum; + pileup->matrix[major_col + c3_fwd_ins_best] = stats.max; + ins_count += stats.sum; + + kh_counter_destroy(ins_counts_f); + // reverse + stats = kh_counter_stats(ins_counts_r); + pileup->matrix[major_col + c3_rev_ins_all] = stats.sum; + pileup->matrix[major_col + c3_rev_ins_best] = stats.max; + ins_count += stats.sum; + + kh_counter_destroy(ins_counts_r); + int offset = pos - ref_start; + char ref_base = toupper(ref_seq[offset]); + int ref_offset_forward = base2_index(ref_base); + int ref_offset_reverse = ref_offset_forward + reverse_pos_start; + char major_alt_base = '\0'; + size_t forward_sum = 0; + size_t reverse_sum = 0; + for (size_t i = 0; i < 4; i++) { + forward_sum += pileup->matrix[major_col + i]; + reverse_sum += pileup->matrix[major_col + i + reverse_pos_start]; + if (i == ref_offset_forward) { + ref_count = pileup->matrix[major_col + i] + pileup->matrix[major_col + i + reverse_pos_start]; + } else { + size_t current_count = pileup->matrix[major_col + i] + pileup->matrix[major_col + i + reverse_pos_start]; + if (current_count > alt_count) { + alt_count = current_count; + major_alt_base = plp_bases_clair3[i]; + } + } + } + + pileup->matrix[major_col + ref_offset_forward] = -1 * forward_sum; + pileup->matrix[major_col + ref_offset_reverse] = -1 * reverse_sum; + + // calculate candidate allele frequency and apply filtering + depth = max(1, depth); + bool pass_min_depth = depth >= min_depth; + bool pass_ref_base_in_acgt = ref_base == 'A' || ref_base == 'C' || ref_base == 'G' || ref_base == 'T'; + bool non_ref_base_majority = ref_count < alt_count || ref_count < ins_count || ref_count < del_count; + bool ref_alt_equal_majority = (ref_count > 0 && ref_count == alt_count && ref_base - major_alt_base < 0); + if (call_snp_only == true) { + pass_af = alt_count / (float)depth >= min_snp_af; + } else { + pass_af = non_ref_base_majority || ref_alt_equal_majority || (alt_count / (float)depth >= min_snp_af); + pass_af = pass_af || (del_count / (float)depth >= min_indel_af) || (ins_count / (float)depth >= min_indel_af); + } + + pass_af = pass_af && pass_min_depth && pass_ref_base_in_acgt; + pass_af = pass_af && (contiguous_flanking_num >= pileup_flanking_base_num); + // move to next position + if (pass_af) { + + if (candidates_num + 1 >= alt_info_p_size) { + alt_info_p_size = alt_info_p_size << 1; + alt_info_p = xrealloc(alt_info_p, alt_info_p_size * sizeof(char*), "alt_info_p"); + } + + size_t max_alt_length = 64; + char *alt_info_str = xalloc(max_alt_length, sizeof(char), "alt_info_str"); + + sprintf(alt_info_str, "%i-%i-%c-", pos+1, depth, ref_base); + //snp + for (size_t i = 0; i < 4; i++) { + forward_sum += pileup->matrix[major_col + i]; + reverse_sum += pileup->matrix[major_col + i + reverse_pos_start]; + size_t alt_sum = pileup->matrix[major_col + i] + pileup->matrix[major_col + i + reverse_pos_start]; + + if (alt_sum > 0 && i != ref_offset_forward) + sprintf(alt_info_str + strlen(alt_info_str), "X%c %i ", plp_bases_clair3[i], alt_sum); + } + //del + for (size_t i = 0; i < del_buf_size; i++) { + size_t d = dels_f[i] + dels_r[i]; + if (d > 0 && i+1 <= max_indel_length) { + // 32 bytes is a safe number for integer to string + if (strlen(alt_info_str) + i + 32 >= max_alt_length) { + while (strlen(alt_info_str) + i + 32 >= max_alt_length) + max_alt_length = max_alt_length << 1; + alt_info_str = xrealloc(alt_info_str, max_alt_length*sizeof(char), "alt_info_str"); + } + sprintf(alt_info_str + strlen(alt_info_str), "D%.*s %i ", i+1,ref_seq+offset+1, d); + } + + } +// //ins + for (khiter_t k = kh_begin(ins_counts_all); k != kh_end(ins_counts_all); ++k) { + if (kh_exist(ins_counts_all, k)) { + const char *key = kh_key(ins_counts_all, k); + size_t val = kh_val(ins_counts_all, k); + if (strlen(key) <= max_indel_length) { + if (strlen(alt_info_str) + strlen(key) + 32 >= max_alt_length) { + while (strlen(alt_info_str) + strlen(key) + 32 >= max_alt_length) + max_alt_length = max_alt_length << 1; + alt_info_str = xrealloc(alt_info_str, max_alt_length *sizeof(char), "alt_info_str"); + } + sprintf(alt_info_str + strlen(alt_info_str), "I%c%s %i ", ref_base, key, val); + } + } + } + // update the alternative information for current candidates here + alt_info_p[candidates_num++] = alt_info_str; + } + + free(dels_f); + free(dels_r); + kh_counter_destroy(ins_counts_all); + major_col += featlenclair3; + } + + + pileup->all_alt_info = alt_info_p; + pileup->candidates_num = candidates_num; + pileup->n_cols = n_cols; + + bam_itr_destroy(data->iter); + bam_mplp_destroy(mplp); + fai_destroy(fai); + free(data); + free(plp); + free(chr); + + return pileup; +} + +int main() +{ + return 0; +} diff --git a/src/clair3_pileup.h b/src/clair3_pileup.h new file mode 100644 index 0000000..5cf9283 --- /dev/null +++ b/src/clair3_pileup.h @@ -0,0 +1,105 @@ +#ifndef _CLAIR3_PILEUP_H +#define _CLAIR3_PILEUP_H + +// medaka-style feature data +typedef struct _plp_data { + size_t buffer_cols; + size_t num_dtypes; + size_t num_homop; + size_t n_cols; + size_t *matrix; + size_t *major; + size_t *minor; + char **all_alt_info; + size_t candidates_num; +} _plp_data; +typedef _plp_data *plp_data; + + +// convert 16bit IUPAC (+16 for strand) to plp_bases index +// { +// , A, C, , G, , , , +// T, , , , , , , , +// , a, c, , g, , , , +// t, , , , , , , , +// } +static const int num2countbase[32] = { + -1, 4, 5, -1, 6, -1, -1, -1, + 7, -1, -1, -1, -1, -1, -1, -1, + -1, 0, 1, -1, 2, -1, -1, -1, + 3, -1, -1, -1, -1, -1, -1, -1, +}; + + +// convert 16bit IUPAC (+16 for strand) to plp_bases clair3 index +// first i: all insertions +// second i: most common insertion +// first d: all first base deletion (actually a reference base) +// second d: most common deletion (actually a reference base) +// third d: non-first base deletion (the deleted bases) +static const char plp_bases_clair3[] = "ACGTIIDDDacgtiiddd"; +static const size_t featlenclair3 = 18; // len of the above +static const size_t c3_fwd_ins_all = 4; +static const size_t c3_fwd_ins_best = 5; +static const size_t c3_fwd_del_all = 6; // (preceding ref position) +static const size_t c3_fwd_del_best = 7; // (preceding ref position) +static const size_t c3_fwd_del = 8; // (actually deleted base) +static const size_t c3_rev_ins_all = 13; +static const size_t c3_rev_ins_best = 14; +static const size_t c3_rev_del_all = 15; // (preceding ref position) +static const size_t c3_rev_del_best = 16; // (preceding ref position) +static const size_t c3_rev_del = 17; // (actually deleted base) +static const size_t reverse_pos_start = 9; // position of reverse position start +static const size_t mpileup_expand_reference_region = 1000; +static const size_t pileup_flanking_base_num = 16; + +static const int num2countbaseclair3[32] = { + -1, 0, 1, -1, 2, -1, -1, -1, + 3, -1, -1, -1, -1, -1, -1, -1, + -1, 9, 10, -1, 11, -1, -1, -1, + 12, -1, -1, -1, -1, -1, -1, -1, +}; + + +/** Constructs a pileup data structure. + * + * @param n_cols number of pileup columns. + * @param buffer_cols number of pileup columns. + * @param num_dtypes number of datatypes in pileup. + * @param num_homop maximum homopolymer length to consider. + * @param fixed_size if not zero data matrix is allocated as fixed_size * n_cols, ignoring other arguments + * @see destroy_plp_data + * @returns a plp_data pointer. + * + * The return value can be freed with destroy_plp_data. + * + */ +plp_data create_plp_data(size_t n_cols, size_t buffer_cols, size_t feature_length, size_t num_dtypes, size_t num_homop, size_t fixed_size); + + +/** Destroys a pileup data structure. + * + * @param data the object to cleanup. + * @returns void. + * + */ +void destroy_plp_data(plp_data data); + +/** C implement of clair3-style pileup feature data and alternative information in a given region of a bam. + * + * @param region 1-based region string + * @param bam_set bam handler of input bam + * @param fasta_path input reference file + * @param min_depth minimum coverage required to call a variant + * @param min_snp_af minimum snp allele frequency for a site to be considered as a candidate site + * @param min_indel_af minimum indel allele frequency for a site to be considered as a candidate site + * @param min_mq minimum mapping quality for read to use for calling + * @param max_indel_length maximum indel length to format into alternative string stream + * @returns a pileup data pointer, including the data matrix and all candidates alternative information + * + * The return value can be freed with destroy_plp_data + * + */ +plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth); + +#endif diff --git a/src/khash.h b/src/khash.h new file mode 100644 index 0000000..f75f347 --- /dev/null +++ b/src/khash.h @@ -0,0 +1,627 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2013-05-02 (0.2.8): + + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.8" + +#include +#include +#include + +/* compiler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifndef kh_inline +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif +#endif /* kh_inline */ + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct kh_##name##_s { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { kfree(new_flags); return -1; } \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) { kfree(new_flags); return -1; } \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: -1 if the operation failed; + 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More convenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash set containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/src/kvec.h b/src/kvec.h new file mode 100644 index 0000000..676be8b --- /dev/null +++ b/src/kvec.h @@ -0,0 +1,90 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "kvec.h" +int main() { + kvec_t(int) array; + kv_init(array); + kv_push(int, array, 10); // append + kv_a(int, array, 20) = 5; // dynamic + kv_A(array, 20) = 4; // static + kv_destroy(array); + return 0; +} +*/ + +/* + 2008-09-22 (0.1.0): + + * The initial version. + +*/ + +#ifndef AC_KVEC_H +#define AC_KVEC_H + +#include + +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) + +#define kvec_t(type) struct { size_t n, m; type *a; } +#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) +#define kv_destroy(v) free((v).a) +#define kv_A(v, i) ((v).a[(i)]) +#define kv_pop(v) ((v).a[--(v).n]) +#define kv_size(v) ((v).n) +#define kv_max(v) ((v).m) + +#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) + +#define kv_copy(type, v1, v0) do { \ + if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ + (v1).n = (v0).n; \ + memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ + } while (0) \ + +#define kv_push(type, v, x) do { \ + if ((v).n == (v).m) { \ + (v).m = (v).m? (v).m<<1 : 2; \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ + } \ + (v).a[(v).n++] = (x); \ + } while (0) + +#define kv_pushp(type, v) (((v).n == (v).m)? \ + ((v).m = ((v).m? (v).m<<1 : 2), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : 0), ((v).a + ((v).n++)) + +#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ + ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ + : 0), (v).a[(i)]) + +#endif diff --git a/src/levenshtein.c b/src/levenshtein.c new file mode 100644 index 0000000..76490df --- /dev/null +++ b/src/levenshtein.c @@ -0,0 +1,72 @@ +// `levenshtein.c` - levenshtein +// MIT licensed. +// Copyright (c) 2015 Titus Wormer + +#include +#include +#include +#include "levenshtein.h" + +// Returns a size_t, depicting the difference between `a` and `b`. +// See for more information. +size_t +levenshtein_n(const char *a, const size_t length, const char *b, const size_t bLength) { + // Shortcut optimizations / degenerate cases. + if (a == b) { + return 0; + } + + if (length == 0) { + return bLength; + } + + if (bLength == 0) { + return length; + } + + size_t *cache = calloc(length, sizeof(size_t)); + size_t index = 0; + size_t bIndex = 0; + size_t distance; + size_t bDistance; + size_t result; + char code; + + // initialize the vector. + while (index < length) { + cache[index] = index + 1; + index++; + } + + // Loop. + while (bIndex < bLength) { + code = b[bIndex]; + result = distance = bIndex++; + index = SIZE_MAX; + + while (++index < length) { + bDistance = code == a[index] ? distance : distance + 1; + distance = cache[index]; + + cache[index] = result = distance > result + ? bDistance > result + ? result + 1 + : bDistance + : bDistance > distance + ? distance + 1 + : bDistance; + } + } + + free(cache); + + return result; +} + +size_t +levenshtein(const char *a, const char *b) { + const size_t length = strlen(a); + const size_t bLength = strlen(b); + + return levenshtein_n(a, length, b, bLength); +} diff --git a/src/levenshtein.h b/src/levenshtein.h new file mode 100644 index 0000000..111a5a5 --- /dev/null +++ b/src/levenshtein.h @@ -0,0 +1,17 @@ +#ifndef LEVENSHTEIN_H +#define LEVENSHTEIN_H + +// `levenshtein.h` - levenshtein +// MIT licensed. +// Copyright (c) 2015 Titus Wormer + +// Returns a size_t, depicting the difference between `a` and `b`. +// See for more information. + +size_t +levenshtein(const char *a, const char *b); + +size_t +levenshtein_n (const char *a, const size_t length, const char *b, const size_t bLength); + +#endif // LEVENSHTEIN_H diff --git a/src/medaka_bamiter.c b/src/medaka_bamiter.c new file mode 100644 index 0000000..a625e14 --- /dev/null +++ b/src/medaka_bamiter.c @@ -0,0 +1,72 @@ +#include +#include + +#include "medaka_bamiter.h" +#include "medaka_common.h" + +// iterator for reading bam +int read_bam(void *data, bam1_t *b) { + mplp_data *aux = (mplp_data*) data; + uint8_t *tag; + bool check_tag = (strcmp(aux->tag_name, "") != 0); + bool have_rg = (aux->read_group != NULL); + uint8_t *rg; + char *rg_val; + int ret; + while (1) { + ret = aux->iter ? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); + if (ret<0) break; + // only take primary alignments + if (b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY | BAM_FQCFAIL | BAM_FDUP)) continue; + // filter by mapping quality + if ((int)b->core.qual < aux->min_mapQ) continue; + // filter by tag + if (check_tag) { + tag = bam_aux_get((const bam1_t*) b, aux->tag_name); + if (tag == NULL){ // tag isn't present or is currupt + if (aux->keep_missing) { + break; + } else { + continue; + } + } + int tag_value = bam_aux2i(tag); + if (errno == EINVAL) continue; // tag was not integer + if (tag_value != aux->tag_value) continue; + } + // filter by RG (read group): + if (have_rg) { + rg = bam_aux_get((const bam1_t*) b, "RG"); + if (rg == NULL) continue; // missing + rg_val = bam_aux2Z(rg); + if (errno == EINVAL) continue; // bad parse + if (strcmp(aux->read_group, rg_val) != 0) continue; // not wanted + } + break; + } + return ret; +} + + +// Initialise BAM file, index and header structures +bam_fset* create_bam_fset(const char* fname) { + bam_fset* fset = xalloc(1, sizeof(bam_fset), "bam fileset"); + fset->fp = hts_open(fname, "rb"); + fset->idx = sam_index_load(fset->fp, fname); + fset->hdr = sam_hdr_read(fset->fp); + if (fset->hdr == 0 || fset->idx == 0 || fset->fp == 0) { + destroy_bam_fset(fset); + fprintf(stderr, "Failed to read .bam file '%s'.", fname); + exit(1); + } + return fset; +} + + +// Destory BAM file, index and header structures +void destroy_bam_fset(bam_fset* fset) { + hts_close(fset->fp); + hts_idx_destroy(fset->idx); + sam_hdr_destroy(fset->hdr); + free(fset); +} diff --git a/src/medaka_bamiter.h b/src/medaka_bamiter.h new file mode 100644 index 0000000..100c632 --- /dev/null +++ b/src/medaka_bamiter.h @@ -0,0 +1,37 @@ +#ifndef _MEDAKA_BAMITER_H +#define _MEDAKA_BAMITER_H + +#include +#include "htslib/sam.h" + +// parameters for bam iteration +typedef struct { + htsFile *fp; + sam_hdr_t *hdr; + hts_itr_t *iter; + int min_mapQ; + char tag_name[2]; + int tag_value; + bool keep_missing; + const char *read_group; +} mplp_data; + + +typedef struct { + htsFile *fp; + hts_idx_t *idx; + sam_hdr_t *hdr; +} bam_fset; + + +// Initialise BAM file, index and header structures +bam_fset* create_bam_fset(const char* fname); + +// Destory BAM file, index and header structures +void destroy_bam_fset(bam_fset* fset); + + +// iterator for reading bam +int read_bam(void *data, bam1_t *b); + +#endif diff --git a/src/medaka_common.c b/src/medaka_common.c new file mode 100644 index 0000000..ba06b03 --- /dev/null +++ b/src/medaka_common.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include + +#include "medaka_common.h" + + +/** Allocates zero-initialised memory with a message on failure. + * + * @param num number of elements to allocate. + * @param size size of each element. + * @param msg message to describe allocation on failure. + * @returns pointer to allocated memory + * + */ +void *xalloc(size_t num, size_t size, char* msg){ + void *res = calloc(num, size); + if (res == NULL){ + fprintf(stderr, "Failed to allocate mem for %s\n", msg); + exit(1); + } + return res; +} + + +/** Reallocates memory with a message on failure. + * + * @param ptr pointer to realloc. + * @param size size of each element. + * @param msg message to describe allocation on failure. + * @returns pointer to allocated memory + * + */ +void *xrealloc(void *ptr, size_t size, char* msg){ + void *res = realloc(ptr, size); + if (res == NULL){ + fprintf(stderr, "Failed to reallocate mem for %s\n", msg); + exit(1); + } + return res; +} + + +/** Retrieves a substring. + * + * @param string input string. + * @param postion start position of substring. + * @param length length of substring required. + * @returns string pointer. + * + */ +char *substring(char *string, int position, int length) { + char *ptr; + size_t i; + + ptr = malloc(length + 1); + + for (i = 0 ; i < length ; i++) { + *(ptr + i) = *(string + position); + string++; + } + + *(ptr + i) = '\0'; + return ptr; +} + + +/** Format a uint32_t to a string + * + * @param value to format. + * @param dst destination char. + * @returns length of string. + * + */ +size_t uint8_to_str(uint8_t value, char *dst) { + static char* digits[] = { + "0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20", + "21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40", + "41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60", + "61","62","63","64","65","66","67","68","69","70","71","72","73","74","75","76","77","78","79","80", + "81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","98","99","100", + "101","102","103","104","105","106","107","108","109","110","111","112","113","114","115","116","117","118","119","120", + "121","122","123","124","125","126","127","128","129","130","131","132","133","134","135","136","137","138","139","140", + "141","142","143","144","145","146","147","148","149","150","151","152","153","154","155","156","157","158","159","160", + "161","162","163","164","165","166","167","168","169","170","171","172","173","174","175","176","177","178","179","180", + "181","182","183","184","185","186","187","188","189","190","191","192","193","194","195","196","197","198","199","200", + "201","202","203","204","205","206","207","208","209","210","211","212","213","214","215","216","217","218","219","220", + "221","222","223","224","225","226","227","228","229","230","231","232","233","234","235","236","237","238","239","240", + "241","242","243","244","245","246","247","248","249","250","251","252","253","254","255"}; + static const uint8_t TEN = 10; + static const uint8_t HUNDRED = 100; + strcpy(dst, digits[value]); + if (value < TEN) return 1; + if (value < HUNDRED) return 2; + else return 3; +} + diff --git a/src/medaka_common.h b/src/medaka_common.h new file mode 100644 index 0000000..2f06bf6 --- /dev/null +++ b/src/medaka_common.h @@ -0,0 +1,60 @@ +#ifndef _MEDAKA_COMMON_H +#define _MEDAKA_COMMON_H + +#include + + +/** Simple integer min/max + * @param a + * @param b + * + * @returns the min/max of a and b + * + */ +static inline int max ( int a, int b ) { return a > b ? a : b; } +static inline int min ( int a, int b ) { return a < b ? a : b; } + + +/** Allocates zero-initialised memory with a message on failure. + * + * @param num number of elements to allocate. + * @param size size of each element. + * @param msg message to describe allocation on failure. + * @returns pointer to allocated memory + * + */ +void *xalloc(size_t num, size_t size, char* msg); + + +/** Reallocates memory with a message on failure. + * + * @param ptr pointer to realloc. + * @param size size of each element. + * @param msg message to describe allocation on failure. + * @returns pointer to allocated memory + * + */ +void *xrealloc(void *ptr, size_t size, char* msg); + + +/** Retrieves a substring. + * + * @param string input string. + * @param postion start position of substring. + * @param length length of substring required. + * @returns string pointer. + * + */ +char *substring(char *string, int position, int length); + + +/** Format a uint32_t to a string + * + * @param value to format. + * @param dst destination char. + * @returns length of string. + * + */ +size_t uint8_to_str(uint8_t value, char *dst); + +#endif diff --git a/src/medaka_khcounter.c b/src/medaka_khcounter.c new file mode 100644 index 0000000..a8e9577 --- /dev/null +++ b/src/medaka_khcounter.c @@ -0,0 +1,135 @@ +// Wrap khash to make it more consise to use + +#define _GNU_SOURCE +#include +#include +#include "medaka_khcounter.h" +#include "medaka_common.h" + +/* Implementation of a counter of strings (increasing only) + * + * khash_t(KH_COUNTER) *h = kh_init(KH_COUNTER); + * kh_counter_increment(h, "one"); + * kh_counter_increment(h, "two"); + * kh_counter_increment(h, "two"); + * kh_counter_add(h, "three", 2); + * kh_counter_increment(h, "three"); + * kh_counter_print(h); + * kh_counter_destroy(h); + * + */ + +int kh_counter_val(khash_t(KH_COUNTER) *hash, char *key) { + khiter_t k = kh_get(KH_COUNTER, hash, key); + int val = k != kh_end(hash) ? kh_val(hash, k) : 0; + return val; +} + +size_t kh_counter_add(khash_t(KH_COUNTER) *hash, char *key, int val) { + // note: key is copied so no need for caller to hold on to it + int ret; + khiter_t k = kh_put(KH_COUNTER, hash, key, &ret); + if (ret == 1) { // new key + kh_key(hash, k) = strdup(key); + kh_value(hash, k) = val; + } else if (ret == 0) { // exists + // get value and add + int cur = kh_val(hash, k); + kh_value(hash, k) = cur + val; + } else { + // shouldnt get here - previously deleted key + } + return ret; +} + +size_t kh_counter_sub(khash_t(KH_COUNTER) *hash, char *key, int val) { + // note: key is copied so no need for caller to hold on to it + int ret; + khiter_t k = kh_put(KH_COUNTER, hash, key, &ret); + if (ret == 1) { // new key + kh_key(hash, k) = strdup(key); + kh_value(hash, k) = -val; + } else if (ret == 0) { // exists + // get value and add + int cur = kh_val(hash, k); + kh_value(hash, k) = cur - val; + } else { + // shouldnt get here - previously deleted key + } + return ret; +} + + +size_t kh_counter_increment(khash_t(KH_COUNTER) *hash, char *key) { + return kh_counter_add(hash, key, 1); +} + +kh_counter_stats_t kh_counter_stats(khash_t(KH_COUNTER) *hash) { + kh_counter_stats_t stats = { .sum=0, .max=0}; + for (khiter_t k = kh_begin(hash); k != kh_end(hash); k++) { + if (kh_exist(hash, k)) { + int val = kh_val(hash, k); + stats.sum += val; + stats.max = max(stats.max, val); + } + } + return stats; +} + +void kh_counter_destroy(khash_t(KH_COUNTER) *hash) { + for (khiter_t k = 0; k < kh_end(hash); k++){ + if (kh_exist(hash, k)) { + free((char*) kh_key(hash, k)); + } + } + kh_destroy(KH_COUNTER, hash); +} + +void kh_counter_print(khash_t(KH_COUNTER) *hash) { + for (khiter_t k = kh_begin(hash); k != kh_end(hash); k++) { + if (kh_exist(hash, k)) { + const char *key = kh_key(hash, k); + int val = kh_val(hash, k); + printf("%s -> %i\n", key, val); + } + } + kh_counter_stats_t stats = kh_counter_stats(hash); +// printf("max: %i, sum: %i\n", stats.max, stats.sum); +} + + +int kh_int_counter_val(khash_t(KH_INT_COUNTER) *hash, int key) { + khiter_t k = kh_get(KH_INT_COUNTER, hash, key); + int val = k != kh_end(hash) ? kh_val(hash, k) : -1; + return val; +} + + +size_t kh_int_counter_add(khash_t(KH_INT_COUNTER) *hash, int key, int val) { + + int ret; + khiter_t k = kh_put(KH_INT_COUNTER, hash, key, &ret); + if (ret == 1) { // new key + kh_value(hash, k) = val; + } else if (ret == 0) { + int cur = kh_val(hash, k); + kh_value(hash, k) = cur + val; + } + return ret; +} + +void kh_int_counter_destroy(khash_t(KH_INT_COUNTER) *hash) { + kh_destroy(KH_INT_COUNTER, hash); +} + +//int (int argc, char *argv[]) { +// khash_t(KH_COUNTER) *h = kh_init(KH_COUNTER); +// kh_counter_increment(h, "one"); +// kh_counter_increment(h, "two"); +// kh_counter_increment(h, "two"); +// kh_counter_add(h, "three", 2); +// kh_counter_increment(h, "three"); +// kh_counter_print(h); +// kh_counter_destroy(h); +// printf("-------\n\n"); +//} diff --git a/src/medaka_khcounter.h b/src/medaka_khcounter.h new file mode 100644 index 0000000..1ad46c8 --- /dev/null +++ b/src/medaka_khcounter.h @@ -0,0 +1,53 @@ +#ifndef _MEDAKA_KHCOUNTER_H +#define _MEDAKA_KHCOUNTER_H + +#include "khash.h" + +typedef struct kh_counter_stats_t { + size_t sum; + size_t max; +} kh_counter_stats_t; + +KHASH_MAP_INIT_STR(KH_COUNTER, int) +KHASH_MAP_INIT_INT(KH_INT_COUNTER, int) + +// create a counter +static inline khash_t(KH_COUNTER) *kh_counter_init() { + khash_t(KH_COUNTER) *h = kh_init(KH_COUNTER); + return h; +} + +static inline khash_t(KH_INT_COUNTER) *kh_int_counter_init() { + khash_t(KH_INT_COUNTER) *h = kh_init(KH_INT_COUNTER); + return h; +} + +// Get a value from a counter +int kh_counter_val(khash_t(KH_COUNTER) *hash, char *key); + +// Clean up a counter +void kh_counter_destroy(khash_t(KH_COUNTER) *hash); + +// Increment a counter by one +size_t kh_counter_increment(khash_t(KH_COUNTER) *hash, char *key); + +size_t kh_counter_sub(khash_t(KH_COUNTER) *hash, char *key, int val); + +// Increment a counter by a given amount +size_t kh_counter_add(khash_t(KH_COUNTER) *hash, char *key, int val); + +// Retrieve statistics on counter +kh_counter_stats_t kh_counter_stats(khash_t(KH_COUNTER) *hash); + +// Print contents of a counter +void kh_counter_print(khash_t(KH_COUNTER) *hash); + +// similar to the kh_counter, except that the key is integer +int kh_int_counter_val(khash_t(KH_INT_COUNTER) *hash, int key); + +size_t kh_int_counter_add(khash_t(KH_INT_COUNTER) *hash, int key, int val); + +void kh_int_counter_destroy(khash_t(KH_INT_COUNTER) *hash); + + +#endif From c8fc4b336a51ac930417f7bbc885495ce40e299a Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 12:54:26 +0800 Subject: [PATCH 02/43] full-alignment create tensor with cffi --- .../CreateTensorFullAlignmentFromCffi.py | 278 ++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 preprocess/CreateTensorFullAlignmentFromCffi.py diff --git a/preprocess/CreateTensorFullAlignmentFromCffi.py b/preprocess/CreateTensorFullAlignmentFromCffi.py new file mode 100644 index 0000000..3a1ba8c --- /dev/null +++ b/preprocess/CreateTensorFullAlignmentFromCffi.py @@ -0,0 +1,278 @@ +import os +import shlex +import logging +import numpy as np +from argparse import ArgumentParser, SUPPRESS +from collections import defaultdict + +import libclair3 +import shared.param_f as param +from shared.utils import subprocess_popen, file_path_from, IUPAC_base_to_num_dict as BASE2NUM, str2bool, vcf_candidates_from +from shared.interval_tree import bed_tree_from + +logging.basicConfig(format='%(message)s', level=logging.INFO) +no_of_positions = param.no_of_positions +flanking_base_num = param.flankingBaseNum +channel_size = param.channel_size + + +def CreateTensorFullAlignment(args): + + ctg_start = args.ctgStart + ctg_end = args.ctgEnd + full_aln_regions = args.full_aln_regions + fasta_file_path = args.ref_fn + ctg_name = args.ctgName + bam_file_path = args.bam_fn + extend_bp = param.extend_bp + platform = args.platform + phased_vcf_fn = args.phased_vcf_fn + + vcf_fn = file_path_from(args.vcf_fn) + is_known_vcf_file_provided = vcf_fn is not None + chunk_id = args.chunk_id - 1 if args.chunk_id else None # 1-base to 0-base + chunk_num = args.chunk_num + extend_bed = file_path_from(args.extend_bed) + is_extend_bed_file_given = extend_bed is not None + confident_bed_fn = file_path_from(args.bed_fn) + is_confident_bed_file_given = confident_bed_fn is not None + + # we would't haplotag reads if --no_phasing_for_fa option is enabled + need_haplotagging = args.no_phasing_for_fa is not True + candidates_set = set() + + if full_aln_regions: + + """ + If given full alignment bed regions, all candidate positions will be directly selected from each row, define as + 'ctg start end', where 0-based center position is the candidate for full alignment calling. + if 'need_haplotagging' option enables, full alignment bed regions will also include nearby heterozygous snp candidates for reads + haplotag, which is faster than whatshap haplotag with more memory occupation. + """ + + candidate_file_path_process = subprocess_popen(shlex.split("gzip -fdc %s" % (full_aln_regions))) + candidate_file_path_output = candidate_file_path_process.stdout + + ctg_start, ctg_end = float('inf'), 0 + for row in candidate_file_path_output: + row = row.rstrip().split('\t') + if row[0] != ctg_name: continue + position = int(row[1]) + 1 + end = int(row[2]) + 1 + ctg_start = min(position, ctg_start) + ctg_end = max(end, ctg_end) + + if platform == "ilmn": + continue + if len(row) > 3: # hete snp positions + center_pos = position + extend_bp + 1 + ref_base, alt_base, genotype, phase_set = row[3].split('-') + else: + center = position + (end - position) // 2 - 1 + candidates_set.add(center) + + candidate_file_path_output.close() + candidate_file_path_process.wait() + + if is_known_vcf_file_provided: + known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name) + candidates_set = set(known_variants_list) + + variant_list = [] + if need_haplotagging and phased_vcf_fn and os.path.exists(phased_vcf_fn): + # if need_haplotagging option enables, scan the phased vcf file and store the heterozygous SNP candidates from each phase set + unzip_process = subprocess_popen(shlex.split("gzip -fdc %s" % (phased_vcf_fn))) + for row in unzip_process.stdout: + row = row.rstrip() + if row[0] == '#': + continue + columns = row.strip().split('\t') + contig_name = columns[0] + if ctg_name and contig_name != ctg_name: + continue + pos = int(columns[1]) + ref_base = columns[3] + alt_base = columns[4] + genotype_info = columns[9].split(':') + genotype, phase_set = genotype_info[0], genotype_info[-1] + if '|' not in genotype: # unphasable + continue + genotype = ('1' if genotype == '0|1' else '2') + + # use a C Variant struct to store all phased infos + variant_list.append(libclair3.ffi.new("struct Variant *", [pos-1, ref_base.encode(), alt_base.encode(), int(genotype), int(phase_set)])) + + variant_num = len(variant_list) + Variants = libclair3.ffi.new("struct Variant *[]", variant_list) + + + # 1-index to 0-index + candidates_list = sorted(list(set([item-1 for item in candidates_set if item >= ctg_start and item <= ctg_end]))) + + region_str = '{}:{}-{}'.format(ctg_name, ctg_start, ctg_end).encode() + candidate_num = len(candidates_list) + + candidates = libclair3.ffi.new("size_t [{}]".format(candidate_num), candidates_list) + + fa_data = libclair3.lib.calculate_clair3_full_alignment(region_str, bam_file_path.encode(), fasta_file_path.encode(), + Variants, variant_num, candidates, candidate_num) + + # use np buffer to get the matrix + matrix_depth = param.matrix_depth_dict[platform] + ffi = libclair3.ffi + _dtype = np.int8 + size_sizet = np.dtype(_dtype).itemsize + np_fa_data = np.frombuffer(ffi.buffer( + fa_data.matrix, size_sizet * matrix_depth * no_of_positions * channel_size * candidate_num), + dtype=_dtype + ).reshape(candidate_num, matrix_depth, no_of_positions, channel_size).copy() + + + all_position_info, all_alt_info = [], [] + for idx in range(candidate_num): + # decode the C char* to python string + alt_info_string = ffi.string(fa_data.all_alt_info[idx]).decode('utf8', 'ignore') + alt_info = alt_info_string.rstrip().split('-') + pos, depth, center_ref_base, alt = alt_info[:4] + all_position_info.append(ctg_name + ':' + pos + ':' + center_ref_base) + all_alt_info.append(depth + '-' + alt) + + libclair3.lib.destroy_fa_data(fa_data) + + return np_fa_data, all_position_info, all_alt_info + + +def main(): + parser = ArgumentParser(description="Generate variant candidate tensors using phased full-alignment") + + parser.add_argument('--platform', type=str, default='ont', + help="Sequencing platform of the input. Options: 'ont,hifi,ilmn', default: %(default)s") + + parser.add_argument('--bam_fn', type=str, default="input.bam", required=True, + help="Sorted BAM file input, required") + + parser.add_argument('--ref_fn', type=str, default="ref.fa", required=True, + help="Reference fasta file input, required") + + parser.add_argument('--tensor_can_fn', type=str, default="PIPE", + help="Tensor output, stdout by default, default: %(default)s") + + parser.add_argument('--vcf_fn', type=str, default=None, + help="Candidate sites VCF file input, if provided, variants will only be called at the sites in the VCF file, default: %(default)s") + + parser.add_argument('--min_af', type=float, default=0.08, + help="Minimum allele frequency for both SNP and Indel for a site to be considered as a condidate site, default: %(default)f") + + parser.add_argument('--snp_min_af', type=float, default=0.08, + help="Minimum snp allele frequency for a site to be considered as a candidate site, default: %(default)f") + + parser.add_argument('--indel_min_af', type=float, default=0.15, + help="Minimum indel allele frequency for a site to be considered as a candidate site, default: %(default)f") + + parser.add_argument('--ctgName', type=str, default=None, + help="The name of sequence to be processed, required if --bed_fn is not defined") + + parser.add_argument('--ctgStart', type=int, default=None, + help="The 1-based starting position of the sequence to be processed, optional, will process the whole --ctgName if not set") + + parser.add_argument('--ctgEnd', type=int, default=None, + help="The 1-based inclusive ending position of the sequence to be processed, optional, will process the whole --ctgName if not set") + + parser.add_argument('--bed_fn', type=str, default=None, + help="Call variant only in the provided regions. Will take an intersection if --ctgName and/or (--ctgStart, --ctgEnd) are set") + + parser.add_argument('--gvcf', type=str2bool, default=False, + help="Enable GVCF output, default: disabled") + + parser.add_argument('--sampleName', type=str, default="SAMPLE", + help="Define the sample name to be shown in the GVCF file") + + parser.add_argument('--samtools', type=str, default="samtools", + help="Path to the 'samtools', samtools version >= 1.10 is required. default: %(default)s") + + # options for advanced users + parser.add_argument('--minCoverage', type=float, default=param.min_coverage, + help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f") + + parser.add_argument('--minMQ', type=int, default=param.min_mq, + help="EXPERIMENTAL: If set, reads with mapping quality with <$minMQ are filtered, default: %(default)d") + + parser.add_argument('--minBQ', type=int, default=param.min_bq, + help="EXPERIMENTAL: If set, bases with base quality with <$minBQ are filtered, default: %(default)d") + + parser.add_argument('--max_depth', type=int, default=param.max_depth, + help="EXPERIMENTAL: Maximum full alignment depth to be processed. default: %(default)s") + + # options for debug purpose + parser.add_argument('--phasing_info_in_bam', action='store_true', + help="DEBUG: Skip phasing and use the phasing info provided in the input BAM (HP tag), default: False") + + parser.add_argument('--phasing_window_size', type=int, default=param.phasing_window_size, + help="DEBUG: The window size for read phasing") + + parser.add_argument('--extend_bed', nargs='?', action="store", type=str, default=None, + help="DEBUG: Extend the regions in the --bed_fn by a few bp for tensor creation, default extend 16bp") + + parser.add_argument('--indel_fn', type=str, default=None, + help="DEBUG: Output all alternative indel cigar for debug purpose") + + parser.add_argument('--base_err', default=0.001, type=float, + help='DEBUG: Estimated base error rate in gvcf option, default: %(default)f') + + parser.add_argument('--gq_bin_size', default=5, type=int, + help='DEBUG: Default gq bin size for merge non-variant block in gvcf option, default: %(default)d') + + parser.add_argument('--bp_resolution', action='store_true', + help="DEBUG: Enable bp resolution for GVCF, default: disabled") + + # options for internal process control + ## Path to the 'zstd' compression + parser.add_argument('--zstd', type=str, default=param.zstd, + help=SUPPRESS) + + ## Test in specific candidate position. Only for testing + parser.add_argument('--test_pos', type=int, default=0, + help=SUPPRESS) + + ## The number of chucks to be divided into for parallel processing + parser.add_argument('--chunk_num', type=int, default=None, + help=SUPPRESS) + + ## The chuck ID to work on + parser.add_argument('--chunk_id', type=int, default=None, + help=SUPPRESS) + + ## Use heterozygous SNP variants in phased vcf file for haplotaging + parser.add_argument('--phased_vcf_fn', type=str, default=None, + help=SUPPRESS) + ## Apply no phased data in training. Only works in data training, default: False + parser.add_argument('--add_no_phasing_data_training', action='store_true', + help=SUPPRESS) + + ## Output representation unification infos, which refines training labels + parser.add_argument('--unify_repre', action='store_true', + help=SUPPRESS) + + ## Path of representation unification output + parser.add_argument('--unify_repre_fn', type=str, default=None, + help=SUPPRESS) + + ## Provide the regions to be included in full-alignment based calling + parser.add_argument('--full_aln_regions', type=str, default=None, + help=SUPPRESS) + + ## Use Clair3's own phasing module for read level phasing when creating tensor, compared to using Whatshap, speed is faster but has higher memory footprint, default: False + parser.add_argument('--need_haplotagging', action='store_true', + help=SUPPRESS) + + ## Apply read realignment for illumina platform. Greatly boost indel performance in trade of running time + parser.add_argument('--need_realignment', action='store_true', + help=SUPPRESS) + + args = parser.parse_args() + + CreateTensorFullAlignment(args) + + +if __name__ == "__main__": + main() \ No newline at end of file From f6368c576d3f7f77bc3ae038db74062b8576e0c3 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 12:55:10 +0800 Subject: [PATCH 03/43] reuse region function from medaka --- preprocess/medaka_utils.py | 95 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 preprocess/medaka_utils.py diff --git a/preprocess/medaka_utils.py b/preprocess/medaka_utils.py new file mode 100644 index 0000000..702c103 --- /dev/null +++ b/preprocess/medaka_utils.py @@ -0,0 +1,95 @@ +import os +import collections + + +_Region = collections.namedtuple('Region', 'ref_name start end') + +class Region(_Region): + """Represents a genomic region.""" + + @property + def name(self): + """Samtools-style region string, zero-base end exclusive.""" + return self.__str__() + + def __str__(self): + """Return string representation of region.""" + # This will be zero-based, end exclusive + start = 0 if self.start is None else self.start + end = '' if self.end is None else self.end + return '{}:{}-{}'.format(self.ref_name, start, end) + + @property + def size(self): + """Return size of region.""" + return self.end - self.start + + @classmethod + def from_string(cls, region): + """Parse region string into `Region` objects. + + :param region: region str + + >>> Region.from_string('Ecoli') == Region( + ... ref_name='Ecoli', start=None, end=None) + True + >>> Region.from_string('Ecoli:1000-2000') == Region( + ... ref_name='Ecoli', start=1000, end=2000) + True + >>> Region.from_string('Ecoli:1000') == Region( + ... ref_name='Ecoli', start=1000, end=None) + True + >>> Region.from_string('Ecoli:-1000') == Region( + ... ref_name='Ecoli', start=0, end=1000) + True + >>> Region.from_string('Ecoli:500-') == Region( + ... ref_name='Ecoli', start=500, end=None) + True + >>> Region.from_string('A:B:c:500-') == Region( + ... ref_name='A:B:c', start=500, end=None) + True + """ + if ':' not in region: + ref_name, start, end = region, None, None + else: + start, end = None, None + ref_name, bounds = region.rsplit(':', 1) + if bounds[0] == '-': + start = 0 + end = int(bounds.replace('-', '')) + elif '-' not in bounds: + start = int(bounds) + end = None + elif bounds[-1] == '-': + start = int(bounds[:-1]) + end = None + else: + start, end = [int(b) for b in bounds.split('-')] + return cls(ref_name, start, end) + + def split(region, size, overlap=0, fixed_size=True): + """Split region into sub-regions of a given length. + + :param size: size of sub-regions. + :param overlap: overlap between ends of sub-regions. + :param fixed_size: ensure all sub-regions are equal in size. If `False` + then the final chunk will be created as the smallest size to + conform with `overlap`. + + :returns: a list of sub-regions. + + """ + regions = list() + if size >= region.size: + return [region] + for start in range(region.start, region.end, size - overlap): + end = min(start + size, region.end) + regions.append(Region(region.ref_name, start, end)) + if len(regions) > 1: + if fixed_size and regions[-1].size < size: + del regions[-1] + end = region.end + start = end - size + if start > regions[-1].start: + regions.append(Region(region.ref_name, start, end)) + return regions \ No newline at end of file From 7932cbeadbf028199015d2c625e2e3704f6ae4ff Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 12:57:54 +0800 Subject: [PATCH 04/43] pileup cffi function, threading and chunking are disabled currently --- preprocess/CreateTensorPileupFromCffi.py | 465 +++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 preprocess/CreateTensorPileupFromCffi.py diff --git a/preprocess/CreateTensorPileupFromCffi.py b/preprocess/CreateTensorPileupFromCffi.py new file mode 100644 index 0000000..512f1cb --- /dev/null +++ b/preprocess/CreateTensorPileupFromCffi.py @@ -0,0 +1,465 @@ +import sys +import logging +import queue +import concurrent.futures +import numpy as np + +from argparse import ArgumentParser, SUPPRESS +from contextlib import contextmanager + +import libclair3 +import shared.param_p as param +from shared.interval_tree import bed_tree_from, is_region_in +from shared.utils import file_path_from, IUPAC_base_to_num_dict as BASE2NUM, str2bool, vcf_candidates_from +from preprocess.medaka_utils import Region + +logging.getLogger().setLevel(logging.INFO) + +flanking_base_num = param.flankingBaseNum +no_of_positions = 2 * flanking_base_num + 1 +channel = param.channel +channel_size = len(channel) + + +def pileup_counts_clair3( + region, bam, fasta, min_depth, min_snp_af, min_indel_af, min_mq, call_snp_only, max_indel_length, gvcf, \ + max_depth, region_split=100000, workers=1): + """Create pileup counts feature array for region. + + :param region: `medaka.common.Region` object + :param bam: .bam file with alignments. + :param dtype_prefixes: prefixes for query names which to separate counts. + If `None` (or of length 1), counts are not split. + :param region_split: largest region to process in single thread. + Regions are processed in parallel and stitched before being returned. + :param workers: worker threads for calculating pileup. + :param tag_name: two letter tag name by which to filter reads. + :param tag_value: integer value of tag for reads to keep. + :param keep_missing: whether to keep reads when tag is missing. + :param num_qstrat: number of layers for qscore stratification. + :param weibull_summation: use a Weibull partial-counts approach, + requires 'WL' and 'WK' float-array tags. + + :returns: iterator of tuples + (pileup counts array, reference positions, insertion positions) + Multiple chunks are returned if there are discontinuities in + positions caused e.g. by gaps in coverage. + """ + lib = libclair3.lib + featlenclair3 = lib.featlenclair3 + bam = BAMHandler(bam) + + def _process_region(reg): + # ctg start is 1-based, medaka.common.Region object is 0-based + region_str = '{}:{}-{}'.format(reg.ref_name, max(0, reg.start-1), reg.end) + if isinstance(bam, BAMHandler): + bam_handle = bam + else: + bam_handle = BAMHandler(bam) + with bam_handle.borrow() as fh: + counts = lib.calculate_clair3_pileup( + region_str.encode(), fh, fasta.encode(), min_depth, min_snp_af, min_indel_af, min_mq, max_indel_length, call_snp_only, max_depth) + np_counts, positions, alt_info_string_list = _plp_data_to_numpy( + counts, featlenclair3) + + alt_info_list = [] + for alt_info in alt_info_string_list: + alt_info = alt_info.split('-') + # skip mainly because candidate length is larger than maximum indel length + if len(alt_info) < 4: + continue + pos, depth, center_ref_base, alt = alt_info[:4] + alt_info_list.append((int(pos), reg.ref_name + ':' + pos + ':' + center_ref_base, depth + '-' + alt)) + + lib.destroy_plp_data(counts) + return np_counts, positions, alt_info_list + + # we found that split into small chunk would lead to some missing truths, + # the candidates cross two negbouring small chunks + region_split = region.end - region.start + regions = region.split(region_split, fixed_size=False) + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + results = executor.map(_process_region, regions) + chunk_results, all_alt_info_list = __enforce_pileup_chunk_contiguity(results) + return chunk_results, all_alt_info_list + + +class BAMHandler(object): + """Opening of BAM file handles and indices.""" + + def __init__(self, bam, size=16): + """Initialise a pool of HTSlib filehandles.""" + # note: the default size here is set to match the default + # `bam_workers` of prediction.DataLoader and `workers` + # of features.pileup_counts, such that this class + # should never block computations + self.bam = bam + self._pool = queue.Queue(size) + + lib, ffi = libclair3.lib, libclair3.ffi + for _ in range(size): + fset = ffi.gc( + lib.create_bam_fset(self.bam.encode()), + self._destroy_fset) + self._pool.put(fset) + + @contextmanager + def borrow(self): + """Borrow a BAM file handle and index set.""" + fset = self._pool.get() + try: + yield fset + finally: + self._pool.put(fset) + + def encode(self): + """Return bare path encoded to bytes. + + For legacy compatibility only. + """ + return self.bam.encode() + + def _destroy_fset(self, fset): + libclair3.lib.destroy_bam_fset(fset) + + +def _plp_data_to_numpy(plp_data, n_rows): + """Create numpy representation of feature data. + + Copy the feature matrix and alignment column names from a + `plp_data` structure returned from C library function calls. + + :param plp_data: a cffi proxy to a `plp_data*` pointer + :param nrows: the number of rows in the plp_data.matrix (the number + of elements in the feature per pileup column). + + :returns: pileup counts numpy array, reference positions + + """ + ffi = libclair3.ffi + size_sizet = np.dtype(np.int).itemsize + _dtype = np.int + np_counts = np.frombuffer(ffi.buffer( + plp_data.matrix, size_sizet * plp_data.n_cols * n_rows), + dtype=_dtype + ).reshape(plp_data.n_cols, n_rows).copy() + + alt_info_string_list = [] + candidates_num = plp_data.candidates_num + # decode all alternative information, position-depth-reference_base-alt_info + for i in range(candidates_num): + alt_info_string = ffi.string(plp_data.all_alt_info[i]).decode('utf8', 'ignore').rstrip() + alt_info_string_list.append(alt_info_string) + + positions = np.empty(plp_data.n_cols, dtype=[ + ('major', int), ('minor', int)]) + np.copyto( + positions['major'], np.frombuffer( + ffi.buffer(plp_data.major, size_sizet * plp_data.n_cols), + dtype=_dtype)) + np.copyto( + positions['minor'], + np.frombuffer(ffi.buffer( + plp_data.minor, size_sizet * plp_data.n_cols), dtype=_dtype)) + return np_counts, positions, alt_info_string_list + + +def __enforce_pileup_chunk_contiguity(pileups): + """Split and join ordered pileup chunks to ensure contiguity. + + :param pileups: iterable of (counts, pileups) as constructed by + `_plp_data_to_numpy`. + + :returns: a list of reconstituted (counts, pileups) where discontinuities + in the inputs cause breaks and abutting inputs are joined. + + """ + split_results = list() + all_alt_info_list = list() + # First pass: need to check for discontinuities within chunks, + # these show up as >1 changes in the major coordinate + for counts, positions, alt_info_list in pileups: + move = np.ediff1d(positions['major']) + gaps = np.where(move > 1)[0] + 1 + all_alt_info_list += alt_info_list + if len(gaps) == 0: + split_results.append((counts, positions)) + else: + start = 0 + for i in gaps: + split_results.append((counts[start:i], positions[start:i])) + start = i + split_results.append((counts[start:], positions[start:])) + + # Second pass: stitch abutting chunks together, anything not neighbouring + # is kept separate whether it came from the same chunk originally or not + def _finalize_chunk(c_buf, p_buf): + chunk_counts = np.concatenate(c_buf) + chunk_positions = np.concatenate(p_buf) + return chunk_counts, chunk_positions + + counts_buffer, positions_buffer = list(), list() + chunk_results = list() + last = None + for counts, positions in split_results: + if len(positions) == 0: + continue + first = positions['major'][0] + # should be last -first == 1? + if len(counts_buffer) == 0 or first - last == 1: + # new or contiguous + counts_buffer.append(counts) + positions_buffer.append(positions) + last = positions['major'][-1] + else: + # discontinuity + chunk_results.append(_finalize_chunk( + counts_buffer, positions_buffer)) + counts_buffer = [counts] + positions_buffer = [positions] + last = positions['major'][-1] + if len(counts_buffer) != 0: + chunk_results.append(_finalize_chunk(counts_buffer, positions_buffer)) + return chunk_results, all_alt_info_list + + +def CreateTensorPileup(args): + """ + Create pileup tensor for pileup model training or calling. + Use slide window to scan the whole candidate regions, keep all candidates over specific minimum allelic frequency + and minimum depth, use samtools mpileup to store pileup info for pileup tensor generation. Only scan candidate + regions once, we could directly get all variant candidates directly. + """ + ctg_start = args.ctgStart + ctg_end = args.ctgEnd + fasta_file_path = args.ref_fn + ctg_name = args.ctgName + bam_file_path = args.bam_fn + chunk_id = args.chunk_id - 1 if args.chunk_id else None # 1-base to 0-base + chunk_num = args.chunk_num + minimum_snp_af_for_candidate = args.snp_min_af + minimum_indel_af_for_candidate = args.indel_min_af + min_coverage = args.minCoverage + min_mapping_quality = args.minMQ + platform = args.platform + + vcf_fn = file_path_from(args.vcf_fn) + is_known_vcf_file_provided = vcf_fn is not None + confident_bed_fn = file_path_from(args.extend_bed) + is_confident_bed_file_given = confident_bed_fn is not None + extend_bed = file_path_from(args.extend_bed) + is_extend_bed_file_given = extend_bed is not None + fast_mode = args.fast_mode + call_snp_only = args.call_snp_only + # enable_long_indel = args.enable_long_indel + # 1-based regions [start, end] (start and end inclusive) + tree, bed_start, bed_end = bed_tree_from(bed_file_path=extend_bed, + contig_name=ctg_name, + return_bed_region=True) + + fai_fn = file_path_from(fasta_file_path, suffix=".fai", exit_on_not_found=True, sep='.') + + fast_mode = platform == 'ont' and fast_mode + minimum_snp_af_for_candidate = max(minimum_snp_af_for_candidate, param.min_af_dict[platform]) if fast_mode else minimum_snp_af_for_candidate + min_coverage = max(min_coverage, 4) if fast_mode else min_coverage + max_indel_length = param.maximum_variant_length_that_need_infer #if not enable_long_indel else param.maximum_variant_length_that_need_infer_include_long_indel + + if not is_confident_bed_file_given and chunk_id is not None: + contig_length = 0 + with open(fai_fn, 'r') as fai_fp: + for row in fai_fp: + columns = row.strip().split("\t") + + contig_name = columns[0] + if contig_name != ctg_name: + continue + contig_length = int(columns[1]) + chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num + ctg_start = chunk_size * chunk_id # 0-base to 1-base + ctg_end = ctg_start + chunk_size + + if is_confident_bed_file_given and chunk_id is not None: + chunk_size = (bed_end - bed_start) // chunk_num + 1 if (bed_end - bed_start) % chunk_num else (bed_end - bed_start) // chunk_num + ctg_start = bed_start + 1 + chunk_size * chunk_id # 0-base to 1-base + ctg_end = ctg_start + chunk_size + + if is_known_vcf_file_provided and chunk_id is not None: + known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name) + total_variants_size = len(known_variants_list) + chunk_variants_size = total_variants_size // chunk_num if total_variants_size % chunk_num == 0 else total_variants_size // chunk_num + 1 + chunk_start_pos = chunk_id * chunk_variants_size + known_variants_set = set(known_variants_list[chunk_start_pos: chunk_start_pos + chunk_variants_size]) + if len(known_variants_set) == 0: + return [], [], [] + ctg_start, ctg_end = min(known_variants_set), max(known_variants_set) + + is_ctg_name_given = ctg_name is not None + is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None + if is_ctg_range_given: + extend_start = max(1, ctg_start - no_of_positions) + extend_end = ctg_end + no_of_positions + + region_str = "{}:{}-{}".format(ctg_name, extend_start, extend_end) + region = Region.from_string(region_str) + + confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn, contig_name=ctg_name, bed_ctg_start=extend_start, + bed_ctg_end=extend_end) + + chunk_result, all_alt_info_list = pileup_counts_clair3(region, + bam=bam_file_path, + fasta=fasta_file_path, + min_depth=min_coverage, + min_snp_af=minimum_snp_af_for_candidate, + min_indel_af=minimum_indel_af_for_candidate, + min_mq=min_mapping_quality, + max_indel_length=max_indel_length, + call_snp_only=call_snp_only, + max_depth=param.max_depth, + gvcf=args.gvcf) + + # slice all candidates tensor according to the alternative information + np_pileup_data, all_position_info, all_alt_info = [], [], [] + for idx, (pos, pos_info, alt_info) in enumerate(all_alt_info_list): + pos = int(pos) + pass_confident_bed = not is_confident_bed_file_given or is_region_in(tree=confident_bed_tree, + contig_name=ctg_name, + region_start=pos - 1, + region_end=pos + 1) + + pass_vcf_region = not is_known_vcf_file_provided or (is_known_vcf_file_provided and pos in known_variants_set) + + if not pass_confident_bed or not pass_vcf_region: + continue + start, end = pos - flanking_base_num, pos + flanking_base_num + 1 + for result in chunk_result: + if start - 1 >= result[1][0][0] and end <= result[1][-1][0]: + offset = start - result[1][0][0] - 1 + tensor = result[0][offset: offset+no_of_positions] + # mainly because no coverage in flanking windows + if tensor.shape != (no_of_positions, channel_size): + continue + # check any empty columns in flanking position, those columns with all zeros + if np.sum(np.sum(tensor == 0, axis=1) == channel_size) > 0: + continue + np_pileup_data.append(tensor) + all_position_info.append(pos_info) + all_alt_info.append(alt_info) + np_pileup_data = np.array(np_pileup_data, dtype=np.int32) + + return np_pileup_data, all_position_info, all_alt_info + + +def main(): + parser = ArgumentParser(description="Generate variant candidate tensors using pileup") + + parser.add_argument('--platform', type=str, default='ont', + help="Sequencing platform of the input. Options: 'ont,hifi,ilmn', default: %(default)s") + + parser.add_argument('--bam_fn', type=str, default="input.bam", required=True, + help="Sorted BAM file input, required") + + parser.add_argument('--ref_fn', type=str, default="ref.fa", required=True, + help="Reference fasta file input, required") + + parser.add_argument('--tensor_can_fn', type=str, default="PIPE", + help="Tensor output, stdout by default, default: %(default)s") + + parser.add_argument('--vcf_fn', type=str, default=None, + help="Candidate sites VCF file input, if provided, variants will only be called at the sites in the VCF file, default: %(default)s") + + parser.add_argument('--min_af', type=float, default=0.08, + help="Minimum allele frequency for both SNP and Indel for a site to be considered as a candidate site, default: %(default)f") + + parser.add_argument('--snp_min_af', type=float, default=0.08, + help="Minimum snp allele frequency for a site to be considered as a candidate site, default: %(default)f") + + parser.add_argument('--indel_min_af', type=float, default=0.15, + help="Minimum indel allele frequency for a site to be considered as a candidate site, default: %(default)f") + + parser.add_argument('--ctgName', type=str, default=None, + help="The name of sequence to be processed, required if --bed_fn is not defined") + + parser.add_argument('--ctgStart', type=int, default=None, + help="The 1-based starting position of the sequence to be processed, optional, will process the whole --ctgName if not set") + + parser.add_argument('--ctgEnd', type=int, default=None, + help="The 1-based inclusive ending position of the sequence to be processed, optional, will process the whole --ctgName if not set") + + parser.add_argument('--bed_fn', type=str, default=None, + help="Call variant only in the provided regions. Will take an intersection if --ctgName and/or (--ctgStart, --ctgEnd) are set") + + parser.add_argument('--gvcf', type=str2bool, default=False, + help="Enable GVCF output, default: disabled") + + parser.add_argument('--sampleName', type=str, default="SAMPLE", + help="Define the sample name to be shown in the VCF file, default: %(default)s") + + parser.add_argument('--samtools', type=str, default="samtools", + help="Path to the 'samtools', samtools version >= 1.10 is required. default: %(default)s") + + # options for advanced users + parser.add_argument('--fast_mode', type=str2bool, default=False, + help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s") + + parser.add_argument('--minCoverage', type=float, default=2, + help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f") + + parser.add_argument('--minMQ', type=int, default=param.min_mq, + help="EXPERIMENTAL: If set, reads with mapping quality with <$minMQ are filtered, default: %(default)d") + + parser.add_argument('--minBQ', type=int, default=param.min_bq, + help="EXPERIMENTAL: If set, bases with base quality with <$minBQ are filtered, default: %(default)d") + + parser.add_argument('--max_depth', type=int, default=param.max_depth, + help="EXPERIMENTAL: Maximum pileup depth to be processed. default: %(default)s") + + parser.add_argument('--call_snp_only', type=str2bool, default=False, + help="EXPERIMENTAL: Call candidates pass snp minimum AF only, ignore Indel candidates") + + # options for debug purpose + parser.add_argument('--extend_bed', type=str, default=None, + help="DEBUG: Extend the regions in the --bed_fn by a few bp for tensor creation, default extend 16bp") + + parser.add_argument('--temp_file_dir', type=str, default="./", + help="EXPERIMENTAL: The cache directory for storing temporary non-variant information if --gvcf is enabled, default: %(default)s") + + parser.add_argument('--indel_fn', type=str, default=None, + help="DEBUG: Output all alternative indel cigar for debug purpose") + + parser.add_argument('--base_err', default=param.base_err, type=float, + help='DEBUG: Estimated base error rate in gvcf option, default: %(default)f') + + parser.add_argument('--gq_bin_size', default=param.gq_bin_size, type=int, + help='DEBUG: Default gq bin size for merge non-variant block in gvcf option, default: %(default)d') + + parser.add_argument('--bp_resolution', action='store_true', + help="DEBUG: Enable bp resolution for GVCF, default: disabled") + + # options for internal process control + ## Path to the 'zstd' compression + parser.add_argument('--zstd', type=str, default=param.zstd, + help=SUPPRESS) + + ## Test in specific candidate position. Only for testing + parser.add_argument('--test_pos', type=int, default=0, + help=SUPPRESS) + + ## The number of chucks to be divided into for parallel processing + parser.add_argument('--chunk_num', type=int, default=None, + help=SUPPRESS) + + ## The chuck ID to work on + parser.add_argument('--chunk_id', type=int, default=None, + help=SUPPRESS) + + args = parser.parse_args() + + if len(sys.argv[1:]) == 0: + parser.print_help() + sys.exit(1) + + CreateTensorPileup(args) + + +if __name__ == "__main__": + main() From a6f7c88df8f5245f7681c89d2110063736be669a Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 13:04:26 +0800 Subject: [PATCH 05/43] integrate the triton inference server from nvidia, and directly use the numpy buffer to get the input matrix from create tensor function, reuse all CallVariants function --- clair3/CallVariantsFromCffi.py | 335 +++++++++++++++++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 clair3/CallVariantsFromCffi.py diff --git a/clair3/CallVariantsFromCffi.py b/clair3/CallVariantsFromCffi.py new file mode 100644 index 0000000..8df9d5b --- /dev/null +++ b/clair3/CallVariantsFromCffi.py @@ -0,0 +1,335 @@ +import sys +import os +import tensorflow as tf +import logging +from time import time +from argparse import ArgumentParser, SUPPRESS + +import tritonclient.grpc as tritongrpcclient + +from shared.utils import str2bool, log_error +from clair3.CallVariants import OutputConfig, output_utilties_from, batch_output + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" +logging.basicConfig(format='%(message)s', level=logging.INFO) + + +def Run(args): + os.environ["OMP_NUM_THREADS"] = "1" + os.environ["OPENBLAS_NUM_THREADS"] = "1" + os.environ["MKL_NUM_THREADS"] = "1" + os.environ["NUMEXPR_NUM_THREADS"] = "1" + + tf.config.threading.set_intra_op_parallelism_threads(1) + tf.config.threading.set_inter_op_parallelism_threads(1) + + global test_pos + test_pos = None + global param + if args.pileup: + import shared.param_p as param + else: + import shared.param_f as param + + if args.enable_long_indel: + maximum_variant_length_that_need_infer = param.maximum_variant_length_that_need_infer_include_long_indel + else: + maximum_variant_length_that_need_infer = param.maximum_variant_length_that_need_infer + + output_config = OutputConfig( + is_show_reference=args.showRef, + is_debug=args.debug, + is_haploid_precise_mode_enabled=args.haploid_precise, + is_haploid_sensitive_mode_enabled=args.haploid_sensitive, + is_output_for_ensemble=args.output_for_ensemble, + quality_score_for_pass=args.qual, + tensor_fn=args.tensor_fn, + input_probabilities=args.input_probabilities, + add_indel_length=args.add_indel_length, + gvcf=args.gvcf, + pileup=args.pileup, + enable_long_indel=args.enable_long_indel, + maximum_variant_length_that_need_infer=maximum_variant_length_that_need_infer + ) + output_utilities = output_utilties_from( + sample_name=args.sampleName, + is_debug=args.debug, + is_output_for_ensemble=args.output_for_ensemble, + reference_file_path=args.ref_fn, + output_file_path=args.call_fn, + output_probabilities=args.output_probabilities + ) + + call_variants_from_cffi(args=args, output_config=output_config, output_utilities=output_utilities) + + +def call_variants_from_cffi(args, output_config, output_utilities): + use_gpu = args.use_gpu + if use_gpu: + server_url = 'localhost:8001' + try: + triton_client = tritongrpcclient.InferenceServerClient( + url=server_url, + verbose=False + ) + except Exception as e: + print("channel creation failed: " + str(e)) + sys.exit() + else: + os.environ["CUDA_VISIBLE_DEVICES"] = "" + + global param + if args.pileup: + import shared.param_p as param + if use_gpu: + model_name = 'pileup' + input_dtype = 'INT32' + else: + from clair3.model import Clair3_P + m = Clair3_P(add_indel_length=args.add_indel_length, predict=True) + else: + import shared.param_f as param + if use_gpu: + model_name = 'alignment' + input_dtype = 'INT8' + else: + from clair3.model import Clair3_F + m = Clair3_F(add_indel_length=args.add_indel_length, predict=True) + + if not use_gpu: + m.load_weights(args.chkpnt_fn) + output_utilities.gen_output_file() + output_utilities.output_header() + chunk_id = args.chunk_id - 1 if args.chunk_id else None # 1-base to 0-base + chunk_num = args.chunk_num + full_alignment_mode = not args.pileup + + logging.info("Calling variants ...") + variant_call_start_time = time() + + batch_output_method = batch_output + total = 0 + + if args.pileup: + from preprocess.CreateTensorPileupFromCffi import CreateTensorPileup as CT + else: + from preprocess.CreateTensorFullAlignmentFromCffi import CreateTensorFullAlignment as CT + + tensor, all_position, all_alt_info = CT(args) + + def tensor_generator_from(tensor, all_position, all_alt_info): + total_data = len(tensor) + assert total_data == len(all_alt_info) + assert total_data == len(all_position) + batch_size = param.predictBatchSize + total_chunk = total_data // batch_size if total_data % batch_size == 0 else total_data // batch_size + 1 + for chunk_id in range(total_chunk): + chunk_start = chunk_id * batch_size + chunk_end = (chunk_id + 1) * batch_size if chunk_id < total_chunk - 1 else total_data + yield (tensor[chunk_start:chunk_end], all_position[chunk_start:chunk_end], all_alt_info[chunk_start:chunk_end]) + + tensor_generator = tensor_generator_from(tensor, all_position, all_alt_info) + + for (X, position, alt_info_list) in tensor_generator: + total += len(X) + + if use_gpu: + inputs = []; outputs = [] + + inputs.append(tritongrpcclient.InferInput('input_1', X.shape, input_dtype)) + outputs.append(tritongrpcclient.InferRequestedOutput('output_1')) + + inputs[0].set_data_from_numpy(X) + results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs) + Y = results.as_numpy('output_1') + else: + Y = m.predict_on_batch(X) + + batch_output_method(position, alt_info_list, Y, output_config, output_utilities) + + if chunk_id is not None: + logging.info("Total processed positions in {} (chunk {}/{}) : {}".format(args.ctgName, chunk_id+1, chunk_num, total)) + elif full_alignment_mode: + try: + chunk_infos = args.call_fn.split('.')[-2] + c_id, c_num = chunk_infos.split('_') + c_id = int(c_id) + 1 # 0-index to 1-index + logging.info("Total processed positions in {} (chunk {}/{}) : {}".format(args.ctgName, c_id, c_num, total)) + except: + logging.info("Total processed positions in {} : {}".format(args.ctgName, total)) + else: + logging.info("Total processed positions in {} : {}".format(args.ctgName, total)) + + if full_alignment_mode and total == 0: + logging.info(log_error("[ERROR] No full-alignment output for file {}/{}".format(args.ctgName, args.call_fn))) + + logging.info("Total time elapsed: %.2f s" % (time() - variant_call_start_time)) + + output_utilities.close_opened_files() + # remove file if on variant in output + if os.path.exists(args.call_fn): + for row in open(args.call_fn, 'r'): + if row[0] != '#': + return + logging.info("[INFO] No vcf output for file {}, remove empty file".format(args.call_fn)) + os.remove(args.call_fn) + + +def main(): + parser = ArgumentParser(description="Call variants using a trained model and tensors of candidate variants") + + parser.add_argument('--platform', type=str, default="ont", + help="Sequencing platform of the input. Options: 'ont,hifi,ilmn', default: %(default)s") + + parser.add_argument('--tensor_fn', type=str, default="PIPE", + help="Tensor input filename, or stdin if not set") + + parser.add_argument('--chkpnt_fn', type=str, default=None, + help="Input a trained model for variant calling, required") + + parser.add_argument('--call_fn', type=str, default="clair3", + help="VCF output filename, or stdout if not set") + + parser.add_argument('--gvcf', type=str2bool, default=False, + help="Enable GVCF output, default: disabled") + + parser.add_argument('--ref_fn', type=str, default=None, + help="Reference fasta file input, required if --gvcf is enabled") + + parser.add_argument('--ctgName', type=str, default=None, + help="The name of the sequence to be processed") + + parser.add_argument('--ctgStart', type=int, default=None, + help="The 1-based starting position of the sequence to be processed, optional, will process the whole --ctgName if not set") + + parser.add_argument('--ctgEnd', type=int, default=None, + help="The 1-based inclusive ending position of the sequence to be processed, optional, will process the whole --ctgName if not set") + + parser.add_argument('--sampleName', type=str, default="SAMPLE", + help="Define the sample name to be shown in the VCF file, optional") + + parser.add_argument('--qual', type=int, default=2, + help="If set, variants with >=$qual will be marked 'PASS', or 'LowQual' otherwise, optional") + + parser.add_argument('--samtools', type=str, default="samtools", + help="Path to the 'samtools', samtools version >= 1.10 is required, default: %(default)s") + + # options for advanced users + parser.add_argument('--temp_file_dir', type=str, default='./', + help="EXPERIMENTAL: The cache directory for storing temporary non-variant information if --gvcf is enabled, default: %(default)s") + + parser.add_argument('--haploid_precise', action='store_true', + help="EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant") + + parser.add_argument('--haploid_sensitive', action='store_true', + help="EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant") + + parser.add_argument('--enable_long_indel', type=str2bool, default=False, + help="EXPERIMENTAL: Enable long Indel variants(>50 bp) calling") + + # options for debug purpose + parser.add_argument('--use_gpu', type=str2bool, default=False, + help="DEBUG: Use GPU for calling. Speed up is mostly insignficiant. Only use this for building your own pipeline") + + parser.add_argument('--predict_fn', type=str, default=None, + help="DEBUG: Output network output probabilities for further analysis") + + parser.add_argument('--input_probabilities', action='store_true', + help="DEBUG: Use network probability outputs as input and generate variants from them") + + parser.add_argument('--output_probabilities', action='store_true', + help="DEBUG: Output the network probabilities of gt21, genotype, indel_length_1 and indel_length_2") + + # options for internal process control + ## In pileup mode or not (full alignment mode), default: False + parser.add_argument('--pileup', action='store_true', + help=SUPPRESS) + + ## Include indel length in training and calling, false for pileup and true for raw alignment + parser.add_argument('--add_indel_length', action='store_true', + help=SUPPRESS) + + ## The number of chucks to be divided into for parallel processing + parser.add_argument('--chunk_num', type=int, default=None, + help=SUPPRESS) + + ## The chuck ID to work on + parser.add_argument('--chunk_id', type=int, default=None, + help=SUPPRESS) + + ## Enable debug mode, default: False, optional + parser.add_argument('--debug', action='store_true', + help=SUPPRESS) + + ## Generating outputs for ensemble model calling + parser.add_argument('--output_for_ensemble', action='store_true', + help=SUPPRESS) + + ## Use bin file from pytables to speed up calling. + parser.add_argument('--is_from_tables', action='store_true', + help=SUPPRESS) + + ## Output reference calls + parser.add_argument('--showRef', type=str2bool, default=True, + help=SUPPRESS) + + # Pileup create tensor options for pileup calling + parser.add_argument('--bam_fn', type=str, default="input.bam", required=True, + help="Sorted BAM file input, required") + + parser.add_argument('--bed_fn', type=str, nargs='?', action="store", default=None, + help="Call variant only in the provided regions. Will take an intersection if --ctgName and/or (--ctgStart, --ctgEnd) are set") + + parser.add_argument('--snp_min_af', type=float, default=0.08, + help="Minimum snp allele frequency for a site to be considered as a candidate site, default: %(default)f") + + parser.add_argument('--indel_min_af', type=float, default=0.15, + help="Minimum indel allele frequency for a site to be considered as a candidate site, default: %(default)f") + + parser.add_argument('--extend_bed', nargs='?', action="store", type=str, default=None, + help="DEBUG: Extend the regions in the --bed_fn by a few bp for tensor creation, default extend 16bp") + + + parser.add_argument('--vcf_fn', type=str, default=None, + help="Candidate sites VCF file input, if provided, variants will only be called at the sites in the VCF file, default: %(default)s") + + parser.add_argument('--minCoverage', type=float, default=2, + help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f") + + parser.add_argument('--minMQ', type=int, default=5, + help="EXPERIMENTAL: If set, reads with mapping quality with <$minMQ are filtered, default: %(default)d") + + parser.add_argument('--minBQ', type=int, default=0, + help="EXPERIMENTAL: If set, bases with base quality with <$minBQ are filtered, default: %(default)d") + + parser.add_argument('--max_depth', type=int, default=144, + help="EXPERIMENTAL: Maximum full alignment depth to be processed. default: %(default)s") + + parser.add_argument('--fast_mode', type=str2bool, default=False, + help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s") + + parser.add_argument('--call_snp_only', type=str2bool, default=False, + help="EXPERIMENTAL: Call candidates pass snp minimum AF only, ignore Indel candidates") + + # Full-alignment create tensor options for full-alignment calling + parser.add_argument('--phased_vcf_fn', type=str, default=None, + help="Use heterozygous SNP variants in phased vcf file for haplotaging") + + parser.add_argument('--no_phasing_for_fa', type=str2bool, default=False, + help="EXPERIMENTAL: Call variants without whatshap or longphase phasing in full alignment calling") + + ## Provide the regions to be included in full-alignment based calling + parser.add_argument('--full_aln_regions', type=str, default=None, + help=SUPPRESS) + + args = parser.parse_args() + + if len(sys.argv[1:]) == 0: + parser.print_help() + sys.exit(1) + + Run(args) + + +if __name__ == "__main__": + main() \ No newline at end of file From d99c654b61cffc018dd683c40ec96a1d0a6a950b Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 13:05:26 +0800 Subject: [PATCH 06/43] only store the candidate reference base in c implement for efficiency --- clair3/CallVariants.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clair3/CallVariants.py b/clair3/CallVariants.py index 56e4f1b..47a7042 100644 --- a/clair3/CallVariants.py +++ b/clair3/CallVariants.py @@ -17,6 +17,7 @@ HETERO_SNP_GT21, HETERO_SNP_LABELS, GT21_LABELS, partial_label_from, mix_two_partial_labels ) import clair3.utils as utils +import shared.param_p as param from clair3.task.genotype import Genotype, genotype_string_from, genotype_enum_from, genotype_enum_for_task from shared.utils import IUPAC_base_to_ACGT_base_dict as BASE2ACGT, BASIC_BASES, str2bool, file_path_from, log_error, log_warning from clair3.task.variant_length import VariantLength @@ -1114,7 +1115,8 @@ def output_with( chromosome, position, reference_sequence = chr_pos_seq.rstrip().split(':') position = int(position) - tensor_position_center = param.flankingBaseNum + # only store the centered reference base for C implment for efficiency + tensor_position_center = param.flankingBaseNum if len(reference_sequence) > 1 else 0 information_string = "P" if output_config.pileup else 'F' if type(alt_info) == np.memmap: From 603c94b411f7339f72f58220a9ad95b4dbeaabe9 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 13:06:47 +0800 Subject: [PATCH 07/43] add three new submodules --- clair3.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/clair3.py b/clair3.py index b8a2a4c..8b46e9e 100644 --- a/clair3.py +++ b/clair3.py @@ -10,6 +10,7 @@ "CallVarBam", "CallVariants", "Train", + "CallVariantsFromCffi" ] data_preprocess_folder = [ @@ -27,7 +28,9 @@ 'UnifyRepresentation', 'CheckEnvs', 'SortVcf', - 'SelectQual' + 'SelectQual', + "CreateTensorPileupFromCffi" + "CreateTensorFullAlignmentFromCffi", ] post_process_scripts_folder = [ From 9fb0ddf27883c6200365f39c8ebea21babc3ad0b Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 13:07:40 +0800 Subject: [PATCH 08/43] add longphase, use_gpu and enable_c_impl options --- run_clair3.sh | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) mode change 100755 => 100644 run_clair3.sh diff --git a/run_clair3.sh b/run_clair3.sh old mode 100755 new mode 100644 index 3acad08..b00fb47 --- a/run_clair3.sh +++ b/run_clair3.sh @@ -51,6 +51,10 @@ print_help_messages() echo $' --no_phasing_for_fa EXPERIMENTAL: Call variants without whatshap phasing in full alignment calling, default: disable.' echo $' --call_snp_only EXPERIMENTAL: Call candidates pass SNP minimum AF only, ignore Indel candidates, default: disable.' echo $' --enable_long_indel EXPERIMENTAL: Call long Indel variants(>50 bp), default: disable.' + echo $' --use_gpu Use GPU for calling, default: disable.' + echo $' --longphase_for_phasing Use longphase for phasing, default: disable.' + echo $' --longphase Path of longphase, longphase >= 1.0 is required.' + echo $' --enable_c_impl Use C implement with cffi for pileup and full-alignment create tensor, default: disable.' echo $'' } @@ -66,9 +70,9 @@ NC="\\033[0m" ARGS=`getopt -o b:f:t:m:p:o:hv \ -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ -bed_fn::,vcf_fn::,ctg_name::,sample_name::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,ref_pct_full::,var_pct_phasing::,\ +bed_fn::,vcf_fn::,ctg_name::,sample_name::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,ref_pct_full::,var_pct_phasing::,longphase::,\ snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\ -remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,help,version -n 'run_clair3.sh' -- "$@"` +remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,use_gpu,longphase_for_phasing,enable_c_impl,help,version -n 'run_clair3.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi eval set -- "${ARGS}" @@ -83,6 +87,7 @@ PYPY="pypy3" PYTHON='python3' PARALLEL='parallel' WHATSHAP='whatshap' +longphase='longphase' CHUNK_NUM=0 CHUNK_SIZE=5000000 QUAL=2 @@ -93,8 +98,8 @@ GVCF=False PILEUP_ONLY=False FAST_MODE=False SHOW_REF=False -SNP_AF="0" -INDEL_AF="0" +SNP_AF="0.08" +INDEL_AF="0.15" HAP_PRE=False HAP_SEN=False SNP_ONLY=False @@ -103,6 +108,9 @@ NO_PHASING=False RM_TMP_DIR=False ENABLE_PHASING=False ENABLE_LONG_INDEL=False +USE_GPU=False +USE_LONGPHASE=False +ENABLE_C_IMPL=False PILEUP_PREFIX="pileup" FA_PREFIX="full_alignment" @@ -126,6 +134,7 @@ while true; do --pypy ) PYPY="$2"; shift 2 ;; --parallel ) PARALLEL="$2"; shift 2 ;; --whatshap ) WHATSHAP="$2"; shift 2 ;; + --longphase ) LONGPHASE="$2"; shift 2 ;; --var_pct_full ) PRO="$2"; shift 2 ;; --ref_pct_full ) REF_PRO="$2"; shift 2 ;; --var_pct_phasing ) PHASING_PCT="$2"; shift 2 ;; @@ -145,6 +154,9 @@ while true; do --remove_intermediate_dir ) RM_TMP_DIR=True; shift 1 ;; --enable_phasing ) ENABLE_PHASING=True; shift 1 ;; --enable_long_indel ) ENABLE_LONG_INDEL=True; shift 1 ;; + --use_gpu ) USE_GPU=True; shift 1 ;; + --longphase_for_phasing ) USE_LONGPHASE=True; shift 1 ;; + --enable_c_impl ) ENABLE_C_IMPL=True; shift 1 ;; -- ) shift; break; ;; -h|--help ) print_help_messages; exit 0 ;; @@ -195,7 +207,7 @@ if [ "${PLATFORM}" != "ont" ] && [ "${PRO}" = "0" ]; then PRO=0.3; fi # show default high quality hete variant proportion for whatshap phasing, 0.8 for ont guppy5 and 0.7 for others if [ "${PHASING_PCT}" = "0" ]; then PHASING_PCT=0.7; fi -BASE_MODEL=$(basename ${MODEL_PATH})C +BASE_MODEL=$(basename ${MODEL_PATH}) if [ "${BASE_MODEL}" = "r941_prom_sup_g5014" ] || [ "${BASE_MODEL}" = "r941_prom_hac_g5014" ] || [ "${BASE_MODEL}" = "ont_guppy5" ]; then PHASING_PCT=0.8; fi # remove the last '/' character in directory input @@ -220,6 +232,7 @@ echo "[INFO] PYTHON PATH: ${PYTHON}" echo "[INFO] PYPY PATH: ${PYPY}" echo "[INFO] PARALLEL PATH: ${PARALLEL}" echo "[INFO] WHATSHAP PATH: ${WHATSHAP}" +echo "[INFO] LONGPHASE PATH: ${LONGPHASE}" echo "[INFO] CHUNK SIZE: ${CHUNK_SIZE}" if [ ${CHUNK_NUM} -gt 0 ]; then echo "[INFO] CHUNK NUM: ${CHUNK_NUM}"; fi echo "[INFO] FULL ALIGN PROPORTION: ${PRO}" @@ -239,6 +252,9 @@ echo "[INFO] ENABLE NO PHASING FOR FULL ALIGNMENT: ${NO_PHASING}" echo "[INFO] ENABLE REMOVING INTERMEDIATE FILES: ${RM_TMP_DIR}" echo "[INFO] ENABLE PHASING VCF OUTPUT: ${ENABLE_PHASING}" echo "[INFO] ENABLE LONG INDEL CALLING: ${ENABLE_LONG_INDEL}" +echo "[INFO] ENABLE GPU CALLING: ${USE_GPU}" +echo "[INFO] ENABLE LONGPHASE_FOR_PHASING: ${USE_LONGPHASE}" +echo "[INFO] ENABLE C_IMPLEMENT: ${USE_LONGPHASE}" echo $'' # file check @@ -290,9 +306,11 @@ if [ -z ${FA_PREFIX} ]; then echo -e "${ERROR} Use '--fa_model_prefix=STR' inste if [ ! -f ${MODEL_PATH}/${PILEUP_PREFIX}.index ]; then echo -e "${ERROR} No pileup model found in provided model path and model prefix ${MODEL_PATH}/${PILEUP_PREFIX} ${NC}"; exit 1; fi if [ ! -f ${MODEL_PATH}/${FA_PREFIX}.index ]; then echo -e "${ERROR} No full-alignment model found in provided model path and model prefix ${MODEL_PATH}/${FA_PREFIX} ${NC}"; exit 1; fi +CLAIR3_SCRIPT="clair3.sh" +if [ "${ENABLE_C_IMPL}" == True ] && [ ! ${PLATFORM} = "ilmn" ]; then CLAIR3_SCRIPT="clair3_c_impl.sh"; fi set -x -${SCRIPT_PATH}/scripts/clair3.sh \ +${SCRIPT_PATH}/scripts/${CLAIR3_SCRIPT} \ --bam_fn ${BAM_FILE_PATH} \ --ref_fn ${REFERENCE_FILE_PATH} \ --threads ${THREADS} \ @@ -329,7 +347,10 @@ ${SCRIPT_PATH}/scripts/clair3.sh \ --fa_model_prefix=${FA_PREFIX} \ --remove_intermediate_dir=${RM_TMP_DIR} \ --enable_phasing=${ENABLE_PHASING} \ - --enable_long_indel=${ENABLE_LONG_INDEL} + --enable_long_indel=${ENABLE_LONG_INDEL} \ + --use_gpu=${USE_GPU} \ + --longphase_for_phasing=${USE_LONGPHASE} \ + --longphase=${LONGPHASE} )) |& tee ${OUTPUT_FOLDER}/run_clair3.log \ No newline at end of file From 367d873981e5b58f379270489dec8048452af309 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 13:09:03 +0800 Subject: [PATCH 09/43] clair3 c implement script, directly use CallVariantsFromCffi submodule for pileup and full-alignment calling --- scripts/clair3_c_impl.sh | 334 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 334 insertions(+) create mode 100644 scripts/clair3_c_impl.sh diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh new file mode 100644 index 0000000..0f0798f --- /dev/null +++ b/scripts/clair3_c_impl.sh @@ -0,0 +1,334 @@ +#!/bin/bash +SCRIPT_NAME=$(basename "$0") +Usage="Usage: ./${SCRIPT_NAME} --bam_fn=BAM --ref_fn=REF --output=OUTPUT_DIR --threads=THREADS --platform=PLATFORM --model_path=MODEL_PREFIX [--bed_fn=BED] [options]" +# INFO: whole calling workflow of clair3 + +set -e +ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \ +-l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ +bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\ +snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ +no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"` + +if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi +eval set -- "${ARGS}" + +while true; do + case "$1" in + -b|--bam_fn ) BAM_FILE_PATH="$2"; shift 2 ;; + -f|--ref_fn ) REFERENCE_FILE_PATH="$2"; shift 2 ;; + -t|--threads ) THREADS="$2"; shift 2 ;; + -m|--model_path ) MODEL_PATH="$2"; shift 2 ;; + -p|--platform ) PLATFORM="$2"; shift 2 ;; + -o|--output ) OUTPUT_FOLDER="$2"; shift 2 ;; + --bed_fn ) BED_FILE_PATH="$2"; shift 2 ;; + --vcf_fn ) VCF_FILE_PATH="$2"; shift 2 ;; + --ctg_name ) CONTIGS="$2"; shift 2 ;; + --sample_name ) SAMPLE="$2"; shift 2 ;; + --chunk_num ) CHUNK_NUM="$2"; shift 2 ;; + --chunk_size ) CHUNK_SIZE="$2"; shift 2 ;; + --qual ) QUAL="$2"; shift 2 ;; + --samtools ) SAMTOOLS="$2"; shift 2 ;; + --python ) PYTHON="$2"; shift 2 ;; + --pypy ) PYPY="$2"; shift 2 ;; + --parallel ) PARALLEL="$2"; shift 2 ;; + --whatshap ) WHATSHAP="$2"; shift 2 ;; + --longphase ) LONGPHASE="$2"; shift 2 ;; + --var_pct_full ) PRO="$2"; shift 2 ;; + --ref_pct_full ) REF_PRO="$2"; shift 2 ;; + --var_pct_phasing ) PHASING_PCT="$2"; shift 2 ;; + --pileup_only ) PILEUP_ONLY="$2"; shift 2 ;; + --fast_mode ) FAST_MODE="$2"; shift 2 ;; + --call_snp_only ) SNP_ONLY="$2"; shift 2 ;; + --print_ref_calls ) SHOW_REF="$2"; shift 2 ;; + --gvcf ) GVCF="$2"; shift 2 ;; + --snp_min_af ) SNP_AF="$2"; shift 2 ;; + --indel_min_af ) INDEL_AF="$2"; shift 2 ;; + --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;; + --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;; + --haploid_precise ) HAP_PRE="$2"; shift 2 ;; + --haploid_sensitive ) HAP_SEN="$2"; shift 2 ;; + --include_all_ctgs ) INCLUDE_ALL_CTGS="$2"; shift 2 ;; + --no_phasing_for_fa ) NO_PHASING="$2"; shift 2 ;; + --remove_intermediate_dir ) RM_TMP_DIR="$2"; shift 2 ;; + --enable_phasing ) ENABLE_PHASING="$2"; shift 2 ;; + --enable_long_indel ) ENABLE_LONG_INDEL="$2"; shift 2 ;; + --use_gpu ) USE_GPU="$2"; shift 2 ;; + --longphase_for_phasing ) USE_LONGPHASE="$2"; shift 2 ;; + + -- ) shift; break; ;; + -h|--help ) print_help_messages; break ;; + * ) print_help_messages; exit 0 ;; + esac +done + + +SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) +CLAIR3="${SHELL_FOLDER}/../clair3.py" + +if [ ${BED_FILE_PATH} = "EMPTY" ] ; then BED_FILE_PATH= ; fi +RETRIES=4 + +PILEUP_CHECKPOINT_PATH="${MODEL_PATH}/${PILEUP_PREFIX}" +FULL_ALIGNMENT_CHECKPOINT_PATH="${MODEL_PATH}/${FA_PREFIX}" +LOG_PATH="${OUTPUT_FOLDER}/log" +TMP_FILE_PATH="${OUTPUT_FOLDER}/tmp" +SPLIT_BED_PATH="${TMP_FILE_PATH}/split_beds" +PILEUP_VCF_PATH="${TMP_FILE_PATH}/pileup_output" +GVCF_TMP_PATH="${TMP_FILE_PATH}/gvcf_tmp_output" +PHASE_OUTPUT_PATH="${TMP_FILE_PATH}/phase_output" +FULL_ALIGNMENT_OUTPUT_PATH="${TMP_FILE_PATH}/full_alignment_output" +PHASE_VCF_PATH="${PHASE_OUTPUT_PATH}/phase_vcf" +PHASE_BAM_PATH="${PHASE_OUTPUT_PATH}/phase_bam" +CANDIDATE_BED_PATH="${FULL_ALIGNMENT_OUTPUT_PATH}/candidate_bed" +export OPENBLAS_NUM_THREADS=1 +export GOTO_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +echo "[INFO] Check environment variables" +${PYTHON} ${CLAIR3} CheckEnvs \ + --bam_fn ${BAM_FILE_PATH} \ + --bed_fn ${BED_FILE_PATH} \ + --output_fn_prefix ${OUTPUT_FOLDER} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --vcf_fn ${VCF_FILE_PATH} \ + --ctg_name ${CONTIGS} \ + --chunk_num ${CHUNK_NUM} \ + --chunk_size ${CHUNK_SIZE} \ + --include_all_ctgs ${INCLUDE_ALL_CTGS} \ + --threads ${THREADS} \ + --python ${PYTHON} \ + --pypy ${PYPY} \ + --samtools ${SAMTOOLS} \ + --whatshap ${WHATSHAP} \ + --parallel ${PARALLEL} \ + --qual ${QUAL} \ + --sampleName ${SAMPLE} \ + --var_pct_full ${PRO} \ + --ref_pct_full ${REF_PRO} \ + --snp_min_af ${SNP_AF} \ + --indel_min_af ${INDEL_AF} +readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS" +if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi +THREADS_LOW=$((${THREADS}*3/4)) +LONGPHASE_THREADS=$((${THREADS}*1/2)) +if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi +if [[ ${LONGPHASE_THREADS} < 1 ]]; then LONGPHASE_THREADS=1; fi + +cd ${OUTPUT_FOLDER} +# Pileup calling +#----------------------------------------------------------------------------------------------------------------------- +echo "[INFO] 1/7 Call variants using pileup model" +time ${PARALLEL} --retries ${RETRIES} -C ' ' --joblog ${LOG_PATH}/parallel_1_call_var_bam_pileup.log -j ${THREADS_LOW} \ +"${PYTHON} ${CLAIR3} CallVariantsFromCffi \ + --chkpnt_fn ${PILEUP_CHECKPOINT_PATH} \ + --bam_fn ${BAM_FILE_PATH} \ + --call_fn ${PILEUP_VCF_PATH}/pileup_{1}_{2}.vcf \ + --sampleName ${SAMPLE} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --extend_bed ${SPLIT_BED_PATH}/{1} \ + --bed_fn ${BED_FILE_PATH} \ + --vcf_fn ${VCF_FILE_PATH} \ + --ctgName {1} \ + --chunk_id {2} \ + --chunk_num {3} \ + --platform ${PLATFORM} \ + --fast_mode ${FAST_MODE} \ + --snp_min_af ${SNP_AF} \ + --indel_min_af ${INDEL_AF} \ + --call_snp_only ${SNP_ONLY} \ + --gvcf ${GVCF} \ + --enable_long_indel ${ENABLE_LONG_INDEL} \ + --samtools ${SAMTOOLS} \ + --temp_file_dir ${GVCF_TMP_PATH} \ + --pileup \ + --use_gpu ${USE_GPU}" :::: ${OUTPUT_FOLDER}/tmp/CHUNK_LIST |& tee ${LOG_PATH}/1_call_var_bam_pileup.log + +${PYPY} ${CLAIR3} SortVcf \ + --input_dir ${PILEUP_VCF_PATH} \ + --vcf_fn_prefix "pileup" \ + --output_fn ${OUTPUT_FOLDER}/pileup.vcf \ + --sampleName ${SAMPLE} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --contigs_fn ${TMP_FILE_PATH}/CONTIGS + +if [ "$( gzip -fdc ${OUTPUT_FOLDER}/pileup.vcf.gz | grep -v '#' | wc -l )" -eq 0 ]; then echo "[INFO] Exit in pileup variant calling"; exit 0; fi +if [ ${PILEUP_ONLY} == True ]; then + if [ ${RM_TMP_DIR} == True ]; then echo "[INFO] Removing intermediate files in ${OUTPUT_FOLDER}/tmp"; rm -rf ${OUTPUT_FOLDER}/tmp; fi + echo "[INFO] Only call pileup output with --pileup_only, output file: ${OUTPUT_FOLDER}/pileup.vcf.gz" + echo "[INFO] Finish calling!" + exit 0; +fi + +# Whatshap phasing and haplotaging +#----------------------------------------------------------------------------------------------------------------------- +if [ ${NO_PHASING} == True ] +then + echo "[INFO] 2/7 No phasing for full alignment calling" + ${PARALLEL} -j${THREADS} ln -sf ${BAM_FILE_PATH} ${PHASE_BAM_PATH}/{1}.bam ::: ${CHR[@]} + if [ -f ${BAM_FILE_PATH}.bai ]; then ${PARALLEL} --retries ${RETRIES} -j${THREADS} ln -sf ${BAM_FILE_PATH}.bai ${PHASE_BAM_PATH}/{1}.bam.bai ::: ${CHR[@]}; fi + if [ -f ${BAM_FILE_PATH%.*}.bai ]; then ${PARALLEL} --retries ${RETRIES} -j${THREADS} ln -sf ${BAM_FILE_PATH%.*}.bai ${PHASE_BAM_PATH}/{1}.bam.bai ::: ${CHR[@]}; fi +else + echo $'' + echo "[INFO] 2/7 Select heterozygous SNP variants for Whatshap phasing and haplotagging" + gzip -fdc ${OUTPUT_FOLDER}/pileup.vcf.gz | ${PYPY} ${CLAIR3} SelectQual --phase --output_fn ${PHASE_VCF_PATH} --var_pct_phasing ${PHASING_PCT} + time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_2_select_hetero_snp.log -j${THREADS} \ + "${PYPY} ${CLAIR3} SelectHetSnp \ + --vcf_fn ${OUTPUT_FOLDER}/pileup.vcf.gz \ + --split_folder ${PHASE_VCF_PATH} \ + --ctgName {1}" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} |& tee ${LOG_PATH}/2_select_hetero_snp.log + + echo $'' + if [ ${USE_LONGPHASE} == True ] + then + echo "[INFO] 3/7 Phase VCF file using LongPhase" + time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \ + "${LONGPHASE} phase\ + -s ${PHASE_VCF_PATH}/{1}.vcf \ + -b ${BAM_FILE_PATH} \ + -r ${REFERENCE_FILE_PATH} \ + -t ${LONGPHASE_THREADS} \ + -o ${PHASE_VCF_PATH}/phased_{1} \ + --ont" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log + ${PARALLEL} -j${THREADS} bgzip -f ${PHASE_VCF_PATH}/phased_{}.vcf ::: ${CHR[@]} + else + echo "[INFO] 3/7 Phase VCF file using Whatshap" + time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \ + "${WHATSHAP} phase \ + --output ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \ + --reference ${REFERENCE_FILE_PATH} \ + --chromosome {1} \ + --distrust-genotypes \ + --ignore-read-groups \ + ${PHASE_VCF_PATH}/{1}.vcf \ + ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log + fi + ${PARALLEL} -j${THREADS} tabix -f -p vcf ${PHASE_VCF_PATH}/phased_{}.vcf.gz ::: ${CHR[@]} + +fi + +# Full alignment calling +#----------------------------------------------------------------------------------------------------------------------- +echo $'' +echo "[INFO] 5/7 Select candidates for full-alignment calling" +gzip -fdc ${OUTPUT_FOLDER}/pileup.vcf.gz | ${PYPY} ${CLAIR3} SelectQual --output_fn ${CANDIDATE_BED_PATH} \ +--var_pct_full ${PRO} --ref_pct_full ${REF_PRO} --platform ${PLATFORM} --vcf_fn ${VCF_FILE_PATH} +time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_5_select_candidate.log -j${THREADS} \ +"${PYPY} ${CLAIR3} SelectCandidates \ + --pileup_vcf_fn ${OUTPUT_FOLDER}/pileup.vcf.gz \ + --split_folder ${CANDIDATE_BED_PATH} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --var_pct_full ${PRO} \ + --ref_pct_full ${REF_PRO} \ + --platform ${PLATFORM} \ + --ctgName {1}" ::: ${CHR[@]} |& tee ${LOG_PATH}/5_select_candidate.log + +echo $'' +echo "[INFO] 6/7 Call low-quality variants using full-alignment model" +cat ${CANDIDATE_BED_PATH}/FULL_ALN_FILE_* > ${CANDIDATE_BED_PATH}/FULL_ALN_FILES +time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_6_call_var_bam_full_alignment.log -j ${THREADS_LOW} \ +"${PYTHON} ${CLAIR3} CallVariantsFromCffi \ + --chkpnt_fn ${FULL_ALIGNMENT_CHECKPOINT_PATH} \ + --bam_fn ${BAM_FILE_PATH} \ + --call_fn ${FULL_ALIGNMENT_OUTPUT_PATH}/full_alignment_{1/}.vcf \ + --sampleName ${SAMPLE} \ + --vcf_fn ${VCF_FILE_PATH} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --full_aln_regions {1} \ + --ctgName {1/.} \ + --add_indel_length \ + --no_phasing_for_fa ${NO_PHASING} \ + --phased_vcf_fn ${PHASE_VCF_PATH}/phased_{/.}.vcf.gz \ + --gvcf ${GVCF} \ + --enable_long_indel ${ENABLE_LONG_INDEL} \ + --samtools ${SAMTOOLS} \ + --use_gpu ${USE_GPU} \ + --platform ${PLATFORM}" :::: ${CANDIDATE_BED_PATH}/FULL_ALN_FILES |& tee ${LOG_PATH}/6_call_var_bam_full_alignment.log + +${PYPY} ${CLAIR3} SortVcf \ + --input_dir ${FULL_ALIGNMENT_OUTPUT_PATH} \ + --vcf_fn_prefix "full_alignment" \ + --output_fn ${OUTPUT_FOLDER}/full_alignment.vcf \ + --sampleName ${SAMPLE} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --contigs_fn ${TMP_FILE_PATH}/CONTIGS + +if [ "$( gzip -fdc ${OUTPUT_FOLDER}/full_alignment.vcf.gz | grep -v '#' | wc -l )" -eq 0 ]; then echo "[INFO] Exit in full-alignment variant calling"; exit 0; fi +# Compress GVCF output using lz4 +if [ ${GVCF} == True ] +then + ${PYPY} ${CLAIR3} SortVcf \ + --input_dir ${GVCF_TMP_PATH} \ + --vcf_fn_suffix ".tmp.gvcf" \ + --output_fn ${GVCF_TMP_PATH}/non_var.gvcf \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --contigs_fn ${TMP_FILE_PATH}/CONTIGS +fi + +##Merge pileup and full alignment vcf +##----------------------------------------------------------------------------------------------------------------------- +echo $'' +echo "[INFO] 7/7 Merge pileup VCF and full-alignment VCF" +time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_7_merge_vcf.log -j${THREADS} \ +"${PYPY} ${CLAIR3} MergeVcf \ + --pileup_vcf_fn ${OUTPUT_FOLDER}/pileup.vcf.gz \ + --bed_fn_prefix ${CANDIDATE_BED_PATH} \ + --full_alignment_vcf_fn ${OUTPUT_FOLDER}/full_alignment.vcf.gz \ + --output_fn ${TMP_FILE_PATH}/merge_output/merge_{1}.vcf \ + --platform ${PLATFORM} \ + --print_ref_calls ${SHOW_REF} \ + --gvcf ${GVCF} \ + --haploid_precise ${HAP_PRE} \ + --haploid_sensitive ${HAP_SEN} \ + --gvcf_fn ${TMP_FILE_PATH}/merge_output/merge_{1}.gvcf \ + --non_var_gvcf_fn ${GVCF_TMP_PATH}/non_var.gvcf \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --ctgName {1}" ::: ${CHR[@]} |& tee ${LOG_PATH}/7_merge_vcf.log + +${PYPY} ${CLAIR3} SortVcf \ + --input_dir ${TMP_FILE_PATH}/merge_output \ + --vcf_fn_prefix "merge" \ + --output_fn ${OUTPUT_FOLDER}/merge_output.vcf \ + --sampleName ${SAMPLE} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --contigs_fn ${TMP_FILE_PATH}/CONTIGS + +if [ "$( gzip -fdc ${OUTPUT_FOLDER}/merge_output.vcf.gz | grep -v '#' | wc -l )" -eq 0 ]; then echo "[INFO] Exit in variant merging"; exit 0; fi +if [ ${GVCF} == True ] +then + ${PYPY} ${CLAIR3} SortVcf \ + --input_dir ${TMP_FILE_PATH}/merge_output \ + --vcf_fn_prefix "merge" \ + --vcf_fn_suffix ".gvcf" \ + --output_fn ${OUTPUT_FOLDER}/merge_output.gvcf \ + --sampleName ${SAMPLE} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --contigs_fn ${TMP_FILE_PATH}/CONTIGS +fi + +if [ ${ENABLE_PHASING} == True ] +then + echo "[INFO] 7/7 Phasing VCF output in parallel using WhatsHap" + time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_8_phase_vcf_output.log -j${THREADS} \ + "${WHATSHAP} phase \ + --output ${TMP_FILE_PATH}/merge_output/phased_merge_{1}.vcf \ + --reference ${REFERENCE_FILE_PATH} \ + --ignore-read-groups \ + ${TMP_FILE_PATH}/merge_output/merge_{1}.vcf \ + ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/8_phase_vcf_output.log + + ${PYPY} ${CLAIR3} SortVcf \ + --input_dir ${TMP_FILE_PATH}/merge_output \ + --vcf_fn_prefix "phased_merge" \ + --output_fn ${OUTPUT_FOLDER}/phased_merge_output.vcf \ + --sampleName ${SAMPLE} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --contigs_fn ${TMP_FILE_PATH}/CONTIGS +fi + +if [ ${RM_TMP_DIR} == True ]; then echo "[INFO] Removing intermediate files in ${OUTPUT_FOLDER}/tmp"; rm -rf ${OUTPUT_FOLDER}/tmp; fi + +echo $'' +echo "[INFO] Finish calling, output file: ${OUTPUT_FOLDER}/merge_output.vcf.gz" + +if [ ${ENABLE_PHASING} == True ]; then echo "[INFO] Finish calling, phased output file: ${OUTPUT_FOLDER}/phased_merge_output.vcf.gz"; fi \ No newline at end of file From 6872b0abba579e0f7ddd21a774354e8c061f4c6a Mon Sep 17 00:00:00 2001 From: zxzheng Date: Tue, 15 Mar 2022 13:09:43 +0800 Subject: [PATCH 10/43] update clair3 options to be consistent with main entry --- scripts/clair3.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/clair3.sh b/scripts/clair3.sh index 35dc2a1..57ac44f 100755 --- a/scripts/clair3.sh +++ b/scripts/clair3.sh @@ -8,7 +8,7 @@ ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \ -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\ snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ -no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel:: -n 'run_clair3.sh' -- "$@"` +no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi eval set -- "${ARGS}" @@ -33,6 +33,7 @@ while true; do --pypy ) PYPY="$2"; shift 2 ;; --parallel ) PARALLEL="$2"; shift 2 ;; --whatshap ) WHATSHAP="$2"; shift 2 ;; + --longphase ) LONGPHASE="$2"; shift 2 ;; --var_pct_full ) PRO="$2"; shift 2 ;; --ref_pct_full ) REF_PRO="$2"; shift 2 ;; --var_pct_phasing ) PHASING_PCT="$2"; shift 2 ;; @@ -52,6 +53,8 @@ while true; do --remove_intermediate_dir ) RM_TMP_DIR="$2"; shift 2 ;; --enable_phasing ) ENABLE_PHASING="$2"; shift 2 ;; --enable_long_indel ) ENABLE_LONG_INDEL="$2"; shift 2 ;; + --use_gpu ) USE_GPU="$2"; shift 2 ;; + --longphase_for_phasing ) USE_LONGPHASE="$2"; shift 2 ;; -- ) shift; break; ;; -h|--help ) print_help_messages; break ;; From 8f5b8e19b09d09fc76857984dd2df912667c5f25 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 11:10:29 +0800 Subject: [PATCH 11/43] fix the vcf_fn in full-alignment calling --- preprocess/CreateTensorFullAlignmentFromCffi.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/preprocess/CreateTensorFullAlignmentFromCffi.py b/preprocess/CreateTensorFullAlignmentFromCffi.py index 3a1ba8c..0643bc1 100644 --- a/preprocess/CreateTensorFullAlignmentFromCffi.py +++ b/preprocess/CreateTensorFullAlignmentFromCffi.py @@ -28,10 +28,6 @@ def CreateTensorFullAlignment(args): platform = args.platform phased_vcf_fn = args.phased_vcf_fn - vcf_fn = file_path_from(args.vcf_fn) - is_known_vcf_file_provided = vcf_fn is not None - chunk_id = args.chunk_id - 1 if args.chunk_id else None # 1-base to 0-base - chunk_num = args.chunk_num extend_bed = file_path_from(args.extend_bed) is_extend_bed_file_given = extend_bed is not None confident_bed_fn = file_path_from(args.bed_fn) @@ -74,10 +70,6 @@ def CreateTensorFullAlignment(args): candidate_file_path_output.close() candidate_file_path_process.wait() - if is_known_vcf_file_provided: - known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name) - candidates_set = set(known_variants_list) - variant_list = [] if need_haplotagging and phased_vcf_fn and os.path.exists(phased_vcf_fn): # if need_haplotagging option enables, scan the phased vcf file and store the heterozygous SNP candidates from each phase set From ec4e98a6091995255883e1e1f434a3908c917809 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 11:11:27 +0800 Subject: [PATCH 12/43] integrate need_haplotagging option to c implement --- preprocess/CreateTensorFullAlignmentFromCffi.py | 6 ++++-- src/clair3_full_alignment.c | 3 +-- src/clair3_full_alignment.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/preprocess/CreateTensorFullAlignmentFromCffi.py b/preprocess/CreateTensorFullAlignmentFromCffi.py index 0643bc1..043afcf 100644 --- a/preprocess/CreateTensorFullAlignmentFromCffi.py +++ b/preprocess/CreateTensorFullAlignmentFromCffi.py @@ -96,7 +96,9 @@ def CreateTensorFullAlignment(args): variant_num = len(variant_list) Variants = libclair3.ffi.new("struct Variant *[]", variant_list) - + else: + Variants = libclair3.ffi.new("struct Variant *[]", 1) + variant_num = 0 # 1-index to 0-index candidates_list = sorted(list(set([item-1 for item in candidates_set if item >= ctg_start and item <= ctg_end]))) @@ -107,7 +109,7 @@ def CreateTensorFullAlignment(args): candidates = libclair3.ffi.new("size_t [{}]".format(candidate_num), candidates_list) fa_data = libclair3.lib.calculate_clair3_full_alignment(region_str, bam_file_path.encode(), fasta_file_path.encode(), - Variants, variant_num, candidates, candidate_num) + Variants, variant_num, candidates, candidate_num, need_haplotagging) # use np buffer to get the matrix matrix_depth = param.matrix_depth_dict[platform] diff --git a/src/clair3_full_alignment.c b/src/clair3_full_alignment.c index 06f6d07..eafae03 100644 --- a/src/clair3_full_alignment.c +++ b/src/clair3_full_alignment.c @@ -375,10 +375,9 @@ size_t get_overlap_candidate_num(size_t read_start, size_t read_end, size_t cand return overlap_num; } -fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num) +fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num, bool need_haplotagging) { - bool need_haplotagging = true; int start, end; char *chr = xalloc(strlen(region) + 1, sizeof(char), "chr"); strcpy(chr, region); diff --git a/src/clair3_full_alignment.h b/src/clair3_full_alignment.h index e7485fb..16bec37 100644 --- a/src/clair3_full_alignment.h +++ b/src/clair3_full_alignment.h @@ -252,6 +252,6 @@ int haplotag_read(Variants_info *variants_info, Read *read, char *ref_seq, size_ * The return value can be freed with destroy_fa_data * */ -fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num); +fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path, const char *fasta_path, Variant **variants, size_t variant_num, size_t *candidates, size_t candidate_num, bool need_haplotagging); #endif From 8c6c16f08d1b9b878ddfd0f4d25e4cf9d7017c6e Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 11:12:30 +0800 Subject: [PATCH 13/43] add samtools and longphase compile step and cffi builder in installation --- build.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 build.py diff --git a/build.py b/build.py new file mode 100644 index 0000000..022b1ae --- /dev/null +++ b/build.py @@ -0,0 +1,97 @@ +import itertools +import os +import platform +from subprocess import run +from cffi import FFI + + +samver = "1.10" +longphase_version = "1.0" +file_directory = os.path.dirname(os.path.realpath(__file__)) +def compile_samtools_package(): + # just a simple way to compile samtools htslib + if not os.path.exists(os.path.join(file_directory, 'libhts.a')): + samtools_source = "samtools-{}.tar.bz2 https://github.com/samtools/samtools/releases/download/{}/samtools-{}.tar.bz2".format(samver, samver, samver) + run("curl -L -o {}".format(samtools_source), shell=True) + run("tar -xjf samtools-{}.tar.bz2".format(samver), shell=True) + run("rm samtools-{}.tar.bz2".format(samver), shell=True) + run("cd samtools-{} && autoheader && autoconf -Wno-syntax && CFLAGS='-fpic -O3' ./configure && make".format(samver), shell=True) + run("cp samtools-{}/htslib-{}/libhts.a {}".format(samver, samver, file_directory), shell=True) + + +def compile_longphase_package(): + if not os.path.exists(os.path.join(file_directory, 'longphase')): + longphase_source = "https://github.com/twolinin/longphase/archive/refs/tags/v{}.tar.gz".format(longphase_version) + run("wget {}".format(longphase_source), shell=True) + run("tar -zxvf v{}.tar.gz".format(longphase_version), shell=True) + run("rm v{}.tar.gz".format(longphase_version), shell=True) + run("cd longphase-{} && autoreconf -i && ./configure && make -j4".format(longphase_version), shell=True) + run("mv longphase-{}/longphase {}".format(longphase_version, file_directory), shell=True) + run("rm -r longphase-{}".format(longphase_version), shell=True) + +def clean_samtools_package(): + # after ffi building, clean the samtools htslib source + if os.path.exists(os.path.join(file_directory, 'libhts.a')): + run("rm -r samtools-{}".format(samver), shell=True) + +htslib_dir=os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver)) + +libraries=['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto'] +library_dirs=[htslib_dir] +src_dir=os.path.join(file_directory, 'src') + +extra_compile_args = ['-std=c99', '-O3'] +if platform.machine() in {"aarch64", "arm64"}: + if platform.system() == "Darwin": + pass + else: + extra_compile_args.append("-march=armv8-a+simd") +else: + extra_compile_args.append("-mtune=haswell") + +ffibuilder = FFI() +ffibuilder.set_source("libclair3", + r""" + #include "kvec.h" + #include "khash.h" + #include "levenshtein.h" + #include "medaka_bamiter.h" + #include "medaka_common.h" + #include "medaka_khcounter.h" + #include "clair3_pileup.h" + #include "clair3_full_alignment.h" + """, + libraries=libraries, + library_dirs=library_dirs, + include_dirs=[src_dir, htslib_dir], + sources=[ + os.path.join(src_dir, x) for x in ( + 'levenshtein.c', + 'medaka_bamiter.c', + 'medaka_common.c', + 'medaka_khcounter.c', + 'clair3_pileup.c', + 'clair3_full_alignment.c')], + extra_compile_args=extra_compile_args, + extra_objects=['libhts.a'] +) + +cdef = [ + "typedef struct { ...; } bam_fset;" + "bam_fset* create_bam_fset(char* fname);" + "void destroy_bam_fset(bam_fset* fset);" +] +for header in ('clair3_pileup.h', 'clair3_full_alignment.h'): + with open(os.path.join(src_dir, header), 'r') as fh: + # remove directives + lines = ''.join(x for x in fh.readlines() if not x.startswith('#')) + cdef.append(lines) + +ffibuilder.cdef('\n\n'.join(cdef)) + + +if __name__ == "__main__": + compile_samtools_package() + compile_longphase_package() + ffibuilder.compile(verbose=True) + clean_samtools_package() From eeb50676736407a87f6784a8aa8f6f7d49153034 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 13:07:16 +0800 Subject: [PATCH 14/43] add gvcf in c implement --- clair3/CallVariantsFromCffi.py | 9 +++ preprocess/CreateTensorPileupFromCffi.py | 86 ++++++++++++++++++++---- src/clair3_pileup.c | 26 ++++++- src/clair3_pileup.h | 6 +- 4 files changed, 109 insertions(+), 18 deletions(-) diff --git a/clair3/CallVariantsFromCffi.py b/clair3/CallVariantsFromCffi.py index 8df9d5b..1f621a0 100644 --- a/clair3/CallVariantsFromCffi.py +++ b/clair3/CallVariantsFromCffi.py @@ -311,6 +311,15 @@ def main(): parser.add_argument('--call_snp_only', type=str2bool, default=False, help="EXPERIMENTAL: Call candidates pass snp minimum AF only, ignore Indel candidates") + parser.add_argument('--base_err', default=0.001, type=float, + help='DEBUG: Estimated base error rate in gvcf option, default: %(default)f') + + parser.add_argument('--gq_bin_size', default=5, type=int, + help='DEBUG: Default gq bin size for merge non-variant block in gvcf option, default: %(default)d') + + parser.add_argument('--bp_resolution', action='store_true', + help="DEBUG: Enable bp resolution for GVCF, default: disabled") + # Full-alignment create tensor options for full-alignment calling parser.add_argument('--phased_vcf_fn', type=str, default=None, help="Use heterozygous SNP variants in phased vcf file for haplotaging") diff --git a/preprocess/CreateTensorPileupFromCffi.py b/preprocess/CreateTensorPileupFromCffi.py index 512f1cb..5ea4f07 100644 --- a/preprocess/CreateTensorPileupFromCffi.py +++ b/preprocess/CreateTensorPileupFromCffi.py @@ -22,8 +22,8 @@ def pileup_counts_clair3( - region, bam, fasta, min_depth, min_snp_af, min_indel_af, min_mq, call_snp_only, max_indel_length, gvcf, \ - max_depth, region_split=100000, workers=1): + region, bam, fasta, min_depth, min_snp_af, min_indel_af, min_mq, call_snp_only, max_indel_length, \ + max_depth, gvcf=False, region_split=100000, workers=1): """Create pileup counts feature array for region. :param region: `medaka.common.Region` object @@ -58,9 +58,10 @@ def _process_region(reg): bam_handle = BAMHandler(bam) with bam_handle.borrow() as fh: counts = lib.calculate_clair3_pileup( - region_str.encode(), fh, fasta.encode(), min_depth, min_snp_af, min_indel_af, min_mq, max_indel_length, call_snp_only, max_depth) - np_counts, positions, alt_info_string_list = _plp_data_to_numpy( - counts, featlenclair3) + region_str.encode(), fh, fasta.encode(), min_depth, min_snp_af, min_indel_af, min_mq, max_indel_length, call_snp_only, max_depth, gvcf) + + np_counts, positions, alt_info_string_list, gvcf_output = _plp_data_to_numpy( + counts, featlenclair3, gvcf=gvcf) alt_info_list = [] for alt_info in alt_info_string_list: @@ -71,8 +72,8 @@ def _process_region(reg): pos, depth, center_ref_base, alt = alt_info[:4] alt_info_list.append((int(pos), reg.ref_name + ':' + pos + ':' + center_ref_base, depth + '-' + alt)) - lib.destroy_plp_data(counts) - return np_counts, positions, alt_info_list + lib.destroy_plp_data(counts, gvcf) + return np_counts, positions, alt_info_list, gvcf_output # we found that split into small chunk would lead to some missing truths, # the candidates cross two negbouring small chunks @@ -80,8 +81,8 @@ def _process_region(reg): regions = region.split(region_split, fixed_size=False) with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: results = executor.map(_process_region, regions) - chunk_results, all_alt_info_list = __enforce_pileup_chunk_contiguity(results) - return chunk_results, all_alt_info_list + chunk_results, all_alt_info_list, gvcf_output = __enforce_pileup_chunk_contiguity(results) + return chunk_results, all_alt_info_list, gvcf_output class BAMHandler(object): @@ -123,7 +124,7 @@ def _destroy_fset(self, fset): libclair3.lib.destroy_bam_fset(fset) -def _plp_data_to_numpy(plp_data, n_rows): +def _plp_data_to_numpy(plp_data, n_rows, gvcf=False): """Create numpy representation of feature data. Copy the feature matrix and alignment column names from a @@ -145,12 +146,24 @@ def _plp_data_to_numpy(plp_data, n_rows): ).reshape(plp_data.n_cols, n_rows).copy() alt_info_string_list = [] + gvcf_output = [] candidates_num = plp_data.candidates_num # decode all alternative information, position-depth-reference_base-alt_info for i in range(candidates_num): alt_info_string = ffi.string(plp_data.all_alt_info[i]).decode('utf8', 'ignore').rstrip() alt_info_string_list.append(alt_info_string) + if gvcf: + gvcf_pos_ref_count = np.frombuffer(ffi.buffer( + plp_data.pos_ref_count, size_sizet * plp_data.buffer_cols), + dtype=_dtype + ).reshape(plp_data.buffer_cols).copy() + gvcf_pos_total_count = np.frombuffer(ffi.buffer( + plp_data.pos_total_count, size_sizet * plp_data.buffer_cols), + dtype=_dtype + ).reshape(plp_data.buffer_cols).copy() + gvcf_output = [gvcf_pos_ref_count, gvcf_pos_total_count] + positions = np.empty(plp_data.n_cols, dtype=[ ('major', int), ('minor', int)]) np.copyto( @@ -161,7 +174,7 @@ def _plp_data_to_numpy(plp_data, n_rows): positions['minor'], np.frombuffer(ffi.buffer( plp_data.minor, size_sizet * plp_data.n_cols), dtype=_dtype)) - return np_counts, positions, alt_info_string_list + return np_counts, positions, alt_info_string_list, gvcf_output def __enforce_pileup_chunk_contiguity(pileups): @@ -178,7 +191,7 @@ def __enforce_pileup_chunk_contiguity(pileups): all_alt_info_list = list() # First pass: need to check for discontinuities within chunks, # these show up as >1 changes in the major coordinate - for counts, positions, alt_info_list in pileups: + for counts, positions, alt_info_list, gvcf_output in pileups: move = np.ediff1d(positions['major']) gaps = np.where(move > 1)[0] + 1 all_alt_info_list += alt_info_list @@ -220,7 +233,7 @@ def _finalize_chunk(c_buf, p_buf): last = positions['major'][-1] if len(counts_buffer) != 0: chunk_results.append(_finalize_chunk(counts_buffer, positions_buffer)) - return chunk_results, all_alt_info_list + return chunk_results, all_alt_info_list, gvcf_output def CreateTensorPileup(args): @@ -305,7 +318,14 @@ def CreateTensorPileup(args): confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn, contig_name=ctg_name, bed_ctg_start=extend_start, bed_ctg_end=extend_end) - chunk_result, all_alt_info_list = pileup_counts_clair3(region, + if args.gvcf: + from preprocess.utils import variantInfoCalculator + nonVariantCaller = variantInfoCalculator(gvcfWritePath=args.temp_file_dir, ref_path=args.ref_fn, + bp_resolution=args.bp_resolution, ctgName=ctg_name,sample_name='.'.join( + [args.sampleName, ctg_name, str(ctg_start), str(ctg_end)]), p_err=args.base_err, + gq_bin_size=args.gq_bin_size) + + chunk_result, all_alt_info_list, gvcf_output = pileup_counts_clair3(region, bam=bam_file_path, fasta=fasta_file_path, min_depth=min_coverage, @@ -346,6 +366,44 @@ def CreateTensorPileup(args): all_alt_info.append(alt_info) np_pileup_data = np.array(np_pileup_data, dtype=np.int32) + + + if args.gvcf: + + from shared.utils import reference_sequence_from, region_from + samtools_execute_command = args.samtools + ref_regions = [] + reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion + reference_start = 1 if reference_start < 1 else reference_start + ref_regions.append(region_from(ctg_name=ctg_name, ctg_start=reference_start, ctg_end=reference_end)) + reference_sequence = reference_sequence_from( + samtools_execute_command=samtools_execute_command, + fasta_file_path=fasta_file_path, + regions=ref_regions + ) + + empty_pileup_flag = True + for pos in range(ctg_start, ctg_end): + ref_count = gvcf_output[0][pos - extend_start + 1] + total_count = gvcf_output[1][pos - extend_start + 1] + reference_base = reference_sequence[pos-reference_start] + if (ref_count == 0 and total_count == 0): + cur_site_info = {'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': 0, 'n_ref': 0} + nonVariantCaller.make_gvcf_online(cur_site_info) + continue + + empty_pileup_flag = False + cur_site_info = {'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': total_count, + 'n_ref': ref_count} + nonVariantCaller.make_gvcf_online(cur_site_info) + if len(nonVariantCaller.current_block) != 0: + nonVariantCaller.write_to_gvcf_batch(nonVariantCaller.current_block, nonVariantCaller.cur_min_DP, + nonVariantCaller.cur_raw_gq) + + if empty_pileup_flag: + nonVariantCaller.write_empty_pileup(ctg_name, ctg_start, ctg_end) + nonVariantCaller.close_vcf_writer() + return np_pileup_data, all_position_info, all_alt_info diff --git a/src/clair3_pileup.c b/src/clair3_pileup.c index e3de48c..3695a09 100644 --- a/src/clair3_pileup.c +++ b/src/clair3_pileup.c @@ -63,6 +63,8 @@ plp_data create_plp_data(size_t n_cols, size_t buffer_cols, size_t feature_lengt data->major = xalloc(buffer_cols, sizeof(size_t), "major"); data->minor = xalloc(buffer_cols, sizeof(size_t), "minor"); data->all_alt_info = NULL; + data->pos_ref_count = NULL; + data->pos_total_count = NULL; return data; } @@ -95,13 +97,18 @@ void enlarge_plp_data(plp_data pileup, size_t buffer_cols, size_t feature_length * @returns void. * */ -void destroy_plp_data(plp_data data) { +void destroy_plp_data(plp_data data, bool gvcf) { free(data->matrix); free(data->major); free(data->minor); for (size_t i = 0; i < data->candidates_num; i++) { free(data->all_alt_info[i]); } + if (gvcf == true) { + free(data->pos_ref_count); + free(data->pos_total_count); + } + free(data->all_alt_info); free(data); } @@ -143,7 +150,7 @@ void destroy_plp_data(plp_data data) { * quality if the “R” counts and discrepancy between positions increase. * */ -plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth) { +plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth, bool gvcf) { // extract `chr`:`start`-`end` from `region` // (start is one-based and end-inclusive), // hts_parse_reg below sets return value to point @@ -201,6 +208,14 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co size_t pre_pos = 0; size_t contiguous_flanking_num = 0; + + if (gvcf == true) { + pileup->pos_ref_count = xalloc(buffer_cols, sizeof(size_t), "pos_ref_count"); + pileup->pos_total_count = xalloc(buffer_cols, sizeof(size_t), "pos_total_count"); + memset(pileup->pos_ref_count, 0, buffer_cols * sizeof(size_t)); + memset(pileup->pos_total_count, 0, buffer_cols * sizeof(size_t)); + } + while ((ret=bam_mplp_auto(mplp, &tid, &pos, &n_plp, plp) > 0)) { size_t depth = 0; @@ -349,6 +364,7 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co char major_alt_base = '\0'; size_t forward_sum = 0; size_t reverse_sum = 0; + size_t all_alt_count = 0; for (size_t i = 0; i < 4; i++) { forward_sum += pileup->matrix[major_col + i]; reverse_sum += pileup->matrix[major_col + i + reverse_pos_start]; @@ -359,6 +375,7 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co if (current_count > alt_count) { alt_count = current_count; major_alt_base = plp_bases_clair3[i]; + all_alt_count += alt_count; } } } @@ -435,6 +452,11 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co alt_info_p[candidates_num++] = alt_info_str; } + if (gvcf == true) { + pileup->pos_ref_count[pos-start] = ref_count; + pileup->pos_total_count[pos-start] = ref_count + all_alt_count + del_count + ins_count; + } + free(dels_f); free(dels_r); kh_counter_destroy(ins_counts_all); diff --git a/src/clair3_pileup.h b/src/clair3_pileup.h index 5cf9283..c4dd8e2 100644 --- a/src/clair3_pileup.h +++ b/src/clair3_pileup.h @@ -12,6 +12,8 @@ typedef struct _plp_data { size_t *minor; char **all_alt_info; size_t candidates_num; + size_t* pos_ref_count; + size_t* pos_total_count; } _plp_data; typedef _plp_data *plp_data; @@ -83,7 +85,7 @@ plp_data create_plp_data(size_t n_cols, size_t buffer_cols, size_t feature_lengt * @returns void. * */ -void destroy_plp_data(plp_data data); +void destroy_plp_data(plp_data data, bool gvcf); /** C implement of clair3-style pileup feature data and alternative information in a given region of a bam. * @@ -100,6 +102,6 @@ void destroy_plp_data(plp_data data); * The return value can be freed with destroy_plp_data * */ -plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth); +plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, const char * fasta_path, size_t min_depth, float min_snp_af, float min_indel_af, size_t min_mq, size_t max_indel_length, bool call_snp_only, size_t max_depth, bool gvcf); #endif From c622611cc05df4a426461239f675ff6ce93fa384 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 13:08:10 +0800 Subject: [PATCH 15/43] import tritonclient only when --gpu option is enabled --- clair3/CallVariantsFromCffi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clair3/CallVariantsFromCffi.py b/clair3/CallVariantsFromCffi.py index 1f621a0..30fb78d 100644 --- a/clair3/CallVariantsFromCffi.py +++ b/clair3/CallVariantsFromCffi.py @@ -5,8 +5,6 @@ from time import time from argparse import ArgumentParser, SUPPRESS -import tritonclient.grpc as tritongrpcclient - from shared.utils import str2bool, log_error from clair3.CallVariants import OutputConfig, output_utilties_from, batch_output @@ -66,6 +64,7 @@ def Run(args): def call_variants_from_cffi(args, output_config, output_utilities): use_gpu = args.use_gpu if use_gpu: + import tritonclient.grpc as tritongrpcclient server_url = 'localhost:8001' try: triton_client = tritongrpcclient.InferenceServerClient( From 3536da80f52e3d66fa224e5efc75056be060d35f Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 13:08:29 +0800 Subject: [PATCH 16/43] use default path of longphase --- run_clair3.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/run_clair3.sh b/run_clair3.sh index b00fb47..e7b780d 100644 --- a/run_clair3.sh +++ b/run_clair3.sh @@ -87,7 +87,7 @@ PYPY="pypy3" PYTHON='python3' PARALLEL='parallel' WHATSHAP='whatshap' -longphase='longphase' +longphase='EMPTY' CHUNK_NUM=0 CHUNK_SIZE=5000000 QUAL=2 @@ -210,6 +210,9 @@ if [ "${PHASING_PCT}" = "0" ]; then PHASING_PCT=0.7; fi BASE_MODEL=$(basename ${MODEL_PATH}) if [ "${BASE_MODEL}" = "r941_prom_sup_g5014" ] || [ "${BASE_MODEL}" = "r941_prom_hac_g5014" ] || [ "${BASE_MODEL}" = "ont_guppy5" ]; then PHASING_PCT=0.8; fi +# use the default longphase binary path +if [ "${USE_LONGPHASE}" == True ] && [ "${LONGPHASE}" == "EMPTY" ]; then LONGPHASE="${SCRIPT_PATH}/longphase"; fi + # remove the last '/' character in directory input OUTPUT_FOLDER=$(echo ${OUTPUT_FOLDER%*/}) MODEL_PATH=$(echo ${MODEL_PATH%*/}) From 3446d5bcbe7f37abade53234cdae8560ec529c16 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 13:13:52 +0800 Subject: [PATCH 17/43] add installation for c implement --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index e27da3b..5d23501 100644 --- a/README.md +++ b/README.md @@ -276,6 +276,10 @@ conda install -c conda-forge -c bioconda whatshap=1.0 -y git clone https://github.com/HKU-BAL/Clair3.git cd Clair3 +# compile samtools, longphase and cffi library for c implement +# after building, longphase binary is in `Clair3` folder +python3 build.py + # download pre-trained models mkdir models wget http://www.bio8.cs.hku.hk/clair3/clair3_models/clair3_models.tar.gz @@ -292,6 +296,8 @@ MODEL_NAME="[YOUR_MODEL_NAME]" # e.g. r941_prom_hac_g360+g422 --output=${OUTPUT_DIR} ## output path prefix ``` + + ### Option 5. Docker Dockerfile This is the same as option 1 except that you are building a docker image yourself. Please refer to option 1 for usage. From d7142b4e71373af415e69e947567a072a3888d57 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 13:42:01 +0800 Subject: [PATCH 18/43] add longphase path check --- run_clair3.sh | 1 + scripts/clair3_c_impl.sh | 1 + 2 files changed, 2 insertions(+) mode change 100644 => 100755 run_clair3.sh mode change 100644 => 100755 scripts/clair3_c_impl.sh diff --git a/run_clair3.sh b/run_clair3.sh old mode 100644 new mode 100755 index e7b780d..0940285 --- a/run_clair3.sh +++ b/run_clair3.sh @@ -212,6 +212,7 @@ if [ "${BASE_MODEL}" = "r941_prom_sup_g5014" ] || [ "${BASE_MODEL}" = "r941_prom # use the default longphase binary path if [ "${USE_LONGPHASE}" == True ] && [ "${LONGPHASE}" == "EMPTY" ]; then LONGPHASE="${SCRIPT_PATH}/longphase"; fi +if [ "${USE_LONGPHASE}" == True ] && [ ! -f ${LONGPHASE} ]; then echo -e "${ERROR} Cannot find LongPhase path in ${LONGPHASE}, exit!${NC}"; exit 1; fi # remove the last '/' character in directory input OUTPUT_FOLDER=$(echo ${OUTPUT_FOLDER%*/}) diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh old mode 100644 new mode 100755 index 0f0798f..75eb24c --- a/scripts/clair3_c_impl.sh +++ b/scripts/clair3_c_impl.sh @@ -110,6 +110,7 @@ ${PYTHON} ${CLAIR3} CheckEnvs \ --indel_min_af ${INDEL_AF} readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS" if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi +# use all threads here when gpu is enabled? THREADS_LOW=$((${THREADS}*3/4)) LONGPHASE_THREADS=$((${THREADS}*1/2)) if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi From 4e5acdab0d5b53eaca95925f73eeeae3b0dbd9ac Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 14:32:51 +0800 Subject: [PATCH 19/43] longphase to upper --- run_clair3.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_clair3.sh b/run_clair3.sh index 0940285..ebe4e3a 100755 --- a/run_clair3.sh +++ b/run_clair3.sh @@ -87,7 +87,7 @@ PYPY="pypy3" PYTHON='python3' PARALLEL='parallel' WHATSHAP='whatshap' -longphase='EMPTY' +LONGPHASE='EMPTY' CHUNK_NUM=0 CHUNK_SIZE=5000000 QUAL=2 From 3e07f4cb135990e96f389bad4901a0a449fe92f9 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Wed, 16 Mar 2022 22:11:21 +0800 Subject: [PATCH 20/43] base2index function to more efficient array lookup --- src/clair3_pileup.c | 14 +------------- src/clair3_pileup.h | 8 ++++++++ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/clair3_pileup.c b/src/clair3_pileup.c index 3695a09..fb8f88e 100644 --- a/src/clair3_pileup.c +++ b/src/clair3_pileup.c @@ -21,18 +21,6 @@ #define bam_nt16_table seq_nt16_table -size_t base2_index(char c) { - if (c == 'A') return 0; - else if (c == 'C') return 1; - else if (c == 'G') return 2; - else if (c == 'T') return 3; - else if (c == 'a') return 9; - else if (c == 'c') return 10; - else if (c == 'g') return 11; - else if (c == 't') return 12; - else return 0; -} - /** Constructs a pileup data structure. * * @param n_cols number of pileup columns. @@ -359,7 +347,7 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co kh_counter_destroy(ins_counts_r); int offset = pos - ref_start; char ref_base = toupper(ref_seq[offset]); - int ref_offset_forward = base2_index(ref_base); + int ref_offset_forward = base2index[ref_base - 'A']; int ref_offset_reverse = ref_offset_forward + reverse_pos_start; char major_alt_base = '\0'; size_t forward_sum = 0; diff --git a/src/clair3_pileup.h b/src/clair3_pileup.h index c4dd8e2..7f7277c 100644 --- a/src/clair3_pileup.h +++ b/src/clair3_pileup.h @@ -33,6 +33,14 @@ static const int num2countbase[32] = { }; +static const int base2index[32] = { + 0, 0, 1, 0, 0, 0, 2, 0, // abcdefgh + 0, 0, 0, 0, 0, 0, 0, 0, // ijklmnop + 0, 0, 0, 3, 0, 0, 0, 0, // qrstuvwx + 0, 0, 0, 0, 0, 0, 0, 0, // vz +}; + + // convert 16bit IUPAC (+16 for strand) to plp_bases clair3 index // first i: all insertions // second i: most common insertion From 19aa91064692ef31e577847f7bd28b9e70a56b17 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 02:00:49 +0800 Subject: [PATCH 21/43] add Makefile --- Makefile | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0f13b7d --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ +OS := $(shell uname) +ARCH := $(shell arch) + +PYTHON ?= python3 + +all : libhts.a longphase libclair3.so +clean : clean_htslib clean_longphase clean_libclair3 + +SAMVER=1.10 +LPVER=1.0 + +samtools-$(SAMVER)/Makefile: + curl -L -o samtools-${SAMVER}.tar.bz2 https://github.com/samtools/samtools/releases/download/${SAMVER}/samtools-${SAMVER}.tar.bz2; \ + tar -xjf samtools-${SAMVER}.tar.bz2; \ + rm samtools-${SAMVER}.tar.bz2 + +libhts.a: samtools-$(SAMVER)/Makefile + # this is required only to add in -fpic so we can build python module + @echo "\x1b[1;33mMaking $(@F)\x1b[0m" + cd samtools-${SAMVER}/htslib-${SAMVER}/ && CFLAGS="-fpic -std=c99 -O3" ./configure && make + cp samtools-${SAMVER}/htslib-${SAMVER}/$@ $@ + + +longphase-$(LPVER)/Makefile: + curl -L -o longphase-${LPVER}.tar.gz https://github.com/twolinin/longphase/archive/refs/tags/v${LPVER}.tar.gz; \ + tar -zxvf longphase-${LPVER}.tar.gz; \ + rm longphase-${LPVER}.tar.gz + +longphase: longphase-$(LPVER)/Makefile + @echo "\x1b[1;33mMaking $(@F)\x1b[0m" + cd longphase-${LPVER} && autoreconf -i && ./configure && make -j4 + cp longphase-${LPVER}/$@ $@ + + +libclair3.so: samtools-${SAMVER}/htslib-${SAMVER} + ${PYTHON} build.py + + +.PHONY: clean_htslib +clean_htslib: + cd samtools-${SAMVER} && make clean || exit 0 + cd samtools-${SAMVER}/htslib-${SAMVER} && make clean || exit 0 + +.PHONY: clean_longphase +clean_longphase: + cd longphase-${LPVER} && make clean || exit 0 + +.PHONY: clean_libclair3 +clean_libclair3: + rm libclair3.* From 5a4a58c361e90b91ae46fa45876e4e3622c82ce0 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 02:01:27 +0800 Subject: [PATCH 22/43] move samtools and longphse compile to Makefile --- build.py | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/build.py b/build.py index 022b1ae..75fee91 100644 --- a/build.py +++ b/build.py @@ -4,36 +4,8 @@ from subprocess import run from cffi import FFI - samver = "1.10" -longphase_version = "1.0" file_directory = os.path.dirname(os.path.realpath(__file__)) -def compile_samtools_package(): - # just a simple way to compile samtools htslib - if not os.path.exists(os.path.join(file_directory, 'libhts.a')): - samtools_source = "samtools-{}.tar.bz2 https://github.com/samtools/samtools/releases/download/{}/samtools-{}.tar.bz2".format(samver, samver, samver) - run("curl -L -o {}".format(samtools_source), shell=True) - run("tar -xjf samtools-{}.tar.bz2".format(samver), shell=True) - run("rm samtools-{}.tar.bz2".format(samver), shell=True) - run("cd samtools-{} && autoheader && autoconf -Wno-syntax && CFLAGS='-fpic -O3' ./configure && make".format(samver), shell=True) - run("cp samtools-{}/htslib-{}/libhts.a {}".format(samver, samver, file_directory), shell=True) - - -def compile_longphase_package(): - if not os.path.exists(os.path.join(file_directory, 'longphase')): - longphase_source = "https://github.com/twolinin/longphase/archive/refs/tags/v{}.tar.gz".format(longphase_version) - run("wget {}".format(longphase_source), shell=True) - run("tar -zxvf v{}.tar.gz".format(longphase_version), shell=True) - run("rm v{}.tar.gz".format(longphase_version), shell=True) - run("cd longphase-{} && autoreconf -i && ./configure && make -j4".format(longphase_version), shell=True) - run("mv longphase-{}/longphase {}".format(longphase_version, file_directory), shell=True) - run("rm -r longphase-{}".format(longphase_version), shell=True) - -def clean_samtools_package(): - # after ffi building, clean the samtools htslib source - if os.path.exists(os.path.join(file_directory, 'libhts.a')): - run("rm -r samtools-{}".format(samver), shell=True) - htslib_dir=os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver)) libraries=['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto'] @@ -91,7 +63,6 @@ def clean_samtools_package(): if __name__ == "__main__": - compile_samtools_package() - compile_longphase_package() ffibuilder.compile(verbose=True) - clean_samtools_package() + run("cp {}/libclair3*.so {}/libclair3.so".format(file_directory, file_directory), shell=True) + From ed78d1762c520ca1ca453659fef9ed9882c3c411 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 02:27:43 +0800 Subject: [PATCH 23/43] toupper not found in arm64 stdlib --- src/clair3_full_alignment.c | 4 ++-- src/clair3_pileup.c | 2 +- src/medaka_bamiter.c | 4 ++-- src/medaka_common.c | 7 +++++++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/clair3_full_alignment.c b/src/clair3_full_alignment.c index eafae03..ea70418 100644 --- a/src/clair3_full_alignment.c +++ b/src/clair3_full_alignment.c @@ -722,7 +722,7 @@ fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path continue; int8_t alt_v = 0; - char ref_base = toupper(ref_seq[cp - ref_start]); + char ref_base = upper_base(ref_seq[cp - ref_start]); int8_t ref_v = num2countbase_fa[ref_base - 'A']; int8_t bq_v = read.pos_info[offset].bq; @@ -821,7 +821,7 @@ fa_data calculate_clair3_full_alignment(const char *region, const char *bam_path // store the alternative information into string size_t max_alt_length = 64; char *alt_info_str = calloc(max_alt_length, sizeof(char)); - char center_ref_base = toupper(ref_seq[candidate - ref_start]); + char center_ref_base = upper_base(ref_seq[candidate - ref_start]); sprintf(alt_info_str, "%i-%i-%c-", candidate + 1, candidate_depth, center_ref_base); for (size_t j = 0; j < 4; j++) diff --git a/src/clair3_pileup.c b/src/clair3_pileup.c index fb8f88e..be334d0 100644 --- a/src/clair3_pileup.c +++ b/src/clair3_pileup.c @@ -346,7 +346,7 @@ plp_data calculate_clair3_pileup(const char *region, const bam_fset* bam_set, co kh_counter_destroy(ins_counts_r); int offset = pos - ref_start; - char ref_base = toupper(ref_seq[offset]); + char ref_base = upper_base(ref_seq[offset]); int ref_offset_forward = base2index[ref_base - 'A']; int ref_offset_reverse = ref_offset_forward + reverse_pos_start; char major_alt_base = '\0'; diff --git a/src/medaka_bamiter.c b/src/medaka_bamiter.c index a625e14..c01e874 100644 --- a/src/medaka_bamiter.c +++ b/src/medaka_bamiter.c @@ -3,7 +3,7 @@ #include "medaka_bamiter.h" #include "medaka_common.h" - +#include // iterator for reading bam int read_bam(void *data, bam1_t *b) { mplp_data *aux = (mplp_data*) data; @@ -57,7 +57,7 @@ bam_fset* create_bam_fset(const char* fname) { if (fset->hdr == 0 || fset->idx == 0 || fset->fp == 0) { destroy_bam_fset(fset); fprintf(stderr, "Failed to read .bam file '%s'.", fname); - exit(1); + return fset; } return fset; } diff --git a/src/medaka_common.c b/src/medaka_common.c index ba06b03..588006b 100644 --- a/src/medaka_common.c +++ b/src/medaka_common.c @@ -7,6 +7,13 @@ #include "medaka_common.h" +char upper_base(char c) { + if (c >= 'a' && c <= 'z') + return c - 32; + + return c; +} + /** Allocates zero-initialised memory with a message on failure. * * @param num number of elements to allocate. From b04ebcf3be5539a5197727327be63185bdf0ec82 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 14:26:30 +0800 Subject: [PATCH 24/43] put function into header --- src/medaka_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/medaka_common.h b/src/medaka_common.h index 2f06bf6..2aecaf1 100644 --- a/src/medaka_common.h +++ b/src/medaka_common.h @@ -14,7 +14,7 @@ static inline int max ( int a, int b ) { return a > b ? a : b; } static inline int min ( int a, int b ) { return a < b ? a : b; } - +char upper_base(char c); /** Allocates zero-initialised memory with a message on failure. * * @param num number of elements to allocate. From a0ae449627def925a2e1289192f15a3194bb6f76 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 14:28:06 +0800 Subject: [PATCH 25/43] change minCoverage type to int --- clair3/CallVarBam.py | 2 +- clair3/CallVariantsFromCffi.py | 2 +- preprocess/CreateTensorFullAlignment.py | 2 +- preprocess/CreateTensorFullAlignmentFromCffi.py | 2 +- preprocess/CreateTensorPileup.py | 2 +- preprocess/CreateTensorPileupFromCffi.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py index 35b79a1..590a582 100644 --- a/clair3/CallVarBam.py +++ b/clair3/CallVarBam.py @@ -347,7 +347,7 @@ def main(): parser.add_argument('--fast_mode', type=str2bool, default=False, help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s") - parser.add_argument('--minCoverage', type=float, default=param.min_coverage, + parser.add_argument('--minCoverage', type=int, default=param.min_coverage, help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f") parser.add_argument('--minMQ', type=int, default=param.min_mq, diff --git a/clair3/CallVariantsFromCffi.py b/clair3/CallVariantsFromCffi.py index 30fb78d..019dda1 100644 --- a/clair3/CallVariantsFromCffi.py +++ b/clair3/CallVariantsFromCffi.py @@ -292,7 +292,7 @@ def main(): parser.add_argument('--vcf_fn', type=str, default=None, help="Candidate sites VCF file input, if provided, variants will only be called at the sites in the VCF file, default: %(default)s") - parser.add_argument('--minCoverage', type=float, default=2, + parser.add_argument('--minCoverage', type=int, default=2, help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f") parser.add_argument('--minMQ', type=int, default=5, diff --git a/preprocess/CreateTensorFullAlignment.py b/preprocess/CreateTensorFullAlignment.py index f6d8c28..10d563d 100644 --- a/preprocess/CreateTensorFullAlignment.py +++ b/preprocess/CreateTensorFullAlignment.py @@ -909,7 +909,7 @@ def main(): help="Path to the 'samtools', samtools version >= 1.10 is required. default: %(default)s") # options for advanced users - parser.add_argument('--minCoverage', type=float, default=param.min_coverage, + parser.add_argument('--minCoverage', type=int, default=param.min_coverage, help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f") parser.add_argument('--minMQ', type=int, default=param.min_mq, diff --git a/preprocess/CreateTensorFullAlignmentFromCffi.py b/preprocess/CreateTensorFullAlignmentFromCffi.py index 043afcf..36ea5fe 100644 --- a/preprocess/CreateTensorFullAlignmentFromCffi.py +++ b/preprocess/CreateTensorFullAlignmentFromCffi.py @@ -185,7 +185,7 @@ def main(): help="Path to the 'samtools', samtools version >= 1.10 is required. default: %(default)s") # options for advanced users - parser.add_argument('--minCoverage', type=float, default=param.min_coverage, + parser.add_argument('--minCoverage', type=int, default=param.min_coverage, help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f") parser.add_argument('--minMQ', type=int, default=param.min_mq, diff --git a/preprocess/CreateTensorPileup.py b/preprocess/CreateTensorPileup.py index 63e3095..bf837ae 100644 --- a/preprocess/CreateTensorPileup.py +++ b/preprocess/CreateTensorPileup.py @@ -494,7 +494,7 @@ def main(): parser.add_argument('--fast_mode', type=str2bool, default=False, help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s") - parser.add_argument('--minCoverage', type=float, default=2, + parser.add_argument('--minCoverage', type=int, default=2, help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f") parser.add_argument('--minMQ', type=int, default=param.min_mq, diff --git a/preprocess/CreateTensorPileupFromCffi.py b/preprocess/CreateTensorPileupFromCffi.py index 5ea4f07..803c540 100644 --- a/preprocess/CreateTensorPileupFromCffi.py +++ b/preprocess/CreateTensorPileupFromCffi.py @@ -459,7 +459,7 @@ def main(): parser.add_argument('--fast_mode', type=str2bool, default=False, help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s") - parser.add_argument('--minCoverage', type=float, default=2, + parser.add_argument('--minCoverage', type=int, default=2, help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f") parser.add_argument('--minMQ', type=int, default=param.min_mq, From 65661dbafd125abda37f73155a4b35593fb1aecb Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 14:29:17 +0800 Subject: [PATCH 26/43] fix gvcf 0-index ctg_start issue --- preprocess/CreateTensorPileupFromCffi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/preprocess/CreateTensorPileupFromCffi.py b/preprocess/CreateTensorPileupFromCffi.py index 803c540..59ad726 100644 --- a/preprocess/CreateTensorPileupFromCffi.py +++ b/preprocess/CreateTensorPileupFromCffi.py @@ -309,6 +309,7 @@ def CreateTensorPileup(args): is_ctg_name_given = ctg_name is not None is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None if is_ctg_range_given: + ctg_start = max(1, ctg_start) extend_start = max(1, ctg_start - no_of_positions) extend_end = ctg_end + no_of_positions @@ -367,7 +368,6 @@ def CreateTensorPileup(args): np_pileup_data = np.array(np_pileup_data, dtype=np.int32) - if args.gvcf: from shared.utils import reference_sequence_from, region_from @@ -384,8 +384,8 @@ def CreateTensorPileup(args): empty_pileup_flag = True for pos in range(ctg_start, ctg_end): - ref_count = gvcf_output[0][pos - extend_start + 1] - total_count = gvcf_output[1][pos - extend_start + 1] + ref_count = gvcf_output[0][pos - extend_start] + total_count = gvcf_output[1][pos - extend_start] reference_base = reference_sequence[pos-reference_start] if (ref_count == 0 and total_count == 0): cur_site_info = {'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': 0, 'n_ref': 0} From 16c90f5dc6e2f2726247638878dae11d4e242638 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 14:30:47 +0800 Subject: [PATCH 27/43] pytables in not necessary in calling , move to training part --- clair3/CallVariants.py | 3 ++- clair3/utils.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/clair3/CallVariants.py b/clair3/CallVariants.py index 47a7042..a16f8aa 100644 --- a/clair3/CallVariants.py +++ b/clair3/CallVariants.py @@ -1,7 +1,6 @@ import sys import os import math -import tables import tensorflow as tf import numpy as np import logging @@ -1529,6 +1528,7 @@ def load_mini_batch(): if full_alignment_mode and total == 0: logging.info(log_error("[ERROR] No full-alignment output for file {}/{}".format(args.ctgName, args.call_fn))) else: + import tables dataset = tables.open_file(args.tensor_fn, 'r').root batch_size = param.predictBatchSize dataset_size = len(dataset.label) @@ -1712,6 +1712,7 @@ def load_mini_batch(): logging.info("Total process positions: {}".format(total)) else: + import tables if not os.path.exists(args.tensor_fn): logging.info("skip {}, not existing chunk_id".format(args.tensor_fn)) return diff --git a/clair3/utils.py b/clair3/utils.py index acfbc7e..0c2ce24 100644 --- a/clair3/utils.py +++ b/clair3/utils.py @@ -3,7 +3,6 @@ import copy import shlex import os -import tables import numpy as np from functools import partial @@ -11,7 +10,6 @@ from shared.interval_tree import bed_tree_from, is_region_in from shared.utils import subprocess_popen, IUPAC_base_to_ACGT_base_dict as BASE2BASE, IUPAC_base_to_num_dict as BASE2NUM -FILTERS = tables.Filters(complib='blosc:lz4hc', complevel=5) shuffle_bin_size = 50000 PREFIX_CHAR_STR = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" @@ -389,6 +387,8 @@ def get_training_array(tensor_fn, var_fn, bed_fn, bin_fn, shuffle=True, is_allow import shared.param_f as param float_type = 'int8' + import tables + FILTERS = tables.Filters(complib='blosc:lz4hc', complevel=5) tensor_shape = param.ont_input_shape if platform == 'ont' else param.input_shape subprocess_list = [] From 0eb5d4b83aef4e01e7b1707b718cf7aaec50abf5 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 14:33:16 +0800 Subject: [PATCH 28/43] add environment check for mac arm64 system --- clair3/CallVarBam.py | 26 ++++++++++++++----------- preprocess/CheckEnvs.py | 16 ++++------------ run_clair3.sh | 41 ++++++++++++++++++++++++++++------------ scripts/clair3.sh | 10 +++++++++- scripts/clair3_c_impl.sh | 12 ++++++++++-- 5 files changed, 67 insertions(+), 38 deletions(-) diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py index 590a582..1f6c073 100644 --- a/clair3/CallVarBam.py +++ b/clair3/CallVarBam.py @@ -9,6 +9,7 @@ from time import sleep from argparse import ArgumentParser, SUPPRESS import logging +import platform logging.getLogger().setLevel(logging.INFO) @@ -130,20 +131,23 @@ def Run(args): chunk_id = CommandOption('chunk_id', args.chunk_id) chunk_num = CommandOption('chunk_num', args.chunk_num) - sched_getaffinity_list = list(os.sched_getaffinity(0)) - maxCpus = len(sched_getaffinity_list) - if args.tensorflow_threads is None: - numCpus = maxCpus + if platform.machine() in {"aarch64", "arm64"} or platform.system() == "Darwin": + taskSet = "" else: - numCpus = args.tensorflow_threads if args.tensorflow_threads < maxCpus else maxCpus + sched_getaffinity_list = list(os.sched_getaffinity(0)) + maxCpus = len(sched_getaffinity_list) + if args.tensorflow_threads is None: + numCpus = maxCpus + else: + numCpus = args.tensorflow_threads if args.tensorflow_threads < maxCpus else maxCpus - _cpuSet = ",".join(str(x) for x in random.sample(sched_getaffinity_list, numCpus)) + _cpuSet = ",".join(str(x) for x in random.sample(sched_getaffinity_list, numCpus)) - taskSet = "taskset -c %s" % (_cpuSet) - try: - subprocess.check_output("which %s" % ("taskset"), shell=True) - except: - taskSet = "" + taskSet = "taskset -c %s" % (_cpuSet) + try: + subprocess.check_output("which %s" % ("taskset"), shell=True) + except: + taskSet = "" if need_realignment: realign_reads_command_options = [ diff --git a/preprocess/CheckEnvs.py b/preprocess/CheckEnvs.py index 5e03c07..628f700 100644 --- a/preprocess/CheckEnvs.py +++ b/preprocess/CheckEnvs.py @@ -3,6 +3,7 @@ import argparse import shlex import subprocess +import platform from collections import defaultdict from argparse import SUPPRESS @@ -54,6 +55,9 @@ def check_python_path(): def check_tools_version(tool_version, required_tool_version): for tool, version in tool_version.items(): required_version = required_tool_version[tool] + # whatshap cannot be installed in Mac arm64 system + if platform.system() == "Darwin" and tool == 'whatshap': + continue if version is None: print(log_error("[ERROR] {} not found, please check you are in clair3 virtual environment".format(tool))) check_python_path() @@ -296,18 +300,6 @@ def CheckEnvs(args): contig_length_list = [] contig_chunk_num = {} - threads = args.threads - sched_getaffinity_list = list(os.sched_getaffinity(0)) - numCpus = len(sched_getaffinity_list) - - if threads > numCpus: - print(log_warning( - '[WARNING] Current maximum threads {} is larger than support cpu count {}, You may set a smaller parallel threads by setting --threads=$ for better parallelism.'.format( - threads, numCpus))) - - ## for better parallelism for create tensor and call variants, we over commit the overall threads/4 for 3 times, which is 0.75 * overall threads. - threads_over_commit = max(4, int(threads * 0.75)) - with open(fai_fn, 'r') as fai_fp: for row in fai_fp: columns = row.strip().split("\t") diff --git a/run_clair3.sh b/run_clair3.sh index ebe4e3a..5cb5707 100755 --- a/run_clair3.sh +++ b/run_clair3.sh @@ -1,7 +1,7 @@ #!/bin/bash SCRIPT_NAME=$(basename "$0") SCRIPT_PATH=`dirname "$0"` -VERSION='v0.1-r10' +VERSION='v0.1-r11' Usage="Usage: ./${SCRIPT_NAME} --bam_fn=BAM --ref_fn=REF --output=OUTPUT_DIR --threads=THREADS --platform=PLATFORM --model_path=MODEL_PREFIX [--bed_fn=BED] [options]" set -e @@ -31,12 +31,15 @@ print_help_messages() echo $' --pypy=STR Path of pypy3, pypy3 >= 3.6 is required.' echo $' --parallel=STR Path of parallel, parallel >= 20191122 is required.' echo $' --whatshap=STR Path of whatshap, whatshap >= 1.0 is required.' + echo $' --longphase=STR Path of longphase, longphase >= 1.0 is required.' echo $' --chunk_size=INT The size of each chuck for parallel processing, default: 5000000.' echo $' --pileup_only Use the pileup model only when calling, default: disable.' echo $' --print_ref_calls Show reference calls (0/0) in VCF file, default: disable.' echo $' --include_all_ctgs Call variants on all contigs, otherwise call in chr{1..22,X,Y} and {1..22,X,Y}, default: disable.' echo $' --gvcf Enable GVCF output, default: disable.' echo $' --enable_phasing Output phased variants using whatshap, default: disable.' + echo $' --longphase_for_phasing Use longphase for phasing, default: enable.' + echo $' --disable_c_impl Disable C implement with cffi for pileup and full-alignment create tensor, default: enable.' echo $' --remove_intermediate_dir Remove intermediate directory, including intermediate phased BAM, pileup and full-alignment results. default: disable.' echo $' --snp_min_af=FLOAT Minimum SNP AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.08,hifi:0.08,ilmn:0.08.' echo $' --indel_min_af=FLOAT Minimum Indel AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.15,hifi:0.08,ilmn:0.08.' @@ -45,16 +48,14 @@ print_help_messages() echo $' --var_pct_phasing=FLOAT EXPERIMENTAL: Specify an expected percentage of high quality 0/1 variants used in WhatsHap phasing, default: 0.8 for ont guppy5 and 0.7 for other platforms.' echo $' --pileup_model_prefix=STR EXPERIMENTAL: Model prefix in pileup calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index. default: pileup.' echo $' --fa_model_prefix=STR EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment.' + echo $' --min_mq=INT EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5.' + echo $' --min_coverage=INT EXPERIMENTAL: Minimum coverage required to call a variant, default: 2.' echo $' --fast_mode EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable.' echo $' --haploid_precise EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable.' echo $' --haploid_sensitive EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable.' echo $' --no_phasing_for_fa EXPERIMENTAL: Call variants without whatshap phasing in full alignment calling, default: disable.' echo $' --call_snp_only EXPERIMENTAL: Call candidates pass SNP minimum AF only, ignore Indel candidates, default: disable.' echo $' --enable_long_indel EXPERIMENTAL: Call long Indel variants(>50 bp), default: disable.' - echo $' --use_gpu Use GPU for calling, default: disable.' - echo $' --longphase_for_phasing Use longphase for phasing, default: disable.' - echo $' --longphase Path of longphase, longphase >= 1.0 is required.' - echo $' --enable_c_impl Use C implement with cffi for pileup and full-alignment create tensor, default: disable.' echo $'' } @@ -71,8 +72,8 @@ NC="\\033[0m" ARGS=`getopt -o b:f:t:m:p:o:hv \ -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,ref_pct_full::,var_pct_phasing::,longphase::,\ -snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\ -remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,use_gpu,longphase_for_phasing,enable_c_impl,help,version -n 'run_clair3.sh' -- "$@"` +min_mq::,min_coverage::,snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\ +remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,use_gpu,longphase_for_phasing,disable_c_impl,help,version -n 'run_clair3.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi eval set -- "${ARGS}" @@ -91,6 +92,8 @@ LONGPHASE='EMPTY' CHUNK_NUM=0 CHUNK_SIZE=5000000 QUAL=2 +MIN_MQ=5 +MIN_COV=2 PHASING_PCT="0" PRO="0" REF_PRO="0" @@ -110,7 +113,7 @@ ENABLE_PHASING=False ENABLE_LONG_INDEL=False USE_GPU=False USE_LONGPHASE=False -ENABLE_C_IMPL=False +ENABLE_C_IMPL=True PILEUP_PREFIX="pileup" FA_PREFIX="full_alignment" @@ -140,6 +143,8 @@ while true; do --var_pct_phasing ) PHASING_PCT="$2"; shift 2 ;; --snp_min_af ) SNP_AF="$2"; shift 2 ;; --indel_min_af ) INDEL_AF="$2"; shift 2 ;; + --min_mq ) MIN_MQ="$2"; shift 2 ;; + --min_coverage ) MIN_COV="$2"; shift 2 ;; --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;; --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;; --gvcf ) GVCF=True; shift 1 ;; @@ -156,7 +161,7 @@ while true; do --enable_long_indel ) ENABLE_LONG_INDEL=True; shift 1 ;; --use_gpu ) USE_GPU=True; shift 1 ;; --longphase_for_phasing ) USE_LONGPHASE=True; shift 1 ;; - --enable_c_impl ) ENABLE_C_IMPL=True; shift 1 ;; + --disable_c_impl ) ENABLE_C_IMPL=False; shift 1 ;; -- ) shift; break; ;; -h|--help ) print_help_messages; exit 0 ;; @@ -211,6 +216,7 @@ BASE_MODEL=$(basename ${MODEL_PATH}) if [ "${BASE_MODEL}" = "r941_prom_sup_g5014" ] || [ "${BASE_MODEL}" = "r941_prom_hac_g5014" ] || [ "${BASE_MODEL}" = "ont_guppy5" ]; then PHASING_PCT=0.8; fi # use the default longphase binary path +if [ "$(uname)" = "Darwin" ] && [ "${NO_PHASING}" == False ]; then echo -e "${WARNING} Mac arm64 system only support longphase for phasing, will enable it! ${NC}"; USE_LONGPHASE=True; fi if [ "${USE_LONGPHASE}" == True ] && [ "${LONGPHASE}" == "EMPTY" ]; then LONGPHASE="${SCRIPT_PATH}/longphase"; fi if [ "${USE_LONGPHASE}" == True ] && [ ! -f ${LONGPHASE} ]; then echo -e "${ERROR} Cannot find LongPhase path in ${LONGPHASE}, exit!${NC}"; exit 1; fi @@ -242,6 +248,8 @@ if [ ${CHUNK_NUM} -gt 0 ]; then echo "[INFO] CHUNK NUM: ${CHUNK_NUM}"; fi echo "[INFO] FULL ALIGN PROPORTION: ${PRO}" echo "[INFO] FULL ALIGN REFERENCE PROPORTION: ${REF_PRO}" echo "[INFO] PHASING PROPORTION: ${PHASING_PCT}" +echo "[INFO] MINIMUM MQ: ${MIN_MQ}" +echo "[INFO] MINIMUM COVERAGE: ${MIN_COV}" if [ "${SNP_AF}" != "0" ]; then echo "[INFO] USER DEFINED SNP THRESHOLD: ${SNP_AF}"; fi if [ "${INDEL_AF}" != "0" ]; then echo "[INFO] USER DEFINED INDEL THRESHOLD: ${INDEL_AF}"; fi echo "[INFO] ENABLE FILEUP ONLY CALLING: ${PILEUP_ONLY}" @@ -256,9 +264,8 @@ echo "[INFO] ENABLE NO PHASING FOR FULL ALIGNMENT: ${NO_PHASING}" echo "[INFO] ENABLE REMOVING INTERMEDIATE FILES: ${RM_TMP_DIR}" echo "[INFO] ENABLE PHASING VCF OUTPUT: ${ENABLE_PHASING}" echo "[INFO] ENABLE LONG INDEL CALLING: ${ENABLE_LONG_INDEL}" -echo "[INFO] ENABLE GPU CALLING: ${USE_GPU}" echo "[INFO] ENABLE LONGPHASE_FOR_PHASING: ${USE_LONGPHASE}" -echo "[INFO] ENABLE C_IMPLEMENT: ${USE_LONGPHASE}" +echo "[INFO] ENABLE C_IMPLEMENT: ${ENABLE_C_IMPL}" echo $'' # file check @@ -273,7 +280,7 @@ if [ ! -d ${MODEL_PATH} ] && [ -z ${CONDA_PREFIX} ]; then echo -e "${ERROR} Cond if [ ! -d ${MODEL_PATH} ]; then echo -e "${ERROR} Model path not found${NC}"; exit 1; fi # max threads detection -MAX_THREADS=$(nproc) +if [ "$(uname)" = "Darwin" ]; then MAX_THREADS=$(sysctl -n hw.logicalcpu); else MAX_THREADS=$(nproc); fi if [[ ! ${THREADS} =~ ^[\-0-9]+$ ]] || (( ${THREADS} <= 0)); then echo -e "${ERROR} Invalid threads input --threads=INT ${NC}"; exit 1; fi if [[ ${THREADS} -gt ${MAX_THREADS} ]]; then echo -e "${WARNING} Threads setting exceeds maximum available threads ${MAX_THREADS}, set threads=${MAX_THREADS}${NC}"; THREADS=${MAX_THREADS}; fi @@ -283,6 +290,11 @@ if [ ! -z ${MAX_ULIMIT_THREADS} ]; then PER_ULIMIT_THREADS=$((${MAX_ULIMIT_THREA if [[ ${PER_ULIMIT_THREADS} < 1 ]]; then PER_ULIMIT_THREADS=1; fi if [ "${MAX_ULIMIT_THREADS}" != "unlimited" ] && [[ ${THREADS} -gt ${PER_ULIMIT_THREADS} ]]; then echo -e "${WARNING} Threads setting exceeds maximum ulimit threads ${THREADS} * 30 > ${MAX_ULIMIT_THREADS} (ulimit -u), set threads=${PER_ULIMIT_THREADS}${NC}"; THREADS=${PER_ULIMIT_THREADS}; fi +# min mapping quality and min coverage detection +if [[ ! ${THREADS} =~ ^[\-0-9]+$ ]] || (( ${THREADS} <= 0)); then echo -e "${ERROR} Invalid threads input --threads=INT ${NC}"; exit 1; fi +if [[ ! ${MIN_MQ} =~ ^[\-0-9]+$ ]] || (( ${MIN_MQ} < 5)); then echo -e "${WARNING} Invalid minimum mapping quality input --min_mq>=5 ${NC}"; MIN_MQ=5; fi +if [[ ! ${MIN_COV} =~ ^[\-0-9]+$ ]] || (( ${MIN_COV} < 2)); then echo -e "${WARNING} Invalid minimum coverage input --min_coverage>=2 ${NC}"; MIN_COV=2; fi + # platform check if [ ! ${PLATFORM} = "ont" ] && [ ! ${PLATFORM} = "hifi" ] && [ ! ${PLATFORM} = "ilmn" ]; then echo -e "${ERROR} Invalid platform input, optional: {ont, hifi, ilmn}${NC}"; exit 1; fi @@ -305,6 +317,9 @@ if [ -z ${REF_PRO} ]; then echo -e "${ERROR} Use '--ref_pct_full=FLOAT' instead if [ -z ${PHASING_PCT} ]; then echo -e "${ERROR} Use '--var_pct_phasing=FLOAT' instead of '--var_pct_phasing FLOAT' for optional parameters${NC}"; exit 1 ; fi if [ -z ${PILEUP_PREFIX} ]; then echo -e "${ERROR} Use '--pileup_model_prefix=STR' instead of '--pileup_model_prefix STR' for optional parameters${NC}"; exit 1 ; fi if [ -z ${FA_PREFIX} ]; then echo -e "${ERROR} Use '--fa_model_prefix=STR' instead of '--fa_model_prefix STR' for optional parameters${NC}"; exit 1 ; fi +if [ -z ${MIN_MQ} ]; then echo -e "${ERROR} Use '--min_mq=INT' instead of '--min_mq INT' for optional parameters${NC}"; exit 1 ; fi +if [ -z ${MIN_COV} ]; then echo -e "${ERROR} Use '--min_coverage=INT' instead of '--min_coverage INT' for optional parameters${NC}"; exit 1 ; fi +if [ -z ${LONGPHASE} ]; then echo -e "${ERROR} Use '--longphase=STR' instead of '--longphase STR' for optional parameters${NC}"; exit 1 ; fi # model prefix detection if [ ! -f ${MODEL_PATH}/${PILEUP_PREFIX}.index ]; then echo -e "${ERROR} No pileup model found in provided model path and model prefix ${MODEL_PATH}/${PILEUP_PREFIX} ${NC}"; exit 1; fi @@ -338,6 +353,8 @@ ${SCRIPT_PATH}/scripts/${CLAIR3_SCRIPT} \ --var_pct_phasing=${PHASING_PCT} \ --snp_min_af=${SNP_AF} \ --indel_min_af=${INDEL_AF} \ + --min_mq=${MIN_MQ} \ + --min_coverage=${MIN_COV} \ --pileup_only=${PILEUP_ONLY} \ --gvcf=${GVCF} \ --fast_mode=${FAST_MODE} \ diff --git a/scripts/clair3.sh b/scripts/clair3.sh index 57ac44f..449575d 100755 --- a/scripts/clair3.sh +++ b/scripts/clair3.sh @@ -85,6 +85,7 @@ export OPENBLAS_NUM_THREADS=1 export GOTO_NUM_THREADS=1 export OMP_NUM_THREADS=1 +echo $'' echo "[INFO] Check environment variables" ${PYTHON} ${CLAIR3} CheckEnvs \ --bam_fn ${BAM_FILE_PATH} \ @@ -108,7 +109,14 @@ ${PYTHON} ${CLAIR3} CheckEnvs \ --ref_pct_full ${REF_PRO} \ --snp_min_af ${SNP_AF} \ --indel_min_af ${INDEL_AF} -readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS" + +if [ "$(uname)" = "Darwin" ]; +then + mapfile -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS" +else + readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS" +fi + if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi THREADS_LOW=$((${THREADS}*3/4)) if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh index 75eb24c..811a10e 100755 --- a/scripts/clair3_c_impl.sh +++ b/scripts/clair3_c_impl.sh @@ -85,6 +85,7 @@ export OPENBLAS_NUM_THREADS=1 export GOTO_NUM_THREADS=1 export OMP_NUM_THREADS=1 +echo $'' echo "[INFO] Check environment variables" ${PYTHON} ${CLAIR3} CheckEnvs \ --bam_fn ${BAM_FILE_PATH} \ @@ -108,9 +109,16 @@ ${PYTHON} ${CLAIR3} CheckEnvs \ --ref_pct_full ${REF_PRO} \ --snp_min_af ${SNP_AF} \ --indel_min_af ${INDEL_AF} -readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS" + +if [ "$(uname)" = "Darwin" ]; +then + mapfile -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS" +else + readarray -t CHR < "${OUTPUT_FOLDER}/tmp/CONTIGS" +fi + if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi -# use all threads here when gpu is enabled? + THREADS_LOW=$((${THREADS}*3/4)) LONGPHASE_THREADS=$((${THREADS}*1/2)) if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi From 40f91216bf4f02620af8ac594039634deaed8405 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 14:34:03 +0800 Subject: [PATCH 29/43] add min_coverage and min_mq option in workflow --- clair3/CallVarBam.py | 2 ++ scripts/clair3.sh | 8 +++++++- scripts/clair3_c_impl.sh | 8 +++++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py index 1f6c073..7759622 100644 --- a/clair3/CallVarBam.py +++ b/clair3/CallVarBam.py @@ -180,6 +180,8 @@ def Run(args): CommandOption('bed_fn', bed_fn), CommandOption('extend_bed', extend_bed), CommandOption('sampleName', args.sampleName), + CommandOption('minCoverage', args.minCoverage), + CommandOption('minMQ', args.minMQ), ctgStart, ctgEnd, chunk_id, diff --git a/scripts/clair3.sh b/scripts/clair3.sh index 449575d..0ace576 100755 --- a/scripts/clair3.sh +++ b/scripts/clair3.sh @@ -7,7 +7,7 @@ set -e ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \ -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\ -snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ +min_mq::,min_coverage::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi @@ -44,6 +44,8 @@ while true; do --gvcf ) GVCF="$2"; shift 2 ;; --snp_min_af ) SNP_AF="$2"; shift 2 ;; --indel_min_af ) INDEL_AF="$2"; shift 2 ;; + --min_mq ) MIN_MQ="$2"; shift 2 ;; + --min_coverage ) MIN_COV="$2"; shift 2 ;; --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;; --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;; --haploid_precise ) HAP_PRE="$2"; shift 2 ;; @@ -143,6 +145,8 @@ time ${PARALLEL} --retries ${RETRIES} -C ' ' --joblog ${LOG_PATH}/parallel_1_cal --fast_mode ${FAST_MODE} \ --snp_min_af ${SNP_AF} \ --indel_min_af ${INDEL_AF} \ + --minMQ ${MIN_MQ} \ + --minCoverage ${MIN_COV} \ --call_snp_only ${SNP_ONLY} \ --gvcf ${GVCF} \ --enable_long_indel ${ENABLE_LONG_INDEL} \ @@ -244,6 +248,8 @@ time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_6_call_var_b --add_indel_length \ --phasing_info_in_bam \ --gvcf ${GVCF} \ + --minMQ ${MIN_MQ} \ + --minCoverage ${MIN_COV} \ --enable_long_indel ${ENABLE_LONG_INDEL} \ --python ${PYTHON} \ --pypy ${PYPY} \ diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh index 811a10e..eaaebca 100755 --- a/scripts/clair3_c_impl.sh +++ b/scripts/clair3_c_impl.sh @@ -7,7 +7,7 @@ set -e ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \ -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\ -snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ +min_mq::,min_coverage::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi @@ -44,6 +44,8 @@ while true; do --gvcf ) GVCF="$2"; shift 2 ;; --snp_min_af ) SNP_AF="$2"; shift 2 ;; --indel_min_af ) INDEL_AF="$2"; shift 2 ;; + --min_mq ) MIN_MQ="$2"; shift 2 ;; + --min_coverage ) MIN_COV="$2"; shift 2 ;; --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;; --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;; --haploid_precise ) HAP_PRE="$2"; shift 2 ;; @@ -145,6 +147,8 @@ time ${PARALLEL} --retries ${RETRIES} -C ' ' --joblog ${LOG_PATH}/parallel_1_cal --fast_mode ${FAST_MODE} \ --snp_min_af ${SNP_AF} \ --indel_min_af ${INDEL_AF} \ + --minMQ ${MIN_MQ} \ + --minCoverage ${MIN_COV} \ --call_snp_only ${SNP_ONLY} \ --gvcf ${GVCF} \ --enable_long_indel ${ENABLE_LONG_INDEL} \ @@ -247,6 +251,8 @@ time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_6_call_var_b --ctgName {1/.} \ --add_indel_length \ --no_phasing_for_fa ${NO_PHASING} \ + --minMQ ${MIN_MQ} \ + --minCoverage ${MIN_COV} \ --phased_vcf_fn ${PHASE_VCF_PATH}/phased_{/.}.vcf.gz \ --gvcf ${GVCF} \ --enable_long_indel ${ENABLE_LONG_INDEL} \ From ff50ec99d2bb10c7f688237825b051a270d34c3a Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 15:06:41 +0800 Subject: [PATCH 30/43] allow csi indexing for input BAM --- preprocess/CheckEnvs.py | 5 ++++- run_clair3.sh | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/preprocess/CheckEnvs.py b/preprocess/CheckEnvs.py index 628f700..12f1c0a 100644 --- a/preprocess/CheckEnvs.py +++ b/preprocess/CheckEnvs.py @@ -206,7 +206,10 @@ def CheckEnvs(args): bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True) ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True) fai_fn = file_path_from(args.ref_fn, suffix=".fai", exit_on_not_found=True, sep='.') - bai_fn = file_path_from(args.bam_fn, suffix=".bai", exit_on_not_found=True, sep='.') + bai_fn = file_path_from(args.bam_fn, suffix=".bai", sep='.') + csi_fn = file_path_from(args.bam_fn, suffix=".csi", sep='.') + if bai_fn is None and csi_fn is None: + sys.exit(log_error("[ERROR] Neither Bam index file {} or {} not found".format(file_name + '.bai', file_name + '.csi'))) bed_fn = file_path_from(args.bed_fn) vcf_fn = file_path_from(args.vcf_fn) tree = bed_tree_from(bed_file_path=bed_fn) diff --git a/run_clair3.sh b/run_clair3.sh index 5cb5707..132ca9b 100755 --- a/run_clair3.sh +++ b/run_clair3.sh @@ -270,7 +270,7 @@ echo $'' # file check if [ ! -f ${BAM_FILE_PATH} ]; then echo -e "${ERROR} BAM file ${BAM_FILE_PATH} not found${NC}"; exit 1; fi -if [ ! -f ${BAM_FILE_PATH}.bai ] && [ ! -f ${BAM_FILE_PATH%.*}.bai ]; then echo -e "${ERROR} BAM index bai file not found, please use 'samtools index \$BAM' first${NC}"; exit 1; fi +if [ ! -f ${BAM_FILE_PATH}.bai ] && [ ! -f ${BAM_FILE_PATH%.*}.bai ] && [ ! -f ${BAM_FILE_PATH}.csi ] && [ ! -f ${BAM_FILE_PATH%.*}.csi ]; then echo -e "${ERROR} BAM index bai file not found, please use 'samtools index \$BAM' first${NC}"; exit 1; fi if [ ! -f ${REFERENCE_FILE_PATH} ]; then echo -e "${ERROR} Reference file ${REFERENCE_FILE_PATH} not found${NC}"; exit 1; fi if [ ! -f ${REFERENCE_FILE_PATH}.fai ] && [ ! -f ${REFERENCE_FILE_PATH%.*}.fai ]; then echo -e "${ERROR} Reference index fai file not found, please use 'samtools faidx \$REF' first${NC}"; exit 1; fi From 9c2736aa18909ff3a5bf395ddd60171abad700e3 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Thu, 31 Mar 2022 15:49:09 +0800 Subject: [PATCH 31/43] platform package conflict with platform option --- clair3/CallVarBam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py index 7759622..b461ce5 100644 --- a/clair3/CallVarBam.py +++ b/clair3/CallVarBam.py @@ -9,7 +9,7 @@ from time import sleep from argparse import ArgumentParser, SUPPRESS import logging -import platform +from platform import machine, system logging.getLogger().setLevel(logging.INFO) @@ -131,7 +131,7 @@ def Run(args): chunk_id = CommandOption('chunk_id', args.chunk_id) chunk_num = CommandOption('chunk_num', args.chunk_num) - if platform.machine() in {"aarch64", "arm64"} or platform.system() == "Darwin": + if machine() in {"aarch64", "arm64"} or system() == "Darwin": taskSet = "" else: sched_getaffinity_list = list(os.sched_getaffinity(0)) From 6e18c230339ce0a84a97aa8859378001f47cb2a9 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Sun, 3 Apr 2022 14:03:38 +0800 Subject: [PATCH 32/43] allow longphase phasing when c implement is disabled --- scripts/clair3.sh | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/scripts/clair3.sh b/scripts/clair3.sh index 0ace576..275dd22 100755 --- a/scripts/clair3.sh +++ b/scripts/clair3.sh @@ -120,7 +120,9 @@ else fi if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0; fi + THREADS_LOW=$((${THREADS}*3/4)) +LONGPHASE_THREADS=$((${THREADS}*1/2)) if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi cd ${OUTPUT_FOLDER} @@ -191,29 +193,32 @@ else --ctgName {1}" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} |& tee ${LOG_PATH}/2_select_hetero_snp.log echo $'' - echo "[INFO] 3/7 Phase VCF file using Whatshap" - time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \ - "${WHATSHAP} phase \ - --output ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \ - --reference ${REFERENCE_FILE_PATH} \ - --chromosome {1} \ - --distrust-genotypes \ - --ignore-read-groups \ - ${PHASE_VCF_PATH}/{1}.vcf \ - ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log + if [ ${USE_LONGPHASE} == True ] + then + echo "[INFO] 3/7 Phase VCF file using LongPhase" + time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \ + "${LONGPHASE} phase\ + -s ${PHASE_VCF_PATH}/{1}.vcf \ + -b ${BAM_FILE_PATH} \ + -r ${REFERENCE_FILE_PATH} \ + -t ${LONGPHASE_THREADS} \ + -o ${PHASE_VCF_PATH}/phased_{1} \ + --${LP_PLATFORM}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log + ${PARALLEL} -j${THREADS} bgzip -f ${PHASE_VCF_PATH}/phased_{}.vcf ::: ${CHR[@]} + else + echo "[INFO] 3/7 Phase VCF file using Whatshap" + time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_phase.log -j${THREADS} \ + "${WHATSHAP} phase \ + --output ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \ + --reference ${REFERENCE_FILE_PATH} \ + --chromosome {1} \ + --distrust-genotypes \ + --ignore-read-groups \ + ${PHASE_VCF_PATH}/{1}.vcf \ + ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log + fi ${PARALLEL} -j${THREADS} tabix -f -p vcf ${PHASE_VCF_PATH}/phased_{}.vcf.gz ::: ${CHR[@]} - echo $'' - echo "[INFO] 4/7 Haplotag input BAM file using Whatshap" - time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_4_haplotag.log -j${THREADS} \ - "${WHATSHAP} haplotag \ - --output ${PHASE_BAM_PATH}/{1}.bam \ - --reference ${REFERENCE_FILE_PATH} \ - --ignore-read-groups \ - --regions {1} \ - ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \ - ${BAM_FILE_PATH}" ::: ${CHR[@]} |& tee ${LOG_PATH}/4_haplotag.log - ${PARALLEL} -j${THREADS} ${SAMTOOLS} index -@12 ${PHASE_BAM_PATH}/{1}.bam ::: ${CHR[@]} fi # Full alignment calling From 3f857c9af9d5a35f93b97b34910aaf969139e0ff Mon Sep 17 00:00:00 2001 From: zxzheng Date: Sun, 3 Apr 2022 14:20:26 +0800 Subject: [PATCH 33/43] add min_contig_size option --- preprocess/CheckEnvs.py | 10 ++++++++++ run_clair3.sh | 32 ++++++++++++++++++++++---------- scripts/clair3.sh | 6 ++++-- scripts/clair3_c_impl.sh | 6 ++++-- 4 files changed, 40 insertions(+), 14 deletions(-) diff --git a/preprocess/CheckEnvs.py b/preprocess/CheckEnvs.py index 12f1c0a..7051519 100644 --- a/preprocess/CheckEnvs.py +++ b/preprocess/CheckEnvs.py @@ -242,6 +242,7 @@ def CheckEnvs(args): ref_pct_full = args.ref_pct_full snp_min_af = args.snp_min_af indel_min_af = args.indel_min_af + min_contig_size = args.min_contig_size sample_name = args.sampleName contig_name_list = os.path.join(tmp_file_path, 'CONTIGS') chunk_list = os.path.join(tmp_file_path, 'CHUNK_LIST') @@ -319,6 +320,12 @@ def CheckEnvs(args): if is_known_vcf_file_provided and contig_name not in contig_set: continue + if min_contig_size > 0 and contig_length < min_contig_size: + print(log_warning( + "[WARNING] {} contig length {} is smaller than minimum contig size {}, will skip it!".format(contig_name, contig_length, min_contig_size))) + if contig_name in contig_set: + contig_set.remove(contig_name) + continue contig_set.add(contig_name) contig_length_list.append(contig_length) chunk_num = int( @@ -462,6 +469,9 @@ def main(): parser.add_argument('--indel_min_af', type=float, default=0.08, help="Minimum Indel allele frequency for a site to be considered as a candidate site, default: %(default)f") + parser.add_argument('--min_contig_size', type=int, default=0, + help="Minimum Indel allele frequency for a site to be considered as a candidate site, default: %(default)f") + # options for internal process control ## The number of chucks to be divided into for parallel processing parser.add_argument('--chunk_num', type=int, default=0, diff --git a/run_clair3.sh b/run_clair3.sh index 132ca9b..9865bdc 100755 --- a/run_clair3.sh +++ b/run_clair3.sh @@ -50,6 +50,7 @@ print_help_messages() echo $' --fa_model_prefix=STR EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment.' echo $' --min_mq=INT EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5.' echo $' --min_coverage=INT EXPERIMENTAL: Minimum coverage required to call a variant, default: 2.' + echo $' --min_contig_size=INT EXPERIMENTAL: If set, contigs with contig size<=$min_contig_size are filtered, default: 0.' echo $' --fast_mode EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable.' echo $' --haploid_precise EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable.' echo $' --haploid_sensitive EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable.' @@ -72,7 +73,7 @@ NC="\\033[0m" ARGS=`getopt -o b:f:t:m:p:o:hv \ -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,ref_pct_full::,var_pct_phasing::,longphase::,\ -min_mq::,min_coverage::,snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\ +min_mq::,min_coverage::,min_contig_size::,snp_min_af::,indel_min_af::,pileup_model_prefix::,fa_model_prefix::,fast_mode,gvcf,pileup_only,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,\ remove_intermediate_dir,no_phasing_for_fa,call_snp_only,enable_phasing,enable_long_indel,use_gpu,longphase_for_phasing,disable_c_impl,help,version -n 'run_clair3.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi @@ -94,6 +95,7 @@ CHUNK_SIZE=5000000 QUAL=2 MIN_MQ=5 MIN_COV=2 +MIN_CONTIG_SIZE=0 PHASING_PCT="0" PRO="0" REF_PRO="0" @@ -101,8 +103,8 @@ GVCF=False PILEUP_ONLY=False FAST_MODE=False SHOW_REF=False -SNP_AF="0.08" -INDEL_AF="0.15" +SNP_AF="0" +INDEL_AF="0" HAP_PRE=False HAP_SEN=False SNP_ONLY=False @@ -210,6 +212,11 @@ if [ "${PLATFORM}" != "ont" ] && [ "${REF_PRO}" = "0" ]; then REF_PRO=0.3; fi if [ "${PLATFORM}" = "ont" ] && [ "${PRO}" = "0" ]; then PRO=0.7; fi if [ "${PLATFORM}" != "ont" ] && [ "${PRO}" = "0" ]; then PRO=0.3; fi +# set default af for ilmn and hifi and ont +if [ "${SNP_AF}" = "0" ]; then SNP_AF=0.08; fi +if [ "${PLATFORM}" = "ont" ] && [ "${INDEL_AF}" = "0" ]; then INDEL_AF=0.15; fi +if [ "${PLATFORM}" != "ont" ] && [ "${INDEL_AF}" = "0" ]; then INDEL_AF=0.08; fi + # show default high quality hete variant proportion for whatshap phasing, 0.8 for ont guppy5 and 0.7 for others if [ "${PHASING_PCT}" = "0" ]; then PHASING_PCT=0.7; fi BASE_MODEL=$(basename ${MODEL_PATH}) @@ -245,13 +252,14 @@ echo "[INFO] WHATSHAP PATH: ${WHATSHAP}" echo "[INFO] LONGPHASE PATH: ${LONGPHASE}" echo "[INFO] CHUNK SIZE: ${CHUNK_SIZE}" if [ ${CHUNK_NUM} -gt 0 ]; then echo "[INFO] CHUNK NUM: ${CHUNK_NUM}"; fi +if [ ${MIN_CONTIG_SIZE} -gt 0 ]; then echo "[INFO] MIN CONTIG SIZE: ${CHUNK_NUM}"; fi echo "[INFO] FULL ALIGN PROPORTION: ${PRO}" echo "[INFO] FULL ALIGN REFERENCE PROPORTION: ${REF_PRO}" echo "[INFO] PHASING PROPORTION: ${PHASING_PCT}" echo "[INFO] MINIMUM MQ: ${MIN_MQ}" echo "[INFO] MINIMUM COVERAGE: ${MIN_COV}" -if [ "${SNP_AF}" != "0" ]; then echo "[INFO] USER DEFINED SNP THRESHOLD: ${SNP_AF}"; fi -if [ "${INDEL_AF}" != "0" ]; then echo "[INFO] USER DEFINED INDEL THRESHOLD: ${INDEL_AF}"; fi +echo "[INFO] SNP AF THRESHOLD: ${SNP_AF}" +echo "[INFO] INDEL AF THRESHOLD: ${INDEL_AF}" echo "[INFO] ENABLE FILEUP ONLY CALLING: ${PILEUP_ONLY}" echo "[INFO] ENABLE FAST MODE CALLING: ${FAST_MODE}" echo "[INFO] ENABLE CALLING SNP CANDIDATES ONLY: ${SNP_ONLY}" @@ -290,10 +298,6 @@ if [ ! -z ${MAX_ULIMIT_THREADS} ]; then PER_ULIMIT_THREADS=$((${MAX_ULIMIT_THREA if [[ ${PER_ULIMIT_THREADS} < 1 ]]; then PER_ULIMIT_THREADS=1; fi if [ "${MAX_ULIMIT_THREADS}" != "unlimited" ] && [[ ${THREADS} -gt ${PER_ULIMIT_THREADS} ]]; then echo -e "${WARNING} Threads setting exceeds maximum ulimit threads ${THREADS} * 30 > ${MAX_ULIMIT_THREADS} (ulimit -u), set threads=${PER_ULIMIT_THREADS}${NC}"; THREADS=${PER_ULIMIT_THREADS}; fi -# min mapping quality and min coverage detection -if [[ ! ${THREADS} =~ ^[\-0-9]+$ ]] || (( ${THREADS} <= 0)); then echo -e "${ERROR} Invalid threads input --threads=INT ${NC}"; exit 1; fi -if [[ ! ${MIN_MQ} =~ ^[\-0-9]+$ ]] || (( ${MIN_MQ} < 5)); then echo -e "${WARNING} Invalid minimum mapping quality input --min_mq>=5 ${NC}"; MIN_MQ=5; fi -if [[ ! ${MIN_COV} =~ ^[\-0-9]+$ ]] || (( ${MIN_COV} < 2)); then echo -e "${WARNING} Invalid minimum coverage input --min_coverage>=2 ${NC}"; MIN_COV=2; fi # platform check if [ ! ${PLATFORM} = "ont" ] && [ ! ${PLATFORM} = "hifi" ] && [ ! ${PLATFORM} = "ilmn" ]; then echo -e "${ERROR} Invalid platform input, optional: {ont, hifi, ilmn}${NC}"; exit 1; fi @@ -319,14 +323,21 @@ if [ -z ${PILEUP_PREFIX} ]; then echo -e "${ERROR} Use '--pileup_model_prefix=ST if [ -z ${FA_PREFIX} ]; then echo -e "${ERROR} Use '--fa_model_prefix=STR' instead of '--fa_model_prefix STR' for optional parameters${NC}"; exit 1 ; fi if [ -z ${MIN_MQ} ]; then echo -e "${ERROR} Use '--min_mq=INT' instead of '--min_mq INT' for optional parameters${NC}"; exit 1 ; fi if [ -z ${MIN_COV} ]; then echo -e "${ERROR} Use '--min_coverage=INT' instead of '--min_coverage INT' for optional parameters${NC}"; exit 1 ; fi +if [ -z ${MIN_CONTIG_SIZE} ]; then echo -e "${ERROR} Use '--min_contig_size=INT' instead of '--min_contig_size INT' for optional parameters${NC}"; exit 1 ; fi if [ -z ${LONGPHASE} ]; then echo -e "${ERROR} Use '--longphase=STR' instead of '--longphase STR' for optional parameters${NC}"; exit 1 ; fi +# min mapping quality, min coverage and min contig size detection +if [[ ! ${THREADS} =~ ^[\-0-9]+$ ]] || (( ${THREADS} <= 0)); then echo -e "${ERROR} Invalid threads input --threads=INT ${NC}"; exit 1; fi +if [[ ! ${MIN_MQ} =~ ^[\-0-9]+$ ]] || (( ${MIN_MQ} < 5)); then echo -e "${WARNING} Invalid minimum mapping quality input --min_mq>=5 ${NC}"; MIN_MQ=5; fi +if [[ ! ${MIN_COV} =~ ^[\-0-9]+$ ]] || (( ${MIN_COV} < 2)); then echo -e "${WARNING} Invalid minimum coverage input --min_coverage>=2 ${NC}"; MIN_COV=2; fi +if [[ ! ${MIN_CONTIG_SIZE} =~ ^[\-0-9]+$ ]] || (( ${MIN_CONTIG_SIZE} < 0)); then echo -e "${WARNING} Invalid minimum contig size --min_contig_size>=0 ${NC}"; MIN_CONTIG_SIZE=0; fi + # model prefix detection if [ ! -f ${MODEL_PATH}/${PILEUP_PREFIX}.index ]; then echo -e "${ERROR} No pileup model found in provided model path and model prefix ${MODEL_PATH}/${PILEUP_PREFIX} ${NC}"; exit 1; fi if [ ! -f ${MODEL_PATH}/${FA_PREFIX}.index ]; then echo -e "${ERROR} No full-alignment model found in provided model path and model prefix ${MODEL_PATH}/${FA_PREFIX} ${NC}"; exit 1; fi CLAIR3_SCRIPT="clair3.sh" -if [ "${ENABLE_C_IMPL}" == True ] && [ ! ${PLATFORM} = "ilmn" ]; then CLAIR3_SCRIPT="clair3_c_impl.sh"; fi +if [ "${ENABLE_C_IMPL}" == True ]; then CLAIR3_SCRIPT="clair3_c_impl.sh"; fi set -x ${SCRIPT_PATH}/scripts/${CLAIR3_SCRIPT} \ @@ -355,6 +366,7 @@ ${SCRIPT_PATH}/scripts/${CLAIR3_SCRIPT} \ --indel_min_af=${INDEL_AF} \ --min_mq=${MIN_MQ} \ --min_coverage=${MIN_COV} \ + --min_contig_size=${MIN_CONTIG_SIZE} \ --pileup_only=${PILEUP_ONLY} \ --gvcf=${GVCF} \ --fast_mode=${FAST_MODE} \ diff --git a/scripts/clair3.sh b/scripts/clair3.sh index 275dd22..8259fd2 100755 --- a/scripts/clair3.sh +++ b/scripts/clair3.sh @@ -7,7 +7,7 @@ set -e ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \ -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\ -min_mq::,min_coverage::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ +min_mq::,min_coverage::,min_contig_size::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi @@ -46,6 +46,7 @@ while true; do --indel_min_af ) INDEL_AF="$2"; shift 2 ;; --min_mq ) MIN_MQ="$2"; shift 2 ;; --min_coverage ) MIN_COV="$2"; shift 2 ;; + --min_contig_size ) MIN_CONTIG_SIZE="$2"; shift 2 ;; --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;; --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;; --haploid_precise ) HAP_PRE="$2"; shift 2 ;; @@ -110,7 +111,8 @@ ${PYTHON} ${CLAIR3} CheckEnvs \ --var_pct_full ${PRO} \ --ref_pct_full ${REF_PRO} \ --snp_min_af ${SNP_AF} \ - --indel_min_af ${INDEL_AF} + --indel_min_af ${INDEL_AF} \ + --min_contig_size ${MIN_CONTIG_SIZE} if [ "$(uname)" = "Darwin" ]; then diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh index eaaebca..a4b8f79 100755 --- a/scripts/clair3_c_impl.sh +++ b/scripts/clair3_c_impl.sh @@ -7,7 +7,7 @@ set -e ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \ -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\ -min_mq::,min_coverage::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ +min_mq::,min_coverage::,min_contig_size::,snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel::,use_gpu::,longphase_for_phasing::,longphase:: -n 'run_clair3.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi @@ -46,6 +46,7 @@ while true; do --indel_min_af ) INDEL_AF="$2"; shift 2 ;; --min_mq ) MIN_MQ="$2"; shift 2 ;; --min_coverage ) MIN_COV="$2"; shift 2 ;; + --min_contig_size ) MIN_CONTIG_SIZE="$2"; shift 2 ;; --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;; --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;; --haploid_precise ) HAP_PRE="$2"; shift 2 ;; @@ -110,7 +111,8 @@ ${PYTHON} ${CLAIR3} CheckEnvs \ --var_pct_full ${PRO} \ --ref_pct_full ${REF_PRO} \ --snp_min_af ${SNP_AF} \ - --indel_min_af ${INDEL_AF} + --indel_min_af ${INDEL_AF} \ + --min_contig_size ${MIN_CONTIG_SIZE} if [ "$(uname)" = "Darwin" ]; then From effe8c346de525ec25c624703eb01d8029560e09 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Sun, 3 Apr 2022 14:21:01 +0800 Subject: [PATCH 34/43] add min_contig_size in main entry --- run_clair3.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/run_clair3.sh b/run_clair3.sh index 9865bdc..026c80b 100755 --- a/run_clair3.sh +++ b/run_clair3.sh @@ -147,6 +147,7 @@ while true; do --indel_min_af ) INDEL_AF="$2"; shift 2 ;; --min_mq ) MIN_MQ="$2"; shift 2 ;; --min_coverage ) MIN_COV="$2"; shift 2 ;; + --min_contig_size ) MIN_CONTIG_SIZE="$2"; shift 2 ;; --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;; --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;; --gvcf ) GVCF=True; shift 1 ;; From 5d4949756885e401a21a0f17f9d7e7630ef249f0 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Sun, 3 Apr 2022 14:21:59 +0800 Subject: [PATCH 35/43] add longphase platform option for pacbio hifi and ont --- scripts/clair3.sh | 2 ++ scripts/clair3_c_impl.sh | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/clair3.sh b/scripts/clair3.sh index 8259fd2..71a7e9a 100755 --- a/scripts/clair3.sh +++ b/scripts/clair3.sh @@ -126,6 +126,8 @@ if [ ${#CHR[@]} -eq 0 ]; then echo "[INFO] Exit in environment checking"; exit 0 THREADS_LOW=$((${THREADS}*3/4)) LONGPHASE_THREADS=$((${THREADS}*1/2)) if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi +if [[ ${LONGPHASE_THREADS} < 1 ]]; then LONGPHASE_THREADS=1; fi +if [ "${PLATFORM}" = "ont" ]; then LP_PLATFORM="ont"; else LP_PLATFORM="pb"; fi cd ${OUTPUT_FOLDER} # Pileup calling diff --git a/scripts/clair3_c_impl.sh b/scripts/clair3_c_impl.sh index a4b8f79..bee42f7 100755 --- a/scripts/clair3_c_impl.sh +++ b/scripts/clair3_c_impl.sh @@ -127,6 +127,7 @@ THREADS_LOW=$((${THREADS}*3/4)) LONGPHASE_THREADS=$((${THREADS}*1/2)) if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi if [[ ${LONGPHASE_THREADS} < 1 ]]; then LONGPHASE_THREADS=1; fi +if [ "${PLATFORM}" = "ont" ]; then LP_PLATFORM="ont"; else LP_PLATFORM="pb"; fi cd ${OUTPUT_FOLDER} # Pileup calling @@ -204,7 +205,7 @@ else -r ${REFERENCE_FILE_PATH} \ -t ${LONGPHASE_THREADS} \ -o ${PHASE_VCF_PATH}/phased_{1} \ - --ont" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log + --${LP_PLATFORM}" ::: ${CHR[@]} |& tee ${LOG_PATH}/3_phase.log ${PARALLEL} -j${THREADS} bgzip -f ${PHASE_VCF_PATH}/phased_{}.vcf ::: ${CHR[@]} else echo "[INFO] 3/7 Phase VCF file using Whatshap" From 8d9af7bc1875c8205903fdc9203e42a768aaffa1 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Sun, 3 Apr 2022 17:28:31 +0800 Subject: [PATCH 36/43] zlib is not used in full-alignment --- src/clair3_full_alignment.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/clair3_full_alignment.c b/src/clair3_full_alignment.c index ea70418..1e1fd2e 100644 --- a/src/clair3_full_alignment.c +++ b/src/clair3_full_alignment.c @@ -15,7 +15,6 @@ #include "medaka_common.h" #include "medaka_khcounter.h" #include "clair3_full_alignment.h" -#include "zlib.h" #include "levenshtein.h" typedef struct Pos_alt_info From c497c8a5a59186b8d3e10d9872ad46d3c5f352a7 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Mon, 4 Apr 2022 16:22:48 +0800 Subject: [PATCH 37/43] update dockerfile with c implement --- Dockerfile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index c3e2447..55e8e64 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,11 +34,15 @@ RUN /bin/bash -c "source activate clair3" && \ pip install tensorflow-cpu==2.2.0 && \ pip install tensorflow-addons==0.11.2 tables==3.6.1 && \ conda install -c anaconda pigz==2.4 -y && \ + conda install -c anaconda cffi==1.14.4 -y && \ conda install -c conda-forge parallel=20191122 zstd=1.4.4 -y && \ conda install -c conda-forge -c bioconda samtools=1.10 -y && \ conda install -c conda-forge -c bioconda whatshap=1.0 -y && \ + conda install -c conda-forge xz zlib bzip2 -y && \ + conda install -c conda-forge automake curl -y && \ rm -rf /opt/conda/pkgs/* && \ - rm -rf /root/.cache/pip + rm -rf /root/.cache/pip && \ + echo "source activate clair3" > ~/.bashrc COPY . . @@ -48,4 +52,6 @@ RUN cd /opt/bin/preprocess/realign && \ wget http://www.bio8.cs.hku.hk/clair3/clair3_models/clair3_models.tar.gz -P /opt/models && \ tar -zxvf /opt/models/clair3_models.tar.gz -C /opt/models && \ rm /opt/models/clair3_models.tar.gz && \ - echo "source activate clair3" > ~/.bashrc \ No newline at end of file + cd /opt/bin && \ + make PREFIX=/opt/conda/envs/clair3 PYTHON=/opt/conda/envs/clair3/bin/python && \ + rm -rf /opt/bin/samtools-* /opt/bin/longphase-* \ No newline at end of file From 34b49a8b8d5fd017559289dd16a84b784be3481b Mon Sep 17 00:00:00 2001 From: zxzheng Date: Mon, 4 Apr 2022 16:23:23 +0800 Subject: [PATCH 38/43] use absolute path for script path --- run_clair3.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_clair3.sh b/run_clair3.sh index 026c80b..3da33bd 100755 --- a/run_clair3.sh +++ b/run_clair3.sh @@ -1,8 +1,8 @@ #!/bin/bash SCRIPT_NAME=$(basename "$0") -SCRIPT_PATH=`dirname "$0"` +SCRIPT_PATH=$(dirname $(readlink -f "$0")) VERSION='v0.1-r11' -Usage="Usage: ./${SCRIPT_NAME} --bam_fn=BAM --ref_fn=REF --output=OUTPUT_DIR --threads=THREADS --platform=PLATFORM --model_path=MODEL_PREFIX [--bed_fn=BED] [options]" +Usage="Usage: ${SCRIPT_NAME} --bam_fn=BAM --ref_fn=REF --output=OUTPUT_DIR --threads=THREADS --platform=PLATFORM --model_path=MODEL_PREFIX [--bed_fn=BED] [options]" set -e #./run_clair3.sh -b tmp.bam -f ref.fasta -t 32 -o tmp -p ont -m model_path @@ -50,7 +50,7 @@ print_help_messages() echo $' --fa_model_prefix=STR EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment.' echo $' --min_mq=INT EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5.' echo $' --min_coverage=INT EXPERIMENTAL: Minimum coverage required to call a variant, default: 2.' - echo $' --min_contig_size=INT EXPERIMENTAL: If set, contigs with contig size<=$min_contig_size are filtered, default: 0.' + echo $' --min_contig_size=INT EXPERIMENTAL: If set, contigs with contig size<$min_contig_size are filtered, default: 0.' echo $' --fast_mode EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable.' echo $' --haploid_precise EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable.' echo $' --haploid_sensitive EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable.' From 72fa01c4e94ff6e99d94dedf33bc775e7c75ae9c Mon Sep 17 00:00:00 2001 From: zxzheng Date: Mon, 4 Apr 2022 16:27:26 +0800 Subject: [PATCH 39/43] add deflate and extra_link_args to link dynamic libraries --- build.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/build.py b/build.py index 75fee91..52c0cf8 100644 --- a/build.py +++ b/build.py @@ -6,11 +6,19 @@ samver = "1.10" file_directory = os.path.dirname(os.path.realpath(__file__)) -htslib_dir=os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver)) +htslib_dir = os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver)) -libraries=['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto'] -library_dirs=[htslib_dir] -src_dir=os.path.join(file_directory, 'src') +libraries = ['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto', 'deflate'] + +try: + conda_path = os.environ['CONDA_PREFIX'] + extra_link_args = ['-Wl,-rpath={}/lib'.format(conda_path)] +except: + print("[WARNING] Conda prefix not found, please activate clair3 conda environment first!") + extra_link_args = [] + +library_dirs = [htslib_dir] +src_dir = os.path.join(file_directory, 'src') extra_compile_args = ['-std=c99', '-O3'] if platform.machine() in {"aarch64", "arm64"}: @@ -45,6 +53,7 @@ 'clair3_pileup.c', 'clair3_full_alignment.c')], extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, extra_objects=['libhts.a'] ) From 1a8a0884a456b83c7fa8a03e313634ed68a44dbd Mon Sep 17 00:00:00 2001 From: zxzheng Date: Mon, 4 Apr 2022 17:34:41 +0800 Subject: [PATCH 40/43] update Makefile --- Makefile | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 0f13b7d..ab1c33a 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,15 @@ PYTHON ?= python3 all : libhts.a longphase libclair3.so clean : clean_htslib clean_longphase clean_libclair3 -SAMVER=1.10 -LPVER=1.0 +SAMVER = 1.10 +LPVER = 1.0 +GCC ?= gcc +GXX ?= g++ +PREFIX ?= ${CONDA_PREFIX} +LDFLAGS = -L ${PREFIX}/lib +CFLAGS = -fpic -std=c99 -O3 -I ${PREFIX}/include -L ${PREFIX}/lib +CPPFLAGS = -std=c++11 -Wall -O3 -I ${PREFIX}/include -L ${PREFIX}/lib -Wl,-rpath=${PREFIX}/lib +LP_CPPFLAGS = -std=c++11 -Wall -g -O3 -I ${PREFIX}/include -L ${PREFIX}/lib -Wl,-rpath=${PREFIX}/lib samtools-$(SAMVER)/Makefile: curl -L -o samtools-${SAMVER}.tar.bz2 https://github.com/samtools/samtools/releases/download/${SAMVER}/samtools-${SAMVER}.tar.bz2; \ @@ -17,7 +24,7 @@ samtools-$(SAMVER)/Makefile: libhts.a: samtools-$(SAMVER)/Makefile # this is required only to add in -fpic so we can build python module @echo "\x1b[1;33mMaking $(@F)\x1b[0m" - cd samtools-${SAMVER}/htslib-${SAMVER}/ && CFLAGS="-fpic -std=c99 -O3" ./configure && make + cd samtools-${SAMVER}/htslib-${SAMVER}; CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}" ./configure; make CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}" cp samtools-${SAMVER}/htslib-${SAMVER}/$@ $@ @@ -28,7 +35,7 @@ longphase-$(LPVER)/Makefile: longphase: longphase-$(LPVER)/Makefile @echo "\x1b[1;33mMaking $(@F)\x1b[0m" - cd longphase-${LPVER} && autoreconf -i && ./configure && make -j4 + cd longphase-${LPVER}; autoreconf -i; CPPFLAGS="${CPPFLAGS}" ./configure; make CC=${GCC} CXX=${GXX} CPPFLAGS="${CPPFLAGS}" cp longphase-${LPVER}/$@ $@ @@ -40,10 +47,12 @@ libclair3.so: samtools-${SAMVER}/htslib-${SAMVER} clean_htslib: cd samtools-${SAMVER} && make clean || exit 0 cd samtools-${SAMVER}/htslib-${SAMVER} && make clean || exit 0 + rm libhts.a .PHONY: clean_longphase clean_longphase: cd longphase-${LPVER} && make clean || exit 0 + rm longphase .PHONY: clean_libclair3 clean_libclair3: From 6294e0c85f5d342f9bb4378c6adbeeebe3ea9c2e Mon Sep 17 00:00:00 2001 From: zxzheng Date: Mon, 4 Apr 2022 17:35:37 +0800 Subject: [PATCH 41/43] set deflate as an option for arm64 --- build.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/build.py b/build.py index 52c0cf8..eff5464 100644 --- a/build.py +++ b/build.py @@ -8,15 +8,8 @@ file_directory = os.path.dirname(os.path.realpath(__file__)) htslib_dir = os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver)) -libraries = ['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto', 'deflate'] - -try: - conda_path = os.environ['CONDA_PREFIX'] - extra_link_args = ['-Wl,-rpath={}/lib'.format(conda_path)] -except: - print("[WARNING] Conda prefix not found, please activate clair3 conda environment first!") - extra_link_args = [] - +libraries = ['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto'] +extra_link_args = [] library_dirs = [htslib_dir] src_dir = os.path.join(file_directory, 'src') @@ -28,6 +21,12 @@ extra_compile_args.append("-march=armv8-a+simd") else: extra_compile_args.append("-mtune=haswell") + libraries.append('deflate') + try: + conda_path = os.environ['CONDA_PREFIX'] + extra_link_args = ['-Wl,-rpath={}/lib'.format(conda_path)] + except: + print("[WARNING] Conda prefix not found, please activate clair3 conda environment first!") ffibuilder = FFI() ffibuilder.set_source("libclair3", From 0e1f64914649b0c573cef8214a3d3fb8686d8fac Mon Sep 17 00:00:00 2001 From: zxzheng Date: Mon, 4 Apr 2022 17:37:28 +0800 Subject: [PATCH 42/43] update Readme --- README.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5d23501..d554517 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,8 @@ A short preprint describing Clair3's algorithms and results is at [bioRxiv](http ## Latest Updates +*v0.1-r11 (Apr 4)* : 1. Variant calling ~2.5x faster than `v0.1-r10` tested with ONT Q20 data, with feature generation in both pileup and full-alignment now implemented in C (co-contributors @[cjw85](https://github.com/cjw85), @[ftostevin-ont](https://github.com/ftostevin-ont), @[EpiSlim](https://github.com/EpiSlim)). 2. Added the lightning-fast [longphase](https://github.com/twolinin/longphase) as an option for phasing. Enable using `longphase` with option `--longphase_for_phasing`. New option disabled by default to align with the default behavior of the previous versions, but we recommend enable when calling human variants with ≥20x long-reads). 3. Added `--min_coverage` and `--min_mq` options ([#83](https://github.com/HKU-BAL/Clair3/issues/83)). 4. Added `--min_contig_size` option to skip calling variants in short contigs when using genome assembly as input. 4. Reads haplotagging after phasing before full-alignment calling now integrated into full-alignment calling to avoid generating an intermediate BAM file. 5. Supported .`csi` BAM index for large references ([#90](https://github.com/HKU-BAL/Clair3/issues/90)). For more speedup details, please check [Notes on r11](docs/v0.1_r11_speedup.md). + *v0.1-r10 (Jan 13)* : 1. Added a new ONT Guppy5 model (`r941_prom_sup_g5014`). Click [here](docs/guppy5_20220113.md) for some benchmarking results. This `sup` model is also applicable to reads called using the `hac` and `fast` mode. The old `r941_prom_sup_g506` model that was fine-tuned from the Guppy3,4 model is obsoleted. 2. Added `--var_pct_phasing` option to control the percentage of top ranked heterozygous pile-up variants used for WhatsHap phasing. *v0.1-r9 (Dec 1)* : Added the `--enable_long_indel` option to output indel variant calls >50bp ([#64](https://github.com/HKU-BAL/Clair3/issues/64)), Click [here](https://github.com/HKU-BAL/Clair3/blob/main/docs/indel_gt50_performance.md) to see more benchmarking results. @@ -267,18 +269,19 @@ pypy3 -m pip install mpmath==1.2.1 # install python packages in environment pip3 install tensorflow==2.2.0 pip3 install tensorflow-addons==0.11.2 tables==3.6.1 -conda install -c anaconda pigz==2.4 -y +conda install -c anaconda pigz==2.4 cffi==1.14.4 -y conda install -c conda-forge parallel=20191122 zstd=1.4.4 -y conda install -c conda-forge -c bioconda samtools=1.10 -y conda install -c conda-forge -c bioconda whatshap=1.0 -y - +conda install -c conda-forge xz zlib bzip2 automake curl -y + # clone Clair3 git clone https://github.com/HKU-BAL/Clair3.git cd Clair3 # compile samtools, longphase and cffi library for c implement # after building, longphase binary is in `Clair3` folder -python3 build.py +source activate clair3 && make PREFIX=${CONDA_PREFIX} # download pre-trained models mkdir models @@ -364,12 +367,15 @@ docker run -it hkubal/clair3:latest /opt/bin/run_clair3.sh --help --pypy=STR Path of pypy3, pypy3 >= 3.6 is required. --parallel=STR Path of parallel, parallel >= 20191122 is required. --whatshap=STR Path of whatshap, whatshap >= 1.0 is required. + --longphase=STR Path of longphase, longphase >= 1.0 is required. --chunk_size=INT The size of each chuck for parallel processing, default: 5Mbp. --pileup_only Use the pileup model only when calling, default: disable. --print_ref_calls Show reference calls (0/0) in vcf file, default: disable. --include_all_ctgs Call variants on all contigs, otherwise call in chr{1..22,X,Y} and {1..22,X,Y}, default: disable. --gvcf Enable GVCF output, default: disable. --enable_phasing Output phased variants using whatshap, default: disable. + --longphase_for_phasing Use longphase for phasing, default: enable. + --disable_c_impl Disable C implement with cffi for pileup and full-alignment create tensor, default: enable. --remove_intermediate_dir Remove intermediate directory, including intermediate phased BAM, pileup and full-alignment results. default: disable. --snp_min_af=FLOAT Minimum SNP AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.08,hifi:0.08,ilmn:0.08. --indel_min_af=FLOAT Minimum INDEL AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.15,hifi:0.08,ilmn:0.08. @@ -378,6 +384,9 @@ docker run -it hkubal/clair3:latest /opt/bin/run_clair3.sh --help --var_pct_phasing=FLOAT EXPERIMENTAL: Specify an expected percentage of high quality 0/1 variants used in WhatsHap phasing, default: 0.8 for ont guppy5 and 0.7 for other platforms. --pileup_model_prefix=STR EXPERIMENTAL: Model prefix in pileup calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index. default: pileup. --fa_model_prefix=STR EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment. + --min_mq=INT EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5. + --min_coverage=INT EXPERIMENTAL: Minimum coverage required to call a variant, default: 2. + --min_contig_size=INT EXPERIMENTAL: If set, contigs with contig size<$min_contig_size are filtered, default: 0. --fast_mode EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable. --haploid_precise EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable. --haploid_sensitive EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable. From 2564a1a37a8c175bde120e97df23a00419c77ac5 Mon Sep 17 00:00:00 2001 From: zxzheng Date: Mon, 4 Apr 2022 17:38:12 +0800 Subject: [PATCH 43/43] add document for c implement and longphase speedup --- docs/v0.1_r11_speedup.md | 41 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 docs/v0.1_r11_speedup.md diff --git a/docs/v0.1_r11_speedup.md b/docs/v0.1_r11_speedup.md new file mode 100644 index 0000000..f584aab --- /dev/null +++ b/docs/v0.1_r11_speedup.md @@ -0,0 +1,41 @@ +# Notes on v0.1-r11 + +We focused on speedup in `v0.1-r11`. We tried a few techniques and listed those that worked as follows. + +1. **C implementation for pileup and full-alignment feature generation.** Before r11, feature generation (tensor creation) in Clair3 was sped up using pypy on python code. The speedup was ~10x over native python. The practice balanced speed and ease of coding in the developmental stage of Clair3. In r11, we added C implementation, bringing another ~2-3 times speedup over pypy. The C code is integrated with the other python parts using CFFI (C Foreign Function Interface). The variants called with the new C implementation are identical to the previous version. +2. **Use longphase for phasing.** [longphase](https://github.com/twolinin/longphase) by [Lin et al.](https://academic.oup.com/bioinformatics/advance-article-abstract/doi/10.1093/bioinformatics/btac058/6519151) is an ultra-fast chromosome-scale phasing algorithm for small and large variants. In our experiments, longphase took ~3 minutes to phase 69x Q20 ONT WGS with 24 CPU cores and no I/O bound, faster than `whatshap` that took 52 minutes. To enable using longphase for phasing, please use the `--longphase_for_phasing` option. Our suggestions on when to enable longphase are shown in the section below. +3. **Haplotagging on the fly.** Whatshap `haplotag` was used to add an `HP` tag to each read after phasing. This process writes out a new BAM, which is I/O intensive and in fact, unnecessary. In r11, we implemented haplotagging to feed tagged read directly to full-alignment calling. We used the exact logic that was implemented in whatshap's haplotag module. This technique, no matter whatshap or longphase was used, saves more than 10-20 minutes on compressing, writing and reading a new BAM. + +We benchmarked r11 against r10 with [69x Q20 ONT HG002 data](https://labs.epi2me.io/gm24385_q20_2021.10). 24 CPU cores with minimal I/O speed limit were used. The results are as follows. With C implementation and longphase enabled, the total runtime reduced from 234 to 101 minutes. + +| Implementation | Sample | CPU cores | Inference hardware | Total runtime | Pileup runtime | Phasing runtime | Full-alignment runtime | +| ------------------ | ----------------- | --------- | ------------------ | ------------- | -------------- | --------------- | ---------------------- | +| c\_impl, longphase | HG002 WGS Q20 69x | 24 | CPU | 101m | 38m | 3m | 56m | +| v0.1-r10, whatshap | HG002 WGS Q20 69x | 24 | CPU | 234m | 57m | 52m | 118m | + +---- + +## When to use `longphase` (to replace `whatshap`) + +`longphase` is **not** enabled by default. We suggest enabling `longphase` through the `--longphase_for_phasing` option when calling variants in human with ≥20x of data. **Use `whatshap` with non-human samples or insufficient depth.** + +Benchmarks between using longphase and whatshap on HG003 WGS ONT Guppy5 with five depths from 10x to 50x are as follows. + +| Phasing algorithm | Depth | SNP-Precision | SNP-Recall | SNP-F1 | Indel-Precision | Indel-Recall | Indel-F1 | +| ----------------- | ----- | ------------- | ---------- | ------ | --------------- | ------------ | -------- | +| longphase | 10x | 96.75% | 93.94% | 95.32% | 82.86% | 47.30% | 60.22% | +| whatshap | 10x | 95.87% | 96.64% | 96.26% | 83.37% | 47.50% | 60.52% | +| longphase | 20x | 99.22% | 99.27% | 99.25% | 88.49% | 62.22% | 73.07% | +| whatshap | 20x | 99.21% | 99.36% | 99.28% | 88.75% | 60.47% | 71.93% | +| longphase | 30x | 99.50% | 99.60% | 99.55% | 90.63% | 68.39% | 77.96% | +| whatshap | 30x | 99.50% | 99.61% | 99.56% | 90.61% | 66.52% | 76.72% | +| longphase | 40x | 99.59% | 99.67% | 99.63% | 91.69% | 72.34% | 80.87% | +| whatshap | 40x | 99.60% | 99.70% | 99.65% | 91.71% | 72.39% | 80.91% | +| longphase | 50x | 99.63% | 99.70% | 99.66% | 92.17% | 75.29% | 82.88% | +| whatshap | 50x | 99.62% | 99.70% | 99.66% | 91.59% | 73.66% | 81.65% | + +--- + +## Use the old python-based feature generation code (to disable the new C implementation) + +The new C implementation generates results identical to the previous version. However, we retained the old python-based feature generation code for benchmarking or back-compatibility purposes. Users can use it through the `--disable_c_impl` option.