Merge pull request #89 from HKU-BAL/r11

update version to v0.1-r11
HKU-BAL · Apr 4, 2022 · e8c2e50 · e8c2e50
2 parents ef0546f + 2564a1a
commit e8c2e50
Show file tree

Hide file tree

Showing 33 changed files with 4,974 additions and 73 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -34,11 +34,15 @@ RUN /bin/bash -c "source activate clair3" && \
     pip install tensorflow-cpu==2.2.0 && \
     pip install tensorflow-addons==0.11.2 tables==3.6.1 && \
     conda install -c anaconda pigz==2.4 -y && \
+    conda install -c anaconda cffi==1.14.4 -y && \
     conda install -c conda-forge parallel=20191122 zstd=1.4.4 -y && \
     conda install -c conda-forge -c bioconda samtools=1.10 -y && \
     conda install -c conda-forge -c bioconda whatshap=1.0 -y && \
+    conda install -c conda-forge xz zlib bzip2 -y && \
+    conda install -c conda-forge automake curl -y && \
     rm -rf /opt/conda/pkgs/* && \
-    rm -rf /root/.cache/pip
+    rm -rf /root/.cache/pip && \
+    echo "source activate clair3" > ~/.bashrc
 
 COPY . .
 
@@ -48,4 +52,6 @@ RUN cd /opt/bin/preprocess/realign && \
     wget http://www.bio8.cs.hku.hk/clair3/clair3_models/clair3_models.tar.gz -P /opt/models && \
     tar -zxvf /opt/models/clair3_models.tar.gz -C /opt/models && \
     rm /opt/models/clair3_models.tar.gz && \
-    echo "source activate clair3" > ~/.bashrc
+    cd /opt/bin && \
+    make PREFIX=/opt/conda/envs/clair3 PYTHON=/opt/conda/envs/clair3/bin/python && \
+    rm -rf /opt/bin/samtools-* /opt/bin/longphase-*
diff --git a/Makefile b/Makefile
@@ -0,0 +1,59 @@
+OS := $(shell uname)
+ARCH := $(shell arch)
+
+PYTHON ?= python3
+
+all : libhts.a longphase libclair3.so
+clean : clean_htslib clean_longphase clean_libclair3
+
+SAMVER	=	1.10
+LPVER	=	1.0
+GCC	?=	gcc
+GXX	?=	g++
+PREFIX	?=	${CONDA_PREFIX}
+LDFLAGS	=	-L ${PREFIX}/lib
+CFLAGS	= -fpic -std=c99 -O3 -I ${PREFIX}/include -L ${PREFIX}/lib
+CPPFLAGS	=	-std=c++11 -Wall -O3 -I ${PREFIX}/include -L ${PREFIX}/lib -Wl,-rpath=${PREFIX}/lib
+LP_CPPFLAGS	 =	-std=c++11 -Wall -g -O3 -I ${PREFIX}/include -L ${PREFIX}/lib -Wl,-rpath=${PREFIX}/lib
+
+samtools-$(SAMVER)/Makefile:
+		curl -L -o samtools-${SAMVER}.tar.bz2 https://github.com/samtools/samtools/releases/download/${SAMVER}/samtools-${SAMVER}.tar.bz2; \
+		tar -xjf samtools-${SAMVER}.tar.bz2; \
+		rm samtools-${SAMVER}.tar.bz2
+
+libhts.a: samtools-$(SAMVER)/Makefile
+	# this is required only to add in -fpic so we can build python module
+	@echo "\x1b[1;33mMaking $(@F)\x1b[0m"
+	cd samtools-${SAMVER}/htslib-${SAMVER}; CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}" ./configure; make CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}"
+	cp samtools-${SAMVER}/htslib-${SAMVER}/$@ $@
+
+
+longphase-$(LPVER)/Makefile:
+	curl -L -o longphase-${LPVER}.tar.gz https://github.com/twolinin/longphase/archive/refs/tags/v${LPVER}.tar.gz; \
+	tar -zxvf longphase-${LPVER}.tar.gz; \
+	rm longphase-${LPVER}.tar.gz
+
+longphase: longphase-$(LPVER)/Makefile
+	@echo "\x1b[1;33mMaking $(@F)\x1b[0m"
+	cd longphase-${LPVER}; autoreconf -i; CPPFLAGS="${CPPFLAGS}" ./configure; make CC=${GCC} CXX=${GXX} CPPFLAGS="${CPPFLAGS}"
+	cp longphase-${LPVER}/$@ $@
+
+
+libclair3.so: samtools-${SAMVER}/htslib-${SAMVER}
+	${PYTHON} build.py
+
+
+.PHONY: clean_htslib
+clean_htslib:
+	cd samtools-${SAMVER} && make clean || exit 0
+	cd samtools-${SAMVER}/htslib-${SAMVER} && make clean || exit 0
+	rm libhts.a
+
+.PHONY: clean_longphase
+clean_longphase:
+	cd longphase-${LPVER} && make clean || exit 0
+	rm longphase
+
+.PHONY: clean_libclair3
+clean_libclair3:
+	rm libclair3.*
diff --git a/README.md b/README.md
@@ -56,6 +56,8 @@ A short preprint describing Clair3's algorithms and results is at [bioRxiv](http
 
 ## Latest Updates
 
+*v0.1-r11 (Apr 4)* : 1. Variant calling ~2.5x faster than `v0.1-r10` tested with ONT Q20 data, with feature generation in both pileup and full-alignment now implemented in C (co-contributors @[cjw85](https://github.com/cjw85), @[ftostevin-ont](https://github.com/ftostevin-ont), @[EpiSlim](https://github.com/EpiSlim)). 2. Added the lightning-fast [longphase](https://github.com/twolinin/longphase) as an option for phasing. Enable using `longphase` with option `--longphase_for_phasing`. New option disabled by default to align with the default behavior of the previous versions, but we recommend enable when calling human variants with ≥20x long-reads). 3. Added `--min_coverage` and `--min_mq` options ([#83](https://github.com/HKU-BAL/Clair3/issues/83)). 4. Added `--min_contig_size` option to skip calling variants in short contigs when using genome assembly as input. 4. Reads haplotagging after phasing before full-alignment calling now integrated into full-alignment calling to avoid generating an intermediate BAM file. 5. Supported .`csi` BAM index for large references ([#90](https://github.com/HKU-BAL/Clair3/issues/90)). For more speedup details, please check [Notes on r11](docs/v0.1_r11_speedup.md).
+
 *v0.1-r10 (Jan 13)* : 1. Added a new ONT Guppy5 model  (`r941_prom_sup_g5014`). Click [here](docs/guppy5_20220113.md) for some benchmarking results. This `sup` model is also applicable to reads called using the `hac` and `fast` mode. The old `r941_prom_sup_g506` model that was fine-tuned from the Guppy3,4 model is obsoleted. 2. Added `--var_pct_phasing` option to control the percentage of top ranked heterozygous pile-up variants used for WhatsHap phasing.
 
 *v0.1-r9 (Dec 1)* : Added the `--enable_long_indel` option to output indel variant calls >50bp ([#64](https://github.com/HKU-BAL/Clair3/issues/64)), Click [here](https://github.com/HKU-BAL/Clair3/blob/main/docs/indel_gt50_performance.md) to see more benchmarking results.
@@ -267,15 +269,20 @@ pypy3 -m pip install mpmath==1.2.1
 # install python packages in environment
 pip3 install tensorflow==2.2.0
 pip3 install tensorflow-addons==0.11.2 tables==3.6.1
-conda install -c anaconda pigz==2.4 -y
+conda install -c anaconda pigz==2.4 cffi==1.14.4 -y
 conda install -c conda-forge parallel=20191122 zstd=1.4.4 -y
 conda install -c conda-forge -c bioconda samtools=1.10 -y
 conda install -c conda-forge -c bioconda whatshap=1.0 -y
-
+conda install -c conda-forge xz zlib bzip2 automake curl -y
+
 # clone Clair3
 git clone https://github.com/HKU-BAL/Clair3.git
 cd Clair3
 
+# compile samtools, longphase and cffi library for c implement
+# after building, longphase binary is in `Clair3` folder
+source activate clair3 && make PREFIX=${CONDA_PREFIX}
+
 # download pre-trained models
 mkdir models
 wget http://www.bio8.cs.hku.hk/clair3/clair3_models/clair3_models.tar.gz 
@@ -292,6 +299,8 @@ MODEL_NAME="[YOUR_MODEL_NAME]"         # e.g. r941_prom_hac_g360+g422
   --output=${OUTPUT_DIR}               ## output path prefix
 ```
 
+
+
 ### Option 5. Docker Dockerfile
 
 This is the same as option 1 except that you are building a docker image yourself. Please refer to option 1 for usage. 
@@ -358,12 +367,15 @@ docker run -it hkubal/clair3:latest /opt/bin/run_clair3.sh --help
       --pypy=STR                Path of pypy3, pypy3 >= 3.6 is required.
       --parallel=STR            Path of parallel, parallel >= 20191122 is required.
       --whatshap=STR            Path of whatshap, whatshap >= 1.0 is required.
+      --longphase=STR           Path of longphase, longphase >= 1.0 is required.
       --chunk_size=INT          The size of each chuck for parallel processing, default: 5Mbp.
       --pileup_only             Use the pileup model only when calling, default: disable.
       --print_ref_calls         Show reference calls (0/0) in vcf file, default: disable.
       --include_all_ctgs        Call variants on all contigs, otherwise call in chr{1..22,X,Y} and {1..22,X,Y}, default: disable.
       --gvcf                    Enable GVCF output, default: disable.
       --enable_phasing          Output phased variants using whatshap, default: disable.
+      --longphase_for_phasing   Use longphase for phasing, default: enable.
+      --disable_c_impl          Disable C implement with cffi for pileup and full-alignment create tensor, default: enable.
       --remove_intermediate_dir Remove intermediate directory, including intermediate phased BAM, pileup and full-alignment results. default: disable.
       --snp_min_af=FLOAT        Minimum SNP AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.08,hifi:0.08,ilmn:0.08.
       --indel_min_af=FLOAT      Minimum INDEL AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.15,hifi:0.08,ilmn:0.08.
@@ -372,6 +384,9 @@ docker run -it hkubal/clair3:latest /opt/bin/run_clair3.sh --help
       --var_pct_phasing=FLOAT   EXPERIMENTAL: Specify an expected percentage of high quality 0/1 variants used in WhatsHap phasing, default: 0.8 for ont guppy5 and 0.7 for other platforms.
       --pileup_model_prefix=STR EXPERIMENTAL: Model prefix in pileup calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index. default: pileup.
       --fa_model_prefix=STR     EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment.
+      --min_mq=INT              EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5.
+      --min_coverage=INT        EXPERIMENTAL: Minimum coverage required to call a variant, default: 2.
+      --min_contig_size=INT     EXPERIMENTAL: If set, contigs with contig size<$min_contig_size are filtered, default: 0.
       --fast_mode               EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable.
       --haploid_precise         EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable.
       --haploid_sensitive       EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable.

diff --git a/build.py b/build.py
@@ -0,0 +1,76 @@
+import itertools
+import os
+import platform
+from subprocess import run
+from cffi import FFI
+
+samver = "1.10"
+file_directory = os.path.dirname(os.path.realpath(__file__))
+htslib_dir = os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver))
+
+libraries = ['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto']
+extra_link_args = []
+library_dirs = [htslib_dir]
+src_dir = os.path.join(file_directory, 'src')
+
+extra_compile_args = ['-std=c99', '-O3']
+if platform.machine() in {"aarch64", "arm64"}:
+    if platform.system() == "Darwin":
+        pass
+    else:
+        extra_compile_args.append("-march=armv8-a+simd")
+else:
+    extra_compile_args.append("-mtune=haswell")
+    libraries.append('deflate')
+    try:
+        conda_path = os.environ['CONDA_PREFIX']
+        extra_link_args = ['-Wl,-rpath={}/lib'.format(conda_path)]
+    except:
+        print("[WARNING] Conda prefix not found, please activate clair3 conda environment first!")
+
+ffibuilder = FFI()
+ffibuilder.set_source("libclair3",
+    r"""
+    #include "kvec.h"
+    #include "khash.h"
+    #include "levenshtein.h"
+    #include "medaka_bamiter.h"
+    #include "medaka_common.h"
+    #include "medaka_khcounter.h"
+    #include "clair3_pileup.h"
+    #include "clair3_full_alignment.h"
+    """,
+    libraries=libraries,
+    library_dirs=library_dirs,
+    include_dirs=[src_dir, htslib_dir],
+    sources=[
+        os.path.join(src_dir, x) for x in (
+            'levenshtein.c',
+            'medaka_bamiter.c',
+            'medaka_common.c',
+            'medaka_khcounter.c',
+            'clair3_pileup.c',
+            'clair3_full_alignment.c')],
+    extra_compile_args=extra_compile_args,
+    extra_link_args=extra_link_args,
+    extra_objects=['libhts.a']
+)
+
+cdef = [
+    "typedef struct { ...; } bam_fset;"
+    "bam_fset* create_bam_fset(char* fname);"
+    "void destroy_bam_fset(bam_fset* fset);"
+]
+for header in ('clair3_pileup.h', 'clair3_full_alignment.h'):
+    with open(os.path.join(src_dir, header), 'r') as fh:
+        # remove directives
+        lines = ''.join(x for x in fh.readlines() if not x.startswith('#'))
+        cdef.append(lines)
+
+ffibuilder.cdef('\n\n'.join(cdef))
+
+
+if __name__ == "__main__":
+    ffibuilder.compile(verbose=True)
+    run("cp {}/libclair3*.so {}/libclair3.so".format(file_directory, file_directory), shell=True)
+
diff --git a/clair3.py b/clair3.py
@@ -10,6 +10,7 @@
     "CallVarBam",
     "CallVariants",
     "Train",
+    "CallVariantsFromCffi"
 ]
 
 data_preprocess_folder = [
@@ -27,7 +28,9 @@
     'UnifyRepresentation',
     'CheckEnvs',
     'SortVcf',
-    'SelectQual'
+    'SelectQual',
+    "CreateTensorPileupFromCffi"
+    "CreateTensorFullAlignmentFromCffi",
 ]
 
 post_process_scripts_folder = [

diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py
@@ -9,6 +9,7 @@
 from time import sleep
 from argparse import ArgumentParser, SUPPRESS
 import logging
+from platform import machine, system
 
 logging.getLogger().setLevel(logging.INFO)
 
@@ -130,20 +131,23 @@ def Run(args):
         chunk_id = CommandOption('chunk_id', args.chunk_id)
         chunk_num = CommandOption('chunk_num', args.chunk_num)
 
-    sched_getaffinity_list = list(os.sched_getaffinity(0))
-    maxCpus = len(sched_getaffinity_list)
-    if args.tensorflow_threads is None:
-        numCpus = maxCpus
+    if machine() in {"aarch64", "arm64"} or system() == "Darwin":
+        taskSet = ""
     else:
-        numCpus = args.tensorflow_threads if args.tensorflow_threads < maxCpus else maxCpus
+        sched_getaffinity_list = list(os.sched_getaffinity(0))
+        maxCpus = len(sched_getaffinity_list)
+        if args.tensorflow_threads is None:
+            numCpus = maxCpus
+        else:
+            numCpus = args.tensorflow_threads if args.tensorflow_threads < maxCpus else maxCpus
 
-    _cpuSet = ",".join(str(x) for x in random.sample(sched_getaffinity_list, numCpus))
+        _cpuSet = ",".join(str(x) for x in random.sample(sched_getaffinity_list, numCpus))
 
-    taskSet = "taskset -c %s" % (_cpuSet)
-    try:
-        subprocess.check_output("which %s" % ("taskset"), shell=True)
-    except:
-        taskSet = ""
+        taskSet = "taskset -c %s" % (_cpuSet)
+        try:
+            subprocess.check_output("which %s" % ("taskset"), shell=True)
+        except:
+            taskSet = ""
 
     if need_realignment:
         realign_reads_command_options = [
@@ -176,6 +180,8 @@ def Run(args):
         CommandOption('bed_fn', bed_fn),
         CommandOption('extend_bed', extend_bed),
         CommandOption('sampleName', args.sampleName),
+        CommandOption('minCoverage', args.minCoverage),
+        CommandOption('minMQ', args.minMQ),
         ctgStart,
         ctgEnd,
         chunk_id,
@@ -347,7 +353,7 @@ def main():
     parser.add_argument('--fast_mode', type=str2bool, default=False,
                         help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s")
 
-    parser.add_argument('--minCoverage', type=float, default=param.min_coverage,
+    parser.add_argument('--minCoverage', type=int, default=param.min_coverage,
                         help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")
 
     parser.add_argument('--minMQ', type=int, default=param.min_mq,

diff --git a/clair3/CallVariants.py b/clair3/CallVariants.py
@@ -1,7 +1,6 @@
 import sys
 import os
 import math
-import tables
 import tensorflow as tf
 import numpy as np
 import logging
@@ -17,6 +16,7 @@
     HETERO_SNP_GT21, HETERO_SNP_LABELS, GT21_LABELS, partial_label_from, mix_two_partial_labels
 )
 import clair3.utils as utils
+import shared.param_p as param
 from clair3.task.genotype import Genotype, genotype_string_from, genotype_enum_from, genotype_enum_for_task
 from shared.utils import IUPAC_base_to_ACGT_base_dict as BASE2ACGT, BASIC_BASES, str2bool, file_path_from, log_error, log_warning
 from clair3.task.variant_length import VariantLength
@@ -1114,7 +1114,8 @@ def output_with(
     chromosome, position, reference_sequence = chr_pos_seq.rstrip().split(':')
     position = int(position)
 
-    tensor_position_center = param.flankingBaseNum
+    # only store the centered reference base for C implment for efficiency
+    tensor_position_center = param.flankingBaseNum if len(reference_sequence) > 1 else 0
     information_string = "P" if output_config.pileup else 'F'
 
     if type(alt_info) == np.memmap:
@@ -1527,6 +1528,7 @@ def load_mini_batch():
         if full_alignment_mode and total == 0:
             logging.info(log_error("[ERROR] No full-alignment output for file {}/{}".format(args.ctgName, args.call_fn)))
     else:
+        import tables
         dataset = tables.open_file(args.tensor_fn, 'r').root
         batch_size = param.predictBatchSize
         dataset_size = len(dataset.label)
@@ -1710,6 +1712,7 @@ def load_mini_batch():
         logging.info("Total process positions: {}".format(total))
 
     else:
+        import tables
         if not os.path.exists(args.tensor_fn):
             logging.info("skip {}, not existing chunk_id".format(args.tensor_fn))
             return