Skip to content

Commit

Permalink
Merge pull request #89 from HKU-BAL/r11
Browse files Browse the repository at this point in the history
update version to v0.1-r11
  • Loading branch information
zhengzhenxian authored Apr 4, 2022
2 parents ef0546f + 2564a1a commit e8c2e50
Show file tree
Hide file tree
Showing 33 changed files with 4,974 additions and 73 deletions.
10 changes: 8 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,15 @@ RUN /bin/bash -c "source activate clair3" && \
pip install tensorflow-cpu==2.2.0 && \
pip install tensorflow-addons==0.11.2 tables==3.6.1 && \
conda install -c anaconda pigz==2.4 -y && \
conda install -c anaconda cffi==1.14.4 -y && \
conda install -c conda-forge parallel=20191122 zstd=1.4.4 -y && \
conda install -c conda-forge -c bioconda samtools=1.10 -y && \
conda install -c conda-forge -c bioconda whatshap=1.0 -y && \
conda install -c conda-forge xz zlib bzip2 -y && \
conda install -c conda-forge automake curl -y && \
rm -rf /opt/conda/pkgs/* && \
rm -rf /root/.cache/pip
rm -rf /root/.cache/pip && \
echo "source activate clair3" > ~/.bashrc

COPY . .

Expand All @@ -48,4 +52,6 @@ RUN cd /opt/bin/preprocess/realign && \
wget http://www.bio8.cs.hku.hk/clair3/clair3_models/clair3_models.tar.gz -P /opt/models && \
tar -zxvf /opt/models/clair3_models.tar.gz -C /opt/models && \
rm /opt/models/clair3_models.tar.gz && \
echo "source activate clair3" > ~/.bashrc
cd /opt/bin && \
make PREFIX=/opt/conda/envs/clair3 PYTHON=/opt/conda/envs/clair3/bin/python && \
rm -rf /opt/bin/samtools-* /opt/bin/longphase-*
59 changes: 59 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
OS := $(shell uname)
ARCH := $(shell arch)

PYTHON ?= python3

all : libhts.a longphase libclair3.so
clean : clean_htslib clean_longphase clean_libclair3

SAMVER = 1.10
LPVER = 1.0
GCC ?= gcc
GXX ?= g++
PREFIX ?= ${CONDA_PREFIX}
LDFLAGS = -L ${PREFIX}/lib
CFLAGS = -fpic -std=c99 -O3 -I ${PREFIX}/include -L ${PREFIX}/lib
CPPFLAGS = -std=c++11 -Wall -O3 -I ${PREFIX}/include -L ${PREFIX}/lib -Wl,-rpath=${PREFIX}/lib
LP_CPPFLAGS = -std=c++11 -Wall -g -O3 -I ${PREFIX}/include -L ${PREFIX}/lib -Wl,-rpath=${PREFIX}/lib

samtools-$(SAMVER)/Makefile:
curl -L -o samtools-${SAMVER}.tar.bz2 https://github.com/samtools/samtools/releases/download/${SAMVER}/samtools-${SAMVER}.tar.bz2; \
tar -xjf samtools-${SAMVER}.tar.bz2; \
rm samtools-${SAMVER}.tar.bz2

libhts.a: samtools-$(SAMVER)/Makefile
# this is required only to add in -fpic so we can build python module
@echo "\x1b[1;33mMaking $(@F)\x1b[0m"
cd samtools-${SAMVER}/htslib-${SAMVER}; CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}" ./configure; make CFLAGS="${CFLAGS}" LDFLAGS="${LDFLAGS}"
cp samtools-${SAMVER}/htslib-${SAMVER}/$@ $@


longphase-$(LPVER)/Makefile:
curl -L -o longphase-${LPVER}.tar.gz https://github.com/twolinin/longphase/archive/refs/tags/v${LPVER}.tar.gz; \
tar -zxvf longphase-${LPVER}.tar.gz; \
rm longphase-${LPVER}.tar.gz

longphase: longphase-$(LPVER)/Makefile
@echo "\x1b[1;33mMaking $(@F)\x1b[0m"
cd longphase-${LPVER}; autoreconf -i; CPPFLAGS="${CPPFLAGS}" ./configure; make CC=${GCC} CXX=${GXX} CPPFLAGS="${CPPFLAGS}"
cp longphase-${LPVER}/$@ $@


libclair3.so: samtools-${SAMVER}/htslib-${SAMVER}
${PYTHON} build.py


.PHONY: clean_htslib
clean_htslib:
cd samtools-${SAMVER} && make clean || exit 0
cd samtools-${SAMVER}/htslib-${SAMVER} && make clean || exit 0
rm libhts.a

.PHONY: clean_longphase
clean_longphase:
cd longphase-${LPVER} && make clean || exit 0
rm longphase

.PHONY: clean_libclair3
clean_libclair3:
rm libclair3.*
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ A short preprint describing Clair3's algorithms and results is at [bioRxiv](http

## Latest Updates

*v0.1-r11 (Apr 4)* : 1. Variant calling ~2.5x faster than `v0.1-r10` tested with ONT Q20 data, with feature generation in both pileup and full-alignment now implemented in C (co-contributors @[cjw85](https://github.com/cjw85), @[ftostevin-ont](https://github.com/ftostevin-ont), @[EpiSlim](https://github.com/EpiSlim)). 2. Added the lightning-fast [longphase](https://github.com/twolinin/longphase) as an option for phasing. Enable using `longphase` with option `--longphase_for_phasing`. New option disabled by default to align with the default behavior of the previous versions, but we recommend enable when calling human variants with ≥20x long-reads). 3. Added `--min_coverage` and `--min_mq` options ([#83](https://github.com/HKU-BAL/Clair3/issues/83)). 4. Added `--min_contig_size` option to skip calling variants in short contigs when using genome assembly as input. 4. Reads haplotagging after phasing before full-alignment calling now integrated into full-alignment calling to avoid generating an intermediate BAM file. 5. Supported .`csi` BAM index for large references ([#90](https://github.com/HKU-BAL/Clair3/issues/90)). For more speedup details, please check [Notes on r11](docs/v0.1_r11_speedup.md).

*v0.1-r10 (Jan 13)* : 1. Added a new ONT Guppy5 model (`r941_prom_sup_g5014`). Click [here](docs/guppy5_20220113.md) for some benchmarking results. This `sup` model is also applicable to reads called using the `hac` and `fast` mode. The old `r941_prom_sup_g506` model that was fine-tuned from the Guppy3,4 model is obsoleted. 2. Added `--var_pct_phasing` option to control the percentage of top ranked heterozygous pile-up variants used for WhatsHap phasing.

*v0.1-r9 (Dec 1)* : Added the `--enable_long_indel` option to output indel variant calls >50bp ([#64](https://github.com/HKU-BAL/Clair3/issues/64)), Click [here](https://github.com/HKU-BAL/Clair3/blob/main/docs/indel_gt50_performance.md) to see more benchmarking results.
Expand Down Expand Up @@ -267,15 +269,20 @@ pypy3 -m pip install mpmath==1.2.1
# install python packages in environment
pip3 install tensorflow==2.2.0
pip3 install tensorflow-addons==0.11.2 tables==3.6.1
conda install -c anaconda pigz==2.4 -y
conda install -c anaconda pigz==2.4 cffi==1.14.4 -y
conda install -c conda-forge parallel=20191122 zstd=1.4.4 -y
conda install -c conda-forge -c bioconda samtools=1.10 -y
conda install -c conda-forge -c bioconda whatshap=1.0 -y

conda install -c conda-forge xz zlib bzip2 automake curl -y

# clone Clair3
git clone https://github.com/HKU-BAL/Clair3.git
cd Clair3

# compile samtools, longphase and cffi library for c implement
# after building, longphase binary is in `Clair3` folder
source activate clair3 && make PREFIX=${CONDA_PREFIX}

# download pre-trained models
mkdir models
wget http://www.bio8.cs.hku.hk/clair3/clair3_models/clair3_models.tar.gz
Expand All @@ -292,6 +299,8 @@ MODEL_NAME="[YOUR_MODEL_NAME]" # e.g. r941_prom_hac_g360+g422
--output=${OUTPUT_DIR} ## output path prefix
```



### Option 5. Docker Dockerfile

This is the same as option 1 except that you are building a docker image yourself. Please refer to option 1 for usage.
Expand Down Expand Up @@ -358,12 +367,15 @@ docker run -it hkubal/clair3:latest /opt/bin/run_clair3.sh --help
--pypy=STR Path of pypy3, pypy3 >= 3.6 is required.
--parallel=STR Path of parallel, parallel >= 20191122 is required.
--whatshap=STR Path of whatshap, whatshap >= 1.0 is required.
--longphase=STR Path of longphase, longphase >= 1.0 is required.
--chunk_size=INT The size of each chuck for parallel processing, default: 5Mbp.
--pileup_only Use the pileup model only when calling, default: disable.
--print_ref_calls Show reference calls (0/0) in vcf file, default: disable.
--include_all_ctgs Call variants on all contigs, otherwise call in chr{1..22,X,Y} and {1..22,X,Y}, default: disable.
--gvcf Enable GVCF output, default: disable.
--enable_phasing Output phased variants using whatshap, default: disable.
--longphase_for_phasing Use longphase for phasing, default: enable.
--disable_c_impl Disable C implement with cffi for pileup and full-alignment create tensor, default: enable.
--remove_intermediate_dir Remove intermediate directory, including intermediate phased BAM, pileup and full-alignment results. default: disable.
--snp_min_af=FLOAT Minimum SNP AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.08,hifi:0.08,ilmn:0.08.
--indel_min_af=FLOAT Minimum INDEL AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.15,hifi:0.08,ilmn:0.08.
Expand All @@ -372,6 +384,9 @@ docker run -it hkubal/clair3:latest /opt/bin/run_clair3.sh --help
--var_pct_phasing=FLOAT EXPERIMENTAL: Specify an expected percentage of high quality 0/1 variants used in WhatsHap phasing, default: 0.8 for ont guppy5 and 0.7 for other platforms.
--pileup_model_prefix=STR EXPERIMENTAL: Model prefix in pileup calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index. default: pileup.
--fa_model_prefix=STR EXPERIMENTAL: Model prefix in full-alignment calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: full_alignment.
--min_mq=INT EXPERIMENTAL: If set, reads with mapping quality with <$min_mq are filtered, default: 5.
--min_coverage=INT EXPERIMENTAL: Minimum coverage required to call a variant, default: 2.
--min_contig_size=INT EXPERIMENTAL: If set, contigs with contig size<$min_contig_size are filtered, default: 0.
--fast_mode EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: disable.
--haploid_precise EXPERIMENTAL: Enable haploid calling mode. Only 1/1 is considered as a variant, default: disable.
--haploid_sensitive EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant, default: disable.
Expand Down
76 changes: 76 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import itertools
import os
import platform
from subprocess import run
from cffi import FFI

samver = "1.10"
file_directory = os.path.dirname(os.path.realpath(__file__))
htslib_dir = os.path.join(file_directory, 'samtools-{}'.format(samver), 'htslib-{}'.format(samver))

libraries = ['m', 'z', 'lzma', 'bz2', 'pthread', 'curl', 'crypto']
extra_link_args = []
library_dirs = [htslib_dir]
src_dir = os.path.join(file_directory, 'src')

extra_compile_args = ['-std=c99', '-O3']
if platform.machine() in {"aarch64", "arm64"}:
if platform.system() == "Darwin":
pass
else:
extra_compile_args.append("-march=armv8-a+simd")
else:
extra_compile_args.append("-mtune=haswell")
libraries.append('deflate')
try:
conda_path = os.environ['CONDA_PREFIX']
extra_link_args = ['-Wl,-rpath={}/lib'.format(conda_path)]
except:
print("[WARNING] Conda prefix not found, please activate clair3 conda environment first!")

ffibuilder = FFI()
ffibuilder.set_source("libclair3",
r"""
#include "kvec.h"
#include "khash.h"
#include "levenshtein.h"
#include "medaka_bamiter.h"
#include "medaka_common.h"
#include "medaka_khcounter.h"
#include "clair3_pileup.h"
#include "clair3_full_alignment.h"
""",
libraries=libraries,
library_dirs=library_dirs,
include_dirs=[src_dir, htslib_dir],
sources=[
os.path.join(src_dir, x) for x in (
'levenshtein.c',
'medaka_bamiter.c',
'medaka_common.c',
'medaka_khcounter.c',
'clair3_pileup.c',
'clair3_full_alignment.c')],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
extra_objects=['libhts.a']
)

cdef = [
"typedef struct { ...; } bam_fset;"
"bam_fset* create_bam_fset(char* fname);"
"void destroy_bam_fset(bam_fset* fset);"
]
for header in ('clair3_pileup.h', 'clair3_full_alignment.h'):
with open(os.path.join(src_dir, header), 'r') as fh:
# remove directives
lines = ''.join(x for x in fh.readlines() if not x.startswith('#'))
cdef.append(lines)

ffibuilder.cdef('\n\n'.join(cdef))


if __name__ == "__main__":
ffibuilder.compile(verbose=True)
run("cp {}/libclair3*.so {}/libclair3.so".format(file_directory, file_directory), shell=True)

5 changes: 4 additions & 1 deletion clair3.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"CallVarBam",
"CallVariants",
"Train",
"CallVariantsFromCffi"
]

data_preprocess_folder = [
Expand All @@ -27,7 +28,9 @@
'UnifyRepresentation',
'CheckEnvs',
'SortVcf',
'SelectQual'
'SelectQual',
"CreateTensorPileupFromCffi"
"CreateTensorFullAlignmentFromCffi",
]

post_process_scripts_folder = [
Expand Down
30 changes: 18 additions & 12 deletions clair3/CallVarBam.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from time import sleep
from argparse import ArgumentParser, SUPPRESS
import logging
from platform import machine, system

logging.getLogger().setLevel(logging.INFO)

Expand Down Expand Up @@ -130,20 +131,23 @@ def Run(args):
chunk_id = CommandOption('chunk_id', args.chunk_id)
chunk_num = CommandOption('chunk_num', args.chunk_num)

sched_getaffinity_list = list(os.sched_getaffinity(0))
maxCpus = len(sched_getaffinity_list)
if args.tensorflow_threads is None:
numCpus = maxCpus
if machine() in {"aarch64", "arm64"} or system() == "Darwin":
taskSet = ""
else:
numCpus = args.tensorflow_threads if args.tensorflow_threads < maxCpus else maxCpus
sched_getaffinity_list = list(os.sched_getaffinity(0))
maxCpus = len(sched_getaffinity_list)
if args.tensorflow_threads is None:
numCpus = maxCpus
else:
numCpus = args.tensorflow_threads if args.tensorflow_threads < maxCpus else maxCpus

_cpuSet = ",".join(str(x) for x in random.sample(sched_getaffinity_list, numCpus))
_cpuSet = ",".join(str(x) for x in random.sample(sched_getaffinity_list, numCpus))

taskSet = "taskset -c %s" % (_cpuSet)
try:
subprocess.check_output("which %s" % ("taskset"), shell=True)
except:
taskSet = ""
taskSet = "taskset -c %s" % (_cpuSet)
try:
subprocess.check_output("which %s" % ("taskset"), shell=True)
except:
taskSet = ""

if need_realignment:
realign_reads_command_options = [
Expand Down Expand Up @@ -176,6 +180,8 @@ def Run(args):
CommandOption('bed_fn', bed_fn),
CommandOption('extend_bed', extend_bed),
CommandOption('sampleName', args.sampleName),
CommandOption('minCoverage', args.minCoverage),
CommandOption('minMQ', args.minMQ),
ctgStart,
ctgEnd,
chunk_id,
Expand Down Expand Up @@ -347,7 +353,7 @@ def main():
parser.add_argument('--fast_mode', type=str2bool, default=False,
help="EXPERIMENTAL: Skip variant candidates with AF <= 0.15, default: %(default)s")

parser.add_argument('--minCoverage', type=float, default=param.min_coverage,
parser.add_argument('--minCoverage', type=int, default=param.min_coverage,
help="EXPERIMENTAL: Minimum coverage required to call a variant, default: %(default)f")

parser.add_argument('--minMQ', type=int, default=param.min_mq,
Expand Down
7 changes: 5 additions & 2 deletions clair3/CallVariants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import sys
import os
import math
import tables
import tensorflow as tf
import numpy as np
import logging
Expand All @@ -17,6 +16,7 @@
HETERO_SNP_GT21, HETERO_SNP_LABELS, GT21_LABELS, partial_label_from, mix_two_partial_labels
)
import clair3.utils as utils
import shared.param_p as param
from clair3.task.genotype import Genotype, genotype_string_from, genotype_enum_from, genotype_enum_for_task
from shared.utils import IUPAC_base_to_ACGT_base_dict as BASE2ACGT, BASIC_BASES, str2bool, file_path_from, log_error, log_warning
from clair3.task.variant_length import VariantLength
Expand Down Expand Up @@ -1114,7 +1114,8 @@ def output_with(
chromosome, position, reference_sequence = chr_pos_seq.rstrip().split(':')
position = int(position)

tensor_position_center = param.flankingBaseNum
# only store the centered reference base for C implment for efficiency
tensor_position_center = param.flankingBaseNum if len(reference_sequence) > 1 else 0
information_string = "P" if output_config.pileup else 'F'

if type(alt_info) == np.memmap:
Expand Down Expand Up @@ -1527,6 +1528,7 @@ def load_mini_batch():
if full_alignment_mode and total == 0:
logging.info(log_error("[ERROR] No full-alignment output for file {}/{}".format(args.ctgName, args.call_fn)))
else:
import tables
dataset = tables.open_file(args.tensor_fn, 'r').root
batch_size = param.predictBatchSize
dataset_size = len(dataset.label)
Expand Down Expand Up @@ -1710,6 +1712,7 @@ def load_mini_batch():
logging.info("Total process positions: {}".format(total))

else:
import tables
if not os.path.exists(args.tensor_fn):
logging.info("skip {}, not existing chunk_id".format(args.tensor_fn))
return
Expand Down
Loading

0 comments on commit e8c2e50

Please sign in to comment.