diff --git a/README.md b/README.md index 61726d5..7dee1ff 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # mHi-C: robust leveraging of multi-mapping reads in Hi-C analysis -Zheng, Ye, Ferhat Ay, and Sunduz Keles. "mHi-C: robust leveraging of multi-mapping reads in Hi-C analysis." bioRxiv (2018): 301705. +Ye Zheng, Ferhat Ay, and Sunduz Keles. "mHi-C: robust leveraging of multi-mapping reads in Hi-C analysis." bioRxiv (2018): 301705. The pipeline is developed in Keles Research Group in University of Wisconsin - Madison and please contact Ye Zheng (yezheng@stat.wisc.edu) for any question and suggestion. @@ -13,7 +13,7 @@ mHi-C is short for **m**ulti-mapping strategy for **Hi-C** data in order to make ### Step 0 - Pipeline caller [mhic_step0-6.sh] Caller for all the steps in mHi-C pipeline, starting from alignment to multi-reads alignment probability assignment. This is a demo script to run multiple steps at once. Parameters in the script should be customize it for you own use. -#### 0.1 Usage +#### 0.0 Usage ``` bash mhic_step0-6.sh @@ -56,6 +56,9 @@ cutsite="AAGCTAGCTT" # for HindIII seqLength=25 resolution=40000 +## compile cutsite to trim chimeric reads +g++ -std=c++0x -o $bin/cutsite_trimming_mHiC $bin/cutsite_trimming_mHiC.cpp + bash s1_bwaAlignment.sh "$name" "$ref" "$bwaDir" "$samtoolsDir" "$fastqDir" "$resultsDir/s1" "$bin" 8 "$cutsite" "$seqLength" "$resultsDir/mHiC.summary_w${resolution}_s1" ``` @@ -64,7 +67,7 @@ bash s1_bwaAlignment.sh "$name" "$ref" "$bwaDir" "$samtoolsDir" "$fastqDir" "$re In step 1, two ends (_1.fastq and _2.fastq) are aligned separetely to the reference genome which can be paired by read ID. Thus paired-end reads can be formed and each paired-end read represent one interaction. #### 2.0 Requirements -- python3 (>= 3.6) +- python (>= 3.6) - numpy (>= 1.13.1) - scipy (>= 0.19.1) - pysam (>= 0.12.0) @@ -88,7 +91,7 @@ name="IMR90_rep1" resultsDir="/projects/IMR90" resolution=40000 -python3 s2_joinEnd.py -r1 ${resultsDir}/s1/${name}_1.sam -r2 ${resultsDir}/s1/${name}_2.sam -o ${resultsDir}/s2/${name}.sam -sf $resultsDir/mHiC.summary_w${resolution}_s2 +python s2_joinEnd.py -r1 ${resultsDir}/s1/${name}_1.sam -r2 ${resultsDir}/s1/${name}_2.sam -o ${resultsDir}/s2/${name}.sam -sf $resultsDir/mHiC.summary_w${resolution}_s2 ``` @@ -96,7 +99,7 @@ python3 s2_joinEnd.py -r1 ${resultsDir}/s1/${name}_1.sam -r2 ${resultsDir}/s1/${ This step is to ensure valid read pairs are passed on to downstream analysis while excluding dangling end, self circle, religation, too short-range interactions as well as invalid alignment that are far away from restriction enzyme cutting sites. Read pairs in each category are summarized. #### 3.0 Requirements -- python3 (>= 3.6) +- python (>= 3.6) - numpy (>= 1.13.1) - scipy (>= 0.19.1) - pysam (>= 0.12.0) @@ -152,7 +155,7 @@ chr1 39255 43602 HIC_chr1_11 0 + Remove the PCR duplicates and bin the genome by fixed window size. #### 4.0 Requirements -- python3 (>= 3.6) +- python (>= 3.6) - numpy (>= 1.13.1) #### 4.1 Arguments @@ -180,6 +183,7 @@ validI="${resultsDir}/s4/w${resolution}/${name}.validPairs" mappFile="${bin}/human-hg19.HindIII.w${resolution}" minMap=0.5 #min mappability threshold minCount=1 #min contact counts allowed +maxIter=150 bash s4_bin.sh "$validP" "$validI" "$bin" "$mappFile" "$minMap" "$minCount" "$maxIter" "$resultsDir/mHiC.summary_w${resolution}_s4" ``` @@ -209,7 +213,7 @@ chr1 620000 10 0.0029 0.38705 Build the prior for mHi-C model using uni-reads only. #### 5.0 Requirements -- python3 (>= 3.6) +- python (>= 3.6) - numpy (>= 1.13.1) - scipy (>= 0.19.1) - sklearn (>= 0.19.1) @@ -241,7 +245,7 @@ python s5_prior.py -f $validI.binPair.Marginal -i $validI.binPairCount.uni.after In this step, allocation probabilities are assigned to each multi-mapping reads at each potential alignment position. s6_em_cython.pyx will be called to accelerate computation process. #### 6.0 Requirements -- python3 (>= 3.6) +- python (>= 3.6) - pyximport #### 6.1 Arguments diff --git a/bin/ICE-with-sparseMatrix.py b/bin/ICE-with-sparseMatrix.py index 2947435..654e165 100644 --- a/bin/ICE-with-sparseMatrix.py +++ b/bin/ICE-with-sparseMatrix.py @@ -1,3 +1,5 @@ + + #!/usr/bin/env python ''' Created on Feb 18, 2013 diff --git a/mhic_step0-6.sh b/mhic_step0-6.sh index a986364..84b60f3 100644 --- a/mhic_step0-6.sh +++ b/mhic_step0-6.sh @@ -8,112 +8,119 @@ Script to call each step of mHi-C April 2016 ''' - -## Step 0 - Download raw data - Example shown here. +## ************************************************ +## step 0 - Download raw data - Example shown here. +## ************************************************ echo "Start step 0 - downloading!" id="SRR1658591" -sraDir=$(pwd)/sratoolkit.2.8.2-1-centos_linux64 -path=$(pwd)/raw +sraDir="/projects/sratoolkit.2.8.2-1-centos_linux64" +path="/projects/fastqFiles" mkdir -p $path ## tar -zxvf sratoolkit.tar.gz wget -r ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX764/SRX764954/SRR1658591/SRR1658591.sra -O $path/$id.sra $sraDir/bin/fastq-dump -F --split-files $path/$id.sra -O $path - -## Step 1-3: Can be run in parallel. - -name=$1 #Can be replicate, e.g "rep1" -resultsDir=$(pwd)/$name -ref=$(pwd)/ReferenceGenome/hg19.fasta -fastqDir=$(pwd)/Sequences # Given -cutsite="GATCGATC" # for MboI +## step 1-3: Can be run in parallel. + +## ****************** +## step 1: Alignment +## ****************** +name="IMR90_rep1" +ref="/projects/ReferenceGenome/hg19.fasta" +bwaDir="/projects/Softwares/bwa-0.5.9" +samtoolsDir="/projects/Softwares/samtools-1.3" +fastqDir="/projects/fastqFiles" +resultsDir="/projects/IMR90" +bin="/projects/bin" +cutsite="AAGCTAGCTT" # for HindIII seqLength=25 -resolution=5000 -validP="${resultsDir}/s3/w${resolution}/${name}.validPairs" -validI="${resultsDir}/s4/w${resolution}/${name}.validPairs" -refrag="MboI_resfrag_hg19.bed" #restriction fragment file -minMap=0.5 -minCount=1 -lowerBound=20000 #$((resolution * 2)) -refragL=50 #$((seqLength * 2)) -refragU=500 +resolution=40000 +## compile cutsite to trim chimeric reads g++ -std=c++0x -o $bin/cutsite_trimming_mHiC $bin/cutsite_trimming_mHiC.cpp -echo "Prepare Output Directory and Summary File!" - echo "Start step 1 - alignment!" -bash s1_bwaAlignment.sh $name $ref $bwaDir $samtoolsDir $fastqDir $resultsDir $bin 1 $cutsite $seqLength $resultsDir/mHiC.summary_w${resolution} +bash s1_bwaAlignment.sh "$name" "$ref" "$bwaDir" "$samtoolsDir" "$fastqDir" "$resultsDir/s1" "$bin" 8 "$cutsite" "$seqLength" "$resultsDir/mHiC.summary_w${resolution}_s1" + + +## ************************** +## step 2: Read ends pairing +## ************************** +name="IMR90_rep1" +resultsDir="/projects/IMR90" +resolution=40000 echo "Start step 2 - joining read ends!" -python s2_joinEnd.py -r1 ${resultsDir}/s1/${name}_1.sam -r2 ${resultsDir}/s1/${name}_2.sam -o ${resultsDir}/s2/${name}.sam -sf $resultsDir/mHiC.summary_w${resolution} +python s2_joinEnd.py -r1 ${resultsDir}/s1/${name}_1.sam -r2 ${resultsDir}/s1/${name}_2.sam -o ${resultsDir}/s2/${name}.sam -sf $resultsDir/mHiC.summary_w${resolution}_s2 + + +## ********************************* +## step 3: Valid fragment filtering +## ********************************* +name="IMR90_rep1" +resultsDir="/projects/IMR90" +refrag="HindIII_resfrag_hg19.bed" #restriction fragment file +resolution=40000 +lowerBound=$((resolution * 2)) +refragL=50 #$((seqLength * 2)) +refragU=500 echo "Start step 3 - categorize read pairs!" -python s3_categorizePairs.py -f ${bin}/$refrag -r ${resultsDir}/s2/${name}.sam -o ${resultsDir}/s3/w$resolution -l $refragL -u $refragU -d $lowerBound -m "window" -b $resolution -sf $resultsDir/mHiC.summary_w${resolution} +python s3_categorizePairs.py -f ${bin}/${refrag} -r ${resultsDir}/s2/${name}.sam -o ${resultsDir}/s3 -l $refragL -u $refragU -d $lowerBound -m "window" -b $resolution -sf $resultsDir/mHiC.summary_w${resolution}_s3 ## In case, chrM is not needed in downstream analysis # awk -v OFS="\t" '$2 != "chrM" && $7!="chrM" {print $0}' $validP >$validP.noChrM # rm -rf $validP # mv $validP.noChrM $validP -## Step 4 - Remove duplicates and binning. Also ICE normalization on uni-reads are process in preparation for step 5 prior building. - -echo "Start step 4 - duplicates removal and binning!" -maxIter=150 -bin=$(pwd)/bin - -cellLine="GM12878" -resolution=5000 -validP="${resultsDir}/s4/w${resolution}/${name}.validPairs" +## *************************************** +## step 4 - Remove duplicates and binning. +## *************************************** +name="IMR90_rep1" +resultsDir="/projects/IMR90" +resolution=40000 +bin="/projects/bin" +validP="${resultsDir}/s3/w${resolution}/${name}.validPairs" validI="${resultsDir}/s4/w${resolution}/${name}.validPairs" -refrag="MboI_resfrag_hg19.bed" #restriction fragment +mappFile="${bin}/human-hg19.HindIII.w${resolution}" minMap=0.5 #min mappability threshold minCount=1 #min contact counts allowed -splineBin=200 # fit-hi-c spline bin number -refragL=50 -refragU=500 - -mappability="null" - - -if [ ! -d "$resultsDir/s4/w${resolution}" ]; then - mkdir -p "$resultsDir/s4/w${resolution}" -fi - +maxIter=150 -awk -v OFS="\t" '{print $1, $2, $6, $7, $11}' $validP.UNI >$validP.UNI.binPair +echo "Start step 4 - duplicates removal and binning!" +bash s4_bin.sh "$validP" "$validI" "$bin" "$mappFile" "$minMap" "$minCount" "$maxIter" "$resultsDir/mHiC.summary_w${resolution}_s4" -echo "processing!" -bash s4_bin.sh "$validP" "$validI" "$bin" "$mappability" "$minMap" "$minCount" "$maxIter" "$resultsDir/mHiC.summary_s4_w${resolution}" +## ********************** +## step 5 - Build prior. +## ********************** +name="IMR90_rep1" +resultsDir="/projects/IMR90" +resolution=40000 +validI="${resultsDir}/s4/w${resolution}/${name}.validPairs" +splineBin=200 +priorName="uniPrior" -## step 5 - build prior echo "Starts step 5 - prior construction based on uni-reads only!" -if [ ! -d "${resultsDir}/s5" ]; then - mkdir -p "${resultsDir}/s5" -fi - - -python s5_prior.py -l uniPrior -f $validI.binPair.Marginal -i $validI.binPairCount.uni.afterICE -o ${resultsDir}/s5 -b $splineBin - +python s5_prior.py -f $validI.binPair.Marginal -i $validI.binPairCount.uni.afterICE -o ${resultsDir}/s5 -b $splineBin -l $priorName +## ************************************************************************************ ## step 6 - Generative model to assign probability to multi-reads potential alignments. - -echo "Starts step 6 - assign probability to multi-reads potential alignment positions !" - - -priorPath=$resultsDir/s5/s5_w${resolution}_splineResults -multiFilePath=$resultsDir/s4/${name}.validPairs.MULTI.binPair.multi -multiKeysPath=$resultsDir/s4/${name}.validPairs.MULTI.binPair.multiKeys -uniFilePath=$resultsDir/s4/${name}.validPairs.binPairCount.uni -filename=${name}.validPairs.binPair.multi +## ************************************************************************************ +name="IMR90_rep1" +resultsDir="/projects/IMR90" +resolution=40000 +prior="${resultsDir}/s5/s5_w${resolution}_splineResults" +multi="${resultsDir}/s4/${name}.validPairs.MULTI.binPair.multi" +multiKeys="$resultsDir/s4/${name}.validPairs.MULTI.binPair.multiKeys" +uni="$resultsDir/s4/${name}.validPairs.binPairCount.uni" +filename="${name}.validPairs.binPair.multi" threshold=0.5 -awk -v OFS="_" '{print $2, $3, $4, $5}' $multiFilePath | sort -u >$multiKeysPath - -python s6_em.py -p $priorPath -u $uniFilePath -m $multiFilePath -mk $multiKeysPath -t $threshold -o ${resultsDir}/s6 -f $filename - +echo "Starts step 6 - assign probability to multi-reads potential alignment positions !" +awk -v OFS="_" '{print $2, $3, $4, $5}' $multi | sort -u >$multiKeys +python s6_em.py -p $prior -u $uni -m $multi -mk $multiKeys -t $threshold -o "${resultsDir}/s6" -f $filename diff --git a/s4_bin.sh b/s4_bin.sh index afc3ff8..e586145 100644 --- a/s4_bin.sh +++ b/s4_bin.sh @@ -21,14 +21,17 @@ validI=$2 bin=$3 dir=${validI%/*} mappability=$4 -minMap=${5:-"0.5"} -minCount=${6:-"1"} -summaryFile=${7:-""} +minMap=${5:-0.5} +minCount=${6:-1} +maxIter=${7:-100} +summaryFile=${8:-""} if [ ! -d $dir/sorttmp ]; then mkdir -p $dir/sorttmp fi +awk -v OFS="\t" '{print $1, $2, $6, $7, $11}' $validP.UNI >$validP.UNI.binPair + # Remove PCR duplicates based on alignment chrom + position sort -k2,2V -k3,3n -k7,7V -k8,8n -T $dir/sorttmp $validP | awk -v OFS="\t" 'BEGIN{c1=0;c2=0;s1=0;s2=0}(c1!=$2 || c2!=$7 || s1!=$3 || s2!=$8){print;c1=$2;c2=$7;s1=$3;s2=$8}' | sort -k1,1V -k2,2V -k6,6n -k7,7V -k11,11n -T $dir/sorttmp > $validI.nodup @@ -84,10 +87,10 @@ if [ "$minCount" -gt "1" ];then awk -v minC=$minCount -v OFS="\t" '$5>=minC {print $0}' $validI.binPairCount.uni | sort -k1,1V -k2,2n -k3,3V -k4,4n >$validI.binPairCount.uni.minCount$minCount # ICE normalization with filtering low mappability regions - python $bin/ICE-with-sparseMatrix.py $validI.binPairCount.uni.minCount$minCount $bin/$mappFile l1 $validI.binPairCount.uni.afterICE $minMap + python $bin/ICE-with-sparseMatrix.py $validI.binPairCount.uni.minCount$minCount $bin/$mappFile l1 $validI.binPairCount.uni.afterICE $minMap $maxIter else # ICE normalization with filtering low mappability regions - python $bin/ICE-with-sparseMatrix.py $validI.binPairCount.uni $bin/$mappFile l1 $validI.binPairCount.uni.afterICE $minMap + python $bin/ICE-with-sparseMatrix.py $validI.binPairCount.uni $bin/$mappFile l1 $validI.binPairCount.uni.afterICE $minMap $maxIter fi # Uni bin marginal pair