Skip to content

Commit

Permalink
Merge pull request #1 from nadegeguiglielmoni/master
Browse files Browse the repository at this point in the history
+ docopt, less required parameters
  • Loading branch information
AntoineHo authored Nov 12, 2020
2 parents d80f2e2 + 33ab33e commit 660b770
Show file tree
Hide file tree
Showing 13 changed files with 850 additions and 510 deletions.
672 changes: 162 additions & 510 deletions Hap.py

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,22 @@ Easy haploidy and size completeness estimation.
## 1. General
This tool helps assess the haploidy and proximity to size completeness.

HapPy computes two values:
- Area Under Curve (AUC) ratio: the ratio of the AUC of the diploid peak and the AUC of the haploid peak
- Total Size Score (TSS): the estimated haploid size completeness

For more information, see:
**Overcoming uncollapsed haplotypes in long-read assemblies of non-model organisms**,
Nadège Guiglielmoni, Antoine Houtain,Alessandro Derzelle, Karine van Doninck, Jean-François Flot,
bioRxiv 2020, doi: https://doi.org/10.1101/2020.03.16.993428

### Requirements:

- sambamba
- scipy
- pandas
- numpy

```
$ python HapPy/Hap.py -h
usage: Hap.py [-h] {depth,estimate} ...
Expand Down Expand Up @@ -59,3 +75,14 @@ optional arguments:
-mp MIN_PEAK, --min-peak MIN_PEAK <INT> Minimum peak height. Default: [150000]
-p, --plot Output plots. Default: False
```

## 4. Example

Here is an example on how to use HapPy. HapPy requires a sorted BAM file as input. Here the PacBio long reads are mapped to the assembly with minimap2, and the output is sorted with samtools. The sorted BAM file is also indexed with samtools. The module depth computes the coverage histogram from the BAM file, and then the module estimate computes the AUC ratio and TSS. Here the max x value for the contaminant peak is set to 35, the max x value for the diploid peak is set to 120, and the min y for a peak is set to 150000. The expected genome size is set to 102M.

```
minimap2 -ax map-pb assembly.fasta.gz pacbio_reads.fasta.gz --secondary=no | samtools sort -o mapping_LR.map-pb.bam -T tmp.ali
samtools index mapping_LR.map-pb.bam
Hap.py depth mapping_LR.map-pb.bam happy_output
Hap.py estimate --max-contaminant 35 --max-diploid 120 --min-peak 150000 happy_output/mapping_LR.map-pb.bam.hist . 102M
```
Binary file added __pycache__/commands.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/coverage.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/estimate.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/plot.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/utils.cpython-36.pyc
Binary file not shown.
88 changes: 88 additions & 0 deletions commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Based on Rémy Greinhofer (rgreinho) tutorial on subcommands in docopt
# https://github.com/rgreinho/docopt-subcommands-example
# Also based on Cyril Mathey-Doret (cmdoret) commands.py in hicstuff
# https://github.com/koszullab/hicstuff

from docopt import docopt
import sys, os, shutil
import tempfile
from os.path import join, dirname
import coverage as happyc
import estimate as happye


class AbstractCommand:
"""Base class for the commands"""

def __init__(self, command_args, global_args):
"""Initialize the commands."""
self.args = docopt(self.__doc__, argv=command_args)
self.global_args = global_args

def execute(self):
"""Execute the commands"""
raise NotImplementedError

def check_output_path(self, path, force=False):
"""Throws error if the output file exists. Create required file tree otherwise."""
# Get complete output filename and prevent overwriting unless force is enabled
if not force and os.path.exists(path):
raise IOError("Output file already exists. Use --force to overwrite")
if dirname(path):
os.makedirs(dirname(path), exist_ok=True)


class Coverage(AbstractCommand):
"""Coverage histogram command
Compute coverage histogram for mapping file.
usage:
coverage [--threads=1] --outdir=DIR <mapping.bam>
arguments:
mapping.bam Sorted BAM file after mapping reads to the assembly.
options:
-t, --threads=INT Number of parallel threads allocated for
sambamba [default: 1].
-d, --outdir=DIR Path where the .cov and .hist files are written.
"""

def execute(self):
print("Running coverage module.")
happyc.get_cov_hist(
self.args["<mapping.bam>"], self.args["--threads"], self.args["--outdir"]
)


class Estimate(AbstractCommand):
"""Estimate command
Compute AUC ratio and TSS from coverage histogram.
usage:
estimate [--max-contaminant=INT] [--max-diploid=INT] [--min-peak=INT] --size=INT --outstats=FILE [--plot] <coverage.hist>
arguments:
coverage.hist Coverage histogram.
options:
-C, --max-contaminant=INT Maximum coverage of contaminants.
-D, --max-diploid=INT Maximum coverage of the diploid peak.
-M, --min-peak=INT Minimum peak height.
-S, --size=INT Estimated haploid genome size.
-O, --outstats=FILE Path where the AUC ratio and TSS values are written.
-p, --plot Generate histogram plot.
"""

def execute(self):

happye.estimate_haploidy(
self.args["<coverage.hist>"],
self.args["--max-contaminant"],
self.args["--max-diploid"],
self.args["--min-peak"],
self.args["--size"],
self.args["--outstats"],
)
82 changes: 82 additions & 0 deletions coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os, sys
import pandas as pd
import numpy as np
from utils import *


def get_cov_hist(infile, threads: int, outdir):
"""Finds peaks and modality, then computes scores of haploidy"""

print("# Hap.py coverage")
print(
"Input alignment file:\t{0}\nOutput directory:\t{1}\nNumber of threads:\t{2}".format(
infile, outdir, threads
)
)
print(
"===============================================================================\n"
)

if os.path.isdir(outdir):
print("WARNING: Output directory already exists! {}".format(outdir))
else:
os.makedirs(outdir) # Create directory following path

# Read coverage histogram
coverage_output = os.path.join(outdir, os.path.basename(infile) + ".cov")
if not os.path.isfile(coverage_output): # In case no coverage file found
log("Starting sambamba depth...")
coverage_output = os.path.abspath(coverage_output)
dc_sambamba = {"BAM": infile, "threads": threads, "out": coverage_output}
cmd = "sambamba depth base -t {threads} -o {out} --min-coverage=0 --min-base-quality=0 {BAM}"
cmd = cmd.format(**dc_sambamba)
sambamba_returncode = run(cmd)
if sambamba_returncode == 1:
log(
"ERROR: sambamba command returned: {0}, a common problem is a missing index (.bai) file...".format(
sambamba_returncode
)
)
sys.exit(1)
else:
log(
"SKIP: Existing output coverage file found using it instead of running sambamba!"
)

# Read coverage histogram
output = os.path.join(outdir, os.path.basename(infile) + ".hist")
if not os.path.isfile(output): # In case no coverage file found
log("Reading coverage file...")
output = os.path.abspath(output)

# Read coverage
df = pd.read_csv(coverage_output, sep="\t", usecols=["REF", "POS", "COV"])
total_bases = len(df) # ~length of reference assembly
# Number of bins = number of coverage values between 0 and max found
hist, bin_edges = np.histogram(df["COV"], bins=range(0, max(df["COV"])))

total_summarized = 0
max_coverage_to_keep = 0
for freq, cov in zip(hist, bin_edges):
total_summarized += freq
if total_summarized / total_bases > 0.99:
max_coverage_to_keep = cov # Find max coverage to keep
break

# Output histogram
f = open(output, "w")
for freq, cov in zip(hist, bin_edges):
if cov == max_coverage_to_keep:
break
else:
f.write("{}\t{}\n".format(cov, freq))
f.close()

else:
log("SKIP: Existing output histogram file found!")

log("Finished!")
sys.exit(0)
Loading

0 comments on commit 660b770

Please sign in to comment.