-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from nadegeguiglielmoni/master
+ docopt, less required parameters
- Loading branch information
Showing
13 changed files
with
850 additions
and
510 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
# Based on Rémy Greinhofer (rgreinho) tutorial on subcommands in docopt | ||
# https://github.com/rgreinho/docopt-subcommands-example | ||
# Also based on Cyril Mathey-Doret (cmdoret) commands.py in hicstuff | ||
# https://github.com/koszullab/hicstuff | ||
|
||
from docopt import docopt | ||
import sys, os, shutil | ||
import tempfile | ||
from os.path import join, dirname | ||
import coverage as happyc | ||
import estimate as happye | ||
|
||
|
||
class AbstractCommand: | ||
"""Base class for the commands""" | ||
|
||
def __init__(self, command_args, global_args): | ||
"""Initialize the commands.""" | ||
self.args = docopt(self.__doc__, argv=command_args) | ||
self.global_args = global_args | ||
|
||
def execute(self): | ||
"""Execute the commands""" | ||
raise NotImplementedError | ||
|
||
def check_output_path(self, path, force=False): | ||
"""Throws error if the output file exists. Create required file tree otherwise.""" | ||
# Get complete output filename and prevent overwriting unless force is enabled | ||
if not force and os.path.exists(path): | ||
raise IOError("Output file already exists. Use --force to overwrite") | ||
if dirname(path): | ||
os.makedirs(dirname(path), exist_ok=True) | ||
|
||
|
||
class Coverage(AbstractCommand): | ||
"""Coverage histogram command | ||
Compute coverage histogram for mapping file. | ||
usage: | ||
coverage [--threads=1] --outdir=DIR <mapping.bam> | ||
arguments: | ||
mapping.bam Sorted BAM file after mapping reads to the assembly. | ||
options: | ||
-t, --threads=INT Number of parallel threads allocated for | ||
sambamba [default: 1]. | ||
-d, --outdir=DIR Path where the .cov and .hist files are written. | ||
""" | ||
|
||
def execute(self): | ||
print("Running coverage module.") | ||
happyc.get_cov_hist( | ||
self.args["<mapping.bam>"], self.args["--threads"], self.args["--outdir"] | ||
) | ||
|
||
|
||
class Estimate(AbstractCommand): | ||
"""Estimate command | ||
Compute AUC ratio and TSS from coverage histogram. | ||
usage: | ||
estimate [--max-contaminant=INT] [--max-diploid=INT] [--min-peak=INT] --size=INT --outstats=FILE [--plot] <coverage.hist> | ||
arguments: | ||
coverage.hist Coverage histogram. | ||
options: | ||
-C, --max-contaminant=INT Maximum coverage of contaminants. | ||
-D, --max-diploid=INT Maximum coverage of the diploid peak. | ||
-M, --min-peak=INT Minimum peak height. | ||
-S, --size=INT Estimated haploid genome size. | ||
-O, --outstats=FILE Path where the AUC ratio and TSS values are written. | ||
-p, --plot Generate histogram plot. | ||
""" | ||
|
||
def execute(self): | ||
|
||
happye.estimate_haploidy( | ||
self.args["<coverage.hist>"], | ||
self.args["--max-contaminant"], | ||
self.args["--max-diploid"], | ||
self.args["--min-peak"], | ||
self.args["--size"], | ||
self.args["--outstats"], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
import os, sys | ||
import pandas as pd | ||
import numpy as np | ||
from utils import * | ||
|
||
|
||
def get_cov_hist(infile, threads: int, outdir): | ||
"""Finds peaks and modality, then computes scores of haploidy""" | ||
|
||
print("# Hap.py coverage") | ||
print( | ||
"Input alignment file:\t{0}\nOutput directory:\t{1}\nNumber of threads:\t{2}".format( | ||
infile, outdir, threads | ||
) | ||
) | ||
print( | ||
"===============================================================================\n" | ||
) | ||
|
||
if os.path.isdir(outdir): | ||
print("WARNING: Output directory already exists! {}".format(outdir)) | ||
else: | ||
os.makedirs(outdir) # Create directory following path | ||
|
||
# Read coverage histogram | ||
coverage_output = os.path.join(outdir, os.path.basename(infile) + ".cov") | ||
if not os.path.isfile(coverage_output): # In case no coverage file found | ||
log("Starting sambamba depth...") | ||
coverage_output = os.path.abspath(coverage_output) | ||
dc_sambamba = {"BAM": infile, "threads": threads, "out": coverage_output} | ||
cmd = "sambamba depth base -t {threads} -o {out} --min-coverage=0 --min-base-quality=0 {BAM}" | ||
cmd = cmd.format(**dc_sambamba) | ||
sambamba_returncode = run(cmd) | ||
if sambamba_returncode == 1: | ||
log( | ||
"ERROR: sambamba command returned: {0}, a common problem is a missing index (.bai) file...".format( | ||
sambamba_returncode | ||
) | ||
) | ||
sys.exit(1) | ||
else: | ||
log( | ||
"SKIP: Existing output coverage file found using it instead of running sambamba!" | ||
) | ||
|
||
# Read coverage histogram | ||
output = os.path.join(outdir, os.path.basename(infile) + ".hist") | ||
if not os.path.isfile(output): # In case no coverage file found | ||
log("Reading coverage file...") | ||
output = os.path.abspath(output) | ||
|
||
# Read coverage | ||
df = pd.read_csv(coverage_output, sep="\t", usecols=["REF", "POS", "COV"]) | ||
total_bases = len(df) # ~length of reference assembly | ||
# Number of bins = number of coverage values between 0 and max found | ||
hist, bin_edges = np.histogram(df["COV"], bins=range(0, max(df["COV"]))) | ||
|
||
total_summarized = 0 | ||
max_coverage_to_keep = 0 | ||
for freq, cov in zip(hist, bin_edges): | ||
total_summarized += freq | ||
if total_summarized / total_bases > 0.99: | ||
max_coverage_to_keep = cov # Find max coverage to keep | ||
break | ||
|
||
# Output histogram | ||
f = open(output, "w") | ||
for freq, cov in zip(hist, bin_edges): | ||
if cov == max_coverage_to_keep: | ||
break | ||
else: | ||
f.write("{}\t{}\n".format(cov, freq)) | ||
f.close() | ||
|
||
else: | ||
log("SKIP: Existing output histogram file found!") | ||
|
||
log("Finished!") | ||
sys.exit(0) |
Oops, something went wrong.