julia_assignment.jl

#!/usr/bin/env julia

# NOTE: "UMI-signal is changed from '____' to '_'!
# It is basically what 'umi_tools extract' outputs!
# Line 166 and line 727 are changed! 

# Line 405: "Name" changed to "gene_id"

Base.CoreLogging.disable_logging(Base.CoreLogging.Debug)

using BioAlignments
using BioSequences
using GenomicFeatures
using DataStructures

using CodecZlib
using HDF5

using Gadfly
import Cairo
import Compose

using ArgParse

@enum TranscriptChoice longestCDS longestTranscript
@enum AssignmentType FivePrime ThreePrime
const ProteinAttributes = Dict{String, String}("protein_id" => "protein_id",
                                                "Name" => "protein_name",
                                                "orf_classification" => "orf_classification")
const mRNAAttributes = Dict{String, String}("transcript_id" => "transcript_id",
                                            "Name" => "transcript_name")
const GeneAttributes = Dict{String, String}("gene" => "gene")

struct ReadLengthRange
    start::Union{Unsigned, Nothing}
    stop::Union{Unsigned, Nothing}
end
ReadLengthRange() = ReadLengthRange(nothing, nothing)
function ReadLengthRange(x::AbstractString)
    ranges = split(x, ':', keepempty=true)
    if length(ranges) > 2
        throw(ArgumentError("\"$x\" is not a valid read length range."))
    end
    start = length(ranges[1]) > 0 ? parse(UInt, ranges[1]) : Nothing
    stop = length(ranges[2]) > 0 ? parse(UInt, ranges[2]) : Nothing
    ReadLengthRange(start, stop)
end

function assign3prime(read::Union{SAM.Record, BAM.Record}, clip5::Unsigned)
    fmt = parentmodule(typeof(read))
    aln = fmt.alignment(read)
    ispositivestrand = fmt.flag(read) & 0x10 == 0
    seqlen = fmt.seqlength(read)
    offsets = [13, 16, 19]
    if seqlen < maximum(offsets)
        return nothing
    end

    assignments = [UInt(k) for (k, nothing) in (ispositivestrand ? seq2ref.((aln,), seqlen .- offsets .+ 1) : seq2ref.((aln,), offsets))]
    return assignments
end

function assign5prime(read::Union{SAM.Record, BAM.Record}, clip5::Unsigned)
    fmt = parentmodule(typeof(read))
    aln = fmt.alignment(read)
    ispositivestrand = fmt.flag(read) & 0x10 == 0
    seqlen = fmt.seqlength(read)
    offsets = [16, 13, 10]
    if clip5 > 0
        if ispositivestrand
            rng = 1:clip5
        else
            rng = seqlen:seqlen - clip5 + 1
        end
        for i ∈ rng
            if seq2ref(aln, i)[2] == OP_SOFT_CLIP
                offsets .+= 1
            else
                break
            end
        end
    end
    if seqlen < maximum(offsets)
        return nothing
    end

    assignments = [UInt(k) for (k, nothing) in (ispositivestrand ? seq2ref.((aln,), offsets) : seq2ref.((aln,), seqlen .-offsets .+ 1))]
    return assignments
end

@enum RibosomeSite asite psite esite

abstract type AbstractRead end

struct Read <: AbstractRead
    refname::String
    strand::Strand
    readlength::UInt16
    clippedlength::UInt16
    lmap::UInt32
    rmap::UInt32

    asite::UInt32
    psite::UInt32
    esite::UInt32
end

refname(rd::Read) = rd.refname
strand(rd::Read) = rd.strand
rdlength(rd::Read) = rd.readlength
clippedlength(rd::Read) = rd.clippedlength
lmap(rd::Read) = rd.lmap
rmap(rd::Read) = rd.rmap
function site(rd::Read, s::RibosomeSite)
    if s == asite
        rd.asite
    elseif s == psite
        rd.psite
    elseif s == esite
        rd.esite
    end
end

struct UMIRead <: AbstractRead
    umi::DNAKmer
    read::Read
end

umiseq(rd::UMIRead) = rd.umi
refname(rd::UMIRead) = refname(rd.read)
strand(rd::UMIRead) = strand(rd.read)
rdlength(rd::UMIRead) = rdlength(rd.read)
clippedlength(rd::UMIRead) = clippedlength(rd.read)
lmap(rd::UMIRead) = lmap(rd.read)
rmap(rd::UMIRead) = rmap(rd.read)
site(rd::UMIRead, s::RibosomeSite) = site(rd.read, s)

function Read(record::Union{SAM.Record, BAM.Record}, clip5::Unsigned, asstype::AssignmentType)
    fmt = parentmodule(typeof(record))
    aln = fmt.alignment(record)
    seq = fmt.sequence(record)
    seqlen = fmt.seqlength(record)
    clippedlen = seqlen
    for i ∈ 1:clip5
        if seq2ref(aln, i)[2] == OP_SOFT_CLIP
            clippedlen -= 1
        else
            break
        end
    end
    ispositivestrand = fmt.flag(record) & 0x10 == 0
    ass = asstype == FivePrime ? assign5prime(record, clip5) : assign3prime(record, clip5)
    if ass === nothing
        return nothing
    else
        return Read(fmt.refname(record),
                    ispositivestrand ? STRAND_POS : STRAND_NEG,
                    seqlen,
                    clippedlen,
                    seq2ref(aln, 1)[1],
                    seq2ref(aln, seqlen)[1],
                    ass...)
    end
end

function UMIRead(record::Union{SAM.Record, BAM.Record}, clip5::Unsigned, asstype::AssignmentType)
    fmt = parentmodule(typeof(record))
    umistr = fmt.tempname(record)
    umiseq = DNASequence(umistr[(findlast("_", umistr).stop + 1):end])

    aln = fmt.alignment(record)
    seq = fmt.sequence(record)
    seqlen = fmt.seqlength(record)
    ispositivestrand = fmt.flag(record) & 0x10 == 0
    for i ∈ 1:clip5
        if seq2ref(aln, i)[2] == OP_SOFT_CLIP
            push!(umiseq, seq[i])
        else
            break
        end
    end
    if findfirst(DNA_N, umiseq) != nothing
        return nothing
    end
    rd = Read(record, clip5, asstype)
    if rd === nothing
        return nothing
    else
        return UMIRead(DNAKmer(umiseq), rd)
    end
end

function Base.hash(x::UMIRead, h::UInt)
    hash(umiseq(x),
         hash(refname(x),
              hash(strand(x),
                   hash(rdlength(x),
                        hash(lmap(x),
                             hash(rmap(x), h))))))
end

function Base.:(==)(x::UMIRead, y::UMIRead)
    umiseq(x) == umiseq(y) &&
    refname(x) == refname(y) &&
    strand(x) == strand(y) &&
    rdlength(x) == rdlength(y) &&
    lmap(x) == lmap(y) &&
    rmap(x) == rmap(y)
end

const UMICounts = DefaultDict{String, DefaultDict{UInt32, UInt32}}
UMICounts() = UMICounts(() -> DefaultDict{UInt32, UInt32}(UInt32(0)))

function featurelength(ic::IntervalCollection{T}) where T
    len = UInt64(0)
    for i in ic
        len += rightposition(i) - leftposition(i) + 1
    end
    len
end

struct mRNA
    parent::String
    strand::GenomicFeatures.Strand
    ID::String
    chromosome::String
    transcript::Interval{GFF3.Record}
    attributes::Dict{String, Union{String, Number}}
end

mRNA(parent::String, strand::GenomicFeatures.Strand, ID::String, chromosome::String, transcript::Interval{GFF3.Record}) = mRNA(parent, strand, ID, chromosome, transcript, Dict{String, Union{String, Number}}())
mRNA(parent::String, strand::GenomicFeatures.Strand, ID::String, chromosome::String) = mRNA(parent, strand, ID, chromosome, Interval{GFF3.Record}())
Base.length(p::mRNA) = rightposition(p.transcript) - leftposition(p.transcript) + 1

struct Protein
    parent::String
    strand::GenomicFeatures.Strand
    ID::String
    chromosome::String
    CDS::IntervalCollection{GFF3.Record}
    attributes::Dict{String, Union{String, Number}}
end

Protein(transcript::String, strand::GenomicFeatures.Strand, chromosome::String, ID::String) = Protein(transcript, strand, chromosome, ID, IntervalCollection{GFF3.Record}(), Dict{String, Union{String, Number}}())
Base.length(p::Protein) = featurelength(p.CDS)
Base.push!(p::Protein, i::Interval{T}) where T = push!(p.CDS, i)

function Base.iterate(p::Protein)
    if length(p.CDS) == 0
        nothing
    else
        iterate(p, (iterate(p.CDS), (0, p.strand == STRAND_POS ? UInt64(1) : length(p))))
    end
end
function Base.iterate(p::Protein, state)
    (int, intState), (pos, cdspos) = state
    genomicPos = leftposition(int) + pos
    if genomicPos > rightposition(int)
        pos = 0
        nstate = iterate(p.CDS, intState)
        if nstate === nothing
            return nothing
        else
            int, intState = nstate
            genomicPos = leftposition(int) + pos
        end
    end
    ((genomicPos, cdspos), ((int, intState), (pos + 1, p.strand == STRAND_POS ? cdspos + 1 : cdspos - 1)))
end

struct Gene
    name::String
    ID::String
    chromosome::String
    mRNAs::Dict{String, mRNA}
    proteins::Dict{String, Protein}
    attributes::Dict{String, Union{String, Number}}
end

Gene(name::String, ID::String, chromosome::String) = Gene(name, ID, chromosome, Dict{String, mRNA}(), Dict{String, Protein}(), Dict{String, Union{String, Number}}())

const StrandPair = NamedTuple{(:fw, :rev), Tuple{DefaultDict{UInt32, UInt32, UInt32}, DefaultDict{UInt32, UInt32, UInt32}}}
StrandPair() = StrandPair((DefaultDict{UInt32, UInt32, UInt32}(UInt32(0)), DefaultDict{UInt32, UInt32, UInt32}(UInt32(0))))
strandcounts(x::StrandPair, s::Strand) = s == STRAND_POS ? x[:fw] : x[:rev]

@enum SiteCountsType all umi
abstract type AbstractSiteCounts end

struct UMISiteCounts <: AbstractSiteCounts
    all::StrandPair
    umi::StrandPair
end
UMISiteCounts() = UMISiteCounts(StrandPair(), StrandPair())
types(::Type{UMISiteCounts}) = [all, umi]
types(x::UMISiteCounts) = types(UMISiteCounts)

struct SiteCounts <: AbstractSiteCounts
    all::StrandPair
end
SiteCounts() = SiteCounts(StrandPair())
types(::Type{SiteCounts}) = [all]
types(x::SiteCounts) = types(SiteCounts)

counts(sitecounts::AbstractSiteCounts, type::SiteCountsType)::StrandPair = getfield(sitecounts, Symbol(type))

const GenomicPositions{T} = DefaultDict{String, T} where T <: AbstractSiteCounts
GenomicPositions(umis::Bool) = umis ? GenomicPositions{UMISiteCounts}(() -> UMISiteCounts()) : GenomicPositions{SiteCounts}(() -> SiteCounts())
types(p::GenomicPositions) = types(valtype(p))

struct GenomicSitePositions{T}
    asite::GenomicPositions{T}
    psite::GenomicPositions{T}
    esite::GenomicPositions{T}
end
GenomicSitePositions(umis::Bool) = GenomicSitePositions(GenomicPositions(umis), GenomicPositions(umis), GenomicPositions(umis))
function site(p::GenomicSitePositions, s::RibosomeSite)
    if s == asite
        p.asite
    elseif s == psite
        p.psite
    elseif s == esite
        p.esite
    end
end
function types(p::GenomicSitePositions)
    tps = types(p.asite)
    @assert types(p.psite) == types(p.esite) == tps
    tps
end

abstract type AbstractFootprintLengthCounts end

struct UMIFootprintLengthCounts <: AbstractFootprintLengthCounts
    all::DefaultDict{UInt8, UInt32}
    accepted::DefaultDict{UInt8, UInt32}
    acceptedumi::DefaultDict{UInt8, UInt32}
end
UMIFootprintLengthCounts() = UMIFootprintLengthCounts(DefaultDict{UInt8, UInt32}(UInt32(0)), DefaultDict{UInt8, UInt32}(UInt32(0)), DefaultDict{UInt8, UInt32}(UInt32(0)))

struct FootprintLengthCounts <: AbstractFootprintLengthCounts
    all::DefaultDict{UInt8, UInt32}
    accepted::DefaultDict{UInt8, UInt32}
end
FootprintLengthCounts() = FootprintLengthCounts(DefaultDict{UInt8, UInt32}(UInt32(0)), DefaultDict{UInt8, UInt32}(UInt32(0)))

function testGZIP(in::IOStream)
    pos = position(in)
    if eof(in)
        seekstart(in)
    end

    magicnr = read(in, 2)
    if length(magicnr) < 2
        return false
    end
    seek(in, pos)
    return magicnr[1] == 0x1f && magicnr[2] == 0x8b
end

function xread(infile::AbstractString)
    in = open(infile, "r")
    return testGZIP(in) ? GzipDecompressorStream(in) : in
end

function parseGFF3(infile::AbstractString)
    @info "reading genome annotation"
    proteins = Dict{String, Protein}()
    mRNAs = Dict{String, mRNA}()
    genes = Dict{String, Gene}()
    reader = GFF3.Reader(xread(infile))
    for record in reader
        if GFF3.featuretype(record) == "CDS"
            att = Dict(GFF3.attributes(record))
            id = haskey(att, "ID") ? att["ID"][1] : att["Name"][1]
            chrom = GFF3.seqid(record)
            if haskey(proteins, id)
                prot = proteins[id]
            else
                prot = Protein(att["Parent"][1], GFF3.strand(record), id, chrom)
                for k ∈ keys(ProteinAttributes)
                    if haskey(att, k)
                        prot.attributes[k] = att[k][1]
                    end
                end
                proteins[id] = prot
            end
            push!(prot, convert(Interval, record))
        elseif GFF3.featuretype(record) == "mRNA"
            att = Dict(GFF3.attributes(record))
            id = haskey(att, "ID") ? att["ID"][1] : att["Name"][1]
            chrom = GFF3.seqid(record)
            if haskey(mRNAs, id)
                mrna = mRNAs[id]
            else
                mrna = mRNA(att["Parent"][1], GFF3.strand(record), id, chrom, convert(Interval, record))
                for k ∈ keys(mRNAAttributes)
                    if haskey(att, k)
                        mrna.attributes[k] = att[k][1]
                    end
                end
                mRNAs[id] = mrna
            end
        elseif GFF3.featuretype(record) == "gene"
            att = Dict(GFF3.attributes(record))
            id = haskey(att, "ID") ? att["ID"][1] : att["Name"][1]
            chrom = GFF3.seqid(record)
            gene = Gene(att["gene_id"][1], id, chrom)
            for k ∈ keys(GeneAttributes)
                    if haskey(att, k)
                        gene.attributes[k] = att[k][1]
                    end
                end
            genes[id] = gene
        end
    end
    close(reader)
    for (id, mrna) ∈ mRNAs
        if haskey(genes, mrna.parent)
            genes[mrna.parent].mRNAs[id] = mrna
        end
    end
    for (id, prot) ∈ proteins
        if haskey(mRNAs, prot.parent)
            genes[mRNAs[prot.parent].parent].proteins[id] = prot
        elseif haskey(genes, prot.parent)
            genes[prot.parent].proteins[id] = prot
        end
    end
    for k ∈ [id for (id, gene) ∈ genes if isempty(gene.mRNAs) && isempty(gene.proteins)]
        delete!(genes, k)
    end

    return genes
end

function processUMIGroup(calignments::Accumulator{UMIRead, <:Unsigned}, positions::GenomicSitePositions, ucounts::UMICounts, lcounts::UMIFootprintLengthCounts)
    for (umird, count) ∈ calignments
        ucounts[refname(umird)][count] += 1
        lcounts.acceptedumi[clippedlength(umird)] += 1
        lcounts.accepted[clippedlength(umird)] += count
        for rsite ∈ instances(RibosomeSite)
            for (type, cnt) ∈ zip((all, umi), (count, 1))
                strandcounts(counts(site(positions, rsite)[refname(umird)], type), strand(umird))[site(umird, rsite)] += cnt
            end
        end
    end
    empty!(calignments.map)
    nothing
end

function readUMIAlignments(reader::Union{BAM.Reader, SAM.Reader}, clip5::Unsigned=0, lengthrange::ReadLengthRange=ReadLengthRange(), asstype::AssignmentType=FivePrime)
    hd = findall(header(reader), "HD")[1]
    hdvals = Dict(SAM.keyvalues(hd))
    if !haskey(hdvals, "SO") || hdvals["SO"] != "coordinate"
        @error "alignment file must be coordinate-sorted"
    end

    positions = GenomicSitePositions(true)
    counts = UMICounts()
    lengthcounts = UMIFootprintLengthCounts()
    calignments = Accumulator{UMIRead, UInt32}()
    cpos = UInt32(0)
    cref = ""
    record = parentmodule(typeof(reader)).Record()
    while !eof(reader)
        read!(reader, record)
        rd = UMIRead(record, clip5, asstype)
        if rd != nothing
            lengthcounts.all[clippedlength(rd)] += 1
            if lengthrange.start !== nothing && rdlength(rd) < lengthrange.start || lengthrange.stop !== nothing && rdlength(rd) > lengthrange.stop
                continue
            elseif lmap(rd) == cpos && refname(rd) == cref
                push!(calignments, rd)
            else
                processUMIGroup(calignments, positions, counts, lengthcounts)
                cref = refname(rd)
                cpos = lmap(rd)
                push!(calignments, rd)
            end
        end
    end
    processUMIGroup(calignments, positions, counts, lengthcounts)
    return positions, lengthcounts, counts
end

function readAlignments(reader::Union{BAM.Reader, SAM.Reader}, clip5::Unsigned=0, lengthrange::ReadLengthRange=ReadLengthRange(), asstype::AssignmentType=FivePrime)
    positions = GenomicSitePositions(false)
    lengthcounts = FootprintLengthCounts()
    record = parentmodule(typeof(reader)).Record()
    while !eof(reader)
        read!(reader, record)
        rd = Read(record, clip5, asstype)
        if rd != nothing
            lengthcounts.all[clippedlength(rd)] += 1
            if lengthrange.start !== nothing && rdlength(rd) < lengthrange.start || lengthrange.stop !== nothing && rdlength(rd) > lengthrange.stop
                continue
            end
            lengthcounts.accepted[rdlength(rd)] += 1
            for rsite ∈ instances(RibosomeSite)
                strandcounts(counts(site(positions, rsite)[refname(rd)], all), strand(rd))[site(rd, rsite)] += 1
            end
        end
    end
    return positions, lengthcounts, nothing
end

function readAlignments(infile::AbstractString, clip5::Unsigned, lengthrange::ReadLengthRange=ReadLengthRange(), use_umis::Bool=false, asstype::AssignmentType=FivePrime)
    @info "reading alignments"
    in = open(infile, "r")
    fmt = testGZIP(in) ? BAM : SAM
    reader = fmt.Reader(in)::Union{BAM.Reader, SAM.Reader}

    if use_umis
        positions, lengthcounts, counts = readUMIAlignments(reader, clip5, lengthrange, asstype)
    else
        positions, lengthcounts, counts = readAlignments(reader, clip5, lengthrange, asstype)
    end
    close(reader)
    return positions, lengthcounts, counts
end

function writeGenomeCoordinates(positions::GenomicSitePositions, outdir::AbstractString)
    out_counts = Dict((i, h5open(joinpath(outdir, "counts_" * string(i) * ".h5"), "w")) for i ∈ types(positions))
    for rsite ∈ instances(RibosomeSite)
        position = site(positions, rsite)
        g = Dict((i, g_create(f, string(rsite))) for (i, f) ∈ out_counts)
        for (chrom, scounts) ∈ position
            for (type, h5g) ∈ g
                counts_fw = strandcounts(counts(scounts, type), STRAND_POS)
                counts_rev = strandcounts(counts(scounts, type), STRAND_NEG)

                occupiedpos = sort(collect(union(keys(counts_fw), keys(counts_rev))))
                pos, fstrand, rstrand = Vector{UInt32}(),
                                        Vector{UInt32}(),
                                        Vector{UInt32}()
                for p ∈ occupiedpos
                    push!(pos, p)
                    push!(fstrand, p ∈ keys(counts_fw) ? counts_fw[p] : 0) # DefaultDict inserts the default value into the dict already upon retrieval,
                    push!(rstrand, p ∈ keys(counts_rev) ? counts_rev[p] : 0) # this prevents filling the Dicts up with zeroes
                end
                h5g[chrom, "compress", 9] = hcat(pos, fstrand, rstrand)
            end
        end
    end
    for f ∈ values(out_counts)
        close(f)
    end
end

function chooseLongestCDS(CDSs, genetrx)
    lengths = [(length(prot), haskey(genetrx, prot.parent) ? length(genetrx[prot.parent]) : 0, id) for (id, prot) ∈ CDSs]
    sort!(lengths, rev=true)
    [CDSs[id] for (nothing, nothing, id) ∈ lengths]
end

function chooseLongestTranscript(CDSs, genetrx)
    lengths = [(haskey(genetrx, prot.parent) ? length(genetrx[prot.parent]) : 0, length(prot), id) for (id, prot) ∈ CDSs]
    sort!(lengths, rev=true)
    [CDSs[id] for (nothing, nothing, id) ∈ lengths]
end

function aggregateAlignments(positions::GenomicSitePositions, genes::AbstractDict{<:AbstractString, Gene}, outdir::AbstractString; trxChoice = longestCDS)
    @info "assigning counts to genes"
    for rsite ∈ instances(RibosomeSite)
        position = site(positions, rsite)
        ptypes = types(position)
        out = Dict((type, h5open(joinpath(outdir, "cds_" * string(type) * "_" * string(rsite) * ".h5"), "w")) for type ∈ ptypes)
        pos = Vector{UInt32}()
        cds_counts = Dict((type, Vector{UInt32}()) for type ∈ ptypes)
        for gene ∈ values(genes)
            if trxChoice == longestCDS
                proteins = chooseLongestCDS(gene.proteins, gene.mRNAs)
            elseif trxChoice == longestTranscript
                proteins = chooseLongestTranscript(gene.proteins, gene.mRNAs)
            end

            for protein ∈ proteins
                if haskey(position, protein.chromosome)
                    gcounts = position[protein.chromosome]
                    for (genomicCoord, cdsCoord) ∈ protein
                        have_counts = false
                        for type ∈ ptypes
                            scounts = strandcounts(counts(gcounts, type), protein.strand)
                            if genomicCoord ∈ keys(scounts)
                                push!(cds_counts[type], scounts[genomicCoord])
                                have_counts = true
                            end
                        end
                        if have_counts
                            push!(pos, cdsCoord)
                        end
                    end
                    if length(pos) > 0
                        for (type, h5out) ∈ out
                            arr = hcat(pos, cds_counts[type])
                            dset = d_create(h5out, gene.name, arr)[1]
                            dset[:,:] = arr
                            attrs(dset)["chromosome"] = protein.chromosome
                            attrs(dset)["cds_length"] = length(protein)
                            attrs(dset)["strand"] = string(protein.strand)

                            for (k, a) ∈ protein.attributes
                                attrs(dset)[ProteinAttributes[k]] = a
                            end
                            if  haskey(gene.mRNAs, protein.parent)
                                for (k, a) ∈ gene.mRNAs[protein.parent].attributes
                                    attrs(dset)[mRNAAttributes[k]] = a
                                end
                            end
                            for (k, a) ∈ gene.attributes
                                attrs(dset)[GeneAttributes[k]] = a
                            end
                        end
                    end
                    empty!(pos)
                    for cnt ∈ values(cds_counts)
                        empty!(cnt)
                    end
                    break
                end
            end
        end
        for f ∈ values(out)
            close(f)
        end
    end
    nothing
end

function aggregateAlignments(positions::GenomicSitePositions, gfffile::AbstractString, outdir::AbstractString; trxChoice = longestCDS)
    genes = parseGFF3(gfffile)
    aggregateAlignments(positions, genes, outdir, trxChoice=trxChoice)
end

function plotLengthCounts(counts::AbstractDict{<:Unsigned, <:Unsigned}, out::Cairo.CairoSurface, title::Union{Nothing, AbstractString})
    c = Cairo.CairoContext(out)
    s = Compose.CAIROSURFACE(out, c)
    xrange = extrema(keys(counts))
    Compose.draw(s, plot(x=collect(keys(counts)),
                         y=collect(values(counts)),
                         Geom.bar,
                         Guide.title(title),
                         Guide.xlabel("read length / nucleotides"),
                         Guide.ylabel("count"),
                         Scale.x_continuous(minticks=4),
                         Coord.cartesian(xmin=xrange[1], xmax=xrange[2])))
    Cairo.show_page(c)
end

function plotLengthCounts(counts::UMIFootprintLengthCounts, outdir::AbstractString)
    out =  Cairo.CairoPDFSurface(joinpath(outdir, "footprint_length_distribution.pdf"), 5*72, 5*72)
    plotLengthCounts(counts.all, out, "all reads")
    plotLengthCounts(counts.accepted, out, "accepted reads")
    plotLengthCounts(counts.acceptedumi, out, "accepted, UMI-collapsed reads")
    Cairo.finish(out)
    nothing
end

function plotLengthCounts(counts::FootprintLengthCounts, outdir::AbstractString)
    out =  Cairo.CairoPDFSurface(joinpath(outdir, "footprint_length_distribution.pdf"), 5*72, 5*72)
    plotLengthCounts(counts.all, out, "all reads")
    plotLengthCounts(counts.accepted, out, "accepted reads")
    Cairo.finish(out)
    nothing
end

function plotUMICounts(counts::UMICounts, outdir::AbstractString)
    @info "plotting reads/UMI distributions"

    out =  Cairo.CairoPDFSurface(joinpath(outdir, "reads_per_umi.pdf"), 5*72, 5*72)
    for (chromosome, cnt) ∈ counts
        c = Cairo.CairoContext(out)
        s = Compose.CAIROSURFACE(out, c)
        Compose.draw(s, plot(x=collect(keys(cnt)),
                             y=collect(values(cnt)),
                             Geom.bar,
                             Scale.y_log10,
                             Guide.title(chromosome),
                             Guide.xlabel("reads / UMI"),
                             Guide.ylabel("count")))
        Cairo.show_page(c)
    end
    Cairo.finish(out)
    nothing
end

function Base.show(io::IO, rng::ReadLengthRange)
    print(io, rng.start === nothing ? "" : Int(rng.start))
    print(io, ':')
    print(io, rng.stop === nothing ? "" : Int(rng.stop))
end

function ArgParse.parse_item(::Type{TranscriptChoice}, x::AbstractString)
    validOptions = instances(TranscriptChoice)
    opt = findfirst(string.(validOptions) .== x)
    if opt === nothing
        throw(ArgumentError("invalid transcript choice \"" * x * "\""))
    else
        validOptions[opt]
    end
end

function ArgParse.parse_item(::Type{AssignmentType}, x::AbstractString)
    if x == "5"
        FivePrime
    elseif x == "3"
        ThreePrime
    else
        throw(ArgumentError("invalid assignment type \"" * x * "\""))
    end
end

function ArgParse.parse_item(::Type{ReadLengthRange}, x::AbstractString)
    ReadLengthRange(x)
end

s = ArgParseSettings()
@add_arg_table s begin
    "--gff", "-g"
        help = "Path to genome annotation in GFF3 format (may be GZIP-compressed)."
        arg_type = String
        required = true
    "--clip5", "-c"
        help = "Number of bases at the 5' end that may be generated by untemplated addition."
        arg_type = UInt8
        default = UInt8(0)
        required = false
    "--use_umis", "-u"
        help = "Use UMI information to discard PCR duplicates. This option assumes that the UMI is stored as the last part of the read name, separated by '_' from the actual read ID. INFILE must be coordinate-sorted."
        action = :store_true
    "--length_range", "-l"
        help = "Valid read lengths. Must be a range expression, e.g. \"20:50\" will only include reads that are between 20 and 50 bases long (after 5' clipping), whereas \"20:\" will include all reads that are at least 20 bases long."
        arg_type = ReadLengthRange
        default = ReadLengthRange()
        required = false
    "--transcript_choice", "-t"
        help = "How to choose the reported transcript for CDS assignments. Valid choices are " * join(string.(instances(TranscriptChoice)), " ,", ", or ")
        arg_type = TranscriptChoice
        default = longestCDS
        required = false
    "--assignment_type", "-a"
        help = "Ribosome site assignment type. Valid choices are 5 for 5'-assignnment or 3 for 3'-assignment."
        arg_type = AssignmentType
        default = FivePrime
        required = false
    "--outdir", "-o"
        help = "output directory"
        arg_type = String
        required = true
    "INFILE"
        help = "path to alignment file in SAM or BAM format"
        arg_type = String
        required = true
end
args = parse_args(s, as_symbols=true)

mkpath(args[:outdir])

positions, lengthCounts, umiCounts = readAlignments(args[:INFILE], args[:clip5], args[:length_range], args[:use_umis], args[:assignment_type])

plotLengthCounts(lengthCounts, args[:outdir])

if args[:use_umis]
    plotUMICounts(umiCounts, args[:outdir])
end

writeGenomeCoordinates(positions, args[:outdir])
aggregateAlignments(positions, args[:gff], args[:outdir], trxChoice=args[:transcript_choice])