diff --git a/Changelog.md b/Changelog.md new file mode 100644 index 0000000..97ea3cd --- /dev/null +++ b/Changelog.md @@ -0,0 +1,3 @@ +## Rehaul + +* [TODO: ADD STUFF] \ No newline at end of file diff --git a/Project.toml b/Project.toml index beb4170..82a028a 100644 --- a/Project.toml +++ b/Project.toml @@ -5,10 +5,10 @@ version = "0.2.6" [deps] Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" -BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6" BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" +CodecBGZF = "d9d91ef6-315d-495b-8131-db2ca24339d6" GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -16,12 +16,10 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" [compat] Automa = "0.7, 0.8" -BGZFStreams = "0.3" BioAlignments = "2" BioGenerics = "0.1" BioSequences = "2.0.4" GenomicFeatures = "2" -Indexes = "0.1" TranscodingStreams = "0.6, 0.7, 0.8, 0.9" julia = "1" diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..f44b824 --- /dev/null +++ b/TODO.md @@ -0,0 +1,5 @@ +## SAM + * quality_iter + * no-copy auxdata + * Display similar to BAM + * convert SAM/BAM and vice versa \ No newline at end of file diff --git a/src/XAM.jl b/src/XAM.jl index 8609747..5eb09f0 100644 --- a/src/XAM.jl +++ b/src/XAM.jl @@ -4,6 +4,7 @@ export SAM, BAM +include("common.jl") include("sam/sam.jl") include("bam/bam.jl") diff --git a/src/bam/auxdata.jl b/src/bam/auxdata.jl index 3ce0050..e73d7b2 100644 --- a/src/bam/auxdata.jl +++ b/src/bam/auxdata.jl @@ -1,8 +1,8 @@ # BAM Auxiliary Data # ================== -struct AuxData <: AbstractDict{String,Any} - data::Vector{UInt8} +struct AuxData{T <: AbstractVector{UInt8}} <: AbstractDict{String,Any} + data::T end function Base.getindex(aux::AuxData, tag::AbstractString) @@ -48,7 +48,7 @@ function checkauxtag(tag::AbstractString) end end -function getauxvalue(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8) +function getauxvalue(data::AbstractVector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8) pos = findauxtag(data, start, stop, t1, t2) if pos == 0 throw(KeyError(String([t1, t2]))) @@ -58,7 +58,7 @@ function getauxvalue(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2:: return val end -function loadauxtype(data::Vector{UInt8}, p::Int) +function loadauxtype(data::AbstractVector{UInt8}, p::Int) function auxtype(b) return ( b == UInt8('A') ? Char : @@ -83,15 +83,15 @@ function loadauxtype(data::Vector{UInt8}, p::Int) end -function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{T}) where T +function loadauxvalue(data::AbstractVector{UInt8}, p::Int, ::Type{T}) where T return p + sizeof(T), unsafe_load(Ptr{T}(pointer(data, p))) end -function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Char}) +function loadauxvalue(data::AbstractVector{UInt8}, p::Int, ::Type{Char}) return p + 1, Char(unsafe_load(pointer(data, p))) end -function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Vector{T}}) where T +function loadauxvalue(data::AbstractVector{UInt8}, p::Int, ::Type{Vector{T}}) where T n = unsafe_load(Ptr{Int32}(pointer(data, p))) p += 4 xs = Vector{T}(undef, n) @@ -99,14 +99,14 @@ function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{Vector{T}}) where T return p + n * sizeof(T), xs end -function loadauxvalue(data::Vector{UInt8}, p::Int, ::Type{String}) +function loadauxvalue(data::AbstractVector{UInt8}, p::Int, ::Type{String}) dataptr = pointer(data, p) endptr = ccall(:memchr, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), dataptr, '\0', length(data) - p + 1) q::Int = p + (endptr - dataptr) - 1 return q + 2, String(data[p:q]) end -function findauxtag(data::Vector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8) +function findauxtag(data::AbstractVector{UInt8}, start::Int, stop::Int, t1::UInt8, t2::UInt8) pos = start while pos ≤ stop && !(data[pos] == t1 && data[pos+1] == t2) @@ -123,7 +123,7 @@ end # Find the starting position of a next tag in `data` after `p`. # `(data[p], data[p+1])` is supposed to be a current tag. -function next_tag_position(data::Vector{UInt8}, p::Int) +function next_tag_position(data::AbstractVector{UInt8}, p::Int) typ = Char(data[p+2]) p += 3 diff --git a/src/bam/bam.jl b/src/bam/bam.jl index 6fd1174..9916ca1 100644 --- a/src/bam/bam.jl +++ b/src/bam/bam.jl @@ -6,8 +6,8 @@ module BAM using BioGenerics using GenomicFeatures using XAM.SAM +using CodecBGZF -import BGZFStreams import BioAlignments import Indexes import BioSequences @@ -15,7 +15,7 @@ import BioGenerics: isfilled, header import GenomicFeatures: eachoverlap - +include("../common.jl") include("bai.jl") include("auxdata.jl") include("reader.jl") diff --git a/src/bam/overlap.jl b/src/bam/overlap.jl index c6cf8ac..3f315b2 100644 --- a/src/bam/overlap.jl +++ b/src/bam/overlap.jl @@ -62,7 +62,7 @@ end function Base.iterate(iter::OverlapIterator, state) while state.chunkid ≤ lastindex(state.chunks) chunk = state.chunks[state.chunkid] - while BGZFStreams.virtualoffset(iter.reader.stream) < chunk.stop + while VirtualOffset(iter.reader.stream) < chunk.stop read!(iter.reader, state.record) c = compare_intervals(state.record, (state.refindex, iter.interval)) if c == 0 diff --git a/src/bam/reader.jl b/src/bam/reader.jl index 43e228e..bb1fac9 100644 --- a/src/bam/reader.jl +++ b/src/bam/reader.jl @@ -11,9 +11,9 @@ Create a data reader of the BAM file format. * `index=nothing`: filepath to a random access index (currently *bai* is supported) """ mutable struct Reader{T} <: BioGenerics.IO.AbstractReader - stream::BGZFStreams.BGZFStream{T} + stream::BGZFDecompressorStream{T} header::SAM.Header - start_offset::BGZFStreams.VirtualOffset + start_offset::VirtualOffset refseqnames::Vector{String} refseqlens::Vector{Int} index::Union{Nothing, BAI} @@ -64,7 +64,7 @@ function header(reader::Reader; fillSQ::Bool=false)::SAM.Header return header end -function Base.seek(reader::Reader, voffset::BGZFStreams.VirtualOffset) +function Base.seek(reader::Reader, voffset::CodecBGZF.VirtualOffset) seek(reader.stream, voffset) end @@ -80,7 +80,7 @@ function Base.iterate(reader::Reader, nextone = Record()) end # Initialize a BAM reader by reading the header section. -function init_bam_reader(input::BGZFStreams.BGZFStream) +function init_bam_reader(input::BGZFDecompressorStream) # magic bytes B = read(input, UInt8) A = read(input, UInt8) @@ -108,9 +108,7 @@ function init_bam_reader(input::BGZFStreams.BGZFStream) refseqlens[i] = seqlen end - voffset = isa(input.io, Base.AbstractPipe) ? - BGZFStreams.VirtualOffset(0, 0) : - BGZFStreams.virtualoffset(input) + voffset = VirtualOffset(input) return Reader( input, @@ -122,7 +120,7 @@ function init_bam_reader(input::BGZFStreams.BGZFStream) end function init_bam_reader(input::IO) - return init_bam_reader(BGZFStreams.BGZFStream(input)) + return init_bam_reader(BGZFDecompressorStream(input)) end function _read!(reader::Reader, record) diff --git a/src/bam/record.jl b/src/bam/record.jl index 61a5c65..7fc8bdd 100644 --- a/src/bam/record.jl +++ b/src/bam/record.jl @@ -25,7 +25,11 @@ mutable struct Record reader::Union{Reader, Nothing} function Record() - return new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, UInt8[]) + # An empty record is a legitimate BAM record. + block_size = FIXED_FIELDS_BYTES - sizeof(Int32) + 1 + # Only the null terminator of the query name sequence + data = UInt8[0x00] + return new(block_size, -1, -1, 0x01, 0xff, 0, 0, 0x0004, 0, -1, -1, 0, data, nothing) end end @@ -77,49 +81,89 @@ function Base.copy(record::Record) end function Base.empty!(record::Record) - record.block_size = 0 - record.refid = 0 - record.pos = 0 - record.l_read_name = 0 - record.mapq = 0 - record.bin = 0 - record.flag = 0 - record.n_cigar_op = 0 - record.l_seq = 0 - record.next_refid = 0 - record.next_pos = 0 - record.tlen = 0 + block_size = FIXED_FIELDS_BYTES - sizeof(Int32) + 1 + record.block_size = block_size + record.refid = -1 + record.pos = -1 + record.l_read_name = 0x01 + record.mapq = 0xff + record.bin = 0 + record.n_cigar_op = 0 + record.flag = 0x0004 + record.l_seq = 0 + record.next_refid = -1 + record.next_pos = -1 + record.tlen = 0 + record.data[1] = 0x00 #Note: data will be overwritten and indexed using data_size. - return record end +_show(io, val) = val === nothing ? print(io, "") : show(io, val) + function Base.show(io::IO, record::Record) - print(io, summary(record), ':') - if isfilled(record) - println(io) - println(io, " template name: ", tempname(record)) - println(io, " flag: ", flag(record)) - println(io, " reference ID: ", refid(record)) - println(io, " position: ", position(record)) - println(io, " mapping quality: ", mappingquality(record)) - println(io, " CIGAR: ", cigar(record)) - println(io, " next reference ID: ", nextrefid(record)) - println(io, " next position: ", nextposition(record)) - println(io, " template length: ", templength(record)) - println(io, " sequence: ", sequence(record)) - # TODO: pretty print base quality - println(io, " base quality: ", quality(record)) - print(io, " auxiliary data:") + println(io, summary(record), ':') + print(io, " template name: "); _show(io, tempname(record)) + print(io, "\n flag: "); show(io, flag(record)) + print(io, "\n reference ID: "); _show(io, refid(record)) + print(io, "\n position: "); _show(io, position(record)) + print(io, "\n mapping quality: "); _show(io, mappingquality(record)) + print(io, "\n CIGAR: "); _show(io, cigar(record)) + print(io, "\n next reference ID: "); _show(io, nextrefid(record)) + print(io, "\n next position: "); _show(io, nextposition(record)) + print(io, "\n template length: "); _show(io, templength(record)) + # Sequence and base quality + seq = sequence(record) + width = displaysize()[2] - 21 # 21 chars used leftwards + if seq === nothing + print(io, "\n sequence: "); print(io, "") + print(io, "\n base quality: "); print(io, "") + elseif length(seq) <= width + print(io, "\n sequence: "); show(io, seq) + print(io, "\n base quality: "); print(io, quality_string(quality(record))) + else + half = div(width, 2) - 1 + print(io, "\n sequence: ", seq[1:half], '…', seq[end-half:end]) + qual = quality(record) + print(io, "\n base quality: ", quality_string(qual[1:half]), '…', + quality_string(qual[end-half:end])) + end + # Auxiliary fields - don't show if too long + print(io, "\n auxiliary data:") + n_aux_bytes = data_size(record) - auxdata_position(record) + 1 + if n_aux_bytes > 500 + print(io, "<", n_aux_bytes, " bytes auxiliary data>") + else for field in keys(auxdata(record)) - print(io, ' ', field, '=', record[field]) + print(io, ' ', field, '='); show(record[field]) end - else - print(io, " ") end end +#= +function Base.show(io::IO, record::Record) + print(io, summary(record), ':') + println(io) + println(io, " template name: ", tempname(record)) + println(io, " flag: ", flag(record)) + println(io, " reference ID: ", refid(record)) + println(io, " position: ", position(record)) + println(io, " mapping quality: ", mappingquality(record)) + println(io, " CIGAR: ", cigar(record)) + println(io, " next reference ID: ", nextrefid(record)) + println(io, " next position: ", nextposition(record)) + println(io, " template length: ", templength(record)) + println(io, " sequence: ", sequence(record)) + # TODO: pretty print base quality + println(io, " base quality: ", quality(record)) + print(io, " auxiliary data:") + for field in keys(auxdata(record)) + print(io, ' ', field, '=', record[field]) + end +end +=# + function Base.read!(reader::Reader, record::Record) return _read!(reader, record) end @@ -134,14 +178,9 @@ end Get the bitwise flag of `record`. """ function flag(record::Record)::UInt16 - checkfilled(record) return record.flag end -function hasflag(record::Record) - return isfilled(record) -end - """ ismapped(record::Record)::Bool @@ -182,24 +221,13 @@ The ID is 1-based (i.e. the first sequence is 1) and is 0 for a record without a See also: `BAM.rname` """ -function refid(record::Record)::Int - checkfilled(record) - return record.refid + 1 +function refid(record::Record) + hasrefid(record) || return nothing + return Int(record.refid + 1) end function hasrefid(record::Record) - return isfilled(record) -end - -function checked_refid(record::Record) - id = refid(record) - if id == 0 - throw(ArgumentError("record is not mapped")) - end - if !isdefined(record, :reader) - throw(ArgumentError("reader is not defined")) - end - return id + record.refid > -1 end """ @@ -209,10 +237,9 @@ Get the reference sequence name of `record`. See also: `BAM.refid` """ -function refname(record::Record)::String - checkfilled(record) - id = checked_refid(record) - return record.reader.refseqnames[id] +function refname(record::Record) + hasrefname(record) || return nothing + return record.reader.refseqnames[refid(record)] end """ @@ -220,14 +247,16 @@ end Get the length of the reference sequence this record applies to. """ -function reflen(record::Record)::Int - checkfilled(record) - id = checked_refid(record) +function reflen(record::Record) + id = refid(record) + if (id === nothing) | (record.reader === nothing) + return nothing + end return record.reader.refseqlens[id] end function hasrefname(record::Record) - return hasrefid(record) + return hasrefid(record) & (record.reader !== nothing) end """ @@ -235,13 +264,12 @@ end Get the 1-based leftmost mapping position of `record`. """ -function position(record::Record)::Int - checkfilled(record) - return record.pos + 1 +function position(record::Record) + return hasposition(record) ? record.pos + 1 : nothing end function hasposition(record::Record) - return isfilled(record) + return record.pos > -1 end """ @@ -249,13 +277,13 @@ end Get the 1-based rightmost mapping position of `record`. """ -function rightposition(record::Record)::Int - checkfilled(record) - return Int32(position(record) + alignlength(record) - 1) +function rightposition(record::Record) + pos = position(record) + return pos === nothing ? nothing : pos + alignlength(record) - 1 end function hasrightposition(record::Record) - return isfilled(record) && ismapped(record) + return record.pos > -1 end """ @@ -263,8 +291,8 @@ end Test if the mate/next read of `record` is mapped. """ -function isnextmapped(record::Record)::Bool - return isfilled(record) && (flag(record) & SAM.FLAG_MUNMAP == 0) +function isnextmapped(record::Record) + return flag(record) & SAM.FLAG_MUNMAP == 0 end """ @@ -272,13 +300,16 @@ end Get the next/mate reference sequence ID of `record`. """ -function nextrefid(record::Record)::Int - checkfilled(record) - return record.next_refid + 1 +function nextrefid(record::Record) + ispaired = flag(record) & SAM.FLAG_PAIRED == SAM.FLAG_PAIRED + if !ispaired || record.next_refid == -1 + return nothing + end + return Int(record.next_refid + 1) end -function hasnextrefid(record::Record) - return isfilled(record) +function hasnextrefid(record::Record)::Bool + return record.next_refid > -1 end """ @@ -286,19 +317,14 @@ end Get the reference name of the mate/next read of `record`. """ -function nextrefname(record::Record)::String - checkfilled(record) - id = nextrefid(record) - if id == 0 - throw(ArgumentError("next record is not mapped")) - elseif !isdefined(record, :reader) - throw(ArgumentError("reader is not defined")) - end +function nextrefname(record::Record)::Union{Nothing, String} + hasnextrefname(record) || return nothing + id = record.next_refid return record.reader.refseqnames[id] end -function hasnextrefname(record::Record) - return isfilled(record) && isnextmapped(record) +function hasnextrefname(record::Record)::Bool + return (record.next_refid > -1) & (record.reader !== nothing) end """ @@ -306,13 +332,13 @@ end Get the 1-based leftmost mapping position of the next/mate read of `record`. """ -function nextposition(record::Record)::Int - checkfilled(record) +function nextposition(record::Record)::Union{Int, Nothing} + hasnextposition(record) || return nothing return record.next_pos + 1 end -function hasnextposition(record::Record) - return isfilled(record) +function hasnextposition(record::Record)::Bool + return record.next_pos > -1 end """ @@ -320,12 +346,17 @@ end Get the mapping quality of `record`. """ -function mappingquality(record::Record)::UInt8 +function mappingquality(record::Record) + hasmappingquality(record) || return nothing return record.mapq end function hasmappingquality(record::Record) - return isfilled(record) + return record.mapq < 0xff +end + +function hascigar(record::Record) + return record.n_cigar_op > 0 end """ @@ -358,7 +389,8 @@ If you have a record that stores the true cigar in a `CG:B,I` tag, but you still See also `BAM.cigar_rle`. """ -function cigar(record::Record, checkCG::Bool = true)::String +function cigar(record::Record, checkCG::Bool = true) + hascigar(record) || return nothing buf = IOBuffer() for (op, len) in zip(cigar_rle(record, checkCG)...) print(buf, len, convert(Char, op)) @@ -381,26 +413,27 @@ If you have a record that stores the true cigar in a `CG:B,I` tag, but you still See also `BAM.cigar`. """ function cigar_rle(record::Record, checkCG::Bool = true)::Tuple{Vector{BioAlignments.Operation},Vector{Int}} - checkfilled(record) idx, nops = cigar_position(record, checkCG) ops, lens = extract_cigar_rle(record.data, idx, nops) return ops, lens end -function extract_cigar_rle(data::Vector{UInt8}, offset, n) - ops = Vector{BioAlignments.Operation}() - lens = Vector{Int}() - for i in offset:4:offset + (n - 1) * 4 - x = unsafe_load(Ptr{UInt32}(pointer(data, i))) - op = BioAlignments.Operation(x & 0x0F) - push!(ops, op) - push!(lens, x >> 4) +function extract_cigar_rle(data::Vector{UInt8}, index, n) + ops = Vector{BioAlignments.Operation}(undef, n) + lens = Vector{Int}(undef, n) + GC.@preserve data @inbounds for i in 1:n + x = unsafe_load(Ptr{UInt32}(pointer(data, index))) + ops[i] = BioAlignments.Operation(x & 0x0F) + lens[i] = x >>> 4 + index += 4 end return ops, lens end function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int} - cigaridx, nops = seqname_length(record) + 1, record.n_cigar_op + cigaridx, nops = seqname_length(record) + 2, record.n_cigar_op + # If a read has more than 65k ops, it's encoded in an AUX field, and the ops + # is set to kSmN. So first return if it's not that if !checkCG return cigaridx, nops end @@ -411,6 +444,7 @@ function cigar_position(record::Record, checkCG::Bool = true)::Tuple{Int, Int} if x != UInt32(seqlength(record) << 4 | 4) return cigaridx, nops end + # Else we go fetch it from the AUX fields. start = auxdata_position(record) stop = data_size(record) tagidx = findauxtag(record.data, start, stop, UInt8('C'), UInt8('G')) @@ -435,7 +469,6 @@ end Get the alignment of `record`. """ function alignment(record::Record)::BioAlignments.Alignment - checkfilled(record) if !ismapped(record) return BioAlignments.Alignment(BioAlignments.AlignmentAnchor[]) end @@ -467,13 +500,14 @@ end Get the alignment length of `record`. """ -function alignlength(record::Record)::Int - offset = seqname_length(record) - length::Int = 0 - for i in offset + 1:4:offset + n_cigar_op(record, false) * 4 - x = unsafe_load(Ptr{UInt32}(pointer(record.data, i))) +function alignlength(record::Record) + idx, nops = cigar_position(record, false) + length = 0 + data = record.data + GC.@preserve data for i in 1:nops + x = unsafe_load(Ptr{UInt32}(pointer(record.data, idx)), i) op = BioAlignments.Operation(x & 0x0F) - if BioAlignments.ismatchop(op) || BioAlignments.isdeleteop(op) + if BioAlignments.ismatchop(op) | BioAlignments.isdeleteop(op) length += x >> 4 end end @@ -485,28 +519,27 @@ end Get the query template name of `record`. """ -function tempname(record::Record)::String - checkfilled(record) - # drop the last NUL character - return unsafe_string(pointer(record.data), max(seqname_length(record) - 1, 0)) +function tempname(record::Record) + hastempname(record) || return nothing + return unsafe_string(pointer(record.data), seqname_length(record)) end function hastempname(record::Record) - return isfilled(record) + return seqname_length(record) > 0 end """ templength(record::Record)::Int -Get the template length of `record`. +Get the template length of `record`. Always returns a positive number. """ -function templength(record::Record)::Int - checkfilled(record) - return record.tlen +function templength(record::Record) + hastemplength(record) || return nothing + return abs(record.tlen) end function hastemplength(record::Record) - return isfilled(record) + return record.tlen != 0 end """ @@ -515,13 +548,14 @@ end Get the segment sequence of `record`. """ function sequence(record::Record) - checkfilled(record) + hassequence(record) || return nothing seqlen = seqlength(record) if seqlen == 0 return nothing end data = Vector{UInt64}(undef, cld(seqlen, 16)) - src::Ptr{UInt64} = pointer(record.data, seqname_length(record) + n_cigar_op(record, false) * 4 + 1) + index = seqname_length(record) + 1 + n_cigar_op(record, false) * 4 + 1 + src::Ptr{UInt64} = pointer(record.data, index) for i in 1:lastindex(data) # copy data flipping high and low nybble x = unsafe_load(src, i) @@ -531,7 +565,7 @@ function sequence(record::Record) end function hassequence(record::Record) - return isfilled(record) + return hasseqlength(record) end """ @@ -539,43 +573,57 @@ end Get the sequence length of `record`. """ -function seqlength(record::Record)::Int - checkfilled(record) +function seqlength(record::Record) + hasseqlength(record) || return nothing return record.l_seq % Int end function hasseqlength(record::Record) - return isfilled(record) + return !iszero(record.l_seq) end """ - quality(record::Record) + quality(record::Record)::Union{nothing, Vector{UInt8}} Get the base quality of `record`. """ function quality(record::Record) - checkfilled(record) + it = quality_iter(record) + return it === nothing ? it : collect(it) +end + +""" + quality_iter(record::Record) + +If the quality is not available, return `nothing`. Else, return an iterator +over the quality values. +""" +function quality_iter(record::Record) + hasseqlength(record) || return nothing seqlen = seqlength(record) - offset = seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) - return record.data[(1+offset):(seqlen+offset)] + offset = seqname_length(record) + 1 + n_cigar_op(record, false) * 4 + cld(seqlen, 2) + data = record.data + range = offset+1:offset+seqlen + all(data[i] == 0xff for i in range) && return nothing + return (data[i] for i in range) end function hasquality(record::Record) - return isfilled(record) + return hasseqlength(record) && any(i < 0xff for i in quality(record)) end """ auxdata(record::Record)::BAM.AuxData -Get the auxiliary data of `record`. +Get the auxiliary data of `record`. Note that this object will be corrupted if +the record is overwritten. """ function auxdata(record::Record) - checkfilled(record) - return AuxData(record.data[auxdata_position(record):data_size(record)]) + return AuxData(@view record.data[auxdata_position(record):data_size(record)]) end function hasauxdata(record::Record) - return isfilled(record) + return auxdata_position(record) < data_size(record) end function Base.getindex(record::Record, tag::AbstractString) @@ -613,7 +661,7 @@ function BioGenerics.seqname(record::Record) end function BioGenerics.hasseqname(record::Record) - return hastempname(record) + return hastempname(record) end function BioGenerics.sequence(record::Record) @@ -646,26 +694,17 @@ end # Return the size of the `.data` field. function data_size(record::Record) - if isfilled(record) - return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size) - end - - return 0 - -end - -function checkfilled(record::Record) - if !isfilled(record) - throw(ArgumentError("unfilled BAM record")) - end + return record.block_size - FIXED_FIELDS_BYTES + sizeof(record.block_size) end function auxdata_position(record::Record) seqlen = seqlength(record) - return seqname_length(record) + n_cigar_op(record, false) * 4 + cld(seqlen, 2) + seqlen + 1 + seqlen === nothing && return 2 + # name null CIGAR DNA quals + return seqname_length(record) + 1 + 4 * n_cigar_op(record, false) + cld(seqlen, 2) + seqlen + 1 end # Return the length of the read name. function seqname_length(record::Record) - return record.l_read_name + return (record.l_read_name % Int) - 1 end diff --git a/src/bam/writer.jl b/src/bam/writer.jl index 2460b3f..eb4b68c 100644 --- a/src/bam/writer.jl +++ b/src/bam/writer.jl @@ -2,7 +2,7 @@ # ========== """ - BAM.Writer(output::BGZFStream, header::SAM.Header) + BAM.Writer(output::BGZFCompressorStream, header::SAM.Header) Create a data writer of the BAM file format. @@ -11,10 +11,10 @@ Create a data writer of the BAM file format. * `header`: SAM header object """ mutable struct Writer <: BioGenerics.IO.AbstractWriter - stream::BGZFStreams.BGZFStream + stream::BGZFCompressorStream end -function Writer(stream::BGZFStreams.BGZFStream, header::SAM.Header) +function Writer(stream::BGZFCompressorStream, header::SAM.Header) refseqnames = String[] refseqlens = Int[] for metainfo in findall(header, "SQ") diff --git a/src/common.jl b/src/common.jl new file mode 100644 index 0000000..e7443fc --- /dev/null +++ b/src/common.jl @@ -0,0 +1,36 @@ +function quality_string(quals::Vector{UInt8}) + characters = Vector{Char}(undef, length(quals)) + for i in eachindex(quals) + @inbounds qual = quals[i] + if qual < 10 + char = ' ' + elseif qual < 15 + char = '▁' + elseif qual < 20 + char = '▂' + elseif qual < 25 + char = '▃' + elseif qual < 30 + char = '▄' + elseif qual < 35 + char = '▆' + elseif qual < 40 + char = '▇' + elseif qual < 255 + char = '█' + else + char = '?' + end + @inbounds characters[i] = char + end + return join(characters) +end + +function compact_string(sequence, width) + if length(sequence) <= width + return string(sequence) + else + half = div(width - 1, 2) + return string(sequence[1:half]) * '…' * string(sequence[end-half:end]) + end +end diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..fbc3db0 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,9 @@ +[deps] +BioAlignments = "00701ae9-d1dc-5365-b64a-a3a3ebf5695e" +BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" +BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" +CodecBGZF = "d9d91ef6-315d-495b-8131-db2ca24339d6" +FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd" +GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +XAM = "d759349c-bcba-11e9-07c2-5b90f8f05f7c" diff --git a/test/runtests.jl b/test/runtests.jl index beede29..b796a35 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,9 +4,9 @@ using BioGenerics using FormatSpecimens using GenomicFeatures using XAM +using CodecBGZF import BioAlignments: Alignment, AlignmentAnchor, OP_START, OP_MATCH, OP_DELETE -import BGZFStreams: BGZFStream import BioGenerics.Exceptions: MissingFieldException import BioSequences: @dna_str, @aa_str @@ -25,4 +25,6 @@ end include("test_sam.jl") include("test_bam.jl") -include("test_crosscheck.jl") + +# TODO: Properties are internal - rely on functions. But do make integration tests +#include("test_crosscheck.jl") diff --git a/test/test_bam.jl b/test/test_bam.jl index beb6d03..d81c216 100644 --- a/test/test_bam.jl +++ b/test/test_bam.jl @@ -35,16 +35,29 @@ @testset "Record" begin record = BAM.Record() - @test !isfilled(record) - @test repr(record) == "XAM.BAM.Record: " - @test_throws ArgumentError BAM.flag(record) + @test record == empty!(record) + @test BAM.flag(record) == 0x0004 + @test !BAM.hasrefid(record) + @test !BAM.hasrefname(record) + @test !BAM.hasposition(record) + @test !BAM.hasrightposition(record) + @test !BAM.hasnextrefid(record) + @test !BAM.hasnextrefname(record) + @test !BAM.hasnextposition(record) + @test !BAM.hasmappingquality(record) + @test !BAM.hasalignment(record) + @test !BAM.hastempname(record) + @test !BAM.hasseqname(record) + @test !BAM.hassequence(record) + @test !BAM.hasquality(record) + @test !BAM.hasauxdata(record) + @test !BAM.hasnextrefid(record) end @testset "Reader" begin reader = open(BAM.Reader, joinpath(bamdir, "ce#1.bam")) @test isa(reader, BAM.Reader) @test eltype(reader) === BAM.Record - @test startswith(repr(reader), "XAM.BAM.Reader{IOStream}:") # header h = header(reader) @@ -58,12 +71,12 @@ @test ! BAM.ispositivestrand(record) @test BAM.refname(record) == "CHROMOSOME_I" @test BAM.refid(record) === 1 - @test BAM.hasnextrefid(record) - @test BAM.nextrefid(record) === 0 + @test !BAM.hasnextrefid(record) + @test BAM.nextrefid(record) === nothing @test BAM.hasposition(record) === hasleftposition(record) === true @test BAM.position(record) === leftposition(record) === 2 - @test BAM.hasnextposition(record) - @test BAM.nextposition(record) === 0 + @test !BAM.hasnextposition(record) + @test BAM.nextposition(record) === nothing @test rightposition(record) == 102 @test BAM.hastempname(record) === hasseqname(record) === true @test BAM.tempname(record) == seqname(record) == "SRR065390.14978392" @@ -199,7 +212,8 @@ header_original = header(reader) - writer = BAM.Writer(BGZFStream(path, "w"), BAM.header(reader, fillSQ=isempty(findall(header(reader), "SQ")))) + hdr = BAM.header(reader, fillSQ=isempty(findall(header(reader), "SQ"))) + writer = BAM.Writer(BGZFCompressorStream(open(path, "w")), hdr) records = BAM.Record[] for record in reader