diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml index 9d5c078..55ffa4a 100644 --- a/.github/workflows/UnitTests.yml +++ b/.github/workflows/UnitTests.yml @@ -12,7 +12,7 @@ jobs: fail-fast: false matrix: julia-version: - - '1.0' # LTS + - '1.3' # The Indexes package uses the Artifacts framework, which is first available in Julia 1.3. - '1' julia-arch: [x64, x86] os: [ubuntu-latest, windows-latest, macOS-latest] diff --git a/Project.toml b/Project.toml index f07b622..ef0623d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,23 +1,24 @@ name = "Indexes" uuid = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d" authors = ["Kenta Sato ", "Ben J. Ward ", "Ciarán O’Mara "] -version = "0.1.3" +version = "0.2.0" [deps] -BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6" BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" +CodecBGZF = "d9d91ef6-315d-495b-8131-db2ca24339d6" GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" [compat] -BGZFStreams = "0.3" BioGenerics = "0.1" +CodecBGZF = "0.1" GenomicFeatures = "2" TranscodingStreams = "0.9.5" -julia = "1" +julia = "1.3" [extras] +FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test"] +test = ["FormatSpecimens", "Test"] diff --git a/src/Indexes.jl b/src/Indexes.jl index f0d9747..a88d028 100644 --- a/src/Indexes.jl +++ b/src/Indexes.jl @@ -10,22 +10,10 @@ module Indexes using TranscodingStreams -import BGZFStreams +import CodecBGZF import BioGenerics import GenomicFeatures: Interval -function Base.bytesavailable(stream::BGZFStreams.BGZFStream{IOStream}) - - block_index = BGZFStreams.ensure_buffered_data(stream) - if block_index == 0 - return 0 - end - block = stream.blocks[block_index] - - return length(block.position:block.size) - -end - include("chunk.jl") include("bgzfindex.jl") include("tabix.jl") diff --git a/src/bgzfindex.jl b/src/bgzfindex.jl index 83081e6..2877244 100644 --- a/src/bgzfindex.jl +++ b/src/bgzfindex.jl @@ -1,7 +1,7 @@ # BGZF Index # ========== # -# An index type for BGZFStream. +# An index type for CodecBGZF. # # The details of the internal is specified in # https://samtools.github.io/hts-specs/SAMv1.pdf. @@ -13,7 +13,7 @@ const BinIndex = Dict{UInt32,Vector{Chunk}} # linear index -const LinearIndex = Vector{BGZFStreams.VirtualOffset} +const LinearIndex = Vector{CodecBGZF.VirtualOffset} # Metadata providing a summary of the number of mappend/unmapped reads. struct PseudoBin @@ -27,7 +27,7 @@ struct PseudoBin n_unmapped::Int64 end -# Index for BGZFStream; used in BAI and Tabix index. +# Index for CodecBGZF; used in BAI and Tabix index. struct BGZFIndex # indexes of contigs (chromosomes) data::Vector{Tuple{BinIndex,LinearIndex,Union{PseudoBin, Nothing}}} diff --git a/src/chunk.jl b/src/chunk.jl index 5c3e878..d46ebf4 100644 --- a/src/chunk.jl +++ b/src/chunk.jl @@ -8,11 +8,11 @@ # BGZF file chunk [.start, .stop). struct Chunk - start::BGZFStreams.VirtualOffset - stop::BGZFStreams.VirtualOffset + start::CodecBGZF.VirtualOffset + stop::CodecBGZF.VirtualOffset end -function Base.in(voffset::BGZFStreams.VirtualOffset, chunk::Chunk) +function Base.in(voffset::CodecBGZF.VirtualOffset, chunk::Chunk) return chunk.start ≤ voffset < chunk.stop end @@ -33,6 +33,6 @@ function Base.isless(chunk1::Chunk, chunk2::Chunk) return false end -function Base.seek(stream::BGZFStreams.BGZFStream, chunk::Chunk) +function Base.seek(stream::CodecBGZF.BGZFDecompressorStream, chunk::Chunk) return seek(stream, chunk.start) end diff --git a/src/overlap.jl b/src/overlap.jl index 9a69b12..a353156 100644 --- a/src/overlap.jl +++ b/src/overlap.jl @@ -47,7 +47,7 @@ function done(iter::TabixOverlapIterator, state) The `virtualoffset(source)` is not synchronized with the current reading position because data are buffered in `buffer` for parsing text. So we need to check not only `virtualoffset` but also `nb_available`, which returns the current buffered data size. =# - while bytesavailable(buffer) > 0 || BGZFStreams.virtualoffset(source) < chunk.stop + while bytesavailable(buffer) > 0 || CodecBGZF.VirtualOffset(source) < chunk.stop read!(iter.reader, state.record) c = icmp(state.record, iter.interval) if c == 0 # overlapping diff --git a/src/tabix.jl b/src/tabix.jl index 484514f..6854b62 100644 --- a/src/tabix.jl +++ b/src/tabix.jl @@ -108,7 +108,7 @@ end # Read a Tabix object from `input_`. function read_tabix(input_::IO) - input = BGZFStreams.BGZFStream(input_) + input = CodecBGZF.BGZFDecompressorStream(input_) # check magic bytes T = read(input, UInt8) diff --git a/test/runtests.jl b/test/runtests.jl index 8e5ed8a..3c5554a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,75 @@ using Test using Indexes +using FormatSpecimens + @testset "Indexes" begin - # TODO - # @test GenomicFeatures.Indexes.Tabix === GenomicFeatures.Indexes.Tabix + +@testset "Tabix" begin + + # BAI + path = joinpath(path_of_format("BAM"), "GSE25840_GSM424320_GM06985_gencode_spliced.head.bam.bai") + path = joinpath(path_of_format("BAM"), "R_12h_D06.uniq.q40.bam.bai") + path = joinpath(path_of_format("BAM"), "cigar-64k.bam.bai") + + # TBI + path = joinpath(path_of_format("BED"), "ws245Genes.WBGene.bed.bgz.tbi") + path = joinpath(path_of_format("GFF3"), "TAIR10.part.gff.bgz.tbi") + + @info path + + open(path) do io + + input = Indexes.CodecBGZF.BGZFDecompressorStream(io) + @info input + + seekstart(input) + + # check magic bytes + T = read(input, UInt8) + @info T + + B = read(input, UInt8) + @info B + + I = read(input, UInt8) + @info I + + x = read(input, UInt8) + @info x + + end + + index = Indexes.Tabix(path) + +end + +@testset "BGZF" begin + +# # +# Indexes.reg2bin(-1, 0) +# +# # The BAI index format for BAM files +# +# goodfiles = filter(entry-> hastag(entry, "bai"), list_valid_specimens("BAM")) +# +# entry = goodfiles[1] +# +# # Get the full path of a file in the entry: +# path_bam = joinpath(path_of_format("BAM"), filename(entry)) +# path_bai = path_bam * ".bai" +# +# stream = open(path_bai) +# +# # Read magic bytes +# str = read(stream, 4) +# +# # read contents +# n_refs = read(stream, Int32) +# +# +# indexes = Indexes.read_bgzfindex(stream, n_refs) + +end + end