From 2403731c3381fc38ec0d4dc3a868d5e46f0165bb Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 28 Feb 2021 12:50:59 +0100 Subject: [PATCH 1/6] Switch to CodecBGZF --- Project.toml | 9 ++++----- src/Indexes.jl | 6 ++---- src/bgzfindex.jl | 6 +++--- src/chunk.jl | 8 ++++---- src/overlap.jl | 2 +- src/tabix.jl | 2 +- 6 files changed, 15 insertions(+), 18 deletions(-) diff --git a/Project.toml b/Project.toml index f07b622..75e34e4 100644 --- a/Project.toml +++ b/Project.toml @@ -4,17 +4,16 @@ authors = ["Kenta Sato ", "Ben J. Ward 0 || BGZFStreams.virtualoffset(source) < chunk.stop + while bytesavailable(buffer) > 0 || CodecBGZF.virtualoffset(source) < chunk.stop read!(iter.reader, state.record) c = icmp(state.record, iter.interval) if c == 0 # overlapping diff --git a/src/tabix.jl b/src/tabix.jl index 484514f..6854b62 100644 --- a/src/tabix.jl +++ b/src/tabix.jl @@ -108,7 +108,7 @@ end # Read a Tabix object from `input_`. function read_tabix(input_::IO) - input = BGZFStreams.BGZFStream(input_) + input = CodecBGZF.BGZFDecompressorStream(input_) # check magic bytes T = read(input, UInt8) From 4883728c0c3d5aac432bbc8f2df12cdcfc7ee458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciara=CC=81n=20O=E2=80=99Mara?= Date: Wed, 31 Mar 2021 08:56:13 +1100 Subject: [PATCH 2/6] Amendments Base.bytesavailable is provided by CodecBGZF. Add compat entry for CodecBGZF. Keep using TranscodingStreams. --- Project.toml | 5 +++-- src/Indexes.jl | 16 +++------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/Project.toml b/Project.toml index 75e34e4..50bf298 100644 --- a/Project.toml +++ b/Project.toml @@ -4,13 +4,14 @@ authors = ["Kenta Sato ", "Ben J. Ward Date: Wed, 31 Mar 2021 09:07:47 +1100 Subject: [PATCH 3/6] Increment version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 50bf298..7df0e86 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Indexes" uuid = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d" authors = ["Kenta Sato ", "Ben J. Ward ", "Ciarán O’Mara "] -version = "0.1.3" +version = "0.2.0" [deps] BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea" From a1d9eaa70d82c399cc90840643735d1ccb24425e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciara=CC=81n=20O=E2=80=99Mara?= Date: Wed, 31 Mar 2021 09:20:29 +1100 Subject: [PATCH 4/6] Adjust test matrix The Indexes package uses the Artifacts framework, which is first available in Julia v1.3. --- .github/workflows/UnitTests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml index 9d5c078..55ffa4a 100644 --- a/.github/workflows/UnitTests.yml +++ b/.github/workflows/UnitTests.yml @@ -12,7 +12,7 @@ jobs: fail-fast: false matrix: julia-version: - - '1.0' # LTS + - '1.3' # The Indexes package uses the Artifacts framework, which is first available in Julia 1.3. - '1' julia-arch: [x64, x86] os: [ubuntu-latest, windows-latest, macOS-latest] From e417c6851314c410d5afb15ff44d36752587d70f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ciara=CC=81n=20O=E2=80=99Mara?= Date: Fri, 11 Jun 2021 23:53:21 +1000 Subject: [PATCH 5/6] Develop tests --- Project.toml | 3 +- test/runtests.jl | 72 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 7df0e86..ef0623d 100644 --- a/Project.toml +++ b/Project.toml @@ -17,7 +17,8 @@ TranscodingStreams = "0.9.5" julia = "1.3" [extras] +FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test"] +test = ["FormatSpecimens", "Test"] diff --git a/test/runtests.jl b/test/runtests.jl index 8e5ed8a..3c5554a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,75 @@ using Test using Indexes +using FormatSpecimens + @testset "Indexes" begin - # TODO - # @test GenomicFeatures.Indexes.Tabix === GenomicFeatures.Indexes.Tabix + +@testset "Tabix" begin + + # BAI + path = joinpath(path_of_format("BAM"), "GSE25840_GSM424320_GM06985_gencode_spliced.head.bam.bai") + path = joinpath(path_of_format("BAM"), "R_12h_D06.uniq.q40.bam.bai") + path = joinpath(path_of_format("BAM"), "cigar-64k.bam.bai") + + # TBI + path = joinpath(path_of_format("BED"), "ws245Genes.WBGene.bed.bgz.tbi") + path = joinpath(path_of_format("GFF3"), "TAIR10.part.gff.bgz.tbi") + + @info path + + open(path) do io + + input = Indexes.CodecBGZF.BGZFDecompressorStream(io) + @info input + + seekstart(input) + + # check magic bytes + T = read(input, UInt8) + @info T + + B = read(input, UInt8) + @info B + + I = read(input, UInt8) + @info I + + x = read(input, UInt8) + @info x + + end + + index = Indexes.Tabix(path) + +end + +@testset "BGZF" begin + +# # +# Indexes.reg2bin(-1, 0) +# +# # The BAI index format for BAM files +# +# goodfiles = filter(entry-> hastag(entry, "bai"), list_valid_specimens("BAM")) +# +# entry = goodfiles[1] +# +# # Get the full path of a file in the entry: +# path_bam = joinpath(path_of_format("BAM"), filename(entry)) +# path_bai = path_bam * ".bai" +# +# stream = open(path_bai) +# +# # Read magic bytes +# str = read(stream, 4) +# +# # read contents +# n_refs = read(stream, Int32) +# +# +# indexes = Indexes.read_bgzfindex(stream, n_refs) + +end + end From 0461282b667699b9be69db536021b09b4996dde2 Mon Sep 17 00:00:00 2001 From: Jialin Ma Date: Mon, 14 Jun 2021 05:54:54 +0000 Subject: [PATCH 6/6] Fix: CodecBGZF.virtualoffset -> CodecBGZF.VirtualOffset --- src/overlap.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/overlap.jl b/src/overlap.jl index 44cb9ba..a353156 100644 --- a/src/overlap.jl +++ b/src/overlap.jl @@ -47,7 +47,7 @@ function done(iter::TabixOverlapIterator, state) The `virtualoffset(source)` is not synchronized with the current reading position because data are buffered in `buffer` for parsing text. So we need to check not only `virtualoffset` but also `nb_available`, which returns the current buffered data size. =# - while bytesavailable(buffer) > 0 || CodecBGZF.virtualoffset(source) < chunk.stop + while bytesavailable(buffer) > 0 || CodecBGZF.VirtualOffset(source) < chunk.stop read!(iter.reader, state.record) c = icmp(state.record, iter.interval) if c == 0 # overlapping