From 2403731c3381fc38ec0d4dc3a868d5e46f0165bb Mon Sep 17 00:00:00 2001
From: Jakob Nybo Nissen <jakobnybonissen@gmail.com>
Date: Sun, 28 Feb 2021 12:50:59 +0100
Subject: [PATCH 1/6] Switch to CodecBGZF

---
 Project.toml     | 9 ++++-----
 src/Indexes.jl   | 6 ++----
 src/bgzfindex.jl | 6 +++---
 src/chunk.jl     | 8 ++++----
 src/overlap.jl   | 2 +-
 src/tabix.jl     | 2 +-
 6 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/Project.toml b/Project.toml
index f07b622..75e34e4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,17 +4,16 @@ authors = ["Kenta Sato <bicycle1885@gmail.com>", "Ben J. Ward <benjward@protonma
 version = "0.1.3"
 
 [deps]
-BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
-BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
+BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81"
+CodecBGZF = "d9d91ef6-315d-495b-8131-db2ca24339d6"
 GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446"
 TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 
 [compat]
-BGZFStreams = "0.3"
-BioGenerics = "0.1"
+BioCore = "2"
 GenomicFeatures = "2"
 TranscodingStreams = "0.9.5"
-julia = "1"
+julia = "1.3"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/Indexes.jl b/src/Indexes.jl
index f0d9747..50939eb 100644
--- a/src/Indexes.jl
+++ b/src/Indexes.jl
@@ -8,10 +8,8 @@
 
 module Indexes
 
-using TranscodingStreams
-
-import BGZFStreams
-import BioGenerics
+import CodecBGZF
+import BioCore
 import GenomicFeatures: Interval
 
 function Base.bytesavailable(stream::BGZFStreams.BGZFStream{IOStream})
diff --git a/src/bgzfindex.jl b/src/bgzfindex.jl
index 83081e6..2877244 100644
--- a/src/bgzfindex.jl
+++ b/src/bgzfindex.jl
@@ -1,7 +1,7 @@
 # BGZF Index
 # ==========
 #
-# An index type for BGZFStream.
+# An index type for CodecBGZF.
 #
 # The details of the internal is specified in
 # https://samtools.github.io/hts-specs/SAMv1.pdf.
@@ -13,7 +13,7 @@
 const BinIndex = Dict{UInt32,Vector{Chunk}}
 
 # linear index
-const LinearIndex = Vector{BGZFStreams.VirtualOffset}
+const LinearIndex = Vector{CodecBGZF.VirtualOffset}
 
 # Metadata providing a summary of the number of mappend/unmapped reads.
 struct PseudoBin
@@ -27,7 +27,7 @@ struct PseudoBin
     n_unmapped::Int64
 end
 
-# Index for BGZFStream; used in BAI and Tabix index.
+# Index for CodecBGZF; used in BAI and Tabix index.
 struct BGZFIndex
     # indexes of contigs (chromosomes)
     data::Vector{Tuple{BinIndex,LinearIndex,Union{PseudoBin, Nothing}}}
diff --git a/src/chunk.jl b/src/chunk.jl
index 5c3e878..d46ebf4 100644
--- a/src/chunk.jl
+++ b/src/chunk.jl
@@ -8,11 +8,11 @@
 
 # BGZF file chunk [.start, .stop).
 struct Chunk
-    start::BGZFStreams.VirtualOffset
-    stop::BGZFStreams.VirtualOffset
+    start::CodecBGZF.VirtualOffset
+    stop::CodecBGZF.VirtualOffset
 end
 
-function Base.in(voffset::BGZFStreams.VirtualOffset, chunk::Chunk)
+function Base.in(voffset::CodecBGZF.VirtualOffset, chunk::Chunk)
     return chunk.start ≤ voffset < chunk.stop
 end
 
@@ -33,6 +33,6 @@ function Base.isless(chunk1::Chunk, chunk2::Chunk)
     return false
 end
 
-function Base.seek(stream::BGZFStreams.BGZFStream, chunk::Chunk)
+function Base.seek(stream::CodecBGZF.BGZFDecompressorStream, chunk::Chunk)
     return seek(stream, chunk.start)
 end
diff --git a/src/overlap.jl b/src/overlap.jl
index 9a69b12..44cb9ba 100644
--- a/src/overlap.jl
+++ b/src/overlap.jl
@@ -47,7 +47,7 @@ function done(iter::TabixOverlapIterator, state)
         The `virtualoffset(source)` is not synchronized with the current reading position because data are buffered in `buffer` for parsing text.
         So we need to check not only `virtualoffset` but also `nb_available`, which returns the current buffered data size.
         =#
-        while bytesavailable(buffer) > 0 || BGZFStreams.virtualoffset(source) < chunk.stop
+        while bytesavailable(buffer) > 0 || CodecBGZF.virtualoffset(source) < chunk.stop
             read!(iter.reader, state.record)
             c = icmp(state.record, iter.interval)
             if c == 0  # overlapping
diff --git a/src/tabix.jl b/src/tabix.jl
index 484514f..6854b62 100644
--- a/src/tabix.jl
+++ b/src/tabix.jl
@@ -108,7 +108,7 @@ end
 
 # Read a Tabix object from `input_`.
 function read_tabix(input_::IO)
-    input = BGZFStreams.BGZFStream(input_)
+    input = CodecBGZF.BGZFDecompressorStream(input_)
 
     # check magic bytes
     T = read(input, UInt8)

From 4883728c0c3d5aac432bbc8f2df12cdcfc7ee458 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciara=CC=81n=20O=E2=80=99Mara?= <Ciaran.OMara@utas.edu.au>
Date: Wed, 31 Mar 2021 08:56:13 +1100
Subject: [PATCH 2/6] Amendments

Base.bytesavailable is provided by CodecBGZF.
Add compat entry for CodecBGZF.
Keep using TranscodingStreams.
---
 Project.toml   |  5 +++--
 src/Indexes.jl | 16 +++-------------
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/Project.toml b/Project.toml
index 75e34e4..50bf298 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,13 +4,14 @@ authors = ["Kenta Sato <bicycle1885@gmail.com>", "Ben J. Ward <benjward@protonma
 version = "0.1.3"
 
 [deps]
-BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81"
+BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
 CodecBGZF = "d9d91ef6-315d-495b-8131-db2ca24339d6"
 GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446"
 TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 
 [compat]
-BioCore = "2"
+BioGenerics = "0.1"
+CodecBGZF = "0.1"
 GenomicFeatures = "2"
 TranscodingStreams = "0.9.5"
 julia = "1.3"
diff --git a/src/Indexes.jl b/src/Indexes.jl
index 50939eb..a88d028 100644
--- a/src/Indexes.jl
+++ b/src/Indexes.jl
@@ -8,22 +8,12 @@
 
 module Indexes
 
+using TranscodingStreams
+
 import CodecBGZF
-import BioCore
+import BioGenerics
 import GenomicFeatures: Interval
 
-function Base.bytesavailable(stream::BGZFStreams.BGZFStream{IOStream})
-
-    block_index = BGZFStreams.ensure_buffered_data(stream)
-    if block_index == 0
-        return 0
-    end
-    block = stream.blocks[block_index]
-
-    return length(block.position:block.size)
-
-end
-
 include("chunk.jl")
 include("bgzfindex.jl")
 include("tabix.jl")

From f9347a1cafc0cdaaf559a817207816d0fbdbed7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciara=CC=81n=20O=E2=80=99Mara?= <Ciaran.OMara@utas.edu.au>
Date: Wed, 31 Mar 2021 09:07:47 +1100
Subject: [PATCH 3/6] Increment version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 50bf298..7df0e86 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Indexes"
 uuid = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d"
 authors = ["Kenta Sato <bicycle1885@gmail.com>", "Ben J. Ward <benjward@protonmail.com>", "Ciarán O’Mara <CiaranOMara@utas.edu.au>"]
-version = "0.1.3"
+version = "0.2.0"
 
 [deps]
 BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"

From a1d9eaa70d82c399cc90840643735d1ccb24425e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciara=CC=81n=20O=E2=80=99Mara?= <Ciaran.OMara@utas.edu.au>
Date: Wed, 31 Mar 2021 09:20:29 +1100
Subject: [PATCH 4/6] Adjust test matrix

The Indexes package uses the Artifacts framework, which is first available in Julia v1.3.
---
 .github/workflows/UnitTests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml
index 9d5c078..55ffa4a 100644
--- a/.github/workflows/UnitTests.yml
+++ b/.github/workflows/UnitTests.yml
@@ -12,7 +12,7 @@ jobs:
       fail-fast: false
       matrix:
         julia-version:
-          - '1.0' # LTS
+          - '1.3' # The Indexes package uses the Artifacts framework, which is first available in Julia 1.3.
           - '1'
         julia-arch: [x64, x86]
         os: [ubuntu-latest, windows-latest, macOS-latest]

From e417c6851314c410d5afb15ff44d36752587d70f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ciara=CC=81n=20O=E2=80=99Mara?= <Ciaran.OMara@utas.edu.au>
Date: Fri, 11 Jun 2021 23:53:21 +1000
Subject: [PATCH 5/6] Develop tests

---
 Project.toml     |  3 +-
 test/runtests.jl | 72 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index 7df0e86..ef0623d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -17,7 +17,8 @@ TranscodingStreams = "0.9.5"
 julia = "1.3"
 
 [extras]
+FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["FormatSpecimens", "Test"]
diff --git a/test/runtests.jl b/test/runtests.jl
index 8e5ed8a..3c5554a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,7 +1,75 @@
 using Test
 using Indexes
+using FormatSpecimens
+
 
 @testset "Indexes" begin
-    # TODO
-    # @test GenomicFeatures.Indexes.Tabix === GenomicFeatures.Indexes.Tabix
+
+@testset "Tabix" begin
+
+    # BAI
+    path = joinpath(path_of_format("BAM"), "GSE25840_GSM424320_GM06985_gencode_spliced.head.bam.bai")
+    path = joinpath(path_of_format("BAM"), "R_12h_D06.uniq.q40.bam.bai")
+    path = joinpath(path_of_format("BAM"), "cigar-64k.bam.bai")
+
+    # TBI
+    path = joinpath(path_of_format("BED"), "ws245Genes.WBGene.bed.bgz.tbi")
+    path = joinpath(path_of_format("GFF3"), "TAIR10.part.gff.bgz.tbi")
+
+    @info path
+
+    open(path) do io
+
+        input = Indexes.CodecBGZF.BGZFDecompressorStream(io)
+        @info input
+
+        seekstart(input)
+
+        # check magic bytes
+        T = read(input, UInt8)
+        @info T
+
+        B = read(input, UInt8)
+        @info B
+
+        I = read(input, UInt8)
+        @info I
+
+        x = read(input, UInt8)
+        @info x
+
+    end
+
+    index = Indexes.Tabix(path)
+
+end
+
+@testset "BGZF" begin
+
+# #
+# Indexes.reg2bin(-1, 0)
+#
+# # The BAI index format for BAM files
+#
+# goodfiles = filter(entry-> hastag(entry, "bai"), list_valid_specimens("BAM"))
+#
+# entry = goodfiles[1]
+#
+# # Get the full path of a file in the entry:
+# path_bam = joinpath(path_of_format("BAM"), filename(entry))
+# path_bai = path_bam * ".bai"
+#
+# stream = open(path_bai)
+#
+# # Read magic bytes
+# str = read(stream, 4)
+#
+# # read contents
+# n_refs = read(stream, Int32)
+#
+#
+# indexes = Indexes.read_bgzfindex(stream, n_refs)
+
+end
+
 end

From 0461282b667699b9be69db536021b09b4996dde2 Mon Sep 17 00:00:00 2001
From: Jialin Ma <jma@broadinstitute.org>
Date: Mon, 14 Jun 2021 05:54:54 +0000
Subject: [PATCH 6/6] Fix: CodecBGZF.virtualoffset -> CodecBGZF.VirtualOffset

---
 src/overlap.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/overlap.jl b/src/overlap.jl
index 44cb9ba..a353156 100644
--- a/src/overlap.jl
+++ b/src/overlap.jl
@@ -47,7 +47,7 @@ function done(iter::TabixOverlapIterator, state)
         The `virtualoffset(source)` is not synchronized with the current reading position because data are buffered in `buffer` for parsing text.
         So we need to check not only `virtualoffset` but also `nb_available`, which returns the current buffered data size.
         =#
-        while bytesavailable(buffer) > 0 || CodecBGZF.virtualoffset(source) < chunk.stop
+        while bytesavailable(buffer) > 0 || CodecBGZF.VirtualOffset(source) < chunk.stop
             read!(iter.reader, state.record)
             c = icmp(state.record, iter.interval)
             if c == 0  # overlapping