Migrate to Automa v1

BioJulia · Jul 19, 2023 · 45854a4 · 45854a4 · jakobnissen · Aug 10, 2023
1 parent a98d262
commit 45854a4
Show file tree

Hide file tree

Showing 8 changed files with 53 additions and 101 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FASTX"
 uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12"
 authors = ["Sabrina J. Ward <[email protected]>", "Jakob N. Nissen <[email protected]>"]
-version = "2.1.2"
+version = "2.1.3"
 
 [weakdeps]
 BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
@@ -10,7 +10,6 @@ BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
 BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
 BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
-ScanByte = "7b38b023-a4d7-4c5e-8d43-3f3097f304eb"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
 TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
@@ -19,10 +18,9 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 BioSequencesExt = "BioSequences"
 
 [compat]
-Automa = "0.8"
+Automa = "1"
 BioGenerics = "0.1.2"
 BioSequences = "3"
-ScanByte = "0.4"
 PrecompileTools = "1"
 StringViews = "1"
 TranscodingStreams = "0.9.5"

diff --git a/src/FASTX.jl b/src/FASTX.jl
@@ -87,8 +87,6 @@ julia> seqsize(parse(FASTA.Record, ">hdr\\nαβγδϵ"))
 """
 function seqsize end
 
-const UTF8 = Union{AbstractVector{UInt8}, String, SubString{String}}
-
 # line is nothing if the reader does not have line information after random IO access.
 @noinline function throw_parser_error(
     data::Vector{UInt8},
@@ -147,7 +145,7 @@ end
 
 CONTEXT = Automa.CodeGenContext(
     generator=:goto,
-    vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte)
+    vars=Automa.Variables(;p=:p, p_end=:p_end, cs=:cs, data=:data, mem=:mem, byte=:byte)
 )
 
 include("fasta/fasta.jl")

diff --git a/src/fasta/fasta.jl b/src/fasta/fasta.jl
@@ -8,17 +8,13 @@ Module under FASTX with code related to FASTA files.
 """
 module FASTA
 
-import Automa
-import Automa.RegExp: @re_str
-import Automa.Stream: @mark, @markpos, @relpos, @abspos
+using Automa: Automa, @re_str, @mark, @markpos, @relpos, @abspos, onenter!, onexit!, onall!
 import BioGenerics: BioGenerics
 import StringViews: StringView
 import TranscodingStreams: TranscodingStreams, TranscodingStream, NoopStream
+import ..FASTX: identifier, description, sequence, seqsize, truncate, memcmp, appendfrom!, CONTEXT, throw_parser_error
 
-# Trivial use, I only use it here because it's a dep of Automa anyway.
-# Can be removed with no big problems
-using ScanByte: memchr, ByteSet
-import ..FASTX: identifier, description, sequence, UTF8, seqsize, throw_parser_error, truncate, memcmp, appendfrom!, CONTEXT
+const Re = Automa.RegExp
 
 include("record.jl")
 include("readrecord.jl")

diff --git a/src/fasta/index.jl b/src/fasta/index.jl
@@ -100,24 +100,17 @@ function Base.show(io::IO, index::Index)
 end
 
 index_machine = let
-    re = Automa.RegExp
     newline = let
-        lf = re"\n"
-        lf.actions[:enter] = [:countline]
-        re.opt('\r') * lf
+        lf = onenter!(re"\n", :countline)
+        Re.opt('\r') * lf
     end
 
     # The specs refer to the SAM specs, which contain this regex
-    name = re"[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*"
-    name.actions[:enter] = [:mark]
-    name.actions[:exit] = [:name]
-
-    number = re"[0-9]+"
-    number.actions[:all] = [:digit]
-    number.actions[:exit] = [:number]
+    name = onexit!(onenter!(re"[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*", :mark), :name)
+    number = onexit!(onall!(re"[0-9]+", :digit), :number)
 
     line = name * re"\t" * number * re"\t" * number * re"\t" * number * re"\t" * number
-    fai = re.opt(line * re.rep(newline * line)) * re.rep(newline)
+    fai = Re.opt(line * Re.rep(newline * line)) * Re.rep(newline)
     Automa.compile(fai)
 end
 
@@ -173,7 +166,6 @@ index_actions = Dict{Symbol, Expr}(
     error("Error when parsing FAI file: Unexpected byte at index $p (line $linenum col $col)")
 end
 
-ctx = Automa.CodeGenContext(vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte))
 @eval function read_faidx(data::Vector{UInt8})
     start = 0
     linenum = 1
@@ -183,15 +175,9 @@ ctx = Automa.CodeGenContext(vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te,
     linebases = 0
     linebases_num = 0
     vectors = (Int[], Int[], UInt[])
-    $(Automa.generate_init_code(ctx, index_machine))
-    p_eof = p_end = length(data)
 
-    GC.@preserve data begin
-        $(Automa.generate_exec_code(ctx, index_machine, index_actions))
-    end
+    $(Automa.generate_code(CONTEXT, index_machine, index_actions))
 
-    # TODO: Rely on Automa's new error code
-    iszero(cs) || throw_index_error(data, linenum, p)
     return Index(names, vectors...)
 end
 
@@ -303,10 +289,9 @@ returncode = quote
     return Index(names, lengths, offsets, encoded_linebases)
 end
 
-Automa.Stream.generate_reader(
+Automa.generate_reader(
     :faidx_,
     machine,
-    arguments = (),
     actions = index_fasta_actions,
     context = CONTEXT,
     initcode = initcode,

diff --git a/src/fasta/reader.jl b/src/fasta/reader.jl
@@ -155,8 +155,6 @@ function seekrecord(reader::Reader, i::Integer)
     nothing
 end
 
-const UNALLOWED_BYTESET = Val(ByteSet((UInt8('\n'), UInt8('\r'), UInt8('>'))))
-
 """
     extract(reader::Reader, name::AbstractString, range::Union{Nothing, UnitRange})
 
@@ -218,11 +216,16 @@ function extract(
     resize!(buffer, total_bases)
 
     # Now check that there are no bad bytes in our buffer
-    # Note: This ByteSet must correspond to the allowed bytes in
+    # Note: The disallowed bytes must correspond to the allowed bytes in
     # the FASTA machine to ensure we can seek the same FASTA files we can read
-    badpos = memchr(buffer, UNALLOWED_BYTESET)
-    if badpos !== nothing
-        error("Invalid byte in FASTA sequence line: $(buffer[badpos])")
+    bad_byte = false
+    for byte in buffer
+        bad_byte |= (
+            (byte === UInt8('\r')) |
+            (byte === UInt8('\n')) |
+            (byte === UInt8('>'))
+        )
+        bad_byte && error("Invalid byte in FASTA sequence line: '>', '\\r' or '\\n'")
     end
 
     # Return the Reader to a usable state after having messed with its

diff --git a/src/fasta/readrecord.jl b/src/fasta/readrecord.jl
@@ -10,55 +10,41 @@
 #   This implies all whitespace except newlines, including trailing whitespace, is part
 #   of the sequence.
 machine = let
-    re = Automa.RegExp
-
     hspace = re"[ \t\v]"
     newline = let
-        lf = re"\n"
-        lf.actions[:enter] = [:countline]
-        re.opt('\r') * lf
+        lf = onenter!(re"\n", :countline)
+        Re.opt('\r') * lf
     end
     space = hspace | newline
 
     # Identifier: Leading non-space
-    identifier = re.rep(re.any() \ re.space())
-    identifier.actions[:enter] = [:mark]
-    # Action: Store length of identifier
-    identifier.actions[:exit]  = [:identifier]
+    identifier = onexit!(onenter!(Re.rep(Re.any() \ Re.space()), :mark), :identifier)
 
     # Description here include trailing whitespace.
     # This is needed for the FSM, since the description can contain arbitrary
     # whitespace, the only way to know the description ends is to encounter a newline.
     # NB: Make sure to also change the Index machine to match this is you change it.
-    description = identifier * re.opt(hspace * re"[^\r\n]*")
-
-    # Action: Store length of description and append description to record.data
-    description.actions[:exit]  = [:description]
+    description = onexit!(identifier * Re.opt(hspace * re"[^\r\n]*"), :description)
 
     # Header: '>' then description
     header = re">" * description
 
     # Sequence line: Anything except \r, \n and >
-    # Note: Must be consistent with the ByteSet in reader.jl used for seeking
-    sequence_line = re"[^\n\r>]+"
-    sequence_line.actions[:enter] = [:mark]
-    # Action: Append letters to sequence_line
-    sequence_line.actions[:exit]  = [:seqline]
+    # Note: Must be consistent with the disallowed bytes in reader.jl used for seeking
+    sequence_line = onexit!(onenter!(re"[^\n\r>]+", :mark), :seqline)
 
     # Sequence: This is intentionally very liberal with whitespace.
     # Any trailing whitespace is simply considered part of the sequence.
     # Is this bad? Maybe.
-    sequence = re.rep1(re.opt(sequence_line) * re.rep1(newline))
+    sequence = Re.rep1(Re.opt(sequence_line) * Re.rep1(newline))
 
     # We have sequence_eof to allow the final sequence to not end in whitespace
-    sequence_eof = re.opt(sequence_line) * re.rep(re.rep1(newline) * re.opt(sequence_line))
+    sequence_eof = Re.opt(sequence_line) * Re.rep(Re.rep1(newline) * Re.opt(sequence_line))
 
-    record = header * newline * sequence
-    record.actions[:exit] = [:record]
-    record_eof = header * newline * sequence_eof
-    record_eof.actions[:exit] = [:record]
+    record = onexit!(header * newline * sequence, :record)
+    record_eof = onexit!(header * newline * sequence_eof, :record)
 
-    fasta = re.rep(space) * re.rep(record) * re.opt(record_eof)
+    fasta = Re.rep(space) * Re.rep(record) * Re.opt(record_eof)
 
     Automa.compile(fasta)
 end
@@ -106,7 +92,7 @@ end
 
 returncode = :(return cs, linenum, found)
 
-Automa.Stream.generate_reader(
+Automa.generate_reader(
     :readrecord!,
     machine,
     arguments = (:(record::Record), :(state::Tuple{Int,Int})),
@@ -120,7 +106,7 @@ Automa.Stream.generate_reader(
 validator_actions = Dict(k => quote nothing end for k in keys(actions))
 validator_actions[:countline] = :(linenum += 1)
 
-Automa.Stream.generate_reader(
+Automa.generate_reader(
     :validate_fasta,
     machine,
     arguments = (),

diff --git a/src/fastq/fastq.jl b/src/fastq/fastq.jl
@@ -5,13 +5,13 @@ Module under FASTX with code related to FASTA files.
 """
 module FASTQ
 
-import Automa
-import Automa.RegExp: @re_str
-import Automa.Stream: @mark, @markpos, @relpos, @abspos
+using Automa: Automa, @re_str, @mark, @markpos, @relpos, @abspos, onenter!, onexit!
 import BioGenerics: BioGenerics
 import StringViews: StringView
 import TranscodingStreams: TranscodingStreams, TranscodingStream, NoopStream
-import ..FASTX: identifier, description, sequence, UTF8, seqsize, throw_parser_error, truncate, memcmp, appendfrom!, CONTEXT
+import ..FASTX: identifier, description, sequence, seqsize, truncate, memcmp, appendfrom!, CONTEXT
+
+const Re = Automa.RegExp
 
 include("quality.jl")
 include("record.jl")

diff --git a/src/fastq/readrecord.jl b/src/fastq/readrecord.jl
@@ -1,48 +1,34 @@
 machine = let
-    re = Automa.RegExp
-
     hspace = re"[ \t\v]"
 
     header1 = let
-        identifier = re.rep(re.any() \ re.space())
-        identifier.actions[:enter] = [:mark]
-        identifier.actions[:exit]  = [:header1_identifier]
+        identifier = onexit!(onenter!(Re.rep(Re.any() \ Re.space()), :mark), :header1_identifier)
 
         # Description here means "after whitespace", not whole line
-        description = re.cat(re.any() \ re.space(), re"[^\r\n]*")
-        re.cat('@', identifier, re.opt(re.cat(re.rep1(hspace), re.opt(description))))
+        description = (Re.any() \ Re.space()) * re"[^\r\n]*"
+        '@' * identifier * Re.opt(Re.rep1(hspace) * Re.opt(description))
     end
-    header1.actions[:exit] = [:header1_description]
+    onexit!(header1, :header1_description)
 
-    sequence = re"[A-z]*"
-    sequence.actions[:enter] = [:mark]
-    sequence.actions[:exit]  = [:sequence]
+    sequence = onexit!(onenter!(re"[A-z]*", :mark), :sequence)
 
     # The pattern recognized by header2 should be identical to header1
     # with the only difference being that h1 is split into identifier
     # and description
     header2 = let
-        description2 = re"[^\r\n]+"
-        description2.actions[:enter] = [:mark]
-        description2.actions[:exit]  = [:header2_description]
-        re.cat('+', re.opt(description2))
+        description2 = onexit!(onenter!(re"[^\r\n]+", :mark), :header2_description)
+        '+' * Re.opt(description2)
     end
 
-    quality = re"[!-~]*"
-    quality.actions[:enter] = [:mark]
-    quality.actions[:exit]  = [:quality]
+    quality = onexit!(onenter!(re"[!-~]*", :mark), :quality)
 
     newline = let
-        lf = re"\n"
-        lf.actions[:enter] = [:countline]
-        re.cat(re.opt('\r'), lf)
+        lf = onenter!(re"\n", :countline)
+        Re.opt('\r') * lf
     end
 
-    record = re.cat(header1, newline, sequence, newline, header2, newline, quality)
-    record.actions[:enter] = [:mark]
-    record.actions[:exit] = [:record]
-
-    fastq = re.opt(record) * re.rep(newline * record) * re.opt(newline)
+    record = onexit!(onenter!(header1 * newline * sequence * newline * header2 * newline * quality, :mark), :record)
+    fastq = Re.opt(record) * Re.rep(newline * record) * Re.opt(newline)
 
     Automa.compile(fastq)
 end
@@ -121,7 +107,7 @@ end
 
 returncode = :(return cs, linenum, found)
 
-Automa.Stream.generate_reader(
+Automa.generate_reader(
     :readrecord!,
     machine,
     arguments = (:(record::Record), :(state::Tuple{Int,Int})),
@@ -174,7 +160,7 @@ initcode = quote
     headerbuffer = Vector{UInt8}(undef, 1024)
 end
 
-Automa.Stream.generate_reader(
+Automa.generate_reader(
     :validate_fastq,
     machine,
     arguments = (),