Skip to content

Commit

Permalink
Migrate to Automa v1
Browse files Browse the repository at this point in the history
  • Loading branch information
jakobnissen committed Jul 19, 2023
1 parent a98d262 commit 45854a4
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 101 deletions.
6 changes: 2 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "FASTX"
uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12"
authors = ["Sabrina J. Ward <[email protected]>", "Jakob N. Nissen <[email protected]>"]
version = "2.1.2"
version = "2.1.3"

[weakdeps]
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
Expand All @@ -10,7 +10,6 @@ BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
ScanByte = "7b38b023-a4d7-4c5e-8d43-3f3097f304eb"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
Expand All @@ -19,10 +18,9 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
BioSequencesExt = "BioSequences"

[compat]
Automa = "0.8"
Automa = "1"
BioGenerics = "0.1.2"
BioSequences = "3"
ScanByte = "0.4"
PrecompileTools = "1"
StringViews = "1"
TranscodingStreams = "0.9.5"
Expand Down
4 changes: 1 addition & 3 deletions src/FASTX.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,6 @@ julia> seqsize(parse(FASTA.Record, ">hdr\\nαβγδϵ"))
"""
function seqsize end

const UTF8 = Union{AbstractVector{UInt8}, String, SubString{String}}

# line is nothing if the reader does not have line information after random IO access.
@noinline function throw_parser_error(
data::Vector{UInt8},
Expand Down Expand Up @@ -147,7 +145,7 @@ end

CONTEXT = Automa.CodeGenContext(
generator=:goto,
vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte)
vars=Automa.Variables(;p=:p, p_end=:p_end, cs=:cs, data=:data, mem=:mem, byte=:byte)
)

include("fasta/fasta.jl")
Expand Down
10 changes: 3 additions & 7 deletions src/fasta/fasta.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,13 @@ Module under FASTX with code related to FASTA files.
"""
module FASTA

import Automa
import Automa.RegExp: @re_str
import Automa.Stream: @mark, @markpos, @relpos, @abspos
using Automa: Automa, @re_str, @mark, @markpos, @relpos, @abspos, onenter!, onexit!, onall!
import BioGenerics: BioGenerics
import StringViews: StringView
import TranscodingStreams: TranscodingStreams, TranscodingStream, NoopStream
import ..FASTX: identifier, description, sequence, seqsize, truncate, memcmp, appendfrom!, CONTEXT, throw_parser_error

# Trivial use, I only use it here because it's a dep of Automa anyway.
# Can be removed with no big problems
using ScanByte: memchr, ByteSet
import ..FASTX: identifier, description, sequence, UTF8, seqsize, throw_parser_error, truncate, memcmp, appendfrom!, CONTEXT
const Re = Automa.RegExp

include("record.jl")
include("readrecord.jl")
Expand Down
29 changes: 7 additions & 22 deletions src/fasta/index.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,24 +100,17 @@ function Base.show(io::IO, index::Index)
end

index_machine = let
re = Automa.RegExp
newline = let
lf = re"\n"
lf.actions[:enter] = [:countline]
re.opt('\r') * lf
lf = onenter!(re"\n", :countline)
Re.opt('\r') * lf
end

# The specs refer to the SAM specs, which contain this regex
name = re"[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*"
name.actions[:enter] = [:mark]
name.actions[:exit] = [:name]

number = re"[0-9]+"
number.actions[:all] = [:digit]
number.actions[:exit] = [:number]
name = onexit!(onenter!(re"[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*", :mark), :name)
number = onexit!(onall!(re"[0-9]+", :digit), :number)

line = name * re"\t" * number * re"\t" * number * re"\t" * number * re"\t" * number
fai = re.opt(line * re.rep(newline * line)) * re.rep(newline)
fai = Re.opt(line * Re.rep(newline * line)) * Re.rep(newline)
Automa.compile(fai)
end

Expand Down Expand Up @@ -173,7 +166,6 @@ index_actions = Dict{Symbol, Expr}(
error("Error when parsing FAI file: Unexpected byte at index $p (line $linenum col $col)")
end

ctx = Automa.CodeGenContext(vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te, :cs, :data, :mem, :byte))
@eval function read_faidx(data::Vector{UInt8})
start = 0
linenum = 1
Expand All @@ -183,15 +175,9 @@ ctx = Automa.CodeGenContext(vars=Automa.Variables(:p, :p_end, :p_eof, :ts, :te,
linebases = 0
linebases_num = 0
vectors = (Int[], Int[], UInt[])
$(Automa.generate_init_code(ctx, index_machine))
p_eof = p_end = length(data)

GC.@preserve data begin
$(Automa.generate_exec_code(ctx, index_machine, index_actions))
end
$(Automa.generate_code(CONTEXT, index_machine, index_actions))

# TODO: Rely on Automa's new error code
iszero(cs) || throw_index_error(data, linenum, p)
return Index(names, vectors...)
end

Expand Down Expand Up @@ -303,10 +289,9 @@ returncode = quote
return Index(names, lengths, offsets, encoded_linebases)
end

Automa.Stream.generate_reader(
Automa.generate_reader(
:faidx_,
machine,
arguments = (),
actions = index_fasta_actions,
context = CONTEXT,
initcode = initcode,
Expand Down
15 changes: 9 additions & 6 deletions src/fasta/reader.jl
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,6 @@ function seekrecord(reader::Reader, i::Integer)
nothing
end

const UNALLOWED_BYTESET = Val(ByteSet((UInt8('\n'), UInt8('\r'), UInt8('>'))))

"""
extract(reader::Reader, name::AbstractString, range::Union{Nothing, UnitRange})
Expand Down Expand Up @@ -218,11 +216,16 @@ function extract(
resize!(buffer, total_bases)

# Now check that there are no bad bytes in our buffer
# Note: This ByteSet must correspond to the allowed bytes in
# Note: The disallowed bytes must correspond to the allowed bytes in
# the FASTA machine to ensure we can seek the same FASTA files we can read
badpos = memchr(buffer, UNALLOWED_BYTESET)
if badpos !== nothing
error("Invalid byte in FASTA sequence line: $(buffer[badpos])")
bad_byte = false
for byte in buffer
bad_byte |= (
(byte === UInt8('\r')) |
(byte === UInt8('\n')) |
(byte === UInt8('>'))
)
bad_byte && error("Invalid byte in FASTA sequence line: '>', '\\r' or '\\n'")
end

# Return the Reader to a usable state after having messed with its
Expand Down
40 changes: 13 additions & 27 deletions src/fasta/readrecord.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,55 +10,41 @@
# This implies all whitespace except newlines, including trailing whitespace, is part
# of the sequence.
machine = let
re = Automa.RegExp

hspace = re"[ \t\v]"
newline = let
lf = re"\n"
lf.actions[:enter] = [:countline]
re.opt('\r') * lf
lf = onenter!(re"\n", :countline)
Re.opt('\r') * lf
end
space = hspace | newline

# Identifier: Leading non-space
identifier = re.rep(re.any() \ re.space())
identifier.actions[:enter] = [:mark]
# Action: Store length of identifier
identifier.actions[:exit] = [:identifier]
identifier = onexit!(onenter!(Re.rep(Re.any() \ Re.space()), :mark), :identifier)

# Description here include trailing whitespace.
# This is needed for the FSM, since the description can contain arbitrary
# whitespace, the only way to know the description ends is to encounter a newline.
# NB: Make sure to also change the Index machine to match this is you change it.
description = identifier * re.opt(hspace * re"[^\r\n]*")

# Action: Store length of description and append description to record.data
description.actions[:exit] = [:description]
description = onexit!(identifier * Re.opt(hspace * re"[^\r\n]*"), :description)

# Header: '>' then description
header = re">" * description

# Sequence line: Anything except \r, \n and >
# Note: Must be consistent with the ByteSet in reader.jl used for seeking
sequence_line = re"[^\n\r>]+"
sequence_line.actions[:enter] = [:mark]
# Action: Append letters to sequence_line
sequence_line.actions[:exit] = [:seqline]
# Note: Must be consistent with the disallowed bytes in reader.jl used for seeking
sequence_line = onexit!(onenter!(re"[^\n\r>]+", :mark), :seqline)

# Sequence: This is intentionally very liberal with whitespace.
# Any trailing whitespace is simply considered part of the sequence.
# Is this bad? Maybe.
sequence = re.rep1(re.opt(sequence_line) * re.rep1(newline))
sequence = Re.rep1(Re.opt(sequence_line) * Re.rep1(newline))

# We have sequence_eof to allow the final sequence to not end in whitespace
sequence_eof = re.opt(sequence_line) * re.rep(re.rep1(newline) * re.opt(sequence_line))
sequence_eof = Re.opt(sequence_line) * Re.rep(Re.rep1(newline) * Re.opt(sequence_line))

record = header * newline * sequence
record.actions[:exit] = [:record]
record_eof = header * newline * sequence_eof
record_eof.actions[:exit] = [:record]
record = onexit!(header * newline * sequence, :record)
record_eof = onexit!(header * newline * sequence_eof, :record)

fasta = re.rep(space) * re.rep(record) * re.opt(record_eof)
fasta = Re.rep(space) * Re.rep(record) * Re.opt(record_eof)

Automa.compile(fasta)
end
Expand Down Expand Up @@ -106,7 +92,7 @@ end

returncode = :(return cs, linenum, found)

Automa.Stream.generate_reader(
Automa.generate_reader(
:readrecord!,
machine,
arguments = (:(record::Record), :(state::Tuple{Int,Int})),
Expand All @@ -120,7 +106,7 @@ Automa.Stream.generate_reader(
validator_actions = Dict(k => quote nothing end for k in keys(actions))
validator_actions[:countline] = :(linenum += 1)

Automa.Stream.generate_reader(
Automa.generate_reader(
:validate_fasta,
machine,
arguments = (),
Expand Down
8 changes: 4 additions & 4 deletions src/fastq/fastq.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ Module under FASTX with code related to FASTA files.
"""
module FASTQ

import Automa
import Automa.RegExp: @re_str
import Automa.Stream: @mark, @markpos, @relpos, @abspos
using Automa: Automa, @re_str, @mark, @markpos, @relpos, @abspos, onenter!, onexit!
import BioGenerics: BioGenerics
import StringViews: StringView
import TranscodingStreams: TranscodingStreams, TranscodingStream, NoopStream
import ..FASTX: identifier, description, sequence, UTF8, seqsize, throw_parser_error, truncate, memcmp, appendfrom!, CONTEXT
import ..FASTX: identifier, description, sequence, seqsize, truncate, memcmp, appendfrom!, CONTEXT

const Re = Automa.RegExp

include("quality.jl")
include("record.jl")
Expand Down
42 changes: 14 additions & 28 deletions src/fastq/readrecord.jl
Original file line number Diff line number Diff line change
@@ -1,48 +1,34 @@
machine = let
re = Automa.RegExp

hspace = re"[ \t\v]"

header1 = let
identifier = re.rep(re.any() \ re.space())
identifier.actions[:enter] = [:mark]
identifier.actions[:exit] = [:header1_identifier]
identifier = onexit!(onenter!(Re.rep(Re.any() \ Re.space()), :mark), :header1_identifier)

# Description here means "after whitespace", not whole line
description = re.cat(re.any() \ re.space(), re"[^\r\n]*")
re.cat('@', identifier, re.opt(re.cat(re.rep1(hspace), re.opt(description))))
description = (Re.any() \ Re.space()) * re"[^\r\n]*"
'@' * identifier * Re.opt(Re.rep1(hspace) * Re.opt(description))
end
header1.actions[:exit] = [:header1_description]
onexit!(header1, :header1_description)

sequence = re"[A-z]*"
sequence.actions[:enter] = [:mark]
sequence.actions[:exit] = [:sequence]
sequence = onexit!(onenter!(re"[A-z]*", :mark), :sequence)

# The pattern recognized by header2 should be identical to header1
# with the only difference being that h1 is split into identifier
# and description
header2 = let
description2 = re"[^\r\n]+"
description2.actions[:enter] = [:mark]
description2.actions[:exit] = [:header2_description]
re.cat('+', re.opt(description2))
description2 = onexit!(onenter!(re"[^\r\n]+", :mark), :header2_description)
'+' * Re.opt(description2)
end

quality = re"[!-~]*"
quality.actions[:enter] = [:mark]
quality.actions[:exit] = [:quality]
quality = onexit!(onenter!(re"[!-~]*", :mark), :quality)

newline = let
lf = re"\n"
lf.actions[:enter] = [:countline]
re.cat(re.opt('\r'), lf)
lf = onenter!(re"\n", :countline)
Re.opt('\r') * lf
end

record = re.cat(header1, newline, sequence, newline, header2, newline, quality)
record.actions[:enter] = [:mark]
record.actions[:exit] = [:record]

fastq = re.opt(record) * re.rep(newline * record) * re.opt(newline)
record = onexit!(onenter!(header1 * newline * sequence * newline * header2 * newline * quality, :mark), :record)
fastq = Re.opt(record) * Re.rep(newline * record) * Re.opt(newline)

Automa.compile(fastq)
end
Expand Down Expand Up @@ -121,7 +107,7 @@ end

returncode = :(return cs, linenum, found)

Automa.Stream.generate_reader(
Automa.generate_reader(
:readrecord!,
machine,
arguments = (:(record::Record), :(state::Tuple{Int,Int})),
Expand Down Expand Up @@ -174,7 +160,7 @@ initcode = quote
headerbuffer = Vector{UInt8}(undef, 1024)
end

Automa.Stream.generate_reader(
Automa.generate_reader(
:validate_fastq,
machine,
arguments = (),
Expand Down

2 comments on commit 45854a4

@jakobnissen
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/89378

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v2.1.3 -m "<description of version>" 45854a40c6f6e8bcadd3b16ac5ed080c5fd809c9
git push origin v2.1.3

Please sign in to comment.