Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor char/string and byte search #54667

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
3 changes: 0 additions & 3 deletions base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,6 @@ isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y)
hash(x::Char, h::UInt) =
hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))

first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8
first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char)

# fallbacks:
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
==(x::AbstractChar, y::AbstractChar) = Char(x) == Char(y)
Expand Down
243 changes: 152 additions & 91 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@ const DenseUInt8 = Union{

const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}

last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)

function last_utf8_byte(c::Char)
u = reinterpret(UInt32, c)
shift = ((4 - ncodeunits(c)) * 8) & 31
Expand All @@ -44,144 +41,209 @@ end
# This holds even in the presence of invalid UTF8
is_standalone_byte(x::UInt8) = (x < 0x80) | (x > 0xf7)

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}, i::Integer)
if i < 1 || i > sizeof(s)
i == sizeof(s) + 1 && return nothing
throw(BoundsError(s, i))
last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)

# Internal type - lazy iterator over positions of char in string
struct FwCharPosIter{S}
string::S # S is assumed to be either String or SubString{String}
char::Char
# Char searchers search for the last UTF8 byte, because this byte tends to
# have the most variety in real texts, so any individual value is rarer.
# This allows more work to be done in the fast path using memchr.
last_char_byte::UInt8
end

function FwCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar)
char = Char(c)::Char
byte = last_utf8_byte(char)
FwCharPosIter{typeof(s)}(s, char, byte)
end

# i is the index in the string to search from.
# We assume it's never < firstindex(s.string)
function Base.iterate(s::FwCharPosIter, i::Int=1)
scu = ncodeunits(s.string)

# By definition, if the last byte is a standalone byte, then the char
# is a single-byte char where the byte can never be a subset of another char.
# Hence, we can simply search for the occurrence of the byte itself.
if is_standalone_byte(s.last_char_byte)
i > scu && return nothing
i = _search(s.string, s.last_char_byte, i)
i === nothing ? nothing : (i, i + 1)
else
ncu = ncodeunits(s.char)
while true
i > scu && return nothing
i = _search(s.string, s.last_char_byte, i)
i === nothing && return nothing
# Increment i before the continue to avoid infinite loop.
# Since we search for the last byte in the char, the index has an offset.
i += 1
index = i - ncu
# The byte may be part of a different char, in which case index
# may be invalid.
isvalid(s.string, index) || continue
# Here, we use iterate instead of indexing, because indexing needlessly
# re-validates the index which we have already done here.
# This relies on the implementation detail that the iterator state for
# iterating strings is the same as the byte index.
char = first(something(iterate(s.string, index)))
char == s.char && return (index, i)
end
end
@inbounds isvalid(s, i) || string_index_err(s, i)
c = pred.x
c ≤ '\x7f' && return _search(s, first_utf8_byte(c), i)
while true
i = _search(s, first_utf8_byte(c), i)
i === nothing && return nothing
isvalid(s, i) && pred(s[i]) && return i
i = nextind(s, i)
end

# Internal type - lazy iterator over positions of char in string, in reverse order
struct RvCharPosIter{S}
string::S # S is assumed to be either String or SubString{String}
char::Char
last_char_byte::UInt8
end

IteratorSize(s::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = SizeUnknown()
eltype(::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = Int

function RvCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar)
char = Char(c)::Char
byte = last_utf8_byte(char)
RvCharPosIter{typeof(s)}(s, char, byte)
end

# i is the index in the string to search from
# We assume it's never > ncodeunits(s.string)
# This is the same implementation as FwCharPosIter, except for two differences:
# 1. i must be decremented, not incremented because we are searching backwards
# 2. Because we search for the last byte, the starting value of i need to be
# incremented in the beginning, as that byte may be found at i + ncodeunits(char) - 1.
function Base.iterate(s::RvCharPosIter, i::Int=ncodeunits(s.string))
ncu = ncodeunits(s.char)
if is_standalone_byte(s.last_char_byte)
i < ncu && return nothing
i = _rsearch(s.string, s.last_char_byte, i)
i === nothing ? nothing : (i, i - 1)
else
i = min(ncodeunits(s.string), i + ncu - 1)
while true
i < ncu && return nothing
i = _rsearch(s.string, s.last_char_byte, i)
i === nothing && return nothing
index = i - ncu + 1
i -= 1
isvalid(s.string, index) || continue
char = first(something(iterate(s.string, index)))
char == s.char && return (index, i)
end
end
end

function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}}, a::Union{DenseInt8, DenseUInt8})
findnext(pred, a, firstindex(a))
function try_next(x, state)
y = iterate(x, state)
y === nothing ? nothing : first(y)
end

function findnext(
pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}},
i::Integer,
)
# TODO: Redesign these strange rules for errors, see #54584
scu = ncodeunits(s)
i == scu + 1 && return nothing
@boundscheck if i < 1 || i > scu + 1
throw(BoundsError(s, i))
elseif !isvalid(s, i)
string_index_err(s, i)
end
try_next(FwCharPosIter(s, pred.x), Int(i)::Int)
end

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
@boundscheck i < firstindex(a) && throw(BoundsError(a, i))
i > lastindex(a) && return nothing
_search(a, pred.x, i)
end

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
@boundscheck i < firstindex(a) && throw(BoundsError(a, i))
i > lastindex(a) && return nothing
_search(a, pred.x, i)
end

# iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
# so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8))
findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i)
function findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer)
@boundscheck i < firstindex(a) && throw(BoundsError(a, i))
i > lastindex(a) && return nothing
_search(a, zero(UInt8), i)
end

# This is essentially just a wrapper around memchr. i must be inbounds.
function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a))
fst = firstindex(a)
lst = last_byteindex(a)
if i < fst
throw(BoundsError(a, i))
end
n_bytes = lst - i + 1
if i > lst
return i == lst+1 ? nothing : throw(BoundsError(a, i))
end
GC.@preserve a begin
p = pointer(a)
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes)
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, last_byteindex(a) - i + 1)
end
return q == C_NULL ? nothing : (q-p+fst) % Int
end

function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
if isascii(b)
_search(a,UInt8(b),i)
else
_search(a,codeunits(string(b)),i).start
function findprev(
pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}},
i::Integer,
)
# TODO: Redesign these strange rules for errors, see #54584
if i == ncodeunits(s) + 1 || i == 0
return nothing
end
end

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}, i::Integer)
c = pred.x
c ≤ '\x7f' && return _rsearch(s, first_utf8_byte(c), i)
b = first_utf8_byte(c)
while true
i = _rsearch(s, b, i)
i == nothing && return nothing
isvalid(s, i) && pred(s[i]) && return i
i = prevind(s, i)
@boundscheck if i < 1 || i > ncodeunits(s) + 1
throw(BoundsError(s, i))
end
end

function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::DenseUInt8OrInt8)
findprev(pred, a, lastindex(a))
try_next(RvCharPosIter(s, pred.x), Int(i)::Int)
end

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
@boundscheck i > lastindex(a) && throw(BoundsError(a, i))
i < firstindex(a) && return nothing
_rsearch(a, pred.x, i)
end

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
@boundscheck i > lastindex(a) && throw(BoundsError(a, i))
i < firstindex(a) && return nothing
_rsearch(a, pred.x, i)
end

# See comments above for findfirst(::typeof(iszero)) methods
findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8))
findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i)
function findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer)
@boundscheck i > lastindex(a) && throw(BoundsError(a, i))
i < firstindex(a) && return nothing
_rsearch(a, zero(UInt8), i)
end

# This is essentially just a wrapper around memrchr. i must be inbounds.
function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a))
fst = firstindex(a)
lst = last_byteindex(a)
if i < fst
return i == fst - 1 ? nothing : throw(BoundsError(a, i))
end
if i > lst
return i == lst+1 ? nothing : throw(BoundsError(a, i))
end
GC.@preserve a begin
p = pointer(a)
q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1)
end
return q == C_NULL ? nothing : (q-p+fst) % Int
end

function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a))
if isascii(b)
_rsearch(a,UInt8(b),i)
else
_rsearch(a,codeunits(string(b)),i).start
end
end

function findall(
pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}
s::Union{String, SubString{String}},
)
c = Char(pred.x)::Char
byte = last_utf8_byte(c)
ncu = ncodeunits(c)

# If only one byte, and can't be part of another Char: Forward to memchr.
is_standalone_byte(byte) && return findall(==(byte), codeunits(s))
result = Int[]
i = firstindex(s)
while true
i = _search(s, byte, i)
isnothing(i) && return result
i += 1
index = i - ncu
# If the char is invalid, it's possible that its first byte is
# inside another char. If so, indexing into the string will throw an
# error, so we need to check for valid indices.
isvalid(s, index) || continue
# We use iterate here instead of indexing, because indexing wastefully
# checks for valid index. It would be better if there was something like
# try_getindex(::String, ::Int) we could use.
char = first(something(iterate(s, index)))
pred(char) && push!(result, index)
iter = FwCharPosIter(s, pred.x)
return if is_standalone_byte(iter.last_char_byte)
findall(==(iter.last_char_byte), codeunits(s))
else
# It is slightly wasteful that every iteration will check is_standalone_byte
# again, but this should only be minor overhead in the non-fast path.
collect(iter)
end
end

Expand Down Expand Up @@ -255,7 +317,6 @@ function findnext(testf::Function, s::AbstractString, i::Integer)
return nothing
end


in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing)

function _searchindex(s::Union{AbstractString,DenseUInt8OrInt8},
Expand Down
2 changes: 1 addition & 1 deletion base/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ unsafe_crc32c(a, n, crc) = ccall(:jl_crc32c, UInt32, (UInt32, Ptr{UInt8}, Csize_
_crc32c(a::NTuple{<:Any, UInt8}, crc::UInt32=0x00000000) =
unsafe_crc32c(Ref(a), length(a) % Csize_t, crc)

function _crc32c(a::DenseBytes, crc::UInt32=0x00000000)
function _crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000)
unsafe_crc32c(a, length(a) % Csize_t, crc)
end

Expand Down
5 changes: 3 additions & 2 deletions stdlib/CRC32c/src/CRC32c.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ See [`CRC32c.crc32c`](@ref) for more information.
"""
module CRC32c

import Base: DenseBytes
import Base.FastContiguousSubArray
import Base: DenseUInt8OrInt8

export crc32c

Expand Down Expand Up @@ -50,7 +51,7 @@ function crc32c(a::AbstractVector{UInt8}, crc::UInt32=0x00000000)
return crc
end

function crc32c(a::DenseBytes, crc::UInt32=0x00000000)
function crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000)
Base._crc32c(a, crc)
end

Expand Down
Loading
Loading