JuliaLang · jakobnissen · May 25, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/base/char.jl b/base/char.jl
@@ -222,9 +222,6 @@ isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y)
 hash(x::Char, h::UInt) =
     hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
 
-first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8
-first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char)
-
 # fallbacks:
 isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
 ==(x::AbstractChar, y::AbstractChar) = Char(x) == Char(y)

diff --git a/base/strings/search.jl b/base/strings/search.jl
@@ -31,9 +31,6 @@ const DenseUInt8 = Union{
 
 const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}
 
-last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
-last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
-
 function last_utf8_byte(c::Char)
     u = reinterpret(UInt32, c)
     shift = ((4 - ncodeunits(c)) * 8) & 31
@@ -44,144 +41,209 @@ end
 # This holds even in the presence of invalid UTF8
 is_standalone_byte(x::UInt8) = (x < 0x80) | (x > 0xf7)
 
-function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
-                  s::Union{String, SubString{String}}, i::Integer)
-    if i < 1 || i > sizeof(s)
-        i == sizeof(s) + 1 && return nothing
-        throw(BoundsError(s, i))
+last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
+last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
+
+# Internal type - lazy iterator over positions of char in string
+struct FwCharPosIter{S}
+    string::S # S is assumed to be either String or SubString{String}
+    char::Char
+    # Char searchers search for the last UTF8 byte, because this byte tends to
+    # have the most variety in real texts, so any individual value is rarer.
+    # This allows more work to be done in the fast path using memchr.
+    last_char_byte::UInt8
+end
+
+function FwCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar)
+    char = Char(c)::Char
+    byte = last_utf8_byte(char)
+    FwCharPosIter{typeof(s)}(s, char, byte)
+end
+
+# i is the index in the string to search from.
+# We assume it's never < firstindex(s.string)
+function Base.iterate(s::FwCharPosIter, i::Int=1)
+    scu = ncodeunits(s.string)
+
+    # By definition, if the last byte is a standalone byte, then the char
+    # is a single-byte char where the byte can never be a subset of another char.
+    # Hence, we can simply search for the occurrence of the byte itself.
+    if is_standalone_byte(s.last_char_byte)
+        i > scu && return nothing
+        i = _search(s.string, s.last_char_byte, i)
+        i === nothing ? nothing : (i, i + 1)
+    else
+        ncu = ncodeunits(s.char)
+        while true
+            i > scu && return nothing
+            i = _search(s.string, s.last_char_byte, i)
+            i === nothing && return nothing
+            # Increment i before the continue to avoid infinite loop.
+            # Since we search for the last byte in the char, the index has an offset.
+            i += 1
+            index = i - ncu
+            # The byte may be part of a different char, in which case index
+            # may be invalid.
+            isvalid(s.string, index) || continue
+            # Here, we use iterate instead of indexing, because indexing needlessly
+            # re-validates the index which we have already done here.
+            # This relies on the implementation detail that the iterator state for
+            # iterating strings is the same as the byte index.
+            char = first(something(iterate(s.string, index)))
+            char == s.char && return (index, i)
+        end
     end
-    @inbounds isvalid(s, i) || string_index_err(s, i)
-    c = pred.x
-    c ≤ '\x7f' && return _search(s, first_utf8_byte(c), i)
-    while true
-        i = _search(s, first_utf8_byte(c), i)
-        i === nothing && return nothing
-        isvalid(s, i) && pred(s[i]) && return i
-        i = nextind(s, i)
+end
+
+# Internal type - lazy iterator over positions of char in string, in reverse order
+struct RvCharPosIter{S}
+    string::S # S is assumed to be either String or SubString{String}
+    char::Char
+    last_char_byte::UInt8
+end
+
+IteratorSize(s::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = SizeUnknown()
+eltype(::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = Int
+
+function RvCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar)
+    char = Char(c)::Char
+    byte = last_utf8_byte(char)
+    RvCharPosIter{typeof(s)}(s, char, byte)
+end
+
+# i is the index in the string to search from
+# We assume it's never > ncodeunits(s.string)
+# This is the same implementation as FwCharPosIter, except for two differences:
+# 1. i must be decremented, not incremented because we are searching backwards
+# 2. Because we search for the last byte, the starting value of i need to be
+#    incremented in the beginning, as that byte may be found at i + ncodeunits(char) - 1.
+function Base.iterate(s::RvCharPosIter, i::Int=ncodeunits(s.string))
+    ncu = ncodeunits(s.char)
+    if is_standalone_byte(s.last_char_byte)
+        i < ncu && return nothing
+        i = _rsearch(s.string, s.last_char_byte, i)
+        i === nothing ? nothing : (i, i - 1)
+    else
+        i = min(ncodeunits(s.string), i + ncu - 1)
+        while true
+            i < ncu && return nothing
+            i = _rsearch(s.string, s.last_char_byte, i)
+            i === nothing && return nothing
+            index = i - ncu + 1
+            i -= 1
+            isvalid(s.string, index) || continue
+            char = first(something(iterate(s.string, index)))
+            char == s.char && return (index, i)
+        end
     end
 end
 
-function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}}, a::Union{DenseInt8, DenseUInt8})
-    findnext(pred, a, firstindex(a))
+function try_next(x, state)
+    y = iterate(x, state)
+    y === nothing ? nothing : first(y)
+end
+
+function findnext(
+    pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
+    s::Union{String, SubString{String}},
+    i::Integer,
+)
+    # TODO: Redesign these strange rules for errors, see #54584
+    scu = ncodeunits(s)
+    i == scu + 1 && return nothing
+    @boundscheck if i < 1 || i > scu + 1
+        throw(BoundsError(s, i))
+    elseif !isvalid(s, i)
+        string_index_err(s, i)
+    end
+    try_next(FwCharPosIter(s, pred.x), Int(i)::Int)
 end
 
 function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
+    @boundscheck i < firstindex(a) && throw(BoundsError(a, i))
+    i > lastindex(a) && return nothing
     _search(a, pred.x, i)
 end
 
 function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
+    @boundscheck i < firstindex(a) && throw(BoundsError(a, i))
+    i > lastindex(a) && return nothing
     _search(a, pred.x, i)
 end
 
 # iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
 # so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
-findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8))
-findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i)
+function findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer)
+    @boundscheck i < firstindex(a) && throw(BoundsError(a, i))
+    i > lastindex(a) && return nothing
+    _search(a, zero(UInt8), i)
+end
 
+# This is essentially just a wrapper around memchr. i must be inbounds.
 function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a))
     fst = firstindex(a)
-    lst = last_byteindex(a)
-    if i < fst
-        throw(BoundsError(a, i))
-    end
-    n_bytes = lst - i + 1
-    if i > lst
-        return i == lst+1 ? nothing : throw(BoundsError(a, i))
-    end
     GC.@preserve a begin
         p = pointer(a)
-        q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes)
+        q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, last_byteindex(a) - i + 1)
     end
     return q == C_NULL ? nothing : (q-p+fst) % Int
 end
 
-function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
-    if isascii(b)
-        _search(a,UInt8(b),i)
-    else
-        _search(a,codeunits(string(b)),i).start
+function findprev(
+    pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
+    s::Union{String, SubString{String}},
+    i::Integer,
+)
+    # TODO: Redesign these strange rules for errors, see #54584
+    if i == ncodeunits(s) + 1 || i == 0
+        return nothing
     end
-end
-
-function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
-                  s::Union{String, SubString{String}}, i::Integer)
-    c = pred.x
-    c ≤ '\x7f' && return _rsearch(s, first_utf8_byte(c), i)
-    b = first_utf8_byte(c)
-    while true
-        i = _rsearch(s, b, i)
-        i == nothing && return nothing
-        isvalid(s, i) && pred(s[i]) && return i
-        i = prevind(s, i)
+    @boundscheck if i < 1 || i > ncodeunits(s) + 1
+        throw(BoundsError(s, i))
     end
-end
-
-function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::DenseUInt8OrInt8)
-    findprev(pred, a, lastindex(a))
+    try_next(RvCharPosIter(s, pred.x), Int(i)::Int)
 end
 
 function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
+    @boundscheck i > lastindex(a) && throw(BoundsError(a, i))
+    i < firstindex(a) && return nothing
     _rsearch(a, pred.x, i)
 end
 
 function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
+    @boundscheck i > lastindex(a) && throw(BoundsError(a, i))
+    i < firstindex(a) && return nothing
     _rsearch(a, pred.x, i)
 end
 
 # See comments above for findfirst(::typeof(iszero)) methods
-findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8))
-findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i)
+function findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer)
+    @boundscheck i > lastindex(a) && throw(BoundsError(a, i))
+    i < firstindex(a) && return nothing
+    _rsearch(a, zero(UInt8), i)
+end
 
+# This is essentially just a wrapper around memrchr. i must be inbounds.
 function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a))
     fst = firstindex(a)
-    lst = last_byteindex(a)
-    if i < fst
-        return i == fst - 1 ? nothing : throw(BoundsError(a, i))
-    end
-    if i > lst
-        return i == lst+1 ? nothing : throw(BoundsError(a, i))
-    end
     GC.@preserve a begin
         p = pointer(a)
         q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1)
     end
     return q == C_NULL ? nothing : (q-p+fst) % Int
 end
 
-function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a))
-    if isascii(b)
-        _rsearch(a,UInt8(b),i)
-    else
-        _rsearch(a,codeunits(string(b)),i).start
-    end
-end
-
 function findall(
     pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
-    s::Union{String, SubString{String}}
+    s::Union{String, SubString{String}},
 )
-    c = Char(pred.x)::Char
-    byte = last_utf8_byte(c)
-    ncu = ncodeunits(c)
-
-    # If only one byte, and can't be part of another Char: Forward to memchr.
-    is_standalone_byte(byte) && return findall(==(byte), codeunits(s))
-    result = Int[]
-    i = firstindex(s)
-    while true
-        i = _search(s, byte, i)
-        isnothing(i) && return result
-        i += 1
-        index = i - ncu
-        # If the char is invalid, it's possible that its first byte is
-        # inside another char. If so, indexing into the string will throw an
-        # error, so we need to check for valid indices.
-        isvalid(s, index) || continue
-        # We use iterate here instead of indexing, because indexing wastefully
-        # checks for valid index. It would be better if there was something like
-        # try_getindex(::String, ::Int) we could use.
-        char = first(something(iterate(s, index)))
-        pred(char) && push!(result, index)
+    iter = FwCharPosIter(s, pred.x)
+    return if is_standalone_byte(iter.last_char_byte)
+        findall(==(iter.last_char_byte), codeunits(s))
+    else
+        # It is slightly wasteful that every iteration will check is_standalone_byte
+        # again, but this should only be minor overhead in the non-fast path.
+        collect(iter)
     end
 end
 
@@ -255,7 +317,6 @@ function findnext(testf::Function, s::AbstractString, i::Integer)
     return nothing
 end
 
-
 in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing)
 
 function _searchindex(s::Union{AbstractString,DenseUInt8OrInt8},

diff --git a/base/util.jl b/base/util.jl
@@ -508,7 +508,7 @@ unsafe_crc32c(a, n, crc) = ccall(:jl_crc32c, UInt32, (UInt32, Ptr{UInt8}, Csize_
 _crc32c(a::NTuple{<:Any, UInt8}, crc::UInt32=0x00000000) =
     unsafe_crc32c(Ref(a), length(a) % Csize_t, crc)
 
-function _crc32c(a::DenseBytes, crc::UInt32=0x00000000)
+function _crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000)
     unsafe_crc32c(a, length(a) % Csize_t, crc)
 end
 

diff --git a/stdlib/CRC32c/src/CRC32c.jl b/stdlib/CRC32c/src/CRC32c.jl
@@ -7,7 +7,8 @@ See [`CRC32c.crc32c`](@ref) for more information.
 """
 module CRC32c
 
-import Base: DenseBytes
+import Base.FastContiguousSubArray
+import Base: DenseUInt8OrInt8
 
 export crc32c
 
@@ -50,7 +51,7 @@ function crc32c(a::AbstractVector{UInt8}, crc::UInt32=0x00000000)
     return crc
 end
 
-function crc32c(a::DenseBytes, crc::UInt32=0x00000000)
+function crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000)
     Base._crc32c(a, crc)
 end