diff --git a/base/char.jl b/base/char.jl index 2e8410f6903e2..43758f4594c24 100644 --- a/base/char.jl +++ b/base/char.jl @@ -222,9 +222,6 @@ isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y) hash(x::Char, h::UInt) = hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) -first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8 -first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char) - # fallbacks: isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y)) ==(x::AbstractChar, y::AbstractChar) = Char(x) == Char(y) diff --git a/base/strings/search.jl b/base/strings/search.jl index 5f658e24526ba..2a9149cd36f16 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -31,9 +31,6 @@ const DenseUInt8 = Union{ const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8} -last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x) -last_byteindex(x::DenseUInt8OrInt8) = lastindex(x) - function last_utf8_byte(c::Char) u = reinterpret(UInt32, c) shift = ((4 - ncodeunits(c)) * 8) & 31 @@ -44,103 +41,191 @@ end # This holds even in the presence of invalid UTF8 is_standalone_byte(x::UInt8) = (x < 0x80) | (x > 0xf7) -function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, - s::Union{String, SubString{String}}, i::Integer) - if i < 1 || i > sizeof(s) - i == sizeof(s) + 1 && return nothing - throw(BoundsError(s, i)) +last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x) +last_byteindex(x::DenseUInt8OrInt8) = lastindex(x) + +# Internal type - lazy iterator over positions of char in string +struct FwCharPosIter{S} + string::S # S is assumed to be either String or SubString{String} + char::Char + # Char searchers search for the last UTF8 byte, because this byte tends to + # have the most variety in real texts, so any individual value is rarer. + # This allows more work to be done in the fast path using memchr. + last_char_byte::UInt8 +end + +function FwCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar) + char = Char(c)::Char + byte = last_utf8_byte(char) + FwCharPosIter{typeof(s)}(s, char, byte) +end + +# i is the index in the string to search from. +# We assume it's never < firstindex(s.string) +function Base.iterate(s::FwCharPosIter, i::Int=1) + scu = ncodeunits(s.string) + + # By definition, if the last byte is a standalone byte, then the char + # is a single-byte char where the byte can never be a subset of another char. + # Hence, we can simply search for the occurrence of the byte itself. + if is_standalone_byte(s.last_char_byte) + i > scu && return nothing + i = _search(s.string, s.last_char_byte, i) + i === nothing ? nothing : (i, i + 1) + else + ncu = ncodeunits(s.char) + while true + i > scu && return nothing + i = _search(s.string, s.last_char_byte, i) + i === nothing && return nothing + # Increment i before the continue to avoid infinite loop. + # Since we search for the last byte in the char, the index has an offset. + i += 1 + index = i - ncu + # The byte may be part of a different char, in which case index + # may be invalid. + isvalid(s.string, index) || continue + # Here, we use iterate instead of indexing, because indexing needlessly + # re-validates the index which we have already done here. + # This relies on the implementation detail that the iterator state for + # iterating strings is the same as the byte index. + char = first(something(iterate(s.string, index))) + char == s.char && return (index, i) + end end - @inbounds isvalid(s, i) || string_index_err(s, i) - c = pred.x - c ≤ '\x7f' && return _search(s, first_utf8_byte(c), i) - while true - i = _search(s, first_utf8_byte(c), i) - i === nothing && return nothing - isvalid(s, i) && pred(s[i]) && return i - i = nextind(s, i) +end + +# Internal type - lazy iterator over positions of char in string, in reverse order +struct RvCharPosIter{S} + string::S # S is assumed to be either String or SubString{String} + char::Char + last_char_byte::UInt8 +end + +IteratorSize(s::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = SizeUnknown() +eltype(::Type{<:Union{FwCharPosIter, RvCharPosIter}}) = Int + +function RvCharPosIter(s::Union{String, SubString{String}}, c::AbstractChar) + char = Char(c)::Char + byte = last_utf8_byte(char) + RvCharPosIter{typeof(s)}(s, char, byte) +end + +# i is the index in the string to search from +# We assume it's never > ncodeunits(s.string) +# This is the same implementation as FwCharPosIter, except for two differences: +# 1. i must be decremented, not incremented because we are searching backwards +# 2. Because we search for the last byte, the starting value of i need to be +# incremented in the beginning, as that byte may be found at i + ncodeunits(char) - 1. +function Base.iterate(s::RvCharPosIter, i::Int=ncodeunits(s.string)) + ncu = ncodeunits(s.char) + if is_standalone_byte(s.last_char_byte) + i < ncu && return nothing + i = _rsearch(s.string, s.last_char_byte, i) + i === nothing ? nothing : (i, i - 1) + else + i = min(ncodeunits(s.string), i + ncu - 1) + while true + i < ncu && return nothing + i = _rsearch(s.string, s.last_char_byte, i) + i === nothing && return nothing + index = i - ncu + 1 + i -= 1 + isvalid(s.string, index) || continue + char = first(something(iterate(s.string, index))) + char == s.char && return (index, i) + end end end -function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}}, a::Union{DenseInt8, DenseUInt8}) - findnext(pred, a, firstindex(a)) +function try_next(x, state) + y = iterate(x, state) + y === nothing ? nothing : first(y) +end + +function findnext( + pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, + s::Union{String, SubString{String}}, + i::Integer, +) + # TODO: Redesign these strange rules for errors, see #54584 + scu = ncodeunits(s) + i == scu + 1 && return nothing + @boundscheck if i < 1 || i > scu + 1 + throw(BoundsError(s, i)) + elseif !isvalid(s, i) + string_index_err(s, i) + end + try_next(FwCharPosIter(s, pred.x), Int(i)::Int) end function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer) + @boundscheck i < firstindex(a) && throw(BoundsError(a, i)) + i > lastindex(a) && return nothing _search(a, pred.x, i) end function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer) + @boundscheck i < firstindex(a) && throw(BoundsError(a, i)) + i > lastindex(a) && return nothing _search(a, pred.x, i) end # iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same, # so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa -findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8)) -findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i) +function findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) + @boundscheck i < firstindex(a) && throw(BoundsError(a, i)) + i > lastindex(a) && return nothing + _search(a, zero(UInt8), i) +end +# This is essentially just a wrapper around memchr. i must be inbounds. function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a)) fst = firstindex(a) - lst = last_byteindex(a) - if i < fst - throw(BoundsError(a, i)) - end - n_bytes = lst - i + 1 - if i > lst - return i == lst+1 ? nothing : throw(BoundsError(a, i)) - end GC.@preserve a begin p = pointer(a) - q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes) + q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, last_byteindex(a) - i + 1) end return q == C_NULL ? nothing : (q-p+fst) % Int end -function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a)) - if isascii(b) - _search(a,UInt8(b),i) - else - _search(a,codeunits(string(b)),i).start +function findprev( + pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, + s::Union{String, SubString{String}}, + i::Integer, +) + # TODO: Redesign these strange rules for errors, see #54584 + if i == ncodeunits(s) + 1 || i == 0 + return nothing end -end - -function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, - s::Union{String, SubString{String}}, i::Integer) - c = pred.x - c ≤ '\x7f' && return _rsearch(s, first_utf8_byte(c), i) - b = first_utf8_byte(c) - while true - i = _rsearch(s, b, i) - i == nothing && return nothing - isvalid(s, i) && pred(s[i]) && return i - i = prevind(s, i) + @boundscheck if i < 1 || i > ncodeunits(s) + 1 + throw(BoundsError(s, i)) end -end - -function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::DenseUInt8OrInt8) - findprev(pred, a, lastindex(a)) + try_next(RvCharPosIter(s, pred.x), Int(i)::Int) end function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer) + @boundscheck i > lastindex(a) && throw(BoundsError(a, i)) + i < firstindex(a) && return nothing _rsearch(a, pred.x, i) end function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer) + @boundscheck i > lastindex(a) && throw(BoundsError(a, i)) + i < firstindex(a) && return nothing _rsearch(a, pred.x, i) end # See comments above for findfirst(::typeof(iszero)) methods -findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8)) -findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i) +function findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) + @boundscheck i > lastindex(a) && throw(BoundsError(a, i)) + i < firstindex(a) && return nothing + _rsearch(a, zero(UInt8), i) +end +# This is essentially just a wrapper around memrchr. i must be inbounds. function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a)) fst = firstindex(a) - lst = last_byteindex(a) - if i < fst - return i == fst - 1 ? nothing : throw(BoundsError(a, i)) - end - if i > lst - return i == lst+1 ? nothing : throw(BoundsError(a, i)) - end GC.@preserve a begin p = pointer(a) q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1) @@ -148,40 +233,17 @@ function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{ return q == C_NULL ? nothing : (q-p+fst) % Int end -function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a)) - if isascii(b) - _rsearch(a,UInt8(b),i) - else - _rsearch(a,codeunits(string(b)),i).start - end -end - function findall( pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}, - s::Union{String, SubString{String}} + s::Union{String, SubString{String}}, ) - c = Char(pred.x)::Char - byte = last_utf8_byte(c) - ncu = ncodeunits(c) - - # If only one byte, and can't be part of another Char: Forward to memchr. - is_standalone_byte(byte) && return findall(==(byte), codeunits(s)) - result = Int[] - i = firstindex(s) - while true - i = _search(s, byte, i) - isnothing(i) && return result - i += 1 - index = i - ncu - # If the char is invalid, it's possible that its first byte is - # inside another char. If so, indexing into the string will throw an - # error, so we need to check for valid indices. - isvalid(s, index) || continue - # We use iterate here instead of indexing, because indexing wastefully - # checks for valid index. It would be better if there was something like - # try_getindex(::String, ::Int) we could use. - char = first(something(iterate(s, index))) - pred(char) && push!(result, index) + iter = FwCharPosIter(s, pred.x) + return if is_standalone_byte(iter.last_char_byte) + findall(==(iter.last_char_byte), codeunits(s)) + else + # It is slightly wasteful that every iteration will check is_standalone_byte + # again, but this should only be minor overhead in the non-fast path. + collect(iter) end end @@ -255,7 +317,6 @@ function findnext(testf::Function, s::AbstractString, i::Integer) return nothing end - in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing) function _searchindex(s::Union{AbstractString,DenseUInt8OrInt8}, diff --git a/base/util.jl b/base/util.jl index c01ff697e64e3..2b43123c5d76b 100644 --- a/base/util.jl +++ b/base/util.jl @@ -508,7 +508,7 @@ unsafe_crc32c(a, n, crc) = ccall(:jl_crc32c, UInt32, (UInt32, Ptr{UInt8}, Csize_ _crc32c(a::NTuple{<:Any, UInt8}, crc::UInt32=0x00000000) = unsafe_crc32c(Ref(a), length(a) % Csize_t, crc) -function _crc32c(a::DenseBytes, crc::UInt32=0x00000000) +function _crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000) unsafe_crc32c(a, length(a) % Csize_t, crc) end diff --git a/stdlib/CRC32c/src/CRC32c.jl b/stdlib/CRC32c/src/CRC32c.jl index 923f7333b4c17..c3a2fc1394cce 100644 --- a/stdlib/CRC32c/src/CRC32c.jl +++ b/stdlib/CRC32c/src/CRC32c.jl @@ -7,7 +7,8 @@ See [`CRC32c.crc32c`](@ref) for more information. """ module CRC32c -import Base: DenseBytes +import Base.FastContiguousSubArray +import Base: DenseUInt8OrInt8 export crc32c @@ -50,7 +51,7 @@ function crc32c(a::AbstractVector{UInt8}, crc::UInt32=0x00000000) return crc end -function crc32c(a::DenseBytes, crc::UInt32=0x00000000) +function crc32c(a::DenseUInt8OrInt8, crc::UInt32=0x00000000) Base._crc32c(a, crc) end diff --git a/test/strings/search.jl b/test/strings/search.jl index c43327fe2971b..9d0ea7844ec7f 100644 --- a/test/strings/search.jl +++ b/test/strings/search.jl @@ -298,6 +298,16 @@ end @test findnext("az", "foo,bar,baz", 12) === nothing end +# See the comments in #54579 +@testset "Search for invalid chars" begin + @test findfirst(==('\xff'), "abc\xffde") == 4 + @test findprev(isequal('\xa6'), "abc\xa69", 5) == 4 + @test isnothing(findfirst(==('\xff'), "abcdeæd")) + + @test isnothing(findnext(==('\xa6'), "æ", 1)) + @test isnothing(findprev(==('\xa6'), "æa", 2)) +end + @testset "issue #9365" begin # string forward search with a two-char UTF-8 (2 byte) string literal @test findfirst("éé", "ééé") == 1:3 @@ -506,6 +516,35 @@ end @test findnext(==(0x02), A, -97) === nothing end +# NOTE: The strange edge cases are tested for here, but that does not mean +# they are intentional. Ideally, the behaviour should be changed. See issue 54584 +@testset "Edge behaviours of findnext/last" begin + # Empty haystack causes no errors + @test isempty(findall(==('\x00'), "")) + + # Findnext errors when i is not a valid index, findprev does not + @test_throws StringIndexError findnext(==('a'), "æøå", 2) + @test findprev(==('æ'), "æøå", 4) == 1 + + # Findnext errors when i < 1 or i > ncodeunits(s) + 1 + @test_throws BoundsError findnext(==('a'), "abc", 0) + @test_throws BoundsError findnext(==('a'), "abc", -1) + @test_throws BoundsError findnext(==('a'), "abc", 5) + @test_throws BoundsError findnext(==('a'), "æøå", 8) + @test findnext(==('a'), "æøå", 7) === nothing + + # Findprev errors when i > ncodeunits(s) + 1 or i < 0 + @test findprev(==('a'), "abc", 0) === nothing + @test findprev(==('æ'), "æøå", 5) == 1 + @test_throws BoundsError findprev(==('a'), "abc", -1) + @test_throws BoundsError findprev(==('æ'), "æøå", 8) + + # Findprev returns nothing when i == ncodeunits(s) + 1 + @test findprev(==('æ'), "æøå", 7) === nothing + @test findprev(==('a'), "abc", 4) === nothing + @test findprev(==('a'), "abc", 3) == 1 +end + # issue 32568 for T = (UInt, BigInt) for x = (4, 5)