Skip to content

Commit 829fa07

Browse files
committed
Merge pull request #11533 from ScottPJones/spj/utfrsearch
Fix #9365 rsearch/rsearchindex with UTF-8
2 parents 62d2503 + 1fb9759 commit 829fa07

File tree

2 files changed

+127
-21
lines changed

2 files changed

+127
-21
lines changed

base/string.jl

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ string(s::AbstractString) = s
3535
string(xs...) = print_to_string(xs...)
3636

3737
bytestring() = ""
38-
bytestring(s::Array{UInt8,1}) = bytestring(pointer(s),length(s))
38+
bytestring(s::Vector{UInt8}) = bytestring(pointer(s),length(s))
3939
bytestring(s::AbstractString...) = print_to_string(s...)
4040

4141
function bytestring(p::Union(Ptr{UInt8},Ptr{Int8}))
@@ -49,10 +49,10 @@ function bytestring(p::Union(Ptr{UInt8},Ptr{Int8}),len::Integer)
4949
ccall(:jl_pchar_to_string, ByteString, (Ptr{UInt8},Int), p, len)
5050
end
5151

52-
convert(::Type{Array{UInt8,1}}, s::AbstractString) = bytestring(s).data
52+
convert(::Type{Vector{UInt8}}, s::AbstractString) = bytestring(s).data
5353
convert(::Type{Array{UInt8}}, s::AbstractString) = bytestring(s).data
5454
convert(::Type{ByteString}, s::AbstractString) = bytestring(s)
55-
convert(::Type{Array{Char,1}}, s::AbstractString) = collect(s)
55+
convert(::Type{Vector{Char}}, s::AbstractString) = collect(s)
5656
convert(::Type{Symbol}, s::AbstractString) = symbol(s)
5757

5858
## generic supplied functions ##
@@ -301,21 +301,25 @@ function _searchindex(s::Array, t::Array, i)
301301
0
302302
end
303303

304-
searchindex(s::Union(Array{UInt8,1},Array{Int8,1}),t::Union(Array{UInt8,1},Array{Int8,1}),i) = _searchindex(s,t,i)
304+
typealias ByteArray Union(Vector{UInt8},Vector{Int8})
305+
306+
searchindex(s::ByteArray, t::ByteArray, i) = _searchindex(s,t,i)
305307
searchindex(s::AbstractString, t::AbstractString, i::Integer) = _searchindex(s,t,i)
306308
searchindex(s::AbstractString, t::AbstractString) = searchindex(s,t,start(s))
307309
searchindex(s::AbstractString, c::Char, i::Integer) = _searchindex(s,c,i)
308310
searchindex(s::AbstractString, c::Char) = searchindex(s,c,start(s))
309311

310312
function searchindex(s::ByteString, t::ByteString, i::Integer=1)
311-
if length(t) == 1
313+
# Check for fast case of a single byte
314+
# (for multi-byte UTF-8 sequences, use searchindex on byte arrays instead)
315+
if endof(t) == 1
312316
search(s, t[1], i)
313317
else
314318
searchindex(s.data, t.data, i)
315319
end
316320
end
317321

318-
function search(s::Union(Array{UInt8,1},Array{Int8,1}),t::Union(Array{UInt8,1},Array{Int8,1}),i)
322+
function search(s::ByteArray, t::ByteArray, i)
319323
idx = searchindex(s,t,i)
320324
if isempty(t)
321325
idx:idx-1
@@ -333,7 +337,13 @@ function search(s::AbstractString, t::AbstractString, i::Integer=start(s))
333337
end
334338
end
335339

336-
function rsearch(s::AbstractString, c::Chars, i::Integer=endof(s))
340+
function rsearch(s::AbstractString, c::Chars)
341+
j = search(RevString(s), c)
342+
j == 0 && return 0
343+
endof(s)-j+1
344+
end
345+
346+
function rsearch(s::AbstractString, c::Chars, i::Integer)
337347
e = endof(s)
338348
j = search(RevString(s), c, e-i+1)
339349
j == 0 && return 0
@@ -435,27 +445,37 @@ function _rsearchindex(s::Array, t::Array, k)
435445
0
436446
end
437447

438-
rsearchindex(s::Union(Array{UInt8,1},Array{Int8,1}),t::Union(Array{UInt8,1},Array{Int8,1}),i) = _rsearchindex(s,t,i)
448+
rsearchindex(s::ByteArray,t::ByteArray,i) = _rsearchindex(s,t,i)
439449
rsearchindex(s::AbstractString, t::AbstractString, i::Integer) = _rsearchindex(s,t,i)
440450
rsearchindex(s::AbstractString, t::AbstractString) = (isempty(s) && isempty(t)) ? 1 : rsearchindex(s,t,endof(s))
441451

442452
function rsearchindex(s::ByteString, t::ByteString)
443-
if length(t) == 1
453+
# Check for fast case of a single byte
454+
# (for multi-byte UTF-8 sequences, use rsearchindex instead)
455+
if endof(t) == 1
444456
rsearch(s, t[1])
445457
else
446-
rsearchindex(s.data, t.data, length(s.data))
458+
_rsearchindex(s.data, t.data, length(s.data))
447459
end
448460
end
449461

450462
function rsearchindex(s::ByteString, t::ByteString, i::Integer)
451-
if length(t) == 1
463+
# Check for fast case of a single byte
464+
# (for multi-byte UTF-8 sequences, use rsearchindex instead)
465+
if endof(t) == 1
452466
rsearch(s, t[1], i)
467+
elseif endof(t) != 0
468+
_rsearchindex(s.data, t.data, nextind(s, i)-1)
469+
elseif i > sizeof(s)
470+
return 0
471+
elseif i == 0
472+
return 1
453473
else
454-
rsearchindex(s.data, t.data, i)
474+
return i
455475
end
456476
end
457477

458-
function rsearch(s::Union(Array{UInt8,1},Array{Int8,1}),t::Union(Array{UInt8,1},Array{Int8,1}),i)
478+
function rsearch(s::ByteArray, t::ByteArray, i::Integer)
459479
idx = rsearchindex(s,t,i)
460480
if isempty(t)
461481
idx:idx-1
@@ -536,7 +556,7 @@ cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a
536556
isless(a::Symbol, b::Symbol) = cmp(a,b) < 0
537557

538558
startswith(a::ByteString, b::ByteString) = startswith(a.data, b.data)
539-
startswith(a::Array{UInt8,1}, b::Array{UInt8,1}) =
559+
startswith(a::Vector{UInt8}, b::Vector{UInt8}) =
540560
(length(a) >= length(b) && ccall(:strncmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0)
541561

542562
# TODO: fast endswith
@@ -957,15 +977,15 @@ unescape_string(s::AbstractString) = sprint(endof(s), print_unescaped, s)
957977

958978
## checking UTF-8 & ACSII validity ##
959979

960-
byte_string_classify(data::Array{UInt8,1}) =
980+
byte_string_classify(data::Vector{UInt8}) =
961981
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data))
962982
byte_string_classify(s::ByteString) = byte_string_classify(s.data)
963983
# 0: neither valid ASCII nor UTF-8
964984
# 1: valid ASCII
965985
# 2: valid UTF-8
966986

967-
isvalid(::Type{ASCIIString}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) == 1
968-
isvalid(::Type{UTF8String}, s::Union(Array{UInt8,1},ByteString)) = byte_string_classify(s) != 0
987+
isvalid(::Type{ASCIIString}, s::Union(Vector{UInt8},ByteString)) = byte_string_classify(s) == 1
988+
isvalid(::Type{UTF8String}, s::Union(Vector{UInt8},ByteString)) = byte_string_classify(s) != 0
969989

970990
## multiline strings ##
971991

@@ -1631,8 +1651,6 @@ float{S<:AbstractString}(a::AbstractArray{S}) = map!(float, similar(a,typeof(flo
16311651

16321652
# find the index of the first occurrence of a value in a byte array
16331653

1634-
typealias ByteArray Union(Array{UInt8,1},Array{Int8,1})
1635-
16361654
function search(a::ByteArray, b::Union(Int8,UInt8), i::Integer)
16371655
if i < 1
16381656
throw(BoundsError(a, i))
@@ -1645,6 +1663,13 @@ function search(a::ByteArray, b::Union(Int8,UInt8), i::Integer)
16451663
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
16461664
q == C_NULL ? 0 : Int(q-p+1)
16471665
end
1666+
function search(a::Vector{UInt8}, b::Char, i::Integer)
1667+
if isascii(b)
1668+
search(a,UInt8(b),i)
1669+
else
1670+
search(a,string(b).data,i).start
1671+
end
1672+
end
16481673
function search(a::ByteArray, b::Char, i::Integer)
16491674
if isascii(b)
16501675
search(a,UInt8(b),i)
@@ -1654,7 +1679,7 @@ function search(a::ByteArray, b::Char, i::Integer)
16541679
end
16551680
search(a::ByteArray, b::Union(Int8,UInt8,Char)) = search(a,b,1)
16561681

1657-
function rsearch(a::Union(Array{UInt8,1},Array{Int8,1}), b::Union(Int8,UInt8), i::Integer)
1682+
function rsearch(a::ByteArray, b::Union(Int8,UInt8), i::Integer)
16581683
if i < 1
16591684
return i == 0 ? 0 : throw(BoundsError(a, i))
16601685
end
@@ -1697,7 +1722,7 @@ function hex2bytes(s::ASCIIString)
16971722
return arr
16981723
end
16991724

1700-
bytes2hex{T<:UInt8}(arr::Array{T,1}) = join([hex(i,2) for i in arr])
1725+
bytes2hex{T<:UInt8}(arr::Vector{T}) = join([hex(i,2) for i in arr])
17011726

17021727
function repr(x)
17031728
s = IOBuffer()

test/strings.jl

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,47 @@ end
469469
@test search("foo,bar,baz", "az") == 10:11
470470
@test search("foo,bar,baz", "az", 12) == 0:-1
471471

472+
# issue #9365
473+
# string search with a two-char UTF-8 (2 byte) string literal
474+
@test search("ééé", "éé") == 1:3
475+
@test search("ééé", "éé", 1) == 1:3
476+
# string search with a two-char UTF-8 (3 byte) string literal
477+
@test search("€€€", "€€") == 1:4
478+
@test search("€€€", "€€", 1) == 1:4
479+
# string search with a two-char UTF-8 (4 byte) string literal
480+
@test search("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 1:5
481+
@test search("\U1f596\U1f596\U1f596", "\U1f596\U1f596", 1) == 1:5
482+
483+
# string search with a two-char UTF-8 (2 byte) string literal
484+
@test search("éé", "éé") == 1:3
485+
@test search("éé", "éé", 1) == 1:3
486+
# string search with a two-char UTF-8 (3 byte) string literal
487+
@test search("€€", "€€") == 1:4
488+
@test search("€€", "€€", 1) == 1:4
489+
# string search with a two-char UTF-8 (4 byte) string literal
490+
@test search("\U1f596\U1f596", "\U1f596\U1f596") == 1:5
491+
@test search("\U1f596\U1f596", "\U1f596\U1f596", 1) == 1:5
492+
493+
# string rsearch with a two-char UTF-8 (2 byte) string literal
494+
@test rsearch("ééé", "éé") == 3:5
495+
@test rsearch("ééé", "éé", endof("ééé")) == 3:5
496+
# string rsearch with a two-char UTF-8 (3 byte) string literal
497+
@test rsearch("€€€", "€€") == 4:7
498+
@test rsearch("€€€", "€€", endof("€€€")) == 4:7
499+
# string rsearch with a two-char UTF-8 (4 byte) string literal
500+
@test rsearch("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 5:9
501+
@test rsearch("\U1f596\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 5:9
502+
503+
# string rsearch with a two-char UTF-8 (2 byte) string literal
504+
@test rsearch("éé", "éé") == 1:3 # should really be 1:4!
505+
@test rsearch("éé", "éé", endof("ééé")) == 1:3
506+
# string search with a two-char UTF-8 (3 byte) string literal
507+
@test rsearch("€€", "€€") == 1:4 # should really be 1:6!
508+
@test rsearch("€€", "€€", endof("€€€")) == 1:4
509+
# string search with a two-char UTF-8 (4 byte) string literal
510+
@test rsearch("\U1f596\U1f596", "\U1f596\U1f596") == 1:5 # should really be 1:8!
511+
@test rsearch("\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 1:5
512+
472513
# string rsearch with a two-char string literal
473514
@test rsearch("foo,bar,baz", "xx") == 0:-1
474515
@test rsearch("foo,bar,baz", "fo") == 1:2
@@ -501,6 +542,46 @@ end
501542
@test search("foo,bar,baz", r"az") == 10:11
502543
@test search("foo,bar,baz", r"az", 12) == 0:-1
503544

545+
# string searchindex with a two-char UTF-8 (2 byte) string literal
546+
@test searchindex("ééé", "éé") == 1
547+
@test searchindex("ééé", "éé", 1) == 1
548+
# string searchindex with a two-char UTF-8 (3 byte) string literal
549+
@test searchindex("€€€", "€€") == 1
550+
@test searchindex("€€€", "€€", 1) == 1
551+
# string searchindex with a two-char UTF-8 (4 byte) string literal
552+
@test searchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 1
553+
@test searchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596", 1) == 1
554+
555+
# string searchindex with a two-char UTF-8 (2 byte) string literal
556+
@test searchindex("éé", "éé") == 1
557+
@test searchindex("éé", "éé", 1) == 1
558+
# string searchindex with a two-char UTF-8 (3 byte) string literal
559+
@test searchindex("€€", "€€") == 1
560+
@test searchindex("€€", "€€", 1) == 1
561+
# string searchindex with a two-char UTF-8 (4 byte) string literal
562+
@test searchindex("\U1f596\U1f596", "\U1f596\U1f596") == 1
563+
@test searchindex("\U1f596\U1f596", "\U1f596\U1f596", 1) == 1
564+
565+
# string rsearchindex with a two-char UTF-8 (2 byte) string literal
566+
@test rsearchindex("ééé", "éé") == 3
567+
@test rsearchindex("ééé", "éé", endof("ééé")) == 3
568+
# string rsearchindex with a two-char UTF-8 (3 byte) string literal
569+
@test rsearchindex("€€€", "€€") == 4
570+
@test rsearchindex("€€€", "€€", endof("€€€")) == 4
571+
# string rsearchindex with a two-char UTF-8 (4 byte) string literal
572+
@test rsearchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596") == 5
573+
@test rsearchindex("\U1f596\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 5
574+
575+
# string rsearchindex with a two-char UTF-8 (2 byte) string literal
576+
@test rsearchindex("éé", "éé") == 1
577+
@test rsearchindex("éé", "éé", endof("ééé")) == 1
578+
# string searchindex with a two-char UTF-8 (3 byte) string literal
579+
@test rsearchindex("€€", "€€") == 1
580+
@test rsearchindex("€€", "€€", endof("€€€")) == 1
581+
# string searchindex with a two-char UTF-8 (4 byte) string literal
582+
@test rsearchindex("\U1f596\U1f596", "\U1f596\U1f596") == 1
583+
@test rsearchindex("\U1f596\U1f596", "\U1f596\U1f596", endof("\U1f596\U1f596\U1f596")) == 1
584+
504585
# split
505586
@test isequal(split("foo,bar,baz", 'x'), ["foo,bar,baz"])
506587
@test isequal(split("foo,bar,baz", ','), ["foo","bar","baz"])

0 commit comments

Comments
 (0)