Skip to content

Commit 0b6619a

Browse files
committed
Fix #10959 bugs with UTF-16/UTF-32 conversions
1 parent c85f1be commit 0b6619a

File tree

8 files changed

+984
-109
lines changed

8 files changed

+984
-109
lines changed

base/sysimg.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,15 @@ include("iterator.jl")
8484
include("osutils.jl")
8585

8686
# strings & printing
87+
include("utferror.jl")
88+
include("utftype.jl")
89+
include("utfcheck.jl")
8790
include("char.jl")
8891
include("ascii.jl")
8992
include("utf8.jl")
9093
include("utf16.jl")
9194
include("utf32.jl")
95+
include("utfconvert.jl")
9296
include("iobuffer.jl")
9397
include("string.jl")
9498
include("utf8proc.jl")

base/utf16.jl

Lines changed: 29 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,12 @@
11
# This file is a part of Julia. License is MIT: http://julialang.org/license
22

3-
immutable UTF16String <: AbstractString
4-
data::Array{UInt16,1} # includes 16-bit NULL termination after string chars
5-
function UTF16String(data::Vector{UInt16})
6-
if length(data) < 1 || data[end] != 0
7-
throw(ArgumentError("UTF16String data must be NULL-terminated"))
8-
end
9-
new(data)
10-
end
11-
end
12-
13-
utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
14-
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
15-
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
16-
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)
17-
183
function length(s::UTF16String)
194
d = s.data
205
len = length(d) - 1
216
len == 0 && return 0
227
cnum = 0
238
for i = 1:len
24-
@inbounds cnum += !utf16_is_trail(d[i])
9+
@inbounds cnum += !is_surrogate_trail(d[i])
2510
end
2611
cnum
2712
end
@@ -30,92 +15,69 @@ function endof(s::UTF16String)
3015
d = s.data
3116
i = length(d) - 1
3217
i == 0 && return i
33-
utf16_is_surrogate(d[i]) ? i-1 : i
18+
return is_surrogate_codeunit(d[i]) ? i-1 : i
3419
end
3520

21+
get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
22+
3623
function next(s::UTF16String, i::Int)
37-
if !utf16_is_surrogate(s.data[i])
38-
return Char(s.data[i]), i+1
39-
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
40-
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
41-
end
42-
throw(ArgumentError("invalid UTF-16 character index"))
24+
ch = s.data[i]
25+
!is_surrogate_codeunit(ch) && return (Char(ch), i+1)
26+
# check length, account for terminating \0
27+
i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
28+
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
29+
ct = s.data[i+1]
30+
!is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch)
31+
Char(get_supplementary(ch, ct)), i+2
4332
end
4433

4534
function reverseind(s::UTF16String, i::Integer)
4635
j = length(s.data) - i
47-
return Base.utf16_is_trail(s.data[j]) ? j-1 : j
36+
return is_surrogate_trail(s.data[j]) ? j-1 : j
4837
end
4938

5039
lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
5140

5241
function reverse(s::UTF16String)
53-
d =s.data
42+
d = s.data
5443
out = similar(d)
5544
out[end] = 0 # NULL termination
5645
n = length(d)
57-
for i = 1:n-1
58-
out[i] = d[n-i]
59-
if Base.utf16_is_lead(out[i])
60-
out[i],out[i-1] = out[i-1],out[i]
61-
end
62-
end
63-
return UTF16String(out)
64-
end
65-
66-
# TODO: optimize this
67-
function encode16(s::AbstractString)
68-
buf = UInt16[]
69-
for ch in s
70-
c = reinterpret(UInt32, ch)
71-
if c < 0x10000
72-
push!(buf, UInt16(c))
73-
elseif c <= 0x10ffff
74-
push!(buf, UInt16(0xd7c0 + (c>>10)))
75-
push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
46+
@inbounds for i = 1:n-1
47+
ch = d[n-i]
48+
if is_surrogate_lead(ch)
49+
out[i],out[i-1] = out[i-1],ch
7650
else
77-
throw(ArgumentError("invalid Unicode character (0x$(hex(c)) > 0x10ffff)"))
51+
out[i] = ch
7852
end
7953
end
80-
push!(buf, 0) # NULL termination
81-
UTF16String(buf)
54+
UTF16String(out)
8255
end
8356

84-
utf16(x) = convert(UTF16String, x)
85-
convert(::Type{UTF16String}, s::UTF16String) = s
86-
convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
87-
convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
88-
convert(::Type{Array{UInt16}}, s::UTF16String) = s.data
89-
90-
# TODO: optimize this
91-
convert(::Type{UTF8String}, s::UTF16String) =
92-
sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
93-
9457
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
95-
unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
96-
convert(Ptr{T}, pointer(s))
9758

9859
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
9960
i = 1
10061
n = length(data) # this may include NULL termination; that's okay
101-
while i < n # check for unpaired surrogates
102-
if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
62+
@inbounds while i < n # check for unpaired surrogates
63+
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
10364
i += 2
104-
elseif utf16_is_surrogate(data[i])
65+
elseif is_surrogate_codeunit(data[i])
10566
return false
10667
else
10768
i += 1
10869
end
10970
end
110-
return i > n || !utf16_is_surrogate(data[i])
71+
return i > n || !is_surrogate_codeunit(data[i])
11172
end
11273

74+
unsafe_convert{T<:Union(Int16,UInt16)}(::Type{Ptr{T}}, s::UTF16String) =
75+
convert(Ptr{T}, pointer(s))
76+
11377
function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
11478
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
11579
len = length(data)
116-
d = Array(UInt16, len + 1)
117-
d[end] = 0 # NULL terminate
118-
UTF16String(copy!(d,1, data,1, len))
80+
@inbounds return UTF16String(setindex!(copy!(Vector{UInt16}(len+1),1,data,1,len),0,len+1))
11981
end
12082

12183
convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
@@ -146,6 +108,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
146108
UTF16String(d)
147109
end
148110

111+
utf16(x) = convert(UTF16String, x)
149112
utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
150113
utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
151114
function utf16(p::Union(Ptr{UInt16}, Ptr{Int16}))

base/utf32.jl

Lines changed: 13 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,17 @@
11
# This file is a part of Julia. License is MIT: http://julialang.org/license
22

3-
## UTF-32 in the native byte order, i.e. plain old character arrays ##
4-
5-
immutable UTF32String <: DirectIndexString
6-
data::Vector{Char} # includes 32-bit NULL termination after string chars
7-
8-
function UTF32String(a::Vector{Char})
9-
if length(a) < 1 || a[end] != Char(0)
10-
throw(ArgumentError("UTF32String data must be NULL-terminated"))
11-
end
12-
new(a)
13-
end
14-
end
15-
UTF32String(data::Vector{UInt32}) = UTF32String(reinterpret(Char, data))
16-
3+
# UTF-32 basic functions
174
next(s::UTF32String, i::Int) = (s.data[i], i+1)
185
endof(s::UTF32String) = length(s.data) - 1
196
length(s::UTF32String) = length(s.data) - 1
207

21-
utf32(x) = convert(UTF32String, x)
22-
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
23-
convert(::Type{UTF32String}, s::UTF32String) = s
8+
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
249

25-
function convert(::Type{UTF32String}, s::AbstractString)
26-
a = Array(Char, length(s) + 1)
27-
i = 0
28-
for c in s
29-
a[i += 1] = c
30-
end
31-
a[end] = Char(0) # NULL terminate
32-
UTF32String(a)
33-
end
10+
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
3411

3512
function convert(::Type{UTF32String}, data::AbstractVector{Char})
3613
len = length(data)
37-
d = Array(Char, len + 1)
38-
d[end] = Char(0) # NULL terminate
39-
UTF32String(copy!(d,1, data,1, len))
14+
@inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1))
4015
end
4116

4217
convert{T<:Union(Int32,UInt32)}(::Type{UTF32String}, data::AbstractVector{T}) =
@@ -54,12 +29,9 @@ function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
5429
convert(T, takebuf_string(s))
5530
end
5631

57-
convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
58-
convert(::Type{Array{Char}}, s::UTF32String) = s.data
59-
60-
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
32+
convert(::Type{Vector{Char}}, str::UTF32String) = str.data
33+
convert(::Type{Array{Char}}, str::UTF32String) = str.data
6134

62-
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
6335
unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
6436
convert(Ptr{T}, pointer(s))
6537

@@ -73,26 +45,24 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
7345
copy!(d,1, data, 2, length(data)-1)
7446
elseif data[1] == Char(0xfffe0000) # byte-swapped
7547
d = Array(Char, length(data))
76-
for i = 2:length(data)
77-
d[i-1] = bswap(data[i])
78-
end
48+
@inbounds for i = 2:length(data) ; d[i-1] = bswap(data[i]) ; end
7949
else
8050
d = Array(Char, length(data) + 1)
8151
copy!(d, 1, data, 1, length(data)) # assume native byte order
8252
end
83-
d[end] = Char(0) # NULL terminate
53+
d[end] = 0 # NULL terminate
8454
UTF32String(d)
8555
end
8656

8757
function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
8858
for i=1:length(str)
89-
@inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
59+
@inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
9060
end
9161
return true
9262
end
9363
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
94-
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(str::T) = isvalid(T, str.data)
95-
isvalid{T<:Union(ASCIIString,UTF8String,UTF16String,UTF32String)}(::Type{T}, str::T) = isvalid(T, str.data)
64+
65+
utf32(x) = convert(UTF32String, x)
9666

9767
utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
9868
utf32(p::Union(Ptr{UInt32}, Ptr{Int32}), len::Integer) = utf32(convert(Ptr{Char}, p), len)
@@ -105,9 +75,9 @@ end
10575
function map(f, s::UTF32String)
10676
d = s.data
10777
out = similar(d)
108-
out[end] = Char(0)
78+
out[end] = 0
10979

110-
for i = 1:(length(d)-1)
80+
@inbounds for i = 1:(length(d)-1)
11181
c2 = f(d[i])
11282
if !isa(c2, Char)
11383
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))

0 commit comments

Comments
 (0)