Skip to content

Commit 9071f14

Browse files
committed
Merge pull request #11551 from ScottPJones/spj/fixutf
Fix #10959 bugs with UTF-16 conversions
2 parents 1c4dae4 + 2ab9334 commit 9071f14

File tree

4 files changed

+350
-67
lines changed

4 files changed

+350
-67
lines changed

base/utf16.jl

Lines changed: 206 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,42 @@
11
# This file is a part of Julia. License is MIT: http://julialang.org/license
22

3-
utf16_is_lead(c::UInt16) = (c & 0xfc00) == 0xd800
4-
utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
5-
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
6-
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)
3+
# Quickly copy and set trailing \0
4+
@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}}(
5+
::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
6+
S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1))
7+
end
8+
9+
# Get rest of character ch from 3-byte UTF-8 sequence in dat
10+
@inline function get_utf8_3byte(dat, pos, ch)
11+
@inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
12+
end
13+
# Get rest of character ch from 4-byte UTF-8 sequence in dat
14+
@inline function get_utf8_4byte(dat, pos, ch)
15+
@inbounds return (((ch & 0x7) << 18)
16+
| (UInt32(dat[pos-2] & 0x3f) << 12)
17+
| (UInt32(dat[pos-1] & 0x3f) << 6)
18+
| (dat[pos] & 0x3f))
19+
end
20+
21+
# Output a character as a 4-byte UTF-8 sequence
22+
@inline function output_utf8_4byte!(buf, out, ch)
23+
@inbounds begin
24+
buf[out + 1] = 0xf0 | (ch >>> 18)
25+
buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
26+
buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
27+
buf[out + 4] = 0x80 | (ch & 0x3f)
28+
end
29+
end
30+
31+
const empty_utf16 = UTF16String(UInt16[0])
732

833
function length(s::UTF16String)
934
d = s.data
1035
len = length(d) - 1
1136
len == 0 && return 0
1237
cnum = 0
1338
for i = 1:len
14-
@inbounds cnum += !utf16_is_trail(d[i])
39+
@inbounds cnum += !is_surrogate_trail(d[i])
1540
end
1641
cnum
1742
end
@@ -20,100 +45,220 @@ function endof(s::UTF16String)
2045
d = s.data
2146
i = length(d) - 1
2247
i == 0 && return i
23-
utf16_is_surrogate(d[i]) ? i-1 : i
48+
return is_surrogate_codeunit(d[i]) ? i-1 : i
2449
end
2550

51+
get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
52+
2653
function next(s::UTF16String, i::Int)
27-
if !utf16_is_surrogate(s.data[i])
28-
return Char(s.data[i]), i+1
29-
elseif length(s.data)-1 > i && utf16_is_lead(s.data[i]) && utf16_is_trail(s.data[i+1])
30-
return utf16_get_supplementary(s.data[i], s.data[i+1]), i+2
31-
end
32-
throw(UnicodeError(UTF_ERR_INVALID_INDEX,0,0))
54+
ch = s.data[i]
55+
!is_surrogate_codeunit(ch) && return (Char(ch), i+1)
56+
# check length, account for terminating \0
57+
i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)))
58+
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch))
59+
ct = s.data[i+1]
60+
!is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch))
61+
Char(get_supplementary(ch, ct)), i+2
3362
end
3463

3564
function reverseind(s::UTF16String, i::Integer)
3665
j = length(s.data) - i
37-
return Base.utf16_is_trail(s.data[j]) ? j-1 : j
66+
return is_surrogate_trail(s.data[j]) ? j-1 : j
3867
end
3968

4069
lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
4170

4271
function reverse(s::UTF16String)
43-
d =s.data
72+
d = s.data
4473
out = similar(d)
4574
out[end] = 0 # NULL termination
4675
n = length(d)
47-
for i = 1:n-1
48-
out[i] = d[n-i]
49-
if Base.utf16_is_lead(out[i])
50-
out[i],out[i-1] = out[i-1],out[i]
76+
@inbounds for i = 1:n-1
77+
ch = d[n-i]
78+
if is_surrogate_lead(ch)
79+
out[i],out[i-1] = out[i-1],ch
80+
else
81+
out[i] = ch
5182
end
5283
end
53-
return UTF16String(out)
84+
UTF16String(out)
5485
end
5586

56-
# TODO: optimize this
57-
function encode16(s::AbstractString)
58-
buf = UInt16[]
59-
for ch in s
60-
c = reinterpret(UInt32, ch)
87+
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
88+
89+
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
90+
i = 1
91+
n = length(data) # this may include NULL termination; that's okay
92+
@inbounds while i < n # check for unpaired surrogates
93+
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
94+
i += 2
95+
elseif is_surrogate_codeunit(data[i])
96+
return false
97+
else
98+
i += 1
99+
end
100+
end
101+
return i > n || !is_surrogate_codeunit(data[i])
102+
end
103+
104+
"
105+
Converts an `AbstractString` to a `UTF16String`
106+
107+
### Returns:
108+
* `UTF16String`
109+
110+
### Throws:
111+
* `UnicodeError`
112+
"
113+
function convert(::Type{UTF16String}, str::AbstractString)
114+
len, flags, num4byte = unsafe_checkstring(str)
115+
buf = Vector{UInt16}(len+num4byte+1)
116+
out = 0
117+
@inbounds for ch in str
118+
c = UInt32(ch)
61119
if c < 0x10000
62-
push!(buf, UInt16(c))
63-
elseif c <= 0x10ffff
64-
push!(buf, UInt16(0xd7c0 + (c>>10)))
65-
push!(buf, UInt16(0xdc00 + (c & 0x3ff)))
120+
buf[out += 1] = UInt16(c)
66121
else
67-
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, ch))
122+
# output surrogate pair
123+
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
124+
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
68125
end
69126
end
70-
push!(buf, 0) # NULL termination
127+
@inbounds buf[out + 1] = 0 # NULL termination
71128
UTF16String(buf)
72129
end
73130

74-
utf16(x) = convert(UTF16String, x)
75-
convert(::Type{UTF16String}, s::UTF16String) = s
76-
convert(::Type{UTF16String}, s::AbstractString) = encode16(s)
77-
convert(::Type{Array{UInt16,1}}, s::UTF16String) = s.data
78-
convert(::Type{Array{UInt16}}, s::UTF16String) = s.data
131+
"
132+
Converts a `UTF8String` to a `UTF16String`
79133
80-
# TODO: optimize this
81-
convert(::Type{UTF8String}, s::UTF16String) =
82-
sprint(length(s.data)-1, io->for c in s; write(io,c::Char); end)
134+
### Returns:
135+
* `UTF16String`
83136
84-
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
85-
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
86-
convert(Ptr{T}, pointer(s))
137+
### Throws:
138+
* `UnicodeError`
139+
"
140+
function convert(::Type{UTF16String}, str::UTF8String)
141+
dat = str.data
142+
# handle zero length string quickly
143+
sizeof(dat) == 0 && return empty_utf16
144+
# Check that is correct UTF-8 encoding and get number of words needed
145+
len, flags, num4byte = unsafe_checkstring(dat)
146+
len += num4byte
147+
buf = Vector{UInt16}(len+1)
148+
@inbounds buf[len+1] = 0
149+
# Optimize case where no characters > 0x7f
150+
flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
151+
out = 0
152+
pos = 0
153+
@inbounds while out < len
154+
ch::UInt32 = dat[pos += 1]
155+
# Handle ASCII characters
156+
if ch <= 0x7f
157+
buf[out += 1] = ch
158+
# Handle range 0x80-0x7ff
159+
elseif ch < 0xe0
160+
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
161+
# Handle range 0x800-0xffff
162+
elseif ch < 0xf0
163+
pos += 2
164+
buf[out += 1] = get_utf8_3byte(dat, pos, ch)
165+
# Handle range 0x10000-0x10ffff
166+
else
167+
pos += 3
168+
ch = get_utf8_4byte(dat, pos, ch)
169+
# output surrogate pair
170+
buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
171+
buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
172+
end
173+
end
174+
UTF16String(buf)
175+
end
87176

88-
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
89-
i = 1
90-
n = length(data) # this may include NULL termination; that's okay
91-
while i < n # check for unpaired surrogates
92-
if utf16_is_lead(data[i]) && utf16_is_trail(data[i+1])
93-
i += 2
94-
elseif utf16_is_surrogate(data[i])
95-
return false
177+
"
178+
Converts a `UTF16String` to a `UTF8String`
179+
180+
### Returns:
181+
* `UTF8String`
182+
183+
### Throws:
184+
* `UnicodeError`
185+
"
186+
function convert(::Type{UTF8String}, str::UTF16String)
187+
dat = str.data
188+
len = sizeof(dat) >>> 1
189+
# handle zero length string quickly
190+
len <= 1 && return empty_utf8
191+
# get number of bytes to allocate
192+
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
193+
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
194+
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
195+
end
196+
197+
"
198+
Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
199+
200+
### Input Arguments:
201+
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
202+
* `len` length of output in bytes
203+
204+
### Returns:
205+
* `UTF8String`
206+
"
207+
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
208+
buf = Vector{UInt8}(len)
209+
out = 0
210+
pos = 0
211+
@inbounds while out < len
212+
ch::UInt32 = dat[pos += 1]
213+
# Handle ASCII characters
214+
if ch <= 0x7f
215+
buf[out += 1] = ch
216+
# Handle 0x80-0x7ff
217+
elseif ch < 0x800
218+
buf[out += 1] = 0xc0 | (ch >>> 6)
219+
buf[out += 1] = 0x80 | (ch & 0x3f)
220+
# Handle 0x10000-0x10ffff (if input is UInt32)
221+
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
222+
output_utf8_4byte!(buf, out, ch)
223+
out += 4
224+
# Handle surrogate pairs
225+
elseif is_surrogate_codeunit(ch)
226+
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
227+
out += 4
228+
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
96229
else
97-
i += 1
230+
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
231+
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
232+
buf[out += 1] = 0x80 | (ch & 0x3f)
98233
end
99234
end
100-
return i > n || !utf16_is_surrogate(data[i])
235+
UTF8String(buf)
101236
end
102237

103-
function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
104-
!isvalid(UTF16String, data) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
105-
len = length(data)
106-
d = Array(UInt16, len + 1)
107-
d[end] = 0 # NULL terminate
108-
UTF16String(copy!(d,1, data,1, len))
238+
function convert(::Type{UTF16String}, str::ASCIIString)
239+
dat = str.data
240+
@inbounds return fast_utf_copy(UTF16String, UInt16, length(dat), dat, true)
109241
end
110242

243+
convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
244+
convert(::Type{Array{UInt16}}, str::UTF16String) = str.data
245+
246+
convert(::Type{UTF16String}, str::UTF16String) = str
247+
248+
unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
249+
convert(Ptr{T}, pointer(s))
250+
111251
convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
112252
convert(T, reshape(data, length(data)))
113253

114254
convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
115255
convert(T, reinterpret(UInt16, data))
116256

257+
function convert(::Type{UTF16String}, dat::AbstractVector{UInt16})
258+
len, flags, num4byte = unsafe_checkstring(dat)
259+
@inbounds return fast_utf_copy(UTF16String, UInt16, len+num4byte, dat, true)
260+
end
261+
117262
function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
118263
isempty(bytes) && return UTF16String(UInt16[0])
119264
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
@@ -136,6 +281,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
136281
UTF16String(d)
137282
end
138283

284+
convert(::Type{UTF16String}, str::UTF16String) = str
285+
286+
utf16(x) = convert(UTF16String, x)
139287
utf16(p::Ptr{UInt16}, len::Integer) = utf16(pointer_to_array(p, len))
140288
utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
141289
function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}})
@@ -154,7 +302,7 @@ function map(fun, str::UTF16String)
154302
end
155303
uc = reinterpret(UInt32, c2)
156304
if uc < 0x10000
157-
if utf16_is_surrogate(UInt16(uc))
305+
if is_surrogate_codeunit(UInt16(uc))
158306
throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc))
159307
end
160308
push!(buf, UInt16(uc))

base/utf32.jl

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# This file is a part of Julia. License is MIT: http://julialang.org/license
22

3+
# UTF-32 basic functions
34
next(s::UTF32String, i::Int) = (s.data[i], i+1)
45
endof(s::UTF32String) = length(s.data) - 1
56
length(s::UTF32String) = length(s.data) - 1
@@ -40,8 +41,8 @@ function convert{T<:ByteString}(::Type{T}, data::AbstractVector{Char})
4041
convert(T, takebuf_string(s))
4142
end
4243

43-
convert(::Type{Array{Char,1}}, s::UTF32String) = s.data
44-
convert(::Type{Array{Char}}, s::UTF32String) = s.data
44+
convert(::Type{Vector{Char}}, str::UTF32String) = str.data
45+
convert(::Type{Array{Char}}, str::UTF32String) = str.data
4546

4647
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
4748

@@ -60,19 +61,19 @@ function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
6061
elseif data[1] == Char(0xfffe0000) # byte-swapped
6162
d = Array(Char, length(data))
6263
for i = 2:length(data)
63-
d[i-1] = bswap(data[i])
64+
@inbounds d[i-1] = bswap(data[i])
6465
end
6566
else
6667
d = Array(Char, length(data) + 1)
6768
copy!(d, 1, data, 1, length(data)) # assume native byte order
6869
end
69-
d[end] = Char(0) # NULL terminate
70+
d[end] = 0 # NULL terminate
7071
UTF32String(d)
7172
end
7273

7374
function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
7475
for i=1:length(str)
75-
@inbounds if !isvalid(Char, reinterpret(UInt32, str[i])) ; return false ; end
76+
@inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
7677
end
7778
return true
7879
end
@@ -89,9 +90,9 @@ end
8990
function map(f, s::UTF32String)
9091
d = s.data
9192
out = similar(d)
92-
out[end] = Char(0)
93+
out[end] = 0
9394

94-
for i = 1:(length(d)-1)
95+
@inbounds for i = 1:(length(d)-1)
9596
c2 = f(d[i])
9697
if !isa(c2, Char)
9798
throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))

0 commit comments

Comments
 (0)