Skip to content

Commit f2b83a2

Browse files
committed
Fix JuliaLang#10959 UTF-32 conversion errors
Added new `convert` methods that use the `checkstring` function to validate input Added tests for many sorts of valid/invalid data Depends on PR JuliaLang#11551 and JuliaLang#11575
1 parent ffb4ec4 commit f2b83a2

File tree

4 files changed

+873
-12
lines changed

4 files changed

+873
-12
lines changed

base/unicode/checkstring.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,11 @@ end
194194
"
195195
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
196196
197+
<<<<<<< HEAD
197198
This function checks the bounds of the start and end positions
199+
=======
200+
This function checks the bounds of the start or end positions
201+
>>>>>>> 0a652d1... Add Unicode validation function and fix UTF-16 conversion bugs
198202
Use `unsafe_checkstring` to avoid that overhead if the bounds have already been checked
199203
200204
### Input Arguments:
@@ -221,9 +225,15 @@ function checkstring end
221225
checkstring(dat; kwargs...) = unsafe_checkstring(dat, start(dat), endof(dat); kwargs...)
222226

223227
# Make sure that beginning and end positions are bounds checked
228+
<<<<<<< HEAD
224229
function checkstring(dat, startpos, endpos = endof(dat); kwargs...)
225230
checkbounds(dat,startpos)
226231
checkbounds(dat,endpos)
227232
endpos < startpos && throw(ArgumentError("End position ($endpos) is less than start position ($startpos)"))
233+
=======
234+
function checkstring(dat, startpos = start(dat), endpos = endof(dat); kwargs...)
235+
startpos < 1 && throw(BoundsError(dat, startpos))
236+
(startpos <= endpos <= endof(dat)) || throw(BoundsError(dat, endpos))
237+
>>>>>>> 0a652d1... Add Unicode validation function and fix UTF-16 conversion bugs
228238
unsafe_checkstring(dat, startpos, endpos; kwargs...)
229239
end

base/unicode/utf32.jl

Lines changed: 265 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,277 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
55
endof(s::UTF32String) = length(s.data) - 1
66
length(s::UTF32String) = length(s.data) - 1
77

8+
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
9+
10+
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
11+
12+
const empty_utf32 = UTF32String(UInt32[0])
13+
814
utf32(x) = convert(UTF32String, x)
915
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
1016
convert(::Type{UTF32String}, s::UTF32String) = s
1117

12-
function convert(::Type{UTF32String}, s::AbstractString)
13-
a = Array(Char, length(s) + 1)
14-
i = 0
15-
for c in s
16-
a[i += 1] = c
18+
"
19+
Converts an `AbstractString` to a `UTF16String`
20+
21+
### Input Arguments:
22+
* `::Type{UTF32String}`
23+
* `str::AbstractString`
24+
25+
### Returns:
26+
* `::UTF32String`
27+
28+
### Throws:
29+
* `UnicodeError`
30+
"
31+
function convert(::Type{UTF32String}, str::AbstractString)
32+
len, flags = check_string(str)
33+
buf = Vector{Char}(len+1)
34+
out = 0
35+
@inbounds for ch in str ; buf[out += 1] = ch ; end
36+
@inbounds buf[out + 1] = 0 # NULL termination
37+
UTF32String(buf)
38+
end
39+
40+
"
41+
Converts a UTF-32 encoded vector of `UInt32` to a `UTF8String`
42+
43+
### Input Arguments:
44+
* `::Type{UTF8String}`
45+
* `dat::Vector{UInt32}`
46+
47+
### Returns:
48+
* `::UTF8String`
49+
50+
### Throws:
51+
* `UnicodeError`
52+
"
53+
function convert(::Type{UTF8String}, dat::Vector{UInt32})
54+
len = sizeof(dat)
55+
# handle zero length string quickly
56+
len == 0 && return empty_utf8
57+
# get number of bytes to allocate
58+
len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2)
59+
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
60+
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
61+
end
62+
63+
"
64+
Converts a `UTF32String` to a `UTF8String`
65+
66+
### Input Arguments:
67+
* `::Type{UTF8String}`
68+
* `str::UTF32String`
69+
70+
### Returns:
71+
* `::UTF8String`
72+
73+
### Throws:
74+
* `UnicodeError`
75+
"
76+
function convert(::Type{UTF8String}, str::UTF32String)
77+
dat = reinterpret(UInt32, str.data)
78+
len = sizeof(dat) >>> 2
79+
# handle zero length string quickly
80+
len <= 1 && return empty_utf8
81+
# get number of bytes to allocate
82+
len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
83+
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
84+
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
85+
end
86+
87+
"
88+
Converts a `UTF8String` to a `UTF32String`
89+
90+
### Input Arguments:
91+
* `::Type{UTF32String}`
92+
* `str::UTF8String`
93+
94+
### Returns:
95+
* `::UTF32String`
96+
97+
### Throws:
98+
* `UnicodeError`
99+
"
100+
function convert(::Type{UTF32String}, str::UTF8String)
101+
dat = str.data
102+
# handle zero length string quickly
103+
sizeof(dat) == 0 && return empty_utf32
104+
# Validate UTF-8 encoding, and get number of words to create
105+
len, flags = check_string(dat)
106+
# Optimize case where no characters > 0x7f
107+
flags == 0 && @inbounds return fast_utf_copy(UTF32String, Char, len, dat, true)
108+
# has multi-byte UTF-8 sequences
109+
buf = Vector{Char}(len+1)
110+
@inbounds buf[len+1] = 0 # NULL termination
111+
local ch::UInt32, surr::UInt32
112+
out = 0
113+
pos = 0
114+
@inbounds while out < len
115+
ch = dat[pos += 1]
116+
# Handle ASCII characters
117+
if ch <= 0x7f
118+
buf[out += 1] = ch
119+
# Handle range 0x80-0x7ff
120+
elseif ch < 0xe0
121+
buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
122+
# Handle range 0x800-0xffff
123+
elseif ch < 0xf0
124+
pos += 2
125+
ch = get_utf8_3byte(dat, pos, ch)
126+
# Handle surrogate pairs (should have been encoded in 4 bytes)
127+
if is_surrogate_lead(ch)
128+
# Build up 32-bit character from ch and trailing surrogate in next 3 bytes
129+
pos += 3
130+
surr = ((UInt32(dat[pos-2] & 0xf) << 12)
131+
| (UInt32(dat[pos-1] & 0x3f) << 6)
132+
| (dat[pos] & 0x3f))
133+
ch = get_supplementary(ch, surr)
134+
end
135+
buf[out += 1] = ch
136+
# Handle range 0x10000-0x10ffff
137+
else
138+
pos += 3
139+
buf[out += 1] = get_utf8_4byte(dat, pos, ch)
140+
end
141+
end
142+
UTF32String(buf)
143+
end
144+
145+
"
146+
Converts a `UTF16String` to `UTF32String`
147+
148+
### Input Arguments:
149+
* `::Type{UTF32String}`
150+
* `str::UTF16String`
151+
152+
### Returns:
153+
* `::UTF32String`
154+
155+
### Throws:
156+
* `UnicodeError`
157+
"
158+
function convert(::Type{UTF32String}, str::UTF16String)
159+
dat = str.data
160+
len = sizeof(dat)
161+
# handle zero length string quickly (account for trailing \0)
162+
len <= 2 && return empty_utf32
163+
# get number of words to create
164+
len, flags, num4byte = check_string(dat, len>>>1)
165+
# No surrogate pairs, do optimized copy
166+
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
167+
local ch::UInt32
168+
buf = Vector{Char}(len)
169+
out = 0
170+
pos = 0
171+
@inbounds while out < len
172+
ch = dat[pos += 1]
173+
# check for surrogate pair
174+
if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end
175+
buf[out += 1] = ch
17176
end
18-
a[end] = Char(0) # NULL terminate
19-
UTF32String(a)
177+
UTF32String(buf)
178+
end
179+
180+
"
181+
Converts a UTF-32 encoded vector of `UInt32` to a `UTF16String`
182+
183+
### Input Arguments:
184+
* `::Type{UTF16String}`
185+
* `dat::Vector{UInt32}`
186+
187+
### Returns:
188+
* `::UTF16String`
189+
190+
### Throws:
191+
* `UnicodeError`
192+
"
193+
function convert(::Type{UTF16String}, dat::Vector{UInt32})
194+
len = sizeof(dat)
195+
# handle zero length string quickly
196+
len <= 4 && return empty_utf16
197+
# get number of words to allocate
198+
len, flags, num4byte = check_string(dat, len>>>2)
199+
len += num4byte + 1
200+
# optimized path, no surrogates
201+
num4byte == 0 && @inbounds return fast_utf_copy(UTF16String, UInt16, len, dat)
202+
return encode_to_utf16(dat, len)
203+
end
204+
205+
"
206+
Converts a `UTF32String` to `UTF16String`
207+
208+
### Input Arguments:
209+
* `::Type{UTF16String}`
210+
* `str::UTF32String`
211+
212+
### Returns:
213+
* `::UTF16String`
214+
215+
### Throws:
216+
* `UnicodeError`
217+
"
218+
function convert(::Type{UTF16String}, str::UTF32String)
219+
dat = reinterpret(UInt32, str.data)
220+
len = sizeof(dat)
221+
# handle zero length string quickly
222+
len <= 4 && return empty_utf16
223+
# get number of words to allocate
224+
len, flags, num4byte = check_string(dat, len>>>2)
225+
# optimized path, no surrogates
226+
num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
227+
return encode_to_utf16(dat, len + num4byte)
228+
end
229+
230+
"
231+
Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String`
232+
233+
### Input Arguments:
234+
* `dat::Vector{UInt32}` UTF-32 encoded data
235+
* `len` length of output in 16-bit words
236+
237+
### Returns:
238+
* `::UTF16String`
239+
"
240+
function encode_to_utf16(dat, len)
241+
buf = Vector{UInt16}(len)
242+
@inbounds buf[len] = 0 # NULL termination
243+
out = 0
244+
pos = 0
245+
@inbounds while out < len
246+
ch = UInt32(dat[pos += 1])
247+
if ch > 0xffff
248+
# Output surrogate pair for 0x10000-0x10ffff
249+
buf[out += 1] = 0xd7c0 + (ch >>> 10)
250+
ch = 0xdc00 + (ch & 0x3ff)
251+
end
252+
buf[out += 1] = ch
253+
end
254+
UTF16String(buf)
255+
end
256+
257+
convert(::Type{UTF8String}, dat::Vector{Char}) = convert(UTF8String, reinterpret(UInt32, dat))
258+
259+
convert(::Type{UTF16String}, dat::Vector{Char}) = convert(UTF16String, reinterpret(UInt32, dat))
260+
convert(::Type{UTF32String}, c::Char) = UTF32String(Char[c, Char(0)])
261+
262+
function convert(::Type{UTF32String}, str::ASCIIString)
263+
dat = str.data
264+
@inbounds return fast_utf_copy(UTF32String, Char, length(dat), dat, true)
265+
end
266+
267+
function convert(::Type{UTF32String}, dat::AbstractVector{Char})
268+
@inbounds return fast_utf_copy(UTF32String, Char, length(dat), dat, true)
20269
end
21270

22271
function convert(::Type{UTF32String}, data::AbstractVector{Char})
23272
len = length(data)
24-
d = Array(Char, len + 1)
25-
d[end] = Char(0) # NULL terminate
26-
UTF32String(copy!(d,1, data,1, len))
273+
@inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1))
274+
end
275+
276+
function convert(::Type{UTF32String}, data::AbstractVector{Char})
277+
len = length(data)
278+
@inbounds return UTF32String(setindex!(copy!(Vector{Char}(len+1),1,data,1,len),0,len+1))
27279
end
28280

29281
convert{T<:Union{Int32,UInt32}}(::Type{UTF32String}, data::AbstractVector{T}) =
@@ -46,12 +298,11 @@ convert(::Type{Array{Char}}, str::UTF32String) = str.data
46298

47299
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
48300

49-
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
50301
unsafe_convert{T<:Union{Int32,UInt32,Char}}(::Type{Ptr{T}}, s::UTF32String) =
51302
convert(Ptr{T}, pointer(s))
52303

53304
function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
54-
isempty(bytes) && return UTF32String(Char[0])
305+
isempty(bytes) && return empty_utf32
55306
length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
56307
data = reinterpret(Char, bytes)
57308
# check for byte-order mark (BOM):
@@ -79,6 +330,8 @@ function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
79330
end
80331
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
81332

333+
utf32(x) = convert(UTF32String, x)
334+
82335
utf32(p::Ptr{Char}, len::Integer) = utf32(pointer_to_array(p, len))
83336
utf32(p::Union{Ptr{UInt32}, Ptr{Int32}}, len::Integer) = utf32(convert(Ptr{Char}, p), len)
84337
function utf32(p::Union{Ptr{Char}, Ptr{UInt32}, Ptr{Int32}})

0 commit comments

Comments
 (0)