Skip to content

Commit 4592c7b

Browse files
committed
Changes to error handling and check_string
1 parent 83bb673 commit 4592c7b

File tree

5 files changed

+39
-60
lines changed

5 files changed

+39
-60
lines changed

base/utf16.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
8282

8383
function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
8484
isempty(bytes) && return UTF16String(UInt16[0])
85-
isodd(length(bytes)) && throw(ArgumentError("odd number of bytes"))
85+
isodd(length(bytes)) && utf_errfunc(UTF_ERR_ODD_BYTES_16,length(bytes),0)
8686
data = reinterpret(UInt16, bytes)
8787
# check for byte-order mark (BOM):
8888
if data[1] == 0xfeff # native byte order
@@ -98,7 +98,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
9898
copy!(d,1, data,1, length(data)) # assume native byte order
9999
end
100100
d[end] = 0 # NULL terminate
101-
!isvalid(UTF16String, d) && throw(ArgumentError("invalid UTF16 data"))
101+
!isvalid(UTF16String, d) && utf_errfunc(UTF_ERR_INVALID_16,0,0)
102102
UTF16String(d)
103103
end
104104

base/utf32.jl

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ unsafe_convert{T<:Union(Int32,UInt32,Char)}(::Type{Ptr{T}}, s::UTF32String) =
3232

3333
function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
3434
isempty(bytes) && return UTF32String(Char[0])
35-
length(bytes) & 3 != 0 && throw(ArgumentError("need multiple of 4 bytes"))
35+
length(bytes) & 3 != 0 && utf_errfunc(UTF_ERR_ODD_BYTES_32, length(bytes))
3636
data = reinterpret(Char, bytes)
3737
# check for byte-order mark (BOM):
3838
if data[1] == Char(0x0000feff) # native byte order
@@ -76,9 +76,7 @@ function map(f, s::UTF32String)
7676

7777
@inbounds for i = 1:(length(d)-1)
7878
c2 = f(d[i])
79-
if !isa(c2, Char)
80-
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
81-
end
79+
!isa(c2, Char) && utf_errfunc(UTF_ERR_MAP_CHAR,0,0)
8280
out[i] = (c2::Char)
8381
end
8482
UTF32String(out)

base/utfcheck.jl

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,10 @@ end
3838
@throws ArgumentError
3939
""" ->
4040
=#
41-
function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
41+
function check_string(dat::Vector{UInt8}, len::Int = sizeof(dat), pos::Int = 0 ; options::Integer=0)
4242
local byt::UInt8, ch::UInt32, surr::UInt32
4343
flags::UInt = 0
4444
totalchar = num2byte = num3byte = num4byte = 0
45-
pos = 0
46-
len = sizeof(dat)
4745
@inbounds while pos < len
4846
ch = dat[pos += 1]
4947
totalchar += 1
@@ -133,11 +131,10 @@ end
133131
@throws ArgumentError
134132
""" ->
135133
=#
136-
function check_string_utf16(dat::Vector{UInt16}, len::Int)
134+
function check_string(dat::Vector{UInt16}, len::Int = sizeof(dat)>>>1, pos::Int = 0 ; options::Integer = 0)
137135
local ch::UInt32
138136
flags::UInt = 0
139137
totalchar = num2byte = num3byte = num4byte = 0
140-
pos = 0
141138
@inbounds while pos < len
142139
ch = dat[pos += 1]
143140
totalchar += 1
@@ -177,11 +174,10 @@ end
177174
@throws ArgumentError
178175
""" ->
179176
=#
180-
function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
177+
function check_string(dat::Vector{UInt32}, len::Int = sizeof(dat)>>>2, pos::Int = 0; options::Integer=0)
181178
local ch::UInt32
182179
flags::UInt = 0
183180
totalchar = num2byte = num3byte = num4byte = 0
184-
pos = 0
185181
@inbounds while pos < len
186182
ch = dat[pos += 1]
187183
totalchar += 1
@@ -215,12 +211,10 @@ function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
215211
return totalchar, flags, num4byte, num3byte, num2byte
216212
end
217213

218-
function check_string_abs(str::AbstractString, options::Integer=0)
214+
function check_string(str::AbstractString, len::Int = endof(str), pos::Int = start(str) ; options::Integer=0)
219215
local ch::UInt32
220216
flags::UInt = 0
221217
totalchar = num2byte = num3byte = num4byte = 0
222-
pos = start(str)
223-
len = endof(str)
224218
@inbounds while pos < len
225219
ch, pos = next(str, pos)
226220
totalchar += 1

base/utfconvert.jl

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ end
4242
"""
4343
=#
4444
function convert(::Type{UTF16String}, str::AbstractString)
45-
len, flags, num4byte = check_string_abs(str)
45+
len, flags, num4byte = check_string(str)
4646
buf = Vector{UInt16}(len+num4byte+1)
4747
out = 0
4848
@inbounds for ch in str
@@ -71,7 +71,7 @@ end
7171
"""
7272
=#
7373
function convert(::Type{UTF32String}, str::AbstractString)
74-
len, flags = check_string_abs(str)
74+
len, flags = check_string(str)
7575
buf = Vector{Char}(len+1)
7676
out = 0
7777
@inbounds for ch in str ; buf[out += 1] = ch ; end
@@ -95,7 +95,7 @@ function convert(::Type{UTF16String}, str::UTF8String)
9595
# handle zero length string quickly
9696
sizeof(dat) == 0 && return empty_utf16
9797
# Check that is correct UTF-8 encoding and get number of words needed
98-
len, flags, num4byte = check_string_utf8(dat)
98+
len, flags, num4byte = check_string(dat)
9999
len += num4byte
100100
buf = Vector{UInt16}(len+1)
101101
@inbounds buf[len+1] = 0
@@ -143,7 +143,7 @@ function convert(::Type{UTF8String}, dat::Vector{UInt16})
143143
# handle zero length string quickly
144144
len == 0 && return UTF8String("")
145145
# get number of bytes to allocate
146-
len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len>>>1)
146+
len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>1)
147147
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), dat))
148148
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
149149
end
@@ -165,7 +165,7 @@ function convert(::Type{UTF8String}, str::UTF16String)
165165
# handle zero length string quickly
166166
len <= 1 && return UTF8String("")
167167
# get number of bytes to allocate
168-
len, flags, num4byte, num3byte, num2byte = check_string_utf16(dat, len-1)
168+
len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
169169
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
170170
return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
171171
end
@@ -186,7 +186,7 @@ function convert(::Type{UTF8String}, dat::Vector{UInt32})
186186
# handle zero length string quickly
187187
len == 0 && return UTF8String("")
188188
# get number of bytes to allocate
189-
len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len>>>2)
189+
len, flags, num4byte, num3byte, num2byte = check_string(dat, len>>>2)
190190
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
191191
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
192192
end
@@ -208,7 +208,7 @@ function convert(::Type{UTF8String}, str::UTF32String)
208208
# handle zero length string quickly
209209
len <= 1 && return UTF8String("")
210210
# get number of bytes to allocate
211-
len, flags, num4byte, num3byte, num2byte = check_string_utf32(dat, len-1)
211+
len, flags, num4byte, num3byte, num2byte = check_string(dat, len-1)
212212
flags == 0 && @inbounds return UTF8String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
213213
return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
214214
end
@@ -271,7 +271,7 @@ function convert(::Type{UTF32String}, str::UTF8String)
271271
# handle zero length string quickly
272272
sizeof(dat) == 0 && return empty_utf32
273273
# Validate UTF-8 encoding, and get number of words to create
274-
len, flags = check_string_utf8(dat)
274+
len, flags = check_string(dat)
275275
# Optimize case where no characters > 0x7f
276276
totlen = len+1
277277
flags == 0 && return fast_utf_copy(UTF32String, Char, totlen, dat)
@@ -329,7 +329,7 @@ function convert(::Type{UTF32String}, str::UTF16String)
329329
# handle zero length string quickly (account for trailing \0)
330330
len <= 2 && return empty_utf32
331331
# get number of words to create
332-
len, flags, num4byte = check_string_utf16(dat, len>>>1)
332+
len, flags, num4byte = check_string(dat, len>>>1)
333333
# No surrogate pairs, do optimized copy
334334
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
335335
local ch::UInt32
@@ -361,7 +361,7 @@ function convert(::Type{UTF16String}, dat::Vector{UInt32})
361361
# handle zero length string quickly
362362
len <= 4 && return empty_utf16
363363
# get number of words to allocate
364-
len, flags, num4byte = check_string_utf32(dat, len>>>2)
364+
len, flags, num4byte = check_string(dat, len>>>2)
365365
len += num4byte + 1
366366
# optimized path, no surrogates
367367
num4byte == 0 && return fast_utf_copy(UTF16String, UInt16, len, dat)
@@ -385,7 +385,7 @@ function convert(::Type{UTF16String}, str::UTF32String)
385385
# handle zero length string quickly
386386
len <= 4 && return empty_utf16
387387
# get number of words to allocate
388-
len, flags, num4byte = check_string_utf32(dat, len>>>2)
388+
len, flags, num4byte = check_string(dat, len>>>2)
389389
# optimized path, no surrogates
390390
num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
391391
return encode_to_utf16(dat, len + num4byte)
@@ -429,7 +429,7 @@ function convert(::Type{UTF16String}, str::ASCIIString)
429429
end
430430

431431
function convert(::Type{UTF16String}, data::AbstractVector{UInt16})
432-
!isvalid(UTF16String, data) && throw(ArgumentError("invalid UTF16 data"))
432+
!isvalid(UTF16String, data) && utf_errfunc(UTF_ERR_INVALID_16,0,0)
433433
fast_utf_copy(UTF16String, UInt16, length(data), data, true)
434434
end
435435

base/utferror.jl

Lines changed: 19 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,46 +6,33 @@
66
""" ->
77
=#
88

9-
const UTF_ERR_SHORT = 1
10-
const UTF_ERR_CONT = 2
11-
const UTF_ERR_LONG = 3
12-
const UTF_ERR_NOT_LEAD = 4
13-
const UTF_ERR_NOT_TRAIL = 5
14-
const UTF_ERR_NOT_SURROGATE = 6
15-
const UTF_ERR_MISSING_SURROGATE = 7
16-
const UTF_ERR_INVALID = 8
17-
const UTF_ERR_SURROGATE = 9
18-
const UTF_ERR_NULL_16_TERMINATE = 10
19-
const UTF_ERR_NULL_32_TERMINATE = 11
20-
const UTF_ERR_MAX = 11
9+
const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
10+
const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
11+
const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
12+
const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
13+
const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
14+
const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
15+
const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
16+
const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
17+
const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
18+
const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
19+
const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
20+
const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>"
21+
const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>"
22+
const UTF_ERR_INVALID_16 = "invalid UTF-16 data"
23+
const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
2124

22-
const errMsgs = [
23-
"invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)",
24-
"invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)",
25-
"invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)",
26-
"not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)",
27-
"not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)",
28-
"not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>",
29-
"missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)",
30-
"invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)",
31-
"surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)",
32-
"UTF16String data must be NULL-terminated",
33-
"UTF32String data must be NULL-terminated"
34-
]
3525
#=
3626
@doc """
3727
@brief Throws ArgumentError with information about the specific error, location, and character
3828
39-
@param[in] errcode Error code for Unicode error (one of UTF_ERR_*)
29+
@param[in] message One of UTF_ERR_ messages
4030
@param[in] charpos Index of invalid byte or character
4131
@param[in] invchar Invalid byte or character
4232
43-
@throws never returns, always throws ArgumentError
33+
@throws ArgumentError
4434
""" ->
4535
=#
46-
@noinline function utf_errfunc(errcode::Integer, charpos, invchar)
47-
if errcode < 1 || errcode > UTF_ERR_MAX
48-
throw(ArgumentError("Invalid error code for Unicode error: $errcode, Pos = $charpos, Char = $invchar"))
49-
end
50-
throw(ArgumentError(replace(replace(errMsgs[errcode],"<<1>>",string(charpos)),"<<2>>",hex(invchar))))
36+
@noinline function utf_errfunc(message, charpos, invchar)
37+
throw(ArgumentError(replace(replace(message,"<<1>>",string(charpos)),"<<2>>",hex(invchar))))
5138
end

0 commit comments

Comments
 (0)