1
1
# This file is a part of Julia. License is MIT: http://julialang.org/license
2
2
3
- utf16_is_lead (c:: UInt16 ) = (c & 0xfc00 ) == 0xd800
4
- utf16_is_trail (c:: UInt16 ) = (c & 0xfc00 ) == 0xdc00
5
- utf16_is_surrogate (c:: UInt16 ) = (c & 0xf800 ) == 0xd800
6
- utf16_get_supplementary (lead:: UInt16 , trail:: UInt16 ) = Char (UInt32 (lead- 0xd7f7 )<< 10 + trail)
3
+ # Quickly copy and set trailing \0
4
+ @inline function fast_utf_copy {S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, Char}} (
5
+ :: Type{S} , :: Type{T} , len, dat, flag:: Bool = false )
6
+ S (setindex! (copy! (Vector {T} (len+ 1 ), 1 , dat, 1 , flag ? len : len+ 1 ), 0 , len+ 1 ))
7
+ end
8
+
9
+ # Get rest of character ch from 3-byte UTF-8 sequence in dat
10
+ @inline function get_utf8_3byte (dat, pos, ch)
11
+ @inbounds return ((ch & 0xf ) << 12 ) | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 ) | (dat[pos] & 0x3f )
12
+ end
13
+ # Get rest of character ch from 4-byte UTF-8 sequence in dat
14
+ @inline function get_utf8_4byte (dat, pos, ch)
15
+ @inbounds return (((ch & 0x7 ) << 18 )
16
+ | (UInt32 (dat[pos- 2 ] & 0x3f ) << 12 )
17
+ | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 )
18
+ | (dat[pos] & 0x3f ))
19
+ end
20
+
21
+ # Output a character as a 4-byte UTF-8 sequence
22
+ @inline function output_utf8_4byte! (buf, out, ch)
23
+ @inbounds begin
24
+ buf[out + 1 ] = 0xf0 | (ch >>> 18 )
25
+ buf[out + 2 ] = 0x80 | ((ch >>> 12 ) & 0x3f )
26
+ buf[out + 3 ] = 0x80 | ((ch >>> 6 ) & 0x3f )
27
+ buf[out + 4 ] = 0x80 | (ch & 0x3f )
28
+ end
29
+ end
30
+
31
+ const empty_utf16 = UTF16String (UInt16[0 ])
7
32
8
33
function length (s:: UTF16String )
9
34
d = s. data
10
35
len = length (d) - 1
11
36
len == 0 && return 0
12
37
cnum = 0
13
38
for i = 1 : len
14
- @inbounds cnum += ! utf16_is_trail (d[i])
39
+ @inbounds cnum += ! is_surrogate_trail (d[i])
15
40
end
16
41
cnum
17
42
end
@@ -20,100 +45,220 @@ function endof(s::UTF16String)
20
45
d = s. data
21
46
i = length (d) - 1
22
47
i == 0 && return i
23
- utf16_is_surrogate (d[i]) ? i- 1 : i
48
+ return is_surrogate_codeunit (d[i]) ? i- 1 : i
24
49
end
25
50
51
+ get_supplementary (lead:: Unsigned , trail:: Unsigned ) = (UInt32 (lead- 0xd7f7 )<< 10 + trail)
52
+
26
53
function next (s:: UTF16String , i:: Int )
27
- if ! utf16_is_surrogate (s. data[i])
28
- return Char (s. data[i]), i+ 1
29
- elseif length (s. data)- 1 > i && utf16_is_lead (s. data[i]) && utf16_is_trail (s. data[i+ 1 ])
30
- return utf16_get_supplementary (s. data[i], s. data[i+ 1 ]), i+ 2
31
- end
32
- throw (UnicodeError (UTF_ERR_INVALID_INDEX,0 ,0 ))
54
+ ch = s. data[i]
55
+ ! is_surrogate_codeunit (ch) && return (Char (ch), i+ 1 )
56
+ # check length, account for terminating \0
57
+ i >= (length (s. data)- 1 ) && throw (UnicodeError (UTF_ERR_MISSING_SURROGATE, i, UInt32 (ch)))
58
+ ! is_surrogate_lead (ch) && throw (UnicodeError (UTF_ERR_NOT_LEAD, i, ch))
59
+ ct = s. data[i+ 1 ]
60
+ ! is_surrogate_trail (ct) && throw ((UTF_ERR_NOT_TRAIL, i, ch))
61
+ Char (get_supplementary (ch, ct)), i+ 2
33
62
end
34
63
35
64
function reverseind (s:: UTF16String , i:: Integer )
36
65
j = length (s. data) - i
37
- return Base . utf16_is_trail (s. data[j]) ? j- 1 : j
66
+ return is_surrogate_trail (s. data[j]) ? j- 1 : j
38
67
end
39
68
40
69
lastidx (s:: UTF16String ) = length (s. data) - 1 # s.data includes NULL terminator
41
70
42
71
function reverse (s:: UTF16String )
43
- d = s. data
72
+ d = s. data
44
73
out = similar (d)
45
74
out[end ] = 0 # NULL termination
46
75
n = length (d)
47
- for i = 1 : n- 1
48
- out[i] = d[n- i]
49
- if Base. utf16_is_lead (out[i])
50
- out[i],out[i- 1 ] = out[i- 1 ],out[i]
76
+ @inbounds for i = 1 : n- 1
77
+ ch = d[n- i]
78
+ if is_surrogate_lead (ch)
79
+ out[i],out[i- 1 ] = out[i- 1 ],ch
80
+ else
81
+ out[i] = ch
51
82
end
52
83
end
53
- return UTF16String (out)
84
+ UTF16String (out)
54
85
end
55
86
56
- # TODO : optimize this
57
- function encode16 (s:: AbstractString )
58
- buf = UInt16[]
59
- for ch in s
60
- c = reinterpret (UInt32, ch)
87
+ sizeof (s:: UTF16String ) = sizeof (s. data) - sizeof (UInt16)
88
+
89
+ function isvalid (:: Type{UTF16String} , data:: AbstractArray{UInt16} )
90
+ i = 1
91
+ n = length (data) # this may include NULL termination; that's okay
92
+ @inbounds while i < n # check for unpaired surrogates
93
+ if is_surrogate_lead (data[i]) && is_surrogate_trail (data[i+ 1 ])
94
+ i += 2
95
+ elseif is_surrogate_codeunit (data[i])
96
+ return false
97
+ else
98
+ i += 1
99
+ end
100
+ end
101
+ return i > n || ! is_surrogate_codeunit (data[i])
102
+ end
103
+
104
+ "
105
+ Converts an `AbstractString` to a `UTF16String`
106
+
107
+ ### Returns:
108
+ * `UTF16String`
109
+
110
+ ### Throws:
111
+ * `UnicodeError`
112
+ "
113
+ function convert (:: Type{UTF16String} , str:: AbstractString )
114
+ len, flags, num4byte = unsafe_checkstring (str)
115
+ buf = Vector {UInt16} (len+ num4byte+ 1 )
116
+ out = 0
117
+ @inbounds for ch in str
118
+ c = UInt32 (ch)
61
119
if c < 0x10000
62
- push! (buf, UInt16 (c))
63
- elseif c <= 0x10ffff
64
- push! (buf, UInt16 (0xd7c0 + (c>> 10 )))
65
- push! (buf, UInt16 (0xdc00 + (c & 0x3ff )))
120
+ buf[out += 1 ] = UInt16 (c)
66
121
else
67
- throw (UnicodeError (UTF_ERR_INVALID_CHAR, 0 , ch))
122
+ # output surrogate pair
123
+ buf[out += 1 ] = UInt16 (0xd7c0 + (ch >>> 10 ))
124
+ buf[out += 1 ] = UInt16 (0xdc00 + (ch & 0x3ff ))
68
125
end
69
126
end
70
- push! ( buf, 0 ) # NULL termination
127
+ @inbounds buf[out + 1 ] = 0 # NULL termination
71
128
UTF16String (buf)
72
129
end
73
130
74
- utf16 (x) = convert (UTF16String, x)
75
- convert (:: Type{UTF16String} , s:: UTF16String ) = s
76
- convert (:: Type{UTF16String} , s:: AbstractString ) = encode16 (s)
77
- convert (:: Type{Array{UInt16,1}} , s:: UTF16String ) = s. data
78
- convert (:: Type{Array{UInt16}} , s:: UTF16String ) = s. data
131
+ "
132
+ Converts a `UTF8String` to a `UTF16String`
79
133
80
- # TODO : optimize this
81
- convert (:: Type{UTF8String} , s:: UTF16String ) =
82
- sprint (length (s. data)- 1 , io-> for c in s; write (io,c:: Char ); end )
134
+ ### Returns:
135
+ * `UTF16String`
83
136
84
- sizeof (s:: UTF16String ) = sizeof (s. data) - sizeof (UInt16)
85
- unsafe_convert {T<:Union{Int16,UInt16}} (:: Type{Ptr{T}} , s:: UTF16String ) =
86
- convert (Ptr{T}, pointer (s))
137
+ ### Throws:
138
+ * `UnicodeError`
139
+ "
140
+ function convert (:: Type{UTF16String} , str:: UTF8String )
141
+ dat = str. data
142
+ # handle zero length string quickly
143
+ sizeof (dat) == 0 && return empty_utf16
144
+ # Check that is correct UTF-8 encoding and get number of words needed
145
+ len, flags, num4byte = unsafe_checkstring (dat)
146
+ len += num4byte
147
+ buf = Vector {UInt16} (len+ 1 )
148
+ @inbounds buf[len+ 1 ] = 0
149
+ # Optimize case where no characters > 0x7f
150
+ flags == 0 && @inbounds return UTF16String (copy! (buf, dat))
151
+ out = 0
152
+ pos = 0
153
+ @inbounds while out < len
154
+ ch:: UInt32 = dat[pos += 1 ]
155
+ # Handle ASCII characters
156
+ if ch <= 0x7f
157
+ buf[out += 1 ] = ch
158
+ # Handle range 0x80-0x7ff
159
+ elseif ch < 0xe0
160
+ buf[out += 1 ] = ((ch & 0x1f ) << 6 ) | (dat[pos += 1 ] & 0x3f )
161
+ # Handle range 0x800-0xffff
162
+ elseif ch < 0xf0
163
+ pos += 2
164
+ buf[out += 1 ] = get_utf8_3byte (dat, pos, ch)
165
+ # Handle range 0x10000-0x10ffff
166
+ else
167
+ pos += 3
168
+ ch = get_utf8_4byte (dat, pos, ch)
169
+ # output surrogate pair
170
+ buf[out += 1 ] = UInt16 (0xd7c0 + (ch >>> 10 ))
171
+ buf[out += 1 ] = UInt16 (0xdc00 + (ch & 0x3ff ))
172
+ end
173
+ end
174
+ UTF16String (buf)
175
+ end
87
176
88
- function isvalid (:: Type{UTF16String} , data:: AbstractArray{UInt16} )
89
- i = 1
90
- n = length (data) # this may include NULL termination; that's okay
91
- while i < n # check for unpaired surrogates
92
- if utf16_is_lead (data[i]) && utf16_is_trail (data[i+ 1 ])
93
- i += 2
94
- elseif utf16_is_surrogate (data[i])
95
- return false
177
+ "
178
+ Converts a `UTF16String` to a `UTF8String`
179
+
180
+ ### Returns:
181
+ * `UTF8String`
182
+
183
+ ### Throws:
184
+ * `UnicodeError`
185
+ "
186
+ function convert (:: Type{UTF8String} , str:: UTF16String )
187
+ dat = str. data
188
+ len = sizeof (dat) >>> 1
189
+ # handle zero length string quickly
190
+ len <= 1 && return empty_utf8
191
+ # get number of bytes to allocate
192
+ len, flags, num4byte, num3byte, num2byte = unsafe_checkstring (dat, 1 , len- 1 )
193
+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
194
+ return encode_to_utf8 (UInt16, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
195
+ end
196
+
197
+ "
198
+ Converts an already validated vector of `UInt16` or `UInt32` to a `UTF8String`
199
+
200
+ ### Input Arguments:
201
+ * `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0 ` is not converted
202
+ * `len` length of output in bytes
203
+
204
+ ### Returns:
205
+ * `UTF8String`
206
+ "
207
+ function encode_to_utf8 {T<:Union{UInt16, UInt32}} (:: Type{T} , dat, len)
208
+ buf = Vector {UInt8} (len)
209
+ out = 0
210
+ pos = 0
211
+ @inbounds while out < len
212
+ ch:: UInt32 = dat[pos += 1 ]
213
+ # Handle ASCII characters
214
+ if ch <= 0x7f
215
+ buf[out += 1 ] = ch
216
+ # Handle 0x80-0x7ff
217
+ elseif ch < 0x800
218
+ buf[out += 1 ] = 0xc0 | (ch >>> 6 )
219
+ buf[out += 1 ] = 0x80 | (ch & 0x3f )
220
+ # Handle 0x10000-0x10ffff (if input is UInt32)
221
+ elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
222
+ output_utf8_4byte! (buf, out, ch)
223
+ out += 4
224
+ # Handle surrogate pairs
225
+ elseif is_surrogate_codeunit (ch)
226
+ output_utf8_4byte! (buf, out, get_supplementary (ch, dat[pos += 1 ]))
227
+ out += 4
228
+ # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
96
229
else
97
- i += 1
230
+ buf[out += 1 ] = 0xe0 | ((ch >>> 12 ) & 0x3f )
231
+ buf[out += 1 ] = 0x80 | ((ch >>> 6 ) & 0x3f )
232
+ buf[out += 1 ] = 0x80 | (ch & 0x3f )
98
233
end
99
234
end
100
- return i > n || ! utf16_is_surrogate (data[i] )
235
+ UTF8String (buf )
101
236
end
102
237
103
- function convert (:: Type{UTF16String} , data:: AbstractVector{UInt16} )
104
- ! isvalid (UTF16String, data) && throw (UnicodeError (UTF_ERR_INVALID_16,0 ,0 ))
105
- len = length (data)
106
- d = Array (UInt16, len + 1 )
107
- d[end ] = 0 # NULL terminate
108
- UTF16String (copy! (d,1 , data,1 , len))
238
+ function convert (:: Type{UTF16String} , str:: ASCIIString )
239
+ dat = str. data
240
+ @inbounds return fast_utf_copy (UTF16String, UInt16, length (dat), dat, true )
109
241
end
110
242
243
+ convert (:: Type{Vector{UInt16}} , str:: UTF16String ) = str. data
244
+ convert (:: Type{Array{UInt16}} , str:: UTF16String ) = str. data
245
+
246
+ convert (:: Type{UTF16String} , str:: UTF16String ) = str
247
+
248
+ unsafe_convert {T<:Union{Int16,UInt16}} (:: Type{Ptr{T}} , s:: UTF16String ) =
249
+ convert (Ptr{T}, pointer (s))
250
+
111
251
convert (T:: Type{UTF16String} , data:: AbstractArray{UInt16} ) =
112
252
convert (T, reshape (data, length (data)))
113
253
114
254
convert (T:: Type{UTF16String} , data:: AbstractArray{Int16} ) =
115
255
convert (T, reinterpret (UInt16, data))
116
256
257
+ function convert (:: Type{UTF16String} , dat:: AbstractVector{UInt16} )
258
+ len, flags, num4byte = unsafe_checkstring (dat)
259
+ @inbounds return fast_utf_copy (UTF16String, UInt16, len+ num4byte, dat, true )
260
+ end
261
+
117
262
function convert (T:: Type{UTF16String} , bytes:: AbstractArray{UInt8} )
118
263
isempty (bytes) && return UTF16String (UInt16[0 ])
119
264
isodd (length (bytes)) && throw (UnicodeError (UTF_ERR_ODD_BYTES_16, length (bytes), 0 ))
@@ -136,6 +281,9 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
136
281
UTF16String (d)
137
282
end
138
283
284
+ convert (:: Type{UTF16String} , str:: UTF16String ) = str
285
+
286
+ utf16 (x) = convert (UTF16String, x)
139
287
utf16 (p:: Ptr{UInt16} , len:: Integer ) = utf16 (pointer_to_array (p, len))
140
288
utf16 (p:: Ptr{Int16} , len:: Integer ) = utf16 (convert (Ptr{UInt16}, p), len)
141
289
function utf16 (p:: Union{Ptr{UInt16}, Ptr{Int16}} )
@@ -154,7 +302,7 @@ function map(fun, str::UTF16String)
154
302
end
155
303
uc = reinterpret (UInt32, c2)
156
304
if uc < 0x10000
157
- if utf16_is_surrogate (UInt16 (uc))
305
+ if is_surrogate_codeunit (UInt16 (uc))
158
306
throw (UnicodeError (UTF_ERR_INVALID_CHAR, 0 , uc))
159
307
end
160
308
push! (buf, UInt16 (uc))
0 commit comments