@@ -5,25 +5,277 @@ next(s::UTF32String, i::Int) = (s.data[i], i+1)
5
5
endof (s:: UTF32String ) = length (s. data) - 1
6
6
length (s:: UTF32String ) = length (s. data) - 1
7
7
8
+ reverse (s:: UTF32String ) = UTF32String (reverse! (copy (s. data), 1 , length (s)))
9
+
10
+ sizeof (s:: UTF32String ) = sizeof (s. data) - sizeof (Char)
11
+
12
+ const empty_utf32 = UTF32String (UInt32[0 ])
13
+
8
14
utf32 (x) = convert (UTF32String, x)
9
15
convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
10
16
convert (:: Type{UTF32String} , s:: UTF32String ) = s
11
17
12
- function convert (:: Type{UTF32String} , s:: AbstractString )
13
- a = Array (Char, length (s) + 1 )
14
- i = 0
15
- for c in s
16
- a[i += 1 ] = c
18
+ "
19
+ Converts an `AbstractString` to a `UTF16String`
20
+
21
+ ### Input Arguments:
22
+ * `::Type{UTF32String}`
23
+ * `str::AbstractString`
24
+
25
+ ### Returns:
26
+ * `::UTF32String`
27
+
28
+ ### Throws:
29
+ * `UnicodeError`
30
+ "
31
+ function convert (:: Type{UTF32String} , str:: AbstractString )
32
+ len, flags = check_string (str)
33
+ buf = Vector {Char} (len+ 1 )
34
+ out = 0
35
+ @inbounds for ch in str ; buf[out += 1 ] = ch ; end
36
+ @inbounds buf[out + 1 ] = 0 # NULL termination
37
+ UTF32String (buf)
38
+ end
39
+
40
+ "
41
+ Converts a UTF-32 encoded vector of `UInt32` to a `UTF8String`
42
+
43
+ ### Input Arguments:
44
+ * `::Type{UTF8String}`
45
+ * `dat::Vector{UInt32}`
46
+
47
+ ### Returns:
48
+ * `::UTF8String`
49
+
50
+ ### Throws:
51
+ * `UnicodeError`
52
+ "
53
+ function convert (:: Type{UTF8String} , dat:: Vector{UInt32} )
54
+ len = sizeof (dat)
55
+ # handle zero length string quickly
56
+ len == 0 && return empty_utf8
57
+ # get number of bytes to allocate
58
+ len, flags, num4byte, num3byte, num2byte = check_string (dat, len>>> 2 )
59
+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
60
+ return encode_to_utf8 (UInt32, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
61
+ end
62
+
63
+ "
64
+ Converts a `UTF32String` to a `UTF8String`
65
+
66
+ ### Input Arguments:
67
+ * `::Type{UTF8String}`
68
+ * `str::UTF32String`
69
+
70
+ ### Returns:
71
+ * `::UTF8String`
72
+
73
+ ### Throws:
74
+ * `UnicodeError`
75
+ "
76
+ function convert (:: Type{UTF8String} , str:: UTF32String )
77
+ dat = reinterpret (UInt32, str. data)
78
+ len = sizeof (dat) >>> 2
79
+ # handle zero length string quickly
80
+ len <= 1 && return empty_utf8
81
+ # get number of bytes to allocate
82
+ len, flags, num4byte, num3byte, num2byte = check_string (dat, len- 1 )
83
+ flags == 0 && @inbounds return UTF8String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
84
+ return encode_to_utf8 (UInt32, dat, len + num2byte + num3byte* 2 + num4byte* 3 )
85
+ end
86
+
87
+ "
88
+ Converts a `UTF8String` to a `UTF32String`
89
+
90
+ ### Input Arguments:
91
+ * `::Type{UTF32String}`
92
+ * `str::UTF8String`
93
+
94
+ ### Returns:
95
+ * `::UTF32String`
96
+
97
+ ### Throws:
98
+ * `UnicodeError`
99
+ "
100
+ function convert (:: Type{UTF32String} , str:: UTF8String )
101
+ dat = str. data
102
+ # handle zero length string quickly
103
+ sizeof (dat) == 0 && return empty_utf32
104
+ # Validate UTF-8 encoding, and get number of words to create
105
+ len, flags = check_string (dat)
106
+ # Optimize case where no characters > 0x7f
107
+ flags == 0 && @inbounds return fast_utf_copy (UTF32String, Char, len, dat, true )
108
+ # has multi-byte UTF-8 sequences
109
+ buf = Vector {Char} (len+ 1 )
110
+ @inbounds buf[len+ 1 ] = 0 # NULL termination
111
+ local ch:: UInt32 , surr:: UInt32
112
+ out = 0
113
+ pos = 0
114
+ @inbounds while out < len
115
+ ch = dat[pos += 1 ]
116
+ # Handle ASCII characters
117
+ if ch <= 0x7f
118
+ buf[out += 1 ] = ch
119
+ # Handle range 0x80-0x7ff
120
+ elseif ch < 0xe0
121
+ buf[out += 1 ] = ((ch & 0x1f ) << 6 ) | (dat[pos += 1 ] & 0x3f )
122
+ # Handle range 0x800-0xffff
123
+ elseif ch < 0xf0
124
+ pos += 2
125
+ ch = get_utf8_3byte (dat, pos, ch)
126
+ # Handle surrogate pairs (should have been encoded in 4 bytes)
127
+ if is_surrogate_lead (ch)
128
+ # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
129
+ pos += 3
130
+ surr = ((UInt32 (dat[pos- 2 ] & 0xf ) << 12 )
131
+ | (UInt32 (dat[pos- 1 ] & 0x3f ) << 6 )
132
+ | (dat[pos] & 0x3f ))
133
+ ch = get_supplementary (ch, surr)
134
+ end
135
+ buf[out += 1 ] = ch
136
+ # Handle range 0x10000-0x10ffff
137
+ else
138
+ pos += 3
139
+ buf[out += 1 ] = get_utf8_4byte (dat, pos, ch)
140
+ end
141
+ end
142
+ UTF32String (buf)
143
+ end
144
+
145
+ "
146
+ Converts a `UTF16String` to `UTF32String`
147
+
148
+ ### Input Arguments:
149
+ * `::Type{UTF32String}`
150
+ * `str::UTF16String`
151
+
152
+ ### Returns:
153
+ * `::UTF32String`
154
+
155
+ ### Throws:
156
+ * `UnicodeError`
157
+ "
158
+ function convert (:: Type{UTF32String} , str:: UTF16String )
159
+ dat = str. data
160
+ len = sizeof (dat)
161
+ # handle zero length string quickly (account for trailing \0)
162
+ len <= 2 && return empty_utf32
163
+ # get number of words to create
164
+ len, flags, num4byte = check_string (dat, len>>> 1 )
165
+ # No surrogate pairs, do optimized copy
166
+ (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String (copy! (Vector {Char} (len), dat))
167
+ local ch:: UInt32
168
+ buf = Vector {Char} (len)
169
+ out = 0
170
+ pos = 0
171
+ @inbounds while out < len
172
+ ch = dat[pos += 1 ]
173
+ # check for surrogate pair
174
+ if is_surrogate_lead (ch) ; ch = get_supplementary (ch, dat[pos += 1 ]) ; end
175
+ buf[out += 1 ] = ch
17
176
end
18
- a[end ] = Char (0 ) # NULL terminate
19
- UTF32String (a)
177
+ UTF32String (buf)
178
+ end
179
+
180
+ "
181
+ Converts a UTF-32 encoded vector of `UInt32` to a `UTF16String`
182
+
183
+ ### Input Arguments:
184
+ * `::Type{UTF16String}`
185
+ * `dat::Vector{UInt32}`
186
+
187
+ ### Returns:
188
+ * `::UTF16String`
189
+
190
+ ### Throws:
191
+ * `UnicodeError`
192
+ "
193
+ function convert (:: Type{UTF16String} , dat:: Vector{UInt32} )
194
+ len = sizeof (dat)
195
+ # handle zero length string quickly
196
+ len <= 4 && return empty_utf16
197
+ # get number of words to allocate
198
+ len, flags, num4byte = check_string (dat, len>>> 2 )
199
+ len += num4byte + 1
200
+ # optimized path, no surrogates
201
+ num4byte == 0 && @inbounds return fast_utf_copy (UTF16String, UInt16, len, dat)
202
+ return encode_to_utf16 (dat, len)
203
+ end
204
+
205
+ "
206
+ Converts a `UTF32String` to `UTF16String`
207
+
208
+ ### Input Arguments:
209
+ * `::Type{UTF16String}`
210
+ * `str::UTF32String`
211
+
212
+ ### Returns:
213
+ * `::UTF16String`
214
+
215
+ ### Throws:
216
+ * `UnicodeError`
217
+ "
218
+ function convert (:: Type{UTF16String} , str:: UTF32String )
219
+ dat = reinterpret (UInt32, str. data)
220
+ len = sizeof (dat)
221
+ # handle zero length string quickly
222
+ len <= 4 && return empty_utf16
223
+ # get number of words to allocate
224
+ len, flags, num4byte = check_string (dat, len>>> 2 )
225
+ # optimized path, no surrogates
226
+ num4byte == 0 && @inbounds return UTF16String (copy! (Vector {UInt16} (len), dat))
227
+ return encode_to_utf16 (dat, len + num4byte)
228
+ end
229
+
230
+ "
231
+ Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String`
232
+
233
+ ### Input Arguments:
234
+ * `dat::Vector{UInt32}` UTF-32 encoded data
235
+ * `len` length of output in 16-bit words
236
+
237
+ ### Returns:
238
+ * `::UTF16String`
239
+ "
240
+ function encode_to_utf16 (dat, len)
241
+ buf = Vector {UInt16} (len)
242
+ @inbounds buf[len] = 0 # NULL termination
243
+ out = 0
244
+ pos = 0
245
+ @inbounds while out < len
246
+ ch = UInt32 (dat[pos += 1 ])
247
+ if ch > 0xffff
248
+ # Output surrogate pair for 0x10000-0x10ffff
249
+ buf[out += 1 ] = 0xd7c0 + (ch >>> 10 )
250
+ ch = 0xdc00 + (ch & 0x3ff )
251
+ end
252
+ buf[out += 1 ] = ch
253
+ end
254
+ UTF16String (buf)
255
+ end
256
+
257
+ convert (:: Type{UTF8String} , dat:: Vector{Char} ) = convert (UTF8String, reinterpret (UInt32, dat))
258
+
259
+ convert (:: Type{UTF16String} , dat:: Vector{Char} ) = convert (UTF16String, reinterpret (UInt32, dat))
260
+ convert (:: Type{UTF32String} , c:: Char ) = UTF32String (Char[c, Char (0 )])
261
+
262
+ function convert (:: Type{UTF32String} , str:: ASCIIString )
263
+ dat = str. data
264
+ @inbounds return fast_utf_copy (UTF32String, Char, length (dat), dat, true )
265
+ end
266
+
267
+ function convert (:: Type{UTF32String} , dat:: AbstractVector{Char} )
268
+ @inbounds return fast_utf_copy (UTF32String, Char, length (dat), dat, true )
20
269
end
21
270
22
271
function convert (:: Type{UTF32String} , data:: AbstractVector{Char} )
23
272
len = length (data)
24
- d = Array (Char, len + 1 )
25
- d[end ] = Char (0 ) # NULL terminate
26
- UTF32String (copy! (d,1 , data,1 , len))
273
+ @inbounds return UTF32String (setindex! (copy! (Vector {Char} (len+ 1 ),1 ,data,1 ,len),0 ,len+ 1 ))
274
+ end
275
+
276
+ function convert (:: Type{UTF32String} , data:: AbstractVector{Char} )
277
+ len = length (data)
278
+ @inbounds return UTF32String (setindex! (copy! (Vector {Char} (len+ 1 ),1 ,data,1 ,len),0 ,len+ 1 ))
27
279
end
28
280
29
281
convert {T<:Union{Int32,UInt32}} (:: Type{UTF32String} , data:: AbstractVector{T} ) =
@@ -46,12 +298,11 @@ convert(::Type{Array{Char}}, str::UTF32String) = str.data
46
298
47
299
reverse (s:: UTF32String ) = UTF32String (reverse! (copy (s. data), 1 , length (s)))
48
300
49
- sizeof (s:: UTF32String ) = sizeof (s. data) - sizeof (Char)
50
301
unsafe_convert {T<:Union{Int32,UInt32,Char}} (:: Type{Ptr{T}} , s:: UTF32String ) =
51
302
convert (Ptr{T}, pointer (s))
52
303
53
304
function convert (T:: Type{UTF32String} , bytes:: AbstractArray{UInt8} )
54
- isempty (bytes) && return UTF32String (Char[ 0 ])
305
+ isempty (bytes) && return empty_utf32
55
306
length (bytes) & 3 != 0 && throw (UnicodeError (UTF_ERR_ODD_BYTES_32,0 ,0 ))
56
307
data = reinterpret (Char, bytes)
57
308
# check for byte-order mark (BOM):
@@ -79,6 +330,8 @@ function isvalid(::Type{UTF32String}, str::Union{Vector{Char}, Vector{UInt32}})
79
330
end
80
331
isvalid (str:: Vector{Char} ) = isvalid (UTF32String, str)
81
332
333
+ utf32 (x) = convert (UTF32String, x)
334
+
82
335
utf32 (p:: Ptr{Char} , len:: Integer ) = utf32 (pointer_to_array (p, len))
83
336
utf32 (p:: Union{Ptr{UInt32}, Ptr{Int32}} , len:: Integer ) = utf32 (convert (Ptr{Char}, p), len)
84
337
function utf32 (p:: Union{Ptr{Char}, Ptr{UInt32}, Ptr{Int32}} )
0 commit comments