1
1
# This file is a part of Julia. License is MIT: http://julialang.org/license
2
2
3
- immutable UTF16String <: AbstractString
4
- data:: Array{UInt16,1} # includes 16-bit NULL termination after string chars
5
- function UTF16String (data:: Vector{UInt16} )
6
- if length (data) < 1 || data[end ] != 0
7
- throw (ArgumentError (" UTF16String data must be NULL-terminated" ))
8
- end
9
- new (data)
10
- end
11
- end
12
-
13
- utf16_is_lead (c:: UInt16 ) = (c & 0xfc00 ) == 0xd800
14
- utf16_is_trail (c:: UInt16 ) = (c & 0xfc00 ) == 0xdc00
15
- utf16_is_surrogate (c:: UInt16 ) = (c & 0xf800 ) == 0xd800
16
- utf16_get_supplementary (lead:: UInt16 , trail:: UInt16 ) = Char (UInt32 (lead- 0xd7f7 )<< 10 + trail)
17
-
18
3
function length (s:: UTF16String )
19
4
d = s. data
20
5
len = length (d) - 1
21
6
len == 0 && return 0
22
7
cnum = 0
23
8
for i = 1 : len
24
- @inbounds cnum += ! utf16_is_trail (d[i])
9
+ @inbounds cnum += ! is_surrogate_trail (d[i])
25
10
end
26
11
cnum
27
12
end
@@ -30,92 +15,69 @@ function endof(s::UTF16String)
30
15
d = s. data
31
16
i = length (d) - 1
32
17
i == 0 && return i
33
- utf16_is_surrogate (d[i]) ? i- 1 : i
18
+ return is_surrogate_codeunit (d[i]) ? i- 1 : i
34
19
end
35
20
21
+ get_supplementary (lead:: Unsigned , trail:: Unsigned ) = (UInt32 (lead- 0xd7f7 )<< 10 + trail)
22
+
36
23
function next (s:: UTF16String , i:: Int )
37
- if ! utf16_is_surrogate (s. data[i])
38
- return Char (s. data[i]), i+ 1
39
- elseif length (s. data)- 1 > i && utf16_is_lead (s. data[i]) && utf16_is_trail (s. data[i+ 1 ])
40
- return utf16_get_supplementary (s. data[i], s. data[i+ 1 ]), i+ 2
41
- end
42
- throw (ArgumentError (" invalid UTF-16 character index" ))
24
+ ch = s. data[i]
25
+ ! is_surrogate_codeunit (ch) && return (Char (ch), i+ 1 )
26
+ # check length, account for terminating \0
27
+ i >= (length (s. data)- 1 ) && utf_errfunc (UTF_ERR_MISSING_SURROGATE, i, UInt32 (ch))
28
+ ! is_surrogate_lead (ch) && utf_errfunc (UTF_ERR_NOT_LEAD, i, ch)
29
+ ct = s. data[i+ 1 ]
30
+ ! is_surrogate_trail (ct) && utf_errfunc (UTF_ERR_NOT_TRAIL, i, ch)
31
+ Char (get_supplementary (ch, ct)), i+ 2
43
32
end
44
33
45
34
function reverseind (s:: UTF16String , i:: Integer )
46
35
j = length (s. data) - i
47
- return Base . utf16_is_trail (s. data[j]) ? j- 1 : j
36
+ return is_surrogate_trail (s. data[j]) ? j- 1 : j
48
37
end
49
38
50
39
lastidx (s:: UTF16String ) = length (s. data) - 1 # s.data includes NULL terminator
51
40
52
41
function reverse (s:: UTF16String )
53
- d = s. data
42
+ d = s. data
54
43
out = similar (d)
55
44
out[end ] = 0 # NULL termination
56
45
n = length (d)
57
- for i = 1 : n- 1
58
- out[i] = d[n- i]
59
- if Base. utf16_is_lead (out[i])
60
- out[i],out[i- 1 ] = out[i- 1 ],out[i]
61
- end
62
- end
63
- return UTF16String (out)
64
- end
65
-
66
- # TODO : optimize this
67
- function encode16 (s:: AbstractString )
68
- buf = UInt16[]
69
- for ch in s
70
- c = reinterpret (UInt32, ch)
71
- if c < 0x10000
72
- push! (buf, UInt16 (c))
73
- elseif c <= 0x10ffff
74
- push! (buf, UInt16 (0xd7c0 + (c>> 10 )))
75
- push! (buf, UInt16 (0xdc00 + (c & 0x3ff )))
46
+ @inbounds for i = 1 : n- 1
47
+ ch = d[n- i]
48
+ if is_surrogate_lead (ch)
49
+ out[i],out[i- 1 ] = out[i- 1 ],ch
76
50
else
77
- throw ( ArgumentError ( " invalid Unicode character (0x $( hex (c)) > 0x10ffff) " ))
51
+ out[i] = ch
78
52
end
79
53
end
80
- push! (buf, 0 ) # NULL termination
81
- UTF16String (buf)
54
+ UTF16String (out)
82
55
end
83
56
84
- utf16 (x) = convert (UTF16String, x)
85
- convert (:: Type{UTF16String} , s:: UTF16String ) = s
86
- convert (:: Type{UTF16String} , s:: AbstractString ) = encode16 (s)
87
- convert (:: Type{Array{UInt16,1}} , s:: UTF16String ) = s. data
88
- convert (:: Type{Array{UInt16}} , s:: UTF16String ) = s. data
89
-
90
- # TODO : optimize this
91
- convert (:: Type{UTF8String} , s:: UTF16String ) =
92
- sprint (length (s. data)- 1 , io-> for c in s; write (io,c:: Char ); end )
93
-
94
57
sizeof (s:: UTF16String ) = sizeof (s. data) - sizeof (UInt16)
95
- unsafe_convert {T<:Union(Int16,UInt16)} (:: Type{Ptr{T}} , s:: UTF16String ) =
96
- convert (Ptr{T}, pointer (s))
97
58
98
59
function isvalid (:: Type{UTF16String} , data:: AbstractArray{UInt16} )
99
60
i = 1
100
61
n = length (data) # this may include NULL termination; that's okay
101
- while i < n # check for unpaired surrogates
102
- if utf16_is_lead (data[i]) && utf16_is_trail (data[i+ 1 ])
62
+ @inbounds while i < n # check for unpaired surrogates
63
+ if is_surrogate_lead (data[i]) && is_surrogate_trail (data[i+ 1 ])
103
64
i += 2
104
- elseif utf16_is_surrogate (data[i])
65
+ elseif is_surrogate_codeunit (data[i])
105
66
return false
106
67
else
107
68
i += 1
108
69
end
109
70
end
110
- return i > n || ! utf16_is_surrogate (data[i])
71
+ return i > n || ! is_surrogate_codeunit (data[i])
111
72
end
112
73
74
+ unsafe_convert {T<:Union(Int16,UInt16)} (:: Type{Ptr{T}} , s:: UTF16String ) =
75
+ convert (Ptr{T}, pointer (s))
76
+
113
77
function convert (:: Type{UTF16String} , data:: AbstractVector{UInt16} )
114
78
! isvalid (UTF16String, data) && throw (ArgumentError (" invalid UTF16 data" ))
115
79
len = length (data)
116
- d = Array (UInt16, len + 1 )
117
- d[end ] = 0 # NULL terminate
118
- UTF16String (copy! (d,1 , data,1 , len))
80
+ @inbounds return UTF16String (setindex! (copy! (Vector {UInt16} (len+ 1 ),1 ,data,1 ,len),0 ,len+ 1 ))
119
81
end
120
82
121
83
convert (T:: Type{UTF16String} , data:: AbstractArray{UInt16} ) =
@@ -146,6 +108,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
146
108
UTF16String (d)
147
109
end
148
110
111
+ utf16 (x) = convert (UTF16String, x)
149
112
utf16 (p:: Ptr{UInt16} , len:: Integer ) = utf16 (pointer_to_array (p, len))
150
113
utf16 (p:: Ptr{Int16} , len:: Integer ) = utf16 (convert (Ptr{UInt16}, p), len)
151
114
function utf16 (p:: Union(Ptr{UInt16}, Ptr{Int16}) )
0 commit comments