From 3163363c5df258b3d3ae90309e73e90a3792b293 Mon Sep 17 00:00:00 2001 From: ScottPJones Date: Mon, 15 Oct 2018 22:01:12 -0400 Subject: [PATCH] Update to use tables for case --- src/StrBase.jl | 1 + src/casefold.jl | 130 +++++++++++------- src/charcase.jl | 130 ++++++++++++++++++ src/maketables.jl | 329 ++++++++++++++++++++++++++++++++++++++++++++++ src/utf16case.jl | 59 ++++----- src/utf8.jl | 2 +- src/utf8case.jl | 115 ++++++++-------- test/basic.jl | 29 +++- 8 files changed, 648 insertions(+), 147 deletions(-) create mode 100644 src/charcase.jl create mode 100644 src/maketables.jl diff --git a/src/StrBase.jl b/src/StrBase.jl index b3ad859..a38ca27 100644 --- a/src/StrBase.jl +++ b/src/StrBase.jl @@ -48,6 +48,7 @@ include("types.jl") @static V6_COMPAT && include("compat.jl") @static NEW_ITERATE && include("fixparse.jl") include("chars.jl") +include("charcase.jl") include("access.jl") include("traits.jl") include("utf8proc.jl") diff --git a/src/casefold.jl b/src/casefold.jl index 40c5ee2..6003296 100644 --- a/src/casefold.jl +++ b/src/casefold.jl @@ -5,31 +5,16 @@ Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones Licensed under MIT License, see LICENSE.md =# -_wide_lower_l(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5) - -@inline _wide_lower_ch(ch) = - ch <= 0x7f ? _islower_a(ch) : (ch > 0xff ? _islower_u(ch) : _wide_lower_l(ch)) - -@inline _isupper_ch(ch) = - ch <= 0x7f ? _isupper_a(ch) : (ch > 0xff ? _isupper_u(ch) : _isupper_l(ch)) - -_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf)) - -_wide_out_upper(ch) = - ifelse(ch == 0xb5, 0x39c, - ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16))) - - function uppercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}} (len = ncodeunits(str)) == 0 && return str @preserve str begin pnt = pointer(str) ch = get_codeunit(pnt) _islower_a(ch) || return str - out = _allocate(len) + buf, out = _allocate(UInt8, len) unsafe_copyto!(out, pnt, len) set_codeunit!(out, ch - 0x20) - Str(C, out) + Str(C, buf) end end @@ -39,10 +24,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}} pnt = pointer(str) ch = get_codeunit(pnt) _isupper_a(ch) || return str - out = _allocate(len) + buf, out = _allocate(UInt8, len) unsafe_copyto!(out, pnt, len) set_codeunit!(out, ch + 0x20) - Str(C, out) + Str(C, buf) end end @@ -119,7 +104,7 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}} _can_upper(ch) || return str buf, out = _allocate(UInt8, len) set_codeunit!(out, ch - 0x20) - len > 1 && unsafe_copyto!(out, pnt+1, len-1) + len > 1 && unsafe_copyto!(out + 1, pnt+1, len-1) Str(C, buf) end end @@ -154,10 +139,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}} @preserve str begin pnt = pointer(str) ch = get_codeunit(pnt) - _isupper(ch) || return str + _isupper_al(ch) || return str buf, out = _allocate(UInt8, len) set_codeunit!(out, ch + 0x20) - len > 1 && unsafe_copyto!(out, pnt+1, len-1) + len > 1 && unsafe_copyto!(out+1, pnt+1, len-1) Str(C, buf) end end @@ -176,7 +161,7 @@ function _upper(::Type{C}, beg::Ptr{UInt8}, off, len) where {C<:_LatinCSE} out += off while out < fin ch = get_codeunit(out) - _can_upper(ch) && set_codeunit!(out, ch - 0x20) + _islower(ch) && set_codeunit!(out, ch - 0x20) out += 1 end Str(C, buf) @@ -264,7 +249,7 @@ end # result must have at least one character > 0xff, so if the only character(s) # > 0xff became <= 0xff, then the result may need to be narrowed and returned as _LatinStr -function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE} +function _lower(::Type{C}, beg, off, len) where {C<:Union{_UCS2CSE}} CU = codeunit(C) buf, out = _allocate(CU, len) unsafe_copyto!(out, beg, len) @@ -277,18 +262,20 @@ function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE} _isupper_a(ch) && set_codeunit!(out, ch += 0x20) elseif ch <= 0xff _isupper_l(ch) && set_codeunit!(out, ch += 0x20) - elseif _isupper_u(ch) - ch = _lowercase_u(ch) - flg = ch <= 0xff - set_codeunit!(out, ch) + elseif ch <= 0xffff + if _can_lower_bmp(ch) + ch = _lower_bmp(ch) + flg = ch <= 0xff + set_codeunit!(out, ch) + end end out += sizeof(CU) end if flg && is_latin(buf) out = pointer(buf) - buf = _allocate(len) - _narrow!(pointer(buf), out, out + len) - Str(_LatinCSE, buf) + buf8 = _allocate(len) + _narrow!(pointer(buf8), out, out + len) + Str(_LatinCSE, buf8) else Str(C, buf) end @@ -302,25 +289,74 @@ function _lower(::Type{C}, beg, off, len) where {C<:Union{UCS2CSE,UTF32_CSEs}} out += off while out < fin ch = get_codeunit(out) - if ch <= 0x7f - _isupper_a(ch) && set_codeunit!(out, ch += 0x20) - elseif ch <= 0xff - _isupper_l(ch) && set_codeunit!(out, ch += 0x20) - elseif _isupper_u(ch) - set_codeunit!(out, _lowercase_u(ch)) + if ch <= 0xff + _isupper_al(ch) && set_codeunit!(out, ch += 0x20) + elseif ch <= 0xffff + _can_lower_bmp(ch) && set_codeunit!(out, _lower_bmp(ch)) + elseif ch <= 0x1ffff + _can_lower_slp(ch) && set_codeunit!(out, _lower_slp(ch)) end out += sizeof(CU) end Str(C, buf) end +function lowercase_first(str::MaybeSub{S}) where {C<:_UCS2CSE,S<:Str{C}} + (len = ncodeunits(str)) == 0 && return str + @preserve str begin + pnt = pointer(str) + ch = get_codeunit(pnt) + (ch <= 0xff ? _isupper_al(ch) : ch <= 0xffff ? _can_lower_bmp(ch) : + ch <= 0x1ffff && _can_lower_slp(ch)) || + return str + cl = _lower_ch(ch) + if ch > 0xff && cl <= 0xff && _check_mask_ul(pnt+1, len-1, _latin_mask(UInt16)) + buf8, out8 = _allocate(UInt8, len) + len > 1 && _narrow!(out8 + 1, pnt + 1, pnt + len - 1) + set_codeunit!(out8, cl) + Str(_LatinCSE, buf8) + else + buf, out = _allocate(codeunit(C), len) + len > 1 && unsafe_copyto!(out, pnt, len) + set_codeunit!(out, cl) + Str(C, buf) + end + end +end + +function uppercase_first(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}} + (len = ncodeunits(str)) == 0 && return str + @preserve str begin + pnt = pointer(str) + ch = get_codeunit(pnt) + _can_title_ch(ch) || return str + buf, out = _allocate(codeunit(C), len) + len > 1 && unsafe_copyto!(out, pnt, len) + set_codeunit!(out, _title_ch(ch)) + Str(C, buf) + end +end + +function lowercase_first(str::MaybeSub{S}) where {C<:Union{UCS2CSE,UTF32_CSEs},S<:Str{C}} + (len = ncodeunits(str)) == 0 && return str + @preserve str begin + pnt = pointer(str) + ch = get_codeunit(pnt) + _can_lower_ch(ch) || return str + buf, out = _allocate(codeunit(C), len) + len > 1 && unsafe_copyto!(out, pnt, len) + set_codeunit!(out, _lower_ch(ch)) + Str(C, buf) + end +end + function lowercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}} @preserve str begin CU = codeunit(C) pnt = beg = pointer(str) fin = beg + sizeof(str) while pnt < fin - _isupper_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str)) + _can_lower_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str)) pnt += sizeof(CU) end end @@ -337,16 +373,12 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}} ch = get_codeunit(out) if ch <= 0x7f _islower_a(ch) && set_codeunit!(out, ch -= 0x20) - elseif ch > 0xff - _islower_u(ch) && set_codeunit!(out, _uppercase_u(ch)) - elseif _can_upper(ch) - set_codeunit!(out, ch -= 0x20) - elseif ch == 0xb5 - set_codeunit!(out, 0x39c) - elseif ch == 0xff - set_codeunit!(out, 0x178) - elseif !V6_COMPAT && ch == 0xdf - set_codeunit!(out, 0x1e9e) + elseif ch <= 0xff + set_codeunit!(out, _uppercase_l(ch)) + elseif ch <= 0xffff + _can_upper_bmp(ch) && set_codeunit!(out, _upper_bmp(ch)) + elseif ch <= 0x1ffff + _can_upper_slp(ch) && set_codeunit!(out, _upper_slp(ch)) end out += sizeof(CU) end @@ -359,7 +391,7 @@ function uppercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:St pnt = beg = pointer(str) fin = beg + sizeof(str) while pnt < fin - _wide_lower_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str)) + _can_upper_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str)) pnt += sizeof(CU) end str diff --git a/src/charcase.jl b/src/charcase.jl new file mode 100644 index 0000000..da31436 --- /dev/null +++ b/src/charcase.jl @@ -0,0 +1,130 @@ +#= +Case folding for Unicode characters + +Copyright 2018 Gandalf Software, Inc., Scott P. Jones +Licensed under MIT License, see LICENSE.md +=# + +module CaseTables +include("maketables.jl") + +const ct, tupvec, offvec, bitvec, sizvecl, sizvecu = case_tables() +end # module CaseTables + +using .CaseTables + +const ct = CaseTables.ct + +using ModuleInterfaceTools +@api extend ChrBase + +_can_upper_lat(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5) + +_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf)) + +_wide_out_upper(ch) = + ifelse(ch == 0xb5, 0x39c, + ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16))) + +_check_tab(off, ch) = + off != 0 && (CaseTables.bitvec[off][((ch >>> 5) & 0xf) + 1] & (UInt32(1) << (ch & 0x1f))) != 0 + +@inline _get_tab(off, ch) = + off == 0 ? ch : (off = CaseTables.offvec[off][((ch >>> 5) & 0x1f) + 1]) == 0 ? ch : + CaseTables.tupvec[off][(ch & 0x1f) + 1] + +@inline _upper_lat(ch) = _get_tab(ct.u_tab[1], ch) + +@inline _upper_bmp(ch) = + (t = (ch >>> 9); ((ct.can_u >>> t) & 1) == 0 ? ch : _get_tab(ct.u_tab[(t>>1)+1], ch)) + +@inline _lower_bmp(ch) = + (t = (ch >>> 9); ((ct.can_l >>> t) & 1) == 0 ? ch : _get_tab(ct.l_tab[(t>>1)+1], ch)) + +@inline _title_bmp(ch) = + (t = (ch >>> 9); ((ct.can_u >>> t) & 1) == 0 ? ch : _get_tab(ct.t_tab[(t>>1)+1], ch)) + +@inline _upper_slp(ch) = + (t = (ch >>> 9); ((ct.s_can_u >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(ct.u_tab[(t>>1)+1], ch)) + +@inline _lower_slp(ch) = + (t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(ct.l_tab[(t>>1)+1], ch)) + +# Handle range 0x0000-0xffff +@inline _can_lower_bmp(ch) = + (t = (ch >>> 9); ((ct.can_l >>> t) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch)) + +# Handle range 0x10000-0x1ffff +@inline _can_lower_slp(ch) = + (t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch)) + +# Handle range 0x0000-0xffff +@inline _can_upper_bmp(ch) = + (t = (ch >>> 9); ((ct.can_u >>> t) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch)) + +# Handle range 0x10000-0x1ffff +@inline _can_upper_slp(ch) = + (t = (ch >>> 9); ((ct.s_can_u >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch)) + +#= +# Handle range 0x0000-0xffff +@inline _can_title_bmp(ch) = + (t = (ch >>> 9); ((ct.can_t >>> t) & 1) != 0 && _check_tab(ct.can_t_tab[t+1], ch)) +=# +const _can_title_bmp = _can_upper_bmp + +# Handle range 0x0000-0xffff +@inline _is_lower_bmp(ch) = + (t = (ch >>> 9); ((ct.can_l >>> t) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch)) + +# Handle range 0x10000-0x1ffff +@inline _is_lower_slp(ch) = + (t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch)) + +# Handle range 0x0000-0xffff +@inline _is_upper_bmp(ch) = + (t = (ch >>> 9); ((ct.can_u >>> t) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch)) + +@inline _is_lower_ch(ch) = + ch <= 0x7f ? _islower_a(ch) : + ch <= 0xff ? _islower_l(ch) : + ch <= 0xffff ? _is_lower_bmp(ch) : + ch <= 0x1ffff ? _is_lower_slp(ch) : false + +@inline _is_upper_ch(ch) = + ch <= 0x7f ? _isupper_a(ch) : + ch <= 0xff ? _isupper_l(ch) : + ch <= 0xffff ? _is_upper_bmp(ch) : + ch <= 0x1ffff ? _is_upper_slp(ch) : false + +@inline _can_lower_ch(ch) = + ch <= 0x7f ? _isupper_a(ch) : + ch <= 0xff ? _isupper_l(ch) : + ch <= 0xffff ? _can_lower_bmp(ch) : + ch <= 0x1ffff ? _can_lower_slp(ch) : false + +@inline _can_upper_ch(ch) = + ch <= 0x7f ? _islower_a(ch) : + ch <= 0xff ? _can_upper_lat(ch) : + ch <= 0xffff ? _can_upper_bmp(ch) : + ch <= 0x1ffff ? _can_upper_slp(ch) : false + +const _can_title_ch = _can_upper_ch + +@inline _lower_ch(ch) = + ch <= 0x7f ? (_isupper_a(ch) ? ch + 0x20 : ch) : + ch <= 0xff ? (_isupper_l(ch) : ch + 0x20 : ch) : + ch <= 0xffff ? _lower_bmp(ch) : + ch <= 0x1ffff ? _lower_slp(ch) : ch + +@inline _upper_ch(ch) = + ch <= 0x7f ? (_islower_a(ch) ? ch - 0x20 : ch) : + ch <= 0xff ? _upper_lat(ch) : + ch <= 0xffff ? _upper_bmp(ch) : + ch <= 0x1ffff ? _upper_slp(ch) : ch + +@inline _title_ch(ch) = + ch <= 0x7f ? (_islower_a(ch) ? ch - 0x20 : ch) : + ch <= 0xff ? _upper_lat(ch) : + ch <= 0xffff ? _title_bmp(ch) : + ch <= 0x1ffff ? _upper_slp(ch) : ch diff --git a/src/maketables.jl b/src/maketables.jl new file mode 100644 index 0000000..ad20d01 --- /dev/null +++ b/src/maketables.jl @@ -0,0 +1,329 @@ +#= +Copyright 2018 Gandalf Software, Inc., Scott P. Jones +Licensed under MIT License, see LICENSE.md +=# + +struct CaseTable + l_tab::NTuple{128,UInt8} + u_tab::NTuple{128,UInt8} + t_tab::NTuple{64,UInt8} + + is_l_tab::NTuple{256, UInt8} + is_u_tab::NTuple{256, UInt8} + can_l_tab::NTuple{256, UInt8} + can_u_tab::NTuple{256, UInt8} + + can_l::UInt128 + can_u::UInt128 + can_t::UInt128 + s_can_l::UInt128 + s_can_u::UInt128 + + is_l_flg::UInt128 + is_u_flg::UInt128 + is_sl_flg::UInt128 + is_su_flg::UInt128 + + siz_l::UInt128 + siz_u::UInt128 + max_siz_l::UInt32 + max_siz_u::UInt32 +end + +# Calculate tables (later move these to library built based on Unicode tables with BinaryBuilder, +# loaded by BinaryProvider) + +# For speed, we want to know which characters can be lowercased, uppercased, or titlecased +# We want to also know which ones increase or decrease in size in UTF-8 encoding + +@inline utf8_len(cp::Unsigned) = ifelse(cp <= 0x7f, 1, ifelse(cp <= 0x7ff, 2, 3 + (cp > 0xffff))) +@inline utf8_len(ch::Char) = utf8_len(ch%UInt32) + +const _z4 = 0%UInt32 + +const zerotup = (_z4, _z4, _z4, _z4, _z4, _z4, _z4, _z4, + _z4, _z4, _z4, _z4, _z4, _z4, _z4, _z4) + +function _add_to_bv!(vec, tmp) + tt = tuple(tmp...) + tt == zerotup && return 0x0 + fill!(tmp, 0%UInt32) + for (i, t) in enumerate(vec) + (t == tt) && return UInt8(i) + end + push!(vec, tt) + UInt8(length(vec)) +end + +hex(a) = "0x" * string(a, base=16) + +# case tables should be: +# 128-bit bitmap to say whether a particular range of 512 code points can be folded +# 64-byte ntuple{32,UInt16} +function case_tables() + bitvec = NTuple{16,UInt32}[] # each element represents 512 characters + tupvec = NTuple{32,UInt16}[] # each element represents 32 characters + offvec = NTuple{32,UInt8}[] # each element maps 1024 character range to offsets in tupvec + + sizvecl = Pair{UInt16,UInt64}[] + sizvecu = Pair{UInt16,UInt64}[] + + # Top level tables + l_tab = fill(0x0, 128) + u_tab = fill(0x0, 128) + t_tab = fill(0x0, 64) + + # The elements of these are all offsets into bitvec + is_l_tab = fill(0x0, 256) + is_u_tab = fill(0x0, 256) + can_l_tab = fill(0x0, 256) + can_u_tab = fill(0x0, 256) + is_l_flg = is_u_flg = is_sl_flg = is_su_flg = 0%UInt128 + + # These tables get reused, stored in offvec + l_off = fill(0x0, 32) + u_off = fill(0x0, 32) + t_off = fill(0x0, 32) + + # These tables get reused, stored in tupvec + tmp_l = fill(0x0000, 32) + tmp_u = fill(0x0000, 32) + tmp_t = fill(0x0000, 32) + + # These tables get reused, stored in bitvec + bit_is_l = fill(0%UInt32, 16) + bit_is_u = fill(0%UInt32, 16) + bit_can_l = fill(0%UInt32, 16) + bit_can_u = fill(0%UInt32, 16) + + # Handle BMP + l_mid = u_mid = t_mid = false + can_l = can_u = can_t = 0%UInt128 + siz_l = siz_u = 0%UInt128 + max_siz_l = max_siz_u = 0%UInt32 + tmp_is_l = tmp_is_u = tmp_can_l = tmp_can_u = 0%UInt32 + for rng in (0x0080:0x20:0xd7e0, 0xe000:0x20:0xffe0), base in rng + hipos = (base >>> 10) + 1 + hibit = UInt128(1) << (base >>> 9) + midbits = ((base >>> 5) & 0x1f) + 1 + + # Handle block of 32 characters + l_flg = u_flg = t_flg = false + diff_l = diff_u = 0%UInt64 + for off = 0x00:0x1f + cp = UInt16(base + off) + lowbit = UInt32(1) << off + ch = Char(cp) + cl = UInt16(lowercase(ch)) + cu = UInt16(uppercase(ch)) + ct = UInt16(titlecase(ch)) + tmp_l[off+1] = cl + tmp_u[off+1] = cu + tmp_t[off+1] = ct + islowercase(ch) && (tmp_is_l |= lowbit) + isuppercase(ch) && (tmp_is_u |= lowbit) + cp === cl === cu === ct && continue + sizc = utf8_len(ch) + sizu = utf8_len(cu) + sizt = utf8_len(ct) + if cl !== cp + l_flg = true + tmp_can_l |= lowbit + diff = utf8_len(cl) - sizc + -2 <= diff <= 1 || error("Size difference for $cp -> $cl is $diff") + diff == 0 || (diff_l |= (diff & 3)<<(off<<1); max_siz_l = cp) + end + if cu !== cp + u_flg = true + tmp_can_u |= lowbit + diff = sizu - sizc + -2 <= diff <= 1 || error("Size difference for $cp -> $cu is $diff") + diff == 0 || (diff_u |= (diff & 3)<<(off<<1); max_siz_u = cp) + end + if ct !== cu + t_flg = true + sizu == sizt || + error("Titlecase and Uppercase are not same size in UTF-8: " * + "$cp $cu:$sizu $ct:$sizt") + end + end + + bitoff = ((base >>> 5) & 0xf) + 1 + bit_is_l[bitoff] = tmp_is_l + bit_is_u[bitoff] = tmp_is_u + bit_can_l[bitoff] = tmp_can_l + bit_can_u[bitoff] = tmp_can_u + + is_l_flg |= (UInt128(tmp_is_l != 0) << hibit) + is_u_flg |= (UInt128(tmp_is_u != 0) << hibit) + can_l |= (UInt128(tmp_can_l != 0) << hibit) + can_u |= (UInt128(tmp_can_u != 0) << hibit) + can_t |= (UInt128(t_flg) << hibit) + + tmp_is_l = tmp_is_u = tmp_can_l = tmp_can_u = 0%UInt32 + + if l_flg + #print("Lower:"); for v in tmp_l; print(" ", string(v, base=16)); end; println() + l_mid = true # Have at least one in this set of blocks + push!(tupvec, tuple(tmp_l...)) + l_off[midbits] = UInt8(length(tupvec)) + end + if u_flg + #print("Upper:"); for v in tmp_u; print(" ", string(v, base=16)); end; println() + u_mid = true + push!(tupvec, tuple(tmp_u...)) + u_off[midbits] = UInt8(length(tupvec)) + end + if t_flg + #print("Title:"); for v in tmp_t; print(" ", string(v, base=16)); end; println() + t_mid = true + push!(tupvec, tuple(tmp_t...)) + t_off[midbits] = UInt8(length(tupvec)) + else + t_off[midbits] = u_off[midbits] + end + + diff_l == 0 || (siz_l |= hibit; push!(sizvecl, base => diff_l)) + diff_u == 0 || (siz_u |= hibit; push!(sizvecu, base => diff_u)) + + if bitoff == 16 + # Reset bits + pos = (base >>> 9) + 1 + is_l_tab[pos] = _add_to_bv!(bitvec, bit_is_l) + is_u_tab[pos] = _add_to_bv!(bitvec, bit_is_u) + can_l_tab[pos] = _add_to_bv!(bitvec, bit_can_l) + can_u_tab[pos] = _add_to_bv!(bitvec, bit_can_u) + println("base:$(hex(base)), pos:$pos, $(hex(is_l_tab[pos])), $(hex(is_u_tab[pos])), ", + "$(hex(can_u_tab[pos])), $(hex(can_u_tab[pos])), ", + bitvec) + end + + # Check for end of chunk + midbits == 32 || continue + + if l_mid + push!(offvec, tuple(l_off...)) + l_tab[hipos] = UInt8(length(offvec)) + fill!(l_off, 0x0) + l_mid = false + end + if u_mid + push!(offvec, tuple(u_off...)) + u_tab[hipos] = UInt8(length(offvec)) + fill!(u_off, 0x0) + u_mid = false + end + if t_mid + push!(offvec, tuple(t_off...)) + t_tab[hipos] = UInt8(length(offvec)) + t_mid = false + else + t_tab[hipos] = u_tab[hipos] + end + + println("base:$(hex(base)), ", + "is_l_flg:$(hex(is_l_flg)), is_u_flg:$(hex(is_u_flg)), ", + "can_l:$(hex(can_l)), can_u:$(hex(can_u))") + end + + # Handle SLP + s_can_l = s_can_u = 0%UInt128 + + for base in 0x0000:0x20:0xffe0 + hipos = (base >>> 10) + 65 + hibit = (1%UInt128) << (base >>> 9) + midbits = ((base >>> 5) & 0x1f) + 1 + + # Handle block of 32 characters + l_flg = u_flg = false + for off = 0x00:0x1f + cp = UInt32(0x10000 + base + off) + ch = Char(cp) + cl = UInt32(lowercase(ch)) + cu = UInt32(uppercase(ch)) + ct = UInt32(titlecase(ch)) + lowbit = UInt32(1) << off + islowercase(ch) && (tmp_is_l |= lowbit) + isuppercase(ch) && (tmp_is_u |= lowbit) + tmp_l[off+1] = cl%UInt16 + tmp_u[off+1] = cu%UInt16 + cp === cl === cu === ct && continue + ct == cu || error("titlecase: $cp -> $ct not same as uppercase $cu") + sizc = 4 + sizl = utf8_len(cl) + sizu = utf8_len(cu) + sizl == sizu == 4 || error("UTF-8 sizes not 4: $cp: $cl=$sizl, $cu=$sizu") + l_flg |= (cl !== cp) + u_flg |= (cu !== cp) + end + + if l_flg + l_mid = true + push!(tupvec, tuple(tmp_l...)) + l_off[midbits] = UInt8(length(tupvec)) + end + if u_flg + u_mid = true + push!(tupvec, tuple(tmp_u...)) + u_off[midbits] = UInt8(length(tupvec)) + end + + bitoff = ((base >>> 5) & 0xf) + 1 + bit_is_l[bitoff] = tmp_is_l + bit_is_u[bitoff] = tmp_is_u + bit_can_l[bitoff] = tmp_can_l + bit_can_u[bitoff] = tmp_can_u + + is_sl_flg |= (UInt128(tmp_is_l != 0) << hibit) + is_su_flg |= (UInt128(tmp_is_u != 0) << hibit) + s_can_l |= (UInt128(tmp_can_l != 0) << hibit) + s_can_u |= (UInt128(tmp_can_u != 0) << hibit) + + tmp_is_l = tmp_is_u = tmp_can_l = tmp_can_u = 0%UInt32 + + if bitoff == 16 + # Reset bits + pos = (base >>> 9) + 129 + is_l_tab[pos] = _add_to_bv!(bitvec, bit_is_l) + is_u_tab[pos] = _add_to_bv!(bitvec, bit_is_u) + can_l_tab[pos] = _add_to_bv!(bitvec, bit_can_l) + can_u_tab[pos] = _add_to_bv!(bitvec, bit_can_u) + end + + # Check for end of chunk + midbits == 32 || continue + + if l_mid + push!(offvec, tuple(l_off...)) + l_tab[hipos] = UInt8(length(offvec)) + fill!(l_off, 0x0) + l_mid = false + end + if u_mid + push!(offvec, tuple(u_off...)) + u_tab[hipos] = UInt8(length(offvec)) + fill!(u_off, 0x0) + u_mid = false + end + + println("base:$(hex(base)), ", + "is_sl_flg:$(hex(is_sl_flg)), is_su_flg:$(hex(is_su_flg)), ", + "s_can_l:$(hex(s_can_l)), s_can_u:$(hex(s_can_u))") + end + + # Check that there are no upper / lower / title case characters above SLP + for cp in 0x20000:0x10ffff + ch = Char(cp) + ch == lowercase(ch) == uppercase(ch) == titlecase(ch) || + error("$cp has lower/upper/titlecase") + end + + (CaseTable(tuple(l_tab...), tuple(u_tab...), tuple(t_tab...), + tuple(is_l_tab...), tuple(is_u_tab...), + tuple(can_l_tab...), tuple(can_u_tab...), + can_l, can_u, can_t, s_can_l, s_can_u, + is_l_flg, is_u_flg, is_sl_flg, is_su_flg, + siz_l, siz_u, max_siz_l, max_siz_u), + tuple(tupvec...), tuple(offvec...), tuple(bitvec...), + tuple(sizvecl...), tuple(sizvecu...)) +end diff --git a/src/utf16case.jl b/src/utf16case.jl index fa76844..9e44e80 100644 --- a/src/utf16case.jl +++ b/src/utf16case.jl @@ -14,20 +14,18 @@ function _lower(::Type{<:Str{UTF16CSE}}, beg, off, len) out += off while out < fin ch = get_codeunit(out) - if ch <= 0x7f - _isupper_a(ch) && set_codeunit!(out, ch += 0x20) - elseif ch <= 0xff - _isupper_l(ch) && set_codeunit!(out, ch += 0x20) + if ch <= 0xff + _isupper_al(ch) && set_codeunit!(out, ch += 0x20) elseif is_surrogate_trail(ch) # pick up previous code unit (lead surrogate) cp = get_supplementary(get_codeunit(out - 2), ch) - if _isupper_u(cp) - w1, w2 = get_utf16(_lowercase_u(cp)) + if cp < 0x1ffff && _can_lower_slp(cp) + w1, w2 = get_utf16(_lower_slp(cp)) set_codeunit!(out - 2, w1) set_codeunit!(out, w2) end - elseif _isupper_u(ch) - set_codeunit!(out, _lowercase_u(ch)) + elseif _can_lower_bmp(ch) + set_codeunit!(out, _lower_bmp(ch)) end out += 2 end @@ -41,12 +39,12 @@ function lowercase(str::Str{UTF16CSE}) while pnt < fin ch = get_codeunit(pnt) prv = pnt - (ch > 0xd7ff # May be surrogate pair - ? _isupper_u(ch > 0xdfff ? ch%UInt32 : get_supplementary(ch, get_codeunit(pnt += 2))) - : (is_ascii(ch) - ? _isupper_a(ch) - : (is_latin(ch) ? _isupper_l(ch) : _isupper_u(ch)))) && - return _lower(UTF16Str, beg, prv-beg, ncodeunits(str)) + (ch <= 0xff ? _isupper_al(ch) : + (is_surrogate_lead(ch) + ? (cp = get_supplementary(ch, get_codeunit(pnt += 2)); + cp <= 0x1ffff && _can_lower_slp(cp)) + : _can_lower_bmp(ch))) && + return _lower(UTF16Str, beg, prv-beg, ncodeunits(str)) pnt += 2 end end @@ -60,28 +58,20 @@ function _upper(::Type{<:Str{UTF16CSE}}, beg, off, len) out += off while out < fin ch = get_codeunit(out) - if is_ascii(ch) + if ch <= 0x7f _islower_a(ch) && set_codeunit!(out, ch -= 0x20) - elseif is_latin(ch) - if _can_upper_l(ch) - set_codeunit!(out, ch -= 0x20) - elseif ch == 0xb5 - set_codeunit!(out, 0x39c) - elseif ch == 0xff - set_codeunit!(out, 0x178) - elseif !V6_COMPAT && ch == 0xdf - set_codeunit!(out, 0x1e9e) - end + elseif ch <= 0xff + set_codeunit!(out, _uppercase_l(ch)) elseif is_surrogate_trail(ch) # pick up previous code unit (lead surrogate) cp = get_supplementary(get_codeunit(out - 2), ch) - if _islower_u(cp) - w1, w2 = get_utf16(_uppercase_u(cp)) + if cp <= 0x1ffff && _can_upper_slp(cp) + w1, w2 = get_utf16(_upper_slp(cp)) set_codeunit!(out - 2, w1) set_codeunit!(out, w2) end - elseif _islower_u(ch) - set_codeunit!(out, _uppercase_u(ch)) + elseif _is_lower_bmp(ch) + set_codeunit!(out, _upper_bmp(ch)) end out += 2 end @@ -95,10 +85,13 @@ function uppercase(str::UTF16Str) while pnt < fin ch = get_codeunit(pnt) prv = pnt - (ch > 0xd7ff # May be surrogate pair - ? _islower_u(ch > 0xdfff ? ch%UInt32 : get_supplementary(ch, get_codeunit(pnt += 2))) - : _wide_lower_ch(ch)) && - return _upper(UTF16Str, beg, prv-beg, ncodeunits(str)) + if (ch <= 0x7f ? _isupper_a(ch) : ch <= 0xff ? _can_upper_lat(ch) : + (is_surrogate_lead(ch) + ? (cp = get_supplementary(ch, get_codeunit(pnt += 2)); + cp <= 0x1ffff && _can_upper_slp(cp)) + : _can_upper_bmp(ch))) + return _upper(UTF16Str, beg, prv-beg, ncodeunits(str)) + end pnt += 2 end end diff --git a/src/utf8.jl b/src/utf8.jl index 2f20136..aaf6455 100644 --- a/src/utf8.jl +++ b/src/utf8.jl @@ -199,7 +199,7 @@ is_latin(str::SubString{<:Str{UTF8CSE}}) = is_latin(vec::Vector{T}) where {T<:Union{UInt16,UInt32}} = (cnt = sizeof(vec)) == 0 ? true : - @preserve str _check_mask_ul(pointer(vec), cnt, _latin_mask(T)) + @preserve vec _check_mask_ul(pointer(vec), cnt, _latin_mask(T)) is_latin(str::SubString{<:Str{C}}) where {C<:Union{Word_CSEs,Quad_CSEs}} = (cnt = sizeof(str)) == 0 ? true : diff --git a/src/utf8case.jl b/src/utf8case.jl index b191f51..ed615a9 100644 --- a/src/utf8case.jl +++ b/src/utf8case.jl @@ -3,6 +3,31 @@ Copyright 2018 Gandalf Software, Inc., Scott P. Jones Licensed under MIT License, see LICENSE.md =# +function _check_char(ch, tab) + cmp = ch & ~0x1f + for (c, mask) in tab + cmp < c && break + cmp == c && return ((mask%Int64) << (62 - ((ch & 0x1f)<<1))) >> 62 + end + 0 +end + +function _calc_len(pnt, fin, len, mask, tab) + while pnt < fin + ch = get_codeunit(pnt) + if ch < 0xf0 + if ch > 0x7f + c16 = ch < 0xe0 ? get_utf8_2byte(pnt += 1, ch) : get_utf8_3byte(pnt += 2, ch) + (mask & (1 << (ch >>> 9))) != 0 && (len += _check_char(ch, tab)) + end + pnt += 1 + else + pnt += 4 # no 4 byte changes size in UTF-8 + end + end + len +end + # These are more complex case folding functions, and maybe belong in a separate UTF8Str.jl package # Note: these only check for cases in Unicode where 2 byte sequences @@ -11,12 +36,14 @@ Licensed under MIT License, see LICENSE.md function _lower_utf8(beg, off, len) # Note, the final length may be larger or smaller - buf, out = _allocate(UInt8, len) - unsafe_copyto!(out, beg, off) fin = beg + len pnt = beg + off - outend = out + len + # Calculate final length + len = _calc_len(pnt, fin, len, ct.siz_l, CaseTables.sizevecl) + buf, out = _allocate(UInt8, len) + unsafe_copyto!(out, beg, off) out += off + pnt = beg + off while pnt < fin ch = get_codeunit(pnt) if ch < 0x80 @@ -28,23 +55,14 @@ function _lower_utf8(beg, off, len) elseif ch < 0xe0 # 2 byte c16 = get_utf8_2byte(pnt += 1, ch) - if _isupper_u(c16) - c16 = _lowercase_u(c16) - # Check if still 2 byte, could increase to 3 byte, decrease to 1 byte + if _can_lower_bmp(c16) + c16 = _lower_bmp(c16) if c16 < 0x80 set_codeunit!(out, c16%UInt8) out += 1 elseif c16 < 0x800 out = output_utf8_2byte!(out, c16) else - # Check to see if we need to resize - diff = (outend - out - 3) - (fin - pnt - 1) - if diff < 0 - outend -= diff - resize!(buf, outend - out) - out = pointer(buf) - outend = out + sizeof(buf) - end out = output_utf8_3byte!(out, c16) end else @@ -53,9 +71,8 @@ function _lower_utf8(beg, off, len) elseif ch < 0xf0 # 3 byte c16 = get_utf8_3byte(pnt += 2, ch) - if _isupper_u(c16) - c16 = _lowercase_u(c16) - # Check if still 3 byte, could drop to 2 byte + if _can_lower_bmp(c16) + c16 = _lower_bmp(c16) if c16 < 0x800 out = output_utf8_2byte!(out, c16) else @@ -67,22 +84,22 @@ function _lower_utf8(beg, off, len) else # 4 byte c32 = get_utf8_4byte(pnt += 3, ch) - _isupper_u(c32) && (c32 = _lowercase_u(c32)) + c32 <= 0x1ffff && _can_lower_slp(c32) && (c32 = _lower_slp(c32)) out = output_utf8_4byte!(out, c32) end pnt += 1 end - out < outend && (buf = resize!(buf, out - pointer(buf))) Str(UTF8CSE, buf) end function _upper_utf8(beg, off, len) + fin = beg + len + pnt = beg + off # Note, the final length may be larger or smaller + len = _calc_len(pnt, fin, len, ct.siz_u, CaseTables.sizevecu) + buf, out = _allocate(UInt8, len) unsafe_copyto!(out, beg, off) - fin = beg + len - pnt = beg + off - outend = out + len out += off while pnt < fin ch = get_codeunit(pnt) @@ -92,26 +109,15 @@ function _upper_utf8(beg, off, len) elseif ch < 0xc4 ch = (ch << 6) | (get_codeunit(pnt += 1) & 0x3f) if !V6_COMPAT && ch == 0xdf - # Increasing from 2 to 3 bytes, check to see if we need to resize - diff = (outend - out - 3) - (fin - pnt - 1) - if diff < 0 - outend -= diff - resize!(buf, outend - out) - out = pointer(buf) - outend = out + sizeof(buf) - end out = output_utf8_3byte!(out, 0x1e9e) else - out = output_utf8_2byte!(out, _can_upper_l(ch) ? (ch - 0x20)%UInt16 - : ch == 0xb5 ? 0x39c - : ch == 0xff ? 0x178 - : ch%UInt16) + out = output_utf8_2byte!(out, _upper_bmp(ch)) end elseif ch < 0xe0 # 2 byte c16 = get_utf8_2byte(pnt += 1, ch) - if _islower_u(c16) - c16 = _uppercase_u(c16) + if _can_upper_bmp(c16) + c16 = _upper_bmp(c16) # Check if still 2 byte, could increase to 3 byte, or decrease to 1 byte if c16 < 0x80 set_codeunit!(out, c16%UInt8) @@ -119,14 +125,6 @@ function _upper_utf8(beg, off, len) elseif c16 < 0x800 out = output_utf8_2byte!(out, c16) else - # Increasing from 2 to 3 bytes, check to see if we need to resize - diff = (outend - out - 3) - (fin - pnt - 1) - if diff < 0 - outend -= diff - resize!(buf, outend - out) - out = pointer(buf) - outend = out + sizeof(buf) - end out = output_utf8_3byte!(out, c16) end else @@ -135,8 +133,8 @@ function _upper_utf8(beg, off, len) elseif ch < 0xf0 # 3 byte c16 = get_utf8_3byte(pnt += 2, ch) - if _islower_u(c16) - c16 = _uppercase_u(c16) + if _can_upper_bmp(c16) + c16 = _upper_bmp(c16) # Check if still 3 byte, uppercase form could drop to 2 byte if c16 < 0x800 out = output_utf8_2byte!(out, c16) @@ -149,12 +147,11 @@ function _upper_utf8(beg, off, len) else # 4 byte c32 = get_utf8_4byte(pnt += 3, ch) - _islower_u(c32) && (c32 = _uppercase_u(c32)) + c32 < 0x1ffff && _can_upper_slp(c32) && (c32 = _upper_slp(c32)) out = output_utf8_4byte!(out, c32) end pnt += 1 end - out < outend && (buf = resize!(buf, out - pointer(buf))) Str(UTF8CSE, buf) end @@ -169,12 +166,12 @@ function lowercase(str::Str{UTF8CSE}) ? _isupper_a(ch) : (ch < 0xc4 ? _isupper_l((ch << 6) | (get_codeunit(pnt += 1) & 0x3f)) - : _isupper_u(ch >= 0xf0 - ? get_utf8_4byte(pnt += 3, ch) - : (ch < 0xe0 - ? get_utf8_2byte(pnt += 1, ch) - : get_utf8_3byte(pnt += 2, ch))%UInt32))) && - return _lower_utf8(beg, prv-beg, ncodeunits(str)) + : _can_lower_ch(ch >= 0xf0 + ? get_utf8_4byte(pnt += 3, ch) + : (ch < 0xe0 + ? get_utf8_2byte(pnt += 1, ch) + : get_utf8_3byte(pnt += 2, ch))%UInt32))) && + return _lower_utf8(beg, prv-beg, ncodeunits(str)) pnt += 1 end str @@ -198,11 +195,11 @@ function uppercase(str::Str{UTF8CSE}) (ch < 0x80 ? _islower_a(ch) : (ch > 0xc3 - ? _islower_u(ch >= 0xf0 - ? get_utf8_4byte(pnt += 3, ch) - : (ch < 0xe0 - ? get_utf8_2byte(pnt += 1, ch) - : get_utf8_3byte(pnt += 2, ch))%UInt32) + ? _can_upper_ch(ch >= 0xf0 + ? get_utf8_4byte(pnt += 3, ch) + : (ch < 0xe0 + ? get_utf8_2byte(pnt += 1, ch) + : get_utf8_3byte(pnt += 2, ch))%UInt32) : _check_uppercase(ch, pnt += 1))) && return _upper_utf8(beg, prv-beg, ncodeunits(str)) pnt += 1 diff --git a/test/basic.jl b/test/basic.jl index d2435cb..98bcce1 100644 --- a/test/basic.jl +++ b/test/basic.jl @@ -34,23 +34,42 @@ end end end -@testset "casefold string" begin - for ST in (ASCIIStr, LatinStr, UCS2Str, UTF32Str, UTF8Str, UTF16Str) +hex(a) = string(a, base=16) + +const sb = StrBase +const ct = sb.ct + +for ST in (ASCIIStr, LatinStr, UCS2Str, UTF32Str, UTF8Str, UTF16Str, UniStr) + @testset "casefold string: $ST" begin C = eltype(ST) tm = typemax(C) for c = 0:Int(tm) # Skip surrogates 0xd800 <= c < 0xe000 && continue + # Handle 3 cases where LatinStr doesn't uppercase + ST === LatinStr && c in (0xb5, 0xdf, 0xff) && continue ch = C(c) # Check to make sure this character would still fit uppercased cu = uppercase(ch) cu > tm && continue - for str in ("$ch test Beg", "test End $ch", "test $ch Mid", "$ch") + cl = lowercase(ch) + #for (i, str) in enumerate(("$ch", "$ch test Beg", "test End $ch", "test $ch Mid")) + t = c >>> 9 + let i = 1, str = "$ch" cvtstr = convert(ST, str) - # Don't do this for LatinStr until ChrBase bug fixed - ST !== LatinStr && @test uppercase(str) == uppercase(cvtstr) + uppercase(str) == uppercase(cvtstr) || + println("uppercase not matching: 0x$(hex(c)), 0x$(hex(UInt32(cu))), ", + "$(sb._can_upper_ch(c)), $(hex(sb._upper_bmp(c))), ", + "$((ct.can_u >>> t) & 1), $(ct.u_tab[(t>>1)+1])") + @test uppercase(str) == uppercase(cvtstr) + lowercase(str) == lowercase(cvtstr) || + println("lowercase not matching: 0x$(hex(c)), 0x$(hex(UInt32(cl))), ", + "$(sb._can_lower_ch(c)) $(hex(sb._lower_bmp(c))), ", + "$((ct.can_l >>> t) & 1), $(ct.l_tab[(t>>1)+1])") @test lowercase(str) == lowercase(cvtstr) #@test titlecase(str) == titlecase(cvtstr) + i > 2 && continue + #@test lowercase_first(str) == lowercase_first(cvtstr) #@test uppercase_first(str) == uppercase_first(cvtstr) end end