Skip to content

Commit

Permalink
Update iterate/next for utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Oct 23, 2018
1 parent ee7a6bb commit b8e550a
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 28 deletions.
48 changes: 29 additions & 19 deletions src/support.jl
Original file line number Diff line number Diff line change
Expand Up @@ -935,29 +935,39 @@ repeat(ch::UTF32Chr, cnt::Integer) = _repeat(UTF32CSE, ch, cnt)
=#

function repeat(ch::C, cnt::Integer) where {C<:Union{ASCIIChr,LatinChr,_LatinChr}}
cnt == 0 && return empty_str(ASCIICSE)
cnt < 0 && repeaterr(cnt)
cu = ch%UInt8
buf, pnt = _allocate(UInt8, cnt)
_memset(pnt, cu, cnt)
Str((C == ASCIIChr || cu <= 0x7f) ? ASCIICSE : (C == _LatinChr ? _LatinCSE : LatinCSE), buf)
if cnt > 0
cu = ch%UInt8
buf, pnt = _allocate(UInt8, cnt)
_memset(pnt, cu, cnt)
if C == ASCIIChr || cu <= 0x7f
Str(ASCIICSE, buf)
elseif C == _LatinChr
Str(_LatinCSE, buf)
else
Str(LatinCSE, buf)
end
else
cnt == 0 ? empty_ascii : repeaterr(cnt)
end
end

function repeat(ch::C, cnt::Integer) where {C<:Union{UCS2Chr,UTF32Chr}}
cnt == 0 && return empty_str(ASCIICSE)
cnt < 0 && repeaterr(cnt)
if ch%UInt32 <= 0xff
buf, pnt = _allocate(UInt8, cnt)
cnt == 1 && set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
Str(ifelse(ch%UInt8 <= 0x7f, ASCIICSE, LatinCSE), buf)
elseif C == UCS2Chr || ch%UInt32 <= 0xffff
buf, pnt = _allocate(UInt16, cnt)
cnt == 1 && set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
Str(UCS2CSE, buf)
if cnt > 0
if ch%UInt32 <= 0xff
buf, pnt = _allocate(UInt8, cnt)
cnt == 1 ? set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
ch%UInt8 <= 0x7f ? Str(ASCIICSE, buf) : Str(LatinCSE, buf)
elseif C == UCS2Chr || ch%UInt32 <= 0xffff
buf, pnt = _allocate(UInt16, cnt)
cnt == 1 ? set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
Str(UCS2CSE, buf)
else
buf, pnt = _allocate(UInt32, cnt)
cnt == 1 ? set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
Str(UTF32CSE, buf)
end
else
buf, pnt = _allocate(UInt32, cnt)
cnt == 1 && set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
Str(UTF32CSE, buf)
cnt == 0 ? empty_ascii : repeaterr(cnt)
end
end

Expand Down
2 changes: 1 addition & 1 deletion src/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ end

@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF16, pos::Int) where {T}
@boundscheck pos <= ncodeunits(str) || boundserr(str, pos)
_iterate(MultiCU(), T, str, pos)
iterate(str, pos)
end

@inline _thisind(::MultiCU, str::MS_UTF16, len, pnt, pos) =
Expand Down
18 changes: 10 additions & 8 deletions src/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -361,27 +361,29 @@ function _iterate_utf8(ch, str, pnt, pos)
end
end

@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
pos > ncodeunits(str) && return nothing
@boundscheck pos <= 0 && boundserr(str, pos)
@inline function _iterate_utf8(str, pos)
@preserve str begin
pnt = pointer(str) + pos - 1
ch = get_codeunit(pnt)
ch <= 0x7f ? (UTF32Chr(ch), pos + 1) : _iterate_utf8(ch, str, pnt, pos)
end
end

@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
pos > ncodeunits(str) && return nothing
@boundscheck pos <= 0 && boundserr(str, pos)
_iterate_utf8(str, pos)
end

_iterate(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =
iterate(str.data, pos)
_iterate(::MultiCU, ::Type{T}, str::SubString{<:Str{RawUTF8CSE}}, pos::Int) where {T} =
iterate(SubString(str.string.data, str.offset + pos, str.offset + ncodeunits(str)), 1)

# Gets next codepoint
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8,
pos::Int) where {T<:Chr}
len = ncodeunits(str)
@boundscheck 0 < pos <= len || boundserr(str, pos)
_iterate(MultiCU(), T, str, pos)
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8, pos::Int) where {T<:Chr}
@boundscheck 0 < pos <= ncodeunits(str) || boundserr(str, pos)
_iterate_utf8(str, pos)
end

_next(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =
Expand Down

0 comments on commit b8e550a

Please sign in to comment.