Skip to content

Commit c8bc88b

Browse files
committed
Update iterate/next for utf8
1 parent ee7a6bb commit c8bc88b

File tree

3 files changed

+45
-51
lines changed

3 files changed

+45
-51
lines changed

src/support.jl

+34-42
Original file line numberDiff line numberDiff line change
@@ -910,57 +910,49 @@ end
910910

911911
(^)(ch::CP, cnt::Integer) where {CP <: Chrs} = repeat(ch, cnt)
912912

913-
#=
914-
function _repeat(::Type{CS}, ch::C, cnt::Integer) where {CS<:CSE,C<:Union{ASCIIChr,LatinChr}}
915-
cnt == 0 && return empty_str(CS)
916-
cnt < 0 && repeaterr(cnt)
917-
buf, pnt = _allocate(UInt8, cnt)
918-
cnt == 1 ? set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
919-
Str(CS, buf)
920-
end
921-
922-
function _repeat(::Type{CS}, ch::C, cnt::Integer) where {CS<:CSE,C<:Union{UCS2Chr,UTF32Chr}}
923-
cnt == 0 && return empty_str(CS)
924-
cnt < 0 && repeaterr(cnt)
925-
CU = codeunit(CS)
926-
buf, pnt = _allocate(CU, cnt)
927-
cnt == 1 ? set_codeunit!(pnt, ch%CU) : _aligned_set(pnt, ch%CU, cnt)
928-
Str(CS, buf)
929-
end
930-
931-
repeat(ch::ASCIIChr, cnt::Integer) = _repeat(ASCIICSE, ch, cnt)
932-
repeat(ch::LatinChr, cnt::Integer) = _repeat(LatinCSE, ch, cnt)
933-
repeat(ch::UCS2Chr, cnt::Integer) = _repeat(UCS2CSE, ch, cnt)
934-
repeat(ch::UTF32Chr, cnt::Integer) = _repeat(UTF32CSE, ch, cnt)
935-
=#
936-
937-
function repeat(ch::C, cnt::Integer) where {C<:Union{ASCIIChr,LatinChr,_LatinChr}}
938-
cnt == 0 && return empty_str(ASCIICSE)
939-
cnt < 0 && repeaterr(cnt)
940-
cu = ch%UInt8
941-
buf, pnt = _allocate(UInt8, cnt)
942-
_memset(pnt, cu, cnt)
943-
Str((C == ASCIIChr || cu <= 0x7f) ? ASCIICSE : (C == _LatinChr ? _LatinCSE : LatinCSE), buf)
944-
end
945-
946-
function repeat(ch::C, cnt::Integer) where {C<:Union{UCS2Chr,UTF32Chr}}
947-
cnt == 0 && return empty_str(ASCIICSE)
948-
cnt < 0 && repeaterr(cnt)
949-
if ch%UInt32 <= 0xff
913+
function repeat(ch::C, cnt::Integer) where {C<:Union{ASCIIChr,LatinChr}}
914+
if cnt > 0
915+
cu = ch%UInt8
950916
buf, pnt = _allocate(UInt8, cnt)
951-
cnt == 1 && set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
952-
Str(ifelse(ch%UInt8 <= 0x7f, ASCIICSE, LatinCSE), buf)
953-
elseif C == UCS2Chr || ch%UInt32 <= 0xffff
917+
_memset(pnt, cu, cnt)
918+
C == ASCIIChr ? Str(ASCIICSE, buf) : Str(LatinCSE, buf)
919+
else
920+
cnt < 0 ? repeaterr(cnt) : C == ASCIIStr ? empty_ascii : empty_latin
921+
end
922+
end
923+
924+
function repeat(ch::_LatinChr, cnt::Integer)
925+
if cnt > 0
926+
cu = ch%UInt8
927+
buf, pnt = _allocate(UInt8, cnt)
928+
_memset(pnt, cu, cnt)
929+
cu <= 0x7f ? Str(ASCIICSE, buf) : Str(_LatinCSE, buf)
930+
else
931+
cnt == 0 ? empty_ascii : repeaterr(cnt)
932+
end
933+
end
934+
935+
function repeat(ch::UCS2Chr, cnt::Integer)
936+
if cnt > 0
954937
buf, pnt = _allocate(UInt16, cnt)
955-
cnt == 1 && set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
938+
cnt == 1 ? set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
956939
Str(UCS2CSE, buf)
957940
else
941+
cnt == 0 ? empty_ucs2 : repeaterr(cnt)
942+
end
943+
end
944+
945+
function repeat(ch::UTF32Chr, cnt::Integer)
946+
if cnt > 0
958947
buf, pnt = _allocate(UInt32, cnt)
959-
cnt == 1 && set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
948+
cnt == 1 ? set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
960949
Str(UTF32CSE, buf)
950+
else
951+
cnt == 0 ? empty_utf32 : repeaterr(cnt)
961952
end
962953
end
963954

955+
964956
# Definitions for C compatible strings, that don't allow embedded
965957
# '\0', and which are terminated by a '\0'
966958

src/utf16.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ end
186186

187187
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF16, pos::Int) where {T}
188188
@boundscheck pos <= ncodeunits(str) || boundserr(str, pos)
189-
_iterate(MultiCU(), T, str, pos)
189+
iterate(str, pos)
190190
end
191191

192192
@inline _thisind(::MultiCU, str::MS_UTF16, len, pnt, pos) =

src/utf8.jl

+10-8
Original file line numberDiff line numberDiff line change
@@ -361,27 +361,29 @@ function _iterate_utf8(ch, str, pnt, pos)
361361
end
362362
end
363363

364-
@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
365-
pos > ncodeunits(str) && return nothing
366-
@boundscheck pos <= 0 && boundserr(str, pos)
364+
@inline function _iterate_utf8(str, pos)
367365
@preserve str begin
368366
pnt = pointer(str) + pos - 1
369367
ch = get_codeunit(pnt)
370368
ch <= 0x7f ? (UTF32Chr(ch), pos + 1) : _iterate_utf8(ch, str, pnt, pos)
371369
end
372370
end
373371

372+
@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
373+
pos > ncodeunits(str) && return nothing
374+
@boundscheck pos <= 0 && boundserr(str, pos)
375+
_iterate_utf8(str, pos)
376+
end
377+
374378
_iterate(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =
375379
iterate(str.data, pos)
376380
_iterate(::MultiCU, ::Type{T}, str::SubString{<:Str{RawUTF8CSE}}, pos::Int) where {T} =
377381
iterate(SubString(str.string.data, str.offset + pos, str.offset + ncodeunits(str)), 1)
378382

379383
# Gets next codepoint
380-
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8,
381-
pos::Int) where {T<:Chr}
382-
len = ncodeunits(str)
383-
@boundscheck 0 < pos <= len || boundserr(str, pos)
384-
_iterate(MultiCU(), T, str, pos)
384+
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8, pos::Int) where {T<:Chr}
385+
@boundscheck 0 < pos <= ncodeunits(str) || boundserr(str, pos)
386+
_iterate_utf8(str, pos)
385387
end
386388

387389
_next(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =

0 commit comments

Comments
 (0)