Skip to content

Commit 056b374

Browse files
authored
Improve performance of readuntil (#20621)
* Improve performance of readuntil using strings * Add backtracking * Improve performance of readuntil using strings Heavily inspired by omus and #20621 * Revise test to be completely unoffensive Looking over my original test I realized it potentially could be offensive. I've used a different example to avoid any potential issues. * readuntil with on-the-fly backtrack caching Caches backtracking information as it is needed. Using a SparseVector which has a lower memory footprint than Vector but is more performant than Dict. * Add unicode test for readuntil Skip testing the I/O producers "File" and "PipeEndpoint" when working with unicode. * Remove need for readuntil file * [wip] reduce code duplication, allow generalized Int indexes * refactor into single method instead of a type, makes it possible to use readuntil with any array (indexable) object and optimizes a few more cases
1 parent d1b6f78 commit 056b374

File tree

4 files changed

+116
-37
lines changed

4 files changed

+116
-37
lines changed

base/io.jl

Lines changed: 82 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,10 @@ flush(io::AbstractPipe) = flush(pipe_writer(io))
152152
read(io::AbstractPipe, byte::Type{UInt8}) = read(pipe_reader(io), byte)
153153
unsafe_read(io::AbstractPipe, p::Ptr{UInt8}, nb::UInt) = unsafe_read(pipe_reader(io), p, nb)
154154
read(io::AbstractPipe) = read(pipe_reader(io))
155-
readuntil(io::AbstractPipe, arg::UInt8) = readuntil(pipe_reader(io), arg)
156-
readuntil(io::AbstractPipe, arg::Char) = readuntil(pipe_reader(io), arg)
157-
readuntil(io::AbstractPipe, arg::AbstractString) = readuntil(pipe_reader(io), arg)
158-
readuntil(io::AbstractPipe, arg) = readuntil(pipe_reader(io), arg)
155+
readuntil(io::AbstractPipe, arg::UInt8) = readuntil(pipe_reader(io), arg)
156+
readuntil(io::AbstractPipe, arg::Char) = readuntil(pipe_reader(io), arg)
157+
readuntil_indexable(io::AbstractPipe, target#=::Indexable{T}=#, out) = readuntil_indexable(pipe_reader(io), target, out)
158+
159159
readavailable(io::AbstractPipe) = readavailable(pipe_reader(io))
160160

161161
isreadable(io::AbstractPipe) = isreadable(pipe_reader(io))
@@ -499,7 +499,7 @@ function readuntil(s::IO, delim::Char)
499499
end
500500

501501
function readuntil(s::IO, delim::T) where T
502-
out = T[]
502+
out = (T === UInt8 ? StringVector(0) : Vector{T}())
503503
while !eof(s)
504504
c = read(s, T)
505505
push!(out, c)
@@ -510,39 +510,89 @@ function readuntil(s::IO, delim::T) where T
510510
return out
511511
end
512512

513-
# based on code by Glen Hertz
514-
function readuntil(s::IO, t::AbstractString)
515-
l = length(t)
516-
if l == 0
517-
return ""
518-
end
519-
if l > 40
520-
warn("readuntil(IO,AbstractString) will perform poorly with a long string")
513+
# requires that indices for target are small ordered integers bounded by start and endof
514+
function readuntil_indexable(io::IO, target#=::Indexable{T}=#, out)
515+
T = eltype(target)
516+
first = start(target)
517+
if done(target, first)
518+
return
521519
end
522-
out = IOBuffer()
523-
m = Vector{Char}(l) # last part of stream to match
524-
t = collect(t)
525-
i = 0
526-
while !eof(s)
527-
i += 1
528-
c = read(s, Char)
529-
write(out, c)
530-
if i <= l
531-
m[i] = c
520+
len = endof(target)
521+
local cache # will be lazy initialized when needed
522+
second = next(target, first)[2]
523+
max_pos = second
524+
pos = first
525+
while !eof(io)
526+
c = read(io, T)
527+
# Backtrack until the next target character matches what was found
528+
if out isa IO
529+
write(out, c)
532530
else
533-
# shift to last part of s
534-
for j = 2:l
535-
m[j-1] = m[j]
536-
end
537-
m[l] = c
531+
push!(out, c)
538532
end
539-
if i >= l && m == t
540-
break
533+
while true
534+
c1, pos1 = next(target, pos)
535+
if c == c1
536+
pos = pos1
537+
break
538+
elseif pos == first
539+
break
540+
elseif pos == second
541+
pos = first
542+
else
543+
# grow cache to contain up to `pos`
544+
if !@isdefined(cache)
545+
cache = zeros(Int, len)
546+
end
547+
while max_pos < pos
548+
b = cache[max_pos] + first
549+
cb, b1 = next(target, b)
550+
ci, max_pos1 = next(target, max_pos)
551+
if ci == cb
552+
cache[max_pos1] = b1 - first
553+
end
554+
max_pos = max_pos1
555+
end
556+
pos = cache[pos] + first
557+
end
541558
end
559+
done(target, pos) && break
542560
end
543-
return String(take!(out))
544561
end
545562

563+
function readuntil(io::IO, target::AbstractString)
564+
# small-string target optimizations
565+
i = start(target)
566+
done(target, i) && return ""
567+
c, i = next(target, start(target))
568+
if done(target, i) && c < Char(0x80)
569+
return readuntil_string(io, c % UInt8)
570+
end
571+
# decide how we can index target
572+
if target isa String
573+
# convert String to a utf8-byte-iterator
574+
target = Vector{UInt8}(target)
575+
#elseif applicable(codeunit, target)
576+
# TODO: a more general version of above optimization
577+
# would be to permit accessing any string via codeunit
578+
# target = CodeUnitVector(target)
579+
elseif !(target isa SubString{String})
580+
# type with unknown indexing behavior: convert to array
581+
target = collect(target)
582+
end
583+
out = (eltype(target) === UInt8 ? StringVector(0) : IOBuffer())
584+
readuntil_indexable(io, target, out)
585+
out = isa(out, IO) ? take!(out) : out
586+
return String(out)
587+
end
588+
589+
function readuntil(io::IO, target::AbstractVector{T}) where T
590+
out = (T === UInt8 ? StringVector(0) : Vector{T}())
591+
readuntil_indexable(io, target, out)
592+
return out
593+
end
594+
595+
546596
"""
547597
readchomp(x)
548598
@@ -592,6 +642,7 @@ function read(s::IO, nb::Integer = typemax(Int))
592642
end
593643

594644
read(s::IO, ::Type{String}) = String(read(s))
645+
read(s::IO, T::Type) = error("The IO stream does not support reading objects of type $T.")
595646

596647
## high-level iterator interfaces ##
597648

base/strings/types.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ SubString(s::T, i::Int, j::Int) where {T<:AbstractString} = SubString{T}(s, i, j
3030
SubString(s::SubString, i::Int, j::Int) = SubString(s.string, s.offset+i, s.offset+j)
3131
SubString(s::AbstractString, i::Integer, j::Integer) = SubString(s, Int(i), Int(j))
3232
SubString(s::AbstractString, i::Integer) = SubString(s, i, endof(s))
33-
SubString{T}(s::T) where {T<:AbstractString} = SubString(s, 1, endof(s))
33+
SubString(s::AbstractString) = SubString(s, 1, endof(s))
34+
SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s))
3435

3536
String(p::SubString{String}) =
3637
unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1)

src/jl_uv.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,9 +382,9 @@ JL_DLLEXPORT int jl_fs_read(int handle, char *data, size_t len)
382382
JL_DLLEXPORT int jl_fs_read_byte(int handle)
383383
{
384384
uv_fs_t req;
385-
char c;
385+
unsigned char c;
386386
uv_buf_t buf[1];
387-
buf[0].base = &c;
387+
buf[0].base = (char*)&c;
388388
buf[0].len = 1;
389389
int ret = uv_fs_read(jl_io_loop, &req, handle, buf, 1, -1, NULL);
390390
uv_fs_req_cleanup(&req);

test/read.jl

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ s = io(text)
9494
close(s)
9595
push!(l, ("PipeEndpoint", io))
9696

97-
9897
#FIXME See https://github.com/JuliaLang/julia/issues/14747
9998
# Reading from open(::Command) seems to deadlock on Linux/Travis
10099
#=
@@ -136,10 +135,38 @@ end
136135

137136
verbose = false
138137

139-
140138
for (name, f) in l
141139
local f
142-
io = ()->(s=f(text); push!(open_streams, s); s)
140+
local function io(text=text)
141+
local s = f(text)
142+
push!(open_streams, s)
143+
return s
144+
end
145+
146+
verbose && println("$name readuntil...")
147+
for (t, s, m) in [
148+
("a", "ab", "a"),
149+
("b", "ab", "b"),
150+
("α", "αγ", "α"),
151+
("ab", "abc", "ab"),
152+
("bc", "abc", "bc"),
153+
("αβ", "αβγ", "αβ"),
154+
("aaabc", "ab", "aaab"),
155+
("aaabc", "ac", "aaabc"),
156+
("aaabc", "aab", "aaab"),
157+
("aaabc", "aac", "aaabc"),
158+
("αααβγ", "αβ", "αααβ"),
159+
("αααβγ", "ααβ", "αααβ"),
160+
("αααβγ", "αγ", "αααβγ"),
161+
("barbarbarians", "barbarian", "barbarbarian")]
162+
local t, s, m
163+
@test readuntil(io(t), s) == m
164+
@test readuntil(io(t), SubString(s, start(s), endof(s))) == m
165+
@test readuntil(io(t), GenericString(s)) == m
166+
@test readuntil(io(t), Vector{UInt8}(s)) == Vector{UInt8}(m)
167+
@test readuntil(io(t), collect(s)::Vector{Char}) == Vector{Char}(m)
168+
end
169+
cleanup()
143170

144171
write(filename, text)
145172

0 commit comments

Comments
 (0)