Skip to content

Commit e1d6e56

Browse files
committed
Merge pull request #10560 from tanmaykm/readcsv
correctly parse integer types in readdlm. fixes #9289
2 parents 39e7015 + a68f950 commit e1d6e56

File tree

4 files changed

+88
-61
lines changed

4 files changed

+88
-61
lines changed

base/datafmt.jl

+46-34
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
module DataFmt
44

55
importall Base
6-
import Base: _default_delims
6+
import Base: _default_delims, tryparse_internal
77

88
export countlines, readdlm, readcsv, writedlm, writecsv
99

@@ -137,15 +137,14 @@ type DLMStore{T,S<:ByteString} <: DLMHandler
137137
sbuff::S
138138
auto::Bool
139139
eol::Char
140-
tmp64::Array{Float64,1}
141140
end
142141

143142
function DLMStore{T,S<:ByteString}(::Type{T}, dims::NTuple{2,Integer}, has_header::Bool, sbuff::S, auto::Bool, eol::Char)
144143
(nrows,ncols) = dims
145144
nrows <= 0 && throw(ArgumentError("number of rows in dims must be > 0, got $nrows"))
146145
ncols <= 0 && throw(ArgumentError("number of columns in dims must be > 0, got $ncols"))
147146
hdr_offset = has_header ? 1 : 0
148-
DLMStore{T,S}(fill(SubString(sbuff,1,0), 1, ncols), Array(T, nrows-hdr_offset, ncols), nrows, ncols, 0, 0, hdr_offset, sbuff, auto, eol, Array(Float64,1))
147+
DLMStore{T,S}(fill(SubString(sbuff,1,0), 1, ncols), Array(T, nrows-hdr_offset, ncols), nrows, ncols, 0, 0, hdr_offset, sbuff, auto, eol)
149148
end
150149

151150
_chrinstr(sbuff::ByteString, chr::UInt8, startpos::Int, endpos::Int) = (endpos >= startpos) && (C_NULL != ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), pointer(sbuff.data)+startpos-1, chr, endpos-startpos+1))
@@ -158,7 +157,6 @@ function store_cell{T,S<:ByteString}(dlmstore::DLMStore{T,S}, row::Int, col::Int
158157
lastrow = dlmstore.lastrow
159158
cells::Array{T,2} = dlmstore.data
160159
sbuff::S = dlmstore.sbuff
161-
tmp64 = dlmstore.tmp64
162160

163161
endpos = prevind(sbuff, nextind(sbuff,endpos))
164162
(endpos > 0) && ('\n' == dlmstore.eol) && ('\r' == Char(sbuff[endpos])) && (endpos = prevind(sbuff, endpos))
@@ -189,9 +187,9 @@ function store_cell{T,S<:ByteString}(dlmstore::DLMStore{T,S}, row::Int, col::Int
189187
# fill data
190188
if quoted && _chrinstr(sbuff, UInt8('"'), startpos, endpos)
191189
unescaped = replace(SubString(sbuff,startpos,endpos), r"\"\"", "\"")
192-
fail = colval(unescaped, 1, length(unescaped), cells, drow, col, tmp64)
190+
fail = colval(unescaped, 1, length(unescaped), cells, drow, col)
193191
else
194-
fail = colval(sbuff, startpos, endpos, cells, drow, col, tmp64)
192+
fail = colval(sbuff, startpos, endpos, cells, drow, col)
195193
end
196194
if fail
197195
sval = SubString(sbuff,startpos,endpos)
@@ -204,9 +202,9 @@ function store_cell{T,S<:ByteString}(dlmstore::DLMStore{T,S}, row::Int, col::Int
204202
# fill header
205203
if quoted && _chrinstr(sbuff, UInt8('"'), startpos, endpos)
206204
unescaped = replace(SubString(sbuff,startpos,endpos), r"\"\"", "\"")
207-
colval(unescaped, 1, length(unescaped), dlmstore.hdr, 1, col, tmp64)
205+
colval(unescaped, 1, length(unescaped), dlmstore.hdr, 1, col)
208206
else
209-
colval(sbuff, startpos,endpos, dlmstore.hdr, 1, col, tmp64)
207+
colval(sbuff, startpos,endpos, dlmstore.hdr, 1, col)
210208
end
211209
end
212210

@@ -304,6 +302,8 @@ function dlm_fill(T::DataType, offarr::Vector{Vector{Int}}, dims::NTuple{2,Integ
304302
idx = 1
305303
offidx = 1
306304
offsets = offarr[1]
305+
row = 0
306+
col = 0
307307
try
308308
dh = DLMStore(T, dims, has_header, sbuff, auto, eol)
309309
while idx <= length(offsets)
@@ -320,40 +320,52 @@ function dlm_fill(T::DataType, offarr::Vector{Vector{Int}}, dims::NTuple{2,Integ
320320
return result(dh)
321321
catch ex
322322
isa(ex, TypeError) && (ex.func == :store_cell) && (return dlm_fill(ex.expected, offarr, dims, has_header, sbuff, auto, eol))
323-
rethrow(ex)
323+
error("at row $row, column $col : $ex")
324324
end
325325
end
326326

327-
328-
function colval{T<:Bool, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1})
329-
len = endpos-startpos+1
330-
if (len == 4) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), pointer(sbuff)+startpos-1, pointer("true"), 4))
331-
cells[row,col] = true
332-
return false
333-
elseif (len == 5) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), pointer(sbuff)+startpos-1, pointer("false"), 5))
334-
cells[row,col] = false
335-
return false
336-
end
337-
true
327+
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Bool,2}, row::Int, col::Int)
328+
n = tryparse_internal(Bool, sbuff, startpos, endpos, false)
329+
isnull(n) || (cells[row,col] = get(n))
330+
isnull(n)
338331
end
339-
function colval{T<:Number, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1})
340-
if 0 == ccall(:jl_substrtod, Int32, (Ptr{UInt8},Csize_t,Cint,Ptr{Float64}), sbuff, startpos-1, endpos-startpos+1, tmp64)
341-
cells[row,col] = tmp64[1]
342-
return false
343-
end
344-
true
332+
function colval{T<:Integer, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int)
333+
n = tryparse_internal(T, sbuff, startpos, endpos, 0, false)
334+
isnull(n) || (cells[row,col] = get(n))
335+
isnull(n)
345336
end
346-
colval{T<:AbstractString, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1}) = ((cells[row,col] = SubString(sbuff,startpos,endpos)); false)
347-
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Any,2}, row::Int, col::Int, tmp64::Array{Float64,1})
348-
if 0 == ccall(:jl_substrtod, Int32, (Ptr{UInt8},Csize_t,Cint,Ptr{Float64}), sbuff, startpos-1, endpos-startpos+1, tmp64)
349-
cells[row,col] = tmp64[1]
350-
else
351-
cells[row,col] = SubString(sbuff, startpos, endpos)
337+
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Float64,2}, row::Int, col::Int)
338+
n = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1)
339+
isnull(n) || (cells[row,col] = get(n))
340+
isnull(n)
341+
end
342+
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Float32,2}, row::Int, col::Int)
343+
n = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1)
344+
isnull(n) || (cells[row,col] = get(n))
345+
isnull(n)
346+
end
347+
colval{T<:AbstractString, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int) = ((cells[row,col] = SubString(sbuff,startpos,endpos)); false)
348+
function colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{Any,2}, row::Int, col::Int)
349+
# if array is of Any type, attempt parsing only the most common types: Int, Bool, Float64 and fallback to SubString
350+
len = endpos-startpos+1
351+
if len > 0
352+
# check Inteter
353+
ni64 = tryparse_internal(Int, sbuff, startpos, endpos, 0, false)
354+
isnull(ni64) || (cells[row,col] = get(ni64); return false)
355+
356+
# check Bool
357+
nb = tryparse_internal(Bool, sbuff, startpos, endpos, false)
358+
isnull(nb) || (cells[row,col] = get(nb); return false)
359+
360+
# check float64
361+
nf64 = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Cint), sbuff, startpos-1, endpos-startpos+1)
362+
isnull(nf64) || (cells[row,col] = get(nf64); return false)
352363
end
364+
cells[row,col] = SubString(sbuff, startpos, endpos)
353365
false
354366
end
355-
colval{T<:Char, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int, tmp64::Array{Float64,1}) = ((startpos==endpos) ? ((cells[row,col] = next(sbuff,startpos)[1]); false) : true)
356-
colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array, row::Int, col::Int, tmp64::Array{Float64,1}) = true
367+
colval{T<:Char, S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array{T,2}, row::Int, col::Int) = ((startpos==endpos) ? ((cells[row,col] = next(sbuff,startpos)[1]); false) : true)
368+
colval{S<:ByteString}(sbuff::S, startpos::Int, endpos::Int, cells::Array, row::Int, col::Int) = true
357369

358370
dlm_parse(s::ASCIIString, eol::Char, dlm::Char, qchar::Char, cchar::Char, ign_adj_dlm::Bool, allow_quote::Bool, allow_comments::Bool, skipstart::Int, skipblanks::Bool, dh::DLMHandler) = begin
359371
dlm_parse(s.data, UInt32(eol)%UInt8, UInt32(dlm)%UInt8, UInt32(qchar)%UInt8, UInt32(cchar)%UInt8,

base/gmp.jl

+3-4
Original file line numberDiff line numberDiff line change
@@ -76,18 +76,17 @@ signed(x::BigInt) = x
7676
BigInt(x::BigInt) = x
7777
BigInt(s::AbstractString) = parse(BigInt,s)
7878

79-
function tryparse_internal(::Type{BigInt}, s::AbstractString, base::Int, raise::Bool)
79+
function tryparse_internal(::Type{BigInt}, s::AbstractString, startpos::Int, endpos::Int, base::Int, raise::Bool)
8080
_n = Nullable{BigInt}()
81-
s = bytestring(s)
82-
sgn, base, i = Base.parseint_preamble(true,s,base)
81+
sgn, base, i = Base.parseint_preamble(true,base,s,startpos,endpos)
8382
if i == 0
8483
raise && throw(ArgumentError("premature end of integer: $(repr(s))"))
8584
return _n
8685
end
8786
z = BigInt()
8887
err = ccall((:__gmpz_set_str, :libgmp),
8988
Int32, (Ptr{BigInt}, Ptr{UInt8}, Int32),
90-
&z, SubString(s,i), base)
89+
&z, SubString(s,i,endpos), base)
9190
if err != 0
9291
raise && throw(ArgumentError("invalid BigInt: $(repr(s))"))
9392
return _n

base/string.jl

+34-23
Original file line numberDiff line numberDiff line change
@@ -1485,31 +1485,31 @@ function parse{T<:Integer}(::Type{T}, c::Char, base::Integer=36)
14851485
convert(T, d)
14861486
end
14871487

1488-
function parseint_next(s::AbstractString, i::Int=start(s))
1489-
done(s,i) && (return Char(0), 0, 0)
1490-
j = i
1491-
c, i = next(s,i)
1492-
c, i, j
1488+
function parseint_next(s::AbstractString, startpos::Int, endpos::Int)
1489+
(0 < startpos <= endpos) || (return Char(0), 0, 0)
1490+
j = startpos
1491+
c, startpos = next(s,startpos)
1492+
c, startpos, j
14931493
end
14941494

1495-
function parseint_preamble(signed::Bool, s::AbstractString, base::Int)
1496-
c, i, j = parseint_next(s)
1495+
function parseint_preamble(signed::Bool, base::Int, s::AbstractString, startpos::Int, endpos::Int)
1496+
c, i, j = parseint_next(s, startpos, endpos)
14971497

14981498
while isspace(c)
1499-
c, i, j = parseint_next(s,i)
1499+
c, i, j = parseint_next(s,i,endpos)
15001500
end
15011501
(j == 0) && (return 0, 0, 0)
15021502

15031503
sgn = 1
15041504
if signed
15051505
if c == '-' || c == '+'
15061506
(c == '-') && (sgn = -1)
1507-
c, i, j = parseint_next(s,i)
1507+
c, i, j = parseint_next(s,i,endpos)
15081508
end
15091509
end
15101510

15111511
while isspace(c)
1512-
c, i, j = parseint_next(s,i)
1512+
c, i, j = parseint_next(s,i,endpos)
15131513
end
15141514
(j == 0) && (return 0, 0, 0)
15151515

@@ -1518,7 +1518,7 @@ function parseint_preamble(signed::Bool, s::AbstractString, base::Int)
15181518
c, i = next(s,i)
15191519
base = c=='b' ? 2 : c=='o' ? 8 : c=='x' ? 16 : 10
15201520
if base != 10
1521-
c, i, j = parseint_next(s,i)
1521+
c, i, j = parseint_next(s,i,endpos)
15221522
end
15231523
else
15241524
base = 10
@@ -1527,21 +1527,30 @@ function parseint_preamble(signed::Bool, s::AbstractString, base::Int)
15271527
return sgn, base, j
15281528
end
15291529

1530+
function tryparse_internal{S<:ByteString}(::Type{Bool}, sbuff::S, startpos::Int, endpos::Int, raise::Bool)
1531+
len = endpos-startpos+1
1532+
p = pointer(sbuff)+startpos-1
1533+
(len == 4) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "true", 4)) && (return Nullable(true))
1534+
(len == 5) && (0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), p, "false", 5)) && (return Nullable(false))
1535+
raise && throw(ArgumentError("invalid Bool representation: $(repr(SubString(s,startpos,endpos)))"))
1536+
Nullable{Bool}()
1537+
end
1538+
15301539
safe_add{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? (n1 > (typemax(T) - n2)) : (n1 < (typemin(T) - n2))) ? Nullable{T}() : Nullable{T}(n1 + n2)
15311540
safe_mul{T<:Integer}(n1::T, n2::T) = ((n2 > 0) ? ((n1 > div(typemax(T),n2)) || (n1 < div(typemin(T),n2))) :
15321541
(n2 < -1) ? ((n1 > div(typemin(T),n2)) || (n1 < div(typemax(T),n2))) :
15331542
((n2 == -1) && n1 == typemin(T))) ? Nullable{T}() : Nullable{T}(n1 * n2)
15341543

1535-
function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, a::Int, raise::Bool)
1544+
function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, a::Int, raise::Bool)
15361545
_n = Nullable{T}()
1537-
sgn, base, i = parseint_preamble(T<:Signed,s,base)
1546+
sgn, base, i = parseint_preamble(T<:Signed, base, s, startpos, endpos)
15381547
if i == 0
1539-
raise && throw(ArgumentError("premature end of integer: $(repr(s))"))
1548+
raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))"))
15401549
return _n
15411550
end
1542-
c, i = parseint_next(s,i)
1551+
c, i = parseint_next(s,i,endpos)
15431552
if i == 0
1544-
raise && throw(ArgumentError("premature end of integer: $(repr(s))"))
1553+
raise && throw(ArgumentError("premature end of integer: $(repr(SubString(s,startpos,endpos)))"))
15451554
return _n
15461555
end
15471556

@@ -1553,12 +1562,12 @@ function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int,
15531562
'A' <= c <= 'Z' ? c-'A'+10 :
15541563
'a' <= c <= 'z' ? c-'a'+a : base
15551564
if d >= base
1556-
raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(s))"))
1565+
raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))"))
15571566
return _n
15581567
end
15591568
n *= base
15601569
n += d
1561-
if done(s,i)
1570+
if i > endpos
15621571
n *= sgn
15631572
return Nullable{T}(n)
15641573
end
@@ -1571,7 +1580,7 @@ function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int,
15711580
'A' <= c <= 'Z' ? c-'A'+10 :
15721581
'a' <= c <= 'z' ? c-'a'+a : base
15731582
if d >= base
1574-
raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(s))"))
1583+
raise && throw(ArgumentError("invalid base $base digit $(repr(c)) in $(repr(SubString(s,startpos,endpos)))"))
15751584
return _n
15761585
end
15771586
(T <: Signed) && (d *= sgn)
@@ -1583,20 +1592,22 @@ function tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int,
15831592
return _n
15841593
end
15851594
n = get(safe_n)
1586-
done(s,i) && return Nullable{T}(n)
1595+
(i > endpos) && return Nullable{T}(n)
15871596
c, i = next(s,i)
15881597
end
1589-
while !done(s,i)
1598+
while i <= endpos
15901599
c, i = next(s,i)
15911600
if !isspace(c)
1592-
raise && throw(ArgumentError("extra characters after whitespace in $(repr(s))"))
1601+
raise && throw(ArgumentError("extra characters after whitespace in $(repr(SubString(s,startpos,endpos)))"))
15931602
return _n
15941603
end
15951604
end
15961605
return Nullable{T}(n)
15971606
end
15981607
tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, base::Int, raise::Bool) =
1599-
tryparse_internal(T, s, base, base <= 36 ? 10 : 36, raise)
1608+
tryparse_internal(T,s,start(s),length(s),base,raise)
1609+
tryparse_internal{T<:Integer}(::Type{T}, s::AbstractString, startpos::Int, endpos::Int, base::Int, raise::Bool) =
1610+
tryparse_internal(T, s, startpos, endpos, base, base <= 36 ? 10 : 36, raise)
16001611
tryparse{T<:Integer}(::Type{T}, s::AbstractString, base::Int) =
16011612
2 <= base <= 62 ? tryparse_internal(T,s,Int(base),false) : throw(ArgumentError("invalid base: base must be 2 ≤ base ≤ 62, got $base"))
16021613
tryparse{T<:Integer}(::Type{T}, s::AbstractString) = tryparse_internal(T,s,0,false)

test/readdlm.jl

+5
Original file line numberDiff line numberDiff line change
@@ -197,3 +197,8 @@ let i18n_data = ["Origin (English)", "Name (English)", "Origin (Native)", "Name
197197
writedlm(i18n_buff, i18n_arr, '\t')
198198
@test (data, hdr) == readdlm(i18n_buff, '\t', header=true)
199199
end
200+
201+
@test isequaldlm(readcsv(IOBuffer("1,22222222222222222222222222222222222222,0x3,10e6\n2000.1,true,false,-10.34"), Any),
202+
reshape(Any[1,2000.1,Float64(22222222222222222222222222222222222222),true,0x3,false,10e6,-10.34], 2, 4), Any)
203+
204+
@test isequaldlm(readcsv(IOBuffer("-9223355253176920979,9223355253176920979"), Int64), Int64[-9223355253176920979 9223355253176920979], Int64)

0 commit comments

Comments
 (0)