Skip to content

Commit 8dac8c5

Browse files
committed
switch to DataFrames 0.11
- switch to Missings.jl/CategoricalArrays.jl - add jlvec() methods handling conversion logic - fix conversion of RLogicalVector into Vector{Bool} - remove DataArrays.jl dependency
1 parent ebc1335 commit 8dac8c5

File tree

5 files changed

+116
-61
lines changed

5 files changed

+116
-61
lines changed

NEWS.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
## RData v0.3.0 Release Notes
2+
3+
Updated to DataFrames v0.11, switched from [DataArrays](https://github.com/JuliaData/DataArrays.jl) to [Missings](https://github.com/JuliaData/Missings.jl) and [CategoricalArrays](https://github.com/JuliaData/CategoricalArrays.jl).
4+
5+
##### Changes
6+
* updated to DataFrames v0.11 [#28]
7+
* switched from `DataVector` to `Vector{Union{T,Null}}` for NAs [#28]
8+
* R factors converted into `CategoricalVector` (instead of `PooledDataArray`) [#28]
9+
10+
[#28]: https://github.com/JuliaStats/RData.jl/issues/28
11+
112
## RData v0.2.0 Release Notes
213

314
Updated to Julia v0.6 (older versions not supported).

REQUIRE

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
julia 0.6
2-
DataFrames 0.9
3-
DataArrays 0.4
2+
DataFrames 0.11
3+
Missings 0.2
4+
CategoricalArrays 0.3
45
FileIO 0.1.2
56
CodecZlib 0.4

src/RData.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ __precompile__()
22

33
module RData
44

5-
using DataFrames, DataArrays, CodecZlib, FileIO
5+
using DataFrames, CategoricalArrays, Missings, CodecZlib, FileIO
66
import DataFrames: identifier
77
import FileIO: load
88

src/convert.jl

Lines changed: 79 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3,83 +3,125 @@
33

44
function Base.convert(::Type{Hash}, pl::RPairList)
55
res = Hash()
6-
for i in 1:length(pl.items)
7-
setindex!(res, pl.items[i], pl.tags[i])
6+
for i in eachindex(pl.items)
7+
@inbounds setindex!(res, pl.items[i], pl.tags[i])
88
end
99
res
1010
end
1111

1212
##############################################################################
1313
##
14-
## Conversion of intermediate R objects into DataArray and DataFrame objects
14+
## Conversion of intermediate R objects into Vector{T} and DataFrame objects
1515
##
1616
##############################################################################
1717

18-
namask(rl::RLogicalVector) = BitArray(rl.data .== R_NA_INT32)
19-
namask(ri::RIntegerVector) = BitArray(ri.data .== R_NA_INT32)
20-
namask(rn::RNumericVector) = BitArray(map(isna_float64, reinterpret(UInt64, rn.data)))
18+
isna(x::Int32) = x == R_NA_INT32
19+
isna(x::Float64) = isna_float64(reinterpret(UInt64, x))
2120
# if re or im is NA, the whole complex number is NA
22-
# FIXME avoid temporary Vector{Bool}
23-
namask(rc::RComplexVector) = BitArray([isna_float64(v.re) || isna_float64(v.im) for v in reinterpret(Complex{UInt64}, rc.data)])
24-
namask(rv::RNullableVector) = rv.na
21+
isna(x::Complex128) = isna(real(x)) || isna(imag(x))
2522

26-
DataArrays.data(rv::RVEC) = DataArray(rv.data, namask(rv))
23+
# convert R vector into Vector holding elements of type T
24+
# if force_missing is true, the result is always Vector{Union{T,Missing}},
25+
# otherwise it's Vector{T} if `rv` doesn't contain NAs
26+
function jlvec(::Type{T}, rv::RVEC, force_missing::Bool=true) where T
27+
anyna = any(isna, rv.data)
28+
if force_missing || anyna
29+
res = convert(Vector{Union{T,Missing}}, rv.data)
30+
if anyna
31+
@inbounds for (i,x) in enumerate(rv.data)
32+
isna(x) && (res[i] = missing)
33+
end
34+
end
35+
return res
36+
else
37+
return convert(Vector{T}, rv.data)
38+
end
39+
end
40+
41+
# convert R nullable vector (has an explicit NA mask) into Vector{T[?]}
42+
function jlvec(::Type{T}, rv::RNullableVector{R}, force_missing::Bool=true) where {T, R}
43+
anyna = any(rv.na)
44+
if force_missing || anyna
45+
res = convert(Vector{Union{T,Missing}}, rv.data)
46+
anyna && @inbounds res[rv.na] = missing
47+
return res
48+
else
49+
return convert(Vector{T}, rv.data)
50+
end
51+
end
52+
53+
# convert R vector into Vector of appropriate type
54+
jlvec(rv::RVEC, force_missing::Bool=true) = jlvec(eltype(rv.data), rv, force_missing)
2755

28-
function DataArrays.data(ri::RIntegerVector)
29-
if !isfactor(ri) return DataArray(ri.data, namask(ri)) end
30-
# convert factor into PooledDataArray
31-
pool = getattr(ri, "levels", emptystrvec)
32-
sz = length(pool)
56+
# convert R logical vector (uses Int32 to store values) into Vector{Bool[?]}
57+
function jlvec(rl::RLogicalVector, force_missing::Bool=true)
58+
anyna = any(isna, rl.data)
59+
if force_missing || anyna
60+
return Union{Bool,Missing}[ifelse(isna(x), missing, x != 0) for x in rl.data]
61+
else
62+
return Bool[x != 0 for x in rl.data]
63+
end
64+
end
65+
66+
# kernel method that converts Vector{Int32} into Vector{R} replacing R_NA_INT32 with 0
67+
# it's assumed that v fits into R
68+
na2zero(::Type{R}, v::Vector{Int32}) where R =
69+
[ifelse(!isna(x), x % R, zero(R)) for x in v]
70+
71+
# convert to CategoricalVector{String[?]} if `ri` is a factor,
72+
# or to Vector{Int32[?]} otherwise
73+
function jlvec(ri::RIntegerVector, force_missing::Bool=true)
74+
isfactor(ri) || return jlvec(eltype(ri.data), ri, force_missing)
75+
76+
rlevels = getattr(ri, "levels", emptystrvec)
77+
sz = length(rlevels)
3378
REFTYPE = sz <= typemax(UInt8) ? UInt8 :
3479
sz <= typemax(UInt16) ? UInt16 :
3580
sz <= typemax(UInt32) ? UInt32 :
3681
UInt64
37-
dd = ri.data
38-
dd[namask(ri)] = 0
39-
refs = convert(Vector{REFTYPE}, dd)
40-
return PooledDataArray(DataArrays.RefArray(refs), pool)
82+
# FIXME set ordered flag
83+
refs = na2zero(REFTYPE, ri.data)
84+
anyna = any(iszero, refs)
85+
pool = CategoricalPool{String, REFTYPE}(rlevels)
86+
if force_missing || anyna
87+
return CategoricalArray{Union{String, Missing}, 1}(refs, pool)
88+
else
89+
return CategoricalArray{String, 1}(refs, pool)
90+
end
4191
end
4292

43-
# convert R logical vector (uses Int32 to store values) into DataVector{Bool}
44-
DataArrays.data(rl::RLogicalVector) =
45-
return DataArray(Bool[x != 0 for x in rl.data], namask(rl))
46-
4793
function sexp2julia(rex::RSEXPREC)
4894
warn("Conversion of $(typeof(rex)) to Julia is not implemented")
4995
return nothing
5096
end
5197

5298
function sexp2julia(rv::RVEC)
53-
# FIXME dimnames
54-
# FIXME forceDataArrays option to always convert to DataArray
55-
nas = namask(rv)
56-
hasna = any(nas)
99+
# TODO dimnames?
100+
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
101+
jv = jlvec(rv, false)
57102
if hasnames(rv)
58103
# if data has no NA, convert to simple Vector
59-
return DictoVec(hasna ? DataArray(rv.data, nas) : rv.data, names(rv))
104+
return DictoVec(jv, names(rv))
60105
else
61106
hasdims = hasdim(rv)
62107
if !hasdims && length(rv.data)==1
63108
# scalar
64-
# FIXME handle NAs
65-
# if hasna
66-
return rv.data[1]
109+
return jv[1]
67110
elseif !hasdims
68111
# vectors
69-
return hasna ? DataArray(rv.data, nas) : rv.data
112+
return jv
70113
else
71114
# matrices and so on
72-
dims = tuple(convert(Vector{Int64}, getattr(rv, "dim"))...)
73-
return hasna ? DataArray(reshape(rv.data, dims), reshape(nas, dims)) :
74-
reshape(rv.data, dims)
115+
dims = tuple(convert(Vector{Int}, getattr(rv, "dim"))...)
116+
return reshape(jv, dims)
75117
end
76118
end
77119
end
78120

79121
function sexp2julia(rl::RList)
80122
if isdataframe(rl)
81-
# FIXME remove Any type assertion workaround
82-
DataFrame(Any[data(col) for col in rl.data], map(identifier, names(rl)))
123+
# FIXME add force_missing option to control whether always convert to Union{T, Missing}
124+
DataFrame(Any[jlvec(col, false) for col in rl.data], identifier.(names(rl)))
83125
elseif hasnames(rl)
84126
DictoVec(Any[sexp2julia(item) for item in rl.data], names(rl))
85127
else

test/RDA.jl

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,53 +6,54 @@ module TestRDA
66
# check for Float64 NA
77
@testset "Detect R floating-point NAs" begin
88
@test !RData.isna_float64(reinterpret(UInt64, 1.0))
9-
@test !RData.isna_float64(reinterpret(UInt64, NaN))
10-
@test !RData.isna_float64(reinterpret(UInt64, Inf))
11-
@test !RData.isna_float64(reinterpret(UInt64, -Inf))
12-
@test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64))
9+
@test !RData.isna(1.0)
10+
@test !RData.isna(NaN)
11+
@test !RData.isna(Inf)
12+
@test !RData.isna(-Inf)
13+
@test RData.isna_float64(RData.R_NA_FLOAT64)
1314
# check that alternative NA is also recognized (#10)
1415
@test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64 | ((Base.significand_mask(Float64) + 1) >> 1)))
1516
end
1617

1718
testdir = dirname(@__FILE__)
1819
@testset "Reading minimal RData" begin
1920
df = DataFrame(num = [1.1, 2.2])
20-
@test isequal(sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]), df)
21-
@test isequal(load("$testdir/data/minimal.rda",convert=true)["df"], df)
22-
@test isequal(load("$testdir/data/minimal_ascii.rda")["df"], df)
21+
@test sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]) == df
22+
@test load("$testdir/data/minimal.rda",convert=true)["df"] == df
23+
@test load("$testdir/data/minimal_ascii.rda")["df"] == df
2324
end
2425

2526
@testset "Conversion to Julia types" begin
2627
df = DataFrame(num = [1.1, 2.2],
2728
int = Int32[1, 2],
2829
logi = [true, false],
2930
chr = ["ab", "c"],
30-
factor = pool(["ab", "c"]),
31-
cplx = Complex128[1.1+0.5im, 1.0im])
31+
factor = categorical(["ab", "c"], true),
32+
cplx = [1.1+0.5im, 1.0im])
3233
rdf = sexp2julia(load("$testdir/data/types.rda",convert=false)["df"])
3334
@test eltypes(rdf) == eltypes(df)
34-
@test isequal(rdf, df)
35+
@test rdf == df
3536
rdf_ascii = sexp2julia(load("$testdir/data/types_ascii.rda",convert=false)["df"])
3637
@test eltypes(rdf_ascii) == eltypes(df)
37-
@test isequal(rdf_ascii, df)
38+
@test rdf_ascii == df
3839
end
3940

4041
@testset "NAs conversion" begin
41-
df = DataFrame(num = [1.1, 2.2],
42-
int = Int32[1, 2],
43-
logi = [true, false],
44-
chr = ["ab", "c"],
45-
factor = pool(["ab", "c"]),
46-
cplx = Complex128[1.1+0.5im, 1.0im])
42+
df = DataFrame(num = Union{Float64, Missing}[1.1, 2.2],
43+
int = Union{Int32, Missing}[1, 2],
44+
logi = Union{Bool, Missing}[true, false],
45+
chr = Union{String, Missing}["ab", "c"],
46+
factor = categorical(Union{String, Missing}["ab", "c"], true),
47+
cplx = Union{Complex128, Missing}[1.1+0.5im, 1.0im])
4748

48-
df[2, :] = NA
49+
df[2, :] = missing
4950
append!(df, df[2, :])
5051
df[3, :num] = NaN
51-
df[:, :cplx] = @data [NA, Complex128(1,NaN), NaN]
52+
df[:, :cplx] = [missing, Complex128(1,NaN), NaN]
5253
@test isequal(sexp2julia(load("$testdir/data/NAs.rda",convert=false)["df"]), df)
5354
# ASCII format saves NaN as NA
54-
df[3, :num] = NA
55-
df[:, :cplx] = @data [NA, NA, NA]
55+
df[3, :num] = missing
56+
df[:, :cplx] = missing
5657
@test isequal(sexp2julia(load("$testdir/data/NAs_ascii.rda",convert=false)["df"]), df)
5758
end
5859

0 commit comments

Comments
 (0)