diff --git a/.travis.yml b/.travis.yml index 1e9eea5..11a43a7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,8 +8,8 @@ julia: notifications: email: false # uncomment the following lines to override the default test script -#script: -# - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi -# - julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("RData"); Pkg.test("RData"; coverage=true)' +script: + - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi + - julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("RData"); Pkg.checkout("DataTables", "master"); Pkg.test("RData"; coverage=true)' after_success: - julia -e 'cd(Pkg.dir("RData")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'; diff --git a/NEWS.md b/NEWS.md index 8dcd3e7..b91e867 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,13 @@ +## RData v0.1.0 Release Notes + +Switched from `DataFrames` to `DataTables`, dropped Julia v0.4 support + +##### Changes +* using `NullableArrays.jl` and `CategoricalArrays.jl` +instead of `DataArrays.jl` ([#19], see [JuliaStats/DataFrames.jl#1008]) +* Julia v0.4 not supported (`DataTables.jl` requires v0.5) +* R logical vectors converted to `Vector{Bool}` (instead of `Vector{Int32}`) + ## RData v0.0.4 Release Notes Now the recommended way to load `.RData`/`.rda` files is by `FileIO.load()`. @@ -15,5 +25,7 @@ Initial release based on `DataFrames.read_rda()` ([JuliaStats/DataFrames.jl#1031 [#9]: https://github.com/JuliaStats/RData.jl/issues/9 [#10]: https://github.com/JuliaStats/RData.jl/issues/10 [#15]: https://github.com/JuliaStats/RData.jl/issues/15 +[#19]: https://github.com/JuliaStats/RData.jl/issues/19 +[JuliaStats/DataFrames.jl#1008]: https://github.com/JuliaStats/DataFrames.jl/pull/1008 [JuliaStats/DataFrames.jl#1031]: https://github.com/JuliaStats/DataFrames.jl/pull/1031 diff --git a/REQUIRE b/REQUIRE index 41080fc..7de741a 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,6 +1,5 @@ julia 0.5 -DataFrames 0.7 -DataArrays 0.3 +DataTables FileIO 0.1.2 GZip 0.2 Compat 0.17 diff --git a/appveyor.yml b/appveyor.yml index a9ae8a1..2d9e258 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,6 +2,8 @@ environment: matrix: - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe" - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe" + - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe" + - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe" - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe" - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe" @@ -34,7 +36,7 @@ build_script: # Need to convert from shallow to complete for Pkg.clone to work - IF EXIST .git\shallow (git fetch --unshallow) - C:\projects\julia\bin\julia -e "versioninfo(); - Pkg.clone(pwd(), \"RData\"); Pkg.build(\"RData\")" + Pkg.clone(pwd(), \"RData\"); Pkg.build(\"RData\"); Pkg.checkout(\"DataTables\", \"master\")" test_script: - C:\projects\julia\bin\julia -e "Pkg.test(\"RData\")" diff --git a/src/RData.jl b/src/RData.jl index 1dd7a42..53475e8 100644 --- a/src/RData.jl +++ b/src/RData.jl @@ -2,9 +2,8 @@ __precompile__() module RData -using Compat, DataFrames, GZip, FileIO -import DataArrays: data -import DataFrames: identifier +using Compat, DataTables, GZip, FileIO +import DataTables: identifier import Compat: unsafe_string import FileIO: load diff --git a/src/convert.jl b/src/convert.jl index c0d0002..7c0c2da 100644 --- a/src/convert.jl +++ b/src/convert.jl @@ -11,33 +11,55 @@ end ############################################################################## ## -## Conversion of intermediate R objects into DataArray and DataFrame objects +## Conversion of intermediate R objects into NullableArray and DataTable objects ## ############################################################################## -namask(rl::RLogicalVector) = BitArray(rl.data .== R_NA_INT32) -namask(ri::RIntegerVector) = BitArray(ri.data .== R_NA_INT32) -namask(rn::RNumericVector) = BitArray(map(isna_float64, reinterpret(UInt64, rn.data))) +namask(ri::RVector{Int32}) = [i == R_NA_INT32 for i in ri.data] +namask(rn::RNumericVector) = map(isna_float64, reinterpret(UInt64, rn.data)) # if re or im is NA, the whole complex number is NA -# FIXME avoid temporary Vector{Bool} -namask(rc::RComplexVector) = BitArray([isna_float64(v.re) || isna_float64(v.im) for v in reinterpret(Complex{UInt64}, rc.data)]) +namask(rc::RComplexVector) = [isna_float64(v.re) || isna_float64(v.im) for v in reinterpret(Complex{UInt64}, rc.data)] namask(rv::RNullableVector) = rv.na -DataArrays.data(rv::RVEC) = DataArray(rv.data, namask(rv)) +function _julia_vector{T}(::Type{T}, rv::RVEC, force_nullable::Bool) + na_mask = namask(rv) + (force_nullable || any(na_mask)) ? NullableArray(convert(Vector{T}, rv.data), na_mask) : rv.data +end + +# convert R vector into either NullableArray +# or Array if force_nullable=false and there are no NAs +julia_vector(rv::RVEC, force_nullable::Bool) = _julia_vector(eltype(rv.data), rv, force_nullable) + +function julia_vector(rl::RLogicalVector, force_nullable::Bool) + v = Bool[flag != zero(eltype(rl.data)) for flag in rl.data] + na_mask = namask(rl) + (force_nullable || any(na_mask)) ? NullableArray(v, na_mask) : v +end -function DataArrays.data(ri::RIntegerVector) - if !isfactor(ri) return DataArray(ri.data, namask(ri)) end - # convert factor into PooledDataArray - pool = getattr(ri, "levels", emptystrvec) - sz = length(pool) +# converts Vector{Int32} into Vector{R} replacing R_NA_INT32 with 0 +# it's assumed that v fits into R +na2zero{R}(::Type{R}, v::Vector{Int32}) = [ifelse(x != R_NA_INT32, x % R, zero(R)) for x in v] + +# convert to [Nullable]CategoricalArray{String} if `ri` is a factor, +# or to [Nullable]Array{Int32} otherwise +function julia_vector(ri::RIntegerVector, force_nullable::Bool) + isfactor(ri) || return _julia_vector(eltype(ri.data), ri, force_nullable) + + # convert factor into [Nullable]CategoricalArray + rlevels = getattr(ri, "levels", emptystrvec) + sz = length(rlevels) REFTYPE = sz <= typemax(UInt8) ? UInt8 : sz <= typemax(UInt16) ? UInt16 : sz <= typemax(UInt32) ? UInt32 : UInt64 - dd = ri.data - dd[namask(ri)] = 0 - refs = convert(Vector{REFTYPE}, dd) - return PooledDataArray(DataArrays.RefArray(refs), pool) + # FIXME set ordered flag + refs = na2zero(REFTYPE, ri.data) + pool = CategoricalPool{String, REFTYPE}(rlevels) + if force_nullable || (findfirst(refs, zero(REFTYPE)) > 0) + return NullableCategoricalArray{String, 1, REFTYPE}(refs, pool) + else + return CategoricalArray{String, 1, REFTYPE}(refs, pool) + end end function sexp2julia(rex::RSEXPREC) @@ -46,36 +68,32 @@ function sexp2julia(rex::RSEXPREC) end function sexp2julia(rv::RVEC) - # FIXME dimnames - # FIXME forceDataArrays option to always convert to DataArray - nas = namask(rv) - hasna = any(nas) + # TODO dimnames? + # FIXME forceNullable option to always convert to NullableArray + jv = julia_vector(rv, false) if hasnames(rv) # if data has no NA, convert to simple Vector - return DictoVec(hasna ? DataArray(rv.data, nas) : rv.data, names(rv)) + return DictoVec(jv, names(rv)) else hasdims = hasdim(rv) if !hasdims && length(rv.data)==1 # scalar - # FIXME handle NAs - # if hasna - return rv.data[1] + return jv[1] elseif !hasdims # vectors - return hasna ? DataArray(rv.data, nas) : rv.data + return jv else # matrices and so on - dims = tuple(convert(Vector{Int64}, getattr(rv, "dim"))...) - return hasna ? DataArray(reshape(rv.data, dims), reshape(nas, dims)) : - reshape(rv.data, dims) + dims = tuple(convert(Vector{Int}, getattr(rv, "dim"))...) + return reshape(jv, dims) end end end function sexp2julia(rl::RList) if isdataframe(rl) - # FIXME remove Any type assertion workaround - DataFrame(Any[data(col) for col in rl.data], map(identifier, names(rl))) + # FIXME forceNullable option to always convert to NullableArray + DataTable(Any[julia_vector(col, true) for col in rl.data], map(identifier, names(rl))) elseif hasnames(rl) DictoVec(Any[sexp2julia(item) for item in rl.data], names(rl)) else diff --git a/src/io/XDRIO.jl b/src/io/XDRIO.jl index 48ba09a..29a959e 100644 --- a/src/io/XDRIO.jl +++ b/src/io/XDRIO.jl @@ -4,7 +4,7 @@ type XDRIO{T<:IO} <: RDAIO sub::T # underlying IO stream buf::Vector{UInt8} # buffer for strings - @compat (::Type{XDRIO}){T <: IO}(io::T) = new{T}(io, Vector{UInt8}(1024)) + (::Type{XDRIO}){T <: IO}(io::T) = new{T}(io, Vector{UInt8}(1024)) end readint32(io::XDRIO) = ntoh(read(io.sub, Int32)) diff --git a/src/sxtypes.jl b/src/sxtypes.jl index 7bb3fb0..2a4774f 100644 --- a/src/sxtypes.jl +++ b/src/sxtypes.jl @@ -125,7 +125,7 @@ const RComplexVector = RVector{Complex128, CPLXSXP} """ immutable RNullableVector{T, S} <: RVEC{T, S} data::Vector{T} - na::BitVector # mask of NA elements + na::Vector{Bool} # mask of NA elements attr::Hash # collection of R object attributes end diff --git a/test/RDA.jl b/test/RDA.jl index 4666772..0153f07 100644 --- a/test/RDA.jl +++ b/test/RDA.jl @@ -1,54 +1,76 @@ module TestRDA using Base.Test - using DataFrames + using DataTables using RData using Compat # check for Float64 NA - @test !RData.isna_float64(reinterpret(UInt64, 1.0)) - @test !RData.isna_float64(reinterpret(UInt64, NaN)) - @test !RData.isna_float64(reinterpret(UInt64, Inf)) - @test !RData.isna_float64(reinterpret(UInt64, -Inf)) - @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64)) - # check that alternative NA is also recognized (#10) - @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64 | ((Base.significand_mask(Float64) + 1) >> 1))) + @testset "Detect R floating-point NAs" begin + @test !RData.isna_float64(reinterpret(UInt64, 1.0)) + @test !RData.isna_float64(reinterpret(UInt64, NaN)) + @test !RData.isna_float64(reinterpret(UInt64, Inf)) + @test !RData.isna_float64(reinterpret(UInt64, -Inf)) + @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64)) + # check that alternative NA is also recognized (#10) + @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64 | ((Base.significand_mask(Float64) + 1) >> 1))) + end testdir = dirname(@__FILE__) + @testset "Reading minimal RData" begin + df = DataTable(num = NullableArray([1.1, 2.2])) + @test sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]) == df + @test load("$testdir/data/minimal.rda",convert=true)["df"] == df + @test load("$testdir/data/minimal_ascii.rda")["df"] == df + end - df = DataFrame(num = [1.1, 2.2]) - @test isequal(sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]), df) - @test isequal(load("$testdir/data/minimal.rda",convert=true)["df"], df) - @test isequal(load("$testdir/data/minimal_ascii.rda")["df"], df) - - df[:int] = Int32[1, 2] - df[:logi] = [true, false] - df[:chr] = ["ab", "c"] - df[:factor] = pool(df[:chr]) - df[:cplx] = Complex128[1.1+0.5im, 1.0im] - @test isequal(sexp2julia(load("$testdir/data/types.rda",convert=false)["df"]), df) - @test isequal(sexp2julia(load("$testdir/data/types_ascii.rda",convert=false)["df"]), df) - - df[2, :] = NA - append!(df, df[2, :]) - df[3, :num] = NaN - df[:, :cplx] = @data [NA, @compat(Complex128(1,NaN)), NaN] - @test isequal(sexp2julia(load("$testdir/data/NAs.rda",convert=false)["df"]), df) - # ASCII format saves NaN as NA - df[3, :num] = NA - df[:, :cplx] = @data [NA, NA, NA] - @test isequal(sexp2julia(load("$testdir/data/NAs_ascii.rda",convert=false)["df"]), df) - - rda_names = names(sexp2julia(load("$testdir/data/names.rda",convert=false)["df"])) - expected_names = [:_end, :x!, :x1, :_B_C_, :x, :x_1] - @test rda_names == expected_names - rda_names = names(sexp2julia(load("$testdir/data/names_ascii.rda",convert=false)["df"])) - @test rda_names == [:_end, :x!, :x1, :_B_C_, :x, :x_1] - - rda_envs = load("$testdir/data/envs.rda",convert=false) - - rda_pairlists = load("$testdir/data/pairlists.rda",convert=false) - - rda_closures = load("$testdir/data/closures.rda",convert=false) - - rda_cmpfuns = load("$testdir/data/cmpfun.rda",convert=false) + @testset "Conversion to Julia types" begin + df = DataTable(num = NullableArray([1.1, 2.2]), + int = NullableArray(Int32[1, 2]), + logi = NullableArray([true, false]), + chr = NullableArray(["ab", "c"]), + factor = categorical(NullableArray(["ab", "c"]), true), + cplx = NullableArray(Complex128[1.1+0.5im, 1.0im])) + rdf = sexp2julia(load("$testdir/data/types.rda",convert=false)["df"]) + @test eltypes(rdf) == eltypes(df) + @test rdf == df + rdf_ascii = sexp2julia(load("$testdir/data/types_ascii.rda",convert=false)["df"]) + @test eltypes(rdf_ascii) == eltypes(df) + @test rdf_ascii == df + end + + @testset "NAs conversion" begin + df = DataTable(num = NullableArray([1.1, 2.2]), + int = NullableArray(Int32[1, 2]), + logi = NullableArray([true, false]), + chr = NullableArray(["ab", "c"]), + factor = categorical(NullableArray(["ab", "c"]), true), + cplx = NullableArray(Complex128[1.1+0.5im, 1.0im])) + df[2, :] = Nullable() + append!(df, df[2, :]) + df[3, :num] = NaN + df[:, :cplx] = NullableArray([Nullable(), Complex128(1,NaN), NaN]) + @test isequal(sexp2julia(load("$testdir/data/NAs.rda",convert=false)["df"]), df) + # ASCII format saves NaN as NA + df[3, :num] = Nullable() + df[:, :cplx] = NullableArray{Complex128}(3) + @test sexp2julia(load("$testdir/data/NAs_ascii.rda",convert=false)["df"]) == df + end + + @testset "Column names conversion" begin + rda_names = names(sexp2julia(load("$testdir/data/names.rda",convert=false)["df"])) + expected_names = [:_end, :x!, :x1, :_B_C_, :x, :x_1] + @test rda_names == expected_names + rda_names = names(sexp2julia(load("$testdir/data/names_ascii.rda",convert=false)["df"])) + @test rda_names == [:_end, :x!, :x1, :_B_C_, :x, :x_1] + end + + @testset "Reading RDA with complex types (environments, closures etc)" begin + rda_envs = load("$testdir/data/envs.rda",convert=false) + + rda_pairlists = load("$testdir/data/pairlists.rda",convert=false) + + rda_closures = load("$testdir/data/closures.rda",convert=false) + + rda_cmpfuns = load("$testdir/data/cmpfun.rda",convert=false) + end end