JuliaData · alyst · Mar 9, 2017 · Jul 13, 2017 · Oct 1, 2016 · Mar 9, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -8,8 +8,8 @@ julia:
 notifications:
   email: false
 # uncomment the following lines to override the default test script
-#script:
-#  - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
-#  - julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("RData"); Pkg.test("RData"; coverage=true)'
+script:
+  - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
+  - julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("RData"); Pkg.checkout("DataTables", "master"); Pkg.test("RData"; coverage=true)'
 after_success:
   - julia -e 'cd(Pkg.dir("RData")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,13 @@
+## RData v0.1.0 Release Notes
+
+Switched from `DataFrames` to `DataTables`, dropped Julia v0.4 support
+
+##### Changes
+* using `NullableArrays.jl` and `CategoricalArrays.jl`
+instead of `DataArrays.jl` ([#19], see [JuliaStats/DataFrames.jl#1008])
+* Julia v0.4 not supported (`DataTables.jl` requires v0.5)
+* R logical vectors converted to `Vector{Bool}` (instead of `Vector{Int32}`)
+
 ## RData v0.0.4 Release Notes
 
 Now the recommended way to load `.RData`/`.rda` files is by `FileIO.load()`.
@@ -15,5 +25,7 @@ Initial release based on `DataFrames.read_rda()` ([JuliaStats/DataFrames.jl#1031
 [#9]: https://github.com/JuliaStats/RData.jl/issues/9
 [#10]: https://github.com/JuliaStats/RData.jl/issues/10
 [#15]: https://github.com/JuliaStats/RData.jl/issues/15
+[#19]: https://github.com/JuliaStats/RData.jl/issues/19
 
+[JuliaStats/DataFrames.jl#1008]: https://github.com/JuliaStats/DataFrames.jl/pull/1008
 [JuliaStats/DataFrames.jl#1031]: https://github.com/JuliaStats/DataFrames.jl/pull/1031
diff --git a/REQUIRE b/REQUIRE
@@ -1,6 +1,5 @@
 julia 0.5
-DataFrames 0.7
-DataArrays 0.3
+DataTables
 FileIO 0.1.2
 GZip 0.2
 Compat 0.17
diff --git a/appveyor.yml b/appveyor.yml
@@ -2,6 +2,8 @@ environment:
   matrix:
   - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe"
   - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe"
+  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
+  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
   - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
   - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
 
@@ -34,7 +36,7 @@ build_script:
 # Need to convert from shallow to complete for Pkg.clone to work
   - IF EXIST .git\shallow (git fetch --unshallow)
   - C:\projects\julia\bin\julia -e "versioninfo();
-      Pkg.clone(pwd(), \"RData\"); Pkg.build(\"RData\")"
+      Pkg.clone(pwd(), \"RData\"); Pkg.build(\"RData\"); Pkg.checkout(\"DataTables\", \"master\")"
 
 test_script:
   - C:\projects\julia\bin\julia -e "Pkg.test(\"RData\")"
diff --git a/src/RData.jl b/src/RData.jl
@@ -2,9 +2,8 @@ __precompile__()
 
 module RData
 
-using Compat, DataFrames, GZip, FileIO
-import DataArrays: data
-import DataFrames: identifier
+using Compat, DataTables, GZip, FileIO
+import DataTables: identifier
 import Compat: unsafe_string
 import FileIO: load
 

diff --git a/src/convert.jl b/src/convert.jl
@@ -11,33 +11,55 @@ end
 
 ##############################################################################
 ##
-## Conversion of intermediate R objects into DataArray and DataFrame objects
+## Conversion of intermediate R objects into NullableArray and DataTable objects
 ##
 ##############################################################################
 
-namask(rl::RLogicalVector) = BitArray(rl.data .== R_NA_INT32)
-namask(ri::RIntegerVector) = BitArray(ri.data .== R_NA_INT32)
-namask(rn::RNumericVector) = BitArray(map(isna_float64, reinterpret(UInt64, rn.data)))
+namask(ri::RVector{Int32}) = [i == R_NA_INT32 for i in ri.data]
+namask(rn::RNumericVector) = map(isna_float64, reinterpret(UInt64, rn.data))
 # if re or im is NA, the whole complex number is NA
-# FIXME avoid temporary Vector{Bool}
-namask(rc::RComplexVector) = BitArray([isna_float64(v.re) || isna_float64(v.im) for v in reinterpret(Complex{UInt64}, rc.data)])
+namask(rc::RComplexVector) = [isna_float64(v.re) || isna_float64(v.im) for v in reinterpret(Complex{UInt64}, rc.data)]
 namask(rv::RNullableVector) = rv.na
 
-DataArrays.data(rv::RVEC) = DataArray(rv.data, namask(rv))
+function _julia_vector{T}(::Type{T}, rv::RVEC, force_nullable::Bool)
+    na_mask = namask(rv)
+    (force_nullable || any(na_mask)) ? NullableArray(convert(Vector{T}, rv.data), na_mask) : rv.data
+end
+
+# convert R vector into either NullableArray
+# or Array if force_nullable=false and there are no NAs
+julia_vector(rv::RVEC, force_nullable::Bool) = _julia_vector(eltype(rv.data), rv, force_nullable)
+
+function julia_vector(rl::RLogicalVector, force_nullable::Bool)
+    v = Bool[flag != zero(eltype(rl.data)) for flag in rl.data]
+    na_mask = namask(rl)
+    (force_nullable || any(na_mask)) ? NullableArray(v, na_mask) : v
+end
 
-function DataArrays.data(ri::RIntegerVector)
-    if !isfactor(ri) return DataArray(ri.data, namask(ri)) end
-    # convert factor into PooledDataArray
-    pool = getattr(ri, "levels", emptystrvec)
-    sz = length(pool)
+# converts Vector{Int32} into Vector{R} replacing R_NA_INT32 with 0
+# it's assumed that v fits into R
+na2zero{R}(::Type{R}, v::Vector{Int32}) = [ifelse(x != R_NA_INT32, x % R, zero(R)) for x in v]
+
+# convert to [Nullable]CategoricalArray{String} if `ri` is a factor,
+# or to [Nullable]Array{Int32} otherwise
+function julia_vector(ri::RIntegerVector, force_nullable::Bool)
+    isfactor(ri) || return _julia_vector(eltype(ri.data), ri, force_nullable)
+
+    # convert factor into [Nullable]CategoricalArray
+    rlevels = getattr(ri, "levels", emptystrvec)
+    sz = length(rlevels)
     REFTYPE = sz <= typemax(UInt8)  ? UInt8 :
               sz <= typemax(UInt16) ? UInt16 :
               sz <= typemax(UInt32) ? UInt32 :
                                       UInt64
-    dd = ri.data
-    dd[namask(ri)] = 0
-    refs = convert(Vector{REFTYPE}, dd)
-    return PooledDataArray(DataArrays.RefArray(refs), pool)
+    # FIXME set ordered flag
+    refs = na2zero(REFTYPE, ri.data)
+    pool = CategoricalPool{String, REFTYPE}(rlevels)
+    if force_nullable || (findfirst(refs, zero(REFTYPE)) > 0)
+        return NullableCategoricalArray{String, 1, REFTYPE}(refs, pool)
+    else
+        return CategoricalArray{String, 1, REFTYPE}(refs, pool)
+    end
 end
 
 function sexp2julia(rex::RSEXPREC)
@@ -46,36 +68,32 @@ function sexp2julia(rex::RSEXPREC)
 end
 
 function sexp2julia(rv::RVEC)
-    # FIXME dimnames
-    # FIXME forceDataArrays option to always convert to DataArray
-    nas = namask(rv)
-    hasna = any(nas)
+    # TODO dimnames?
+    # FIXME forceNullable option to always convert to NullableArray
+    jv = julia_vector(rv, false)
     if hasnames(rv)
         # if data has no NA, convert to simple Vector
-        return DictoVec(hasna ? DataArray(rv.data, nas) : rv.data, names(rv))
+        return DictoVec(jv, names(rv))
     else
         hasdims = hasdim(rv)
         if !hasdims && length(rv.data)==1
             # scalar
-            # FIXME handle NAs
-            # if hasna
-            return rv.data[1]
+            return jv[1]
         elseif !hasdims
             # vectors
-            return hasna ? DataArray(rv.data, nas) : rv.data
+            return jv
         else
             # matrices and so on
-            dims = tuple(convert(Vector{Int64}, getattr(rv, "dim"))...)
-            return hasna ? DataArray(reshape(rv.data, dims), reshape(nas, dims)) :
-                         reshape(rv.data, dims)
+            dims = tuple(convert(Vector{Int}, getattr(rv, "dim"))...)
+            return reshape(jv, dims)
         end
     end
 end
 
 function sexp2julia(rl::RList)
     if isdataframe(rl)
-        # FIXME remove Any type assertion workaround
-        DataFrame(Any[data(col) for col in rl.data], map(identifier, names(rl)))
+        # FIXME forceNullable option to always convert to NullableArray
+        DataTable(Any[julia_vector(col, true) for col in rl.data], map(identifier, names(rl)))
     elseif hasnames(rl)
         DictoVec(Any[sexp2julia(item) for item in rl.data], names(rl))
     else

diff --git a/src/io/XDRIO.jl b/src/io/XDRIO.jl
@@ -4,7 +4,7 @@
 type XDRIO{T<:IO} <: RDAIO
     sub::T             # underlying IO stream
     buf::Vector{UInt8} # buffer for strings
-    @compat (::Type{XDRIO}){T <: IO}(io::T) = new{T}(io, Vector{UInt8}(1024))
+    (::Type{XDRIO}){T <: IO}(io::T) = new{T}(io, Vector{UInt8}(1024))
 end
 
 readint32(io::XDRIO) = ntoh(read(io.sub, Int32))

diff --git a/src/sxtypes.jl b/src/sxtypes.jl
@@ -125,7 +125,7 @@ const RComplexVector = RVector{Complex128, CPLXSXP}
 """
 immutable RNullableVector{T, S} <: RVEC{T, S}
     data::Vector{T}
-    na::BitVector                # mask of NA elements
+    na::Vector{Bool}             # mask of NA elements
     attr::Hash                   # collection of R object attributes
 end
 

diff --git a/test/RDA.jl b/test/RDA.jl
@@ -1,54 +1,76 @@
 module TestRDA
     using Base.Test
-    using DataFrames
+    using DataTables
     using RData
     using Compat
 
     # check for Float64 NA
-    @test !RData.isna_float64(reinterpret(UInt64, 1.0))
-    @test !RData.isna_float64(reinterpret(UInt64, NaN))
-    @test !RData.isna_float64(reinterpret(UInt64, Inf))
-    @test !RData.isna_float64(reinterpret(UInt64, -Inf))
-    @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64))
-    # check that alternative NA is also recognized (#10)
-    @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64 | ((Base.significand_mask(Float64) + 1) >> 1)))
+    @testset "Detect R floating-point NAs" begin
+        @test !RData.isna_float64(reinterpret(UInt64, 1.0))
+        @test !RData.isna_float64(reinterpret(UInt64, NaN))
+        @test !RData.isna_float64(reinterpret(UInt64, Inf))
+        @test !RData.isna_float64(reinterpret(UInt64, -Inf))
+        @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64))
+        # check that alternative NA is also recognized (#10)
+        @test RData.isna_float64(reinterpret(UInt64, RData.R_NA_FLOAT64 | ((Base.significand_mask(Float64) + 1) >> 1)))
+    end
 
     testdir = dirname(@__FILE__)
+    @testset "Reading minimal RData" begin
+        df = DataTable(num = NullableArray([1.1, 2.2]))
+        @test sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]) == df
+        @test load("$testdir/data/minimal.rda",convert=true)["df"] == df
+        @test load("$testdir/data/minimal_ascii.rda")["df"] == df
+    end
 
-    df = DataFrame(num = [1.1, 2.2])
-    @test isequal(sexp2julia(load("$testdir/data/minimal.rda",convert=false)["df"]), df)
-    @test isequal(load("$testdir/data/minimal.rda",convert=true)["df"], df)
-    @test isequal(load("$testdir/data/minimal_ascii.rda")["df"], df)
-
-    df[:int] = Int32[1, 2]
-    df[:logi] = [true, false]
-    df[:chr] = ["ab", "c"]
-    df[:factor] = pool(df[:chr])
-    df[:cplx] = Complex128[1.1+0.5im, 1.0im]
-    @test isequal(sexp2julia(load("$testdir/data/types.rda",convert=false)["df"]), df)
-    @test isequal(sexp2julia(load("$testdir/data/types_ascii.rda",convert=false)["df"]), df)
-
-    df[2, :] = NA
-    append!(df, df[2, :])
-    df[3, :num] = NaN
-    df[:, :cplx] = @data [NA, @compat(Complex128(1,NaN)), NaN]
-    @test isequal(sexp2julia(load("$testdir/data/NAs.rda",convert=false)["df"]), df)
-    # ASCII format saves NaN as NA
-    df[3, :num] = NA
-    df[:, :cplx] = @data [NA, NA, NA]
-    @test isequal(sexp2julia(load("$testdir/data/NAs_ascii.rda",convert=false)["df"]), df)
-
-    rda_names = names(sexp2julia(load("$testdir/data/names.rda",convert=false)["df"]))
-    expected_names = [:_end, :x!, :x1, :_B_C_, :x, :x_1]
-    @test rda_names == expected_names
-    rda_names = names(sexp2julia(load("$testdir/data/names_ascii.rda",convert=false)["df"]))
-    @test rda_names == [:_end, :x!, :x1, :_B_C_, :x, :x_1]
-
-    rda_envs = load("$testdir/data/envs.rda",convert=false)
-
-    rda_pairlists = load("$testdir/data/pairlists.rda",convert=false)
-
-    rda_closures = load("$testdir/data/closures.rda",convert=false)
-
-    rda_cmpfuns = load("$testdir/data/cmpfun.rda",convert=false)
+    @testset "Conversion to Julia types" begin
+        df = DataTable(num = NullableArray([1.1, 2.2]),
+                       int = NullableArray(Int32[1, 2]),
+                       logi = NullableArray([true, false]),
+                       chr = NullableArray(["ab", "c"]),
+                       factor = categorical(NullableArray(["ab", "c"]), true),
+                       cplx = NullableArray(Complex128[1.1+0.5im, 1.0im]))
+        rdf = sexp2julia(load("$testdir/data/types.rda",convert=false)["df"])
+        @test eltypes(rdf) == eltypes(df)
+        @test rdf == df
+        rdf_ascii = sexp2julia(load("$testdir/data/types_ascii.rda",convert=false)["df"])
+        @test eltypes(rdf_ascii) == eltypes(df)
+        @test rdf_ascii == df
+    end
+
+    @testset "NAs conversion" begin
+        df = DataTable(num = NullableArray([1.1, 2.2]),
+                       int = NullableArray(Int32[1, 2]),
+                       logi = NullableArray([true, false]),
+                       chr = NullableArray(["ab", "c"]),
+                       factor = categorical(NullableArray(["ab", "c"]), true),
+                       cplx = NullableArray(Complex128[1.1+0.5im, 1.0im]))
+        df[2, :] = Nullable()
+        append!(df, df[2, :])
+        df[3, :num] = NaN
+        df[:, :cplx] = NullableArray([Nullable(), Complex128(1,NaN), NaN])
+        @test isequal(sexp2julia(load("$testdir/data/NAs.rda",convert=false)["df"]), df)
+        # ASCII format saves NaN as NA
+        df[3, :num] = Nullable()
+        df[:, :cplx] = NullableArray{Complex128}(3)
+        @test sexp2julia(load("$testdir/data/NAs_ascii.rda",convert=false)["df"]) == df
+    end
+
+    @testset "Column names conversion" begin
+        rda_names = names(sexp2julia(load("$testdir/data/names.rda",convert=false)["df"]))
+        expected_names = [:_end, :x!, :x1, :_B_C_, :x, :x_1]
+        @test rda_names == expected_names
+        rda_names = names(sexp2julia(load("$testdir/data/names_ascii.rda",convert=false)["df"]))
+        @test rda_names == [:_end, :x!, :x1, :_B_C_, :x, :x_1]
+    end
+
+    @testset "Reading RDA with complex types (environments, closures etc)" begin
+        rda_envs = load("$testdir/data/envs.rda",convert=false)
+
+        rda_pairlists = load("$testdir/data/pairlists.rda",convert=false)
+
+        rda_closures = load("$testdir/data/closures.rda",convert=false)
+
+        rda_cmpfuns = load("$testdir/data/cmpfun.rda",convert=false)
+    end
 end