diff --git a/NEWS.md b/NEWS.md index 87f2e87..b753f8b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,9 +2,12 @@ ##### Changes * add support for `.rds` files (single object data files from R) [#22], [#33] +* add support for `Date` and `POSIXct` (only for timezone codes supported by [TimeZones](https://github.com/JuliaTime/TimeZones.jl)) data [#34], [#35] [#22]: https://github.com/JuliaStats/RData.jl/issues/22 [#33]: https://github.com/JuliaStats/RData.jl/issues/33 +[#34]: https://github.com/JuliaStats/RData.jl/issues/34 +[#35]: https://github.com/JuliaStats/RData.jl/issues/35 ## RData v0.3.0 Release Notes diff --git a/README.md b/README.md index 7c29819..3a48fe6 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ convert R objects into Julia equivalents: | named vector, list | `DictoVec` | `DictoVec` allows indexing both by element index and by its name, just as R vectors and lists | | vector | `Vector{T}` | `T` is the appropriate Julia type. If R vector contains `NA` values, they are converted to [`missing`](https://github.com/JuliaData/Missings.jl), and the elements type of the resulting `Vector` is `Union{T, Missing}`. | factor | `CategoricalArray` | [CategoricalArrays.jl](https://github.com/JuliaData/CategoricalArrays.jl) | +| `Date` | `Dates.Date` | | +| `POSIXct` date time | `ZonedDateTime` | [TimeZones.jl](https://github.com/JuliaTime/TimeZones.jl) | | data frame | `DataFrame` | [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl) | If conversion to the Julia type is not supported (e.g. R closure or language expression), `load()` will return the internal RData representation of the object (`RSEXPREC` subtype). diff --git a/REQUIRE b/REQUIRE index f9e2ce0..a3ee828 100644 --- a/REQUIRE +++ b/REQUIRE @@ -4,3 +4,4 @@ Missings 0.2 CategoricalArrays 0.3 FileIO 0.1.2 CodecZlib 0.4 +TimeZones diff --git a/src/RData.jl b/src/RData.jl index 31782f6..83b925a 100644 --- a/src/RData.jl +++ b/src/RData.jl @@ -2,7 +2,7 @@ __precompile__() module RData -using DataFrames, CategoricalArrays, Missings, CodecZlib, FileIO +using DataFrames, CategoricalArrays, Missings, CodecZlib, FileIO, TimeZones import DataFrames: identifier import FileIO: load diff --git a/src/config.jl b/src/config.jl index 5ad3594..cde0447 100644 --- a/src/config.jl +++ b/src/config.jl @@ -37,3 +37,7 @@ const Hash = Dict{RString, Any} const emptyhash = Hash() const emptyhashkey = RString("\0") + +const R_Date_Class = ["Date"] +const R_POSIXct_Class = ["POSIXct", "POSIXt"] + diff --git a/src/convert.jl b/src/convert.jl index ac20812..1558008 100644 --- a/src/convert.jl +++ b/src/convert.jl @@ -1,6 +1,8 @@ # converters from selected RSEXPREC to Hash # They are used to translate SEXPREC attributes into Hash +import TimeZones: istimezone, unix2zdt, ZonedDateTime + function Base.convert(::Type{Hash}, pl::RPairList) res = Hash() for i in eachindex(pl.items) @@ -51,7 +53,16 @@ function jlvec(::Type{T}, rv::RNullableVector{R}, force_missing::Bool=true) wher end # convert R vector into Vector of appropriate type -jlvec(rv::RVEC, force_missing::Bool=true) = jlvec(eltype(rv.data), rv, force_missing) +function jlvec(rv::RVEC, force_missing::Bool=true) + cls = class(rv) + if cls == R_Date_Class + return jlvec(Dates.Date, rv, force_missing) + elseif cls == R_POSIXct_Class + return jlvec(ZonedDateTime, rv, force_missing) + else + return jlvec(eltype(rv.data), rv, force_missing) + end +end # convert R logical vector (uses Int32 to store values) into Vector{Bool[?]} function jlvec(rl::RLogicalVector, force_missing::Bool=true) @@ -89,6 +100,33 @@ function jlvec(ri::RIntegerVector, force_missing::Bool=true) end end +# convert R Date to Dates.Date +function jlvec(::Type{Dates.Date}, rv::RVEC, force_missing::Bool=true) + @assert class(rv) == R_Date_Class + nas = isnan.(rv.data) + if force_missing || any(nas) + dates = Union{Dates.Date, Missing}[isna ? missing : rdays2date(dtfloat) + for (isna, dtfloat) in zip(nas, rv.data)] + else + dates = rdays2date.(rv.data) + end + return dates +end + +# convert R POSIXct to ZonedDateTime +function jlvec(::Type{ZonedDateTime}, rv::RVEC, force_missing::Bool=true) + @assert class(rv) == R_POSIXct_Class + tz, validtz = getjuliatz(rv) + nas = isnan.(rv.data) + if force_missing || any(nas) + datetimes = Union{ZonedDateTime, Missing}[isna ? missing : unix2zdt(dtfloat, tz=tz) + for (isna, dtfloat) in zip(nas, rv.data)] + else + datetimes = unix2zdt.(rv.data, tz=tz) + end + return datetimes +end + function sexp2julia(rex::RSEXPREC) warn("Conversion of $(typeof(rex)) to Julia is not implemented") return nothing @@ -128,3 +166,37 @@ function sexp2julia(rl::RList) map(sexp2julia, rl.data) end end + +function rdays2date(days::Real) + const epoch_conv = 719528 # Dates.date2epochdays(Date("1970-01-01")) + Dates.epochdays2date(days + epoch_conv) +end + +# gets R timezone from the data attribute and converts it to TimeZones.TimeZone +# see r2juliatz() +function getjuliatz(rv::RVEC, deftz=tz"UTC") + tzattr = getattr(rv, "tzone", [""])[1] + if tzattr == "" + return deftz, true # R will store a blank for tzone + else + return r2juliatz(tzattr, deftz) + end +end + +# converts R timezone code to TimeZones.TimeZone +# returns a tuple: +# - timezone (or `deftz` if `rtz` is not recognized as a valid time zone) +# - boolean flag: true if `rtz` is not recognized, false otherwise +function r2juliatz(rtz::AbstractString, deftz=tz"UTC") + valid = istimezone(rtz) + if !valid + warn("Could not determine the timezone of '$(rtz)', treating as $deftz.") + return deftz, false + else + return TimeZone(rtz), true + end +end + +function unix2zdt(seconds::Real; tz::TimeZone=tz"UTC") + ZonedDateTime(Dates.unix2datetime(seconds), tz, from_utc=true) +end diff --git a/test/RDS.jl b/test/RDS.jl index 985ef50..c5db53d 100644 --- a/test/RDS.jl +++ b/test/RDS.jl @@ -2,6 +2,7 @@ module TestRDS using Base.Test using DataFrames using RData + using TimeZones testdir = dirname(@__FILE__) @@ -42,5 +43,68 @@ module TestRDS @test eltypes(rdf_decomp) == eltypes(df) @test isequal(rdf_decomp, df) end -end + @testset "Test Date conversion" begin + dates = load("$testdir/data/dates.rds") + @test dates[1] == Date("2017-01-01") + Dates.Day.(1:4) + @test dates[2] == Date("2017-01-02") + @test dates[3] isa DictoVec + @test dates[3].data == Date("2017-01-01") + Dates.Day.(1:4) + @test [dates[3].index2name[i] for i in 1:length(dates[3])] == ["A", "B", "C", "D"] + @test dates[4] isa DictoVec + @test dates[4].data == [Date("2017-01-02")] + @test dates[4].index2name[1] == "A" + end + + @testset "Test DateTime conversion" begin + datetimes = load("$testdir/data/datetimes.rds") + testdts = ZonedDateTime.(DateTime("2017-01-01T13:23") + Dates.Second.(1:4), + TimeZone("UTC")) + @test datetimes[1] == testdts + @test datetimes[2] == testdts[1] + @test datetimes[3] isa DictoVec + @test datetimes[3].data == testdts + @test [datetimes[3].index2name[i] for i in 1:length(datetimes[3])] == ["A", "B", "C", "D"] + @test datetimes[4] isa DictoVec + @test datetimes[4].data == [testdts[1]] + @test datetimes[4].index2name[1] == "A" + end + + @testset "Test Date and DateTime in a DataFrame" begin + rdfs = load("$testdir/data/datedfs.rds") + df = DataFrame(date=Date("2017-01-01") + Dates.Day.(1:4), + datetime=ZonedDateTime.(DateTime("2017-01-01T13:23") + Dates.Second.(1:4), + tz"UTC")) + @test length(rdfs) == 2 + @test rdfs[1] isa DataFrame + @test rdfs[2] isa DataFrame + @test eltypes(df) == eltypes(rdfs[1]) + @test eltypes(df) == eltypes(rdfs[2]) + @test isequal(df[1, :], rdfs[1]) + @test isequal(df, rdfs[2]) + end + + @testset "Test NA Date and DateTime conversion" begin + dates = load("$testdir/data/datesNA.rds") + + testdates = [Date("2017-01-01") + Dates.Day.(1:4); missing] + @test all(dates[1] .=== testdates) + + testdts = [ZonedDateTime.(DateTime("2017-01-01T13:23") + Dates.Second.(1:4), tz"UTC"); + missing] + @test all(dates[2] .=== testdts) + end + + @testset "Test DateTime timezones" begin + # tz"CST" is not supported by TimeZones.jl + datetimes = @test_warn "Could not determine the timezone of 'CST', treating as UTC." begin + load("$testdir/data/datetimes_tz.rds") + end + # assumes generate_rda.R was generated on system set to PST! + @test datetimes[1] == ZonedDateTime(DateTime("2017-01-01T21:23"), tz"UTC") + # should be tz"CST", but gets substituted to tz"UTC" + # FIXME update the test when CST is supported + @test datetimes[2] == ZonedDateTime(DateTime("2017-01-01T13:23"), tz"UTC") + @test datetimes[3] == ZonedDateTime(DateTime("2017-01-01T13:23"), tz"America/Chicago") + end +end diff --git a/test/data/datedfs.rds b/test/data/datedfs.rds new file mode 100644 index 0000000..f41707d Binary files /dev/null and b/test/data/datedfs.rds differ diff --git a/test/data/dates.rds b/test/data/dates.rds new file mode 100644 index 0000000..1cb935f Binary files /dev/null and b/test/data/dates.rds differ diff --git a/test/data/datesNA.rds b/test/data/datesNA.rds new file mode 100644 index 0000000..1ee51e0 Binary files /dev/null and b/test/data/datesNA.rds differ diff --git a/test/data/datetimes.rds b/test/data/datetimes.rds new file mode 100644 index 0000000..062bca3 Binary files /dev/null and b/test/data/datetimes.rds differ diff --git a/test/data/datetimes_tz.rds b/test/data/datetimes_tz.rds new file mode 100644 index 0000000..9a9fc41 Binary files /dev/null and b/test/data/datetimes_tz.rds differ diff --git a/test/generate_rda.R b/test/generate_rda.R index 4998c11..a354c3a 100644 --- a/test/generate_rda.R +++ b/test/generate_rda.R @@ -55,3 +55,29 @@ save(test.cmpfun0, test.cmpfun1, test.cmpfun2, file = "data/cmpfun.rda") x <- factor(c("a", "b", "c")) y <- ordered(x, levels=c("b", "a", "c")) save(x, y, file="data/ord.rda") + +dates = as.Date("2017-01-01") + 1:4 +datetimes = as.POSIXct("2017-01-01 13:23", tz="UTC") + 1:4 +dateNAs = list(c(dates, NA), c(datetimes, NA)) +saveRDS(dateNAs, file="data/datesNA.rds") +datelst = list(dates, dates[1]) +names(dates) = LETTERS[1:length(dates)] +datelst = c(datelst, list(dates), list(dates[1])) +saveRDS(datelst, file="data/dates.rds") +dtlst = list(datetimes, datetimes[1]) +names(datetimes) = LETTERS[1:length(datetimes)] +dtlst = c(dtlst, list(datetimes), list(datetimes[1])) +saveRDS(dtlst, file="data/datetimes.rds") +datedfs = list(data.frame(date=dates[1], datetime=datetimes[1]), + data.frame(date=dates, datetime=datetimes)) +saveRDS(datedfs, file="data/datedfs.rds") + +# the first element here is assumed to be in the local timezone but is saved in +# UTC time, without any timezone attribute. When R reads it, it assumes local time. +# So the test associated with this first datapoint is going to assume which timezone +# the data is generated in! (PST/-8) +saveRDS(list(as.POSIXct("2017-01-01 13:23"), + as.POSIXct("2017-01-01 13:23", tz="CST"), + as.POSIXct("2017-01-01 13:23", tz="America/Chicago")), + file="data/datetimes_tz.rds") +