reverse indices of mask_filled and improve tests

bkamins · bkamins · commit fd496dcfa681 · 2017-12-11T23:58:42.000+01:00
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -152,7 +152,7 @@ Unstacks a DataFrame; convert from a long to wide format
 ```julia
 unstack(df::AbstractDataFrame, rowkeys::Union{Symbol, Integer},
         colkey::Union{Symbol, Integer}, value::Union{Symbol, Integer})
-unstack(df::AbstractDataFrame, rowkeys::Union{AbstractVector{<:Union{Symbol, Integer}}},
+unstack(df::AbstractDataFrame, rowkeys::AbstractVector{<:Union{Symbol, Integer}},
         colkey::Union{Symbol, Integer}, value::Union{Symbol, Integer})
 unstack(df::AbstractDataFrame, colkey::Union{Symbol, Integer},
         value::Union{Symbol, Integer})
@@ -198,9 +198,9 @@ wide3 = unstack(long, [:id, :a], :variable, :value)
 Note that there are some differences between the widened results above.
 """
 function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
-    refkeycol = deepcopy(categorical(df[rowkey]))
+    refkeycol = deepcopy(categorical(df[rowkey])) # TODO: remove deepcopy after CategoricalArrays #110 is merged
     droplevels!(refkeycol)
-    keycol = deepcopy(categorical(df[colkey]))
+    keycol = deepcopy(categorical(df[colkey])) # TODO: remove deepcopy after CategoricalArrays #110 is merged
     droplevels!(keycol)
     valuecol = df[value]
     _unstack(df, rowkey, colkey, value, keycol, valuecol, refkeycol)
@@ -212,9 +212,9 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
     Ncol = length(keycol.pool)
     unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
     hadmissing = false # have we encountered missing in refkeycol
-    mask_filled = falses(Ncol, Nrow+1) # has a given [col,row] entry been filled?
-    warned_dup = false # hawe we already printed duplicate entries warning?
-    warned_missing = false # hawe we already printed missing in keycol warning?
+    mask_filled = falses(Nrow+1, Ncol) # has a given [col,row] entry been filled?
+    warned_dup = false # have we already printed duplicate entries warning?
+    warned_missing = false # have we already printed missing in keycol warning?
     keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
     refkeycol_order = Vector{Int}(CategoricalArrays.order(refkeycol.pool))
     for k in 1:nrow(df)
@@ -240,13 +240,13 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
         else
             i = refkeycol_order[refkref]
         end
-        if !warned_dup && mask_filled[j, i]
+        if !warned_dup && mask_filled[i, j]
             warn("Duplicate entries in unstack at row $k for key "*
                  "$(refkeycol[k]) and variable $(keycol[k]).")
             warned_dup = true
         end
         unstacked_val[j][i] = valuecol[k]
-        mask_filled[j, i] = true
+        mask_filled[i, j] = true
     end
     levs = levels(refkeycol)
     # we have to handle a case with missings in refkeycol as levs will skip missing
@@ -279,7 +279,7 @@ function unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol}, colkey:
     length(rowkeys) == 0 && throw(ArgumentError("No key column found"))
     length(rowkeys) == 1 && return unstack(df, rowkeys[1], colkey, value)
     g = groupby(df, rowkeys, sort=true)
-    keycol = deepcopy(categorical(df[colkey]))
+    keycol = deepcopy(categorical(df[colkey])) # TODO: remove deepcopy after CategoricalArrays #110 is merged
     droplevels!(keycol)
     valuecol = df[value]
     _unstack(df, rowkeys, colkey, value, keycol, valuecol, g)
@@ -296,7 +296,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
     Nrow = length(g)
     Ncol = length(levels(keycol))
     unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
-    mask_filled = falses(Ncol, Nrow)
+    mask_filled = falses(Nrow, Ncol)
     warned_dup = false
     warned_missing = false
     keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
@@ -311,13 +311,13 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
         end
         j = keycol_order[kref]
         i = rowkey[k]
-        if !warned_dup && mask_filled[j, i]
+        if !warned_dup && mask_filled[i, j]
             warn("Duplicate entries in unstack at row $k for key "*
                  "$(tuple((df[1,s] for s in rowkeys)...)) and variable $(keycol[k]).")
             warned_dup = true
         end
         unstacked_val[j][i] = valuecol[k]
-        mask_filled[j, i] = true
+        mask_filled[i, j] = true
     end
     df2 = DataFrame(unstacked_val, map(Symbol, levels(keycol)))
     hcat(df1, df2)
diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -396,6 +396,7 @@ module TestDataFrame
         @test udf == DataFrame(Any[Union{Int, Missing}[1, 2], Union{Int, Missing}[1, 5],
                                    Union{Int, Missing}[2, 6], Union{Int, Missing}[3, 7],
                                    Union{Int, Missing}[4, 8]], [:id, :a, :b, :c, :d])
+        @test isa(udf[1], Vector{Int})
         @test all(isa.(udf.columns[2:end], Vector{Union{Int, Missing}}))
         df = DataFrame(Any[categorical(repeat(1:2, inner=4)),
                            categorical(repeat('a':'d', outer=2)), categorical(1:8)],
@@ -405,6 +406,7 @@ module TestDataFrame
         @test udf == DataFrame(Any[Union{Int, Missing}[1, 2], Union{Int, Missing}[1, 5],
                                    Union{Int, Missing}[2, 6], Union{Int, Missing}[3, 7],
                                    Union{Int, Missing}[4, 8]], [:id, :a, :b, :c, :d])
+        @test isa(udf[1], CategoricalVector{Int64})
         @test all(isa.(udf.columns[2:end], CategoricalVector{Union{Int, Missing}}))
     end
 
@@ -440,11 +442,17 @@ module TestDataFrame
                        variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
                        value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
         @test_warn "Missing value in variable variable at row 3. Skipping." unstack(df)
+        udf = unstack(df)
+        @test names(udf) == [:id, :a, :b, :missing]
+        @test udf[:missing] ≅ [missing, 9.0, missing]
         df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
                        id2=[1, 1, 1, missing, missing, missing, 2, 2, 2],
                        variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
                        value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
         @test_warn "Missing value in variable variable at row 3. Skipping." unstack(df, 3, 4)
+        udf = unstack(df, 3, 4)
+        @test names(udf) == [:id, :id2, :a, :b, :missing]
+        @test udf[:missing] ≅ [missing, 9.0, missing]
     end
 
     @testset "stack-unstack correctness" begin