moved unstacked_val into _unstack

bkamins · bkamins · commit 285a0280e1a5 · 2017-12-10T20:50:08.000+01:00
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -201,17 +201,16 @@ function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
     refkeycol = CategoricalArray{Union{eltype(df[rowkey]), Missing}}(df[rowkey])
     droplevels!(refkeycol)
     keycol = CategoricalArray{Union{eltype(df[colkey]), Missing}}(df[colkey])
+    droplevels!(keycol)
     valuecol = df[value]
-    Nrow = length(refkeycol.pool)
-    Ncol = length(keycol.pool)
-    df2m = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
-    _unstack(df, rowkey, colkey, value, keycol, valuecol, df2m, refkeycol)
+    _unstack(df, rowkey, colkey, value, keycol, valuecol, refkeycol)
 end
 
 function _unstack(df::AbstractDataFrame, rowkey::Int,
-                  colkey::Int, value::Int, keycol, valuecol, df2m, refkeycol)
+                  colkey::Int, value::Int, keycol, valuecol, refkeycol)
     Nrow = length(refkeycol.pool)
     Ncol = length(keycol.pool)
+    unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
     hadmissing = false # have we encountered missing in refkeycol
     mask_filled = falses(Ncol, Nrow+1) # has a given [col,row] entry been filled?
     warned_dup = false # hawe we already printed duplicate entries warning?
@@ -233,28 +232,28 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
             if !hadmissing # if it is the first time we have to add a new row
                 hadmissing = true
                 # we use the fact that missing is greater than anything
-                for i in eachindex(df2m)
-                    push!(df2m[i], missing)
+                for i in eachindex(unstacked_val)
+                    push!(unstacked_val[i], missing)
                 end
             end
-            i = length(df2m[1])
+            i = length(unstacked_val[1])
         else
             i = refkeycol_order[refkref]
         end
-        if (!warned_dup) && mask_filled[j, i]
+        if !warned_dup && mask_filled[j, i]
             warn("Duplicate entries in unstack at row $k for key "*
                  "$(refkeycol[k]) and variable $(keycol[k]).")
             warned_dup = true
         end
-        df2m[j][i] = valuecol[k]
+        unstacked_val[j][i] = valuecol[k]
         mask_filled[j, i] = true
     end
     levs = levels(refkeycol)
     # we have to handle a case with missings in refkeycol as levs will skip missing
     col = similar_missing(df[rowkey], length(levs) + hadmissing)
     copy!(col, levs)
     hadmissing && (col[end] = missing)
-    df2 = DataFrame(df2m, map(Symbol, levels(keycol)))
+    df2 = DataFrame(unstacked_val, map(Symbol, levels(keycol)))
     insert!(df2, 1, col, _names(df)[rowkey])
 end
 
@@ -281,15 +280,13 @@ function unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol}, colkey:
     length(rowkeys) == 1 && return unstack(df, rowkeys[1], colkey, value)
     g = groupby(df, rowkeys, sort=true)
     keycol = CategoricalArray{Union{eltype(df[colkey]), Missing}}(df[colkey])
+    droplevels!(keycol)
     valuecol = df[value]
-    Nrow = length(g)
-    Ncol = length(levels(keycol))
-    df2m = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
-    _unstack(df, rowkeys, colkey, value, keycol, valuecol, df2m, g)
+    _unstack(df, rowkeys, colkey, value, keycol, valuecol, g)
 end
 
 function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
-                  colkey::Int, value::Int, keycol, valuecol, df2m, g)
+                  colkey::Int, value::Int, keycol, valuecol, g)
     groupidxs = [g.idx[g.starts[i]:g.ends[i]] for i in 1:length(g.starts)]
     rowkey = zeros(Int, size(df, 1))
     for i in 1:length(groupidxs)
@@ -298,6 +295,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
     df1 = allowmissing!(df[g.idx[g.starts], g.cols], g.cols)
     Nrow = length(g)
     Ncol = length(levels(keycol))
+    unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
     mask_filled = falses(Ncol, Nrow)
     warned_dup = false
     warned_missing = false
@@ -313,15 +311,15 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
         end
         j = keycol_order[kref]
         i = rowkey[k]
-        if (!warned_dup) && mask_filled[j, i]
+        if !warned_dup && mask_filled[j, i]
             warn("Duplicate entries in unstack at row $k for key "*
                  "$(tuple((df[1,s] for s in rowkeys)...)) and variable $(keycol[k]).")
             warned_dup = true
         end
-        df2m[j][i] = valuecol[k]
+        unstacked_val[j][i] = valuecol[k]
         mask_filled[j, i] = true
     end
-    df2 = DataFrame(df2m, map(Symbol, levels(keycol)))
+    df2 = DataFrame(unstacked_val, map(Symbol, levels(keycol)))
     hcat(df1, df2)
 end
 
diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -296,10 +296,11 @@ module TestDataFrame
 
     #Check the output of unstack
     df = DataFrame(Fish = CategoricalArray{Union{String, Missing}}(["Bob", "Bob", "Batman", "Batman"]),
-                   Key = Union{String, Missing}["Mass", "Color", "Mass", "Color"],
+                   Key = CategoricalArray{Union{String, Missing}}["Mass", "Color", "Mass", "Color"],
                    Value = Union{String, Missing}["12 g", "Red", "18 g", "Grey"])
     # Check that reordering levels does not confuse unstack
     levels!(df[1], ["XXX", "Bob", "Batman"])
+    levels!(df[2], ["YYY", "Color", "Mass"])
     #Unstack specifying a row column
     df2 = unstack(df, :Fish, :Key, :Value)
     @test levels(df[1]) == ["XXX", "Bob", "Batman"] # make sure we did not mess df[1] levels