code cleanup after review

bkamins · bkamins · commit 2b9d2ee32fcd · 2017-12-09T23:49:05.000+01:00
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -175,10 +175,10 @@ unstack(df::AbstractDataFrame)
 
 * `::DataFrame` : the wide-format DataFrame
 
-If `colkey` contains `missing` values then they will be skipped and warning will be printed.
+If `colkey` contains `missing` values then they will be skipped and a warning will be printed.
 
 If combination of `rowkeys` and `colkey` contains duplicate entries then last `value` will
-be retained and warnign will be printed.
+be retained and a warning will be printed.
 
 ### Examples
 
@@ -197,68 +197,66 @@ wide3 = unstack(long, [:id, :a], :variable, :value)
 ```
 Note that there are some differences between the widened results above.
 """
-function unstack(df::AbstractDataFrame, rowkeys::Int, colkey::Int, value::Int)
-    refkeycol = copy(CategoricalArray{Union{eltype(df[rowkeys]), Missing}}(df[rowkeys]))
+function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
+    refkeycol = copy(CategoricalArray{Union{eltype(df[rowkey]), Missing}}(df[rowkey]))
     droplevels!(refkeycol)
     keycol = CategoricalArray{Union{eltype(df[colkey]), Missing}}(df[colkey])
     valuecol = df[value]
     Nrow = length(refkeycol.pool)
     Ncol = length(keycol.pool)
     df2m = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
-    _unstack(df, rowkeys, colkey, value, keycol, valuecol, df2m, refkeycol)
+    _unstack(df, rowkey, colkey, value, keycol, valuecol, df2m, refkeycol)
 end
 
-function _unstack(df::AbstractDataFrame, rowkeys::Int,
+function _unstack(df::AbstractDataFrame, rowkey::Int,
                   colkey::Int, value::Int, keycol, valuecol, df2m, refkeycol)
     Nrow = length(refkeycol.pool)
     Ncol = length(keycol.pool)
-    hadmissing = false # have we encounered missing in refkeycol
-    mask_filled = falses(Ncol, Nrow) # has a given [col,row] entry been filled?
-    nowarning = true # do we print duplicate entries warning
-    nowarning2 = true # do we print missing in keycol
-    keycol_pool = Vector{Int}(CategoricalArrays.order(keycol.pool))
-    refkeycol_pool = Vector{Int}(CategoricalArrays.order(refkeycol.pool))
+    hadmissing = false # have we encountered missing in refkeycol
+    mask_filled = falses(Ncol, Nrow+1) # has a given [col,row] entry been filled?
+    warned_missing_1 = false # do we print duplicate entries warning
+    warned_missing_2 = false # do we print missing in keycol
+    keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
+    refkeycol_order = Vector{Int}(CategoricalArrays.order(refkeycol.pool))
     for k in 1:nrow(df)
-        keycol_refs = keycol.refs[k]
-        if keycol_refs == 0 # we have found missing in colkey
-            if nowarning2
-              warn("Missing value in colkey variable at row $k. Skipping.")
-              nowarning2 = false
+        kref = keycol.refs[k]
+        if kref <= 0 # we have found missing in colkey
+            if !warned_missing_2
+                warn("Missing value in '$(_names(df)[colkey])' variable at row $k. Skipping.")
+                warned_missing_2 = true
             end
             continue # skip processing it
         end
-        j = keycol_pool[keycol_refs]
-        refkeycol_refs = refkeycol.refs[k]
-        if refkeycol_refs == 0 # we have found missing in rowkeys
+        j = keycol_order[kref]
+        refkref = refkeycol.refs[k]
+        if refkref <= 0 # we have found missing in rowkey
             if !hadmissing # if it is the first time we have to add a new row
                 hadmissing = true
                 # we use the fact that missing is greater than anything
                 for i in eachindex(df2m)
-                    push!(df2m, missing)
+                    push!(df2m[i], missing)
                 end
-                mask_filled = hcat(mask_filled, falses(length(df2m)))
             end
             i = length(df2m[1])
         else
-            i = refkeycol_pool[refkeycol_refs]
+            i = refkeycol_order[refkref]
         end
-        if nowarning && mask_filled[j, i]
+        if (!warned_missing_1) && mask_filled[j, i]
             warn("Duplicate entries in unstack at row $k.")
-            nowarning = false
+            warned_missing_1 = true
         end
         df2m[j][i] = valuecol[k]
         mask_filled[j, i] = true
     end
     levs = levels(refkeycol)
     # we have to handle a case with missings in refkeycol as levs will skip missing
+    col = similar_missing(df[rowkey], length(levs))
+    copy!(col2, levs)
     if hadmissing
-        col = [levs; missing]
-    else
-        col = levs
+        push!(col, missing)
     end
-    col2 = similar_missing(df[rowkeys], length(col))
     df2 = DataFrame(df2m, map(Symbol, levels(keycol)))
-    insert!(df2, 1, copy!(col2, col), _names(df)[rowkeys])
+    insert!(df2, 1, col, _names(df)[rowkey])
 end
 
 unstack(df::AbstractDataFrame, rowkey::ColumnIndex,
@@ -276,7 +274,7 @@ unstack(df::AbstractDataFrame, colkey::Int, value::Int) =
 unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, value::ColumnIndex) =
     unstack(df, rowkeys, index(df)[colkey], index(df)[value])
 
-unstack(df::AbstractDataFrame, rowkeys::AbstractVector{T}, colkey::Int, value::Int) where T<:Real =
+unstack(df::AbstractDataFrame, rowkeys::AbstractVector{<:Real}, colkey::Int, value::Int) =
     unstack(df, names(df)[rowkeys], colkey, value)
 
 function unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol}, colkey::Int, value::Int)
@@ -302,23 +300,23 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
     Nrow = length(g)
     Ncol = length(levels(keycol))
     mask_filled = falses(Ncol, Nrow)
-    nowarning = true
-    nowarning2 = true
-    keycol_pool = Vector{Int}(CategoricalArrays.order(keycol.pool))
+    warned_missing_1 = false
+    warned_missing_2 = false
+    keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
     for k in 1:nrow(df)
-        keycol_refs = keycol.refs[k]
-        if keycol_refs == 0
-            if nowarning2
-                warn("Missing value in colkey variable at row $k. Skipping.")
-                nowarning2 = false
+        kref = keycol.refs[k]
+        if kref <= 0
+            if !warned_missing_2
+                warn("Missing value in '$(_names(df)[colkey])' variable at row $k. Skipping.")
+                warned_missing_2 = true
             end
             continue
         end
-        j = keycol_pool[keycol_refs]
+        j = keycol_order[kref]
         i = rowkey[k]
-        if nowarning && mask_filled[j, i]
+        if warned_missing_1 && mask_filled[j, i]
             warn("Duplicate entries in unstack at row $k.")
-            nowarning = false
+            warned_missing_1 = true
         end
         df2m[j][i] = valuecol[k]
         mask_filled[j, i] = true
diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -399,21 +399,21 @@ module TestDataFrame
         b = unstack(df, :variable, :value)
         @test a ≅ b ≅ DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4])
 
-        df = DataFrame(variable=["x","x"], value=[missing,missing], id=[1,1])
+        df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1])
         @test_warn "Duplicate entries in unstack at row 2." unstack(df, :variable, :value)
         @test_warn "Duplicate entries in unstack at row 2." unstack(df)
     end
 
     @testset "missing values in colkey" begin
-        df = DataFrame(id=[1,1,1,missing,missing,missing,2,2,2],
-                       variable=["a","b",missing,"a","b","missing","a","b","missing"],
-                       value=[missing,2.0,3.0,4.0,5.0,missing,7.0,missing,9.0])
-        @test_warn "Missing value in colkey variable at row 3. Skipping." unstack(df)
-        df = DataFrame(id=[1,1,1,missing,missing,missing,2,2,2],
-                       id2=[1,1,1,missing,missing,missing,2,2,2],
-                       variable=["a","b",missing,"a","b","missing","a","b","missing"],
-                       value=[missing,2.0,3.0,4.0,5.0,missing,7.0,missing,9.0])
-        @test_warn "Missing value in colkey variable at row 3. Skipping." unstack(df, 3, 4)
+        df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
+                       variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
+                       value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
+        @test_warn "Missing value in 'variable' variable at row 3. Skipping." unstack(df)
+        df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
+                       id2=[1, 1, 1, missing, missing, missing, 2, 2, 2],
+                       variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
+                       value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
+        @test_warn "Missing value in 'variable' variable at row 3. Skipping." unstack(df, 3, 4)
     end
 
     @testset "stack-unstack correctness" begin