code cleanup after review

bkamins · bkamins · commit 7de450df40d8 · 2017-12-10T13:22:22.000+01:00
diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl
@@ -198,7 +198,7 @@ wide3 = unstack(long, [:id, :a], :variable, :value)
 Note that there are some differences between the widened results above.
 """
 function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
-    refkeycol = copy(CategoricalArray{Union{eltype(df[rowkey]), Missing}}(df[rowkey]))
+    refkeycol = CategoricalArray{Union{eltype(df[rowkey]), Missing}}(df[rowkey])
     droplevels!(refkeycol)
     keycol = CategoricalArray{Union{eltype(df[colkey]), Missing}}(df[colkey])
     valuecol = df[value]
@@ -214,16 +214,16 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
     Ncol = length(keycol.pool)
     hadmissing = false # have we encountered missing in refkeycol
     mask_filled = falses(Ncol, Nrow+1) # has a given [col,row] entry been filled?
-    warned_missing_1 = false # do we print duplicate entries warning
-    warned_missing_2 = false # do we print missing in keycol
+    warned_dup = false # hawe we already printed duplicate entries warning?
+    warned_missing = false # hawe we already printed missing in keycol warning?
     keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
     refkeycol_order = Vector{Int}(CategoricalArrays.order(refkeycol.pool))
     for k in 1:nrow(df)
         kref = keycol.refs[k]
         if kref <= 0 # we have found missing in colkey
-            if !warned_missing_2
-                warn("Missing value in '$(_names(df)[colkey])' variable at row $k. Skipping.")
-                warned_missing_2 = true
+            if !warned_missing
+                warn("Missing value in variable $(_names(df)[colkey]) at row $k. Skipping.")
+                warned_missing = true
             end
             continue # skip processing it
         end
@@ -241,21 +241,19 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
         else
             i = refkeycol_order[refkref]
         end
-        if (!warned_missing_1) && mask_filled[j, i]
+        if (!warned_dup) && mask_filled[j, i]
             warn("Duplicate entries in unstack at row $k for key "*
-                 "'$(refkeycol[k])' and variable '$(keycol[k])'.")
-            warned_missing_1 = true
+                 "$(refkeycol[k]) and variable $(keycol[k]).")
+            warned_dup = true
         end
         df2m[j][i] = valuecol[k]
         mask_filled[j, i] = true
     end
     levs = levels(refkeycol)
     # we have to handle a case with missings in refkeycol as levs will skip missing
-    col = similar_missing(df[rowkey], length(levs))
+    col = similar_missing(df[rowkey], length(levs) + hadmissing)
     copy!(col, levs)
-    if hadmissing
-        push!(col, missing)
-    end
+    hadmissing && (col[end] = missing)
     df2 = DataFrame(df2m, map(Symbol, levels(keycol)))
     insert!(df2, 1, col, _names(df)[rowkey])
 end
@@ -301,24 +299,24 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
     Nrow = length(g)
     Ncol = length(levels(keycol))
     mask_filled = falses(Ncol, Nrow)
-    warned_missing_1 = false
-    warned_missing_2 = false
+    warned_dup = false
+    warned_missing = false
     keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
     for k in 1:nrow(df)
         kref = keycol.refs[k]
         if kref <= 0
-            if !warned_missing_2
-                warn("Missing value in '$(_names(df)[colkey])' variable at row $k. Skipping.")
-                warned_missing_2 = true
+            if !warned_missing
+                warn("Missing value in variable $(_names(df)[colkey]) at row $k. Skipping.")
+                warned_missing = true
             end
             continue
         end
         j = keycol_order[kref]
         i = rowkey[k]
-        if (!warned_missing_1) && mask_filled[j, i]
+        if (!warned_dup) && mask_filled[j, i]
             warn("Duplicate entries in unstack at row $k for key "*
-                 "'$(tuple((df[1,s] for s in rowkeys)...))' and variable '$(keycol[k])'.")
-            warned_missing_1 = true
+                 "$(tuple((df[1,s] for s in rowkeys)...)) and variable $(keycol[k]).")
+            warned_dup = true
         end
         df2m[j][i] = valuecol[k]
         mask_filled[j, i] = true
diff --git a/test/dataframe.jl b/test/dataframe.jl
@@ -302,6 +302,7 @@ module TestDataFrame
     levels!(df[1], ["XXX", "Bob", "Batman"])
     #Unstack specifying a row column
     df2 = unstack(df, :Fish, :Key, :Value)
+    @test levels(df[1]) == ["XXX", "Bob", "Batman"] # make sure we did not mess df[1] levels
     #Unstack without specifying a row column
     df3 = unstack(df, :Key, :Value)
     #The expected output, XXX level should be dropped as it has no rows with this key
@@ -385,8 +386,8 @@ module TestDataFrame
                        id2=Union{Int, Missing}[1, 2, 1, 2], 
                        variable=["a", "b", "a", "b"], value=[3, 4, 5, 6])
         @static if VERSION >= v"0.6.0-dev.1980"
-            @test_warn "Duplicate entries in unstack at row 3 for key '1' and variable 'a'." unstack(df, :id, :variable, :value)
-            @test_warn "Duplicate entries in unstack at row 3 for key '(1, 1)' and variable 'a'." unstack(df, :variable, :value)
+            @test_warn "Duplicate entries in unstack at row 3 for key 1 and variable a." unstack(df, :id, :variable, :value)
+            @test_warn "Duplicate entries in unstack at row 3 for key (1, 1) and variable a." unstack(df, :variable, :value)
         end
         a = unstack(df, :id, :variable, :value)
         @test a ≅ DataFrame(id = [1, 2], a = [5, missing], b = [missing, 6])
@@ -403,20 +404,20 @@ module TestDataFrame
         @test a ≅ b ≅ DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4])
 
         df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1])
-        @test_warn "Duplicate entries in unstack at row 2." unstack(df, :variable, :value)
-        @test_warn "Duplicate entries in unstack at row 2." unstack(df)
+        @test_warn "Duplicate entries in unstack at row 2 for key 1 and variable x." unstack(df, :variable, :value)
+        @test_warn "Duplicate entries in unstack at row 2 for key 1 and variable x." unstack(df)
     end
 
     @testset "missing values in colkey" begin
         df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
                        variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
                        value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
-        @test_warn "Missing value in 'variable' variable at row 3. Skipping." unstack(df)
+        @test_warn "Missing value in variable variable at row 3. Skipping." unstack(df)
         df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
                        id2=[1, 1, 1, missing, missing, missing, 2, 2, 2],
                        variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
                        value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
-        @test_warn "Missing value in 'variable' variable at row 3. Skipping." unstack(df, 3, 4)
+        @test_warn "Missing value in variable variable at row 3. Skipping." unstack(df, 3, 4)
     end
 
     @testset "stack-unstack correctness" begin