Skip to content

Commit 7de450d

Browse files
committed
code cleanup after review
1 parent 2c8ef46 commit 7de450d

File tree

2 files changed

+26
-27
lines changed

2 files changed

+26
-27
lines changed

src/abstractdataframe/reshape.jl

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ wide3 = unstack(long, [:id, :a], :variable, :value)
198198
Note that there are some differences between the widened results above.
199199
"""
200200
function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
201-
refkeycol = copy(CategoricalArray{Union{eltype(df[rowkey]), Missing}}(df[rowkey]))
201+
refkeycol = CategoricalArray{Union{eltype(df[rowkey]), Missing}}(df[rowkey])
202202
droplevels!(refkeycol)
203203
keycol = CategoricalArray{Union{eltype(df[colkey]), Missing}}(df[colkey])
204204
valuecol = df[value]
@@ -214,16 +214,16 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
214214
Ncol = length(keycol.pool)
215215
hadmissing = false # have we encountered missing in refkeycol
216216
mask_filled = falses(Ncol, Nrow+1) # has a given [col,row] entry been filled?
217-
warned_missing_1 = false # do we print duplicate entries warning
218-
warned_missing_2 = false # do we print missing in keycol
217+
warned_dup = false # hawe we already printed duplicate entries warning?
218+
warned_missing = false # hawe we already printed missing in keycol warning?
219219
keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
220220
refkeycol_order = Vector{Int}(CategoricalArrays.order(refkeycol.pool))
221221
for k in 1:nrow(df)
222222
kref = keycol.refs[k]
223223
if kref <= 0 # we have found missing in colkey
224-
if !warned_missing_2
225-
warn("Missing value in '$(_names(df)[colkey])' variable at row $k. Skipping.")
226-
warned_missing_2 = true
224+
if !warned_missing
225+
warn("Missing value in variable $(_names(df)[colkey]) at row $k. Skipping.")
226+
warned_missing = true
227227
end
228228
continue # skip processing it
229229
end
@@ -241,21 +241,19 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
241241
else
242242
i = refkeycol_order[refkref]
243243
end
244-
if (!warned_missing_1) && mask_filled[j, i]
244+
if (!warned_dup) && mask_filled[j, i]
245245
warn("Duplicate entries in unstack at row $k for key "*
246-
"'$(refkeycol[k])' and variable '$(keycol[k])'.")
247-
warned_missing_1 = true
246+
"$(refkeycol[k]) and variable $(keycol[k]).")
247+
warned_dup = true
248248
end
249249
df2m[j][i] = valuecol[k]
250250
mask_filled[j, i] = true
251251
end
252252
levs = levels(refkeycol)
253253
# we have to handle a case with missings in refkeycol as levs will skip missing
254-
col = similar_missing(df[rowkey], length(levs))
254+
col = similar_missing(df[rowkey], length(levs) + hadmissing)
255255
copy!(col, levs)
256-
if hadmissing
257-
push!(col, missing)
258-
end
256+
hadmissing && (col[end] = missing)
259257
df2 = DataFrame(df2m, map(Symbol, levels(keycol)))
260258
insert!(df2, 1, col, _names(df)[rowkey])
261259
end
@@ -301,24 +299,24 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
301299
Nrow = length(g)
302300
Ncol = length(levels(keycol))
303301
mask_filled = falses(Ncol, Nrow)
304-
warned_missing_1 = false
305-
warned_missing_2 = false
302+
warned_dup = false
303+
warned_missing = false
306304
keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
307305
for k in 1:nrow(df)
308306
kref = keycol.refs[k]
309307
if kref <= 0
310-
if !warned_missing_2
311-
warn("Missing value in '$(_names(df)[colkey])' variable at row $k. Skipping.")
312-
warned_missing_2 = true
308+
if !warned_missing
309+
warn("Missing value in variable $(_names(df)[colkey]) at row $k. Skipping.")
310+
warned_missing = true
313311
end
314312
continue
315313
end
316314
j = keycol_order[kref]
317315
i = rowkey[k]
318-
if (!warned_missing_1) && mask_filled[j, i]
316+
if (!warned_dup) && mask_filled[j, i]
319317
warn("Duplicate entries in unstack at row $k for key "*
320-
"'$(tuple((df[1,s] for s in rowkeys)...))' and variable '$(keycol[k])'.")
321-
warned_missing_1 = true
318+
"$(tuple((df[1,s] for s in rowkeys)...)) and variable $(keycol[k]).")
319+
warned_dup = true
322320
end
323321
df2m[j][i] = valuecol[k]
324322
mask_filled[j, i] = true

test/dataframe.jl

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ module TestDataFrame
302302
levels!(df[1], ["XXX", "Bob", "Batman"])
303303
#Unstack specifying a row column
304304
df2 = unstack(df, :Fish, :Key, :Value)
305+
@test levels(df[1]) == ["XXX", "Bob", "Batman"] # make sure we did not mess df[1] levels
305306
#Unstack without specifying a row column
306307
df3 = unstack(df, :Key, :Value)
307308
#The expected output, XXX level should be dropped as it has no rows with this key
@@ -385,8 +386,8 @@ module TestDataFrame
385386
id2=Union{Int, Missing}[1, 2, 1, 2],
386387
variable=["a", "b", "a", "b"], value=[3, 4, 5, 6])
387388
@static if VERSION >= v"0.6.0-dev.1980"
388-
@test_warn "Duplicate entries in unstack at row 3 for key '1' and variable 'a'." unstack(df, :id, :variable, :value)
389-
@test_warn "Duplicate entries in unstack at row 3 for key '(1, 1)' and variable 'a'." unstack(df, :variable, :value)
389+
@test_warn "Duplicate entries in unstack at row 3 for key 1 and variable a." unstack(df, :id, :variable, :value)
390+
@test_warn "Duplicate entries in unstack at row 3 for key (1, 1) and variable a." unstack(df, :variable, :value)
390391
end
391392
a = unstack(df, :id, :variable, :value)
392393
@test a DataFrame(id = [1, 2], a = [5, missing], b = [missing, 6])
@@ -403,20 +404,20 @@ module TestDataFrame
403404
@test a b DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4])
404405

405406
df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1])
406-
@test_warn "Duplicate entries in unstack at row 2." unstack(df, :variable, :value)
407-
@test_warn "Duplicate entries in unstack at row 2." unstack(df)
407+
@test_warn "Duplicate entries in unstack at row 2 for key 1 and variable x." unstack(df, :variable, :value)
408+
@test_warn "Duplicate entries in unstack at row 2 for key 1 and variable x." unstack(df)
408409
end
409410

410411
@testset "missing values in colkey" begin
411412
df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
412413
variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
413414
value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
414-
@test_warn "Missing value in 'variable' variable at row 3. Skipping." unstack(df)
415+
@test_warn "Missing value in variable variable at row 3. Skipping." unstack(df)
415416
df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
416417
id2=[1, 1, 1, missing, missing, missing, 2, 2, 2],
417418
variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
418419
value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
419-
@test_warn "Missing value in 'variable' variable at row 3. Skipping." unstack(df, 3, 4)
420+
@test_warn "Missing value in variable variable at row 3. Skipping." unstack(df, 3, 4)
420421
end
421422

422423
@testset "stack-unstack correctness" begin

0 commit comments

Comments
 (0)