Skip to content

Commit fd496dc

Browse files
committed
reverse indices of mask_filled and improve tests
1 parent fbfe182 commit fd496dc

File tree

2 files changed

+20
-12
lines changed

2 files changed

+20
-12
lines changed

src/abstractdataframe/reshape.jl

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ Unstacks a DataFrame; convert from a long to wide format
152152
```julia
153153
unstack(df::AbstractDataFrame, rowkeys::Union{Symbol, Integer},
154154
colkey::Union{Symbol, Integer}, value::Union{Symbol, Integer})
155-
unstack(df::AbstractDataFrame, rowkeys::Union{AbstractVector{<:Union{Symbol, Integer}}},
155+
unstack(df::AbstractDataFrame, rowkeys::AbstractVector{<:Union{Symbol, Integer}},
156156
colkey::Union{Symbol, Integer}, value::Union{Symbol, Integer})
157157
unstack(df::AbstractDataFrame, colkey::Union{Symbol, Integer},
158158
value::Union{Symbol, Integer})
@@ -198,9 +198,9 @@ wide3 = unstack(long, [:id, :a], :variable, :value)
198198
Note that there are some differences between the widened results above.
199199
"""
200200
function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
201-
refkeycol = deepcopy(categorical(df[rowkey]))
201+
refkeycol = deepcopy(categorical(df[rowkey])) # TODO: remove deepcopy after CategoricalArrays #110 is merged
202202
droplevels!(refkeycol)
203-
keycol = deepcopy(categorical(df[colkey]))
203+
keycol = deepcopy(categorical(df[colkey])) # TODO: remove deepcopy after CategoricalArrays #110 is merged
204204
droplevels!(keycol)
205205
valuecol = df[value]
206206
_unstack(df, rowkey, colkey, value, keycol, valuecol, refkeycol)
@@ -212,9 +212,9 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
212212
Ncol = length(keycol.pool)
213213
unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
214214
hadmissing = false # have we encountered missing in refkeycol
215-
mask_filled = falses(Ncol, Nrow+1) # has a given [col,row] entry been filled?
216-
warned_dup = false # hawe we already printed duplicate entries warning?
217-
warned_missing = false # hawe we already printed missing in keycol warning?
215+
mask_filled = falses(Nrow+1, Ncol) # has a given [col,row] entry been filled?
216+
warned_dup = false # have we already printed duplicate entries warning?
217+
warned_missing = false # have we already printed missing in keycol warning?
218218
keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
219219
refkeycol_order = Vector{Int}(CategoricalArrays.order(refkeycol.pool))
220220
for k in 1:nrow(df)
@@ -240,13 +240,13 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
240240
else
241241
i = refkeycol_order[refkref]
242242
end
243-
if !warned_dup && mask_filled[j, i]
243+
if !warned_dup && mask_filled[i, j]
244244
warn("Duplicate entries in unstack at row $k for key "*
245245
"$(refkeycol[k]) and variable $(keycol[k]).")
246246
warned_dup = true
247247
end
248248
unstacked_val[j][i] = valuecol[k]
249-
mask_filled[j, i] = true
249+
mask_filled[i, j] = true
250250
end
251251
levs = levels(refkeycol)
252252
# we have to handle a case with missings in refkeycol as levs will skip missing
@@ -279,7 +279,7 @@ function unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol}, colkey:
279279
length(rowkeys) == 0 && throw(ArgumentError("No key column found"))
280280
length(rowkeys) == 1 && return unstack(df, rowkeys[1], colkey, value)
281281
g = groupby(df, rowkeys, sort=true)
282-
keycol = deepcopy(categorical(df[colkey]))
282+
keycol = deepcopy(categorical(df[colkey])) # TODO: remove deepcopy after CategoricalArrays #110 is merged
283283
droplevels!(keycol)
284284
valuecol = df[value]
285285
_unstack(df, rowkeys, colkey, value, keycol, valuecol, g)
@@ -296,7 +296,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
296296
Nrow = length(g)
297297
Ncol = length(levels(keycol))
298298
unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
299-
mask_filled = falses(Ncol, Nrow)
299+
mask_filled = falses(Nrow, Ncol)
300300
warned_dup = false
301301
warned_missing = false
302302
keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
@@ -311,13 +311,13 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
311311
end
312312
j = keycol_order[kref]
313313
i = rowkey[k]
314-
if !warned_dup && mask_filled[j, i]
314+
if !warned_dup && mask_filled[i, j]
315315
warn("Duplicate entries in unstack at row $k for key "*
316316
"$(tuple((df[1,s] for s in rowkeys)...)) and variable $(keycol[k]).")
317317
warned_dup = true
318318
end
319319
unstacked_val[j][i] = valuecol[k]
320-
mask_filled[j, i] = true
320+
mask_filled[i, j] = true
321321
end
322322
df2 = DataFrame(unstacked_val, map(Symbol, levels(keycol)))
323323
hcat(df1, df2)

test/dataframe.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,7 @@ module TestDataFrame
396396
@test udf == DataFrame(Any[Union{Int, Missing}[1, 2], Union{Int, Missing}[1, 5],
397397
Union{Int, Missing}[2, 6], Union{Int, Missing}[3, 7],
398398
Union{Int, Missing}[4, 8]], [:id, :a, :b, :c, :d])
399+
@test isa(udf[1], Vector{Int})
399400
@test all(isa.(udf.columns[2:end], Vector{Union{Int, Missing}}))
400401
df = DataFrame(Any[categorical(repeat(1:2, inner=4)),
401402
categorical(repeat('a':'d', outer=2)), categorical(1:8)],
@@ -405,6 +406,7 @@ module TestDataFrame
405406
@test udf == DataFrame(Any[Union{Int, Missing}[1, 2], Union{Int, Missing}[1, 5],
406407
Union{Int, Missing}[2, 6], Union{Int, Missing}[3, 7],
407408
Union{Int, Missing}[4, 8]], [:id, :a, :b, :c, :d])
409+
@test isa(udf[1], CategoricalVector{Int64})
408410
@test all(isa.(udf.columns[2:end], CategoricalVector{Union{Int, Missing}}))
409411
end
410412

@@ -440,11 +442,17 @@ module TestDataFrame
440442
variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
441443
value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
442444
@test_warn "Missing value in variable variable at row 3. Skipping." unstack(df)
445+
udf = unstack(df)
446+
@test names(udf) == [:id, :a, :b, :missing]
447+
@test udf[:missing] [missing, 9.0, missing]
443448
df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
444449
id2=[1, 1, 1, missing, missing, missing, 2, 2, 2],
445450
variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
446451
value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
447452
@test_warn "Missing value in variable variable at row 3. Skipping." unstack(df, 3, 4)
453+
udf = unstack(df, 3, 4)
454+
@test names(udf) == [:id, :id2, :a, :b, :missing]
455+
@test udf[:missing] [missing, 9.0, missing]
448456
end
449457

450458
@testset "stack-unstack correctness" begin

0 commit comments

Comments
 (0)