Skip to content

Commit 152183f

Browse files
committed
do not force unstack to create columns with missing
1 parent 285a028 commit 152183f

File tree

2 files changed

+32
-6
lines changed

2 files changed

+32
-6
lines changed

src/abstractdataframe/reshape.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -198,9 +198,9 @@ wide3 = unstack(long, [:id, :a], :variable, :value)
198198
Note that there are some differences between the widened results above.
199199
"""
200200
function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
201-
refkeycol = CategoricalArray{Union{eltype(df[rowkey]), Missing}}(df[rowkey])
201+
refkeycol = deepcopy(categorical(df[rowkey]))
202202
droplevels!(refkeycol)
203-
keycol = CategoricalArray{Union{eltype(df[colkey]), Missing}}(df[colkey])
203+
keycol = deepcopy(categorical(df[colkey]))
204204
droplevels!(keycol)
205205
valuecol = df[value]
206206
_unstack(df, rowkey, colkey, value, keycol, valuecol, refkeycol)
@@ -250,7 +250,7 @@ function _unstack(df::AbstractDataFrame, rowkey::Int,
250250
end
251251
levs = levels(refkeycol)
252252
# we have to handle a case with missings in refkeycol as levs will skip missing
253-
col = similar_missing(df[rowkey], length(levs) + hadmissing)
253+
col = similar(df[rowkey], length(levs) + hadmissing)
254254
copy!(col, levs)
255255
hadmissing && (col[end] = missing)
256256
df2 = DataFrame(unstacked_val, map(Symbol, levels(keycol)))
@@ -279,7 +279,7 @@ function unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol}, colkey:
279279
length(rowkeys) == 0 && throw(ArgumentError("No key column found"))
280280
length(rowkeys) == 1 && return unstack(df, rowkeys[1], colkey, value)
281281
g = groupby(df, rowkeys, sort=true)
282-
keycol = CategoricalArray{Union{eltype(df[colkey]), Missing}}(df[colkey])
282+
keycol = deepcopy(categorical(df[colkey]))
283283
droplevels!(keycol)
284284
valuecol = df[value]
285285
_unstack(df, rowkeys, colkey, value, keycol, valuecol, g)
@@ -292,7 +292,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
292292
for i in 1:length(groupidxs)
293293
rowkey[groupidxs[i]] = i
294294
end
295-
df1 = allowmissing!(df[g.idx[g.starts], g.cols], g.cols)
295+
df1 = df[g.idx[g.starts], g.cols]
296296
Nrow = length(g)
297297
Ncol = length(levels(keycol))
298298
unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]

test/dataframe.jl

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,14 +296,15 @@ module TestDataFrame
296296

297297
#Check the output of unstack
298298
df = DataFrame(Fish = CategoricalArray{Union{String, Missing}}(["Bob", "Bob", "Batman", "Batman"]),
299-
Key = CategoricalArray{Union{String, Missing}}["Mass", "Color", "Mass", "Color"],
299+
Key = CategoricalArray{Union{String, Missing}}(["Mass", "Color", "Mass", "Color"]),
300300
Value = Union{String, Missing}["12 g", "Red", "18 g", "Grey"])
301301
# Check that reordering levels does not confuse unstack
302302
levels!(df[1], ["XXX", "Bob", "Batman"])
303303
levels!(df[2], ["YYY", "Color", "Mass"])
304304
#Unstack specifying a row column
305305
df2 = unstack(df, :Fish, :Key, :Value)
306306
@test levels(df[1]) == ["XXX", "Bob", "Batman"] # make sure we did not mess df[1] levels
307+
@test levels(df[2]) == ["YYY", "Color", "Mass"] # make sure we did not mess df[2] levels
307308
#Unstack without specifying a row column
308309
df3 = unstack(df, :Key, :Value)
309310
#The expected output, XXX level should be dropped as it has no rows with this key
@@ -321,6 +322,31 @@ module TestDataFrame
321322
df4[1,:Mass] = missing
322323
@test df2 df4
323324

325+
#The same as above but without CategoricalArray
326+
df = DataFrame(Fish = ["Bob", "Bob", "Batman", "Batman"],
327+
Key = ["Mass", "Color", "Mass", "Color"],
328+
Value = ["12 g", "Red", "18 g", "Grey"])
329+
#Unstack specifying a row column
330+
df2 = unstack(df, :Fish, :Key, :Value)
331+
#Unstack without specifying a row column
332+
df3 = unstack(df, :Key, :Value)
333+
#The expected output, XXX level should be dropped as it has no rows with this key
334+
df4 = DataFrame(Fish = ["Batman", "Bob"],
335+
Color = ["Grey", "Red"],
336+
Mass = ["18 g", "12 g"])
337+
@test df2 df4
338+
@test typeof(df2[:Fish]) <: Vector{String}
339+
# first column stays as CategoricalArray in df3
340+
@test df3 == df4
341+
#Make sure unstack works with missing values at the start of the value column
342+
allowmissing!(df, :Value)
343+
df[1,:Value] = missing
344+
df2 = unstack(df, :Fish, :Key, :Value)
345+
#This changes the expected result
346+
allowmissing!(df4, :Mass)
347+
df4[2,:Mass] = missing
348+
@test df2 df4
349+
324350
# test empty set of grouping variables
325351
@test_throws ArgumentError unstack(df, Int[], :Key, :Value)
326352
@test_throws ArgumentError unstack(df, Symbol[], :Key, :Value)

0 commit comments

Comments
 (0)