Skip to content

Commit 2b9d2ee

Browse files
committed
code cleanup after review
1 parent 9f33493 commit 2b9d2ee

File tree

2 files changed

+50
-52
lines changed

2 files changed

+50
-52
lines changed

src/abstractdataframe/reshape.jl

Lines changed: 40 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -175,10 +175,10 @@ unstack(df::AbstractDataFrame)
175175
176176
* `::DataFrame` : the wide-format DataFrame
177177
178-
If `colkey` contains `missing` values then they will be skipped and warning will be printed.
178+
If `colkey` contains `missing` values then they will be skipped and a warning will be printed.
179179
180180
If combination of `rowkeys` and `colkey` contains duplicate entries then last `value` will
181-
be retained and warnign will be printed.
181+
be retained and a warning will be printed.
182182
183183
### Examples
184184
@@ -197,68 +197,66 @@ wide3 = unstack(long, [:id, :a], :variable, :value)
197197
```
198198
Note that there are some differences between the widened results above.
199199
"""
200-
function unstack(df::AbstractDataFrame, rowkeys::Int, colkey::Int, value::Int)
201-
refkeycol = copy(CategoricalArray{Union{eltype(df[rowkeys]), Missing}}(df[rowkeys]))
200+
function unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, value::Int)
201+
refkeycol = copy(CategoricalArray{Union{eltype(df[rowkey]), Missing}}(df[rowkey]))
202202
droplevels!(refkeycol)
203203
keycol = CategoricalArray{Union{eltype(df[colkey]), Missing}}(df[colkey])
204204
valuecol = df[value]
205205
Nrow = length(refkeycol.pool)
206206
Ncol = length(keycol.pool)
207207
df2m = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
208-
_unstack(df, rowkeys, colkey, value, keycol, valuecol, df2m, refkeycol)
208+
_unstack(df, rowkey, colkey, value, keycol, valuecol, df2m, refkeycol)
209209
end
210210

211-
function _unstack(df::AbstractDataFrame, rowkeys::Int,
211+
function _unstack(df::AbstractDataFrame, rowkey::Int,
212212
colkey::Int, value::Int, keycol, valuecol, df2m, refkeycol)
213213
Nrow = length(refkeycol.pool)
214214
Ncol = length(keycol.pool)
215-
hadmissing = false # have we encounered missing in refkeycol
216-
mask_filled = falses(Ncol, Nrow) # has a given [col,row] entry been filled?
217-
nowarning = true # do we print duplicate entries warning
218-
nowarning2 = true # do we print missing in keycol
219-
keycol_pool = Vector{Int}(CategoricalArrays.order(keycol.pool))
220-
refkeycol_pool = Vector{Int}(CategoricalArrays.order(refkeycol.pool))
215+
hadmissing = false # have we encountered missing in refkeycol
216+
mask_filled = falses(Ncol, Nrow+1) # has a given [col,row] entry been filled?
217+
warned_missing_1 = false # do we print duplicate entries warning
218+
warned_missing_2 = false # do we print missing in keycol
219+
keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
220+
refkeycol_order = Vector{Int}(CategoricalArrays.order(refkeycol.pool))
221221
for k in 1:nrow(df)
222-
keycol_refs = keycol.refs[k]
223-
if keycol_refs == 0 # we have found missing in colkey
224-
if nowarning2
225-
warn("Missing value in colkey variable at row $k. Skipping.")
226-
nowarning2 = false
222+
kref = keycol.refs[k]
223+
if kref <= 0 # we have found missing in colkey
224+
if !warned_missing_2
225+
warn("Missing value in '$(_names(df)[colkey])' variable at row $k. Skipping.")
226+
warned_missing_2 = true
227227
end
228228
continue # skip processing it
229229
end
230-
j = keycol_pool[keycol_refs]
231-
refkeycol_refs = refkeycol.refs[k]
232-
if refkeycol_refs == 0 # we have found missing in rowkeys
230+
j = keycol_order[kref]
231+
refkref = refkeycol.refs[k]
232+
if refkref <= 0 # we have found missing in rowkey
233233
if !hadmissing # if it is the first time we have to add a new row
234234
hadmissing = true
235235
# we use the fact that missing is greater than anything
236236
for i in eachindex(df2m)
237-
push!(df2m, missing)
237+
push!(df2m[i], missing)
238238
end
239-
mask_filled = hcat(mask_filled, falses(length(df2m)))
240239
end
241240
i = length(df2m[1])
242241
else
243-
i = refkeycol_pool[refkeycol_refs]
242+
i = refkeycol_order[refkref]
244243
end
245-
if nowarning && mask_filled[j, i]
244+
if (!warned_missing_1) && mask_filled[j, i]
246245
warn("Duplicate entries in unstack at row $k.")
247-
nowarning = false
246+
warned_missing_1 = true
248247
end
249248
df2m[j][i] = valuecol[k]
250249
mask_filled[j, i] = true
251250
end
252251
levs = levels(refkeycol)
253252
# we have to handle a case with missings in refkeycol as levs will skip missing
253+
col = similar_missing(df[rowkey], length(levs))
254+
copy!(col2, levs)
254255
if hadmissing
255-
col = [levs; missing]
256-
else
257-
col = levs
256+
push!(col, missing)
258257
end
259-
col2 = similar_missing(df[rowkeys], length(col))
260258
df2 = DataFrame(df2m, map(Symbol, levels(keycol)))
261-
insert!(df2, 1, copy!(col2, col), _names(df)[rowkeys])
259+
insert!(df2, 1, col, _names(df)[rowkey])
262260
end
263261

264262
unstack(df::AbstractDataFrame, rowkey::ColumnIndex,
@@ -276,7 +274,7 @@ unstack(df::AbstractDataFrame, colkey::Int, value::Int) =
276274
unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, value::ColumnIndex) =
277275
unstack(df, rowkeys, index(df)[colkey], index(df)[value])
278276

279-
unstack(df::AbstractDataFrame, rowkeys::AbstractVector{T}, colkey::Int, value::Int) where T<:Real =
277+
unstack(df::AbstractDataFrame, rowkeys::AbstractVector{<:Real}, colkey::Int, value::Int) =
280278
unstack(df, names(df)[rowkeys], colkey, value)
281279

282280
function unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol}, colkey::Int, value::Int)
@@ -302,23 +300,23 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Symbol},
302300
Nrow = length(g)
303301
Ncol = length(levels(keycol))
304302
mask_filled = falses(Ncol, Nrow)
305-
nowarning = true
306-
nowarning2 = true
307-
keycol_pool = Vector{Int}(CategoricalArrays.order(keycol.pool))
303+
warned_missing_1 = false
304+
warned_missing_2 = false
305+
keycol_order = Vector{Int}(CategoricalArrays.order(keycol.pool))
308306
for k in 1:nrow(df)
309-
keycol_refs = keycol.refs[k]
310-
if keycol_refs == 0
311-
if nowarning2
312-
warn("Missing value in colkey variable at row $k. Skipping.")
313-
nowarning2 = false
307+
kref = keycol.refs[k]
308+
if kref <= 0
309+
if !warned_missing_2
310+
warn("Missing value in '$(_names(df)[colkey])' variable at row $k. Skipping.")
311+
warned_missing_2 = true
314312
end
315313
continue
316314
end
317-
j = keycol_pool[keycol_refs]
315+
j = keycol_order[kref]
318316
i = rowkey[k]
319-
if nowarning && mask_filled[j, i]
317+
if warned_missing_1 && mask_filled[j, i]
320318
warn("Duplicate entries in unstack at row $k.")
321-
nowarning = false
319+
warned_missing_1 = true
322320
end
323321
df2m[j][i] = valuecol[k]
324322
mask_filled[j, i] = true

test/dataframe.jl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -399,21 +399,21 @@ module TestDataFrame
399399
b = unstack(df, :variable, :value)
400400
@test a b DataFrame(id = [1, 2], a = [3, missing], b = [missing, 4])
401401

402-
df = DataFrame(variable=["x","x"], value=[missing,missing], id=[1,1])
402+
df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1,1])
403403
@test_warn "Duplicate entries in unstack at row 2." unstack(df, :variable, :value)
404404
@test_warn "Duplicate entries in unstack at row 2." unstack(df)
405405
end
406406

407407
@testset "missing values in colkey" begin
408-
df = DataFrame(id=[1,1,1,missing,missing,missing,2,2,2],
409-
variable=["a","b",missing,"a","b","missing","a","b","missing"],
410-
value=[missing,2.0,3.0,4.0,5.0,missing,7.0,missing,9.0])
411-
@test_warn "Missing value in colkey variable at row 3. Skipping." unstack(df)
412-
df = DataFrame(id=[1,1,1,missing,missing,missing,2,2,2],
413-
id2=[1,1,1,missing,missing,missing,2,2,2],
414-
variable=["a","b",missing,"a","b","missing","a","b","missing"],
415-
value=[missing,2.0,3.0,4.0,5.0,missing,7.0,missing,9.0])
416-
@test_warn "Missing value in colkey variable at row 3. Skipping." unstack(df, 3, 4)
408+
df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
409+
variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
410+
value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
411+
@test_warn "Missing value in 'variable' variable at row 3. Skipping." unstack(df)
412+
df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2],
413+
id2=[1, 1, 1, missing, missing, missing, 2, 2, 2],
414+
variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"],
415+
value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0])
416+
@test_warn "Missing value in 'variable' variable at row 3. Skipping." unstack(df, 3, 4)
417417
end
418418

419419
@testset "stack-unstack correctness" begin

0 commit comments

Comments
 (0)