Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GroupBy: Avoid guessing variable types #6906

Merged
merged 4 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 38 additions & 14 deletions Orange/data/aggregate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from functools import lru_cache
from typing import Callable, Dict, List, Tuple, Union
from typing import Callable, Dict, List, Tuple, Union, Type

import pandas as pd

Expand Down Expand Up @@ -39,15 +39,20 @@ def __init__(self, table: Table, by: List[Variable]):
df = table_to_frame(table, include_metas=True)
# observed=True keeps only groups with at leas one instance
self.group_by = df.groupby([a.name for a in by], observed=True)
self.by = tuple(by)

# lru_cache that is caches on the object level
self.compute_aggregation = lru_cache()(self._compute_aggregation)

AggDescType = Union[str,
Callable,
Tuple[str, Union[str, Callable]],
Tuple[str, Union[str, Callable], Union[Type[Variable], bool]]
]

def aggregate(
self,
aggregations: Dict[
Variable, List[Union[str, Callable, Tuple[str, Union[str, Callable]]]]
],
aggregations: Dict[Variable, List[AggDescType]],
callback: Callable = dummy_callback,
) -> Table:
"""
Expand All @@ -57,12 +62,16 @@ def aggregate(
----------
aggregations
The dictionary that defines aggregations that need to be computed
for variables. We support two formats:
for variables. We support three formats:
- {variable name: [agg function 1, agg function 2]}
- {variable name: [(agg name 1, agg function 1), (agg name 1, agg function 1)]}
- {variable name: [(agg name 1, agg function 1, output_variable_type1), ...]}
Where agg name is the aggregation name used in the output column name.
Aggregation function can be either function or string that defines
aggregation in Pandas (e.g. mean).
output_variable_type can be a type for a new variable, True to copy
the input variable, or False to create a new variable of the same type
as the input
callback
Callback function to report the progress

Expand All @@ -75,29 +84,44 @@ def aggregate(
count = 0

result_agg = []
output_variables = []
for col, aggs in aggregations.items():
for agg in aggs:
res = self._compute_aggregation(col, agg)
res, var = self._compute_aggregation(col, agg)
result_agg.append(res)
output_variables.append(var)
count += 1
callback(count / num_aggs * 0.8)

agg_table = self._aggregations_to_table(result_agg)
agg_table = self._aggregations_to_table(result_agg, output_variables)
callback(1)
return agg_table

def _compute_aggregation(
self, col: Variable, agg: Union[str, Callable, Tuple[str, Union[str, Callable]]]
) -> pd.Series:
self, col: Variable, agg: AggDescType) -> Tuple[pd.Series, Variable]:
# use named aggregation to avoid issues with same column names when reset_index
if isinstance(agg, tuple):
name, agg = agg
name, agg, var_type, *_ = (*agg, None)
else:
name = agg if isinstance(agg, str) else agg.__name__
var_type = None
col_name = f"{col.name} - {name}"
return self.group_by[col.name].agg(**{col_name: agg})

def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
agg_col = self.group_by[col.name].agg(**{col_name: agg})
if var_type is True:
var = col.copy(name=col_name)
elif var_type is False:
var = col.make(name=col_name)
elif var_type is None:
var = None
else:
assert issubclass(var_type, Variable)
var = var_type.make(name=col_name)
return agg_col, var

def _aggregations_to_table(
self,
aggregations: List[pd.Series],
output_variables: List[Union[Variable, None]]) -> Table:
"""Concatenate aggregation series and convert back to Table"""
if aggregations:
df = pd.concat(aggregations, axis=1)
Expand All @@ -107,7 +131,7 @@ def _aggregations_to_table(self, aggregations: List[pd.Series]) -> Table:
df = df.drop(columns=df.columns)
gb_attributes = df.index.names
df = df.reset_index() # move group by var that are in index to columns
table = table_from_frame(df)
table = table_from_frame(df, variables=(*self.by, *output_variables))

# group by variables should be last two columns in metas in the output
metas = table.domain.metas
Expand Down
83 changes: 53 additions & 30 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Pandas DataFrame↔Table conversion helpers"""
from functools import partial
from itertools import zip_longest

import numpy as np
from scipy import sparse as sp
Expand Down Expand Up @@ -255,7 +256,14 @@
return np.asarray(x)


def vars_from_df(df, role=None, force_nominal=False):
def to_numeric(s, _):
return np.asarray(pd.to_numeric(s))


def vars_from_df(df, role=None, force_nominal=False, variables=None):
if variables is not None:
assert len(variables) == len(df.columns)

if role is None and hasattr(df, 'orange_role'):
role = df.orange_role
df = _reset_index(df)
Expand All @@ -264,39 +272,52 @@
exprs = [], [], []
vars_ = [], [], []

for column in df.columns:
def _convert_string(s, _):
return np.asarray(
# to object so that fillna can replace with nans if Unknown in nan
# replace nan with object Unknown assure that all values are string
s.astype(object).fillna(StringVariable.Unknown).astype(str),
dtype=object
)

conversions = {
DiscreteVariable: to_categorical,
ContinuousVariable: to_numeric,
TimeVariable: _convert_datetime,
StringVariable: _convert_string
}

for column, var in zip_longest(df.columns, variables or [], fillvalue=None):
s = df[column]
_role = Role.Attribute if role is None else role
if hasattr(df, 'orange_variables') and column in df.orange_variables:
if var is not None:
if not var.is_primitive():
_role = Role.Meta
expr = conversions[type(var)]
elif hasattr(df, 'orange_variables') and column in df.orange_variables:
original_var = df.orange_variables[column]
var = original_var.copy(compute_value=None)
expr = None
elif _is_datetime(s):
var = TimeVariable(str(column))
expr = _convert_datetime
elif _is_discrete(s, force_nominal):
discrete = s.astype("category").cat
var = DiscreteVariable(
str(column), discrete.categories.astype(str).tolist()
)
expr = to_categorical
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
)
expr = None
else:
if role is not None and role != Role.Meta:
raise ValueError("String variable must be in metas.")
_role = Role.Meta
var = StringVariable(str(column))
expr = lambda s, _: np.asarray(
# to object so that fillna can replace with nans if Unknown in nan
# replace nan with object Unknown assure that all values are string
s.astype(object).fillna(StringVariable.Unknown).astype(str),
dtype=object
)
if _is_datetime(s):
var = TimeVariable(str(column))
elif _is_discrete(s, force_nominal):
discrete = s.astype("category").cat
var = DiscreteVariable(
str(column), discrete.categories.astype(str).tolist()
)
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
)
else:
if role is not None and role != Role.Meta:
raise ValueError("String variable must be in metas.")

Check warning on line 316 in Orange/data/pandas_compat.py

View check run for this annotation

Codecov / codecov/patch

Orange/data/pandas_compat.py#L316

Added line #L316 was not covered by tests
_role = Role.Meta
var = StringVariable(str(column))
expr = conversions[type(var)]


cols[_role].append(column)
exprs[_role].append(expr)
Expand Down Expand Up @@ -330,8 +351,10 @@
return xym, Domain(*vars_)


def table_from_frame(df, *, force_nominal=False):
XYM, domain = vars_from_df(df, force_nominal=force_nominal)
def table_from_frame(df, *, force_nominal=False, variables=None):
XYM, domain = vars_from_df(df,
force_nominal=force_nominal,
variables=variables)

if hasattr(df, 'orange_weights') and hasattr(df, 'orange_attributes'):
W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]
Expand Down
59 changes: 58 additions & 1 deletion Orange/data/tests/test_aggregate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from unittest.mock import Mock

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -132,13 +133,69 @@ def test_aggregation(self):
def test_preserve_table_class(self):
"""
Test whether result table has the same type than the imnput table,
e.g. if input table corpus the resutlitn table must be corpus too.
e.g. if input table corpus the resulting table must be corpus too.
"""
data = AlternativeTable.from_table(self.data.domain, self.data)
gb = data.groupby([data.domain["a"]])
output = gb.aggregate({data.domain["a"]: ["mean"]})
self.assertIsInstance(output, AlternativeTable)

def test_preserve_variables(self):
a, _, _, dvar = self.data.domain.attributes
gb = self.data.groupby([a])

a.attributes = {"foo": "bar"}
dvar.attributes = {"foo": "baz"}

a.copy = Mock(side_effect=a.copy)
a.make = Mock(side_effect=a.make)

def f(*_):
return 0

output = gb.aggregate(
{a: [("copy", f, True),
("make", f, False),
("auto", f, None),
("string", f, StringVariable),
("number", f, ContinuousVariable)],
dvar: [("copy", f, True),
("make", f, False),
("auto", f, None),
("string", f, StringVariable),
("discrete", f, DiscreteVariable)]}
)
self.assertIsInstance(output.domain["a - copy"], ContinuousVariable)
a.copy.assert_called_once()
self.assertEqual(output.domain["a - copy"].attributes, {"foo": "bar"})

self.assertIsInstance(output.domain["a - make"], ContinuousVariable)
a.make.assert_called_once()
self.assertNotEqual(output.domain["a - make"].attributes, {"foo": "bar"})

self.assertIsInstance(output.domain["a - auto"], ContinuousVariable)
self.assertNotEqual(output.domain["a - auto"].attributes, {"foo": "bar"})

self.assertIsInstance(output.domain["a - string"], StringVariable)

self.assertIsInstance(output.domain["a - number"], ContinuousVariable)
self.assertNotEqual(output.domain["a - number"].attributes, {"foo": "bar"})

self.assertIsInstance(output.domain["dvar - copy"], DiscreteVariable)
self.assertEqual(output.domain["dvar - copy"].attributes, {"foo": "baz"})

self.assertIsInstance(output.domain["dvar - make"], DiscreteVariable)
self.assertNotEqual(output.domain["dvar - make"].attributes, {"foo": "baz"})

# f returns 0, so the column looks numeric! Let's test that it is
# converted to numeric.
self.assertIsInstance(output.domain["dvar - auto"], ContinuousVariable)

self.assertIsInstance(output.domain["dvar - string"], StringVariable)

self.assertIsInstance(output.domain["dvar - discrete"], DiscreteVariable)
self.assertNotEqual(output.domain["dvar - discrete"].attributes, {"foo": "baz"})


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,24 @@ def test_table_from_frame(self):
self.assertEqual(names, ['0', '1', '2'])
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])

# Specify (some) variables
dvar = DiscreteVariable('x', values=tuple("dacb"))
cvar = ContinuousVariable('y')
table = table_from_frame(df, variables=[dvar, cvar, None])
self.assertIs(table.domain[0], dvar)
self.assertIs(table.domain[1], cvar)
self.assertIsInstance(table.domain[2], TimeVariable)

table = table_from_frame(df,
variables=[None, None, None],
force_nominal=True)
self.assertIsInstance(table.domain[0], DiscreteVariable)
self.assertIsInstance(table.domain[1], ContinuousVariable)
self.assertIsInstance(table.domain[2], TimeVariable)

self.assertRaises(AssertionError,
table_from_frame, df, variables=[None, None])

# Include index
df.index = list('abaa')
table = table_from_frame(df)
Expand Down
Loading
Loading