Skip to content

Commit

Permalink
Merge pull request #5273 from FederatedAI/feature-1.11.4-stats
Browse files Browse the repository at this point in the history
select random mode when counts equal
  • Loading branch information
mgqa34 authored Nov 22, 2023
2 parents 921d32d + 7750ba2 commit 65461f3
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 9 deletions.
8 changes: 5 additions & 3 deletions python/federatedml/feature/imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,14 +216,16 @@ def __get_cols_transform_value(self, data, replace_method, replace_value=None, c
elif replace_method[feature] == consts.MEDIAN:
transform_value = summary_obj.get_median()[feature]
elif replace_method[feature] == consts.MODE:
mode = summary_obj.get_mode(to_compute_mode_col)[feature]
if len(mode['max_val']) > 1:
mode = summary_obj.get_mode(to_compute_mode_col, multi_mode=multi_mode)[feature]
transform_value = mode['max_val']
# LOGGER.debug(f"transform value is: {transform_value}")
"""if len(mode['max_val']) > 1:
if multi_mode == 'random':
transform_value = np.random.choice(mode['max_val'])
else:
raise ValueError("There are multiple modes in column {}, please check.".format(feature))
else:
transform_value = mode['max_val'][0]
transform_value = mode['max_val'][0]"""
elif replace_method[feature] == consts.DESIGNATED:
if isinstance(col_replace_value, dict):
transform_value = col_replace_value.get(feature, replace_value)
Expand Down
17 changes: 11 additions & 6 deletions python/federatedml/statistic/statics.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ def aggregate_statics(s_dict1, s_dict2):
new_dict[col_name] = static_1
return new_dict

def compute_mode(self, data):
def compute_mode(self, data, raise_multi_mode=False):

def __mapper(kv_iterator):
for _, v in kv_iterator:
Expand All @@ -561,12 +561,12 @@ def __aggregate_count_per_val_in_col(kv_iterator, header):
continue
col_name = header[k[0]]
if col_name not in col_dict:
col_dict[col_name] = {'max_count': v, 'max_val': [k[1]]}
col_dict[col_name] = {'max_count': v, 'max_val': k[1]}
else:
if col_dict[col_name]['max_count'] < v:
col_dict[col_name] = {'max_count': v, 'max_val': [k[1]]}
col_dict[col_name] = {'max_count': v, 'max_val': k[1]}
elif col_dict[col_name]['max_count'] == v:
col_dict[col_name]['max_val'].append(k[1])
col_dict[col_name]['max_val'] = random.choice([k[1], col_dict[col_name]['max_val']])
return col_dict

func = functools.partial(__aggregate_count_per_val_in_col, header=data.schema.get('header'))
Expand All @@ -579,6 +579,8 @@ def merge_count_dict(x, y):
if v['max_count'] > x[k]['max_count']:
x[k] = v
elif v['max_count'] == x[k]['max_count']:
if raise_multi_mode:
raise ValueError(f"There are multiple modes in column {k}, please check.")
# x[k]['max_val'].extend(v['max_val'])
x[k]['max_val'] = random.choice([x[k]['max_val'], v['max_val']])

Expand All @@ -587,7 +589,7 @@ def merge_count_dict(x, y):
data_mode_summary = data_reduce_by_col_val.mapPartitions(func).reduce(merge_count_dict)
return data_mode_summary

def get_mode(self, col=None):
def get_mode(self, col=None, multi_mode='random'):
"""
Return the mode value(s) of the given column
Expand All @@ -607,7 +609,10 @@ def get_mode(self, col=None):
new_schema['header'] = new_header
data.schema = new_schema
if not self.mode_:
self.mode_ = self.compute_mode(data)
raise_multi_mode = False
if multi_mode == 'raise':
raise_multi_mode = True
self.mode_ = self.compute_mode(data, raise_multi_mode)
return self.mode_

def get_median(self):
Expand Down

0 comments on commit 65461f3

Please sign in to comment.