Skip to content

Commit

Permalink
refactor: replace pandas apply with more efficient means (#655)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyanyang committed May 1, 2020
1 parent 22dbece commit 11e4a4a
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 23 deletions.
4 changes: 4 additions & 0 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ The CHANGELOG for the current development version is available at

- Implemented both `use_clones` and `fit_base_estimators` (previously `refit` in `EnsembleVoteClassifier`) for `EnsembleVoteClassifier` and `StackingClassifier`. ([#670](https://github.com/rasbt/mlxtend/pull/670) via [Katrina Ni](https://github.com/nilichen))

- Improve the runtime performance for apriori function and tests by replacing pandas' .apply with numpy's vectorize as the data show vectorize is faster than apply for the current use. ([#655](https://github.com/rasbt/mlxtend/pull/655) via [Kyle Yang](https://github.com/keyanyang))

- Improve the efficiency for generate_itemsets function by replacing Python lists with Numpy's arrays and replacing iterative division with array division. ([#655](https://github.com/rasbt/mlxtend/pull/655) via [Kyle Yang](https://github.com/keyanyang))

##### Bug Fixes

- Fix axis DeprecationWarning in matplotlib v3.1.0 and newer. ([#673](https://github.com/rasbt/mlxtend/pull/673))
Expand Down
4 changes: 2 additions & 2 deletions mlxtend/frequent_patterns/apriori.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,8 @@ def _support(_x, _n_rows, _is_sparse):
res_df.columns = ['support', 'itemsets']
if use_colnames:
mapping = {idx: item for idx, item in enumerate(df.columns)}
res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
mapping[i] for i in x]))
res_df['itemsets'] = [frozenset(a) for a in np.vectorize(map)(
mapping.get, res_df['itemsets'])]
res_df = res_df.reset_index(drop=True)

if verbose:
Expand Down
15 changes: 8 additions & 7 deletions mlxtend/frequent_patterns/fpcommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,19 @@ def setup_fptree(df, min_support):


def generate_itemsets(generator, num_itemsets, colname_map):
itemsets = []
supports = []
itemsets = np.array([])
supports = np.array([])
for sup, iset in generator:
itemsets.append(frozenset(iset))
supports.append(sup / num_itemsets)
itemsets = np.append(itemsets, frozenset(iset))
supports = np.append(supports, sup)

supports = np.divide(supports, num_itemsets)

res_df = pd.DataFrame({'support': supports, 'itemsets': itemsets})

if colname_map is not None:
res_df['itemsets'] = res_df['itemsets'] \
.apply(lambda x: frozenset([colname_map[i] for i in x]))

res_df['itemsets'] = [frozenset(a) for a in np.vectorize(map)(
colname_map.get, res_df['itemsets'])]
return res_df


Expand Down
18 changes: 8 additions & 10 deletions mlxtend/frequent_patterns/tests/test_association_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,8 @@

def test_default():
res_df = association_rules(df_freq_items)
res_df['antecedents'] = res_df['antecedents'].apply(
lambda x: str(frozenset(x)))
res_df['consequents'] = res_df['consequents'].apply(
lambda x: str(frozenset(x)))
res_df['antecedents'] = np.vectorize(str)(res_df['antecedents'])
res_df['consequents'] = np.vectorize(str)(res_df['consequents'])
res_df.sort_values(columns_ordered, inplace=True)
res_df.reset_index(inplace=True, drop=True)

Expand All @@ -46,10 +44,10 @@ def test_default():
columns=columns_ordered
)

expect['antecedents'] = expect['antecedents'].apply(
lambda x: str(frozenset(x)))
expect['consequents'] = expect['consequents'].apply(
lambda x: str(frozenset(x)))
expect['antecedents'] = np.vectorize(str)(
np.vectorize(frozenset)(expect['antecedents']))
expect['consequents'] = np.vectorize(str)(
np.vectorize(frozenset)(expect['consequents']))
expect.sort_values(columns_ordered, inplace=True)
expect.reset_index(inplace=True, drop=True)

Expand All @@ -68,8 +66,8 @@ def test_datatypes():
# check if association_rule converts it internally
# back to frozensets
df_freq_items_copy = df_freq_items.copy()
df_freq_items_copy['itemsets'] = df_freq_items_copy['itemsets']\
.apply(lambda x: set(x))
df_freq_items_copy['itemsets'] = np.vectorize(set)(
df_freq_items_copy['itemsets'])

res_df = association_rules(df_freq_items)
for i in res_df['antecedents']:
Expand Down
4 changes: 2 additions & 2 deletions mlxtend/frequent_patterns/tests/test_fpbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,11 +201,11 @@ def test_default(self):

def test_max_len(self):
res_df1 = self.fpalgo(self.df)
max_len = np.max(res_df1['itemsets'].apply(len))
max_len = np.vectorize(len)(res_df1['itemsets']).max()
assert max_len == 3

res_df2 = self.fpalgo(self.df, max_len=2)
max_len = np.max(res_df2['itemsets'].apply(len))
max_len = np.vectorize(len)(res_df2['itemsets']).max()
assert max_len == 2

def test_low_memory_flag(self):
Expand Down
4 changes: 2 additions & 2 deletions mlxtend/frequent_patterns/tests/test_fpmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ def test_default(self):

def test_max_len(self):
res_df1 = fpmax(self.df)
max_len = np.max(res_df1['itemsets'].apply(len))
max_len = np.vectorize(len)(res_df1['itemsets']).max()
assert max_len == 3

res_df2 = fpmax(self.df, max_len=2)
max_len = np.max(res_df2['itemsets'].apply(len))
max_len = np.vectorize(len)(res_df2['itemsets']).max()
assert max_len == 2


Expand Down

0 comments on commit 11e4a4a

Please sign in to comment.