From 11e4a4a1cf7bab240f3aca52a3fa2a89a43a6742 Mon Sep 17 00:00:00 2001 From: github kyleyang Date: Thu, 30 Apr 2020 17:09:01 -0700 Subject: [PATCH] refactor: replace pandas apply with more efficient means (#655) --- docs/sources/CHANGELOG.md | 4 ++++ mlxtend/frequent_patterns/apriori.py | 4 ++-- mlxtend/frequent_patterns/fpcommon.py | 15 ++++++++------- .../tests/test_association_rules.py | 18 ++++++++---------- mlxtend/frequent_patterns/tests/test_fpbase.py | 4 ++-- mlxtend/frequent_patterns/tests/test_fpmax.py | 4 ++-- 6 files changed, 26 insertions(+), 23 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index f65271d00..b1266ab7e 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -25,6 +25,10 @@ The CHANGELOG for the current development version is available at - Implemented both `use_clones` and `fit_base_estimators` (previously `refit` in `EnsembleVoteClassifier`) for `EnsembleVoteClassifier` and `StackingClassifier`. ([#670](https://github.com/rasbt/mlxtend/pull/670) via [Katrina Ni](https://github.com/nilichen)) +- Improve the runtime performance for apriori function and tests by replacing pandas' .apply with numpy's vectorize as the data show vectorize is faster than apply for the current use. ([#655](https://github.com/rasbt/mlxtend/pull/655) via [Kyle Yang](https://github.com/keyanyang)) + +- Improve the efficiency for generate_itemsets function by replacing Python lists with Numpy's arrays and replacing iterative division with array division. ([#655](https://github.com/rasbt/mlxtend/pull/655) via [Kyle Yang](https://github.com/keyanyang)) + ##### Bug Fixes - Fix axis DeprecationWarning in matplotlib v3.1.0 and newer. ([#673](https://github.com/rasbt/mlxtend/pull/673)) diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index ee2e15e1f..76cb21795 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -324,8 +324,8 @@ def _support(_x, _n_rows, _is_sparse): res_df.columns = ['support', 'itemsets'] if use_colnames: mapping = {idx: item for idx, item in enumerate(df.columns)} - res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([ - mapping[i] for i in x])) + res_df['itemsets'] = [frozenset(a) for a in np.vectorize(map)( + mapping.get, res_df['itemsets'])] res_df = res_df.reset_index(drop=True) if verbose: diff --git a/mlxtend/frequent_patterns/fpcommon.py b/mlxtend/frequent_patterns/fpcommon.py index a7cb87478..45dfcf19e 100644 --- a/mlxtend/frequent_patterns/fpcommon.py +++ b/mlxtend/frequent_patterns/fpcommon.py @@ -59,18 +59,19 @@ def setup_fptree(df, min_support): def generate_itemsets(generator, num_itemsets, colname_map): - itemsets = [] - supports = [] + itemsets = np.array([]) + supports = np.array([]) for sup, iset in generator: - itemsets.append(frozenset(iset)) - supports.append(sup / num_itemsets) + itemsets = np.append(itemsets, frozenset(iset)) + supports = np.append(supports, sup) + + supports = np.divide(supports, num_itemsets) res_df = pd.DataFrame({'support': supports, 'itemsets': itemsets}) if colname_map is not None: - res_df['itemsets'] = res_df['itemsets'] \ - .apply(lambda x: frozenset([colname_map[i] for i in x])) - + res_df['itemsets'] = [frozenset(a) for a in np.vectorize(map)( + colname_map.get, res_df['itemsets'])] return res_df diff --git a/mlxtend/frequent_patterns/tests/test_association_rules.py b/mlxtend/frequent_patterns/tests/test_association_rules.py index c841f6b5f..1bedd560f 100644 --- a/mlxtend/frequent_patterns/tests/test_association_rules.py +++ b/mlxtend/frequent_patterns/tests/test_association_rules.py @@ -26,10 +26,8 @@ def test_default(): res_df = association_rules(df_freq_items) - res_df['antecedents'] = res_df['antecedents'].apply( - lambda x: str(frozenset(x))) - res_df['consequents'] = res_df['consequents'].apply( - lambda x: str(frozenset(x))) + res_df['antecedents'] = np.vectorize(str)(res_df['antecedents']) + res_df['consequents'] = np.vectorize(str)(res_df['consequents']) res_df.sort_values(columns_ordered, inplace=True) res_df.reset_index(inplace=True, drop=True) @@ -46,10 +44,10 @@ def test_default(): columns=columns_ordered ) - expect['antecedents'] = expect['antecedents'].apply( - lambda x: str(frozenset(x))) - expect['consequents'] = expect['consequents'].apply( - lambda x: str(frozenset(x))) + expect['antecedents'] = np.vectorize(str)( + np.vectorize(frozenset)(expect['antecedents'])) + expect['consequents'] = np.vectorize(str)( + np.vectorize(frozenset)(expect['consequents'])) expect.sort_values(columns_ordered, inplace=True) expect.reset_index(inplace=True, drop=True) @@ -68,8 +66,8 @@ def test_datatypes(): # check if association_rule converts it internally # back to frozensets df_freq_items_copy = df_freq_items.copy() - df_freq_items_copy['itemsets'] = df_freq_items_copy['itemsets']\ - .apply(lambda x: set(x)) + df_freq_items_copy['itemsets'] = np.vectorize(set)( + df_freq_items_copy['itemsets']) res_df = association_rules(df_freq_items) for i in res_df['antecedents']: diff --git a/mlxtend/frequent_patterns/tests/test_fpbase.py b/mlxtend/frequent_patterns/tests/test_fpbase.py index f834291f0..1b484f74b 100644 --- a/mlxtend/frequent_patterns/tests/test_fpbase.py +++ b/mlxtend/frequent_patterns/tests/test_fpbase.py @@ -201,11 +201,11 @@ def test_default(self): def test_max_len(self): res_df1 = self.fpalgo(self.df) - max_len = np.max(res_df1['itemsets'].apply(len)) + max_len = np.vectorize(len)(res_df1['itemsets']).max() assert max_len == 3 res_df2 = self.fpalgo(self.df, max_len=2) - max_len = np.max(res_df2['itemsets'].apply(len)) + max_len = np.vectorize(len)(res_df2['itemsets']).max() assert max_len == 2 def test_low_memory_flag(self): diff --git a/mlxtend/frequent_patterns/tests/test_fpmax.py b/mlxtend/frequent_patterns/tests/test_fpmax.py index 949fa5bc9..e03cf79ee 100644 --- a/mlxtend/frequent_patterns/tests/test_fpmax.py +++ b/mlxtend/frequent_patterns/tests/test_fpmax.py @@ -32,11 +32,11 @@ def test_default(self): def test_max_len(self): res_df1 = fpmax(self.df) - max_len = np.max(res_df1['itemsets'].apply(len)) + max_len = np.vectorize(len)(res_df1['itemsets']).max() assert max_len == 3 res_df2 = fpmax(self.df, max_len=2) - max_len = np.max(res_df2['itemsets'].apply(len)) + max_len = np.vectorize(len)(res_df2['itemsets']).max() assert max_len == 2