scikit-learn-contrib · fayeab · Nov 16, 2020 · Nov 17, 2020 · Nov 18, 2020 · Nov 19, 2020
diff --git a/examples/plot_credit_default.py b/examples/plot_credit_default.py
@@ -151,7 +151,7 @@
 clf = SkopeRules(
     max_depth_duplication=3, max_depth=3, max_features=0.5,
     max_samples_features=0.5, random_state=rng, n_estimators=20,
-    feature_names=feature_names, recall_min=0.04, precision_min=0.6)
+    feature_names=feature_names, filtering_criteria={'recall': 0.04, 'precision': 0.6})
 clf.fit(X_train, y_train)
 
 # in the score_top_rules method, a score of k means that rule number k

diff --git a/notebooks/demo_clustering.ipynb b/notebooks/demo_clustering.ipynb
@@ -110,40 +110,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Cluster 0:\n",
-      "[('Agility <= 81.5 and Free_kick_accuracy > 56.0 and Heading_accuracy > 58.5', (0.93548387096774188, 0.8529411764705882, 10))]\n",
-      "Cluster 1:\n",
-      "[('Aggression <= 76.5 and Agility > 81.5 and Balance > 66.5', (1.0, 0.77419354838709675, 8))]\n",
-      "Cluster 2:\n",
-      "[('Curve <= 61.5 and Heading_accuracy > 82.5', (1.0, 0.7857142857142857, 8))]\n",
-      "Cluster 3:\n",
-      "[('Curve <= 28.0', (1.0, 1.0, 4))]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "warnings.filterwarnings('ignore') #To deals with warning raised by max_samples=1 (see below).\n",
-    "#With max_samples=1, there is no Out-Of-Bag sample to evaluate performance (it is evaluated on all samples. \n",
-    "#As there are less than 100 samples and this is a clustering-oriented task, the risk of overfitting is not \n",
-    "#dramatic here.\n",
+    "warnings.filterwarnings('ignore') # To deals with warning raised by max_samples=1 (see below).\n",
+    "# With max_samples=1, there is no Out-Of-Bag sample to evaluate performance (it is evaluated on all samples. \n",
+    "# As there are less than 100 samples and this is a clustering-oriented task, the risk of overfitting is not \n",
+    "# dramatic here.\n",
     "\n",
     "i_cluster = 0\n",
     "for i_cluster in range(4):\n",
     "    X_train = data.drop(['Name', 'Preferred_Positions', 'cluster'], axis=1)\n",
-    "    y_train = (data['cluster']==i_cluster)*1\n",
-    "    skope_rules_clf = SkopeRules(feature_names=feature_names, random_state=42, n_estimators=5,\n",
-    "                                   recall_min=0.5, precision_min=0.5, max_depth_duplication=0,\n",
-    "                                   max_samples=1., max_depth=3)\n",
+    "    y_train = (data['cluster'] == i_cluster) * 1\n",
+    "    skope_rules_clf = SkopeRules(feature_names=feature_names, \n",
+    "                                 random_state=42,\n",
+    "                                 n_estimators=5,\n",
+    "                                 filtering_criteria={'precision': 0.5, 'recall': 0.5},\n",
+    "                                 duplication_criterion=\"f1\",\n",
+    "                                 max_depth_duplication=0,\n",
+    "                                 max_samples=1.,\n",
+    "                                 max_depth=3)\n",
     "    skope_rules_clf.fit(X_train, y_train)\n",
-    "    print('Cluster '+str(i_cluster)+':')\n",
-    "    #print(data.query('cluster=='+str(i_cluster))[['Name', 'Preferred_Positions']])\n",
+    "    print('Cluster ' + str(i_cluster) + ':')\n",
     "    print(skope_rules_clf.rules_)"
    ]
   },
@@ -186,8 +175,8 @@
    ],
    "source": [
     "for i_cluster in range(4):\n",
-    "    print('5 players from cluster '+str(i_cluster)+':')\n",
-    "    print(data.query(\"cluster==\"+str(i_cluster))['Name'].sample(5, random_state=42).tolist()) # Get 5 random players per cluster\n",
+    "    print('5 players from cluster {}:'.format(i_cluster))\n",
+    "    print(data.query(\"cluster == {}\".format(i_cluster))['Name'].sample(5, random_state=42).tolist()) # Get 5 random players per cluster\n",
     "    print()"
    ]
   },
@@ -224,7 +213,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.2"
+   "version": "3.5.6"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/demo_titanic.ipynb b/notebooks/demo_titanic.ipynb
@@ -132,10 +132,14 @@
     "decision_tree_clf.fit(X_train, y_train)\n",
     "\n",
     "# Train a skope-rules-boosting classifier\n",
-    "skope_rules_clf = SkopeRules(feature_names=feature_names, random_state=42, n_estimators=30,\n",
-    "                               recall_min=0.05, precision_min=0.9,\n",
-    "                               max_samples=0.7,\n",
-    "                               max_depth_duplication= 4, max_depth = 5)\n",
+    "skope_rules_clf = SkopeRules(feature_names=feature_names,\n",
+    "                             random_state=42, \n",
+    "                             n_estimators=30,\n",
+    "                             filtering_criteria={'precision': 0.9, 'recall': 0.05},\n",
+    "                             max_samples=0.7,\n",
+    "                             max_depth_duplication=4,\n",
+    "                             duplication_criterion=\"f1\",\n",
+    "                             max_depth=5)\n",
     "skope_rules_clf.fit(X_train, y_train)\n",
     "\n",
     "\n",
@@ -665,7 +669,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.2"
+   "version": "3.5.6"
   }
  },
  "nbformat": 4,

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
-numpy>=1.10.4
-scikit-learn>=0.17.1
+numpy>=1.11.0
+scikit-learn>=0.22
 scipy>=0.17.0
 pandas>=0.18.1
 numpydoc

diff --git a/skrules/__init__.py b/skrules/__init__.py
@@ -1,4 +1,4 @@
 from .skope_rules import SkopeRules
 from .rule import Rule, replace_feature_name
 
-__all__ = ['SkopeRules', 'Rule']
+__all__ = ['SkopeRules', 'Rule', 'replace_feature_name']
diff --git a/skrules/datasets/credit_data.py b/skrules/datasets/credit_data.py
@@ -16,28 +16,59 @@
 
 """
 
+
+from os.path import exists, join
+from urllib.request import urlretrieve
+from collections import namedtuple
+import hashlib
+
 import pandas as pd
 import numpy as np
-from sklearn.datasets.base import get_data_home, Bunch
-from sklearn.datasets.base import _fetch_remote, RemoteFileMetadata
-from os.path import exists, join
+
+from sklearn.datasets import get_data_home
+from sklearn.utils import Bunch
 
 
+# Because of sklearn.datasets.base and  module is  deprecated in version 0.22
+# and will be removed in version 0.24
+# We delete "from sklearn.datasets.base import _fetch_remote, RemoteFileMetadata"
+# The function _sha256 of sklearn.datasets.base is redefined
+
+def calculate_sha256(file_path):
+    """Calculate the sha256 hash of the file at path."""
+    sha256hash = hashlib.sha256()
+    chunk_size = 8192
+    with open(file_path, "rb") as file:
+        while True:
+            buffer = file.read(chunk_size)
+            if not buffer:
+                break
+            sha256hash.update(buffer)
+    return sha256hash.hexdigest()
+
 def load_credit_data():
     sk_data_dir = get_data_home()
+    RemoteFileMetadata = namedtuple('RemoteFileMetadata',
+                                ['filename', 'url', 'checksum'])
     archive = RemoteFileMetadata(
         filename='default of credit card clients.xls',
         url='https://archive.ics.uci.edu/ml/machine-learning-databases/'
             '00350/default%20of%20credit%20card%20clients.xls',
         checksum=('30c6be3abd8dcfd3e6096c828bad8c2f'
                   '011238620f5369220bd60cfc82700933'))
-
-    if not exists(join(sk_data_dir, archive.filename)):
-        _fetch_remote(archive, dirname=sk_data_dir)
-
-    data = pd.read_excel(join(sk_data_dir, archive.filename),
-                         sheet_name='Data', header=1)
-
+    file_path = join(sk_data_dir, archive.filename)
+    if not exists(file_path):
+        urlretrieve(archive.url, file_path)
+        checksum = calculate_sha256(file_path)
+        if archive.checksum != checksum:
+            raise IOError("{} has an SHA256 checksum ({}) "
+                          "differing from expected ({}), "
+                          "file may be corrupted.".format(file_path,
+                                                          checksum,
+                                                          archive.checksum))
+    data = pd.read_excel(file_path,
+                         sheet_name='Data',
+                         header=1)
     dataset = Bunch(
         data=(data.drop('default payment next month', axis=1)),
         target=np.array(data['default payment next month'])