Add probabilistic classification to hiclass #minor (#119)

scikit-learn-contrib · Nov 25, 2024 · ee8cb75 · ee8cb75
1 parent 5ee9b59
commit ee8cb75
Show file tree

Hide file tree

Showing 36 changed files with 6,523 additions and 1,114 deletions.
diff --git a/.gitignore b/.gitignore
@@ -248,6 +248,8 @@ instance/
 # Sphinx documentation
 docs/_build/
 doc/_build/
+docs/examples/trained_model.sav
+docs/source/auto_examples/
 
 # PyBuilder
 target/

diff --git a/Pipfile b/Pipfile
@@ -6,16 +6,30 @@ name = "pypi"
 [packages]
 networkx = "*"
 numpy = "*"
-scikit-learn = "*"
+scikit-learn = "1.4.2"
+scipy = "1.11.4"
 
 [dev-packages]
-pytest = "*"
-pytest-flake8 = "*"
-pytest-pydocstyle = "*"
-pytest-cov = "*"
+pytest = "7.1.2"
+flake8 = "4.0.1"
+pytest-flake8 = "1.1.1"
+pydocstyle = "6.1.1"
+pytest-pydocstyle = "2.3.0"
+pytest-cov = "3.0.0"
 twine = "*"
-sphinx = "4.1.1"
-sphinx-rtd-theme = "0.5.2"
+sphinx = "5.0.0"
+sphinx-rtd-theme = "1.0.0"
+readthedocs-sphinx-search = "0.1.2"
+sphinx_code_tabs = "0.5.3"
+sphinx-gallery = "0.10.1"
+matplotlib = "3.9.2"
+pandas = "1.4.2"
+bert-sklearn = {git = "https://github.com/charles9n/bert-sklearn.git@master", editable = true}
+black = {version = "24.3.0", extras = ["colorama"]}
+pre-commit = "2.20.0"
+pyfakefs = "*"
+shap = "0.44.1"
+xarray = "2023.1.0"
 
 [extras]
 ray = "*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/docs/examples/plot_calibration.py b/docs/examples/plot_calibration.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+"""
+=====================
+Calibrating a Classifier
+=====================
+
+A minimalist example showing how to calibrate a HiClass LCN model. The calibration method can be selected with the :literal:`calibration_method` parameter, for example:
+
+.. tabs::
+
+    .. code-tab:: python
+        :caption: Isotonic Regression
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='isotonic'
+        )
+
+    .. code-tab:: python
+        :caption: Platt scaling
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='platt'
+        )
+
+    .. code-tab:: python
+        :caption: Beta scaling
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='beta'
+        )
+
+    .. code-tab:: python
+        :caption: IVAP
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='ivap'
+        )
+
+    .. code-tab:: python
+        :caption: CVAP
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='cvap'
+        )
+
+Furthermore, probabilites of multiple levels can be aggregated by defining a probability combiner:
+
+.. tabs::
+
+    .. code-tab:: python
+        :caption: Multiply (Default)
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='isotonic',
+            probability_combiner='multiply'
+        )
+
+    .. code-tab:: python
+        :caption: Geometric Mean
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='isotonic',
+            probability_combiner='geometric'
+        )
+
+    .. code-tab:: python
+        :caption: Arithmetic Mean
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='isotonic',
+            probability_combiner='arithmetic'
+        )
+
+    .. code-tab:: python
+        :caption: No Aggregation
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='isotonic',
+            probability_combiner=None
+        )
+
+
+A hierarchical classifier can be calibrated by calling calibrate on the model or by using a Pipeline:
+
+.. tabs::
+
+    .. code-tab:: python
+        :caption: Default
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='isotonic'
+        )
+
+        classifier.fit(X_train, Y_train)
+        classifier.calibrate(X_cal, Y_cal)
+        classifier.predict_proba(X_test)
+
+    .. code-tab:: python
+        :caption: Pipeline
+
+        from hiclass import Pipeline
+
+        rf = RandomForestClassifier()
+        classifier = LocalClassifierPerNode(
+            local_classifier=rf,
+            calibration_method='isotonic'
+        )
+
+        pipeline = Pipeline([
+            ('classifier', classifier),
+        ])
+
+        pipeline.fit(X_train, Y_train)
+        pipeline.calibrate(X_cal, Y_cal)
+        pipeline.predict_proba(X_test)
+
+In the code below, isotonic regression is used to calibrate the model.
+
+"""
+from sklearn.ensemble import RandomForestClassifier
+
+from hiclass import LocalClassifierPerNode
+
+# Define data
+X_train = [[1], [2], [3], [4]]
+X_test = [[4], [3], [2], [1]]
+X_cal = [[5], [6], [7], [8]]
+Y_train = [
+    ["Animal", "Mammal", "Sheep"],
+    ["Animal", "Mammal", "Cow"],
+    ["Animal", "Reptile", "Snake"],
+    ["Animal", "Reptile", "Lizard"],
+]
+
+Y_cal = [
+    ["Animal", "Mammal", "Cow"],
+    ["Animal", "Mammal", "Sheep"],
+    ["Animal", "Reptile", "Lizard"],
+    ["Animal", "Reptile", "Snake"],
+]
+
+# Use random forest classifiers for every node
+rf = RandomForestClassifier()
+
+# Use local classifier per node with isotonic regression as calibration method
+classifier = LocalClassifierPerNode(
+    local_classifier=rf, calibration_method="isotonic", probability_combiner="multiply"
+)
+
+# Train local classifier per node
+classifier.fit(X_train, Y_train)
+
+# Calibrate local classifier per node
+classifier.calibrate(X_cal, Y_cal)
+
+# Predict probabilities
+probabilities = classifier.predict_proba(X_test)
+
+# Print probabilities and labels for the last level
+print(classifier.classes_[2])
+print(probabilities)
diff --git a/docs/source/algorithms/calibration.rst b/docs/source/algorithms/calibration.rst
@@ -0,0 +1,110 @@
+.. _calibration-overview:
+
+===========================
+Classifier Calibration
+===========================
+HiClass provides support for probability calibration using various post-hoc calibration methods. 
+
+++++++++++++++++++++++++++
+Motivation
+++++++++++++++++++++++++++
+While many machine learning models can output uncertainty scores, these scores are known to be often poorly calibrated [1]_ [2]_. Model calibration aims to improve the quality of probabilistic forecasts by learning a transformation of the scores, using a separate dataset.
+
+++++++++++++++++++++++++++
+Methods
+++++++++++++++++++++++++++
+
+HiClass supports the following calibration methods:
+
+* Isotonic Regression [3]_
+
+* Platt Scaling [4]_
+
+* Beta Calibration [5]_
+
+* Inductive Venn-Abers Calibration [6]_
+
+* Cross Venn-Abers Calibration [6]_
+
+++++++++++++++++++++++++++
+Probability Aggregation
+++++++++++++++++++++++++++
+
+Combining probabilities over multiple levels is another method to improve probabilistic forecasts. The following methods are supported:
+
+Conditional Probability Aggregation (Multiply Aggregation)
+--------------
+Given a node hierarchy with :math:`n` levels, the probability of a node :math:`A_i`, where :math:`i` denotes the level, is calculated as:
+
+:math:`\displaystyle{\mathbb{P}(A_1 \cap A_2 \cap \ldots \cap A_i) = \mathbb{P}(A_1) \cdot \mathbb{P}(A_2 \mid A_1) \cdot \mathbb{P}(A_3 \mid A_1 \cap A_2) \cdot \ldots}`
+:math:`\displaystyle{\cdot \mathbb{P}(A_i \mid A_1 \cap A_2 \cap \ldots \cap A_{i-1})}`
+
+Arithmetic Mean Aggregation
+--------------
+:math:`\displaystyle{\mathbb{P}(A_i) = \frac{1}{i} \sum_{j=1}^{i} \mathbb{P}(A_{j})}`
+
+Geometric Mean Aggregation
+--------------
+:math:`\displaystyle{\mathbb{P}(A_i) = \exp{\left(\frac{1}{i} \sum_{j=1}^{i} \ln \mathbb{P}(A_{j})\right)}}`
+
+++++++++++++++++++++++++++
+Code sample
+++++++++++++++++++++++++++
+
+.. code-block:: python
+
+    from sklearn.ensemble import RandomForestClassifier
+
+    from hiclass import LocalClassifierPerNode
+
+    # Define data
+    X_train = [[1], [2], [3], [4]]
+    X_test = [[4], [3], [2], [1]]
+    X_cal = [[5], [6], [7], [8]]
+    Y_train = [
+        ["Animal", "Mammal", "Sheep"],
+        ["Animal", "Mammal", "Cow"],
+        ["Animal", "Reptile", "Snake"],
+        ["Animal", "Reptile", "Lizard"],
+    ]
+
+    Y_cal = [
+        ["Animal", "Mammal", "Cow"],
+        ["Animal", "Mammal", "Sheep"],
+        ["Animal", "Reptile", "Lizard"],
+        ["Animal", "Reptile", "Snake"],
+    ]
+
+    # Use random forest classifiers for every node
+    rf = RandomForestClassifier()
+
+    # Use local classifier per node with isotonic regression as calibration method
+    classifier = LocalClassifierPerNode(
+        local_classifier=rf, calibration_method="isotonic", probability_combiner="multiply"
+    )
+
+    # Train local classifier per node
+    classifier.fit(X_train, Y_train)
+
+    # Calibrate local classifier per node
+    classifier.calibrate(X_cal, Y_cal)
+
+    # Predict probabilities
+    probabilities = classifier.predict_proba(X_test)
+
+    # Print probabilities and labels for the last level
+    print(classifier.classes_[2])
+    print(probabilities)
+
+.. [1] Niculescu-Mizil, Alexandru; Caruana, Rich (2005): Predicting good probabilities with supervised learning. In: Saso Dzeroski (Hg.): Proceedings of the 22nd international conference on Machine learning - ICML '05. the 22nd international conference. Bonn, Germany, 07.08.2005 - 11.08.2005. New York, New York, USA: ACM Press, S. 625-632.
+
+.. [2] Chuan Guo; Geoff Pleiss; Yu Sun; Kilian Q. Weinberger (2017): On Calibration of Modern Neural Networks. In: Doina Precup und Yee Whye Teh (Hg.): Proceedings of the 34th International Conference on Machine Learning, Bd. 70: PMLR (Proceedings of Machine Learning Research), S. 1321-1330.
+
+.. [3] Zadrozny, Bianca; Elkan, Charles (2002): Transforming classifier scores into accurate multiclass probability estimates. In: Proceedings of the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. New York, NY, USA: Association for Computing Machinery (KDD ’02), S. 694-699.
+
+.. [4] Platt, John (2000): Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods. In: Adv. Large Margin Classif. 10.
+
+.. [5] Kull, Meelis; Filho, Telmo Silva; Flach, Peter (2017): Beta calibration: a well-founded and easily implemented improvement on logistic calibration for binary classifiers. In: Aarti Singh und Jerry Zhu (Hg.): Proceedings of the 20th International Conference on Artificial Intelligence and Statistics, Bd. 54: PMLR (Proceedings of Machine Learning Research), S. 623-631.
+
+.. [6] Vovk, Vladimir; Petej, Ivan; Fedorova, Valentina (2015): Large-scale probabilistic predictors with and without guarantees of validity. In: C. Cortes, N. Lawrence, D. Lee, M. Sugiyama und R. Garnett (Hg.): Advances in Neural Information Processing Systems, Bd. 28: Curran Associates, Inc. 
+
diff --git a/docs/source/algorithms/index.rst b/docs/source/algorithms/index.rst
@@ -17,3 +17,4 @@ HiClass provides implementations for the most popular machine learning models fo
     multi_label
     metrics
     explainer
+    calibration
diff --git a/docs/source/algorithms/metrics.rst b/docs/source/algorithms/metrics.rst
@@ -3,12 +3,19 @@
 Metrics
 ====================
 
+Classification Metrics
+--------------
+
 According to [1]_, the use of flat classification metrics might not be adequate to give enough insight of which algorithm is better at classifying hierarchical data. Hence, in HiClass we implemented the metrics of hierarchical precision (hP), hierarchical recall (hR) and hierarchical F-score (hF), which are extensions of the renowned metrics of precision, recall and F-score, but tailored to the hierarchical classification scenario. These hierarchical counterparts were initially proposed by [2]_, and are defined as follows:
 
 :math:`\displaystyle{hP = \frac{\sum_i|\alpha_i\cap\beta_i|}{\sum_i|\alpha_i|}}`, :math:`\displaystyle{hR = \frac{\sum_i|\alpha_i\cap\beta_i|}{\sum_i|\beta_i|}}`, :math:`\displaystyle{hF = \frac{2 \times hP \times hR}{hP + hR}}`
 
 where :math:`\alpha_i` is the set consisting of the most specific classes predicted for test example :math:`i` and all their ancestor classes, while :math:`\beta_i` is the set containing the true most specific classes of test example :math:`i` and all their ancestors, with summations computed over all test examples.
 
+Calibration Metrics
+--------------
+
+
 .. [1] Silla, C. N., & Freitas, A. A. (2011). A survey of hierarchical classification across different application domains. Data Mining and Knowledge Discovery, 22(1), 31-72.
 
 .. [2] Kiritchenko, S., Matwin, S., Nock, R., & Famili, A. F. (2006, June). Learning and evaluation in the presence of class hierarchies: Application to text categorization. In Conference of the Canadian Society for Computational Studies of Intelligence (pp. 395-406). Springer, Berlin, Heidelberg.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -23,7 +23,7 @@
 # -- Project information -----------------------------------------------------
 
 project = "hiclass"
-copyright = "2022, Fabio Malcher Miranda, Niklas Köhnecke"
+copyright = "2024, Fabio Malcher Miranda, Niklas Köhnecke"
 author = "Fabio Malcher Miranda, Niklas Köhnecke"