diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-pr.yml
index fe18840f..2e977344 100644
--- a/.github/workflows/test-pr.yml
+++ b/.github/workflows/test-pr.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
         os: [ubuntu-latest, macOS-latest, windows-latest]
     steps:
     - uses: actions/checkout@v2
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index f120b161..6bb5d9a5 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -3,7 +3,7 @@ version: 2
 build:
   os: "ubuntu-20.04"
   tools:
-    python: "3.8"
+    python: "3.12"
 
 # Build from the docs/ directory with Sphinx
 sphinx:
diff --git a/Pipfile b/Pipfile
index 70f6ab34..80576526 100644
--- a/Pipfile
+++ b/Pipfile
@@ -24,14 +24,9 @@ sphinx_code_tabs = "0.5.3"
 sphinx-gallery = "0.10.1"
 matplotlib = "3.9.2"
 pandas = "1.4.2"
-bert-sklearn = {git = "https://github.com/charles9n/bert-sklearn.git@master", editable = true}
 black = {version = "24.3.0", extras = ["colorama"]}
 pre-commit = "2.20.0"
 pyfakefs = "*"
-shap = "0.44.1"
-xarray = "2023.1.0"
 
 [extras]
 ray = "*"
-shap = "0.44.1"
-xarray = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
index 2cbf59e1..c5ec1509 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -183,11 +183,6 @@
             "markers": "python_version < '3.12'",
             "version": "==1.2.0"
         },
-        "bert-sklearn": {
-            "editable": true,
-            "git": "https://github.com/charles9n/bert-sklearn.git@master",
-            "ref": "9cb510ae16209c1cb26b078e0e5037e1344600af"
-        },
         "black": {
             "extras": [
                 "colorama"
diff --git a/README.md b/README.md
index 3491e72b..a2014a58 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,6 @@ HiClass is an open-source Python library for hierarchical classification compati
 - [Who is using HiClass?](#who-is-using-hiclass)
 - [Install](#install)
 - [Quick start](#quick-start)
-- [Explaining Hierarchical Classifiers](#explaining-hierarchical-classifiers)
 - [Step-by-step walk-through](#step-by-step-walk-through)
 - [API documentation](#api-documentation)
 - [FAQ](#faq)
@@ -34,8 +33,6 @@ HiClass is an open-source Python library for hierarchical classification compati
 - **[Build pipelines](https://hiclass.readthedocs.io/en/latest/auto_examples/plot_pipeline.html):** Since the hierarchical classifiers inherit from the BaseEstimator of scikit-learn, pipelines can be built to automate machine learning workflows.
 - **[Hierarchical metrics](https://hiclass.readthedocs.io/en/latest/api/utilities.html#hierarchical-metrics):** HiClass supports the computation of hierarchical precision, recall and f-score, which are more appropriate for hierarchical data than traditional metrics.
 - **[Compatible with pickle](https://hiclass.readthedocs.io/en/latest/auto_examples/plot_model_persistence.html):** Easily store trained models on disk for future use.
-- **[BERT sklearn](https://hiclass.readthedocs.io/en/latest/auto_examples/plot_bert.html):** Compatible with the library [BERT sklearn](https://github.com/charles9n/bert-sklearn).
-- **[Hierarchical Explanability](https://hiclass.readthedocs.io/en/latest/algorithms/explainer.html):**  HiClass allows explaining hierarchical models using the [SHAP](https://github.com/shap/shap) package.
 
 **Any feature missing on this list?** Search our [issue tracker](https://github.com/scikit-learn-contrib/hiclass/issues) to see if someone has already requested it and add a comment to it explaining your use-case. Otherwise, please open a new issue describing the requested feature and possible use-case scenario. We prioritize our roadmap based on user feedback, so we would love to hear from you.
 
@@ -115,7 +112,6 @@ pip install hiclass"[<extra_name>]"
 Replace <extra_name> with one of the following options:
 
 - ray: Installs the ray package, which is required for parallel processing support.
-- xai: Installs the shap and xarray packages, which are required for explaining Hiclass' predictions.
 
 ### Option 2: Conda
 
@@ -201,10 +197,6 @@ pipeline.fit(X_train, Y_train)
 predictions = pipeline.predict(X_test)
 ```
 
-## Explaining Hierarchical Classifiers
-
-Hierarchical classifiers can provide additional insights when combined with explainability methods. HiClass allows explaining hierarchical models using SHAP values. Different hierarchical models yield different insights. More information on explaining [Local classifier per parent node](https://colab.research.google.com/drive/1rVlYuRU_uO1jw5sD6qo2HoCpCz6E6z5J?usp=sharing), [Local classifier per node](https://colab.research.google.com/drive/1wqSl1t_Qn2f62WNZQ48mdB0mNeu1XSF1?usp=sharing), and [Local classifier per level](https://colab.research.google.com/drive/1VnGlJu-1wSG4wxHXL0Ijf2a7Pu3kklT-?usp=sharing) is available on [Read the Docs](https://hiclass.readthedocs.io/en/latest/algorithms/explainer.html).
-
 ## Step-by-step walk-through
 
 A step-by-step walk-through is available on our documentation hosted on [Read the Docs](https://hiclass.readthedocs.io/en/latest/index.html).
diff --git a/docs/examples/plot_bert.py b/docs/examples/plot_bert.py
deleted file mode 100644
index 49aa334f..00000000
--- a/docs/examples/plot_bert.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-=====================
-BERT sklearn
-=====================
-
-In order to use `bert-sklearn <https://github.com/charles9n/bert-sklearn>`_ with HiClass, some of scikit-learns checks need to be disabled.
-The reason is that BERT expects text as input for the features, but scikit-learn expects numerical features.
-Hence, the checks will fail.
-To disable scikit-learn's checks, we can simply use the parameter :literal:`bert=True` in the constructor of the local hierarchical classifier.
-"""
-from bert_sklearn import BertClassifier
-from hiclass import LocalClassifierPerParentNode
-
-# Define data
-X_train = X_test = [
-    "Batman",
-    "Rorschach",
-]
-Y_train = [
-    ["Action", "The Dark Night"],
-    ["Action", "Watchmen"],
-]
-
-# Use BERT for every node
-bert = BertClassifier()
-classifier = LocalClassifierPerParentNode(
-    local_classifier=bert,
-    bert=True,
-)
-
-# Train local classifier per node
-classifier.fit(X_train, Y_train)
-
-# Predict
-predictions = classifier.predict(X_test)
-print(predictions)
diff --git a/docs/examples/plot_lcpl_explainer.py b/docs/examples/plot_lcpl_explainer.py
deleted file mode 100644
index d085c791..00000000
--- a/docs/examples/plot_lcpl_explainer.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-=========================================
-Explaining Local Classifier Per Level
-=========================================
-
-A minimalist example showing how to use HiClass Explainer to obtain SHAP values of LCPL model.
-A detailed summary of the Explainer class has been given at Algorithms Overview Section for :ref:`Hierarchical Explainability`.
-SHAP values are calculated based on a synthetic platypus diseases dataset that can be downloaded `here <https://gist.githubusercontent.com/ashishpatel16/9306f8ed3ed101e7ddcb519776bcbd80/raw/3f225c3f80dd8cbb1b6252f6c372a054ec968705/platypus_diseases.csv>`_.
-"""
-from sklearn.ensemble import RandomForestClassifier
-from hiclass import LocalClassifierPerLevel, Explainer
-import shap
-from hiclass.datasets import load_platypus
-
-# Load train and test splits
-X_train, X_test, Y_train, Y_test = load_platypus()
-
-# Use random forest classifiers for every level
-rfc = RandomForestClassifier()
-classifier = LocalClassifierPerLevel(local_classifier=rfc, replace_classifiers=False)
-
-# Train local classifiers per level
-classifier.fit(X_train, Y_train)
-
-# Define Explainer
-explainer = Explainer(classifier, data=X_train, mode="tree")
-explanations = explainer.explain(X_test.values)
-print(explanations)
-
-# Let's filter the Shapley values corresponding to the Covid (level 1)
-# and 'Respiratory' (level 0)
-
-covid_idx = classifier.predict(X_test)[:, 1] == "Covid"
-
-shap_filter_covid = {"level": 1, "class": "Covid", "sample": covid_idx}
-shap_filter_resp = {"level": 0, "class": "Respiratory", "sample": covid_idx}
-shap_val_covid = explanations.sel(**shap_filter_covid)
-shap_val_resp = explanations.sel(**shap_filter_resp)
-
-
-# This code snippet demonstrates how to visually compare the mean absolute SHAP values for 'Covid' vs. 'Respiratory' diseases.
-
-# Feature names for the X-axis
-feature_names = X_train.columns.values
-
-# SHAP values for 'Covid'
-shap_values_covid = shap_val_covid.shap_values.values
-
-# SHAP values for 'Respiratory'
-shap_values_resp = shap_val_resp.shap_values.values
-
-shap.summary_plot(
-    [shap_values_covid, shap_values_resp],
-    features=X_test.iloc[covid_idx],
-    feature_names=X_train.columns.values,
-    plot_type="bar",
-    class_names=["Covid", "Respiratory"],
-)
diff --git a/docs/examples/plot_lcpn_explainer.py b/docs/examples/plot_lcpn_explainer.py
deleted file mode 100644
index 39494fbf..00000000
--- a/docs/examples/plot_lcpn_explainer.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-=========================================
-Explaining Local Classifier Per Node
-=========================================
-
-A minimalist example showing how to use HiClass Explainer to obtain SHAP values of LCPN model.
-A detailed summary of the Explainer class has been given at Algorithms Overview Section for :ref:`Hierarchical Explainability`.
-SHAP values are calculated based on a synthetic platypus diseases dataset that can be downloaded `here <https://gist.githubusercontent.com/ashishpatel16/9306f8ed3ed101e7ddcb519776bcbd80/raw/3f225c3f80dd8cbb1b6252f6c372a054ec968705/platypus_diseases.csv>`_.
-"""
-import numpy as np
-from sklearn.ensemble import RandomForestClassifier
-from hiclass import LocalClassifierPerNode, Explainer
-from hiclass.datasets import load_platypus
-import shap
-
-# Load train and test splits
-X_train, X_test, Y_train, Y_test = load_platypus()
-
-# Use random forest classifiers for every node
-rfc = RandomForestClassifier()
-classifier = LocalClassifierPerNode(local_classifier=rfc, replace_classifiers=False)
-
-# Train local classifier per node
-classifier.fit(X_train, Y_train)
-
-# Define Explainer
-explainer = Explainer(classifier, data=X_train.values, mode="tree")
-explanations = explainer.explain(X_test.values)
-print(explanations)
-
-# Filter samples which only predicted "Respiratory" at first level
-respiratory_idx = classifier.predict(X_test)[:, 0] == "Respiratory"
-
-# Specify additional filters to obtain only level 0
-shap_filter = {"level": 0, "class": "Respiratory_1", "sample": respiratory_idx}
-
-# Use .sel() method to apply the filter and obtain filtered results
-shap_val_respiratory = explanations.sel(shap_filter)
-
-# Plot feature importance on test set
-shap.plots.violin(
-    shap_val_respiratory.shap_values,
-    feature_names=X_train.columns.values,
-    plot_size=(13, 8),
-)
diff --git a/docs/examples/plot_lcppn_explainer.py b/docs/examples/plot_lcppn_explainer.py
deleted file mode 100644
index ab27ce38..00000000
--- a/docs/examples/plot_lcppn_explainer.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-============================================
-Explaining Local Classifier Per Parent Node
-============================================
-
-A minimalist example showing how to use HiClass Explainer to obtain SHAP values of LCPPN model.
-A detailed summary of the Explainer class has been given at Algorithms Overview Section for :ref:`Hierarchical Explainability`.
-SHAP values are calculated based on a synthetic platypus diseases dataset that can be downloaded `here <https://gist.githubusercontent.com/ashishpatel16/9306f8ed3ed101e7ddcb519776bcbd80/raw/3f225c3f80dd8cbb1b6252f6c372a054ec968705/platypus_diseases.csv>`_.
-"""
-from sklearn.ensemble import RandomForestClassifier
-from hiclass import LocalClassifierPerParentNode, Explainer
-import shap
-from hiclass.datasets import load_platypus
-
-# Load train and test splits
-X_train, X_test, Y_train, Y_test = load_platypus()
-
-# Use random forest classifiers for every node
-rfc = RandomForestClassifier()
-classifier = LocalClassifierPerParentNode(
-    local_classifier=rfc, replace_classifiers=False
-)
-
-# Train local classifier per parent node
-classifier.fit(X_train, Y_train)
-
-# Define Explainer
-explainer = Explainer(classifier, data=X_train.values, mode="tree")
-explanations = explainer.explain(X_test.values)
-print(explanations)
-
-# Filter samples which only predicted "Respiratory" at first level
-respiratory_idx = classifier.predict(X_test)[:, 0] == "Respiratory"
-
-# Specify additional filters to obtain only level 0
-shap_filter = {"level": 0, "class": "Respiratory", "sample": respiratory_idx}
-
-# Use .sel() method to apply the filter and obtain filtered results
-shap_val_respiratory = explanations.sel(shap_filter)
-
-# Plot feature importance on test set
-shap.plots.violin(
-    shap_val_respiratory.shap_values,
-    feature_names=X_train.columns.values,
-    plot_size=(13, 8),
-)
diff --git a/docs/examples/plot_multilabel.py b/docs/examples/plot_multilabel.py
deleted file mode 100644
index 26a29c44..00000000
--- a/docs/examples/plot_multilabel.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-==============================================
-Using Hierarchical Multi-Label Classification
-==============================================
-
-A simple example to show how to use multi-label classification in HiClass.
-Please have a look at Algorithms Overview Section for :ref:`Multi-Label Classification` for the motivation and background behind the implementation.
-"""
-import numpy as np
-from sklearn.tree import DecisionTreeClassifier
-from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
-
-# Define data
-X_train = [[1, 2], [3, 4], [5, 6]]
-X_test = [[1, 2], [3, 4], [5, 6]]
-
-# Define labels
-Y_train = np.array(
-    [
-        [["Mammal", "Human"], ["Fish"]],  # Mermaid
-        [["Mammal", "Human"], ["Mammal", "Bovine"]],  # Minotaur
-        [["Mammal", "Human"]],  # just a Human
-    ],
-    dtype=object,
-)
-
-# Use decision tree classifiers for every node
-tree = DecisionTreeClassifier()
-classifier = MultiLabelLocalClassifierPerNode(local_classifier=tree)
-
-# Train local classifier per node
-classifier.fit(X_train, Y_train)
-
-# Predict
-predictions = classifier.predict(X_test)
-print(predictions)
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 50db5a0a..9c6c8edb 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,13 +1,8 @@
-# Defining the exact version will make sure things don't break
-sphinx==5.0.0
-sphinx_rtd_theme==1.0.0
-readthedocs-sphinx-search==0.1.2
-sphinx_code_tabs==0.5.3
-sphinx-gallery==0.10.1
-matplotlib==3.5.2
-pandas==1.4.2
+sphinx
+sphinx_rtd_theme
+readthedocs-sphinx-search
+sphinx_code_tabs
+sphinx-gallery
+matplotlib
 ray
 numpy
-git+https://github.com/charles9n/bert-sklearn.git@master
-shap==0.44.1
-xarray==2023.1.0
\ No newline at end of file
diff --git a/docs/source/algorithms/explainer-indexing.png b/docs/source/algorithms/explainer-indexing.png
deleted file mode 100644
index 60c177e7..00000000
Binary files a/docs/source/algorithms/explainer-indexing.png and /dev/null differ
diff --git a/docs/source/algorithms/explainer.rst b/docs/source/algorithms/explainer.rst
deleted file mode 100644
index b87ced9c..00000000
--- a/docs/source/algorithms/explainer.rst
+++ /dev/null
@@ -1,131 +0,0 @@
-.. _explainer-overview:
-
-===========================
-Hierarchical Explainability
-===========================
-HiClass also provides support for eXplainable AI (XAI) using SHAP values. This section demonstrates the Explainer class along with examples and design principles.
-
-++++++++++++++++++++++++++
-Motivation
-++++++++++++++++++++++++++
-
-Explainability in machine learning refers to understanding and interpreting how a model arrives at a particular decision. Several explainability methods are available in the literature, which have found applications in various machine learning applications.
-
-SHAP values are one such approach that provides a unified measure of feature importance that considers the contribution of each feature to the model prediction. These values are based on cooperative game theory and provide a fair way to distribute the credit for the prediction among the features.
-
-Integrating explainability methods into Hierarchical classifiers can yield promising results depending on the application domain. Hierarchical explainability extends the concept of SHAP values to hierarchical classification models.
-
-++++++++++++++++++++++++++
-Dataset overview
-++++++++++++++++++++++++++
-For the remainder of this section, we will utilize a synthetically generated dataset representing platypus diseases. This tabular dataset is created to visualize and test the essence of explainability using SHAP on hierarchical models. The diagram below illustrates the hierarchical structure of the dataset. With nine symptoms as features—fever, diarrhea, stomach pain, skin rash, cough, sniffles, shortness of breath, headache, and body size—the objective is to predict the disease based on these feature values.
-
-.. figure:: ../algorithms/platypus_diseases_hierarchy.svg
-   :align: center
-   :width: 100%
-
-   Hierarchical structure of the synthetic dataset representing platypus diseases.
-
-++++++++++++++++++++++++++
-Background
-++++++++++++++++++++++++++
-This section introduces two main concepts: hierarchical classification and SHAP values. Hierarchical classification leverages the hierarchical structure of data, breaking down the classification task into manageable sub-tasks using models organized in a tree or DAG structure.
-
-SHAP values, adapted from game theory, show the impact of features on model predictions, thus aiding model interpretation. The SHAP library offers practical implementation of these methods, supporting various machine learning algorithms for explanation generation.
-
-To demonstrate how SHAP values provide insights into model prediction, consider the following sample from the platypus disease dataset.
-
-.. code-block:: python
-
-   test_sample = np.array([[35.5,  0. ,  1. ,  1. ,  3. ,  3. ,  0. ,  2. , 37.5]])
-   sample_target = np.array([['Respiratory', 'Cold', '']])
-
-We can calculate SHAP values using the SHAP python package and visualize them. SHAP values tell us how much each symptom "contributes" to the model's decision about which disease a platypus might have. The following diagram illustrates how SHAP values can be visualized using the :literal:`shap.force_plot`.
-
-.. figure:: ../algorithms/shap_explanation.png
-   :align: center
-   :width: 100%
-
-   Force plot illustrating the influence of symptoms on predicting platypus diseases using SHAP values. Each bar represents a symptom, and its length indicates the magnitude of its impact on disease prediction.
-
-++++++++++++++++++++++++++
-API Design
-++++++++++++++++++++++++++
-
-Designing an API for hierarchical classifiers and SHAP value computation presents numerous challenges, including complex data structures, difficulties accessing correct SHAP values corresponding to a classifier, and slow computation. We addressed these issues by using :literal:`xarray.Dataset` for organization, filtering, and storage of SHAP values efficiency. We also utilized parallelization using Joblib for speed. These enhancements ensure a streamlined and user-friendly experience for users dealing with hierarchical classifiers and SHAP values.
-
-.. figure:: ../algorithms/explainer-indexing.png
-   :align: center
-   :width: 75%
-
-   Pictorial representation of dimensions along which indexing of hierarchical SHAP values is required.
-
-The Explainer class takes a fitted HiClass model, training data, and some named parameters as input. After creating an instance of the Explainer, the :literal:`Explainer.explain` method can be called by providing the samples for which SHAP values need to be calculated.
-
-.. code-block:: python
-
-    explainer = Explainer(fitted_hiclass_model, data=training_data)
-
-The Explainer returns an :literal:`xarray.Dataset` object which allows users to intuitively access, filter, slice, and plot SHAP values. This Explanation object can also be used interactively within the Jupyter notebook environment. The Explanation object along with its respective attributes are depicted in the following UML diagram.
-
-.. figure:: ../algorithms/hiclass-uml.png
-   :align: center
-   :width: 100%
-
-   UML diagram showing the relationship between HiClass Explainer and the returned Explanation object.
-
-The Explanation object can be obtained by calling the :literal:`explain` method of the Explainer class.
-
-.. code-block:: python
-
-    explanations = explainer.explain(sample_data)
-
-
-++++++++++++++++++++++++++
-Code sample
-++++++++++++++++++++++++++
-
-.. code-block:: python
-
-    from sklearn.ensemble import RandomForestClassifier
-    import numpy as np
-    from hiclass import LocalClassifierPerParentNode, Explainer
-
-    rfc = RandomForestClassifier()
-    lcppn = LocalClassifierPerParentNode(local_classifier=rfc, replace_classifiers=False)
-
-    x_train = np.array([
-        [40.7,  1. ,  1. ,  2. ,  5. ,  2. ,  1. ,  5. , 34.3],
-        [39.2,  0. ,  2. ,  4. ,  1. ,  3. ,  1. ,  2. , 34.1],
-        [40.6,  0. ,  3. ,  1. ,  4. ,  5. ,  0. ,  6. , 27.7],
-        [36.5,  0. ,  3. ,  1. ,  2. ,  2. ,  0. ,  2. , 39.9],
-    ])
-    y_train = np.array([
-        ['Gastrointestinal', 'Norovirus', ''],
-        ['Respiratory', 'Covid', ''],
-        ['Allergy', 'External', 'Bee Allergy'],
-        ['Respiratory', 'Cold', ''],
-    ])
-
-    x_test = np.array([[35.5,  0. ,  1. ,  1. ,  3. ,  3. ,  0. ,  2. , 37.5]])
-
-    lcppn.fit(x_train, y_train)
-    explainer = Explainer(lcppn, data=x_train, mode="tree")
-    explanations = explainer.explain(x_test)
-
-
-++++++++++++++++++++++++++
-Filtering and Manipulation
-++++++++++++++++++++++++++
-
-The Explanation object returned by the Explainer is built using the :literal:`xarray.Dataset` data structure, that enables the application of any xarray dataset operation. For example, filtering specific values can be quickly done. To illustrate the filtering operation, suppose we have SHAP values stored in the Explanation object named :literal:`explanation`.
-
-A common use case is to extract SHAP values for only the predicted nodes. In Local classifier per parent node approach, each node except the leaf nodes represents a classifier. Hence, to find the SHAP values, we can pass the prediction until the penultimate element to obtain the SHAP values.
-To achieve this, we can use xarray's :literal:`.sel()` method:
-
-.. code-block:: python
-
-    mask = {'class': lcppn.predict(x_test).flatten()[:-1]}
-    x = explanations.sel(mask).shap_values
-
-More advanced usage and capabilities can be found at the `Xarray.Dataset <https://docs.xarray.dev/en/stable/generated/xarray.Dataset.html>`_ documentation.
diff --git a/docs/source/algorithms/index.rst b/docs/source/algorithms/index.rst
index 0087781d..e84ff2ad 100644
--- a/docs/source/algorithms/index.rst
+++ b/docs/source/algorithms/index.rst
@@ -14,7 +14,5 @@ HiClass provides implementations for the most popular machine learning models fo
     local_classifier_per_node
     local_classifier_per_parent_node
     local_classifier_per_level
-    multi_label
     metrics
-    explainer
     calibration
diff --git a/docs/source/algorithms/multi_label.rst b/docs/source/algorithms/multi_label.rst
deleted file mode 100644
index 76eaae47..00000000
--- a/docs/source/algorithms/multi_label.rst
+++ /dev/null
@@ -1,242 +0,0 @@
-.. _hierarchical-multi-label-Classification-Overview:
-
-==========================
-Multi-Label Classification
-==========================
-
-HiClass supports hierarchical multi-label classification.
-This means a sample can belong to multiple classes at the same hierarchy level.
-
-On this page, we motivate, explain, and demonstrate how hierarchical multi-label classification is implemented in HiClass.
-
-++++++++++++++++++++++++++
-Motivation
-++++++++++++++++++++++++++
-In numerous hierarchical classification problems, it is possible for a sample to be associated with multiple classes at the same level of the hierarchy.
-This occurs when the classes are not mutually exclusive.
-For instance, let us consider a problem involving the classification of dog breeds, where we aim to determine a dog's breed based on available data.
-Without allowing for multiple paths through the dog breed hierarchy, we would have to assign a single label to each sample, which means we have to choose a single path through the hierarchy, assigning a dog to a single breed.
-However, this only sometimes reflects reality since a dog can be a mix of multiple breeds.
-For example, a dog can be a mix of a Dachshund and a Golden Retriever.
-In such a scenario, we aim to assign both the Dachshund and Golden Retriever labels to the sample, which requires at least two paths through the hierarchy.
-The following figure illustrates this example.
-
-.. _example_dog_breed_hierarchy:
-
-.. figure:: ../algorithms/hc_dog_breed_hierarchy.png
-   :align: center
-   :width: 80%
-
-   An example image of a dog that is a mix of a Dachshund and a Golden Retriever, thereby requiring multiple paths through the hierarchy for correct classification.
-
-Another multi-label classification example is document classification, in which we aim to classify a document based on its content.
-The categories are often hierarchical in nature, such as classifying documents into broad topics like "Technology", "Sports", and "Politics", which further have subcategories like "Artificial Intelligence", "Football", and "International Relations".
-A document can belong to multiple categories, for example, a text that deals with the influence of advancements in AI on International Relations, which can only be correctly classified by multiple paths through the hierarchy.
-
-++++++++++++++++++++++++++++++++++++++++
-Background - Classification Terminology
-++++++++++++++++++++++++++++++++++++++++
-To explain what we mean by hierarchical multi-label classification, we first need to define some terminology.
-
-.. figure:: ../algorithms/hc_background.png
-   :align: left
-   :figwidth: 30%
-
-   The set of classification problems from most generic (multi-class) to most specific (hierarchical multi-label classification).
-
-In a multi-class classification problem, a sample can be assigned to one class among several options.
-In a multi-label classification problem, a sample can be associated with multiple classes simultaneously.
-A hierarchical classification problem is a type of multi-label classification problem where classes are organized in a hierarchical structure represented as a graph, such as a tree or directed acyclic graph (DAG).
-In this graph, the nodes correspond to the classes to be predicted.
-If not specified, it is usually assumed that a sample can only belong to one class at each level of the hierarchy.
-This means a sample can only be associated with a single path through the hierarchy, starting from the root node and ending at a leaf node.
-In hierarchical multi-label classification, this restriction is lifted.
-A sample can belong to multiple classes at any level of the hierarchy, i.e., a sample can be classified by multiple paths through the hierarchy.
-
-|
-|
-
-++++++++++++++++++++++++++
-Design - Target Format
-++++++++++++++++++++++++++
-HiClass is designed to be compatible with the scikit-learn API.
-For the non-multi-label hierarchical classification case, the target array follows the sklearn format for a multi-label classification problem.
-However, since there is no sklearn specific multi-label hierarchical format, HiClass implements its own format extension.
-The HiClass target format extends the non-multi-label hierarchical classification format by adding a new dimension to the 2-dimensional array, which captures the different paths through the hierarchy.
-
-.. figure:: ../algorithms/hc_format.png
-   :align: center
-   :width: 80%
-
-   HiClass hierarchical multi-label classification format extension for samples classified by the dog breed hierarchy.
-
-This is implemented as a nested list of lists, in which the last dimension specifies a path through the hierarchy.
-
-.. code-block:: python
-
-   y = [
-      [["Retriever", "Golden Retriever"], ["Hound", "Dachshund"]], # sample 1
-      [["Hound", "Beagle"]] # sample 2
-   ]
-
-Important to note here is that we specify the whole list of nodes from the root to the most specific nodes for each path.
-Even in cases where only the leaf nodes are different, we still need to specify the whole path.
-For example, if sample 1 belonged to the Labrador class instead of the Dachshund class, we still need to specify the whole path from the root to the Golden Retriever and Labrador nodes, which would be :code:`[["Retriever", "Golden Retriever"], ["Retriever", "Labrador"]]`.
-This is a consequence of using Numpy arrays for the implementation which require fixed dimensions for the target array.
-Furthermore, by explicitly specifying the whole path from the root to the leaf node, the target format is readable and easy to comprehend and also works well for hierarchies that are not trees but DAGs.
-
-
-++++++++++++++++++++++++++
-Fitting the Classifiers
-++++++++++++++++++++++++++
-In this section, we outline how fitting of the local classifiers is implemented in HiClass for hierarchical multi-label classification.
-Here, we only focus on the hierarchical multi-label classification case for the :class:`hiclass.MultiLabelLocalClassifierPerNode` and :class:`hiclass.MultiLabelLocalClassifierPerParentNode` classifiers.
-For a recap on how the strategies work, visit the :ref:`Algorithms<algorithms>` section.
-
-
-.. _hierarchical-multi-label-local-classifier-per-node:
-
-Local Classifier Per Node
----------------------------
-The :class:`hiclass.MultiLabelLocalClassifierPerNode` strategy fits a binary local classifier for each node in the hierarchy.
-:class:`hiclass.BinaryPolicy` defines which samples belong to the positive and which ones to the negative class for a given local classifier.
-HiClass implements that positive and negative samples for a local classifier are mutually exclusive, i.e., a sample can only belong to a local classifier's positive or negative class.
-In the hierarchical multi-label case, a sample belongs to the positive class if it belongs to any of the paths through the hierarchy that are associated with the local classifier.
-
-For instance, the :ref:`example image <example_dog_breed_hierarchy>` is assigned to the positive class for the Retriever classifier since it belongs to the Golden Retriever class, which is a child of the Retriever node.
-It is also assigned to the positive class for the Hound classifier since it does not belong to the Dachshund class, which is a child of the Hound node.
-
-
-.. _hierarchical-multi-label-local-classifier-per-parent-node:
-
-Local Classifier Per Parent Node
----------------------------------
-The :class:`hiclass.MultiLabelLocalClassifierPerParentNode` trains a multi-class classifier for each non-leaf/parent node, i.e., a node with children in the hierarchy.
-The classes to be predicted are the children of the node.
-For the multi-label case, a sample can belong to multiple children of a node.
-Internally, this is implemented by duplicating the sample and assigning each duplicate to one of the node's children.
-The classifier does not need to support the sklearn multi-label format and can be a standard sklearn classifier.
-
-++++++++++++++++++++++++++
-Prediction
-++++++++++++++++++++++++++
-So far, we have only discussed the fitting of the classifiers; in this section, we outline how the prediction is implemented in HiClass for multiple paths.
-HiClass follows a top-down prediction strategy in which a data sample is classified by nodes in the hierarchy, starting from the root and going down to the leaf nodes.
-In the single path case, the data sample is assigned the label with the highest probability at each level.
-This leads to only a single path through the hierarchy for each data sample.
-
-.. figure:: ../algorithms/hc_prediction.png
-   :align: center
-   :width: 80%
-
-   Predicting the labels for a sample using the top-down prediction strategy. Numeric values in red are the predicted probabilities for each node.
-
-In the example given above, the sample would be assigned the label :code:`["Retriever", "Golden Retriever"]`, since this is the path with the highest probability starting at the root node.
-In contrast, when we want to allow for multiple paths through the hierarchy, we need to specify a criterion different from taking the highest probability to assign labels to data samples.
-HiClass implements two strategies for this: Threshold and Tolerance.
-
-Threshold
--------------------------
-The Threshold strategy assigns a label to a data sample if the probability of the label is above a given threshold.
-The threshold :math:`\lambda \in [0, 1]` is a parameter passed to the predict function and specifies an absolute probability value.
-
-.. math::
-   Predictions(Node) = \{c \in Children(Node): \mathbb{P}(c) \geq \lambda\}
-
-In the example given above, if we set :math:`\lambda = 0.6`, we would assign the label :code:`[["Retriever", "Golden Retriever"], ["Hound", "Dachshund"]]` to the sample since the probabilities of the assigned nodes are greater than 0.6.
-While this strategy is simple to implement and understand, it has the disadvantage that it is impossible to specify a different threshold for each node in the hierarchy, requiring a global threshold for all nodes.
-Furthermore, with the top-down prediction strategy, if the predicted probability is below the threshold for a node, the prediction stops regardless of the probabilities of the nodes further down the hierarchy.
-For example, if :math:`\lambda = 0.85`, no label is assigned to the sample since the probabilities for the Retriever and Hound class are below the threshold value and traversing the hierarchy stops.
-
-Tolerance
--------------------------
-The Tolerance strategy mitigates the problem that arises from the absolute probability value in the Threshold strategy by assigning a label to a data sample if the probability is within a given tolerance of the highest probability for neighboring nodes.
-The tolerance :math:`\gamma \in [0, 1]` is a parameter that is passed to the predict function and specifies a relative probability value.
-
-.. math::
-   Predictions(Node) = \{ c \in Children(Node):  \mathbb{P}(c) ≥ max( \mathbb{P}(children) ) - \gamma \}
-
-
-This strategy has the advantage of always predicting at least one class at each level since the tolerance is relative to the highest probability.
-For example, with :math:`\gamma = 0.3` we would predict the labels :code:`[["Retriever", "Golden Retriever"], ["Hound", "Dachshund"], ["Hound", "Beagle"]]`.
-Note that the Beagle label is assigned in the second level because its probability of 0.5 is within the threshold of 0.3 of the highest probability of 0.8 (Dachshund class) of a neighboring node.
-
-
-.. _hierarchical-multi-label-metrics:
-
-++++++++++++++++++++++++++
-Metrics
-++++++++++++++++++++++++++
-We extend the hierarchical precision, recall, and F-Score metrics to evaluate the performance of the hierarchical multi-label classifiers.
-The hierarchical precision, recall, and F-Score are defined as follows and are also defined in :ref:`Metrics <metrics-overview>`.
-
-Here, we give an example of the hierarchical precision and recall for the multi-label case.
-
-.. figure:: ../algorithms/hc_metrics.png
-   :align: center
-   :width: 100%
-
-Note that we can define micro and macro averages when calculating the hierarchical precision and recall for multiple samples.
-The micro-precision/recall of all predictions are considered together, regardless of the sample.
-In contrast, in the macro precision/recall, we first calculate a sample's hierarchical precision/recall and then aggregate the results.
-Since samples can have differing numbers of labels assigned to them, micro and macro averages can lead to different results.
-
-
-++++++++++++++++++++++++++++++++++++++++
-Code example - Putting it all together
-++++++++++++++++++++++++++++++++++++++++
-.. rst-class:: sphx-glr-script-out
-
- Out:
-
- .. code-block:: none
-
-    [[['Retriever' 'Golden Retriever']
-      ['Hound' 'Dachshund']]
-
-     [['Retriever' 'Golden Retriever']
-      ['' '']]
-
-     [['Hound' 'Dachshund']
-      ['Hound' 'Beagle']]]
-
-
-
-
-
-
-|
-
-.. code-block:: default
-
-
-    from sklearn.tree import DecisionTreeClassifier
-
-    from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
-
-    # Define data
-    X_train = [[1, 2], [3, 4], [5, 6]]
-    X_test = [[1, 2], [3, 4], [5, 6]]
-
-    # Define Labels
-    Y_train = np.array([
-        [["Retriever", "Golden Retriever"], ["Hound", "Dachshund"]],
-        [["Retriever", "Labrador"]],
-        [["Hound", "Dachshund"], ["Hound", "Beagle"]],
-    ], dtype=object)
-
-    # Use decision tree classifiers for every node
-    tree = DecisionTreeClassifier()
-    classifier = MultiLabelLocalClassifierPerNode(local_classifier=tree)
-
-    # Train local classifier per node
-    classifier.fit(X_train, Y_train)
-
-    # Predict
-    predictions = classifier.predict(X_test)
-    print(predictions)
-
-
-.. rst-class:: sphx-glr-timing
-
-   **Total running time of the script:** ( 0 minutes  0.047 seconds)
diff --git a/docs/source/api/explainer_api.rst b/docs/source/api/explainer_api.rst
deleted file mode 100644
index 1cadc303..00000000
--- a/docs/source/api/explainer_api.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-.. _explainer_api:
-
-Explainer
-========================
-
-Explainer
------------------------
-.. autoclass:: Explainer.Explainer
-    :members:
-    :special-members: __init__
\ No newline at end of file
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
index d7ba18bf..379729a5 100644
--- a/docs/source/api/index.rst
+++ b/docs/source/api/index.rst
@@ -13,4 +13,3 @@ This is done in order to provide a complete list of the callable functions for e
 
     classifiers
     utilities
-    explainer_api
diff --git a/docs/source/api/utilities.rst b/docs/source/api/utilities.rst
index faf790f9..4e7723e2 100644
--- a/docs/source/api/utilities.rst
+++ b/docs/source/api/utilities.rst
@@ -95,13 +95,6 @@ F-score
 Datasets
 ----------
 
-Platypus diseases dataset
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autofunction:: datasets.load_platypus
-
-..................................
-
 Hierarchical text classification dataset
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/source/get_started/install.rst b/docs/source/get_started/install.rst
index 7ed92400..8c8827fa 100644
--- a/docs/source/get_started/install.rst
+++ b/docs/source/get_started/install.rst
@@ -16,7 +16,6 @@ Additionally, it is also possible to install optional packages along. To install
 :literal:`<extra_name>` can have one of the following options:
 
 - ray: Installs the ray package, which is required for parallel processing support.
-- xai: Installs the shap and xarray packages, which are required for explaining Hiclass' predictions.
 
 It is also possible to install HiClass using :literal:`conda`, as follows:
 
diff --git a/hiclass/BinaryPolicy.py b/hiclass/BinaryPolicy.py
index b6cf3001..9ee33b01 100644
--- a/hiclass/BinaryPolicy.py
+++ b/hiclass/BinaryPolicy.py
@@ -163,7 +163,7 @@ def get_binary_examples(self, node) -> tuple:
         elif isinstance(self.X, csr_matrix) or isinstance(self.X, csr_array):
             X = vstack([positive_x, negative_x])
             sample_weights = (
-                vstack([positive_weights, negative_weights])
+                np.concatenate([positive_weights, negative_weights])
                 if self.sample_weight is not None
                 else None
             )
diff --git a/hiclass/Explainer.py b/hiclass/Explainer.py
deleted file mode 100644
index 8708527b..00000000
--- a/hiclass/Explainer.py
+++ /dev/null
@@ -1,367 +0,0 @@
-"""Explainer API for explaining predictions using shapley values."""
-
-from copy import deepcopy
-from joblib import Parallel, delayed
-import numpy as np
-from sklearn.utils.validation import check_array, check_is_fitted
-from hiclass import (
-    LocalClassifierPerParentNode,
-    LocalClassifierPerNode,
-    LocalClassifierPerLevel,
-    HierarchicalClassifier,
-)
-
-try:
-    import xarray as xr
-except ImportError:
-    xarray_installed = False
-else:
-    xarray_installed = True
-
-try:
-    import shap
-except ImportError:
-    shap_installed = False
-else:
-    shap_installed = True
-
-
-class Explainer:
-    """Explainer class for returning shap values for each of the three hierarchical classifiers."""
-
-    def __init__(
-        self,
-        hierarchical_model: HierarchicalClassifier.HierarchicalClassifier,
-        data: None,
-        n_jobs: int = 1,
-        algorithm: str = "auto",
-        mode: str = "",
-    ):
-        """
-        Initialize the SHAP explainer for a hierarchical model.
-
-        Parameters
-        ----------
-        hierarchical_model : HierarchicalClassifier
-            The hierarchical classification model to explain.
-        data : array-like or None, default=None
-            The dataset used for creating the SHAP explainer.
-        n_jobs : int, default=1
-            The number of jobs to run in parallel.
-        algorithm : str, default="auto"
-            The algorithm to use for SHAP explainer. Possible values are 'linear', 'tree', 'auto', 'permutation', or 'partition'
-        mode : str, default=""
-            The mode of the SHAP explainer. Can be 'tree', 'gradient', 'deep', 'linear', or '' for default SHAP explainer.
-
-        Examples
-        --------
-        >>> from sklearn.ensemble import RandomForestClassifier
-        >>> import numpy as np
-        >>> from hiclass import LocalClassifierPerParentNode, Explainer
-        >>> rfc = RandomForestClassifier()
-        >>> lcppn = LocalClassifierPerParentNode(local_classifier=rfc, replace_classifiers=False)
-        >>> x_train = np.array([[1, 3], [2, 5]])
-        >>> y_train = np.array([[1, 2], [3, 4]])
-        >>> x_test = np.array([[4, 6]])
-        >>> lcppn.fit(x_train, y_train)
-        >>> explainer = Explainer(lcppn, data=x_train, mode="tree")
-        >>> explanations = explainer.explain(x_test)
-        <xarray.Dataset>
-        Dimensions:          (class: 3, sample: 1, level: 2, feature: 2)
-        Coordinates:
-          * class            (class) <U1 '1' '3' '4'
-          * level            (level) int64 0 1
-        Dimensions without coordinates: sample, feature
-        Data variables:
-            node             (sample, level) <U13 'hiclass::root' '3'
-            predicted_class  (sample, level) <U1 '3' '4'
-            predict_proba    (sample, level, class) float64 0.29 0.71 nan nan nan 1.0
-            classes          (sample, level, class) object '1' '3' nan nan nan '4'
-            shap_values      (level, class, sample, feature) float64 -0.135 ... 0.0
-        """
-        self.hierarchical_model = hierarchical_model
-        self.algorithm = algorithm
-        self.mode = mode
-        self.data = np.array(data)
-        self.n_jobs = n_jobs
-
-        # Check if hierarchical model is fitted
-        check_is_fitted(self.hierarchical_model)
-
-        if mode == "linear":
-            self.explainer = shap.LinearExplainer
-        elif mode == "gradient":
-            self.explainer = shap.GradientExplainer
-        elif mode == "deep":
-            self.explainer = shap.DeepExplainer
-        elif mode == "tree":
-            self.explainer = shap.TreeExplainer
-        else:
-            self.explainer = shap.Explainer
-
-    def explain(self, X):
-        """
-        Generate SHAP values for each node in the hierarchy for the given data.
-
-        Parameters
-        ----------
-        X : array-like
-            Training data to fit the SHAP explainer.
-
-        Returns
-        -------
-        explanation : xarray.Dataset
-            An xarray.Dataset object representing the explanations for each sample passed.
-        """
-        # Check if sample data is valid
-        check_array(X)
-
-        if (
-            isinstance(self.hierarchical_model, LocalClassifierPerParentNode)
-            or isinstance(self.hierarchical_model, LocalClassifierPerLevel)
-            or isinstance(self.hierarchical_model, LocalClassifierPerNode)
-        ):
-            return self._explain_with_xr(X)
-        else:
-            raise ValueError(f"Invalid model: {self.hierarchical_model}.")
-
-    def _explain_with_xr(self, X):
-        """
-        Generate SHAP values for each node using the SHAP package.
-
-        Parameters
-        ----------
-        X : array-like
-            Sample data for which to generate SHAP values.
-
-        Returns
-        -------
-        explanation : xarray.Dataset
-            An xarray Dataset consisting of SHAP values for each sample.
-        """
-        explanations = Parallel(n_jobs=self.n_jobs, backend="threading")(
-            delayed(self._calculate_shap_values)(sample.reshape(1, -1)) for sample in X
-        )
-
-        dataset = xr.concat(explanations, dim="sample")
-        return dataset
-
-    def _get_traversed_nodes_lcppn(self, samples):
-        """
-        Return a list of all traversed nodes as per the provided LocalClassifierPerParentNode model.
-
-        Parameters
-        ----------
-        samples : array-like
-            Sample data for which to generate traversed nodes.
-
-        Returns
-        -------
-        traversals : list
-            A list of all traversed nodes as per LocalClassifierPerParentNode (LCPPN) strategy.
-        """
-        # Initialize array with shape (#samples, #levels)
-        traversals = np.empty(
-            (samples.shape[0], self.hierarchical_model.max_levels_),
-            dtype=self.hierarchical_model.dtype_,
-        )
-
-        # Initialize first element as root node
-        traversals[:, 0] = self.hierarchical_model.root_
-
-        # For subsequent nodes, calculate mask and find predictions
-        for level in range(1, traversals.shape[1]):
-            predecessors = set(traversals[:, level - 1])
-            predecessors.discard("")
-            for predecessor in predecessors:
-                mask = np.isin(traversals[:, level - 1], predecessor)
-                predecessor_x = samples[mask]
-                if predecessor_x.shape[0] > 0:
-                    successors = list(
-                        self.hierarchical_model.hierarchy_.successors(predecessor)
-                    )
-                    if len(successors) > 0:
-                        classifier = self.hierarchical_model.hierarchy_.nodes[
-                            predecessor
-                        ]["classifier"]
-                        traversals[mask, level] = classifier.predict(
-                            predecessor_x
-                        ).flatten()
-        return traversals
-
-    def _get_traversed_nodes_lcpn(self, samples):
-        """
-        Return a list of all traversed nodes as per the provided LocalClassifierPerNode model.
-
-        Parameters
-        ----------
-        samples : array-like
-            Sample data for which to generate traversed nodes.
-
-        Returns
-        -------
-        traversals : list
-            A list of all traversed nodes as per LocalClassifierPerNode (LCPN) strategy.
-        """
-        traversals = np.empty(
-            (samples.shape[0], self.hierarchical_model.max_levels_),
-            dtype=self.hierarchical_model.dtype_,
-        )
-
-        predictions = self.hierarchical_model.predict(samples)
-
-        traversals[:, 0] = predictions[:, 0]
-        separator = np.full(
-            (samples.shape[0], 3),
-            self.hierarchical_model.separator_,
-            dtype=self.hierarchical_model.dtype_,
-        )
-
-        for level in range(1, traversals.shape[1]):
-            traversals[:, level] = np.char.add(
-                traversals[:, level - 1],
-                np.char.add(separator[:, 0], predictions[:, level]),
-            )
-
-        # For inconsistent hierarchies, levels with empty nodes should be ignored
-        mask = predictions == ""
-        traversals[mask] = ""
-
-        return traversals
-
-    def _get_traversed_nodes_lcpl(self, samples):
-        """
-        Return a list of all traversed nodes as per the provided LocalClassifierPerLevel model.
-
-        Parameters
-        ----------
-        samples : array-like
-            Sample data for which to generate traversed nodes.
-
-        Returns
-        -------
-        traversals : list
-            A list of all traversed nodes as per LocalClassifierPerLevel (LCPL) strategy.
-        """
-        traversals = []
-        predictions = self.hierarchical_model.predict(samples)
-        for pred in predictions:
-            traversal_order = []
-            filtered_pred = [p for p in pred if p.strip()]
-            for i in range(1, len(filtered_pred) + 1):
-                node = self.hierarchical_model.separator_.join(filtered_pred[:i])
-                traversal_order.append(node)
-            traversals.append(traversal_order)
-        return traversals
-
-    def _calculate_shap_values(self, X):
-        """
-        Return an xarray.Dataset object for a single sample provided. This dataset is aligned on the `level` attribute.
-
-        Parameters
-        ----------
-        X : array-like
-            Data for single sample for which to generate SHAP values.
-
-        Returns
-        -------
-        explanation : xarray.Dataset
-            A single explanation for the prediction of given sample.
-        """
-        traversed_nodes = []
-        if isinstance(self.hierarchical_model, LocalClassifierPerLevel):
-            traversed_nodes = self._get_traversed_nodes_lcpl(X)[0]
-        elif isinstance(self.hierarchical_model, LocalClassifierPerParentNode):
-            traversed_nodes = self._get_traversed_nodes_lcppn(X)[0]
-        elif isinstance(self.hierarchical_model, LocalClassifierPerNode):
-            traversed_nodes = self._get_traversed_nodes_lcpn(X)[0]
-        datasets = []
-        level = 0
-        for node in traversed_nodes:
-            if node == "" or (
-                ("classifier" not in self.hierarchical_model.hierarchy_.nodes[node])
-                and (not isinstance(self.hierarchical_model, LocalClassifierPerLevel))
-            ):
-                continue
-
-            if isinstance(self.hierarchical_model, LocalClassifierPerLevel):
-                local_classifier = self.hierarchical_model.local_classifiers_[level]
-            else:
-                local_classifier = self.hierarchical_model.hierarchy_.nodes[node][
-                    "classifier"
-                ]
-
-            # Create a SHAP explainer for the local classifier
-            local_explainer = deepcopy(self.explainer)(local_classifier, self.data)
-
-            current_node = node.split(self.hierarchical_model.separator_)[-1]
-
-            # Calculate SHAP values for the given sample X
-            shap_values = np.array(
-                local_explainer.shap_values(X, check_additivity=False)
-            )
-
-            if len(shap_values.shape) < 3:
-                shap_values = shap_values.reshape(
-                    1, shap_values.shape[0], shap_values.shape[1]
-                )
-
-            if isinstance(self.hierarchical_model, LocalClassifierPerNode):
-                simplified_labels = [
-                    f"{current_node}_{int(label)}"
-                    for label in local_classifier.classes_
-                ]
-                predicted_class = current_node
-            elif isinstance(self.hierarchical_model, LocalClassifierPerParentNode):
-                simplified_labels = [
-                    label.split(self.hierarchical_model.separator_)[-1]
-                    for label in local_classifier.classes_
-                ]
-                predicted_class = (
-                    local_classifier.predict(X)
-                    .flatten()[0]
-                    .split(self.hierarchical_model.separator_)[-1]
-                )
-            else:
-                simplified_labels = [
-                    label.split(self.hierarchical_model.separator_)[-1]
-                    for label in local_classifier.classes_
-                ]
-                predicted_class = current_node
-
-            classes = xr.DataArray(
-                simplified_labels,
-                dims=["class"],
-                coords={"class": simplified_labels},
-            )
-
-            shap_val_local = xr.DataArray(
-                shap_values,
-                dims=["class", "sample", "feature"],
-                coords={"class": simplified_labels},
-            )
-
-            prediction_probability = local_classifier.predict_proba(X)[0]
-
-            predict_proba = xr.DataArray(
-                prediction_probability,
-                dims=["class"],
-                coords={
-                    "class": simplified_labels,
-                },
-            )
-
-            local_dataset = xr.Dataset(
-                {
-                    "node": current_node,
-                    "predicted_class": predicted_class,
-                    "predict_proba": predict_proba,
-                    "classes": classes,
-                    "shap_values": shap_val_local,
-                    "level": level,
-                }
-            )
-            level += 1
-            datasets.append(local_dataset)
-        sample_explanation = xr.concat(datasets, dim="level")
-        return sample_explanation
diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py
index 1351fa0b..c7b5ab36 100644
--- a/hiclass/HierarchicalClassifier.py
+++ b/hiclass/HierarchicalClassifier.py
@@ -20,6 +20,10 @@
     MultiplyCombiner,
 )
 
+from hiclass.probability_combiner import (
+    init_strings as probability_combiner_init_strings,
+)
+
 try:
     import ray
 except ImportError:
@@ -76,9 +80,9 @@ def __init__(
         edge_list: str = None,
         replace_classifiers: bool = True,
         n_jobs: int = 1,
-        bert: bool = False,
         classifier_abbreviation: str = "",
         calibration_method: str = None,
+        probability_combiner: str = "multiply",
         tmp_dir: str = None,
     ):
         """
@@ -101,12 +105,17 @@ def __init__(
         n_jobs : int, default=1
             The number of jobs to run in parallel. Only :code:`fit` is parallelized.
             If :code:`Ray` is installed it is used, otherwise it defaults to :code:`Joblib`.
-        bert : bool, default=False
-            If True, skip scikit-learn's checks and sample_weight passing for BERT.
         classifier_abbreviation : str, default=""
             The abbreviation of the local hierarchical classifier to be displayed during logging.
         calibration_method : {"ivap", "cvap", "platt", "isotonic", "beta"}, str, default=None
             If set, use the desired method to calibrate probabilities returned by predict_proba().
+        probability_combiner: {"geometric", "arithmetic", "multiply", None}, str, default="multiply"
+            Specify the rule for combining probabilities over multiple levels:
+
+            - `geometric`: Each levels probabilities are calculated by taking the geometric mean of itself and its predecessors;
+            - `arithmetic`: Each levels probabilities are calculated by taking the arithmetic mean of itself and its predecessors;
+            - `multiply`: Each levels probabilities are calculated by multiplying itself with its predecessors.
+            - `None`: No aggregation.
         tmp_dir : str, default=None
             Temporary directory to persist local classifiers that are trained. If the job needs to be restarted,
             it will skip the pre-trained local classifier found in the temporary directory.
@@ -116,9 +125,9 @@ def __init__(
         self.edge_list = edge_list
         self.replace_classifiers = replace_classifiers
         self.n_jobs = n_jobs
-        self.bert = bert
         self.classifier_abbreviation = classifier_abbreviation
         self.calibration_method = calibration_method
+        self.probability_combiner = probability_combiner
         self.tmp_dir = tmp_dir
 
     def fit(self, X, y, sample_weight=None):
@@ -152,18 +161,21 @@ def fit(self, X, y, sample_weight=None):
             self._clean_up()
 
     def _pre_fit(self, X, y, sample_weight):
+        # check params
+        if (
+            self.probability_combiner
+            and self.probability_combiner not in probability_combiner_init_strings
+        ):
+            raise ValueError(
+                f"probability_combiner must be one of {', '.join(probability_combiner_init_strings)} or None."
+            )
+
         # Check that X and y have correct shape
         # and convert them to np.ndarray if need be
 
-        if not self.bert:
-            self.X_, self.y_ = self._validate_data(
-                X, y, multi_output=True, accept_sparse="csr", allow_nd=True
-            )
-        else:
-            self.X_ = np.array(X)
-            self.y_ = check_array(
-                make_leveled(y), dtype=None, ensure_2d=False, allow_nd=True
-            )
+        self.X_, self.y_ = self._validate_data(
+            X, y, multi_output=True, accept_sparse="csr", allow_nd=True
+        )
 
         if sample_weight is not None:
             self.sample_weight_ = _check_sample_weight(sample_weight, X)
@@ -265,10 +277,7 @@ def calibrate(self, X, y):
         check_is_fitted(self)
 
         # Input validation
-        if not self.bert:
-            X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
-        else:
-            X = np.array(X)
+        X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
 
         if self.calibration_method == "cvap":
             # combine train and calibration dataset for cross validation
diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py
index 680e2146..9581934a 100644
--- a/hiclass/LocalClassifierPerLevel.py
+++ b/hiclass/LocalClassifierPerLevel.py
@@ -12,15 +12,12 @@
 import numpy as np
 from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator
+from sklearn.utils._tags import ClassifierTags
 from sklearn.utils.validation import check_array, check_is_fitted
 
+from hiclass._calibration.Calibrator import _Calibrator
 from hiclass.ConstantClassifier import ConstantClassifier
 from hiclass.HierarchicalClassifier import HierarchicalClassifier
-from hiclass._calibration.Calibrator import _Calibrator
-
-from hiclass.probability_combiner import (
-    init_strings as probability_combiner_init_strings,
-)
 
 try:
     import ray
@@ -56,7 +53,6 @@ def __init__(
         edge_list: str = None,
         replace_classifiers: bool = True,
         n_jobs: int = 1,
-        bert: bool = False,
         calibration_method: str = None,
         return_all_probabilities: bool = False,
         probability_combiner: str = "multiply",
@@ -82,8 +78,6 @@ def __init__(
         n_jobs : int, default=1
             The number of jobs to run in parallel. Only :code:`fit` is parallelized.
             If :code:`Ray` is installed it is used, otherwise it defaults to :code:`Joblib`.
-        bert : bool, default=False
-            If True, skip scikit-learn's checks and sample_weight passing for BERT.
         calibration_method : {"ivap", "cvap", "platt", "isotonic", "beta"}, str, default=None
             If set, use the desired method to calibrate probabilities returned by predict_proba().
         return_all_probabilities : bool, default=False
@@ -106,20 +100,18 @@ def __init__(
             replace_classifiers=replace_classifiers,
             n_jobs=n_jobs,
             classifier_abbreviation="LCPL",
-            bert=bert,
             calibration_method=calibration_method,
             tmp_dir=tmp_dir,
         )
         self.return_all_probabilities = return_all_probabilities
         self.probability_combiner = probability_combiner
 
-        if (
-            self.probability_combiner
-            and self.probability_combiner not in probability_combiner_init_strings
-        ):
-            raise ValueError(
-                f"probability_combiner must be one of {', '.join(probability_combiner_init_strings)} or None."
-            )
+    def __sklearn_tags__(self):
+        """Configure annotations of estimator to allow inspection of capabilities, such as sparse matrix support."""
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.classifier_tags = ClassifierTags()
+        return tags
 
     def fit(self, X, y, sample_weight=None):
         """
@@ -179,10 +171,7 @@ def predict(self, X):
         check_is_fitted(self)
 
         # Input validation
-        if not self.bert:
-            X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
-        else:
-            X = np.array(X)
+        X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
 
         # Initialize array that holds predictions
         y = np.empty((X.shape[0], self.max_levels_), dtype=self.dtype_)
@@ -225,10 +214,7 @@ def predict_proba(self, X):
         # Check if fit has been called
         check_is_fitted(self)
 
-        if not self.bert:
-            X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
-        else:
-            X = np.array(X)
+        X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
 
         if not self.calibration_method:
             self.logger_.info(
@@ -458,12 +444,9 @@ def _fit_classifier(self, level, separator):
         unique_y = np.unique(y)
         if len(unique_y) == 1 and self.replace_classifiers:
             classifier = ConstantClassifier()
-        if not self.bert:
-            try:
-                classifier.fit(X, y, sample_weight)
-            except TypeError:
-                classifier.fit(X, y)
-        else:
+        try:
+            classifier.fit(X, y, sample_weight)
+        except TypeError:
             classifier.fit(X, y)
         self._save_tmp(level, classifier)
         return classifier
diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py
index c32c0781..7e7cec46 100644
--- a/hiclass/LocalClassifierPerNode.py
+++ b/hiclass/LocalClassifierPerNode.py
@@ -12,18 +12,14 @@
 import networkx as nx
 import numpy as np
 from sklearn.base import BaseEstimator
+from sklearn.utils._tags import ClassifierTags
 from sklearn.utils.validation import check_array, check_is_fitted
 
 from hiclass import BinaryPolicy
-from hiclass.ConstantClassifier import ConstantClassifier
-from hiclass.HierarchicalClassifier import HierarchicalClassifier
 from hiclass._calibration.Calibrator import _Calibrator
-
-from hiclass.probability_combiner import (
-    init_strings as probability_combiner_init_strings,
-)
-
 from hiclass._hiclass_utils import _normalize_probabilities
+from hiclass.ConstantClassifier import ConstantClassifier
+from hiclass.HierarchicalClassifier import HierarchicalClassifier
 
 
 class LocalClassifierPerNode(BaseEstimator, HierarchicalClassifier):
@@ -53,7 +49,6 @@ def __init__(
         edge_list: str = None,
         replace_classifiers: bool = True,
         n_jobs: int = 1,
-        bert: bool = False,
         calibration_method: str = None,
         return_all_probabilities: bool = False,
         probability_combiner: str = "multiply",
@@ -90,8 +85,6 @@ def __init__(
         n_jobs : int, default=1
             The number of jobs to run in parallel. Only :code:`fit` is parallelized.
             If :code:`Ray` is installed it is used, otherwise it defaults to :code:`Joblib`.
-        bert : bool, default=False
-            If True, skip scikit-learn's checks and sample_weight passing for BERT.
         calibration_method : {"ivap", "cvap", "platt", "isotonic", "beta"}, str, default=None
             If set, use the desired method to calibrate probabilities returned by predict_proba().
         return_all_probabilities : bool, default=False
@@ -114,7 +107,6 @@ def __init__(
             replace_classifiers=replace_classifiers,
             n_jobs=n_jobs,
             classifier_abbreviation="LCPN",
-            bert=bert,
             calibration_method=calibration_method,
             tmp_dir=tmp_dir,
         )
@@ -122,13 +114,12 @@ def __init__(
         self.return_all_probabilities = return_all_probabilities
         self.probability_combiner = probability_combiner
 
-        if (
-            self.probability_combiner
-            and self.probability_combiner not in probability_combiner_init_strings
-        ):
-            raise ValueError(
-                f"probability_combiner must be one of {', '.join(probability_combiner_init_strings)} or None."
-            )
+    def __sklearn_tags__(self):
+        """Configure annotations of estimator to allow inspection of capabilities, such as sparse matrix support."""
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.classifier_tags = ClassifierTags()
+        return tags
 
     def fit(self, X, y, sample_weight=None):
         """
@@ -190,10 +181,7 @@ def predict(self, X):
         check_is_fitted(self)
 
         # Input validation
-        if not self.bert:
-            X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
-        else:
-            X = np.array(X)
+        X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
 
         # Initialize array that holds predictions
         y = np.empty((X.shape[0], self.max_levels_), dtype=self.dtype_)
@@ -259,10 +247,7 @@ def predict_proba(self, X):
         check_is_fitted(self)
 
         # Input validation
-        if not self.bert:
-            X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
-        else:
-            X = np.array(X)
+        X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
 
         if not self.calibration_method:
             self.logger_.info(
@@ -430,12 +415,9 @@ def _fit_classifier(self, node):
         if len(unique_y) == 1 and self.replace_classifiers:
             self.logger_.info("adding constant classifier")
             classifier = ConstantClassifier()
-        if not self.bert:
-            try:
-                classifier.fit(X, y, sample_weight)
-            except TypeError:
-                classifier.fit(X, y)
-        else:
+        try:
+            classifier.fit(X, y, sample_weight)
+        except TypeError:
             classifier.fit(X, y)
         self._save_tmp(node, classifier)
         return classifier
diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py
index c5373922..49bd431f 100644
--- a/hiclass/LocalClassifierPerParentNode.py
+++ b/hiclass/LocalClassifierPerParentNode.py
@@ -12,17 +12,13 @@
 import networkx as nx
 import numpy as np
 from sklearn.base import BaseEstimator
+from sklearn.utils._tags import ClassifierTags
 from sklearn.utils.validation import check_array, check_is_fitted
 
-from hiclass.ConstantClassifier import ConstantClassifier
-from hiclass.HierarchicalClassifier import HierarchicalClassifier
 from hiclass._calibration.Calibrator import _Calibrator
-
-from hiclass.probability_combiner import (
-    init_strings as probability_combiner_init_strings,
-)
-
 from hiclass._hiclass_utils import _normalize_probabilities
+from hiclass.ConstantClassifier import ConstantClassifier
+from hiclass.HierarchicalClassifier import HierarchicalClassifier
 
 
 class LocalClassifierPerParentNode(BaseEstimator, HierarchicalClassifier):
@@ -51,7 +47,6 @@ def __init__(
         edge_list: str = None,
         replace_classifiers: bool = True,
         n_jobs: int = 1,
-        bert: bool = False,
         calibration_method: str = None,
         return_all_probabilities: bool = False,
         probability_combiner: str = "multiply",
@@ -77,8 +72,6 @@ def __init__(
         n_jobs : int, default=1
             The number of jobs to run in parallel. Only :code:`fit` is parallelized.
             If :code:`Ray` is installed it is used, otherwise it defaults to :code:`Joblib`.
-        bert : bool, default=False
-            If True, skip scikit-learn's checks and sample_weight passing for BERT.
         calibration_method : {"ivap", "cvap", "platt", "isotonic", "beta"}, str, default=None
             If set, use the desired method to calibrate probabilities returned by predict_proba().
         return_all_probabilities : bool, default=False
@@ -101,20 +94,18 @@ def __init__(
             replace_classifiers=replace_classifiers,
             n_jobs=n_jobs,
             classifier_abbreviation="LCPPN",
-            bert=bert,
             calibration_method=calibration_method,
             tmp_dir=tmp_dir,
         )
         self.return_all_probabilities = return_all_probabilities
         self.probability_combiner = probability_combiner
 
-        if (
-            self.probability_combiner
-            and self.probability_combiner not in probability_combiner_init_strings
-        ):
-            raise ValueError(
-                f"probability_combiner must be one of {', '.join(probability_combiner_init_strings)} or None."
-            )
+    def __sklearn_tags__(self):
+        """Configure annotations of estimator to allow inspection of capabilities, such as sparse matrix support."""
+        tags = super().__sklearn_tags__()
+        tags.input_tags.sparse = True
+        tags.classifier_tags = ClassifierTags()
+        return tags
 
     def fit(self, X, y, sample_weight=None):
         """
@@ -173,10 +164,7 @@ def predict(self, X):
         check_is_fitted(self)
 
         # Input validation
-        if not self.bert:
-            X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
-        else:
-            X = np.array(X)
+        X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
 
         # Initialize array that holds predictions
         y = np.empty((X.shape[0], self.max_levels_), dtype=self.dtype_)
@@ -220,10 +208,7 @@ def predict_proba(self, X):
         check_is_fitted(self)
 
         # Input validation
-        if not self.bert:
-            X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
-        else:
-            X = np.array(X)
+        X = check_array(X, accept_sparse="csr", allow_nd=True, ensure_2d=False)
 
         if not self.calibration_method:
             self.logger_.info(
@@ -398,12 +383,9 @@ def _fit_classifier(self, node):
         unique_y = np.unique(y)
         if len(unique_y) == 1 and self.replace_classifiers:
             classifier = ConstantClassifier()
-        if not self.bert:
-            try:
-                classifier.fit(X, y, sample_weight)
-            except TypeError:
-                classifier.fit(X, y)
-        else:
+        try:
+            classifier.fit(X, y, sample_weight)
+        except TypeError:
             classifier.fit(X, y)
         self._save_tmp(node, classifier)
         return classifier
diff --git a/hiclass/MultiLabelHierarchicalClassifier.py b/hiclass/MultiLabelHierarchicalClassifier.py
deleted file mode 100644
index b9acaa68..00000000
--- a/hiclass/MultiLabelHierarchicalClassifier.py
+++ /dev/null
@@ -1,427 +0,0 @@
-"""Shared code for all classifiers."""
-
-import abc
-import logging
-
-import networkx as nx
-import numpy as np
-from joblib import Parallel, delayed
-from scipy.sparse import csr_matrix
-from sklearn.base import BaseEstimator
-from sklearn.linear_model import LogisticRegression
-from sklearn.utils.validation import _check_sample_weight
-
-import functools
-import sklearn.utils.validation
-
-# TODO: Move to MultiLabelHierarchicalClassifier (Parent Class)
-sklearn.utils.validation.check_array = functools.partial(
-    sklearn.utils.validation.check_array, allow_nd=True
-)
-
-try:
-    import ray
-except ImportError:
-    _has_ray = False
-else:
-    _has_ray = True
-
-
-def make_leveled(y):
-    """
-    Add empty cells if columns' length differs.
-
-    Parameters
-    ----------
-    y : array-like of shape (n_samples, n_labels, n_levels)
-        The target values, i.e., multi-label hierarchical class labels for classification.
-
-    Returns
-    -------
-    leveled_y : array-like of shape (n_samples, n_labels, n_levels)
-        The leveled target values, i.e., multi-label hierarchical class labels for classification.
-
-    Notes
-    -----
-    If rows are not iterable, returns the current y without modifications.
-
-    Examples
-    --------
-    >>> from hiclass.HierarchicalClassifier import make_leveled
-    >>> y = [[['a']], [['b', 'c']]]
-    >>> make_leveled(y)
-    array([[['a', '']],
-       [['b', 'c']]])
-    """
-    rows = len(y)
-    try:
-        multi_labels = max([len(row) for row in y])
-        levels = max([len(label) for row in y for label in row])
-    except TypeError:
-        return y
-    leveled_y = np.full(
-        (rows, multi_labels, levels), "", dtype=object
-    )  # dtype object relevant to allow for string labels
-    for i, row in enumerate(y):
-        for j, multi_label in enumerate(row):
-            for k, label in enumerate(multi_label):
-                leveled_y[i, j, k] = label
-    return np.array(leveled_y)
-
-
-class MultiLabelHierarchicalClassifier(abc.ABC):
-    """Abstract class for the local hierarchical classifiers.
-
-    Offers mostly utility methods and common data initialization.
-    """
-
-    def __init__(
-        self,
-        local_classifier: BaseEstimator = None,
-        tolerance: float = None,
-        verbose: int = 0,
-        edge_list: str = None,
-        replace_classifiers: bool = True,
-        n_jobs: int = 1,
-        bert: bool = False,
-        classifier_abbreviation: str = "",
-    ):
-        r"""
-        Initialize a local hierarchical classifier.
-
-        Parameters
-        ----------
-        local_classifier : BaseEstimator, default=LogisticRegression
-            The local_classifier used to create the collection of local classifiers. Needs to have fit, predict and
-            clone methods.
-        tolerance : float, default=None
-            The tolerance used to determine multi-labels. If set to None, only the child class with highest probability is predicted.
-            Otherwise, all child classes with :math:`probability >= max\_prob - tolerance` are predicted.
-        verbose : int, default=0
-            Controls the verbosity when fitting and predicting.
-            See https://verboselogs.readthedocs.io/en/latest/readme.html#overview-of-logging-levels
-            for more information.
-        edge_list : str, default=None
-            Path to write the hierarchy built.
-        replace_classifiers : bool, default=True
-            Turns on (True) the replacement of a local classifier with a constant classifier when trained on only
-            a single unique class.
-        n_jobs : int, default=1
-            The number of jobs to run in parallel. Only :code:`fit` is parallelized.
-            If :code:`Ray` is installed it is used, otherwise it defaults to :code:`Joblib`.
-        bert : bool, default=False
-            If True, skip scikit-learn's checks and sample_weight passing for BERT.
-        classifier_abbreviation : str, default=""
-            The abbreviation of the local hierarchical classifier to be displayed during logging.
-        """
-        self.local_classifier = local_classifier
-        self.tolerance = tolerance
-        self.verbose = verbose
-        self.edge_list = edge_list
-        self.replace_classifiers = replace_classifiers
-        self.n_jobs = n_jobs
-        self.bert = bert
-        self.classifier_abbreviation = classifier_abbreviation
-
-    def fit(self, X, y, sample_weight=None):
-        """
-        Fit a local hierarchical classifier.
-
-        Needs to be subclassed by other classifiers as it only offers common methods.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The training input samples. Internally, its dtype will be converted
-            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
-            converted into a sparse ``csc_matrix``.
-        y : array-like of shape (n_samples, n_levels)
-            The target values, i.e., hierarchical class labels for classification.
-        sample_weight : array-like of shape (n_samples,), default=None
-            Array of weights that are assigned to individual samples.
-            If not provided, then each sample is given unit weight.
-
-        Returns
-        -------
-        self : object
-            Fitted estimator.
-        """
-        # Fit local classifiers in DAG
-        self._fit_digraph()
-
-        # Delete unnecessary variables
-        self._clean_up()
-
-    def _pre_fit(self, X, y, sample_weight):
-        # Check that X and y have correct shape
-        # and convert them to np.ndarray if need be
-
-        if not self.bert:
-            self.X_, self.y_ = self._validate_data(
-                X, y, multi_output=True, accept_sparse="csr", allow_nd=True
-            )
-        else:
-            self.X_ = np.array(X)
-            self.y_ = np.array(y)
-
-        if sample_weight is not None:
-            self.sample_weight_ = _check_sample_weight(sample_weight, X)
-        else:
-            self.sample_weight_ = None
-
-        self.y_ = make_leveled(self.y_)
-
-        # Create and configure logger
-        self._create_logger()
-
-        # Avoids creating more columns in prediction if edges are a->b and b->c,
-        # which would generate the prediction a->b->c
-        self._disambiguate()
-
-        # Create DAG from self.y_ and store to self.hierarchy_
-        self._create_digraph()
-
-        # If user passes edge_list, then export
-        # DAG to CSV file to visualize with Gephi
-        self._export_digraph()
-
-        # Assert that graph is directed acyclic
-        self._assert_digraph_is_dag()
-
-        # If y is 1D or 2D, convert to 3D for binary policies
-        self._convert_1d_or_2d_y_to_3d()
-
-        # Detect root(s) and add artificial root to DAG
-        self._add_artificial_root()
-
-        # Initialize local classifiers in DAG
-        self._initialize_local_classifiers()
-
-    def _create_logger(self):
-        # Create logger
-        self.logger_ = logging.getLogger(self.classifier_abbreviation)
-        self.logger_.setLevel(self.verbose)
-
-        # Create console handler and set verbose level
-        if not self.logger_.hasHandlers():
-            ch = logging.StreamHandler()
-            ch.setLevel(self.verbose)
-
-            # Create formatter
-            formatter = logging.Formatter(
-                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-            )
-
-            # Add formatter to ch
-            ch.setFormatter(formatter)
-
-            # Add ch to logger
-            self.logger_.addHandler(ch)
-
-    def _disambiguate(self):
-        self.separator_ = "::HiClass::Separator::"
-        if self.y_.ndim == 3:
-            new_y = []
-            for i in range(self.y_.shape[0]):
-                new_y.append([])
-                for j in range(self.y_.shape[1]):
-                    new_y[i].append([str(self.y_[i, j, 0])])
-                    for k in range(1, self.y_.shape[2]):
-                        new_cell = ""
-                        if new_y[i][j][k - 1] != "":
-                            new_cell = (
-                                new_y[i][j][k - 1]
-                                + self.separator_
-                                + str(self.y_[i, j, k])
-                            )
-                        new_y[i][j].append(new_cell)
-            self.y_ = np.array(new_y)
-
-    def _create_digraph(self):
-        # Create DiGraph
-        self.hierarchy_ = nx.DiGraph()
-
-        # Save dtype of y_
-        self.dtype_ = self.y_.dtype
-
-        self._create_digraph_1d()
-
-        self._create_digraph_2d()
-
-        self._create_digraph_3d()
-
-        if self.y_.ndim > 3:
-            # Unsuported dimension
-            self.logger_.error(f"y with {self.y_.ndim} dimensions detected")
-            raise ValueError(
-                f"Creating graph from y with {self.y_.ndim} dimensions is not supported"
-            )
-
-    def _create_digraph_1d(self):
-        # Flatten 1D disguised as 2D
-        if self.y_.ndim == 2 and self.y_.shape[1] == 1:
-            self.logger_.info("Converting y to 1D")
-            self.y_ = self.y_.flatten()
-        if self.y_.ndim == 1:
-            # Create max_levels_ variable
-            self.max_levels_ = 1
-            self.max_multi_labels_ = 1
-            self.logger_.info(f"Creating digraph from {self.y_.size} 1D labels")
-            for label in self.y_:
-                self.hierarchy_.add_node(label)
-
-    def _create_digraph_2d(self):
-        if self.y_.ndim == 2:
-            # Create max_levels variable
-            self.max_levels_ = self.y_.shape[1]
-            self.max_multi_labels_ = 1
-            rows, columns = self.y_.shape
-            self.logger_.info(f"Creating digraph from {rows} 2D labels")
-            for row in range(rows):
-                for column in range(columns - 1):
-                    parent = self.y_[row, column].split(self.separator_)[-1]
-                    child = self.y_[row, column + 1].split(self.separator_)[-1]
-                    if parent != "" and child != "":
-                        # Only add edge if both parent and child are not empty
-                        self.hierarchy_.add_edge(
-                            self.y_[row, column], self.y_[row, column + 1]
-                        )
-                    elif parent != "" and column == 0:
-                        self.hierarchy_.add_node(parent)
-
-    def _create_digraph_3d(self):
-        if self.y_.ndim == 3:
-            self.max_levels_ = self.y_.shape[2]
-            self.max_multi_labels_ = self.y_.shape[1]
-            rows, multi_labels, columns = self.y_.shape
-            self.logger_.info(f"Creating digraph from {rows} 3D labels")
-            for row in range(rows):
-                for multi_label in range(multi_labels):
-                    if columns == 1:
-                        # If column is 1, then add node without children
-                        self.hierarchy_.add_node(self.y_[row, multi_label, 0])
-                    else:
-                        for column in range(columns - 1):
-                            parent = self.y_[row, multi_label, column].split(
-                                self.separator_
-                            )[-1]
-                            child = self.y_[row, multi_label, column + 1].split(
-                                self.separator_
-                            )[-1]
-                            if parent != "" and child != "":
-                                # Only add edge if both parent and child are not empty
-                                self.hierarchy_.add_edge(
-                                    self.y_[row, multi_label, column],
-                                    self.y_[row, multi_label, column + 1],
-                                )
-                            elif parent != "" and column == 0:
-                                self.hierarchy_.add_node(parent)
-
-    def _export_digraph(self):
-        # Check if edge_list is set
-        if self.edge_list:
-            # Add quotes to all nodes in case the text has commas
-            mapping = {}
-            for node in self.hierarchy_:
-                mapping[node] = '"{}"'.format(node.split(self.separator_)[-1])
-            hierarchy = nx.relabel_nodes(self.hierarchy_, mapping, copy=True)
-            # Export DAG to CSV file
-            self.logger_.info(f"Writing edge list to file {self.edge_list}")
-            nx.write_edgelist(hierarchy, self.edge_list, delimiter=",")
-
-    def _assert_digraph_is_dag(self):
-        # Assert that graph is directed acyclic
-        if not nx.is_directed_acyclic_graph(self.hierarchy_):
-            self.logger_.error("Cycle detected in graph")
-            raise ValueError("Graph is not directed acyclic")
-
-    def _convert_1d_or_2d_y_to_3d(
-        self,
-    ):
-        # This conversion is necessary for the multi-label binary policies
-        if self.y_.ndim == 1:
-            self.y_ = np.reshape(self.y_, (-1, 1, 1))
-        if self.y_.ndim == 2:
-            self.y_ = np.reshape(self.y_, (self.y_.shape[0], -1, self.y_.shape[1]))
-
-    def _add_artificial_root(self):
-        # Detect root(s)
-        roots = [
-            node for node, in_degree in self.hierarchy_.in_degree() if in_degree == 0
-        ]
-        self.logger_.info(f"Detected {len(roots)} roots")
-
-        # Add artificial root as predecessor to root(s) detected
-        self.root_ = "hiclass::root"
-        for old_root in roots:
-            self.hierarchy_.add_edge(self.root_, old_root)
-
-    def _initialize_local_classifiers(self):
-        # Create a deep copy of the local classifier specified
-        # for each node in the hierarchy and save to attribute "classifier"
-        self.logger_.info("Initializing local classifiers")
-        if self.local_classifier is None:
-            self.local_classifier_ = LogisticRegression()
-        else:
-            self.local_classifier_ = self.local_classifier
-
-    def _convert_to_2d(self, y):
-        # Convert predictions to 2D if there is only 1 label (should probably be deleted since this MultiLabel class should only be called when multipe y labels exist)
-        if self.max_multi_labels_ == 1:
-            y = y[:, 0, :]  # Remove multi-label dimension
-        return y
-
-    def _convert_to_1d(self, y):
-        # Convert predictions to 1D if there is only 1 column
-        # TODO: Decide if to keep in case of multi-label
-        if self.max_levels_ == 1:
-            y = y.flatten()
-
-        return y
-
-    def _remove_separator(self, y):
-        # Remove separator from predictions
-        if y.ndim == 2:
-            for i in range(y.shape[0]):
-                for j in range(1, y.shape[1]):
-                    y[i, j] = y[i, j].split(self.separator_)[-1]
-        elif y.ndim == 3:
-            for i in range(y.shape[0]):
-                for j in range(y.shape[1]):
-                    for k in range(1, y.shape[2]):
-                        y[i, j, k] = y[i, j, k].split(self.separator_)[-1]
-
-    def _fit_node_classifier(
-        self, nodes, local_mode: bool = False, use_joblib: bool = False
-    ):
-        if self.n_jobs > 1:
-            if _has_ray and not use_joblib:
-                ray.init(
-                    num_cpus=self.n_jobs,
-                    local_mode=local_mode,
-                    ignore_reinit_error=True,
-                )
-                classifier = ray.put(self)
-                _parallel_fit = ray.remote(self._fit_classifier)
-                results = [_parallel_fit.remote(classifier, node) for node in nodes]
-                classifiers = ray.get(results)
-            else:
-                classifiers = Parallel(n_jobs=self.n_jobs)(
-                    delayed(self._fit_classifier)(self, node) for node in nodes
-                )
-
-        else:
-            classifiers = [self._fit_classifier(self, node) for node in nodes]
-        for classifier, node in zip(classifiers, nodes):
-            self.hierarchy_.nodes[node]["classifier"] = classifier
-
-    @staticmethod
-    def _fit_classifier(self, node):
-        raise NotImplementedError("Method should be implemented in the LCPN and LCPPN")
-
-    def _clean_up(self):
-        self.logger_.info("Cleaning up variables that can take a lot of disk space")
-        del self.X_
-        del self.y_
-        if self.sample_weight_ is not None:
-            del self.sample_weight_
diff --git a/hiclass/MultiLabelLocalClassifierPerNode.py b/hiclass/MultiLabelLocalClassifierPerNode.py
deleted file mode 100644
index 06a1baae..00000000
--- a/hiclass/MultiLabelLocalClassifierPerNode.py
+++ /dev/null
@@ -1,316 +0,0 @@
-"""
-Local classifier per node approach.
-
-Numeric and string output labels are both handled.
-"""
-
-from copy import deepcopy
-
-import functools
-import networkx as nx
-import numpy as np
-
-from hiclass import BinaryPolicy
-from hiclass.ConstantClassifier import ConstantClassifier
-from hiclass.MultiLabelHierarchicalClassifier import (
-    MultiLabelHierarchicalClassifier,
-    make_leveled,
-)
-
-from sklearn.base import BaseEstimator
-from sklearn.utils.validation import check_is_fitted
-
-# monkeypatching check_array to accept 3 dimensional arrays
-import sklearn.utils.validation
-
-# TODO: Move to MultiLabelHierarchicalClassifier (Parent Class)
-sklearn.utils.validation.check_array = functools.partial(
-    sklearn.utils.validation.check_array, allow_nd=True
-)
-
-
-class MultiLabelLocalClassifierPerNode(BaseEstimator, MultiLabelHierarchicalClassifier):
-    """
-    Assign local classifiers to each node of the graph, except the root node.
-
-    A local classifier per node is a local hierarchical classifier that fits one local binary classifier
-    for each node of the class hierarchy, except for the root node.
-
-    Examples
-    --------
-    >>> from hiclass import MultiLabelLocalClassifierPerNode.py
-    >>> y = [[['1', '1.1'], ['', '']], [['1', '1.1'], ['2', '2.1']]]
-    >>> X = [[1, 2], [3, 4]]
-    >>> lcpn = MultiLabelLocalClassifierPerNode.py()
-    >>> lcpn.fit(X, y)
-    >>> lcpn.predict(X)
-    array([[['1', '1.1']],
-       [['2', '2.1']]])
-    """
-
-    def __init__(
-        self,
-        local_classifier: BaseEstimator = None,
-        binary_policy: str = "siblings",
-        tolerance: float = None,
-        verbose: int = 0,
-        edge_list: str = None,
-        replace_classifiers: bool = True,
-        n_jobs: int = 1,
-        bert: bool = False,
-    ):
-        r"""
-        Initialize a local classifier per node.
-
-        Parameters
-        ----------
-        local_classifier : BaseEstimator, default=LogisticRegression
-            The local_classifier used to create the collection of local classifiers. Needs to have fit, predict and
-            clone methods.
-        binary_policy : {"exclusive", "less_exclusive", "exclusive_siblings", "inclusive", "less_inclusive", "siblings"}, str, default="siblings"
-            Specify the rule for defining positive and negative training examples, using one of the following options:
-
-            - `exclusive`: Positive examples belong only to the class being considered. All classes are negative examples, except for the selected class;
-            - `less_exclusive`: Positive examples belong only to the class being considered. All classes are negative examples, except for the selected class and its descendants;
-            - `exclusive_siblings`: Positive examples belong only to the class being considered. All sibling classes are negative examples;
-            - `inclusive`: Positive examples belong only to the class being considered and its descendants. All classes are negative examples, except for the selected class, its descendants and ancestors;
-            - `less_inclusive`: Positive examples belong only to the class being considered and its descendants. All classes are negative examples, except for the selected class and its descendants;
-            - `siblings`: Positive examples belong only to the class being considered and its descendants. All siblings and their descendant classes are negative examples.
-
-            See :ref:`Training Policies` for more information about the different policies.
-        tolerance : float, default=None
-            The tolerance used to determine multi-labels. If set to None, only the child class with highest probability is predicted.
-            Otherwise, all child classes with :math:`probability >= max\_prob - tolerance` are predicted.
-        verbose : int, default=0
-            Controls the verbosity when fitting and predicting.
-            See https://verboselogs.readthedocs.io/en/latest/readme.html#overview-of-logging-levels
-            for more information.
-        edge_list : str, default=None
-            Path to write the hierarchy built.
-        replace_classifiers : bool, default=True
-            Turns on (True) the replacement of a local classifier with a constant classifier when trained on only
-            a single unique class.
-        n_jobs : int, default=1
-            The number of jobs to run in parallel. Only :code:`fit` is parallelized.
-            If :code:`Ray` is installed it is used, otherwise it defaults to :code:`Joblib`.
-        bert : bool, default=False
-            If True, skip scikit-learn's checks and sample_weight passing for BERT.
-        """
-        super().__init__(
-            local_classifier=local_classifier,
-            verbose=verbose,
-            edge_list=edge_list,
-            replace_classifiers=replace_classifiers,
-            n_jobs=n_jobs,
-            classifier_abbreviation="LCPN",
-            bert=bert,
-        )
-        self.binary_policy = binary_policy
-        self.tolerance = tolerance
-
-    def fit(self, X, y, sample_weight=None):
-        """
-        Fit a local classifier per node.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The training input samples. Internally, its dtype will be converted
-            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
-            converted into a sparse ``csc_matrix``.
-        y : array-like of shape (n_samples, n_levels)
-            The target values, i.e., hierarchical class labels for classification.
-        sample_weight : array-like of shape (n_samples,), default=None
-            Array of weights that are assigned to individual samples.
-            If not provided, then each sample is given unit weight.
-
-        Returns
-        -------
-        self : object
-            Fitted estimator.
-        """
-        # Execute common methods necessary before fitting
-        super()._pre_fit(X, y, sample_weight)
-
-        # Initialize policy
-        self._initialize_binary_policy()
-
-        # Fit local classifiers in DAG
-        super().fit(X, y)
-
-        # TODO: Store the classes seen during fit
-
-        # TODO: Add function to allow user to change local classifier
-
-        # TODO: Add parameter to receive hierarchy as parameter in constructor
-
-        # Return the classifier
-        return self
-
-    def predict(self, X, tolerance: float = None) -> np.ndarray:
-        r"""
-        Predict classes for the given data.
-
-        Hierarchical labels are returned.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The input samples. Internally, its dtype will be converted
-            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
-            converted into a sparse ``csr_matrix``.
-        tolerance : float, default=None
-            The tolerance used to determine multi-labels.
-            If set to None, only the child class with highest probability is predicted.
-            Overrides the tolerance set in the constructor.
-            Otherwise, all child classes with :math:`probability >= max\_prob - tolerance` are predicted.
-        Returns
-        -------
-        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
-            The predicted classes.
-        """
-        # Check if fit has been called
-        check_is_fitted(self)
-        _tolerance = (tolerance if tolerance is not None else self.tolerance) or 0.0
-
-        # Input validation
-        if not self.bert:
-            X = sklearn.utils.validation.check_array(
-                X, accept_sparse="csr"
-            )  # TODO: Decide allow_nd True or False
-        else:
-            X = np.array(X)
-
-        # Initialize array that holds predictions
-        y = [[[]] for _ in range(X.shape[0])]
-
-        bfs = nx.bfs_successors(self.hierarchy_, source=self.root_)
-
-        self.logger_.info("Predicting")
-
-        for predecessor, successors in bfs:
-            if predecessor == self.root_:
-                mask = [True] * X.shape[0]
-                subset_x = X[mask]
-                y_row_indices = [[i, [0]] for i in range(X.shape[0])]
-            else:
-                # get indices of rows that have the predecessor
-                y_row_indices = []
-                for row in range(X.shape[0]):
-                    # create list of indices
-                    _t = [z for z, ls in enumerate(y[row]) if ls[-1] == predecessor]
-
-                    # y_row_indices is a list of lists, each list contains the index of the row and a list of column indices
-                    y_row_indices.append([row, _t])
-
-                # Filter
-                mask = [True if ld[1] else False for ld in y_row_indices]
-                y_row_indices = [k for k in y_row_indices if k[1]]
-                subset_x = X[mask]
-
-            if subset_x.shape[0] > 0:
-                probabilities = np.zeros((subset_x.shape[0], len(successors)))
-                for row, successor in enumerate(successors):
-                    successor_name = str(successor).split(self.separator_)[-1]
-                    self.logger_.info(f"Predicting for node '{successor_name}'")
-                    classifier = self.hierarchy_.nodes[successor]["classifier"]
-                    positive_index = np.where(classifier.classes_ == 1)[0]
-                    probabilities[:, row] = classifier.predict_proba(subset_x)[
-                        :, positive_index
-                    ][:, 0]
-
-                # get indices of probabilities that are within tolerance of max
-
-                highest_probabilities = np.max(probabilities, axis=1).reshape(-1, 1)
-                indices_probabilities_within_tolerance = np.argwhere(
-                    np.greater_equal(probabilities, highest_probabilities - _tolerance)
-                )
-
-                prediction = [[] for _ in range(subset_x.shape[0])]
-                for row, column in indices_probabilities_within_tolerance:
-                    prediction[row].append(successors[column])
-
-                k = 0  # index of prediction
-                for row, col_list in y_row_indices:
-                    for j in col_list:
-                        if not prediction[k]:
-                            y[row][j].append("")
-                        else:
-                            for pi, _suc in enumerate(prediction[k]):
-                                if pi == 0:
-                                    y[row][j].append(_suc)
-                                else:
-                                    # in case of mulitple predictions, copy the previous prediction up to (but not including) the last prediction and add the new one
-                                    _old_y = y[row][j][:-1].copy()
-                                    y[row].insert(j + 1, _old_y + [_suc])
-                    k += 1
-
-        y = make_leveled(y)
-        self._remove_separator(y)
-        y = np.array(y, dtype=self.dtype_)
-
-        return y
-
-    def _initialize_binary_policy(self):
-        if isinstance(self.binary_policy, str):
-            self.logger_.info(f"Initializing {self.binary_policy} binary policy")
-            try:
-                self.binary_policy_ = BinaryPolicy.IMPLEMENTED_POLICIES[
-                    self.binary_policy.lower()
-                ](self.hierarchy_, self.X_, self.y_, self.sample_weight_)
-            except KeyError:
-                self.logger_.error(
-                    f"Policy {self.binary_policy} not implemented. Available policies are:\n"
-                    + f"{list(BinaryPolicy.IMPLEMENTED_POLICIES.keys())}"
-                )
-                raise KeyError(f"Policy {self.binary_policy} not implemented.")
-        else:
-            self.logger_.error("Binary policy is not a string")
-            raise ValueError(
-                f"Binary policy type must str, not {type(self.binary_policy)}."
-            )
-
-    def _initialize_local_classifiers(self):
-        super()._initialize_local_classifiers()
-        local_classifiers = {}
-        for node in self.hierarchy_.nodes:
-            # Skip only root node
-            if node != self.root_:
-                local_classifiers[node] = {
-                    "classifier": deepcopy(self.local_classifier_)
-                }
-        nx.set_node_attributes(self.hierarchy_, local_classifiers)
-
-    def _fit_digraph(self, local_mode: bool = False, use_joblib: bool = False):
-        self.logger_.info("Fitting local classifiers")
-        nodes = list(self.hierarchy_.nodes)
-        # Remove root because it does not need to be fitted
-        nodes.remove(self.root_)
-        self._fit_node_classifier(nodes, local_mode, use_joblib)
-
-    @staticmethod
-    def _fit_classifier(self, node):
-        classifier = self.hierarchy_.nodes[node]["classifier"]
-        X, y, sample_weight = self.binary_policy_.get_binary_examples(node)
-        unique_y = np.unique(y)
-        if len(unique_y) == 1 and self.replace_classifiers:
-            classifier = ConstantClassifier()
-        if not self.bert:
-            try:
-                classifier.fit(X, y, sample_weight)
-            except TypeError:
-                classifier.fit(X, y)
-        else:
-            classifier.fit(X, y)
-        return classifier
-
-    def _clean_up(self):
-        super()._clean_up()
-        del self.binary_policy_
-
-    def _more_tags(self):
-        return {
-            "_xfail_checks": {
-                "check_estimator_sparse_data": "Multi-label multi-output prediction format is not support in sklearn"
-            },
-        }
diff --git a/hiclass/MultiLabelLocalClassifierPerParentNode.py b/hiclass/MultiLabelLocalClassifierPerParentNode.py
deleted file mode 100644
index b61a83e8..00000000
--- a/hiclass/MultiLabelLocalClassifierPerParentNode.py
+++ /dev/null
@@ -1,306 +0,0 @@
-"""
-Local classifier per parent node approach.
-
-Numeric and string output labels are both handled.
-"""
-
-from copy import deepcopy
-from collections import defaultdict
-
-import networkx as nx
-import numpy as np
-from scipy.sparse import csr_matrix, vstack
-from sklearn.base import BaseEstimator
-from sklearn.utils.validation import check_array, check_is_fitted
-
-from hiclass.ConstantClassifier import ConstantClassifier
-from hiclass.MultiLabelHierarchicalClassifier import (
-    MultiLabelHierarchicalClassifier,
-    make_leveled,
-)
-
-
-class MultiLabelLocalClassifierPerParentNode(
-    BaseEstimator, MultiLabelHierarchicalClassifier
-):
-    """
-    Assign local classifiers to each parent node of the graph.
-
-    A local classifier per parent node is a local hierarchical classifier that fits one multi-class classifier
-    for each parent node of the class hierarchy.
-
-    Examples
-    --------
-    >>> from hiclass import MultiLabelLocalClassifierPerParentNode
-    >>> y = [[['1', '1.1'], ['2', '2.1']]]
-    >>> X = [[1, 2]]
-    >>> mllcppn = MultiLabelLocalClassifierPerParentNode()
-    >>> mllcppn.fit(X, y)
-    >>> mllcppn.predict(X)
-    array([[['1', '1.1'],
-       ['2', '2.1']]])
-    """
-
-    def __init__(
-        self,
-        local_classifier: BaseEstimator = None,
-        tolerance: float = None,
-        verbose: int = 0,
-        edge_list: str = None,
-        replace_classifiers: bool = True,
-        n_jobs: int = 1,
-        bert: bool = False,
-    ):
-        r"""
-        Initialize a multi-label local classifier per parent node.
-
-        Parameters
-        ----------
-        local_classifier : BaseEstimator, default=LogisticRegression
-            The local_classifier used to create the collection of local classifiers. Needs to have fit, predict and
-            clone methods.
-        tolerance : float, default=None
-            The tolerance used to determine multi-labels. If set to None, only the child class with highest probability is predicted.
-            Otherwise, all child classes with :math:`probability >= max\_prob - tolerance` are predicted.
-        verbose : int, default=0
-            Controls the verbosity when fitting and predicting.
-            See https://verboselogs.readthedocs.io/en/latest/readme.html#overview-of-logging-levels
-            for more information.
-        edge_list : str, default=None
-            Path to write the hierarchy built.
-        replace_classifiers : bool, default=True
-            Turns on (True) the replacement of a local classifier with a constant classifier when trained on only
-            a single unique class.
-        n_jobs : int, default=1
-            The number of jobs to run in parallel. Only :code:`fit` is parallelized.
-            If :code:`Ray` is installed it is used, otherwise it defaults to :code:`Joblib`.
-        bert : bool, default=False
-            If True, skip scikit-learn's checks and sample_weight passing for BERT.
-        """
-        super().__init__(
-            local_classifier=local_classifier,
-            tolerance=tolerance,
-            verbose=verbose,
-            edge_list=edge_list,
-            replace_classifiers=replace_classifiers,
-            n_jobs=n_jobs,
-            classifier_abbreviation="LCPPN",
-            bert=bert,
-        )
-
-    def fit(self, X, y, sample_weight=None):
-        """
-        Fit a local classifier per parent node.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The training input samples. Internally, its dtype will be converted
-            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
-            converted into a sparse ``csc_matrix``.
-        y : array-like of shape (n_samples, n_levels)
-            The target values, i.e., hierarchical class labels for classification.
-        sample_weight : array-like of shape (n_samples,), default=None
-            Array of weights that are assigned to individual samples.
-            If not provided, then each sample is given unit weight.
-
-        Returns
-        -------
-        self : object
-            Fitted estimator.
-        """
-        # Execute common methods necessary before fitting
-        super()._pre_fit(X, y, sample_weight)
-
-        # Fit local classifiers in DAG
-        super().fit(X, y)
-
-        # TODO: Store the classes seen during fit
-
-        # TODO: Add function to allow user to change local classifier
-
-        # TODO: Add parameter to receive hierarchy as parameter in constructor
-
-        # Return the classifier
-        return self
-
-    def predict(self, X, tolerance: float = None):
-        r"""
-        Predict classes for the given data.
-
-        Hierarchical labels are returned.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The input samples. Internally, its dtype will be converted
-            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
-            converted into a sparse ``csr_matrix``.
-        tolerance : float, default=None
-            The tolerance used to determine multi-labels.
-            If set to None, only the child class with highest probability is predicted.
-            Overrides the tolerance set in the constructor.
-            Otherwise, all child classes with :math:`probability >= max\_prob - tolerance` are predicted.
-        Returns
-        -------
-        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
-            The predicted classes.
-        """
-        # Check if fit has been called
-        check_is_fitted(self)
-        _tolerance = (tolerance if tolerance is not None else self.tolerance) or 0.0
-
-        # Input validation
-        if not self.bert:
-            X = check_array(X, accept_sparse="csr")
-        else:
-            X = np.array(X)
-
-        # TODO: Add threshold to stop prediction halfway if need be
-
-        self.logger_.info("Predicting")
-
-        y = [[] for _ in range(X.shape[0])]
-        # Predict first level
-        classifier = self.hierarchy_.nodes[self.root_]["classifier"]
-
-        probabilities = classifier.predict_proba(X)
-        for probs, ls in zip(probabilities, y):
-            prediction = classifier.classes_[
-                np.greater_equal(probs, np.max(probs) - _tolerance)
-            ]
-            for pred in prediction:
-                ls.append([pred])
-
-        y = self._predict_remaining_(X, y, _tolerance)
-        y = make_leveled(y)
-        y = np.array(y, dtype=self.dtype_)
-
-        # TODO: this only needed for the sklearn check_estimator_sparse_data test to pass :(
-        # However it also breaks a bunch of other tests
-        # y = self._convert_to_1d(
-        #   self._convert_to_2d(y)
-        # )
-        self._remove_separator(y)
-
-        return y
-
-    def _get_mask_and_indices(self, y, node):
-        mask = np.zeros(len(y), dtype=bool)
-        indicies = defaultdict(lambda: [])
-        for i, multi_label in enumerate(y):
-            for j, label in enumerate(multi_label):
-                if label[-1] == node:
-                    # if label[-1].split(self.separator_)[-1] == node:
-                    mask[i] = True
-                    indicies[i].append(j)
-        return mask, indicies
-
-    def _get_nodes_to_predict(self, y):
-        last_predictions = set()
-        for multi_label in y:
-            for label in multi_label:
-                last_predictions.add(label[-1])
-
-        nodes_to_predict = []
-        for node in last_predictions:
-            if node in self.hierarchy_.nodes and self.hierarchy_.nodes[node].get(
-                "classifier"
-            ):
-                nodes_to_predict.append(node)
-
-        return nodes_to_predict
-
-    def _predict_remaining_(self, X, y, tolerance):
-        nodes_to_predict = self._get_nodes_to_predict(y)
-        while nodes_to_predict:
-            for node in nodes_to_predict:
-                classifier = self.hierarchy_.nodes[node]["classifier"]
-                mask, indices = self._get_mask_and_indices(y, node)
-                subset_x = X[mask]
-                probabilities = classifier.predict_proba(subset_x)
-                for probs, (i, ls) in zip(probabilities, indices.items()):
-                    prediction = classifier.classes_[
-                        np.greater_equal(probs, np.max(probs) - tolerance)
-                    ]
-                    for j in ls:
-                        y[i][j].append(prediction[0])
-                        for pred in prediction[1:]:
-                            _old_y = y[i][j][:-1].copy()
-                            y[i].insert(j + 1, _old_y + [pred])
-            nodes_to_predict = self._get_nodes_to_predict(y)
-        return y
-
-    def _initialize_local_classifiers(self):
-        super()._initialize_local_classifiers()
-        local_classifiers = {}
-        nodes = self._get_parents()
-        for node in nodes:
-            local_classifiers[node] = {"classifier": deepcopy(self.local_classifier_)}
-        nx.set_node_attributes(self.hierarchy_, local_classifiers)
-
-    def _get_parents(self):
-        nodes = []
-        for node in self.hierarchy_.nodes:
-            # Skip only leaf nodes
-            successors = list(self.hierarchy_.successors(node))
-            if len(successors) > 0:
-                nodes.append(node)
-        return nodes
-
-    def _get_successors(self, node):
-        successors = list(self.hierarchy_.successors(node))
-        mask = np.isin(self.y_, successors).any(axis=(2, 1))
-        y = []
-        if isinstance(self.X_, csr_matrix):
-            X = csr_matrix((0, self.X_.shape[1]), dtype=self.X_.dtype)
-        else:
-            X = []
-        sample_weight = [] if self.sample_weight_ is not None else None
-        for i in range(self.y_.shape[0]):
-            if mask[i]:
-                row = self.y_[i]
-                labels = np.unique(
-                    row[np.isin(row, successors)]
-                )  # We do not want to double count the same row, e.g [["a", "b"], ["a", "c"]] should only count once for the root classifier with y label "a"
-                y.extend(labels)
-                for _ in range(labels.shape[0]):
-                    if isinstance(self.X_, csr_matrix):
-                        X = vstack([X, self.X_[i]])
-                    else:
-                        X.append(self.X_[i])
-                    if self.sample_weight_ is not None:
-                        sample_weight.append(self.sample_weight_[i])
-        y = np.array(y)
-        if isinstance(self.X_, np.ndarray):
-            X = np.array(X)
-        return X, y, sample_weight
-
-    @staticmethod
-    def _fit_classifier(self, node):
-        classifier = self.hierarchy_.nodes[node]["classifier"]
-        # get children examples
-        X, y, sample_weight = self._get_successors(node)
-        unique_y = np.unique(y)
-        if len(unique_y) == 1 and self.replace_classifiers:
-            classifier = ConstantClassifier()
-        if not self.bert:
-            try:
-                classifier.fit(X, y, sample_weight)
-            except TypeError:
-                classifier.fit(X, y)
-        else:
-            classifier.fit(X, y)
-        return classifier
-
-    def _fit_digraph(self, local_mode: bool = False, use_joblib: bool = False):
-        self.logger_.info("Fitting local classifiers")
-        nodes = self._get_parents()
-        self._fit_node_classifier(nodes, local_mode, use_joblib)
-
-    def _more_tags(self):
-        return {
-            "_xfail_checks": {
-                "check_estimator_sparse_data": "Multi-label multi-output prediction format is not support in sklearn"
-            },
-        }
diff --git a/hiclass/__init__.py b/hiclass/__init__.py
index 57778bac..4986d12e 100644
--- a/hiclass/__init__.py
+++ b/hiclass/__init__.py
@@ -1,18 +1,13 @@
 """Init module for the library."""
 
 import os
+
 from ._version import get_versions
+from .FlatClassifier import FlatClassifier
 from .LocalClassifierPerLevel import LocalClassifierPerLevel
 from .LocalClassifierPerNode import LocalClassifierPerNode
 from .LocalClassifierPerParentNode import LocalClassifierPerParentNode
 from .Pipeline import Pipeline
-from .FlatClassifier import FlatClassifier
-from .MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
-from .MultiLabelLocalClassifierPerParentNode import (
-    MultiLabelLocalClassifierPerParentNode,
-)
-from .Explainer import Explainer
-from ._version import get_versions
 
 __version__ = get_versions()["version"]
 del get_versions
@@ -23,8 +18,5 @@
     "LocalClassifierPerLevel",
     "Pipeline",
     "FlatClassifier",
-    "Explainer",
-    "MultiLabelLocalClassifierPerNode",
-    "MultiLabelLocalClassifierPerParentNode",
     "datasets",
 ]
diff --git a/hiclass/datasets.py b/hiclass/datasets.py
index 55819a8f..01111d45 100644
--- a/hiclass/datasets.py
+++ b/hiclass/datasets.py
@@ -1,10 +1,12 @@
 """Datasets util for downloading and maintaining sample datasets."""
 
-import requests
-import pandas as pd
+import csv
+import logging
 import os
 import tempfile
-import logging
+
+import numpy as np
+import requests
 from sklearn.model_selection import train_test_split
 
 # Configure logging
@@ -18,7 +20,6 @@
 os.makedirs(CACHE_DIR, exist_ok=True)
 
 # Dataset urls
-PLATYPUS_URL = "https://gist.githubusercontent.com/ashishpatel16/9306f8ed3ed101e7ddcb519776bcbd80/raw/1152c0b9613c2bda144a38fc4f74b5fe12255f4d/platypus_diseases.csv"
 HIERARCHICAL_TEXT_CLASSIFICATION_URL = (
     "https://zenodo.org/record/6657410/files/train_40k.csv?download=1"
 )
@@ -36,57 +37,6 @@ def _download_file(url, destination):
         raise RuntimeError(f"Failed to download file from {url}: {str(e)}")
 
 
-def load_platypus(test_size=0.3, random_state=42):
-    """
-    Load platypus diseases dataset.
-
-    Parameters
-    ----------
-    test_size : float, default=0.3
-        The proportion of the dataset to include in the test split.
-    random_state : int or None, default=42
-        Controls the randomness of the dataset. Pass an int for reproducible output across multiple function calls.
-
-    Returns
-    -------
-    list
-        List containing train-test split of inputs.
-
-    Raises
-    ------
-    RuntimeError
-        If failed to access or process the dataset.
-    Examples
-    --------
-    >>> from hiclass.datasets import load_platypus
-    >>> X_train, X_test, Y_train, Y_test = load_platypus()
-    >>> X_train[:3]
-         fever  diarrhea  stomach pain  skin rash  cough  sniffles  short breath  headache  size
-    220   37.8         0             3          5      1         1             0         2  27.6
-    539   37.2         0             6          1      1         1             0         3  28.4
-    326   39.9         0             2          5      1         1             1         2  30.7
-    >>> X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
-    (572, 9) (246, 9) (572,) (246,)
-    """
-    dataset_name = "platypus_diseases.csv"
-    cached_file_path = os.path.join(CACHE_DIR, dataset_name)
-
-    # Check if the file exists in the cache
-    if not os.path.exists(cached_file_path):
-        try:
-            logger.info("Downloading platypus diseases dataset..")
-            _download_file(PLATYPUS_URL, cached_file_path)
-        except Exception as e:
-            raise RuntimeError(f"Failed to access or download dataset: {str(e)}")
-
-    data = pd.read_csv(cached_file_path).fillna(" ")
-    X = data.drop(["label"], axis=1)
-    y = pd.Series([eval(val) for val in data["label"]])
-
-    # Return tuple (X_train, X_test, y_train, y_test)
-    return train_test_split(X, y, test_size=test_size, random_state=random_state)
-
-
 def load_hierarchical_text_classification(test_size=0.3, random_state=42):
     """
     Load hierarchical text classification dataset.
@@ -130,9 +80,11 @@ def load_hierarchical_text_classification(test_size=0.3, random_state=42):
         except Exception as e:
             raise RuntimeError(f"Failed to access or download dataset: {str(e)}")
 
-    data = pd.read_csv(cached_file_path).fillna(" ")
-    X = data["Title"]
-    y = data[["Cat1", "Cat2", "Cat3"]]
+    data = [row for row in csv.reader(open(cached_file_path))]
+    data.pop(0)
+    data = np.array(data, dtype=object)
+    X = data[:, 1]
+    y = data[:, 7:]
 
     # Return tuple (X_train, X_test, y_train, y_test)
     return train_test_split(X, y, test_size=test_size, random_state=random_state)
diff --git a/hiclass/metrics.py b/hiclass/metrics.py
index d281bc9e..a41f9d92 100644
--- a/hiclass/metrics.py
+++ b/hiclass/metrics.py
@@ -278,12 +278,6 @@ def f1(
     1.0
     >>> f1(y_true, y_pred, zero_division=np.nan)
     nan
-
-    >>> # multilabel hierarchical classification
-    >>> y_true = [[["a", "b", "c"]], [["d", "e", "f"]], [["g", "h", "i"]]]
-    >>> y_pred = [[["a", "b", "c"]], [["d", "e", "f"]], [["g", "h", "i"]]]
-    >>> f1(y_true, y_pred)
-    1.0
     """
     y_true, y_pred = _validate_input(y_true, y_pred)
     functions = {
diff --git a/setup.cfg b/setup.cfg
index 27599a0b..a74e8139 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,7 +14,7 @@ exclude = **/__init__.py,  docs/source/conf.py
 ;file.py: error
 
 [requires]
-python_version = ">=3.8,<3.13"
+python_version = ">=3.9,<3.13"
 
 # See the docstring in versioneer.py for instructions. Note that you must
 # re-run 'versioneer.py setup' after changing this section, and commit the
diff --git a/setup.py b/setup.py
index 7d674908..f9eefadc 100644
--- a/setup.py
+++ b/setup.py
@@ -23,17 +23,16 @@
 URL_ISSUES = "https://github.com/scikit-learn-contrib/hiclass/issues"
 EMAIL = "fabio.malchermiranda@hpi.de, Niklas.Koehnecke@student.hpi.uni-potsdam.de"
 AUTHOR = "Fabio Malcher Miranda, Niklas Koehnecke"
-REQUIRES_PYTHON = ">=3.8,<3.13"
+REQUIRES_PYTHON = ">=3.9,<3.13"
 KEYWORDS = ["hierarchical classification"]
 DACS_SOFTWARE = "https://gitlab.com/dacs-hpi"
 # What packages are required for this module to be executed?
-REQUIRED = ["networkx", "numpy", "scikit-learn<1.5", "scipy<1.13"]
+REQUIRED = ["networkx", "numpy", "scikit-learn>=1.5", "scipy"]
 
 # What packages are optional?
 # 'fancy feature': ['django'],}
 EXTRAS = {
     "ray": ["ray>=1.11.0"],
-    "xai": ["shap==0.44.1", "xarray==2023.1.0"],
     "dev": [
         "flake8",
         "pytest",
@@ -43,9 +42,6 @@
         "black==24.2.0",
         "pre-commit==2.20.0",
         "ray",
-        "shap==0.44.1",
-        "xarray==2023.1.0",
-        "bert-sklearn",
     ],
 }
 
@@ -156,9 +152,10 @@ def run(self):
         "Operating System :: Unix",
         "Operating System :: MacOS",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
         "Programming Language :: Python :: Implementation :: CPython",
         "Programming Language :: Python :: Implementation :: PyPy",
     ],
diff --git a/tests/test_BinaryPolicy.py b/tests/test_BinaryPolicy.py
index 7783536d..ec2dd90c 100644
--- a/tests/test_BinaryPolicy.py
+++ b/tests/test_BinaryPolicy.py
@@ -76,6 +76,22 @@ def features_2d():
 
 @pytest.fixture
 def features_sparse():
+    return csr_matrix(
+        [
+            [1, 2],
+            [3, 4],
+            [5, 6],
+            [7, 8],
+            [9, 10],
+            [11, 12],
+            [13, 14],
+            [15, 16],
+        ]
+    )
+
+
+@pytest.fixture
+def features_sparse_3d():
     return csr_matrix(
         [
             [1, 2],
@@ -1064,9 +1080,9 @@ def test_siblings_get_binary_examples_sparse_labels_2d_3(
 
 
 def test_siblings_get_binary_examples_sparse_labels_3d_1(
-    digraph, features_sparse, labels_3d
+    digraph, features_sparse_3d, labels_3d
 ):
-    policy = SiblingsPolicy(digraph, features_sparse, labels_3d)
+    policy = SiblingsPolicy(digraph, features_sparse_3d, labels_3d)
     ground_truth_x = [
         [1, 2],
         [3, 4],
@@ -1088,9 +1104,9 @@ def test_siblings_get_binary_examples_sparse_labels_3d_1(
 
 
 def test_siblings_get_binary_examples_sparse_labels_3d_2(
-    digraph, features_sparse, labels_3d
+    digraph, features_sparse_3d, labels_3d
 ):
-    policy = SiblingsPolicy(digraph, features_sparse, labels_3d)
+    policy = SiblingsPolicy(digraph, features_sparse_3d, labels_3d)
     ground_truth_x = [
         [5, 6],
         [7, 8],
@@ -1112,9 +1128,9 @@ def test_siblings_get_binary_examples_sparse_labels_3d_2(
 
 
 def test_siblings_get_binary_examples_sparse_labels_3d_3(
-    digraph, features_sparse, labels_3d
+    digraph, features_sparse_3d, labels_3d
 ):
-    policy = SiblingsPolicy(digraph, features_sparse, labels_3d)
+    policy = SiblingsPolicy(digraph, features_sparse_3d, labels_3d)
     ground_truth_x = [
         [5, 6],
         [9, 10],
diff --git a/tests/test_Datasets.py b/tests/test_Datasets.py
index 1d47ba5e..b86c0307 100644
--- a/tests/test_Datasets.py
+++ b/tests/test_Datasets.py
@@ -1,29 +1,12 @@
-import numpy as np
-import pytest
-
-import hiclass.datasets
-from hiclass.datasets import load_platypus, load_hierarchical_text_classification
 import os
 import tempfile
 
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
 
-def test_load_platypus_output_shape():
-    X_train, X_test, y_train, y_test = load_platypus(test_size=0.2, random_state=42)
-    assert X_train.shape[0] == y_train.shape[0]
-    assert X_test.shape[0] == y_test.shape[0]
-
-
-def test_load_platypus_random_state():
-    X_train_1, X_test_1, y_train_1, y_test_1 = load_platypus(
-        test_size=0.2, random_state=42
-    )
-    X_train_2, X_test_2, y_train_2, y_test_2 = load_platypus(
-        test_size=0.2, random_state=42
-    )
-    assert (X_train_1.values == X_train_2.values).all()
-    assert (X_test_1.values == X_test_2.values).all()
-    assert (y_train_1.index == y_train_2.index).all()
-    assert (y_test_1.index == y_test_2.index).all()
+import hiclass.datasets
+from hiclass.datasets import load_hierarchical_text_classification
 
 
 def test_load_hierarchical_text_classification_shape():
@@ -41,10 +24,10 @@ def test_load_hierarchical_text_classification_random_state():
     X_train_2, X_test_2, y_train_2, y_test_2 = load_hierarchical_text_classification(
         test_size=0.2, random_state=42
     )
-    assert (X_train_1 == X_train_2).all()
-    assert (X_test_1 == X_test_2).all()
-    assert (y_train_1.index == y_train_2.index).all()
-    assert (y_test_1.index == y_test_2.index).all()
+    assert_array_equal(X_train_1, X_train_2)
+    assert_array_equal(X_test_1, X_test_2)
+    assert_array_equal(y_train_1, y_train_2)
+    assert_array_equal(y_test_1, y_test_2)
 
 
 def test_load_hierarchical_text_classification_file_exists():
@@ -59,21 +42,9 @@ def test_load_hierarchical_text_classification_file_exists():
         assert os.path.exists(cached_file_path)
 
 
-def test_load_platypus_file_exists():
-    dataset_name = "platypus_diseases.csv"
-    cached_file_path = os.path.join(tempfile.gettempdir(), dataset_name)
-
-    if os.path.exists(cached_file_path):
-        os.remove(cached_file_path)
-
-    if not os.path.exists(cached_file_path):
-        load_platypus()
-        assert os.path.exists(cached_file_path)
-
-
 def test_download_dataset():
-    dataset_name = "platypus_diseases_test.csv"
-    url = hiclass.datasets.PLATYPUS_URL
+    dataset_name = "hierarchical_text_classification.csv"
+    url = hiclass.datasets.HIERARCHICAL_TEXT_CLASSIFICATION_URL
     cached_file_path = os.path.join(tempfile.gettempdir(), dataset_name)
 
     if os.path.exists(cached_file_path):
@@ -84,22 +55,6 @@ def test_download_dataset():
         assert os.path.exists(cached_file_path)
 
 
-def test_download_error_load_platypus():
-    dataset_name = "platypus_diseases.csv"
-    backup_url = hiclass.datasets.PLATYPUS_URL
-    hiclass.datasets.PLATYPUS_URL = ""
-    cached_file_path = os.path.join(tempfile.gettempdir(), dataset_name)
-
-    if os.path.exists(cached_file_path):
-        os.remove(cached_file_path)
-
-    if not os.path.exists(cached_file_path):
-        with pytest.raises(RuntimeError):
-            load_platypus()
-
-    hiclass.datasets.PLATYPUS_URL = backup_url
-
-
 def test_download_error_load_hierarchical_text():
     dataset_name = "hierarchical_text_classification.csv"
     backup_url = hiclass.datasets.HIERARCHICAL_TEXT_CLASSIFICATION_URL
@@ -117,5 +72,4 @@ def test_download_error_load_hierarchical_text():
 
 
 def test_url_links():
-    assert hiclass.datasets.PLATYPUS_URL != ""
     assert hiclass.datasets.HIERARCHICAL_TEXT_CLASSIFICATION_URL != ""
diff --git a/tests/test_Explainer.py b/tests/test_Explainer.py
deleted file mode 100644
index 303216f6..00000000
--- a/tests/test_Explainer.py
+++ /dev/null
@@ -1,245 +0,0 @@
-import numpy as np
-import pytest
-from sklearn.ensemble import RandomForestClassifier
-from hiclass import (
-    LocalClassifierPerLevel,
-    LocalClassifierPerParentNode,
-    LocalClassifierPerNode,
-    Explainer,
-)
-
-try:
-    import shap
-except ImportError:
-    shap_installed = False
-else:
-    shap_installed = True
-
-try:
-    import xarray
-except ImportError:
-    xarray_installed = False
-else:
-    xarray_installed = True
-
-
-@pytest.fixture
-def explainer_data():
-    x_train = np.random.randn(4, 3)
-    y_train = np.array(
-        [["a", "b", "d"], ["a", "b", "e"], ["a", "c", "f"], ["a", "c", "g"]]
-    )
-    x_test = np.random.randn(5, 3)
-
-    return x_train, x_test, y_train
-
-
-@pytest.fixture
-def explainer_data_no_root():
-    x_train = np.random.randn(6, 3)
-    y_train = np.array(
-        [
-            ["a", "b", "c"],
-            ["x", "y", "z"],
-            ["a", "b", "c"],
-            ["x", "y", "z"],
-            ["a", "b", "c"],
-            ["x", "y", "z"],
-        ]
-    )
-    x_test = np.random.randn(5, 3)
-    return x_train, x_test, y_train
-
-
-@pytest.mark.skipif(not shap_installed, reason="shap not installed")
-@pytest.mark.parametrize("data", ["explainer_data", "explainer_data_no_root"])
-def test_explainer_tree_lcppn(data, request):
-    rfc = RandomForestClassifier()
-    lcppn = LocalClassifierPerParentNode(
-        local_classifier=rfc, replace_classifiers=False
-    )
-
-    x_train, x_test, y_train = request.getfixturevalue(data)
-
-    lcppn.fit(x_train, y_train)
-
-    explainer = Explainer(lcppn, data=x_train, mode="tree")
-    explanations = explainer.explain(x_test)
-
-    # Assert if explainer returns an xarray.Dataset object
-    assert isinstance(explanations, xarray.Dataset)
-
-    # Assert if predictions made are consistent with the explanation object
-    y_preds = lcppn.predict(x_test)
-    for i in range(len(x_test)):
-        y_pred = y_preds[i]
-        explanation = explanations["predicted_class"][i]
-        for j in range(len(y_pred)):
-            assert explanation.data[j].split(lcppn.separator_)[-1] == y_pred[j]
-
-
-@pytest.mark.skipif(not shap_installed, reason="shap not installed")
-@pytest.mark.skipif(not xarray_installed, reason="xarray not installed")
-@pytest.mark.parametrize("data", ["explainer_data", "explainer_data_no_root"])
-def test_explainer_tree_lcpn(data, request):
-    rfc = RandomForestClassifier()
-    lcpn = LocalClassifierPerNode(local_classifier=rfc, replace_classifiers=False)
-
-    x_train, x_test, y_train = request.getfixturevalue(data)
-
-    lcpn.fit(x_train, y_train)
-
-    explainer = Explainer(lcpn, data=x_train, mode="tree")
-    explanations = explainer.explain(x_test)
-
-    # Assert if explainer returns an xarray.Dataset object
-    assert isinstance(explanations, xarray.Dataset)
-    y_preds = lcpn.predict(x_test)
-
-    # Assert if predictions made are consistent with the explanation object
-    for i in range(len(x_test)):
-        y_pred = y_preds[i]
-        for j in range(len(y_pred)):
-            assert str(explanations["node"][i].data[j]) == y_pred[j]
-
-
-@pytest.mark.skipif(not shap_installed, reason="shap not installed")
-@pytest.mark.parametrize("data", ["explainer_data", "explainer_data_no_root"])
-def test_explainer_tree_lcpl(data, request):
-    rfc = RandomForestClassifier()
-    lcpl = LocalClassifierPerLevel(local_classifier=rfc, replace_classifiers=False)
-
-    x_train, x_test, y_train = request.getfixturevalue(data)
-
-    lcpl.fit(x_train, y_train)
-
-    explainer = Explainer(lcpl, data=x_train, mode="tree")
-    explanations = explainer.explain(x_test)
-    assert explanations is not None
-    y_preds = lcpl.predict(x_test)
-    for i in range(len(x_test)):
-        y_pred = y_preds[i]
-        for j in range(len(y_pred)):
-            assert str(explanations["node"][i].data[j]) == y_pred[j]
-
-
-@pytest.mark.skipif(not shap_installed, reason="shap not installed")
-@pytest.mark.parametrize("data", ["explainer_data", "explainer_data_no_root"])
-def test_traversal_path_lcppn(data, request):
-    x_train, x_test, y_train = request.getfixturevalue(data)
-    rfc = RandomForestClassifier()
-    lcppn = LocalClassifierPerParentNode(
-        local_classifier=rfc, replace_classifiers=False
-    )
-
-    lcppn.fit(x_train, y_train)
-    explainer = Explainer(lcppn, data=x_train, mode="tree")
-    traversals = explainer._get_traversed_nodes_lcppn(x_test)
-    preds = lcppn.predict(x_test)
-    assert len(preds) == len(traversals)
-    for i in range(len(x_test)):
-        for j in range(len(traversals[i])):
-            if traversals[i][j] == lcppn.root_:
-                continue
-            label = traversals[i][j].split(lcppn.separator_)[-1]
-            assert label == preds[i][j - 1]
-
-
-@pytest.mark.skipif(not shap_installed, reason="shap not installed")
-@pytest.mark.parametrize("data", ["explainer_data", "explainer_data_no_root"])
-def test_traversal_path_lcpn(data, request):
-    x_train, x_test, y_train = request.getfixturevalue(data)
-    rfc = RandomForestClassifier()
-    lcpn = LocalClassifierPerNode(local_classifier=rfc, replace_classifiers=False)
-
-    lcpn.fit(x_train, y_train)
-    explainer = Explainer(lcpn, data=x_train, mode="tree")
-    traversals = explainer._get_traversed_nodes_lcpn(x_test)
-    preds = lcpn.predict(x_test)
-
-    # Assert if predictions and traversals are of same length
-    assert len(preds) == len(traversals)
-
-    # Assert if traversal path in predictions is same as the computed traversal path
-    for i in range(len(x_test)):
-        for j in range(len(traversals[i])):
-            label = traversals[i][j].split(lcpn.separator_)[-1]
-            assert label == preds[i][j]
-
-
-@pytest.mark.skipif(not shap_installed, reason="shap not installed")
-@pytest.mark.parametrize("data", ["explainer_data", "explainer_data_no_root"])
-def test_traversal_path_lcpl(data, request):
-    x_train, x_test, y_train = request.getfixturevalue(data)
-    rfc = RandomForestClassifier()
-    lcpl = LocalClassifierPerLevel(local_classifier=rfc, replace_classifiers=False)
-
-    lcpl.fit(x_train, y_train)
-    explainer = Explainer(lcpl, data=x_train, mode="tree")
-    traversals = explainer._get_traversed_nodes_lcpl(x_test)
-    preds = lcpl.predict(x_test)
-    assert len(preds) == len(traversals)
-    for i in range(len(x_test)):
-        for j in range(len(traversals[i])):
-            label = traversals[i][j].split(lcpl.separator_)[-1]
-            assert label == preds[i][j]
-
-
-@pytest.mark.skipif(not shap_installed, reason="shap not installed")
-@pytest.mark.skipif(not xarray_installed, reason="xarray not installed")
-@pytest.mark.parametrize("data", ["explainer_data", "explainer_data_no_root"])
-@pytest.mark.parametrize(
-    "classifier",
-    [LocalClassifierPerLevel, LocalClassifierPerParentNode, LocalClassifierPerNode],
-)
-def test_explain_with_xr(data, request, classifier):
-    x_train, x_test, y_train = request.getfixturevalue(data)
-    rfc = RandomForestClassifier()
-    clf = classifier(local_classifier=rfc, replace_classifiers=False)
-
-    clf.fit(x_train, y_train)
-    explainer = Explainer(clf, data=x_train, mode="tree")
-    explanations = explainer._explain_with_xr(x_test)
-
-    # Assert if explainer returns an xarray.Dataset object
-    assert isinstance(explanations, xarray.Dataset)
-
-
-@pytest.mark.parametrize(
-    "classifier",
-    [LocalClassifierPerParentNode, LocalClassifierPerLevel, LocalClassifierPerNode],
-)
-def test_imports(classifier):
-    x_train = [[76, 12, 49], [88, 63, 31], [5, 42, 24], [17, 90, 55]]
-    y_train = [["a", "b", "d"], ["a", "b", "e"], ["a", "c", "f"], ["a", "c", "g"]]
-
-    rfc = RandomForestClassifier()
-    clf = classifier(local_classifier=rfc, replace_classifiers=False)
-    clf.fit(x_train, y_train)
-
-    explainer = Explainer(clf, data=x_train, mode="tree")
-    assert isinstance(explainer.data, np.ndarray)
-
-
-@pytest.mark.skipif(not shap_installed, reason="shap not installed")
-@pytest.mark.parametrize(
-    "classifier",
-    [LocalClassifierPerLevel, LocalClassifierPerParentNode, LocalClassifierPerNode],
-)
-@pytest.mark.parametrize("data", ["explainer_data"])
-@pytest.mark.parametrize("mode", ["linear", "gradient", "deep", "tree", ""])
-def test_explainers(data, request, classifier, mode):
-    x_train, x_test, y_train = request.getfixturevalue(data)
-    rfc = RandomForestClassifier()
-    clf = classifier(local_classifier=rfc, replace_classifiers=False)
-
-    clf.fit(x_train, y_train)
-    explainer = Explainer(clf, data=x_train, mode=mode)
-    mode_mapping = {
-        "linear": shap.LinearExplainer,
-        "gradient": shap.GradientExplainer,
-        "deep": shap.DeepExplainer,
-        "tree": shap.TreeExplainer,
-        "": shap.Explainer,
-    }
-    assert explainer.explainer == mode_mapping[mode]
diff --git a/tests/test_HierarchicalClassifier.py b/tests/test_HierarchicalClassifier.py
index 6d02571b..0a87745d 100644
--- a/tests/test_HierarchicalClassifier.py
+++ b/tests/test_HierarchicalClassifier.py
@@ -219,13 +219,3 @@ def test_fit_classifier():
 def test_fit_digraph():
     with pytest.raises(NotImplementedError):
         HierarchicalClassifier._fit_digraph(None, None)
-
-
-def test_pre_fit_bert():
-    classifier = HierarchicalClassifier()
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.bert = True
-    x = [[0, 1], [2, 3]]
-    y = [["a", "b"], ["c", "d"]]
-    sample_weight = None
-    classifier._pre_fit(x, y, sample_weight)
diff --git a/tests/test_LocalClassifierPerLevel.py b/tests/test_LocalClassifierPerLevel.py
index 880fc904..58e37787 100644
--- a/tests/test_LocalClassifierPerLevel.py
+++ b/tests/test_LocalClassifierPerLevel.py
@@ -298,22 +298,3 @@ def test_fit_calibrate_predict_proba():
     assert_array_almost_equal(
         np.sum(proba[1], axis=1), np.ones(len(proba[1])), decimal=10
     )
-
-
-def test_fit_calibrate_predict_predict_proba_bert():
-    classifier = LocalClassifierPerLevel(
-        local_classifier=LogisticRegression(),
-        return_all_probabilities=True,
-        calibration_method="ivap",
-        probability_combiner="geometric",
-    )
-
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.bert = True
-    x = [[0, 1], [2, 3]]
-    y = [["a", "b"], ["c", "d"]]
-    sample_weight = None
-    classifier.fit(x, y, sample_weight)
-    classifier.calibrate(x, y)
-    classifier.predict(x)
-    classifier.predict_proba(x)
diff --git a/tests/test_LocalClassifierPerNode.py b/tests/test_LocalClassifierPerNode.py
index 2bcdf45c..dfb99713 100644
--- a/tests/test_LocalClassifierPerNode.py
+++ b/tests/test_LocalClassifierPerNode.py
@@ -410,22 +410,3 @@ def test_fit_calibrate_predict_proba():
     assert_array_almost_equal(
         np.sum(proba[1], axis=1), np.ones(len(proba[1])), decimal=10
     )
-
-
-def test_fit_calibrate_predict_predict_proba_bert():
-    classifier = LocalClassifierPerNode(
-        local_classifier=LogisticRegression(),
-        return_all_probabilities=True,
-        calibration_method="ivap",
-        probability_combiner="geometric",
-    )
-
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.bert = True
-    x = [[0, 1], [2, 3]]
-    y = [["a", "b"], ["c", "d"]]
-    sample_weight = None
-    classifier.fit(x, y, sample_weight)
-    classifier.calibrate(x, y)
-    classifier.predict(x)
-    classifier.predict_proba(x)
diff --git a/tests/test_LocalClassifierPerParentNode.py b/tests/test_LocalClassifierPerParentNode.py
index 268fc990..2d4a49da 100644
--- a/tests/test_LocalClassifierPerParentNode.py
+++ b/tests/test_LocalClassifierPerParentNode.py
@@ -4,7 +4,6 @@
 import networkx as nx
 import numpy as np
 import pytest
-from bert_sklearn import BertClassifier
 from numpy.testing import assert_array_almost_equal, assert_array_equal
 from scipy.sparse import csr_matrix
 from sklearn.exceptions import NotFittedError
@@ -376,56 +375,3 @@ def test_fit_calibrate_predict_proba():
     assert_array_almost_equal(
         np.sum(proba[1], axis=1), np.ones(len(proba[1])), decimal=10
     )
-
-
-def test_fit_calibrate_predict_predict_proba_bert():
-    classifier = LocalClassifierPerParentNode(
-        local_classifier=LogisticRegression(),
-        return_all_probabilities=True,
-        calibration_method="ivap",
-        probability_combiner="geometric",
-    )
-
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.bert = True
-    x = [[0, 1], [2, 3]]
-    y = [["a", "b"], ["c", "d"]]
-    sample_weight = None
-    classifier.fit(x, y, sample_weight)
-    classifier.calibrate(x, y)
-    classifier.predict(x)
-    classifier.predict_proba(x)
-
-
-# Note: bert only works with the local classifier per parent node
-# It does not have the attribute classes_, which are necessary
-# for the local classifiers per level and per node
-def test_fit_bert():
-    bert = BertClassifier()
-    clf = LocalClassifierPerParentNode(
-        local_classifier=bert,
-        bert=True,
-    )
-    x = ["Batman", "rorschach"]
-    y = [
-        ["Action", "The Dark Night"],
-        ["Action", "Watchmen"],
-    ]
-    clf.fit(x, y)
-    check_is_fitted(clf)
-    predictions = clf.predict(x)
-    assert_array_equal(y, predictions)
-
-
-def test_bert_unleveled():
-    clf = LocalClassifierPerParentNode(
-        local_classifier=BertClassifier(),
-        bert=True,
-    )
-    x = ["Batman", "Jaws"]
-    y = [["Action", "The Dark Night"], ["Thriller"]]
-    ground_truth = [["Action", "The Dark Night"], ["Action", "The Dark Night"]]
-    clf.fit(x, y)
-    check_is_fitted(clf)
-    predictions = clf.predict(x)
-    assert_array_equal(ground_truth, predictions)
diff --git a/tests/test_MultiLabelHierarchicalClassifier.py b/tests/test_MultiLabelHierarchicalClassifier.py
deleted file mode 100644
index 9f72c928..00000000
--- a/tests/test_MultiLabelHierarchicalClassifier.py
+++ /dev/null
@@ -1,287 +0,0 @@
-import logging
-import tempfile
-
-import networkx as nx
-import numpy as np
-import pytest
-from numpy.testing import assert_array_equal
-from sklearn.linear_model import LogisticRegression
-
-from hiclass.MultiLabelHierarchicalClassifier import (
-    MultiLabelHierarchicalClassifier,
-    make_leveled,
-)
-
-
-@pytest.fixture
-def ambiguous_node_str():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.y_ = np.array(
-        [
-            [["a", "b"], ["", ""]],
-            [["b", "c"], ["", ""]],
-            [["d", "e"], ["f", "g"]],
-        ]
-    )
-    return classifier
-
-
-def test_disambiguate_str(ambiguous_node_str):
-    ground_truth = np.array(
-        [
-            [["a", "a::HiClass::Separator::b"], ["", ""]],
-            [["b", "b::HiClass::Separator::c"], ["", ""]],
-            [["d", "d::HiClass::Separator::e"], ["f", "f::HiClass::Separator::g"]],
-        ]
-    )
-    ambiguous_node_str._disambiguate()
-    assert_array_equal(ground_truth, ambiguous_node_str.y_)
-
-
-@pytest.fixture
-def ambiguous_node_int():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.y_ = np.array(
-        [
-            [[1, 2], ["", ""]],
-            [[2, 3], ["", ""]],
-            [[4, 5], [6, 7]],
-        ]
-    )
-    return classifier
-
-
-def test_disambiguate_int(ambiguous_node_int):
-    ground_truth = np.array(
-        [
-            [["1", "1::HiClass::Separator::2"], ["", ""]],
-            [["2", "2::HiClass::Separator::3"], ["", ""]],
-            [["4", "4::HiClass::Separator::5"], ["6", "6::HiClass::Separator::7"]],
-        ]
-    )
-    ambiguous_node_int._disambiguate()
-    assert_array_equal(ground_truth, ambiguous_node_int.y_)
-
-
-@pytest.fixture
-def graph_1d():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.y_ = np.array(["a", "b", "c", "d"])
-    classifier.logger_ = logging.getLogger("HC")
-    return classifier
-
-
-def test_create_digraph_1d(graph_1d):
-    ground_truth = nx.DiGraph()
-    ground_truth.add_nodes_from(np.array(["a", "b", "c", "d"]))
-    graph_1d._create_digraph()
-    assert nx.is_isomorphic(ground_truth, graph_1d.hierarchy_)
-    assert list(ground_truth.nodes) == list(graph_1d.hierarchy_.nodes)
-    assert list(ground_truth.edges) == list(graph_1d.hierarchy_.edges)
-
-
-@pytest.fixture
-def graph_1d_disguised_as_2d():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.y_ = np.array([["a"], ["b"], ["c"], ["d"]])
-    classifier.logger_ = logging.getLogger("HC")
-    return classifier
-
-
-def test_create_digraph_1d_disguised_as_2d(graph_1d_disguised_as_2d):
-    ground_truth = nx.DiGraph()
-    ground_truth.add_nodes_from(np.array(["a", "b", "c", "d"]))
-    graph_1d_disguised_as_2d._create_digraph()
-    assert nx.is_isomorphic(ground_truth, graph_1d_disguised_as_2d.hierarchy_)
-    assert list(ground_truth.nodes) == list(graph_1d_disguised_as_2d.hierarchy_.nodes)
-    assert list(ground_truth.edges) == list(graph_1d_disguised_as_2d.hierarchy_.edges)
-
-
-@pytest.fixture
-def digraph_2d():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.y_ = np.array([["a", "b", "c"], ["d", "e", "f"]])
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.separator_ = "::HiClass::Separator::"
-    return classifier
-
-
-def test_create_digraph_2d(digraph_2d):
-    ground_truth = nx.DiGraph([("a", "b"), ("b", "c"), ("d", "e"), ("e", "f")])
-    digraph_2d._create_digraph()
-    assert nx.is_isomorphic(ground_truth, digraph_2d.hierarchy_)
-    assert list(ground_truth.nodes) == list(digraph_2d.hierarchy_.nodes)
-    assert list(ground_truth.edges) == list(digraph_2d.hierarchy_.edges)
-
-
-@pytest.fixture
-def digraph_3d():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.y_ = np.array(
-        [
-            [["a", "b", "c"], ["d", "e", "f"]],
-            [["g", "h", "i"], ["j", "k", "l"]],
-        ]
-    )
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.separator_ = "::HiClass::Separator::"
-    return classifier
-
-
-def test_create_digraph_3d(digraph_3d):
-    ground_truth = nx.DiGraph(
-        [
-            ("a", "b"),
-            ("b", "c"),
-            ("d", "e"),
-            ("e", "f"),
-            ("g", "h"),
-            ("h", "i"),
-            ("j", "k"),
-            ("k", "l"),
-        ]
-    )
-    digraph_3d._create_digraph()
-    assert nx.is_isomorphic(ground_truth, digraph_3d.hierarchy_)
-    assert list(ground_truth.nodes) == list(digraph_3d.hierarchy_.nodes)
-    assert list(ground_truth.edges) == list(digraph_3d.hierarchy_.edges)
-
-
-@pytest.fixture
-def digraph_3d_no_children():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.y_ = np.array([[["a"]], [["b"]]])
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.separator_ = "::HiClass::Separator::"
-    return classifier
-
-
-def test_create_digraph_3d_single_column(digraph_3d_no_children):
-    ground_truth = nx.DiGraph()
-    ground_truth.add_node("a")
-    ground_truth.add_node("b")
-
-    digraph_3d_no_children._create_digraph()
-    assert nx.is_isomorphic(ground_truth, digraph_3d_no_children.hierarchy_)
-    assert list(ground_truth.nodes) == list(digraph_3d_no_children.hierarchy_.nodes)
-    assert list(ground_truth.edges) == list(digraph_3d_no_children.hierarchy_.edges)
-
-
-def test_export_digraph(digraph_2d):
-    digraph_2d.hierarchy_ = nx.DiGraph([("a", "b"), ("b", "c"), ("d", "e"), ("e", "f")])
-    digraph_2d.edge_list = tempfile.TemporaryFile()
-    ground_truth = b'"a","b",{}\n"b","c",{}\n"d","e",{}\n"e","f",{}\n'
-    digraph_2d._export_digraph()
-    digraph_2d.edge_list.seek(0)
-    assert digraph_2d.edge_list.read() == ground_truth
-
-
-@pytest.fixture
-def cyclic_graph():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.hierarchy_ = nx.DiGraph([("a", "b"), ("b", "c"), ("c", "a")])
-    classifier.logger_ = logging.getLogger("HC")
-    return classifier
-
-
-def test_assert_digraph_is_dag(cyclic_graph):
-    with pytest.raises(ValueError):
-        cyclic_graph._assert_digraph_is_dag()
-
-
-def test_convert_1d_y_to_3d(graph_1d):
-    ground_truth = np.array(
-        [
-            [["a"]],
-            [["b"]],
-            [["c"]],
-            [["d"]],
-        ]
-    )
-    graph_1d._convert_1d_or_2d_y_to_3d()
-    assert_array_equal(ground_truth, graph_1d.y_)
-
-
-def test_convert_2d_y_to_3d(digraph_2d):
-    ground_truth = np.array(
-        [
-            [["a", "b", "c"]],
-            [["d", "e", "f"]],
-        ]
-    )
-    digraph_2d._convert_1d_or_2d_y_to_3d()
-    assert_array_equal(ground_truth, digraph_2d.y_)
-
-
-@pytest.fixture
-def digraph_one_root():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.hierarchy_ = nx.DiGraph([("a", "b"), ("b", "c"), ("c", "d")])
-    return classifier
-
-
-def test_add_artificial_root(digraph_one_root):
-    digraph_one_root._add_artificial_root()
-    successors = list(digraph_one_root.hierarchy_.successors("hiclass::root"))
-    assert ["a"] == successors
-    assert "hiclass::root" == digraph_one_root.root_
-
-
-@pytest.fixture
-def digraph_multiple_roots():
-    classifier = MultiLabelHierarchicalClassifier()
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.hierarchy_ = nx.DiGraph([("a", "b"), ("c", "d"), ("e", "f")])
-    classifier.X_ = np.array([[1, 2], [3, 4], [5, 6]])
-    classifier.y_ = np.array([["a", "b"], ["c", "d"], ["e", "f"]])
-    classifier.sample_weight_ = None
-    return classifier
-
-
-def test_add_artificial_root_multiple_roots(digraph_multiple_roots):
-    digraph_multiple_roots._add_artificial_root()
-    successors = list(digraph_multiple_roots.hierarchy_.successors("hiclass::root"))
-    assert ["a", "c", "e"] == successors
-    assert "hiclass::root" == digraph_multiple_roots.root_
-
-
-def test_initialize_local_classifiers_2(digraph_multiple_roots):
-    digraph_multiple_roots.local_classifier = None
-    digraph_multiple_roots._initialize_local_classifiers()
-    assert isinstance(digraph_multiple_roots.local_classifier_, LogisticRegression)
-
-
-def test_clean_up(digraph_multiple_roots):
-    digraph_multiple_roots._clean_up()
-    with pytest.raises(AttributeError):
-        assert digraph_multiple_roots.X_ is None
-    with pytest.raises(AttributeError):
-        assert digraph_multiple_roots.y_ is None
-
-
-@pytest.fixture
-def noniterable_y():
-    y = [1, 2, 3]
-    return y
-
-
-def test_make_leveled_non_iterable_y(noniterable_y):
-    assert noniterable_y == make_leveled(noniterable_y)
-
-
-def test_make_leveled_example_y():
-    y = [[["a"]], [["b", "c"]]]
-    ground_truth = np.array([[["a", ""]], [["b", "c"]]])
-    assert_array_equal(ground_truth, make_leveled(y))
-
-
-def test_make_leveled_multicharacter_nodes_y():
-    y = [[["node1"]], [["node2", "node3"]]]
-    ground_truth = np.array([[["node1", ""]], [["node2", "node3"]]])
-    assert_array_equal(ground_truth, make_leveled(y))
-
-
-def test_fit_classifier():
-    with pytest.raises(NotImplementedError):
-        MultiLabelHierarchicalClassifier._fit_classifier(None, None)
diff --git a/tests/test_MultiLabelLocalClassifierPerNode.py b/tests/test_MultiLabelLocalClassifierPerNode.py
deleted file mode 100644
index 2fb3ee36..00000000
--- a/tests/test_MultiLabelLocalClassifierPerNode.py
+++ /dev/null
@@ -1,357 +0,0 @@
-import logging
-
-import networkx as nx
-import numpy as np
-import pytest
-from numpy.testing import assert_array_equal
-from scipy.sparse import csr_matrix
-from sklearn.exceptions import NotFittedError
-from sklearn.linear_model import LogisticRegression
-from sklearn.utils.estimator_checks import parametrize_with_checks
-from sklearn.utils.validation import check_is_fitted
-
-from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
-from hiclass.BinaryPolicy import ExclusivePolicy
-from hiclass.ConstantClassifier import ConstantClassifier
-import sklearn.utils.validation
-import functools
-
-
-@parametrize_with_checks([MultiLabelLocalClassifierPerNode()])
-def test_sklearn_compatible_estimator(estimator, check):
-    check(estimator)
-
-
-@pytest.fixture
-def digraph_with_policy():
-    digraph = MultiLabelLocalClassifierPerNode(binary_policy="exclusive")
-    digraph.hierarchy_ = nx.DiGraph([("a", "b")])
-    digraph.X_ = np.array([1, 2])
-    digraph.y_ = np.array([[["a", "b"]]])
-    digraph.logger_ = logging.getLogger("LCPN")
-    digraph.sample_weight_ = None
-    return digraph
-
-
-def test_initialize_binary_policy(digraph_with_policy):
-    digraph_with_policy._initialize_binary_policy()
-    assert isinstance(digraph_with_policy.binary_policy_, ExclusivePolicy)
-
-
-@pytest.fixture
-def digraph_with_unknown_policy():
-    digraph = MultiLabelLocalClassifierPerNode(binary_policy="unknown")
-    digraph.hierarchy_ = nx.DiGraph([("a", "b")])
-    digraph.y_ = np.array([[["a", "b"]]])
-    digraph.logger_ = logging.getLogger("LCPN")
-    return digraph
-
-
-def test_initialize_unknown_binary_policy(digraph_with_unknown_policy):
-    with pytest.raises(KeyError):
-        digraph_with_unknown_policy._initialize_binary_policy()
-
-
-@pytest.fixture
-def digraph_with_object_policy():
-    digraph = MultiLabelLocalClassifierPerNode(binary_policy=ExclusivePolicy)
-    digraph.hierarchy_ = nx.DiGraph([("a", "b")])
-    digraph.y_ = np.array([[["a", "b"]]])
-    digraph.logger_ = logging.getLogger("LCPN")
-    return digraph
-
-
-def test_initialize_object_binary_policy(digraph_with_object_policy):
-    with pytest.raises(ValueError):
-        digraph_with_object_policy._initialize_binary_policy()
-
-
-@pytest.fixture
-def digraph_logistic_regression():
-    digraph = MultiLabelLocalClassifierPerNode(local_classifier=LogisticRegression())
-    digraph.hierarchy_ = nx.DiGraph([("a", "b"), ("a", "c")])
-    digraph.y_ = np.array(
-        [[["a", "b"], ["", ""]], [["a", "c"], ["", ""]], [["a", "b"], ["a", "c"]]]
-    )
-    digraph.X_ = np.array([[1, 2], [3, 4], [5, 6]])
-    digraph.logger_ = logging.getLogger("LCPN")
-    digraph.root_ = "a"
-    digraph.separator_ = "::HiClass::Separator::"
-    digraph.binary_policy_ = ExclusivePolicy(digraph.hierarchy_, digraph.X_, digraph.y_)
-    digraph.sample_weight_ = None
-    return digraph
-
-
-def test_initialize_local_classifiers(digraph_logistic_regression):
-    digraph_logistic_regression._initialize_local_classifiers()
-    for node in digraph_logistic_regression.hierarchy_.nodes:
-        if node != digraph_logistic_regression.root_:
-            assert isinstance(
-                digraph_logistic_regression.hierarchy_.nodes[node]["classifier"],
-                LogisticRegression,
-            )
-        else:
-            with pytest.raises(KeyError):
-                isinstance(
-                    digraph_logistic_regression.hierarchy_.nodes[node]["classifier"],
-                    LogisticRegression,
-                )
-
-
-@pytest.mark.parametrize("n_jobs", [1, 2])
-def test_fit_digraph(digraph_logistic_regression, n_jobs):
-    classifiers = {
-        "b": {"classifier": LogisticRegression()},
-        "c": {"classifier": LogisticRegression()},
-    }
-    digraph_logistic_regression.n_jobs = n_jobs
-    nx.set_node_attributes(digraph_logistic_regression.hierarchy_, classifiers)
-    digraph_logistic_regression._fit_digraph(local_mode=True)
-    with pytest.raises(KeyError):
-        check_is_fitted(digraph_logistic_regression.hierarchy_.nodes["a"]["classifier"])
-    for node in ["b", "c"]:
-        try:
-            check_is_fitted(
-                digraph_logistic_regression.hierarchy_.nodes[node]["classifier"]
-            )
-        except NotFittedError as e:
-            pytest.fail(repr(e))
-    assert 1
-
-
-def test_fit_digraph_joblib_multiprocessing(digraph_logistic_regression):
-    classifiers = {
-        "b": {"classifier": LogisticRegression()},
-        "c": {"classifier": LogisticRegression()},
-    }
-    digraph_logistic_regression.n_jobs = 2
-    nx.set_node_attributes(digraph_logistic_regression.hierarchy_, classifiers)
-    digraph_logistic_regression._fit_digraph(local_mode=True, use_joblib=True)
-    with pytest.raises(KeyError):
-        check_is_fitted(digraph_logistic_regression.hierarchy_.nodes["a"]["classifier"])
-    for node in ["b", "c"]:
-        try:
-            check_is_fitted(
-                digraph_logistic_regression.hierarchy_.nodes[node]["classifier"]
-            )
-        except NotFittedError as e:
-            pytest.fail(repr(e))
-    assert 1
-
-
-def test_fit_1_label():
-    # test that predict removes multilabel dimension in case of 1 label
-    lcpn = MultiLabelLocalClassifierPerNode(
-        local_classifier=LogisticRegression(), n_jobs=2
-    )
-    y = np.array([[["1", "2"]]])
-    X = np.array([[1, 2]])
-    ground_truth = np.array(
-        [[["1", "2"]]]
-    )  # TODO: decide if dimension should be removed
-    lcpn.fit(X, y)
-    prediction = lcpn.predict(X)
-    assert_array_equal(ground_truth, prediction)
-
-
-def test_clean_up(digraph_logistic_regression):
-    digraph_logistic_regression._clean_up()
-    with pytest.raises(AttributeError):
-        assert digraph_logistic_regression.X_ is None
-    with pytest.raises(AttributeError):
-        assert digraph_logistic_regression.y_ is None
-    with pytest.raises(AttributeError):
-        assert digraph_logistic_regression.binary_policy_ is None
-
-
-@pytest.fixture
-def fitted_logistic_regression():
-    digraph = MultiLabelLocalClassifierPerNode(local_classifier=LogisticRegression())
-    digraph.hierarchy_ = nx.DiGraph(
-        [("r", "1"), ("r", "2"), ("1", "1.1"), ("1", "1.2"), ("2", "2.1"), ("2", "2.2")]
-    )
-    digraph.X_ = np.array([[1, -1], [1, 1], [2, -1], [2, 1], [1, 0]])
-    digraph.y_ = [
-        [["1", "1.1"]],
-        [["1", "1.2"]],
-        [["2", "2.1"]],
-        [["2", "2.2"]],
-        [["1", "1.1"], ["1", "1.2"]],
-    ]
-
-    digraph.logger_ = logging.getLogger("LCPN")
-    digraph.max_levels_ = 2
-    digraph.dtype_ = "<U3"
-    digraph.root_ = "r"
-    digraph.separator_ = "::HiClass::Separator::"
-    digraph.max_multi_labels_ = 2
-    classifiers = {
-        "1": {"classifier": LogisticRegression()},
-        "1.1": {"classifier": LogisticRegression()},
-        "1.2": {"classifier": LogisticRegression()},
-        "2": {"classifier": LogisticRegression()},
-        "2.1": {"classifier": LogisticRegression()},
-        "2.2": {"classifier": LogisticRegression()},
-    }
-    # TODO: is selection of labels for trainnig correct with respect to binary policy?
-    classifiers["1"]["classifier"].fit(digraph.X_, [1, 1, 0, 0, 1])
-    classifiers["1.1"]["classifier"].fit(digraph.X_, [1, 0, 0, 0, 1])
-    classifiers["1.2"]["classifier"].fit(digraph.X_, [0, 1, 0, 0, 1])
-    classifiers["2"]["classifier"].fit(digraph.X_, [0, 0, 1, 1, 0])
-    classifiers["2.1"]["classifier"].fit(digraph.X_, [0, 0, 1, 0, 0])
-    classifiers["2.2"]["classifier"].fit(digraph.X_, [0, 0, 0, 1, 0])
-    nx.set_node_attributes(digraph.hierarchy_, classifiers)
-    return digraph
-
-
-def test_predict_no_tolerance(fitted_logistic_regression):
-    ground_truth = np.array(
-        [
-            [["1", "1.1"], ["", ""]],
-            [["1", "1.2"], ["", ""]],
-            [["2", "2.1"], ["", ""]],
-            [["2", "2.2"], ["", ""]],
-            [["1", "1.1"], ["1", "1.2"]],
-        ]
-    )
-    X = np.array(
-        [[1, -1], [1, 1], [2, -1], [2, 1], [1, 0]]
-    )  # same as fitted_logistic_regression.X_
-    prediction = fitted_logistic_regression.predict(X)
-    assert_array_equal(ground_truth, prediction)
-
-
-@pytest.mark.parametrize(
-    "tolerance,expected", [(None, [["1", "1.2"]]), (0.1, [["1", "1.1"], ["1", "1.2"]])]
-)
-def test_predict_tolerance(fitted_logistic_regression, tolerance, expected):
-    # test that depending on tolerance set predicts multilabels or not
-    ground_truth = np.array([expected])
-    X = np.array([[1, 0.01]])
-    prediction = fitted_logistic_regression.predict(X, tolerance)
-    assert_array_equal(ground_truth, prediction)
-
-
-def test_predict_sparse(fitted_logistic_regression):
-    ground_truth = np.array(
-        [
-            [["1", "1.1"], ["", ""]],
-            [["1", "1.2"], ["", ""]],
-            [["2", "2.1"], ["", ""]],
-            [["2", "2.2"], ["", ""]],
-            [["1", "1.1"], ["1", "1.2"]],
-        ]
-    )
-    prediction = fitted_logistic_regression.predict(
-        csr_matrix([[1, -1], [1, 1], [2, -1], [2, 1], [1, 0]])
-    )
-    assert_array_equal(ground_truth, prediction)
-
-
-def test_fit_predict():
-    lcpn = MultiLabelLocalClassifierPerNode(local_classifier=LogisticRegression())
-    x = np.array([[1, 2], [3, 4]])
-    y = np.array([[["a", "c"], ["b", "c"]], [["a", "c"], ["b", "c"]]])
-    lcpn.fit(x, y)
-    predictions = lcpn.predict(x)
-    assert_array_equal(y, predictions)
-
-
-def test_fit_predict_deep():
-    lcpn = MultiLabelLocalClassifierPerNode(
-        local_classifier=LogisticRegression(), tolerance=0.1
-    )
-    x = np.array(
-        [
-            [1, 1, -1],
-            [1, 1, 1],
-        ]
-    )
-    y = np.array(
-        [
-            [
-                ["1", "1.1", "1.1.1"],
-                ["1", "1.1", "1.1.2"],
-                ["1", "1.2", "1.2.1"],
-                ["1", "1.2", "1.2.2"],
-            ],
-            [
-                ["1", "1.1", "1.1.1"],
-                ["1", "1.1", "1.1.2"],
-                ["1", "1.2", "1.2.1"],
-                ["1", "1.2", "1.2.2"],
-            ],
-        ]
-    )
-    lcpn.fit(x, y)
-    predictions = lcpn.predict(x)
-    assert_array_equal(y, predictions)
-
-
-@pytest.fixture
-def empty_levels():
-    X = [
-        [1],
-        [2],
-        [3],
-    ]
-    y = [
-        [["1", "", ""]],
-        [["2", "2.1", ""]],
-        [["3", "3.1", "3.1.2"]],
-    ]
-    return X, y
-
-
-def test_empty_levels(empty_levels):
-    lcpn = MultiLabelLocalClassifierPerNode()
-    X, y = empty_levels
-    lcpn.fit(X, y)
-    predictions = lcpn.predict(X)
-    ground_truth = [
-        [["1", "", ""]],
-        [["2", "2.1", ""]],
-        [["3", "3.1", "3.1.2"]],
-    ]
-    assert list(lcpn.hierarchy_.nodes) == [
-        "1",
-        "2",
-        "2" + lcpn.separator_ + "2.1",
-        "3",
-        "3" + lcpn.separator_ + "3.1",
-        "3" + lcpn.separator_ + "3.1" + lcpn.separator_ + "3.1.2",
-        lcpn.root_,
-    ]
-    assert_array_equal(ground_truth, predictions)
-
-
-def test_fit_unique_class():
-    lcpn = MultiLabelLocalClassifierPerNode(
-        local_classifier=LogisticRegression(), n_jobs=1
-    )
-    y = np.array([[["1"]], [["1"]]])
-    X = np.array([[1], [2]])
-
-    lcpn.fit(X, y)
-    prediction = lcpn.predict(X)
-    assert_array_equal(y, prediction)
-
-
-def test_fit_bert():
-    bert = ConstantClassifier()
-    lcpn = MultiLabelLocalClassifierPerNode(
-        local_classifier=bert,
-        bert=True,
-    )
-    X = ["Text 1", "Text 2"]
-    y = np.array(
-        [
-            [["a"]],
-            [["a"]],
-        ]
-    )
-    lcpn.fit(X, y)
-    check_is_fitted(lcpn)
-
-    predictions = lcpn.predict(X)
-    assert_array_equal(y, predictions)
diff --git a/tests/test_MultiLabelLocalClassifierPerParentNode.py b/tests/test_MultiLabelLocalClassifierPerParentNode.py
deleted file mode 100644
index 70ac8fdf..00000000
--- a/tests/test_MultiLabelLocalClassifierPerParentNode.py
+++ /dev/null
@@ -1,406 +0,0 @@
-import logging
-import tempfile
-
-import networkx as nx
-import numpy as np
-import pytest
-from numpy.testing import assert_array_equal
-from scipy.sparse import csr_matrix
-from sklearn.exceptions import NotFittedError
-from sklearn.linear_model import LogisticRegression
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.utils.estimator_checks import parametrize_with_checks
-from sklearn.utils.validation import check_is_fitted
-
-from hiclass.MultiLabelLocalClassifierPerParentNode import (
-    MultiLabelLocalClassifierPerParentNode,
-)
-from hiclass.ConstantClassifier import ConstantClassifier
-
-
-@parametrize_with_checks([MultiLabelLocalClassifierPerParentNode()])
-def test_sklearn_compatible_estimator(estimator, check):
-    check(estimator)
-
-
-@pytest.fixture
-def digraph_logistic_regression():
-    digraph = MultiLabelLocalClassifierPerParentNode(
-        local_classifier=LogisticRegression()
-    )
-    digraph.hierarchy_ = nx.DiGraph([("a", "b"), ("a", "c")])
-    digraph.y_ = np.array([[["a", "b"]], [["a", "c"]]])
-    digraph.X_ = np.array([[1, 2], [3, 4]])
-    digraph.logger_ = logging.getLogger("LCPPN")
-    digraph.root_ = "a"
-    digraph.separator_ = "::HiClass::Separator::"
-    digraph.sample_weight_ = None
-    return digraph
-
-
-def test_initialize_local_classifiers(digraph_logistic_regression):
-    digraph_logistic_regression._initialize_local_classifiers()
-    for node in digraph_logistic_regression.hierarchy_.nodes:
-        if node == digraph_logistic_regression.root_:
-            assert isinstance(
-                digraph_logistic_regression.hierarchy_.nodes[node]["classifier"],
-                LogisticRegression,
-            )
-        else:
-            with pytest.raises(KeyError):
-                isinstance(
-                    digraph_logistic_regression.hierarchy_.nodes[node]["classifier"],
-                    LogisticRegression,
-                )
-
-
-def test_fit_digraph(digraph_logistic_regression):
-    classifiers = {
-        "a": {"classifier": LogisticRegression()},
-    }
-    digraph_logistic_regression.n_jobs = 2
-    nx.set_node_attributes(digraph_logistic_regression.hierarchy_, classifiers)
-    digraph_logistic_regression._fit_digraph(local_mode=True)
-    try:
-        check_is_fitted(digraph_logistic_regression.hierarchy_.nodes["a"]["classifier"])
-    except NotFittedError as e:
-        pytest.fail(repr(e))
-    for node in ["b", "c"]:
-        with pytest.raises(KeyError):
-            check_is_fitted(
-                digraph_logistic_regression.hierarchy_.nodes[node]["classifier"]
-            )
-    assert 1
-
-
-def test_fit_digraph_joblib_multiprocessing(digraph_logistic_regression):
-    classifiers = {
-        "a": {"classifier": LogisticRegression()},
-    }
-    digraph_logistic_regression.n_jobs = 2
-    nx.set_node_attributes(digraph_logistic_regression.hierarchy_, classifiers)
-    digraph_logistic_regression._fit_digraph(local_mode=True, use_joblib=True)
-    try:
-        check_is_fitted(digraph_logistic_regression.hierarchy_.nodes["a"]["classifier"])
-    except NotFittedError as e:
-        pytest.fail(repr(e))
-    for node in ["b", "c"]:
-        with pytest.raises(KeyError):
-            check_is_fitted(
-                digraph_logistic_regression.hierarchy_.nodes[node]["classifier"]
-            )
-    assert 1
-
-
-def test_fit_1_class():
-    lcppn = MultiLabelLocalClassifierPerParentNode(
-        local_classifier=LogisticRegression(), n_jobs=2
-    )
-    y = np.array([[["1", "2"]]])
-    X = np.array([[1, 2]])
-    ground_truth = np.array([[["1", "2"]]])
-    lcppn.fit(X, y)
-    prediction = lcppn.predict(X)
-    assert_array_equal(ground_truth, prediction)
-
-
-@pytest.fixture
-def digraph_2d():
-    classifier = MultiLabelLocalClassifierPerParentNode()
-    classifier.y_ = np.array([[["a", "b", "c"]], [["d", "e", "f"]]])
-    classifier.hierarchy_ = nx.DiGraph([("a", "b"), ("b", "c"), ("d", "e"), ("e", "f")])
-    classifier.logger_ = logging.getLogger("HC")
-    classifier.edge_list = tempfile.TemporaryFile()
-    classifier.separator_ = "::HiClass::Separator::"
-    return classifier
-
-
-def test_get_parents(digraph_2d):
-    ground_truth = np.array(["a", "b", "d", "e"])
-    nodes = digraph_2d._get_parents()
-    assert_array_equal(ground_truth, nodes)
-
-
-@pytest.fixture
-def x_and_y_arrays():
-    graph = MultiLabelLocalClassifierPerParentNode()
-    graph.X_ = np.array(
-        [
-            [1, 2, 3],
-            [4, 5, 6],
-            [7, 8, 9],
-            # Multi-label
-            [10, 11, 12],
-            [13, 14, 15],
-        ]
-    )
-    graph.y_ = np.array(
-        [
-            [["a", "b", "c"], ["", "", ""]],
-            [["a", "e", "f"], ["", "", ""]],
-            [["d", "g", "h"], ["", "", ""]],
-            # Multi-label
-            [["a", "b", "c"], ["a", "e", "f"]],
-            [["a", "b", "c"], ["d", "g", "h"]],
-        ]
-    )
-    graph.hierarchy_ = nx.DiGraph(
-        [("a", "b"), ("b", "c"), ("a", "e"), ("e", "f"), ("d", "g"), ("g", "h")]
-    )
-    graph.root_ = "r"
-    graph.sample_weight_ = None
-    return graph
-
-
-def test_get_successors_1(x_and_y_arrays):
-    x, y, weights = x_and_y_arrays._get_successors("a")
-    ground_truth_x = np.array(
-        [[1, 2, 3], [4, 5, 6], [10, 11, 12], [10, 11, 12], [13, 14, 15]]
-    )
-    ground_truth_y = np.array(["b", "e", "b", "e", "b"])
-    assert_array_equal(ground_truth_x, x)
-    assert_array_equal(ground_truth_y, y)
-    assert weights is None
-
-
-def test_get_successors_2(x_and_y_arrays):
-    x, y, weights = x_and_y_arrays._get_successors("d")
-    ground_truth_x = x_and_y_arrays.X_[[False, False, True, False, True]]
-    ground_truth_y = np.array(["g", "g"])
-    assert_array_equal(ground_truth_x, x)
-    assert_array_equal(ground_truth_y, y)
-    assert weights is None
-
-
-def test_get_successors_3(x_and_y_arrays):
-    x, y, weights = x_and_y_arrays._get_successors("b")
-    ground_truth_x = x_and_y_arrays.X_[[True, False, False, True, True]]
-    ground_truth_y = np.array(["c", "c", "c"])
-    assert_array_equal(ground_truth_x, x)
-    assert ground_truth_y.shape == y.shape
-    assert_array_equal(ground_truth_y, y)
-    assert weights is None
-
-
-@pytest.fixture
-def fitted_logistic_regression():
-    digraph = MultiLabelLocalClassifierPerParentNode(
-        local_classifier=LogisticRegression()
-    )
-    digraph.hierarchy_ = nx.DiGraph(
-        [("r", "1"), ("r", "2"), ("1", "1.1"), ("1", "1.2"), ("2", "2.1"), ("2", "2.2")]
-    )
-    digraph.X_ = np.array(
-        [
-            [1, -1],
-            [1, 1],
-            [1, 0],
-            [2, -1],
-            [2, 1],
-        ]
-    )
-    digraph.y_ = [
-        [["1", "1.1"]],
-        [["1", "1.2"]],
-        [["1", "1.1"], ["1", "1.2"]],
-        [["2", "2.1"]],
-        [["2", "2.2"]],
-    ]
-    digraph.logger_ = logging.getLogger("LCPN")
-    digraph.max_levels_ = 2
-    digraph.dtype_ = "<U3"
-    digraph.root_ = "r"
-    digraph.separator_ = "::HiClass::Separator::"
-    digraph.tolerance = 0.0
-    classifiers = {
-        "r": {"classifier": LogisticRegression()},
-        "1": {"classifier": LogisticRegression()},
-        "2": {"classifier": LogisticRegression()},
-    }
-    classifiers["r"]["classifier"].fit(digraph.X_, ["1", "1", "1", "2", "2"])
-    classifiers["1"]["classifier"].fit(
-        digraph.X_[[0, 1, 2, 2]], ["1.1", "1.2", "1.1", "1.2"]
-    )
-    classifiers["2"]["classifier"].fit(digraph.X_[[3, 4]], ["2.1", "2.2"])
-    nx.set_node_attributes(digraph.hierarchy_, classifiers)
-    return digraph
-
-
-def test_predict_no_tolerance(fitted_logistic_regression):
-    ground_truth = np.array(
-        [
-            [["1", "1.1"], ["", ""]],
-            [["1", "1.2"], ["", ""]],
-            [["1", "1.1"], ["1", "1.2"]],
-            [["2", "2.1"], ["", ""]],
-            [["2", "2.2"], ["", ""]],
-        ]
-    )
-    X = np.array(
-        [
-            [1, -1],
-            [1, 1],
-            [1, 0],
-            [2, -1],
-            [2, 1],
-        ]
-    )
-    prediction = fitted_logistic_regression.predict(X)
-    assert_array_equal(ground_truth, prediction)
-
-
-@pytest.mark.parametrize(
-    "tolerance,expected", [(0.0, [["1", "1.2"]]), (0.1, [["1", "1.1"], ["1", "1.2"]])]
-)
-def test_predict_tolerance(fitted_logistic_regression, tolerance, expected):
-    # test that depending on tolerance set predicts multilabels or not
-    fitted_logistic_regression.tolerance = tolerance
-    ground_truth = np.array([expected])
-    X = np.array([[1, 0.01]])
-    prediction = fitted_logistic_regression.predict(X)
-    assert_array_equal(ground_truth, prediction)
-
-
-def test_predict_sparse(fitted_logistic_regression):
-    ground_truth = np.array(
-        [
-            [["1", "1.1"], ["", ""]],
-            [["1", "1.2"], ["", ""]],
-            [["1", "1.1"], ["1", "1.2"]],
-            [["2", "2.1"], ["", ""]],
-            [["2", "2.2"], ["", ""]],
-        ]
-    )
-    prediction = fitted_logistic_regression.predict(
-        csr_matrix([[1, -1], [1, 1], [1, 0], [2, -1], [2, 1]])
-    )
-    assert_array_equal(ground_truth, prediction)
-
-
-def test_fit_predict_dag():
-    lcppn = MultiLabelLocalClassifierPerParentNode(
-        local_classifier=LogisticRegression()
-    )
-    x = np.array([[0, -1], [1, 1]])
-    y = np.array(
-        [[["a", "b", "c"]], [["b", "c", ""]]]
-    )  # TODO: How to deal with seperator nodes?, in this case we have two classifiers for b: "b" and "a::sep::b"
-    lcppn.fit(x, y)
-    predictions = lcppn.predict(x)
-    assert_array_equal(y, predictions)
-
-
-def test_fit_predict():
-    lcppn = MultiLabelLocalClassifierPerParentNode(
-        local_classifier=LogisticRegression()
-    )
-    y = np.array(
-        [
-            [["1", "1.1"], ["", ""]],
-            [["1", "1.2"], ["", ""]],
-            [["1", "1.1"], ["1", "1.2"]],
-            [["2", "2.1"], ["", ""]],
-            [["2", "2.2"], ["", ""]],
-        ]
-    )
-    x = np.array([[1, -1], [1, 1], [1, 0], [2, -1], [2, 1]])
-    lcppn.fit(x, y)
-    predictions = lcppn.predict(x)
-    assert_array_equal(y, predictions)
-
-
-def test_fit_predict_deep():
-    lcpn = MultiLabelLocalClassifierPerParentNode(
-        local_classifier=LogisticRegression(), tolerance=0.1
-    )
-    x = np.array(
-        [
-            [1, 1, -1],
-            [1, 1, 1],
-        ]
-    )
-    y = np.array(
-        [
-            [
-                ["1", "1.1", "1.1.1"],
-                ["1", "1.1", "1.1.2"],
-                ["1", "1.2", "1.2.1"],
-                ["1", "1.2", "1.2.2"],
-            ],
-            [
-                ["1", "1.1", "1.1.1"],
-                ["1", "1.1", "1.1.2"],
-                ["1", "1.2", "1.2.1"],
-                ["1", "1.2", "1.2.2"],
-            ],
-        ]
-    )
-    lcpn.fit(x, y)
-    predictions = lcpn.predict(x)
-    assert_array_equal(y, predictions)
-
-
-@pytest.fixture
-def empty_levels():
-    X = [
-        [1],
-        [2],
-        [3],
-    ]
-    y = [
-        [["1", "", ""]],
-        [["2", "2.1", ""]],
-        [["3", "3.1", "3.1.2"]],
-    ]
-    return X, y
-
-
-def test_empty_levels(empty_levels):
-    lcppn = MultiLabelLocalClassifierPerParentNode()
-    X, y = empty_levels
-    lcppn.fit(X, y)
-    predictions = lcppn.predict(X)
-    ground_truth = [
-        [["1", "", ""]],
-        [["2", "2.1", ""]],
-        [["3", "3.1", "3.1.2"]],
-    ]
-    assert list(lcppn.hierarchy_.nodes) == [
-        "1",
-        "2",
-        "2" + lcppn.separator_ + "2.1",
-        "3",
-        "3" + lcppn.separator_ + "3.1",
-        "3" + lcppn.separator_ + "3.1" + lcppn.separator_ + "3.1.2",
-        lcppn.root_,
-    ]
-    assert_array_equal(ground_truth, predictions)
-
-
-def test_fit_unique_class():
-    lcppn = MultiLabelLocalClassifierPerParentNode(
-        local_classifier=LogisticRegression(), n_jobs=1
-    )
-    y = np.array([[["1"]], [["1"]]])
-    X = np.array([[1], [2]])
-
-    lcppn.fit(X, y)
-    prediction = lcppn.predict(X)
-    assert_array_equal(y, prediction)
-
-
-def test_bert():
-    bert = ConstantClassifier()
-    lcpn = MultiLabelLocalClassifierPerParentNode(
-        local_classifier=bert,
-        bert=True,
-    )
-    X = ["Text 1", "Text 2"]
-    y = [
-        [["a"]],
-        [["a"]],
-    ]
-    lcpn.fit(X, y)
-    check_is_fitted(lcpn)
-    predictions = lcpn.predict(X)
-    assert_array_equal(y, predictions)