diff --git a/.bumpversion.cfg b/.bumpversion.cfg index df50016f..35b29ab6 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.7 +current_version = 0.1.8 commit = False tag = False allow_dirty = False diff --git a/.gitignore b/.gitignore index aca3fcd8..5c5766d2 100644 --- a/.gitignore +++ b/.gitignore @@ -12,9 +12,18 @@ docs/sensai dist *.egg-info* /temp +/temp*.py /default_console.py /build /git-split.sh /.pytest_cache data .coverage +/*.env +/docs-build +/code.sh +/lightning_logs +*.code-workspace +/TODO.txt +notebooks/temp.ipynb +/docs/build \ No newline at end of file diff --git a/README-dev.md b/README-dev.md index 384b11b5..7a1b89c4 100644 --- a/README-dev.md +++ b/README-dev.md @@ -1,15 +1,59 @@ +# Development Environment + +This section explains the steps required to set up an environment in order to develop sensAI further. + +## Clone Large Files + +Clone the full repo, including large files using [git LFS](https://git-lfs.github.com): + + git lfs pull + +This adds, in particular, data that is used in notebooks. + +## Create the Python Virtual Environment + +Use conda to set up the Python environment: + + conda env create -f environment.py + +Solving the environment may take several minutes (but should ultimately work). + +NOTE: versions are mostly unpinned in the environment specification, because this facilitates conda dependency resolution. Also, sensAI is intended to be compatible with all (newer) versions of the dependencies. If it isn't, we need to specify an upper version bound in `setup.py` (where it matters the most) as well as in `environment.yml`. Compatibility with old (pinned) versions and the latest versions is tested in the tox build (see below). + # Build and Test Pipeline The tests and docs build are executed via **tox** in several environments: * `py`: the "regular" test environment, where we test against the pinned dependencies which we also use for development (by explicitly including `requirements.txt` with the pinned versions; this is also the environment in which we test the execution of notebooks * `py_latest_dependencies`: the environment where we use the latest versions of all dependencies (except where we have identified an incompatibility; see `setup.py` definitions `DEPS_VERSION_LOWER_BOUND` and `DEPS_VERSION_UPPER_BOUND_EXCLUSIVE`); by not including `requirements.txt`, we depend on the latest admissible versions according to `setup.py` -* `docs`: the environment in which docs are built via sphinx (by executing `build_scripts/update_docs.py`) +* `docs`: the environment in which docs are built via sphinx ## Docs Build -Docs are automatically created, all .rst files are auto-generated; only `index.rst` is manually defined. +Docs are automatically created during the GitHub build via tox. + +All .rst files are auto-generated (by `build_scripts/update_docs.py`), with the exception of the root index file `index.rst`. + +### Declaring Optional Dependencies + +**Attention**: Make sure that any optional sensAI dependencies (which are not included in the `docs` tox environment) are added to `docs/conf.py` under `autodoc_mock_imports`. Otherwise the tox build will fail. + +### Notebooks + +`docs/index.rst` includes the names of notebooks which reside in the `notebooks/` folder. They are not initially present in the `docs/` folder, but any notebooks whose names are referenced in `index.rst` will be executed and saved with outputs to the `docs/` folder by a test in `notebooks/test_notebooks.py`. + +Therefore, in order for the docs build to work (without temporarily removing the notebook inclusions), it is necessary to run the aforementioned test at least once via + + sh run_pytest_notebooks.sh + +For changes in notebooks to be reflected in the docs build, the test needs to be rerun. + +### Manually Running the Docs Build -Make sure that any optional sensAI dependencies (which are not included in the `docs` tox environment) are added to `docs/conf.py` under `autodoc_mock_imports`. +The docs build can be run without tox via + + sh build-docs.sh + +Results will be stored in `docs/build/`. # Creating a New Release @@ -42,3 +86,53 @@ Make sure that any optional sensAI dependencies (which are not included in the ` `bumpversion build --commit` * Continue with step 3. +# Source-Level Directory Sync + +#### Details on the Synchonisation of a Source Directory within Your Project with the sensAI Repository + +We support the synchronisation of a branch in the sensAI repository with a directory within the git repository of your project which is to contain the sensAI source code (i.e. alternative #2 from above) via a convenient scripting solution. + +We consider two local repositories: the sensAI repository in directory `sensAI/` and your project in, for instance, directory `sensAI/../myprj/`. Let us assume that we want to synchronise branch `myprj-branch` in the sensAI repository with directory `myprj/src/sensai`. + +##### Synchronisation Script + +To perform the synchronisation, please create a script as follows, which you should save to `sensAI/sync.py`: + +```python +import os +from repo_dir_sync import LibRepo, OtherRepo + +r = LibRepo() +r.add(OtherRepo("myprj", "myprj-branch", os.path.join("..", "myprj", "src", "sensai"))) +r.runMain() +``` + +You can add multiple other repositories if you so desire in the future. + +From directory `sensAI/` you can use the script in order to + +* ***Push***: Update your project (i.e. `myprj/src/sensai`) with changes that were made in other projects by running `python sync.py myprj push` +* ***Pull***: Update `myprj-branch` in the sensAI repository with changes made in your project by running `python sync.py myprj pull` + +##### Initialisation + +To initialise the synchronisation, proceed as follows: + +1. Create the branch `myprj-branch` in the sensAI repository, i.e. in `sensAI/` run this command: + `git branch myprj-branch master` +2. Create the directory `myprj/src/sensai`. +3. Make sure you have a `.gitignore` file in `myprj/` with at least the following entries: + + *.pyc + __pycache__ + *.bak + *.orig + + Otherwise you may end up with unwanted tracked files after a synchronisation. +4. Perform the initial *push*, i.e. in `sensAI/` run this command: + `python sync.py myprj push` + +##### Things to Keep in Mind + +* Both *push* and *pull* operations are always performed based on the branch that is currently checked out in `myprj/`. The best practice is to only use one branch for synchronisation, e.g. master. +* *Push* and *pull* operations will make git commits in both repositories. Should an operation ever go wrong/not do what you intended, use `git reset --hard` to go back to the commits before the operation in both repositories. \ No newline at end of file diff --git a/README.md b/README.md index f2e4ad3c..c517b870 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ In particular, sensAI provides ... ## Documentation -Source code documentation and tutorials can be found [here](https://sensai.readthedocs.io/) +Reference documentation and tutorials can be found [here](https://jambit.github.io/sensAI/docs/). ### Integrating sensAI into a Project @@ -34,62 +34,14 @@ sensAI may be integrated into your project in several ways: Choose this option if you do not intend to make changes to sensAI in the context of your project. 2. **Include sensAI's source code as a package within your project** (e.g. in `src/sensai`), which you synchronise with a sensAI branch. Choose this option if you intend to make changes to sensAI as you develop your project. When using this option, you (and others) may even make changes to sensAI in several branches of your project and even several projects using the same inclusion mechanism at the same time. - See below for details on how synchronisation works. + See developer documentation in README-dev.md for details on how synchronisation works. 3. **Clone sensAI and add its source directory to your `PYTHONPATH`**. Choose this option if you potentially intend to make changes to sensAI but no one else working on your project will do the same and you will be modifying sensAI's source in no more than one branch at a time. -#### Details on the Synchonisation of a Source Directory within Your Project with the sensAI Repository - -We support the synchronisation of a branch in the sensAI repository with a directory within the git repository of your project which is to contain the sensAI source code (i.e. alternative #2 from above) via a convenient scripting solution. - -We consider two local repositories: the sensAI repository in directory `sensAI/` and your project in, for instance, directory `sensAI/../myprj/`. Let us assume that we want to synchronise branch `myprj-branch` in the sensAI repository with directory `myprj/src/sensai`. - -##### Synchronisation Script - -To perform the synchronisation, please create a script as follows, which you should save to `sensAI/sync.py`: - -```python -import os -from repo_dir_sync import LibRepo, OtherRepo - -r = LibRepo() -r.add(OtherRepo("myprj", "myprj-branch", os.path.join("..", "myprj", "src", "sensai"))) -r.runMain() -``` - -You can add multiple other repositories if you so desire in the future. - -From directory `sensAI/` you can use the script in order to - -* ***Push***: Update your project (i.e. `myprj/src/sensai`) with changes that were made in other projects by running `python sync.py myprj push` -* ***Pull***: Update `myprj-branch` in the sensAI repository with changes made in your project by running `python sync.py myprj pull` - -##### Initialisation - -To initialise the synchronisation, proceed as follows: - -1. Create the branch `myprj-branch` in the sensAI repository, i.e. in `sensAI/` run this command: - `git branch myprj-branch master` -2. Create the directory `myprj/src/sensai`. -3. Make sure you have a `.gitignore` file in `myprj/` with at least the following entries: - - *.pyc - __pycache__ - *.bak - *.orig - - Otherwise you may end up with unwanted tracked files after a synchronisation. -4. Perform the initial *push*, i.e. in `sensAI/` run this command: - `python sync.py myprj push` - -##### Things to Keep in Mind - -* Both *push* and *pull* operations are always performed based on the branch that is currently checked out in `myprj/`. The best practice is to only use one branch for synchronisation, e.g. master. -* *Push* and *pull* operations will make git commits in both repositories. Should an operation ever go wrong/not do what you intended, use `git reset --hard` to go back to the commits before the operation in both repositories. ## Contributors -sensAI is being developed by the artificial intelligence group at jambit GmbH. +sensAI is being developed by the artificial intelligence group at [jambit GmbH](http://www.jambit.com) and by members of [appliedAI](www.appliedai.de). The main contributors are Dominik Jain, Michael Panchenko, Kristof Schröder and Magnus Winter. diff --git a/build-docs.sh b/build-docs.sh index 4662b9d0..d9e69328 100644 --- a/build-docs.sh +++ b/build-docs.sh @@ -1,3 +1,4 @@ +rm -rf docs/build python build_scripts/update_docs.py sphinx-build -W -b html docs docs/build diff --git a/config.py b/config.py index 5681406d..3cf4b891 100644 --- a/config.py +++ b/config.py @@ -9,7 +9,7 @@ __config_instance = None -topLevelDirectory = os.path.dirname(__file__) +topLevelDirectory = os.path.abspath(os.path.dirname(__file__)) class __Configuration: @@ -66,7 +66,7 @@ def _get_path(self, key: Union[str, List[str]], create=False) -> str: :return: the queried path """ path_string = self._get_non_empty_entry(key) - path = os.path.abspath(path_string) + path = os.path.abspath(os.path.join(topLevelDirectory, path_string)) if not os.path.exists(path): if isinstance(key, list): key = ".".join(key) # purely for logging diff --git a/docs/conf.py b/docs/conf.py index cc989b07..fff3c2e4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -121,7 +121,8 @@ def findLineFromObjectName(sourceFile, objectName): "azure", "geopandas", "shapely", - "networkx" + "networkx", + "utm" ] # Render docu of __init__ methods @@ -155,10 +156,14 @@ def findLineFromObjectName(sourceFile, objectName): # built documents. # # The full version, including alpha/beta/rc tags. -release = pkg_resources.get_distribution(project).version -# The short X.Y version. -major_v, minor_v = release.split(".")[:2] -version = f"{major_v}.{minor_v}" +try: + version = pkg_resources.get_distribution(project).version + # The short X.Y version. + #major_v, minor_v = release.split(".")[:2] + #version = f"{major_v}.{minor_v}" +except: + version = "dev" + log.warning(f"The {project} distribution was not found; using dummy version string '{version}'") # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/getting-started.rst b/docs/getting-started.rst deleted file mode 100644 index 061f6114..00000000 --- a/docs/getting-started.rst +++ /dev/null @@ -1,9 +0,0 @@ -Getting started -=============== - -This library works with python>=3.7. Install it by executing \n -``python setup.py install`` \n -from the root directory. - -For developing the usage of tox is encouraged. Run ``tox`` from the root directory in order to build the package, -these docs and perform several tests. You should not merge to master without tox having executed successfully! diff --git a/docs/index.rst b/docs/index.rst index 9ed3f2a6..41d07053 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,8 +5,17 @@ sensAI - the Python library for sensible AI :caption: Guides and Tutorials :glob: - * - + intro.ipynb + intro_old.ipynb + neural_networks.ipynb + tensor_models_pytorch_lightning.ipynb + coordinate_clustering.ipynb + clustering_evaluation.ipynb + tracking_experiments.ipynb + +.. + Above, we can include any notebooks from ../notebooks; Within the build process, they will be copied + here by test_notebooks if they are referenced in this file. .. toctree:: :caption: Modules diff --git a/environment.yml b/environment.yml index a51d3e9e..bc0220b9 100644 --- a/environment.yml +++ b/environment.yml @@ -1,26 +1,46 @@ +# environment definition for development +# NOTE: versions are mostly unpinned, because this facilitates conda dependency resolution. Also, +# sensAI should be compatible to all (newer) versions of the dependencies. If it isn't, we need to specify +# an upper version bound in setup.py (where it matters the most) as well as here. +# Compatibility with old (pinned) versions and the latest versions is tested in the tox build. name: sensai channels: - pytorch - - conda-forge - - anaconda - defaults + - conda-forge dependencies: - - python=3.7 - - pytorch=1.4.0 - - tensorflow=1.14.0 - - pyqt=5.12 - - psutil=5.6.7 - - pytorch=1.4.0 - - catboost=0.23 - - xgboost=1.4.0 - - pytest=5.4.1 - - tox=3.14.6 - - bump2version - - jupyter - - pyyaml=5.3 - - lightgbm=3.1.1 - - sphinx_rtd_theme + # basics + # we use Python 3.8 because it has better conda dependency support; the code must still be compatible to 3.7 (which is tested by the github build) + - python=3.8 - pip + # optional sensai dependencies for development + # NOTE: catboost is disabled, because it causes conflicts + #- catboost + - pytorch + - pyqt + - psutil + - xgboost + - lightgbm + - utm=0.7.0 + - geopandas + # for notebook support + - jupyter + # for tests + - pytest + # for docs build + - sphinx + - sphinx_rtd_theme>=0.5.1 + - nbsphinx + # for release process + - bump2version - pip: - - -r file:requirements.txt - - nbsphinx \ No newline at end of file + # non-optional sensai dependencies + - -r requirements-relaxed.txt + # optional sensai pip dependencies + # clearml is only available through pip, and pulling tensorflow from conda causes dependency issues + - clearml + - tensorflow-cpu + - pytorch-lightning + # required for locally running the tox build (which will work on Linux only at this time) + - tox + - virtualenv \ No newline at end of file diff --git a/notebooks/Clustering Evaluation.ipynb b/notebooks/clustering_evaluation.ipynb similarity index 76% rename from notebooks/Clustering Evaluation.ipynb rename to notebooks/clustering_evaluation.ipynb index b2cd9d0a..ca9ec965 100644 --- a/notebooks/Clustering Evaluation.ipynb +++ b/notebooks/clustering_evaluation.ipynb @@ -1,50 +1,44 @@ { "cells": [ { - "cell_type": "markdown", - "metadata": { - "collapsed": true, - "pycharm": { - "name": "#%% md\n" - } - }, + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# Evaluating clustering algorithms\n", - "\n", - "The present library contains utilities for evaluating different clustering algorithms\n", - "(with or without ground truth labels). On top of the evaluation utilities there are classes for\n", - "performing parameters sweeps and model selection. Here we give an overview of the most important functionality\n", - "\n", - "\n", - "## Before running the notebook\n", - "\n", - "Install the library and its dependencies with, if you haven't done so already\n", - "```\n", - "pip install -e .\n", - "```\n", - "from the root directory. You can also execute this command directly in the notebook but will need to reload the\n", - "kernel afterwards" + "%load_ext autoreload\n", + "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys; sys.path.extend([\"../src\", \"..\"])\n", + "import sensai\n", + "import logging\n", + "import config\n", + "\n", + "cfg = config.get_config(reload=True)\n", + "sensai.util.logging.configureLogging(level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", "metadata": { + "collapsed": true, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } }, - "outputs": [], "source": [ - "# Note - this cell should be executed only once per session\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import sys, os\n", + "# Evaluating Clustering Algorithms\n", "\n", - "# in order to get the config, it is not part of the library\n", - "os.chdir(\"..\")\n", - "sys.path.append(os.path.abspath(\".\"))" + "The present library contains utilities for evaluating different clustering algorithms\n", + "(with or without ground truth labels). On top of the evaluation utilities there are classes for\n", + "performing parameters sweeps and model selection. Here we give an overview of the most important functionality" ] }, { @@ -63,20 +57,17 @@ "from sklearn.cluster import DBSCAN\n", "import seaborn as sns\n", "import geopandas as gp\n", + "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import logging\n", "\n", - "from sensai.geoanalytics.coordinate_clustering import SkLearnCoordinateClustering\n", + "from sensai.geoanalytics.geopandas.coordinate_clustering import SkLearnCoordinateClustering\n", "from sensai.hyperopt import GridSearch\n", "from sensai.evaluation.evaluator_clustering import ClusteringModelSupervisedEvaluator, \\\n", " ClusteringModelUnsupervisedEvaluator\n", "from sensai.evaluation.eval_stats import ClusteringUnsupervisedEvalStats, ClusteringSupervisedEvalStats, \\\n", " AdjustedMutualInfoScore\n", - "from sensai.geoanalytics.coordinate_clustering_ground_truth import PolygonAnnotatedCoordinates\n", - "\n", - "from config import get_config\n", - "\n", - "logging.basicConfig(level=logging.INFO)" + "from sensai.geoanalytics.geopandas.coordinate_clustering_ground_truth import PolygonAnnotatedCoordinates" ] }, { @@ -89,9 +80,8 @@ }, "outputs": [], "source": [ - "# loading data and config\n", - "c = get_config(reload=True)\n", - "sampleFile = c.datafile_path(\"sample\", stage=c.RAW) # this can point to a directory or a shp/geojson file\n", + "# loading data \n", + "sampleFile = cfg.datafile_path(\"sample\", stage=cfg.RAW) # this can point to a directory or a shp/geojson file\n", "coordinatesDF = gp.read_file(sampleFile)" ] }, @@ -103,7 +93,7 @@ } }, "source": [ - "## Evaluating a single model\n", + "## Evaluating a Single Model\n", "\n", "For a single model that was already fitted, evaluation statistics can be extracted with `ClusteringEvalStats`, see the\n", "example below (the eval_stats object can also be used to retrieve evaluation results one by one)\n" @@ -121,8 +111,11 @@ "source": [ "dbscan = SkLearnCoordinateClustering(DBSCAN(eps=150, min_samples=20))\n", "dbscan.fit(coordinatesDF)\n", + "\n", "evalStats = ClusteringUnsupervisedEvalStats.fromModel(dbscan)\n", + "\n", "pprint(evalStats.getAll())\n", + "\n", "plt.hist(evalStats.clusterSizeDistribution)\n", "plt.show()" ] @@ -135,13 +128,13 @@ } }, "source": [ - "## Model selection\n", + "## Unsupervised Model Selection\n", "\n", "For model selection we need to compare different (or differently parametrized) models that were\n", "trained on the same dataset. The `ClusteringEvaluator` abstraction was designed with this goal in mind.\n", "The evaluator can be used to obtain evaluation statistics for different models that are guaranteed\n", - "to be comparable with each other (always computed by the same object in the same way). Here an example evaluating\n", - "a dbscan performance on metrics that don't necessitate ground truth labels" + "to be comparable with each other (always computed by the same object in the same way). Here is an example evaluating\n", + "DBSCAN's performance on metrics that don't necessitate ground truth labels." ] }, { @@ -182,8 +175,8 @@ }, "source": [ "One of the main purposes of evaluators is to be used within classes that perform a parameter sweep, e.g.\n", - "a `GridSearch`. All such objects return a data frame and (optionally but recommended!) persist all evaluation results\n", - "in a csv." + "a `GridSearch`. All such objects return a data frame and (optionally) persist all evaluation results\n", + "in a CSV file." ] }, { @@ -206,7 +199,7 @@ "def dbscanFactory(**kwargs):\n", " return SkLearnCoordinateClustering(DBSCAN(**kwargs))\n", "\n", - "dbscanGridSearch = GridSearch(dbscanFactory, parameterOptions, csvResultsPath=os.path.join(c.temp, \"dbscanGridSearchCsv\"))" + "dbscanGridSearch = GridSearch(dbscanFactory, parameterOptions, csvResultsPath=os.path.join(cfg.temp, \"dbscanGridSearchCsv\"))" ] }, { @@ -219,7 +212,7 @@ }, "outputs": [], "source": [ - "# the results of the grid-search are saved as csv under the path provided above\n", + "# the results of the grid-search are saved as a CSV file under the path provided above\n", "resultDf = dbscanGridSearch.run(modelEvaluator, sortColumnName=\"numClusters\", ascending=False)\n", "resultDf.head()" ] @@ -292,7 +285,9 @@ } }, "source": [ - "## Dealing with ground truth labels\n", + "## Supervised Model Selection\n", + "\n", + "### Obtaining Ground Truth Labels\n", "\n", "\n", "The evaluation classes can take ground truth labels for all coordinates and use them for calculating related metrics.\n", @@ -313,7 +308,7 @@ "outputs": [], "source": [ "# The polygons can be read directly from a file, see the documentation for more details\n", - "groundTruthClusters = PolygonAnnotatedCoordinates(coordinatesDF, c.datafile_path(\"sample\", stage=c.GROUND_TRUTH))" + "groundTruthClusters = PolygonAnnotatedCoordinates(coordinatesDF, cfg.datafile_path(\"sample\", stage=cfg.GROUND_TRUTH))" ] }, { @@ -344,6 +339,13 @@ "groundTruthClusters.toGeoDF().head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Supervised Evaluation Metrics" + ] + }, { "cell_type": "markdown", "metadata": { @@ -352,9 +354,9 @@ } }, "source": [ - "We can extract the coordinates and labels for the annotated region and use them in evaluation. In the following\n", - "we will train our own adaption of DBSCAN, namely `boundedDBSCAN` on datapoints in the ground truth region and\n", - "evaluate the results against the true labels" + "We can extract the coordinates and labels for the annotated region and use them in evaluation. In the following,\n", + "we will evaluate a slight adaptation of DBSCAN which uses an additional bound, i.e. it will ultimately reject clusters that do not reach a minimum size.\n", + "We will train it on datapoints in the ground truth region and evaluate the results against the true labels." ] }, { @@ -368,7 +370,9 @@ "outputs": [], "source": [ "boundedDbscan = SkLearnCoordinateClustering(DBSCAN(eps=150, min_samples=20), minClusterSize=100)\n", + "\n", "groundTruthCoordinates, groundTruthLabels = groundTruthClusters.getCoordinatesLabels()\n", + "\n", "supervisedEvaluator = ClusteringModelSupervisedEvaluator(groundTruthCoordinates, trueLabels=groundTruthLabels)\n", "supervisedEvalStats = supervisedEvaluator.evalModel(boundedDbscan)\n", "\n", @@ -376,6 +380,20 @@ "pprint(supervisedEvalStats.getAll())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Comparing Unsupervised Evaluation Metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It can also be instructive to compare unsupervised evaluation metrics." + ] + }, { "cell_type": "code", "execution_count": null, @@ -386,11 +404,11 @@ }, "outputs": [], "source": [ - "print(\"Unsupervised evaluation metrics of bounded dbscan:\")\n", - "pprint(ClusteringUnsupervisedEvalStats(groundTruthCoordinates, groundTruthLabels).getAll())\n", - "print(\"\")\n", - "print(\"Unsupervised evaluation metrics of annotated data\")\n", - "pprint(ClusteringUnsupervisedEvalStats.fromModel(boundedDbscan).getAll())" + "groundTruthUnsupervisedMetrics = ClusteringUnsupervisedEvalStats(groundTruthCoordinates, groundTruthLabels).metricsDict()\n", + "boundedDbscanUnsupervisedMetrics = ClusteringUnsupervisedEvalStats.fromModel(boundedDbscan).metricsDict()\n", + "\n", + "pd.DataFrame({\"bounded DBSCAN\": boundedDbscanUnsupervisedMetrics, \"ground truth\": groundTruthUnsupervisedMetrics}, \n", + " index=groundTruthUnsupervisedMetrics.keys())" ] }, { @@ -401,9 +419,9 @@ } }, "source": [ - "The bounded dbscan is performing quite OK with the given parameters, although we see that it segregates clusters too\n", + "The bounded DBSCAN is already performing quite well with the given parameters, although we see that it segregates clusters too\n", "much and has a general tendency towards smaller clusters. These tendencies can be seen visually by comparing the ground\n", - "truth and the bounded dbscan cluster plots" + "truth and the bounded DBSCAN cluster plots." ] }, { @@ -429,10 +447,10 @@ } }, "source": [ - "## Supervised parameter estimation\n", + "### Parameter Search\n", "\n", "We can now bring everything together by running a grid search and evaluating against ground truth. Very little code\n", - "is needed for that, so we will write it entirely in the cell below" + "is needed for that:" ] }, { @@ -451,22 +469,9 @@ "}\n", "\n", "supervisedGridSearch = GridSearch(dbscanFactory, parameterOptions,\n", - " csvResultsPath=os.path.join(c.temp, \"bounded_dbscan_grid_search.csv\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# we will sort the results by mutual information store\n", + " csvResultsPath=os.path.join(cfg.temp, \"bounded_dbscan_grid_search.csv\"))\n", "supervisedResultDf = supervisedGridSearch.run(supervisedEvaluator, sortColumnName=AdjustedMutualInfoScore.name,\n", - " ascending=False)\n", + " ascending=False)\n", "supervisedResultDf" ] }, @@ -478,15 +483,21 @@ } }, "source": [ - "It seems like we were lucky to already have picked the optimal parameters for the dbscan above.\n", - "It is also interesting to notice that the supervised scores are in\n", - "stark disagreement with the unsupervised ones" + "According to the adjusted mutual information score, we have now found a new parameter combination (see rightmost columns of first row) which yields results even closer to the ground truth." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { + "interpreter": { + "hash": "9b3442ae4bdb9561e722e28424c33a03c16d40b3aa50369b79d367cad7b1adea" + }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.8.13 ('sensai')", "language": "python", "name": "python3" }, @@ -500,9 +511,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/notebooks/Intro to Coordinate Clustering.ipynb b/notebooks/coordinate_clustering.ipynb similarity index 70% rename from notebooks/Intro to Coordinate Clustering.ipynb rename to notebooks/coordinate_clustering.ipynb index 0753b9bd..6cd5843e 100644 --- a/notebooks/Intro to Coordinate Clustering.ipynb +++ b/notebooks/coordinate_clustering.ipynb @@ -1,25 +1,13 @@ { "cells": [ { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "# The Coordinate Clustering Module\n", - "\n", - "On top of support for different clustering algorithms, sensAI provides useful methods specific to\n", - "clustering of geospatial data. They include utilities for wrangling geometrical data, spanning trees and for persisting and\n", - "visualizing the results. It seamlessly interoperates with geopandas and shapely.\n", - "This notebook gives an overview of the coordinate clustering's main functions\n", - "\n", - "\n", - "## Before running the notebook\n", - "\n", - "Install the library and its dependencies with, if you haven't done so already\n", - "```\n", - "pip install -e .\n", - "```\n", - "from the root directory. You can also execute this command directly in the notebook but will need to reload the\n", - "kernel afterwards" + "%load_ext autoreload\n", + "%autoreload 2" ] }, { @@ -28,24 +16,42 @@ "metadata": {}, "outputs": [], "source": [ - "# Note - this cell should be executed only once per session\n", - "%load_ext autoreload\n", - "%autoreload 2\n", + "import sys; sys.path.extend([\"../src\", \"..\"])\n", + "import os\n", + "import config\n", + "import sensai\n", + "import logging\n", "\n", - "import sys, os\n", + "c = config.get_config(reload=True)\n", + "sensai.util.logging.configureLogging(level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Coordinate Clustering\n", "\n", - "# in order to get the config, it is not part of the library\n", - "os.chdir(\"..\")\n", - "sys.path.append(os.path.abspath(\".\"))" + "On top of support for different clustering algorithms, sensAI provides useful methods specific to\n", + "clustering of geospatial data. They include utilities for wrangling geometrical data, spanning trees and for persisting and\n", + "visualizing the results. It seamlessly interoperates with geopandas and shapely.\n", + "This notebook gives an overview of the coordinate clustering's main functions" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ - "import os\n", "import geopandas as gp\n", "from pprint import pprint\n", "import numpy as np\n", @@ -53,20 +59,20 @@ "from sklearn.cluster import DBSCAN\n", "\n", "import logging\n", - "from sensai.geoanalytics.graph import CoordinateSpanningTree\n", - "from sensai.geoanalytics.coordinate_clustering import SkLearnCoordinateClustering\n", - "from sensai.geoanalytics.geometry import alphaShape\n", - "from config import get_config\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "c = get_config(reload=True)" + "from sensai.geoanalytics.geopandas.graph import CoordinateSpanningTree\n", + "from sensai.geoanalytics.geopandas.coordinate_clustering import SkLearnCoordinateClustering\n", + "from sensai.geoanalytics.geopandas.geometry import alphaShape" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "## Loading and Fitting" + "## Loading Data and Fitting a Clusterer" ] }, { @@ -77,35 +83,56 @@ } }, "source": [ - "The library contains utils for loading coordinates from files and for wrapping arbitrary scikit-learn compatible\n", - "clustering algorithms. Custom clustering algorithms can be implemented easily buy inheriting from the baseclass\n", - "`ClusteringModel`\n" + "The library contains utilities for loading coordinates from files and for wrapping arbitrary scikit-learn-compatible\n", + "clustering algorithms. Custom clustering algorithms can be implemented by inheriting from the base class\n", + "`EuclideanClusterer`." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "sampleFile = c.datafile_path(\"sample\", stage=c.RAW) # this can point to a directory or a shp/geojson file\n", "sampleGeoDF = gp.read_file(sampleFile)\n", + "sampleGeoDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "dbscan = SkLearnCoordinateClustering(DBSCAN(eps=150, min_samples=20))\n", "dbscan.fit(sampleGeoDF)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "The resulting `CoordinateClusteringAlgorithm` instance has many useful methods.\n", - "You can retrieve clusters individually or via a generator. The noise cluster can be accessed individually" + "The instance has many useful methods.\n", + "You can retrieve clusters individually or via a generator. The noise cluster can be accessed individually." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "print(f\"Clusters found: {dbscan.numClusters}\")\n", @@ -117,23 +144,35 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Analysis and Visualization" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "From the dbscan single clusters which are instances of `CoordinateClusteringAlgorithm.Cluster` \n", - "can be retrieved and visualized. Most objects, including the dbscan itself, have an inbuilt plot method" + "From the instance, individual clusters, which are instances of `EuclidianClusterer.Cluster`,\n", + "can be retrieved and visualized. Most objects, including the clusterer itself, have a built-in plot method." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "dbscan.plot(markersize=0.2)" @@ -141,15 +180,23 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "We can condition before plotting as well as pass custom arguments" + "We can apply a condition to the clusters to be plotted and pass additional arguments affecting the display." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "dbscan.plot(condition=lambda x: len(x) >= 50, cmap='plasma')" @@ -157,17 +204,25 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "### Properties of a single cluster\n", + "### Properties of Individual Clusters\n", "\n", - "Single clusters can be plotted too" + "Individual clusters can be plotted, too." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "sampleCluster = dbscan.getCluster(0)\n", @@ -177,7 +232,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Clusters have an identifier and coordinates. It is easy to extract additional information,\n", "e.g. via the summary method" @@ -186,7 +245,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "pprint(sampleCluster.summaryDict())" @@ -194,92 +257,96 @@ }, { "cell_type": "markdown", - "source": [ - "A single cluster is just a wrapper around its coordinates. They can be\n", - "retrieved either as a numpy array, a geodataframe or a MultiPoint object.\n", - "The latter is useful for geometric operations, e.g. computing hulls" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "A single cluster is just a wrapper around its coordinates. They can be\n", + "retrieved either as a numpy array, a geodataframe or a MultiPoint object.\n", + "The latter is useful for geometric operations, e.g. computing hulls" + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "clusterMultipoint = sampleCluster.asMultipoint()\n", - "clusterMultipoint.convex_hull" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "clusterMultipoint = sampleCluster.asMultipoint()\n", + "clusterMultipoint.convex_hull" + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "# we also provide a utility for computing alpha shapes for such objects\n", - "\n", - "alphaShape(clusterMultipoint)" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "# we also provide a utility for computing alpha shapes for such objects\n", + "\n", + "alphaShape(clusterMultipoint)" + ] }, { "cell_type": "markdown", - "source": [ - "sensAI also provides utilities for computing trees, e.g. here for the minimal spanning tree" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "sensAI also provides utilities for computing trees, e.g. here for the minimal spanning tree" + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "sampleTree = CoordinateSpanningTree(sampleCluster)\n", - "sampleTree.plot()" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "sampleTree = CoordinateSpanningTree(sampleCluster)\n", + "sampleTree.plot()" + ] }, { "cell_type": "markdown", - "source": [ - "Most objects provide a way for extracting a summary from them, either as a dict or as a data frame" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "Most objects provide a way for extracting a summary from them, either as a dict or as a data frame" + ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "print(\"cluster summary:\")\n", @@ -289,7 +356,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "dbscan.summaryDF().head()" @@ -297,14 +368,22 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Saving and Loading" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "All of the objects used above can be exported to a GeoDataFrame using the `toGeoDF` method. This geodataframe\n", "can then be persisted as usual.\n", @@ -320,7 +399,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "dbscanGeoDF = dbscan.toGeoDF() # here again a condition for filtering clusters can be passed\n", @@ -332,7 +415,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "dbscanSavedPath = os.path.join(c.temp, f\"{dbscan}_sample.pickle\")\n", @@ -346,7 +433,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "loadedDBSCAN = SkLearnCoordinateClustering.load(dbscanSavedPath)\n", @@ -377,8 +468,11 @@ } ], "metadata": { + "interpreter": { + "hash": "9b3442ae4bdb9561e722e28424c33a03c16d40b3aa50369b79d367cad7b1adea" + }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.8.13 ('sensai')", "language": "python", "name": "python3" }, @@ -392,9 +486,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.2" + "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/notebooks/intro.ipynb b/notebooks/intro.ipynb new file mode 100644 index 00000000..dcba1db8 --- /dev/null +++ b/notebooks/intro.ipynb @@ -0,0 +1,557 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../src\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to sensAI: Supervised Learning with VectorModels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sensai\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Logging\n", + "\n", + "sensAI will log relevant activies and inform about ongoing processes as well as results via the log. It is therefore highly recommended that logging be enabled when using sensAI.\n", + "\n", + "sensAI provides a `logging` module which includes Python's standard logging module and adds some additional functionality. To enable logging, simply use its `configureLogging` function.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sensai.util import logging\n", + "\n", + "logging.configureLogging(level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To additionally write log output to a file, use the function `logging.addFileLogger`.\n", + "\n", + "## VectorModels\n", + "\n", + "The central base class for supervised learning problems in sensAI is `VectorModel`. A VectorModel is any model which operates on data points that can be reprsented as vectors of data. Here, vector is to be understood not in the mathematical sense but in the computer science sense, where a vector is simply an array of (potentially arbitaririly complex) data. (The mathematical equivalent is a tuple.) Models are typically expected to be able to process more than one data point at a time and thus should be able to process a sequence of vectors. \n", + "\n", + "We use pandas DataFrames to represent such sequences of data points. Note that pandas DataFrames are not limited to primitive data types but can hold arbitrary objects in every cell. When dealing with a large number of inputs, DataFrames also provide at least limited meta-information in the form of column names, so we do not lose track of what is contained in which element of a data point (vector).\n", + "\n", + "VectorModel itself is an abstract base class, which provides a lot of useful functionality that all its specialisations inherit (as we will see later, particularly in the more advanced tutorials). The class is specialised in `VectorClassificationModel` and `VectorRegressionModel`, which in turn are specialised for various machine learning frameworks (such as sklearn and PyTorch) or can be directly subclassed to create your own model. \n", + "\n", + "In this tutorial, we will be dealing with a classification problem. Therefore, we will apply subclasses of `VectorClassificationModel` such as `SkLearnRandomForestVectorClassificationModel`. As an sklearn classification model which uses a well-defined training and inference interface, the implementation of the class is essentially justa few lines of code (given the intermediate abstraction `AbstractSkLearnVectorClassificationModel` for all classification models that use the sklearn protocol)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training and Evaluating Models\n", + "\n", + "First, let us load a dataset which we can experiment. sklearn provides, for example, the Iris classification dataset, where the task is to differentiate three different types of flowers based on measurements of their petals and sepals." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn.datasets\n", + "import pandas as pd\n", + "\n", + "irisData = sklearn.datasets.load_iris()\n", + "irisInputDF = pd.DataFrame(irisData[\"data\"], columns=irisData[\"feature_names\"]).reset_index(drop=True)\n", + "irisOutputDF = pd.DataFrame({\"class\": [irisData[\"target_names\"][idx] for idx in irisData[\"target\"]]}).reset_index(drop=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's a sample of the data, combining both the inputs and outputs:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "irisCombinedDF = pd.concat((irisInputDF, irisOutputDF), axis=1)\n", + "irisCombinedDF.sample(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When working with sensAI, we typically use DataFrames such as this as the starting point.\n", + "\n", + "We create an instance of **InputOutputData** from the two data frames." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "irisInputOutputData = sensai.InputOutputData(irisInputDF, irisOutputDF)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Low-Level Training and Inference " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use a **DataSplitter** (see subclasses) to split the data into a training and test set, specifically a **DataSplitterFractional**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataSplitter = sensai.data.DataSplitterFractional(0.8, shuffle=True)\n", + "trainingIoData, testIoData = dataSplitter.split(irisInputOutputData)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are ready to train a model. Let us train a random forest classifier, which should work well for this sort of problem. sensAI provides models from various libraries, including scikit-learn, PyTorch, lightgbm, xgboost, catboost, and TensorFlow.\n", + "\n", + "In this case, let us use the random forest implementation from sklearn, which is provided via the wrapper class SkLearnRandomForestVectorClassificationModel.\n", + "\n", + "sensAI's **VectorModel** classes (specialised for classification and regression) provide a common interface with a lot of useful functionality, which we will see later." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "randomForestModel = sensai.sklearn.classification.SkLearnRandomForestVectorClassificationModel(\n", + " min_samples_leaf=2).withName(\"RandomForest\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The class suppports all the parameters supported by the original sklearn model. In this case, we only set the minimum number of samples that must end up in each leaf.\n", + "\n", + "We train the model using the `fitInputOutputData` method; we could also use the `fit` method, which is analogous to the sklearn interface and takes two arguments (input, output)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "randomForestModel.fitInputOutputData(trainingIoData)\n", + "randomForestModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now apply the trained model and predict the outputs for the test set we reserved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictedOutputsDF = randomForestModel.predict(testIoData.inputs)\n", + "predictedOutputsDF.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's compare some of the predictions to the ground truth." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.concat((predictedOutputsDF.rename(columns={\"class\": \"predictedClass\"}), testIoData.outputs), axis=1).sample(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the ground truth and predicted values, we could now compute the metrics we're interested in. We could, for example, use the metrics implemented in sklearn to analyse the result. Yet sensAI already provides abstractions that facilitate the generation of metrics and the collection of results. Read on!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Evaluators\n", + "\n", + "sensAI provides evaluator abstractions which facilitate the training and evaluation of models.\n", + "\n", + "For a classification problem, we instantiate a VectorClassificationModelEvaluator. An evaluator serves to evaluate one or more models based on the same data, so we construct it with the data and instructions on how to handle/split the data for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluatorParams = sensai.evaluation.VectorClassificationModelEvaluatorParams(dataSplitter=dataSplitter, computeProbabilities=True)\n", + "evaluator = sensai.evaluation.VectorClassificationModelEvaluator(irisInputOutputData, params=evaluatorParams)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use this evaluator to evaluate one or more models. Let us evaluate the random forest model from above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluator.fitModel(randomForestModel)\n", + "evalData = evaluator.evalModel(randomForestModel)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The evaluation data holds, in particular, an **EvalStats** object, which can provide data on the quality of the results.\n", + "Depending on the type of problem, many metrics will already be computed by default." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalStats = evalData.getEvalStats()\n", + "evalStats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can get the metrics in a dictionary as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalStats.metricsDict()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can compute additional metrics by passing a metric to the `computeMetricValue` method, but we could also have added additional metrics to the `evaluatorParams` above and have the metric included in all results.\n", + "\n", + "Let's see how frequently the true class is among the top two most probable classes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalStats.computeMetricValue(sensai.eval_stats_classification.ClassificationMetricTopNAccuracy(2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The EvalStats object can also be used to generate plots, such as a confusion matrix or a precision-recall plot for binary classification." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalStats.plotConfusionMatrix(normalize=True);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using the Fully-Integrated Evaluation Utilities\n", + "\n", + "sensAI's evaluation utilities take things one step further and assist you in out all the evaluation steps and results computations in a single call.\n", + "\n", + "You can perform evaluations based on a single split or cross-validation. We simply declare the necessary parameters for both types of computations (or the one type we seek to carry out)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluatorParams = sensai.evaluation.VectorClassificationModelEvaluatorParams(\n", + " dataSplitter=dataSplitter, computeProbabilities=True, \n", + " additionalMetrics=[sensai.eval_stats_classification.ClassificationMetricTopNAccuracy(2)])\n", + "crossValidatorParams = sensai.evaluation.crossval.VectorModelCrossValidatorParams(folds=10, \n", + " evaluatorParams=evaluatorParams)\n", + "evalUtil = sensai.evaluation.ClassificationEvaluationUtil(irisInputOutputData, \n", + " evaluatorParams=evaluatorParams, crossValidatorParams=crossValidatorParams)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In practice, we will usually want to save evaluation results. The evaluation methods of `evalUtil` take a parameter `resultWriter` which allows us to define where results shall be written. Within this notebook, we shall simply inspect the resulting metrics in the log that is printed, and we shall configure plots to be shown directly.\n", + "\n", + "#### Simple Evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can perform the same evaluation as above (which uses a single split) like so:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalUtil.performSimpleEvaluation(randomForestModel, showPlots=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Customising the Set of Plots" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we decide that we don't really want to have the normalised confusion matrix, we can disable it for any further experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalUtil.evalStatsPlotCollector.getEnabledPlots()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some of these are only active for binary classification. The one we don't want is \"confusion-matrix-rel\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalUtil.evalStatsPlotCollector.disablePlots(\"confusion-matrix-rel\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We could also define our own plot class (by creating a new subclass of `ClassificationEvalStatsPlot`) and add it to the `evalStatsPlotCollector` in order to have the plot auto-generated whenever we apply one of `evalUtil`'s methods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Cross-Validation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can similarly run cross-validation and produce the respective evaluation metrics with a single call." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalUtil.performCrossValidation(randomForestModel, showPlots=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the plot we disabled earlier is no longer being generated." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Comparing Models\n", + "\n", + "A most common use case is to compare the performance of several models. The evaluation utility makes it very simple to compare any number of models.\n", + "\n", + "Let's say we want to compare the random forest we have been using thus far to a simple decision tree." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = evalUtil.compareModels([\n", + " randomForestModel, \n", + " sensai.sklearn.classification.SkLearnDecisionTreeVectorClassificationModel(min_samples_leaf=2).withName(\"DecisionTree\")], \n", + " useCrossValidation=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to the data frame with the aggregated metrics, which was already printed to the log, the results object contains all the data that was generated during the evaluation. We can, for example, use it to plot the distribution of one of the metrics across all the folds for one of our models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(results.resultsDF)\n", + "\n", + "escRandomForest = results.resultByModelName[\"RandomForest\"].crossValData.getEvalStatsCollection()\n", + "escRandomForest.plotDistribution(\"accuracy\", bins=np.linspace(0,1,21), stat=\"count\", kde=False);\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also compute additional aggregations or inspect the full list of metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "escRandomForest.aggMetricsDict(aggFns=[np.max, np.min])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "escRandomForest.getValues(\"accuracy\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "9b3442ae4bdb9561e722e28424c33a03c16d40b3aa50369b79d367cad7b1adea" + }, + "kernelspec": { + "display_name": "Python 3.7.9 ('sensai')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/Intro to sensAI.ipynb b/notebooks/intro_old.ipynb similarity index 68% rename from notebooks/Intro to sensAI.ipynb rename to notebooks/intro_old.ipynb index cd72b13a..42ef6d73 100644 --- a/notebooks/Intro to sensAI.ipynb +++ b/notebooks/intro_old.ipynb @@ -1,5 +1,27 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys; sys.path.extend([\"../src\", \"..\"])\n", + "from sensai.util import logging\n", + "\n", + "logging.configureLogging(level=logging.INFO)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -9,57 +31,25 @@ } }, "source": [ - "# Lightning intro to sensAI\n", + "# Custom Models and Feature Generators\n", "\n", "In this notebook we will demonstrate some of sensAI's main features by training a model together\n", "with feature extractors and custom normalization rules. This will also demonstrate how easy it is to wrap one's\n", "own model declaration into a sensAI model." ] }, - { - "cell_type": "markdown", - "source": [ - "### Before running the notebook\n", - "\n", - "Install the package and its dependencies, if you haven't done so already. E.g. for an editable install call\n", - "```\n", - "pip install -e .\n", - "```\n", - "from the root directory. You can also execute this command directly in the notebook but will need to reload the\n", - "kernel afterwards" - ], - "metadata": { - "collapsed": false - } - }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "# Note - this cell should be executed only once per session\n", - "\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import sys, os\n", - "\n", - "# in order to get the top level modules; they are not part of the package\n", - "os.chdir(\"..\")\n", - "sys.path.append(os.path.abspath(\".\"))" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, "outputs": [], "source": [ + "import sensai\n", "import pandas as pd\n", "import numpy as np\n", "import sensai as sn\n", @@ -72,95 +62,78 @@ "from sensai.tracking.clearml_tracking import ClearMLExperiment\n", "import sensai.featuregen as fgen\n", "import matplotlib.pyplot as plt\n", - "\n", - "\n", - "import logging\n", - "logging.basicConfig(level=logging.INFO)" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ "from config import get_config\n", "\n", - "c = get_config(reload=True)" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "cfg = get_config(reload=True)" + ] }, { "cell_type": "markdown", - "source": [ - "## Loading the dataset" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "First, let us load a dataset." + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ - "housing_data = c.datafile_path(\"boston_housing.csv\", stage=c.RAW)\n", + "housing_data = cfg.datafile_path(\"boston_housing.csv\", stage=cfg.RAW)\n", "housing_df = pd.read_csv(housing_data)\n", "\n", "housing_df.head()" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, "outputs": [], "source": [ "X = housing_df.copy()\n", "y = pd.DataFrame({\"nox\": X.pop(\"nox\")})" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, "outputs": [], "source": [ "print(\"We will use this as target\")\n", "y.head()" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "## Creating a Custom Model\n", "\n", @@ -172,17 +145,17 @@ "tutorial in TBA.\n", "\n", "We will use VectorModel to wrap scikit-learn's implementation of a multi layer perceptron." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "class CustomModel(VectorRegressionModel):\n", @@ -196,16 +169,16 @@ "\n", " def _fit(self, X: pd.DataFrame, Y: pd.DataFrame):\n", " self.model.fit(X, Y.values.ravel())" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "## Feature Generation and Normalization\n", "\n", @@ -227,16 +200,16 @@ "explaining exactly what they do and what the intended use case looks like.\n", "\n", "Below we will show an example of feature engineering.\n" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "### Defining Feature Generators\n", "\n", @@ -245,17 +218,17 @@ "is extracted from the dataframe when the feature generator is fit.\n", "\n", "The second feature generator simply takes the columns \"crim\" and \"age\" as is and marks that they should be normalized." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "class TaxFraudFeaturegen(fgen.FeatureGenerator):\n", @@ -287,18 +260,15 @@ " columns=[\"crim\", \"age\"],\n", " normalisationRuleTemplate=DFTNormalisation.RuleTemplate(skip=True),\n", ")" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ - "### The Feature Registry\n", + "### The Feature Generator Registry\n", "\n", "We could simply take the feature generators as they are and plug them into our model but instead we demonstrate\n", "one more class in sensAI: the feature registry. Creating a registry is convenient for rapid experimentation\n", @@ -311,14 +281,17 @@ "The collector is pinned to a registry and allows to call the registered features by name (if desired).\n", "This might not make much sense in a notebook but imagine having a central feature registry somewhere in you code. This\n", "way you can combine the registered features with some features that you cooked up in a script, all in a few lines of code." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "housing_feature_registry = fgen.FeatureGeneratorRegistry(useSingletons=True)\n", @@ -326,16 +299,16 @@ "housing_feature_registry.tax = TaxFraudFeaturegen\n", "\n", "feature_collector = fgen.FeatureCollector(\"tax\", crime_age_featuregen, registry=housing_feature_registry)" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "### Normalization of Input and Target\n", "\n", @@ -347,52 +320,51 @@ "all normalization rules to the feature generators themselves, just to be sure that nothing is missing.\n", "\n", "For normalizing the target we have to use an invertible transformer, we will take the MaxAbsScaler here." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "dft_normalisation = sn.data_transformation.DFTNormalisation(\n", " feature_collector.getNormalizationRules(),\n", - " requireAllHandled=True,\n", - ")\n", + " requireAllHandled=True)\n", "\n", "target_transformer = sn.data_transformation.DFTSkLearnTransformer(MaxAbsScaler())\n" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ - "### Combining Everything with the Model\n", + "## Combining Everything with the Model\n", "\n", "Now we can plug all these components into our vector model and enjoy a safe and robust that will\n", "work during training and inference. The model already has methods for saving and loading and is ready to\n", "be deployed." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "custom_model = CustomModel()\n", @@ -402,193 +374,79 @@ " .withInputTransformers(dft_normalisation) \\\n", " .withTargetTransformer(target_transformer) \\\n", " .withName(\"housing_predictor\")" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "custom_model.fit(X, y)\n", - "custom_model.predict(X).head()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ - "## SensAI Featuregen vs. Sklearn Pipelines\n", - "\n", - "TBA" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + "### Evaluating the Model, Tracking Results Online" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ - "## Model Evaluation\n", - "\n", - "sensAI has extensive support for evaluating different types of models with different methods, including\n", - "cross validation. The evaluation has native support for experiment tracking frameworks, like clearML or MLflow.\n", - "Here we will use clearML, so after running this notebook you will be able to see the result in the\n", - "clearML demo-server.\n", - "\n", - "The evaluation is generally based on the following structure: an `Evaluator` object holds a dataset.\n", - "An `Evaluator` can evaluate multiple models by calling `Evaluator.evalModel(model)`,\n", - "this ensures that the same kind of evaluation is performed and thus the results can be compared in meaningful way\n", - "(the latter is crucial for model selection). This `.evalModel(model)` call returns an EvalData object, h\n", - "olding the evaluation data and containing methods for computing metrics and visualization.\n", - "\n", - "Similarly, a `CrossValidator` holds data and can perform cross validation,\n", - "\n", - "Below we will show a simple example for that, using the lower-level evaluation interfaces. There is also\n", - "a higher level evaluation interfaces in the eval_util module, we will leave that to a separate intro." - ], - "metadata": { - "collapsed": false - } + "We evaluate the model using an evaluation util as usual, but this time we will additionally track the results online using ClearML." + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "io_data = InputOutputData(X, y)\n", - "\n", - "clearml_experiment = ClearMLExperiment(projectName=\"sensai_demo\", taskName=\"custom_model\")\n", - "evaluator = createVectorModelEvaluator(io_data, isRegression=custom_model.isRegressionModel(),\n", - " testFraction=0.2)\n", - "evaluator.setTrackedExperiment(clearml_experiment)" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "try: \n", + " clearmlExperiment = ClearMLExperiment(projectName=\"sensai_demo\", taskName=\"custom_model\")\n", + "except:\n", + " # allow to run without ClearML credentials being present\n", + " clearmlExperiment = None\n", + "\n", + "evalUtil = sensai.evaluation.RegressionEvaluationUtil(InputOutputData(X, y))\n", + "evalData = evalUtil.performSimpleEvaluation(custom_model, showPlots=True, trackedExperiment=clearmlExperiment)" + ] }, { - "cell_type": "code", - "execution_count": null, - "outputs": [], + "cell_type": "markdown", + "metadata": {}, "source": [ - "new_custom_model = CustomModel() \\\n", - " .withFeatureCollector(feature_collector) \\\n", - " .withInputTransformers(dft_normalisation) \\\n", - " .withTargetTransformer(target_transformer) \\\n", - " .withName(\"housing_predictor\")\n", + "You will find the URL under which the results are stored online in the log.\n", "\n", - "evaluator.fitModel(new_custom_model)\n", - "eval_stats = evaluator.evalModel(new_custom_model).getEvalStats()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "print(eval_stats.getAll())" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "eval_stats.plotScatterGroundTruthPredictions()\n", - "eval_stats.plotErrorDistribution()\n", - "eval_stats.plotHeatmapGroundTruthPredictions()\n", - "plt.show()\n", - "print(\"Demonstrating plotting capabilities\")" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "If you missed the evaluation metrics in the log output, here they are:" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "eval_stats.plotScatterGroundTruthPredictions()\n", - "plt.show()" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Things we left out in this intro\n", - "\n", - " - Already implemented models and feature generators\n", - " - Caching (this is actually one of the central features)\n", - " - Support for ensembling and parallelization\n", - " - The local search and hyperopt modules, including grid-search, simulated-annealing and other stuff\n", - " - kNN and clustering implementations\n" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + "evalData.getEvalStats().metricsDict()" + ] } ], "metadata": { + "interpreter": { + "hash": "9b3442ae4bdb9561e722e28424c33a03c16d40b3aa50369b79d367cad7b1adea" + }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.8.13 ('sensai')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/notebooks/neural_networks.ipynb b/notebooks/neural_networks.ipynb new file mode 100644 index 00000000..6461bba1 --- /dev/null +++ b/notebooks/neural_networks.ipynb @@ -0,0 +1,481 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys; sys.path.extend([\"../src\", \"..\"])\n", + "import sensai\n", + "import pandas as pd\n", + "import numpy as np\n", + "from typing import *\n", + "import config\n", + "import warnings\n", + "\n", + "cfg = config.get_config()\n", + "warnings.filterwarnings(\"ignore\")\n", + "sensai.util.logging.configureLogging()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Neural Networks\n", + "\n", + "Neural networks being a very powerful class of models, especially in cases where the learning of representations from low-level information (such as pixels, audio samples or text) is key, sensAI provides many useful abstractions for dealing with this class of models, facilitating data handling, learning and evaluation.\n", + "\n", + "sensAI mainly provides abstractions for PyTorch, but there is also rudimentary support for TensorFlow.\n", + "\n", + "## Image Classification\n", + "\n", + "As an example use case, let us solve the classification problem of classifying digits in pixel images from the MNIST dataset. Images are greyscale (no colour information) and 28x28 pixels in size." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mnistDF = pd.read_csv(cfg.datafile_path(\"mnist_train.csv.zip\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data frame contains one column for every pixel, each pixel being represented by an 8-bit integer (0 to 255)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mnistDF.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create the I/O data for our experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mnistIoData = sensai.InputOutputData.fromDataFrame(mnistDF, \"label\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have the image data separated from the labels, let's write a function to restore the 2D image arrays and take a look at some of the images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def reshape2DImage(series):\n", + " return series.values.reshape(28, 28)\n", + "\n", + "fig, axs = plt.subplots(nrows=1, ncols=5, figsize=(10, 5))\n", + "for i in range(5):\n", + " axs[i].imshow(reshape2DImage(mnistIoData.inputs.iloc[i]), cmap=\"binary\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Applying Predefined Models\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We create an evaluator in order to test the performance of our models, randomly splitting the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluatorParams = sensai.evaluation.VectorClassificationModelEvaluatorParams(fractionalSplitTestFraction=0.2)\n", + "evalUtil = sensai.evaluation.ClassificationEvaluationUtil(mnistIoData, evaluatorParams=evaluatorParams)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One pre-defined model we could try is a simple multi-layer perceptron. A PyTorch-based implementation is provided via class `MultiLayerPerceptronVectorClassificationModel`. This implementation supports CUDA-accelerated computations (on Nvidia GPUs), yet we shall stick to CPU-based computation (cuda=False) in this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sensai.torch\n", + "\n", + "nnOptimiserParams = sensai.torch.NNOptimiserParams(earlyStoppingEpochs=2, batchSize=54)\n", + "torchMLPModel = sensai.torch.models.MultiLayerPerceptronVectorClassificationModel(hiddenDims=(50, 20), \n", + " cuda=False, normalisationMode=sensai.torch.NormalisationMode.MAX_ALL, \n", + " nnOptimiserParams=nnOptimiserParams, pDropout=0.0) \\\n", + " .withName(\"MLP\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Neural networks work best on **normalised inputs**, so we have opted to apply basic normalisation by specifying a normalisation mode which will transforms inputs by dividing by the maximum value found across all columns in the training data. For more elaborate normalisation options, we could have used a data frame transformer (DFT), particularly `DFTNormalisation` or `DFTSkLearnTransformer`.\n", + "\n", + "sensAI's default **neural network training algorithm** is based on early stopping, which involves checking, in regular intervals, the performance of the model on a validation set (which is split from the training set) and ultimately selecting the model that performed best on the validation set. You have full control over the loss evaluation method used to select the best model (by passing a respective `NNLossEvaluator` instance to NNOptimiserParams) as well as the method that is used to split the training set into the actual training set and the validation set (by adding a `DataFrameSplitter` to the model or using a custom `TorchDataSetProvider`).\n", + "\n", + "Given the vectorised nature of our MNIST dataset, we can apply any type of model which can accept the numeric inputs. Let's compare the neural network we defined above against another pre-defined model, which is based on a scikit-learn implementation and uses decision trees rather than neural networks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "randomForestModel = sensai.sklearn.classification.SkLearnRandomForestVectorClassificationModel(min_samples_leaf=1, n_estimators=10) \\\n", + " .withName(\"RandomForest\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's compare the two models using our evaluation utility." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalUtil.compareModels([randomForestModel, torchMLPModel])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Both models perform reasonably well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating a Custom CNN Model\n", + "\n", + "Given that this is an image recognition problem, it can be sensible to apply convolutional neural networks (CNNs), which can analyse patches of the image in order to generate more high-level features from them.\n", + "Specifically, we shall apply a neural network model which uses multiple convolutions, a max-pooling layer and a multi-layer perceptron at the end in order to produce the classification.\n", + "\n", + "For classification and regression, sensAI provides the fundamental classes `TorchVectorClassificationModel` and `TorchVectorRegressionModel` respectively. Ultimately, these classes will wrap an instance of `torch.nn.Module`, the base class for neural networks in PyTorch.\n", + "\n", + "#### Wrapping a Custom torch.nn.Module Instance\n", + "\n", + "If we already had an implementation of a `torch.nn.Module`, it can be straightforwardly adapted to become a sensAI ``VectorModel``.\n", + "\n", + "Let's say we had the following implementation of a torch module, which performs the steps described above.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "class MnistCnnModule(torch.nn.Module):\n", + " def __init__(self, imageDim: int, outputDim: int, numConv: int, kernelSize: int, poolingKernelSize: int, \n", + " mlpHiddenDims: Sequence[int], outputActivationFn: sensai.torch.ActivationFunction, pDropout=0.0):\n", + " super().__init__()\n", + " k = kernelSize\n", + " p = poolingKernelSize\n", + " self.cnn = torch.nn.Conv2d(1, numConv, (k, k))\n", + " self.pool = torch.nn.MaxPool2d((p, p))\n", + " self.dropout = torch.nn.Dropout(p=pDropout)\n", + " reducedDim = (imageDim-k+1)/p\n", + " if int(reducedDim) != reducedDim:\n", + " raise ValueError(f\"Pooling kernel size {p} is not a divisor of post-convolution dimension {imageDim-k+1}\")\n", + " self.mlp = sensai.torch.models.MultiLayerPerceptron(numConv * int(reducedDim)**2, outputDim, mlpHiddenDims,\n", + " outputActivationFn=outputActivationFn.getTorchFunction(),\n", + " hidActivationFn=sensai.torch.ActivationFunction.RELU.getTorchFunction(),\n", + " pDropout=pDropout)\n", + "\n", + " def forward(self, x):\n", + " x = self.cnn(x.unsqueeze(1))\n", + " x = self.pool(x)\n", + " x = x.view(x.shape[0], -1)\n", + " x = self.dropout(x)\n", + " return self.mlp(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since this module requires 2D images as input, we will need a component that transforms the vector input that is given in our data frame into a tensor that will serve as input to the module.\n", + "In sensAI, the abstraction for this purpose is a ``sensai.torch.Tensoriser``. A **Tensoriser** can, in principle, perform arbitrary computations in order to produce, from a data frame with N rows, one or more tensors of length N (first dimension equal to N) that will ultimately be fed to the neural network.\n", + "\n", + "Luckily, for the case at hand, we already have the function ``reshape2DImage`` from above to assist in the implementation of the tensoriser." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ImageReshapingInputTensoriser(sensai.torch.RuleBasedTensoriser):\n", + " def _tensorise(self, df: pd.DataFrame) -> Union[torch.Tensor, List[torch.Tensor]]:\n", + " images = [reshape2DImage(row) for _, row in df.iterrows()]\n", + " return torch.tensor(np.stack(images)).float() / 255" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, we derived the class from ``RuleBasedTensorised`` rather than ``Tensoriser``, because our tensoriser does not require fitting. We additionally took care of the normalisation.\n", + "\n", + "Now we have all we need to create a sensAI ``TorchVectorClassificationModel`` that will work on the input/output data we loaded earlier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cnnModule = MnistCnnModule(28, 10, 32, 5, 2, (200, 20), sensai.torch.ActivationFunction.LOG_SOFTMAX)\n", + "nnOptimiserParams = sensai.torch.NNOptimiserParams(optimiser=sensai.torch.Optimiser.ADAMW, optimiserLR=0.01, batchSize=1024, \n", + " earlyStoppingEpochs=3)\n", + "cnnModelFromModule = sensai.torch.TorchVectorClassificationModel.fromModule(\n", + " cnnModule, sensai.torch.ClassificationOutputMode.LOG_PROBABILITIES, \n", + " cuda=False, nnOptimiserParams=nnOptimiserParams) \\\n", + " .withInputTensoriser(ImageReshapingInputTensoriser()) \\\n", + " .withName(\"CNN\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have now fully defined all the necessary parameters, including parameters controlling the training of the model.\n", + "\n", + "We are now ready to evaluate the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evalUtil.performSimpleEvaluation(cnnModelFromModule);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Creating an Input-/Output-Adaptive Custom Model\n", + "\n", + "While the above approach allows us to straightforwardly encapsulate a ``torch.nn.Module``, it really doesn't follow sensAI's principle of adapting model hyperparameters based on the inputs and outputs we receive during training - whenever possible. Notice that in the above example, we had to hard-code the image dimension (``28``) as well as the number of classes (``10``), even though these parameters could have been easily determined from the data. Especially in other domains where feature engineering is possible, we might want to experiment with different combinations of features, and therefore automatically adapting to inputs is key if we want to avoid editing the model hyperparameters time and time again; similarly, we might change the set of target labels in our classification problem and the model should simply adapt to a changed output dimension.\n", + "\n", + "To design a model that can fully adapt to the inputs and outputs, we can simply subclass ``TorchVectorClassificationModel``, where the late instantiation of the underlying model is catered for. Naturally, delayed construction of the underlying model necessitates the use of factories and thus results in some indirections. \n", + "\n", + "If we had designed the above model to be within the sensAI ``VectorModel`` realm from the beginning, here's what we might have written:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "class CnnModel(sensai.torch.TorchVectorClassificationModel):\n", + " def __init__(self, cuda: bool, kernelSize: int, numConv: int, poolingKernelSize: int, mlpHiddenDims: Sequence[int], \n", + " nnOptimiserParams: sensai.torch.NNOptimiserParams, pDropout=0.0):\n", + " self.cuda = cuda\n", + " self.outputActivationFn = sensai.torch.ActivationFunction.LOG_SOFTMAX\n", + " self.kernelSize = kernelSize\n", + " self.numConv = numConv\n", + " self.poolingKernelSize = poolingKernelSize\n", + " self.mlpHiddenDims = mlpHiddenDims\n", + " self.pDropout = pDropout\n", + " super().__init__(sensai.torch.ClassificationOutputMode.forActivationFn(self.outputActivationFn),\n", + " modelClass=self.VectorTorchModel, modelArgs=[self], nnOptimiserParams=nnOptimiserParams)\n", + "\n", + " class VectorTorchModel(sensai.torch.VectorTorchModel):\n", + " def __init__(self, parent: \"CnnModel\"):\n", + " super().__init__(parent.cuda)\n", + " self._parent = parent\n", + "\n", + " def createTorchModuleForDims(self, inputDim: int, outputDim: int) -> torch.nn.Module:\n", + " return self.Module(int(np.sqrt(inputDim)), outputDim, self._parent)\n", + "\n", + " class Module(torch.nn.Module):\n", + " def __init__(self, imageDim, outputDim, parent: \"CnnModel\"):\n", + " super().__init__()\n", + " k = parent.kernelSize\n", + " p = parent.poolingKernelSize\n", + " self.cnn = torch.nn.Conv2d(1, parent.numConv, (k, k))\n", + " self.pool = torch.nn.MaxPool2d((p, p))\n", + " self.dropout = torch.nn.Dropout(p=parent.pDropout)\n", + " reducedDim = (imageDim-k+1)/p\n", + " if int(reducedDim) != reducedDim:\n", + " raise ValueError(f\"Pooling kernel size {p} is not a divisor of post-convolution dimension {imageDim-k+1}\")\n", + " self.mlp = sensai.torch.models.MultiLayerPerceptron(parent.numConv * int(reducedDim)**2, outputDim, parent.mlpHiddenDims,\n", + " outputActivationFn=parent.outputActivationFn.getTorchFunction(),\n", + " hidActivationFn=sensai.torch.ActivationFunction.RELU.getTorchFunction(),\n", + " pDropout=parent.pDropout)\n", + "\n", + " def forward(self, x):\n", + " x = self.cnn(x.unsqueeze(1))\n", + " x = self.pool(x)\n", + " x = x.view(x.shape[0], -1)\n", + " x = self.dropout(x)\n", + " return self.mlp(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is only insignificantly more code than in the previous implementation.\n", + "The outer class, which provides the sensAI `VectorModel` features, serves mainly to hold the parameters, and the inner class inheriting from `VectorTorchModel` serves as a factory for the `torch.nn.Module`, providing us with the input and output dimensions (number of input columns and number of classes respectively) based on the data, thus enabling the model to adapt. If we had required even more adaptiveness, we could have learnt more about the data from within the fitting process of a custom input tensoriser (i.e. we could have added an inner ``Tensoriser`` class, which could have derived further hyperparameters from the data in its implementation of the fitting method.)\n", + "\n", + "Let's instantiate our model and evaluate it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cnnModel = CnnModel(cuda=False, kernelSize=5, numConv=32, poolingKernelSize=2, mlpHiddenDims=(200,20),\n", + " nnOptimiserParams=nnOptimiserParams) \\\n", + " .withName(\"CNN'\") \\\n", + " .withInputTensoriser(ImageReshapingInputTensoriser())\n", + "\n", + "evalData = evalUtil.performSimpleEvaluation(cnnModel)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our CNN models do improve upon the MLP model we evaluated earlier. Let's do a comparison of all the models we trained thus far:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "comparisonData = evalUtil.compareModels([torchMLPModel, cnnModelFromModule, cnnModel, randomForestModel], fitModels=False)\n", + "comparisonData.resultsDF" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that any differences between the two CNN models are due only to randomness in the parameter initialisation; they are functionally identical.\n", + "\n", + "Could the CNN model have produced even better results? Let's take a look at some examples where the CNN model went wrong by inspecting the evaluation data that was returned earlier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "misclassified = evalData.getMisclassifiedTriplesPredTrueInput()\n", + "fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(9,9))\n", + "for i, (predClass, trueClass, input) in enumerate(misclassified[:9]):\n", + " axs[i//3][i%3].imshow(reshape2DImage(input), cmap=\"binary\")\n", + " axs[i//3][i%3].set_title(f\"{trueClass} misclassified as {predClass}\")\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While some of these examples are indeed ambiguous, there still is room for improvement." + ] + } + ], + "metadata": { + "interpreter": { + "hash": "9b3442ae4bdb9561e722e28424c33a03c16d40b3aa50369b79d367cad7b1adea" + }, + "kernelspec": { + "display_name": "Python 3.8.13 ('sensai')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/Tensor Models with Torchlightning.ipynb b/notebooks/tensor_models_pytorch_lightning.ipynb similarity index 76% rename from notebooks/Tensor Models with Torchlightning.ipynb rename to notebooks/tensor_models_pytorch_lightning.ipynb index 1dd3071f..8b5fb9dc 100644 --- a/notebooks/Tensor Models with Torchlightning.ipynb +++ b/notebooks/tensor_models_pytorch_lightning.ipynb @@ -1,35 +1,14 @@ { "cells": [ { - "cell_type": "markdown", - "source": [ - "# Tensor Models with PyTorch-Lightning\n", - "\n", - "In this notebook we show how sensAI's TensorModel wrappers can be used together with pytorch-lightning models\n", - "and trainers for even faster development and experimentation." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "### Before running the notebook\n", - "\n", - "Install the package and its dependencies, if you haven't done so already. E.g. for an editable install call\n", - "```\n", - "pip install -e .\n", - "```\n", - "from the root directory. You can also execute this command directly in the notebook but will need to reload the\n", - "kernel afterwards\n" - ], - "metadata": { - "collapsed": false - } + "%load_ext autoreload\n", + "%autoreload 2" + ] }, { "cell_type": "code", @@ -37,16 +16,28 @@ "metadata": {}, "outputs": [], "source": [ - "# Note - this cell should be executed only once per session\n", - "\n", - "%load_ext autoreload\n", - "%autoreload 2\n", + "import sys; sys.path.extend([\"../src\", \"..\"])\n", + "import sensai\n", + "import logging\n", + "import config\n", "\n", - "import sys, os\n", + "c = config.get_config(reload=True)\n", + "sensai.util.logging.configureLogging(level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Tensor Models with PyTorch-Lightning\n", "\n", - "# in order to get the top level modules; they are not part of the package\n", - "os.chdir(\"..\")\n", - "sys.path.append(os.path.abspath(\".\"))" + "In this notebook we show how sensAI's TensorModel wrappers can be used together with pytorch-lightning models\n", + "and trainers for even faster development and experimentation." ] }, { @@ -60,6 +51,7 @@ "from torch.nn import functional as F\n", "import pytorch_lightning as pl\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pandas as pd\n", "from sensai.data import InputOutputArrays, DataSplitterFractional\n", "\n", @@ -76,18 +68,18 @@ }, { "cell_type": "markdown", - "source": [ - "## Loading the Data\n", - "\n", - "Unlike in the mnist-based torch-lightning tutorial, here we will load the data in a more \"realistic\" way,\n", - "namely with pandas from disc." - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "## Loading the Data\n", + "\n", + "Unlike in the mnist-based torch-lightning tutorial, here we will load the data in a more \"realistic\" way,\n", + "namely with pandas from disc." + ] }, { "cell_type": "code", @@ -100,7 +92,7 @@ "outputs": [], "source": [ "X = pd.read_csv(c.datafile_path(\"mnist_train.csv.zip\"))\n", - "labels = pd.DataFrame(X.pop(\"label\"))\n", + "labels = pd.DataFrame(X.pop(\"label\")).astype(np.int64)\n", "X = X.values.reshape(len(X), 28, 28) / 2 ** 8\n", "X = pd.DataFrame({\"mnist_image\": list(X)}, index=labels.index)\n", "\n", @@ -115,57 +107,67 @@ }, { "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "## Using Data Loaders in pure PyTorch Lightning\n", + "## Using Data Loaders in Pure PyTorch Lightning\n", "\n", "First, let us see how training would proceed in pure pytorch-lightning.\n", "\n", "We will use sensaAI only for obtaining torch data loaders (which otherwise would require a few more lines of code)\n", "by transforming the data frames to arrays, splitting them and converting them to loaders." - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { - "name": "#%% md\n" + "name": "#%%\n" } - } - }, - { - "cell_type": "code", + }, + "outputs": [], "source": [ + "TEST_FRACTION = 0.2\n", "VALIDATION_FRACTION = 0.1\n", "\n", "full_ds = InputOutputArrays(extractArray(X), extractArray(labels))\n", - "splitter = DataSplitterFractional(1-VALIDATION_FRACTION)\n", "\n", - "train_ds, val_ds = splitter.split(full_ds)\n", + "full_train_ds, test_ds = DataSplitterFractional(1-VALIDATION_FRACTION).split(full_ds)\n", + "train_ds, val_ds = DataSplitterFractional(1-VALIDATION_FRACTION).split(full_train_ds)\n", "train_dataloader = train_ds.toTorchDataLoader()\n", - "val_dataloader = val_ds.toTorchDataLoader()" - ], + "val_dataloader = val_ds.toTorchDataLoader()\n", + "test_dataloader = test_ds.toTorchDataLoader()" + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", "source": [ "Now that we have the data loaders, let us forget about sensAI for the moment. We create the model declaration and\n", "trainer with pytorch-lightning and fit on the MNIST data" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { - "name": "#%% md\n" + "name": "#%%\n" } - } - }, - { - "cell_type": "code", + }, + "outputs": [], "source": [ "class MNISTModel(pl.LightningModule):\n", "\n", @@ -176,79 +178,88 @@ " def forward(self, x: torch.Tensor):\n", " x = x.float()\n", " x = torch.relu(self.l1(x.view(x.size(0), -1)))\n", - " return F.softmax(x, dim=1)\n", + " return F.log_softmax(x, dim=1)\n", "\n", " def training_step(self, batch, *args):\n", " x, y = batch\n", - " loss = F.cross_entropy(self(x), y)\n", + " loss = F.nll_loss(self(x), y)\n", " return loss\n", "\n", " def validation_step(self, batch, *args):\n", " x, y = batch\n", - " loss = F.cross_entropy(self(x), y)\n", + " loss = F.nll_loss(self(x), y)\n", " return loss\n", "\n", " def configure_optimizers(self):\n", " return torch.optim.Adam(self.parameters(), lr=0.02)" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", + "outputs": [], "source": [ "mnist_model = MNISTModel()\n", "\n", - "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20)\n", + "trainer = pl.Trainer(max_epochs=5, progress_bar_refresh_rate=20)\n", "trainer.fit(mnist_model, train_dataloader, val_dataloader)" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "Let us pick some images from the validation set and look at the results" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "Let us pick some images from the validation set and look at the results" + ] }, { "cell_type": "code", - "source": [ - "mini_test_set = val_dataloader.dataset[10:20]\n", - "test_images, test_labels = mini_test_set\n", - "\n", - "display(mnist_model(test_images).argmax(axis=1))\n", - "display(test_labels)" - ], + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, + "outputs": [], + "source": [ + "mini_test_set = test_dataloader.dataset[10:20]\n", + "test_images, test_labels = mini_test_set\n", + "\n", + "display(mnist_model(test_images).argmax(axis=1))\n", + "display(test_labels)" + ] + }, + { + "cell_type": "code", "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "\n", + "sklearn.metrics.accuracy_score(test_ds.outputs, mnist_model(test_dataloader.dataset[:][0]).argmax(axis=1))" + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Wrapping the Model with sensAI\n", "\n", @@ -257,122 +268,138 @@ "\n", "This model maps a tensor to a single label, so the correct class to wrap it with is `PLTensorToScalarClassificationModel`,\n", "where the `PL` prefix stands for pytorch-lightning." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "mnist_model = MNISTModel()\n", "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20)\n", "sensaiMnistModel = PLTensorToScalarClassificationModel(mnist_model, trainer, validationFraction=VALIDATION_FRACTION)" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "NB: Even without dedicated wrappers, it would require only a few more lines of code to get a custom implementation of\n", "a suitable sensAI base class that wraps one's model." - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "With the wrapped model, we can fit directly on the data frames. We don't lose any of the niceties that pytorch-lightning\n", "brings to the game (both the original model and the trainer are available in `sensaiMnistModel`). By wrapping the\n", "model and trainer we gain all the safety, transparency, flexibility in feature engineering as well\n", "as extensive support for model evaluation that sensAI is all about." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "sensaiMnistModel.fit(X, labels)" - ], + "display(labels.dtypes)\n", + "np.stack(np.stack(labels.values, axis=1).squeeze(), axis=0).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "ioData = sensai.InputOutputData(X, labels)\n", + "trainData, testData = DataSplitterFractional(0.8).split(ioData)\n", + "\n", + "sensaiMnistModel.fitInputOutputData(trainData)" + ] }, { "cell_type": "markdown", - "source": [ - "The wrapped model performs predictions on data frames. Let us take some points from the training set,\n", - "perform a prediction on them and have a look at the true labels" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "The wrapped model performs predictions on data frames. Let us take some points from the training set,\n", + "perform a prediction on them and have a look at the true labels" + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "display(\"Predicted data frame\")\n", - "display(sensaiMnistModel.predict(X.iloc[:10]))\n", - "display(\"True labels data frame\")\n", - "display(labels.iloc[:10])" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "display(\"Predicted data frame\")\n", + "display(sensaiMnistModel.predict(testData.inputs.iloc[:10]))\n", + "display(\"True labels data frame\")\n", + "display(testData.outputs.iloc[:10])" + ] }, { "cell_type": "markdown", - "source": [ - "## Evaluating Tensor Models\n", - "\n", - "TODO - the evaluation part is unfinished yet (although we could already the above classifier with the standard\n", - "vector model evaluators).\n", - "We should also include TensorToTensor models here and show how to evaluate them\n" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "## Evaluating the Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluator = sensai.evaluation.VectorClassificationModelEvaluator(trainData, testData)\n", + "evaluator.evalModel(sensaiMnistModel).getEvalStats().metricsDict()" + ] } ], "metadata": { + "interpreter": { + "hash": "9b3442ae4bdb9561e722e28424c33a03c16d40b3aa50369b79d367cad7b1adea" + }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.7.9 ('sensai')", "language": "python", "name": "python3" }, @@ -386,9 +413,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/notebooks/test_notebooks.py b/notebooks/test_notebooks.py index 52948be9..af304e51 100644 --- a/notebooks/test_notebooks.py +++ b/notebooks/test_notebooks.py @@ -1,17 +1,28 @@ import logging import os +import pathlib +import re import nbformat import pytest from nbconvert.preprocessors import ExecutePreprocessor -NOTEBOOKS_DIR = "notebooks" -DOCS_DIR = "docs" -resources = {"metadata": {"path": NOTEBOOKS_DIR}} +ROOT_DIR = pathlib.Path(__file__).parent.parent.absolute() +DOCS_DIR = ROOT_DIR / "docs" +NOTEBOOKS_DIR = ROOT_DIR / "notebooks" log = logging.getLogger(__name__) +def notebooksUsedInDocs(): + with open(ROOT_DIR / "docs/index.rst", "r") as f: + content = f.read() + return re.findall(r"\s(\w+\.ipynb)", content) + + +NOTEBOOKS_TO_COPY = notebooksUsedInDocs() + + class LoggingExecutePreprocessor(ExecutePreprocessor): def __init__(self, notebookName, **kw): self._notebookName = notebookName @@ -26,15 +37,18 @@ def preprocess_cell(self, cell, resources, index): "notebook", [file for file in os.listdir(NOTEBOOKS_DIR) if file.endswith(".ipynb")] ) def test_notebook(notebook): - notebook_path = os.path.join(NOTEBOOKS_DIR, notebook) + notebook_path = NOTEBOOKS_DIR / notebook log.info(f"Reading jupyter notebook from {notebook_path}") with open(notebook_path) as f: nb = nbformat.read(f, as_version=4) ep = LoggingExecutePreprocessor(notebook, timeout=600) - ep.preprocess(nb, resources=resources) + ep.preprocess(nb, resources={"metadata": {"path": str(NOTEBOOKS_DIR)}}) # saving the executed notebook to docs - output_path = os.path.join(DOCS_DIR, notebook) - log.info(f"Saving executed notebook to {output_path} for documentation purposes") - with open(output_path, "w", encoding="utf-8") as f: - nbformat.write(nb, f) + if notebook in NOTEBOOKS_TO_COPY: + output_path = os.path.join(DOCS_DIR, notebook) + log.info(f"Saving executed notebook to {output_path} for documentation purposes") + with open(output_path, "w", encoding="utf-8") as f: + nbformat.write(nb, f) + else: + log.info(f"Notebook {notebook} is not used in docs; not copied") \ No newline at end of file diff --git a/notebooks/Tracking Experiments.ipynb b/notebooks/tracking_experiments.ipynb similarity index 77% rename from notebooks/Tracking Experiments.ipynb rename to notebooks/tracking_experiments.ipynb index fa8f0b95..d5855071 100644 --- a/notebooks/Tracking Experiments.ipynb +++ b/notebooks/tracking_experiments.ipynb @@ -1,5 +1,30 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys; sys.path.extend([\"../src\", \"..\"])\n", + "import sensai\n", + "import logging\n", + "import config\n", + "\n", + "c = config.get_config(reload=True)\n", + "sensai.util.logging.configureLogging(level=logging.INFO)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -9,7 +34,7 @@ } }, "source": [ - "# Tracking sensAI experiments\n", + "# Tracking Experiments\n", "\n", "In this notebook we will demonstrate how to use sensAI's tracking utilities with evaluators\n", "and parameter sweeps. Several backends are supported and it is very easy to write a new custom adapter\n", @@ -21,167 +46,152 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "# Note - this cell should be executed only once per session\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import sys, os\n", - "\n", - "# in order to get the config, it is not part of the library\n", - "os.chdir(\"..\")\n", - "sys.path.append(os.path.abspath(\".\"))" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, "outputs": [], "source": [ - "import geopandas as gp\n", + "import os\n", "\n", - "from sensai.hyperopt import GridSearch\n", + "import geopandas as gp\n", "from sklearn.cluster import DBSCAN\n", - "import logging\n", "\n", - "from sensai.geoanalytics.coordinate_clustering import SkLearnCoordinateClustering\n", + "from sensai.hyperopt import GridSearch\n", + "from sensai.geoanalytics.geopandas.coordinate_clustering import SkLearnCoordinateClustering\n", "from sensai.evaluation.evaluator_clustering import ClusteringModelSupervisedEvaluator\n", - "from sensai.geoanalytics.coordinate_clustering_ground_truth import PolygonAnnotatedCoordinates\n", + "from sensai.geoanalytics.geopandas.coordinate_clustering_ground_truth import PolygonAnnotatedCoordinates\n", "from sensai.tracking.clearml_tracking import ClearMLExperiment\n", "\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from config import get_config\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "c = get_config(reload=True)" - ], + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "### Evaluators\n", "\n", "The main entrypoint to reproducible experiments is the evaluator api. We will use clustering evaluation for\n", "demonstration purposes. We load the data and create a SupervisedClusteringEvaluator, see\n", - "[intro to evaluation](Clustering%20Evaluation.ipynb) for more details.\n", + "[intro to evaluation](clustering_evaluation.ipynb) for more details.\n", "\n", "[comment]: <> (TODO - use some VectorModel with an sklearn dataset instead, move the notebook to sensAI repo)" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "# loading the data and ground truth labels\n", "sampleFile = c.datafile_path(\"sample\", stage=c.RAW) # this can point to a directory or a shp/geojson file\n", "sampleGeoDF = gp.read_file(sampleFile)\n", "groundTruthClusters = PolygonAnnotatedCoordinates(sampleGeoDF, c.datafile_path(\"sample\", stage=c.GROUND_TRUTH))" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, "outputs": [], "source": [ "# creating the evaluator\n", "groundTruthCoordinates, groundTruthLabels = groundTruthClusters.getCoordinatesLabels()\n", "supervisedEvaluator = ClusteringModelSupervisedEvaluator(groundTruthCoordinates, trueLabels=groundTruthLabels)" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + ] }, { "cell_type": "markdown", - "source": [ - "### Setup tracking\n", - "\n", - "Now comes the new part - we create a tracking experiment and set it in the evaluator" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "### Setup tracking\n", + "\n", + "Now comes the new part - we create a tracking experiment and set it in the evaluator" + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "experiment = ClearMLExperiment(projectName=\"Demos\", taskName=\"notebook_experiment\")\n", - "supervisedEvaluator.setTrackedExperiment(experiment)" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "def createExperiment(projectName, taskName):\n", + " try:\n", + " return ClearMLExperiment(projectName=projectName, taskName=taskName)\n", + " except: # allow to run in contexts without ClearML credentials\n", + " return None\n", + "\n", + "experiment = createExperiment(projectName=\"Demos\", taskName=\"notebook_experiment\")\n", + "supervisedEvaluator.setTrackedExperiment(experiment)" + ] }, { "cell_type": "markdown", - "source": [ - "As simple as that! Whenever we perform an evaluation, the results will be tracked. Depending on\n", - "the backend and the particular implementation of the experiment, the code and other information\n", - "like images will get tracked as well. We will demonstrated the tracking of the evaluation of a dbscan." - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "As simple as that! Whenever we perform an evaluation, the results will be tracked. Depending on\n", + "the backend and the particular implementation of the experiment, the code and other information\n", + "like images will get tracked as well. We will demonstrated the tracking of the evaluation of a dbscan." + ] }, { "cell_type": "code", - "source": [ - "boundedDbscan = SkLearnCoordinateClustering(DBSCAN(eps=150, min_samples=20), minClusterSize=100)\n", - "supervisedEvaluator.computeMetrics(boundedDbscan)" - ], + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "boundedDbscan = SkLearnCoordinateClustering(DBSCAN(eps=150, min_samples=20), minClusterSize=100)\n", + "supervisedEvaluator.computeMetrics(boundedDbscan)" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "# plots are tracked automatically on creation.\n", @@ -191,67 +201,68 @@ "ax.set_title(\"Sample Ground Truth clusters\")\n", "groundTruthClusters.plot(includeNoise=False, markersize=0.2, cmap=\"plasma\", ax=ax)\n", "fig.show()" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=[6, 8])\n", "ax.set_title(\"Predicted clusters\")\n", "boundedDbscan.plot(includeNoise=False, markersize=0.2, cmap=\"plasma\", ax=ax, figsize=10)\n", "fig.show()\n" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, "outputs": [], "source": [ "# We can also add the summaries df to the experiment through explicit tracking\n", "\n", - "logger = supervisedEvaluator.trackedExperiment.logger\n", + "if supervisedEvaluator.trackedExperiment:\n", + " logger = supervisedEvaluator.trackedExperiment.logger\n", "\n", - "logger.report_table(title=\"Clusters Summaries\", series=\"pandas DataFrame\", iteration=0,\n", - " table_plot=boundedDbscan.summaryDF().sort_values(\"numMembers\"))" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + " logger.report_table(title=\"Clusters Summaries\", series=\"pandas DataFrame\", iteration=0,\n", + " table_plot=boundedDbscan.summaryDF().sort_values(\"numMembers\"))" + ] }, { "cell_type": "markdown", - "source": [ - "The same mechanism works in the hyperopts module. The experiment can be set for GridSearch\n", - "or simulated annealing. One can also set the experiment in the evaluator that is passed to\n", - "the hyperopt objects and use that one for tracking instead. Here an example\n" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "The same mechanism works in the hyperopts module. The experiment can be set for GridSearch\n", + "or simulated annealing. One can also set the experiment in the evaluator that is passed to\n", + "the hyperopt objects and use that one for tracking instead. Here an example\n" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "# because of how trains works and because we are using it in jupyter, we need to manually close the existing task\n", @@ -259,8 +270,9 @@ "# this step is unnecessary if one has one experiment per script execution\n", "# we also unset the tracked experiment in the evaluator and prepare a new one for the grid search\n", "\n", - "supervisedEvaluator.trackedExperiment.task.close()\n", - "supervisedEvaluator.unsetTrackedExperiment()\n", + "if supervisedEvaluator.trackedExperiment:\n", + " supervisedEvaluator.trackedExperiment.task.close()\n", + " supervisedEvaluator.unsetTrackedExperiment()\n", "\n", "\n", "def dbscanFactory(**kwargs):\n", @@ -271,70 +283,68 @@ " \"eps\": [50, 150]\n", "}\n", "\n", + "gridExperiment = createExperiment(projectName=\"Demos\", taskName=\"notebook_grid_search\")\n", "dbscanGridSearch = GridSearch(dbscanFactory, parameterOptions,\n", " csvResultsPath=os.path.join(c.temp, \"dbscanGridSearchCsv\"))\n", - "gridExperiment = ClearMLExperiment(projectName=\"Demos\", taskName=\"notebook_grid_search\")\n", "dbscanGridSearch.setTrackedExperiment(gridExperiment)" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "searchResults = dbscanGridSearch.run(supervisedEvaluator, sortColumnName=\"numClusters\")" + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "searchResults = dbscanGridSearch.run(supervisedEvaluator, sortColumnName=\"numClusters\")" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, "outputs": [], "source": [ "# unfortunately, the trains experiment interface is at conflict with the grid search\n", "# the most pragmatic solution is to simply attach the dataframe to the experiment and to use it for further evaluation\n", "\n", - "dbscanGridSearch.trackedExperiment.logger.report_table(title=\"Results\", series=\"pandas DataFrame\", iteration=0,\n", - " table_plot=searchResults)\n" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "if dbscanGridSearch.trackedExperiment:\n", + " dbscanGridSearch.trackedExperiment.logger.report_table(title=\"Results\", series=\"pandas DataFrame\", iteration=0,\n", + " table_plot=searchResults)\n" + ] } ], "metadata": { + "interpreter": { + "hash": "9b3442ae4bdb9561e722e28424c33a03c16d40b3aa50369b79d367cad7b1adea" + }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.8.13 ('sensai')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/requirements-dev.txt b/requirements-dev.txt index fc4f6fcd..5d623e77 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,4 @@ +# This file has been superseded by environment.yml; use conda to install a developer environment PyQt5==5.14.1 psutil==5.6.7 torch==1.4.0 diff --git a/requirements-relaxed.txt b/requirements-relaxed.txt new file mode 100644 index 00000000..b496490c --- /dev/null +++ b/requirements-relaxed.txt @@ -0,0 +1,7 @@ +# this is a relaxed version of requirements.txt where newer versions are allowed +pandas>=1.0.0 +scipy>=1.4 +numpy>=1.18.0 +scikit-learn>=0.22.0 +seaborn>=0.11.0 +typing-extensions>=3.7 diff --git a/requirements.txt b/requirements.txt index 9cd84cbe..86cccaa9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ +# sensAI main requirements +# These are the pinned lowest versions we test; setup.py relaxes them, changing them to lower bounds pandas~=1.0.0 scipy~=1.4 numpy~=1.18.0 scikit-learn~=0.22.0 -seaborn~=0.10.0 +seaborn~=0.11.0 typing-extensions~=3.7 diff --git a/requirements_geoanalytics.txt b/requirements_geoanalytics.txt new file mode 100644 index 00000000..6395d261 --- /dev/null +++ b/requirements_geoanalytics.txt @@ -0,0 +1,6 @@ +# Requirements for extra 'geoanalytics' +# These are the pinned lowest versions we test; setup.py relaxes them, changing them to lower bounds +networkx==2.4 +Shapely~=1.7.0 +geopandas==0.7.0 +utm==0.7.0 diff --git a/requirements_lightgbm.txt b/requirements_lightgbm.txt new file mode 100644 index 00000000..cbe441d5 --- /dev/null +++ b/requirements_lightgbm.txt @@ -0,0 +1,3 @@ +# Requirements for extra 'lightgbm' +# These are the pinned lowest versions we test; setup.py relaxes them, changing them to lower bounds +lightgbm==2.3.0 diff --git a/requirements_tensorflow.txt b/requirements_tensorflow.txt new file mode 100644 index 00000000..dc5d55bb --- /dev/null +++ b/requirements_tensorflow.txt @@ -0,0 +1,3 @@ +# Requirements for extra 'tensorflow' +# These are the pinned lowest versions we test; setup.py relaxes them, changing them to lower bounds +tensorflow~=2.0 diff --git a/requirements_torch.txt b/requirements_torch.txt new file mode 100644 index 00000000..cf3fedf3 --- /dev/null +++ b/requirements_torch.txt @@ -0,0 +1,4 @@ +# Requirements for extra 'torch' +# These are the pinned lowest versions we test; setup.py relaxes them, changing them to lower bounds +torch==1.4.0 +torchtext==0.5.0 diff --git a/run_pytest_notebooks.sh b/run_pytest_notebooks.sh new file mode 100644 index 00000000..f1895b65 --- /dev/null +++ b/run_pytest_notebooks.sh @@ -0,0 +1,6 @@ +if ! git lfs pull; then + printf "\n\nERROR: git lfs pull failed\n\n" + exit +fi +export PYTHONPATH="`realpath src`" +pytest notebooks \ No newline at end of file diff --git a/run_pytest_tests.sh b/run_pytest_tests.sh new file mode 100644 index 00000000..ed02adaa --- /dev/null +++ b/run_pytest_tests.sh @@ -0,0 +1,2 @@ +export PYTHONPATH=src +pytest tests \ No newline at end of file diff --git a/setup.py b/setup.py index 4a6aea54..095c112a 100644 --- a/setup.py +++ b/setup.py @@ -1,25 +1,45 @@ +import functools import re from typing import Iterable, Dict +from glob import glob from setuptools import setup, find_namespace_packages tf_requirements = ['tensorflow==1.15.0'] torch_requirements = ['torch==1.4.0', 'torchtext==0.5.0'] lightgbm_requirements = ['lightgbm==2.3.0'] -geoanalytics_requirements = ['networkx==2.4', 'Shapely~=1.7.0', 'geopandas==0.7.0'] +geoanalytics_requirements = ['networkx==2.4', 'Shapely~=1.7.0', 'geopandas==0.7.0', 'utm==0.7.0'] -# list of dependencies where ==/~= dependencies (used by us, particularly in requirements.txt) are relaxed: +# list of dependencies where ==/~= dependencies (used in requirements.txt and for the extras in requirements_*.txt) are relaxed: # any later version is OK (as long as we are not aware of a concrete limitation - and once we are, we shall define # the respective upper bound below) -DEPS_VERSION_LOWER_BOUND = ["pandas", "scipy", "numpy", "scikit-learn", "seaborn", "typing-extensions"] +DEPS_VERSION_LOWER_BOUND = [ + # main + "pandas", "scipy", "numpy", "scikit-learn", "seaborn", "typing-extensions", + # extra "torch" + "torch", "torchtext", + # extra "tensorflow" + "tensorflow", + # extra "lightgbm" + "lightgbm", + # extra "geoanalytics" + "networkx", "Shapely", "geopandas", "utm", +] +# upper bound: map dependency name to lowest exluded version DEPS_VERSION_UPPER_BOUND_EXCLUSIVE: Dict[str, str] = {} -def required_packages(deps: Iterable[str]): +def relaxed_requirements(deps: Iterable[str]): + """ + :param deps: the set of requirements + :return: the set of updated requirements with the relaxations defined above applied + """ updated_deps = [] for dep in deps: dep = dep.strip() + if dep.startswith("#"): + continue m = re.match(r'([\w-]+)[=~]=', dep) # match package with == or ~= version spec if m: package = m.group(1) @@ -32,6 +52,20 @@ def required_packages(deps: Iterable[str]): return updated_deps +def relaxed_requirements_from_file(path): + with open(path, "r") as f: + return relaxed_requirements(f.readlines()) + + +# create extras requirements from requirements_*.txt, and add "full" extras which combines them all +extras_require = {} +for extras_requirements_file in glob("requirements_*.txt"): + m = re.match(r"requirements_(\w+).txt", extras_requirements_file) + extra_name = m.group(1) + extras_require[extra_name] = relaxed_requirements_from_file(extras_requirements_file) +extras_require["full"] = functools.reduce(lambda x, y: x + y, list(extras_require.values())) + + setup( name='sensai', package_dir={"": "src"}, @@ -39,17 +73,11 @@ def required_packages(deps: Iterable[str]): url="https://github.com/jambit/sensAI", packages=find_namespace_packages(where="src"), include_package_data=True, - version='0.1.7', + version='0.1.8', description='Library for sensible AI', - install_requires=required_packages(open("requirements.txt").readlines()), + install_requires=relaxed_requirements_from_file("requirements.txt"), dependency_links=["https://download.pytorch.org/whl/torch_stable.html"], setup_requires=["wheel"], - extras_require={ - "torch": torch_requirements, - "tensorflow": tf_requirements, - "lightgbm": lightgbm_requirements, - "geoanalytics": geoanalytics_requirements, - "full": tf_requirements + torch_requirements + lightgbm_requirements + geoanalytics_requirements - }, + extras_require=extras_require, author='jambit GmbH' -) \ No newline at end of file +) diff --git a/src/sensai/__init__.py b/src/sensai/__init__.py index db7fe56e..3820c298 100644 --- a/src/sensai/__init__.py +++ b/src/sensai/__init__.py @@ -16,7 +16,7 @@ TensorToTensorClassificationModel, TensorToScalarClassificationModel from .vector_model import VectorModelBase, VectorModel, VectorRegressionModel, VectorClassificationModel -__version__ = "0.1.7" +__version__ = "0.1.8" # The following submodules are not imported by default to avoid necessarily requiring their dependencies: # tensorflow diff --git a/src/sensai/clustering/greedy_clustering.py b/src/sensai/clustering/greedy_clustering.py index cfb43165..475f0ae7 100644 --- a/src/sensai/clustering/greedy_clustering.py +++ b/src/sensai/clustering/greedy_clustering.py @@ -8,7 +8,6 @@ log = logging.getLogger(__name__) -# TODO: implement an adapter or wrapper such that we get a GreedyAgglomerativeClusteringModel class class GreedyAgglomerativeClustering(object): """ An implementation of greedy agglomerative clustering which avoids unnecessary @@ -69,7 +68,7 @@ def applyClustering(self) -> List[Cluster]: clusters merged into them) """ # compute all possible merges, adding them to the priority queue - self.log.info("Computing initial merges") + self.log.debug("Computing initial merges") for idx, wc in enumerate(self.wrappedClusters): self.log.debug("Computing potential merges for cluster index %d" % idx) wc.computeMerges(False) diff --git a/src/sensai/data.py b/src/sensai/data.py index f9a6713a..7f2d5091 100644 --- a/src/sensai/data.py +++ b/src/sensai/data.py @@ -1,10 +1,15 @@ +import logging from abc import ABC, abstractmethod from typing import Tuple, Sequence, TypeVar, List, Generic import numpy as np import pandas as pd import scipy.stats +from sklearn.model_selection import StratifiedShuffleSplit +from sensai.util.string import ToStringMixin + +log = logging.getLogger(__name__) T = TypeVar("T") @@ -47,11 +52,27 @@ def toTorchDataLoader(self, batchSize=64, shuffle=True): return DataLoader(dataSet, batch_size=batchSize, shuffle=shuffle) -# TODO: Rename to InputOutputDataFrames when the time for breaking changes has come -class InputOutputData(BaseInputOutputData[pd.DataFrame]): +class InputOutputData(BaseInputOutputData[pd.DataFrame], ToStringMixin): + """ + Holds input and output data for learning problems + """ def __init__(self, inputs: pd.DataFrame, outputs: pd.DataFrame): super().__init__(inputs, outputs) + def _toStringObjectInfo(self) -> str: + return f"N={len(self.inputs)}, numInputColumns={len(self.inputs.columns)}, numOutputColumns={len(self.outputs.columns)}" + + @classmethod + def fromDataFrame(cls, df: pd.DataFrame, *outputColumns: str) -> "InputOutputData": + """ + :param df: a data frame containing both input and output columns + :param outputColumns: the output column name(s) + :return: an InputOutputData instance with inputs and outputs separated + """ + inputs = df[[c for c in df.columns if c not in outputColumns]] + outputs = df[list(outputColumns)] + return cls(inputs, outputs) + def filterIndices(self, indices: Sequence[int]) -> __qualname__: inputs = self.inputs.iloc[indices] outputs = self.outputs.iloc[indices] @@ -134,18 +155,41 @@ def split(self, data: InputOutputData) -> Tuple[InputOutputData, InputOutputData return A, B +class DataSplitterFromSkLearnSplitter(DataSplitter): + def __init__(self, skLearnSplitter): + """ + :param skLearnSplitter: an instance of one of the splitter classes from sklearn.model_selection, + see https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection + """ + self.skLearnSplitter = skLearnSplitter + + def split(self, data: TInputOutputData) -> Tuple[TInputOutputData, TInputOutputData]: + splitterResult = self.skLearnSplitter.split(data.inputs, data.outputs) + split = next(iter(splitterResult)) + firstIndices, secondIndices = split + return data.filterIndices(firstIndices), data.filterIndices(secondIndices) + + +class DataSplitterStratifiedShuffleSplit(DataSplitterFromSkLearnSplitter): + def __init__(self, fractionalSizeOfFirstSet: float, randomSeed=42): + super().__init__(StratifiedShuffleSplit(n_splits=1, train_size=fractionalSizeOfFirstSet, random_state=randomSeed)) + + class DataFrameSplitter(ABC): @abstractmethod def computeSplitIndices(self, df: pd.DataFrame, fractionalSizeOfFirstSet: float) -> Tuple[Sequence[int], Sequence[int]]: pass @staticmethod - def split(df: pd.DataFrame, indicesPair: Tuple[Sequence[int], Sequence[int]]) -> Tuple[pd.DataFrame, pd.DataFrame]: + def splitWithIndices(df: pd.DataFrame, indicesPair: Tuple[Sequence[int], Sequence[int]]) -> Tuple[pd.DataFrame, pd.DataFrame]: indicesA, indicesB = indicesPair A = df.iloc[indicesA] B = df.iloc[indicesB] return A, B + def split(self, df: pd.DataFrame, fractionalSizeOfFirstSet: float) -> Tuple[pd.DataFrame, pd.DataFrame]: + return self.splitWithIndices(df, self.computeSplitIndices(df, fractionalSizeOfFirstSet)) + class DataFrameSplitterFractional(DataFrameSplitter): def __init__(self, shuffle=False, randomSeed=42): diff --git a/src/sensai/data_transformation/__init__.py b/src/sensai/data_transformation/__init__.py index ab73e9be..fc9bc42f 100644 --- a/src/sensai/data_transformation/__init__.py +++ b/src/sensai/data_transformation/__init__.py @@ -1,2 +1,3 @@ from .dft import * -from . import sklearn_transformer \ No newline at end of file +from .sklearn_transformer import * +from .value_transformation import * diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py index 95401b65..28b78dd4 100644 --- a/src/sensai/data_transformation/dft.py +++ b/src/sensai/data_transformation/dft.py @@ -10,11 +10,16 @@ from .sklearn_transformer import SkLearnTransformerProtocol from ..columngen import ColumnGenerator -from ..util import flattenArguments +from ..util import flattenArguments, countNotNone from ..util.pandas import DataFrameColumnChangeTracker from ..util.pickle import setstate from ..util.string import orRegexGroup, ToStringMixin +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ..featuregen import FeatureGenerator + log = logging.getLogger(__name__) @@ -48,9 +53,13 @@ def getName(self) -> str: """ return self._name - def setName(self, name): + def setName(self, name: str): self._name = name + def withName(self, name: str): + self.setName(name) + return self + @abstractmethod def _fit(self, df: pd.DataFrame): pass @@ -86,6 +95,30 @@ def fitApply(self, df: pd.DataFrame) -> pd.DataFrame: self.fit(df) return self.apply(df) + def toFeatureGenerator(self, categoricalFeatureNames: Optional[Union[Sequence[str], str]] = None, + normalisationRules: Sequence['DFTNormalisation.Rule'] = (), + normalisationRuleTemplate: 'DFTNormalisation.RuleTemplate' = None, + addCategoricalDefaultRules=True): + # need to import here to prevent circular imports + from ..featuregen import FeatureGeneratorFromDFT + return FeatureGeneratorFromDFT( + self, categoricalFeatureNames=categoricalFeatureNames, normalisationRules=normalisationRules, + normalisationRuleTemplate=normalisationRuleTemplate, addCategoricalDefaultRules=addCategoricalDefaultRules + ) + + +class DFTFromFeatureGenerator(DataFrameTransformer): + def _fit(self, df: pd.DataFrame): + self.fgen.fit(df, ctx=None) + + def _apply(self, df: pd.DataFrame) -> pd.DataFrame: + return self.fgen.generate(df) + + def __init__(self, fgen: "FeatureGenerator"): + super().__init__() + self.fgen = fgen + self.setName(f"{self.__class__.__name__}[{self.fgen.getName()}]") + class InvertibleDataFrameTransformer(DataFrameTransformer, ABC): @abstractmethod @@ -437,13 +470,28 @@ class DFTNormalisation(DataFrameTransformer): """ Applies normalisation/scaling to a data frame by applying a set of transformation rules, where each rule defines a set of columns to which it applies (learning a single transformer based on the values - of all applicable columns) + of all applicable columns). + DFTNormalisation ignores N/A values during fitting and application. """ class RuleTemplate: def __init__(self, skip=False, unsupported=False, transformer: SkLearnTransformerProtocol = None, transformerFactory: Callable[[], SkLearnTransformerProtocol] = None, independentColumns=False): """ + Creates a rule template which applies to one or more features/columns (depending on context). + Use parameters as follows: + + * If the relevant features are already normalised, pass ``skip=True`` + * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` + * If the relevant features shall be normalised, the other parameters apply. + No parameters, i.e. ``RuleTemplate()``, are an option if ... + + * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application + is suitable for the relevant set of features. + Otherwise, specify either ``transformerFactory`` or ``transformer``. + * all relevant features are to be normalised in the same way. + Otherwise, specify ``independentColumns=True``. + :param skip: flag indicating whether no transformation shall be performed on all of the columns :param unsupported: flag indicating whether normalisation of all columns is unsupported (shall trigger an exception if attempted) :param transformer: a transformer instance (from sklearn.preprocessing, e.g. StandardScaler) to apply to the matching column(s) @@ -453,12 +501,12 @@ def __init__(self, skip=False, unsupported=False, transformer: SkLearnTransforme feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory. :param transformerFactory: a factory for the generation of the transformer instance, which will only be applied if `transformer` is not given; if neither `transformer` nor `transformerInstance` are given, the containing instance's default factory will - be used. See `SkLearnTransformerFactoryFactory` for convenient construction options. + be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. :param independentColumns: whether a separate transformation is to be learned for each of the columns for the case where the rule matches multiple columns. """ - if skip and transformer is not None: - raise ValueError("skip==True while transformer is not None") + if (skip or unsupported) and countNotNone(transformer, transformerFactory) > 0: + raise ValueError("Passed transformer or transformerFactory while skip=True or unsupported=True") self.skip = skip self.unsupported = unsupported self.transformer = transformer @@ -496,7 +544,7 @@ def __init__(self, regex: Optional[str], skip=False, unsupported=False, transfor feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory. :param transformerFactory: a factory for the generation of the transformer instance, which will only be applied if `transformer` is not given; if neither `transformer` nor `transformerInstance` are given, the containing instance's default factory will - be used. See `SkLearnTransformerFactoryFactory` for convenient construction options. + be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. :param arrayValued: whether the column values are not scalars but arrays (of arbitrary lengths). It is assumed that all entries in such arrays are to be normalised in the same way. If arrayValued is True, only a single matching column is supported, i.e. the regex must match at most one column. @@ -541,7 +589,9 @@ def matchingColumns(self, columns: Sequence[str]): def __init__(self, rules: Sequence[Rule], defaultTransformerFactory=None, requireAllHandled=True, inplace=False): """ - :param rules: the set of rules; rules are always fitted and applied in the given order + :param rules: the set of rules; rules are always fitted and applied in the given order. + A convenient way to obtain a set of rules in the :class:`sensai.vector_model.VectorModel` context is from a + :class:`sensai.featuregen.FeatureCollector` or :class:`sensai.featuregen.MultiFeatureGenerator`. :param defaultTransformerFactory: a factory for the creation of transformer instances (from sklearn.preprocessing, e.g. StandardScaler) that shall be used to create a transformer for all rules that don't specify a particular transformer. The default transformer will only be applied to columns matched by such rules, unmatched columns will @@ -653,6 +703,9 @@ def findRule(self, colName: str) -> "DFTNormalisation.Rule": class DFTFromColumnGenerators(RuleBasedDataFrameTransformer): + """ + Extends a data frame with columns generated from ColumnGenerator instances + """ def __init__(self, columnGenerators: Sequence[ColumnGenerator], inplace=False): super().__init__() self.columnGenerators = columnGenerators @@ -674,7 +727,9 @@ def info(self): class DFTCountEntries(RuleBasedDataFrameTransformer): """ - Adds a new column with counts of the values on a selected column + Transforms a data frame, based on one of its columns, into a new data frame containing two columns that indicate the counts + of unique values in the input column. It is the "DataFrame output version" of pd.Series.value_counts. + Each row of the output column holds a unique value of the input column and the number of times it appears in the input column. """ def __init__(self, columnForEntryCount: str, columnNameForResultingCounts: str = "counts"): super().__init__() @@ -799,3 +854,66 @@ class DFTSortColumns(RuleBasedDataFrameTransformer): """ def _apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[sorted(df.columns)] + + +class DFTFillNA(RuleBasedDataFrameTransformer): + """ + Fills NA/NaN values with the given value + """ + def __init__(self, fillValue, inplace: bool = False): + super().__init__() + self.fillValue = fillValue + self.inplace = inplace + + def _apply(self, df: pd.DataFrame) -> pd.DataFrame: + if self.inplace: + df.fillna(value=self.fillValue, inplace=True) + return df + else: + return df.fillna(value=self.fillValue) + + +class DFTCastCategoricalColumns(RuleBasedDataFrameTransformer): + """ + Casts columns with dtype category to the given type. + This can be useful in cases where categorical columns are not accepted by the model but the column values are actually numeric, + in which case the cast to a numeric value yields an acceptable label encoding. + """ + def __init__(self, columns: Optional[List[str]] = None, dtype=float): + """ + :param columns: the columns to convert; if None, convert all that have dtype category + :param dtype: the data type to which categorical columns are to be converted + """ + super().__init__() + self.columns = columns + self.dtype = dtype + + def _apply(self, df: pd.DataFrame) -> pd.DataFrame: + df = df.copy() + columns = self.columns if self.columns is not None else df.columns + for col in columns: + s = df[col] + if s.dtype.name == "category": + df[col] = s.astype(self.dtype) + return df + + +class DFTDropNA(RuleBasedDataFrameTransformer): + """ + Drops rows or columns containing NA/NaN values + """ + def __init__(self, axis=0, inplace=False): + """ + :param axis: 0 to drop rows, 1 to drop columns containing an N/A value + :param inplace: whether to perform the operation in-place on the input data frame + """ + super().__init__() + self.axis = axis + self.inplace = inplace + + def _apply(self, df: pd.DataFrame) -> pd.DataFrame: + if self.inplace: + df.dropna(axis=self.axis, inplace=True) + return df + else: + return df.dropna(axis=self.axis) diff --git a/src/sensai/data_transformation/sklearn_transformer.py b/src/sensai/data_transformation/sklearn_transformer.py index f3d081a3..fe73033e 100644 --- a/src/sensai/data_transformation/sklearn_transformer.py +++ b/src/sensai/data_transformation/sklearn_transformer.py @@ -2,7 +2,7 @@ import logging from typing import Optional, Sequence, Union, Any, Callable -from sklearn.preprocessing import MaxAbsScaler, StandardScaler, RobustScaler +from sklearn.preprocessing import MaxAbsScaler, StandardScaler, RobustScaler, MinMaxScaler import numpy as np from typing_extensions import Protocol @@ -59,6 +59,10 @@ class SkLearnTransformerFactoryFactory: def MaxAbsScaler() -> Callable[[], MaxAbsScaler]: return MaxAbsScaler + @staticmethod + def MinMaxScaler() -> Callable[[], MinMaxScaler]: + return MinMaxScaler + @staticmethod def StandardScaler(with_mean=True, with_std=True) -> Callable[[], StandardScaler]: return functools.partial(StandardScaler, with_mean=with_mean, with_std=with_std) @@ -74,10 +78,15 @@ def RobustScaler(quantile_range=(25, 75), with_scaling=True, with_centering=True `min` being mapped to -1 and `max` being mapped to 1. :param with_scaling: whether to apply scaling based on quantile_range. :param with_centering: whether to apply centering by subtracting the median. - :return: a function, which when called without any arguments, produces the respective RobustScaler instace. + :return: a function, which when called without any arguments, produces the respective RobustScaler instance. """ return functools.partial(RobustScaler, quantile_range=quantile_range, with_scaling=with_scaling, with_centering=with_centering) @staticmethod - def ManualScaler(centre: Optional[float] = None, scale: Optional[float] = None): + def ManualScaler(centre: Optional[float] = None, scale: Optional[float] = None) -> Callable[[], ManualScaler]: + """ + :param centre: the value to subtract from all values (if any) + :param scale: the value with which to scale all values (after removing the centre) + :return: a function, which when called without any arguments, produces the respective scaler instance. + """ return functools.partial(ManualScaler, centre=centre, scale=scale) \ No newline at end of file diff --git a/src/sensai/evaluation/__init__.py b/src/sensai/evaluation/__init__.py index 868cea30..c3d11de9 100644 --- a/src/sensai/evaluation/__init__.py +++ b/src/sensai/evaluation/__init__.py @@ -1,8 +1,10 @@ from .crossval import VectorClassificationModelCrossValidator, VectorRegressionModelCrossValidator, \ - VectorClassificationModelCrossValidationData, VectorRegressionModelCrossValidationData + VectorClassificationModelCrossValidationData, VectorRegressionModelCrossValidationData, \ + VectorModelCrossValidatorParams from .eval_util import RegressionEvaluationUtil, ClassificationEvaluationUtil, MultiDataEvaluationUtil, \ evalModelViaEvaluator, createEvaluationUtil, createVectorModelEvaluator, createVectorModelCrossValidator from .evaluator import VectorClassificationModelEvaluator, VectorRegressionModelEvaluator, \ + VectorRegressionModelEvaluatorParams, VectorClassificationModelEvaluatorParams, \ VectorRegressionModelEvaluationData, VectorClassificationModelEvaluationData, \ RuleBasedVectorClassificationModelEvaluator, RuleBasedVectorRegressionModelEvaluator diff --git a/src/sensai/evaluation/crossval.py b/src/sensai/evaluation/crossval.py index a2552b83..1b339d3a 100644 --- a/src/sensai/evaluation/crossval.py +++ b/src/sensai/evaluation/crossval.py @@ -68,7 +68,7 @@ def createFolds(self, data: InputOutputData, numFolds: int) -> List[Tuple[Sequen """ :param data: the data from which to obtain the folds :param numFolds: the number of splits/folds - :return: a list containing numSplits tuples (t, e) where t and e are sequences of data point indices to use for training + :return: a list containing numFolds tuples (t, e) where t and e are sequences of data point indices to use for training and evaluation respectively """ pass @@ -193,7 +193,7 @@ def _computeMetrics(self, model: VectorModel, **kwargs): def _computeMetricsForVarName(self, model, predictedVarName: Optional[str]): data = self.evalModel(model) - return data.getEvalStatsCollection(predictedVarName=predictedVarName).aggStats() + return data.getEvalStatsCollection(predictedVarName=predictedVarName).aggMetricsDict() def createMetricsDictProvider(self, predictedVarName: Optional[str]) -> MetricsDictProvider: """ diff --git a/src/sensai/evaluation/eval_stats/eval_stats_base.py b/src/sensai/evaluation/eval_stats/eval_stats_base.py index 48ee2cc9..07f542f0 100644 --- a/src/sensai/evaluation/eval_stats/eval_stats_base.py +++ b/src/sensai/evaluation/eval_stats/eval_stats_base.py @@ -3,8 +3,9 @@ import seaborn as sns from abc import ABC, abstractmethod from matplotlib import pyplot as plt -from typing import Generic, TypeVar, List, Union, Dict, Sequence, Optional +from typing import Generic, TypeVar, List, Union, Dict, Sequence, Optional, Tuple, Callable +from ...util.plot import ScatterPlot, HistogramPlot, Plot, HeatMapPlot from ...util.string import ToStringMixin, dictString from ...vector_model import VectorModel @@ -40,15 +41,23 @@ def addMetric(self, metric: TMetric): def computeMetricValue(self, metric: TMetric) -> float: return metric.computeValueForEvalStats(self) - def getAll(self) -> Dict[str, float]: - """Gets a dictionary with all metrics""" + def metricsDict(self) -> Dict[str, float]: + """ + Computes all metrics + + :return: a dictionary mapping metric names to values + """ d = {} for metric in self.metrics: d[metric.name] = self.computeMetricValue(metric) return d + def getAll(self) -> Dict[str, float]: + """Alias for metricsDict; may be deprecated in the future""" + return self.metricsDict() + def _toStringObjectInfo(self) -> str: - return dictString(self.getAll()) + return dictString(self.metricsDict()) TEvalStats = TypeVar("TEvalStats", bound=EvalStats) @@ -57,52 +66,149 @@ def _toStringObjectInfo(self) -> str: class Metric(Generic[TEvalStats], ABC): name: str - def __init__(self, name: str = None): + def __init__(self, name: str = None, bounds: Optional[Tuple[float, float]] = None): """ :param name: the name of the metric; if None use the class' name attribute + :param bounds: the minimum and maximum values the metric can take on (or None if the bounds are not specified) """ # this raises an attribute error if a subclass does not specify a name as a static attribute nor as parameter self.name = name if name is not None else self.__class__.name + self.bounds = bounds @abstractmethod def computeValueForEvalStats(self, evalStats: TEvalStats) -> float: pass + def getPairedMetrics(self) -> List[TMetric]: + """ + Gets a list of metrics that should be considered together with this metric (e.g. for paired visualisations/plots). + The direction of the pairing should be such that if this metric is "x", the other is "y" for x-y type visualisations. -class EvalStatsCollection(Generic[TEvalStats], ABC): + :return: a list of metrics + """ + return [] + + def hasFiniteBounds(self) -> bool: + return self.bounds is not None and not any((np.isinf(x) for x in self.bounds)) + + +class EvalStatsCollection(Generic[TEvalStats, TMetric], ABC): def __init__(self, evalStatsList: List[TEvalStats]): self.statsList = evalStatsList - metricsList = [es.getAll() for es in evalStatsList] + metricNamesSet = None + metricsList = [] + for es in evalStatsList: + metrics = es.metricsDict() + currentMetricNamesSet = set(metrics.keys()) + if metricNamesSet is None: + metricNamesSet = currentMetricNamesSet + else: + if metricNamesSet != currentMetricNamesSet: + raise Exception(f"Inconsistent set of metrics in evaluation stats collection: Got {metricNamesSet} for one instance, {currentMetricNamesSet} for another") + metricsList.append(metrics) metricNames = sorted(metricsList[0].keys()) - self.metrics = {metric: [d[metric] for d in metricsList] for metric in metricNames} + self._valuesByMetricName = {metric: [d[metric] for d in metricsList] for metric in metricNames} + self._metrics: List[TMetric] = evalStatsList[0].metrics + + def getValues(self, metricName: str): + return self._valuesByMetricName[metricName] + + def getMetricNames(self) -> List[str]: + return list(self._valuesByMetricName.keys()) - def getValues(self, metric): - return self.metrics[metric] + def getMetrics(self) -> List[TMetric]: + return self._metrics - def aggStats(self): + def getMetricByName(self, name: str) -> Optional[TMetric]: + for m in self._metrics: + if m.name == name: + return m + return None + + def hasMetric(self, metric: Union[Metric, str]) -> bool: + if type(metric) != str: + metric = metric.name + return metric in self._valuesByMetricName + + def aggMetricsDict(self, aggFns=(np.mean, np.std)) -> Dict[str, float]: agg = {} - for metric, values in self.metrics.items(): - agg[f"mean[{metric}]"] = float(np.mean(values)) - agg[f"std[{metric}]"] = float(np.std(values)) + for metric, values in self._valuesByMetricName.items(): + for aggFn in aggFns: + agg[f"{aggFn.__name__}[{metric}]"] = float(aggFn(values)) return agg - def meanStats(self): - metrics = {metric: np.mean(values) for (metric, values) in self.metrics.items()} - metrics.update({f"StdDev[{metric}]": np.std(values) for (metric, values) in self.metrics.items()}) + def meanMetricsDict(self) -> Dict[str, float]: + metrics = {metric: np.mean(values) for (metric, values) in self._valuesByMetricName.items()} return metrics - def plotDistribution(self, metric): - values = self.metrics[metric] - plt.figure() - plt.title(metric) - sns.distplot(values) + def plotDistribution(self, metricName: str, subtitle: Optional[str] = None, bins=None, kde=False, cdf=False, + cdfComplementary=False, stat="proportion", **kwargs) -> plt.Figure: + """ + Plots the distribution of a metric as a histogram + + :param metricName: name of the metric for which to plot the distribution (histogram) across evaluations + :param subtitle: the subtitle to add, if any + :param bins: the histogram bins (number of bins or boundaries); metrics bounds will be used to define the x limits. + If None, use 'auto' bins + :param kde: whether to add a kernel density estimator plot + :param cdf: whether to add the cumulative distribution function (cdf) + :param cdfComplementary: whether to plot, if ``cdf`` is True, the complementary cdf instead of the regular cdf + :param stat: the statistic to compute for each bin ('percent', 'probability'='proportion', 'count', 'frequency' or 'density'), y-axis value + :param kwargs: additional parameters to pass to seaborn.histplot (see https://seaborn.pydata.org/generated/seaborn.histplot.html) + :return: + """ + # define bins based on metric bounds where available + xTick = None + if bins is None or type(bins) == int: + metric = self.getMetricByName(metricName) + if metric.bounds == (0, 1): + xTick = 0.1 + if bins is None: + numBins = 10 if cdf else 20 + else: + numBins = bins + bins = np.linspace(0, 1, numBins+1) + + values = self._valuesByMetricName[metricName] + title = metricName + if subtitle is not None: + title += "\n" + subtitle + plot = HistogramPlot(values, bins=bins, stat=stat, kde=kde, cdf=cdf, cdfComplementary=cdfComplementary, **kwargs).title(title) + if xTick is not None: + plot.xtickMajor(xTick) + return plot.fig + + def _plotXY(self, metricNameX, metricNameY, plotFactory: Callable[[Sequence, Sequence], Plot], adjustBounds: bool) -> plt.Figure: + def axlim(bounds): + minValue, maxValue = bounds + diff = maxValue - minValue + return (minValue - 0.05 * diff, maxValue + 0.05 * diff) + + x = self._valuesByMetricName[metricNameX] + y = self._valuesByMetricName[metricNameY] + plot = plotFactory(x, y) + plot.xlabel(metricNameX) + plot.ylabel(metricNameY) + mx = self.getMetricByName(metricNameX) + if adjustBounds and mx.hasFiniteBounds(): + plot.xlim(*axlim(mx.bounds)) + my = self.getMetricByName(metricNameY) + if adjustBounds and my.hasFiniteBounds(): + plot.ylim(*axlim(my.bounds)) + return plot.fig + + def plotScatter(self, metricNameX: str, metricNameY: str) -> plt.Figure: + return self._plotXY(metricNameX, metricNameY, ScatterPlot, adjustBounds=True) + + def plotHeatMap(self, metricNameX: str, metricNameY: str) -> plt.Figure: + return self._plotXY(metricNameX, metricNameY, HeatMapPlot, adjustBounds=False) def toDataFrame(self) -> pd.DataFrame: """ :return: a DataFrame with the evaluation metrics from all contained EvalStats objects; the EvalStats' name field being used as the index if it is set """ - data = dict(self.metrics) + data = dict(self._valuesByMetricName) index = [stats.name for stats in self.statsList] if len([n for n in index if n is not None]) == 0: index = None @@ -110,11 +216,14 @@ def toDataFrame(self) -> pd.DataFrame: @abstractmethod def getGlobalStats(self) -> TEvalStats: + """ + :return: an EvalStats object that combines the data from all contained EvalStats objects + """ pass def __str__(self): return f"{self.__class__.__name__}[" + \ - ", ".join([f"{key}={self.aggStats()[key]:.4f}" for key in self.metrics]) + "]" + ", ".join([f"{key}={self.aggMetricsDict()[key]:.4f}" for key in self._valuesByMetricName]) + "]" class PredictionEvalStats(EvalStats[TMetric], ABC): @@ -203,6 +312,17 @@ def meanStats(evalStatsList: Sequence[EvalStats]) -> Dict[str, float]: For a list of EvalStats objects compute the mean values of all metrics in a dictionary. Assumes that all provided EvalStats have the same metrics """ - dicts = [s.getAll() for s in evalStatsList] + dicts = [s.metricsDict() for s in evalStatsList] metrics = dicts[0].keys() return {m: np.mean([d[m] for d in dicts]) for m in metrics} + + +class EvalStatsPlot(Generic[TEvalStats], ABC): + @abstractmethod + def createFigure(self, evalStats: TEvalStats, subtitle: str) -> Optional[plt.Figure]: + """ + :param evalStats: the evaluation stats from which to generate the plot + :param subtitle: the plot's subtitle + :return: the figure or None if this plot is not applicable/cannot be created + """ + pass diff --git a/src/sensai/evaluation/eval_stats/eval_stats_classification.py b/src/sensai/evaluation/eval_stats/eval_stats_classification.py index f1c9be04..988981f1 100644 --- a/src/sensai/evaluation/eval_stats/eval_stats_classification.py +++ b/src/sensai/evaluation/eval_stats/eval_stats_classification.py @@ -1,17 +1,38 @@ +import logging +from abc import ABC, abstractmethod +from typing import List, Sequence, Optional, Dict, Any, Tuple + +import matplotlib.ticker as plticker import numpy as np import pandas as pd import sklearn -from abc import ABC, abstractmethod -from sklearn.metrics import confusion_matrix, accuracy_score -from typing import List, Sequence +from matplotlib import pyplot as plt +from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, precision_recall_curve, \ + balanced_accuracy_score, f1_score -from .eval_stats_base import PredictionArray, PredictionEvalStats, EvalStatsCollection, Metric +from .eval_stats_base import PredictionArray, PredictionEvalStats, EvalStatsCollection, Metric, EvalStatsPlot, TMetric +from ...util.aggregation import RelativeFrequencyCounter +from ...util.pickle import getstate from ...util.plot import plotMatrix +log = logging.getLogger(__name__) + + +GUESS = ("__guess",) +BINARY_CLASSIFICATION_POSITIVE_LABEL_CANDIDATES = [1, True, "1", "True"] + class ClassificationMetric(Metric["ClassificationEvalStats"], ABC): requiresProbabilities = False + def __init__(self, name=None, bounds: Tuple[float, float] = (0, 1), requiresProbabilities=None): + """ + :param name: the name of the metric; if None use the class' name attribute + :param bounds: the minimum and maximum values the metric can take on + """ + super().__init__(name=name, bounds=bounds) + self.requiresProbabilities = requiresProbabilities if requiresProbabilities is not None else self.__class__.requiresProbabilities + def computeValueForEvalStats(self, evalStats: "ClassificationEvalStats"): return self.computeValue(evalStats.y_true, evalStats.y_predicted, evalStats.y_predictedClassProbabilities) @@ -26,14 +47,60 @@ def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): class ClassificationMetricAccuracy(ClassificationMetric): - name = "ACC" + name = "accuracy" def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): return accuracy_score(y_true=y_true, y_pred=y_predicted) +class ClassificationMetricBalancedAccuracy(ClassificationMetric): + name = "balancedAccuracy" + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + return balanced_accuracy_score(y_true=y_true, y_pred=y_predicted) + + +class ClassificationMetricAccuracyWithoutLabels(ClassificationMetric): + """ + Accuracy score with set of data points limited to the ones where the ground truth label is not one of the given labels + """ + def __init__(self, *labels: Any, probabilityThreshold=None): + """ + :param labels: one or more labels which are not to be considered (all data points where the ground truth is + one of these labels will be ignored) + :param probabilityThreshold: a probability threshold: the probability of the most likely class must be at least this value for a data point + to be considered in the metric computation (analogous to :class:`ClassificationMetricAccuracyMaxProbabilityBeyondThreshold`) + """ + if probabilityThreshold is not None: + nameAdd = f", p_max >= {probabilityThreshold}" + else: + nameAdd = "" + name = f"{ClassificationMetricAccuracy.name}Without[{','.join(map(str, labels))}{nameAdd}]" + super().__init__(name, requiresProbabilities=probabilityThreshold is not None) + self.labels = set(labels) + self.probabilityThreshold = probabilityThreshold + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + y_true = np.array(y_true) + y_predicted = np.array(y_predicted) + indices = [] + for i, (trueLabel, predictedLabel) in enumerate(zip(y_true, y_predicted)): + if trueLabel not in self.labels: + if self.probabilityThreshold is not None: + if y_predictedClassProbabilities[predictedLabel].iloc[i] < self.probabilityThreshold: + continue + indices.append(i) + return accuracy_score(y_true=y_true[indices], y_pred=y_predicted[indices]) + + def getPairedMetrics(self) -> List[TMetric]: + if self.probabilityThreshold is not None: + return [ClassificationMetricRelFreqMaxProbabilityBeyondThreshold(self.probabilityThreshold)] + else: + return [] + + class ClassificationMetricGeometricMeanOfTrueClassProbability(ClassificationMetric): - name = "GeoMeanTrueClassProb" + name = "geoMeanTrueClassProb" requiresProbabilities = True def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): @@ -54,7 +121,7 @@ class ClassificationMetricTopNAccuracy(ClassificationMetric): def __init__(self, n: int): self.n = n - super().__init__(name=f"Top{n}Accuracy") + super().__init__(name=f"top{n}Accuracy") def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): labels = y_predictedClassProbabilities.columns @@ -66,13 +133,199 @@ def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): return cnt / len(y_true) +class ClassificationMetricAccuracyMaxProbabilityBeyondThreshold(ClassificationMetric): + """ + Accuracy limited to cases where the probability of the most likely class is at least a given threshold + """ + requiresProbabilities = True + + def __init__(self, threshold: float, zeroValue=0.0): + """ + :param threshold: minimum probability of the most likely class + :param zeroValue: the value of the metric for the case where the probability of the most likely class never reaches the threshold + """ + self.threshold = threshold + self.zeroValue = zeroValue + super().__init__(name=f"accuracy[p_max >= {threshold}]") + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + labels = y_predictedClassProbabilities.columns + labelToColIdx = {l: i for i, l in enumerate(labels)} + relFreq = RelativeFrequencyCounter() + for i, probabilities in enumerate(y_predictedClassProbabilities.values.tolist()): + classIdx_predicted = np.argmax(probabilities) + prob_predicted = probabilities[classIdx_predicted] + if prob_predicted >= self.threshold: + classIdx_true = labelToColIdx[y_true[i]] + relFreq.count(classIdx_predicted == classIdx_true) + if relFreq.numTotal == 0: + return self.zeroValue + else: + return relFreq.getRelativeFrequency() + + def getPairedMetrics(self) -> List[TMetric]: + return [ClassificationMetricRelFreqMaxProbabilityBeyondThreshold(self.threshold)] + + +class ClassificationMetricRelFreqMaxProbabilityBeyondThreshold(ClassificationMetric): + """ + Relative frequency of cases where the probability of the most likely class is at least a given threshold + """ + requiresProbabilities = True + + def __init__(self, threshold: float): + """ + :param threshold: minimum probability of the most likely class + """ + self.threshold = threshold + super().__init__(name=f"relFreq[p_max >= {threshold}]") + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + relFreq = RelativeFrequencyCounter() + for i, probabilities in enumerate(y_predictedClassProbabilities.values.tolist()): + pMax = np.max(probabilities) + relFreq.count(pMax >= self.threshold) + return relFreq.getRelativeFrequency() + + +class BinaryClassificationMetric(ClassificationMetric, ABC): + def __init__(self, positiveClassLabel, name: str = None): + name = name if name is not None else self.__class__.name + if positiveClassLabel not in BINARY_CLASSIFICATION_POSITIVE_LABEL_CANDIDATES: + name = f"{name}[{positiveClassLabel}]" + super().__init__(name) + self.positiveClassLabel = positiveClassLabel + + +class BinaryClassificationMetricPrecision(BinaryClassificationMetric): + name = "precision" + + def __init__(self, positiveClassLabel): + super().__init__(positiveClassLabel) + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + return precision_score(y_true, y_predicted, pos_label=self.positiveClassLabel, zero_division=0) + + def getPairedMetrics(self) -> List[BinaryClassificationMetric]: + return [BinaryClassificationMetricRecall(self.positiveClassLabel)] + + +class BinaryClassificationMetricRecall(BinaryClassificationMetric): + name = "recall" + + def __init__(self, positiveClassLabel): + super().__init__(positiveClassLabel) + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + return recall_score(y_true, y_predicted, pos_label=self.positiveClassLabel) + + +class BinaryClassificationMetricF1Score(BinaryClassificationMetric): + name = "F1" + + def __init__(self, positiveClassLabel): + super().__init__(positiveClassLabel) + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + return f1_score(y_true, y_predicted, pos_label=self.positiveClassLabel) + + +class BinaryClassificationMetricRecallForPrecision(BinaryClassificationMetric): + """ + Computes the maximum recall that can be achieved (by varying the decision threshold) in cases where at least the given precision + is reached. The given precision may not be achievable at all, in which case the metric value is ``zeroValue``. + """ + def __init__(self, precision: float, positiveClassLabel, zeroValue=0.0): + """ + :param precision: the minimum precision value that must be reached + :param positiveClassLabel: the positive class label + :param zeroValue: the value to return for the case where the minimum precision is never reached + """ + self.minPrecision = precision + self.zeroValue = zeroValue + super().__init__(positiveClassLabel, name=f"recallForPrecision[{precision}]") + + def computeValueForEvalStats(self, evalStats: "ClassificationEvalStats"): + varData = evalStats.getBinaryClassificationProbabilityThresholdVariationData() + bestRecall = None + for c in varData.counts: + precision = c.getPrecision() + if precision >= self.minPrecision: + recall = c.getRecall() + if bestRecall is None or recall > bestRecall: + bestRecall = recall + return self.zeroValue if bestRecall is None else bestRecall + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + raise NotImplementedError(f"{self.__class__.__qualname__} only supports computeValueForEvalStats") + + +class BinaryClassificationMetricPrecisionThreshold(BinaryClassificationMetric): + """ + Precision for the case where predictions are considered "positive" if predicted probability of the positive class is beyond the + given threshold + """ + requiresProbabilities = True + + def __init__(self, threshold: float, positiveClassLabel: Any, zeroValue=0.0): + """ + :param threshold: the minimum predicted probability of the positive class for the prediction to be considered "positive" + :param zeroValue: the value of the metric for the case where a positive class probability beyond the threshold is never predicted + (denominator = 0) + """ + self.threshold = threshold + self.zeroValue = zeroValue + super().__init__(positiveClassLabel, name=f"precision[{threshold}]") + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + relFreqCorrect = RelativeFrequencyCounter() + classIdx_positive = list(y_predictedClassProbabilities.columns).index(self.positiveClassLabel) + for i, (probabilities, classLabel_true) in enumerate(zip(y_predictedClassProbabilities.values.tolist(), y_true)): + prob_predicted = probabilities[classIdx_positive] + if prob_predicted >= self.threshold: + relFreqCorrect.count(classLabel_true == self.positiveClassLabel) + f = relFreqCorrect.getRelativeFrequency() + return f if f is not None else self.zeroValue + + def getPairedMetrics(self) -> List[BinaryClassificationMetric]: + return [BinaryClassificationMetricRecallThreshold(self.threshold, self.positiveClassLabel)] + + +class BinaryClassificationMetricRecallThreshold(BinaryClassificationMetric): + """ + Recall for the case where predictions are considered "positive" if predicted probability of the positive class is beyond the + given threshold + """ + requiresProbabilities = True + + def __init__(self, threshold: float, positiveClassLabel: Any, zeroValue=0.0): + """ + :param threshold: the minimum predicted probability of the positive class for the prediction to be considered "positive" + :param zeroValue: the value of the metric for the case where there are no positive instances in the data set (denominator = 0) + """ + self.threshold = threshold + self.zeroValue = zeroValue + super().__init__(positiveClassLabel, name=f"recall[{threshold}]") + + def _computeValue(self, y_true, y_predicted, y_predictedClassProbabilities): + relFreqRecalled = RelativeFrequencyCounter() + classIdx_positive = list(y_predictedClassProbabilities.columns).index(self.positiveClassLabel) + for i, (probabilities, classLabel_true) in enumerate(zip(y_predictedClassProbabilities.values.tolist(), y_true)): + if self.positiveClassLabel == classLabel_true: + prob_predicted = probabilities[classIdx_positive] + relFreqRecalled.count(prob_predicted >= self.threshold) + f = relFreqRecalled.getRelativeFrequency() + return f if f is not None else self.zeroValue + + class ClassificationEvalStats(PredictionEvalStats["ClassificationMetric"]): def __init__(self, y_predicted: PredictionArray = None, - y_true: PredictionArray = None, - y_predictedClassProbabilities: pd.DataFrame = None, - labels: PredictionArray = None, - metrics: Sequence["ClassificationMetric"] = None, - additionalMetrics: Sequence["ClassificationMetric"] = None): + y_true: PredictionArray = None, + y_predictedClassProbabilities: pd.DataFrame = None, + labels: PredictionArray = None, + metrics: Sequence["ClassificationMetric"] = None, + additionalMetrics: Sequence["ClassificationMetric"] = None, + binaryPositiveLabel=GUESS): """ :param y_predicted: the predicted class labels :param y_true: the true class labels @@ -80,6 +333,11 @@ def __init__(self, y_predicted: PredictionArray = None, :param labels: the list of class labels :param metrics: the metrics to compute for evaluation; if None, use default metrics :param additionalMetrics: the metrics to additionally compute + :param binaryPositiveLabel: the label of the positive class for the case where it is a binary classification, adding further + binary metrics by default; + if GUESS (default), check `labels` (if length 2) for occurrence of one of BINARY_CLASSIFICATION_POSITIVE_LABEL_CANDIDATES in + the respective order and use the first one found (if any); + if None, treat the problem as non-binary, regardless of the labels being used. """ self.labels = labels self.y_predictedClassProbabilities = y_predictedClassProbabilities @@ -87,12 +345,40 @@ def __init__(self, y_predicted: PredictionArray = None, if self._probabilitiesAvailable: colSet = set(y_predictedClassProbabilities.columns) if colSet != set(labels): - raise ValueError(f"Set of columns in class probabilities data frame ({colSet}) does not correspond to labels ({labels}") + raise ValueError(f"Columns in class probabilities data frame ({y_predictedClassProbabilities.columns}) do not correspond to labels ({labels}") if len(y_predictedClassProbabilities) != len(y_true): raise ValueError("Row count in class probabilities data frame does not match ground truth") + numLabels = len(labels) + if binaryPositiveLabel == GUESS: + foundCandidateLabel = False + if numLabels == 2: + for c in BINARY_CLASSIFICATION_POSITIVE_LABEL_CANDIDATES: + if c in labels: + binaryPositiveLabel = c + foundCandidateLabel = True + break + if not foundCandidateLabel: + binaryPositiveLabel = None + elif binaryPositiveLabel is not None: + if numLabels != 2: + log.warning(f"Passed binaryPositiveLabel for non-binary classification (labels={self.labels})") + if binaryPositiveLabel not in self.labels: + log.warning(f"The binary positive label {binaryPositiveLabel} does not appear in labels={labels}") + if numLabels == 2 and binaryPositiveLabel is None: + log.warning(f"Binary classification (labels={labels}) without specification of positive class label; binary classification metrics will not be considered") + self.binaryPositiveLabel = binaryPositiveLabel + self.isBinary = binaryPositiveLabel is not None + if metrics is None: - metrics = [ClassificationMetricAccuracy(), ClassificationMetricGeometricMeanOfTrueClassProbability()] + metrics = [ClassificationMetricAccuracy(), ClassificationMetricBalancedAccuracy(), + ClassificationMetricGeometricMeanOfTrueClassProbability()] + if self.isBinary: + metrics.extend([ + BinaryClassificationMetricPrecision(self.binaryPositiveLabel), + BinaryClassificationMetricRecall(self.binaryPositiveLabel), + BinaryClassificationMetricF1Score(self.binaryPositiveLabel)]) + metrics = list(metrics) if additionalMetrics is not None: for m in additionalMetrics: @@ -101,41 +387,79 @@ def __init__(self, y_predicted: PredictionArray = None, super().__init__(y_predicted, y_true, metrics, additionalMetrics=additionalMetrics) + # transient members + self._binaryClassificationProbabilityThresholdVariationData = None + + def __getstate__(self): + return getstate(ClassificationEvalStats, self, transientProperties=["_binaryClassificationProbabilityThresholdVariationData"]) + def getConfusionMatrix(self) -> "ConfusionMatrix": return ConfusionMatrix(self.y_true, self.y_predicted) + def getBinaryClassificationProbabilityThresholdVariationData(self) -> "BinaryClassificationProbabilityThresholdVariationData": + if self._binaryClassificationProbabilityThresholdVariationData is None: + self._binaryClassificationProbabilityThresholdVariationData = BinaryClassificationProbabilityThresholdVariationData(self) + return self._binaryClassificationProbabilityThresholdVariationData + def getAccuracy(self): return self.computeMetricValue(ClassificationMetricAccuracy()) - def getAll(self): - """Gets a dictionary with all metrics""" + def metricsDict(self) -> Dict[str, float]: d = {} for metric in self.metrics: if not metric.requiresProbabilities or self._probabilitiesAvailable: d[metric.name] = self.computeMetricValue(metric) return d + def getMisclassifiedIndices(self) -> List[int]: + return [i for i, (predClass, trueClass) in enumerate(zip(self.y_predicted, self.y_true)) if predClass != trueClass] + def plotConfusionMatrix(self, normalize=True, titleAdd: str = None): # based on https://scikit-learn.org/0.20/auto_examples/model_selection/plot_confusion_matrix.html confusionMatrix = self.getConfusionMatrix() return confusionMatrix.plot(normalize=normalize, titleAdd=titleAdd) + def plotPrecisionRecallCurve(self, titleAdd: str = None): + from sklearn.metrics import PrecisionRecallDisplay # only supported by newer versions of sklearn + if not self._probabilitiesAvailable: + raise Exception("Precision-recall curve requires probabilities") + if not self.isBinary: + raise Exception("Precision-recall curve is not applicable to non-binary classification") + probabilities = self.y_predictedClassProbabilities[self.binaryPositiveLabel] + precision, recall, thresholds = precision_recall_curve(y_true=self.y_true, probas_pred=probabilities, + pos_label=self.binaryPositiveLabel) + disp = PrecisionRecallDisplay(precision, recall) + disp.plot() + ax: plt.Axes = disp.ax_ + ax.set_xlabel("recall") + ax.set_ylabel("precision") + title = "Precision-Recall Curve" + if titleAdd is not None: + title += "\n" + titleAdd + ax.set_title(title) + ax.xaxis.set_major_locator(plticker.MultipleLocator(base=0.1)) + ax.yaxis.set_major_locator(plticker.MultipleLocator(base=0.1)) + return disp.figure_ + -class ClassificationEvalStatsCollection(EvalStatsCollection[ClassificationEvalStats]): +class ClassificationEvalStatsCollection(EvalStatsCollection[ClassificationEvalStats, ClassificationMetric]): def __init__(self, evalStatsList: List[ClassificationEvalStats]): super().__init__(evalStatsList) self.globalStats = None - # TODO once we moved to python 3.8: move to base class and use the new get_args method to infer the generic type at runtime - # https://docs.python.org/3/library/typing.html#typing.get_args def getGlobalStats(self) -> ClassificationEvalStats: - """ - Gets an evaluation statistics object that combines the data from all contained eval stats objects - """ if self.globalStats is None: y_true = np.concatenate([evalStats.y_true for evalStats in self.statsList]) y_predicted = np.concatenate([evalStats.y_predicted for evalStats in self.statsList]) - self.globalStats = ClassificationEvalStats(y_predicted, y_true) + es0 = self.statsList[0] + if es0.y_predictedClassProbabilities is not None: + y_probs = pd.concat([evalStats.y_predictedClassProbabilities for evalStats in self.statsList]) + labels = list(y_probs.columns) + else: + y_probs = None + labels = es0.labels + self.globalStats = ClassificationEvalStats(y_predicted=y_predicted, y_true=y_true, y_predictedClassProbabilities=y_probs, + labels=labels, binaryPositiveLabel=es0.binaryPositiveLabel, metrics=es0.metrics) return self.globalStats @@ -148,3 +472,129 @@ def plot(self, normalize=True, titleAdd: str = None): title = 'Normalized Confusion Matrix' if normalize else 'Confusion Matrix (Counts)' return plotMatrix(self.confusionMatrix, title, self.labels, self.labels, 'true class', 'predicted class', normalize=normalize, titleAdd=titleAdd) + + +class BinaryClassificationCounts: + def __init__(self, isPositivePrediction: Sequence[bool], isPositiveGroundTruth: Sequence[bool], zeroDenominatorMetricValue=0): + """ + :param isPositivePrediction: the sequence of Booleans indicating whether the model predicted the positive class + :param isPositiveGroundTruth: the sequence of Booleans indicating whether the true class is the positive class + :param zeroDenominatorMetricValue: the result to return for metrics such as precision and recall in case the denominator + is zero (i.e. zero counted cases) + """ + self.zeroDenominatorMetricValue = zeroDenominatorMetricValue + self.tp = 0 + self.tn = 0 + self.fp = 0 + self.fn = 0 + for predPositive, gtPositive in zip(isPositivePrediction, isPositiveGroundTruth): + if gtPositive: + if predPositive: + self.tp += 1 + else: + self.fn += 1 + else: + if predPositive: + self.fp += 1 + else: + self.tn += 1 + + @classmethod + def fromProbabilityThreshold(cls, probabilities: Sequence[float], threshold: float, isPositiveGroundTruth: Sequence[bool]) -> "BinaryClassificationCounts": + return cls([p >= threshold for p in probabilities], isPositiveGroundTruth) + + @classmethod + def fromEvalStats(cls, evalStats: ClassificationEvalStats, threshold=0.5) -> "BinaryClassificationCounts": + if not evalStats.isBinary: + raise ValueError("Probability threshold variation data can only be computed for binary classification problems") + if evalStats.y_predictedClassProbabilities is None: + raise ValueError("No probability data") + posClassLabel = evalStats.binaryPositiveLabel + probs = evalStats.y_predictedClassProbabilities[posClassLabel] + isPositiveGT = [gtLabel == posClassLabel for gtLabel in evalStats.y_true] + return cls.fromProbabilityThreshold(probabilities=probs, threshold=threshold, isPositiveGroundTruth=isPositiveGT) + + def _frac(self, numerator, denominator): + if denominator == 0: + return self.zeroDenominatorMetricValue + return numerator / denominator + + def getPrecision(self): + return self._frac(self.tp, self.tp + self.fp) + + def getRecall(self): + return self._frac(self.tp, self.tp + self.fn) + + def getF1(self): + return self._frac(self.tp, self.tp + 0.5 * (self.fp + self.fn)) + + +class BinaryClassificationProbabilityThresholdVariationData: + def __init__(self, evalStats: ClassificationEvalStats): + self.thresholds = np.linspace(0, 1, 101) + self.counts: List[BinaryClassificationCounts] = [] + for threshold in self.thresholds: + self.counts.append(BinaryClassificationCounts.fromEvalStats(evalStats, threshold=threshold)) + + def plotPrecisionRecall(self, subtitle=None) -> plt.Figure: + fig = plt.figure() + title = "Probability Threshold-Dependent Precision & Recall" + if subtitle is not None: + title += "\n" + subtitle + plt.title(title) + plt.xlabel("probability threshold") + precision = [c.getPrecision() for c in self.counts] + recall = [c.getRecall() for c in self.counts] + f1 = [c.getF1() for c in self.counts] + plt.plot(self.thresholds, precision, label="precision") + plt.plot(self.thresholds, recall, label="recall") + plt.plot(self.thresholds, f1, label="F1-score") + plt.legend() + return fig + + def plotCounts(self, subtitle=None): + fig = plt.figure() + title = "Probability Threshold-Dependent Counts" + if subtitle is not None: + title += "\n" + subtitle + plt.title(title) + plt.xlabel("probability threshold") + plt.stackplot(self.thresholds, + [c.tp for c in self.counts], [c.tn for c in self.counts], [c.fp for c in self.counts], [c.fn for c in self.counts], + labels=["true positives", "true negatives", "false positives", "false negatives"], + colors=["#4fa244", "#79c36f", "#a25344", "#c37d6f"]) + plt.legend() + return fig + + +class ClassificationEvalStatsPlot(EvalStatsPlot[ClassificationEvalStats], ABC): + pass + + +class ClassificationEvalStatsPlotConfusionMatrix(ClassificationEvalStatsPlot): + def __init__(self, normalise=True): + self.normalise = normalise + + def createFigure(self, evalStats: ClassificationEvalStats, subtitle: str) -> plt.Figure: + return evalStats.plotConfusionMatrix(normalize=self.normalise, titleAdd=subtitle) + + +class ClassificationEvalStatsPlotPrecisionRecall(ClassificationEvalStatsPlot): + def createFigure(self, evalStats: ClassificationEvalStats, subtitle: str) -> Optional[plt.Figure]: + if not evalStats.isBinary: + return None + return evalStats.plotPrecisionRecallCurve(titleAdd=subtitle) + + +class ClassificationEvalStatsPlotProbabilityThresholdPrecisionRecall(ClassificationEvalStatsPlot): + def createFigure(self, evalStats: ClassificationEvalStats, subtitle: str) -> Optional[plt.Figure]: + if not evalStats.isBinary: + return None + return evalStats.getBinaryClassificationProbabilityThresholdVariationData().plotPrecisionRecall(subtitle=subtitle) + + +class ClassificationEvalStatsPlotProbabilityThresholdCounts(ClassificationEvalStatsPlot): + def createFigure(self, evalStats: ClassificationEvalStats, subtitle: str) -> Optional[plt.Figure]: + if not evalStats.isBinary: + return None + return evalStats.getBinaryClassificationProbabilityThresholdVariationData().plotCounts(subtitle=subtitle) \ No newline at end of file diff --git a/src/sensai/evaluation/eval_stats/eval_stats_clustering.py b/src/sensai/evaluation/eval_stats/eval_stats_clustering.py index 680fc848..0838fa22 100644 --- a/src/sensai/evaluation/eval_stats/eval_stats_clustering.py +++ b/src/sensai/evaluation/eval_stats/eval_stats_clustering.py @@ -47,8 +47,8 @@ def getDistributionSummary(self) -> Dict[str, float]: result[self.NOISE_SIZE] = int(self.noiseClusterSize) return result - def getAll(self) -> Dict[str, float]: - metricsDict = super().getAll() + def metricsDict(self) -> Dict[str, float]: + metricsDict = super().metricsDict() metricsDict.update(self.getDistributionSummary()) return metricsDict diff --git a/src/sensai/evaluation/eval_stats/eval_stats_regression.py b/src/sensai/evaluation/eval_stats/eval_stats_regression.py index 9e8b7a2e..f121d4eb 100644 --- a/src/sensai/evaluation/eval_stats/eval_stats_regression.py +++ b/src/sensai/evaluation/eval_stats/eval_stats_regression.py @@ -1,12 +1,13 @@ import logging -import numpy as np -import seaborn as sns from abc import abstractmethod, ABC +from typing import List, Sequence, Optional + +import numpy as np from matplotlib import pyplot as plt from matplotlib.colors import LinearSegmentedColormap -from typing import List, Sequence, Optional -from .eval_stats_base import PredictionEvalStats, Metric, EvalStatsCollection, PredictionArray +from .eval_stats_base import PredictionEvalStats, Metric, EvalStatsCollection, PredictionArray, EvalStatsPlot +from ...util.plot import HistogramPlot log = logging.getLogger(__name__) @@ -168,26 +169,25 @@ def getEvalStatsCollection(self) -> "RegressionEvalStatsCollection": statsList.append(stats) return RegressionEvalStatsCollection(statsList) - def plotErrorDistribution(self, bins=None, figure=True, titleAdd=None) -> Optional[plt.Figure]: + def plotErrorDistribution(self, bins="auto", titleAdd=None) -> Optional[plt.Figure]: """ - :param bins: if None, seaborns default binning will be used + :param bins: bin specification (see :class:`HistogramPlot`) :param figure: whether to plot in a separate figure and return that figure :param titleAdd: a string to add to the title (on a second line) :return: the resulting figure object or None """ errors = np.array(self.y_predicted) - np.array(self.y_true) - fig = None title = "Prediction Error Distribution" if titleAdd is not None: title += "\n" + titleAdd - if figure: - fig = plt.figure(title.replace("\n", " ")) - sns.distplot(errors, bins=bins) - plt.title(title) - plt.xlabel("error (prediction - ground truth)") - plt.ylabel("probability density") - return fig + if bins == "auto" and len(errors) < 100: + bins = 10 # seaborn can crash with low number of data points and bins="auto" (tries to allocate vast amounts of memory) + plot = HistogramPlot(errors, bins=bins, kde=True) + plot.title(title) + plot.xlabel("error (prediction - ground truth)") + plot.ylabel("probability density") + return plot.fig def plotScatterGroundTruthPredictions(self, figure=True, titleAdd=None, **kwargs) -> Optional[plt.Figure]: """ @@ -198,7 +198,7 @@ def plotScatterGroundTruthPredictions(self, figure=True, titleAdd=None, **kwargs :return: the resulting figure object or None """ fig = None - title = "Scatter Plot of Ground Truth vs. Predicted Values" + title = "Scatter Plot of Predicted Values vs. Ground Truth" if titleAdd is not None: title += "\n" + titleAdd if figure: @@ -228,7 +228,7 @@ def plotHeatmapGroundTruthPredictions(self, figure=True, cmap=None, bins=60, tit :return: the resulting figure object or None """ fig = None - title = "Heat Map of Ground Truth vs. Predicted Values" + title = "Heat Map of Predicted Values vs. Ground Truth" if titleAdd: title += "\n" + titleAdd if figure: @@ -260,17 +260,33 @@ def plotHeatmapGroundTruthPredictions(self, figure=True, cmap=None, bins=60, tit return fig -class RegressionEvalStatsCollection(EvalStatsCollection): +class RegressionEvalStatsCollection(EvalStatsCollection[RegressionEvalStats, RegressionMetric]): def __init__(self, evalStatsList: List[RegressionEvalStats]): super().__init__(evalStatsList) self.globalStats = None def getGlobalStats(self) -> RegressionEvalStats: - """ - Gets an evaluation statistics object that combines the data from all contained eval stats objects - """ if self.globalStats is None: y_true = np.concatenate([evalStats.y_true for evalStats in self.statsList]) y_predicted = np.concatenate([evalStats.y_predicted for evalStats in self.statsList]) self.globalStats = RegressionEvalStats(y_predicted, y_true) return self.globalStats + + +class RegressionEvalStatsPlot(EvalStatsPlot[RegressionEvalStats], ABC): + pass + + +class RegressionEvalStatsPlotErrorDistribution(RegressionEvalStatsPlot): + def createFigure(self, evalStats: RegressionEvalStats, subtitle: str) -> plt.Figure: + return evalStats.plotErrorDistribution(titleAdd=subtitle) + + +class RegressionEvalStatsPlotHeatmapGroundTruthPredictions(RegressionEvalStatsPlot): + def createFigure(self, evalStats: RegressionEvalStats, subtitle: str) -> plt.Figure: + return evalStats.plotHeatmapGroundTruthPredictions(titleAdd=subtitle) + + +class RegressionEvalStatsPlotScatterGroundTruthPredictions(RegressionEvalStatsPlot): + def createFigure(self, evalStats: RegressionEvalStats, subtitle: str) -> plt.Figure: + return evalStats.plotScatterGroundTruthPredictions(titleAdd=subtitle) diff --git a/src/sensai/evaluation/eval_util.py b/src/sensai/evaluation/eval_util.py index 1c1ccb0a..e1c8f76e 100644 --- a/src/sensai/evaluation/eval_util.py +++ b/src/sensai/evaluation/eval_util.py @@ -3,36 +3,45 @@ workflow for evaluation is to use these higher-level functionalities instead of instantiating the evaluation classes directly. """ -# TODO: provide a notebook (and possibly an rst file) that illustrates standard evaluation scenarios and at the same -# time serves as an integration test - +import functools import logging from abc import ABC, abstractmethod -from typing import Tuple, Dict, Any, Union, Generic, TypeVar, Optional, Sequence, Callable +from collections import defaultdict +from dataclasses import dataclass +from typing import Dict, Any, Union, Generic, TypeVar, Optional, Sequence, Callable, Set, Iterable, List import matplotlib.figure import matplotlib.pyplot as plt +import numpy as np import pandas as pd import seaborn as sns from .crossval import VectorModelCrossValidationData, VectorRegressionModelCrossValidationData, \ VectorClassificationModelCrossValidationData, \ VectorClassificationModelCrossValidator, VectorRegressionModelCrossValidator, VectorModelCrossValidator, VectorModelCrossValidatorParams -from .eval_stats.eval_stats_base import EvalStats, EvalStatsCollection +from .eval_stats import RegressionEvalStatsCollection, ClassificationEvalStatsCollection, RegressionEvalStatsPlotErrorDistribution, \ + RegressionEvalStatsPlotHeatmapGroundTruthPredictions, RegressionEvalStatsPlotScatterGroundTruthPredictions, \ + ClassificationEvalStatsPlotConfusionMatrix, ClassificationEvalStatsPlotPrecisionRecall, RegressionEvalStatsPlot, \ + ClassificationEvalStatsPlotProbabilityThresholdPrecisionRecall, ClassificationEvalStatsPlotProbabilityThresholdCounts +from .eval_stats.eval_stats_base import EvalStats, EvalStatsCollection, EvalStatsPlot from .eval_stats.eval_stats_classification import ClassificationEvalStats from .eval_stats.eval_stats_regression import RegressionEvalStats from .evaluator import VectorModelEvaluator, VectorModelEvaluationData, VectorRegressionModelEvaluator, \ VectorRegressionModelEvaluationData, VectorClassificationModelEvaluator, VectorClassificationModelEvaluationData, \ VectorRegressionModelEvaluatorParams, VectorClassificationModelEvaluatorParams, VectorModelEvaluatorParams from ..data import InputOutputData +from ..feature_importance import AggregatedFeatureImportance, FeatureImportanceProvider, plotFeatureImportance, FeatureImportance +from ..tracking import TrackedExperiment +from ..util.deprecation import deprecated from ..util.io import ResultWriter from ..util.string import prettyStringRepr -from ..vector_model import VectorClassificationModel, VectorRegressionModel, VectorModel +from ..vector_model import VectorClassificationModel, VectorRegressionModel, VectorModel, VectorModelBase log = logging.getLogger(__name__) TModel = TypeVar("TModel", bound=VectorModel) TEvalStats = TypeVar("TEvalStats", bound=EvalStats) +TEvalStatsPlot = TypeVar("TEvalStatsPlot", bound=EvalStatsPlot) TEvalStatsCollection = TypeVar("TEvalStatsCollection", bound=EvalStatsCollection) TEvaluator = TypeVar("TEvaluator", bound=VectorModelEvaluator) TCrossValidator = TypeVar("TCrossValidator", bound=VectorModelCrossValidator) @@ -125,15 +134,82 @@ def evalModelViaEvaluator(model: TModel, inputOutputData: InputOutputData, testF return ev.performSimpleEvaluation(model, showPlots=True, logResults=True) +class EvaluationResultCollector: + def __init__(self, showPlots: bool = True, resultWriter: Optional[ResultWriter] = None): + self.showPlots = showPlots + self.resultWriter = resultWriter + + def addFigure(self, name: str, fig: matplotlib.figure.Figure): + if self.resultWriter is not None: + self.resultWriter.writeFigure(name, fig, closeFigure=not self.showPlots) + + def addDataFrameCsvFile(self, name: str, df: pd.DataFrame): + if self.resultWriter is not None: + self.resultWriter.writeDataFrameCsvFile(name, df) + + def child(self, addedFilenamePrefix): + resultWriter = self.resultWriter + if resultWriter: + resultWriter = resultWriter.childWithAddedPrefix(addedFilenamePrefix) + return self.__class__(showPlots=self.showPlots, resultWriter=resultWriter) + + +class EvalStatsPlotCollector(Generic[TEvalStats, TEvalStatsPlot]): + def __init__(self): + self.plots: Dict[str, EvalStatsPlot] = {} + self.disabledPlots: Set[str] = set() + + def addPlot(self, name: str, plot: EvalStatsPlot): + self.plots[name] = plot + + def getEnabledPlots(self) -> List[str]: + return [p for p in self.plots if p not in self.disabledPlots] + + def disablePlots(self, *names: str): + self.disabledPlots.update(names) + + def createPlots(self, evalStats: EvalStats, subtitle: str, resultCollector: EvaluationResultCollector): + knownPlots = set(self.plots.keys()) + unknownDisabledPlots = self.disabledPlots.difference(knownPlots) + if len(unknownDisabledPlots) > 0: + log.warning(f"Plots were disabled which are not registered: {unknownDisabledPlots}; known plots: {knownPlots}") + for name, plot in self.plots.items(): + if name not in self.disabledPlots: + fig = plot.createFigure(evalStats, subtitle) + if fig is not None: + resultCollector.addFigure(name, fig) + + +class RegressionEvalStatsPlotCollector(EvalStatsPlotCollector[RegressionEvalStats, RegressionEvalStatsPlot]): + def __init__(self): + super().__init__() + self.addPlot("error-dist", RegressionEvalStatsPlotErrorDistribution()) + self.addPlot("heatmap-gt-pred", RegressionEvalStatsPlotHeatmapGroundTruthPredictions()) + self.addPlot("scatter-gt-pred", RegressionEvalStatsPlotScatterGroundTruthPredictions()) + + +class ClassificationEvalStatsPlotCollector(EvalStatsPlotCollector[RegressionEvalStats, RegressionEvalStatsPlot]): + def __init__(self): + super().__init__() + self.addPlot("confusion-matrix-rel", ClassificationEvalStatsPlotConfusionMatrix(normalise=True)) + self.addPlot("confusion-matrix-abs", ClassificationEvalStatsPlotConfusionMatrix(normalise=False)) + # the plots below apply to the binary case only (skipped for non-binary case) + self.addPlot("precision-recall", ClassificationEvalStatsPlotPrecisionRecall()) + self.addPlot("threshold-precision-recall", ClassificationEvalStatsPlotProbabilityThresholdPrecisionRecall()) + self.addPlot("threshold-counts", ClassificationEvalStatsPlotProbabilityThresholdCounts()) + + class EvaluationUtil(ABC, Generic[TModel, TEvaluator, TEvalData, TCrossValidator, TCrossValData, TEvalStats]): """ Utility class for the evaluation of models based on a dataset """ def __init__(self, inputOutputData: InputOutputData, + evalStatsPlotCollector: Union[RegressionEvalStatsPlotCollector, ClassificationEvalStatsPlotCollector], evaluatorParams: Optional[Union[VectorRegressionModelEvaluatorParams, VectorClassificationModelEvaluatorParams, Dict[str, Any]]] = None, crossValidatorParams: Optional[Union[VectorModelCrossValidatorParams, Dict[str, Any]]] = None): """ :param inputOutputData: the data set to use for evaluation + :param evalStatsPlotCollector: a collector for plots generated from evaluation stats objects :param evaluatorParams: parameters with which to instantiate evaluators :param crossValidatorParams: parameters with which to instantiate cross-validators """ @@ -144,21 +220,7 @@ def __init__(self, inputOutputData: InputOutputData, self.evaluatorParams = evaluatorParams self.crossValidatorParams = crossValidatorParams self.inputOutputData = inputOutputData - - class ResultCollector: - def __init__(self, showPlots: bool = True, resultWriter: Optional[ResultWriter] = None): - self.showPlots = showPlots - self.resultWriter = resultWriter - - def addFigure(self, name, fig: matplotlib.figure.Figure): - if self.resultWriter is not None: - self.resultWriter.writeFigure(name, fig, closeFigure=not self.showPlots) - - def child(self, addedFilenamePrefix): - resultWriter = self.resultWriter - if resultWriter: - resultWriter = resultWriter.childWithAddedPrefix(addedFilenamePrefix) - return self.__class__(showPlots=self.showPlots, resultWriter=resultWriter) + self.evalStatsPlotCollector = evalStatsPlotCollector def createEvaluator(self, model: TModel = None, isRegression: bool = None) -> TEvaluator: """ @@ -183,11 +245,14 @@ def createCrossValidator(self, model: TModel = None, isRegression: bool = None) return createVectorModelCrossValidator(self.inputOutputData, model=model, isRegression=isRegression, params=self.crossValidatorParams) def performSimpleEvaluation(self, model: TModel, createPlots=True, showPlots=False, logResults=True, resultWriter: ResultWriter = None, - additionalEvaluationOnTrainingData=False, fitModel=True, writeEvalStats=False) -> TEvalData: + additionalEvaluationOnTrainingData=False, fitModel=True, writeEvalStats=False, + trackedExperiment: TrackedExperiment = None) -> TEvalData: if showPlots and not createPlots: raise ValueError("showPlots=True requires createPlots=True") resultWriter = self._resultWriterForModel(resultWriter, model) evaluator = self.createEvaluator(model) + if trackedExperiment is not None: + evaluator.setTrackedExperiment(trackedExperiment) log.info(f"Evaluating {model} via {evaluator}") if fitModel: evaluator.fitModel(model) @@ -221,9 +286,10 @@ def gatherResults(evalResultData: VectorModelEvaluationData, resultWriter, subti def _resultWriterForModel(resultWriter: Optional[ResultWriter], model: TModel) -> Optional[ResultWriter]: if resultWriter is None: return None - return resultWriter.childWithAddedPrefix(model.getName() + "-") + return resultWriter.childWithAddedPrefix(model.getName() + "_") - def performCrossValidation(self, model: TModel, showPlots=False, logResults=True, resultWriter: Optional[ResultWriter] = None) -> TCrossValData: + def performCrossValidation(self, model: TModel, showPlots=False, logResults=True, resultWriter: Optional[ResultWriter] = None, + trackedExperiment: TrackedExperiment = None) -> TCrossValData: """ Evaluates the given model via cross-validation @@ -232,14 +298,18 @@ def performCrossValidation(self, model: TModel, showPlots=False, logResults=True :param logResults: whether to log evaluation results :param resultWriter: a writer with which to store text files and plots. The evaluated model's name is added to each filename automatically + :param trackedExperiment: a tracked experiment with which results shall be associated :return: cross-validation result data """ resultWriter = self._resultWriterForModel(resultWriter, model) crossValidator = self.createCrossValidator(model) + if trackedExperiment is not None: + crossValidator.setTrackedExperiment(trackedExperiment) crossValidationData = crossValidator.evalModel(model) - aggStatsByVar = {varName: crossValidationData.getEvalStatsCollection(predictedVarName=varName).aggStats() + aggStatsByVar = {varName: crossValidationData.getEvalStatsCollection(predictedVarName=varName).aggMetricsDict() for varName in crossValidationData.predictedVarNames} - strEvalResults = str(pd.DataFrame.from_dict(aggStatsByVar, orient="index")) + df = pd.DataFrame.from_dict(aggStatsByVar, orient="index") + strEvalResults = df.to_string() if logResults: log.info(f"Cross-validation results:\n{strEvalResults}") if resultWriter is not None: @@ -248,7 +318,10 @@ def performCrossValidation(self, model: TModel, showPlots=False, logResults=True return crossValidationData def compareModels(self, models: Sequence[TModel], resultWriter: Optional[ResultWriter] = None, useCrossValidation=False, - fitModels=True, writeIndividualResults=True, sortColumn: Optional[str] = None, sortAscending: bool = True) -> pd.DataFrame: + fitModels=True, writeIndividualResults=True, sortColumn: Optional[str] = None, sortAscending: bool = True, + alsoIncludeUnsortedResults: bool = False, + visitors: Optional[Iterable["ModelComparisonVisitor"]] = None, + writeVisitorResults=False) -> "ModelComparisonData": """ Compares several models via simple evaluation or cross-validation @@ -261,42 +334,70 @@ def compareModels(self, models: Sequence[TModel], resultWriter: Optional[ResultW summary) :param sortColumn: column/metric name by which to sort :param sortAscending: whether to sort in ascending order - :return: a data frame containing evaluation metrics on all models + :param alsoIncludeUnsortedResults: whether to also include, for the case where the results are sorted, the unsorted table of + results in the results text + :param visitors: visitors which may process individual results + :param writeVisitorResults: whether to collect results from visitors (if any) after the comparison + :return: the comparison results """ statsList = [] - for model in models: + resultByModelName = {} + for i, model in enumerate(models, start=1): + modelName = model.getName() + log.info(f"Evaluating model {i}/{len(models)} named '{modelName}' ...") if useCrossValidation: if not fitModels: raise ValueError("Cross-validation necessitates that models be retrained; got fitModels=False") - crossValidationResult = self.performCrossValidation(model, resultWriter=resultWriter if writeIndividualResults else None) - statsDict = crossValidationResult.getEvalStatsCollection().aggStats() + crossValData = self.performCrossValidation(model, resultWriter=resultWriter if writeIndividualResults else None) + modelResult = ModelComparisonData.Result(crossValData=crossValData) + resultByModelName[modelName] = modelResult + evalStatsCollection = crossValData.getEvalStatsCollection() + statsDict = evalStatsCollection.aggMetricsDict() else: - evalStats: EvalStats = self.performSimpleEvaluation(model, resultWriter=resultWriter if writeIndividualResults else None, - fitModel=fitModels).getEvalStats() - statsDict = evalStats.getAll() - statsDict["modelName"] = model.getName() + evalData = self.performSimpleEvaluation(model, resultWriter=resultWriter if writeIndividualResults else None, + fitModel=fitModels) + modelResult = ModelComparisonData.Result(evalData=evalData) + resultByModelName[modelName] = modelResult + evalStats = evalData.getEvalStats() + statsDict = evalStats.metricsDict() + statsDict["modelName"] = modelName statsList.append(statsDict) + if visitors is not None: + for visitor in visitors: + visitor.visit(modelName, modelResult) resultsDF = pd.DataFrame(statsList).set_index("modelName") + unsortedResultsDF = resultsDF if sortColumn is not None: if sortColumn not in resultsDF.columns: - log.warning(f"Requested sort column '{sortColumn}' not in list of columns {list(resultsDF.columns)}") - else: - resultsDF.sort_values(sortColumn, ascending=sortAscending, inplace=True) + altSortColumn = f"mean[{sortColumn}]" + if altSortColumn in resultsDF.columns: + sortColumn = altSortColumn + else: + sortColumn = None + log.warning(f"Requested sort column '{sortColumn}' (or '{altSortColumn}') not in list of columns {list(resultsDF.columns)}") + if sortColumn is not None: + resultsDF = resultsDF.sort_values(sortColumn, ascending=sortAscending, inplace=False) strResults = f"Model comparison results:\n{resultsDF.to_string()}" + if alsoIncludeUnsortedResults and sortColumn is not None: + strResults += f"\n\nModel comparison results (unsorted):\n{unsortedResultsDF.to_string()}" log.info(strResults) if resultWriter is not None: suffix = "crossval" if useCrossValidation else "simple-eval" - strResults += "\n\n" + "\n\n".join([f"{model.getName()} = {str(model)}" for model in models]) + strResults += "\n\n" + "\n\n".join([f"{model.getName()} = {model.pprints()}" for model in models]) resultWriter.writeTextFile(f"model-comparison-results-{suffix}", strResults) - return resultsDF + if visitors is not None and writeVisitorResults: + resultCollector = EvaluationResultCollector(showPlots=False, resultWriter=resultWriter) + for visitor in visitors: + visitor.collectResults(resultCollector) + return ModelComparisonData(resultsDF, resultByModelName) - def compareModelsCrossValidation(self, models: Sequence[TModel], resultWriter: Optional[ResultWriter] = None) -> pd.DataFrame: + def compareModelsCrossValidation(self, models: Sequence[TModel], resultWriter: Optional[ResultWriter] = None) -> "ModelComparisonData": """ Compares several models via cross-validation :param models: the models to compare :param resultWriter: a writer with which to store results of the comparison - :return: a data frame containing evaluation metrics on all models + :return: the comparison results """ return self.compareModels(models, resultWriter=resultWriter, useCrossValidation=True) @@ -311,10 +412,10 @@ def createPlots(self, data: Union[TEvalData, TCrossValData], showPlots=True, res """ if not showPlots and resultWriter is None: return - resultCollector = self.ResultCollector(showPlots=showPlots, resultWriter=resultWriter) + resultCollector = EvaluationResultCollector(showPlots=showPlots, resultWriter=resultWriter) self._createPlots(data, resultCollector, subtitle=subtitlePrefix + data.modelName) - def _createPlots(self, data: Union[TEvalData, TCrossValData], resultCollector: ResultCollector, subtitle=None): + def _createPlots(self, data: Union[TEvalData, TCrossValData], resultCollector: EvaluationResultCollector, subtitle=None): def createPlots(predVarName, rc, subt): if isinstance(data, VectorModelCrossValidationData): @@ -332,82 +433,356 @@ def createPlots(predVarName, rc, subt): for predictedVarName in predictedVarNames: createPlots(predictedVarName, resultCollector.child(predictedVarName+"-"), f"{predictedVarName}, {subtitle}") - @abstractmethod - def _createEvalStatsPlots(self, evalStats: TEvalStats, resultCollector: ResultCollector, subtitle=None): + def _createEvalStatsPlots(self, evalStats: TEvalStats, resultCollector: EvaluationResultCollector, subtitle=None): """ :param evalStats: the evaluation results for which to create plots :param resultCollector: the collector to which all plots are to be passed :param subtitle: the subtitle to use for generated plots (if any) """ - pass + self.evalStatsPlotCollector.createPlots(evalStats, subtitle, resultCollector) class RegressionEvaluationUtil(EvaluationUtil[VectorRegressionModel, VectorRegressionModelEvaluator, VectorRegressionModelEvaluationData, VectorRegressionModelCrossValidator, VectorRegressionModelCrossValidationData, RegressionEvalStats]): - def _createEvalStatsPlots(self, evalStats: RegressionEvalStats, resultCollector: EvaluationUtil.ResultCollector, subtitle=None): - resultCollector.addFigure("error-dist", evalStats.plotErrorDistribution(titleAdd=subtitle)) - resultCollector.addFigure("heatmap-gt-pred", evalStats.plotHeatmapGroundTruthPredictions(titleAdd=subtitle)) - resultCollector.addFigure("scatter-gt-pred", evalStats.plotScatterGroundTruthPredictions(titleAdd=subtitle)) + def __init__(self, inputOutputData: InputOutputData, + evaluatorParams: Optional[Union[VectorRegressionModelEvaluatorParams, Dict[str, Any]]] = None, + crossValidatorParams: Optional[Union[VectorModelCrossValidatorParams, Dict[str, Any]]] = None): + """ + :param inputOutputData: the data set to use for evaluation + :param evaluatorParams: parameters with which to instantiate evaluators + :param crossValidatorParams: parameters with which to instantiate cross-validators + """ + super().__init__(inputOutputData, evalStatsPlotCollector=RegressionEvalStatsPlotCollector(), evaluatorParams=evaluatorParams, + crossValidatorParams=crossValidatorParams) class ClassificationEvaluationUtil(EvaluationUtil[VectorClassificationModel, VectorClassificationModelEvaluator, VectorClassificationModelEvaluationData, VectorClassificationModelCrossValidator, VectorClassificationModelCrossValidationData, ClassificationEvalStats]): - def _createEvalStatsPlots(self, evalStats: ClassificationEvalStats, resultCollector: EvaluationUtil.ResultCollector, subtitle=None): - resultCollector.addFigure("confusion-matrix", evalStats.plotConfusionMatrix(titleAdd=subtitle)) + def __init__(self, inputOutputData: InputOutputData, + evaluatorParams: Optional[Union[VectorClassificationModelEvaluatorParams, Dict[str, Any]]] = None, + crossValidatorParams: Optional[Union[VectorModelCrossValidatorParams, Dict[str, Any]]] = None): + """ + :param inputOutputData: the data set to use for evaluation + :param evaluatorParams: parameters with which to instantiate evaluators + :param crossValidatorParams: parameters with which to instantiate cross-validators + """ + super().__init__(inputOutputData, evalStatsPlotCollector=ClassificationEvalStatsPlotCollector(), evaluatorParams=evaluatorParams, + crossValidatorParams=crossValidatorParams) class MultiDataEvaluationUtil: - def __init__(self, inputOutputDataDict: Dict[str, InputOutputData], keyName: str = "dataset"): + def __init__(self, inputOutputDataDict: Dict[str, InputOutputData], keyName: str = "dataset", + metaDataDict: Optional[Dict[str, Dict[str, Any]]] = None): """ :param inputOutputDataDict: a dictionary mapping from names to the data sets with which to evaluate models - :param keyName: a name for the key value used in inputOutputDataDict + :param keyName: a name for the key value used in inputOutputDataDict, which will be used as a column name in result data frames + :param metaDataDict: a dictionary which maps from a name (same keys as in inputOutputDataDict) to a dictionary, which maps + from a column name to a value and which is to be used to extend the result data frames containing per-dataset results """ self.inputOutputDataDict = inputOutputDataDict self.keyName = keyName + if metaDataDict is not None: + self.metaDF = pd.DataFrame(metaDataDict.values(), index=metaDataDict.keys()) + else: + self.metaDF = None - def compareModelsCrossValidation(self, modelFactories: Sequence[Callable[[], VectorModel]], + def compareModelsCrossValidation(self, modelFactories: Sequence[Callable[[], Union[VectorRegressionModel, VectorClassificationModel]]], resultWriter: Optional[ResultWriter] = None, writePerDatasetResults=True, - crossValidatorParams: Optional[Dict[str, Any]] = None, columnNameForModelRanking: str = None, rankMax=True) -> Tuple[pd.DataFrame, pd.DataFrame]: + crossValidatorParams: Optional[Dict[str, Any]] = None, columnNameForModelRanking: str = None, rankMax=True) -> "MultiDataModelComparisonData": """ - :param modelFactories: a sequence of factory functions for the creation of models to evaluate + Deprecated. Use compareModels instead. + """ + return self.compareModels(modelFactories, useCrossValidation=True, resultWriter=resultWriter, writePerDatasetResults=writePerDatasetResults, + crossValidatorParams=crossValidatorParams, + columnNameForModelRanking=columnNameForModelRanking, rankMax=rankMax) + + def compareModels(self, modelFactories: Sequence[Callable[[], Union[VectorRegressionModel, VectorClassificationModel]]], + useCrossValidation=False, + resultWriter: Optional[ResultWriter] = None, + writePerDatasetResults=True, + evaluatorParams: Optional[Union[VectorRegressionModelEvaluatorParams, VectorClassificationModelEvaluatorParams, Dict[str, Any]]] = None, + crossValidatorParams: Optional[Union[VectorModelCrossValidatorParams, Dict[str, Any]]] = None, + columnNameForModelRanking: str = None, + rankMax=True, + createMetricDistributionPlots=True, + createCombinedEvalStatsPlots=False, + distributionPlots_cdf = True, + distributionPlots_cdfComplementary = False, + visitors: Optional[Iterable["ModelComparisonVisitor"]] = None) -> Union["RegressionMultiDataModelComparisonData", "ClassificationMultiDataModelComparisonData"]: + """ + :param modelFactories: a sequence of factory functions for the creation of models to evaluate; every factory must result + in a model with a fixed model name (otherwise results cannot be correctly aggregated) + :param useCrossValidation: whether to use cross-validation (rather than a single split) for model evaluation :param resultWriter: a writer with which to store results :param writePerDatasetResults: whether to use resultWriter (if not None) in order to generate detailed results for each dataset in a subdirectory named according to the name of the dataset - :param crossValidatorParams: parameters to use for the instantiation of cross-validators + :param evaluatorParams: parameters to use for the instantiation of evaluators (relevant if useCrossValidation==False) + :param crossValidatorParams: parameters to use for the instantiation of cross-validators (relevant if useCrossValidation==True) :param columnNameForModelRanking: column name to use for ranking models :param rankMax: if true, use max for ranking, else min - :return: a pair of data frames (allDF, meanDF) where allDF contains all the individual cross-validation results - for every dataset and meanDF contains one row for each model with results averaged across datasets + :param createMetricDistributionPlots: whether to create, for each model, plots of the distribution of each metric across the datasets + :param createCombinedEvalStatsPlots: whether to combine, for each type of model, the EvalStats objects from the individual experiments + into a single objects that holds all results and use it to create plots reflecting the overall result. + Note that for classification, this is only possible if all individual experiments use the same set of class labels. + :param visitors: visitors which may process individual results. Plots generated by visitors are created/collected at the end of the + comparison. + :return: a pair of data frames (allDF, meanDF) where allDF contains all the individual evaluation results (one row per data set) + and meanDF contains one row for each model with results averaged across datasets """ - allResults = pd.DataFrame() - for key, inputOutputData in self.inputOutputDataDict.items(): - log.info(f"Evaluating models for {key}") + allResultsDF = pd.DataFrame() + evalStatsByModelName = defaultdict(list) + isRegression = None + plotCollector: Optional[EvalStatsPlotCollector] = None + modelNames = None + modelName2StringRepr = None + + for i, (key, inputOutputData) in enumerate(self.inputOutputDataDict.items(), start=1): + log.info(f"Evaluating models for data set #{i}/{len(self.inputOutputDataDict)}: {self.keyName}={key}") models = [f() for f in modelFactories] - modelsAreRegression = [model.isRegressionModel() for model in models] - if all(modelsAreRegression): - isRegression = True - elif not any(modelsAreRegression): - isRegression = False - else: - raise ValueError("The models have to be either all regression models or all classification, not a mixture") - ev = createEvaluationUtil(inputOutputData, isRegression=isRegression, crossValidatorParams=crossValidatorParams) - childResultWriter = resultWriter.childForSubdirectory(key) if writePerDatasetResults else None - df = ev.compareModelsCrossValidation(models, resultWriter=childResultWriter) + + currentModelNames = [model.getName() for model in models] + if modelNames is None: + modelNames = currentModelNames + elif modelNames != currentModelNames: + log.warning(f"Model factories do not produce fixed names; use model.withName to name your models. Got {currentModelNames}, previously got {modelNames}") + + if isRegression is None: + modelsAreRegression = [model.isRegressionModel() for model in models] + if all(modelsAreRegression): + isRegression = True + elif not any(modelsAreRegression): + isRegression = False + else: + raise ValueError("The models have to be either all regression models or all classification, not a mixture") + + ev = createEvaluationUtil(inputOutputData, isRegression=isRegression, evaluatorParams=evaluatorParams, + crossValidatorParams=crossValidatorParams) + + if plotCollector is None: + plotCollector = ev.evalStatsPlotCollector + + # compute data frame with results for current data set + childResultWriter = resultWriter.childForSubdirectory(key) if (writePerDatasetResults and resultWriter is not None) else None + comparisonData = ev.compareModels(models, useCrossValidation=useCrossValidation, resultWriter=childResultWriter, + visitors=visitors, writeVisitorResults=False) + df = comparisonData.resultsDF + + # augment data frame df[self.keyName] = key df["modelName"] = df.index - if columnNameForModelRanking is not None: - if columnNameForModelRanking not in df.columns: - raise ValueError(f"Rank metric {columnNameForModelRanking} not contained in columns {df.columns}") - df["bestModel"] = 0 - if rankMax: - df["bestModel"].loc[df[columnNameForModelRanking].idxmax()] = 1 - else: - df["bestModel"].loc[df[columnNameForModelRanking].idxmin()] = 1 df = df.reset_index(drop=True) - allResults = pd.concat((allResults, df)) - strAllResults = f"All results:\n{allResults.to_string()}" + + # collect eval stats objects by model name and remove from data frame + for modelName, result in comparisonData.resultByModelName.items(): + if useCrossValidation: + evalStats = result.crossValData.getEvalStatsCollection().getGlobalStats() + else: + evalStats = result.evalData.getEvalStats() + evalStatsByModelName[modelName].append(evalStats) + + allResultsDF = pd.concat((allResultsDF, df)) + + if modelName2StringRepr is None: + modelName2StringRepr = {model.getName(): model.pprints() for model in models} + + if self.metaDF is not None: + allResultsDF = allResultsDF.join(self.metaDF, on=self.keyName, how="left") + + strAllResults = f"All results:\n{allResultsDF.to_string()}" log.info(strAllResults) - meanResults = allResults.groupby("modelName").mean() - strMeanResults = f"Mean results:\n{meanResults.to_string()}" + + # create mean result by model, removing any metrics/columns that produced NaN values + # (because the mean would be computed without them, skipna parameter unsupported) + allResultsGrouped = allResultsDF.dropna(axis=1).groupby("modelName") + meanResultsDF: pd.DataFrame = allResultsGrouped.mean() + for colName in [columnNameForModelRanking, f"mean[{columnNameForModelRanking}]"]: + if colName in meanResultsDF: + meanResultsDF.sort_values(columnNameForModelRanking, inplace=True, ascending=not rankMax) + break + strMeanResults = f"Mean results (averaged across {len(self.inputOutputDataDict)} data sets):\n{meanResultsDF.to_string()}" log.info(strMeanResults) + + # create further aggregations + aggDFs = [] + for opName, aggFn in [("mean", lambda x: x.mean()), ("std", lambda x: x.std()), ("min", lambda x: x.min()), ("max", lambda x: x.max())]: + aggDF = aggFn(allResultsGrouped) + aggDF.columns = [f"{opName}[{c}]" for c in aggDF.columns] + aggDFs.append(aggDF) + furtherAggsDF = pd.concat(aggDFs, axis=1) + furtherAggsDF = furtherAggsDF.loc[meanResultsDF.index] # apply same sort order (index is modelName) + columnOrder = functools.reduce(lambda a, b: a + b, [list(t) for t in zip(*[df.columns for df in aggDFs])]) + furtherAggsDF = furtherAggsDF[columnOrder] + strFurtherAggs = f"Further aggregations:\n{furtherAggsDF.to_string()}" + log.info(strFurtherAggs) + if resultWriter is not None: - resultWriter.writeTextFile("model-comparison-results", strMeanResults + "\n\n" + strAllResults) - return allResults, meanResults + comparisonContent = strMeanResults + "\n\n" + strFurtherAggs + "\n\n" + strAllResults + comparisonContent += "\n\nModels [example instance]:\n\n" + comparisonContent += "\n\n".join(f"{name} = {s}" for name, s in modelName2StringRepr.items()) + resultWriter.writeTextFile("model-comparison-results", comparisonContent) + + # create plots from combined data for each model + if createCombinedEvalStatsPlots: + for modelName, evalStatsList in evalStatsByModelName.items(): + childResultWriter = resultWriter.childWithAddedPrefix(modelName + "_") if resultWriter is not None else None + resultCollector = EvaluationResultCollector(showPlots=False, resultWriter=childResultWriter) + if isRegression: + evalStats = RegressionEvalStatsCollection(evalStatsList).getGlobalStats() + else: + evalStats = ClassificationEvalStatsCollection(evalStatsList).getGlobalStats() + plotCollector.createPlots(evalStats, subtitle=modelName, resultCollector=resultCollector) + + # collect results from visitors (if any) + resultCollector = EvaluationResultCollector(showPlots=False, resultWriter=resultWriter) + if visitors is not None: + for visitor in visitors: + visitor.collectResults(resultCollector) + + # create result + if isRegression: + mdmcData = RegressionMultiDataModelComparisonData(allResultsDF, meanResultsDF, furtherAggsDF, evalStatsByModelName) + else: + mdmcData = ClassificationMultiDataModelComparisonData(allResultsDF, meanResultsDF, furtherAggsDF, evalStatsByModelName) + + # plot distributions + if createMetricDistributionPlots and resultWriter is not None: + mdmcData.createDistributionPlots(resultWriter, cdf=distributionPlots_cdf, cdfComplementary=distributionPlots_cdfComplementary) + + return mdmcData + + +class ModelComparisonData: + @dataclass + class Result: + evalData: Union[VectorClassificationModelEvaluationData, VectorRegressionModelEvaluationData] = None + crossValData: Union[VectorClassificationModelCrossValidationData, VectorRegressionModelCrossValidationData] = None + + def __init__(self, resultsDF: pd.DataFrame, resultsByModelName: Dict[str, Result]): + self.resultsDF = resultsDF + self.resultByModelName = resultsByModelName + + def getBestModelName(self, metricName: str) -> str: + idx = np.argmax(self.resultsDF[metricName]) + return self.resultsDF.index[idx] + + def getBestModel(self, metricName: str) -> Union[VectorClassificationModel, VectorRegressionModel, VectorModelBase]: + result = self.resultByModelName[self.getBestModelName(metricName)] + if result.evalData is None: + raise ValueError("The best model is not well-defined when using cross-validation") + return result.evalData.model + + +class ModelComparisonVisitor(ABC): + @abstractmethod + def visit(self, modelName: str, result: ModelComparisonData.Result): + pass + + @abstractmethod + def collectResults(self, resultCollector: EvaluationResultCollector) -> None: + """ + Collects results (such as figures) at the end of the model comparison, based on the results collected + + :param resultCollector: the collector to which figures are to be added + """ + pass + + +class ModelComparisonVisitorAggregatedFeatureImportance(ModelComparisonVisitor): + """ + During a model comparison, computes aggregated feature importance values for the model with the given name + """ + def __init__(self, modelName: str, featureAggRegEx: Sequence[str] = (), writeFigure=True, writeDataFrameCSV=False): + """ + :param modelName: the name of the model for which to compute the aggregated feature importance values + :param featureAggRegEx: a sequence of regular expressions describing which feature names to sum as one. Each regex must + contain exactly one group. If a regex matches a feature name, the feature importance will be summed under the key + of the matched group instead of the full feature name. For example, the regex r"(\w+)_\d+$" will cause "foo_1" and "foo_2" + to be summed under "foo" and similarly "bar_1" and "bar_2" to be summed under "bar". + """ + self.modelName = modelName + self.aggFeatureImportance = AggregatedFeatureImportance(featureAggRegEx=featureAggRegEx) + self.writeFigure = writeFigure + self.writeDataFrameCSV = writeDataFrameCSV + + def visit(self, modelName: str, result: ModelComparisonData.Result): + if modelName == self.modelName: + if result.crossValData is not None: + models = result.crossValData.trainedModels + if models is not None: + for model in models: + self._collect(model) + else: + raise ValueError("Models were not returned in cross-validation results") + elif result.evalData is not None: + self._collect(result.evalData.model) + + def _collect(self, model: Union[FeatureImportanceProvider, VectorModelBase]): + if not isinstance(model, FeatureImportanceProvider): + raise ValueError(f"Got model which does inherit from {FeatureImportanceProvider.__qualname__}: {model}") + self.aggFeatureImportance.add(model.getFeatureImportanceDict()) + + @deprecated("Use getFeatureImportance and create the plot using the returned object") + def plotFeatureImportance(self) -> plt.Figure: + featureImportanceDict = self.aggFeatureImportance.getAggregatedFeatureImportance().getFeatureImportanceDict() + return plotFeatureImportance(featureImportanceDict, subtitle=self.modelName) + + def getFeatureImportance(self) -> FeatureImportance: + return self.aggFeatureImportance.getAggregatedFeatureImportance() + + def collectResults(self, resultCollector: EvaluationResultCollector): + featureImportance = self.getFeatureImportance() + if self.writeFigure: + resultCollector.addFigure(f"{self.modelName}_feature-importance", featureImportance.plot()) + if self.writeDataFrameCSV: + resultCollector.addDataFrameCsvFile(f"{self.modelName}_feature-importance", featureImportance.getDataFrame()) + + +class MultiDataModelComparisonData(Generic[TEvalStats, TEvalStatsCollection], ABC): + def __init__(self, allResultsDF: pd.DataFrame, meanResultsDF: pd.DataFrame, aggResultsDF: pd.DataFrame, + evalStatsByModelName: Dict[str, List[TEvalStats]]): + self.allResultsDF = allResultsDF + self.meanResultsDF = meanResultsDF + self.aggResultsDF = aggResultsDF + self.evalStatsByModelName = evalStatsByModelName + + def getModelNames(self) -> List[str]: + return list(self.evalStatsByModelName.keys()) + + def getEvalStatsList(self, modelName: str) -> List[TEvalStats]: + return self.evalStatsByModelName[modelName] + + @abstractmethod + def getEvalStatsCollection(self, modelName: str) -> TEvalStatsCollection: + pass + + def createDistributionPlots(self, resultWriter: ResultWriter, cdf=True, cdfComplementary=False): + """ + Creates plots of distributions of metrics across datasets for each model as a histogram, and additionally + any x-y plots (scatter plots & heat maps) for metrics that have associated paired metrics that were also computed + + :param resultWriter: the result writer + :param cdf: whether to additionally plot, for each distribution, the cumulative distribution function + :param cdfComplementary: whether to plot the complementary cdf, provided that ``cdf`` is True + """ + for modelName in self.getModelNames(): + evalStatsCollection = self.getEvalStatsCollection(modelName) + for metricName in evalStatsCollection.getMetricNames(): + # plot distribution + fig = evalStatsCollection.plotDistribution(metricName, cdf=cdf, cdfComplementary=cdfComplementary) + resultWriter.writeFigure(f"{modelName}_dist-{metricName}", fig) + # scatter plot with paired metrics + metric = evalStatsCollection.getMetricByName(metricName) + for pairedMetric in metric.getPairedMetrics(): + if evalStatsCollection.hasMetric(pairedMetric): + fig = evalStatsCollection.plotScatter(metric.name, pairedMetric.name) + resultWriter.writeFigure(f"{modelName}_scatter-{metric.name}-{pairedMetric.name}", fig) + fig = evalStatsCollection.plotHeatMap(metric.name, pairedMetric.name) + resultWriter.writeFigure(f"{modelName}_heatmap-{metric.name}-{pairedMetric.name}", fig) + + +class ClassificationMultiDataModelComparisonData(MultiDataModelComparisonData[ClassificationEvalStats, ClassificationEvalStatsCollection]): + def getEvalStatsCollection(self, modelName: str): + return ClassificationEvalStatsCollection(self.getEvalStatsList(modelName)) + + +class RegressionMultiDataModelComparisonData(MultiDataModelComparisonData[RegressionEvalStats, RegressionEvalStatsCollection]): + def getEvalStatsCollection(self, modelName: str): + return RegressionEvalStatsCollection(self.getEvalStatsList(modelName)) \ No newline at end of file diff --git a/src/sensai/evaluation/evaluator.py b/src/sensai/evaluation/evaluator.py index 4f39d0e7..2760c87e 100644 --- a/src/sensai/evaluation/evaluator.py +++ b/src/sensai/evaluation/evaluator.py @@ -5,12 +5,13 @@ import pandas as pd +from .eval_stats import GUESS from .eval_stats.eval_stats_base import EvalStats, EvalStatsCollection from .eval_stats.eval_stats_classification import ClassificationEvalStats, ClassificationMetric from .eval_stats.eval_stats_regression import RegressionEvalStats, RegressionEvalStatsCollection, RegressionMetric -from ..data_transformation import DataFrameTransformer from ..data import DataSplitter, DataSplitterFractional, InputOutputData -from ..tracking import TrackingMixin +from ..data_transformation import DataFrameTransformer +from ..tracking import TrackingMixin, TrackedExperiment from ..util.string import ToStringMixin from ..util.typing import PandasNamedTuple from ..vector_model import VectorClassificationModel, VectorModel, VectorModelBase, VectorModelFittableBase @@ -46,9 +47,7 @@ def computeMetrics(self, model, **kwargs) -> Optional[Dict[str, float]]: """ valuesDict = self._computeMetrics(model, **kwargs) if self.trackedExperiment is not None: - trackedDict = valuesDict.copy() - trackedDict["str(model)"] = str(model) - self.trackedExperiment.trackValues(trackedDict) + self.trackedExperiment.trackValues(valuesDict, addValuesDict={"str(model)": str(model)}) return valuesDict @@ -61,13 +60,13 @@ def _computeMetrics(self, model, **kwargs) -> Dict[str, float]: class VectorModelEvaluationData(ABC, Generic[TEvalStats]): - def __init__(self, statsDict: Dict[str, TEvalStats], inputData: pd.DataFrame, model: VectorModelBase): + def __init__(self, statsDict: Dict[str, TEvalStats], ioData: InputOutputData, model: VectorModelBase): """ :param statsDict: a dictionary mapping from output variable name to the evaluation statistics object - :param inputData: the input data that was used to produce the results + :param ioData: the input/output data that was used to produce the results :param model: the model that was used to produce predictions """ - self.inputData = inputData + self.ioData = ioData self.evalStatsByVarName = statsDict self.predictedVarNames = list(self.evalStatsByVarName.keys()) self.model = model @@ -76,6 +75,10 @@ def __init__(self, statsDict: Dict[str, TEvalStats], inputData: pd.DataFrame, mo def modelName(self): return self.model.getName() + @property + def inputData(self): # for backward compatibility + return self.ioData.inputs + def getEvalStats(self, predictedVarName=None) -> TEvalStats: if predictedVarName is None: if len(self.evalStatsByVarName) != 1: @@ -96,7 +99,7 @@ def getDataFrame(self): statsDicts = [] varNames = [] for predictedVarName, evalStats in self.evalStatsByVarName.items(): - statsDicts.append(evalStats.getAll()) + statsDicts.append(evalStats.metricsDict()) varNames.append(predictedVarName) df = pd.DataFrame(statsDicts, index=varNames) df.index.name = "predictedVar" @@ -174,16 +177,31 @@ def __init__(self, data: Optional[InputOutputData], testData: InputOutputData = self.trainingData = data self.testData = testData - def evalModel(self, model: VectorModelBase, onTrainingData=False) -> TEvalData: + def setTrackedExperiment(self, trackedExperiment: TrackedExperiment): + """ + Sets a tracked experiment which will result in metrics being saved whenever computeMetrics is called + or evalModel is called with track=True. + + :param trackedExperiment: the experiment in which to track evaluation metrics. + """ + super().setTrackedExperiment(trackedExperiment) + + def evalModel(self, model: VectorModelBase, onTrainingData=False, track=True) -> TEvalData: """ Evaluates the given model :param model: the model to evaluate :param onTrainingData: if True, evaluate on this evaluator's training data rather than the held-out test data + :param track: whether to track the evaluation metrics for the case where a tracked experiment was set on this object :return: the evaluation result """ data = self.trainingData if onTrainingData else self.testData - return self._evalModel(model, data) + result: VectorModelEvaluationData = self._evalModel(model, data) + if track and self.trackedExperiment is not None: + for predVarName in result.predictedVarNames: + addValuesDict = {"str(model)": str(model), "predVarName": predVarName} + self.trackedExperiment.trackValues(result.getEvalStats(predVarName).metricsDict(), addValuesDict=addValuesDict) + return result @abstractmethod def _evalModel(self, model: VectorModelBase, data: InputOutputData) -> TEvalData: @@ -194,8 +212,9 @@ def _computeMetrics(self, model: VectorModel, onTrainingData=False) -> Dict[str, def _computeMetricsForVarName(self, model, predictedVarName: Optional[str], onTrainingData=False): self.fitModel(model) - evalData = self.evalModel(model, onTrainingData=onTrainingData) - return evalData.getEvalStats(predictedVarName=predictedVarName).getAll() + track = False # avoid duplicate tracking (as this function is only called by computeMetrics, which already tracks) + evalData: VectorModelEvaluationData = self.evalModel(model, onTrainingData=onTrainingData, track=track) + return evalData.getEvalStats(predictedVarName=predictedVarName).metricsDict() def createMetricsDictProvider(self, predictedVarName: Optional[str]) -> MetricsDictProvider: """ @@ -216,7 +235,7 @@ def fitModel(self, model: VectorModelFittableBase): class VectorRegressionModelEvaluatorParams(VectorModelEvaluatorParams): def __init__(self, dataSplitter: DataSplitter = None, fractionalSplitTestFraction: float = None, fractionalSplitRandomSeed=42, - fractionalSplitShuffle=True, additionalMetrics: Sequence[RegressionMetric] = None, + fractionalSplitShuffle=True, metrics: Sequence[RegressionMetric] = None, additionalMetrics: Sequence[RegressionMetric] = None, outputDataFrameTransformer: DataFrameTransformer = None): """ :param dataSplitter: [if test data must be obtained via split] a splitter to use in order to obtain; if None, must specify @@ -225,12 +244,14 @@ def __init__(self, dataSplitter: DataSplitter = None, fractionalSplitTestFractio :param fractionalSplitRandomSeed: [if dataSplitter is none, test data must be obtained via split] the random seed to use for the fractional split of the data :param fractionalSplitShuffle: [if dataSplitter is None, test data must be obtained via split] whether to randomly (based on randomSeed) shuffle the dataset before splitting it + :param additionalMetrics: additional regression metrics to apply :param outputDataFrameTransformer: a data frame transformer to apply to all output data frames (both model outputs and ground truth), such that evaluation metrics are computed on the transformed data frame """ super().__init__(dataSplitter, fractionalSplitTestFraction=fractionalSplitTestFraction, fractionalSplitRandomSeed=fractionalSplitRandomSeed, fractionalSplitShuffle=fractionalSplitShuffle) + self.metrics = metrics self.additionalMetrics = additionalMetrics self.outputDataFrameTransformer = outputDataFrameTransformer @@ -240,10 +261,10 @@ def fromDictOrInstance(cls, params: Optional[Union[Dict[str, Any], "VectorRegres return VectorRegressionModelEvaluatorParams() elif type(params) == dict: return cls.fromOldKwArgs(**params) - elif isinstance(params, VectorRegressionModelEvaluatorParams): + elif isinstance(params, cls): return params else: - raise ValueError(f"Must provide dictionary or instance, got {params}") + raise ValueError(f"Must provide dictionary or {cls} instance, got {params}, type {type(params)}") @classmethod def fromOldKwArgs(cls, dataSplitter=None, testFraction=None, randomSeed=42, shuffle=True, additionalMetrics: Sequence[RegressionMetric] = None, @@ -283,9 +304,10 @@ def _evalModel(self, model: VectorModelBase, data: InputOutputData) -> VectorReg predictions, groundTruth = self._computeOutputs(model, data) for predictedVarName in predictions.columns: evalStats = RegressionEvalStats(y_predicted=predictions[predictedVarName], y_true=groundTruth[predictedVarName], + metrics=self.params.metrics, additionalMetrics=self.params.additionalMetrics) evalStatsByVarName[predictedVarName] = evalStats - return VectorRegressionModelEvaluationData(evalStatsByVarName, data.inputs, model) + return VectorRegressionModelEvaluationData(evalStatsByVarName, data, model) def computeTestDataOutputs(self, model: VectorModelBase) -> Tuple[pd.DataFrame, pd.DataFrame]: """ @@ -313,13 +335,22 @@ def _computeOutputs(self, model: VectorModelBase, inputOutputData: InputOutputDa class VectorClassificationModelEvaluationData(VectorModelEvaluationData[ClassificationEvalStats]): - pass + def getMisclassifiedInputsDataFrame(self) -> pd.DataFrame: + return self.inputData.iloc[self.getEvalStats().getMisclassifiedIndices()] + + def getMisclassifiedTriplesPredTrueInput(self) -> List[Tuple[Any, Any, pd.Series]]: + """ + :return: a list containing a triple (predicted class, true class, input series) for each misclassified data point + """ + evalStats = self.getEvalStats() + indices = evalStats.getMisclassifiedIndices() + return [(evalStats.y_predicted[i], evalStats.y_true[i], self.inputData.iloc[i]) for i in indices] class VectorClassificationModelEvaluatorParams(VectorModelEvaluatorParams): def __init__(self, dataSplitter: DataSplitter = None, fractionalSplitTestFraction: float = None, fractionalSplitRandomSeed=42, fractionalSplitShuffle=True, additionalMetrics: Sequence[ClassificationMetric] = None, - computeProbabilities: bool = False): + computeProbabilities: bool = False, binaryPositiveLabel=GUESS): """ :param dataSplitter: [if test data must be obtained via split] a splitter to use in order to obtain; if None, must specify fractionalSplitTestFraction for fractional split (default) @@ -329,11 +360,14 @@ def __init__(self, dataSplitter: DataSplitter = None, fractionalSplitTestFractio splitting it :param additionalMetrics: additional metrics to apply :param computeProbabilities: whether to compute class probabilities + :param binaryPositiveLabel: the positive class label for binary classification; if GUESS, true to detect from labels; + if None, no detection (non-binary classification) """ super().__init__(dataSplitter, fractionalSplitTestFraction=fractionalSplitTestFraction, fractionalSplitRandomSeed=fractionalSplitRandomSeed, fractionalSplitShuffle=fractionalSplitShuffle) self.additionalMetrics = additionalMetrics self.computeProbabilities = computeProbabilities + self.binaryPositiveLabel = binaryPositiveLabel @classmethod def fromOldKwArgs(cls, dataSplitter=None, testFraction=None, @@ -382,9 +416,9 @@ def _evalModel(self, model: VectorClassificationModel, data: InputOutputData) -> raise ValueError(f"Expected a classification model, got {model}") predictions, predictions_proba, groundTruth = self._computeOutputs(model, data) evalStats = ClassificationEvalStats(y_predictedClassProbabilities=predictions_proba, y_predicted=predictions, y_true=groundTruth, - labels=model.getClassLabels(), additionalMetrics=self.params.additionalMetrics) + labels=model.getClassLabels(), additionalMetrics=self.params.additionalMetrics, binaryPositiveLabel=self.params.binaryPositiveLabel) predictedVarName = model.getPredictedVariableNames()[0] - return VectorClassificationModelEvaluationData({predictedVarName: evalStats}, data.inputs, model) + return VectorClassificationModelEvaluationData({predictedVarName: evalStats}, data, model) def computeTestDataOutputs(self, model) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ @@ -417,16 +451,17 @@ class RuleBasedVectorClassificationModelEvaluator(VectorClassificationModelEvalu def __init__(self, data: InputOutputData): super().__init__(data, testData=data) - def evalModel(self, model: VectorModelBase, onTrainingData=False) -> VectorClassificationModelEvaluationData: + def evalModel(self, model: VectorModelBase, onTrainingData=False, track=True) -> VectorClassificationModelEvaluationData: """ Evaluate the rule based model. The training data and test data coincide, thus fitting the model will fit the model's preprocessors on the full data set and evaluating it will evaluate the model on the same data set. - :param model: + :param model: the model to evaluate :param onTrainingData: has to be False here. Setting to True is not supported and will lead to an exception - :return: + :param track: whether to track the evaluation metrics for the case where a tracked experiment was set on this object + :return: the evaluation result """ if onTrainingData: raise Exception("Evaluating rule based models on training data is not supported. In this evaluator" @@ -438,16 +473,17 @@ class RuleBasedVectorRegressionModelEvaluator(VectorRegressionModelEvaluator): def __init__(self, data: InputOutputData): super().__init__(data, testData=data) - def evalModel(self, model: VectorModelBase, onTrainingData=False) -> VectorRegressionModelEvaluationData: + def evalModel(self, model: VectorModelBase, onTrainingData=False, track=True) -> VectorRegressionModelEvaluationData: """ Evaluate the rule based model. The training data and test data coincide, thus fitting the model will fit the model's preprocessors on the full data set and evaluating it will evaluate the model on the same data set. - :param model: + :param model: the model to evaluate :param onTrainingData: has to be False here. Setting to True is not supported and will lead to an exception - :return: + :param track: whether to track the evaluation metrics for the case where a tracked experiment was set on this object + :return: the evaluation result """ if onTrainingData: raise Exception("Evaluating rule based models on training data is not supported. In this evaluator" diff --git a/src/sensai/evaluation/evaluator_clustering.py b/src/sensai/evaluation/evaluator_clustering.py index 704ceffe..b3b29525 100644 --- a/src/sensai/evaluation/evaluator_clustering.py +++ b/src/sensai/evaluation/evaluator_clustering.py @@ -21,7 +21,7 @@ def _computeMetrics(self, model: EuclideanClusterer, **kwargs) -> Dict[str, floa :return: """ evalStats = self.evalModel(model, **kwargs) - return evalStats.getAll() + return evalStats.metricsDict() @abstractmethod def evalModel(self, model: EuclideanClusterer, **kwargs) -> TClusteringEvalStats: diff --git a/src/sensai/feature_importance.py b/src/sensai/feature_importance.py new file mode 100644 index 00000000..4f277de6 --- /dev/null +++ b/src/sensai/feature_importance.py @@ -0,0 +1,228 @@ +import collections +import copy +import logging +import re +from abc import ABC, abstractmethod +from typing import Dict, Union, Sequence, List, Tuple + +import numpy as np +import pandas as pd +import seaborn as sns +from matplotlib import pyplot as plt +from sklearn.inspection import permutation_importance + +from .data import InputOutputData +from .evaluation.crossval import VectorModelCrossValidationData +from .util.deprecation import deprecated +from .util.plot import MATPLOTLIB_DEFAULT_FIGURE_SIZE +from .util.string import ToStringMixin +from .vector_model import VectorModel + +log = logging.getLogger(__name__) + + +class FeatureImportance: + def __init__(self, featureImportanceDict: Union[Dict[str, float], Dict[str, Dict[str, float]]]): + self.featureImportanceDict = featureImportanceDict + self._isMultiVar = self._isDict(next(iter(featureImportanceDict.values()))) + + @staticmethod + def _isDict(x): + return hasattr(x, "get") + + def getFeatureImportanceDict(self, predictedVarName=None) -> Dict[str, float]: + if self._isMultiVar: + self.featureImportanceDict: Dict[str, Dict[str, float]] + if predictedVarName is not None: + return self.featureImportanceDict[predictedVarName] + else: + if len(self.featureImportanceDict) > 1: + raise ValueError("Must provide predicted variable name (multiple output variables)") + else: + return next(iter(self.featureImportanceDict.values())) + else: + return self.featureImportanceDict + + def getSortedTuples(self, predictedVarName=None, reverse=False) -> List[Tuple[str, float]]: + """ + :param predictedVarName: the predicted variable name for which to retrieve the sorted feature importance values + :param reverse: whether to reverse the order (i.e. descending order of importance values, where the most important feature comes first, + rather than ascending order) + :return: a sorted list of tuples (feature name, feature importance) + """ + # noinspection PyTypeChecker + tuples: List[Tuple[str, float]] = list(self.getFeatureImportanceDict(predictedVarName).items()) + tuples.sort(key=lambda t: t[1], reverse=reverse) + return tuples + + def plot(self, predictedVarName=None) -> plt.Figure: + return plotFeatureImportance(self.getFeatureImportanceDict(predictedVarName=predictedVarName)) + + def getDataFrame(self, predictedVarName=None) -> pd.DataFrame: + """ + :param predictedVarName: the predicted variable name + :return: a data frame with two columns, "feature" and "importance" + """ + namesAndImportance = self.getSortedTuples(predictedVarName=predictedVarName, reverse=True) + return pd.DataFrame(namesAndImportance, columns=["feature", "importance"]) + + +class FeatureImportanceProvider(ABC): + """ + Interface for models that can provide feature importance values + """ + @abstractmethod + def getFeatureImportanceDict(self) -> Union[Dict[str, float], Dict[str, Dict[str, float]]]: + """ + Gets the feature importance values + + :return: either a dictionary mapping feature names to importance values or (for models predicting multiple + variables (independently)) a dictionary which maps predicted variable names to such dictionaries + """ + pass + + def getFeatureImportance(self) -> FeatureImportance: + return FeatureImportance(self.getFeatureImportanceDict()) + + @deprecated("Use getFeatureImportanceDict or the high-level interface getFeatureImportance instead.") + def getFeatureImportances(self) -> Union[Dict[str, float], Dict[str, Dict[str, float]]]: + return self.getFeatureImportanceDict() + + +def plotFeatureImportance(featureImportanceDict: Dict[str, float], subtitle: str = None) -> plt.Figure: + numFeatures = len(featureImportanceDict) + defaultWidth, defaultHeight = MATPLOTLIB_DEFAULT_FIGURE_SIZE + height = max(defaultHeight, defaultHeight * numFeatures / 20) + fig, ax = plt.subplots(figsize=(defaultWidth, height)) + sns.barplot(x=list(featureImportanceDict.values()), y=list(featureImportanceDict.keys()), ax=ax) + title = "Feature Importance" + if subtitle is not None: + title += "\n" + subtitle + plt.title(title) + plt.tight_layout() + return fig + + +class AggregatedFeatureImportance: + """ + Aggregates feature importance values (e.g. from models implementing FeatureImportanceProvider, such as sklearn's RandomForest + models and compatible models from lightgbm, etc.) + """ + def __init__(self, *items: Union[FeatureImportanceProvider, Dict[str, float], Dict[str, Dict[str, float]]], + featureAggRegEx: Sequence[str] = (), aggFn=np.mean): + r""" + :param items: (optional) initial list of feature importance providers or dictionaries to aggregate; further + values can be added via method add + :param featureAggRegEx: a sequence of regular expressions describing which feature names to sum as one. Each regex must + contain exactly one group. If a regex matches a feature name, the feature importance will be summed under the key + of the matched group instead of the full feature name. For example, the regex r"(\w+)_\d+$" will cause "foo_1" and "foo_2" + to be summed under "foo" and similarly "bar_1" and "bar_2" to be summed under "bar". + """ + self._aggDict = None + self._isNested = None + self._numDictsAdded = 0 + self._featureAggRegEx = [re.compile(p) for p in featureAggRegEx] + self._aggFn = aggFn + for item in items: + self.add(item) + + @staticmethod + def _isDict(x): + return hasattr(x, "get") + + def add(self, featureImportance: Union[FeatureImportanceProvider, Dict[str, float], Dict[str, Dict[str, float]]]): + """ + Adds the feature importance values from the given dictionary + + :param featureImportance: the dictionary obtained via a model's getFeatureImportances method + """ + if isinstance(featureImportance, FeatureImportanceProvider): + featureImportance = featureImportance.getFeatureImportanceDict() + if self._isNested is None: + self._isNested = self._isDict(next(iter(featureImportance.values()))) + if self._isNested: + if self._aggDict is None: + self._aggDict = collections.defaultdict(lambda: collections.defaultdict(list)) + for targetName, d in featureImportance.items(): + d: dict + for featureName, value in d.items(): + self._aggDict[targetName][self._aggFeatureName(featureName)].append(value) + else: + if self._aggDict is None: + self._aggDict = collections.defaultdict(list) + for featureName, value in featureImportance.items(): + self._aggDict[self._aggFeatureName(featureName)].append(value) + self._numDictsAdded += 1 + + def _aggFeatureName(self, featureName: str): + for regex in self._featureAggRegEx: + m = regex.match(featureName) + if m is not None: + return m.group(1) + return featureName + + def getAggregatedFeatureImportanceDict(self) -> Union[Dict[str, float], Dict[str, Dict[str, float]]]: + def aggregate(d: dict): + return {k: self._aggFn(l) for k, l in d.items()} + + if self._isNested: + return {k: aggregate(d) for k, d in self._aggDict} + else: + return aggregate(self._aggDict) + + def getAggregatedFeatureImportance(self) -> FeatureImportance: + return FeatureImportance(self.getAggregatedFeatureImportanceDict()) + + +def computePermutationFeatureImportanceDict(model, ioData: InputOutputData, scoring, numRepeats: int, randomState, + excludeInputPreprocessors=False): + if excludeInputPreprocessors: + inputs = model.computeModelInputs(ioData.inputs) + model = copy.copy(model) + model.removeInputPreprocessors() + else: + inputs = ioData.inputs + featureNames = inputs.columns + pi = permutation_importance(model, inputs, ioData.outputs, n_repeats=numRepeats, random_state=randomState, scoring=scoring) + importanceValues = pi.importances_mean + assert len(importanceValues) == len(featureNames) + featureImportanceDict = dict(zip(featureNames, importanceValues)) + return featureImportanceDict + + +class AggregatedPermutationFeatureImportance(ToStringMixin): + def __init__(self, aggregatedFeatureImportance: AggregatedFeatureImportance, scoring, numRepeats=5, randomSeed=42, + excludeModelInputPreprocessors=False): + """ + :param aggregatedFeatureImportance: the object in which to aggregate the feature importance (to which no feature importance + values should have yet been added) + :param scoring: the scoring method; see https://scikit-learn.org/stable/modules/model_evaluation.html; e.g. "r2" for regression or + "accuracy" for classification + :param numRepeats: the number of data permutations to apply for each model + :param randomSeed: the random seed for shuffling the data + :param excludeModelInputPreprocessors: whether to exclude model input preprocessors, such that the + feature importance will be reported on the transformed inputs that are actually fed to the model rather than the original + inputs. + Enabling this can, for example, help save time in cases where the input preprocessors discard many of the raw input + columns, but it may not be a good idea of the preprocessors generate multiple columns from the original input columns. + """ + self._agg = aggregatedFeatureImportance + self.scoring = scoring + self.numRepeats = numRepeats + self.randomSeed = randomSeed + self.excludeModelInputPreprocessors = excludeModelInputPreprocessors + + def add(self, model: VectorModel, ioData: InputOutputData): + featureImportanceDict = computePermutationFeatureImportanceDict(model, ioData, self.scoring, numRepeats=self.numRepeats, + randomState=self.randomSeed, excludeInputPreprocessors=self.excludeModelInputPreprocessors) + self._agg.add(featureImportanceDict) + + def addCrossValidationData(self, crossValData: VectorModelCrossValidationData): + if crossValData.trainedModels is None: + raise ValueError("No models in cross-validation data; enable model collection during cross-validation") + for i, (model, evalData) in enumerate(zip(crossValData.trainedModels, crossValData.evalDataList), start=1): + log.info(f"Computing permutation feature importance for model #{i}/{len(crossValData.trainedModels)}") + self.add(model, evalData.ioData) + + def getFeatureImportance(self) -> FeatureImportance: + return self._agg.getAggregatedFeatureImportance() diff --git a/src/sensai/feature_selection/rfe.py b/src/sensai/feature_selection/rfe.py new file mode 100644 index 00000000..9d1fcd20 --- /dev/null +++ b/src/sensai/feature_selection/rfe.py @@ -0,0 +1,137 @@ +import logging +from copy import copy +from dataclasses import dataclass +from typing import Union, List + +import matplotlib.pyplot as plt +import numpy as np + +from sensai import VectorModel, InputOutputData +from sensai.evaluation import VectorModelCrossValidatorParams, createVectorModelCrossValidator +from sensai.feature_importance import FeatureImportanceProvider, AggregatedFeatureImportance +from sensai.util.plot import ScatterPlot + +log = logging.getLogger(__name__) + + +class RecursiveFeatureEliminationCV: + """ + Recursive feature elimination, using cross-validation to select the best set of features: + In each step, the model is first evaluated using cross-validation. + Then the feature importance values are aggregated across the models that were trained during cross-validation, + and the least important feature is discarded. For the case where the lowest feature importance is 0, all + features with 0 importance are discarded. + This process is repeated until a point is reached where only `minFeatures` (or less) remain. + The selected set of features is the one from the step where cross-validation yielded the best evaluation metric value. + + Feature importance is computed at the level of model input features, i.e. after feature generation and transformation. + + NOTE: This implementation differs markedly from sklearn's RFECV, which performs an independent RFE for each fold. + RFECV determines the number of features to use by determining the elimination step in each fold that yielded the best + metric value on average. Because the eliminations are independent, the actual features that were being used in those step + could have been completely different. Using the selected number of features n, RFECV then performs another RFE, eliminating features + until n features remain and returns these features as the result. + """ + def __init__(self, crossValidatorParams: VectorModelCrossValidatorParams, minFeatures=1): + """ + :param crossValidatorParams: the parameters for cross-validation + :param minFeatures: the minimum number of features to evaluate + """ + self.crossValidatorParams = crossValidatorParams + self.minFeatures = minFeatures + + @dataclass + class Step: + metricValue: float + features: List[str] + + class Result: + def __init__(self, steps: List["RecursiveFeatureEliminationCV.Step"], metricName: str, minimise: bool): + self.steps = steps + self.metricName = metricName + self.minimise = minimise + + def getSortedSteps(self) -> List["RecursiveFeatureEliminationCV.Step"]: + """ + :return: the elimination step results, sorted from best to worst + """ + return sorted(self.steps, key=lambda s: s.metricValue, reverse=not self.minimise) + + def getSelectedFeatures(self) -> List[str]: + return self.getSortedSteps()[0].features + + def getNumFeaturesArray(self) -> np.ndarray: + """ + :return: array containing the number of features that was considered in each step + """ + return np.array([len(s.features) for s in self.steps]) + + def getMetricValuesArray(self) -> np.ndarray: + """ + :return: array containing the metric value that resulted in each step + """ + return np.array([s.metricValue for s in self.steps]) + + def plotMetricValues(self) -> plt.Figure: + """ + Plots the metric values vs. the number of features for each step of the elimination + + :return: the figure + """ + return ScatterPlot(self.getNumFeaturesArray(), self.getMetricValuesArray(), c_opacity=1, x_label="number of features", + y_label=f"cross-validation mean metric value ({self.metricName})").fig + + def run(self, model: Union[VectorModel, FeatureImportanceProvider], ioData: InputOutputData, metricName: str, minimise: bool) -> Result: + """ + Runs the optimisation for the given model and data. + + :param model: the model + :param ioData: the data + :param metricName: the metric to optimise + :param minimise: whether the metric shall be minimsed; if False, maximise. + :return: a result object, which provides access to the selected features and data on all elimination steps + """ + metricKey = f"mean[{metricName}]" + + model = copy(model) + model.fitInputOutputData(ioData, fitPreprocessors=True, fitModel=False) + inputs = model.computeModelInputs(ioData.inputs) + model.removeInputPreprocessors() + ioData = InputOutputData(inputs, ioData.outputs) + + features = list(inputs.columns) + steps = [] + while True: + # evaluate model + crossValidator = createVectorModelCrossValidator(ioData, model=model, params=self.crossValidatorParams) + crossValData = crossValidator.evalModel(model) + aggMetricsDict = crossValData.getEvalStatsCollection().aggMetricsDict() + metricValue = aggMetricsDict[metricKey] + + steps.append(self.Step(metricValue=metricValue, features=features)) + + # eliminate feature(s) + log.info(f"Model performance with {len(features)} features: {metricKey}={metricValue}") + aggImportance = AggregatedFeatureImportance(*crossValData.trainedModels) + fi = aggImportance.getAggregatedFeatureImportance() + tuples = fi.getSortedTuples() + minImportance = tuples[0][1] + if minImportance == 0: + eliminatedFeatures = [] + for i, (fname, importance) in enumerate(tuples): + if importance > 0: + break + eliminatedFeatures.append(fname) + log.info(f"Eliminating {len(eliminatedFeatures)} features with 0 importance: {eliminatedFeatures}") + else: + eliminatedFeatures = [tuples[0][0]] + log.info(f"Eliminating feature {eliminatedFeatures[0]}") + features = [f for f in features if f not in eliminatedFeatures] + ioData.inputs = ioData.inputs[features] + log.info(f"{len(features)} features remain") + + if len(features) < self.minFeatures: + log.info("Minimum number of features reached/exceeded") + break + + return self.Result(steps, metricName, minimise) \ No newline at end of file diff --git a/src/sensai/featuregen.py b/src/sensai/featuregen.py index 67d5c3da..48df2618 100644 --- a/src/sensai/featuregen.py +++ b/src/sensai/featuregen.py @@ -9,8 +9,10 @@ from . import util, data_transformation from .columngen import ColumnGenerator +from .data_transformation import DFTNormalisation, DFTFromFeatureGenerator, DataFrameTransformer from .util import flattenArguments -from .util.string import orRegexGroup, ToStringMixin, dictString, listString +from .util.string import orRegexGroup, ToStringMixin, listString +from .util.typing import PandasNamedTuple if TYPE_CHECKING: from .vector_model import VectorModel @@ -31,7 +33,9 @@ def __init__(self, categoricalFeatureNames: Optional[Union[Sequence[str], str]] normalisationRules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None, addCategoricalDefaultRules=True): """ - :param categoricalFeatureNames: either a sequence of column names or a regex that is to match all categorical feature names. + :param categoricalFeatureNames: either a sequence of column names or a regex that is to match all categorical feature names + (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated + by other feature generators). It will be ensured that the respective columns in the generated data frames will have dtype 'category'. Furthermore, presence of meta-information can later be leveraged for further transformations, e.g. one-hot encoding. :param normalisationRules: Rules to be used by DFTNormalisation (e.g. for constructing an input transformer for a model). @@ -43,6 +47,11 @@ def __init__(self, categoricalFeatureNames: Optional[Union[Sequence[str], str]] If True, normalisation rules for categorical features (which are unsupported by normalisation) and their corresponding one-hot encoded features (with "_" appended) will be added. """ + # NOTE: While it would be more elegant to not have all of the above constructor arguments and instead provide + # them later using "with*" methods, this would have the significant drawback that it would enable + # all such attributes to be provided in all subclasses, even in ones where we know settings exactly + # and can provide them directly in the subclass constructor implementation. Thus it would enable + # non-sensical settings which should be avoided. if len(normalisationRules) > 0 and normalisationRuleTemplate is not None: raise ValueError(f"normalisationRules should be empty when a normalisationRuleTemplate is provided") @@ -134,6 +143,9 @@ def getGeneratedColumnNames(self) -> Optional[List[str]]: """ return self._generatedColumnNames + def toDFT(self): + return DFTFromFeatureGenerator(self) + @abstractmethod def _fit(self, X: pd.DataFrame, Y: pd.DataFrame = None, ctx=None): """ @@ -218,9 +230,10 @@ def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: :param df: the input data frame for which to generate features :param ctx: a context object whose functionality may be required for feature generation; this is typically the model instance that this feature generator is to generate inputs for - :return: a data frame containing the generated features, which uses the same index as X (and Y). - The data frame's columns holding categorical columns are not required to have dtype 'category'; - this will be ensured by the encapsulating call. + :return: a data frame containing the generated features, which uses the same index as ``df``. + The data frame's columns holding categorical columns are not required to have dtype ``category``; + this will be ensured by the encapsulating call as long as the respective columns' names + were appropriately provided at construction. """ pass @@ -240,18 +253,52 @@ def fitGenerate(self, X: pd.DataFrame, Y: pd.DataFrame = None, ctx=None) -> pd.D def flattened(self, columnsToFlatten: List[str] = None, normalisationRules=(), - normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None) -> "ChainedFeatureGenerator": + normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None, + keepOtherColumns=True) -> "ChainedFeatureGenerator": """ Returns a new feature generator which returns flattened versions of one or more of the vector-valued columns generated - by this feature generator + by this feature generator. :param columnsToFlatten: the list of columns to flatten; if None, flatten all columns :param normalisationRules: a list of normalisation rules which apply to the flattened columns :param normalisationRuleTemplate: a normalisation rule template which applies to all generated flattened columns + :param keepOtherColumns: if True, any additional columns that are not to be flattened are to be retained + by the returned feature generator; if False, additional columns are to be discarded :return: a feature generator which generates the flattened columns """ return flattenedFeatureGenerator(self, columnsToFlatten=columnsToFlatten, normalisationRules=normalisationRules, - normalisationRuleTemplate=normalisationRuleTemplate) + keepOtherColumns=keepOtherColumns, normalisationRuleTemplate=normalisationRuleTemplate) + + def concat(self, *others: "FeatureGenerator") -> "MultiFeatureGenerator": + """ + Concatenates this feature generator with one or more other feature generator in order to produce a feature generator that + jointly generates all features + + :param others: other feature generators + :return: a :class:`MultiFeatureGenerator` + """ + if isinstance(self, MultiFeatureGenerator): + fgens = list(self.featureGenerators) + else: + fgens = [self] + fgens.extend(others) + return MultiFeatureGenerator(fgens) + + def chain(self, *others: "FeatureGenerator") -> "ChainedFeatureGenerator": + """ + Chains this feature generator with one or more other feature generators such that each feature generator + receives as input the output of the preceding feature generator. The resulting feature generator + produces the features of the last element in the chain. + + :param others: other feature generator + :return: a :class:`ChainedFeatureGenerator` + """ + if isinstance(self, ChainedFeatureGenerator): + fgens = self.featureGenerators + else: + fgens = [self] + fgens.extend(others) + return ChainedFeatureGenerator(fgens) class RuleBasedFeatureGenerator(FeatureGenerator, ABC): @@ -297,7 +344,10 @@ def _generateFromMultiple(self, generateFeatures: Callable[[FeatureGenerator], p if len(dfs) == 0: return pd.DataFrame(index=index) else: - return pd.concat(dfs, axis=1) + combinedDF = pd.concat(dfs, axis=1) + if len(combinedDF.columns) != len(set(combinedDF.columns)): + raise Exception(f"At least one column was generated more than once: {list(combinedDF.columns)}; check feature generators for correctness!") + return combinedDF def _generate(self, inputDF: pd.DataFrame, ctx=None): def generateFeatures(fg: FeatureGenerator): @@ -342,6 +392,7 @@ def __init__(self, cache: util.cache.PersistentKeyValueCache = None, categorical def _generate(self, df: pd.DataFrame, ctx=None): dicts = [] for idx, nt in enumerate(df.itertuples()): + nt: PandasNamedTuple if idx % 100 == 0: log.debug(f"Generating feature via {self.__class__.__name__} for index {idx}") value = None @@ -366,31 +417,47 @@ def _generateFeatureDict(self, namedTuple) -> Dict[str, Any]: class FeatureGeneratorTakeColumns(RuleBasedFeatureGenerator): - def __init__(self, columns: Union[str, List[str]] = None, exceptColumns: Sequence[str] = (), categoricalFeatureNames: Sequence[str] = (), - normalisationRules: Sequence[data_transformation.DFTNormalisation.Rule] = (), - normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None): + def __init__(self, columns: Union[str, List[str]] = None, exceptColumns: Sequence[str] = (), + categoricalFeatureNames: Optional[Union[Sequence[str], str]] = (), + normalisationRules: Sequence[data_transformation.DFTNormalisation.Rule] = (), + normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None, + verifyColumnNames=True): """ - :param columns: name of the column or list of names of columns to be taken. If None, all columns will be taken. :param exceptColumns: list of names of columns to not take if present in the input df - :param categoricalFeatureNames: - :param normalisationRules: - :param normalisationRuleTemplate: + :param categoricalFeatureNames: either a sequence of column names or a regex that is to match all categorical feature names + (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated + by other feature generators). + It will be ensured that the respective columns in the generated data frames will have dtype 'category'. + Furthermore, presence of meta-information can later be leveraged for further transformations, e.g. one-hot encoding. + :param normalisationRules: Rules to be used by DFTNormalisation (e.g. for constructing an input transformer for a model). + These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used + within a data processing pipeline. They do not affect feature generation. + :param normalisationRuleTemplate: This parameter can be supplied instead of normalisationRules for the case where + there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as categorical. + :param verifyColumnNames: if True and columns to take were specified, will raise an error in case said columns + are missing during feature generation. If False, will log on info level instead """ super().__init__(categoricalFeatureNames=categoricalFeatureNames, normalisationRules=normalisationRules, normalisationRuleTemplate=normalisationRuleTemplate) if isinstance(columns, str): columns = [columns] self.columns = columns self.exceptColumns = exceptColumns + self.verifyColumnNames = verifyColumnNames def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: columnsToTake = self.columns if self.columns is not None else df.columns columnsToTake = [col for col in columnsToTake if col not in self.exceptColumns] - missingCols = set(columnsToTake).difference(df.columns) - if len(missingCols) > 0: - raise Exception(f"Columns {missingCols} not present in data frame; available columns: {list(df.columns)}") - + if self.columns is not None: + missingCols = set(columnsToTake).difference(df.columns) + if len(missingCols) > 0: + missing_cols_notification = f"Columns {missingCols} were specified but are not present in data frame. " \ + f"verifyColumnNames was set to {self.verifyColumnNames}; " \ + f"available columns: {list(df.columns)}" + if self.verifyColumnNames: + raise RuntimeError(missing_cols_notification) + log.info(missing_cols_notification) return df[columnsToTake] def info(self): @@ -734,10 +801,15 @@ def __init__(self, *featureGeneratorsOrNames: Union[str, FeatureGenerator], regi def getMultiFeatureGenerator(self) -> MultiFeatureGenerator: return self._multiFeatureGenerator - def getNormalizationRules(self, includeGeneratedCategoricalRules=True): + def getNormalisationRules(self, includeGeneratedCategoricalRules=True): return self.getMultiFeatureGenerator().getNormalisationRules( - includeGeneratedCategoricalRules=includeGeneratedCategoricalRules - ) + includeGeneratedCategoricalRules=includeGeneratedCategoricalRules) + + def getNormalizationRules(self, includeGeneratedCategoricalRules=True): # for backward compatibility + return self.getNormalisationRules(includeGeneratedCategoricalRules=includeGeneratedCategoricalRules) + + def getCategoricalFeatureNameRegex(self) -> str: + return self.getMultiFeatureGenerator().getCategoricalFeatureNameRegex() def _createMultiFeatureGenerator(self): featureGenerators = [] @@ -802,15 +874,103 @@ def info(self): return info -def flattenedFeatureGenerator(fgen: FeatureGenerator, columnsToFlatten: List[str] = None, - normalisationRules=(), normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None): +class FeatureGeneratorMapColumn(RuleBasedFeatureGenerator, ABC): """ - Return a flattening version of the input feature generator, leaving additional columns (if any) that are not to be flattened - but are also generated by the input feature generator untouched. + Creates a single feature from a single input column by applying a function to each element of the input column + """ + def __init__(self, inputColName: str, featureColName: str, categoricalFeatureNames: Optional[Union[Sequence[str], str]] = None, + normalisationRules: Sequence[data_transformation.DFTNormalisation.Rule] = (), + normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None, addCategoricalDefaultRules=True): + super().__init__(categoricalFeatureNames=categoricalFeatureNames, normalisationRules=normalisationRules, + normalisationRuleTemplate=normalisationRuleTemplate, addCategoricalDefaultRules=addCategoricalDefaultRules) + self._inputColName = inputColName + self._featureColName = featureColName + + def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: + if self._inputColName not in df.columns: + raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: {list(df.columns)}") + inputSeries = df[self._inputColName] + values = inputSeries.apply(self._createValue) + return pd.DataFrame({self._featureColName: values}, index=df.index) + + @abstractmethod + def _createValue(self, value): + """ + Maps a value from the input column to a feature value + + :param value: a value from the input column + :return: the feature value + """ + pass + + +class FeatureGeneratorMapColumnDict(RuleBasedFeatureGenerator, ABC): + """ + Creates an arbitrary number of features from a single input column by applying a function to each element of the input column + """ + def __init__(self, inputColName: str, categoricalFeatureNames: Optional[Union[Sequence[str], str]] = None, + normalisationRules: Sequence[data_transformation.DFTNormalisation.Rule] = (), + normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None, addCategoricalDefaultRules=True): + super().__init__(categoricalFeatureNames=categoricalFeatureNames, normalisationRules=normalisationRules, + normalisationRuleTemplate=normalisationRuleTemplate, addCategoricalDefaultRules=addCategoricalDefaultRules) + self._inputColName = inputColName + + def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: + if self._inputColName not in df.columns: + raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: {list(df.columns)}") + inputSeries = df[self._inputColName] + values = [self._createFeaturesDict(v) for v in inputSeries] + return pd.DataFrame(values, index=df.index) + + @abstractmethod + def _createFeaturesDict(self, value) -> Dict[str, Any]: + """ + Maps a value from the input column to a dictionary containing one or more features. + + :param value: a value from the input column + :return: a dictionary mapping feature names to values + """ + pass + + +class FeatureGeneratorNAMarker(RuleBasedFeatureGenerator): + """ + Creates features indicating whether another feature is N/A (not available). + It can be practical to use this feature generator in conjunction with DFTFillNA for models that cannot handle missing values. + """ + def __init__(self, columns: List[str], valueA=0, valueNA=1): + """ + Note: When changing the default values used, use only values that are considered to be normalised when using this + feature generation in a context where DFTNormalisation is used (no normalisation is applied to features generated + by this feature generator). + + :param columns: the columns for which to generate + :param valueA: the feature value if the input feature is available + :param valueNA: the feature value if the input feature is not available + """ + super().__init__(normalisationRuleTemplate=DFTNormalisation.RuleTemplate(skip=True)) + self.columns = columns + self.valueA = valueA + self.valueNA = valueNA + + def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: + newCols = {} + valueMap = {True: self.valueNA, False: self.valueA} + for col in self.columns: + newCols[f"{col}_na"] = [valueMap[isNA] for isNA in df[col].isna()] + return pd.DataFrame(newCols, index=df.index) + + +def flattenedFeatureGenerator(fgen: FeatureGenerator, columnsToFlatten: List[str] = None, keepOtherColumns=True, + normalisationRules: Sequence[DFTNormalisation.Rule] = (), + normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None): + """ + Return a flattening version of the input feature generator. :param fgen: the feature generator which generates columns that are to be flattened - :param columnsToFlatten: list of names of output columns to be flattened. - If None, all output columns will be flattened. + :param columnsToFlatten: list of names of output columns to be flattened; if None, flatten all columns + :param keepOtherColumns: whether any additional columns that are not to be flattened are to be retained + by the returned feature generator :param normalisationRules: additional normalisation rules for the flattened output columns :param normalisationRuleTemplate: This parameter can be supplied instead of normalisationRules for the case where there shall be a single rule that applies to all flattened output columns @@ -830,8 +990,24 @@ def flattenedFeatureGenerator(fgen: FeatureGenerator, columnsToFlatten: List[str """ flatteningGenerator = FeatureGeneratorFlattenColumns(columns=columnsToFlatten, normalisationRules=normalisationRules, normalisationRuleTemplate=normalisationRuleTemplate) - if columnsToFlatten is None: + if columnsToFlatten is None or not keepOtherColumns: return ChainedFeatureGenerator(fgen, flatteningGenerator) else: return ChainedFeatureGenerator(fgen, MultiFeatureGenerator(flatteningGenerator, FeatureGeneratorTakeColumns(exceptColumns=columnsToFlatten))) + + +class FeatureGeneratorFromDFT(FeatureGenerator): + def __init__(self, dft: DataFrameTransformer, categoricalFeatureNames: Optional[Union[Sequence[str], str]] = None, + normalisationRules: Sequence[data_transformation.DFTNormalisation.Rule] = (), + normalisationRuleTemplate: data_transformation.DFTNormalisation.RuleTemplate = None, + addCategoricalDefaultRules=True): + super().__init__(categoricalFeatureNames=categoricalFeatureNames, normalisationRules=normalisationRules, + normalisationRuleTemplate=normalisationRuleTemplate, addCategoricalDefaultRules=addCategoricalDefaultRules) + self.dft = dft + + def _fit(self, X: pd.DataFrame, Y: pd.DataFrame = None, ctx=None): + self.dft.fit(X) + + def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: + return self.dft.apply(df) diff --git a/src/sensai/geoanalytics/__init__.py b/src/sensai/geoanalytics/__init__.py index ee3ddaa4..e69de29b 100644 --- a/src/sensai/geoanalytics/__init__.py +++ b/src/sensai/geoanalytics/__init__.py @@ -1 +0,0 @@ -from . import coordinates, coordinate_clustering, coordinate_clustering_ground_truth, geometry \ No newline at end of file diff --git a/src/sensai/geoanalytics/_globalmaptiles.py b/src/sensai/geoanalytics/_globalmaptiles.py new file mode 100644 index 00000000..da3d20bb --- /dev/null +++ b/src/sensai/geoanalytics/_globalmaptiles.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python +############################################################################### +# $Id$ +# +# Project: GDAL2Tiles, Google Summer of Code 2007 & 2008 +# Global Map Tiles Classes +# Purpose: Convert a raster into TMS tiles, create KML SuperOverlay EPSG:4326, +# generate a simple HTML viewers based on Google Maps and OpenLayers +# Author: Klokan Petr Pridal, klokan at klokan dot cz +# Web: http://www.klokan.cz/projects/gdal2tiles/ +# +############################################################################### +# Copyright (c) 2008 Klokan Petr Pridal. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +############################################################################### + +""" +_globalmaptiles.py + +Global Map Tiles as defined in MapTile Map Service (TMS) Profiles +============================================================== + +Functions necessary for generation of global tiles used on the web. +It contains classes implementing coordinate conversions for: + + - GlobalMercator (based on EPSG:900913 = EPSG:3785) + for Google Maps, Yahoo Maps, Microsoft Maps compatible tiles + - GlobalGeodetic (based on EPSG:4326) + for OpenLayers Base Map and Google Earth compatible tiles + +More info at: + +http://wiki.osgeo.org/wiki/Tile_Map_Service_Specification +http://wiki.osgeo.org/wiki/WMS_Tiling_Client_Recommendation +http://msdn.microsoft.com/en-us/library/bb259689.aspx +http://code.google.com/apis/maps/documentation/overlays.html#Google_Maps_Coordinates + +Created by Klokan Petr Pridal on 2008-07-03. +Google Summer of Code 2008, project GDAL2Tiles for OSGEO. + +In case you use this class in your product, translate it to another language +or find it usefull for your project please let me know. +My email: klokan at klokan dot cz. +I would like to know where it was used. + +Class is available under the open-source GDAL license (www.gdal.org). +""" + +import math +from typing import Tuple + + +class GlobalMercator(object): + """ + TMS Global Mercator Profile + --------------------------- + + Functions necessary for generation of tiles in Spherical Mercator projection, + EPSG:900913 (EPSG:gOOglE, Google Maps Global Mercator), EPSG:3785, OSGEO:41001. + + Such tiles are compatible with Google Maps, Microsoft Virtual Earth, Yahoo Maps, + UK Ordnance Survey OpenSpace API, ... + and you can overlay them on top of base maps of those web mapping applications. + + Pixel and tile coordinates are in TMS notation (origin [0,0] in bottom-left). + + What coordinate conversions do we need for TMS Global Mercator tiles:: + + LatLon <-> Meters <-> Pixels <-> MapTile + + WGS84 coordinates Spherical Mercator Pixels in pyramid Tiles in pyramid + lat/lon XY in metres XY pixels Z zoom XYZ from TMS + EPSG:4326 EPSG:900913 + .----. --------- -- TMS + / \ <-> | | <-> /----/ <-> Google + \ / | | /--------/ QuadTree + ----- --------- /------------/ + KML, public WebMapService Web Clients TileMapService + + What is the coordinate extent of Earth in EPSG:900913? + + [-20037508.342789244, -20037508.342789244, 20037508.342789244, 20037508.342789244] + Constant 20037508.342789244 comes from the circumference of the Earth in meters, + which is 40 thousand kilometers, the coordinate origin is in the middle of extent. + In fact you can calculate the constant as: 2 * math.pi * 6378137 / 2.0 + $ echo 180 85 | gdaltransform -s_srs EPSG:4326 -t_srs EPSG:900913 + Polar areas with abs(latitude) bigger then 85.05112878 are clipped off. + + What are zoom level constants (pixels/meter) for pyramid with EPSG:900913? + + whole region is on top of pyramid (zoom=0) covered by 256x256 pixels tile, + every lower zoom level resolution is always divided by two + initialResolution = 20037508.342789244 * 2 / 256 = 156543.03392804062 + + What is the difference between TMS and Google Maps/QuadTree tile name convention? + + The tile raster itself is the same (equal extent, projection, pixel size), + there is just different identification of the same raster tile. + Tiles in TMS are counted from [0,0] in the bottom-left corner, id is XYZ. + Google placed the origin [0,0] to the top-left corner, reference is XYZ. + Microsoft is referencing tiles by a QuadTree name, defined on the website: + http://msdn2.microsoft.com/en-us/library/bb259689.aspx + + The lat/lon coordinates are using WGS84 datum, yeh? + + Yes, all lat/lon we are mentioning should use WGS84 Geodetic Datum. + Well, the web clients like Google Maps are projecting those coordinates by + Spherical Mercator, so in fact lat/lon coordinates on sphere are treated as if + the were on the WGS84 ellipsoid. + + From MSDN documentation: + To simplify the calculations, we use the spherical form of projection, not + the ellipsoidal form. Since the projection is used only for map display, + and not for displaying numeric coordinates, we don't need the extra precision + of an ellipsoidal projection. The spherical projection causes approximately + 0.33 percent scale distortion in the Y direction, which is not visually noticable. + + How do I create a raster in EPSG:900913 and convert coordinates with PROJ.4? + + You can use standard GIS tools like gdalwarp, cs2cs or gdaltransform. + All of the tools supports -t_srs 'epsg:900913'. + + For other GIS programs check the exact definition of the projection: + More info at http://spatialreference.org/ref/user/google-projection/ + The same projection is degined as EPSG:3785. WKT definition is in the official + EPSG database. + + Proj4 Text: + +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 + +k=1.0 +units=m +nadgrids=@null +no_defs + + Human readable WKT format of EPGS:900913: + PROJCS["Google Maps Global Mercator", + GEOGCS["WGS 84", + DATUM["WGS_1984", + SPHEROID["WGS 84",6378137,298.2572235630016, + AUTHORITY["EPSG","7030"]], + AUTHORITY["EPSG","6326"]], + PRIMEM["Greenwich",0], + UNIT["degree",0.0174532925199433], + AUTHORITY["EPSG","4326"]], + PROJECTION["Mercator_1SP"], + PARAMETER["central_meridian",0], + PARAMETER["scale_factor",1], + PARAMETER["false_easting",0], + PARAMETER["false_northing",0], + UNIT["metre",1, + AUTHORITY["EPSG","9001"]]] + """ + + def __init__(self, tileSize=256): + """ + Initialize the TMS Global Mercator pyramid + + :param tileSize: the tile size in pixels + """ + self.tileSize = tileSize + self.initialResolution = 2 * math.pi * 6378137 / self.tileSize + # 156543.03392804062 for tileSize 256 pixels + self.originShift = 2 * math.pi * 6378137 / 2.0 + # 20037508.342789244 + + def LatLonToMeters(self, lat: float, lon: float): + """"Converts given lat/lon in WGS84 Datum to XY in Spherical Mercator EPSG:900913""" + + mx = lon * self.originShift / 180.0 + my = math.log(math.tan((90 + lat) * math.pi / 360.0)) / (math.pi / 180.0) + + my = my * self.originShift / 180.0 + return mx, my + + def MetersToLatLon(self, mx, my): + """Converts XY point from Spherical Mercator EPSG:900913 to lat/lon in WGS84 Datum""" + + lon = (mx / self.originShift) * 180.0 + lat = (my / self.originShift) * 180.0 + + lat = 180 / math.pi * (2 * math.atan(math.exp(lat * math.pi / 180.0)) - math.pi / 2.0) + return lat, lon + + def PixelsToMeters(self, px, py, zoom): + """Converts pixel coordinates in given zoom level of pyramid to EPSG:900913""" + + res = self.Resolution(zoom) + mx = px * res - self.originShift + my = py * res - self.originShift + return mx, my + + def MetersToPixels(self, mx, my, zoom): + """Converts EPSG:900913 to pyramid pixel coordinates in given zoom level""" + + res = self.Resolution(zoom) + px = (mx + self.originShift) / res + py = (my + self.originShift) / res + return px, py + + def PixelsToTile(self, px, py): + """Returns a tile covering region in given pixel coordinates""" + + tx = int(math.ceil(px / float(self.tileSize)) - 1) + ty = int(math.ceil(py / float(self.tileSize)) - 1) + return tx, ty + + def PixelsToRaster(self, px, py, zoom): + """Move the origin of pixel coordinates to top-left corner""" + + mapSize = self.tileSize << zoom + return px, mapSize - py + + def MetersToTile(self, mx, my, zoom): + """Returns tile for given mercator coordinates""" + + px, py = self.MetersToPixels(mx, my, zoom) + return self.PixelsToTile(px, py) + + def LatLonToTile(self, lat, lon, zoom) -> Tuple[int, int]: + return self.MetersToTile(*self.LatLonToMeters(lat, lon), zoom) + + def TileBounds(self, tx, ty, zoom): + """Returns bounds of the given tile in EPSG:900913 coordinates""" + + minx, miny = self.PixelsToMeters(tx*self.tileSize, ty*self.tileSize, zoom) + maxx, maxy = self.PixelsToMeters((tx+1)*self.tileSize, (ty+1)*self.tileSize, zoom) + return minx, miny, maxx, maxy + + def TileLatLonBounds(self, tx, ty, zoom): + """Returns bounds of the given tile in latutude/longitude using WGS84 datum""" + + bounds = self.TileBounds(tx, ty, zoom) + minLat, minLon = self.MetersToLatLon(bounds[0], bounds[1]) + maxLat, maxLon = self.MetersToLatLon(bounds[2], bounds[3]) + + return minLat, minLon, maxLat, maxLon + + def Resolution(self, zoom): + """Resolution (meters/pixel) for given zoom level (measured at Equator)""" + + # return (2 * math.pi * 6378137) / (self.tileSize * 2**zoom) + return self.initialResolution / (2**zoom) + + def ZoomForPixelSize(self, pixelSize): + """Maximal scaledown zoom of the pyramid closest to the pixelSize.""" + + for i in range(30): + if pixelSize > self.Resolution(i): + return i-1 if i != 0 else 0 # We don't want to scale up + + @staticmethod + def GoogleTile(tx, ty, zoom): + """Converts TMS tile coordinates to Google MapTile coordinates""" + + # coordinate origin is moved from bottom-left to top-left corner of the extent + return tx, (2**zoom - 1) - ty + + @staticmethod + def QuadTree(tx, ty, zoom): + """Converts TMS tile coordinates to Microsoft QuadTree""" + + quadKey = "" + ty = (2**zoom - 1) - ty + for i in range(zoom, 0, -1): + digit = 0 + mask = 1 << (i-1) + if (tx & mask) != 0: + digit += 1 + if (ty & mask) != 0: + digit += 2 + quadKey += str(digit) + + return quadKey + + +class GlobalGeodetic(object): + """ + TMS Global Geodetic Profile + --------------------------- + + Functions necessary for generation of global tiles in Plate Carre projection, + EPSG:4326, "unprojected profile". + + Such tiles are compatible with Google Earth (as any other EPSG:4326 rasters) + and you can overlay the tiles on top of OpenLayers base map. + + Pixel and tile coordinates are in TMS notation (origin [0,0] in bottom-left). + + What coordinate conversions do we need for TMS Global Geodetic tiles? + + Global Geodetic tiles are using geodetic coordinates (latitude,longitude) + directly as planar coordinates XY (it is also called Unprojected or Plate + Carre). We need only scaling to pixel pyramid and cutting to tiles. + Pyramid has on top level two tiles, so it is not square but rectangle. + Area [-180,-90,180,90] is scaled to 512x256 pixels. + TMS has coordinate origin (for pixels and tiles) in bottom-left corner. + Rasters are in EPSG:4326 and therefore are compatible with Google Earth. + + LatLon <-> Pixels <-> Tiles + + WGS84 coordinates Pixels in pyramid Tiles in pyramid + lat/lon XY pixels Z zoom XYZ from TMS + EPSG:4326 + .----. ---- + / \ <-> /--------/ <-> TMS + \ / /--------------/ + ----- /--------------------/ + WMS, KML Web Clients, Google Earth TileMapService + """ + + def __init__(self, tileSize=256): + self.tileSize = tileSize + + @staticmethod + def LatLonToPixels(lat, lon, zoom): + """Converts lat/lon to pixel coordinates in given zoom of the EPSG:4326 pyramid""" + + res = 180 / 256.0 / 2**zoom + px = (180 + lat) / res + py = (90 + lon) / res + return px, py + + def PixelsToTile(self, px, py): + """Returns coordinates of the tile covering region in pixel coordinates""" + + tx = int(math.ceil(px / float(self.tileSize)) - 1) + ty = int(math.ceil(py / float(self.tileSize)) - 1) + return tx, ty + + @staticmethod + def Resolution(zoom): + """Resolution (arc/pixel) for given zoom level (measured at Equator)""" + + return 180 / 256.0 / 2**zoom + #return 180 / float(1 << (8+zoom)) + + @staticmethod + def TileBounds(tx, ty, zoom): + """Returns bounds of the given tile""" + res = 180 / 256.0 / 2**zoom + return ( + tx*256*res - 180, + ty*256*res - 90, + (tx+1)*256*res - 180, + (ty+1)*256*res - 90) diff --git a/src/sensai/geoanalytics/geo_clustering.py b/src/sensai/geoanalytics/geo_clustering.py new file mode 100644 index 00000000..1094a315 --- /dev/null +++ b/src/sensai/geoanalytics/geo_clustering.py @@ -0,0 +1,151 @@ +import collections +import itertools +import math +from abc import abstractmethod, ABC +from typing import List, Tuple + +import numpy as np +import sklearn.cluster + +from .geo_coords import GeoCoord +from .local_coords import LocalCoordinateSystem +from ..clustering import GreedyAgglomerativeClustering + + +class GeoCoordClusterer(ABC): + @abstractmethod + def fitGeoCoords(self, geoCoords: List[GeoCoord]): + """ + :param geoCoords: the coordinates to be clustered + """ + pass + + @abstractmethod + def clustersIndices(self) -> Tuple[List[List[int]], List[int]]: + """ + :return: a tuple (clusters, outliers), where clusters is a dictionary mapping from cluster index to + the list of original point indices within the cluster and outliers is the list of indices of points not within + clusters + """ + pass + + +class GreedyAgglomerativeGeoCoordClusterer(GeoCoordClusterer): + def __init__(self, maxMinDistanceForMergeM: float, maxDistanceM: float, minClusterSize: int, lcs: LocalCoordinateSystem = None): + """ + :param maxMinDistanceForMergeM: the maximum distance, in metres, for the minimum distance between two existing clusters for a merge + to be admissible + :param maxDistanceM: the maximum distance, in metres, between any two points for the points to be allowed to be in the same cluster + :param minClusterSize: the minimum number of points any valid cluster must ultimately contain; the points in any smaller clusters + shall be considered as outliers + :param lcs: the local coordinate system to use for clustering; if None, compute based on mean coordinates passed when fitting + """ + self.lcs = lcs + self.minClusterSize = minClusterSize + self.maxMinDistanceForMerge = maxMinDistanceForMergeM + self.squaredMaxMinDistanceForMerge = maxMinDistanceForMergeM * maxMinDistanceForMergeM + self.squaredMaxDistance = maxDistanceM * maxDistanceM + self.localPoints = None + + class LocalPoint: + def __init__(self, xy: np.ndarray, idx: int): + self.idx = idx + self.xy = xy + + class Cluster(GreedyAgglomerativeClustering.Cluster): + def __init__(self, point: "GreedyAgglomerativeGeoCoordClusterer.LocalPoint", clusterer: 'GreedyAgglomerativeGeoCoordClusterer'): + self.clusterer = clusterer + self.points = [point] + + def mergeCost(self, other): + cartesianProduct = itertools.product(self.points, other.points) + minSquaredDistance = math.inf + for p1, p2 in cartesianProduct: + diff = p1.xy - p2.xy + squaredDistance = np.dot(diff, diff) + if squaredDistance > self.clusterer.squaredMaxDistance: + return math.inf + else: + minSquaredDistance = min(squaredDistance, minSquaredDistance) + if minSquaredDistance <= self.clusterer.squaredMaxMinDistanceForMerge: + return minSquaredDistance + return math.inf + + def merge(self, other): + self.points += other.points + + def fitGeoCoords(self, geoCoords: List[GeoCoord]) -> None: + if self.lcs is None: + meanCoord = GeoCoord.meanCoord(geoCoords) + self.lcs = LocalCoordinateSystem(meanCoord.lat, meanCoord.lon) + self.localPoints = [self.LocalPoint(np.array(self.lcs.getLocalCoords(p.lat, p.lon)), idx) for idx, p in enumerate(geoCoords)] + clusters = [self.Cluster(lp, self) for lp in self.localPoints] + clusters = GreedyAgglomerativeClustering(clusters).applyClustering() + self.clusters = clusters + + def clustersIndices(self) -> Tuple[List[List[int]], List[int]]: + outliers = [] + clusters = [] + for c in self.clusters: + indices = [p.idx for p in c.points] + if len(c.points) < self.minClusterSize: + outliers.extend(indices) + else: + clusters.append(indices) + return clusters, outliers + + +class SkLearnGeoCoordClusterer(GeoCoordClusterer): + def __init__(self, clusterer, lcs: LocalCoordinateSystem = None): + """ + :param clusterer: a clusterer from sklearn.cluster + :param lcs: the local coordinate system to use for Euclidian conversion; if None, determine from data (using mean coordinate as centre) + """ + self.lcs = lcs + self.clusterer = clusterer + self.localPoints = None + + def fitGeoCoords(self, geoCoords: List[GeoCoord]): + if self.lcs is None: + meanCoord = GeoCoord.meanCoord(geoCoords) + self.lcs = LocalCoordinateSystem(meanCoord.lat, meanCoord.lon) + self.localPoints = [self.lcs.getLocalCoords(p.lat, p.lon) for p in geoCoords] + self.clusterer.fit(self.localPoints) + + def _clusters(self, mode): + clusters = collections.defaultdict(list) + outliers = [] + for idxPoint, idxCluster in enumerate(self.clusterer.labels_): + if mode == "localPoints": + item = self.localPoints[idxPoint] + elif mode == "indices": + item = idxPoint + else: + raise ValueError() + if idxCluster >= 0: + clusters[idxCluster].append(item) + else: + outliers.append(item) + return list(clusters.values()), outliers + + def clustersLocalPoints(self) -> Tuple[List[List[Tuple[float, float]]], List[Tuple[float, float]]]: + """ + :return: a tuple (clusters, outliers), where clusters is a dictionary mapping from cluster index to + the list of local points within the cluster and outliers is a list of local points not within + clusters + """ + return self._clusters("localPoints") + + def clustersIndices(self) -> Tuple[List[List[int]], List[int]]: + return self._clusters("indices") + + +class DBSCANGeoCoordClusterer(SkLearnGeoCoordClusterer): + def __init__(self, eps, min_samples, lcs: LocalCoordinateSystem = None, **kwargs): + """ + :param eps: the maximum distance between two samples for one to be considered as in the neighbourhood of the other + :param min_samples: the minimum number of samples that must be within a neighbourhood for a cluster to be formed + :param lcs: the local coordinate system for conversion to a Euclidian space + :param kwargs: additional arguments to pass to DBSCAN (see https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html) + """ + super().__init__(sklearn.cluster.DBSCAN(eps=eps, min_samples=min_samples, **kwargs), lcs) \ No newline at end of file diff --git a/src/sensai/geoanalytics/geo_coords.py b/src/sensai/geoanalytics/geo_coords.py new file mode 100644 index 00000000..6878583b --- /dev/null +++ b/src/sensai/geoanalytics/geo_coords.py @@ -0,0 +1,190 @@ +""" +Utility functions and classes for geographic coordinates +""" + +import math +from typing import Tuple, Iterable + +import numpy as np +import pandas as pd + +from ..util.string import ToStringMixin + +EARTH_RADIUS = 6371000 +EARTH_CIRCUMFERENCE = 2 * math.pi * EARTH_RADIUS +LATITUDE_PER_METRE = 360.0 / EARTH_CIRCUMFERENCE + + +def longitudePerM(latitude): + return LATITUDE_PER_METRE / math.cos(math.radians(latitude)) + + +def latitudePerM(): + return LATITUDE_PER_METRE + + +def approximateSquaredDistance(p1: Tuple[float, float], p2: Tuple[float, float]): + """ + :param p1: a tuple (latitude, longitude) + :param p2: a tuple (latitude, longitude) + :return: the approximate squared distance (in m²) between p1 and p2 + """ + latPerM = latitudePerM() + p1lat, p1lon = p1 + p2lat, p2lon = p2 + lonPerM = longitudePerM((p1lat + p2lat) / 2) + dx = (p2lon - p1lon) / lonPerM + dy = (p2lat - p1lat) / latPerM + return dx * dx + dy * dy + + +def closestPointOnSegment(searchPos: Tuple[float, float], segPoint1: Tuple[float, float], segPoint2: Tuple[float, float]): + """ + Gets the point on the line segment connecting segPoint1 and segPoint2 that is closest to searchPos + + :param searchPos: the position for which to search for the closest point on the line segment + :param segPoint1: the first point defining the line segment on which to search + :param segPoint2: the second point defining the line segment on which to search + :return: the closest point, which is on the line connecting segPoint1 and segPoint2 (and may be one of the two points) + """ + seg1lat, seg1lon = segPoint1 + seg2lat, seg2lon = segPoint2 + srchlat, srchlon = searchPos + latPerM = latitudePerM() + lonPerM = longitudePerM(srchlat) + sp1x = (seg1lon - srchlon) / lonPerM + sp1y = (seg1lat - srchlat) / latPerM + sp2x = (seg2lon - srchlon) / lonPerM + sp2y = (seg2lat - srchlat) / latPerM + vx = sp2x - sp1x + vy = sp2y - sp1y + c1 = -vx * sp1x - vy * sp1y + if c1 <= 0: + return segPoint1 + c2 = vx * vx + vy * vy + if c2 <= c1: + return segPoint2 + b = 0 if c2 == 0 else c1 / c2 + lon = seg1lon + b * vx * lonPerM + lat = seg1lat + b * vy * latPerM + return [lat, lon] + + +def orientation(p1: Tuple[float, float], p2: Tuple[float, float]) -> float: + """ + Gets the orientation angle for the vector from p1 to p2 + + :param p1: a (lat, lon) pair + :param p2: a (lat, lon) pair + :return: the orientation angle in rad + """ + p1Lat, p1Lon = p1 + p2Lat, p2Lon = p2 + centerLat = (p1Lat + p2Lat) / 2 + dx = (p2Lon - p1Lon) / longitudePerM(centerLat) + dy = (p2Lat - p1Lat) / latitudePerM() + return math.atan2(dy, dx) + + +def absAngleDifference(a1: float, a2: float) -> float: + """ + Computes the absolute angle difference in ]-pi, pi] between two angles + + :param a1: an angle in rad + :param a2: an angle in rad + :return: the difference in rad + """ + d = a1 - a2 + while d > math.pi: + d -= 2*math.pi + while d <= -math.pi: + d += 2*math.pi + return abs(d) + + +def closestPointOnPolyline(searchPos, polyline, searchOrientationAngle=None, maxAngleDifference=0) -> Tuple[Tuple[float, float], float, int]: + """ + Gets the point on the given polyline that is closest to the given search position along with the + distance (in metres) to the polyline + + :param searchPos: a (lat, lon) pair indicating the position for which to find the closest math on the polyline + :param polyline: list of (lat, lon) pairs that make up the polyline on which to search + :param searchOrientationAngle: if not None, defines the orientation with which to compute angle differences (if maxAngleDifference > 0) + :param maxAngleDifference: the maximum absolute angle difference (in rad) that is admissible (between the orientation of the + respective line segment and the orientation given in searchOrientationAngle) + :return: a tuple (optPoint, optDist, optSegmentStartIdx) where + optPoint is the closest point (with admissible orientation - or None if there is none), + optDist is the distance from the polyline to the closest point, + optSegmentStartIdx is the index of the first point of the segment on the polyline for which the closest point was found + """ + if len(polyline) < 2: + raise Exception("Polyline must consist of at least two points") + optSegmentStartIdx = None + optPoint = None + optSqDist = None + for i in range(len(polyline)-1): + if maxAngleDifference > 0: + orientationAngle = orientation(polyline[i], polyline[i+1]) + angDiff = absAngleDifference(orientationAngle, searchOrientationAngle) + if angDiff > maxAngleDifference: + continue + optSegPoint = closestPointOnSegment(searchPos, polyline[i], polyline[i + 1]) + sqDist = approximateSquaredDistance(searchPos, optSegPoint) + if optSqDist is None or sqDist < optSqDist: + optPoint = optSegPoint + optSqDist = sqDist + optSegmentStartIdx = i + return optPoint, math.sqrt(optSqDist), optSegmentStartIdx + + +class GeoCoord(ToStringMixin): + """ + Represents geographic coordinates (WGS84) + """ + def __init__(self, lat: float, lon: float): + self.lat = lat + self.lon = lon + + def latlon(self): + return self.lat, self.lon + + def distanceTo(self, gpsPosition: 'GeoCoord'): + return math.sqrt(self.squaredDistanceTo(gpsPosition)) + + def squaredDistanceTo(self, gpsPosition: 'GeoCoord'): + return approximateSquaredDistance(self.latlon(), gpsPosition.latlon()) + + def localCoords(self, lcs): + return lcs.getLocalCoords(self.lat, self.lon) + + @classmethod + def meanCoord(cls, geoCoords: Iterable["GeoCoord"]): + meanLat = np.mean([c.lat for c in geoCoords]) + meanLon = np.mean([c.lon for c in geoCoords]) + # noinspection PyTypeChecker + return GeoCoord(meanLat, meanLon) + + +class GpsTracePoint(GeoCoord): + def __init__(self, lat, lon, time: pd.Timestamp): + super().__init__(lat, lon) + self.time = time + + +class GeoRect: + def __init__(self, minLat: float, minLon: float, maxLat: float, maxLon: float): + if maxLat < minLat or maxLon < minLon: + raise ValueError() + self.minLat = minLat + self.minLon = minLon + self.maxLat = maxLat + self.maxLon = maxLon + + @staticmethod + def fromCircle(centreLat, centreLon, radiusM): + """Creates the bounding rectangle for the given circular area""" + from .local_coords import LocalCoordinateSystem + lcs = LocalCoordinateSystem(centreLat, centreLon) + minLat, minLon = lcs.getLatLon(-radiusM, -radiusM) + maxLat, maxLon = lcs.getLatLon(radiusM, radiusM) + return GeoRect(minLat, minLon, maxLat, maxLon) diff --git a/src/sensai/geoanalytics/geopandas/__init__.py b/src/sensai/geoanalytics/geopandas/__init__.py new file mode 100644 index 00000000..ee3ddaa4 --- /dev/null +++ b/src/sensai/geoanalytics/geopandas/__init__.py @@ -0,0 +1 @@ +from . import coordinates, coordinate_clustering, coordinate_clustering_ground_truth, geometry \ No newline at end of file diff --git a/src/sensai/geoanalytics/coordinate_clustering.py b/src/sensai/geoanalytics/geopandas/coordinate_clustering.py similarity index 93% rename from src/sensai/geoanalytics/coordinate_clustering.py rename to src/sensai/geoanalytics/geopandas/coordinate_clustering.py index 229213e2..7a877396 100644 --- a/src/sensai/geoanalytics/coordinate_clustering.py +++ b/src/sensai/geoanalytics/geopandas/coordinate_clustering.py @@ -1,15 +1,17 @@ -import geopandas as gp import logging +from typing import Callable, Union, Iterable + +import geopandas as gp import numpy as np +import pandas as pd from shapely.geometry import MultiPoint -from typing import Callable, Union, Iterable -from ..clustering.clustering_base import EuclideanClusterer -from ..clustering import SkLearnEuclideanClusterer -from ..clustering.sklearn_clustering import SkLearnClustererProtocol -from ..util.cache import LoadSaveInterface from .coordinates import validateCoordinates, extractCoordinatesArray, TCoordinates, GeoDataFrameWrapper -from ..util.profiling import timed +from ...clustering import SkLearnEuclideanClusterer +from ...clustering.clustering_base import EuclideanClusterer +from ...clustering.sklearn_clustering import SkLearnClustererProtocol +from ...util.cache import LoadSaveInterface +from ...util.profiling import timed log = logging.getLogger(__name__) @@ -114,9 +116,9 @@ def toGeoDF(self, condition: Callable[[Cluster], bool] = None, crs='epsg:3857', geodf.crs = crs # TODO or not TODO: parallelize this or improve performance some another way for cluster in self.clusters(condition): - geodf = geodf.append(cluster.toGeoDF(crs=crs)) + geodf = pd.concat((geodf, cluster.toGeoDF(crs=crs))) if includeNoise: - geodf = geodf.append(self.noiseCluster().toGeoDF(crs=crs)) + geodf = pd.concat((geodf, self.noiseCluster().toGeoDF(crs=crs))) return geodf def plot(self, includeNoise=False, condition=None, **kwargs): diff --git a/src/sensai/geoanalytics/coordinate_clustering_ground_truth.py b/src/sensai/geoanalytics/geopandas/coordinate_clustering_ground_truth.py similarity index 100% rename from src/sensai/geoanalytics/coordinate_clustering_ground_truth.py rename to src/sensai/geoanalytics/geopandas/coordinate_clustering_ground_truth.py diff --git a/src/sensai/geoanalytics/coordinates.py b/src/sensai/geoanalytics/geopandas/coordinates.py similarity index 97% rename from src/sensai/geoanalytics/coordinates.py rename to src/sensai/geoanalytics/geopandas/coordinates.py index 6867bfad..c2414b75 100644 --- a/src/sensai/geoanalytics/coordinates.py +++ b/src/sensai/geoanalytics/geopandas/coordinates.py @@ -4,7 +4,7 @@ from shapely.geometry import MultiPoint from typing import Union -from ..clustering import EuclideanClusterer +from ...clustering import EuclideanClusterer TCoordinates = Union[np.ndarray, MultiPoint, gp.GeoDataFrame, EuclideanClusterer.Cluster] diff --git a/src/sensai/geoanalytics/geometry.py b/src/sensai/geoanalytics/geopandas/geometry.py similarity index 100% rename from src/sensai/geoanalytics/geometry.py rename to src/sensai/geoanalytics/geopandas/geometry.py diff --git a/src/sensai/geoanalytics/graph.py b/src/sensai/geoanalytics/geopandas/graph.py similarity index 100% rename from src/sensai/geoanalytics/graph.py rename to src/sensai/geoanalytics/geopandas/graph.py diff --git a/src/sensai/geoanalytics/local_coords.py b/src/sensai/geoanalytics/local_coords.py new file mode 100644 index 00000000..b0e69173 --- /dev/null +++ b/src/sensai/geoanalytics/local_coords.py @@ -0,0 +1,181 @@ +""" +Local coordinate systems (for geographic data) +""" +import math +from functools import reduce +from typing import Tuple, Union, List + +import numpy as np +import utm +from shapely.geometry import polygon, multipolygon, point, LineString, mapping +from shapely.ops import polygonize, unary_union + + +class LocalCoordinateSystem(object): + """ + Represents a local coordinate system for the conversion of geo-coordinates + (latitude, longitude) to a local Cartesian coordinate system (unit=metre) and vice versa + using the UTM transform + """ + + def __init__(self, lat, lon): + """ + Parameters: + lat: the latitude of the origin of the coordinate system + lon: the longitude of the origin of the coordinate system + """ + self.uRef = utm.from_latlon(lat, lon) + self.uRefE = self.uRef[0] + self.uRefN = self.uRef[1] + self.uRefPseudoN = self._pseudoNorthing(self.uRefN) + + def getLocalCoords(self, lat, lon) -> Tuple[float, float]: + uE, uN, zM, zL = utm.from_latlon(lat, lon) + x = uE - self.uRefE + y = self._pseudoNorthing(uN) - self.uRefPseudoN + return x, y + + def getLatLon(self, localX, localY) -> Tuple[float, float]: + easting = localX + self.uRefE + pseudoNorthing = localY + self.uRefPseudoN + return utm.to_latlon(easting, self._realNorthing(pseudoNorthing), self.uRef[2], self.uRef[3]) + + @staticmethod + def _pseudoNorthing(realNorthing): + if realNorthing >= 10000000: + return realNorthing - 10000000 + else: + return realNorthing + + @staticmethod + def _realNorthing(pseudoNorthing): + if pseudoNorthing < 0: + return pseudoNorthing + 10000000 + else: + return pseudoNorthing + + +class LocalHexagonalGrid: + """ + A local hexagonal grid, where hex cells can be referenced by two integer coordinates relative to + the central grid cell, whose centre is at local coordinate (0, 0) and where positive x-coordinates/columns + are towards the east and positive y-coordinates/rows are towards the north. + Every odd row of cells is shifted half a hexagon to the right, i.e. column x for row 1 is half a grid cell + further to the right than column x for row 0. + + For visualisation purposes, see https://www.redblobgames.com/grids/hexagons/ + """ + def __init__(self, radiusM): + """ + :param radiusM: the radius, in metres, of each hex cell + """ + self.radiusM = radiusM + startAngle = math.pi / 6 + stepAngle = math.pi / 3 + self.offsetVectors = [] + for i in range(6): + angle = startAngle + i * stepAngle + x = math.cos(angle) * radiusM + y = math.sin(angle) * radiusM + self.offsetVectors.append(np.array([x, y])) + self.hexagonWidth = 2 * self.offsetVectors[0][0] + self.hexagonHeight = 2 * self.offsetVectors[1][1] + self.rowStep = 0.75 * self.hexagonHeight + self.polygonArea = 6 * self.hexagonHeight * self.hexagonWidth / 8 + + def getHexagon(self, xColumn: int, yRow: int) -> polygon.Polygon: + """ + Gets the hexagon (polygon) for the given integer hex cell coordinates + :param xColumn: the column coordinate + :param yRow: the row coordinate + :return: the hexagon + """ + centreX = xColumn * self.hexagonWidth + centreY = yRow * self.rowStep + if yRow % 2 == 1: + centreX += 0.5 * self.hexagonWidth + centre = np.array([centreX, centreY]) + return polygon.Polygon([centre + o for o in self.offsetVectors]) + + def getMinHexagonColumn(self, x): + lowestXDefinitelyInColumn0 = 0 + return math.floor((x - lowestXDefinitelyInColumn0) / self.hexagonWidth) + + def getMaxHexagonColumn(self, x): + highestXDefinitelyInColumn0 = self.hexagonWidth / 2 + return math.ceil((x - highestXDefinitelyInColumn0) / self.hexagonWidth) + + def getMinHexagonRow(self, y): + lowestYDefinitelyInRow0 = -self.hexagonHeight / 4 + return math.floor((y - lowestYDefinitelyInRow0) / self.rowStep) + + def getMaxHexagonRow(self, y): + highestYDefinitelyInRow0 = self.hexagonHeight / 4 + return math.ceil((y - highestYDefinitelyInRow0) / self.rowStep) + + def getHexagonCoordSpanForBoundingBox(self, minX, minY, maxX, maxY) -> Tuple[Tuple[int, int], Tuple[int, int]]: + """ + Gets the range of hex-cell coordinates that cover the given bounding box + + :param minX: minimum x-coordinate of bounding box + :param minY: minimum y-coordinate of bounding box + :param maxX: maximum x-coordinate of bounding box + :param maxY: maximum y-coordinate of bounding box + :return: a pair of pairs ((minCol, minRow), (maxCol, maxRow)) indicating the span of cell coordinates + """ + if minX > maxX or minY > maxY: + raise ValueError() + minColumn = self.getMinHexagonColumn(minX) + maxColumn = self.getMaxHexagonColumn(maxX) + minRow = self.getMinHexagonRow(minY) + maxRow = self.getMaxHexagonRow(maxY) + return ((minColumn, minRow), (maxColumn, maxRow)) + + def getHexagonCoordsForPoint(self, x, y): + ((minColumn, minRow), (maxColumn, maxRow)) = self.getHexagonCoordSpanForBoundingBox(x, y, x, y) + for xCol in range(minColumn, maxColumn+1): + for yRow in range(minRow, maxRow+1): + if self.getHexagon(xCol, yRow).contains(point.Point(x, y)): + return xCol, yRow + raise Exception("No Hexagon matched; possible edge case (point on hexagon boundary)") + + +def fixPolygon(poly: Union[polygon.Polygon, multipolygon.MultiPolygon], maxAreaDiff=1e-2) -> Union[polygon.Polygon, multipolygon.MultiPolygon]: + """ + Fix invalid shapely polygons or multipolygons. + + Reference: + https://stackoverflow.com/questions/35110632/splitting-self-intersecting-polygon-only-returned-one-polygon-in-shapely + + :param poly: the polygon to fix + :param maxAreaDiff: the maximum change in area + :return: the fixed polygon or None if it cannot be fixed given the area change constraint + """ + def _fixPolygonComponent(coords: List[Tuple[float, float]]): + res = list(polygonize(unary_union(LineString(list(coords) + [coords[0]])))) + return reduce(lambda p1, p2: p1.union(p2), res) + + if poly.is_valid: + return poly + else: + if isinstance(poly, polygon.Polygon): + exteriorCoords = poly.exterior.coords[:] + fixedExterior = _fixPolygonComponent(exteriorCoords) + fixedInterior = polygon.Polygon() + for interior in poly.interiors: + coords = interior.coords[:] + fixedInterior = fixedInterior.union(_fixPolygonComponent(coords)) + fixedPolygon = fixedExterior.difference(fixedInterior) + elif isinstance(poly, multipolygon.MultiPolygon): + polys = list(poly) + fixedPolys = [fixPolygon(p, maxAreaDiff=maxAreaDiff) for p in polys] + fixedPolygon = reduce(lambda p1, p2: p1.union(p2), fixedPolys) + else: + raise Exception(f"Unsupported type {type(poly)}") + areaDiff = float('Inf') if poly.area == 0 else abs(poly.area - fixedPolygon.area) / poly.area + #log.info(f"Invalid polygon\n{poly}\nComputed fix:\n{fixedPolygon}.\nArea error: {areaDiff}") + if areaDiff > maxAreaDiff: + return None + else: + return fixedPolygon + diff --git a/src/sensai/geoanalytics/map_tiles.py b/src/sensai/geoanalytics/map_tiles.py new file mode 100644 index 00000000..afb91fc9 --- /dev/null +++ b/src/sensai/geoanalytics/map_tiles.py @@ -0,0 +1,57 @@ +""" +Utility functions and classes for geographic coordinates +""" + +import math +from typing import Tuple, List, Generator + +from ._globalmaptiles import GlobalMercator +from .geo_coords import GeoRect + +EARTH_RADIUS = 6371000 +EARTH_CIRCUMFERENCE = 2 * math.pi * EARTH_RADIUS +LATITUDE_PER_METRE = 360.0 / EARTH_CIRCUMFERENCE + + + +class MapTile: + def __init__(self, tx: int, ty: int, rect: GeoRect, zoom: int): + self.tx = tx + self.ty = ty + self.rect = rect + self.zoom = zoom + + +class MapTiles: + def __init__(self, zoom=13): + self.zoom = zoom + self._mercator = GlobalMercator() + self._tiles = {} + + def _getTile(self, tx, ty): + key = (tx, ty) + tile = self._tiles.get(key) + if tile is None: + tile = MapTile(tx, ty, GeoRect(*self._mercator.TileLatLonBounds(tx, ty, self.zoom)), self.zoom) + self._tiles[key] = tile + return tile + + def iterTileCoordinatesInRect(self, rect: GeoRect) -> Generator[Tuple[int, int], None, None]: + tx1, ty1 = self._mercator.LatLonToTile(rect.minLat, rect.minLon, self.zoom) + tx2, ty2 = self._mercator.LatLonToTile(rect.maxLat, rect.maxLon, self.zoom) + txMin = min(tx1, tx2) + txMax = max(tx1, tx2) + tyMin = min(ty1, ty2) + tyMax = max(ty1, ty2) + for tx in range(txMin, txMax+1): + for ty in range(tyMin, tyMax+1): + yield tx, ty + + def getTilesInRect(self, rect: GeoRect) -> List[MapTile]: + return [self._getTile(tx, ty) for tx, ty in self.iterTileCoordinatesInRect(rect)] + + def getTile(self, lat: float, lon: float) -> MapTile: + return self._getTile(*self.getTileCoordinates(lat, lon)) + + def getTileCoordinates(self, lat: float, lon: float) -> Tuple[int, int]: + return self._mercator.LatLonToTile(lat, lon, self.zoom) \ No newline at end of file diff --git a/src/sensai/lightgbm.py b/src/sensai/lightgbm.py index 2c9189aa..4f09a47e 100644 --- a/src/sensai/lightgbm.py +++ b/src/sensai/lightgbm.py @@ -1,24 +1,40 @@ -from typing import Sequence, Union, Optional, Dict import logging +import re +from typing import Sequence, Union, Optional + import lightgbm import pandas as pd -import re +from .sklearn.sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel, AbstractSkLearnVectorClassificationModel, \ + FeatureImportanceProviderSkLearnRegressionMultipleOneDim, FeatureImportanceProviderSkLearnClassification from .util.string import orRegexGroup -from .sklearn.sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel, AbstractSkLearnVectorClassificationModel log = logging.getLogger(__name__) -class LightGBMVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel): +# noinspection PyUnusedLocal +def _updateFitArgs(fitArgs: dict, inputs: pd.DataFrame, outputs: pd.DataFrame, categoricalFeatureNameRegex: Optional[str]): + if categoricalFeatureNameRegex is not None: + cols = list(inputs.columns) + categoricalFeatureNames = [col for col in cols if re.match(categoricalFeatureNameRegex, col)] + colIndices = [cols.index(f) for f in categoricalFeatureNames] + args = {"categorical_feature": colIndices} + log.info(f"Updating fit parameters with {args}") + fitArgs.update(args) + else: + fitArgs.pop("categorical_feature", None) + + +class LightGBMVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel, FeatureImportanceProviderSkLearnRegressionMultipleOneDim): log = log.getChild(__qualname__) def __init__(self, categoricalFeatureNames: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31, max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", **modelArgs): """ - :param categoricalFeatureNames: sequence of feature names in the input data that are categorical. + :param categoricalFeatureNames: sequence of feature names in the input data that are categorical or a single string containing + a regex matching the categorical feature names. Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators) - need not be specified (should be inferred automatically). + need not be specified (will be inferred automatically). In general, passing categorical features is preferable to using one-hot encoding, for example. :param random_state: the random seed to use :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31) @@ -43,34 +59,37 @@ def __init__(self, categoricalFeatureNames: Optional[Union[Sequence[str], str]] categoricalFeatureNameRegex = None self._categoricalFeatureNameRegex: str = categoricalFeatureNameRegex - def _updateModelArgs(self, inputs: pd.DataFrame, outputs: pd.DataFrame): - if self._categoricalFeatureNameRegex is not None: - cols = list(inputs.columns) - categoricalFeatureNames = [col for col in cols if re.match(self._categoricalFeatureNameRegex, col)] - colIndices = [cols.index(f) for f in categoricalFeatureNames] - args = {"cat_column": colIndices} - self.log.info(f"Updating model parameters with {args}") - self.modelArgs.update(args) - - def getFeatureImportances(self) -> Dict[str, Dict[str, int]]: - return {targetFeature: dict(zip(model.feature_name_, model.feature_importances_)) for targetFeature, model in self.models.items()} + def _updateFitArgs(self, inputs: pd.DataFrame, outputs: pd.DataFrame): + _updateFitArgs(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex) -class LightGBMVectorClassificationModel(AbstractSkLearnVectorClassificationModel): +class LightGBMVectorClassificationModel(AbstractSkLearnVectorClassificationModel, FeatureImportanceProviderSkLearnClassification): log = log.getChild(__qualname__) - def __init__(self, categoricalFeatureNames: Sequence[str] = None, random_state=42, num_leaves=31, **modelArgs): + def __init__(self, categoricalFeatureNames: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31, + max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", useComputedClassWeights=False, + **modelArgs): """ - :param categoricalFeatureNames: sequence of feature names in the input data that are categorical + :param categoricalFeatureNames: sequence of feature names in the input data that are categorical or a single string containing + a regex matching the categorical feature names. Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators) - need not be specified (should be inferred automatically, but we have never actually tested this behaviour - successfully for a classification model). + need not be specified (will be inferred automatically). In general, passing categorical features may be preferable to using one-hot encoding, for example. :param random_state: the random seed to use :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31) - :param modelArgs: see https://lightgbm.readthedocs.io/en/latest/Parameters.html + :param max_depth: maximum tree depth for base learners, <=0 means no limit + :param n_estimators: number of boosted trees to fit + :param min_child_samples: minimum number of data needed in a child (leaf) + :param importance_type: the type of feature importance to be set in the respective property of the wrapped model. + If ‘split’, result contains numbers of times the feature is used in a model. + If ‘gain’, result contains total gains of splits which use the feature. + :param useComputedClassWeights: whether to compute class weights from the training data that is given and pass it on to the + classifier's fit method; weighted data points may not be supported for all types of models + :param modelArgs: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html?highlight=LGBMClassifier """ - super().__init__(lightgbm.sklearn.LGBMClassifier, random_state=random_state, num_leaves=num_leaves, **modelArgs) + super().__init__(lightgbm.sklearn.LGBMClassifier, random_state=random_state, num_leaves=num_leaves, + max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples, importance_type=importance_type, + useComputedClassWeights=useComputedClassWeights, **modelArgs) if type(categoricalFeatureNames) == str: categoricalFeatureNameRegex = categoricalFeatureNames @@ -81,14 +100,14 @@ def __init__(self, categoricalFeatureNames: Sequence[str] = None, random_state=4 categoricalFeatureNameRegex = None self._categoricalFeatureNameRegex: str = categoricalFeatureNameRegex - def _updateModelArgs(self, inputs: pd.DataFrame, outputs: pd.DataFrame): - if self._categoricalFeatureNameRegex is not None: - cols = list(inputs.columns) - categoricalFeatureNames = [col for col in cols if re.match(self._categoricalFeatureNameRegex, col)] - colIndices = [cols.index(f) for f in categoricalFeatureNames] - args = {"cat_column": colIndices} - self.log.info(f"Updating model parameters with {args}") - self.modelArgs.update(args) - - def getFeatureImportances(self) -> Dict[str, Dict[str, int]]: - return dict(zip(self.model.feature_name_, self.model.feature_importances_)) + def _updateFitArgs(self, inputs: pd.DataFrame, outputs: pd.DataFrame): + _updateFitArgs(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex) + + def _predictClassProbabilities(self, x: pd.DataFrame): + if len(self._labels) == 1: + # special handling required because LGBMClassifier will return values for two classes even if there is only one + Y = self.model.predict_proba(self._transformInput(x)) + Y = Y[:, 0] + return pd.DataFrame(Y, columns=self._labels) + else: + return super()._predictClassProbabilities(x) \ No newline at end of file diff --git a/src/sensai/local_search.py b/src/sensai/local_search.py index 622a94c4..eaafc052 100644 --- a/src/sensai/local_search.py +++ b/src/sensai/local_search.py @@ -10,6 +10,8 @@ import pandas as pd from matplotlib import pyplot as plt +from sensai.util.aggregation import RelativeFrequencyCounter + log = logging.getLogger(__name__) @@ -389,23 +391,6 @@ def chooseParams(self) -> Optional[Tuple[Tuple, Optional[SACostValue]]]: pass -class RelativeFrequencyCounter: - def __init__(self): - self.numTotal = 0 - self.numRelevant = 0 - - def count(self, isRelevantEvent): - self.numTotal += 1 - if isRelevantEvent: - self.numRelevant += 1 - - def __str__(self): - info = f"{self.numRelevant}/{self.numTotal}" - if self.numTotal > 0: - info += f", {100 * self.numRelevant / self.numTotal:.2f}%" - return f"RelativeFrequencyCounter[{info}]" - - class SAChain(Generic[TSAState]): """Manages the progression of one state during simulated annealing""" diff --git a/src/sensai/sklearn/sklearn_base.py b/src/sensai/sklearn/sklearn_base.py index 9c8023fe..d6c2e21c 100644 --- a/src/sensai/sklearn/sklearn_base.py +++ b/src/sensai/sklearn/sklearn_base.py @@ -1,13 +1,16 @@ import copy import logging -from abc import ABC, abstractmethod -from typing import List, Any, Dict import re +from abc import ABC, abstractmethod +from typing import List, Any, Dict, Optional import numpy as np import pandas as pd from sklearn import compose +from ..feature_importance import FeatureImportanceProvider +from ..util.pickle import setstate +from ..util.string import dictString from ..vector_model import VectorRegressionModel, VectorClassificationModel log = logging.getLogger(__name__) @@ -30,6 +33,21 @@ def strSkLearnModel(model): return re.sub(r",\s*", ", ", str(model)) +def _applySkLearnInputTransformer(inputs: pd.DataFrame, sklearnInputTransformer: Optional, fit: bool) -> pd.DataFrame: + if sklearnInputTransformer is None: + return inputs + else: + inputValues = inputs.values + shapeBefore = inputValues.shape + if fit: + inputValues = sklearnInputTransformer.fit_transform(inputValues) + else: + inputValues = sklearnInputTransformer.transform(inputValues) + if inputValues.shape != shapeBefore: + raise Exception("sklearnInputTransformer changed the shape of the input, which is unsupported. Consider using an a DFTSkLearnTransformer in inputTransformers instead.") + return pd.DataFrame(inputValues, index=inputs.index, columns=inputs.columns) + + class AbstractSkLearnVectorRegressionModel(VectorRegressionModel, ABC): """ Base class for models built upon scikit-learn's model implementations @@ -46,6 +64,7 @@ def __init__(self, modelConstructor, **modelArgs): self.sklearnOutputTransformer = None self.modelConstructor = modelConstructor self.modelArgs = modelArgs + self.fitArgs = {} def _toStringExcludes(self) -> List[str]: return super()._toStringExcludes() + ["sklearnInputTransformer", "sklearnOutputTransformer", "modelConstructor", "modelArgs"] @@ -67,18 +86,7 @@ def withSkLearnOutputTransformer(self, sklearnOutputTransformer): return self def _transformInput(self, inputs: pd.DataFrame, fit=False) -> pd.DataFrame: - if self.sklearnInputTransformer is None: - return inputs - else: - inputValues = inputs.values - shapeBefore = inputValues.shape - if fit: - inputValues = self.sklearnInputTransformer.fit_transform(inputValues) - else: - inputValues = self.sklearnInputTransformer.transform(inputValues) - if inputValues.shape != shapeBefore: - raise Exception("sklearnInputTransformer changed the shape of the input, which is unsupported. Consider using an a DFTSkLearnTransformer in inputTransformers instead.") - return pd.DataFrame(inputValues, index=inputs.index, columns=inputs.columns) + return _applySkLearnInputTransformer(inputs, self.sklearnInputTransformer, fit) def _updateModelArgs(self, inputs: pd.DataFrame, outputs: pd.DataFrame): """ @@ -89,9 +97,20 @@ def _updateModelArgs(self, inputs: pd.DataFrame, outputs: pd.DataFrame): """ pass + def _updateFitArgs(self, inputs: pd.DataFrame, outputs: pd.DataFrame): + """ + Designed to be overridden in order to make input data-specific changes to fitArgs (arguments to be passed to the + underlying model's fit method) + + :param inputs: the training input data + :param outputs: the training output data + """ + pass + def _fit(self, inputs: pd.DataFrame, outputs: pd.DataFrame): inputs = self._transformInput(inputs, fit=True) self._updateModelArgs(inputs, outputs) + self._updateFitArgs(inputs, outputs) self._fitSkLearn(inputs, outputs) @abstractmethod @@ -124,7 +143,7 @@ def _toStringAdditionalEntries(self) -> Dict[str, Any]: if len(self.models) > 0: d["model[0]"] = strSkLearnModel(next(iter(self.models.values()))) else: - d["modelConstructor"] = f"{self.modelConstructor.__name__}{self.modelArgs}" + d["modelConstructor"] = f"{self.modelConstructor.__name__}({dictString(self.modelArgs)})" return d def _fitSkLearn(self, inputs: pd.DataFrame, outputs: pd.DataFrame): @@ -133,7 +152,7 @@ def _fitSkLearn(self, inputs: pd.DataFrame, outputs: pd.DataFrame): model = createSkLearnModel(self.modelConstructor, self.modelArgs, outputTransformer=copy.deepcopy(self.sklearnOutputTransformer)) - model.fit(inputs, outputs[predictedVarName]) + model.fit(inputs, outputs[predictedVarName], **self.fitArgs) self.models[predictedVarName] = model def _predictSkLearn(self, inputs: pd.DataFrame) -> pd.DataFrame: @@ -159,7 +178,7 @@ def _toStringAdditionalEntries(self) -> Dict[str, Any]: if self.model is not None: d["model"] = strSkLearnModel(self.model) else: - d["modelConstructor"] = f"{self.modelConstructor.__name__}{self.modelArgs}" + d["modelConstructor"] = f"{self.modelConstructor.__name__}({dictString(self.modelArgs)})" return d def _fitSkLearn(self, inputs: pd.DataFrame, outputs: pd.DataFrame): @@ -169,7 +188,7 @@ def _fitSkLearn(self, inputs: pd.DataFrame, outputs: pd.DataFrame): outputValues = outputs.values if outputValues.shape[1] == 1: # for 1D output, shape must be (numSamples,) rather than (numSamples, 1) outputValues = np.ravel(outputValues) - self.model.fit(inputs, outputValues) + self.model.fit(inputs, outputValues, **self.fitArgs) def _predictSkLearn(self, inputs: pd.DataFrame) -> pd.DataFrame: Y = self.model.predict(inputs) @@ -177,26 +196,33 @@ def _predictSkLearn(self, inputs: pd.DataFrame) -> pd.DataFrame: class AbstractSkLearnVectorClassificationModel(VectorClassificationModel, ABC): - def __init__(self, modelConstructor, **modelArgs): + def __init__(self, modelConstructor, useComputedClassWeights=False, **modelArgs): """ :param modelConstructor: the sklearn model constructor :param modelArgs: arguments to be passed to the sklearn model constructor + :param useComputedClassWeights: whether to compute class weights from the training data that is given and pass it on to the + classifier's fit method; weighted data points may not be supported for all types of models """ super().__init__() self.modelConstructor = modelConstructor self.sklearnInputTransformer = None self.sklearnOutputTransformer = None self.modelArgs = modelArgs + self.fitArgs = {} + self.useComputedClassWeights = useComputedClassWeights self.model = None + def __setstate__(self, state): + setstate(AbstractSkLearnVectorClassificationModel, self, state, newDefaultProperties={"useComputedClassWeights": False}) + def _toStringExcludes(self) -> List[str]: return super()._toStringExcludes() + ["modelConstructor", "sklearnInputTransformer", "sklearnOutputTransformer", - "modelArgs", "model"] + "modelArgs", "model"] def _toStringAdditionalEntries(self) -> Dict[str, Any]: d = super()._toStringAdditionalEntries() if self.model is None: - d["modelConstructor"] = f"{self.modelConstructor.__name__}{self.modelArgs}" + d["modelConstructor"] = f"{self.modelConstructor.__name__}({dictString(self.modelArgs)})" else: d["model"] = strSkLearnModel(self.model) return d @@ -226,21 +252,33 @@ def _updateModelArgs(self, inputs: pd.DataFrame, outputs: pd.DataFrame): """ pass + def _updateFitArgs(self, inputs: pd.DataFrame, outputs: pd.DataFrame): + """ + Designed to be overridden in order to make input data-specific changes to fitArgs (arguments to be passed to the + underlying model's fit method) + + :param inputs: the training input data + :param outputs: the training output data + """ + pass + def _fitClassifier(self, inputs: pd.DataFrame, outputs: pd.DataFrame): - inputValues = self._transformInput(inputs, fit=True) + inputs = self._transformInput(inputs, fit=True) self._updateModelArgs(inputs, outputs) + self._updateFitArgs(inputs, outputs) self.model = createSkLearnModel(self.modelConstructor, self.modelArgs, self.sklearnOutputTransformer) log.info(f"Fitting sklearn classifier of type {self.model.__class__.__name__}") - self.model.fit(inputValues, np.ravel(outputs.values)) + kwargs = dict(self.fitArgs) + if self.useComputedClassWeights: + class2weight = self._computeClassWeights(outputs) + classes = outputs.iloc[:, 0] + weights = [class2weight[cls] for cls in classes] + kwargs["sample_weight"] = np.array(weights) + outputValues = np.ravel(outputs.values) + self.model.fit(inputs, outputValues, **kwargs) - def _transformInput(self, inputs: pd.DataFrame, fit=False) -> np.ndarray: - inputValues = inputs.values - if self.sklearnInputTransformer is not None: - if fit: - inputValues = self.sklearnInputTransformer.fit_transform(inputValues) - else: - inputValues = self.sklearnInputTransformer.transform(inputValues) - return inputValues + def _transformInput(self, inputs: pd.DataFrame, fit=False) -> pd.DataFrame: + return _applySkLearnInputTransformer(inputs, self.sklearnInputTransformer, fit) def _predict(self, x: pd.DataFrame): inputValues = self._transformInput(x) @@ -258,3 +296,42 @@ def get_params(self, deep=True): def set_params(self, **params): self.model.set_params(**params) + def _computeClassWeights(self, outputs: pd.DataFrame): + """ + :param outputs: the output data frame containing the class labels as the first column + :return: the dictionary of class weights mapping class to weight value + """ + classes: pd.Series = outputs.iloc[:,0] + counts = classes.value_counts() + rfreqs = counts / counts.sum() + weights: pd.Series = 1.0 / rfreqs + return weights.to_dict() + + +def _getModelFeatureImportanceVector(model): + candAttributes = ("feature_importances_", "coef_") + for attr in candAttributes: + if hasattr(model, attr): + importanceValues = getattr(model, attr) + if attr == "coef_": + importanceValues = np.abs(importanceValues) # for coefficients in linear models, use the absolute values + return importanceValues + raise ValueError(f"Model {model} has none of the attributes {candAttributes}") + + +class FeatureImportanceProviderSkLearnRegressionMultipleOneDim(FeatureImportanceProvider): + def getFeatureImportanceDict(self) -> Dict[str, Dict[str, int]]: + self: AbstractSkLearnMultipleOneDimVectorRegressionModel + return {targetFeature: dict(zip(self._modelInputVariableNames, _getModelFeatureImportanceVector(model))) for targetFeature, model in self.models.items()} + + +class FeatureImportanceProviderSkLearnRegressionMultiDim(FeatureImportanceProvider): + def getFeatureImportanceDict(self) -> Dict[str, float]: + self: AbstractSkLearnMultiDimVectorRegressionModel + return dict(zip(self._modelInputVariableNames, _getModelFeatureImportanceVector(self.model))) + + +class FeatureImportanceProviderSkLearnClassification(FeatureImportanceProvider): + def getFeatureImportanceDict(self) -> Dict[str, float]: + self: AbstractSkLearnVectorClassificationModel + return dict(zip(self._modelInputVariableNames, _getModelFeatureImportanceVector(self.model))) diff --git a/src/sensai/sklearn/sklearn_classification.py b/src/sensai/sklearn/sklearn_classification.py index 49e8c14f..51a060fd 100644 --- a/src/sensai/sklearn/sklearn_classification.py +++ b/src/sensai/sklearn/sklearn_classification.py @@ -6,8 +6,7 @@ import sklearn.neural_network import sklearn.tree -from .sklearn_base import AbstractSkLearnVectorClassificationModel - +from .sklearn_base import AbstractSkLearnVectorClassificationModel, FeatureImportanceProviderSkLearnClassification log = logging.getLogger(__name__) @@ -18,10 +17,12 @@ def __init__(self, min_samples_leaf=8, random_state=42, **modelArgs): min_samples_leaf=min_samples_leaf, random_state=random_state, **modelArgs) -class SkLearnRandomForestVectorClassificationModel(AbstractSkLearnVectorClassificationModel): - def __init__(self, min_samples_leaf=8, random_state=42, **modelArgs): +class SkLearnRandomForestVectorClassificationModel(AbstractSkLearnVectorClassificationModel, FeatureImportanceProviderSkLearnClassification): + def __init__(self, min_samples_leaf=8, random_state=42, useComputedClassWeights=False, **modelArgs): super().__init__(sklearn.ensemble.RandomForestClassifier, - random_state=random_state, min_samples_leaf=min_samples_leaf, **modelArgs) + random_state=random_state, min_samples_leaf=min_samples_leaf, + useComputedClassWeights=useComputedClassWeights, + **modelArgs) class SkLearnMLPVectorClassificationModel(AbstractSkLearnVectorClassificationModel): @@ -51,3 +52,8 @@ def __init__(self, **modelArgs): class SkLearnSVCVectorClassificationModel(AbstractSkLearnVectorClassificationModel): def __init__(self, random_state=42, **modelArgs): super().__init__(sklearn.svm.SVC, random_state=random_state, **modelArgs) + + +class SkLearnLogisticRegressionVectorClassificationModel(AbstractSkLearnVectorClassificationModel): + def __init__(self, random_state=42, **modelArgs): + super().__init__(sklearn.linear_model.LogisticRegression, random_state=random_state, **modelArgs) diff --git a/src/sensai/sklearn/sklearn_regression.py b/src/sensai/sklearn/sklearn_regression.py index 9072c2a6..23c4b5e4 100644 --- a/src/sensai/sklearn/sklearn_regression.py +++ b/src/sensai/sklearn/sklearn_regression.py @@ -1,5 +1,5 @@ import logging -from typing import Union, Optional, Dict +from typing import Union, Optional import sklearn.ensemble import sklearn.linear_model @@ -7,27 +7,55 @@ import sklearn.neural_network import sklearn.svm -from .sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel, AbstractSkLearnMultiDimVectorRegressionModel - +from .sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel, AbstractSkLearnMultiDimVectorRegressionModel, \ + FeatureImportanceProviderSkLearnRegressionMultipleOneDim, FeatureImportanceProviderSkLearnRegressionMultiDim log = logging.getLogger(__name__) -class SkLearnRandomForestVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel): +class SkLearnRandomForestVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel, FeatureImportanceProviderSkLearnRegressionMultipleOneDim): def __init__(self, n_estimators=100, min_samples_leaf=10, random_state=42, **modelArgs): super().__init__(sklearn.ensemble.RandomForestRegressor, n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=random_state, **modelArgs) - def getFeatureImportances(self) -> Dict[str, Dict[str, float]]: - return {targetFeature: dict(zip(self._modelInputVariableNames, model.feature_importances_)) for targetFeature, model in self.models.items()} + +class SkLearnLinearRegressionVectorRegressionModel(AbstractSkLearnMultiDimVectorRegressionModel, FeatureImportanceProviderSkLearnRegressionMultiDim): + def __init__(self, fit_intercept=True, **modelArgs): + """ + :param fit_intercept: whether to determine the intercept, i.e. the constant term which is not scaled with an input feature value; + set to False if the data is already centred + :param modelArgs: see https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html + """ + super().__init__(sklearn.linear_model.LinearRegression, fit_intercept=fit_intercept, **modelArgs) -class SkLearnLinearRegressionVectorRegressionModel(AbstractSkLearnMultiDimVectorRegressionModel): - def __init__(self, **modelArgs): - super().__init__(sklearn.linear_model.LinearRegression, **modelArgs) +class SkLearnLinearRidgeRegressionVectorRegressionModel(AbstractSkLearnMultiDimVectorRegressionModel, FeatureImportanceProviderSkLearnRegressionMultiDim): + """ + Linear least squares with L2 regularisation + """ + def __init__(self, alpha=1.0, fit_intercept=True, solver="auto", max_iter=None, tol=1e-3, **modelArgs): + """ + :param alpha: multiplies the L2 term, controlling regularisation strength + :param fit_intercept: whether to determine the intercept, i.e. the constant term which is not scaled with an input feature value; + set to False if the data is already centred + :param modelArgs: see https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge + """ + super().__init__(sklearn.linear_model.Ridge, alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, + solver=solver, **modelArgs) + - def getFeatureImportances(self) -> Dict[str, float]: - return dict(zip(self._modelInputVariableNames, self.model.feature_importances_)) +class SkLearnLinearLassoRegressionVectorRegressionModel(AbstractSkLearnMultiDimVectorRegressionModel, FeatureImportanceProviderSkLearnRegressionMultiDim): + """ + Linear least squares with L1 regularisation, a.k.a. the lasso + """ + def __init__(self, alpha=1.0, fit_intercept=True, max_iter=1000, tol=0.0001, **modelArgs): + """ + :param alpha: multiplies the L1 term, controlling regularisation strength + :param fit_intercept: whether to determine the intercept, i.e. the constant term which is not scaled with an input feature value; + set to False if the data is already centred + :param modelArgs: see https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso + """ + super().__init__(sklearn.linear_model.Lasso, alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, **modelArgs) class SkLearnMultiLayerPerceptronVectorRegressionModel(AbstractSkLearnMultiDimVectorRegressionModel): @@ -75,3 +103,9 @@ class SkLearnExtraTreesVectorRegressionModel(AbstractSkLearnMultipleOneDimVector def __init__(self, n_estimators=100, min_samples_leaf=10, random_state=42, **modelArgs): super().__init__(sklearn.ensemble.ExtraTreesRegressor, n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=random_state, **modelArgs) + + +class SkLearnDummyVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel): + def __init__(self, strategy='mean', constant=None, quantile=None): + super().__init__(sklearn.dummy.DummyRegressor, + strategy=strategy, constant=constant, quantile=quantile) diff --git a/src/sensai/tensor_model.py b/src/sensai/tensor_model.py index c853ec8f..77f858f2 100644 --- a/src/sensai/tensor_model.py +++ b/src/sensai/tensor_model.py @@ -262,15 +262,15 @@ def isRegressionModel(self) -> bool: def getNumPredictedClasses(self): return self._numPredictedClasses - def fit(self, X: pd.DataFrame, Y: pd.DataFrame, fitPreprocessors=True): + def fit(self, X: pd.DataFrame, Y: pd.DataFrame, fitPreprocessors=True, fitModel=True): """ :param X: data frame containing input tensors on which to train :param Y: ground truth has to be an array containing only zeroes and ones (one-hot-encoded labels) of the shape `(*predictionShape, numLabels)` - :param fitPreprocessors: - :return: + :param fitPreprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted + :param fitModel: whether the model itself shall be fitted """ if len(Y.columns) != 1: raise ValueError(f"{self.__class__.__name__} requires exactly one output " @@ -290,7 +290,7 @@ def fit(self, X: pd.DataFrame, Y: pd.DataFrame, fitPreprocessors=True): f"predictionShape. If the predictions are scalars, a TensorToScalarClassificationModel " f"should be used instead of {self.__class__.__name__}") self._numPredictedClasses = dfYToCheck.shape[-1] - super().fit(X, Y, fitPreprocessors=fitPreprocessors) + super().fit(X, Y, fitPreprocessors=fitPreprocessors, fitModel=True) def getModelOutputShape(self): # The ground truth contains one-hot-encoded labels in the last dimension diff --git a/src/sensai/torch/__init__.py b/src/sensai/torch/__init__.py index 153c6431..a08a8031 100644 --- a/src/sensai/torch/__init__.py +++ b/src/sensai/torch/__init__.py @@ -1,5 +1,5 @@ from . import torch_modules as modules, torch_models as models -from .torch_data import TensorScaler, DataUtil, VectorDataUtil -from .torch_opt import NNLossEvaluatorRegression, NNLossEvaluator, NNOptimiser -from .torch_base import TorchModel, VectorTorchModel, TorchVectorRegressionModel, \ - TorchVectorClassificationModel +from .torch_data import * +from .torch_opt import * +from .torch_base import * +from .torch_enums import * \ No newline at end of file diff --git a/src/sensai/torch/torch_base.py b/src/sensai/torch/torch_base.py index 1dc18613..b11d551f 100644 --- a/src/sensai/torch/torch_base.py +++ b/src/sensai/torch/torch_base.py @@ -10,12 +10,14 @@ from torch.nn import functional as F from .torch_data import TensorScaler, VectorDataUtil, ClassificationVectorDataUtil, TorchDataSet, \ - TorchDataSetProviderFromDataUtil, TorchDataSetProvider, Tensoriser, TorchDataSetFromDataFrames, RuleBasedTensoriser + TorchDataSetProvider, Tensoriser, TorchDataSetFromDataFrames, RuleBasedTensoriser, \ + TorchDataSetProviderFromVectorDataUtil from .torch_enums import ClassificationOutputMode from .torch_opt import NNOptimiser, NNLossEvaluatorRegression, NNLossEvaluatorClassification, NNOptimiserParams, TrainingInfo from ..data import DataFrameSplitter from ..normalisation import NormalisationMode from ..util.dtype import toFloatArray +from ..util.pickle import setstate from ..util.string import ToStringMixin from ..vector_model import VectorRegressionModel, VectorClassificationModel, TrainingContext @@ -107,6 +109,9 @@ def __init__(self, cuda=True) -> None: self.trainingInfo: Optional[TrainingInfo] = None self._gpu: Optional[int] = None + def _toStringExcludePrivate(self) -> bool: + return True + def setTorchModule(self, module: torch.nn.Module) -> None: self.module = module @@ -185,7 +190,7 @@ def apply(self, X: Union[torch.Tensor, np.ndarray, TorchDataSet, Sequence[torch. mcDropoutSamples: Optional[int] = None, mcDropoutProbability: Optional[float] = None, scaleOutput: bool = False, scaleInput: bool = False) -> Union[torch.Tensor, np.ndarray, Tuple]: """ - Applies the model to the given input tensor and returns the result (normalized) + Applies the model to the given input tensor and returns the result :param X: the input tensor (either a batch or, if createBatch=True, a single data point), a data set or a tuple/list of tensors (if the model accepts more than one input). @@ -194,8 +199,8 @@ def apply(self, X: Union[torch.Tensor, np.ndarray, TorchDataSet, Sequence[torch. :param createBatch: whether to add an additional tensor dimension for a batch containing just one data point :param mcDropoutSamples: if not None, apply MC-Dropout-based inference with the respective number of samples; if None, apply regular inference :param mcDropoutProbability: the probability with which to apply dropouts in MC-Dropout-based inference; if None, use model's default - :param scaleOutput: whether to scale the output that is produced by the underlying model (using this instance's output scaler) - :param scaleInput: whether to scale the input (using this instance's input scaler) before applying the underlying model + :param scaleOutput: whether to scale the output that is produced by the underlying model (using this instance's output scaler, if any) + :param scaleInput: whether to scale the input (using this instance's input scaler, if any) before applying the underlying model :return: an output tensor or, if MC-Dropout is applied, a pair (y, sd) where y the mean output tensor and sd is a tensor of the same dimension containing standard deviations @@ -234,13 +239,11 @@ def extract(z): # check input normalisation if self.NORMALISATION_CHECK_THRESHOLD is not None: - maxValue = 0.0 - for t in inputs: + for i, t in enumerate(inputs): if t.is_floating_point() and t.numel() > 0: # skip any integer tensors (which typically contain lengths) and empty tensors - maxValue = max(t.abs().max().item(), maxValue) - if maxValue > self.NORMALISATION_CHECK_THRESHOLD: - log.warning("Received input which is likely to not be correctly normalised: maximum abs. value in input tensor is %f" % maxValue) - + maxValue = t.abs().max().item() + if maxValue > self.NORMALISATION_CHECK_THRESHOLD: + log.warning(f"Received value in input tensor {i} which is likely to not be correctly normalised: maximum abs. value in tensor is %f" % maxValue) if mcDropoutSamples is None: y = model(*inputs) return extract(y) @@ -332,6 +335,15 @@ def createTorchModule(self) -> torch.nn.Module: return self.moduleFactory(*self.args, **self.kwargs) +class TorchModelFromModule(TorchModel): + def __init__(self, module: torch.nn.Module, cuda: bool = True): + super().__init__(cuda=cuda) + self.module = module + + def createTorchModule(self) -> torch.nn.Module: + return self.module + + class VectorTorchModel(TorchModel, ABC): """ Base class for TorchModels that can be used within VectorModels, where the input and output dimensions @@ -352,6 +364,13 @@ def createTorchModule(self) -> torch.nn.Module: @abstractmethod def createTorchModuleForDims(self, inputDim: int, outputDim: int) -> torch.nn.Module: + """ + :param inputDim: the number of input dimensions as reported by the data set provider (number of columns + in input data frame for default providers) + :param outputDim: the number of output dimensions as reported by the data set provider (for default providers, + this will be the nnumber of columns in the output data frame or, for classification, the number of classes) + :return: the torch module + """ pass @@ -365,9 +384,9 @@ def __init__(self, modelClass: Callable[..., TorchModel], modelArgs: Sequence = normalisationMode: NormalisationMode = NormalisationMode.NONE, nnOptimiserParams: Union[dict, NNOptimiserParams, None] = None) -> None: """ - :param modelClass: the constructor with which to create the wrapped torch vector model - :param modelArgs: the constructor argument list to pass to modelClass - :param modelKwArgs: the dictionary of constructor keyword arguments to pass to modelClass + :param modelClass: the constructor/factory function with which to create the contained TorchModel instance + :param modelArgs: the constructor argument list to pass to ``modelClass`` + :param modelKwArgs: the dictionary of constructor keyword arguments to pass to ``modelClass`` :param normalisationMode: the normalisation mode to apply to input data frames :param nnOptimiserParams: the parameters to apply in NNOptimiser during training """ @@ -390,7 +409,7 @@ def __init__(self, modelClass: Callable[..., TorchModel], modelArgs: Sequence = self.model: Optional[TorchModel] = None self.inputTensoriser: Optional[Tensoriser] = None self.outputTensoriser: Optional[Tensoriser] = None - self.outputTensorToArrayConverter = None + self.outputTensorToArrayConverter: Optional[OutputTensorToArrayConverter] = None self.torchDataSetProviderFactory: Optional[TorchDataSetProviderFactory] = None self.dataFrameSplitter: Optional[DataFrameSplitter] = None @@ -398,36 +417,46 @@ def __setstate__(self, state) -> None: state["nnOptimiserParams"] = NNOptimiserParams.fromDictOrInstance(state["nnOptimiserParams"]) newOptionalMembers = ["inputTensoriser", "torchDataSetProviderFactory", "dataFrameSplitter", "outputTensoriser", "outputTensorToArrayConverter"] - for m in newOptionalMembers: - if m not in state: - state[m] = None - s = super() - if hasattr(s, '__setstate__'): - s.__setstate__(state) - else: - self.__dict__ = state + setstate(TorchVectorRegressionModel, self, state, newOptionalProperties=newOptionalMembers) + + @classmethod + def fromModule(cls, module: torch.nn.Module, cuda=True, normalisationMode: NormalisationMode = NormalisationMode.NONE, + nnOptimiserParams: Optional[NNOptimiserParams] = None) -> "TorchVectorRegressionModel": + return cls(TorchModelFromModule, modelKwArgs=dict(module=module, cuda=cuda), normalisationMode=normalisationMode, + nnOptimiserParams=nnOptimiserParams) def withInputTensoriser(self, tensoriser: Tensoriser) -> __qualname__: + """ + :param tensoriser: tensoriser to use in order to convert input data frames to (one or more) tensors. + The default tensoriser directly converts the data frame's values (which is assumed to contain only scalars that + can be coerced to floats) to a float tensor. + The use of a custom tensoriser is necessary if a non-trivial conversion is necessary or if the data frame + is to be converted to more than one input tensor. + :return: self + """ self.inputTensoriser = tensoriser return self def withOutputTensoriser(self, tensoriser: RuleBasedTensoriser) -> __qualname__: """ :param tensoriser: tensoriser to use in order to convert the output data frame to a tensor. + The default output tensoriser directly converts the data frame's values to a float tensor. + NOTE: It is required to be a rule-based tensoriser, because mechanisms that require fitting on the data and thus perform a data-dependendent conversion are likely to cause problems because they would need to be reversed at inference time (since the model will be trained on the converted values). If you require a transformation, use a target transformer, which will be applied before the tensoriser. + :return: self """ self.outputTensoriser = tensoriser return self - def withOutputTensorToArrayConverter(self, outputTensorToArrayConverter) -> __qualname__: + def withOutputTensorToArrayConverter(self, outputTensorToArrayConverter: "OutputTensorToArrayConverter") -> __qualname__: """ Configures the use of a custom converter from tensors to numpy arrays, which is applied during inference. A custom converter can be required, for example, to handle variable-length outputs (where the output tensor will typically contain unwanted padding). Note that since the converter is for inference only, it may be - required to use a custom loss evaluator during training. + required to use a custom loss evaluator during training if the use of a custom converter is necessary. :param outputTensorToArrayConverter: the converter :return: self @@ -435,10 +464,23 @@ def withOutputTensorToArrayConverter(self, outputTensorToArrayConverter) -> __qu self.outputTensorToArrayConverter = outputTensorToArrayConverter def withTorchDataSetProviderFactory(self, torchDataSetProviderFactory: "TorchDataSetProviderFactory") -> __qualname__: + """ + :param torchDataSetProviderFactory: the torch data set provider factory, which is used to instantiate the provider which + will provide the training and validation data sets from the input data frame that is passed in for learning. + By default, TorchDataSetProviderFactoryRegressionDefault is used. + :return: self + """ self.torchDataSetProviderFactory = torchDataSetProviderFactory return self def withDataFrameSplitter(self, dataFrameSplitter: DataFrameSplitter) -> __qualname__: + """ + :param dataFrameSplitter: the data frame splitter which is used to split the input/output data frames that are passed for + learning into a data frame that is used for training and a data frame that is used for validation. + The input data frame is the data frame that is passed as input to the splitter, and the returned indices + are used to split both the input and output data frames in the same way. + :return: self + """ self.dataFrameSplitter = dataFrameSplitter return self @@ -479,7 +521,11 @@ def _predict(self, inputs: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame(yArray, columns=self.getModelOutputVariableNames()) def _toStringExcludes(self) -> List[str]: - return super()._toStringExcludes() + ["modelClass", "modelArgs", "modelKwArgs", "inputTensoriser"] + excludes = super()._toStringExcludes() + if self.model is not None: + return excludes + ["modelClass", "modelArgs", "modelKwArgs"] + else: + return excludes class TorchVectorClassificationModel(VectorClassificationModel): @@ -493,7 +539,7 @@ def __init__(self, outputMode: ClassificationOutputMode, nnOptimiserParams: Union[dict, NNOptimiserParams, None] = None) -> None: """ :param outputMode: specifies the nature of the output of the underlying neural network model - :param modelClass: the constructor with which to create the wrapped torch vector model + :param modelClass: the constructor with which to create the wrapped torch model :param modelArgs: the constructor argument list to pass to modelClass :param modelKwArgs: the dictionary of constructor keyword arguments to pass to modelClass :param normalisationMode: the normalisation mode to apply to input data frames @@ -526,18 +572,26 @@ def __init__(self, outputMode: ClassificationOutputMode, def __setstate__(self, state) -> None: state["nnOptimiserParams"] = NNOptimiserParams.fromDictOrInstance(state["nnOptimiserParams"]) newOptionalMembers = ["inputTensoriser", "torchDataSetProviderFactory", "dataFrameSplitter", "outputTensoriser"] - for m in newOptionalMembers: - if m not in state: - state[m] = None - if "outputMode" not in state: - state["outputMode"] = ClassificationOutputMode.PROBABILITIES - s = super() - if hasattr(s, '__setstate__'): - s.__setstate__(state) - else: - self.__dict__ = state + newDefaultProperties = {"outputMode": ClassificationOutputMode.PROBABILITIES} + setstate(TorchVectorClassificationModel, self, state, newOptionalProperties=newOptionalMembers, + newDefaultProperties=newDefaultProperties) + + @classmethod + def fromModule(cls, module: torch.nn.Module, outputMode: ClassificationOutputMode, cuda=True, + normalisationMode: NormalisationMode = NormalisationMode.NONE, + nnOptimiserParams: Optional[NNOptimiserParams] = None) -> "TorchVectorClassificationModel": + return cls(outputMode, TorchModelFromModule, modelKwArgs=dict(module=module, cuda=cuda), + normalisationMode=normalisationMode, nnOptimiserParams=nnOptimiserParams) def withInputTensoriser(self, tensoriser: Tensoriser) -> __qualname__: + """ + :param tensoriser: tensoriser to use in order to convert input data frames to (one or more) tensors. + The default tensoriser directly converts the data frame's values (which is assumed to contain only scalars that + can be coerced to floats) to a float tensor. + The use of a custom tensoriser is necessary if a non-trivial conversion is necessary or if the data frame + is to be converted to more than one input tensor. + :return: self + """ self.inputTensoriser = tensoriser return self @@ -553,10 +607,23 @@ def withOutputTensoriser(self, tensoriser: RuleBasedTensoriser) -> __qualname__: return self def withTorchDataSetProviderFactory(self, torchDataSetProviderFactory: "TorchDataSetProviderFactory") -> __qualname__: + """ + :param torchDataSetProviderFactory: the torch data set provider factory, which is used to instantiate the provider which + will provide the training and validation data sets from the input data frame that is passed in for learning. + By default, TorchDataSetProviderFactoryClassificationDefault is used. + :return: self + """ self.torchDataSetProviderFactory = torchDataSetProviderFactory return self def withDataFrameSplitter(self, dataFrameSplitter: DataFrameSplitter) -> __qualname__: + """ + :param dataFrameSplitter: the data frame splitter which is used to split the input/output data frames that are passed for + learning into a data frame that is used for training and a data frame that is used for validation. + The input data frame is the data frame that is passed as input to the splitter, and the returned indices + are used to split both the input and output data frames in the same way. + :return: self + """ self.dataFrameSplitter = dataFrameSplitter return self @@ -608,7 +675,11 @@ def _predictClassProbabilities(self, inputs: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame(y.numpy(), columns=self._labels) def _toStringExcludes(self) -> List[str]: - return super()._toStringExcludes() + ["modelClass", "modelArgs", "modelKwArgs", "inputTensoriser"] + excludes = super()._toStringExcludes() + if self.model is not None: + return excludes + ["modelClass", "modelArgs", "modelKwArgs"] + else: + return excludes class TorchDataSetProviderFactory(ABC): @@ -621,22 +692,36 @@ def createDataSetProvider(self, inputs: pd.DataFrame, outputs: pd.DataFrame, class TorchDataSetProviderFactoryClassificationDefault(TorchDataSetProviderFactory): + def __init__(self, tensoriseDynamically=False): + """ + :param tensoriseDynamically: whether tensorisation shall take place on the fly whenever the provided data sets are iterated; + if False, tensorisation takes place once in a precomputation stage (tensors must jointly fit into memory) + """ + self.tensoriseDynamically = tensoriseDynamically + def createDataSetProvider(self, inputs: pd.DataFrame, outputs: pd.DataFrame, model: TorchVectorClassificationModel, trainingContext: TrainingContext, inputTensoriser: Optional[Tensoriser], outputTensoriser: Optional[Tensoriser], dataFrameSplitter: Optional[DataFrameSplitter]) -> TorchDataSetProvider: dataUtil = ClassificationVectorDataUtil(inputs, outputs, model.model.cuda, len(model._labels), normalisationMode=model.normalisationMode, inputTensoriser=inputTensoriser, outputTensoriser=outputTensoriser, dataFrameSplitter=dataFrameSplitter) - return TorchDataSetProviderFromDataUtil(dataUtil, model.model.cuda) + return TorchDataSetProviderFromVectorDataUtil(dataUtil, model.model.cuda, tensoriseDynamically=self.tensoriseDynamically) class TorchDataSetProviderFactoryRegressionDefault(TorchDataSetProviderFactory): + def __init__(self, tensoriseDynamically=False): + """ + :param tensoriseDynamically: whether tensorisation shall take place on the fly whenever the provided data sets are iterated; + if False, tensorisation takes place once in a precomputation stage (tensors must jointly fit into memory) + """ + self.tensoriseDynamically = tensoriseDynamically + def createDataSetProvider(self, inputs: pd.DataFrame, outputs: pd.DataFrame, model: TorchVectorRegressionModel, trainingContext: TrainingContext, inputTensoriser: Optional[Tensoriser], outputTensoriser: Optional[Tensoriser], dataFrameSplitter: Optional[DataFrameSplitter]) -> TorchDataSetProvider: dataUtil = VectorDataUtil(inputs, outputs, model.model.cuda, normalisationMode=model.normalisationMode, inputTensoriser=inputTensoriser, outputTensoriser=outputTensoriser, dataFrameSplitter=dataFrameSplitter) - return TorchDataSetProviderFromDataUtil(dataUtil, model.model.cuda) + return TorchDataSetProviderFromVectorDataUtil(dataUtil, model.model.cuda, tensoriseDynamically=self.tensoriseDynamically) class OutputTensorToArrayConverter(ABC): diff --git a/src/sensai/torch/torch_data.py b/src/sensai/torch/torch_data.py index 51f58630..1a0dac6b 100644 --- a/src/sensai/torch/torch_data.py +++ b/src/sensai/torch/torch_data.py @@ -1,5 +1,6 @@ import logging from abc import ABC, abstractmethod +import math from typing import Tuple, Sequence, Optional, Union, List, Iterator import numpy as np @@ -485,7 +486,15 @@ def _get_batches(self, tensorTuples: Sequence[TensorTuple], batch_size, shuffle) index = torch.LongTensor(range(length)) start_idx = 0 while start_idx < length: - end_idx = min(length, start_idx + batch_size) + remaining_items = length - start_idx + is_second_last_batch = remaining_items <= 2*batch_size and remaining_items > batch_size + if is_second_last_batch: + # to avoid cases where the last batch is excessively small (1 item in the worst case, where e.g. batch + # normalisation would not be applicable), we evenly distribute the items across the last two batches + adjusted_batch_size = math.ceil(remaining_items / 2) + end_idx = min(length, start_idx + adjusted_batch_size) + else: + end_idx = min(length, start_idx + batch_size) excerpt = index[start_idx:end_idx] batch = [] for tensorTuple in tensorTuples: @@ -504,7 +513,7 @@ def _get_batches(self, tensorTuples: Sequence[TensorTuple], batch_size, shuffle) yield batch[0] else: yield tuple(batch) - start_idx += batch_size + start_idx = end_idx def size(self): return len(self.x) @@ -598,6 +607,18 @@ def provideSplit(self, fractionalSizeOfFirstSet: float) -> Tuple[TorchDataSet, T return TorchDataSetFromTensors(x1, y1, self.cuda), TorchDataSetFromTensors(x2, y2, self.cuda) +class TorchDataSetProviderFromVectorDataUtil(TorchDataSetProvider): + def __init__(self, dataUtil: VectorDataUtil, cuda: bool, tensoriseDynamically=False): + super().__init__(inputTensorScaler=dataUtil.getInputTensorScaler(), outputTensorScaler=dataUtil.getOutputTensorScaler(), + inputDim=dataUtil.inputDim(), modelOutputDim=dataUtil.modelOutputDim()) + self.dataUtil = dataUtil + self.cuda = cuda + self.tensoriseDynamically = tensoriseDynamically + + def provideSplit(self, fractionalSizeOfFirstSet: float) -> Tuple[TorchDataSet, TorchDataSet]: + return self.dataUtil.splitIntoDataSets(fractionalSizeOfFirstSet, self.cuda, tensoriseDynamically=self.tensoriseDynamically) + + class TensorTransformer(ABC): @abstractmethod def transform(self, t: torch.Tensor) -> torch.Tensor: diff --git a/src/sensai/torch/torch_enums.py b/src/sensai/torch/torch_enums.py index 3025b725..c8f6b899 100644 --- a/src/sensai/torch/torch_enums.py +++ b/src/sensai/torch/torch_enums.py @@ -20,7 +20,7 @@ def fromName(cls, name) -> "ActivationFunction": return item raise ValueError(f"No function found for name '{name}'") - def getTorchFunction(self) -> Callable: + def getTorchFunction(self) -> Optional[Callable]: return { ActivationFunction.NONE: None, ActivationFunction.SIGMOID: F.sigmoid, @@ -62,9 +62,13 @@ class ClassificationOutputMode(Enum): UNNORMALISED_LOG_PROBABILITIES = "unnormalised_log_probabilities" @classmethod - def forActivationFn(cls, fn: Optional[Callable]): + def forActivationFn(cls, fn: Optional[Union[Callable, ActivationFunction]]): + if isinstance(fn, ActivationFunction): + fn = fn.getTorchFunction() if fn is None: return cls.UNNORMALISED_LOG_PROBABILITIES + if not callable(fn): + raise ValueError(fn) if isinstance(fn, functools.partial): fn = fn.func name = fn.__name__ diff --git a/src/sensai/torch/torch_eval_util.py b/src/sensai/torch/torch_eval_util.py index 5bfc152b..1cb59839 100644 --- a/src/sensai/torch/torch_eval_util.py +++ b/src/sensai/torch/torch_eval_util.py @@ -1,22 +1,23 @@ from typing import Union -from ..evaluation import RegressionEvaluationUtil -from ..evaluation.crossval import VectorModelCrossValidationData -from ..evaluation.eval_util import TEvalData, TCrossValData -from ..evaluation.evaluator import VectorModelEvaluationData from . import TorchVectorRegressionModel +from ..evaluation import RegressionEvaluationUtil +from ..evaluation.crossval import VectorModelCrossValidationData, VectorRegressionModelCrossValidationData +from ..evaluation.eval_util import EvaluationResultCollector +from ..evaluation.evaluator import VectorModelEvaluationData, VectorRegressionModelEvaluationData class TorchVectorRegressionModelEvaluationUtil(RegressionEvaluationUtil): - def _createPlots(self, data: Union[TEvalData, TCrossValData], resultCollector: RegressionEvaluationUtil.ResultCollector, subtitle=None): + def _createPlots(self, data: Union[VectorRegressionModelEvaluationData, VectorRegressionModelCrossValidationData], resultCollector: EvaluationResultCollector, + subtitle=None): super()._createPlots(data, resultCollector, subtitle) if isinstance(data, VectorModelEvaluationData): self._addLossProgressionPlotIfTorchVectorRegressionModel(data.model, "loss-progression", resultCollector) elif isinstance(data, VectorModelCrossValidationData): if data.trainedModels is not None: for i, model in enumerate(data.trainedModels, start=1): - self._addLossProgressionPlotIfTorchVectorRegressionModel(model, "loss-progression-{i}", resultCollector) + self._addLossProgressionPlotIfTorchVectorRegressionModel(model, f"loss-progression-{i}", resultCollector) @staticmethod def _addLossProgressionPlotIfTorchVectorRegressionModel(model, plotName, resultCollector): diff --git a/src/sensai/torch/torch_models/mlp/mlp_models.py b/src/sensai/torch/torch_models/mlp/mlp_models.py index dae5bf73..f1caec26 100644 --- a/src/sensai/torch/torch_models/mlp/mlp_models.py +++ b/src/sensai/torch/torch_models/mlp/mlp_models.py @@ -14,15 +14,25 @@ class MultiLayerPerceptronTorchModel(VectorTorchModel): def __init__(self, cuda: bool, hiddenDims: Sequence[int], hidActivationFunction: Callable[[torch.Tensor], torch.Tensor], - outputActivationFunction: Optional[Callable[[torch.Tensor], torch.Tensor]], pDropout: Optional[float] = None) -> None: + outputActivationFunction: Optional[Callable[[torch.Tensor], torch.Tensor]], pDropout: Optional[float] = None, + inputDim: Optional[int] = None) -> None: + """ + :param cuda: whether to enable CUDA + :param hiddenDims: the sequence of hidden layer dimensions + :param hidActivationFunction: the output activation function for hidden layers + :param outputActivationFunction: the output activation function + :param pDropout: the dropout probability for training + :param inputDim: the input dimension; if None, use dimensions determined by the input data (number of columns in data frame) + """ super().__init__(cuda=cuda) self.hidActivationFunction = ActivationFunction.torchFunctionFromAny(hidActivationFunction) self.outputActivationFunction = ActivationFunction.torchFunctionFromAny(outputActivationFunction) self.hiddenDims = hiddenDims self.pDropout = pDropout + self.overrideInputDim = inputDim def createTorchModuleForDims(self, inputDim: int, outputDim: int) -> torch.nn.Module: - return MultiLayerPerceptron(inputDim, outputDim, self.hiddenDims, + return MultiLayerPerceptron(inputDim if self.overrideInputDim is None else self.overrideInputDim, outputDim, self.hiddenDims, hidActivationFn=self.hidActivationFunction, outputActivationFn=self.outputActivationFunction, pDropout=self.pDropout) @@ -30,13 +40,15 @@ def createTorchModuleForDims(self, inputDim: int, outputDim: int) -> torch.nn.Mo class MultiLayerPerceptronVectorRegressionModel(TorchVectorRegressionModel): def __init__(self, hiddenDims: Sequence[int] = (5, 5), hidActivationFunction: Callable[[torch.Tensor], torch.Tensor] = torch.sigmoid, outputActivationFunction: Optional[Callable[[torch.Tensor], torch.Tensor]] = None, - normalisationMode: NormalisationMode = NormalisationMode.MAX_BY_COLUMN, + inputDim: Optional[int] = None, + normalisationMode: NormalisationMode = NormalisationMode.NONE, cuda: bool = True, pDropout: Optional[float] = None, nnOptimiserParams: Optional[NNOptimiserParams] = None, **nnOptimiserDictParams) -> None: """ :param hiddenDims: sequence containing the number of neurons to use in hidden layers :param hidActivationFunction: the activation function (torch.nn.functional.* or torch.*) to use for all hidden layers :param outputActivationFunction: the output activation function (torch.nn.functional.* or torch.* or None) + :param inputDim: the input dimension; if None, use dimensions determined by the input data (number of columns in data frame) :param normalisationMode: the normalisation mode to apply to input and output data :param cuda: whether to use CUDA (GPU acceleration) :param pDropout: the probability with which to apply dropouts after each hidden layer @@ -45,19 +57,21 @@ def __init__(self, hiddenDims: Sequence[int] = (5, 5), hidActivationFunction: Ca """ nnOptimiserParams = NNOptimiserParams.fromEitherDictOrInstance(nnOptimiserDictParams, nnOptimiserParams) super().__init__(MultiLayerPerceptronTorchModel, [cuda, hiddenDims, hidActivationFunction, outputActivationFunction], - dict(pDropout=pDropout), normalisationMode, nnOptimiserParams) + dict(pDropout=pDropout, inputDim=inputDim), normalisationMode, nnOptimiserParams) class MultiLayerPerceptronVectorClassificationModel(TorchVectorClassificationModel): def __init__(self, hiddenDims: Sequence[int] = (5, 5), hidActivationFunction: Callable[[torch.Tensor], torch.Tensor] = torch.sigmoid, - outputActivationFunction: Optional[Union[Callable[[torch.Tensor], torch.Tensor], str, ActivationFunction]] = torch.nn.functional.log_softmax, - normalisationMode: NormalisationMode = NormalisationMode.MAX_BY_COLUMN, cuda: bool = True, pDropout: Optional[float] = None, + outputActivationFunction: Optional[Union[Callable[[torch.Tensor], torch.Tensor], str, ActivationFunction]] = ActivationFunction.LOG_SOFTMAX, + inputDim: Optional[int] = None, + normalisationMode: NormalisationMode = NormalisationMode.NONE, cuda: bool = True, pDropout: Optional[float] = None, nnOptimiserParams: Optional[NNOptimiserParams] = None, **nnOptimiserDictParams) -> None: """ :param hiddenDims: sequence containing the number of neurons to use in hidden layers :param hidActivationFunction: the activation function (torch.nn.functional.* or torch.*) to use for all hidden layers :param outputActivationFunction: the output activation function (function from torch.nn.functional.*, function name, enum instance or None) + :param inputDim: the input dimension; if None, use dimensions determined by the input data (number of columns in data frame) :param normalisationMode: the normalisation mode to apply to input and output data :param cuda: whether to use CUDA (GPU acceleration) :param pDropout: the probability with which to apply dropouts after each hidden layer @@ -67,4 +81,4 @@ def __init__(self, hiddenDims: Sequence[int] = (5, 5), nnOptimiserParams = NNOptimiserParams.fromEitherDictOrInstance(nnOptimiserDictParams, nnOptimiserParams) outputMode = ClassificationOutputMode.forActivationFn(ActivationFunction.torchFunctionFromAny(outputActivationFunction)) super().__init__(outputMode, MultiLayerPerceptronTorchModel, [cuda, hiddenDims, hidActivationFunction, outputActivationFunction], - dict(pDropout=pDropout), normalisationMode, nnOptimiserParams) + dict(pDropout=pDropout, inputDim=inputDim), normalisationMode, nnOptimiserParams) diff --git a/src/sensai/torch/torch_models/mlp/mlp_modules.py b/src/sensai/torch/torch_models/mlp/mlp_modules.py index 52c5eaf5..7f46d692 100644 --- a/src/sensai/torch/torch_models/mlp/mlp_modules.py +++ b/src/sensai/torch/torch_models/mlp/mlp_modules.py @@ -4,7 +4,7 @@ from torch import nn from ...torch_base import MCDropoutCapableNNModule -from ....util.string import objectRepr +from ....util.string import objectRepr, functionName class MultiLayerPerceptron(MCDropoutCapableNNModule): @@ -31,8 +31,8 @@ def __init__(self, inputDim: float, outputDim: float, hiddenDims: Sequence[int], def __str__(self): return objectRepr(self, dict(inputDim=self.inputDim, outputDim=self.outputDim, hiddenDims=self.hiddenDims, - hidActivationFn=self.hidActivationFn.__name__ if self.hidActivationFn is not None else None, - outputActivationFn=self.outputActivationFn.__name__ if self.outputActivationFn is not None else None, + hidActivationFn=functionName(self.hidActivationFn) if self.hidActivationFn is not None else None, + outputActivationFn=functionName(self.outputActivationFn) if self.outputActivationFn is not None else None, pDropout=self.pDropout)) def forward(self, x): diff --git a/src/sensai/torch/torch_opt.py b/src/sensai/torch/torch_opt.py index 3e155159..168d41ce 100644 --- a/src/sensai/torch/torch_opt.py +++ b/src/sensai/torch/torch_opt.py @@ -41,13 +41,20 @@ class Optimiser(enum.Enum): LBFGS = ("lbfgs", optim.LBFGS) @classmethod - def fromName(cls, name: str): + def fromName(cls, name: str) -> "Optimiser": lname = name.lower() for o in cls: if o.value[0] == lname: return o raise ValueError(f"Unknown optimiser name '{name}'; known names: {[o.value[0] for o in cls]}") + @classmethod + def fromNameOrInstance(cls, nameOrInstance: Union[str, "Optimiser"]) -> "Optimiser": + if type(nameOrInstance) == str: + return cls.fromName(nameOrInstance) + else: + return nameOrInstance + class _Optimiser(object): """ @@ -61,10 +68,7 @@ def __init__(self, params, method: Union[str, Optimiser], lr, max_grad_norm, use :param max_grad_norm: gradient norm value beyond which to apply gradient shrinkage :param optimiserArgs: keyword arguments to be used in actual torch optimiser """ - if type(method) == str: - self.method = Optimiser.fromName(method) - else: - self.method = method + self.method = Optimiser.fromNameOrInstance(method) self.params = list(params) # careful: params may be a generator self.last_ppl = None self.lr = lr @@ -119,7 +123,7 @@ def startEpoch(self) -> None: def computeTrainBatchLoss(self, modelOutput, groundTruth, X, Y) -> torch.Tensor: """ Computes the loss for the given model outputs and ground truth values for a batch - and aggregates the computed loss values such that getEpochLoss can return an appropriate + and aggregates the computed loss values such that :meth:``getEpochTrainLoss`` can return an appropriate result for the entire epoch. The original batch tensors X and Y are provided as meta-information only. @@ -538,7 +542,7 @@ def __init__(self, lossEvaluator: NNLossEvaluator = None, gpu=None, optimiser: U """ :param lossEvaluator: the loss evaluator to use :param gpu: the index of the GPU to be used (if CUDA is enabled for the model to be trained); if None, default to first GPU - :param optimiser: the name of the optimizer to be used; defaults to "adam" + :param optimiser: the optimiser to use :param optimiserLR: the optimiser's learning rate :param earlyStoppingEpochs: the number of epochs without validation score improvement after which to abort training and use the best epoch's model (early stopping); if None, never abort training before all epochs are completed @@ -548,12 +552,12 @@ def __init__(self, lossEvaluator: NNLossEvaluator = None, gpu=None, optimiser: U If no validation is to be performed, pass 1.0. :param scaledOutputs: whether to scale all outputs, resulting in computations of the loss function based on scaled values rather than normalised values. Enabling scaling may not be appropriate in cases where there are multiple outputs on different scales/with completely different units. - :param useShrinkage: whether to apply shrinkage to gradients whose norm exceeds optimiserClip - :param shrinkageClip: the maximum gradient norm beyond which to apply shrinkage (if useShrinkage is True) + :param useShrinkage: whether to apply shrinkage to gradients whose norm exceeds ``shrinkageClip``, scaling the gradient down to ``shrinkageClip`` + :param shrinkageClip: the maximum gradient norm beyond which to apply shrinkage (if ``useShrinkage`` is True) :param shuffle: whether to shuffle the training data :param optimiserArgs: keyword arguments to be passed on to the actual torch optimiser """ - if optimiser == 'lbfgs': + if Optimiser.fromNameOrInstance(optimiser) == Optimiser.LBFGS: largeBatchSize = 1e12 if batchSize is not None: log.warning(f"LBFGS does not make use of batches, therefore using large batch size {largeBatchSize} to achieve use of a single batch") @@ -614,20 +618,7 @@ class NNOptimiser: def __init__(self, params: NNOptimiserParams): """ - :param cuda: whether to use CUDA - :param lossEvaluator: the loss evaluator to use - :param gpu: index of the gpu to be used (if CUDA is enabled in the model to be trained) - :param optimiser: the optimizer to be used; defaults to "adam" - :param optimiserClip: the maximum gradient norm beyond which to apply shrinkage (if useShrinkage is True) - :param optimiserLR: the optimiser's learning rate - :param batchSize: the batch size to use; for algorithms L-BFGS (optimiser='lbfgs'), which do not use batches, leave this at None. - If the algorithm uses batches and None is specified, batch size 64 will be used by default. - :param trainFraction: the fraction of the data used for training (with the remainder being used for validation). - If no validation is to be performed, pass 1.0. - :param scaledOutputs: whether to scale all outputs, resulting in computations of the loss function based on scaled values rather than normalised values. - Enabling scaling may not be appropriate in cases where there are multiple outputs on different scales/with completely different units. - :param useShrinkage: whether to apply shrinkage to gradients whose norm exceeds optimiserClip - :param optimiserArgs: keyword arguments to be passed on to the actual torch optimiser + :param params: parameters """ if params.lossEvaluator is None: raise ValueError("Must provide a loss evaluator") diff --git a/src/sensai/tracking/tracking_base.py b/src/sensai/tracking/tracking_base.py index 314b5a9a..dcb97a7b 100644 --- a/src/sensai/tracking/tracking_base.py +++ b/src/sensai/tracking/tracking_base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, Any +from typing import Dict, Any, Optional class TrackedExperiment(ABC): @@ -10,8 +10,10 @@ def __init__(self, additionalLoggingValuesDict=None): """ self.additionalLoggingValuesDict = additionalLoggingValuesDict - def trackValues(self, valuesDict: Dict[str, Any]): + def trackValues(self, valuesDict: Dict[str, Any], addValuesDict: Dict[str, Any] = None): valuesDict = dict(valuesDict) + if addValuesDict is not None: + valuesDict.update(addValuesDict) if self.additionalLoggingValuesDict is not None: valuesDict.update(self.additionalLoggingValuesDict) self._trackValues(valuesDict) @@ -22,15 +24,14 @@ def _trackValues(self, valuesDict): class TrackingMixin(ABC): - # hackidy hack - _trackedExperimentAttributeName = "_trackedExperiment" + _objectId2trackedExperiment = {} - def setTrackedExperiment(self, trackedExperiment: TrackedExperiment): - setattr(self, self._trackedExperimentAttributeName, trackedExperiment) + def setTrackedExperiment(self, trackedExperiment: Optional[TrackedExperiment]): + self._objectId2trackedExperiment[id(self)] = trackedExperiment def unsetTrackedExperiment(self): - setattr(self, self._trackedExperimentAttributeName, None) + self.setTrackedExperiment(None) @property - def trackedExperiment(self): - return getattr(self, self._trackedExperimentAttributeName, None) + def trackedExperiment(self) -> Optional[TrackedExperiment]: + return self._objectId2trackedExperiment.get(id(self)) diff --git a/src/sensai/util/aggregation.py b/src/sensai/util/aggregation.py index 3abf4343..e875a6d2 100644 --- a/src/sensai/util/aggregation.py +++ b/src/sensai/util/aggregation.py @@ -1,5 +1,5 @@ import collections -from typing import Hashable, Dict +from typing import Hashable, Dict, Optional from .string import ToStringMixin @@ -39,7 +39,12 @@ def add(self, relativeFrequencyCounter: __qualname__) -> None: self.numTotal += relativeFrequencyCounter.numTotal self.numRelevant += relativeFrequencyCounter.numRelevant - def getRelativeFrequency(self) -> float: + def getRelativeFrequency(self) -> Optional[float]: + """ + :return: the relative frequency (between 0 and 1) or None if nothing was counted (0 events considered) + """ + if self.numTotal == 0: + return None return self.numRelevant / self.numTotal @@ -70,7 +75,7 @@ def _toStringObjectInfo(self): return ", ".join([f"{str(k)}: {v} ({v/self.totalCount:.3f})" for k, v in self.counts.items()]) -class WeightedMean: +class WeightedMean(ToStringMixin): """ Computes a weighted mean of values """ @@ -78,12 +83,15 @@ def __init__(self): self.weightedValueSum = 0 self.weightSum = 0 - def add(self, value, weight) -> None: + def _toStringObjectInfo(self) -> str: + return f"{self.weightedValueSum/self.weightSum}" + + def add(self, value, weight=1) -> None: """ Adds the given value with the the given weight to the calculation :param value: the value - :param weight: the weight with which to consider tha value + :param weight: the weight with which to consider the value """ self.weightedValueSum += value * weight self.weightSum += weight diff --git a/src/sensai/util/cache.py b/src/sensai/util/cache.py index 50a7e100..c0542e11 100644 --- a/src/sensai/util/cache.py +++ b/src/sensai/util/cache.py @@ -565,18 +565,20 @@ def _computeValue(self, key, data): def cached(fn: Callable[[], T], picklePath, functionName=None, validityCheckFn: Optional[Callable[[T], bool]] = None, - backend="pickle", protocol=pickle.HIGHEST_PROTOCOL) -> T: + backend="pickle", protocol=pickle.HIGHEST_PROTOCOL, load=True, version=None) -> T: """ :param fn: the function whose result is to be cached :param picklePath: the path in which to store the cached result :param functionName: the name of the function fn (for the case where its __name__ attribute is not informative) :param validityCheckFn: an optional function to call in order to check whether a cached result is still valid; - the function shall return True if the res is still valid and false otherwise. If a cached result is invalid, + the function shall return True if the result is still valid and false otherwise. If a cached result is invalid, the function fn is called to compute the result and the cached result is updated. :param backend: pickle or joblib :param protocol: the pickle protocol version - :return: the res (either obtained from the cache or the function) + :param load: whether to load a previously persisted result; if False, do not load an old result but store the newly computed result + :param version: if not None, previously persisted data will only be returned if it was stored with the same version + :return: the result (either obtained from the cache or the function) """ if functionName is None: functionName = fn.__name__ @@ -584,17 +586,34 @@ def cached(fn: Callable[[], T], picklePath, functionName=None, validityCheckFn: def callFnAndCacheResult(): res = fn() log.info(f"Saving cached result in {picklePath}") - dumpPickle(res, picklePath, backend=backend, protocol=protocol) + if version is not None: + persistedRes = {"__cacheVersion": version, "obj": res} + else: + persistedRes = res + dumpPickle(persistedRes, picklePath, backend=backend, protocol=protocol) return res if os.path.exists(picklePath): - log.info(f"Loading cached result of function '{functionName}' from {picklePath}") - result = loadPickle(picklePath, backend=backend) - if validityCheckFn is not None: - if not validityCheckFn(result): - log.info(f"Cached result is no longer valid, recomputing ...") - result = callFnAndCacheResult() - return result + if load: + log.info(f"Loading cached result of function '{functionName}' from {picklePath}") + result = loadPickle(picklePath, backend=backend) + if validityCheckFn is not None: + if not validityCheckFn(result): + log.info(f"Cached result is no longer valid, recomputing ...") + result = callFnAndCacheResult() + if version is not None: + cachedVersion = None + if type(result) == dict: + cachedVersion = result.get("__cacheVersion") + if cachedVersion != version: + log.info(f"Cached result has incorrect version ({cachedVersion}, expected {version}), recomputing ...") + result = callFnAndCacheResult() + else: + result = result["obj"] + return result + else: + log.info(f"Ignoring previously stored result in {picklePath}, calling function '{functionName}' ...") + return callFnAndCacheResult() else: log.info(f"No cached result found in {picklePath}, calling function '{functionName}' ...") return callFnAndCacheResult() @@ -605,21 +624,26 @@ class PickleCached(object): Function decorator for caching function results via pickle """ def __init__(self, cacheBasePath: str, filenamePrefix: str = None, filename: str = None, backend="pickle", - protocol=pickle.HIGHEST_PROTOCOL): + protocol=pickle.HIGHEST_PROTOCOL, load=True, version=None): """ :param cacheBasePath: the directory where the pickle cache file will be stored :param filenamePrefix: a prefix of the name of the cache file to be created, to which the function name and, where applicable, a hash code of the function arguments will be appended and ".cache.pickle" will be appended; if None, use "" (if filename has not been provided) - :param filename: the full file name of the cache file to be created; this is admissible only if the function has no arguments + :param filename: the full file name of the cache file to be created; if the function takes arguments, the filename must + contain a placeholder '%s' for the argument hash :param backend: the serialisation backend to use (see dumpPickle) :param protocol: the pickle protocol version to use + :param load: whether to load a previously persisted result; if False, do not load an old result but store the newly computed result + :param version: if not None, previously persisted data will only be returned if it was stored with the same version """ self.filename = filename self.cacheBasePath = cacheBasePath self.filenamePrefix = filenamePrefix self.backend = backend self.protocol = protocol + self.load = load + self.version = version if self.filenamePrefix is None: self.filenamePrefix = "" @@ -629,18 +653,27 @@ def __init__(self, cacheBasePath: str, filenamePrefix: str = None, filename: str def __call__(self, fn: Callable, *_args, **_kwargs): def wrapped(*args, **kwargs): + hashCodeStr = None haveArgs = len(args) > 0 or len(kwargs) > 0 + if haveArgs: + hashCodeStr = pickleHash((args, kwargs)) if self.filename is None: - filename = self.filenamePrefix + fn.__qualname__ - if haveArgs: - filename += "-" + pickleHash((args, kwargs)) + filename = self.filenamePrefix + fn.__qualname__.replace("..", ".") + if hashCodeStr is not None: + filename += "-" + hashCodeStr filename += ".cache.pickle" else: - if haveArgs: - raise Exception("Function called with arguments but full cache filename specified: specify a cache filename prefix only to account for argument values") - filename = self.filename + if hashCodeStr is not None: + if not "%s" in self.filename: + raise Exception("Function called with arguments but full cache filename contains no placeholder (%s) for argument hash") + filename = self.filename % hashCodeStr + else: + if "%s" in self.filename: + raise Exception("Function without arguments but full cache filename with placeholder (%s) was specified") + filename = self.filename picklePath = os.path.join(self.cacheBasePath, filename) - return cached(lambda: fn(*args, **kwargs), picklePath, functionName=fn.__name__, backend=self.backend) + return cached(lambda: fn(*args, **kwargs), picklePath, functionName=fn.__name__, backend=self.backend, load=self.load, + version=self.version) return wrapped diff --git a/src/sensai/util/datastruct.py b/src/sensai/util/datastruct.py index 27ca13d2..1b0f22dc 100644 --- a/src/sensai/util/datastruct.py +++ b/src/sensai/util/datastruct.py @@ -229,10 +229,10 @@ def closestKeyAndValue(self, key) -> Optional[Tuple[TKey, TValue]]: return None if idx is None else (self.keys[idx], self.values[idx]) def valueSliceInner(self, lowerBoundKey, upperBoundKey): - return array_util.valueSliceOuter(self.keys, lowerBoundKey, upperBoundKey, values=self.values) + return array_util.valueSliceInner(self.keys, lowerBoundKey, upperBoundKey, values=self.values) - def valueSliceOuter(self, lowerBoundKey, upperBoundKey): - return array_util.valueSliceOuter(self.keys, lowerBoundKey, upperBoundKey, values=self.values) + def valueSliceOuter(self, lowerBoundKey, upperBoundKey, fallback=False): + return array_util.valueSliceOuter(self.keys, lowerBoundKey, upperBoundKey, values=self.values, fallbackBounds=fallback) class SortedKeyValuePairs(Generic[TKey, TValue]): diff --git a/src/sensai/util/deprecation.py b/src/sensai/util/deprecation.py new file mode 100644 index 00000000..9cfe6907 --- /dev/null +++ b/src/sensai/util/deprecation.py @@ -0,0 +1,13 @@ +import warnings + + +def deprecated(message): + def deprecated_decorator(func): + def deprecated_func(*args, **kwargs): + warnings.warn("{} is a deprecated function. {}".format(func.__name__, message), + category=DeprecationWarning, + stacklevel=2) + warnings.simplefilter('default', DeprecationWarning) + return func(*args, **kwargs) + return deprecated_func + return deprecated_decorator diff --git a/src/sensai/util/helper.py b/src/sensai/util/helper.py index 076b71b0..03cf0f78 100644 --- a/src/sensai/util/helper.py +++ b/src/sensai/util/helper.py @@ -1,9 +1,9 @@ """ This module contains various helper functions. """ +import math from typing import Any, Sequence, Union, TypeVar, List - T = TypeVar("T") @@ -21,6 +21,16 @@ def countNone(*args: Any) -> int: return c +def countNotNone(*args: Any) -> int: + """ + Counts the number of arguments that are not None + + :param args: various arguments + :return: the number of arguments that are not None + """ + return len(args) - countNone(*args) + + def anyNone(*args: Any) -> bool: """ :param args: various arguments @@ -37,6 +47,17 @@ def allNone(*args: Any) -> bool: return countNone(*args) == len(args) +def checkNotNaNDict(d: dict): + """ + Raises ValueError if any of the values in the given dictionary are NaN, reporting the respective keys + + :param d: a dictionary mapping to floats that are to be checked for NaN + """ + invalidKeys = [k for k, v in d.items() if math.isnan(v)] + if len(invalidKeys) > 0: + raise ValueError(f"Got one or more NaN values: {invalidKeys}") + + def markUsed(*args): """ Utility function to mark identifiers as used. diff --git a/src/sensai/util/io.py b/src/sensai/util/io.py index e6cf6a11..def4dcf0 100644 --- a/src/sensai/util/io.py +++ b/src/sensai/util/io.py @@ -1,6 +1,6 @@ import logging import os -from typing import Sequence, Optional, Tuple +from typing import Sequence, Optional, Tuple, List import matplotlib.figure from matplotlib import pyplot as plt @@ -64,6 +64,12 @@ def writeTextFile(self, filenameSuffix, content): f.write(content) return p + def writeTextFileLines(self, filenameSuffix, lines: List[str]): + p = self.path(filenameSuffix, extensionToAdd="txt") + self.log.info(f"Saving text file {p}") + writeTextFileLines(lines, p) + return p + def writeDataFrameTextFile(self, filenameSuffix, df: pd.DataFrame): p = self.path(filenameSuffix, extensionToAdd="df.txt", validOtherExtensions="txt") self.log.info(f"Saving data frame text file {p}") @@ -75,6 +81,7 @@ def writeDataFrameCsvFile(self, filenameSuffix, df: pd.DataFrame): p = self.path(filenameSuffix, extensionToAdd="csv") self.log.info(f"Saving data frame CSV file {p}") df.to_csv(p) + return p def writeFigure(self, filenameSuffix, fig, closeFigure=False): """ @@ -98,3 +105,32 @@ def writePickle(self, filenameSuffix, obj): p = self.path(filenameSuffix, extensionToAdd="pickle") self.log.info(f"Saving pickle {p}") dumpPickle(obj, p) + return p + + +def writeTextFileLines(lines: List[str], path): + """ + :param lines: the lines to write (without a trailing newline, which will be added) + :param path: the path of the text file to write to + """ + with open(path, "w") as f: + for line in lines: + f.write(line) + f.write("\n") + + +def readTextFileLines(path, strip=True, skipEmpty=True) -> List[str]: + """ + :param path: the path of the text file to read from + :param strip: whether to strip each line, removing whitespace/newline characters + :param skipEmpty: whether to skip any lines that are empty (after stripping) + :return: the list of lines + """ + lines = [] + with open(path, "r") as f: + for line in f.readlines(): + if strip: + line = line.strip() + if not skipEmpty or line != "": + lines.append(line) + return lines \ No newline at end of file diff --git a/src/sensai/util/logging.py b/src/sensai/util/logging.py index b90e533b..adad5ce6 100644 --- a/src/sensai/util/logging.py +++ b/src/sensai/util/logging.py @@ -7,7 +7,6 @@ import pandas as pd - log = getLogger(__name__) LOG_DEFAULT_FORMAT = '%(levelname)-5s %(asctime)-15s %(name)s:%(funcName)s - %(message)s' @@ -69,6 +68,16 @@ def restart(self): def getElapsedTimeSecs(self) -> float: return time.time() - self.startTime + def getElapsedTimedelta(self) -> pd.Timedelta: + return pd.Timedelta(self.getElapsedTimeSecs(), unit="s") + + def getElapsedTimeString(self) -> str: + secs = self.getElapsedTimeSecs() + if secs < 60: + return f"{secs:.3f} seconds" + else: + return str(pd.Timedelta(secs, unit="s")) + class StopWatchManager: """ @@ -93,7 +102,7 @@ def start(self, name): def stop(self, name) -> float: """ - :param name: the name of the time + :param name: the name of the stopwatch :return: the time that has passed in seconds """ timePassedSecs = time.time() - self._stopWatches[name] @@ -105,18 +114,41 @@ def isRunning(self, name): class LogTime: - def __init__(self, name): + """ + An execution time logger which can be conveniently applied using a with-statement - in order to log the executing time of the respective + with-block. + """ + + def __init__(self, name, enabled=True, logger: Logger = None): + """ + :param name: the name of the event whose time is to be logged upon completion as " completed in