diff --git a/CHANGELOG.md b/CHANGELOG.md index 984c6600..573c5b00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,39 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- `distance` method +- 30 new distance/similarity + 1. AMPLE + 2. Anderberg's D + 3. Andres & Marzo's Delta + 4. Baroni-Urbani & Buser I + 5. Baroni-Urbani & Buser II + 6. Batagelj & Bren + 7. Baulieu I + 8. Baulieu II + 9. Baulieu III + 10. Baulieu IV + 11. Baulieu V + 12. Baulieu VI + 13. Baulieu VII + 14. Baulieu VIII + 15. Baulieu IX + 16. Baulieu X + 17. Baulieu XI + 18. Baulieu XII + 19. Baulieu XIII + 20. Baulieu XIV + 21. Baulieu XV + 22. Benini I + 23. Benini II + 24. Canberra + 25. Clement + 26. Consonni & Todeschini I + 27. Consonni & Todeschini II + 28. Consonni & Todeschini III + 29. Consonni & Todeschini IV + 30. Consonni & Todeschini V ### Changed - `README.md` modified - Document modified diff --git a/Document/Distance.ipynb b/Document/Distance.ipynb new file mode 100644 index 00000000..2cd9f832 --- /dev/null +++ b/Document/Distance.ipynb @@ -0,0 +1,1652 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Please cite us if you use the software

" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distance/Similarity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "PyCM's `distance` method provides users with a wide range of string distance/similarity metrics to evaluate a confusion matrix by measuring its distance to a perfect confusion matrix. Distance/Similarity metrics measure the distance between two vectors of numbers. Small distances between two objects indicate similarity. In the PyCM's `distance` method, a distance measure can be chosen from `DistanceType`. The measures' names are chosen based on the namig style suggested in [[1]](#ref1)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pycm import ConfusionMatrix, DistanceType" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "cm = ConfusionMatrix(matrix={0: {0: 3, 1: 0, 2: 0}, 1: {0: 0, 1: 1, 2: 2}, 2: {0: 2, 1: 1, 2: 3}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$TP \\rightarrow True Positive$$\n", + "$$TN \\rightarrow True Negative$$\n", + "$$FP \\rightarrow False Positive$$\n", + "$$FN \\rightarrow False Negative$$\n", + "$$POP \\rightarrow Population$$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## AMPLE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "AMPLE similarity [[2]](#ref2) [[3]](#ref3)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{AMPLE}=|\\frac{TP}{TP+FP}-\\frac{FN}{FN+TN}|$$" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.6, 1: 0.3, 2: 0.17142857142857143}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.AMPLE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Anderberg's D" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anderberg's D [[4]](#ref4)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{Anderberg} =\n", + "\\frac{(max(TP,FP)+max(FN,TN)+max(TP,FN)+max(FP,TN))-\n", + "(max(TP+FP,FP+TN)+max(TP+FP,FN+TN))}{2\\times POP}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.16666666666666666, 1: 0.0, 2: 0.041666666666666664}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.Anderberg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Andres & Marzo's Delta" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Andres & Marzo's Delta correlation [[5]](#ref5)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$corr_{AndresMarzo_\\Delta} = \\Delta =\n", + "\\frac{TP+TN-2 \\times \\sqrt{FP \\times FN}}{POP}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.8333333333333334, 1: 0.5142977396044842, 2: 0.17508504286947035}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.AndresMarzoDelta)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baroni-Urbani & Buser I" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baroni-Urbani & Buser I similarity [[6]](#ref6)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{BaroniUrbaniBuserI} =\n", + "\\frac{\\sqrt{TP\\times TN}+TP}{\\sqrt{TP\\times TN}+TP+FP+FN}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.79128784747792, 1: 0.5606601717798213, 2: 0.5638559245324765}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaroniUrbaniBuserI)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baroni-Urbani & Buser II" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baroni-Urbani & Buser II correlation [[6]](#ref6)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$corr_{BaroniUrbaniBuserII} =\n", + "\\frac{\\sqrt{TP \\times TN}+TP-FP-FN}{\\sqrt{TP \\times TN}+TP+FP+FN}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.58257569495584, 1: 0.12132034355964261, 2: 0.1277118490649528}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaroniUrbaniBuserII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Batagelj & Bren" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Batagelj & Bren distance [[7]](#ref7)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BatageljBren} =\n", + "\\frac{FP \\times FN}{TP \\times TN}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.0, 1: 0.25, 2: 0.5}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BatageljBren)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu I" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu I distance [[8]](#ref8)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{BaulieuI} =\n", + "\\frac{(TP+FP) \\times (TP+FN)-TP^2}{(TP+FP) \\times (TP+FN)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.4, 1: 0.8333333333333334, 2: 0.7}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuI)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu II" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu II similarity [[8]](#ref8)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{BaulieuII} =\n", + "\\frac{TP^2 \\times TN^2}{(TP+FP) \\times (TP+FN) \\times (FP+TN) \\times (FN+TN)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.4666666666666667, 1: 0.11851851851851852, 2: 0.11428571428571428}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu III" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu III distance [[8]](#ref8)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{BaulieuIII} =\n", + "\\frac{POP^2 - 4 \\times (TP \\times TN-FP \\times FN)}{2 \\times POP^2}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.20833333333333334, 1: 0.4166666666666667, 2: 0.4166666666666667}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuIII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu IV" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu IV distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuIV} = \\frac{FP+FN-(TP+\\frac{1}{2})\\times(TN+\\frac{1}{2})\\times TN \\times k}{POP}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: -41.45702383161246, 1: -22.855395541901885, 2: -13.85431293274332}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuIV)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* The default value of k is Euler's number $e$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu V" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu V distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuV} = \\frac{FP+FN+1}{TP+FP+FN+1}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.5, 1: 0.8, 2: 0.6666666666666666}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuV)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu VI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu VI distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuVI} = \\frac{FP+FN}{TP+FP+FN+1}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.3333333333333333, 1: 0.6, 2: 0.5555555555555556}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuVI)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu VII" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu VII distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuVII} = \\frac{FP+FN}{POP + TP \\times (TP-4)^2}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.13333333333333333, 1: 0.14285714285714285, 2: 0.3333333333333333}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuVII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu VIII" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu VIII distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuVIII} = \\frac{(FP-FN)^2}{POP^2}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.027777777777777776, 1: 0.006944444444444444, 2: 0.006944444444444444}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuVIII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu IX" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu IX distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuIX} = \\frac{FP+2 \\times FN}{TP+FP+2 \\times FN+TN}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.16666666666666666, 1: 0.35714285714285715, 2: 0.5333333333333333}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuIX)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu X" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu X distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuX} = \\frac{FP+FN+max(FP,FN)}{POP+max(FP,FN)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.2857142857142857, 1: 0.35714285714285715, 2: 0.5333333333333333}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuX)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu XI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu XI distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuXI} = \\frac{FP+FN}{FP+FN+TN}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.2222222222222222, 1: 0.2727272727272727, 2: 0.5555555555555556}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuXI)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu XII" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu XII distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuXII} = \\frac{FP+FN}{TP+FP+FN-1}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.5, 1: 1.0, 2: 0.7142857142857143}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuXII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu XIII" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu XIII distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuXIII} = \\frac{FP+FN}{TP+FP+FN+TP \\times (TP-4)^2}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.25, 1: 0.23076923076923078, 2: 0.45454545454545453}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuXIII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu XIV" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu XIV distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuXIV} = \\frac{FP+2 \\times FN}{TP+FP+2 \\times FN}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.4, 1: 0.8333333333333334, 2: 0.7272727272727273}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuXIV)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baulieu XV" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Baulieu XV distance [[9]](#ref9)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$dist_{BaulieuXV} = \\frac{FP+FN+max(FP, FN)}{TP+FP+FN+max(FP, FN)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.5714285714285714, 1: 0.8333333333333334, 2: 0.7272727272727273}" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BaulieuXV)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benini I" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Benini I correlation [[10]](#ref10)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$corr_{BeniniI} = \\frac{TP \\times TN-FP \\times FN}{(TP+FN)\\times(FN+TN)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 1.0, 1: 0.2, 2: 0.14285714285714285}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BeniniI)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benini II" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Benini II correlation [[10]](#ref10)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$corr_{BeniniII} = \\frac{TP \\times TN-FP \\times FN}{min((TP+FN)\\times(FN+TN), (TP+FP)\\times(FP+TN))}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 1.0, 1: 0.3333333333333333, 2: 0.2}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.BeniniII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Canberra" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Canberra distance [[11]](#ref11) [[12]](#ref12)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{Canberra} =\n", + "\\frac{FP+FN}{(TP+FP)+(TP+FN)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.25, 1: 0.6, 2: 0.45454545454545453}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.Canberra)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clement" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Clement similarity [[13]](#ref13)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{Clement} =\n", + "\\frac{TP}{TP+FP}\\times\\Big(1 - \\frac{TP+FP}{POP}\\Big) +\n", + "\\frac{TN}{FN+TN}\\times\\Big(1 - \\frac{FN+TN}{POP}\\Big)$$" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.7666666666666666, 1: 0.55, 2: 0.588095238095238}" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.Clement)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Consonni & Todeschini I" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Consonni & Todeschini I similarity [[14]](#ref14)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{ConsonniTodeschiniI} =\n", + "\\frac{log(1+TP+TN)}{log(1+POP)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.9348704159880586, 1: 0.8977117175026231, 2: 0.8107144632819592}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.ConsonniTodeschiniI)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Consonni & Todeschini II" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Consonni & Todeschini II similarity [[14]](#ref14)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{ConsonniTodeschiniII} =\n", + "\\frac{log(1+POP)-log(1+FP+FN)}{log(1+POP)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.5716826589686053, 1: 0.4595236911453605, 2: 0.3014445045412856}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.ConsonniTodeschiniII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Consonni & Todeschini III" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Consonni & Todeschini III similarity [[14]](#ref14)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{ConsonniTodeschiniIII} =\n", + "\\frac{log(1+TP)}{log(1+POP)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.5404763088546395, 1: 0.27023815442731974, 2: 0.5404763088546395}" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.ConsonniTodeschiniIII)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Consonni & Todeschini IV" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Consonni & Todeschini IV similarity [[14]](#ref14)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$sim_{ConsonniTodeschiniIV} =\n", + "\\frac{log(1+TP)}{log(1+TP+FP+FN)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.7737056144690831, 1: 0.43067655807339306, 2: 0.6309297535714574}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.ConsonniTodeschiniIV)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Consonni & Todeschini V" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Consonni & Todeschini V correlation [[14]](#ref14)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$$corr_{ConsonniTodeschiniV} =\n", + "\\frac{log(1+TP \\times TN)-log(1+FP \\times FN)}{log(1+\\frac{POP^2}{4})}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 0.8560267854703983, 1: 0.30424737289682985, 2: 0.17143541431350617}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cm.distance(metric=DistanceType.ConsonniTodeschiniV)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
1- C. C. Little, \"Abydos Documentation,\" 2018.
\n", + "\n", + "
2- V. Dallmeier, C. Lindig, and A. Zeller, \"Lightweight defect localization for Java,\" in European conference on object-oriented programming, 2005: Springer, pp. 528-550.
\n", + "\n", + "
3- R. Abreu, P. Zoeteweij, and A. J. Van Gemund, \"An evaluation of similarity coefficients for software fault localization,\" in 2006 12th Pacific Rim International Symposium on Dependable Computing (PRDC'06), 2006: IEEE, pp. 39-46.
\n", + "\n", + "
4- M. R. Anderberg, Cluster analysis for applications: probability and mathematical statistics: a series of monographs and textbooks. Academic press, 2014.
\n", + "\n", + "
5- A. M. Andrés and P. F. Marzo, \"Delta: A new measure of agreement between two raters,\" British journal of mathematical and statistical psychology, vol. 57, no. 1, pp. 1-19, 2004.
\n", + "\n", + "
6- C. Baroni-Urbani and M. W. Buser, \"Similarity of binary data,\" Systematic Zoology, vol. 25, no. 3, pp. 251-259, 1976.
\n", + "\n", + "
7- V. Batagelj and M. Bren, \"Comparing resemblance measures,\" Journal of classification, vol. 12, no. 1, pp. 73-90, 1995.
\n", + "\n", + "
8- F. B. Baulieu, \"A classification of presence/absence based dissimilarity coefficients,\" Journal of Classification, vol. 6, no. 1, pp. 233-246, 1989.
\n", + "\n", + "
9- F. B. Baulieu, \"Two variant axiom systems for presence/absence based dissimilarity coefficients,\" Journal of Classification, vol. 14, no. 1, pp. 0159-0170, 1997.
\n", + "\n", + "
10- R. Benini, Principii di demografia. Barbera, 1901.
\n", + "\n", + "
11- G. N. Lance and W. T. Williams, \"Computer programs for hierarchical polythetic classification (“similarity analyses”),\" The Computer Journal, vol. 9, no. 1, pp. 60-64, 1966.
\n", + "\n", + "
12- G. N. Lance and W. T. Williams, \"Mixed-Data Classificatory Programs I - Agglomerative Systems,\" Australian Computer Journal, vol. 1, no. 1, pp. 15-20, 1967.
\n", + "\n", + "
13- P. W. Clement, \"A formula for computing inter-observer agreement,\" Psychological Reports, vol. 39, no. 1, pp. 257-258, 1976.
\n", + "\n", + "
14- V. Consonni and R. Todeschini, \"New similarity coefficients for binary data,\" Match-Communications in Mathematical and Computer Chemistry, vol. 68, no. 2, p. 581, 2012.
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Distance/Similarity", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Document/Document.ipynb b/Document/Document.ipynb index b79cdba0..35376e97 100644 --- a/Document/Document.ipynb +++ b/Document/Document.ipynb @@ -71,6 +71,7 @@ "
  • To Array
  • \n", "
  • Combine
  • \n", "
  • Plot
  • \n", + "
  • Distance/Similarity
  • \n", "
  • Parameter Recommender
  • \n", "
  • Compare
  • \n", "
  • ROC Curve
  • \n", @@ -2293,6 +2294,30 @@ "" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distance/Similarity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- [Jupyter Notebook](https://nbviewer.jupyter.org/github/sepandhaghighi/pycm/blob/master/Document/Distance.ipynb)\n", + "- [HTML](http://www.pycm.io/doc/Distance.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/Document/README.md b/Document/README.md index c91b74b9..d0d9d936 100644 --- a/Document/README.md +++ b/Document/README.md @@ -8,7 +8,12 @@ ## Document - [Jupyter Notebook](https://nbviewer.jupyter.org/github/sepandhaghighi/pycm/blob/master/Document/Document.ipynb) -- [HTML](http://www.pycm.io/doc/) +- [HTML](http://www.pycm.io/doc/) + +## Distance + +- [Jupyter Notebook](https://nbviewer.jupyter.org/github/sepandhaghighi/pycm/blob/master/Document/Distance.ipynb) +- [HTML](http://www.pycm.io/doc/Distance.html) ## Example-1 (Comparison of three different classifiers) diff --git a/Otherfiles/notebook_check.py b/Otherfiles/notebook_check.py index 8545f8ab..a8f8ae75 100644 --- a/Otherfiles/notebook_check.py +++ b/Otherfiles/notebook_check.py @@ -7,6 +7,7 @@ NOTEBOOKS_LIST = [ "Document", + "Distance", "Example1", "Example2", "Example3", diff --git a/Test/error_test.py b/Test/error_test.py index b57b7fe2..512b5b7e 100644 --- a/Test/error_test.py +++ b/Test/error_test.py @@ -53,6 +53,10 @@ >>> y_actu = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2] >>> y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2] >>> cm = ConfusionMatrix(y_actu,y_pred) +>>> cm.distance(metric = 2) +Traceback (most recent call last): + ... +pycm.pycm_error.pycmMatrixError: The metric type must be DistanceType >>> cm.relabel([1,2,3]) Traceback (most recent call last): ... diff --git a/Test/function_test.py b/Test/function_test.py index 116e9d4f..dd789717 100644 --- a/Test/function_test.py +++ b/Test/function_test.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """ >>> from pycm import * +>>> from pycm.pycm_distance import DISTANCE_MAPPER >>> import os >>> import json >>> import numpy as np @@ -717,4 +718,9 @@ >>> cm4.to_array() array([[3, 1], [0, 0]]) +>>> result = [] +>>> for item in DISTANCE_MAPPER.values(): +... result.append(item(TP=2, TN=2, FP=1, FN="2")) +>>> all(list(map(lambda x: x=="None", result))) +True """ diff --git a/Test/verified_test.py b/Test/verified_test.py index 3c963640..ffb959d9 100644 --- a/Test/verified_test.py +++ b/Test/verified_test.py @@ -427,4 +427,127 @@ True >>> abs(crv.area(method="midpoint")[2] - 0.2916) < 0.001 True +>>> cm1 = ConfusionMatrix(matrix = {1:{1:2,0:2},0:{0:778,1:2}}) # Verified Case -- (https://bit.ly/3vVMWRT) +>>> cm2 = ConfusionMatrix(matrix = {1:{1:2,0:3},0:{0:775,1:4}}) # Verified Case -- (https://bit.ly/3vVMWRT) +>>> cm1.distance(metric=DistanceType.AMPLE)[1] +0.49743589743589745 +>>> cm2.distance(metric=DistanceType.AMPLE)[1] +0.32947729220222793 +>>> cm1.distance(metric=DistanceType.Anderberg)[1] +0.0 +>>> cm2.distance(metric=DistanceType.Anderberg)[1] +0.0 +>>> cm1.distance(metric=DistanceType.AndresMarzoDelta)[1] +0.9897959183673469 +>>> cm2.distance(metric=DistanceType.AndresMarzoDelta)[1] +0.9822344346552608 +>>> cm1.distance(metric=DistanceType.BaroniUrbaniBuserI)[1] +0.9119837740878104 +>>> cm2.distance(metric=DistanceType.BaroniUrbaniBuserI)[1] +0.8552823175014205 +>>> cm1.distance(metric=DistanceType.BaroniUrbaniBuserII)[1] +0.8239675481756209 +>>> cm2.distance(metric=DistanceType.BaroniUrbaniBuserII)[1] +0.7105646350028408 +>>> cm1.distance(metric=DistanceType.BatageljBren)[1] +0.002570694087403599 +>>> cm2.distance(metric=DistanceType.BatageljBren)[1] +0.007741935483870968 +>>> cm1.distance(metric=DistanceType.BaulieuI)[1] +0.75 +>>> cm2.distance(metric=DistanceType.BaulieuI)[1] +0.8666666666666667 +>>> cm1.distance(metric=DistanceType.BaulieuII)[1] +0.24871959237343852 +>>> cm2.distance(metric=DistanceType.BaulieuII)[1] +0.13213719608444902 +>>> cm1.distance(metric=DistanceType.BaulieuIII)[1] +0.4949500208246564 +>>> cm2.distance(metric=DistanceType.BaulieuIII)[1] +0.4949955747605165 +>>> cm1.distance(metric=DistanceType.BaulieuIV)[1] +-5249.96272285802 +>>> cm2.distance(metric=DistanceType.BaulieuIV)[1] +-5209.561726488335 +>>> cm1.distance(metric=DistanceType.BaulieuV)[1] +0.7142857142857143 +>>> cm2.distance(metric=DistanceType.BaulieuV)[1] +0.8 +>>> cm1.distance(metric=DistanceType.BaulieuVI)[1] +0.5714285714285714 +>>> cm2.distance(metric=DistanceType.BaulieuVI)[1] +0.7 +>>> cm1.distance(metric=DistanceType.BaulieuVII)[1] +0.005050505050505051 +>>> cm2.distance(metric=DistanceType.BaulieuVII)[1] +0.008838383838383838 +>>> cm1.distance(metric=DistanceType.BaulieuVIII)[1] +0.0 +>>> cm2.distance(metric=DistanceType.BaulieuVIII)[1] +1.6269262807163682e-06 +>>> cm1.distance(metric=DistanceType.BaulieuIX)[1] +0.007633587786259542 +>>> cm2.distance(metric=DistanceType.BaulieuIX)[1] +0.012706480304955527 +>>> cm1.distance(metric=DistanceType.BaulieuX)[1] +0.007633587786259542 +>>> cm2.distance(metric=DistanceType.BaulieuX)[1] +0.013959390862944163 +>>> cm1.distance(metric=DistanceType.BaulieuXI)[1] +0.005115089514066497 +>>> cm2.distance(metric=DistanceType.BaulieuXI)[1] +0.008951406649616368 +>>> cm1.distance(metric=DistanceType.BaulieuXII)[1] +0.8 +>>> cm2.distance(metric=DistanceType.BaulieuXII)[1] +0.875 +>>> cm1.distance(metric=DistanceType.BaulieuXIII)[1] +0.2857142857142857 +>>> cm2.distance(metric=DistanceType.BaulieuXIII)[1] +0.4117647058823529 +>>> cm1.distance(metric=DistanceType.BaulieuXIV)[1] +0.75 +>>> cm2.distance(metric=DistanceType.BaulieuXIV)[1] +0.8333333333333334 +>>> cm1.distance(metric=DistanceType.BaulieuXV)[1] +0.75 +>>> cm2.distance(metric=DistanceType.BaulieuXV)[1] +0.8461538461538461 +>>> cm1.distance(metric=DistanceType.BeniniI)[1] +0.49743589743589745 +>>> cm2.distance(metric=DistanceType.BeniniI)[1] +0.3953727506426735 +>>> cm1.distance(metric=DistanceType.BeniniII)[1] +0.49743589743589745 +>>> cm2.distance(metric=DistanceType.BeniniII)[1] +0.3953727506426735 +>>> cm1.distance(metric=DistanceType.Canberra)[1] +0.5 +>>> cm2.distance(metric=DistanceType.Canberra)[1] +0.6363636363636364 +>>> cm1.distance(metric=DistanceType.Clement)[1] +0.5025379382522239 +>>> cm2.distance(metric=DistanceType.Clement)[1] +0.33840586363079933 +>>> cm1.distance(metric=DistanceType.ConsonniTodeschiniI)[1] +0.9992336018090547 +>>> cm2.distance(metric=DistanceType.ConsonniTodeschiniI)[1] +0.998656222829757 +>>> cm1.distance(metric=DistanceType.ConsonniTodeschiniII)[1] +0.7585487129939101 +>>> cm2.distance(metric=DistanceType.ConsonniTodeschiniII)[1] +0.6880377723094788 +>>> cm1.distance(metric=DistanceType.ConsonniTodeschiniIII)[1] +0.16481614417697044 +>>> cm2.distance(metric=DistanceType.ConsonniTodeschiniIII)[1] +0.16481614417697044 +>>> cm1.distance(metric=DistanceType.ConsonniTodeschiniIV)[1] +0.5645750340535797 +>>> cm2.distance(metric=DistanceType.ConsonniTodeschiniIV)[1] +0.47712125471966244 +>>> cm1.distance(metric=DistanceType.ConsonniTodeschiniV)[1] +0.48072545510682463 +>>> cm2.distance(metric=DistanceType.ConsonniTodeschiniV)[1] +0.4003930264973547 + """ diff --git a/pycm/__init__.py b/pycm/__init__.py index 7074fd4f..6d185842 100644 --- a/pycm/__init__.py +++ b/pycm/__init__.py @@ -3,6 +3,7 @@ from .pycm_param import PYCM_VERSION, OVERALL_BENCHMARK_LIST, CLASS_BENCHMARK_LIST from .pycm_error import * from .pycm_output import pycm_help, online_help +from .pycm_distance import DistanceType from .pycm_obj import ConfusionMatrix from .pycm_compare import Compare from .pycm_curve import Curve, ROCCurve, PRCurve diff --git a/pycm/pycm_curve.py b/pycm/pycm_curve.py index c714acc4..9a187aa0 100644 --- a/pycm/pycm_curve.py +++ b/pycm/pycm_curve.py @@ -93,8 +93,10 @@ def area(self, method="trapezoidal"): dx = numpy.diff(x) if numpy.any(dx < 0) and numpy.any(dx > 0): sort_indices = numpy.argsort(x, kind="mergesort") - self.data[c][self.plot_x_axis] = x = numpy.array(x)[sort_indices].tolist() - self.data[c][self.plot_y_axis] = y = numpy.array(y)[sort_indices].tolist() + self.data[c][self.plot_x_axis] = x = numpy.array(x)[ + sort_indices].tolist() + self.data[c][self.plot_y_axis] = y = numpy.array(y)[ + sort_indices].tolist() if method == "trapezoidal": self.auc[c] = __trapezoidal_numeric_integral__(x, y) elif method == "midpoint": diff --git a/pycm/pycm_distance.py b/pycm/pycm_distance.py new file mode 100644 index 00000000..19298b1b --- /dev/null +++ b/pycm/pycm_distance.py @@ -0,0 +1,713 @@ +# -*- coding: utf-8 -*- +"""Distance/Similarity functions.""" +from __future__ import division +from enum import Enum +import math + + +class DistanceType(Enum): + """ + Distance metric type class. + + >>> pycm.DistanceType.AMPLE + """ + + AMPLE = "AMPLE" + Anderberg = "Anderberg" + AndresMarzoDelta = "AndresMarzoDelta" + BaroniUrbaniBuserI = "BaroniUrbaniBuserI" + BaroniUrbaniBuserII = "BaroniUrbaniBuserII" + BatageljBren = "BatageljBren" + BaulieuI = "BaulieuI" + BaulieuII = "BaulieuII" + BaulieuIII = "BaulieuIII" + BaulieuIV = "BaulieuIV" + BaulieuV = "BaulieuV" + BaulieuVI = "BaulieuVI" + BaulieuVII = "BaulieuVII" + BaulieuVIII = "BaulieuVIII" + BaulieuIX = "BaulieuIX" + BaulieuX = "BaulieuX" + BaulieuXI = "BaulieuXI" + BaulieuXII = "BaulieuXII" + BaulieuXIII = "BaulieuXIII" + BaulieuXIV = "BaulieuXIV" + BaulieuXV = "BaulieuXV" + BeniniI = "BeniniI" + BeniniII = "BeniniII" + Canberra = "Canberra" + Clement = "Clement" + ConsonniTodeschiniI = "ConsonniTodeschiniI" + ConsonniTodeschiniII = "ConsonniTodeschiniII" + ConsonniTodeschiniIII = "ConsonniTodeschiniIII" + ConsonniTodeschiniIV = "ConsonniTodeschiniIV" + ConsonniTodeschiniV = "ConsonniTodeschiniV" + + +def AMPLE_calc(TP, FP, FN, TN): + """ + Calculate AMPLE. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: AMPLE as float + """ + try: + part1 = TP / (TP + FP) + part2 = FN / (FN + TN) + return abs(part1 - part2) + except Exception: + return "None" + + +def Anderberg_calc(TP, FP, FN, TN): + """ + Calculate Anderberg's D. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Anderberg's D as float + """ + try: + part1 = max(TP, FP) + max(FN, TN) + max(TP, FN) + max(FP, TN) + part2 = max(TP + FP, FP + TN) + max(TP + FP, FN + TN) + n = TP + FP + FN + TN + return (part1 - part2) / (2 * n) + except Exception: + return "None" + + +def AndresMarzoDelta_calc(TP, FP, FN, TN): + """ + Calculate Andres & Marzo's Delta. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Andres & Marzo's Delta as float + """ + try: + part1 = TP + TN - 2 * math.sqrt(FP * FN) + n = TP + FP + FN + TN + return part1 / n + except Exception: + return "None" + + +def BaroniUrbaniBuserI_calc(TP, FP, FN, TN): + """ + Calculate Baroni-Urbani & Buser I. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baroni-Urbani & Buser I as float + """ + try: + part1 = math.sqrt(TP * TN) + TP + part2 = part1 + FP + FN + return part1 / part2 + except Exception: + return "None" + + +def BaroniUrbaniBuserII_calc(TP, FP, FN, TN): + """ + Calculate Baroni-Urbani & Buser II. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baroni-Urbani & Buser II as float + """ + try: + part1 = math.sqrt(TP * TN) + TP - FP - FN + part2 = math.sqrt(TP * TN) + TP + FP + FN + return part1 / part2 + except Exception: + return "None" + + +def BatageljBren_calc(TP, FP, FN, TN): + """ + Calculate Batagelj & Bren. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Batagelj & Bren as float + """ + try: + return (FP * FN) / (TP * TN) + except Exception: + return "None" + + +def BaulieuI_calc(TP, FP, FN, TN): + """ + Calculate Baulieu I. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu I as float + """ + try: + part1 = (TP + FP) * (TP + FN) + return (part1 - TP * TP) / part1 + except Exception: + return "None" + + +def BaulieuII_calc(TP, FP, FN, TN): + """ + Calculate Baulieu II. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu II as float + """ + try: + part1 = TP * TP * TN * TN + part2 = (TP + FP) * (TP + FN) * (FP + TN) * (FN + TN) + return part1 / part2 + except Exception: + return "None" + + +def BaulieuIII_calc(TP, FP, FN, TN): + """ + Calculate Baulieu III. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu III as float + """ + try: + n = TP + FP + FN + TN + part1 = n * n - 4 * (TP * TN - FP * FN) + return part1 / (2 * n * n) + except Exception: + return "None" + + +def BaulieuIV_calc(TP, FP, FN, TN): + """ + Calculate Baulieu IV. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu IV as float + """ + try: + n = TP + FP + FN + TN + part1 = FP + FN - (TP + 0.5) * (TN + 0.5) * TN * math.e + return part1 / n + except Exception: + return "None" + + +def BaulieuV_calc(TP, FP, FN, TN): + """ + Calculate Baulieu V. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu V as float + """ + try: + return (FP + FN + 1) / (TP + FP + FN + 1) + except Exception: + return "None" + + +def BaulieuVI_calc(TP, FP, FN, TN): + """ + Calculate Baulieu VI. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu VI as float + """ + try: + return (FP + FN) / (TP + FP + FN + 1) + except Exception: + return "None" + + +def BaulieuVII_calc(TP, FP, FN, TN): + """ + Calculate Baulieu VII. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu VII as float + """ + try: + n = TP + FP + FN + TN + return (FP + FN) / (n + TP * (TP - 4) * (TP - 4)) + except Exception: + return "None" + + +def BaulieuVIII_calc(TP, FP, FN, TN): + """ + Calculate Baulieu VIII. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu VIII as float + """ + try: + n = TP + FP + FN + TN + return ((FP - FN) * (FP - FN)) / (n * n) + except Exception: + return "None" + + +def BaulieuIX_calc(TP, FP, FN, TN): + """ + Calculate Baulieu IX. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu IX as float + """ + try: + return (FP + 2 * FN) / (TP + FP + 2 * FN + TN) + except Exception: + return "None" + + +def BaulieuX_calc(TP, FP, FN, TN): + """ + Calculate Baulieu X. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu X as float + """ + try: + n = TP + FP + FN + TN + max_bc = max(FP, FN) + return (FP + FN + max_bc) / (n + max_bc) + except Exception: + return "None" + + +def BaulieuXI_calc(TP, FP, FN, TN): + """ + Calculate Baulieu XI. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu XI as float + """ + try: + return (FP + FN) / (FP + FN + TN) + except Exception: + return "None" + + +def BaulieuXII_calc(TP, FP, FN, TN): + """ + Calculate Baulieu XII. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu XII as float + """ + try: + return (FP + FN) / (TP + FP + FN - 1) + except Exception: + return "None" + + +def BaulieuXIII_calc(TP, FP, FN, TN): + """ + Calculate Baulieu XIII. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu XIII as float + """ + try: + part2 = TP + FP + FN + TP * (TP - 4) * (TP - 4) + return (FP + FN) / part2 + except Exception: + return "None" + + +def BaulieuXIV_calc(TP, FP, FN, TN): + """ + Calculate Baulieu XIV. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu XIV as float + """ + try: + return (FP + 2 * FN) / (TP + FP + 2 * FN) + except Exception: + return "None" + + +def BaulieuXV_calc(TP, FP, FN, TN): + """ + Calculate Baulieu XV. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Baulieu XV as float + """ + try: + max_bc = max(FP, FN) + return (FP + FN + max_bc) / (TP + FP + FN + max_bc) + except Exception: + return "None" + + +def BeniniI_calc(TP, FP, FN, TN): + """ + Calculate Benini I correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Benini I correlation as float + """ + try: + return (TP * TN - FP * FN) / ((TP + FN) * (FN + TN)) + except Exception: + return "None" + + +def BeniniII_calc(TP, FP, FN, TN): + """ + Calculate Benini II correlation. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Benini II correlation as float + """ + try: + part2 = min((TP + FN) * (FN + TN), (TP + FP) * (FP + TN)) + return (TP * TN - FP * FN) / part2 + except Exception: + return "None" + + +def Canberra_calc(TP, FP, FN, TN): + """ + Calculate Canberra distance. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Canberra distance as float + """ + try: + return (FP + FN) / ((TP + FP) + (TP + FN)) + except Exception: + return "None" + + +def Clement_calc(TP, FP, FN, TN): + """ + Calculate Clement similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Clement similarity as float + """ + try: + n = TP + FP + FN + TN + term1 = (TP / (TP + FP)) * (1 - (TP + FP) / n) + term2 = (TN / (FN + TN)) * (1 - (FN + TN) / n) + return term1 + term2 + except Exception: + return "None" + + +def ConsonniTodeschiniI_calc(TP, FP, FN, TN): + """ + Calculate Consonni & Todeschini I similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Consonni & Todeschini I similarity as float + """ + try: + n = TP + FP + FN + TN + return math.log(1 + TP + TN) / math.log(1 + n) + except Exception: + return "None" + + +def ConsonniTodeschiniII_calc(TP, FP, FN, TN): + """ + Calculate Consonni & Todeschini II similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Consonni & Todeschini II similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = math.log(1 + n) - math.log(1 + FP + FN) + return part1 / math.log(1 + n) + except Exception: + return "None" + + +def ConsonniTodeschiniIII_calc(TP, FP, FN, TN): + """ + Calculate Consonni & Todeschini III similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Consonni & Todeschini III similarity as float + """ + try: + n = TP + FP + FN + TN + return math.log(1 + TP) / math.log(1 + n) + except Exception: + return "None" + + +def ConsonniTodeschiniIV_calc(TP, FP, FN, TN): + """ + Calculate Consonni & Todeschini IV similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Consonni & Todeschini IV similarity as float + """ + try: + return math.log(1 + TP) / math.log(1 + TP + FP + FN) + except Exception: + return "None" + + +def ConsonniTodeschiniV_calc(TP, FP, FN, TN): + """ + Calculate Consonni & Todeschini V similarity. + + :param TP: true positive + :type TP: int + :param TN: true negative + :type TN: int + :param FP: false positive + :type FP: int + :param FN: false negative + :type FN: int + :return: Consonni & Todeschini V similarity as float + """ + try: + n = TP + FP + FN + TN + part1 = math.log(1 + TP * TN) - math.log(1 + FP * FN) + part2 = math.log(1 + n * n / 4) + return part1 / part2 + except Exception: + return "None" + + +DISTANCE_MAPPER = { + DistanceType.AMPLE: AMPLE_calc, + DistanceType.Anderberg: Anderberg_calc, + DistanceType.AndresMarzoDelta: AndresMarzoDelta_calc, + DistanceType.BaroniUrbaniBuserI: BaroniUrbaniBuserI_calc, + DistanceType.BaroniUrbaniBuserII: BaroniUrbaniBuserII_calc, + DistanceType.BatageljBren: BatageljBren_calc, + DistanceType.BaulieuI: BaulieuI_calc, + DistanceType.BaulieuII: BaulieuII_calc, + DistanceType.BaulieuIII: BaulieuIII_calc, + DistanceType.BaulieuIV: BaulieuIV_calc, + DistanceType.BaulieuV: BaulieuV_calc, + DistanceType.BaulieuVI: BaulieuVI_calc, + DistanceType.BaulieuVII: BaulieuVII_calc, + DistanceType.BaulieuVIII: BaulieuVIII_calc, + DistanceType.BaulieuIX: BaulieuIX_calc, + DistanceType.BaulieuX: BaulieuX_calc, + DistanceType.BaulieuXI: BaulieuXI_calc, + DistanceType.BaulieuXII: BaulieuXII_calc, + DistanceType.BaulieuXIII: BaulieuXIII_calc, + DistanceType.BaulieuXIV: BaulieuXIV_calc, + DistanceType.BaulieuXV: BaulieuXV_calc, + DistanceType.BeniniI: BeniniI_calc, + DistanceType.BeniniII: BeniniII_calc, + DistanceType.Canberra: Canberra_calc, + DistanceType.Clement: Clement_calc, + DistanceType.ConsonniTodeschiniI: ConsonniTodeschiniI_calc, + DistanceType.ConsonniTodeschiniII: ConsonniTodeschiniII_calc, + DistanceType.ConsonniTodeschiniIII: ConsonniTodeschiniIII_calc, + DistanceType.ConsonniTodeschiniIV: ConsonniTodeschiniIV_calc, + DistanceType.ConsonniTodeschiniV: ConsonniTodeschiniV_calc, +} diff --git a/pycm/pycm_obj.py b/pycm/pycm_obj.py index e2072e2f..bbf6fe6b 100644 --- a/pycm/pycm_obj.py +++ b/pycm/pycm_obj.py @@ -6,6 +6,7 @@ from .pycm_handler import __obj_assign_handler__, __obj_file_handler__, __obj_matrix_handler__, __obj_vector_handler__, __obj_array_handler__ from .pycm_class_func import F_calc, IBA_calc, TI_calc, NB_calc, sensitivity_index_calc from .pycm_overall_func import weighted_kappa_calc, weighted_alpha_calc, alpha2_calc, brier_score_calc +from .pycm_distance import DistanceType, DISTANCE_MAPPER from .pycm_output import * from .pycm_util import * from .pycm_param import * @@ -591,6 +592,22 @@ def NB(self, w=1): except Exception: return {} + def distance(self, metric): + """ + Calculate distance/similarity for all classes. + + :param metric: metric + :type metric: DistanceType + :return: result as dict + """ + distance_dict = {} + if not isinstance(metric, DistanceType): + raise pycmMatrixError(DISTANCE_METRIC_TYPE_ERROR) + for i in self.classes: + distance_dict[i] = DISTANCE_MAPPER[metric]( + TP=self.TP[i], FP=self.FP[i], FN=self.FN[i], TN=self.TN[i]) + return distance_dict + def CI( self, param, diff --git a/pycm/pycm_param.py b/pycm/pycm_param.py index f22c99a2..d94bae7f 100644 --- a/pycm/pycm_param.py +++ b/pycm/pycm_param.py @@ -117,6 +117,8 @@ CURVE_NONE_WARNING = "The curve axes contain non-numerical value(s)." +DISTANCE_METRIC_TYPE_ERROR = "The metric type must be DistanceType" + CLASS_NUMBER_THRESHOLD = 10 BALANCE_RATIO_THRESHOLD = 3