diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py index 27d5188..7eca9af 100644 --- a/mapclassify/classifiers.py +++ b/mapclassify/classifiers.py @@ -60,6 +60,26 @@ FMT = "{:.2f}" + +class MockNumpy: + def __init__(self, int_type=None, float_type=None): + self.int32 = int_type or int + self.float32 = float_type or float + + self.inf = self.float32("inf") + + def zeros(self, dims, dtype=int): + if len(dims) == 1: + zero = dtype(0) + return [zero for __ in range(dims[0])] + + return [self.zeros(dims[1:], dtype) for __ in range(dims[0])] + + @staticmethod + def delete(arr, index): + return arr[:index] + arr[index + 1 :] + + try: from numba import njit @@ -637,6 +657,72 @@ def _fisher_jenks_means(values, classes=5): return np.delete(kclass, 0) +def _fisher_jenks_means_without_numpy(values, classes=5, np=None): + """ + As for _fisher_jenks_means above, to keep the code as far as possible + exactly the same, except with np passable in as a dependency, and with + matrix[i, j] replaced with matrix[i][j] for speed. + + + Jenks Optimal (Natural Breaks) algorithm implemented in Python. + + Notes + ----- + + The original Python code comes from here: + http://danieljlewis.org/2010/06/07/jenks-natural-breaks-algorithm-in-python/ + and is based on a JAVA and Fortran code available here: + https://stat.ethz.ch/pipermail/r-sig-geo/2006-March/000811.html + + + + """ + if np is None: + np = MockNumpy() + + n_data = len(values) + mat1 = np.zeros((n_data + 1, classes + 1), dtype=np.int32) + mat2 = np.zeros((n_data + 1, classes + 1), dtype=np.float32) + + for j in range(1, classes + 1): + mat1[1][j] = 1 + for i in range(2, n_data + 1): + mat2[i][j] = np.inf + v = 0 + for _l in range(2, len(values) + 1): + s1 = 0 + s2 = 0 + w = 0 + for m in range(1, _l + 1): + i3 = _l - m + 1 + val = values[i3 - 1] + s2 += val * val + s1 += val + w += 1 + v = s2 - (s1 * s1) / np.float32(w) + i4 = i3 - 1 + if i4 != 0: + for j in range(2, classes + 1): + if mat2[_l][j] >= (v + mat2[i4][j - 1]): + mat1[_l][j] = i3 + mat2[_l][j] = v + mat2[i4][j - 1] + + mat1[_l][1] = 1 + mat2[_l][1] = v + + k = len(values) + + kclass = np.zeros((classes + 1,), dtype=type(values[0])) + kclass[classes] = values[len(values) - 1] + kclass[0] = values[0] + for countNum in range(classes, 1, -1): + pivot = mat1[k][countNum] + _id = int(pivot - 2) + kclass[countNum - 1] = values[_id] + k = int(pivot - 1) + return np.delete(kclass, 0) + + class MapClassifier: r""" Abstract class for all map classifications :cite:`Slocum_2009` @@ -1926,7 +2012,7 @@ class FisherJenks(MapClassifier): Parameters ---------- - y : numpy.array + y : collections.abc.Iterable[numbers.Real] :math:`(n,1)`, values to classify. k : int (default 5) The number of classes required. @@ -1936,7 +2022,7 @@ class FisherJenks(MapClassifier): yb : numpy.array :math:`(n,1)`, bin IDs for observations. - bins : numpy.array + bins : collections.abc.Sequence[numbers.Real] :math:`(k,1)`, the upper bounds of each class. k : int The number of classes. @@ -1962,8 +2048,9 @@ class FisherJenks(MapClassifier): def __init__(self, y, k=K): if not HAS_NUMBA: + self._set_bins = self._set_bins_without_numpy warnings.warn( - "Numba not installed. Using slow pure python version.", + "Numba not installed. Using a less slow, pure python version.", UserWarning, stacklevel=3, ) @@ -1981,6 +2068,10 @@ def _set_bins(self): x = np.sort(self.y).astype("f8") self.bins = _fisher_jenks_means(x, classes=self.k) + def _set_bins_without_numpy(self): + x = sorted(self.y) + self.bins = np.asarray(_fisher_jenks_means_without_numpy(x, classes=self.k)) + class FisherJenksSampled(MapClassifier): """ @@ -2019,14 +2110,20 @@ class FisherJenksSampled(MapClassifier): """ + ids = None + def __init__(self, y, k=K, pct=0.10, truncate=True): + print(f"Got: {k=}, {pct=}, {truncate=}") self.k = k n = y.size if (pct * n > 1000) and truncate: pct = 1000.0 / n + # if FisherJenksSampled.ids is None: + # FisherJenksSampled.ids = np.random.randint(0, n, int(n * pct)) ids = np.random.randint(0, n, int(n * pct)) y = np.asarray(y) + # yr = y[FisherJenksSampled.ids] yr = y[ids] yr[-1] = max(y) # make sure we have the upper bound yr[0] = min(y) # make sure we have the min diff --git a/mapclassify/tests/time_fisher_jenkss.py b/mapclassify/tests/time_fisher_jenkss.py new file mode 100644 index 0000000..7d5d26c --- /dev/null +++ b/mapclassify/tests/time_fisher_jenkss.py @@ -0,0 +1,121 @@ +import random +import timeit +import functools +import warnings + + +try: + import matplotlib.pyplot as plt + + HAS_MATPLOTLIB = True +except ImportError: + HAS_MATPLOTLIB = False + +import mapclassify +import mapclassify.classifiers + +import numpy as np + +try: + import numba + + warnings.warn( + f"""This test is to compare execution times of two alternatives + to the Numba-optimised function (both of which we already + know are far slower). + + Please run {__file__} again in a venv, + in which Numba is not installed. """, + UserWarning, + stacklevel=3, + ) +except ImportError: + pass + + +number_tests = 1 +k = 8 + + +def test_fisher_jenks_means(N, HAS_NUMBA): + data = [random.uniform(1.0, 1000.0) for __ in range(N)] + + if HAS_NUMBA: + func = mapclassify.classifiers._fisher_jenks_means( + np.sort(data).astype("f8"), classes=k + ) + else: + func = mapclassify.classifiers._fisher_jenks_means_without_numpy( + sorted(data), classes=k + ) + + # data = np.sort([random.uniform(1.0, 1000.0) for __ in range(N)]).astype("f8") + + # if HAS_NUMBA: + # func = mapclassify.classifiers._fisher_jenks_means(data, classes=k) + # else: + # func = mapclassify.classifiers._fisher_jenks_means_without_numpy(data, classes=k) + + +def test_mapclassify_classify_fisherjenks(N, HAS_NUMBA): + data = [random.uniform(1.0, 1000.0) for __ in range(N)] + + # This hack avoids changing the interface of the + # FisherJenks class, just for this timing code. + mapclassify.classifiers.HAS_NUMBA = HAS_NUMBA + + mapclassify.classify(y=data, scheme="fisherjenks", k=k) + + +data_sizes = [100, 300, 1000, 2800, 8000] + + +def compare_times(test_runner, descriptions, title): + print(f"{title}\n") + + if HAS_MATPLOTLIB: + fig, ax = plt.subplots(figsize=(8.5, 5), layout="constrained") + + for HAS_NUMBA, description in zip([False, True], descriptions): + times = [] + + for N in data_sizes: + t = timeit.timeit( + functools.partial(test_runner, N=N, HAS_NUMBA=HAS_NUMBA), + number=number_tests, + ) + + print( + f"Time: {t:.3f} seconds, data points: {N} {description}, {number_tests=}" + ) + + times.append(t) + + if HAS_MATPLOTLIB: + ax.plot(data_sizes, times, "o-", label=description) + + if HAS_MATPLOTLIB: + ax.set_xlabel("Number of random data points classified") + ax.set_ylabel("Run time (seconds)") + ax.set_title(title) + ax.legend() + + plt.show() + + +compare_times( + test_fisher_jenks_means, + title="Run times of the proposed function vs the original (excluding MapClassifier overhead)", + descriptions=[ + " _fjm_without_numpy", + " _fisher_jenks_means", + ], +) + +# compare_times( +# test_mapclassify_classify_fisherjenks, +# title="Run times for end user, of the proposed code vs the original (inc MapClassifier overhead)", +# descriptions = ["without Numpy, much less slow, pure python code", +# 'with Numpy, existing "slow pure python" code', +# ], +# )