pysal · JamesParrott · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 14, 2023
diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py
@@ -60,6 +60,26 @@
 
 FMT = "{:.2f}"
 
+
+class MockNumpy:
+    def __init__(self, int_type=None, float_type=None):
+        self.int32 = int_type or int
+        self.float32 = float_type or float
+
+        self.inf = self.float32("inf")
+
+    def zeros(self, dims, dtype=int):
+        if len(dims) == 1:
+            zero = dtype(0)
+            return [zero for __ in range(dims[0])]
+
+        return [self.zeros(dims[1:], dtype) for __ in range(dims[0])]
+
+    @staticmethod
+    def delete(arr, index):
+        return arr[:index] + arr[index + 1 :]
+
+
 try:
     from numba import njit
 
@@ -637,6 +657,72 @@ def _fisher_jenks_means(values, classes=5):
     return np.delete(kclass, 0)
 
 
+def _fisher_jenks_means_without_numpy(values, classes=5, np=None):
+    """
+    As for _fisher_jenks_means above, to keep the code as far as possible
+    exactly the same, except with np passable in as a dependency, and with
+    matrix[i, j] replaced with matrix[i][j] for speed.
+
+
+    Jenks Optimal (Natural Breaks) algorithm implemented in Python.
+
+    Notes
+    -----
+
+    The original Python code comes from here:
+    http://danieljlewis.org/2010/06/07/jenks-natural-breaks-algorithm-in-python/
+    and is based on a JAVA and Fortran code available here:
+    https://stat.ethz.ch/pipermail/r-sig-geo/2006-March/000811.html
+
+
+
+    """
+    if np is None:
+        np = MockNumpy()
+
+    n_data = len(values)
+    mat1 = np.zeros((n_data + 1, classes + 1), dtype=np.int32)
+    mat2 = np.zeros((n_data + 1, classes + 1), dtype=np.float32)
+
+    for j in range(1, classes + 1):
+        mat1[1][j] = 1
+        for i in range(2, n_data + 1):
+            mat2[i][j] = np.inf
+    v = 0
+    for _l in range(2, len(values) + 1):
+        s1 = 0
+        s2 = 0
+        w = 0
+        for m in range(1, _l + 1):
+            i3 = _l - m + 1
+            val = values[i3 - 1]
+            s2 += val * val
+            s1 += val
+            w += 1
+            v = s2 - (s1 * s1) / np.float32(w)
+            i4 = i3 - 1
+            if i4 != 0:
+                for j in range(2, classes + 1):
+                    if mat2[_l][j] >= (v + mat2[i4][j - 1]):
+                        mat1[_l][j] = i3
+                        mat2[_l][j] = v + mat2[i4][j - 1]
+
+        mat1[_l][1] = 1
+        mat2[_l][1] = v
+
+    k = len(values)
+
+    kclass = np.zeros((classes + 1,), dtype=type(values[0]))
+    kclass[classes] = values[len(values) - 1]
+    kclass[0] = values[0]
+    for countNum in range(classes, 1, -1):
+        pivot = mat1[k][countNum]
+        _id = int(pivot - 2)
+        kclass[countNum - 1] = values[_id]
+        k = int(pivot - 1)
+    return np.delete(kclass, 0)
+
+
 class MapClassifier:
     r"""
     Abstract class for all map classifications :cite:`Slocum_2009`
@@ -1926,7 +2012,7 @@ class FisherJenks(MapClassifier):
     Parameters
     ----------
 
-    y : numpy.array
+    y : collections.abc.Iterable[numbers.Real]
         :math:`(n,1)`, values to classify.
     k : int (default 5)
         The number of classes required.
@@ -1936,7 +2022,7 @@ class FisherJenks(MapClassifier):
 
     yb : numpy.array
         :math:`(n,1)`, bin IDs for observations.
-    bins : numpy.array
+    bins : collections.abc.Sequence[numbers.Real]
         :math:`(k,1)`, the upper bounds of each class.
     k : int
         The number of classes.
@@ -1962,8 +2048,9 @@ class FisherJenks(MapClassifier):
 
     def __init__(self, y, k=K):
         if not HAS_NUMBA:
+            self._set_bins = self._set_bins_without_numpy
             warnings.warn(
-                "Numba not installed. Using slow pure python version.",
+                "Numba not installed. Using a less slow, pure python version.",
                 UserWarning,
                 stacklevel=3,
             )
@@ -1981,6 +2068,10 @@ def _set_bins(self):
         x = np.sort(self.y).astype("f8")
         self.bins = _fisher_jenks_means(x, classes=self.k)
 
+    def _set_bins_without_numpy(self):
+        x = sorted(self.y)
+        self.bins = np.asarray(_fisher_jenks_means_without_numpy(x, classes=self.k))
+
 
 class FisherJenksSampled(MapClassifier):
     """
@@ -2019,14 +2110,20 @@ class FisherJenksSampled(MapClassifier):
 
     """
 
+    ids = None
+
     def __init__(self, y, k=K, pct=0.10, truncate=True):
+        print(f"Got: {k=}, {pct=}, {truncate=}")
         self.k = k
         n = y.size
 
         if (pct * n > 1000) and truncate:
             pct = 1000.0 / n
+        # if FisherJenksSampled.ids is None:
+        # FisherJenksSampled.ids = np.random.randint(0, n, int(n * pct))
         ids = np.random.randint(0, n, int(n * pct))
         y = np.asarray(y)
+        # yr = y[FisherJenksSampled.ids]
         yr = y[ids]
         yr[-1] = max(y)  # make sure we have the upper bound
         yr[0] = min(y)  # make sure we have the min

diff --git a/mapclassify/tests/time_fisher_jenkss.py b/mapclassify/tests/time_fisher_jenkss.py
@@ -0,0 +1,121 @@
+import random
+import timeit
+import functools
+import warnings
+
+
+try:
+    import matplotlib.pyplot as plt
+
+    HAS_MATPLOTLIB = True
+except ImportError:
+    HAS_MATPLOTLIB = False
+
+import mapclassify
+import mapclassify.classifiers
+
+import numpy as np
+
+try:
+    import numba
+
+    warnings.warn(
+        f"""This test is to compare execution times of two alternatives 
+        to the Numba-optimised function (both of which we already 
+        know are far slower).  
+
+        Please run {__file__} again in a venv, 
+        in which Numba is not installed. """,
+        UserWarning,
+        stacklevel=3,
+    )
+except ImportError:
+    pass
+
+
+number_tests = 1
+k = 8
+
+
+def test_fisher_jenks_means(N, HAS_NUMBA):
+    data = [random.uniform(1.0, 1000.0) for __ in range(N)]
+
+    if HAS_NUMBA:
+        func = mapclassify.classifiers._fisher_jenks_means(
+            np.sort(data).astype("f8"), classes=k
+        )
+    else:
+        func = mapclassify.classifiers._fisher_jenks_means_without_numpy(
+            sorted(data), classes=k
+        )
+
+    # data = np.sort([random.uniform(1.0, 1000.0) for __ in range(N)]).astype("f8")
+
+    # if HAS_NUMBA:
+    #     func = mapclassify.classifiers._fisher_jenks_means(data, classes=k)
+    # else:
+    #     func = mapclassify.classifiers._fisher_jenks_means_without_numpy(data, classes=k)
+
+
+def test_mapclassify_classify_fisherjenks(N, HAS_NUMBA):
+    data = [random.uniform(1.0, 1000.0) for __ in range(N)]
+
+    # This hack avoids changing the interface of the
+    # FisherJenks class, just for this timing code.
+    mapclassify.classifiers.HAS_NUMBA = HAS_NUMBA
+
+    mapclassify.classify(y=data, scheme="fisherjenks", k=k)
+
+
+data_sizes = [100, 300, 1000, 2800, 8000]
+
+
+def compare_times(test_runner, descriptions, title):
+    print(f"{title}\n")
+
+    if HAS_MATPLOTLIB:
+        fig, ax = plt.subplots(figsize=(8.5, 5), layout="constrained")
+
+    for HAS_NUMBA, description in zip([False, True], descriptions):
+        times = []
+
+        for N in data_sizes:
+            t = timeit.timeit(
+                functools.partial(test_runner, N=N, HAS_NUMBA=HAS_NUMBA),
+                number=number_tests,
+            )
+
+            print(
+                f"Time: {t:.3f} seconds, data points: {N} {description}, {number_tests=}"
+            )
+
+            times.append(t)
+
+        if HAS_MATPLOTLIB:
+            ax.plot(data_sizes, times, "o-", label=description)
+
+    if HAS_MATPLOTLIB:
+        ax.set_xlabel("Number of random data points classified")
+        ax.set_ylabel("Run time (seconds)")
+        ax.set_title(title)
+        ax.legend()
+
+        plt.show()
+
+
+compare_times(
+    test_fisher_jenks_means,
+    title="Run times of the proposed function vs the original (excluding MapClassifier overhead)",
+    descriptions=[
+        " _fjm_without_numpy",
+        " _fisher_jenks_means",
+    ],
+)
+
+# compare_times(
+#     test_mapclassify_classify_fisherjenks,
+#     title="Run times for end user, of the proposed code vs the original (inc MapClassifier overhead)",
+#     descriptions = ["without Numpy, much less slow, pure python code",
+#                     'with Numpy, existing "slow pure python" code',
+#                    ],
+#     )