Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up the "slow", not compiled with numba.jit, _fisher_jenks_means by 4 to 5 times. Saves 3 minutes classifiying 8000 data points. #201

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 100 additions & 3 deletions mapclassify/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,26 @@

FMT = "{:.2f}"


class MockNumpy:
def __init__(self, int_type=None, float_type=None):
self.int32 = int_type or int
self.float32 = float_type or float

self.inf = self.float32("inf")

def zeros(self, dims, dtype=int):
if len(dims) == 1:
zero = dtype(0)
return [zero for __ in range(dims[0])]

return [self.zeros(dims[1:], dtype) for __ in range(dims[0])]

@staticmethod
def delete(arr, index):
return arr[:index] + arr[index + 1 :]


try:
from numba import njit

Expand Down Expand Up @@ -637,6 +657,72 @@ def _fisher_jenks_means(values, classes=5):
return np.delete(kclass, 0)


def _fisher_jenks_means_without_numpy(values, classes=5, np=None):
"""
As for _fisher_jenks_means above, to keep the code as far as possible
exactly the same, except with np passable in as a dependency, and with
matrix[i, j] replaced with matrix[i][j] for speed.


Jenks Optimal (Natural Breaks) algorithm implemented in Python.

Notes
-----

The original Python code comes from here:
http://danieljlewis.org/2010/06/07/jenks-natural-breaks-algorithm-in-python/
and is based on a JAVA and Fortran code available here:
https://stat.ethz.ch/pipermail/r-sig-geo/2006-March/000811.html



"""
if np is None:
np = MockNumpy()

n_data = len(values)
mat1 = np.zeros((n_data + 1, classes + 1), dtype=np.int32)
mat2 = np.zeros((n_data + 1, classes + 1), dtype=np.float32)

for j in range(1, classes + 1):
mat1[1][j] = 1
for i in range(2, n_data + 1):
mat2[i][j] = np.inf
v = 0
for _l in range(2, len(values) + 1):
s1 = 0
s2 = 0
w = 0
for m in range(1, _l + 1):
i3 = _l - m + 1
val = values[i3 - 1]
s2 += val * val
s1 += val
w += 1
v = s2 - (s1 * s1) / np.float32(w)
i4 = i3 - 1
if i4 != 0:
for j in range(2, classes + 1):
if mat2[_l][j] >= (v + mat2[i4][j - 1]):
mat1[_l][j] = i3
mat2[_l][j] = v + mat2[i4][j - 1]

mat1[_l][1] = 1
mat2[_l][1] = v

k = len(values)

kclass = np.zeros((classes + 1,), dtype=type(values[0]))
kclass[classes] = values[len(values) - 1]
kclass[0] = values[0]
for countNum in range(classes, 1, -1):
pivot = mat1[k][countNum]
_id = int(pivot - 2)
kclass[countNum - 1] = values[_id]
k = int(pivot - 1)
return np.delete(kclass, 0)


class MapClassifier:
r"""
Abstract class for all map classifications :cite:`Slocum_2009`
Expand Down Expand Up @@ -1926,7 +2012,7 @@ class FisherJenks(MapClassifier):
Parameters
----------

y : numpy.array
y : collections.abc.Iterable[numbers.Real]
:math:`(n,1)`, values to classify.
k : int (default 5)
The number of classes required.
Expand All @@ -1936,7 +2022,7 @@ class FisherJenks(MapClassifier):

yb : numpy.array
:math:`(n,1)`, bin IDs for observations.
bins : numpy.array
bins : collections.abc.Sequence[numbers.Real]
:math:`(k,1)`, the upper bounds of each class.
k : int
The number of classes.
Expand All @@ -1962,8 +2048,9 @@ class FisherJenks(MapClassifier):

def __init__(self, y, k=K):
if not HAS_NUMBA:
self._set_bins = self._set_bins_without_numpy
warnings.warn(
"Numba not installed. Using slow pure python version.",
"Numba not installed. Using a less slow, pure python version.",
UserWarning,
stacklevel=3,
)
Expand All @@ -1981,6 +2068,10 @@ def _set_bins(self):
x = np.sort(self.y).astype("f8")
self.bins = _fisher_jenks_means(x, classes=self.k)

def _set_bins_without_numpy(self):
x = sorted(self.y)
self.bins = np.asarray(_fisher_jenks_means_without_numpy(x, classes=self.k))


class FisherJenksSampled(MapClassifier):
"""
Expand Down Expand Up @@ -2019,14 +2110,20 @@ class FisherJenksSampled(MapClassifier):

"""

ids = None

def __init__(self, y, k=K, pct=0.10, truncate=True):
print(f"Got: {k=}, {pct=}, {truncate=}")
self.k = k
n = y.size

if (pct * n > 1000) and truncate:
pct = 1000.0 / n
# if FisherJenksSampled.ids is None:
# FisherJenksSampled.ids = np.random.randint(0, n, int(n * pct))
ids = np.random.randint(0, n, int(n * pct))
y = np.asarray(y)
# yr = y[FisherJenksSampled.ids]
yr = y[ids]
yr[-1] = max(y) # make sure we have the upper bound
yr[0] = min(y) # make sure we have the min
Expand Down
121 changes: 121 additions & 0 deletions mapclassify/tests/time_fisher_jenkss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import random
import timeit
import functools
import warnings


try:
import matplotlib.pyplot as plt

HAS_MATPLOTLIB = True
except ImportError:
HAS_MATPLOTLIB = False

import mapclassify
import mapclassify.classifiers

import numpy as np

try:
import numba

warnings.warn(
f"""This test is to compare execution times of two alternatives
to the Numba-optimised function (both of which we already
know are far slower).

Please run {__file__} again in a venv,
in which Numba is not installed. """,
UserWarning,
stacklevel=3,
)
except ImportError:
pass


number_tests = 1
k = 8


def test_fisher_jenks_means(N, HAS_NUMBA):
data = [random.uniform(1.0, 1000.0) for __ in range(N)]

if HAS_NUMBA:
func = mapclassify.classifiers._fisher_jenks_means(
np.sort(data).astype("f8"), classes=k
)
else:
func = mapclassify.classifiers._fisher_jenks_means_without_numpy(
sorted(data), classes=k
)

# data = np.sort([random.uniform(1.0, 1000.0) for __ in range(N)]).astype("f8")

# if HAS_NUMBA:
# func = mapclassify.classifiers._fisher_jenks_means(data, classes=k)
# else:
# func = mapclassify.classifiers._fisher_jenks_means_without_numpy(data, classes=k)


def test_mapclassify_classify_fisherjenks(N, HAS_NUMBA):
data = [random.uniform(1.0, 1000.0) for __ in range(N)]

# This hack avoids changing the interface of the
# FisherJenks class, just for this timing code.
mapclassify.classifiers.HAS_NUMBA = HAS_NUMBA

mapclassify.classify(y=data, scheme="fisherjenks", k=k)


data_sizes = [100, 300, 1000, 2800, 8000]


def compare_times(test_runner, descriptions, title):
print(f"{title}\n")

if HAS_MATPLOTLIB:
fig, ax = plt.subplots(figsize=(8.5, 5), layout="constrained")

for HAS_NUMBA, description in zip([False, True], descriptions):
times = []

for N in data_sizes:
t = timeit.timeit(
functools.partial(test_runner, N=N, HAS_NUMBA=HAS_NUMBA),
number=number_tests,
)

print(
f"Time: {t:.3f} seconds, data points: {N} {description}, {number_tests=}"
)

times.append(t)

if HAS_MATPLOTLIB:
ax.plot(data_sizes, times, "o-", label=description)

if HAS_MATPLOTLIB:
ax.set_xlabel("Number of random data points classified")
ax.set_ylabel("Run time (seconds)")
ax.set_title(title)
ax.legend()

plt.show()


compare_times(
test_fisher_jenks_means,
title="Run times of the proposed function vs the original (excluding MapClassifier overhead)",
descriptions=[
" _fjm_without_numpy",
" _fisher_jenks_means",
],
)

# compare_times(
# test_mapclassify_classify_fisherjenks,
# title="Run times for end user, of the proposed code vs the original (inc MapClassifier overhead)",
# descriptions = ["without Numpy, much less slow, pure python code",
# 'with Numpy, existing "slow pure python" code',
# ],
# )