Skip to content

Refactoring pairwise into pairwise and gemm #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
44 changes: 44 additions & 0 deletions gemm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Authors: James Bergstra
# License: MIT
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a description of the benchmark and some motivation in a the module level docstring.


"""Computes the GEMM routine from the BLAS standard.

BLAS defines the xGEMM (DGEMM, SGEMM, CGEMM, ZGEMM) operations as
dense matrix multiplication and accumulation as follows:

C <- alpha A x B + beta C

Here (alpha, beta) are scalars and (A, B, C) are matrices.

This operation is one of the most widely-used and most-studied kernels in
high-performance computing, and BLAS implementations such as OpenBLAS, ATLAS,
and MKL provide highly optimized implementations of this operation. GEMM
implementations provide a real-life measure of peak performance on a
particular platform.

Note that the GEMM interface does not actually describe an algorithm, and the
standard does not require particular numerical accuracy. There are sub-cubic
algorithms (e.g. Strassen), and there are also cubic algorithms that are
"blocked" to be more cache-friendly. I believe that OpenBLAS and ATLAS use
blocked cubic algorithms, based on the asymptotic GFLOP/s attributed to MKL,
I would guess it uses blocked cubic algorithms too.

My hope with this benchmark is that it can be used to develop fast, readable
GEMM implementations. I'm curious, for example, if a readable, blocked
algorithm in pure Python could be compiled to a reasonable-performing
implementation.

"""

import numpy as np


def make_env(M=512, N=512, K=512, seed=0, dtype=np.float64,
alpha=1.5,
beta=0.5):
rng = np.random.RandomState(seed)
A = np.asarray(rng.normal(size=(M, K)), dtype=dtype)
B = np.asarray(rng.normal(size=(K, N)), dtype=dtype)
C = np.asarray(rng.normal(size=(M, N)), dtype=dtype)
return (alpha, A, B, beta, C), {}

32 changes: 32 additions & 0 deletions gemm/gemm_cython.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

# Authors: Jake Vanderplas, Olivier Grisel
# License: MIT

import numpy as np
cimport cython
from libc.math cimport sqrt

@cython.boundscheck(False)
@cython.wraparound(False)
def gemm_cython_for_loops(
double alpha,
double[:, ::1] A,
double[:, ::1] B,
double beta,
double[:, ::1] C,
):
cdef int M = C.shape[0]
cdef int N = C.shape[1]
cdef int K = A.shape[1]
cdef double tmp, d
for i in range(M):
for j in range(N):
d = 0.0
for k in range(K):
d += A[i, k] * B[k, j]
C[i, j] = alpha * d + beta * C[i, j]


benchmarks = (
gemm_cython_for_loops,
)
11 changes: 11 additions & 0 deletions gemm/gemm_numba.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Authors: Olivier Grisel
# License: MIT

from gemm import gemm_python
from numba import autojit


benchmarks = (
("gemm_numba_nested_for_loops",
autojit(gemm_python.gemm_python_nested_for_loops)),
)
13 changes: 13 additions & 0 deletions gemm/gemm_parakeet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Authors: Olivier Grisel
# License: MIT

from gemm import gemm_python
from parakeet import jit


benchmarks = (
("gemm_parakeet_nested_for_loops",
jit(gemm_python.gemm_python_nested_for_loops)),
("gemm_parakeet_inner_numpy",
jit(gemm_python.gemm_python_inner_numpy)),
)
10 changes: 10 additions & 0 deletions gemm/gemm_pyopencl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Author: James Bergstra
# License: MIT

# -- https://github.com/jaberg/python-benchmarks-pyopencl
from pybench_pyopencl import gemm_pyopencl

benchmarks = (
gemm_pyopencl.gemm_pyopencl_cpu,
)

45 changes: 45 additions & 0 deletions gemm/gemm_python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Authors: James Bergstra
# License: MIT

import numpy as np


# -- too slow to run directly, but good for JIT by other systems
def gemm_python_nested_for_loops(alpha, A, B, beta, C):
M, N = C.shape
K = A.shape[1]
#"omp parallel for private(j, d, k, tmp)"
for i in range(M):
for j in range(N):
d = 0.0
for k in range(K):
tmp = A[i, k] * B[j, k]
d += tmp
C[i, j] = alpha * d + beta * C[i, j]
return C


def gemm_python_inner_numpy(alpha, A, B, beta, C):
M, N = C.shape
for i in xrange(M):
for j in xrange(N):
C[i, j] *= beta
C[i, j] += alpha * np.dot(A[i, :], B[:, j])
return C


def gemm_python_broadcast_numpy(alpha, A, B, beta, C):
return alpha * (A[:, None, :] * B.T[None, :, :]).sum(axis=2) + beta * C


def gemm_python_numpy_dot(alpha, A, B, beta, C):
C *= beta
C += alpha * np.dot(A, B)
return C


benchmarks = (
gemm_python_inner_numpy,
gemm_python_broadcast_numpy,
gemm_python_numpy_dot,
)
45 changes: 45 additions & 0 deletions gemm/gemm_theano.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Authors: James Bergstra
# License: MIT
import theano
import theano.tensor as TT


def gemm_theano_tensor_prepare(dtype):
alpha = TT.scalar(dtype=str(dtype))
beta = TT.scalar(dtype=str(dtype))
A = TT.matrix(dtype=str(dtype))
B = TT.matrix(dtype=str(dtype))
C = TT.matrix(dtype=str(dtype))
Z = alpha * TT.sum(A[:, None, :] * B, axis=2) + beta * C
name = 'gemm_theano_broadcast_' + dtype
Cin = theano.In(C, mutable=True, borrow=True)
rval = theano.function([alpha, A, B, beta, Cin],
theano.Out(Z, borrow=True),
allow_input_downcast=True, name=name)
rval.__name__ = name
return rval


def gemm_theano_blas_prepare(dtype):
alpha = TT.scalar(dtype=str(dtype))
beta = TT.scalar(dtype=str(dtype))
A = TT.matrix(dtype=str(dtype))
B = TT.matrix(dtype=str(dtype))
C = TT.matrix(dtype=str(dtype))
Z = alpha * TT.dot(A, B) + beta * C
Cin = theano.In(C, mutable=True, borrow=True)
name = 'gemm_theano_blas_' + dtype
rval = theano.function([alpha, A, B, beta, Cin],
theano.Out(Z, borrow=True),
allow_input_downcast=True, name=name)
rval.__name__ = name
return rval


benchmarks = (
#gemm_theano_tensor_prepare('float32'),
gemm_theano_tensor_prepare('float64'),
#gemm_theano_blas_prepare('float32'),
gemm_theano_blas_prepare('float64'),
)

10 changes: 10 additions & 0 deletions pairwise/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# Authors: Olivier Grisel
# License: MIT

"""Computes the Euclidean distance between each pair of rows in a matrix.

In LaTeX:
Y[i, j] = sqrt{ \sum_k (A[i, k] - B[j, k])^2 }

This computation is a core routine of many machine learning algorithms that
rely on neighbourhood computations.

"""

import numpy as np


Expand Down
104 changes: 8 additions & 96 deletions pairwise/pairwise_pyopencl.py
Original file line number Diff line number Diff line change
@@ -1,105 +1,17 @@
# Authors: James Bergstra
# Author: James Bergstra
# License: MIT

import numpy as np
import time
import pyopencl as cl
import numpy

mf = cl.mem_flags
# -- https://github.com/jaberg/python-benchmarks-pyopencl
from pybench_pyopencl import pairwise_pyopencl

PROFILING = 0

ctx = cl.create_some_context()
if PROFILING:
queue = cl.CommandQueue(
ctx,
properties=cl.command_queue_properties.PROFILING_ENABLE)
else:
queue = cl.CommandQueue(ctx)

_cache = {}

def pairwise_pyopencl_cpu_prepare(shp, dtype):
N, D = shp
ctype = {
'float32': 'float',
'float64': 'double',
}[str(dtype)]

odd_d = "" if 0 == D % 2 else """
__global %(ctype)s * a1 = (__global %(ctype)s*) (a);
%(ctype)s diff = a1[(n0 + 1) * %(D)s - 1] - a1[(m0 + 1) * %(D)s - 1];
buf.s0 += diff * diff;
"""

prg = cl.Program(ctx, """
__kernel void lower(__global %(ctype)s2 *a, __global %(ctype)s *c)
{
for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0))
{
for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1))
{
if (n0 < m0) continue;
__global %(ctype)s2 *an = a + n0 * %(D)s / 2;
__global %(ctype)s2 *am = a + m0 * %(D)s / 2;
%(ctype)s2 buf = 0;
for (int d = 0; d < %(D)s/2; ++d)
{
%(ctype)s2 diff = am[d] - an[d];
buf += diff * diff;
}
%(odd_d)s;
c[m0 * %(N)s + n0] = sqrt(buf.s0 + buf.s1);
}
}
}
__kernel void upper(__global %(ctype)s *a, __global %(ctype)s *c)
{
for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0))
{
for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1))
{
if (n0 >= m0) continue;
c[m0 * %(N)s + n0] = c[n0 * %(N)s + m0];
}
}
}
""" % locals()).build()

return prg.lower, prg.upper


comptimes = []
def pairwise_pyopencl_cpu(data):
data = np.asarray(data, order='C')
N, D = data.shape
try:
lower, upper = _cache[(data.shape, data.dtype)]
except:
lower, upper = pairwise_pyopencl_cpu_prepare(data.shape, data.dtype)
_cache[(data.shape, data.dtype)] = lower, upper
data_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=data)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, N * N * data.dtype.itemsize)
try:
rval, _ = cl.enqueue_map_buffer(queue, dest_buf, cl.map_flags.READ,
offset=0, shape=(N, N), dtype=data.dtype)
need_copy = False
except TypeError: #OSX's OCL needs this?
rval = np.empty((N, N), dtype=data.dtype)
need_copy = True
lower(queue, (N, 1), (1, 1), data_buf, dest_buf)
upper(queue, (4, 4), (1, 1), data_buf, dest_buf)
if need_copy:
cl.enqueue_copy(queue, rval, dest_buf)
else:
queue.finish()
if PROFILING:
comptimes.append(1e-9 * (ev.profile.end - ev.profile.start))
print 'computation time', min(comptimes)
return rval

M, K = data.shape
out = np.empty((M, M), dtype=data.dtype, order='C')
return pairwise_pyopencl.pairwise_pyopencl_cpu(data, data.T, out)

benchmarks = (
pairwise_pyopencl_cpu,
pairwise_pyopencl_cpu,
)

7 changes: 0 additions & 7 deletions pairwise/pairwise_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,8 @@ def pairwise_python_broadcast_numpy(data):
return np.sqrt(((data[:, None, :] - data) ** 2).sum(axis=2))


def pairwise_python_numpy_dot(data):
X_norm_2 = (data ** 2).sum(axis=1)
dists = np.sqrt(2 * X_norm_2 - np.dot(data, data.T))
return dists


benchmarks = (
pairwise_python_nested_for_loops,
pairwise_python_inner_numpy,
pairwise_python_broadcast_numpy,
pairwise_python_numpy_dot,
)
19 changes: 4 additions & 15 deletions pairwise/pairwise_theano.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,10 @@ def pairwise_theano_tensor_prepare(dtype):
return rval


def pairwise_theano_blas_prepare(dtype):
X = TT.matrix(dtype=str(dtype))
X_norm_2 = (X ** 2).sum(axis=1)
dists = TT.sqrt(2 * X_norm_2 - TT.dot(X, X.T))
name = 'pairwise_theano_blas_' + dtype
rval = theano.function([X],
theano.Out(dists, borrow=True),
allow_input_downcast=True, name=name)
rval.__name__ = name
return rval


benchmarks = (
pairwise_theano_tensor_prepare('float32'),
# -- disabling float32 to match the precision of the other
# implementations (assuming that the benchmark problem is
# to carry out computations in double precision).
# pairwise_theano_tensor_prepare('float32'),
pairwise_theano_tensor_prepare('float64'),
pairwise_theano_blas_prepare('float32'),
pairwise_theano_blas_prepare('float64'),
)
1 change: 1 addition & 0 deletions run_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from collections import OrderedDict
except:
from ordereddict import OrderedDict
import argparse
import json
import os
import traceback
Expand Down