diff --git a/gemm/__init__.py b/gemm/__init__.py new file mode 100644 index 0000000..de0b9a7 --- /dev/null +++ b/gemm/__init__.py @@ -0,0 +1,44 @@ +# Authors: James Bergstra +# License: MIT + +"""Computes the GEMM routine from the BLAS standard. + +BLAS defines the xGEMM (DGEMM, SGEMM, CGEMM, ZGEMM) operations as +dense matrix multiplication and accumulation as follows: + +C <- alpha A x B + beta C + +Here (alpha, beta) are scalars and (A, B, C) are matrices. + +This operation is one of the most widely-used and most-studied kernels in +high-performance computing, and BLAS implementations such as OpenBLAS, ATLAS, +and MKL provide highly optimized implementations of this operation. GEMM +implementations provide a real-life measure of peak performance on a +particular platform. + +Note that the GEMM interface does not actually describe an algorithm, and the +standard does not require particular numerical accuracy. There are sub-cubic +algorithms (e.g. Strassen), and there are also cubic algorithms that are +"blocked" to be more cache-friendly. I believe that OpenBLAS and ATLAS use +blocked cubic algorithms, based on the asymptotic GFLOP/s attributed to MKL, +I would guess it uses blocked cubic algorithms too. + +My hope with this benchmark is that it can be used to develop fast, readable +GEMM implementations. I'm curious, for example, if a readable, blocked +algorithm in pure Python could be compiled to a reasonable-performing +implementation. + +""" + +import numpy as np + + +def make_env(M=512, N=512, K=512, seed=0, dtype=np.float64, + alpha=1.5, + beta=0.5): + rng = np.random.RandomState(seed) + A = np.asarray(rng.normal(size=(M, K)), dtype=dtype) + B = np.asarray(rng.normal(size=(K, N)), dtype=dtype) + C = np.asarray(rng.normal(size=(M, N)), dtype=dtype) + return (alpha, A, B, beta, C), {} + diff --git a/gemm/gemm_cython.pyx b/gemm/gemm_cython.pyx new file mode 100644 index 0000000..2f597be --- /dev/null +++ b/gemm/gemm_cython.pyx @@ -0,0 +1,32 @@ + +# Authors: Jake Vanderplas, Olivier Grisel +# License: MIT + +import numpy as np +cimport cython +from libc.math cimport sqrt + +@cython.boundscheck(False) +@cython.wraparound(False) +def gemm_cython_for_loops( + double alpha, + double[:, ::1] A, + double[:, ::1] B, + double beta, + double[:, ::1] C, + ): + cdef int M = C.shape[0] + cdef int N = C.shape[1] + cdef int K = A.shape[1] + cdef double tmp, d + for i in range(M): + for j in range(N): + d = 0.0 + for k in range(K): + d += A[i, k] * B[k, j] + C[i, j] = alpha * d + beta * C[i, j] + + +benchmarks = ( + gemm_cython_for_loops, +) diff --git a/gemm/gemm_numba.py b/gemm/gemm_numba.py new file mode 100644 index 0000000..702ca38 --- /dev/null +++ b/gemm/gemm_numba.py @@ -0,0 +1,11 @@ +# Authors: Olivier Grisel +# License: MIT + +from gemm import gemm_python +from numba import autojit + + +benchmarks = ( + ("gemm_numba_nested_for_loops", + autojit(gemm_python.gemm_python_nested_for_loops)), +) diff --git a/gemm/gemm_parakeet.py b/gemm/gemm_parakeet.py new file mode 100644 index 0000000..8c189d1 --- /dev/null +++ b/gemm/gemm_parakeet.py @@ -0,0 +1,13 @@ +# Authors: Olivier Grisel +# License: MIT + +from gemm import gemm_python +from parakeet import jit + + +benchmarks = ( + ("gemm_parakeet_nested_for_loops", + jit(gemm_python.gemm_python_nested_for_loops)), + ("gemm_parakeet_inner_numpy", + jit(gemm_python.gemm_python_inner_numpy)), +) diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py new file mode 100644 index 0000000..235dbe9 --- /dev/null +++ b/gemm/gemm_pyopencl.py @@ -0,0 +1,10 @@ +# Author: James Bergstra +# License: MIT + +# -- https://github.com/jaberg/python-benchmarks-pyopencl +from pybench_pyopencl import gemm_pyopencl + +benchmarks = ( + gemm_pyopencl.gemm_pyopencl_cpu, +) + diff --git a/gemm/gemm_python.py b/gemm/gemm_python.py new file mode 100644 index 0000000..9a7ce52 --- /dev/null +++ b/gemm/gemm_python.py @@ -0,0 +1,45 @@ +# Authors: James Bergstra +# License: MIT + +import numpy as np + + +# -- too slow to run directly, but good for JIT by other systems +def gemm_python_nested_for_loops(alpha, A, B, beta, C): + M, N = C.shape + K = A.shape[1] + #"omp parallel for private(j, d, k, tmp)" + for i in range(M): + for j in range(N): + d = 0.0 + for k in range(K): + tmp = A[i, k] * B[j, k] + d += tmp + C[i, j] = alpha * d + beta * C[i, j] + return C + + +def gemm_python_inner_numpy(alpha, A, B, beta, C): + M, N = C.shape + for i in xrange(M): + for j in xrange(N): + C[i, j] *= beta + C[i, j] += alpha * np.dot(A[i, :], B[:, j]) + return C + + +def gemm_python_broadcast_numpy(alpha, A, B, beta, C): + return alpha * (A[:, None, :] * B.T[None, :, :]).sum(axis=2) + beta * C + + +def gemm_python_numpy_dot(alpha, A, B, beta, C): + C *= beta + C += alpha * np.dot(A, B) + return C + + +benchmarks = ( + gemm_python_inner_numpy, + gemm_python_broadcast_numpy, + gemm_python_numpy_dot, +) diff --git a/gemm/gemm_theano.py b/gemm/gemm_theano.py new file mode 100644 index 0000000..e6948bd --- /dev/null +++ b/gemm/gemm_theano.py @@ -0,0 +1,45 @@ +# Authors: James Bergstra +# License: MIT +import theano +import theano.tensor as TT + + +def gemm_theano_tensor_prepare(dtype): + alpha = TT.scalar(dtype=str(dtype)) + beta = TT.scalar(dtype=str(dtype)) + A = TT.matrix(dtype=str(dtype)) + B = TT.matrix(dtype=str(dtype)) + C = TT.matrix(dtype=str(dtype)) + Z = alpha * TT.sum(A[:, None, :] * B, axis=2) + beta * C + name = 'gemm_theano_broadcast_' + dtype + Cin = theano.In(C, mutable=True, borrow=True) + rval = theano.function([alpha, A, B, beta, Cin], + theano.Out(Z, borrow=True), + allow_input_downcast=True, name=name) + rval.__name__ = name + return rval + + +def gemm_theano_blas_prepare(dtype): + alpha = TT.scalar(dtype=str(dtype)) + beta = TT.scalar(dtype=str(dtype)) + A = TT.matrix(dtype=str(dtype)) + B = TT.matrix(dtype=str(dtype)) + C = TT.matrix(dtype=str(dtype)) + Z = alpha * TT.dot(A, B) + beta * C + Cin = theano.In(C, mutable=True, borrow=True) + name = 'gemm_theano_blas_' + dtype + rval = theano.function([alpha, A, B, beta, Cin], + theano.Out(Z, borrow=True), + allow_input_downcast=True, name=name) + rval.__name__ = name + return rval + + +benchmarks = ( + #gemm_theano_tensor_prepare('float32'), + gemm_theano_tensor_prepare('float64'), + #gemm_theano_blas_prepare('float32'), + gemm_theano_blas_prepare('float64'), +) + diff --git a/pairwise/__init__.py b/pairwise/__init__.py index ce70b6f..547724b 100644 --- a/pairwise/__init__.py +++ b/pairwise/__init__.py @@ -1,6 +1,16 @@ # Authors: Olivier Grisel # License: MIT +"""Computes the Euclidean distance between each pair of rows in a matrix. + +In LaTeX: + Y[i, j] = sqrt{ \sum_k (A[i, k] - B[j, k])^2 } + +This computation is a core routine of many machine learning algorithms that +rely on neighbourhood computations. + +""" + import numpy as np diff --git a/pairwise/pairwise_pyopencl.py b/pairwise/pairwise_pyopencl.py index 6e4e40d..983195c 100644 --- a/pairwise/pairwise_pyopencl.py +++ b/pairwise/pairwise_pyopencl.py @@ -1,105 +1,17 @@ -# Authors: James Bergstra +# Author: James Bergstra # License: MIT import numpy as np -import time -import pyopencl as cl -import numpy -mf = cl.mem_flags +# -- https://github.com/jaberg/python-benchmarks-pyopencl +from pybench_pyopencl import pairwise_pyopencl -PROFILING = 0 - -ctx = cl.create_some_context() -if PROFILING: - queue = cl.CommandQueue( - ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) -else: - queue = cl.CommandQueue(ctx) - -_cache = {} - -def pairwise_pyopencl_cpu_prepare(shp, dtype): - N, D = shp - ctype = { - 'float32': 'float', - 'float64': 'double', - }[str(dtype)] - - odd_d = "" if 0 == D % 2 else """ - __global %(ctype)s * a1 = (__global %(ctype)s*) (a); - %(ctype)s diff = a1[(n0 + 1) * %(D)s - 1] - a1[(m0 + 1) * %(D)s - 1]; - buf.s0 += diff * diff; - """ - - prg = cl.Program(ctx, """ - __kernel void lower(__global %(ctype)s2 *a, __global %(ctype)s *c) - { - for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0)) - { - for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1)) - { - if (n0 < m0) continue; - __global %(ctype)s2 *an = a + n0 * %(D)s / 2; - __global %(ctype)s2 *am = a + m0 * %(D)s / 2; - %(ctype)s2 buf = 0; - for (int d = 0; d < %(D)s/2; ++d) - { - %(ctype)s2 diff = am[d] - an[d]; - buf += diff * diff; - } - %(odd_d)s; - c[m0 * %(N)s + n0] = sqrt(buf.s0 + buf.s1); - } - } - } - __kernel void upper(__global %(ctype)s *a, __global %(ctype)s *c) - { - for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0)) - { - for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1)) - { - if (n0 >= m0) continue; - c[m0 * %(N)s + n0] = c[n0 * %(N)s + m0]; - } - } - } - """ % locals()).build() - - return prg.lower, prg.upper - - -comptimes = [] def pairwise_pyopencl_cpu(data): - data = np.asarray(data, order='C') - N, D = data.shape - try: - lower, upper = _cache[(data.shape, data.dtype)] - except: - lower, upper = pairwise_pyopencl_cpu_prepare(data.shape, data.dtype) - _cache[(data.shape, data.dtype)] = lower, upper - data_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=data) - dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, N * N * data.dtype.itemsize) - try: - rval, _ = cl.enqueue_map_buffer(queue, dest_buf, cl.map_flags.READ, - offset=0, shape=(N, N), dtype=data.dtype) - need_copy = False - except TypeError: #OSX's OCL needs this? - rval = np.empty((N, N), dtype=data.dtype) - need_copy = True - lower(queue, (N, 1), (1, 1), data_buf, dest_buf) - upper(queue, (4, 4), (1, 1), data_buf, dest_buf) - if need_copy: - cl.enqueue_copy(queue, rval, dest_buf) - else: - queue.finish() - if PROFILING: - comptimes.append(1e-9 * (ev.profile.end - ev.profile.start)) - print 'computation time', min(comptimes) - return rval - + M, K = data.shape + out = np.empty((M, M), dtype=data.dtype, order='C') + return pairwise_pyopencl.pairwise_pyopencl_cpu(data, data.T, out) benchmarks = ( - pairwise_pyopencl_cpu, + pairwise_pyopencl_cpu, ) + diff --git a/pairwise/pairwise_python.py b/pairwise/pairwise_python.py index 21698fe..c792783 100644 --- a/pairwise/pairwise_python.py +++ b/pairwise/pairwise_python.py @@ -31,15 +31,8 @@ def pairwise_python_broadcast_numpy(data): return np.sqrt(((data[:, None, :] - data) ** 2).sum(axis=2)) -def pairwise_python_numpy_dot(data): - X_norm_2 = (data ** 2).sum(axis=1) - dists = np.sqrt(2 * X_norm_2 - np.dot(data, data.T)) - return dists - - benchmarks = ( pairwise_python_nested_for_loops, pairwise_python_inner_numpy, pairwise_python_broadcast_numpy, - pairwise_python_numpy_dot, ) diff --git a/pairwise/pairwise_theano.py b/pairwise/pairwise_theano.py index 367f545..431ef0d 100644 --- a/pairwise/pairwise_theano.py +++ b/pairwise/pairwise_theano.py @@ -18,21 +18,10 @@ def pairwise_theano_tensor_prepare(dtype): return rval -def pairwise_theano_blas_prepare(dtype): - X = TT.matrix(dtype=str(dtype)) - X_norm_2 = (X ** 2).sum(axis=1) - dists = TT.sqrt(2 * X_norm_2 - TT.dot(X, X.T)) - name = 'pairwise_theano_blas_' + dtype - rval = theano.function([X], - theano.Out(dists, borrow=True), - allow_input_downcast=True, name=name) - rval.__name__ = name - return rval - - benchmarks = ( - pairwise_theano_tensor_prepare('float32'), + # -- disabling float32 to match the precision of the other + # implementations (assuming that the benchmark problem is + # to carry out computations in double precision). + # pairwise_theano_tensor_prepare('float32'), pairwise_theano_tensor_prepare('float64'), - pairwise_theano_blas_prepare('float32'), - pairwise_theano_blas_prepare('float64'), ) diff --git a/run_benchmarks.py b/run_benchmarks.py index 4163fbe..d079a97 100644 --- a/run_benchmarks.py +++ b/run_benchmarks.py @@ -7,6 +7,7 @@ from collections import OrderedDict except: from ordereddict import OrderedDict +import argparse import json import os import traceback