diff --git a/gemm/__init__.py b/gemm/__init__.py
new file mode 100644
index 0000000..de0b9a7
--- /dev/null
+++ b/gemm/__init__.py
@@ -0,0 +1,44 @@
+# Authors: James Bergstra
+# License: MIT
+
+"""Computes the GEMM routine from the BLAS standard.
+
+BLAS defines the xGEMM (DGEMM, SGEMM, CGEMM, ZGEMM) operations as
+dense matrix multiplication and accumulation as follows:
+
+C <- alpha A x B + beta C
+
+Here (alpha, beta) are scalars and (A, B, C) are matrices.
+
+This operation is one of the most widely-used and most-studied kernels in
+high-performance computing, and BLAS implementations such as OpenBLAS, ATLAS,
+and MKL provide highly optimized implementations of this operation. GEMM
+implementations provide a real-life measure of peak performance on a
+particular platform.
+
+Note that the GEMM interface does not actually describe an algorithm, and the
+standard does not require particular numerical accuracy.  There are sub-cubic
+algorithms (e.g. Strassen), and there are also cubic algorithms that are
+"blocked" to be more cache-friendly.  I believe that OpenBLAS and ATLAS use
+blocked cubic algorithms, based on the asymptotic GFLOP/s attributed to MKL,
+I would guess it uses blocked cubic algorithms too.
+
+My hope with this benchmark is that it can be used to develop fast, readable
+GEMM implementations. I'm curious, for example, if a readable, blocked
+algorithm in pure Python could be compiled to a reasonable-performing
+implementation.
+
+"""
+
+import numpy as np
+
+
+def make_env(M=512, N=512, K=512, seed=0, dtype=np.float64,
+        alpha=1.5,
+        beta=0.5):
+    rng = np.random.RandomState(seed)
+    A = np.asarray(rng.normal(size=(M, K)), dtype=dtype)
+    B = np.asarray(rng.normal(size=(K, N)), dtype=dtype)
+    C = np.asarray(rng.normal(size=(M, N)), dtype=dtype)
+    return (alpha, A, B, beta, C), {}
+
diff --git a/gemm/gemm_cython.pyx b/gemm/gemm_cython.pyx
new file mode 100644
index 0000000..2f597be
--- /dev/null
+++ b/gemm/gemm_cython.pyx
@@ -0,0 +1,32 @@
+
+# Authors: Jake Vanderplas, Olivier Grisel
+# License: MIT
+
+import numpy as np
+cimport cython
+from libc.math cimport sqrt
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def gemm_cython_for_loops(
+        double alpha,
+        double[:, ::1] A,
+        double[:, ::1] B,
+        double beta,
+        double[:, ::1] C,
+        ):
+    cdef int M = C.shape[0]
+    cdef int N = C.shape[1]
+    cdef int K = A.shape[1]
+    cdef double tmp, d
+    for i in range(M):
+        for j in range(N):
+            d = 0.0
+            for k in range(K):
+                d += A[i, k] * B[k, j]
+            C[i, j] = alpha * d + beta * C[i, j]
+
+
+benchmarks = (
+    gemm_cython_for_loops,
+)
diff --git a/gemm/gemm_numba.py b/gemm/gemm_numba.py
new file mode 100644
index 0000000..702ca38
--- /dev/null
+++ b/gemm/gemm_numba.py
@@ -0,0 +1,11 @@
+# Authors: Olivier Grisel
+# License: MIT
+
+from gemm import gemm_python
+from numba import autojit
+
+
+benchmarks = (
+    ("gemm_numba_nested_for_loops",
+     autojit(gemm_python.gemm_python_nested_for_loops)),
+)
diff --git a/gemm/gemm_parakeet.py b/gemm/gemm_parakeet.py
new file mode 100644
index 0000000..8c189d1
--- /dev/null
+++ b/gemm/gemm_parakeet.py
@@ -0,0 +1,13 @@
+# Authors: Olivier Grisel
+# License: MIT
+
+from gemm import gemm_python
+from parakeet import jit
+
+
+benchmarks = (
+    ("gemm_parakeet_nested_for_loops",
+     jit(gemm_python.gemm_python_nested_for_loops)),
+    ("gemm_parakeet_inner_numpy",
+     jit(gemm_python.gemm_python_inner_numpy)),
+)
diff --git a/gemm/gemm_pyopencl.py b/gemm/gemm_pyopencl.py
new file mode 100644
index 0000000..235dbe9
--- /dev/null
+++ b/gemm/gemm_pyopencl.py
@@ -0,0 +1,10 @@
+# Author: James Bergstra
+# License: MIT
+
+# -- https://github.com/jaberg/python-benchmarks-pyopencl
+from pybench_pyopencl import gemm_pyopencl
+
+benchmarks = (
+    gemm_pyopencl.gemm_pyopencl_cpu,
+)
+
diff --git a/gemm/gemm_python.py b/gemm/gemm_python.py
new file mode 100644
index 0000000..9a7ce52
--- /dev/null
+++ b/gemm/gemm_python.py
@@ -0,0 +1,45 @@
+# Authors: James Bergstra
+# License: MIT
+
+import numpy as np
+
+
+# -- too slow to run directly, but good for JIT by other systems
+def gemm_python_nested_for_loops(alpha, A, B, beta, C):
+    M, N = C.shape
+    K = A.shape[1]
+    #"omp parallel for private(j, d, k, tmp)"
+    for i in range(M):
+        for j in range(N):
+            d = 0.0
+            for k in range(K):
+                tmp = A[i, k] * B[j, k]
+                d += tmp
+            C[i, j] = alpha * d + beta * C[i, j]
+    return C
+
+
+def gemm_python_inner_numpy(alpha, A, B, beta, C):
+    M, N = C.shape
+    for i in xrange(M):
+        for j in xrange(N):
+            C[i, j] *= beta
+            C[i, j] += alpha * np.dot(A[i, :], B[:, j])
+    return C
+
+
+def gemm_python_broadcast_numpy(alpha, A, B, beta, C):
+    return alpha * (A[:, None, :] * B.T[None, :, :]).sum(axis=2) + beta * C
+
+
+def gemm_python_numpy_dot(alpha, A, B, beta, C):
+    C *= beta
+    C += alpha * np.dot(A, B)
+    return C
+
+
+benchmarks = (
+    gemm_python_inner_numpy,
+    gemm_python_broadcast_numpy,
+    gemm_python_numpy_dot,
+)
diff --git a/gemm/gemm_theano.py b/gemm/gemm_theano.py
new file mode 100644
index 0000000..e6948bd
--- /dev/null
+++ b/gemm/gemm_theano.py
@@ -0,0 +1,45 @@
+# Authors: James Bergstra
+# License: MIT
+import theano
+import theano.tensor as TT
+
+
+def gemm_theano_tensor_prepare(dtype):
+    alpha = TT.scalar(dtype=str(dtype))
+    beta = TT.scalar(dtype=str(dtype))
+    A = TT.matrix(dtype=str(dtype))
+    B = TT.matrix(dtype=str(dtype))
+    C = TT.matrix(dtype=str(dtype))
+    Z = alpha * TT.sum(A[:, None, :] * B, axis=2) + beta * C
+    name = 'gemm_theano_broadcast_' + dtype
+    Cin = theano.In(C, mutable=True, borrow=True)
+    rval = theano.function([alpha, A, B, beta, Cin],
+                           theano.Out(Z, borrow=True),
+                           allow_input_downcast=True, name=name)
+    rval.__name__ = name
+    return rval
+
+
+def gemm_theano_blas_prepare(dtype):
+    alpha = TT.scalar(dtype=str(dtype))
+    beta = TT.scalar(dtype=str(dtype))
+    A = TT.matrix(dtype=str(dtype))
+    B = TT.matrix(dtype=str(dtype))
+    C = TT.matrix(dtype=str(dtype))
+    Z = alpha * TT.dot(A, B) + beta * C
+    Cin = theano.In(C, mutable=True, borrow=True)
+    name = 'gemm_theano_blas_' + dtype
+    rval = theano.function([alpha, A, B, beta, Cin],
+                           theano.Out(Z, borrow=True),
+                           allow_input_downcast=True, name=name)
+    rval.__name__ = name
+    return rval
+
+
+benchmarks = (
+    #gemm_theano_tensor_prepare('float32'),
+    gemm_theano_tensor_prepare('float64'),
+    #gemm_theano_blas_prepare('float32'),
+    gemm_theano_blas_prepare('float64'),
+)
+
diff --git a/pairwise/__init__.py b/pairwise/__init__.py
index ce70b6f..547724b 100644
--- a/pairwise/__init__.py
+++ b/pairwise/__init__.py
@@ -1,6 +1,16 @@
 # Authors: Olivier Grisel
 # License: MIT
 
+"""Computes the Euclidean distance between each pair of rows in a matrix.
+
+In LaTeX:
+    Y[i, j] = sqrt{ \sum_k (A[i, k] - B[j, k])^2 }
+
+This computation is a core routine of many machine learning algorithms that
+rely on neighbourhood computations.
+
+"""
+
 import numpy as np
 
 
diff --git a/pairwise/pairwise_pyopencl.py b/pairwise/pairwise_pyopencl.py
index 6e4e40d..983195c 100644
--- a/pairwise/pairwise_pyopencl.py
+++ b/pairwise/pairwise_pyopencl.py
@@ -1,105 +1,17 @@
-# Authors: James Bergstra
+# Author: James Bergstra
 # License: MIT
 
 import numpy as np
-import time
-import pyopencl as cl
-import numpy
 
-mf = cl.mem_flags
+# -- https://github.com/jaberg/python-benchmarks-pyopencl
+from pybench_pyopencl import pairwise_pyopencl
 
-PROFILING = 0
-
-ctx = cl.create_some_context()
-if PROFILING:
-    queue = cl.CommandQueue(
-        ctx,
-        properties=cl.command_queue_properties.PROFILING_ENABLE)
-else:
-    queue = cl.CommandQueue(ctx)
-
-_cache = {}
-
-def pairwise_pyopencl_cpu_prepare(shp, dtype):
-    N, D = shp
-    ctype = {
-            'float32': 'float',
-            'float64': 'double',
-            }[str(dtype)]
-
-    odd_d = "" if 0 == D % 2 else """
-    __global %(ctype)s * a1 = (__global %(ctype)s*) (a);
-    %(ctype)s diff = a1[(n0 + 1) * %(D)s - 1] - a1[(m0 + 1) * %(D)s - 1];
-    buf.s0 += diff * diff;
-    """
-
-    prg = cl.Program(ctx, """
-        __kernel void lower(__global %(ctype)s2 *a, __global %(ctype)s *c)
-        {
-          for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0))
-          {
-              for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1))
-              {
-                if (n0 < m0) continue;
-                  __global %(ctype)s2 *an = a + n0 * %(D)s / 2;
-                  __global %(ctype)s2 *am = a + m0 * %(D)s / 2;
-                  %(ctype)s2 buf = 0;
-                  for (int d = 0; d < %(D)s/2; ++d)
-                  {
-                    %(ctype)s2 diff = am[d] - an[d];
-                    buf += diff * diff;
-                  }
-                  %(odd_d)s;
-                  c[m0 * %(N)s + n0] = sqrt(buf.s0 + buf.s1);
-              }
-          }
-        }
-        __kernel void upper(__global %(ctype)s *a, __global %(ctype)s *c)
-        {
-          for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0))
-          {
-              for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1))
-              {
-                if (n0 >= m0) continue;
-                  c[m0 * %(N)s + n0] = c[n0 * %(N)s + m0];
-              }
-          }
-        }
-        """ % locals()).build()
-
-    return prg.lower, prg.upper
-
-
-comptimes = []
 def pairwise_pyopencl_cpu(data):
-    data = np.asarray(data, order='C')
-    N, D = data.shape
-    try:
-        lower, upper = _cache[(data.shape, data.dtype)]
-    except:
-        lower, upper = pairwise_pyopencl_cpu_prepare(data.shape, data.dtype)
-        _cache[(data.shape, data.dtype)] = lower, upper
-    data_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=data)
-    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, N * N * data.dtype.itemsize)
-    try:
-        rval, _ = cl.enqueue_map_buffer(queue, dest_buf, cl.map_flags.READ,
-                offset=0, shape=(N, N), dtype=data.dtype)
-        need_copy = False
-    except TypeError: #OSX's OCL needs this?
-        rval = np.empty((N, N), dtype=data.dtype)
-        need_copy = True
-    lower(queue, (N, 1), (1, 1), data_buf, dest_buf)
-    upper(queue, (4, 4), (1, 1), data_buf, dest_buf)
-    if need_copy:
-        cl.enqueue_copy(queue, rval, dest_buf)
-    else:
-        queue.finish()
-    if PROFILING:
-        comptimes.append(1e-9 * (ev.profile.end - ev.profile.start))
-        print 'computation time', min(comptimes)
-    return rval
-
+    M, K = data.shape
+    out = np.empty((M, M), dtype=data.dtype, order='C')
+    return pairwise_pyopencl.pairwise_pyopencl_cpu(data, data.T, out)
 
 benchmarks = (
-    pairwise_pyopencl_cpu,
+        pairwise_pyopencl_cpu,
 )
+
diff --git a/pairwise/pairwise_python.py b/pairwise/pairwise_python.py
index 21698fe..c792783 100644
--- a/pairwise/pairwise_python.py
+++ b/pairwise/pairwise_python.py
@@ -31,15 +31,8 @@ def pairwise_python_broadcast_numpy(data):
     return np.sqrt(((data[:, None, :] - data) ** 2).sum(axis=2))
 
 
-def pairwise_python_numpy_dot(data):
-    X_norm_2 = (data ** 2).sum(axis=1)
-    dists = np.sqrt(2 * X_norm_2 - np.dot(data, data.T))
-    return dists
-
-
 benchmarks = (
     pairwise_python_nested_for_loops,
     pairwise_python_inner_numpy,
     pairwise_python_broadcast_numpy,
-    pairwise_python_numpy_dot,
 )
diff --git a/pairwise/pairwise_theano.py b/pairwise/pairwise_theano.py
index 367f545..431ef0d 100644
--- a/pairwise/pairwise_theano.py
+++ b/pairwise/pairwise_theano.py
@@ -18,21 +18,10 @@ def pairwise_theano_tensor_prepare(dtype):
     return rval
 
 
-def pairwise_theano_blas_prepare(dtype):
-    X = TT.matrix(dtype=str(dtype))
-    X_norm_2 = (X ** 2).sum(axis=1)
-    dists = TT.sqrt(2 * X_norm_2 - TT.dot(X, X.T))
-    name = 'pairwise_theano_blas_' + dtype
-    rval = theano.function([X],
-                           theano.Out(dists, borrow=True),
-                           allow_input_downcast=True, name=name)
-    rval.__name__ = name
-    return rval
-
-
 benchmarks = (
-    pairwise_theano_tensor_prepare('float32'),
+    # -- disabling float32 to match the precision of the other
+    #    implementations (assuming that the benchmark problem is
+    #    to carry out computations in double precision).
+    # pairwise_theano_tensor_prepare('float32'),
     pairwise_theano_tensor_prepare('float64'),
-    pairwise_theano_blas_prepare('float32'),
-    pairwise_theano_blas_prepare('float64'),
 )
diff --git a/run_benchmarks.py b/run_benchmarks.py
index 4163fbe..d079a97 100644
--- a/run_benchmarks.py
+++ b/run_benchmarks.py
@@ -7,6 +7,7 @@
     from collections import OrderedDict
 except:
     from ordereddict import OrderedDict
+import argparse
 import json
 import os
 import traceback