diff --git a/examples/Makefile b/examples/Makefile
index 362d37b..67f6085 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,6 +1,8 @@
 ESDK=${EPIPHANY_HOME}
 ELDF=${ESDK}/bsps/current/fast.ldf
 
+CFLAGS=-std=c99 -O3 -ffast-math -funroll-loops 
+
 INCLUDES = \
 		   -I../include\
 		   -I${ESDK}/tools/host/include
@@ -30,10 +32,10 @@ dirs:
 hello: bin/host_hello bin/e_hello.elf bin/e_hello.srec
 
 bin/host_hello: hello/host_hello.c
-	gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+	gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
 	
 bin/e_hello.elf: hello/e_hello.c
-	e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+	e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
 
 bin/e_hello.srec: bin/e_hello.elf
 	e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -43,10 +45,10 @@ bin/e_hello.srec: bin/e_hello.elf
 e_hello: bin/host_e_hello bin/e_e_hello.elf bin/e_e_hello.srec
 
 bin/host_e_hello: e_hello/host_e_hello.c
-	gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+	gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
 	
 bin/e_e_hello.elf: e_hello/e_e_hello.c
-	e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+	e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
 
 bin/e_e_hello.srec: bin/e_e_hello.elf
 	e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -56,10 +58,10 @@ bin/e_e_hello.srec: bin/e_e_hello.elf
 dot_product: bin/host_dot_product bin/e_dot_product.elf bin/e_dot_product.srec
 
 bin/host_dot_product: dot_product/host_dot_product.c
-	gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+	gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
 	
 bin/e_dot_product.elf: dot_product/e_dot_product.c
-	e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+	e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
 
 bin/e_dot_product.srec: bin/e_dot_product.elf
 	e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -69,10 +71,10 @@ bin/e_dot_product.srec: bin/e_dot_product.elf
 lu_decomposition: bin/host_lu_decomposition bin/e_lu_decomposition.elf bin/e_lu_decomposition.srec
 
 bin/host_lu_decomposition: lu_decomposition/host_lu_decomposition.c
-	gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+	gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
 	
 bin/e_lu_decomposition.elf: lu_decomposition/e_lu_decomposition.c
-	e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+	e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
 
 bin/e_lu_decomposition.srec: bin/e_lu_decomposition.elf
 	e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -82,10 +84,10 @@ bin/e_lu_decomposition.srec: bin/e_lu_decomposition.elf
 memtest: bin/host_memtest bin/e_memtest.elf bin/e_memtest.srec
 
 bin/host_memtest: memtest/host_memtest.c
-	gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+	gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
 	
 bin/e_memtest.elf: memtest/e_memtest.c
-	e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+	e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
 
 bin/e_memtest.srec: bin/e_memtest.elf
 	e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -95,16 +97,15 @@ bin/e_memtest.srec: bin/e_memtest.elf
 bspbench: bin/host_bspbench bin/e_bspbench.elf bin/e_bspbench.srec
 
 bin/host_bspbench: bspbench/host_bspbench.c
-	gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+	gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
 	
 bin/e_bspbench.elf: bspbench/e_bspbench.c
-	e-gcc -O3 -ffast-math -funroll-loops -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+	e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
 
 bin/e_bspbench.srec: bin/e_bspbench.elf
 	e-objcopy --srec-forceS3 --output-target srec $< $@
 
 ########################################################
 
-
 clean:
 	rm bin/*
diff --git a/examples/lu_decomposition/common.h b/examples/lu_decomposition/common.h
index 3dda0ef..358d5ad 100644
--- a/examples/lu_decomposition/common.h
+++ b/examples/lu_decomposition/common.h
@@ -22,17 +22,15 @@ see the files COPYING and COPYING.LESSER. If not, see
 <http://www.gnu.org/licenses/>.
 */
 
-#define LOC_M 0x4000
-#define LOC_N 0x4001
-#define LOC_DIM 0x4002
-#define LOC_MATRIX 0x4004
+#define LOC_M 0x4800
+#define LOC_N 0x4804
+#define LOC_DIM 0x4808
+#define LOC_MATRIX 0x480c
 
 #define LOC_RS 0x5800
 #define LOC_ARK (LOC_RS + sizeof(int) * M)
-#define LOC_R (LOC_ARK + sizeof(float))
+#define LOC_R (LOC_ARK + sizeof(float) * M)
 #define LOC_PI (LOC_R + sizeof(int))
-#define LOC_PI_IN (LOC_PI + sizeof(int) * N)
+#define LOC_PI_IN (LOC_PI + sizeof(int) * entries_per_col)
 #define LOC_ROW_IN (LOC_PI_IN + sizeof(int) * 2)
 #define LOC_COL_IN (LOC_ROW_IN + sizeof(float) * dim)
-
-#define LOC_RESULT 0x6000
diff --git a/examples/lu_decomposition/e_lu_decomposition.c b/examples/lu_decomposition/e_lu_decomposition.c
index ba5aa0e..40baaad 100644
--- a/examples/lu_decomposition/e_lu_decomposition.c
+++ b/examples/lu_decomposition/e_lu_decomposition.c
@@ -25,16 +25,17 @@ see the files COPYING and COPYING.LESSER. If not, see
 #include <e_bsp.h>
 #include "e-lib.h"
 
+#include <math.h>
 #include "common.h"
 
 int M = 0;
 int N = 0;
+int dim = 0;
 int s = 0;
 int t = 0;
-int dim = 0;
-int entries_per_row = 0;
+int entries_per_col = 0;
 
-inline int proc_id(int s, int t)
+int proc_id(int s, int t)
 {
     return s * M + t;
 }
@@ -47,32 +48,32 @@ int ltg(int* i, int* j, int l)
 }
 
 // "global to local" index
-inline int gtl(int i, int j)
+int gtl(int i, int j)
 {
     // here we assume correct processor
     return (i / M) * (dim / M) + (j / M);
 }
 
-inline float* a(int i, int j) {
+float* a(int i, int j) {
     return (float*)LOC_MATRIX + gtl(i, j);
 }
 
 int main()
 {
-    int i, j, k;
     bsp_begin();
 
     int n = bsp_nprocs(); 
     int p = bsp_pid();
 
-    M = (*(char*)LOC_M);
-    N = (*(char*)LOC_N);
-    dim = (*(char*)LOC_DIM);
-    entries_per_row = dim / M;
+    M = (*(int*)LOC_M);
+    N = (*(int*)LOC_N);
+    dim = (*(int*)LOC_DIM);
 
     s = p / M;
     t = p % M;
 
+    entries_per_col = dim / N;
+
     // register variable to store r and a_rk
     // need arrays equal to number of procs in our proc column
     bsp_push_reg((void*)LOC_RS, sizeof(int) * N);
@@ -84,32 +85,44 @@ int main()
     bsp_push_reg((void*)LOC_R, sizeof(int));
     bsp_sync();
 
-    // FIXME: PI is actually distributed as well.
     bsp_push_reg((void*)LOC_PI_IN, sizeof(int));
     bsp_sync();
 
+    bsp_push_reg((void*)LOC_ROW_IN, sizeof(int));
+    bsp_sync();
+
+    bsp_push_reg((void*)LOC_COL_IN, sizeof(int));
+    bsp_sync();
+
     // also initialize pi as identity
     if (t == 0)
-        for (i = 0; i < N; ++i)
-            *((int*)LOC_PI + i) = i;
+        for (int i = 0; i < entries_per_col; ++i)
+            *((int*)LOC_PI + i) = s + i * N;
 
-    for (k = 0; k < dim; ++k) {
+    for (int k = 0; k < dim; ++k) {
 
         //----------------------
         // STAGE 1: Pivot search
         //----------------------
-        if (k % M == 0) {
+        if (k % M == t) {
+            // COMPUTE PIVOT IN COLUMN K
             int rs = -1;
-            float a_rk = -1.0;
-            for (i = k; i < dim; ++i) {
-                float a_ik = abs(*a(i, k));
+            float a_rk = -1.0f;
+
+            int start_i = (k / N) * N + s;
+            if (s % N < k % N)
+                start_i += N;
+
+            for (int i = start_i; i < dim; i += N) {
+                float a_ik = fabsf(*a(i, k));
                 if (a_ik > a_rk) {
                     a_rk = a_ik;
                     rs = i;
                 }
             }
 
-            for (j = 0; j < N; ++j) {
+            // HORIZONTAL COMMUNICATION
+            for (int j = 0; j < N; ++j) {
                 // put r_s in P(*,t)
                 bsp_hpput(proc_id(j, t),
                          &rs, (void*)LOC_RS,
@@ -117,15 +130,19 @@ int main()
 
                 // put a_(r_s, k) in P(*,t)
                 bsp_hpput(proc_id(j, t),
-                         &rs, (void*)LOC_ARK,
+                         &a_rk, (void*)LOC_ARK,
                          s * sizeof(float), sizeof(float));
             }
 
             bsp_sync(); // (0) + (1)
 
-            a_rk = -1.0;
-            for (j = 0; j < N; ++j) {
-                float val = abs(*(((float*)LOC_ARK + j)));
+            a_rk = -1.0f;
+            for (int j = 0; j < N; ++j) {
+                if (*((int*)LOC_RS + j) < 0)
+                    continue;
+
+                float val = fabsf(*(((float*)LOC_ARK + j)));
+
                 if (val > a_rk) {
                     a_rk = val;
                     rs = *((int*)LOC_RS + j);
@@ -133,7 +150,7 @@ int main()
             }
 
             // put r in P(s, *)
-            for(j = 0; j < M; ++j) {
+            for(int j = 0; j < M; ++j) {
                 bsp_hpput(proc_id(s, j),
                         &rs, (void*)LOC_R,
                         0, sizeof(int));
@@ -150,35 +167,43 @@ int main()
         // STAGE 2: Index and row swaps
         // ----------------------------
         int r = *((int*)LOC_R);
+
+        ebsp_message("(r, k), (%i, %i)", r, k);
+
         if (k % N == s && t == 0) {
             bsp_hpput(proc_id(r % N, 0),
-                    ((int*)LOC_PI + k), (void*)LOC_PI_IN,
+                    ((int*)LOC_PI + (k / N)), (void*)LOC_PI_IN,
                     0, sizeof(int));
         }
+
         if (r % N == s && t == 0) {
+            // here offset is set to one in case k % N == r % N
             bsp_hpput(proc_id(k % N, 0),
-                    ((int*)LOC_PI + r), (void*)LOC_PI_IN,
+                    ((int*)LOC_PI + (r / N)), (void*)LOC_PI_IN,
                     sizeof(int), sizeof(int));
         }
+
         bsp_sync(); // (4)
 
-        if (k % N == s && t == 0)
-            *((int*)LOC_PI + k) = *((int*)LOC_PI_IN + 1);
+        if (k % N == s && t == 0) {
+            *((int*)LOC_PI + (k / N)) = *((int*)LOC_PI_IN + 1);
+        }
+
         if (r % N == s && t == 0)
-            *((int*)LOC_PI + r) = *((int*)LOC_PI_IN);
+            *((int*)LOC_PI + (r / N)) = *((int*)LOC_PI_IN);
 
         if (k % N == s) { // need to swap rows with row r
-            for (j = t; j < dim; j += M) {
+            for (int j = t; j < dim; j += M) {
                  bsp_hpput(proc_id(r % N, t),
                         a(k, j), (void*)LOC_ROW_IN,
                         sizeof(float) * (j - t) / M, sizeof(float));
             }
         }
 
-        if (r % N == s) { // need to swap rows with row r
-            for (j = t; j < dim; j += M) {
+        if (r % N == s) { // need to swap rows with row k
+            for (int j = t; j < dim; j += M) {
                  bsp_hpput(proc_id(k % N, t),
-                        a(r, j), (void*)LOC_ROW_IN,
+                        a(r, j), (void*)LOC_COL_IN,
                         sizeof(float) * (j - t) / M, sizeof(float));
             }
         }
@@ -186,13 +211,13 @@ int main()
         bsp_sync(); // (5) + (6)
 
         if (k % N == s) {
-            for (j = t; j < dim; j += M) {
-                (*a(k, j)) = *((float*)LOC_ROW_IN + (j - t)/M);
+            for (int j = t; j < dim; j += M) {
+                *a(k, j) = *((float*)LOC_COL_IN + (j - t) / M);
             }
         }
         if (r % N == s) {
-            for (j = t; j < dim; j += M) {
-                (*a(r, j)) = *((float*)LOC_ROW_IN + (j - t)/M);
+            for (int j = t; j < dim; j += M) {
+                *a(r, j) = *((float*)LOC_ROW_IN + (j - t) / M);
             }
         }
 
@@ -203,7 +228,7 @@ int main()
         // ----------------------
         if (k % N == s && k % M == t) {
             // put a_kk in P(*, t)
-            for (j = 0; j < N; j += M) {
+            for (int j = 0; j < N; ++j) {
                  bsp_hpput(proc_id(j, t),
                         a(k, k), (void*)LOC_ROW_IN,
                         0, sizeof(float));
@@ -212,40 +237,51 @@ int main()
 
         bsp_sync(); // (8)
 
+        int start_idx = (k / N) * N + s;
+        if (s % N <= k % N)
+            start_idx += N;
+
+        int start_jdx = (k / M) * M + t;
+        if (t % N <= k % M)
+            start_jdx += M;
+
         if (k % M == t) {
-            for (i = k; i < n && i % N == s; ++i) {
-                (*a(i, k)) = *a(i,k) / (*((int*)LOC_ROW_IN));
+            for (int i = start_idx; i < dim; i += N) {
+                *a(i, k) = *a(i, k) / (*((float*)LOC_ROW_IN));
             }
         }
 
+        // HORIZONTAL COMMUNICATION
         if (k % M == t) {
             // put a_ik in P(s, *)
-            for (i = k; i < n && i % N == s; ++i) {
-                for (j = 0; j < M; ++j) {
-                    bsp_hpput(proc_id(s, j),
+            for (int i = start_idx; i < dim; i += N) {
+                for (int sj = 0; sj < M; ++sj) {
+                    bsp_hpput(proc_id(s, sj),
                             a(i, k), (void*)LOC_COL_IN,
                             sizeof(float) * i, sizeof(float));
                 }
             }
         }
+
+        // VERTICAL COMMUNICATION
         if (k % N == s) {
             // put a_ki in P(*, t)
-            for (i = k; i < n && i % M == t; ++i) {
-                for (j = 0; j < N; ++j) {
-                    bsp_hpput(proc_id(j, t),
-                            a(k, i), (void*)LOC_ROW_IN,
-                            sizeof(float) * i, sizeof(float));
+            for (int j = start_jdx; j < dim; j += M) {
+                for (int si = 0; si < N; ++si) {
+                    bsp_hpput(proc_id(si, t),
+                            a(k, j), (void*)LOC_ROW_IN,
+                            sizeof(float) * j, sizeof(float));
                 }
             }
         }
 
         bsp_sync(); // (9) + (10)
 
-        for (i = k; i < n && i % N == s; ++i) {
-            for (j = k; j < n && j % M == t; ++j) {
-                int a_ik = *((float*)LOC_COL_IN + i);
-                int a_kj = *((float*)LOC_ROW_IN + j);
-                (*a(i, j)) = *a(i, j) - a_ik * a_kj;
+        for (int i = start_idx; i < dim; i += N) {
+            for (int j = start_jdx; j < dim; j += M) {
+                float a_ik = *((float*)LOC_COL_IN + i);
+                float a_kj = *((float*)LOC_ROW_IN + j);
+                *a(i, j) = *a(i, j) - a_ik * a_kj;
             }
         }
     }
diff --git a/examples/lu_decomposition/host_lu_decomposition.c b/examples/lu_decomposition/host_lu_decomposition.c
index 69a82b0..c5dcd39 100644
--- a/examples/lu_decomposition/host_lu_decomposition.c
+++ b/examples/lu_decomposition/host_lu_decomposition.c
@@ -33,13 +33,13 @@ see the files COPYING and COPYING.LESSER. If not, see
 #define DEBUG
 
 // information on matrix and procs
-char N = -1;
-char M = -1;
+int N = -1;
+int M = -1;
 
 // always choose multiple of 4 such that we dont have to worry
 // about heterogeneous distributions too much,
 // which makes a lot of things much easier
-char dim = 20;
+int dim = 20;
 
 // "local to global" index
 int ltg(int* i, int* j, int l, int s, int t)
@@ -61,20 +61,54 @@ int proc_id(int s, int t)
     return s * M + t;
 }
 
+// multiply AB = C (all n x n)
+// assume matrices are stored column-major
+void mat_mult(float* A, float* B, float* C, int n)
+{
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+            C[n * i + j] = 0.0f;
+            for (int k = 0; k < n; ++k) {
+                C[n * i + j] += A[n * i + k] * B[n * k + j];
+            }
+        }
+    }
+}
+
+// permute matrix A (n x n) according to the vector pi (n x 1)
+// B = P^T(pi) A))
+void mat_permute(int* pi, float* A, float* B, int n)
+{
+    float* swap_row = malloc(sizeof(float) * n);
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+            B[n * i + j] = A[n * pi[i] + j];
+        }
+    }
+}
+
+// print a matrix to stdout
+void mat_pretty_print(float* A, int n)
+{
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+            printf("%.2f\t", A[dim * i + j]);
+        }
+        printf("\n");
+    }
+}
+
 int main(int argc, char **argv)
 {
+    srand(12345);
+
     // allocate and zero-initialize matrix
-    float* mat = malloc(sizeof(float) * dim * dim);
+    float* A = malloc(sizeof(float) * dim * dim);
 
     // construct the matrix
-    int i = 0; 
-    int j = 0;
-    for(i = 0; i < dim; ++i) {
-        for(j = 0; j < dim; ++j) {
-            if(i > j) 
-                mat[dim*i + j] = (float)i / (j+1);
-            else 
-                mat[dim*i + j] = (float)j / (i+1);
+    for(int i = 0; i < dim; ++i) {
+        for(int j = 0; j < dim; ++j) {
+            A[dim * i + j] = rand() % 5 + 1;
         }
     }
 
@@ -97,40 +131,49 @@ int main(int argc, char **argv)
         default:
             fprintf(stderr, "Unsupported processor count, please add values\
                     for N and M in the host program.");
-            break;
+            return -1;
     }
 
     printf("LUD: Writing info on procs and matrix \n");
     // Write M, N and dim to every processor such that they can figure out 
     // the (s,t) pair, and gtl / ltg functions
-    for(i = 0; i < bsp_nprocs(); ++i) {
-        co_write(i, &M, (off_t)LOC_M, sizeof(char));
-        co_write(i, &N, (off_t)LOC_N, sizeof(char));
-        co_write(i, &dim, (off_t)LOC_DIM, sizeof(char));
+    for (int i = 0; i < bsp_nprocs(); ++i) {
+        ebsp_write(i, &M, (off_t)LOC_M, sizeof(int));
+        ebsp_write(i, &N, (off_t)LOC_N, sizeof(int));
+        ebsp_write(i, &dim, (off_t)LOC_DIM, sizeof(int));
     }
 
-    int s = 0;
-    int t = 0;
-    int l = 0;
-    for (i = 0; i < dim; ++i) {
-        for (j = 0; j < dim; ++j) {
-            gtl(i, j, &l, &s, &t);
-            co_write(proc_id(s, t),
-                    &mat[dim*i + j],
-                    LOC_MATRIX + sizeof(float) * l,
+    int prow = 0;
+    int pcol = 0;
+    int loc = 0;
+    for (int i = 0; i < dim; ++i) {
+        for (int j = 0; j < dim; ++j) {
+            gtl(i, j, &loc, &prow, &pcol);
+            ebsp_write(proc_id(prow, pcol),
+                    &A[dim*i + j],
+                    LOC_MATRIX + sizeof(float) * loc,
                     sizeof(float));
         }
     }
 
     // test global to local and local to global function for random processor
 #ifdef DEBUG
-    s = 2;
-    t = 3;
-    printf("i.e. (s,t) = (2,3): \n");
-    for (l = 0; l < (dim * dim) / bsp_nprocs(); ++l) {
+    int s = 3;
+    int t = 3;
+    printf("e.g. (s,t) = (3,3): \n");
+
+    int _M, _N, _dim;
+    ebsp_read(proc_id(s, t), (off_t)LOC_M, &_M, sizeof(int));
+    ebsp_read(proc_id(s, t), (off_t)LOC_N, &_N, sizeof(int));
+    ebsp_read(proc_id(s, t), (off_t)LOC_DIM, &_dim, sizeof(int));
+
+    printf("M, N, dim: %i, %i, %i\n", _M, _N, _dim);
+
+    for (int l = 0; l < (dim * dim) / bsp_nprocs(); ++l) {
+            int i, j;
             ltg(&i, &j, l, s, t);
             float val;
-            co_read(proc_id(s, t),
+            ebsp_read(proc_id(s, t),
                     LOC_MATRIX + sizeof(float) * l,
                     &val,
                     sizeof(float));
@@ -141,16 +184,16 @@ int main(int argc, char **argv)
 #endif
 
 #ifdef DEBUG
-    ebsp_inspector_enable();
+    //ebsp_inspector_enable();
 #endif
 
     ebsp_spmd();
 
     printf("----------------------------: \n");
     printf("Matrix: \n");
-    for (i = 0; i < dim; ++i) {
-        for (j = 0; j < dim; ++j) {
-            printf("%.2f ", mat[dim * i + j]);
+    for (int i = 0; i < dim; ++i) {
+        for (int j = 0; j < dim; ++j) {
+            printf("%.2f\t", A[dim * i + j]);
         }
         printf("\n");
     }
@@ -158,28 +201,102 @@ int main(int argc, char **argv)
     printf("----------------------------: \n");
     printf("LU decomposition: \n");
 
-    for (s = 0; s < N; ++s) {
-        for (t = 0; t < M; ++t) {
-            for (l = 0; l < (dim * dim) / bsp_nprocs(); ++l) {
-                    ltg(&i, &j, l, s, t);
-                    co_read(proc_id(s, t),
-                            LOC_MATRIX + sizeof(float) * l,
-                            &mat[dim*i + j], sizeof(float));
+    float* Y = malloc(sizeof(float) * dim * dim);
+
+    for (int s = 0; s < N; ++s) {
+        for (int t = 0; t < M; ++t) {
+            for (int l = 0; l < (dim * dim) / bsp_nprocs(); ++l) {
+                int i, j;
+                ltg(&i, &j, l, s, t);
+                ebsp_read(proc_id(s, t),
+                        LOC_MATRIX + sizeof(float) * l,
+                        &Y[dim*i + j], sizeof(float));
             }
         }
     }
 
-    for (i = 0; i < dim; ++i) {
-        for (j = 0; j < dim; ++j) {
-            printf("%.2f ", mat[dim * i + j]);
+    for (int i = 0; i < dim; ++i) {
+        for (int j = 0; j < dim; ++j) {
+            printf("%.2f\t", Y[dim * i + j]);
         }
         printf("\n");
     }
 
+    int* pi = malloc(sizeof(int) * dim);
+
+    printf("PI: \n");
+    for(int i = 0; i < dim; ++i) {
+        ebsp_read(proc_id(i % N, 0),
+                LOC_PI + sizeof(int) * (i / N),
+                &pi[i], sizeof(int));
+        printf("%i\n", pi[i]);
+    }
+
     printf("----------------------------: \n");
 
+    // we test the results here
+    float* L = malloc(sizeof(float) * dim * dim);
+    float* U = malloc(sizeof(float) * dim * dim);
+    float* B = malloc(sizeof(float) * dim * dim);
+
+    for (int i = 0; i < dim; ++i) {
+        for (int j = 0; j < dim; ++j) { 
+            L[dim * i + j] = 0.0f;
+            U[dim * i + j] = 0.0f;
+            B[dim * i + j] = 0.0f;
+        }
+    }
+
+    // obtain L, U from Y
+    for (int i = 0; i < dim; ++i) {
+        for (int j = 0; j < dim; ++j) {
+            if (i == j) {
+                U[dim * i + j] = Y[dim * i + j];
+                L[dim * i + j] = 1.0f;
+            }
+            else if (j < i) {
+                L[dim * i + j] = Y[dim * i + j];
+            } else {
+                U[dim * i + j] = Y[dim * i + j];
+            }
+        }
+    }
+
+    printf("A ---------------------------- \n");
+    mat_pretty_print(A, dim);
+
+    printf("Y ---------------------------- \n");
+    mat_pretty_print(Y, dim);
+
+    printf("L ---------------------------- \n");
+    mat_pretty_print(L, dim);
+
+    printf("U ---------------------------- \n");
+    mat_pretty_print(U, dim);
+
+    printf("PA ---------------------------- \n");
+
+    // first see what the permuted A looks like
+    mat_permute(pi, A, B, dim);
+    mat_pretty_print(B, dim);
+
+    printf("LU ---------------------------- \n");
+
+    // obtain LU
+    mat_mult(L, U, B, dim);
+    mat_pretty_print(B, dim);
+
+    printf("FINISHED ---------------------- \n");
+
     // finalize
     bsp_end();
 
+    // free matrices and vectors
+    free(A);
+    free(L);
+    free(U);
+    free(B);
+    free(pi);
+
     return 0;
 }
diff --git a/src/host_bsp.c b/src/host_bsp.c
index 508ef6b..2bdb6d2 100644
--- a/src/host_bsp.c
+++ b/src/host_bsp.c
@@ -29,7 +29,7 @@ see the files COPYING and COPYING.LESSER. If not, see
 #include <string.h>
 #include <stdlib.h>
 #include <stddef.h>
-// We need to do this in order to use the timers that give wall time
+
 #define __USE_POSIX199309 1
 #include <time.h>
 extern int clock_nanosleep (clockid_t __clock_id, int __flags,
@@ -213,6 +213,7 @@ void ebsp_set_end_callback(void (*cb)())
 
 int ebsp_spmd()
 {   
+   
     // Start the program
     // The program will block on bsp_begin
     // in state STATE_INIT
diff --git a/src/host_bsp_inspector.c b/src/host_bsp_inspector.c
index 40180e2..af40efc 100644
--- a/src/host_bsp_inspector.c
+++ b/src/host_bsp_inspector.c
@@ -22,19 +22,15 @@ see the files COPYING and COPYING.LESSER. If not, see
 <http://www.gnu.org/licenses/>.
 */
 
-// # TODO: Features
-// [ ] Red for memory that changed from last run
-// [ ] actually dump memory from epiphany.
-// [ ] Optional: notes on memory regions
-// [ ] Fix maximum cores/memory
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <e-hal.h>
 #include <host_bsp.h>
+
 #include <ncurses.h>
+#include <signal.h>
 
 typedef enum
 {
@@ -271,8 +267,28 @@ void ebsp_inspector_finalize() {
     endwin();
 }
 
+void ebsp_inspector_finish()
+{
+    bsp_end();
+
+    // free memory buffer
+    free(i_state.buf);
+ 
+    // close the curses window
+    endwin();
+
+    // exit
+    exit(0);
+}
+
 void ebsp_inspector_enable()
 {
+    // FIXME terminate with signals properly
+    signal(SIGINT, ebsp_inspector_finish);
+
+    // TODO redirect stdout
+    // ... (freopen, dup)
+
     memset(&i_state, 0, sizeof(e_h_viewer_state));
 
     i_state.mem_max = 0x8000;