diff --git a/examples/Makefile b/examples/Makefile
index 362d37b..67f6085 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,6 +1,8 @@
ESDK=${EPIPHANY_HOME}
ELDF=${ESDK}/bsps/current/fast.ldf
+CFLAGS=-std=c99 -O3 -ffast-math -funroll-loops
+
INCLUDES = \
-I../include\
-I${ESDK}/tools/host/include
@@ -30,10 +32,10 @@ dirs:
hello: bin/host_hello bin/e_hello.elf bin/e_hello.srec
bin/host_hello: hello/host_hello.c
- gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+ gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
bin/e_hello.elf: hello/e_hello.c
- e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+ e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
bin/e_hello.srec: bin/e_hello.elf
e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -43,10 +45,10 @@ bin/e_hello.srec: bin/e_hello.elf
e_hello: bin/host_e_hello bin/e_e_hello.elf bin/e_e_hello.srec
bin/host_e_hello: e_hello/host_e_hello.c
- gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+ gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
bin/e_e_hello.elf: e_hello/e_e_hello.c
- e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+ e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
bin/e_e_hello.srec: bin/e_e_hello.elf
e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -56,10 +58,10 @@ bin/e_e_hello.srec: bin/e_e_hello.elf
dot_product: bin/host_dot_product bin/e_dot_product.elf bin/e_dot_product.srec
bin/host_dot_product: dot_product/host_dot_product.c
- gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+ gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
bin/e_dot_product.elf: dot_product/e_dot_product.c
- e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+ e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
bin/e_dot_product.srec: bin/e_dot_product.elf
e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -69,10 +71,10 @@ bin/e_dot_product.srec: bin/e_dot_product.elf
lu_decomposition: bin/host_lu_decomposition bin/e_lu_decomposition.elf bin/e_lu_decomposition.srec
bin/host_lu_decomposition: lu_decomposition/host_lu_decomposition.c
- gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+ gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
bin/e_lu_decomposition.elf: lu_decomposition/e_lu_decomposition.c
- e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+ e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
bin/e_lu_decomposition.srec: bin/e_lu_decomposition.elf
e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -82,10 +84,10 @@ bin/e_lu_decomposition.srec: bin/e_lu_decomposition.elf
memtest: bin/host_memtest bin/e_memtest.elf bin/e_memtest.srec
bin/host_memtest: memtest/host_memtest.c
- gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+ gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
bin/e_memtest.elf: memtest/e_memtest.c
- e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+ e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
bin/e_memtest.srec: bin/e_memtest.elf
e-objcopy --srec-forceS3 --output-target srec $< $@
@@ -95,16 +97,15 @@ bin/e_memtest.srec: bin/e_memtest.elf
bspbench: bin/host_bspbench bin/e_bspbench.elf bin/e_bspbench.srec
bin/host_bspbench: bspbench/host_bspbench.c
- gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
+ gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES)
bin/e_bspbench.elf: bspbench/e_bspbench.c
- e-gcc -O3 -ffast-math -funroll-loops -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
+ e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES)
bin/e_bspbench.srec: bin/e_bspbench.elf
e-objcopy --srec-forceS3 --output-target srec $< $@
########################################################
-
clean:
rm bin/*
diff --git a/examples/lu_decomposition/common.h b/examples/lu_decomposition/common.h
index 3dda0ef..358d5ad 100644
--- a/examples/lu_decomposition/common.h
+++ b/examples/lu_decomposition/common.h
@@ -22,17 +22,15 @@ see the files COPYING and COPYING.LESSER. If not, see
.
*/
-#define LOC_M 0x4000
-#define LOC_N 0x4001
-#define LOC_DIM 0x4002
-#define LOC_MATRIX 0x4004
+#define LOC_M 0x4800
+#define LOC_N 0x4804
+#define LOC_DIM 0x4808
+#define LOC_MATRIX 0x480c
#define LOC_RS 0x5800
#define LOC_ARK (LOC_RS + sizeof(int) * M)
-#define LOC_R (LOC_ARK + sizeof(float))
+#define LOC_R (LOC_ARK + sizeof(float) * M)
#define LOC_PI (LOC_R + sizeof(int))
-#define LOC_PI_IN (LOC_PI + sizeof(int) * N)
+#define LOC_PI_IN (LOC_PI + sizeof(int) * entries_per_col)
#define LOC_ROW_IN (LOC_PI_IN + sizeof(int) * 2)
#define LOC_COL_IN (LOC_ROW_IN + sizeof(float) * dim)
-
-#define LOC_RESULT 0x6000
diff --git a/examples/lu_decomposition/e_lu_decomposition.c b/examples/lu_decomposition/e_lu_decomposition.c
index ba5aa0e..40baaad 100644
--- a/examples/lu_decomposition/e_lu_decomposition.c
+++ b/examples/lu_decomposition/e_lu_decomposition.c
@@ -25,16 +25,17 @@ see the files COPYING and COPYING.LESSER. If not, see
#include
#include "e-lib.h"
+#include
#include "common.h"
int M = 0;
int N = 0;
+int dim = 0;
int s = 0;
int t = 0;
-int dim = 0;
-int entries_per_row = 0;
+int entries_per_col = 0;
-inline int proc_id(int s, int t)
+int proc_id(int s, int t)
{
return s * M + t;
}
@@ -47,32 +48,32 @@ int ltg(int* i, int* j, int l)
}
// "global to local" index
-inline int gtl(int i, int j)
+int gtl(int i, int j)
{
// here we assume correct processor
return (i / M) * (dim / M) + (j / M);
}
-inline float* a(int i, int j) {
+float* a(int i, int j) {
return (float*)LOC_MATRIX + gtl(i, j);
}
int main()
{
- int i, j, k;
bsp_begin();
int n = bsp_nprocs();
int p = bsp_pid();
- M = (*(char*)LOC_M);
- N = (*(char*)LOC_N);
- dim = (*(char*)LOC_DIM);
- entries_per_row = dim / M;
+ M = (*(int*)LOC_M);
+ N = (*(int*)LOC_N);
+ dim = (*(int*)LOC_DIM);
s = p / M;
t = p % M;
+ entries_per_col = dim / N;
+
// register variable to store r and a_rk
// need arrays equal to number of procs in our proc column
bsp_push_reg((void*)LOC_RS, sizeof(int) * N);
@@ -84,32 +85,44 @@ int main()
bsp_push_reg((void*)LOC_R, sizeof(int));
bsp_sync();
- // FIXME: PI is actually distributed as well.
bsp_push_reg((void*)LOC_PI_IN, sizeof(int));
bsp_sync();
+ bsp_push_reg((void*)LOC_ROW_IN, sizeof(int));
+ bsp_sync();
+
+ bsp_push_reg((void*)LOC_COL_IN, sizeof(int));
+ bsp_sync();
+
// also initialize pi as identity
if (t == 0)
- for (i = 0; i < N; ++i)
- *((int*)LOC_PI + i) = i;
+ for (int i = 0; i < entries_per_col; ++i)
+ *((int*)LOC_PI + i) = s + i * N;
- for (k = 0; k < dim; ++k) {
+ for (int k = 0; k < dim; ++k) {
//----------------------
// STAGE 1: Pivot search
//----------------------
- if (k % M == 0) {
+ if (k % M == t) {
+ // COMPUTE PIVOT IN COLUMN K
int rs = -1;
- float a_rk = -1.0;
- for (i = k; i < dim; ++i) {
- float a_ik = abs(*a(i, k));
+ float a_rk = -1.0f;
+
+ int start_i = (k / N) * N + s;
+ if (s % N < k % N)
+ start_i += N;
+
+ for (int i = start_i; i < dim; i += N) {
+ float a_ik = fabsf(*a(i, k));
if (a_ik > a_rk) {
a_rk = a_ik;
rs = i;
}
}
- for (j = 0; j < N; ++j) {
+ // HORIZONTAL COMMUNICATION
+ for (int j = 0; j < N; ++j) {
// put r_s in P(*,t)
bsp_hpput(proc_id(j, t),
&rs, (void*)LOC_RS,
@@ -117,15 +130,19 @@ int main()
// put a_(r_s, k) in P(*,t)
bsp_hpput(proc_id(j, t),
- &rs, (void*)LOC_ARK,
+ &a_rk, (void*)LOC_ARK,
s * sizeof(float), sizeof(float));
}
bsp_sync(); // (0) + (1)
- a_rk = -1.0;
- for (j = 0; j < N; ++j) {
- float val = abs(*(((float*)LOC_ARK + j)));
+ a_rk = -1.0f;
+ for (int j = 0; j < N; ++j) {
+ if (*((int*)LOC_RS + j) < 0)
+ continue;
+
+ float val = fabsf(*(((float*)LOC_ARK + j)));
+
if (val > a_rk) {
a_rk = val;
rs = *((int*)LOC_RS + j);
@@ -133,7 +150,7 @@ int main()
}
// put r in P(s, *)
- for(j = 0; j < M; ++j) {
+ for(int j = 0; j < M; ++j) {
bsp_hpput(proc_id(s, j),
&rs, (void*)LOC_R,
0, sizeof(int));
@@ -150,35 +167,43 @@ int main()
// STAGE 2: Index and row swaps
// ----------------------------
int r = *((int*)LOC_R);
+
+ ebsp_message("(r, k), (%i, %i)", r, k);
+
if (k % N == s && t == 0) {
bsp_hpput(proc_id(r % N, 0),
- ((int*)LOC_PI + k), (void*)LOC_PI_IN,
+ ((int*)LOC_PI + (k / N)), (void*)LOC_PI_IN,
0, sizeof(int));
}
+
if (r % N == s && t == 0) {
+ // here offset is set to one in case k % N == r % N
bsp_hpput(proc_id(k % N, 0),
- ((int*)LOC_PI + r), (void*)LOC_PI_IN,
+ ((int*)LOC_PI + (r / N)), (void*)LOC_PI_IN,
sizeof(int), sizeof(int));
}
+
bsp_sync(); // (4)
- if (k % N == s && t == 0)
- *((int*)LOC_PI + k) = *((int*)LOC_PI_IN + 1);
+ if (k % N == s && t == 0) {
+ *((int*)LOC_PI + (k / N)) = *((int*)LOC_PI_IN + 1);
+ }
+
if (r % N == s && t == 0)
- *((int*)LOC_PI + r) = *((int*)LOC_PI_IN);
+ *((int*)LOC_PI + (r / N)) = *((int*)LOC_PI_IN);
if (k % N == s) { // need to swap rows with row r
- for (j = t; j < dim; j += M) {
+ for (int j = t; j < dim; j += M) {
bsp_hpput(proc_id(r % N, t),
a(k, j), (void*)LOC_ROW_IN,
sizeof(float) * (j - t) / M, sizeof(float));
}
}
- if (r % N == s) { // need to swap rows with row r
- for (j = t; j < dim; j += M) {
+ if (r % N == s) { // need to swap rows with row k
+ for (int j = t; j < dim; j += M) {
bsp_hpput(proc_id(k % N, t),
- a(r, j), (void*)LOC_ROW_IN,
+ a(r, j), (void*)LOC_COL_IN,
sizeof(float) * (j - t) / M, sizeof(float));
}
}
@@ -186,13 +211,13 @@ int main()
bsp_sync(); // (5) + (6)
if (k % N == s) {
- for (j = t; j < dim; j += M) {
- (*a(k, j)) = *((float*)LOC_ROW_IN + (j - t)/M);
+ for (int j = t; j < dim; j += M) {
+ *a(k, j) = *((float*)LOC_COL_IN + (j - t) / M);
}
}
if (r % N == s) {
- for (j = t; j < dim; j += M) {
- (*a(r, j)) = *((float*)LOC_ROW_IN + (j - t)/M);
+ for (int j = t; j < dim; j += M) {
+ *a(r, j) = *((float*)LOC_ROW_IN + (j - t) / M);
}
}
@@ -203,7 +228,7 @@ int main()
// ----------------------
if (k % N == s && k % M == t) {
// put a_kk in P(*, t)
- for (j = 0; j < N; j += M) {
+ for (int j = 0; j < N; ++j) {
bsp_hpput(proc_id(j, t),
a(k, k), (void*)LOC_ROW_IN,
0, sizeof(float));
@@ -212,40 +237,51 @@ int main()
bsp_sync(); // (8)
+ int start_idx = (k / N) * N + s;
+ if (s % N <= k % N)
+ start_idx += N;
+
+ int start_jdx = (k / M) * M + t;
+ if (t % N <= k % M)
+ start_jdx += M;
+
if (k % M == t) {
- for (i = k; i < n && i % N == s; ++i) {
- (*a(i, k)) = *a(i,k) / (*((int*)LOC_ROW_IN));
+ for (int i = start_idx; i < dim; i += N) {
+ *a(i, k) = *a(i, k) / (*((float*)LOC_ROW_IN));
}
}
+ // HORIZONTAL COMMUNICATION
if (k % M == t) {
// put a_ik in P(s, *)
- for (i = k; i < n && i % N == s; ++i) {
- for (j = 0; j < M; ++j) {
- bsp_hpput(proc_id(s, j),
+ for (int i = start_idx; i < dim; i += N) {
+ for (int sj = 0; sj < M; ++sj) {
+ bsp_hpput(proc_id(s, sj),
a(i, k), (void*)LOC_COL_IN,
sizeof(float) * i, sizeof(float));
}
}
}
+
+ // VERTICAL COMMUNICATION
if (k % N == s) {
// put a_ki in P(*, t)
- for (i = k; i < n && i % M == t; ++i) {
- for (j = 0; j < N; ++j) {
- bsp_hpput(proc_id(j, t),
- a(k, i), (void*)LOC_ROW_IN,
- sizeof(float) * i, sizeof(float));
+ for (int j = start_jdx; j < dim; j += M) {
+ for (int si = 0; si < N; ++si) {
+ bsp_hpput(proc_id(si, t),
+ a(k, j), (void*)LOC_ROW_IN,
+ sizeof(float) * j, sizeof(float));
}
}
}
bsp_sync(); // (9) + (10)
- for (i = k; i < n && i % N == s; ++i) {
- for (j = k; j < n && j % M == t; ++j) {
- int a_ik = *((float*)LOC_COL_IN + i);
- int a_kj = *((float*)LOC_ROW_IN + j);
- (*a(i, j)) = *a(i, j) - a_ik * a_kj;
+ for (int i = start_idx; i < dim; i += N) {
+ for (int j = start_jdx; j < dim; j += M) {
+ float a_ik = *((float*)LOC_COL_IN + i);
+ float a_kj = *((float*)LOC_ROW_IN + j);
+ *a(i, j) = *a(i, j) - a_ik * a_kj;
}
}
}
diff --git a/examples/lu_decomposition/host_lu_decomposition.c b/examples/lu_decomposition/host_lu_decomposition.c
index 69a82b0..c5dcd39 100644
--- a/examples/lu_decomposition/host_lu_decomposition.c
+++ b/examples/lu_decomposition/host_lu_decomposition.c
@@ -33,13 +33,13 @@ see the files COPYING and COPYING.LESSER. If not, see
#define DEBUG
// information on matrix and procs
-char N = -1;
-char M = -1;
+int N = -1;
+int M = -1;
// always choose multiple of 4 such that we dont have to worry
// about heterogeneous distributions too much,
// which makes a lot of things much easier
-char dim = 20;
+int dim = 20;
// "local to global" index
int ltg(int* i, int* j, int l, int s, int t)
@@ -61,20 +61,54 @@ int proc_id(int s, int t)
return s * M + t;
}
+// multiply AB = C (all n x n)
+// assume matrices are stored column-major
+void mat_mult(float* A, float* B, float* C, int n)
+{
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ C[n * i + j] = 0.0f;
+ for (int k = 0; k < n; ++k) {
+ C[n * i + j] += A[n * i + k] * B[n * k + j];
+ }
+ }
+ }
+}
+
+// permute matrix A (n x n) according to the vector pi (n x 1)
+// B = P^T(pi) A))
+void mat_permute(int* pi, float* A, float* B, int n)
+{
+ float* swap_row = malloc(sizeof(float) * n);
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ B[n * i + j] = A[n * pi[i] + j];
+ }
+ }
+}
+
+// print a matrix to stdout
+void mat_pretty_print(float* A, int n)
+{
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ printf("%.2f\t", A[dim * i + j]);
+ }
+ printf("\n");
+ }
+}
+
int main(int argc, char **argv)
{
+ srand(12345);
+
// allocate and zero-initialize matrix
- float* mat = malloc(sizeof(float) * dim * dim);
+ float* A = malloc(sizeof(float) * dim * dim);
// construct the matrix
- int i = 0;
- int j = 0;
- for(i = 0; i < dim; ++i) {
- for(j = 0; j < dim; ++j) {
- if(i > j)
- mat[dim*i + j] = (float)i / (j+1);
- else
- mat[dim*i + j] = (float)j / (i+1);
+ for(int i = 0; i < dim; ++i) {
+ for(int j = 0; j < dim; ++j) {
+ A[dim * i + j] = rand() % 5 + 1;
}
}
@@ -97,40 +131,49 @@ int main(int argc, char **argv)
default:
fprintf(stderr, "Unsupported processor count, please add values\
for N and M in the host program.");
- break;
+ return -1;
}
printf("LUD: Writing info on procs and matrix \n");
// Write M, N and dim to every processor such that they can figure out
// the (s,t) pair, and gtl / ltg functions
- for(i = 0; i < bsp_nprocs(); ++i) {
- co_write(i, &M, (off_t)LOC_M, sizeof(char));
- co_write(i, &N, (off_t)LOC_N, sizeof(char));
- co_write(i, &dim, (off_t)LOC_DIM, sizeof(char));
+ for (int i = 0; i < bsp_nprocs(); ++i) {
+ ebsp_write(i, &M, (off_t)LOC_M, sizeof(int));
+ ebsp_write(i, &N, (off_t)LOC_N, sizeof(int));
+ ebsp_write(i, &dim, (off_t)LOC_DIM, sizeof(int));
}
- int s = 0;
- int t = 0;
- int l = 0;
- for (i = 0; i < dim; ++i) {
- for (j = 0; j < dim; ++j) {
- gtl(i, j, &l, &s, &t);
- co_write(proc_id(s, t),
- &mat[dim*i + j],
- LOC_MATRIX + sizeof(float) * l,
+ int prow = 0;
+ int pcol = 0;
+ int loc = 0;
+ for (int i = 0; i < dim; ++i) {
+ for (int j = 0; j < dim; ++j) {
+ gtl(i, j, &loc, &prow, &pcol);
+ ebsp_write(proc_id(prow, pcol),
+ &A[dim*i + j],
+ LOC_MATRIX + sizeof(float) * loc,
sizeof(float));
}
}
// test global to local and local to global function for random processor
#ifdef DEBUG
- s = 2;
- t = 3;
- printf("i.e. (s,t) = (2,3): \n");
- for (l = 0; l < (dim * dim) / bsp_nprocs(); ++l) {
+ int s = 3;
+ int t = 3;
+ printf("e.g. (s,t) = (3,3): \n");
+
+ int _M, _N, _dim;
+ ebsp_read(proc_id(s, t), (off_t)LOC_M, &_M, sizeof(int));
+ ebsp_read(proc_id(s, t), (off_t)LOC_N, &_N, sizeof(int));
+ ebsp_read(proc_id(s, t), (off_t)LOC_DIM, &_dim, sizeof(int));
+
+ printf("M, N, dim: %i, %i, %i\n", _M, _N, _dim);
+
+ for (int l = 0; l < (dim * dim) / bsp_nprocs(); ++l) {
+ int i, j;
ltg(&i, &j, l, s, t);
float val;
- co_read(proc_id(s, t),
+ ebsp_read(proc_id(s, t),
LOC_MATRIX + sizeof(float) * l,
&val,
sizeof(float));
@@ -141,16 +184,16 @@ int main(int argc, char **argv)
#endif
#ifdef DEBUG
- ebsp_inspector_enable();
+ //ebsp_inspector_enable();
#endif
ebsp_spmd();
printf("----------------------------: \n");
printf("Matrix: \n");
- for (i = 0; i < dim; ++i) {
- for (j = 0; j < dim; ++j) {
- printf("%.2f ", mat[dim * i + j]);
+ for (int i = 0; i < dim; ++i) {
+ for (int j = 0; j < dim; ++j) {
+ printf("%.2f\t", A[dim * i + j]);
}
printf("\n");
}
@@ -158,28 +201,102 @@ int main(int argc, char **argv)
printf("----------------------------: \n");
printf("LU decomposition: \n");
- for (s = 0; s < N; ++s) {
- for (t = 0; t < M; ++t) {
- for (l = 0; l < (dim * dim) / bsp_nprocs(); ++l) {
- ltg(&i, &j, l, s, t);
- co_read(proc_id(s, t),
- LOC_MATRIX + sizeof(float) * l,
- &mat[dim*i + j], sizeof(float));
+ float* Y = malloc(sizeof(float) * dim * dim);
+
+ for (int s = 0; s < N; ++s) {
+ for (int t = 0; t < M; ++t) {
+ for (int l = 0; l < (dim * dim) / bsp_nprocs(); ++l) {
+ int i, j;
+ ltg(&i, &j, l, s, t);
+ ebsp_read(proc_id(s, t),
+ LOC_MATRIX + sizeof(float) * l,
+ &Y[dim*i + j], sizeof(float));
}
}
}
- for (i = 0; i < dim; ++i) {
- for (j = 0; j < dim; ++j) {
- printf("%.2f ", mat[dim * i + j]);
+ for (int i = 0; i < dim; ++i) {
+ for (int j = 0; j < dim; ++j) {
+ printf("%.2f\t", Y[dim * i + j]);
}
printf("\n");
}
+ int* pi = malloc(sizeof(int) * dim);
+
+ printf("PI: \n");
+ for(int i = 0; i < dim; ++i) {
+ ebsp_read(proc_id(i % N, 0),
+ LOC_PI + sizeof(int) * (i / N),
+ &pi[i], sizeof(int));
+ printf("%i\n", pi[i]);
+ }
+
printf("----------------------------: \n");
+ // we test the results here
+ float* L = malloc(sizeof(float) * dim * dim);
+ float* U = malloc(sizeof(float) * dim * dim);
+ float* B = malloc(sizeof(float) * dim * dim);
+
+ for (int i = 0; i < dim; ++i) {
+ for (int j = 0; j < dim; ++j) {
+ L[dim * i + j] = 0.0f;
+ U[dim * i + j] = 0.0f;
+ B[dim * i + j] = 0.0f;
+ }
+ }
+
+ // obtain L, U from Y
+ for (int i = 0; i < dim; ++i) {
+ for (int j = 0; j < dim; ++j) {
+ if (i == j) {
+ U[dim * i + j] = Y[dim * i + j];
+ L[dim * i + j] = 1.0f;
+ }
+ else if (j < i) {
+ L[dim * i + j] = Y[dim * i + j];
+ } else {
+ U[dim * i + j] = Y[dim * i + j];
+ }
+ }
+ }
+
+ printf("A ---------------------------- \n");
+ mat_pretty_print(A, dim);
+
+ printf("Y ---------------------------- \n");
+ mat_pretty_print(Y, dim);
+
+ printf("L ---------------------------- \n");
+ mat_pretty_print(L, dim);
+
+ printf("U ---------------------------- \n");
+ mat_pretty_print(U, dim);
+
+ printf("PA ---------------------------- \n");
+
+ // first see what the permuted A looks like
+ mat_permute(pi, A, B, dim);
+ mat_pretty_print(B, dim);
+
+ printf("LU ---------------------------- \n");
+
+ // obtain LU
+ mat_mult(L, U, B, dim);
+ mat_pretty_print(B, dim);
+
+ printf("FINISHED ---------------------- \n");
+
// finalize
bsp_end();
+ // free matrices and vectors
+ free(A);
+ free(L);
+ free(U);
+ free(B);
+ free(pi);
+
return 0;
}
diff --git a/src/host_bsp.c b/src/host_bsp.c
index 508ef6b..2bdb6d2 100644
--- a/src/host_bsp.c
+++ b/src/host_bsp.c
@@ -29,7 +29,7 @@ see the files COPYING and COPYING.LESSER. If not, see
#include
#include
#include
-// We need to do this in order to use the timers that give wall time
+
#define __USE_POSIX199309 1
#include
extern int clock_nanosleep (clockid_t __clock_id, int __flags,
@@ -213,6 +213,7 @@ void ebsp_set_end_callback(void (*cb)())
int ebsp_spmd()
{
+
// Start the program
// The program will block on bsp_begin
// in state STATE_INIT
diff --git a/src/host_bsp_inspector.c b/src/host_bsp_inspector.c
index 40180e2..af40efc 100644
--- a/src/host_bsp_inspector.c
+++ b/src/host_bsp_inspector.c
@@ -22,19 +22,15 @@ see the files COPYING and COPYING.LESSER. If not, see
.
*/
-// # TODO: Features
-// [ ] Red for memory that changed from last run
-// [ ] actually dump memory from epiphany.
-// [ ] Optional: notes on memory regions
-// [ ] Fix maximum cores/memory
-
#include
#include
#include
#include
#include
+
#include
+#include
typedef enum
{
@@ -271,8 +267,28 @@ void ebsp_inspector_finalize() {
endwin();
}
+void ebsp_inspector_finish()
+{
+ bsp_end();
+
+ // free memory buffer
+ free(i_state.buf);
+
+ // close the curses window
+ endwin();
+
+ // exit
+ exit(0);
+}
+
void ebsp_inspector_enable()
{
+ // FIXME terminate with signals properly
+ signal(SIGINT, ebsp_inspector_finish);
+
+ // TODO redirect stdout
+ // ... (freopen, dup)
+
memset(&i_state, 0, sizeof(e_h_viewer_state));
i_state.mem_max = 0x8000;