diff --git a/examples/Makefile b/examples/Makefile index 362d37b..67f6085 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -1,6 +1,8 @@ ESDK=${EPIPHANY_HOME} ELDF=${ESDK}/bsps/current/fast.ldf +CFLAGS=-std=c99 -O3 -ffast-math -funroll-loops + INCLUDES = \ -I../include\ -I${ESDK}/tools/host/include @@ -30,10 +32,10 @@ dirs: hello: bin/host_hello bin/e_hello.elf bin/e_hello.srec bin/host_hello: hello/host_hello.c - gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) + gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) bin/e_hello.elf: hello/e_hello.c - e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) + e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) bin/e_hello.srec: bin/e_hello.elf e-objcopy --srec-forceS3 --output-target srec $< $@ @@ -43,10 +45,10 @@ bin/e_hello.srec: bin/e_hello.elf e_hello: bin/host_e_hello bin/e_e_hello.elf bin/e_e_hello.srec bin/host_e_hello: e_hello/host_e_hello.c - gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) + gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) bin/e_e_hello.elf: e_hello/e_e_hello.c - e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) + e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) bin/e_e_hello.srec: bin/e_e_hello.elf e-objcopy --srec-forceS3 --output-target srec $< $@ @@ -56,10 +58,10 @@ bin/e_e_hello.srec: bin/e_e_hello.elf dot_product: bin/host_dot_product bin/e_dot_product.elf bin/e_dot_product.srec bin/host_dot_product: dot_product/host_dot_product.c - gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) + gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) bin/e_dot_product.elf: dot_product/e_dot_product.c - e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) + e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) bin/e_dot_product.srec: bin/e_dot_product.elf e-objcopy --srec-forceS3 --output-target srec $< $@ @@ -69,10 +71,10 @@ bin/e_dot_product.srec: bin/e_dot_product.elf lu_decomposition: bin/host_lu_decomposition bin/e_lu_decomposition.elf bin/e_lu_decomposition.srec bin/host_lu_decomposition: lu_decomposition/host_lu_decomposition.c - gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) + gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) bin/e_lu_decomposition.elf: lu_decomposition/e_lu_decomposition.c - e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) + e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) bin/e_lu_decomposition.srec: bin/e_lu_decomposition.elf e-objcopy --srec-forceS3 --output-target srec $< $@ @@ -82,10 +84,10 @@ bin/e_lu_decomposition.srec: bin/e_lu_decomposition.elf memtest: bin/host_memtest bin/e_memtest.elf bin/e_memtest.srec bin/host_memtest: memtest/host_memtest.c - gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) + gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) bin/e_memtest.elf: memtest/e_memtest.c - e-gcc -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) + e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) bin/e_memtest.srec: bin/e_memtest.elf e-objcopy --srec-forceS3 --output-target srec $< $@ @@ -95,16 +97,15 @@ bin/e_memtest.srec: bin/e_memtest.elf bspbench: bin/host_bspbench bin/e_bspbench.elf bin/e_bspbench.srec bin/host_bspbench: bspbench/host_bspbench.c - gcc $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) + gcc $(CFLAGS) $(INCLUDES) -o $@ $< $(LIBS) $(HOST_LIBS) $(HOST_LIB_NAMES) bin/e_bspbench.elf: bspbench/e_bspbench.c - e-gcc -O3 -ffast-math -funroll-loops -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) + e-gcc $(CFLAGS) -T ${ELDF} $(INCLUDES) -o $@ $< $(LIBS) $(E_LIBS) $(E_LIB_NAMES) bin/e_bspbench.srec: bin/e_bspbench.elf e-objcopy --srec-forceS3 --output-target srec $< $@ ######################################################## - clean: rm bin/* diff --git a/examples/lu_decomposition/common.h b/examples/lu_decomposition/common.h index 3dda0ef..358d5ad 100644 --- a/examples/lu_decomposition/common.h +++ b/examples/lu_decomposition/common.h @@ -22,17 +22,15 @@ see the files COPYING and COPYING.LESSER. If not, see . */ -#define LOC_M 0x4000 -#define LOC_N 0x4001 -#define LOC_DIM 0x4002 -#define LOC_MATRIX 0x4004 +#define LOC_M 0x4800 +#define LOC_N 0x4804 +#define LOC_DIM 0x4808 +#define LOC_MATRIX 0x480c #define LOC_RS 0x5800 #define LOC_ARK (LOC_RS + sizeof(int) * M) -#define LOC_R (LOC_ARK + sizeof(float)) +#define LOC_R (LOC_ARK + sizeof(float) * M) #define LOC_PI (LOC_R + sizeof(int)) -#define LOC_PI_IN (LOC_PI + sizeof(int) * N) +#define LOC_PI_IN (LOC_PI + sizeof(int) * entries_per_col) #define LOC_ROW_IN (LOC_PI_IN + sizeof(int) * 2) #define LOC_COL_IN (LOC_ROW_IN + sizeof(float) * dim) - -#define LOC_RESULT 0x6000 diff --git a/examples/lu_decomposition/e_lu_decomposition.c b/examples/lu_decomposition/e_lu_decomposition.c index ba5aa0e..40baaad 100644 --- a/examples/lu_decomposition/e_lu_decomposition.c +++ b/examples/lu_decomposition/e_lu_decomposition.c @@ -25,16 +25,17 @@ see the files COPYING and COPYING.LESSER. If not, see #include #include "e-lib.h" +#include #include "common.h" int M = 0; int N = 0; +int dim = 0; int s = 0; int t = 0; -int dim = 0; -int entries_per_row = 0; +int entries_per_col = 0; -inline int proc_id(int s, int t) +int proc_id(int s, int t) { return s * M + t; } @@ -47,32 +48,32 @@ int ltg(int* i, int* j, int l) } // "global to local" index -inline int gtl(int i, int j) +int gtl(int i, int j) { // here we assume correct processor return (i / M) * (dim / M) + (j / M); } -inline float* a(int i, int j) { +float* a(int i, int j) { return (float*)LOC_MATRIX + gtl(i, j); } int main() { - int i, j, k; bsp_begin(); int n = bsp_nprocs(); int p = bsp_pid(); - M = (*(char*)LOC_M); - N = (*(char*)LOC_N); - dim = (*(char*)LOC_DIM); - entries_per_row = dim / M; + M = (*(int*)LOC_M); + N = (*(int*)LOC_N); + dim = (*(int*)LOC_DIM); s = p / M; t = p % M; + entries_per_col = dim / N; + // register variable to store r and a_rk // need arrays equal to number of procs in our proc column bsp_push_reg((void*)LOC_RS, sizeof(int) * N); @@ -84,32 +85,44 @@ int main() bsp_push_reg((void*)LOC_R, sizeof(int)); bsp_sync(); - // FIXME: PI is actually distributed as well. bsp_push_reg((void*)LOC_PI_IN, sizeof(int)); bsp_sync(); + bsp_push_reg((void*)LOC_ROW_IN, sizeof(int)); + bsp_sync(); + + bsp_push_reg((void*)LOC_COL_IN, sizeof(int)); + bsp_sync(); + // also initialize pi as identity if (t == 0) - for (i = 0; i < N; ++i) - *((int*)LOC_PI + i) = i; + for (int i = 0; i < entries_per_col; ++i) + *((int*)LOC_PI + i) = s + i * N; - for (k = 0; k < dim; ++k) { + for (int k = 0; k < dim; ++k) { //---------------------- // STAGE 1: Pivot search //---------------------- - if (k % M == 0) { + if (k % M == t) { + // COMPUTE PIVOT IN COLUMN K int rs = -1; - float a_rk = -1.0; - for (i = k; i < dim; ++i) { - float a_ik = abs(*a(i, k)); + float a_rk = -1.0f; + + int start_i = (k / N) * N + s; + if (s % N < k % N) + start_i += N; + + for (int i = start_i; i < dim; i += N) { + float a_ik = fabsf(*a(i, k)); if (a_ik > a_rk) { a_rk = a_ik; rs = i; } } - for (j = 0; j < N; ++j) { + // HORIZONTAL COMMUNICATION + for (int j = 0; j < N; ++j) { // put r_s in P(*,t) bsp_hpput(proc_id(j, t), &rs, (void*)LOC_RS, @@ -117,15 +130,19 @@ int main() // put a_(r_s, k) in P(*,t) bsp_hpput(proc_id(j, t), - &rs, (void*)LOC_ARK, + &a_rk, (void*)LOC_ARK, s * sizeof(float), sizeof(float)); } bsp_sync(); // (0) + (1) - a_rk = -1.0; - for (j = 0; j < N; ++j) { - float val = abs(*(((float*)LOC_ARK + j))); + a_rk = -1.0f; + for (int j = 0; j < N; ++j) { + if (*((int*)LOC_RS + j) < 0) + continue; + + float val = fabsf(*(((float*)LOC_ARK + j))); + if (val > a_rk) { a_rk = val; rs = *((int*)LOC_RS + j); @@ -133,7 +150,7 @@ int main() } // put r in P(s, *) - for(j = 0; j < M; ++j) { + for(int j = 0; j < M; ++j) { bsp_hpput(proc_id(s, j), &rs, (void*)LOC_R, 0, sizeof(int)); @@ -150,35 +167,43 @@ int main() // STAGE 2: Index and row swaps // ---------------------------- int r = *((int*)LOC_R); + + ebsp_message("(r, k), (%i, %i)", r, k); + if (k % N == s && t == 0) { bsp_hpput(proc_id(r % N, 0), - ((int*)LOC_PI + k), (void*)LOC_PI_IN, + ((int*)LOC_PI + (k / N)), (void*)LOC_PI_IN, 0, sizeof(int)); } + if (r % N == s && t == 0) { + // here offset is set to one in case k % N == r % N bsp_hpput(proc_id(k % N, 0), - ((int*)LOC_PI + r), (void*)LOC_PI_IN, + ((int*)LOC_PI + (r / N)), (void*)LOC_PI_IN, sizeof(int), sizeof(int)); } + bsp_sync(); // (4) - if (k % N == s && t == 0) - *((int*)LOC_PI + k) = *((int*)LOC_PI_IN + 1); + if (k % N == s && t == 0) { + *((int*)LOC_PI + (k / N)) = *((int*)LOC_PI_IN + 1); + } + if (r % N == s && t == 0) - *((int*)LOC_PI + r) = *((int*)LOC_PI_IN); + *((int*)LOC_PI + (r / N)) = *((int*)LOC_PI_IN); if (k % N == s) { // need to swap rows with row r - for (j = t; j < dim; j += M) { + for (int j = t; j < dim; j += M) { bsp_hpput(proc_id(r % N, t), a(k, j), (void*)LOC_ROW_IN, sizeof(float) * (j - t) / M, sizeof(float)); } } - if (r % N == s) { // need to swap rows with row r - for (j = t; j < dim; j += M) { + if (r % N == s) { // need to swap rows with row k + for (int j = t; j < dim; j += M) { bsp_hpput(proc_id(k % N, t), - a(r, j), (void*)LOC_ROW_IN, + a(r, j), (void*)LOC_COL_IN, sizeof(float) * (j - t) / M, sizeof(float)); } } @@ -186,13 +211,13 @@ int main() bsp_sync(); // (5) + (6) if (k % N == s) { - for (j = t; j < dim; j += M) { - (*a(k, j)) = *((float*)LOC_ROW_IN + (j - t)/M); + for (int j = t; j < dim; j += M) { + *a(k, j) = *((float*)LOC_COL_IN + (j - t) / M); } } if (r % N == s) { - for (j = t; j < dim; j += M) { - (*a(r, j)) = *((float*)LOC_ROW_IN + (j - t)/M); + for (int j = t; j < dim; j += M) { + *a(r, j) = *((float*)LOC_ROW_IN + (j - t) / M); } } @@ -203,7 +228,7 @@ int main() // ---------------------- if (k % N == s && k % M == t) { // put a_kk in P(*, t) - for (j = 0; j < N; j += M) { + for (int j = 0; j < N; ++j) { bsp_hpput(proc_id(j, t), a(k, k), (void*)LOC_ROW_IN, 0, sizeof(float)); @@ -212,40 +237,51 @@ int main() bsp_sync(); // (8) + int start_idx = (k / N) * N + s; + if (s % N <= k % N) + start_idx += N; + + int start_jdx = (k / M) * M + t; + if (t % N <= k % M) + start_jdx += M; + if (k % M == t) { - for (i = k; i < n && i % N == s; ++i) { - (*a(i, k)) = *a(i,k) / (*((int*)LOC_ROW_IN)); + for (int i = start_idx; i < dim; i += N) { + *a(i, k) = *a(i, k) / (*((float*)LOC_ROW_IN)); } } + // HORIZONTAL COMMUNICATION if (k % M == t) { // put a_ik in P(s, *) - for (i = k; i < n && i % N == s; ++i) { - for (j = 0; j < M; ++j) { - bsp_hpput(proc_id(s, j), + for (int i = start_idx; i < dim; i += N) { + for (int sj = 0; sj < M; ++sj) { + bsp_hpput(proc_id(s, sj), a(i, k), (void*)LOC_COL_IN, sizeof(float) * i, sizeof(float)); } } } + + // VERTICAL COMMUNICATION if (k % N == s) { // put a_ki in P(*, t) - for (i = k; i < n && i % M == t; ++i) { - for (j = 0; j < N; ++j) { - bsp_hpput(proc_id(j, t), - a(k, i), (void*)LOC_ROW_IN, - sizeof(float) * i, sizeof(float)); + for (int j = start_jdx; j < dim; j += M) { + for (int si = 0; si < N; ++si) { + bsp_hpput(proc_id(si, t), + a(k, j), (void*)LOC_ROW_IN, + sizeof(float) * j, sizeof(float)); } } } bsp_sync(); // (9) + (10) - for (i = k; i < n && i % N == s; ++i) { - for (j = k; j < n && j % M == t; ++j) { - int a_ik = *((float*)LOC_COL_IN + i); - int a_kj = *((float*)LOC_ROW_IN + j); - (*a(i, j)) = *a(i, j) - a_ik * a_kj; + for (int i = start_idx; i < dim; i += N) { + for (int j = start_jdx; j < dim; j += M) { + float a_ik = *((float*)LOC_COL_IN + i); + float a_kj = *((float*)LOC_ROW_IN + j); + *a(i, j) = *a(i, j) - a_ik * a_kj; } } } diff --git a/examples/lu_decomposition/host_lu_decomposition.c b/examples/lu_decomposition/host_lu_decomposition.c index 69a82b0..c5dcd39 100644 --- a/examples/lu_decomposition/host_lu_decomposition.c +++ b/examples/lu_decomposition/host_lu_decomposition.c @@ -33,13 +33,13 @@ see the files COPYING and COPYING.LESSER. If not, see #define DEBUG // information on matrix and procs -char N = -1; -char M = -1; +int N = -1; +int M = -1; // always choose multiple of 4 such that we dont have to worry // about heterogeneous distributions too much, // which makes a lot of things much easier -char dim = 20; +int dim = 20; // "local to global" index int ltg(int* i, int* j, int l, int s, int t) @@ -61,20 +61,54 @@ int proc_id(int s, int t) return s * M + t; } +// multiply AB = C (all n x n) +// assume matrices are stored column-major +void mat_mult(float* A, float* B, float* C, int n) +{ + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + C[n * i + j] = 0.0f; + for (int k = 0; k < n; ++k) { + C[n * i + j] += A[n * i + k] * B[n * k + j]; + } + } + } +} + +// permute matrix A (n x n) according to the vector pi (n x 1) +// B = P^T(pi) A)) +void mat_permute(int* pi, float* A, float* B, int n) +{ + float* swap_row = malloc(sizeof(float) * n); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + B[n * i + j] = A[n * pi[i] + j]; + } + } +} + +// print a matrix to stdout +void mat_pretty_print(float* A, int n) +{ + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + printf("%.2f\t", A[dim * i + j]); + } + printf("\n"); + } +} + int main(int argc, char **argv) { + srand(12345); + // allocate and zero-initialize matrix - float* mat = malloc(sizeof(float) * dim * dim); + float* A = malloc(sizeof(float) * dim * dim); // construct the matrix - int i = 0; - int j = 0; - for(i = 0; i < dim; ++i) { - for(j = 0; j < dim; ++j) { - if(i > j) - mat[dim*i + j] = (float)i / (j+1); - else - mat[dim*i + j] = (float)j / (i+1); + for(int i = 0; i < dim; ++i) { + for(int j = 0; j < dim; ++j) { + A[dim * i + j] = rand() % 5 + 1; } } @@ -97,40 +131,49 @@ int main(int argc, char **argv) default: fprintf(stderr, "Unsupported processor count, please add values\ for N and M in the host program."); - break; + return -1; } printf("LUD: Writing info on procs and matrix \n"); // Write M, N and dim to every processor such that they can figure out // the (s,t) pair, and gtl / ltg functions - for(i = 0; i < bsp_nprocs(); ++i) { - co_write(i, &M, (off_t)LOC_M, sizeof(char)); - co_write(i, &N, (off_t)LOC_N, sizeof(char)); - co_write(i, &dim, (off_t)LOC_DIM, sizeof(char)); + for (int i = 0; i < bsp_nprocs(); ++i) { + ebsp_write(i, &M, (off_t)LOC_M, sizeof(int)); + ebsp_write(i, &N, (off_t)LOC_N, sizeof(int)); + ebsp_write(i, &dim, (off_t)LOC_DIM, sizeof(int)); } - int s = 0; - int t = 0; - int l = 0; - for (i = 0; i < dim; ++i) { - for (j = 0; j < dim; ++j) { - gtl(i, j, &l, &s, &t); - co_write(proc_id(s, t), - &mat[dim*i + j], - LOC_MATRIX + sizeof(float) * l, + int prow = 0; + int pcol = 0; + int loc = 0; + for (int i = 0; i < dim; ++i) { + for (int j = 0; j < dim; ++j) { + gtl(i, j, &loc, &prow, &pcol); + ebsp_write(proc_id(prow, pcol), + &A[dim*i + j], + LOC_MATRIX + sizeof(float) * loc, sizeof(float)); } } // test global to local and local to global function for random processor #ifdef DEBUG - s = 2; - t = 3; - printf("i.e. (s,t) = (2,3): \n"); - for (l = 0; l < (dim * dim) / bsp_nprocs(); ++l) { + int s = 3; + int t = 3; + printf("e.g. (s,t) = (3,3): \n"); + + int _M, _N, _dim; + ebsp_read(proc_id(s, t), (off_t)LOC_M, &_M, sizeof(int)); + ebsp_read(proc_id(s, t), (off_t)LOC_N, &_N, sizeof(int)); + ebsp_read(proc_id(s, t), (off_t)LOC_DIM, &_dim, sizeof(int)); + + printf("M, N, dim: %i, %i, %i\n", _M, _N, _dim); + + for (int l = 0; l < (dim * dim) / bsp_nprocs(); ++l) { + int i, j; ltg(&i, &j, l, s, t); float val; - co_read(proc_id(s, t), + ebsp_read(proc_id(s, t), LOC_MATRIX + sizeof(float) * l, &val, sizeof(float)); @@ -141,16 +184,16 @@ int main(int argc, char **argv) #endif #ifdef DEBUG - ebsp_inspector_enable(); + //ebsp_inspector_enable(); #endif ebsp_spmd(); printf("----------------------------: \n"); printf("Matrix: \n"); - for (i = 0; i < dim; ++i) { - for (j = 0; j < dim; ++j) { - printf("%.2f ", mat[dim * i + j]); + for (int i = 0; i < dim; ++i) { + for (int j = 0; j < dim; ++j) { + printf("%.2f\t", A[dim * i + j]); } printf("\n"); } @@ -158,28 +201,102 @@ int main(int argc, char **argv) printf("----------------------------: \n"); printf("LU decomposition: \n"); - for (s = 0; s < N; ++s) { - for (t = 0; t < M; ++t) { - for (l = 0; l < (dim * dim) / bsp_nprocs(); ++l) { - ltg(&i, &j, l, s, t); - co_read(proc_id(s, t), - LOC_MATRIX + sizeof(float) * l, - &mat[dim*i + j], sizeof(float)); + float* Y = malloc(sizeof(float) * dim * dim); + + for (int s = 0; s < N; ++s) { + for (int t = 0; t < M; ++t) { + for (int l = 0; l < (dim * dim) / bsp_nprocs(); ++l) { + int i, j; + ltg(&i, &j, l, s, t); + ebsp_read(proc_id(s, t), + LOC_MATRIX + sizeof(float) * l, + &Y[dim*i + j], sizeof(float)); } } } - for (i = 0; i < dim; ++i) { - for (j = 0; j < dim; ++j) { - printf("%.2f ", mat[dim * i + j]); + for (int i = 0; i < dim; ++i) { + for (int j = 0; j < dim; ++j) { + printf("%.2f\t", Y[dim * i + j]); } printf("\n"); } + int* pi = malloc(sizeof(int) * dim); + + printf("PI: \n"); + for(int i = 0; i < dim; ++i) { + ebsp_read(proc_id(i % N, 0), + LOC_PI + sizeof(int) * (i / N), + &pi[i], sizeof(int)); + printf("%i\n", pi[i]); + } + printf("----------------------------: \n"); + // we test the results here + float* L = malloc(sizeof(float) * dim * dim); + float* U = malloc(sizeof(float) * dim * dim); + float* B = malloc(sizeof(float) * dim * dim); + + for (int i = 0; i < dim; ++i) { + for (int j = 0; j < dim; ++j) { + L[dim * i + j] = 0.0f; + U[dim * i + j] = 0.0f; + B[dim * i + j] = 0.0f; + } + } + + // obtain L, U from Y + for (int i = 0; i < dim; ++i) { + for (int j = 0; j < dim; ++j) { + if (i == j) { + U[dim * i + j] = Y[dim * i + j]; + L[dim * i + j] = 1.0f; + } + else if (j < i) { + L[dim * i + j] = Y[dim * i + j]; + } else { + U[dim * i + j] = Y[dim * i + j]; + } + } + } + + printf("A ---------------------------- \n"); + mat_pretty_print(A, dim); + + printf("Y ---------------------------- \n"); + mat_pretty_print(Y, dim); + + printf("L ---------------------------- \n"); + mat_pretty_print(L, dim); + + printf("U ---------------------------- \n"); + mat_pretty_print(U, dim); + + printf("PA ---------------------------- \n"); + + // first see what the permuted A looks like + mat_permute(pi, A, B, dim); + mat_pretty_print(B, dim); + + printf("LU ---------------------------- \n"); + + // obtain LU + mat_mult(L, U, B, dim); + mat_pretty_print(B, dim); + + printf("FINISHED ---------------------- \n"); + // finalize bsp_end(); + // free matrices and vectors + free(A); + free(L); + free(U); + free(B); + free(pi); + return 0; } diff --git a/src/host_bsp.c b/src/host_bsp.c index 508ef6b..2bdb6d2 100644 --- a/src/host_bsp.c +++ b/src/host_bsp.c @@ -29,7 +29,7 @@ see the files COPYING and COPYING.LESSER. If not, see #include #include #include -// We need to do this in order to use the timers that give wall time + #define __USE_POSIX199309 1 #include extern int clock_nanosleep (clockid_t __clock_id, int __flags, @@ -213,6 +213,7 @@ void ebsp_set_end_callback(void (*cb)()) int ebsp_spmd() { + // Start the program // The program will block on bsp_begin // in state STATE_INIT diff --git a/src/host_bsp_inspector.c b/src/host_bsp_inspector.c index 40180e2..af40efc 100644 --- a/src/host_bsp_inspector.c +++ b/src/host_bsp_inspector.c @@ -22,19 +22,15 @@ see the files COPYING and COPYING.LESSER. If not, see . */ -// # TODO: Features -// [ ] Red for memory that changed from last run -// [ ] actually dump memory from epiphany. -// [ ] Optional: notes on memory regions -// [ ] Fix maximum cores/memory - #include #include #include #include #include + #include +#include typedef enum { @@ -271,8 +267,28 @@ void ebsp_inspector_finalize() { endwin(); } +void ebsp_inspector_finish() +{ + bsp_end(); + + // free memory buffer + free(i_state.buf); + + // close the curses window + endwin(); + + // exit + exit(0); +} + void ebsp_inspector_enable() { + // FIXME terminate with signals properly + signal(SIGINT, ebsp_inspector_finish); + + // TODO redirect stdout + // ... (freopen, dup) + memset(&i_state, 0, sizeof(e_h_viewer_state)); i_state.mem_max = 0x8000;