diff --git a/configure.ac b/configure.ac index e05fb6b6c..8a911a4fd 100644 --- a/configure.ac +++ b/configure.ac @@ -245,8 +245,6 @@ AC_ARG_WITH(dso, [ --with-dso use DSO linking], with_dso=$withval, with_dso=no) AC_ARG_WITH(opt, [ --with-opt use opt for essentials], with_opt=$withval, with_opt=no) -AC_ARG_WITH(openmp, - [ --with-openmp use OMP directives], with_openmp=$withval, with_openmp=no) AC_ARG_WITH(std, [ --with-std use this -std=], with_std=$withval, with_std=c99) diff --git a/src/kernel/misc/mdbench.c b/src/kernel/misc/mdbench.c index 072c4cbe3..a7e9824ae 100644 --- a/src/kernel/misc/mdbench.c +++ b/src/kernel/misc/mdbench.c @@ -7,13 +7,8 @@ */ -//#define USE_OMP - #include #include -#ifdef USE_OMP -#include -#endif string defv[] = { "dim=10,20,30,40\n Dimensions of array A[dim1][dim2][dim3][dim4]....", @@ -22,12 +17,8 @@ string defv[] = { "flip=f\n Reverse traversal through array, for benchmarking", "iter=1\n Number of times to do the work, for benchmarking", - "free=f\n Free things we don't need anymore", -#ifdef USE_OMP - "nprocs=1\n Number of processors", -#else + "free=f\n Free things we don't need anymore" "nprocs=-1\n No OMP enabled", -#endif "VERSION=1.2\n 11-feb-2024 PJT", NULL, }; @@ -294,13 +285,8 @@ void nemo_main() mdarray6 x6; mdarray7 x7; -#ifdef USE_OMP - if (nprocs < 0) nprocs = omp_get_max_threads(); - dprintf(0,"Using OMP with nprocs=%d (or use OMP_NUM_THREADS)\n",nprocs); -#else dprintf(0,"Using single CPU, no OMP enables\n"); if (nprocs>1) warning("No OMP was enabled"); -#endif /* C99 now does it the way I wanted it to work */ dprintf(1,"pointer test3: 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",test3,test3[0],&test3[0][0],&test3[0][1],test3[1],&test3[1][0]); @@ -339,10 +325,6 @@ void nemo_main() // 80^4 * 10 -> 3.832" // 90^4 * 1 -> 0.63 // 100^4 * 1 -> 1.00 -#ifdef USE_OMP - #pragma omp parallel shared(x4,ntest,dim) private(i,i4,i3,i2,i1) - #pragma omp for -#endif for (i=0; i1) { y3 = allocate_mdarray3(dim[3],dim[1],dim[0]); -#ifdef USE_OMP - #pragma omp parallel shared(x4,y3,ntest,dim) private(i,i4,i3,i2,i1,sum) - #pragma omp for -#endif for (i=0; i bench9.log 2>&1 - grep cputime bench9.log | sed s/###// | tabplot - 6 8 line=1,1 ycoord=0 ymin=0 - @echo "Jansky 0.077 k2 0.090 0.098" diff --git a/src/tutor/mp/README b/src/tutor/mp/README deleted file mode 100644 index 3cd2f995f..000000000 --- a/src/tutor/mp/README +++ /dev/null @@ -1,34 +0,0 @@ - -Here are some simple OPENMP examples. -If compiled with openmp, NEMO will show this when debug=1 is used: - ### nemo Debug Info: omp_get_max_threads() -> 8 [OMP_NUM_THREADS] -otherwise it would not show this. - -Some examples of usage: - -sections --------- - This program uses the "#pragma sections" as well as for - It also seems to shows that multiple sections cannot split up the for loop, - - -OMP_NUM_THREADS=4 sections iter=10000 -27.81user 0.00system 0:13.89elapsed 200%CPU (0avgtext+0avgdata 18172maxresident)k - -No openmp: -25.02user 0.00system 0:25.02elapsed 99%CPU (0avgtext+0avgdata 17824maxresident)k - -cool pages: - -https://zingale.github.io/phy504/openmp-relax.html - - -To measure the performance an example of the output from /usr/bin/time on program with N threads -can be processed with the amdahl.py script. Example is the heated plate, which is pretty good: - -# on an i5-1135G7 ./heated_plate_openmp -1 9.21user 0.00system 0:09.21elapsed 99%CPU 1.00 1.00 1.00 -2 9.20user 0.00system 0:04.60elapsed 199%CPU 1.00 1.00 1.00 -4 10.49user 0.00system 0:02.62elapsed 399%CPU 1.14 1.14 0.95 -8 15.05user 0.02system 0:01.89elapsed 797%CPU 1.63 1.64 0.91 -16 23.20user 5.29system 0:05.82elapsed 489%CPU 2.52 10.11 0.39 diff --git a/src/tutor/mp/heated_plate_openmp.c b/src/tutor/mp/heated_plate_openmp.c deleted file mode 100644 index 7d0026d0d..000000000 --- a/src/tutor/mp/heated_plate_openmp.c +++ /dev/null @@ -1,500 +0,0 @@ -/* - * This is an example from Burkardt's OPENMP examples (great resource!) - * with the original "main" as well as the NEMO-fied nemo_main() - * - * https://people.sc.fsu.edu/~jburkardt/c_src/heated_plate_openmp/heated_plate_openmp.c - * - */ - - -#include -#include - -# include -# include -# include -# include - - -string defv[] = { - "n=500\n Number of pixels in X", - "m=500\n Number of pixels in Y", - "eps=0.001\n Accuracy", - "nprocs=-1\n Override number of procs used", - "old=f\n Use the old executable", - "VERSION=1\n 21-dec-2019 PJT", - NULL, -}; - - -int old_main (void); -int new_main (int n, int m, real eps, int nprocs); - -void nemo_main() -{ - bool Qold = getbparam("old"); - int n = getiparam("n"); - int m = getiparam("m"); - real eps = getrparam("eps"); - int nprocs = getiparam("nprocs"); - - if (Qold) { - old_main(); - } else { - new_main(n,m,eps,nprocs); - } -} - - -/******************************************************************************/ - -int old_main (void) - -/******************************************************************************/ -/* - Purpose: - - MAIN is the main program for HEATED_PLATE_OPENMP. - - Discussion: - - This code solves the steady state heat equation on a rectangular region. - - The sequential version of this program needs approximately - 18/epsilon iterations to complete. - - - The physical region, and the boundary conditions, are suggested - by this diagram; - - W = 0 - +------------------+ - | | - W = 100 | | W = 100 - | | - +------------------+ - W = 100 - - The region is covered with a grid of M by N nodes, and an N by N - array W is used to record the temperature. The correspondence between - array indices and locations in the region is suggested by giving the - indices of the four corners: - - I = 0 - [0][0]-------------[0][N-1] - | | - J = 0 | | J = N-1 - | | - [M-1][0]-----------[M-1][N-1] - I = M-1 - - The steady state solution to the discrete heat equation satisfies the - following condition at an interior grid point: - - W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] ) - - where "Central" is the index of the grid point, "North" is the index - of its immediate neighbor to the "north", and so on. - - Given an approximate solution of the steady state heat equation, a - "better" solution is given by replacing each interior point by the - average of its 4 neighbors - in other words, by using the condition - as an ASSIGNMENT statement: - - W[Central] <= (1/4) * ( W[North] + W[South] + W[East] + W[West] ) - - If this process is repeated often enough, the difference between successive - estimates of the solution will go to zero. - - This program carries out such an iteration, using a tolerance specified by - the user, and writes the final estimate of the solution to a file that can - be used for graphic processing. - - Licensing: - - This code is distributed under the GNU LGPL license. - - Modified: - - 18 October 2011 - - Author: - - Original C version by Michael Quinn. - This C version by John Burkardt. - - Reference: - - Michael Quinn, - Parallel Programming in C with MPI and OpenMP, - McGraw-Hill, 2004, - ISBN13: 978-0071232654, - LC: QA76.73.C15.Q55. - - Local parameters: - - Local, double DIFF, the norm of the change in the solution from one iteration - to the next. - - Local, double MEAN, the average of the boundary values, used to initialize - the values of the solution in the interior. - - Local, double U[M][N], the solution at the previous iteration. - - Local, double W[M][N], the solution computed at the latest iteration. -*/ -{ -# define M 500 -# define N 500 - - double diff; - double epsilon = 0.001; - int i; - int iterations; - int iterations_print; - int j; - double mean; - double my_diff; - double u[M][N]; - double w[M][N]; - double wtime; - - printf ( "\n" ); - printf ( "HEATED_PLATE_OPENMP\n" ); - printf ( " C/OpenMP version\n" ); - printf ( " A program to solve for the steady state temperature distribution\n" ); - printf ( " over a rectangular plate.\n" ); - printf ( "\n" ); - printf ( " Spatial grid of %d by %d points.\n", M, N ); - printf ( " The iteration will be repeated until the change is <= %e\n", epsilon ); - printf ( " Number of processors available = %d\n", omp_get_num_procs ( ) ); - printf ( " Number of threads = %d\n", omp_get_max_threads ( ) ); -/* - Set the boundary values, which don't change. -*/ - mean = 0.0; - -#pragma omp parallel shared ( w ) private ( i, j ) - { -#pragma omp for - for ( i = 1; i < M - 1; i++ ) - { - w[i][0] = 100.0; - } -#pragma omp for - for ( i = 1; i < M - 1; i++ ) - { - w[i][N-1] = 100.0; - } -#pragma omp for - for ( j = 0; j < N; j++ ) - { - w[M-1][j] = 100.0; - } -#pragma omp for - for ( j = 0; j < N; j++ ) - { - w[0][j] = 0.0; - } -/* - Average the boundary values, to come up with a reasonable - initial value for the interior. -*/ -#pragma omp for reduction ( + : mean ) - for ( i = 1; i < M - 1; i++ ) - { - mean = mean + w[i][0] + w[i][N-1]; - } -#pragma omp for reduction ( + : mean ) - for ( j = 0; j < N; j++ ) - { - mean = mean + w[M-1][j] + w[0][j]; - } - } -/* - OpenMP note: - You cannot normalize MEAN inside the parallel region. It - only gets its correct value once you leave the parallel region. - So we interrupt the parallel region, set MEAN, and go back in. -*/ - mean = mean / ( double ) ( 2 * M + 2 * N - 4 ); - printf ( "\n" ); - printf ( " MEAN = %f\n", mean ); -/* - Initialize the interior solution to the mean value. -*/ -#pragma omp parallel shared ( mean, w ) private ( i, j ) - { -#pragma omp for - for ( i = 1; i < M - 1; i++ ) - { - for ( j = 1; j < N - 1; j++ ) - { - w[i][j] = mean; - } - } - } -/* - iterate until the new solution W differs from the old solution U - by no more than EPSILON. -*/ - iterations = 0; - iterations_print = 1; - printf ( "\n" ); - printf ( " Iteration Change\n" ); - printf ( "\n" ); - wtime = omp_get_wtime ( ); - - diff = epsilon; - - while ( epsilon <= diff ) - { -# pragma omp parallel shared ( u, w ) private ( i, j ) - { -/* - Save the old solution in U. -*/ -# pragma omp for - for ( i = 0; i < M; i++ ) - { - for ( j = 0; j < N; j++ ) - { - u[i][j] = w[i][j]; - } - } -/* - Determine the new estimate of the solution at the interior points. - The new solution W is the average of north, south, east and west neighbors. -*/ -# pragma omp for - for ( i = 1; i < M - 1; i++ ) - { - for ( j = 1; j < N - 1; j++ ) - { - w[i][j] = ( u[i-1][j] + u[i+1][j] + u[i][j-1] + u[i][j+1] ) / 4.0; - } - } - } -/* - C and C++ cannot compute a maximum as a reduction operation. - - Therefore, we define a private variable MY_DIFF for each thread. - Once they have all computed their values, we use a CRITICAL section - to update DIFF. -*/ - diff = 0.0; -# pragma omp parallel shared ( diff, u, w ) private ( i, j, my_diff ) - { - my_diff = 0.0; -# pragma omp for - for ( i = 1; i < M - 1; i++ ) - { - for ( j = 1; j < N - 1; j++ ) - { - if ( my_diff < fabs ( w[i][j] - u[i][j] ) ) - { - my_diff = fabs ( w[i][j] - u[i][j] ); - } - } - } -# pragma omp critical - { - if ( diff < my_diff ) - { - diff = my_diff; - } - } - } - - iterations++; - if ( iterations == iterations_print ) - { - printf ( " %8d %f\n", iterations, diff ); - iterations_print = 2 * iterations_print; - } - } - wtime = omp_get_wtime ( ) - wtime; - - printf ( "\n" ); - printf ( " %8d %f\n", iterations, diff ); - printf ( "\n" ); - printf ( " Error tolerance achieved.\n" ); - printf ( " Wallclock time = %f\n", wtime ); -/* - Terminate. -*/ - printf ( "\n" ); - printf ( "HEATED_PLATE_OPENMP:\n" ); - printf ( " Normal end of execution.\n" ); - - return 0; - -# undef M -# undef N -} - - - - - - -int new_main (int N, int M, real eps, int nprocs) -{ - double diff; - double epsilon = eps; - int i; - int iterations; - int iterations_print; - int j; - double mean; - double my_diff; - double wtime; - mdarray2 u = allocate_mdarray2(M,N); //double u[M][N]; - mdarray2 w = allocate_mdarray2(M,N); //double w[M][N]; - - if (nprocs > 0) omp_set_num_threads(nprocs); - - printf ( "\n" ); - printf ( "HEATED_PLATE_OPENMP\n" ); - printf ( " C/OpenMP version\n" ); - printf ( " A program to solve for the steady state temperature distribution\n" ); - printf ( " over a rectangular plate.\n" ); - printf ( "\n" ); - printf ( " Spatial grid of %d by %d points.\n", M, N ); - printf ( " The iteration will be repeated until the change is <= %e\n", epsilon ); - printf ( " Number of processors available = %d\n", omp_get_num_procs ( ) ); - printf ( " Number of threads = %d\n", omp_get_max_threads ( ) ); - - /* - Set the boundary values, which don't change. - */ - mean = 0.0; - - #pragma omp parallel shared ( w ) private ( i, j ) - { - #pragma omp for - for ( i = 1; i < M - 1; i++ ) - w[i][0] = 100.0; - - #pragma omp for - for ( i = 1; i < M - 1; i++ ) - w[i][N-1] = 100.0; - - #pragma omp for - for ( j = 0; j < N; j++ ) - w[M-1][j] = 100.0; - - #pragma omp for - for ( j = 0; j < N; j++ ) - w[0][j] = 0.0; - /* - Average the boundary values, to come up with a reasonable - initial value for the interior. - */ - #pragma omp for reduction ( + : mean ) - for ( i = 1; i < M - 1; i++ ) - mean = mean + w[i][0] + w[i][N-1]; - - #pragma omp for reduction ( + : mean ) - for ( j = 0; j < N; j++ ) - mean = mean + w[M-1][j] + w[0][j]; - } // pragma omp parallel shared ( w ) private ( i, j ) - - /* - OpenMP note: - You cannot normalize MEAN inside the parallel region. It - only gets its correct value once you leave the parallel region. - So we interrupt the parallel region, set MEAN, and go back in. - */ - mean = mean / ( double ) ( 2 * M + 2 * N - 4 ); - printf ( "\n" ); - printf ( " MEAN = %f\n", mean ); - /* - Initialize the interior solution to the mean value. - */ - #pragma omp parallel shared ( mean, w ) private ( i, j ) - { - #pragma omp for - for ( i = 1; i < M - 1; i++ ) - for ( j = 1; j < N - 1; j++ ) - w[i][j] = mean; - } - /* - iterate until the new solution W differs from the old solution U - by no more than EPSILON. - */ - iterations = 0; - iterations_print = 1; - printf ( "\n" ); - printf ( " Iteration Change\n" ); - printf ( "\n" ); - wtime = omp_get_wtime ( ); - - diff = epsilon; - - while ( epsilon <= diff ) { - # pragma omp parallel shared ( u, w ) private ( i, j ) - { - /* - Save the old solution in U. - */ - # pragma omp for - for ( i = 0; i < M; i++ ) - for ( j = 0; j < N; j++ ) - u[i][j] = w[i][j]; - /* - Determine the new estimate of the solution at the interior points. - The new solution W is the average of north, south, east and west neighbors. - */ - # pragma omp for - for ( i = 1; i < M - 1; i++ ) - for ( j = 1; j < N - 1; j++ ) - w[i][j] = ( u[i-1][j] + u[i+1][j] + u[i][j-1] + u[i][j+1] ) / 4.0; - } // pragma - - /* - C and C++ cannot compute a maximum as a reduction operation. - - Therefore, we define a private variable MY_DIFF for each thread. - Once they have all computed their values, we use a CRITICAL section - to update DIFF. - */ - diff = 0.0; - # pragma omp parallel shared ( diff, u, w ) private ( i, j, my_diff ) - { - my_diff = 0.0; - # pragma omp for - for ( i = 1; i < M - 1; i++ ) - for ( j = 1; j < N - 1; j++ ) - if ( my_diff < fabs ( w[i][j] - u[i][j] ) ) - my_diff = fabs ( w[i][j] - u[i][j] ); - - # pragma omp critical - { - if ( diff < my_diff ) - diff = my_diff; - } - } // pragma - - iterations++; - if ( iterations == iterations_print ) { - printf ( " %8d %f\n", iterations, diff ); - iterations_print = 2 * iterations_print; - } - } // while - wtime = omp_get_wtime ( ) - wtime; - - printf ( "\n" ); - printf ( " %8d %f\n", iterations, diff ); - printf ( "\n" ); - printf ( " Error tolerance achieved.\n" ); - printf ( " Wallclock time = %f\n", wtime ); - - - printf ( "\n" ); - printf ( "HEATED_PLATE_OPENMP:\n" ); - printf ( " Normal end of execution.\n" ); - - return 0; -} - diff --git a/src/tutor/mp/hellomp.c b/src/tutor/mp/hellomp.c deleted file mode 100644 index 66d6ec1c4..000000000 --- a/src/tutor/mp/hellomp.c +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Simple Example OMP usage - */ - -#include - -#include -#include -#include - -int main (void) -{ - int nthreads, my_id; - int i, imax = 1e8,j,jmax; - double x=0; - -#ifdef _OPENMP - printf("Compiled by an OpenMP-compliant implementation.\n"); - - #pragma omp parallel shared(x) - { - nthreads = omp_get_num_threads(); - my_id = omp_get_thread_num(); - for (i=0;i -# include -# include -# include -# include -# include - -double L = 1.0; /* linear size of square region */ -int N = 32; /* number of interior points per dim */ - -double *u, *u_new; /* linear arrays to hold solution */ - -/* macro to index into a 2-D (N+2)x(N+2) array */ -#define INDEX(i,j) ((N+2)*(i)+(j)) - -int my_rank; /* rank of this process */ - -int *proc; /* process indexed by vertex */ -int *i_min, *i_max; /* min, max vertex indices of processes */ -int *left_proc, *right_proc; /* processes to left and right */ - -/* - Functions: -*/ -int main ( int argc, char *argv[] ); -void allocate_arrays ( ); -void jacobi ( int num_procs, double f[] ); -void make_domains ( int num_procs ); -double *make_source ( ); -void timestamp ( ); - -/******************************************************************************/ - -int main ( int argc, char *argv[] ) - -/******************************************************************************/ -/* - Purpose: - - MAIN is the main program for POISSON_MPI. - - Discussion: - - This program solves Poisson's equation in a 2D region. - - The Jacobi iterative method is used to solve the linear system. - - MPI is used for parallel execution, with the domain divided - into strips. - - Modified: - - 22 September 2013 - - Local parameters: - - Local, double F[(N+2)x(N+2)], the source term. - - Local, int N, the number of interior vertices in one dimension. - - Local, int NUM_PROCS, the number of MPI processes. - - Local, double U[(N+2)*(N+2)], a solution estimate. - - Local, double U_NEW[(N+2)*(N+2)], a solution estimate. -*/ -{ - double change; - double epsilon = 1.0E-03; - double *f; - char file_name[100]; - int i; - int j; - double my_change; - int my_n; - int n; - int num_procs; - int step; - double *swap; - double wall_time; -/* - MPI initialization. -*/ - MPI_Init ( &argc, &argv ); - - MPI_Comm_size ( MPI_COMM_WORLD, &num_procs ); - - MPI_Comm_rank ( MPI_COMM_WORLD, &my_rank ); -/* - Read commandline arguments, if present. -*/ - if ( 1 < argc ) - { - sscanf ( argv[1], "%d", &N ); - } - else - { - N = 32; - } - - if ( 2 < argc ) - { - sscanf ( argv[2], "%lf", &epsilon ); - } - else - { - epsilon = 1.0E-03; - } - if ( 3 < argc ) - { - strcpy ( file_name, argv[3] ); - } - else - { - strcpy ( file_name, "poisson_mpi.out" ); - } -/* - Print out initial information. -*/ - if ( my_rank == 0 ) - { - timestamp ( ); - printf ( "\n" ); - printf ( "POISSON_MPI:\n" ); - printf ( " C version (NEMO)\n" ); - printf ( " 2-D Poisson equation using Jacobi algorithm\n" ); - printf ( " ===========================================\n" ); - printf ( " MPI version: 1-D domains, non-blocking send/receive\n" ); - printf ( " Number of processes = %d\n", num_procs ); - printf ( " Number of interior vertices = %d\n", N ); - printf ( " Desired fractional accuracy = %f\n", epsilon ); - printf ( "\n" ); - } - - allocate_arrays ( ); - f = make_source ( ); - make_domains ( num_procs ); - - step = 0; -/* - Begin timing. -*/ - wall_time = MPI_Wtime ( ); -/* - Begin iteration. -*/ - do - { - jacobi ( num_procs, f ); - ++step; -/* - Estimate the error -*/ - change = 0.0; - n = 0; - - my_change = 0.0; - my_n = 0; - - for ( i = i_min[my_rank]; i <= i_max[my_rank]; i++ ) - { - for ( j = 1; j <= N; j++ ) - { - if ( u_new[INDEX(i,j)] != 0.0 ) - { - my_change = my_change - + fabs ( 1.0 - u[INDEX(i,j)] / u_new[INDEX(i,j)] ); - - my_n = my_n + 1; - } - } - } - MPI_Allreduce ( &my_change, &change, 1, MPI_DOUBLE, MPI_SUM, - MPI_COMM_WORLD ); - - MPI_Allreduce ( &my_n, &n, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); - - if ( n != 0 ) - { - change = change / n; - } - if ( my_rank == 0 && ( step % 10 ) == 0 ) - { - printf ( " N = %d, n = %d, my_n = %d, Step %4d Error = %g\n", - N, n, my_n, step, change ); - } -/* - Interchange U and U_NEW. -*/ - swap = u; - u = u_new; - u_new = swap; - } while ( epsilon < change ); - -/* - Here is where you can copy the solution to process 0 - and print to a file. -*/ - -/* - Report on wallclock time. -*/ - wall_time = MPI_Wtime() - wall_time; - if ( my_rank == 0 ) - { - printf ( "\n" ); - printf ( " Wall clock time = %f secs\n", wall_time ); - } -/* - Terminate MPI. -*/ - MPI_Finalize ( ); -/* - Free memory. -*/ - free ( f ); -/* - Terminate. -*/ - if ( my_rank == 0 ) - { - printf ( "\n" ); - printf ( "POISSON_MPI:\n" ); - printf ( " Number of processes = %d\n", num_procs ); - printf ( " Number of interior vertices = %d\n", N ); - printf ( " Normal end of execution.\n" ); - printf ( "\n" ); - timestamp ( ); - } - - return 0; -} -/******************************************************************************/ - -void allocate_arrays ( ) - -/******************************************************************************/ -/* - Purpose: - - ALLOCATE_ARRAYS creates and zeros out the arrays U and U_NEW. - - Modified: - - 10 September 2013 -*/ -{ - int i; - int ndof; - - ndof = ( N + 2 ) * ( N + 2 ); - - u = ( double * ) malloc ( ndof * sizeof ( double ) ); - for ( i = 0; i < ndof; i++) - { - u[i] = 0.0; - } - - u_new = ( double * ) malloc ( ndof * sizeof ( double ) ); - for ( i = 0; i < ndof; i++ ) - { - u_new[i] = 0.0; - } - - return; -} -/******************************************************************************/ - -void jacobi ( int num_procs, double f[] ) - -/******************************************************************************/ -/* - Purpose: - - JACOBI carries out the Jacobi iteration for the linear system. - - Modified: - - 16 September 2013 - - Parameters: - - Input, int NUM_PROCS, the number of processes. - - Input, double F[(N+2)*(N+2)], the right hand side of the linear system. -*/ -{ - double h; - int i; - int j; - MPI_Request request[4]; - int requests; - MPI_Status status[4]; -/* - H is the lattice spacing. -*/ - h = L / ( double ) ( N + 1 ); -/* - Update ghost layers using non-blocking send/receive -*/ - requests = 0; - - if ( left_proc[my_rank] >= 0 && left_proc[my_rank] < num_procs ) - { - MPI_Irecv ( u + INDEX(i_min[my_rank] - 1, 1), N, MPI_DOUBLE, - left_proc[my_rank], 0, MPI_COMM_WORLD, - request + requests++ ); - - MPI_Isend ( u + INDEX(i_min[my_rank], 1), N, MPI_DOUBLE, - left_proc[my_rank], 1, MPI_COMM_WORLD, - request + requests++ ); - } - - if ( right_proc[my_rank] >= 0 && right_proc[my_rank] < num_procs ) - { - MPI_Irecv ( u + INDEX(i_max[my_rank] + 1, 1), N, MPI_DOUBLE, - right_proc[my_rank], 1, MPI_COMM_WORLD, - request + requests++ ); - - MPI_Isend ( u + INDEX(i_max[my_rank], 1), N, MPI_DOUBLE, - right_proc[my_rank], 0, MPI_COMM_WORLD, - request + requests++ ); - } -/* - Jacobi update for internal vertices in my domain. -*/ - for ( i = i_min[my_rank] + 1; i <= i_max[my_rank] - 1; i++ ) - { - for ( j = 1; j <= N; j++ ) - { - u_new[INDEX(i,j)] = - 0.25 * ( u[INDEX(i-1,j)] + u[INDEX(i+1,j)] + - u[INDEX(i,j-1)] + u[INDEX(i,j+1)] + - h * h * f[INDEX(i,j)] ); - } - } -/* - Wait for all non-blocking communications to complete. -*/ - MPI_Waitall ( requests, request, status ); -/* - Jacobi update for boundary vertices in my domain. -*/ - i = i_min[my_rank]; - for ( j = 1; j <= N; j++ ) - { - u_new[INDEX(i,j)] = - 0.25 * ( u[INDEX(i-1,j)] + u[INDEX(i+1,j)] + - u[INDEX(i,j-1)] + u[INDEX(i,j+1)] + - h * h * f[INDEX(i,j)] ); - } - - i = i_max[my_rank]; - if (i != i_min[my_rank]) - { - for (j = 1; j <= N; j++) - { - u_new[INDEX(i,j)] = - 0.25 * ( u[INDEX(i-1,j)] + u[INDEX(i+1,j)] + - u[INDEX(i,j-1)] + u[INDEX(i,j+1)] + - h * h * f[INDEX(i,j)] ); - } - } - - return; -} -/******************************************************************************/ - -void make_domains ( int num_procs ) - -/******************************************************************************/ -/* - Purpose: - - MAKE_DOMAINS sets up the information defining the process domains. - - Modified: - - 10 September 2013 - - Parameters: - - Input, int NUM_PROCS, the number of processes. -*/ -{ - double d; - double eps; - int i; - int p; - double x_max; - double x_min; -/* - Allocate arrays for process information. -*/ - proc = ( int * ) malloc ( ( N + 2 ) * sizeof ( int ) ); - i_min = ( int * ) malloc ( num_procs * sizeof ( int ) ); - i_max = ( int * ) malloc ( num_procs * sizeof ( int ) ); - left_proc = ( int * ) malloc ( num_procs * sizeof ( int ) ); - right_proc = ( int * ) malloc ( num_procs * sizeof ( int ) ); -/* - Divide the range [(1-eps)..(N+eps)] evenly among the processes. -*/ - eps = 0.0001; - d = ( N - 1.0 + 2.0 * eps ) / ( double ) num_procs; - - for ( p = 0; p < num_procs; p++ ) - { -/* - The I indices assigned to domain P will satisfy X_MIN <= I <= X_MAX. -*/ - x_min = - eps + 1.0 + ( double ) ( p * d ); - x_max = x_min + d; -/* - For the node with index I, store in PROC[I] the process P it belongs to. -*/ - for ( i = 1; i <= N; i++ ) - { - if ( x_min <= i && i < x_max ) - { - proc[i] = p; - } - } - } -/* - Now find the lowest index I associated with each process P. -*/ - for ( p = 0; p < num_procs; p++ ) - { - for ( i = 1; i <= N; i++ ) - { - if ( proc[i] == p ) - { - break; - } - } - i_min[p] = i; -/* - Find the largest index associated with each process P. -*/ - for ( i = N; 1 <= i; i-- ) - { - if ( proc[i] == p ) - { - break; - } - } - i_max[p] = i; -/* - Find the processes to left and right. -*/ - left_proc[p] = -1; - right_proc[p] = -1; - - if ( proc[p] != -1 ) - { - if ( 1 < i_min[p] && i_min[p] <= N ) - { - left_proc[p] = proc[i_min[p] - 1]; - } - if ( 0 < i_max[p] && i_max[p] < N ) - { - right_proc[p] = proc[i_max[p] + 1]; - } - } - } - - return; -} -/******************************************************************************/ - -double *make_source ( ) - -/******************************************************************************/ -/* - Purpose: - - MAKE_SOURCE sets up the source term for the Poisson equation. - - Modified: - - 16 September 2013 - - Parameters: - - Output, double *MAKE_SOURCE, a pointer to the (N+2)*(N+2) source term - array. -*/ -{ - double *f; - int i; - int j; - int k; - double q; - - f = ( double * ) malloc ( ( N + 2 ) * ( N + 2 ) * sizeof ( double ) ); - - for ( i = 0; i < ( N + 2 ) * ( N + 2 ); i++ ) - { - f[i] = 0.0; - } -/* - Make a dipole. -*/ - q = 10.0; - - i = 1 + N / 4; - j = i; - k = INDEX ( i, j ); - f[k] = q; - - i = 1 + 3 * N / 4; - j = i; - k = INDEX ( i, j ); - f[k] = -q; - - return f; -} -/******************************************************************************/ - -void timestamp ( ) - -/******************************************************************************/ -/* - Purpose: - - TIMESTAMP prints the current YMDHMS date as a time stamp. - - Example: - - 31 May 2001 09:45:54 AM - - Licensing: - - This code is distributed under the GNU LGPL license. - - Modified: - - 24 September 2003 - - Author: - - John Burkardt - - Parameters: - - None -*/ -{ -# define TIME_SIZE 40 - - static char time_buffer[TIME_SIZE]; - const struct tm *tm; - time_t now; - - now = time ( NULL ); - tm = localtime ( &now ); - - strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm ); - - printf ( "%s\n", time_buffer ); - - return; -# undef TIME_SIZE -} diff --git a/src/tutor/mp/python_openmp.py b/src/tutor/mp/python_openmp.py deleted file mode 100755 index f4fc8fdd3..000000000 --- a/src/tutor/mp/python_openmp.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python -# -# Taken from: https://scicomp.aalto.fi/triton/examples/python/python_openmp/python_openmp/ -# E.g. -# export OMP_PROC_BIND=true -# export OMP_NUM_THREADS=4 - -import os -from time import time -import numpy as np - -print('SLURM_CPUS_PER_TASK: Using %d processors' % int(os.getenv('SLURM_CPUS_PER_TASK',1))) - -nrounds = 5 -n = 2000 - -t_start = time() - -for i in range(nrounds): - a = np.random.random([n,n]) - a = a + a.T - b = np.linalg.pinv(a) - -t_delta = time() - t_start - -print('Seconds taken to invert %d symmetric %dx%d matrices: %f' % (nrounds, n, n, t_delta)) diff --git a/src/tutor/mp/quad_openmp.c b/src/tutor/mp/quad_openmp.c deleted file mode 100644 index 523aa067d..000000000 --- a/src/tutor/mp/quad_openmp.c +++ /dev/null @@ -1,208 +0,0 @@ -# include -# include -# include -# include -# include - -int main ( int argc, char *argv[] ); -double f ( double x ); -double cpu_time ( ); -void timestamp ( ); - -/******************************************************************************/ - -int main ( int argc, char *argv[] ) - -/******************************************************************************/ -/* - Purpose: - - MAIN is the main program for QUAD_OPENMP. - - Licensing: - - This code is distributed under the GNU LGPL license. - - Modified: - - 14 December 2011 - - Author: - - John Burkardt -*/ -{ - double a = 0.0; - double b = 10.0; - double error; - double exact = 0.49936338107645674464; - int i; -// int n = 10000000; - int n = 1000000000; - double total; - double wtime; - double x; - - timestamp ( ); - printf ( "\n" ); - printf ( "QUAD_OPENMP:\n" ); - printf ( " C version\n" ); - printf ( " Use OpenMP for parallel execution.\n" ); - printf ( " Estimate the integral of f(x) from A to B.\n" ); - printf ( " f(x) = 50 / ( pi * ( 2500 * x * x + 1 ) ).\n" ); - printf ( "\n" ); - printf ( " A = %f\n", a ); - printf ( " B = %f\n", b ); - printf ( " N = %d\n", n ); - printf ( " Exact = %24.16f\n", exact ); - - wtime = omp_get_wtime ( ); - - total = 0.0; - -# pragma omp parallel shared ( a, b, n ) private ( i, x ) - -# pragma omp for reduction ( + : total ) - - for ( i = 0; i < n; i++ ) - { - x = ( ( double ) ( n - i - 1 ) * a + ( double ) ( i ) * b ) / ( double ) ( n - 1 ); - total = total + f ( x ); - } - - wtime = omp_get_wtime ( ) - wtime; - - total = ( b - a ) * total / ( double ) n; - error = fabs ( total - exact ); - - printf ( "\n" ); - printf ( " Estimate = %24.16f\n", total ); - printf ( " Error = %e\n", error ); - printf ( " Time = %f\n", wtime ); -/* - Terminate. -*/ - printf ( "\n" ); - printf ( "QUAD_OPENMP:\n" ); - printf ( " Normal end of execution.\n" ); - printf ( "\n" ); - timestamp ( ); - - return 0; -} -/*******************************************************************************/ - -double f ( double x ) - -/*******************************************************************************/ -/* - Purpose: - - F evaluates the function. - - Licensing: - - This code is distributed under the GNU LGPL license. - - Modified: - - 18 July 2010 - - Author: - - John Burkardt - - Parameters: - - Input, double X, the argument. - - Output, double F, the value of the function. -*/ -{ - double r8_pi = 3.141592653589793; - double value; - - value = 50.0 / ( r8_pi * ( 2500.0 * x * x + 1.0 ) ); - - return value; -} -/*******************************************************************************/ - -double cpu_time ( ) - -/*******************************************************************************/ -/* - Purpose: - - CPU_TIME reports the total CPU time for a program. - - Licensing: - - This code is distributed under the GNU LGPL license. - - Modified: - - 27 September 2005 - - Author: - - John Burkardt - - Parameters: - - Output, double CPU_TIME, the current total elapsed CPU time in second. -*/ -{ - double value; - - value = ( double ) clock ( ) / ( double ) CLOCKS_PER_SEC; - - return value; -} -/******************************************************************************/ - -void timestamp ( ) - -/******************************************************************************/ -/* - Purpose: - - TIMESTAMP prints the current YMDHMS date as a time stamp. - - Example: - - 31 May 2001 09:45:54 AM - - Licensing: - - This code is distributed under the GNU LGPL license. - - Modified: - - 24 September 2003 - - Author: - - John Burkardt - - Parameters: - - None -*/ -{ -# define TIME_SIZE 40 - - static char time_buffer[TIME_SIZE]; - const struct tm *tm; - time_t now; - - now = time ( NULL ); - tm = localtime ( &now ); - - strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm ); - - printf ( "%s\n", time_buffer ); - - return; -# undef TIME_SIZE -} diff --git a/src/tutor/mp/scaling.c b/src/tutor/mp/scaling.c deleted file mode 100644 index afbd2b655..000000000 --- a/src/tutor/mp/scaling.c +++ /dev/null @@ -1,24 +0,0 @@ -// Compile with: gcc scaling.c -std=c99 -fopenmp -O3 -// See also discussion on: -// https://stackoverflow.com/questions/19780554/what-limits-scaling-in-this-simple-openmp-program - -#include -#include -#include - -int main(){ - - const uint64_t umin=1; - const uint64_t umax=4000000000LL; // 4->5 already causes overflow - double sum=0.; -#pragma omp parallel for reduction(+:sum) - for(uint64_t u=umin; u -#include - -string defv[] = { - "umin=1\n Starting value", - "umax=10000\n sqrt of Ending value", - "umax2=0\n Non-parallel loop, again sqrt of Ending value", - "iter=1\n How many times to iterate and report timing", - "VERSION=1.4\n 17-sep-2023 PJT", - NULL, -}; - -string usage="NEMO version of the well scaled OpenMP scaling program"; - - -void nemo_main(void) -{ - int umin4 = getiparam("umin"); // NEMO doesn't have a reliable longlong - int umax4 = getiparam("umax"); - int umax2 = getiparam("umax2"); - int niter = getiparam("iter"); - uint64_t umin = (uint64_t) umin4 * (uint64_t) umin4; - uint64_t umax = (uint64_t) umax4 * (uint64_t) umax4; - double t0, t1, t2 = 0.0; - bool Qshow = niter == 1; - extern int np_openmp; // this is a cheat; see getparam.c - - dprintf(0,"omp_get_num_procs() -> %d\n",np_openmp); - dprintf(0,"scaling2: umin=%ld umax=%ld\n",umin,umax); - - while (niter--) { - double sum=0.0; -#pragma omp parallel for reduction(+:sum) - for(uint64_t u=umin; u 0) { - sum=0.0; - umax = (uint64_t) umax2 * (uint64_t) umax2; - for(uint64_t u=umin; u - -string defv[] = { - "n=1000000\n array size", - "iter=10\n how many times to iterate each block", - "seed=123\n seed for xrandom", - "mode=0\n Different openmp experiments", - "VERSION=0.3\n 14-feb-2021 PJT", - NULL, -}; - -string usage="benchmark openmp overhead of starting a test via sections"; - -string cvsid="$Id:$"; - -/* - benchmark program: - /usr/bin/time sections mode=3 iter=0 n=100000000 - -n * iter scales the CPU, but for small n you can see the caching effect. -Small: 0.40 Gop -Large: 1.20 -Elarge 2.53 (memory starts to fill up? 1 Gx=28" - -lma: 3.40 n=100M iter=10 (1G-xrandom = 21") - 1.00 n=1000 iter=1M -*/ - - -void work1(int n, double *x, int iter); -void work2(int n, double *x, int iter); -void work1r(int n, double *x, int iter); -void work2r(int n, double *x, int iter); -void work3(int n, double *x, double *y, double *z, int iter); - -void nemo_main() -{ - int n = getiparam("n"); - int seed = init_xrandom(getparam("seed")); - int iter1 = getiparam("iter"); - int iter2 = iter1; - int i; - int n1 = n; - int n2 = n; - int n3 = n; - int mode = getiparam("mode"); - real t0, t1; - real *x1 = (real *) allocate(n1*sizeof(double)); - real *x2 = (real *) allocate(n2*sizeof(double)); - real *x3 = (real *) allocate(n3*sizeof(double)); - - dprintf(0,"n=%d iter=%d\n",n,iter1); - // cannot omp this for-loop: xrandom has no mutex - for (i=0; i -# include -# include -# include -# include - -int main ( void ); -void test01 ( int n ); -void test02 ( int n ); -void test03 ( int n ); - -int isamax ( int n, float x[], int incx ); -void matgen ( int lda, int n, float a[], float x[], float b[] ); -void msaxpy ( int nr, int nc, float a[], int n, float x[], float y[] ); -void msaxpy2 ( int nr, int nc, float a[], int n, float x[], float y[] ); -int msgefa ( float a[], int lda, int n, int ipvt[] ); -int msgefa2 ( float a[], int lda, int n, int ipvt[] ); -void saxpy ( int n, float a, float x[], int incx, float y[], int incy ); -float sdot ( int n, float x[], int incx, float y[], int incy ); -int sgefa ( float a[], int lda, int n, int ipvt[] ); -void sgesl ( float a[], int lda, int n, int ipvt[], float b[], int job ); -void sscal ( int n, float a, float x[], int incx ); -void sswap ( int n, float x[], int incx, float y[], int incy ); -void timestamp ( ); - -/******************************************************************************/ - -int main ( void ) - -/******************************************************************************/ -/* - Purpose: - - MAIN is the main program for the SGEFA_OPENMP test program. - - Discussion: - - We want to compare methods of solving the linear system A*x=b. - - The first way uses the standard sequential algorithm "SGEFA". - - The second way uses a variant of SGEFA that has been modified to - take advantage of OpenMP. - - The third way reruns the variant code, but with OpenMP turned off. - - Modified: - - 17 April 2009 - - Author: - - John Burkardt -*/ -{ - int n; - - timestamp ( ); - - printf ( "\n" ); - printf ( "SGEFA_OPENMP\n" ); - printf ( " C + OpenMP version\n" ); - - printf ( "\n" ); - printf ( " Number of processors available = %d\n", omp_get_num_procs ( ) ); - printf ( " Number of threads = %d\n", omp_get_max_threads ( ) ); - - printf ( "\n" ); - printf ( " Algorithm Mode N Error Time\n" ); - - printf ( "\n" ); - n = 10; - test01 ( n ); - test02 ( n ); - test03 ( n ); - - printf ( "\n" ); - n = 100; - test01 ( n ); - test02 ( n ); - test03 ( n ); - - printf ( "\n" ); - n = 1000; - test01 ( n ); - test02 ( n ); - test03 ( n ); - - printf ( "\n" ); - printf ( "SGEFA_OPENMP\n" ); - printf ( " Normal end of execution.\n" ); - - printf ( "\n" ); - timestamp ( ); - - return 0; -} -/******************************************************************************/ - -void test01 ( int n ) - -/******************************************************************************/ -/* - Purpose: - - TEST01 runs the sequential version of SGEFA. - - Modified: - - 07 April 2008 - - Author: - - John Burkardt -*/ -{ - float *a; - float *b; - float err; - int i; - int info; - int *ipvt; - int job; - int lda; - double wtime; - float *x; -/* - Generate the linear system A * x = b. -*/ - lda = n; - a = ( float * ) malloc ( lda * n * sizeof ( float ) ); - b = ( float * ) malloc ( n * sizeof ( float ) ); - x = ( float * ) malloc ( n * sizeof ( float ) ); - - matgen ( lda, n, a, x, b ); -/* - Factor the linear system. -*/ - ipvt = ( int * ) malloc ( n * sizeof ( int ) ); - - wtime = omp_get_wtime ( ); - info = sgefa ( a, lda, n, ipvt ); - wtime = omp_get_wtime ( ) - wtime; - - if ( info != 0 ) - { - printf ( "\n" ); - printf ( "TEST01 - Fatal error!\n" ); - printf ( " SGEFA reports the matrix is singular.\n" ); - exit ( 1 ); - } -/* - Solve the linear system. -*/ - job = 0; - sgesl ( a, lda, n, ipvt, b, job ); - - err = 0.0; - for ( i = 0; i < n; i++ ) - { - err = err + fabs ( x[i] - b[i] ); - } - printf ( " Original Sequential %8d %10.4e %10.4e\n", n, err, wtime ); - - free ( a ); - free ( b ); - free ( ipvt ); - free ( x ); - - return; -} -/******************************************************************************/ - -void test02 ( int n ) - -/******************************************************************************/ -/* - Purpose: - - TEST02 runs the revised version of SGEFA in parallel. - - Modified: - - 07 April 2008 - - Author: - - John Burkardt -*/ -{ - float *a; - float *b; - float err; - int i; - int info; - int *ipvt; - int job; - int lda; - double wtime; - float *x; -/* - Generate the linear system A * x = b. -*/ - lda = n; - a = ( float * ) malloc ( lda * n * sizeof ( float ) ); - b = ( float * ) malloc ( n * sizeof ( float ) ); - x = ( float * ) malloc ( n * sizeof ( float ) ); - - matgen ( lda, n, a, x, b ); -/* - Factor the linear system. -*/ - ipvt = ( int * ) malloc ( n * sizeof ( int ) ); - - wtime = omp_get_wtime ( ); - info = msgefa ( a, lda, n, ipvt ); - wtime = omp_get_wtime ( ) - wtime; - - if ( info != 0 ) - { - printf ( "\n" ); - printf ( "TEST02 - Fatal error!\n" ); - printf ( " MSGEFA reports the matrix is singular.\n" ); - exit ( 1 ); - } -/* - Solve the linear system. -*/ - job = 0; - sgesl ( a, lda, n, ipvt, b, job ); - - err = 0.0; - for ( i = 0; i < n; i++ ) - { - err = err + fabs ( x[i] - b[i] ); - } - - printf ( " Revised Parallel %8d %10.4e %10.4e\n", n, err, wtime ); - - free ( a ); - free ( b ); - free ( ipvt ); - free ( x ); - - return; -} -/******************************************************************************/ - -void test03 ( int n ) - -/******************************************************************************/ -/* - Purpose: - - TEST03 runs the revised version of SGEFA in sequential mode. - - Modified: - - 07 April 2008 - - Author: - - John Burkardt -*/ -{ - float *a; - float *b; - float err; - int i; - int info; - int *ipvt; - int job; - int lda; - double wtime; - float *x; -/* - Generate the linear system A * x = b. -*/ - lda = n; - a = ( float * ) malloc ( lda * n * sizeof ( float ) ); - b = ( float * ) malloc ( n * sizeof ( float ) ); - x = ( float * ) malloc ( n * sizeof ( float ) ); - - matgen ( lda, n, a, x, b ); -/* - Factor the linear system. -*/ - ipvt = ( int * ) malloc ( n * sizeof ( int ) ); - - wtime = omp_get_wtime ( ); - info = msgefa2 ( a, lda, n, ipvt ); - wtime = omp_get_wtime ( ) - wtime; - - if ( info != 0 ) - { - printf ( "\n" ); - printf ( "TEST03 - Fatal error!\n" ); - printf ( " MSGEFA2 reports the matrix is singular.\n" ); - exit ( 1 ); - } -/* - Solve the linear system. -*/ - job = 0; - sgesl ( a, lda, n, ipvt, b, job ); - - err = 0.0; - for ( i = 0; i < n; i++ ) - { - err = err + fabs ( x[i] - b[i] ); - } - - printf ( " Revised Sequential %8d %10.4e %10.4e\n", n, err, wtime ); - - free ( a ); - free ( b ); - free ( ipvt ); - free ( x ); - - return; -} -/******************************************************************************/ - -int isamax ( int n, float x[], int incx ) - -/******************************************************************************/ -/* - Purpose: - - ISAMAX finds the index of the vector element of maximum absolute value. - - Discussion: - - WARNING: This index is a 1-based index, not a 0-based index! - - Modified: - - 07 April 2008 - - Author: - - FORTRAN77 original version by Lawson, Hanson, Kincaid, Krogh. - C version by John Burkardt - - Reference: - - Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart, - LINPACK User's Guide, - SIAM, 1979, - ISBN13: 978-0-898711-72-1, - LC: QA214.L56. - - Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh, - Algorithm 539: - Basic Linear Algebra Subprograms for Fortran Usage, - ACM Transactions on Mathematical Software, - Volume 5, Number 3, September 1979, pages 308-323. - - Parameters: - - Input, int N, the number of entries in the vector. - - Input, float X[*], the vector to be examined. - - Input, int INCX, the increment between successive entries of SX. - - Output, int ISAMAX, the index of the element of maximum - absolute value. -*/ -{ - float xmax; - int i; - int ix; - int value; - - value = 0; - - if ( n < 1 || incx <= 0 ) - { - return value; - } - - value = 1; - - if ( n == 1 ) - { - return value; - } - - if ( incx == 1 ) - { - xmax = fabs ( x[0] ); - - for ( i = 1; i < n; i++ ) - { - if ( xmax < fabs ( x[i] ) ) - { - value = i + 1; - xmax = fabs ( x[i] ); - } - } - } - else - { - ix = 0; - xmax = fabs ( x[0] ); - ix = ix + incx; - - for ( i = 1; i < n; i++ ) - { - if ( xmax < fabs ( x[ix] ) ) - { - value = i + 1; - xmax = fabs ( x[ix] ); - } - ix = ix + incx; - } - } - - return value; -} -/*******************************************************************************/ - -void matgen ( int lda, int n, float a[], float x[], float b[] ) - -/*******************************************************************************/ -/* - Purpose: - - MATGEN generates a "random" matrix for testing. - - Modified: - - 27 April 2008 - - Author: - - John Burkardt - - Parameters: - - Input, int LDA, the leading dimension of the matrix. - - Input, int N, the order of the matrix, and the length of the vector. - - Output, float A[LDA*N], the matrix. - - Output, float X[N], the solution vector. - - Output, float B[N], the right hand side vector. -*/ -{ - int i; - int j; - int seed; - float value; - - seed = 1325; -/* - Set the matrix A. -*/ - for ( j = 0; j < n; j++ ) - { - for ( i = 0; i < n; i++ ) - { - seed = ( 3125 * seed ) % 65536; - value = ( ( float ) seed - 32768.0 ) / 16384.0; - a[i+j*lda] = value; - } - } -/* - Set x. -*/ - for ( i = 0; i < n; i++ ) - { - x[i] = ( float ) ( i + 1 ) / ( ( float ) n ); - } -/* - Set b = A * x. -*/ - for ( i = 0; i < n; i++ ) - { - b[i] = 0.0; - for ( j = 0; j < n; j++ ) - { - b[i] = b[i] + a[i+j*lda] * x[j]; - } - } - return; -} -/******************************************************************************/ - -void msaxpy ( int nr, int nc, float a[], int n, float x[], float y[] ) - -/******************************************************************************/ -/* - Purpose: - - MSAXPY carries out multiple "SAXPY" operations. - - Discussion: - - This routine carries out the step of Gaussian elimination where multiples - of the pivot row are added to the rows below the pivot row. - - A single call to MSAXPY replaces multiple calls to SAXPY. - - Modified: - - 07 April 2008 - - Author: - - Wesley Petersen - - Parameters: - - Input, int NR, NC, the number of rows and columns in the matrix. - - Input, float A[*], ... - - Input, int N, ... - - Input, float X[*], ... - - Output, float Y[*], ... -*/ -{ - int i,j; - -# pragma omp parallel \ - shared ( a, nc, nr, x, y ) \ - private ( i, j ) - -# pragma omp for - for ( j = 0; j < nc; j++) - { - for ( i = 0; i < nr; i++ ) - { - y[i+j*n] += a[j*n] * x[i]; - } - } - return; -} -/******************************************************************************/ - -void msaxpy2 ( int nr, int nc, float a[], int n, float x[], float y[] ) - -/******************************************************************************/ -/* - Purpose: - - MSAXPY2 carries out multiple "SAXPY" operations. - - Discussion: - - This routine carries out the step of Gaussian elimination where multiples - of the pivot row are added to the rows below the pivot row. - - A single call to MSAXPY replaces multiple calls to SAXPY. - - Modified: - - 07 April 2008 - - Author: - - Wesley Petersen - - Parameters: - - Input, int NR, NC, the number of rows and columns in the matrix. - - Input, float A[*], ... - - Input, int N, ... - - Input, float X[*], ... - - Output, float Y[*], ... -*/ -{ - int i,j; - - for ( j = 0; j < nc; j++) - { - for ( i = 0; i < nr; i++ ) - { - y[i+j*n] += a[j*n] * x[i]; - } - } - return; -} -/******************************************************************************/ - -int msgefa ( float a[], int lda, int n, int ipvt[] ) - -/******************************************************************************/ -/* - Purpose: - - MSGEFA factors a matrix by gaussian elimination. - - Discussion: - - Matrix references which would, mathematically, be written A(I,J) - must be written here as: - * A[I+J*LDA], when the value is needed, or - * A+I+J*LDA, when the address is needed. - - This variant of SGEFA uses OpenMP for improved parallel execution. - The step in which multiples of the pivot row are added to individual - rows has been replaced by a single call which updates the entire - matrix sub-block. - - Modified: - - 07 March 2008 - - Author: - - FORTRAN77 original version by Cleve Moler. - C version by Wesley Petersen. - - Reference: - - Jack Dongarra, Jim Bunch, Cleve Moler, Pete Stewart, - LINPACK User's Guide, - SIAM, 1979, - ISBN13: 978-0-898711-72-1, - LC: QA214.L56. - - Parameters: - - Input/output, float A[LDA*N]. On input, the matrix to be factored. - On output, an upper triangular matrix and the multipliers which were - used to obtain it. The factorization can be written A = L * U where - L is a product of permutation and unit lower triangular matrices and - U is upper triangular. - - Input, int LDA, the leading dimension of the matrix. - - Input, int N, the order of the matrix. - - Output, int IPVT[N], the pivot indices. - - Output, int MSGEFA, indicates singularity. - If 0, this is the normal value, and the algorithm succeeded. - If K, then on the K-th elimination step, a zero pivot was encountered. - The matrix is numerically not invertible. -*/ -{ - int info; - int k,kp1,l,nm1; - float t; - - info = 0; - nm1 = n - 1; - for ( k = 0; k < nm1; k++ ) - { - kp1 = k + 1; - l = isamax ( n-k, a+k+k*lda, 1 ) + k - 1; - ipvt[k] = l + 1; - - if ( a[l+k*lda] == 0.0 ) - { - info = k + 1; - return info; - } - - if ( l != k ) - { - t = a[l+k*lda]; - a[l+k*lda] = a[k+k*lda]; - a[k+k*lda] = t; - } - t = -1.0 / a[k+k*lda]; - sscal ( n-k-1, t, a+kp1+k*lda, 1 ); -/* - Interchange the pivot row and the K-th row. -*/ - if ( l != k ) - { - sswap ( n-k-1, a+l+kp1*lda, lda, a+k+kp1*lda, lda ); - } -/* - Add multiples of the K-th row to rows K+1 through N. -*/ - msaxpy ( n-k-1, n-k-1, a+k+kp1*lda, n, a+kp1+k*lda, a+kp1+kp1*lda ); - } - - ipvt[n-1] = n; - - if ( a[n-1+(n-1)*lda] == 0.0 ) - { - info = n; - } - - return info; -} -/******************************************************************************/ - -int msgefa2 ( float a[], int lda, int n, int ipvt[] ) - -/******************************************************************************/ -/* - Purpose: - - MSGEFA2 factors a matrix by gaussian elimination. - - Discussion: - - Matrix references which would, mathematically, be written A(I,J) - must be written here as: - * A[I+J*LDA], when the value is needed, or - * A+I+J*LDA, when the address is needed. - - This variant of SGEFA uses OpenMP for improved parallel execution. - The step in which multiples of the pivot row are added to individual - rows has been replaced by a single call which updates the entire - matrix sub-block. - - Modified: - - 07 March 2008 - - Author: - - FORTRAN77 original version by Cleve Moler. - C version by Wesley Petersen. - - Reference: - - Jack Dongarra, Jim Bunch, Cleve Moler, Pete Stewart, - LINPACK User's Guide, - SIAM, 1979, - ISBN13: 978-0-898711-72-1, - LC: QA214.L56. - - Parameters: - - Input/output, float A[LDA*N]. On input, the matrix to be factored. - On output, an upper triangular matrix and the multipliers which were - used to obtain it. The factorization can be written A = L * U where - L is a product of permutation and unit lower triangular matrices and - U is upper triangular. - - Input, int LDA, the leading dimension of the matrix. - - Input, int N, the order of the matrix. - - Output, int IPVT[N], the pivot indices. - - Output, int MSGEFA, indicates singularity. - If 0, this is the normal value, and the algorithm succeeded. - If K, then on the K-th elimination step, a zero pivot was encountered. - The matrix is numerically not invertible. -*/ -{ - int info; - int k,kp1,l,nm1; - float t; - - info = 0; - nm1 = n - 1; - for ( k = 0; k < nm1; k++ ) - { - kp1 = k + 1; - l = isamax ( n-k, a+k+k*lda, 1 ) + k - 1; - ipvt[k] = l + 1; - - if ( a[l+k*lda] == 0.0 ) - { - info = k + 1; - return info; - } - - if ( l != k ) - { - t = a[l+k*lda]; - a[l+k*lda] = a[k+k*lda]; - a[k+k*lda] = t; - } - t = -1.0 / a[k+k*lda]; - sscal ( n-k-1, t, a+kp1+k*lda, 1 ); -/* - Interchange the pivot row and the K-th row. -*/ - if ( l != k ) - { - sswap ( n-k-1, a+l+kp1*lda, lda, a+k+kp1*lda, lda ); - } -/* - Add multiples of the K-th row to rows K+1 through N. -*/ - msaxpy2 ( n-k-1, n-k-1, a+k+kp1*lda, n, a+kp1+k*lda, a+kp1+kp1*lda ); - } - - ipvt[n-1] = n; - - if ( a[n-1+(n-1)*lda] == 0.0 ) - { - info = n; - } - - return info; -} -/******************************************************************************/ - -void saxpy ( int n, float a, float x[], int incx, float y[], int incy ) - -/******************************************************************************/ -/* - Purpose: - - SAXPY computes float constant times a vector plus a vector. - - Discussion: - - This routine uses unrolled loops for increments equal to one. - - Modified: - - 23 February 2006 - - Author: - - FORTRAN77 original version by Dongarra, Moler, Bunch, Stewart. - C version by John Burkardt - - Reference: - - Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart, - LINPACK User's Guide, - SIAM, 1979, - ISBN13: 978-0-898711-72-1, - LC: QA214.L56. - - Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh, - Basic Linear Algebra Subprograms for Fortran Usage, - Algorithm 539, - ACM Transactions on Mathematical Software, - Volume 5, Number 3, September 1979, pages 308-323. - - Parameters: - - Input, int N, the number of elements in X and Y. - - Input, float A, the multiplier of X. - - Input, float X[*], the first vector. - - Input, int INCX, the increment between successive entries of X. - - Input/output, float Y[*], the second vector. - On output, Y[*] has been replaced by Y[*] + A * X[*]. - - Input, int INCY, the increment between successive entries of Y. -*/ -{ - int i; - int ix; - int iy; - int m; - - if ( n <= 0 ) - { - return; - } - - if ( a == 0.0 ) - { - return; - } -/* - Code for unequal increments or equal increments - not equal to 1. -*/ - if ( incx != 1 || incy != 1 ) - { - if ( 0 <= incx ) - { - ix = 0; - } - else - { - ix = ( - n + 1 ) * incx; - } - - if ( 0 <= incy ) - { - iy = 0; - } - else - { - iy = ( - n + 1 ) * incy; - } - - for ( i = 0; i < n; i++ ) - { - y[iy] = y[iy] + a * x[ix]; - ix = ix + incx; - iy = iy + incy; - } - } -/* - Code for both increments equal to 1. -*/ - else - { - m = n % 4; - - for ( i = 0; i < m; i++ ) - { - y[i] = y[i] + a * x[i]; - } - - for ( i = m; i < n; i = i + 4 ) - { - y[i ] = y[i ] + a * x[i ]; - y[i+1] = y[i+1] + a * x[i+1]; - y[i+2] = y[i+2] + a * x[i+2]; - y[i+3] = y[i+3] + a * x[i+3]; - } - } - - return; -} -/******************************************************************************/ - -float sdot ( int n, float x[], int incx, float y[], int incy ) - -/******************************************************************************/ -/* - Purpose: - - SDOT forms the dot product of two vectors. - - Discussion: - - This routine uses unrolled loops for increments equal to one. - - Modified: - - 23 February 2006 - - Author: - - FORTRAN77 original version by Dongarra, Moler, Bunch, Stewart - C version by John Burkardt - - Reference: - - Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart, - LINPACK User's Guide, - SIAM, 1979. - - Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh, - Basic Linear Algebra Subprograms for Fortran Usage, - Algorithm 539, - ACM Transactions on Mathematical Software, - Volume 5, Number 3, September 1979, pages 308-323. - - Parameters: - - Input, int N, the number of entries in the vectors. - - Input, float X[*], the first vector. - - Input, int INCX, the increment between successive entries in X. - - Input, float Y[*], the second vector. - - Input, int INCY, the increment between successive entries in Y. - - Output, float SDOT, the sum of the product of the corresponding - entries of X and Y. -*/ -{ - int i; - int ix; - int iy; - int m; - float temp; - - temp = 0.0; - - if ( n <= 0 ) - { - return temp; - } -/* - Code for unequal increments or equal increments - not equal to 1. -*/ - if ( incx != 1 || incy != 1 ) - { - if ( 0 <= incx ) - { - ix = 0; - } - else - { - ix = ( - n + 1 ) * incx; - } - - if ( 0 <= incy ) - { - iy = 0; - } - else - { - iy = ( - n + 1 ) * incy; - } - - for ( i = 0; i < n; i++ ) - { - temp = temp + x[ix] * y[iy]; - ix = ix + incx; - iy = iy + incy; - } - } -/* - Code for both increments equal to 1. -*/ - else - { - m = n % 5; - - for ( i = 0; i < m; i++ ) - { - temp = temp + x[i] * y[i]; - } - - for ( i = m; i < n; i = i + 5 ) - { - temp = temp + x[i ] * y[i ] - + x[i+1] * y[i+1] - + x[i+2] * y[i+2] - + x[i+3] * y[i+3] - + x[i+4] * y[i+4]; - } - } - - return temp; -} -/*******************************************************************************/ - -int sgefa ( float a[], int lda, int n, int ipvt[] ) - -/*******************************************************************************/ -/* - Purpose: - - SGEFA factors a matrix by gaussian elimination. - - Discussion: - - Matrix references which would, mathematically, be written A(I,J) - must be written here as: - * A[I+J*LDA], when the value is needed, or - * A+I+J*LDA, when the address is needed. - - Modified: - - 07 March 2008 - - Author: - - FORTRAN77 original version by Cleve Moler. - C version by John Burkardt. - - Reference: - - Jack Dongarra, Jim Bunch, Cleve Moler, Pete Stewart, - LINPACK User's Guide, - SIAM, 1979, - ISBN13: 978-0-898711-72-1, - LC: QA214.L56. - - Parameters: - - Input/output, float A[LDA*N]. On input, the matrix to be factored. - On output, an upper triangular matrix and the multipliers which were - used to obtain it. The factorization can be written A = L * U where - L is a product of permutation and unit lower triangular matrices and - U is upper triangular. - - Input, int LDA, the leading dimension of the matrix. - - Input, int N, the order of the matrix. - - Output, int IPVT[N], the pivot indices. - - Output, int SGEFA, indicates singularity. - If 0, this is the normal value, and the algorithm succeeded. - If K, then on the K-th elimination step, a zero pivot was encountered. - The matrix is numerically not invertible. -*/ -{ - int j; - int info; - int k; - int l; - float t; - - info = 0; - - for ( k = 1; k <= n - 1; k++ ) - { -/* - Find l = pivot index. -*/ - l = isamax ( n-k+1, &a[k-1+(k-1)*lda], 1 ) + k - 1; - ipvt[k-1] = l; -/* - Zero pivot implies this column already triangularized. -*/ - if ( a[l-1+(k-1)*lda] != 0.0 ) - { -/* - Interchange if necessary. -*/ - if ( l != k ) - { - t = a[l-1+(k-1)*lda]; - a[l-1+(k-1)*lda] = a[k-1+(k-1)*lda]; - a[k-1+(k-1)*lda] = t; - } -/* - Compute multipliers. -*/ - t = - 1.0 / a[k-1+(k-1)*lda]; - sscal ( n-k, t, &a[k+(k-1)*lda], 1 ); -/* - Row elimination with column indexing. -*/ - for ( j = k + 1; j <= n; j++ ) - { - t = a[l-1+(j-1)*lda]; - if (l != k) - { - a[l-1+(j-1)*lda] = a[k-1+(j-1)*lda]; - a[k-1+(j-1)*lda] = t; - } - saxpy ( n-k, t, &a[k+(k-1)*lda], 1, &a[k+(j-1)*lda], 1 ); - } - } - else - { - info = k; - } - } - ipvt[n-1] = n; - - if (a[n-1+(n-1)*lda] == 0.0 ) - { - info = n - 1; - } - return info; -} -/******************************************************************************/ - -void sgesl ( float a[], int lda, int n, int ipvt[], float b[], int job ) - -/******************************************************************************/ -/* - Purpose: - - SGESL solves a real general linear system A * X = B. - - Discussion: - - SGESL can solve either of the systems A * X = B or A' * X = B. - - The system matrix must have been factored by SGECO or SGEFA. - - A division by zero will occur if the input factor contains a - zero on the diagonal. Technically this indicates singularity - but it is often caused by improper arguments or improper - setting of LDA. It will not occur if the subroutines are - called correctly and if SGECO has set 0.0 < RCOND - or SGEFA has set INFO == 0. - - Modified: - - 04 April 2006 - - Author: - - FORTRAN77 original by Dongarra, Moler, Bunch and Stewart. - C translation by John Burkardt. - - Reference: - - Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart, - LINPACK User's Guide, - SIAM, (Society for Industrial and Applied Mathematics), - 3600 University City Science Center, - Philadelphia, PA, 19104-2688. - ISBN: 0-89871-172-X - - Parameters: - - Input, float A[LDA*N], the output from SGECO or SGEFA. - - Input, int LDA, the leading dimension of A. - - Input, int N, the order of the matrix A. - - Input, int IPVT[N], the pivot vector from SGECO or SGEFA. - - Input/output, float B[N]. - On input, the right hand side vector. - On output, the solution vector. - - Input, int JOB. - 0, solve A * X = B; - nonzero, solve A' * X = B. -*/ -{ - int k; - int l; - float t; -/* - Solve A * X = B. -*/ - if ( job == 0 ) - { - for ( k = 1; k <= n-1; k++ ) - { - l = ipvt[k-1]; - t = b[l-1]; - - if ( l != k ) - { - b[l-1] = b[k-1]; - b[k-1] = t; - } - saxpy ( n-k, t, a+k+(k-1)*lda, 1, b+k, 1 ); - } - - for ( k = n; 1 <= k; k-- ) - { - b[k-1] = b[k-1] / a[k-1+(k-1)*lda]; - t = -b[k-1]; - saxpy ( k-1, t, a+0+(k-1)*lda, 1, b, 1 ); - } - } -/* - Solve A' * X = B. -*/ - else - { - for ( k = 1; k <= n; k++ ) - { - t = sdot ( k-1, a+0+(k-1)*lda, 1, b, 1 ); - b[k-1] = ( b[k-1] - t ) / a[k-1+(k-1)*lda]; - } - - for ( k = n-1; 1 <= k; k-- ) - { - b[k-1] = b[k-1] + sdot ( n-k, a+k+(k-1)*lda, 1, b+k, 1 ); - l = ipvt[k-1]; - - if ( l != k ) - { - t = b[l-1]; - b[l-1] = b[k-1]; - b[k-1] = t; - } - } - } - return; -} -/******************************************************************************/ - -void sscal ( int n, float sa, float x[], int incx ) - -/******************************************************************************/ -/* - Purpose: - - SSCAL scales a float vector by a constant. - - Modified: - - 23 February 2006 - - Author: - - Jack Dongarra - C version by John Burkardt - - Reference: - - Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart, - LINPACK User's Guide, - SIAM, 1979, - ISBN13: 978-0-898711-72-1, - LC: QA214.L56. - - Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh, - Basic Linear Algebra Subprograms for Fortran Usage, - Algorithm 539, - ACM Transactions on Mathematical Software, - Volume 5, Number 3, September 1979, pages 308-323. - - Parameters: - - Input, int N, the number of entries in the vector. - - Input, float SA, the multiplier. - - Input/output, float X[*], the vector to be scaled. - - Input, int INCX, the increment between successive entries of X. -*/ -{ - int i; - int ix; - int m; - - if ( n <= 0 ) - { - } - else if ( incx == 1 ) - { - m = n % 5; - - for ( i = 0; i < m; i++ ) - { - x[i] = sa * x[i]; - } - - for ( i = m; i < n; i = i + 5 ) - { - x[i] = sa * x[i]; - x[i+1] = sa * x[i+1]; - x[i+2] = sa * x[i+2]; - x[i+3] = sa * x[i+3]; - x[i+4] = sa * x[i+4]; - } - } - else - { - if ( 0 <= incx ) - { - ix = 0; - } - else - { - ix = ( - n + 1 ) * incx; - } - - for ( i = 0; i < n; i++ ) - { - x[ix] = sa * x[ix]; - ix = ix + incx; - } - - } - - return; -} -/******************************************************************************/ - -void sswap ( int n, float x[], int incx, float y[], int incy ) - -/******************************************************************************/ -/* - Purpose: - - SSWAP interchanges two float vectors. - - Modified: - - 23 February 2006 - - Author: - - C version by John Burkardt - - Reference: - - Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart, - LINPACK User's Guide, - SIAM, 1979, - ISBN13: 978-0-898711-72-1, - LC: QA214.L56. - - Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh, - Basic Linear Algebra Subprograms for Fortran Usage, - Algorithm 539, - ACM Transactions on Mathematical Software, - Volume 5, Number 3, September 1979, pages 308-323. - - Parameters: - - Input, int N, the number of entries in the vectors. - - Input/output, float X[*], one of the vectors to swap. - - Input, int INCX, the increment between successive entries of X. - - Input/output, float Y[*], one of the vectors to swap. - - Input, int INCY, the increment between successive elements of Y. -*/ -{ - int i; - int ix; - int iy; - int m; - float temp; - - if ( n <= 0 ) - { - } - else if ( incx == 1 && incy == 1 ) - { - m = n % 3; - - for ( i = 0; i < m; i++ ) - { - temp = x[i]; - x[i] = y[i]; - y[i] = temp; - } - - for ( i = m; i < n; i = i + 3 ) - { - temp = x[i]; - x[i] = y[i]; - y[i] = temp; - - temp = x[i+1]; - x[i+1] = y[i+1]; - y[i+1] = temp; - - temp = x[i+2]; - x[i+2] = y[i+2]; - y[i+2] = temp; - } - } - else - { - if ( 0 <= incx ) - { - ix = 0; - } - else - { - ix = ( - n + 1 ) * incx; - } - - if ( 0 <= incy ) - { - iy = 0; - } - else - { - iy = ( - n + 1 ) * incy; - } - - for ( i = 0; i < n; i++ ) - { - temp = x[ix]; - x[ix] = y[iy]; - y[iy] = temp; - ix = ix + incx; - iy = iy + incy; - } - } - return; -} -/******************************************************************************/ - -void timestamp ( ) - -/******************************************************************************/ -/* - Purpose: - - TIMESTAMP prints the current YMDHMS date as a time stamp. - - Example: - - 31 May 2001 09:45:54 AM - - Modified: - - 24 September 2003 - - Author: - - John Burkardt - - Parameters: - - None -*/ -{ -# define TIME_SIZE 40 - - static char time_buffer[TIME_SIZE]; - const struct tm *tm; - time_t now; - - now = time ( NULL ); - tm = localtime ( &now ); - - strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm ); - - printf ( "%s\n", time_buffer ); - - return; -# undef TIME_SIZE -} diff --git a/src/tutor/mp/sum_tbb.cc b/src/tutor/mp/sum_tbb.cc deleted file mode 100644 index a43ce8a95..000000000 --- a/src/tutor/mp/sum_tbb.cc +++ /dev/null @@ -1,22 +0,0 @@ -// g++ sum_tbb.cc -o sum_tbb -ltbb - -#include - -int main(){ - int n = 1001; - int sum = oneapi::tbb::parallel_reduce( - oneapi::tbb::blocked_range(1,n), 0, - [](oneapi::tbb::blocked_range const& r, int init) -> int { - for (int v = r.begin(); v != r.end(); v++) { - init += v; - } - return init; - }, - [](int lhs, int rhs) -> int { - return lhs + rhs; - } - ); - - printf("N=%d Sum: %d\n", n, sum); - return 0; -} diff --git a/src/tutor/mp/wait3.sh b/src/tutor/mp/wait3.sh deleted file mode 100755 index 20c923315..000000000 --- a/src/tutor/mp/wait3.sh +++ /dev/null @@ -1,21 +0,0 @@ -#! /usr/bin/bash -# -# If you have "about equal CPU" tasks that are normally single CPU programs, -# running them with this simply shell construct can be useful to speed up -# a series -# - - -echo Sleeping 2 -sleep 2 & - -echo Sleeping 4 -sleep 4 & - -echo Sleeping 6 -sleep 6 & - -echo Waiting until all are done.... -wait - -echo All done diff --git a/usr/dehnen/falcON/src/public/acc/Monopole.cc b/usr/dehnen/falcON/src/public/acc/Monopole.cc index 4400f34f8..29dc7e9a9 100644 --- a/usr/dehnen/falcON/src/public/acc/Monopole.cc +++ b/usr/dehnen/falcON/src/public/acc/Monopole.cc @@ -33,8 +33,10 @@ #include #include #define __NO_AUX_DEFACC -#include // $NEMOINC/defacc.h +// issue105 #include +#include // $NEMOINC/defacc.h + //////////////////////////////////////////////////////////////////////////////// namespace { using namespace WDutils; diff --git a/usr/dehnen/falcON/src/public/acc/PotExp.cc b/usr/dehnen/falcON/src/public/acc/PotExp.cc index 2572e08d6..017cf0143 100644 --- a/usr/dehnen/falcON/src/public/acc/PotExp.cc +++ b/usr/dehnen/falcON/src/public/acc/PotExp.cc @@ -19,6 +19,8 @@ // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | // | //-----------------------------------------------------------------------------+ +// issue105 +#include #include #include #ifndef falcON_NEMO diff --git a/usr/dehnen/falcON/src/public/lib/bodyfunc.cc b/usr/dehnen/falcON/src/public/lib/bodyfunc.cc index 7cff22fb4..b669e0ba7 100644 --- a/usr/dehnen/falcON/src/public/lib/bodyfunc.cc +++ b/usr/dehnen/falcON/src/public/lib/bodyfunc.cc @@ -154,8 +154,9 @@ namespace { // compiles a falcON C++ program in fname using compiler flags const char* falcON_path = falcON::directory(); if(falcON_path == 0) throw BfErr("cannot locate falcON directory"); - char cmmd[512]; - SNprintf(cmmd,512,"cd /tmp; %s %s.cc -o %s.so" + // issue105 + char cmmd[1024]; + SNprintf(cmmd,1024,"cd /tmp; %s %s.cc -o %s.so" " %s -shared -fPIC -I%s/inc -I%s/inc/utils -O2" #if __cplusplus >= 201103L " -std=c++0x" @@ -176,10 +177,14 @@ namespace { // " -march=native -mfpmath=sse -mpreferred-stack-boundary=4 -ggdb3" #elif defined(__GNUC__) - " -mfpmath=sse -mpreferred-stack-boundary=4 -ggdb3" + " -mfpmath=sse -ggdb3" " -Wall -Wextra -Winit-self -Wshadow -Woverloaded-virtual -fPIC" " -std=c++11" - " -fopenmp -funroll-loops -fforce-addr" + " -funroll-loops -fforce-addr" +// issue105 +# ifndef __clang__ + " -mpreferred-stack-boundary=4" +# endif #else " -fpic -openmp -g" #endif @@ -191,7 +196,7 @@ namespace { fname,fname,(flags? flags : " "),falcON_path,falcON_path,fname); DebugInfo(2,"now compiling using the following command\n %s\n",cmmd); if(system(cmmd)) { - if(debug(debug_depth)) { + //issue105 if(debug(debug_depth)) { std::cerr<<"could not compile temporary file /tmp/"< /dev/stderr",fname); @@ -202,7 +207,7 @@ namespace { SNprintf(show,512,"more /tmp/%s.log > /dev/stderr",fname); std::cerr<<'\n'; rr=system(show); - } + // } throw BfErr(message("could not compile expression; " "perhaps it contains a syntax error")); } @@ -510,7 +515,8 @@ namespace { "#define BD_TEST\n" "#define body_func\n" "#include \n\n" - "real _P[10]={RNG()};\n\n" +// issue105 + "real _P[10]={static_cast(RNG())};\n\n" "extern \"C\" {\n" " fieldset "< # endif #endif -#if __cplusplus >= 201103L && defined(_OPENMP) && \ - !defined(WDutils_included_omp_h) -# include -# define WDutils_included_omp_h -#endif // // WDutils @@ -141,27 +136,7 @@ namespace WDutils { Info._m_mpi_size=s; } - /// \name openMP stuff - //@{ - /// set \# openMP threads - /// \note If @a arg[0] == 't', we set \# threads to \# processors. - /// If @a arg[0] == 'f', we set \# threads to 1 (no openMP). - /// Otherwise, we try to convert @a arg to an integer number and - /// take that. This may exceed the \# processors. - static void set_omp(const char*arg); - /// set number of openMP threads - static void set_omp(int n); - /// maximum \# processors available for openMP - static int max_omp_proc() - { return Info._m_omp_proc; } - /// number of openMP threads to be used, may exceed @a max_omp_proc() - /// \note defaults to max_omp_proc, implying openMP is used if available - static int omp_threads() - { return Info._m_omp_size; } - /// shall openMP parallelism be used? - static bool use_omp() - { return Info._m_omp_size > 1; } - //@} + /// \name TBB stuff //@{ diff --git a/usr/dehnen/utils/src/exception.cc b/usr/dehnen/utils/src/exception.cc index 250488bb0..1c5f01fd5 100644 --- a/usr/dehnen/utils/src/exception.cc +++ b/usr/dehnen/utils/src/exception.cc @@ -171,17 +171,8 @@ WDutils::RunInfo::RunInfo() SNprintf(_m_name,104,"unknown.name"); } #endif - // set # proc available for openMP - { -#ifdef _OPENMP - if(omp_in_parallel()) - WDutils_ErrorF("called inside OMP parallel region\n"); - _m_omp_proc = omp_get_num_procs(); -#else - _m_omp_proc = 1; -#endif - _m_omp_size = _m_omp_proc; - } + + // set # threads used by TBB { #ifdef WDutilsTBB @@ -195,52 +186,7 @@ WDutils::RunInfo::RunInfo() catch(WDutils::exception& ex) { WDutils_RETHROW(ex); } } // -void WDutils::RunInfo::set_omp(int -#ifdef _OPENMP - n -#endif - ) -{ -#ifdef _OPENMP - Info._m_omp_size = n; - if(Info._m_omp_size < 1) { - Info._m_omp_size = 1; - WDutils_WarningN("RunInfo::set_omp('%d') assume '1'\n",n); - } - omp_set_num_threads(Info._m_omp_size); -#else - Info._m_omp_size = 1; -#endif -} -// -void WDutils::RunInfo::set_omp(const char* -#ifdef _OPENMP - arg -#endif - ) -{ -#ifdef _OPENMP - if(arg==0 || arg[0]==0 || arg[0]=='t') - Info._m_omp_size = Info._m_omp_proc; - else if(arg[0] == 'f') - Info._m_omp_size = 1; - else if(arg && arg[0]) { - Info._m_omp_size = strtol(arg,0,10); - if(errno == EINVAL) - WDutils_THROWN("RunInfo::set_omp('%s') (errno=EINVAL)\n",arg,errno); - if(errno == ERANGE) - WDutils_THROWN("RunInfo::set_omp('%s') (errno=ERANGE)\n",arg,errno); - if(Info._m_omp_size < 1) { - Info._m_omp_size = 1; - WDutils_WarningN("RunInfo::set_omp('%s') assume '1'\n",arg); - } - } - omp_set_num_threads(Info._m_omp_size); -#else - Info._m_omp_size = 1; -#endif -} -// + WDutils::RunInfo::~RunInfo() { #ifdef WDutilsTBB