diff --git a/configure.ac b/configure.ac
index e05fb6b6c..8a911a4fd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -245,8 +245,6 @@ AC_ARG_WITH(dso,
 		  [  --with-dso              use DSO linking], with_dso=$withval, with_dso=no)
 AC_ARG_WITH(opt, 
 		  [  --with-opt              use opt for essentials], with_opt=$withval, with_opt=no)
-AC_ARG_WITH(openmp,
-		  [  --with-openmp           use OMP directives], with_openmp=$withval, with_openmp=no)
 
 AC_ARG_WITH(std,
 		  [  --with-std              use this -std=], with_std=$withval, with_std=c99)
diff --git a/src/kernel/misc/mdbench.c b/src/kernel/misc/mdbench.c
index 072c4cbe3..a7e9824ae 100644
--- a/src/kernel/misc/mdbench.c
+++ b/src/kernel/misc/mdbench.c
@@ -7,13 +7,8 @@
  */
 
 
-//#define USE_OMP
-
 #include <nemo.h>
 #include <mdarray.h>
-#ifdef USE_OMP
-#include <omp.h>
-#endif
 
 string defv[] = {
   "dim=10,20,30,40\n Dimensions of array A[dim1][dim2][dim3][dim4]....",
@@ -22,12 +17,8 @@ string defv[] = {
 
   "flip=f\n          Reverse traversal through array, for benchmarking",
   "iter=1\n          Number of times to do the work, for benchmarking",
-  "free=f\n          Free things we don't need anymore",
-#ifdef USE_OMP
-  "nprocs=1\n        Number of processors",
-#else
+  "free=f\n          Free things we don't need anymore"
   "nprocs=-1\n       No OMP enabled",
-#endif
   "VERSION=1.2\n     11-feb-2024 PJT",
   NULL,
 };
@@ -294,13 +285,8 @@ void nemo_main()
   mdarray6 x6;
   mdarray7 x7;
 
-#ifdef USE_OMP
-  if (nprocs < 0) nprocs = omp_get_max_threads();
-  dprintf(0,"Using OMP with nprocs=%d (or use OMP_NUM_THREADS)\n",nprocs);
-#else  
   dprintf(0,"Using single CPU, no OMP enables\n");
   if (nprocs>1) warning("No OMP was enabled");
-#endif
 
   /* C99 now does it the way I wanted it to work */
   dprintf(1,"pointer test3: 0x%x 0x%x 0x%x    0x%x   0x%x 0x%x\n",test3,test3[0],&test3[0][0],&test3[0][1],test3[1],&test3[1][0]);
@@ -339,10 +325,6 @@ void nemo_main()
   //  80^4 * 10 -> 3.832"
   //  90^4 * 1  ->  0.63
   // 100^4 * 1  ->  1.00
-#ifdef USE_OMP
-  #pragma omp parallel shared(x4,ntest,dim) private(i,i4,i3,i2,i1)
-  #pragma omp for
-#endif  
   for (i=0; i<ntest; i++) {
     for (i4=0; i4<dim[3]; i4++) 
       for (i3=0; i3<dim[2]; i3++)
@@ -360,10 +342,6 @@ void nemo_main()
     // 90^4:  0.62   1.60
     // 95^4:  0.74   2.00 
     y4 = allocate_mdarray4(dim[3],dim[1],dim[0],dim[2]);
-#ifdef USE_OMP
-    #pragma omp parallel shared(x4,y4,ntest,dim) private(i,i4,i3,i2,i1)
-    #pragma omp for
-#endif    
     for (i=0; i<ntest; i++) {
       for (i4=0; i4<dim[3]; i4++) 
 	for (i3=0; i3<dim[2]; i3++)
@@ -375,10 +353,6 @@ void nemo_main()
     if (free) free_mdarray4(x4,dim[3],dim[2],dim[1],dim[0]);
     if (iwork>1) {
       y3 = allocate_mdarray3(dim[3],dim[1],dim[0]);
-#ifdef USE_OMP
-      #pragma omp parallel shared(x4,y3,ntest,dim) private(i,i4,i3,i2,i1,sum)
-      #pragma omp for
-#endif    
       for (i=0; i<ntest; i++) {
 	for (i4=0; i4<dim[3]; i4++) 
 	  for (i2=0; i2<dim[1]; i2++)
diff --git a/src/nbody/io_nemo/Makefile b/src/nbody/io_nemo/Makefile
index f8ae9a3a3..a5244ea6f 100644
--- a/src/nbody/io_nemo/Makefile
+++ b/src/nbody/io_nemo/Makefile
@@ -181,7 +181,7 @@ ${IONO}/check_file.o       : ${IONS}/check_file.h ${IONS}/check_file.c
 
 # main program used with g77 compiler
 ${IONO}/nemo_g77.o         : ${IONS}/nemo_g77.c
-	$(CC) $(CFLAGS) -D$(DFC) $(OPT) $(INC) -c ${IONS}/nemo_g77.c -o ${IONO}/nemo_g77.o 
+	$(CC) $(CFLAGS) -D$(DFC) $(OPT) $(INC) -c ${IONS}/nemo_g77.c -o ${IONO}/nemo_g77.o
 
 #
 # io_nemo test programs
diff --git a/src/tutor/bench/Makefile b/src/tutor/bench/Makefile
index 0828f979e..0b01680f9 100644
--- a/src/tutor/bench/Makefile
+++ b/src/tutor/bench/Makefile
@@ -60,8 +60,3 @@ offt:	offt.c
 
 offt8:	offt.c
 	$(CC) -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -o offt8 offt.c
-
-
-array_add_omp:  array_add.c
-	$(CC) $(CFLAGS)  -fopenmp array_add.c -o array_add_omp $(NEMO_LIBS)
-#	$(CC) $(CFLAGS) -std=c99 -fopenmp array_add.c -o array_add_omp $(NEMO_LIBS)
diff --git a/src/tutor/bench/array_add.c b/src/tutor/bench/array_add.c
index 09c566f91..56150a76f 100644
--- a/src/tutor/bench/array_add.c
+++ b/src/tutor/bench/array_add.c
@@ -1,5 +1,6 @@
 /*
  * benchmark openmp vs. pthreads
+*  issue 105 - no longer relevant
  * 
  * example taken from
  * http://www.futurechips.org/tips-for-power-coders/open-mp-pthreads.html
@@ -24,7 +25,6 @@ string usage = "bench openmp";
 void sum_st(int *A, int *B, int *C)
 {
   int i;
-  #pragma omp parallel for
   for(i = 0; i < M; i++)
     A[i] = B[i] + C[i];
 }
diff --git a/src/tutor/mp/Makefile b/src/tutor/mp/Makefile
deleted file mode 100644
index 14ce94f3d..000000000
--- a/src/tutor/mp/Makefile
+++ /dev/null
@@ -1,84 +0,0 @@
-#
-
-#   set defaults in case the makedefs does not
-CC    = gcc
-MPICC = mpicc
-
-TIME = /usr/bin/time -f "%U %S %e %P"
-
-include $(NEMOLIB)/makedefs
-
-BINS = hello1 hello2 heated_plate_openmp poisson_mpi scaling scaling2
-
-all:  $(BINS)
-
-clean:
-	rm -f $(BINS)
-
-hello1:
-	$(CC) -fopenmp -O3  -o hello1 hellomp.c  -lm
-
-hello2:
-	$(CC)          -O3  -o hello2 hellomp.c  -lm
-
-bench: $(BINS)
-	$(TIME) ./hello1
-	$(TIME) ./hello2
-
-
-heated_plate_openmp:
-	$(CC) $(CFLAGS) -fopenmp -o heated_plate_openmp heated_plate_openmp.c $(NEMO_LIBS)
-
-bench2: heated_plate_openmp
-	$(TIME) ./heated_plate_openmp nprocs=4
-
-
-# https://people.sc.fsu.edu/~jburkardt/c_src/poisson_mpi/poisson_mpi.c
-
-poisson_mpi:
-	$(MPICC) poisson_mpi.c -o poisson_mpi -lm
-
-bench3:
-	$(TIME) mpirun -np 4 ./poisson_mpi 512
-
-# takes about 25" in single mode
-bench4:
-	$(TIME) ./sections iter=10000
-
-# scaling from https://stackoverflow.com/questions/19780554/what-limits-scaling-in-this-simple-openmp-program
-
-scaling: scaling.c
-	$(CC) scaling.c -std=c99 -fopenmp -O3    -o scaling -lm
-
-ONT=1 2 4 8 16
-bench5: scaling
-	@echo OMP_NUM_THREADS ONT=$(ONT)
-	-@for i in $(ONT); do\
-	   (echo $$i;OMP_NUM_THREADS=$$i $(TIME) ./scaling); done
-
-
-quad_openmp: quad_openmp.c
-	gcc -fopenmp quad_openmp.c -o quad_openmp
-
-bench6:  quad_openmp
-	@echo OMP_NUM_THREADS ONT=$(ONT)
-	-@for i in $(ONT); do\
-	   (echo $$i;OMP_NUM_THREADS=$$i $(TIME) ./quad_openmp); done
-
-
-bench7: python_openmp.py
-	@echo OMP_NUM_THREADS ONT=$(ONT)
-	-@for i in $(ONT); do\
-	   (echo $$i;OMP_NUM_THREADS=$$i $(TIME) ipython python_openmp.py); done
-
-bench8: scaling2
-	@echo "# OMP_NUM_THREADS ONT=$(ONT)"
-	-@for i in $(ONT); do\
-	   (echo -n "$$i ";$(TIME) ./scaling2 umax=20000 np=$$i iter=20*$$i debug=-1); done
-
-
-bench9:	scaling2
-	@echo Long integration, see laptop performance drop, see core swaps
-	$(TIME) ./scaling2  iter=500 > bench9.log 2>&1
-	grep cputime bench9.log | sed s/###// | tabplot - 6 8 line=1,1 ycoord=0  ymin=0
-	@echo "Jansky 0.077  k2 0.090 0.098"
diff --git a/src/tutor/mp/README b/src/tutor/mp/README
deleted file mode 100644
index 3cd2f995f..000000000
--- a/src/tutor/mp/README
+++ /dev/null
@@ -1,34 +0,0 @@
-
-Here are some simple OPENMP examples.
-If compiled with openmp, NEMO will show this when debug=1 is used:
-   ### nemo Debug Info: omp_get_max_threads() -> 8  [OMP_NUM_THREADS]
-otherwise it would not show this.
-
-Some examples of usage:
-
-sections
---------
-   This program uses the "#pragma sections" as well as for
-   It also seems to shows that multiple sections cannot split up the for loop,
-
-
-OMP_NUM_THREADS=4 sections iter=10000
-27.81user 0.00system 0:13.89elapsed 200%CPU (0avgtext+0avgdata 18172maxresident)k
-
-No openmp:
-25.02user 0.00system 0:25.02elapsed 99%CPU (0avgtext+0avgdata 17824maxresident)k
-
-cool pages:
-
-https://zingale.github.io/phy504/openmp-relax.html
-
-
-To measure the performance an example of the output from /usr/bin/time on program with N threads
-can be processed with the amdahl.py script. Example is the heated plate, which is pretty good:
-
-# on an i5-1135G7 ./heated_plate_openmp
-1       9.21user 0.00system 0:09.21elapsed  99%CPU  1.00 1.00 1.00
-2       9.20user 0.00system 0:04.60elapsed 199%CPU  1.00 1.00 1.00
-4      10.49user 0.00system 0:02.62elapsed 399%CPU  1.14 1.14 0.95
-8      15.05user 0.02system 0:01.89elapsed 797%CPU  1.63 1.64 0.91
-16     23.20user 5.29system 0:05.82elapsed 489%CPU  2.52 10.11 0.39
diff --git a/src/tutor/mp/heated_plate_openmp.c b/src/tutor/mp/heated_plate_openmp.c
deleted file mode 100644
index 7d0026d0d..000000000
--- a/src/tutor/mp/heated_plate_openmp.c
+++ /dev/null
@@ -1,500 +0,0 @@
-/*
- * This is an example from Burkardt's OPENMP examples (great resource!)
- * with the original "main" as well as the NEMO-fied nemo_main()
- *
- * https://people.sc.fsu.edu/~jburkardt/c_src/heated_plate_openmp/heated_plate_openmp.c
- *
- */
-
-
-#include <nemo.h>
-#include <mdarray.h>
-
-# include <stdlib.h>
-# include <stdio.h>
-# include <math.h>
-# include <omp.h>
-
-
-string defv[] = {
-  "n=500\n       Number of pixels in X",
-  "m=500\n       Number of pixels in Y",
-  "eps=0.001\n   Accuracy",
-  "nprocs=-1\n   Override number of procs used",
-  "old=f\n       Use the old executable",
-  "VERSION=1\n   21-dec-2019 PJT",
-  NULL,
-};
-
-
-int old_main (void);
-int new_main (int n, int m, real eps, int nprocs);
-  
-void nemo_main()
-{
-  bool Qold = getbparam("old");
-  int n = getiparam("n");
-  int m = getiparam("m");
-  real eps = getrparam("eps");
-  int nprocs = getiparam("nprocs");
-
-  if (Qold)  {
-    old_main();
-  } else {
-    new_main(n,m,eps,nprocs);
-  }
-}
-
-
-/******************************************************************************/
-
-int old_main (void)
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    MAIN is the main program for HEATED_PLATE_OPENMP.
-
-  Discussion:
-
-    This code solves the steady state heat equation on a rectangular region.
-
-    The sequential version of this program needs approximately
-    18/epsilon iterations to complete. 
-
-
-    The physical region, and the boundary conditions, are suggested
-    by this diagram;
-
-                   W = 0
-             +------------------+
-             |                  |
-    W = 100  |                  | W = 100
-             |                  |
-             +------------------+
-                   W = 100
-
-    The region is covered with a grid of M by N nodes, and an N by N
-    array W is used to record the temperature.  The correspondence between
-    array indices and locations in the region is suggested by giving the
-    indices of the four corners:
-
-                  I = 0
-          [0][0]-------------[0][N-1]
-             |                  |
-      J = 0  |                  |  J = N-1
-             |                  |
-        [M-1][0]-----------[M-1][N-1]
-                  I = M-1
-
-    The steady state solution to the discrete heat equation satisfies the
-    following condition at an interior grid point:
-
-      W[Central] = (1/4) * ( W[North] + W[South] + W[East] + W[West] )
-
-    where "Central" is the index of the grid point, "North" is the index
-    of its immediate neighbor to the "north", and so on.
-   
-    Given an approximate solution of the steady state heat equation, a
-    "better" solution is given by replacing each interior point by the
-    average of its 4 neighbors - in other words, by using the condition
-    as an ASSIGNMENT statement:
-
-      W[Central]  <=  (1/4) * ( W[North] + W[South] + W[East] + W[West] )
-
-    If this process is repeated often enough, the difference between successive 
-    estimates of the solution will go to zero.
-
-    This program carries out such an iteration, using a tolerance specified by
-    the user, and writes the final estimate of the solution to a file that can
-    be used for graphic processing.
-
-  Licensing:
-
-    This code is distributed under the GNU LGPL license. 
-
-  Modified:
-
-    18 October 2011
-
-  Author:
-
-    Original C version by Michael Quinn.
-    This C version by John Burkardt.
-
-  Reference:
-
-    Michael Quinn,
-    Parallel Programming in C with MPI and OpenMP,
-    McGraw-Hill, 2004,
-    ISBN13: 978-0071232654,
-    LC: QA76.73.C15.Q55.
-
-  Local parameters:
-
-    Local, double DIFF, the norm of the change in the solution from one iteration
-    to the next.
-
-    Local, double MEAN, the average of the boundary values, used to initialize
-    the values of the solution in the interior.
-
-    Local, double U[M][N], the solution at the previous iteration.
-
-    Local, double W[M][N], the solution computed at the latest iteration.
-*/
-{
-# define M 500
-# define N 500
-
-  double diff;
-  double epsilon = 0.001;
-  int i;
-  int iterations;
-  int iterations_print;
-  int j;
-  double mean;
-  double my_diff;
-  double u[M][N];
-  double w[M][N];
-  double wtime;
-
-  printf ( "\n" );
-  printf ( "HEATED_PLATE_OPENMP\n" );
-  printf ( "  C/OpenMP version\n" );
-  printf ( "  A program to solve for the steady state temperature distribution\n" );
-  printf ( "  over a rectangular plate.\n" );
-  printf ( "\n" );
-  printf ( "  Spatial grid of %d by %d points.\n", M, N );
-  printf ( "  The iteration will be repeated until the change is <= %e\n", epsilon ); 
-  printf ( "  Number of processors available = %d\n", omp_get_num_procs ( ) );
-  printf ( "  Number of threads =              %d\n", omp_get_max_threads ( ) );
-/*
-  Set the boundary values, which don't change. 
-*/
-  mean = 0.0;
-
-#pragma omp parallel shared ( w ) private ( i, j )
-  {
-#pragma omp for
-    for ( i = 1; i < M - 1; i++ )
-    {
-      w[i][0] = 100.0;
-    }
-#pragma omp for
-    for ( i = 1; i < M - 1; i++ )
-    {
-      w[i][N-1] = 100.0;
-    }
-#pragma omp for
-    for ( j = 0; j < N; j++ )
-    {
-      w[M-1][j] = 100.0;
-    }
-#pragma omp for
-    for ( j = 0; j < N; j++ )
-    {
-      w[0][j] = 0.0;
-    }
-/*
-  Average the boundary values, to come up with a reasonable
-  initial value for the interior.
-*/
-#pragma omp for reduction ( + : mean )
-    for ( i = 1; i < M - 1; i++ )
-    {
-      mean = mean + w[i][0] + w[i][N-1];
-    }
-#pragma omp for reduction ( + : mean )
-    for ( j = 0; j < N; j++ )
-    {
-      mean = mean + w[M-1][j] + w[0][j];
-    }
-  }
-/*
-  OpenMP note:
-  You cannot normalize MEAN inside the parallel region.  It
-  only gets its correct value once you leave the parallel region.
-  So we interrupt the parallel region, set MEAN, and go back in.
-*/
-  mean = mean / ( double ) ( 2 * M + 2 * N - 4 );
-  printf ( "\n" );
-  printf ( "  MEAN = %f\n", mean );
-/* 
-  Initialize the interior solution to the mean value.
-*/
-#pragma omp parallel shared ( mean, w ) private ( i, j )
-  {
-#pragma omp for
-    for ( i = 1; i < M - 1; i++ )
-    {
-      for ( j = 1; j < N - 1; j++ )
-      {
-        w[i][j] = mean;
-      }
-    }
-  }
-/*
-  iterate until the  new solution W differs from the old solution U
-  by no more than EPSILON.
-*/
-  iterations = 0;
-  iterations_print = 1;
-  printf ( "\n" );
-  printf ( " Iteration  Change\n" );
-  printf ( "\n" );
-  wtime = omp_get_wtime ( );
-
-  diff = epsilon;
-
-  while ( epsilon <= diff )
-  {
-# pragma omp parallel shared ( u, w ) private ( i, j )
-    {
-/*
-  Save the old solution in U.
-*/
-# pragma omp for
-      for ( i = 0; i < M; i++ ) 
-      {
-        for ( j = 0; j < N; j++ )
-        {
-          u[i][j] = w[i][j];
-        }
-      }
-/*
-  Determine the new estimate of the solution at the interior points.
-  The new solution W is the average of north, south, east and west neighbors.
-*/
-# pragma omp for
-      for ( i = 1; i < M - 1; i++ )
-      {
-        for ( j = 1; j < N - 1; j++ )
-        {
-          w[i][j] = ( u[i-1][j] + u[i+1][j] + u[i][j-1] + u[i][j+1] ) / 4.0;
-        }
-      }
-    }
-/*
-  C and C++ cannot compute a maximum as a reduction operation.
-
-  Therefore, we define a private variable MY_DIFF for each thread.
-  Once they have all computed their values, we use a CRITICAL section
-  to update DIFF.
-*/
-    diff = 0.0;
-# pragma omp parallel shared ( diff, u, w ) private ( i, j, my_diff )
-    {
-      my_diff = 0.0;
-# pragma omp for
-      for ( i = 1; i < M - 1; i++ )
-      {
-        for ( j = 1; j < N - 1; j++ )
-        {
-          if ( my_diff < fabs ( w[i][j] - u[i][j] ) )
-          {
-            my_diff = fabs ( w[i][j] - u[i][j] );
-          }
-        }
-      }
-# pragma omp critical
-      {
-        if ( diff < my_diff )
-        {
-          diff = my_diff;
-        }
-      }
-    }
-
-    iterations++;
-    if ( iterations == iterations_print )
-    {
-      printf ( "  %8d  %f\n", iterations, diff );
-      iterations_print = 2 * iterations_print;
-    }
-  } 
-  wtime = omp_get_wtime ( ) - wtime;
-
-  printf ( "\n" );
-  printf ( "  %8d  %f\n", iterations, diff );
-  printf ( "\n" );
-  printf ( "  Error tolerance achieved.\n" );
-  printf ( "  Wallclock time = %f\n", wtime );
-/*
-  Terminate.
-*/
-  printf ( "\n" );
-  printf ( "HEATED_PLATE_OPENMP:\n" );
-  printf ( "  Normal end of execution.\n" );
-
-  return 0;
-
-# undef M
-# undef N
-}
-
-
-
-
-
-
-int new_main (int N, int M, real eps, int nprocs)
-{
-  double diff;
-  double epsilon = eps;
-  int i;
-  int iterations;
-  int iterations_print;
-  int j;
-  double mean;
-  double my_diff;
-  double wtime;
-  mdarray2 u = allocate_mdarray2(M,N);      //double u[M][N];
-  mdarray2 w = allocate_mdarray2(M,N);      //double w[M][N];
-
-  if (nprocs > 0)  omp_set_num_threads(nprocs);
-
-  printf ( "\n" );
-  printf ( "HEATED_PLATE_OPENMP\n" );
-  printf ( "  C/OpenMP version\n" );
-  printf ( "  A program to solve for the steady state temperature distribution\n" );
-  printf ( "  over a rectangular plate.\n" );
-  printf ( "\n" );
-  printf ( "  Spatial grid of %d by %d points.\n", M, N );
-  printf ( "  The iteration will be repeated until the change is <= %e\n", epsilon ); 
-  printf ( "  Number of processors available = %d\n", omp_get_num_procs ( ) );
-  printf ( "  Number of threads =              %d\n", omp_get_max_threads ( ) );
-  
-  /*
-    Set the boundary values, which don't change. 
-  */
-  mean = 0.0;
-
-  #pragma omp parallel shared ( w ) private ( i, j )
-  {
-    #pragma omp for
-    for ( i = 1; i < M - 1; i++ )
-      w[i][0] = 100.0;
-
-    #pragma omp for
-    for ( i = 1; i < M - 1; i++ )
-      w[i][N-1] = 100.0;
-    
-    #pragma omp for
-    for ( j = 0; j < N; j++ )
-      w[M-1][j] = 100.0;
-    
-    #pragma omp for
-    for ( j = 0; j < N; j++ )
-      w[0][j] = 0.0;
-    /*
-      Average the boundary values, to come up with a reasonable
-      initial value for the interior.
-    */
-    #pragma omp for reduction ( + : mean )
-    for ( i = 1; i < M - 1; i++ )
-      mean = mean + w[i][0] + w[i][N-1];
-    
-    #pragma omp for reduction ( + : mean )
-    for ( j = 0; j < N; j++ )
-      mean = mean + w[M-1][j] + w[0][j];
-  } // pragma omp parallel shared ( w ) private ( i, j )
-  
-  /*
-    OpenMP note:
-    You cannot normalize MEAN inside the parallel region.  It
-    only gets its correct value once you leave the parallel region.
-    So we interrupt the parallel region, set MEAN, and go back in.
-  */
-  mean = mean / ( double ) ( 2 * M + 2 * N - 4 );
-  printf ( "\n" );
-  printf ( "  MEAN = %f\n", mean );
-  /* 
-     Initialize the interior solution to the mean value.
-  */
-  #pragma omp parallel shared ( mean, w ) private ( i, j )
-  {
-    #pragma omp for
-    for ( i = 1; i < M - 1; i++ )
-      for ( j = 1; j < N - 1; j++ )
-        w[i][j] = mean;
-  }
-  /*
-    iterate until the  new solution W differs from the old solution U
-    by no more than EPSILON.
-  */
-  iterations = 0;
-  iterations_print = 1;
-  printf ( "\n" );
-  printf ( " Iteration  Change\n" );
-  printf ( "\n" );
-  wtime = omp_get_wtime ( );
-
-  diff = epsilon;
-
-  while ( epsilon <= diff ) {
-    # pragma omp parallel shared ( u, w ) private ( i, j )
-    {
-    /*
-      Save the old solution in U.
-    */
-    # pragma omp for
-    for ( i = 0; i < M; i++ ) 
-      for ( j = 0; j < N; j++ )
-	u[i][j] = w[i][j];
-    /*
-      Determine the new estimate of the solution at the interior points.
-      The new solution W is the average of north, south, east and west neighbors.
-    */
-    # pragma omp for
-    for ( i = 1; i < M - 1; i++ )
-      for ( j = 1; j < N - 1; j++ )
-	w[i][j] = ( u[i-1][j] + u[i+1][j] + u[i][j-1] + u[i][j+1] ) / 4.0;
-    } // pragma
-    
-    /*
-      C and C++ cannot compute a maximum as a reduction operation.
-      
-      Therefore, we define a private variable MY_DIFF for each thread.
-      Once they have all computed their values, we use a CRITICAL section
-      to update DIFF.
-    */
-    diff = 0.0;
-    # pragma omp parallel shared ( diff, u, w ) private ( i, j, my_diff )
-    {
-      my_diff = 0.0;
-      # pragma omp for
-      for ( i = 1; i < M - 1; i++ )
-        for ( j = 1; j < N - 1; j++ )
-          if ( my_diff < fabs ( w[i][j] - u[i][j] ) )
-            my_diff = fabs ( w[i][j] - u[i][j] );
-      
-      # pragma omp critical
-      {
-        if ( diff < my_diff )
-          diff = my_diff;
-      }
-    } // pragma
-
-    iterations++;
-    if ( iterations == iterations_print ) {
-      printf ( "  %8d  %f\n", iterations, diff );
-      iterations_print = 2 * iterations_print;
-    }
-  } // while
-  wtime = omp_get_wtime ( ) - wtime;
-
-  printf ( "\n" );
-  printf ( "  %8d  %f\n", iterations, diff );
-  printf ( "\n" );
-  printf ( "  Error tolerance achieved.\n" );
-  printf ( "  Wallclock time = %f\n", wtime );
-
-  
-  printf ( "\n" );
-  printf ( "HEATED_PLATE_OPENMP:\n" );
-  printf ( "  Normal end of execution.\n" );
-
-  return 0;
-}
-
diff --git a/src/tutor/mp/hellomp.c b/src/tutor/mp/hellomp.c
deleted file mode 100644
index 66d6ec1c4..000000000
--- a/src/tutor/mp/hellomp.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- *   Simple Example OMP usage
- */
-
-#include <omp.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-
-int main (void)
-{
-  int nthreads, my_id;
-  int i, imax = 1e8,j,jmax;
-  double x=0;
-
-#ifdef _OPENMP
-  printf("Compiled by an OpenMP-compliant implementation.\n");
-
-  #pragma omp parallel  shared(x)
-  { 
-    nthreads = omp_get_num_threads();
-    my_id = omp_get_thread_num();
-    for (i=0;i<imax/nthreads;i++)
-      x += cos(i*x);
-  }
-  printf("omp: making %d cosines to calculate x= %3.3e took.. \n",imax,x);
-
-#else
-  printf("Normal compilation\n");
-    
-  for (i=0;i<imax;i++)
-    x += cos(i*x);
-  printf("cpu1: making %d cosines to calculate x= %3.3e took.. \n",imax,x);
-#endif
-    
- 
-}
diff --git a/src/tutor/mp/poisson_mpi.c b/src/tutor/mp/poisson_mpi.c
deleted file mode 100644
index 4cd5b6b3a..000000000
--- a/src/tutor/mp/poisson_mpi.c
+++ /dev/null
@@ -1,564 +0,0 @@
-# include <math.h>
-# include <mpi.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <string.h>
-# include <time.h>
-
-double L = 1.0;			/* linear size of square region */
-int N = 32;			/* number of interior points per dim */
-
-double *u, *u_new;		/* linear arrays to hold solution */
-
-/* macro to index into a 2-D (N+2)x(N+2) array */
-#define INDEX(i,j) ((N+2)*(i)+(j))
-
-int my_rank;			/* rank of this process */
-
-int *proc;			/* process indexed by vertex */
-int *i_min, *i_max;		/* min, max vertex indices of processes */
-int *left_proc, *right_proc;	/* processes to left and right */
-
-/*
-  Functions:
-*/
-int main ( int argc, char *argv[] );
-void allocate_arrays ( );
-void jacobi ( int num_procs, double f[] );
-void make_domains ( int num_procs );
-double *make_source ( );
-void timestamp ( );
-
-/******************************************************************************/
-
-int main ( int argc, char *argv[] ) 
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    MAIN is the main program for POISSON_MPI.
-
-  Discussion:
-
-    This program solves Poisson's equation in a 2D region.
-
-    The Jacobi iterative method is used to solve the linear system.
-
-    MPI is used for parallel execution, with the domain divided
-    into strips.
-
-  Modified:
-
-    22 September 2013
-
-  Local parameters:
-
-    Local, double F[(N+2)x(N+2)], the source term.
-
-    Local, int N, the number of interior vertices in one dimension.
-
-    Local, int NUM_PROCS, the number of MPI processes.
-
-    Local, double U[(N+2)*(N+2)], a solution estimate.
-
-    Local, double U_NEW[(N+2)*(N+2)], a solution estimate.
-*/
-{
-  double change;
-  double epsilon = 1.0E-03;
-  double *f;
-  char file_name[100];
-  int i;
-  int j;
-  double my_change;
-  int my_n;
-  int n;
-  int num_procs;
-  int step;
-  double *swap;
-  double wall_time;
-/*
-  MPI initialization.
-*/
-  MPI_Init ( &argc, &argv );
-
-  MPI_Comm_size ( MPI_COMM_WORLD, &num_procs );
-
-  MPI_Comm_rank ( MPI_COMM_WORLD, &my_rank );
-/*
-  Read commandline arguments, if present.
-*/
-  if ( 1 < argc )
-  {
-    sscanf ( argv[1], "%d", &N );
-  }
-  else
-  {
-    N = 32;
-  }
-
-  if ( 2 < argc )
-  {
-    sscanf ( argv[2], "%lf", &epsilon );
-  }
-  else
-  {
-    epsilon = 1.0E-03;
-  }
-  if ( 3 < argc )
-  {
-    strcpy ( file_name, argv[3] );
-  }
-  else
-  {
-    strcpy ( file_name, "poisson_mpi.out" );
-  }
-/*
-  Print out initial information.
-*/
-  if ( my_rank == 0 ) 
-  {
-    timestamp ( );
-    printf ( "\n" );
-    printf ( "POISSON_MPI:\n" );
-    printf ( "  C version (NEMO)\n" );
-    printf ( "  2-D Poisson equation using Jacobi algorithm\n" );
-    printf ( "  ===========================================\n" );
-    printf ( "  MPI version: 1-D domains, non-blocking send/receive\n" );
-    printf ( "  Number of processes         = %d\n", num_procs );
-    printf ( "  Number of interior vertices = %d\n", N );
-    printf ( "  Desired fractional accuracy = %f\n", epsilon );
-    printf ( "\n" );
-  }
-
-  allocate_arrays ( );
-  f = make_source ( );
-  make_domains ( num_procs );
-
-  step = 0;
-/*
-  Begin timing.
-*/
-  wall_time = MPI_Wtime ( );
-/*
-  Begin iteration.
-*/
-  do 
-  {
-    jacobi ( num_procs, f );
-    ++step;
-/* 
-  Estimate the error 
-*/
-    change = 0.0;
-    n = 0;
-
-    my_change = 0.0;
-    my_n = 0;
-
-    for ( i = i_min[my_rank]; i <= i_max[my_rank]; i++ )
-    {
-      for ( j = 1; j <= N; j++ )
-      {
-        if ( u_new[INDEX(i,j)] != 0.0 ) 
-        {
-          my_change = my_change 
-            + fabs ( 1.0 - u[INDEX(i,j)] / u_new[INDEX(i,j)] );
-
-          my_n = my_n + 1;
-        }
-      }
-    }
-    MPI_Allreduce ( &my_change, &change, 1, MPI_DOUBLE, MPI_SUM,
-      MPI_COMM_WORLD );
-
-    MPI_Allreduce ( &my_n, &n, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD );
-
-    if ( n != 0 )
-    {
-      change = change / n;
-    }
-    if ( my_rank == 0 && ( step % 10 ) == 0 ) 
-    {
-      printf ( "  N = %d, n = %d, my_n = %d, Step %4d  Error = %g\n", 
-        N, n, my_n, step, change );
-    }
-/* 
-  Interchange U and U_NEW.
-*/
-    swap = u;
-    u = u_new;
-    u_new = swap;
-  } while ( epsilon < change );
-
-/* 
-  Here is where you can copy the solution to process 0 
-  and print to a file.
-*/
-
-/*
-  Report on wallclock time.
-*/
-  wall_time = MPI_Wtime() - wall_time;
-  if ( my_rank == 0 )
-  {
-    printf ( "\n" );
-    printf ( "  Wall clock time = %f secs\n", wall_time );
-  }
-/*
-  Terminate MPI.
-*/
-  MPI_Finalize ( );
-/*
-  Free memory.
-*/
-  free ( f );
-/*
-  Terminate.
-*/
-  if ( my_rank == 0 )
-  {
-    printf ( "\n" );
-    printf ( "POISSON_MPI:\n" );
-    printf ( "  Number of processes         = %d\n", num_procs );
-    printf ( "  Number of interior vertices = %d\n", N );
-    printf ( "  Normal end of execution.\n" );
-    printf ( "\n" );
-    timestamp ( );
-  }
- 
-  return 0;
-}
-/******************************************************************************/
-
-void allocate_arrays ( ) 
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    ALLOCATE_ARRAYS creates and zeros out the arrays U and U_NEW.
-
-  Modified:
-
-    10 September 2013
-*/
-{
-  int i;
-  int ndof;
-
-  ndof = ( N + 2 ) * ( N + 2 );
-
-  u = ( double * ) malloc ( ndof * sizeof ( double ) );
-  for ( i = 0; i < ndof; i++)
-  {
-    u[i] = 0.0;
-  }
-
-  u_new = ( double * ) malloc ( ndof * sizeof ( double ) );
-  for ( i = 0; i < ndof; i++ )
-  {
-    u_new[i] = 0.0;
-  }
-
-  return;
-}
-/******************************************************************************/
-
-void jacobi ( int num_procs, double f[] ) 
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    JACOBI carries out the Jacobi iteration for the linear system.
-
-  Modified:
-
-    16 September 2013
-
-  Parameters:
-
-    Input, int NUM_PROCS, the number of processes.
-
-    Input, double F[(N+2)*(N+2)], the right hand side of the linear system.
-*/
-{
-  double h;
-  int i;
-  int j;
-  MPI_Request request[4];
-  int requests;
-  MPI_Status status[4];
-/*
-  H is the lattice spacing.
-*/
-  h = L / ( double ) ( N + 1 );
-/* 
-  Update ghost layers using non-blocking send/receive 
-*/
-  requests = 0;
-
-  if ( left_proc[my_rank] >= 0 && left_proc[my_rank] < num_procs ) 
-  {
-    MPI_Irecv ( u + INDEX(i_min[my_rank] - 1, 1), N, MPI_DOUBLE,
-      left_proc[my_rank], 0, MPI_COMM_WORLD,
-      request + requests++ );
-
-    MPI_Isend ( u + INDEX(i_min[my_rank], 1), N, MPI_DOUBLE,
-      left_proc[my_rank], 1, MPI_COMM_WORLD,
-      request + requests++ );
-  }
-
-  if ( right_proc[my_rank] >= 0 && right_proc[my_rank] < num_procs ) 
-  {
-    MPI_Irecv ( u + INDEX(i_max[my_rank] + 1, 1), N, MPI_DOUBLE,
-      right_proc[my_rank], 1, MPI_COMM_WORLD,
-      request + requests++ );
-
-    MPI_Isend ( u + INDEX(i_max[my_rank], 1), N, MPI_DOUBLE,
-      right_proc[my_rank], 0, MPI_COMM_WORLD,
-      request + requests++ );
-  }
-/* 
-  Jacobi update for internal vertices in my domain.
-*/
-  for ( i = i_min[my_rank] + 1; i <= i_max[my_rank] - 1; i++ )
-  {
-    for ( j = 1; j <= N; j++ )
-    {
-      u_new[INDEX(i,j)] =
-        0.25 * ( u[INDEX(i-1,j)] + u[INDEX(i+1,j)] +
-                 u[INDEX(i,j-1)] + u[INDEX(i,j+1)] +
-                 h * h * f[INDEX(i,j)] );
-    }
-  }
-/* 
-  Wait for all non-blocking communications to complete.
-*/
-  MPI_Waitall ( requests, request, status );
-/* 
-  Jacobi update for boundary vertices in my domain.
-*/
-  i = i_min[my_rank];
-  for ( j = 1; j <= N; j++ )
-  {
-    u_new[INDEX(i,j)] =
-      0.25 * ( u[INDEX(i-1,j)] + u[INDEX(i+1,j)] +
-               u[INDEX(i,j-1)] + u[INDEX(i,j+1)] +
-               h * h * f[INDEX(i,j)] );
-  }
-
-  i = i_max[my_rank];
-  if (i != i_min[my_rank])
-  {
-    for (j = 1; j <= N; j++)
-    {
-      u_new[INDEX(i,j)] =
-        0.25 * ( u[INDEX(i-1,j)] + u[INDEX(i+1,j)] +
-                 u[INDEX(i,j-1)] + u[INDEX(i,j+1)] +
-                 h * h * f[INDEX(i,j)] );
-    }
-  }
-
-  return;
-}
-/******************************************************************************/
-
-void make_domains ( int num_procs ) 
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    MAKE_DOMAINS sets up the information defining the process domains.
-
-  Modified:
-
-    10 September 2013
-
-  Parameters:
-
-    Input, int NUM_PROCS, the number of processes.
-*/
-{
-  double d;
-  double eps;
-  int i;
-  int p;
-  double x_max;
-  double x_min;
-/* 
-  Allocate arrays for process information.
-*/
-  proc = ( int * ) malloc ( ( N + 2 ) * sizeof ( int ) );
-  i_min = ( int * ) malloc ( num_procs * sizeof ( int ) );
-  i_max = ( int * ) malloc ( num_procs * sizeof ( int ) );
-  left_proc = ( int * ) malloc ( num_procs * sizeof ( int ) );
-  right_proc = ( int * ) malloc ( num_procs * sizeof ( int ) );
-/* 
-  Divide the range [(1-eps)..(N+eps)] evenly among the processes.
-*/
-  eps = 0.0001;
-  d = ( N - 1.0 + 2.0 * eps ) / ( double ) num_procs;
-
-  for ( p = 0; p < num_procs; p++ )
-  {
-/* 
-  The I indices assigned to domain P will satisfy X_MIN <= I <= X_MAX.
-*/
-    x_min = - eps + 1.0 + ( double ) ( p * d );
-    x_max = x_min + d;
-/* 
-  For the node with index I, store in PROC[I] the process P it belongs to.
-*/
-    for ( i = 1; i <= N; i++ )
-    {
-      if ( x_min <= i && i < x_max )
-      {
-        proc[i] = p;
-      }
-    }
-  }
-/* 
-  Now find the lowest index I associated with each process P.
-*/
-  for ( p = 0; p < num_procs; p++ )
-  {
-    for ( i = 1; i <= N; i++ )
-    {
-      if ( proc[i] == p )
-      {
-        break;
-      }
-    }
-    i_min[p] = i;
-/* 
-  Find the largest index associated with each process P.
-*/
-    for ( i = N; 1 <= i; i-- )
-    {
-      if ( proc[i] == p )
-      {
-        break;
-      }
-    }
-    i_max[p] = i;
-/* 
-  Find the processes to left and right. 
-*/
-    left_proc[p] = -1;
-    right_proc[p] = -1;
-
-    if ( proc[p] != -1 ) 
-    {
-      if ( 1 < i_min[p] && i_min[p] <= N )
-      {
-        left_proc[p] = proc[i_min[p] - 1];
-      }
-      if ( 0 < i_max[p] && i_max[p] < N )
-      {
-        right_proc[p] = proc[i_max[p] + 1];
-      }
-    }
-  }
-
-  return;
-}
-/******************************************************************************/
-
-double *make_source ( ) 
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    MAKE_SOURCE sets up the source term for the Poisson equation.
-
-  Modified:
-
-    16 September 2013
-
-  Parameters:
-
-    Output, double *MAKE_SOURCE, a pointer to the (N+2)*(N+2) source term
-    array.
-*/
-{
-  double *f;
-  int i;
-  int j;
-  int k;
-  double q;
-
-  f = ( double * ) malloc ( ( N + 2 ) * ( N + 2 ) * sizeof ( double ) );
-
-  for ( i = 0; i < ( N + 2 ) * ( N + 2 ); i++ )
-  {
-    f[i] = 0.0;
-  }
-/* 
-  Make a dipole.
-*/
-  q = 10.0;
-
-  i = 1 + N / 4;
-  j = i;
-  k = INDEX ( i, j );
-  f[k] = q;
-
-  i = 1 + 3 * N / 4;
-  j = i;
-  k = INDEX ( i, j );
-  f[k] = -q;
-
-  return f;
-}
-/******************************************************************************/
-
-void timestamp ( )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    TIMESTAMP prints the current YMDHMS date as a time stamp.
-
-  Example:
-
-    31 May 2001 09:45:54 AM
-
-  Licensing:
-
-    This code is distributed under the GNU LGPL license. 
-
-  Modified:
-
-    24 September 2003
-
-  Author:
-
-    John Burkardt
-
-  Parameters:
-
-    None
-*/
-{
-# define TIME_SIZE 40
-
-  static char time_buffer[TIME_SIZE];
-  const struct tm *tm;
-  time_t now;
-
-  now = time ( NULL );
-  tm = localtime ( &now );
-
-  strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm );
-
-  printf ( "%s\n", time_buffer );
-
-  return;
-# undef TIME_SIZE
-}
diff --git a/src/tutor/mp/python_openmp.py b/src/tutor/mp/python_openmp.py
deleted file mode 100755
index f4fc8fdd3..000000000
--- a/src/tutor/mp/python_openmp.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env python
-#
-#  Taken from:   https://scicomp.aalto.fi/triton/examples/python/python_openmp/python_openmp/
-#  E.g.
-#  export OMP_PROC_BIND=true
-#  export OMP_NUM_THREADS=4
-
-import os
-from time import time
-import numpy as np
-
-print('SLURM_CPUS_PER_TASK: Using %d processors' % int(os.getenv('SLURM_CPUS_PER_TASK',1)))
-
-nrounds = 5
-n = 2000
-
-t_start = time()
-
-for i in range(nrounds):
-    a = np.random.random([n,n])
-    a = a + a.T
-    b = np.linalg.pinv(a)
-
-t_delta = time() - t_start
-
-print('Seconds taken to invert %d symmetric %dx%d matrices: %f' % (nrounds, n, n, t_delta))
diff --git a/src/tutor/mp/quad_openmp.c b/src/tutor/mp/quad_openmp.c
deleted file mode 100644
index 523aa067d..000000000
--- a/src/tutor/mp/quad_openmp.c
+++ /dev/null
@@ -1,208 +0,0 @@
-# include <math.h>
-# include <stdlib.h>
-# include <stdio.h>
-# include <time.h>
-# include <omp.h>
-
-int main ( int argc, char *argv[] );
-double f ( double x );
-double cpu_time ( );
-void timestamp ( );
-
-/******************************************************************************/
-
-int main ( int argc, char *argv[] )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    MAIN is the main program for QUAD_OPENMP.
-
-  Licensing:
-
-    This code is distributed under the GNU LGPL license. 
-
-  Modified:
-
-    14 December 2011
-
-  Author:
-
-    John Burkardt
-*/
-{
-  double a = 0.0;
-  double b = 10.0;
-  double error;
-  double exact = 0.49936338107645674464;
-  int i;
-//  int n = 10000000;
-  int n = 1000000000;
-  double total;
-  double wtime;
-  double x;
-
-  timestamp ( );
-  printf ( "\n" );
-  printf ( "QUAD_OPENMP:\n" );
-  printf ( "  C version\n" );
-  printf ( "  Use OpenMP for parallel execution.\n" );
-  printf ( "  Estimate the integral of f(x) from A to B.\n" );
-  printf ( "  f(x) = 50 / ( pi * ( 2500 * x * x + 1 ) ).\n" );
-  printf ( "\n" );
-  printf ( "  A        = %f\n", a );
-  printf ( "  B        = %f\n", b );
-  printf ( "  N        = %d\n", n );
-  printf ( "  Exact    = %24.16f\n", exact );
-
-  wtime = omp_get_wtime ( );
-
-  total = 0.0;
-
-# pragma omp parallel shared ( a, b, n ) private ( i, x ) 
-
-# pragma omp for reduction ( + : total )
-
-  for ( i = 0; i < n; i++ )
-  {
-    x = ( ( double ) ( n - i - 1 ) * a + ( double ) ( i ) * b ) / ( double ) ( n - 1 );
-    total = total + f ( x );
-  }
-
-  wtime = omp_get_wtime ( ) - wtime;
-
-  total = ( b - a ) * total / ( double ) n;
-  error = fabs ( total - exact );
-
-  printf ( "\n" );
-  printf ( "  Estimate = %24.16f\n", total );
-  printf ( "  Error    = %e\n", error );
-  printf ( "  Time     = %f\n", wtime );
-/*
-  Terminate.
-*/
-  printf ( "\n" );
-  printf ( "QUAD_OPENMP:\n" );
-  printf ( "  Normal end of execution.\n" );
-  printf ( "\n" );
-  timestamp ( );
-
-  return 0;
-}
-/*******************************************************************************/
-
-double f ( double x )
-
-/*******************************************************************************/
-/*
-  Purpose:
- 
-    F evaluates the function.
-
-  Licensing:
-
-    This code is distributed under the GNU LGPL license. 
-
-  Modified:
-
-    18 July 2010
-
-  Author:
-
-    John Burkardt
-
-  Parameters:
-
-    Input, double X, the argument.
-
-    Output, double F, the value of the function.
-*/
-{
-  double r8_pi = 3.141592653589793;
-  double value;
-
-  value = 50.0 / ( r8_pi * ( 2500.0 * x * x + 1.0 ) );
-
-  return value;
-}
-/*******************************************************************************/
-
-double cpu_time ( )
-
-/*******************************************************************************/
-/*
-  Purpose:
- 
-    CPU_TIME reports the total CPU time for a program.
-
-  Licensing:
-
-    This code is distributed under the GNU LGPL license. 
-
-  Modified:
-
-    27 September 2005
-
-  Author:
-
-    John Burkardt
-
-  Parameters:
-
-    Output, double CPU_TIME, the current total elapsed CPU time in second.
-*/
-{
-  double value;
-
-  value = ( double ) clock ( ) / ( double ) CLOCKS_PER_SEC;
-
-  return value;
-}
-/******************************************************************************/
-
-void timestamp ( )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    TIMESTAMP prints the current YMDHMS date as a time stamp.
-
-  Example:
-
-    31 May 2001 09:45:54 AM
-
-  Licensing:
-
-    This code is distributed under the GNU LGPL license. 
-
-  Modified:
-
-    24 September 2003
-
-  Author:
-
-    John Burkardt
-
-  Parameters:
-
-    None
-*/
-{
-# define TIME_SIZE 40
-
-  static char time_buffer[TIME_SIZE];
-  const struct tm *tm;
-  time_t now;
-
-  now = time ( NULL );
-  tm = localtime ( &now );
-
-  strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm );
-
-  printf ( "%s\n", time_buffer );
-
-  return;
-# undef TIME_SIZE
-}
diff --git a/src/tutor/mp/scaling.c b/src/tutor/mp/scaling.c
deleted file mode 100644
index afbd2b655..000000000
--- a/src/tutor/mp/scaling.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// Compile with: gcc scaling.c -std=c99 -fopenmp -O3                                                                                               
-// See also discussion on:
-//    https://stackoverflow.com/questions/19780554/what-limits-scaling-in-this-simple-openmp-program
-
-#include <stdio.h>
-#include <stdint.h>
-#include <math.h>
-
-int main(){
-
-  const uint64_t umin=1;
-  const uint64_t umax=4000000000LL;    //   4->5 already causes overflow
-  double sum=0.;
-#pragma omp parallel for reduction(+:sum)
-  for(uint64_t u=umin; u<umax; u++) {
-    sum+=1./(u*u);       //    10.0"  1./u/u takes about 2x longer on intel
-    //sum+=1./u/u;         //  15.6"  
-    //sum+=1/pow(u,2.0);   // 8.6"  compiler optimizes this out!!!
-    //sum+=1/pow(u,1.33);  // 15.3"
-    // sum+=(1/u)*(1/u);   //  18.5"
-  }
-  printf("%e\n", sum);
-
-}
diff --git a/src/tutor/mp/scaling2.c b/src/tutor/mp/scaling2.c
deleted file mode 100644
index 90af6ba02..000000000
--- a/src/tutor/mp/scaling2.c
+++ /dev/null
@@ -1,68 +0,0 @@
-
-// See scaling.c where the original program is kept
-
-//  @todo   negotiate between using OMP_NUM_THREADS and np=
-//
-//   Processor affinity can affect performance?
-//   
-//   export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7"
-//   /usr/bin/time scaling2 1 200000 np=4
-//   75.01user 0.00system 0:18.81elapsed 398%CPU
-//   75.03user 0.00system 0:18.81elapsed 398%CPU
-//   75.03user 0.00system 0:18.81elapsed 398%CPU 
-//   unset GOMP_CPU_AFFINITY
-//   75.69user 0.02system 0:19.04elapsed 397%CPU
-//   75.51user 0.00system 0:18.95elapsed 398%CPU
-//   75.65user 0.03system 0:19.01elapsed 397%CPU 
-
-
-#include <nemo.h>
-#include <stdint.h>
-
-string defv[] = {
-    "umin=1\n             Starting value",
-    "umax=10000\n         sqrt of Ending value",   
-    "umax2=0\n            Non-parallel loop, again sqrt of Ending value",
-    "iter=1\n             How many times to iterate and report timing",
-    "VERSION=1.4\n        17-sep-2023 PJT",
-     NULL,
-};
-
-string usage="NEMO version of the well scaled OpenMP scaling program";
-		 
-
-void nemo_main(void)
-{
-  int umin4 = getiparam("umin");  // NEMO doesn't have a reliable longlong
-  int umax4 = getiparam("umax");
-  int umax2 = getiparam("umax2");
-  int niter = getiparam("iter");
-  uint64_t umin = (uint64_t) umin4 * (uint64_t) umin4;
-  uint64_t umax = (uint64_t) umax4 * (uint64_t) umax4;
-  double t0, t1, t2 = 0.0;
-  bool Qshow = niter == 1;
-  extern int np_openmp;  // this is a cheat; see getparam.c
-
-  dprintf(0,"omp_get_num_procs() -> %d\n",np_openmp);
-  dprintf(0,"scaling2: umin=%ld umax=%ld\n",umin,umax);
-  
-  while (niter--) {
-    double sum=0.0;
-#pragma omp parallel for reduction(+:sum)
-    for(uint64_t u=umin; u<umax; u++) 
-      sum+=1./(u*u);   //  1./u/u takes about 2x longer on intel
-      //sum+=1./u/u;       //  1./u/u takes about 2x longer on intel but doesn't underflow
-    if (Qshow) printf("sum=%g\n", sum);
-    if (umax2 > 0) {
-      sum=0.0;
-      umax = (uint64_t) umax2 * (uint64_t) umax2;
-      for(uint64_t u=umin; u<umax; u++)
-	sum+=1./(u*u); 
-      if (Qshow) printf("sum2=%g\n", sum);	
-    }
-    t0 = 60*cputime2(0);
-    t1 = 60*cputime2(2);
-    dprintf(0,"cputime: %d %g %g %g sec\n", niter+1, t0, t1, t1-t2);
-    t2 = t1;
-  } // niter
-}
diff --git a/src/tutor/mp/sections.c b/src/tutor/mp/sections.c
deleted file mode 100644
index d902a2d46..000000000
--- a/src/tutor/mp/sections.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * sections:    possibly the easiest to OMP ?   careful about
-                the work call. they might need to mutex
- */
-
-#include <nemo.h>
-
-string defv[] = {
-  "n=1000000\n      array size",
-  "iter=10\n        how many times to iterate each block",
-  "seed=123\n       seed for xrandom",
-  "mode=0\n         Different openmp experiments",
-  "VERSION=0.3\n    14-feb-2021 PJT",
-  NULL,
-};
-
-string usage="benchmark openmp overhead of starting a test via sections";
-
-string cvsid="$Id:$";
-
-/*
- benchmark program:
-     /usr/bin/time sections mode=3 iter=0 n=100000000
-
-n * iter scales the CPU, but for small n you can see the caching effect.
-Small:   0.40 Gop
-Large:   1.20
-Elarge   2.53   (memory starts to fill up?    1 Gx=28"
-
-lma:    3.40 n=100M iter=10     (1G-xrandom = 21")
-        1.00 n=1000 iter=1M
-*/
-  
-
-void work1(int n, double *x, int iter);
-void work2(int n, double *x, int iter);
-void work1r(int n, double *x, int iter);
-void work2r(int n, double *x, int iter);
-void work3(int n, double *x, double *y, double *z, int iter);
-
-void nemo_main()
-{
-  int n = getiparam("n");
-  int seed = init_xrandom(getparam("seed"));
-  int iter1 = getiparam("iter");
-  int iter2 = iter1;
-  int i;
-  int n1 = n;
-  int n2 = n;
-  int n3 = n;
-  int mode = getiparam("mode");
-  real t0, t1;
-  real *x1 = (real *) allocate(n1*sizeof(double));
-  real *x2 = (real *) allocate(n2*sizeof(double));
-  real *x3 = (real *) allocate(n3*sizeof(double));
-
-  dprintf(0,"n=%d iter=%d\n",n,iter1);
-  // cannot omp this for-loop: xrandom has no mutex
-  for (i=0; i<n1; ++i) {
-    x1[i] = xrandom(0.0,1.0);
-    x2[i] = xrandom(0.0,1.0);
-  }
-
-  
-  t0 = cputime();
-  if (mode==0) {
-    dprintf(0,"mode=0: vanilla sections\n");    
-    #pragma omp parallel sections
-    {
-      work1(n1,x1,iter1);
-      #pragma omp section
-      work2(n2,x2,iter2);
-    }
-  } else if (mode == 1) {
-    dprintf(0,"mode=1: just work1r\n");
-    work1r(n1,x1,iter1);
-  } else if (mode == 2) {
-    dprintf(0,"mode=2: reduc on work1r and work2r\n");
-    #pragma omp parallel sections
-    {
-      work1r(n1,x1,iter1);
-      #pragma omp section
-      work2r(n2,x2,iter2);
-    }
-  } else if (mode == 3) {
-    work3(n1,x1,x2,x3,iter1);    
-  } else
-    error("mode=%d not implemented yet",mode);
-  t1 = cputime();
-  dprintf(0,"CPU time=%f  %f\n",(t1-t0)*60, t0*60);
-}
-
-void work3(int n, real *x, real *y, real *z, int iter)
-{
-  printf("work3 %d\n",iter);
-  int i;
-  real sum = 0.0;
-
-  while (iter--)
-    for (i=0; i<n; ++i) {
-      z[i] = x[i] * y[i];
-      sum += z[i];
-    }
-  printf("sum=%g\n",sum);
-}
-
-void work1(int n, real *x, int iter)
-{
-  printf("work1 %d\n",iter);
-  int i;
-  real sum = 0.0;
-
-  while (iter--)
-    for (i=0; i<n; ++i)
-      sum += x[i];
-  printf("sum1=%g\n",sum);
-}
-
-void work2(int n, real *x, int iter)
-{
-  printf("work2 %d\n",iter);
-  int i;
-  real sum = 0.0;
-
-  while (iter--)
-    for (i=0; i<n; ++i)
-      sum += x[i];      
-  printf("sum2=%g\n",sum);
-}
-
-void work1r(int n, real *x, int iter)
-{
-  printf("work1 %d\n",iter);
-  int i;
-  real sum = 0.0;
-
-  while (iter--)
-    #pragma omp parallel shared(n, x, sum) private(i)
-    #pragma omp for reduction(+:sum)   
-    for (i=0; i<n; ++i)
-      sum += x[i];
-  printf("sum1=%g\n",sum);
-}
-
-void work2r(int n, real *x, int iter)
-{
-  printf("work2 %d\n",iter);
-  int i;
-  real sum = 0.0;
-
-  while (iter--)
-    #pragma omp parallel shared(n, x, sum) private(i)
-    #pragma omp for reduction(+:sum)   
-    for (i=0; i<n; ++i)
-      sum += x[i];      
-  printf("sum2=%g\n",sum);
-}
diff --git a/src/tutor/mp/sgefa_openmp.c b/src/tutor/mp/sgefa_openmp.c
deleted file mode 100644
index ccd9c05ee..000000000
--- a/src/tutor/mp/sgefa_openmp.c
+++ /dev/null
@@ -1,1519 +0,0 @@
-# include <stdlib.h>
-# include <stdio.h>
-# include <math.h>
-# include <omp.h>
-# include <time.h>
-
-int main ( void );
-void test01 ( int n );
-void test02 ( int n );
-void test03 ( int n );
-
-int isamax ( int n, float x[], int incx );
-void matgen ( int lda, int n, float a[], float x[], float b[] );
-void msaxpy ( int nr, int nc, float a[], int n, float x[], float y[] );
-void msaxpy2 ( int nr, int nc, float a[], int n, float x[], float y[] );
-int msgefa ( float a[], int lda, int n, int ipvt[] );
-int msgefa2 ( float a[], int lda, int n, int ipvt[] );
-void saxpy ( int n, float a, float x[], int incx, float y[], int incy );
-float sdot ( int n, float x[], int incx, float y[], int incy );
-int sgefa ( float a[], int lda, int n, int ipvt[] );
-void sgesl ( float a[], int lda, int n, int ipvt[], float b[], int job );
-void sscal ( int n, float a, float x[], int incx );
-void sswap ( int n, float x[], int incx, float y[], int incy );
-void timestamp ( );
-
-/******************************************************************************/
-
-int main ( void )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    MAIN is the main program for the SGEFA_OPENMP test program.
-
-  Discussion:
-
-    We want to compare methods of solving the linear system A*x=b.
-
-    The first way uses the standard sequential algorithm "SGEFA".
-
-    The second way uses a variant of SGEFA that has been modified to
-    take advantage of OpenMP.
-
-    The third way reruns the variant code, but with OpenMP turned off.
-
-  Modified:
-
-    17 April 2009
-
-  Author:
-
-    John Burkardt
-*/
-{
-  int n;
-
-  timestamp ( );
-
-  printf ( "\n" );
-  printf ( "SGEFA_OPENMP\n" );
-  printf ( "  C + OpenMP version\n" );
-
-  printf ( "\n" );
-  printf ( "  Number of processors available = %d\n", omp_get_num_procs ( ) );
-  printf ( "  Number of threads =              %d\n", omp_get_max_threads ( ) );
-
-  printf ( "\n" );
-  printf ( " Algorithm        Mode          N    Error       Time\n" );
-
-  printf ( "\n" );
-  n = 10;
-  test01 ( n );
-  test02 ( n );
-  test03 ( n );
-
-  printf ( "\n" );
-  n = 100;
-  test01 ( n );
-  test02 ( n );
-  test03 ( n );
-
-  printf ( "\n" );
-  n = 1000;
-  test01 ( n );
-  test02 ( n );
-  test03 ( n );
-
-  printf ( "\n" );
-  printf ( "SGEFA_OPENMP\n" );
-  printf ( "  Normal end of execution.\n" );
-
-  printf ( "\n" );
-  timestamp ( );
-
-  return 0;
-}
-/******************************************************************************/
-
-void test01 ( int n )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    TEST01 runs the sequential version of SGEFA.
-
-  Modified:
-
-    07 April 2008
-
-  Author:
-
-    John Burkardt
-*/
-{
-  float *a;
-  float *b;
-  float err;
-  int i;
-  int info;
-  int *ipvt;
-  int job;
-  int lda;
-  double wtime;
-  float *x;
-/*
-  Generate the linear system A * x = b.
-*/
-  lda = n;
-  a = ( float * ) malloc ( lda * n * sizeof ( float ) );
-  b = ( float * ) malloc ( n * sizeof ( float ) );
-  x = ( float * ) malloc ( n * sizeof ( float ) );
-
-  matgen ( lda, n, a, x, b );
-/*
-  Factor the linear system.
-*/
-  ipvt = ( int * ) malloc ( n * sizeof ( int ) );
-
-  wtime = omp_get_wtime ( );
-  info = sgefa ( a, lda, n, ipvt );
-  wtime = omp_get_wtime ( ) - wtime;
-
-  if ( info != 0 )
-  {
-    printf ( "\n" );
-    printf ( "TEST01 - Fatal error!\n" );
-    printf ( "  SGEFA reports the matrix is singular.\n" );
-    exit ( 1 );
-  }
-/*
-  Solve the linear system.
-*/
-  job = 0;
-  sgesl ( a, lda, n, ipvt, b, job );
-
-  err = 0.0;
-  for ( i = 0; i < n; i++ )
-  {
-    err = err + fabs ( x[i] - b[i] );
-  }
-  printf ( "  Original  Sequential   %8d  %10.4e  %10.4e\n", n, err, wtime );
-
-  free ( a );
-  free ( b );
-  free ( ipvt );
-  free ( x );
-
-  return;
-}
-/******************************************************************************/
-
-void test02 ( int n )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    TEST02 runs the revised version of SGEFA in parallel.
-
-  Modified:
-
-    07 April 2008
-
-  Author:
-
-    John Burkardt
-*/
-{
-  float *a;
-  float *b;
-  float err;
-  int i;
-  int info;
-  int *ipvt;
-  int job;
-  int lda;
-  double wtime;
-  float *x;
-/*
-  Generate the linear system A * x = b.
-*/
-  lda = n;
-  a = ( float * ) malloc ( lda * n * sizeof ( float ) );
-  b = ( float * ) malloc ( n * sizeof ( float ) );
-  x = ( float * ) malloc ( n * sizeof ( float ) );
-
-  matgen ( lda, n, a, x, b );
-/*
-  Factor the linear system.
-*/
-  ipvt = ( int * ) malloc ( n * sizeof ( int ) );
-
-  wtime = omp_get_wtime ( );
-  info = msgefa ( a, lda, n, ipvt );
-  wtime = omp_get_wtime ( ) - wtime;
-
-  if ( info != 0 )
-  {
-    printf ( "\n" );
-    printf ( "TEST02 - Fatal error!\n" );
-    printf ( "  MSGEFA reports the matrix is singular.\n" );
-    exit ( 1 );
-  }
-/*
-  Solve the linear system.
-*/
-  job = 0;
-  sgesl ( a, lda, n, ipvt, b, job );
-
-  err = 0.0;
-  for ( i = 0; i < n; i++ )
-  {
-    err = err + fabs ( x[i] - b[i] );
-  }
-
-  printf ( "  Revised     Parallel   %8d  %10.4e  %10.4e\n", n, err, wtime );
-
-  free ( a );
-  free ( b );
-  free ( ipvt );
-  free ( x );
-
-  return;
-}
-/******************************************************************************/
-
-void test03 ( int n )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    TEST03 runs the revised version of SGEFA in sequential mode.
-
-  Modified:
-
-    07 April 2008
-
-  Author:
-
-    John Burkardt
-*/
-{
-  float *a;
-  float *b;
-  float err;
-  int i;
-  int info;
-  int *ipvt;
-  int job;
-  int lda;
-  double wtime;
-  float *x;
-/*
-  Generate the linear system A * x = b.
-*/
-  lda = n;
-  a = ( float * ) malloc ( lda * n * sizeof ( float ) );
-  b = ( float * ) malloc ( n * sizeof ( float ) );
-  x = ( float * ) malloc ( n * sizeof ( float ) );
-
-  matgen ( lda, n, a, x, b );
-/*
-  Factor the linear system.
-*/
-  ipvt = ( int * ) malloc ( n * sizeof ( int ) );
-
-  wtime = omp_get_wtime ( );
-  info = msgefa2 ( a, lda, n, ipvt );
-  wtime = omp_get_wtime ( ) - wtime;
-
-  if ( info != 0 )
-  {
-    printf ( "\n" );
-    printf ( "TEST03 - Fatal error!\n" );
-    printf ( "  MSGEFA2 reports the matrix is singular.\n" );
-    exit ( 1 );
-  }
-/*
-  Solve the linear system.
-*/
-  job = 0;
-  sgesl ( a, lda, n, ipvt, b, job );
-
-  err = 0.0;
-  for ( i = 0; i < n; i++ )
-  {
-    err = err + fabs ( x[i] - b[i] );
-  }
-
-  printf ( "  Revised   Sequential   %8d  %10.4e  %10.4e\n", n, err, wtime );
-
-  free ( a );
-  free ( b );
-  free ( ipvt );
-  free ( x );
-
-  return;
-}
-/******************************************************************************/
-
-int isamax ( int n, float x[], int incx )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    ISAMAX finds the index of the vector element of maximum absolute value.
-
-  Discussion:
-
-    WARNING: This index is a 1-based index, not a 0-based index!
-
-  Modified:
-
-    07 April 2008
-
-  Author:
-
-    FORTRAN77 original version by Lawson, Hanson, Kincaid, Krogh.
-    C version by John Burkardt
-
-  Reference:
-
-    Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-    LINPACK User's Guide,
-    SIAM, 1979,
-    ISBN13: 978-0-898711-72-1,
-    LC: QA214.L56.
-
-    Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh,
-    Algorithm 539: 
-    Basic Linear Algebra Subprograms for Fortran Usage,
-    ACM Transactions on Mathematical Software,
-    Volume 5, Number 3, September 1979, pages 308-323.
-
-  Parameters:
-
-    Input, int N, the number of entries in the vector.
-
-    Input, float X[*], the vector to be examined.
-
-    Input, int INCX, the increment between successive entries of SX.
-
-    Output, int ISAMAX, the index of the element of maximum
-    absolute value.
-*/
-{
-  float xmax;
-  int i;
-  int ix;
-  int value;
-
-  value = 0;
-
-  if ( n < 1 || incx <= 0 )
-  {
-    return value;
-  }
-
-  value = 1;
-
-  if ( n == 1 )
-  {
-    return value;
-  }
-
-  if ( incx == 1 )
-  {
-    xmax = fabs ( x[0] );
-
-    for ( i = 1; i < n; i++ )
-    {
-      if ( xmax < fabs ( x[i] ) )
-      {
-        value = i + 1;
-        xmax = fabs ( x[i] );
-      }
-    }
-  }
-  else
-  {
-    ix = 0;
-    xmax = fabs ( x[0] );
-    ix = ix + incx;
-
-    for ( i = 1; i < n; i++ )
-    {
-      if ( xmax < fabs ( x[ix] ) )
-      {
-        value = i + 1;
-        xmax = fabs ( x[ix] );
-      }
-      ix = ix + incx;
-    }
-  }
-
-  return value;
-}
-/*******************************************************************************/
-
-void matgen ( int lda, int n, float a[], float x[], float b[] )
-
-/*******************************************************************************/
-/* 
-  Purpose:
-
-    MATGEN generates a "random" matrix for testing.
-
-  Modified:
-
-    27 April 2008
-
-  Author:
-
-    John Burkardt
-
-  Parameters:
-
-    Input, int LDA, the leading dimension of the matrix.
-
-    Input, int N, the order of the matrix, and the length of the vector.
-
-    Output, float A[LDA*N], the matrix.
-
-    Output, float X[N], the solution vector.
-
-    Output, float B[N], the right hand side vector.
-*/
-{
-  int i;
-  int j;
-  int seed;
-  float value;
-
-  seed = 1325;
-/*
-  Set the matrix A.
-*/
-  for ( j = 0; j < n; j++ )
-  {
-    for ( i = 0; i < n; i++ )
-    {
-      seed = ( 3125 * seed ) % 65536;
-      value = ( ( float ) seed - 32768.0 ) / 16384.0;
-      a[i+j*lda] = value;
-    }
-  }
-/*
-  Set x.
-*/
-  for ( i = 0; i < n; i++ )
-  {
-    x[i] = ( float ) ( i + 1 ) / ( ( float ) n );
-  }
-/*
-  Set b = A * x.
-*/
-  for ( i = 0; i < n; i++ ) 
-  {
-    b[i] = 0.0;
-    for ( j = 0; j < n; j++ )
-    {
-      b[i] = b[i] + a[i+j*lda] * x[j];
-    }
-  }
-  return;
-}
-/******************************************************************************/
-
-void msaxpy ( int nr, int nc, float a[], int n, float x[], float y[] )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    MSAXPY carries out multiple "SAXPY" operations.
-
-  Discussion:
-
-    This routine carries out the step of Gaussian elimination where multiples
-    of the pivot row are added to the rows below the pivot row.
-
-    A single call to MSAXPY replaces multiple calls to SAXPY.
-
-  Modified:
-
-    07 April 2008
-
-  Author:
-
-    Wesley Petersen
-
-  Parameters:
-
-    Input, int NR, NC, the number of rows and columns in the matrix.
-
-    Input, float A[*], ...
-
-    Input, int N, ...
-
-    Input, float X[*], ...
-
-    Output, float Y[*], ...
-*/
-{
-  int i,j;
-
-# pragma omp parallel \
-  shared ( a, nc, nr, x, y ) \
-  private ( i, j )
-
-# pragma omp for
-  for ( j = 0; j < nc; j++)
-  {
-    for ( i = 0; i < nr; i++ )
-    {
-      y[i+j*n] += a[j*n] * x[i];
-    }
-  }
-  return;
-}
-/******************************************************************************/
-
-void msaxpy2 ( int nr, int nc, float a[], int n, float x[], float y[] )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    MSAXPY2 carries out multiple "SAXPY" operations.
-
-  Discussion:
-
-    This routine carries out the step of Gaussian elimination where multiples
-    of the pivot row are added to the rows below the pivot row.
-
-    A single call to MSAXPY replaces multiple calls to SAXPY.
-
-  Modified:
-
-    07 April 2008
-
-  Author:
-
-    Wesley Petersen
-
-  Parameters:
-
-    Input, int NR, NC, the number of rows and columns in the matrix.
-
-    Input, float A[*], ...
-
-    Input, int N, ...
-
-    Input, float X[*], ...
-
-    Output, float Y[*], ...
-*/
-{
-  int i,j;
-
-  for ( j = 0; j < nc; j++)
-  {
-    for ( i = 0; i < nr; i++ )
-    {
-      y[i+j*n] += a[j*n] * x[i];
-    }
-  }
-  return;
-}
-/******************************************************************************/
-
-int msgefa ( float a[], int lda, int n, int ipvt[] )
-
-/******************************************************************************/
-/* 
-  Purpose:
-
-    MSGEFA factors a matrix by gaussian elimination.
-
-  Discussion:
-
-    Matrix references which would, mathematically, be written A(I,J)
-    must be written here as:
-    * A[I+J*LDA], when the value is needed, or
-    * A+I+J*LDA, when the address is needed.
-
-    This variant of SGEFA uses OpenMP for improved parallel execution.
-    The step in which multiples of the pivot row are added to individual
-    rows has been replaced by a single call which updates the entire
-    matrix sub-block.
-
-  Modified:
-
-    07 March 2008
-
-  Author:
-
-    FORTRAN77 original version by Cleve Moler.
-    C version by Wesley Petersen.
-
-  Reference:
-
-    Jack Dongarra, Jim Bunch, Cleve Moler, Pete Stewart,
-    LINPACK User's Guide,
-    SIAM, 1979,
-    ISBN13: 978-0-898711-72-1,
-    LC: QA214.L56.
-
-  Parameters:
-
-    Input/output, float A[LDA*N].  On input, the matrix to be factored.
-    On output, an upper triangular matrix and the multipliers which were 
-    used to obtain it.  The factorization can be written A = L * U where
-    L is a product of permutation and unit lower triangular matrices and
-    U is upper triangular.
-
-    Input, int LDA, the leading dimension of the matrix.
-
-    Input, int N, the order of the matrix.
-
-    Output, int IPVT[N], the pivot indices.
-
-    Output, int MSGEFA, indicates singularity.
-    If 0, this is the normal value, and the algorithm succeeded.
-    If K, then on the K-th elimination step, a zero pivot was encountered.
-    The matrix is numerically not invertible.
-*/
-{
-  int info;
-  int k,kp1,l,nm1;
-  float t;
-
-  info = 0;
-  nm1 = n - 1;
-  for ( k = 0; k < nm1; k++ )
-  {
-    kp1 = k + 1;
-    l = isamax ( n-k, a+k+k*lda, 1 ) + k - 1;
-    ipvt[k] = l + 1;
-
-    if ( a[l+k*lda] == 0.0 )
-    {
-      info = k + 1;
-      return info;
-    }
-
-    if ( l != k )
-    {
-      t          = a[l+k*lda];
-      a[l+k*lda] = a[k+k*lda];
-      a[k+k*lda] = t;
-    }
-    t = -1.0 / a[k+k*lda]; 
-    sscal ( n-k-1, t, a+kp1+k*lda, 1 );
-/*
-  Interchange the pivot row and the K-th row.
-*/
-    if ( l != k )
-    {
-      sswap ( n-k-1, a+l+kp1*lda, lda, a+k+kp1*lda, lda );
-    }
-/*
-  Add multiples of the K-th row to rows K+1 through N.
-*/
-    msaxpy ( n-k-1, n-k-1, a+k+kp1*lda, n, a+kp1+k*lda, a+kp1+kp1*lda );
-  }
-
-  ipvt[n-1] = n;
-
-  if ( a[n-1+(n-1)*lda] == 0.0 )
-  {
-    info = n;
-  }
-
-  return info;
-}
-/******************************************************************************/
-
-int msgefa2 ( float a[], int lda, int n, int ipvt[] )
-
-/******************************************************************************/
-/* 
-  Purpose:
-
-    MSGEFA2 factors a matrix by gaussian elimination.
-
-  Discussion:
-
-    Matrix references which would, mathematically, be written A(I,J)
-    must be written here as:
-    * A[I+J*LDA], when the value is needed, or
-    * A+I+J*LDA, when the address is needed.
-
-    This variant of SGEFA uses OpenMP for improved parallel execution.
-    The step in which multiples of the pivot row are added to individual
-    rows has been replaced by a single call which updates the entire
-    matrix sub-block.
-
-  Modified:
-
-    07 March 2008
-
-  Author:
-
-    FORTRAN77 original version by Cleve Moler.
-    C version by Wesley Petersen.
-
-  Reference:
-
-    Jack Dongarra, Jim Bunch, Cleve Moler, Pete Stewart,
-    LINPACK User's Guide,
-    SIAM, 1979,
-    ISBN13: 978-0-898711-72-1,
-    LC: QA214.L56.
-
-  Parameters:
-
-    Input/output, float A[LDA*N].  On input, the matrix to be factored.
-    On output, an upper triangular matrix and the multipliers which were 
-    used to obtain it.  The factorization can be written A = L * U where
-    L is a product of permutation and unit lower triangular matrices and
-    U is upper triangular.
-
-    Input, int LDA, the leading dimension of the matrix.
-
-    Input, int N, the order of the matrix.
-
-    Output, int IPVT[N], the pivot indices.
-
-    Output, int MSGEFA, indicates singularity.
-    If 0, this is the normal value, and the algorithm succeeded.
-    If K, then on the K-th elimination step, a zero pivot was encountered.
-    The matrix is numerically not invertible.
-*/
-{
-  int info;
-  int k,kp1,l,nm1;
-  float t;
-
-  info = 0;
-  nm1 = n - 1;
-  for ( k = 0; k < nm1; k++ )
-  {
-    kp1 = k + 1;
-    l = isamax ( n-k, a+k+k*lda, 1 ) + k - 1;
-    ipvt[k] = l + 1;
-
-    if ( a[l+k*lda] == 0.0 )
-    {
-      info = k + 1;
-      return info;
-    }
-
-    if ( l != k )
-    {
-      t          = a[l+k*lda];
-      a[l+k*lda] = a[k+k*lda];
-      a[k+k*lda] = t;
-    }
-    t = -1.0 / a[k+k*lda]; 
-    sscal ( n-k-1, t, a+kp1+k*lda, 1 );
-/*
-  Interchange the pivot row and the K-th row.
-*/
-    if ( l != k )
-    {
-      sswap ( n-k-1, a+l+kp1*lda, lda, a+k+kp1*lda, lda );
-    }
-/*
-  Add multiples of the K-th row to rows K+1 through N.
-*/
-    msaxpy2 ( n-k-1, n-k-1, a+k+kp1*lda, n, a+kp1+k*lda, a+kp1+kp1*lda );
-  }
-
-  ipvt[n-1] = n;
-
-  if ( a[n-1+(n-1)*lda] == 0.0 )
-  {
-    info = n;
-  }
-
-  return info;
-}
-/******************************************************************************/
-
-void saxpy ( int n, float a, float x[], int incx, float y[], int incy )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    SAXPY computes float constant times a vector plus a vector.
-
-  Discussion:
-
-    This routine uses unrolled loops for increments equal to one.
-
-  Modified:
-
-    23 February 2006
-
-  Author:
-
-    FORTRAN77 original version by Dongarra, Moler, Bunch, Stewart.
-    C version by John Burkardt
-
-  Reference:
-
-    Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-    LINPACK User's Guide,
-    SIAM, 1979,
-    ISBN13: 978-0-898711-72-1,
-    LC: QA214.L56.
-
-    Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh,
-    Basic Linear Algebra Subprograms for Fortran Usage,
-    Algorithm 539, 
-    ACM Transactions on Mathematical Software, 
-    Volume 5, Number 3, September 1979, pages 308-323.
-
-  Parameters:
-
-    Input, int N, the number of elements in X and Y.
-
-    Input, float A, the multiplier of X.
-
-    Input, float X[*], the first vector.
-
-    Input, int INCX, the increment between successive entries of X.
-
-    Input/output, float Y[*], the second vector.
-    On output, Y[*] has been replaced by Y[*] + A * X[*].
-
-    Input, int INCY, the increment between successive entries of Y.
-*/
-{
-  int i;
-  int ix;
-  int iy;
-  int m;
-
-  if ( n <= 0 )
-  {
-    return;
-  }
-
-  if ( a == 0.0 )
-  {
-    return;
-  }
-/*
-  Code for unequal increments or equal increments
-  not equal to 1.
-*/
-  if ( incx != 1 || incy != 1 )
-  {
-    if ( 0 <= incx )
-    {
-      ix = 0;
-    }
-    else
-    {
-      ix = ( - n + 1 ) * incx;
-    }
-
-    if ( 0 <= incy )
-    {
-      iy = 0;
-    }
-    else
-    {
-      iy = ( - n + 1 ) * incy;
-    }
-
-    for ( i = 0; i < n; i++ )
-    {
-      y[iy] = y[iy] + a * x[ix];
-      ix = ix + incx;
-      iy = iy + incy;
-    }
-  }
-/*
-  Code for both increments equal to 1.
-*/
-  else
-  {
-    m = n % 4;
-
-    for ( i = 0; i < m; i++ )
-    {
-      y[i] = y[i] + a * x[i];
-    }
-
-    for ( i = m; i < n; i = i + 4 )
-    {
-      y[i  ] = y[i  ] + a * x[i  ];
-      y[i+1] = y[i+1] + a * x[i+1];
-      y[i+2] = y[i+2] + a * x[i+2];
-      y[i+3] = y[i+3] + a * x[i+3];
-    }
-  }
-
-  return;
-}
-/******************************************************************************/
-
-float sdot ( int n, float x[], int incx, float y[], int incy )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    SDOT forms the dot product of two vectors.
-
-  Discussion:
-
-    This routine uses unrolled loops for increments equal to one.
-
-  Modified:
-
-    23 February 2006
-
-  Author:
-
-    FORTRAN77 original version by Dongarra, Moler, Bunch, Stewart
-    C version by John Burkardt
-
-  Reference:
-
-    Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-    LINPACK User's Guide,
-    SIAM, 1979.
-
-    Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh,
-    Basic Linear Algebra Subprograms for Fortran Usage,
-    Algorithm 539, 
-    ACM Transactions on Mathematical Software, 
-    Volume 5, Number 3, September 1979, pages 308-323.
-
-  Parameters:
-
-    Input, int N, the number of entries in the vectors.
-
-    Input, float X[*], the first vector.
-
-    Input, int INCX, the increment between successive entries in X.
-
-    Input, float Y[*], the second vector.
-
-    Input, int INCY, the increment between successive entries in Y.
-
-    Output, float SDOT, the sum of the product of the corresponding
-    entries of X and Y.
-*/
-{
-  int i;
-  int ix;
-  int iy;
-  int m;
-  float temp;
-
-  temp = 0.0;
-
-  if ( n <= 0 )
-  {
-    return temp;
-  }
-/*
-  Code for unequal increments or equal increments
-  not equal to 1.
-*/
-  if ( incx != 1 || incy != 1 )
-  {
-    if ( 0 <= incx )
-    {
-      ix = 0;
-    }
-    else
-    {
-      ix = ( - n + 1 ) * incx;
-    }
-
-    if ( 0 <= incy )
-    {
-      iy = 0;
-    }
-    else
-    {
-      iy = ( - n + 1 ) * incy;
-    }
-
-    for ( i = 0; i < n; i++ )
-    {
-      temp = temp + x[ix] * y[iy];
-      ix = ix + incx;
-      iy = iy + incy;
-    }
-  }
-/*
-  Code for both increments equal to 1.
-*/
-  else
-  {
-    m = n % 5;
-
-    for ( i = 0; i < m; i++ )
-    {
-      temp = temp + x[i] * y[i];
-    }
-
-    for ( i = m; i < n; i = i + 5 )
-    {
-      temp = temp + x[i  ] * y[i  ] 
-                  + x[i+1] * y[i+1] 
-                  + x[i+2] * y[i+2] 
-                  + x[i+3] * y[i+3] 
-                  + x[i+4] * y[i+4];
-    }
-  }
-
-  return temp;
-}
-/*******************************************************************************/
-
-int sgefa ( float a[], int lda, int n, int ipvt[] )
-
-/*******************************************************************************/
-/*
-  Purpose:
-
-    SGEFA factors a matrix by gaussian elimination.
-
-  Discussion:
-
-    Matrix references which would, mathematically, be written A(I,J)
-    must be written here as:
-    * A[I+J*LDA], when the value is needed, or
-    * A+I+J*LDA, when the address is needed.
-
-  Modified:
-
-    07 March 2008
-
-  Author:
-
-    FORTRAN77 original version by Cleve Moler.
-    C version by John Burkardt.
-
-  Reference:
-
-    Jack Dongarra, Jim Bunch, Cleve Moler, Pete Stewart,
-    LINPACK User's Guide,
-    SIAM, 1979,
-    ISBN13: 978-0-898711-72-1,
-    LC: QA214.L56.
-
-  Parameters:
-
-    Input/output, float A[LDA*N].  On input, the matrix to be factored.
-    On output, an upper triangular matrix and the multipliers which were 
-    used to obtain it.  The factorization can be written A = L * U where
-    L is a product of permutation and unit lower triangular matrices and
-    U is upper triangular.
-
-    Input, int LDA, the leading dimension of the matrix.
-
-    Input, int N, the order of the matrix.
-
-    Output, int IPVT[N], the pivot indices.
-
-    Output, int SGEFA, indicates singularity.
-    If 0, this is the normal value, and the algorithm succeeded.
-    If K, then on the K-th elimination step, a zero pivot was encountered.
-    The matrix is numerically not invertible.
-*/
-{
-  int j;
-  int info;
-  int k;
-  int l;
-  float t;
-
-  info = 0;
-
-  for ( k = 1; k <= n - 1; k++ ) 
-  {
-/* 
-  Find l = pivot index.
-*/
-    l = isamax ( n-k+1, &a[k-1+(k-1)*lda], 1 ) + k - 1;
-    ipvt[k-1] = l;
-/* 
-  Zero pivot implies this column already triangularized.
-*/
-    if ( a[l-1+(k-1)*lda] != 0.0 ) 
-    {
-/* 
-  Interchange if necessary.
-*/
-      if ( l != k ) 
-      {
-        t                = a[l-1+(k-1)*lda];
-        a[l-1+(k-1)*lda] = a[k-1+(k-1)*lda];
-        a[k-1+(k-1)*lda] = t; 
-      }
-/* 
-  Compute multipliers.
-*/
-      t = - 1.0 / a[k-1+(k-1)*lda];
-      sscal ( n-k, t, &a[k+(k-1)*lda], 1 );
-/* 
-  Row elimination with column indexing.
-*/
-      for ( j = k + 1; j <= n; j++ ) 
-      {
-        t = a[l-1+(j-1)*lda];
-        if (l != k) 
-        {
-          a[l-1+(j-1)*lda] = a[k-1+(j-1)*lda];
-          a[k-1+(j-1)*lda] = t;
-        }
-        saxpy ( n-k, t, &a[k+(k-1)*lda], 1, &a[k+(j-1)*lda], 1 );
-      } 
-    }
-    else
-    { 
-      info = k;
-    }
-  } 
-  ipvt[n-1] = n;
-
-  if (a[n-1+(n-1)*lda] == 0.0 ) 
-  {
-    info = n - 1;
-  }
-  return info;
-}
-/******************************************************************************/
-
-void sgesl ( float a[], int lda, int n, int ipvt[], float b[], int job )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    SGESL solves a real general linear system A * X = B.
-
-  Discussion:
-
-    SGESL can solve either of the systems A * X = B or A' * X = B.
-
-    The system matrix must have been factored by SGECO or SGEFA.
-
-    A division by zero will occur if the input factor contains a
-    zero on the diagonal.  Technically this indicates singularity
-    but it is often caused by improper arguments or improper
-    setting of LDA.  It will not occur if the subroutines are
-    called correctly and if SGECO has set 0.0 < RCOND
-    or SGEFA has set INFO == 0.
-
-  Modified:
-
-    04 April 2006
-
-  Author:
-
-    FORTRAN77 original by Dongarra, Moler, Bunch and Stewart.
-    C translation by John Burkardt.
-
-  Reference:
-
-    Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-    LINPACK User's Guide,
-    SIAM, (Society for Industrial and Applied Mathematics),
-    3600 University City Science Center,
-    Philadelphia, PA, 19104-2688.
-    ISBN: 0-89871-172-X
-
-  Parameters:
-
-    Input, float A[LDA*N], the output from SGECO or SGEFA.
-
-    Input, int LDA, the leading dimension of A.
-
-    Input, int N, the order of the matrix A.
-
-    Input, int IPVT[N], the pivot vector from SGECO or SGEFA.
-
-    Input/output, float B[N].
-    On input, the right hand side vector.
-    On output, the solution vector.
-
-    Input, int JOB.
-    0, solve A * X = B;
-    nonzero, solve A' * X = B.
-*/
-{
-  int k;
-  int l;
-  float t;
-/*
-  Solve A * X = B.
-*/
-  if ( job == 0 )
-  {
-    for ( k = 1; k <= n-1; k++ )
-    {
-      l = ipvt[k-1];
-      t = b[l-1];
-
-      if ( l != k )
-      {
-        b[l-1] = b[k-1];
-        b[k-1] = t;
-      }
-      saxpy ( n-k, t, a+k+(k-1)*lda, 1, b+k, 1 );
-    }
-
-    for ( k = n; 1 <= k; k-- )
-    {
-      b[k-1] = b[k-1] / a[k-1+(k-1)*lda];
-      t = -b[k-1];
-      saxpy ( k-1, t, a+0+(k-1)*lda, 1, b, 1 );
-    }
-  }
-/*
-  Solve A' * X = B.
-*/
-  else
-  {
-    for ( k = 1; k <= n; k++ )
-    {
-      t = sdot ( k-1, a+0+(k-1)*lda, 1, b, 1 );
-      b[k-1] = ( b[k-1] - t ) / a[k-1+(k-1)*lda];
-    }
-
-    for ( k = n-1; 1 <= k; k-- )
-    {
-      b[k-1] = b[k-1] + sdot ( n-k, a+k+(k-1)*lda, 1, b+k, 1 );
-      l = ipvt[k-1];
-
-      if ( l != k )
-      {
-        t = b[l-1];
-        b[l-1] = b[k-1];
-        b[k-1] = t;
-      }
-    }
-  }
-  return;
-}
-/******************************************************************************/
-
-void sscal ( int n, float sa, float x[], int incx )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    SSCAL scales a float vector by a constant.
-
-  Modified:
-
-    23 February 2006
-
-  Author:
-
-    Jack Dongarra
-    C version by John Burkardt
-
-  Reference:
-
-    Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-    LINPACK User's Guide,
-    SIAM, 1979,
-    ISBN13: 978-0-898711-72-1,
-    LC: QA214.L56.
-
-    Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh,
-    Basic Linear Algebra Subprograms for Fortran Usage,
-    Algorithm 539,
-    ACM Transactions on Mathematical Software,
-    Volume 5, Number 3, September 1979, pages 308-323.
-
-  Parameters:
-
-    Input, int N, the number of entries in the vector.
-
-    Input, float SA, the multiplier.
-
-    Input/output, float X[*], the vector to be scaled.
-
-    Input, int INCX, the increment between successive entries of X.
-*/
-{
-  int i;
-  int ix;
-  int m;
-
-  if ( n <= 0 )
-  {
-  }
-  else if ( incx == 1 )
-  {
-    m = n % 5;
-
-    for ( i = 0; i < m; i++ )
-    {
-      x[i] = sa * x[i];
-    }
-
-    for ( i = m; i < n; i = i + 5 )
-    {
-      x[i]   = sa * x[i];
-      x[i+1] = sa * x[i+1];
-      x[i+2] = sa * x[i+2];
-      x[i+3] = sa * x[i+3];
-      x[i+4] = sa * x[i+4];
-    }
-  }
-  else
-  {
-    if ( 0 <= incx )
-    {
-      ix = 0;
-    }
-    else
-    {
-      ix = ( - n + 1 ) * incx;
-    }
-
-    for ( i = 0; i < n; i++ )
-    {
-      x[ix] = sa * x[ix];
-      ix = ix + incx;
-    }
-
-  }
-
-  return;
-}
-/******************************************************************************/
-
-void sswap ( int n, float x[], int incx, float y[], int incy )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    SSWAP interchanges two float vectors.
-
-  Modified:
-
-    23 February 2006
-
-  Author:
-
-    C version by John Burkardt
-
-  Reference:
-
-    Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-    LINPACK User's Guide,
-    SIAM, 1979,
-    ISBN13: 978-0-898711-72-1,
-    LC: QA214.L56.
-
-    Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh,
-    Basic Linear Algebra Subprograms for Fortran Usage,
-    Algorithm 539, 
-    ACM Transactions on Mathematical Software, 
-    Volume 5, Number 3, September 1979, pages 308-323.
-
-  Parameters:
-
-    Input, int N, the number of entries in the vectors.
-
-    Input/output, float X[*], one of the vectors to swap.
-
-    Input, int INCX, the increment between successive entries of X.
-
-    Input/output, float Y[*], one of the vectors to swap.
-
-    Input, int INCY, the increment between successive elements of Y.
-*/
-{
-  int i;
-  int ix;
-  int iy;
-  int m;
-  float temp;
-
-  if ( n <= 0 )
-  {
-  }
-  else if ( incx == 1 && incy == 1 )
-  {
-    m = n % 3;
-
-    for ( i = 0; i < m; i++ )
-    {
-      temp = x[i];
-      x[i] = y[i];
-      y[i] = temp;
-    }
-
-    for ( i = m; i < n; i = i + 3 )
-    {
-      temp = x[i];
-      x[i] = y[i];
-      y[i] = temp;
-
-      temp = x[i+1];
-      x[i+1] = y[i+1];
-      y[i+1] = temp;
-
-      temp = x[i+2];
-      x[i+2] = y[i+2];
-      y[i+2] = temp;
-    }
-  }
-  else
-  {
-    if ( 0 <= incx )
-    {
-      ix = 0;
-    }
-    else
-    {
-      ix = ( - n + 1 ) * incx;
-    }
-
-    if ( 0 <= incy )
-    {
-      iy = 0;
-    }
-    else
-    {
-      iy = ( - n + 1 ) * incy;
-    }
-
-    for ( i = 0; i < n; i++ )
-    {
-      temp = x[ix];
-      x[ix] = y[iy];
-      y[iy] = temp;
-      ix = ix + incx;
-      iy = iy + incy;
-    }
-  }
-  return;
-}
-/******************************************************************************/
-
-void timestamp ( )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    TIMESTAMP prints the current YMDHMS date as a time stamp.
-
-  Example:
-
-    31 May 2001 09:45:54 AM
-
-  Modified:
-
-    24 September 2003
-
-  Author:
-
-    John Burkardt
-
-  Parameters:
-
-    None
-*/
-{
-# define TIME_SIZE 40
-
-  static char time_buffer[TIME_SIZE];
-  const struct tm *tm;
-  time_t now;
-
-  now = time ( NULL );
-  tm = localtime ( &now );
-
-  strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm );
-
-  printf ( "%s\n", time_buffer );
-
-  return;
-# undef TIME_SIZE
-}
diff --git a/src/tutor/mp/sum_tbb.cc b/src/tutor/mp/sum_tbb.cc
deleted file mode 100644
index a43ce8a95..000000000
--- a/src/tutor/mp/sum_tbb.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-//    g++ sum_tbb.cc -o sum_tbb -ltbb
-
-#include <oneapi/tbb.h>   
-
-int main(){
-  int n = 1001;
-  int sum = oneapi::tbb::parallel_reduce(
-					 oneapi::tbb::blocked_range<int>(1,n), 0,
-					 [](oneapi::tbb::blocked_range<int> const& r, int init) -> int {
-					   for (int v = r.begin(); v != r.end(); v++) {
-					     init += v;
-					   }
-					   return init;
-					 },
-					 [](int lhs, int rhs) -> int {
-					   return lhs + rhs;
-					 }
-					 );
-
-    printf("N=%d Sum: %d\n", n, sum);
-    return 0;
-}
diff --git a/src/tutor/mp/wait3.sh b/src/tutor/mp/wait3.sh
deleted file mode 100755
index 20c923315..000000000
--- a/src/tutor/mp/wait3.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#! /usr/bin/bash
-#
-#  If you have "about equal CPU" tasks that are normally single CPU programs,
-#  running them with this simply shell construct can be useful to speed up
-#  a series
-#
-
-
-echo Sleeping 2
-sleep 2 &
-
-echo Sleeping 4
-sleep 4 &
-
-echo Sleeping 6
-sleep 6 &
-
-echo Waiting until all are done....
-wait
-
-echo All done
diff --git a/usr/dehnen/falcON/src/public/acc/Monopole.cc b/usr/dehnen/falcON/src/public/acc/Monopole.cc
index 4400f34f8..29dc7e9a9 100644
--- a/usr/dehnen/falcON/src/public/acc/Monopole.cc
+++ b/usr/dehnen/falcON/src/public/acc/Monopole.cc
@@ -33,8 +33,10 @@
 #include <inline.h>
 #include <acc/timer.h>
 #define __NO_AUX_DEFACC
-#include <defacc.h> // $NEMOINC/defacc.h
+// issue105
 #include <stdinc.h>
+#include <defacc.h> // $NEMOINC/defacc.h
+
 ////////////////////////////////////////////////////////////////////////////////
 namespace {
   using namespace WDutils;
diff --git a/usr/dehnen/falcON/src/public/acc/PotExp.cc b/usr/dehnen/falcON/src/public/acc/PotExp.cc
index 2572e08d6..017cf0143 100644
--- a/usr/dehnen/falcON/src/public/acc/PotExp.cc
+++ b/usr/dehnen/falcON/src/public/acc/PotExp.cc
@@ -19,6 +19,8 @@
 // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.                   |
 //                                                                             |
 //-----------------------------------------------------------------------------+
+// issue105
+#include <stdinc.h>
 #include <defacc.h>
 #include <ctime>
 #ifndef falcON_NEMO
diff --git a/usr/dehnen/falcON/src/public/lib/bodyfunc.cc b/usr/dehnen/falcON/src/public/lib/bodyfunc.cc
index 7cff22fb4..b669e0ba7 100644
--- a/usr/dehnen/falcON/src/public/lib/bodyfunc.cc
+++ b/usr/dehnen/falcON/src/public/lib/bodyfunc.cc
@@ -154,8 +154,9 @@ namespace {
     // compiles a falcON C++ program in fname using compiler flags              
     const char* falcON_path = falcON::directory();
     if(falcON_path == 0) throw BfErr("cannot locate falcON directory");
-    char cmmd[512];
-    SNprintf(cmmd,512,"cd /tmp; %s %s.cc -o %s.so"
+    // issue105
+    char cmmd[1024];
+    SNprintf(cmmd,1024,"cd /tmp; %s %s.cc -o %s.so"
 	     " %s -shared -fPIC -I%s/inc -I%s/inc/utils -O2"
 #if __cplusplus >= 201103L
 	     " -std=c++0x"
@@ -176,10 +177,14 @@ namespace {
 
 	     // 	     	     " -march=native -mfpmath=sse -mpreferred-stack-boundary=4 -ggdb3"
 #elif defined(__GNUC__)
-	     " -mfpmath=sse -mpreferred-stack-boundary=4 -ggdb3"
+	     " -mfpmath=sse -ggdb3"
 	     " -Wall -Wextra -Winit-self -Wshadow -Woverloaded-virtual -fPIC"
 	     " -std=c++11"
-	     " -fopenmp -funroll-loops -fforce-addr"
+	     " -funroll-loops -fforce-addr"
+// issue105
+#   ifndef __clang__
+	     " -mpreferred-stack-boundary=4"
+#   endif
 #else
 	     " -fpic -openmp -g"
 #endif
@@ -191,7 +196,7 @@ namespace {
 	     fname,fname,(flags? flags : " "),falcON_path,falcON_path,fname);
     DebugInfo(2,"now compiling using the following command\n   %s\n",cmmd);
     if(system(cmmd)) {
-      if(debug(debug_depth)) {
+      //issue105 if(debug(debug_depth)) {
 	std::cerr<<"could not compile temporary file /tmp/"<<fname<<".cc:\n";
 	char show[512];
 	SNprintf(show,512,"more /tmp/%s.cc > /dev/stderr",fname);
@@ -202,7 +207,7 @@ namespace {
 	SNprintf(show,512,"more /tmp/%s.log > /dev/stderr",fname);
 	std::cerr<<'\n';
 	rr=system(show);
-      }
+     // }
       throw BfErr(message("could not compile expression; "
 			  "perhaps it contains a syntax error"));
     }
@@ -510,7 +515,8 @@ namespace {
       "#define BD_TEST\n"
       "#define body_func\n"
       "#include <public/bodyfuncdefs.h>\n\n"
-      "real   _P[10]={RNG()};\n\n"
+// issue105
+      "real   _P[10]={static_cast<real>(RNG())};\n\n"
       "extern \"C\" {\n"
       "  fieldset "<<ftype<<"(char&_type)\n"
       "  {\n"
diff --git a/usr/dehnen/utils/inc/exception.h b/usr/dehnen/utils/inc/exception.h
index f1a133380..a51a8ebd0 100644
--- a/usr/dehnen/utils/inc/exception.h
+++ b/usr/dehnen/utils/inc/exception.h
@@ -70,11 +70,6 @@
 #    include <type_traits>
 #  endif
 #endif
-#if __cplusplus >= 201103L && defined(_OPENMP) && \
-  !defined(WDutils_included_omp_h)
-#  include <omp.h>
-#  define WDutils_included_omp_h
-#endif
 
 //                                                                              
 //  WDutils                                                                     
@@ -141,27 +136,7 @@ namespace WDutils {
       Info._m_mpi_size=s;
     }
 
-    /// \name openMP stuff
-    //@{
-    /// set \# openMP threads
-    /// \note If @a arg[0] == 't', we set \# threads to \# processors.
-    ///       If @a arg[0] == 'f', we set \# threads to 1 (no openMP).
-    ///       Otherwise, we try to convert @a arg to an integer number and
-    ///       take that. This may exceed the \# processors.
-    static void set_omp(const char*arg);
-    /// set number of openMP threads
-    static void set_omp(int n);
-    /// maximum \# processors available for openMP
-    static int max_omp_proc()
-    { return Info._m_omp_proc; }
-    /// number of openMP threads to be used, may exceed @a max_omp_proc()
-    /// \note defaults to max_omp_proc, implying openMP is used if available
-    static int omp_threads()
-    { return Info._m_omp_size; }
-    /// shall openMP parallelism be used?
-    static bool use_omp()
-    { return Info._m_omp_size > 1; }
-    //@}
+
 
     /// \name TBB stuff
     //@{
diff --git a/usr/dehnen/utils/src/exception.cc b/usr/dehnen/utils/src/exception.cc
index 250488bb0..1c5f01fd5 100644
--- a/usr/dehnen/utils/src/exception.cc
+++ b/usr/dehnen/utils/src/exception.cc
@@ -171,17 +171,8 @@ WDutils::RunInfo::RunInfo()
       SNprintf(_m_name,104,"unknown.name");
     }
 #endif
-    // set # proc available for openMP
-    {
-#ifdef _OPENMP
-      if(omp_in_parallel())
-	WDutils_ErrorF("called inside OMP parallel region\n");
-      _m_omp_proc = omp_get_num_procs();
-#else
-      _m_omp_proc = 1;
-#endif
-      _m_omp_size = _m_omp_proc;
-    }
+
+
     // set # threads used by TBB
     {
 #ifdef WDutilsTBB
@@ -195,52 +186,7 @@ WDutils::RunInfo::RunInfo()
   catch(WDutils::exception& ex) { WDutils_RETHROW(ex); }
 }
 //
-void WDutils::RunInfo::set_omp(int 
-#ifdef _OPENMP
-			       n
-#endif
-			       )
-{
-#ifdef _OPENMP
-  Info._m_omp_size = n;
-  if(Info._m_omp_size < 1) {
-    Info._m_omp_size = 1;
-    WDutils_WarningN("RunInfo::set_omp('%d') assume '1'\n",n);
-  }
-  omp_set_num_threads(Info._m_omp_size);
-#else
-  Info._m_omp_size = 1;
-#endif
-}
-//
-void WDutils::RunInfo::set_omp(const char*
-#ifdef _OPENMP
-			       arg
-#endif
-			       )
-{
-#ifdef _OPENMP
-  if(arg==0 || arg[0]==0 || arg[0]=='t')
-    Info._m_omp_size = Info._m_omp_proc;
-  else if(arg[0] == 'f')
-    Info._m_omp_size = 1;
-  else if(arg && arg[0]) {
-    Info._m_omp_size = strtol(arg,0,10);
-    if(errno == EINVAL)
-      WDutils_THROWN("RunInfo::set_omp('%s') (errno=EINVAL)\n",arg,errno);
-    if(errno == ERANGE)
-      WDutils_THROWN("RunInfo::set_omp('%s') (errno=ERANGE)\n",arg,errno);
-    if(Info._m_omp_size < 1) {
-      Info._m_omp_size = 1;
-      WDutils_WarningN("RunInfo::set_omp('%s') assume '1'\n",arg);
-    }
-  }
-  omp_set_num_threads(Info._m_omp_size);
-#else
-  Info._m_omp_size = 1;
-#endif
-}
-//
+
 WDutils::RunInfo::~RunInfo()
 {
 #ifdef WDutilsTBB