OpenMP parallelization in update_gauge

expo functions with pointers and no return value
Marcogarofalo · Sep 26, 2012 · 9f495fe · 9f495fe
1 parent 441615c
commit 9f495fe
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 53 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,5 @@ read_input.c
 *~
 Makefile
 config.log
+*.mod.c
+*.opari.inc
diff --git a/expo.c b/expo.c
@@ -1,5 +1,6 @@
 /***********************************************************************
  * Copyright (C) 2001 Martin Hasenbusch
+ * Copyright (C) 2012 Bartosz Kostrzewa (void versions)
  *
  * This file is part of tmLQCD.
  *
@@ -20,11 +21,12 @@
  *
  *
  * The externally accessible functions are
- *
- *   su3 exposu3(su3adj p)
- *   su3 exposu3_check(su3adj p)
- *   su3 restoresu3(su3 u)
- *   Returns an element of su3
+ * 
+ * void exposu3(su3* const vr, const su3adj* const p);
+ * extern void exposu3_check(su3* const vr, const su3adj* const p, int im);
+ * extern void restoresu3(su3* const vr,const su3* const u);
+ * extern void restoresu3_in_place(su3* const u);
+ * extern void exposu3_in_place(su3* const u);
  *
  * Author: Martin Hasenbusch <[email protected]>
  * Tue Aug 28 10:06:56 MEST 2001
@@ -51,15 +53,15 @@
 #include "su3adj.h"
 #include "expo.h"
 
-su3 exposu3(su3adj p) {
+void exposu3(su3* const vr, const su3adj* const p) {
   int i;
-  su3 ALIGN v,v2,vr;
+  su3 ALIGN v,v2;
   double ALIGN fac,r;
   double ALIGN a,b;
   _Complex double ALIGN a0,a1,a2,a1p;
 
   /* it writes 'p=vec(h_{j,mu})' in matrix form 'v' */  
-  _make_su3(v,p);
+  _make_su3(v,*p);
   /* calculates v^2 */
   _su3_times_su3(v2,v,v);
   /* */
@@ -83,73 +85,87 @@ su3 exposu3(su3adj p) {
     r -= 1.0;
   }
   /* vr = a0 + a1*v + a2*v2 */
-  vr.c00 = a0 + a1 * v.c00 + a2 * v2.c00;
-  vr.c01 =      a1 * v.c01 + a2 * v2.c01;
-  vr.c02 =      a1 * v.c02 + a2 * v2.c02;
-  vr.c10 =      a1 * v.c10 + a2 * v2.c10;
-  vr.c11 = a0 + a1 * v.c11 + a2 * v2.c11;
-  vr.c12 =      a1 * v.c12 + a2 * v2.c12;
-  vr.c20 =      a1 * v.c20 + a2 * v2.c20;
-  vr.c21 =      a1 * v.c21 + a2 * v2.c21;
-  vr.c22 = a0 + a1 * v.c22 + a2 * v2.c22;
-  return vr;
+  vr->c00 = a0 + a1 * v.c00 + a2 * v2.c00;
+  vr->c01 =      a1 * v.c01 + a2 * v2.c01;
+  vr->c02 =      a1 * v.c02 + a2 * v2.c02;
+  vr->c10 =      a1 * v.c10 + a2 * v2.c10;
+  vr->c11 = a0 + a1 * v.c11 + a2 * v2.c11;
+  vr->c12 =      a1 * v.c12 + a2 * v2.c12;
+  vr->c20 =      a1 * v.c20 + a2 * v2.c20;
+  vr->c21 =      a1 * v.c21 + a2 * v2.c21;
+  vr->c22 = a0 + a1 * v.c22 + a2 * v2.c22;
 }
 
-su3 exposu3_check(su3adj p, int im) {
-
+void exposu3_check(su3* const vr, const su3adj* const p, int im) {
   /* compute the result by taylor series */
-  su3 ALIGN v,v2,v3,vr;
+  su3 ALIGN v,v2,v3;
   double ALIGN fac;
   int i;
 
-  _make_su3(v, p);
-  _su3_one(vr);
-  _su3_acc(vr, v); 
+  _make_su3(v, *p);
+  _su3_one(*vr);
+  _su3_acc(*vr, v); 
   _su3_times_su3(v2, v, v);
-  _su3_refac_acc(vr, 0.5, v2);
+  _su3_refac_acc(*vr, 0.5, v2);
   fac = 0.5;
   for(i = 3; i <= im; i++) {
     fac = fac/i;
     _su3_times_su3(v3, v2, v);
-    _su3_refac_acc(vr, fac, v3); 
+    _su3_refac_acc(*vr, fac, v3); 
     _su3_assign(v2, v3); 
   }
-  return vr;
 }
 
-
-su3 restoresu3(su3 u) {
-  su3 ALIGN vr;
+void restoresu3(su3* const vr, const su3* const u) {
   double ALIGN n0,n1;
 
   /* normalize rows 1 and 2 */
-  n0 = 1.0 / sqrt(conj(u.c00) * u.c00 + conj(u.c01) * u.c01 + conj(u.c02) * u.c02);
-  n1 = 1.0 / sqrt(conj(u.c10) * u.c10 + conj(u.c11) * u.c11 + conj(u.c12) * u.c12);
+  n0 = 1.0 / sqrt(conj(u->c00) * u->c00 + conj(u->c01) * u->c01 + conj(u->c02) * u->c02);
+  n1 = 1.0 / sqrt(conj(u->c10) * u->c10 + conj(u->c11) * u->c11 + conj(u->c12) * u->c12);
 
-  vr.c00 = n0 * u.c00;
-  vr.c01 = n0 * u.c01;
-  vr.c02 = n0 * u.c02;
+  vr->c00 = n0 * u->c00;
+  vr->c01 = n0 * u->c01;
+  vr->c02 = n0 * u->c02;
 
-  vr.c10 = n1 * u.c10;
-  vr.c11 = n1 * u.c11;
-  vr.c12 = n1 * u.c12;
+  vr->c10 = n1 * u->c10;
+  vr->c11 = n1 * u->c11;
+  vr->c12 = n1 * u->c12;
 
   /* compute  row 3 as the conjugate of the cross-product of 1 and 2 */ 
-  vr.c20 = conj(vr.c01 * vr.c12 - vr.c02 * vr.c11);
-  vr.c21 = conj(vr.c02 * vr.c10 - vr.c00 * vr.c12);
-  vr.c22 = conj(vr.c00 * vr.c11 - vr.c01 * vr.c10);
-
-  return vr;
+  vr->c20 = conj(vr->c01 * vr->c12 - vr->c02 * vr->c11);
+  vr->c21 = conj(vr->c02 * vr->c10 - vr->c00 * vr->c12);
+  vr->c22 = conj(vr->c00 * vr->c11 - vr->c01 * vr->c10);
 }
 
+void restoresu3_in_place(su3* const u) {
+  double ALIGN n0,n1;
+
+  /* normalize rows 1 and 2 */
+  n0 = 1.0 / sqrt(conj(u->c00) * u->c00 + conj(u->c01) * u->c01 + conj(u->c02) * u->c02);
+  n1 = 1.0 / sqrt(conj(u->c10) * u->c10 + conj(u->c11) * u->c11 + conj(u->c12) * u->c12);
+
+  u->c00 = n0 * u->c00;
+  u->c01 = n0 * u->c01;
+  u->c02 = n0 * u->c02;
+
+  u->c10 = n1 * u->c10;
+  u->c11 = n1 * u->c11;
+  u->c12 = n1 * u->c12;
+
+  /* compute  row 3 as the conjugate of the cross-product of 1 and 2 */
+  u->c20 = conj(u->c01 * u->c12 - u->c02 * u->c11);
+  u->c21 = conj(u->c02 * u->c10 - u->c00 * u->c12);
+  u->c22 = conj(u->c00 * u->c11 - u->c01 * u->c10);
+}
+
 /* Exponentiates a hermitian 3x3 matrix Q */
 /* Convenience function -- wrapper around Hasenbusch's implementation */
-void exposu3_in_place(su3 *u) {
+void exposu3_in_place(su3* const u) {
   su3adj ALIGN p;
 
   _trace_lambda(p, *u); /* Projects onto the Gell-Mann matrices */
   /* -2.0 to get su3 to su3adjoint consistency ****/
   p.d1 *= -0.5; p.d2 *= -0.5; p.d3 *= -0.5; p.d4 *= -0.5;
   p.d5 *= -0.5; p.d6 *= -0.5; p.d7 *= -0.5; p.d8 *= -0.5;
-  *u = exposu3(p);
+  exposu3(u,&p);
 }
diff --git a/expo.h b/expo.h
@@ -19,9 +19,10 @@
 #ifndef _EXPO_H
 #define _EXPO_H
 
-extern su3 exposu3(su3adj p);
-extern su3 exposu3_check(su3adj p,int im);
-extern su3 restoresu3(su3 u);
-extern void exposu3_in_place(su3 *u);
+extern void exposu3(su3* const vr, const su3adj* const p);
+extern void exposu3_check(su3* const vr, const su3adj* const p, int im);
+extern void restoresu3(su3* const vr, const su3* const u);
+extern void restoresu3_in_place(su3* const u);
+extern void exposu3_in_place(su3* const u);
 
 #endif
diff --git a/tests/test_su3_algebra.c b/tests/test_su3_algebra.c
@@ -251,7 +251,7 @@ TEST(su3_expo_positivedet) {
   U.c22 = -0.5279 -0.0022*I;
 
   _trace_lambda(T,Q);
-  Q = exposu3(T);
+  exposu3(&Q,&T);
 
   if( creal(Q.c00 - U.c00) > EPS && creal(Q.c01 - U.c01) > EPS && creal(Q.c02 - U.c02) > EPS &&
       creal(Q.c10 - U.c10) > EPS && creal(Q.c11 - U.c11) > EPS && creal(Q.c12 - U.c12) > EPS &&   

diff --git a/update_gauge.c b/update_gauge.c
@@ -47,7 +47,11 @@
 
 
 void update_gauge(const double step, hamiltonian_field_t * const hf) {
-
+#ifdef OMP
+#define static
+#pragma omp parallel
+  {
+#endif
   int i,mu;
   static su3 v,w;
   su3 *z;
@@ -57,17 +61,29 @@ void update_gauge(const double step, hamiltonian_field_t * const hf) {
 #pragma pomp inst begin(updategauge)
 #endif
 
+#ifdef OMP
+#undef static
+#endif
+
+#ifdef OMP
+#pragma omp for
+#endif
   for(i = 0; i < VOLUME; i++) { 
     for(mu = 0; mu < 4; mu++){
       /* moment[i][mu] = h_{i,mu}^{alpha} */
       xm = &hf->momenta[i][mu];
       z = &hf->gaugefield[i][mu];
       _su3adj_assign_const_times_su3adj(deriv, step, *xm);
-      v = restoresu3( exposu3(deriv) );
+      exposu3(&w,&deriv);
+      restoresu3(&v,&w);
       _su3_times_su3(w, v, *z);
       _su3_assign(*z, w);
     }
   }
+
+#ifdef OMP
+  } /* OpenMP parallel closing brace */
+#endif
 
 #ifdef MPI
   /* for parallelization */

diff --git a/update_tm.c b/update_tm.c
@@ -332,7 +332,7 @@ int update_tm(double *plaquette_energy, double *rectangle_energy,
       for(int ix=0;ix<VOLUME;ix++) { 
         for(int mu=0;mu<4;mu++) { 
           v=&hf.gaugefield[ix][mu];
-          *v=restoresu3(*v); 
+          restoresu3_in_place(v); 
         }
       }
     }