diff --git a/Makefile b/Makefile
index 2e472be2..29308854 100644
--- a/Makefile
+++ b/Makefile
@@ -11,13 +11,19 @@ HPCG_DEPS = src/ComputeResidual.o \
          src/CheckProblem.o \
 	 src/OptimizeProblem.o src/ReadHpcgDat.o src/ReportResults.o \
 	 src/SetupHalo.o src/SetupHalo_ref.o src/TestSymmetry.o src/TestNorms.o src/WriteProblem.o \
-         src/YAML_Doc.o src/YAML_Element.o src/ComputeDotProduct.o \
-         src/ComputeDotProduct_ref.o src/finalize.o src/init.o src/mytimer.o src/ComputeSPMV.o \
-         src/ComputeSPMV_ref.o src/ComputeWAXPBY.o src/ComputeWAXPBY_ref.o \
-         src/ComputeMG_ref.o src/ComputeMG.o src/ComputeProlongation_ref.o src/ComputeRestriction_ref.o \
+         src/YAML_Doc.o src/YAML_Element.o \
+         src/ComputeDotProduct.o src/ComputeDotProduct_ref.o \
+         src/finalize.o src/init.o src/mytimer.o \
+         src/ComputeSPMV.o src/ComputeSPMV_ref.o \
+	 src/ComputeSYMGS.o src/ComputeSYMGS_ref.o \
+         src/ComputeWAXPBY.o src/ComputeWAXPBY_ref.o \
+         src/ComputeMG_ref.o src/ComputeMG.o \
+         src/ComputeProlongation_ref.o src/ComputeRestriction_ref.o \
 	 src/ComputeOptimalShapeXYZ.o src/MixedBaseCounter.o src/CheckAspectRatio.o src/OutputFile.o \
          \
-         src/TestGMRES.o src/ComputeTRSM.o src/ComputeGEMV.o \
+         src/TestGMRES.o src/ComputeTRSM.o src/ComputeGEMV.o src/ComputeGEMVT.o \
+         src/ComputeGEMV.o src/ComputeGEMV_ref.o \
+         src/ComputeGEMVT.o src/ComputeGEMVT_ref.o \
          src/GMRES.o src/GMRES_IR.o \
          src/ComputeGS_Forward.o src/ComputeGS_Forward_ref.o \
          src/SetupProblem.o \
@@ -27,9 +33,13 @@ HPCG_DEPS = src/ComputeResidual.o \
 bin/xhpgmp: src/main_hpgmp.o $(HPCG_DEPS)
 	$(LINKER) $(LINKFLAGS) src/main_hpgmp.o $(HPCG_DEPS) -o bin/xhpgmp $(HPCG_LIBS)
 
+bin/xhpgmp_time: src/main_time.o $(HPCG_DEPS)
+	$(LINKER) $(LINKFLAGS) src/main_time.o $(HPCG_DEPS) -o bin/xhpgmp_time $(HPCG_LIBS)
+
 clean:
 	rm -f $(HPCG_DEPS) \
-	bin/xhpgmp src/main_hpgmp.o
+	bin/xhpgmp src/main_hpgmp.o \
+	bin/xhpgmp_time src/main_time.o
 
 .PHONY: clean
 
diff --git a/Makefile.ext b/Makefile.ext
index 3cee767d..7c3b5a04 100644
--- a/Makefile.ext
+++ b/Makefile.ext
@@ -27,6 +27,8 @@ HPCG_DEPS = src/ComputeResidual.o \
 	    src/ComputeOptimalShapeXYZ.o \
 	    src/ComputeSPMV.o \
 	    src/ComputeSPMV_ref.o \
+	    src/ComputeSYMGS.o \
+	    src/ComputeSYMGS_ref.o \
 	    src/ComputeWAXPBY.o \
 	    src/ComputeWAXPBY_ref.o \
 	    src/ComputeMG_ref.o \
@@ -45,28 +47,37 @@ HPCG_DEPS = src/ComputeResidual.o \
 	    src/ComputeGS_Forward_ref.o \
 	    src/ComputeTRSM.o \
 	    src/ComputeGEMV.o \
-			src/SetupProblem.o \
+	    src/ComputeGEMV_ref.o \
+	    src/ComputeGEMVT.o \
+	    src/ComputeGEMVT_ref.o \
+	    src/SetupProblem.o \
 	    src/GenerateNonsymProblem.o \
 	    src/GenerateNonsymProblem_v1_ref.o \
 	    src/GenerateNonsymCoarseProblem.o \
 
 # These header files are included in many source files, so we recompile every file if one or more of these header is modified.
-PRIMARY_HEADERS = HPCG_SRC_PATH/src/Geometry.hpp HPCG_SRC_PATH/src/SparseMatrix.hpp HPCG_SRC_PATH/src/Vector.hpp HPCG_SRC_PATH/src/CGData.hpp \
-                  HPCG_SRC_PATH/src/MGData.hpp HPCG_SRC_PATH/src/hpcg.hpp
+PRIMARY_HEADERS = HPCG_SRC_PATH/src/Geometry.hpp HPCG_SRC_PATH/src/SparseMatrix.hpp HPCG_SRC_PATH/src/Vector.hpp HPCG_SRC_PATH/src/MultiVector.hpp \
+                  HPCG_SRC_PATH/src/CGData.hpp HPCG_SRC_PATH/src/MGData.hpp HPCG_SRC_PATH/src/Hpgmp_Params.hpp
 
-all: bin/xhpgmp
+all: bin/xhpgmp bin/xhpgmp_time
 
 bin/xhpgmp: src/main_hpgmp.o $(HPCG_DEPS)
 	$(LINKER) $(LINKFLAGS) src/main_hpgmp.o $(HPCG_DEPS) $(HPCG_LIBS) -o bin/xhpgmp
 
+bin/xhpgmp_time: src/main_time.o $(HPCG_DEPS)
+	$(LINKER) $(LINKFLAGS) src/main_time.o $(HPCG_DEPS) $(HPCG_LIBS) -o bin/xhpgmp_time
+
 clean:
-	rm -f src/*.o bin/xhpgmp
+	rm -f src/*.o bin/xhpgmp bin/xhpgmp_time
 
 .PHONY: all clean
 
 src/main_hpgmp.o: HPCG_SRC_PATH/src/main_hpgmp.cpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 
+src/main_time.o: HPCG_SRC_PATH/src/main_time.cpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
 src/ComputeResidual.o: HPCG_SRC_PATH/src/ComputeResidual.cpp HPCG_SRC_PATH/src/ComputeResidual.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 
@@ -139,6 +150,9 @@ src/ComputeSPMV_ref.o: HPCG_SRC_PATH/src/ComputeSPMV_ref.cpp HPCG_SRC_PATH/src/C
 src/ComputeSYMGS.o: HPCG_SRC_PATH/src/ComputeSYMGS.cpp HPCG_SRC_PATH/src/ComputeSYMGS.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 
+src/ComputeSYMGS_ref.o: HPCG_SRC_PATH/src/ComputeSYMGS_ref.cpp HPCG_SRC_PATH/src/ComputeSYMGS_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
 src/ComputeWAXPBY.o: HPCG_SRC_PATH/src/ComputeWAXPBY.cpp HPCG_SRC_PATH/src/ComputeWAXPBY.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 
@@ -185,6 +199,15 @@ src/ComputeTRSM.o: HPCG_SRC_PATH/src/ComputeTRSM.cpp HPCG_SRC_PATH/src/ComputeTR
 src/ComputeGEMV.o: HPCG_SRC_PATH/src/ComputeGEMV.cpp HPCG_SRC_PATH/src/ComputeGEMV.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 
+src/ComputeGEMV_ref.o: HPCG_SRC_PATH/src/ComputeGEMV_ref.cpp HPCG_SRC_PATH/src/ComputeGEMV_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeGEMVT.o: HPCG_SRC_PATH/src/ComputeGEMVT.cpp HPCG_SRC_PATH/src/ComputeGEMVT.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
+src/ComputeGEMVT_ref.o: HPCG_SRC_PATH/src/ComputeGEMVT_ref.cpp HPCG_SRC_PATH/src/ComputeGEMVT_ref.hpp $(PRIMARY_HEADERS)
+	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
+
 src/SetupProblem.o: HPCG_SRC_PATH/src/SetupProblem.cpp HPCG_SRC_PATH/src/SetupProblem.hpp $(PRIMARY_HEADERS)
 	$(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@
 
diff --git a/QUICKSTART b/QUICKSTART
index 466ffff1..f65db16f 100644
--- a/QUICKSTART
+++ b/QUICKSTART
@@ -37,6 +37,11 @@ NOTE: The instructions in this file assume you are working with a version
       export OMP_NUM_THREADS 4
       mpiexec -np 64 ./xhpcg
 
+5) To set parameters, hpcg.dat ... First two lines ignored. 
+    Line 3: nx ny nz
+    Line 4: Time to run the bechmark (seconds)
+    Line 5: 
+
 5) The benchmark has completed execution. This should take a few minutes
    when running in evaluation mode, and take about 30 minutes in official
    benchmark mode. If you are running on a production system, you may be able
diff --git a/bin/HPGMP-Benchmark_1.1_2022-03-07_16-04-57.txt b/bin/HPGMP-Benchmark_1.1_2022-03-07_16-04-57.txt
new file mode 100644
index 00000000..2c10174f
--- /dev/null
+++ b/bin/HPGMP-Benchmark_1.1_2022-03-07_16-04-57.txt
@@ -0,0 +1,112 @@
+HPGMP-Benchmark
+version=1.1
+Release date=March 28, 2019
+Machine Summary=
+Machine Summary::Distributed Processes=1
+Machine Summary::Threads per processes=1
+Global Problem Dimensions=
+Global Problem Dimensions::Global nx=16
+Global Problem Dimensions::Global ny=16
+Global Problem Dimensions::Global nz=16
+Processor Dimensions=
+Processor Dimensions::npx=1
+Processor Dimensions::npy=1
+Processor Dimensions::npz=1
+Local Domain Dimensions=
+Local Domain Dimensions::nx=16
+Local Domain Dimensions::ny=16
+Local Domain Dimensions::Lower ipz=0
+Local Domain Dimensions::Upper ipz=0
+Local Domain Dimensions::nz=16
+########## Problem Summary  ##########=
+Setup Information=
+Setup Information::Setup Time=0.002537
+Linear System Information=
+Linear System Information::Number of Equations=4096
+Linear System Information::Number of Nonzero Terms=97336
+Multigrid Information=
+Multigrid Information::Number of coarse grid levels=3
+Multigrid Information::Coarse Grids=
+Multigrid Information::Coarse Grids::Grid Level=1
+Multigrid Information::Coarse Grids::Number of Equations=512
+Multigrid Information::Coarse Grids::Number of Nonzero Terms=10648
+Multigrid Information::Coarse Grids::Number of Presmoother Steps=1
+Multigrid Information::Coarse Grids::Number of Postsmoother Steps=1
+Multigrid Information::Coarse Grids::Grid Level=2
+Multigrid Information::Coarse Grids::Number of Equations=64
+Multigrid Information::Coarse Grids::Number of Nonzero Terms=1000
+Multigrid Information::Coarse Grids::Number of Presmoother Steps=1
+Multigrid Information::Coarse Grids::Number of Postsmoother Steps=1
+Multigrid Information::Coarse Grids::Grid Level=3
+Multigrid Information::Coarse Grids::Number of Equations=8
+Multigrid Information::Coarse Grids::Number of Nonzero Terms=64
+Multigrid Information::Coarse Grids::Number of Presmoother Steps=1
+Multigrid Information::Coarse Grids::Number of Postsmoother Steps=1
+########## Memory Use Summary  ##########=
+Memory Use Information=
+Memory Use Information::Total memory used for data (Gbytes)=0.00292882
+Memory Use Information::Memory used for OptimizeProblem data (Gbytes)=0
+Memory Use Information::Bytes per equation (Total memory / Number of Equations)=715.045
+Memory Use Information::Memory used for linear system and CG (Gbytes)=0.00257652
+Memory Use Information::Coarse Grids=
+Memory Use Information::Coarse Grids::Grid Level=1
+Memory Use Information::Coarse Grids::Memory used=0.000308152
+Memory Use Information::Coarse Grids::Grid Level=2
+Memory Use Information::Coarse Grids::Memory used=3.8904e-05
+Memory Use Information::Coarse Grids::Grid Level=3
+Memory Use Information::Coarse Grids::Memory used=5.248e-06
+########## V&V Testing Summary  ##########=
+Spectral Convergence Tests=
+Spectral Convergence Tests::Result=FAILED
+Spectral Convergence Tests::Unpreconditioned=
+Spectral Convergence Tests::Unpreconditioned::Maximum iteration count=21
+Spectral Convergence Tests::Unpreconditioned::Expected iteration count=12
+Spectral Convergence Tests::Preconditioned=
+Spectral Convergence Tests::Preconditioned::Maximum iteration count=3
+Spectral Convergence Tests::Preconditioned::Expected iteration count=2
+########## Iterations Summary  ##########=
+Iteration Count Information=
+Iteration Count Information::Result=PASSED
+Iteration Count Information::Reference CG iterations per set=50
+Iteration Count Information::Optimized CG iterations per set=500
+Iteration Count Information::Total number of reference iterations=50
+Iteration Count Information::Total number of optimized iterations=500
+########## Reproducibility Summary  ##########=
+Reproducibility Information=
+Reproducibility Information::Result=FAILED
+Reproducibility Information::Scaled residual mean=2.122e-314
+Reproducibility Information::Scaled residual variance=2.122e-314
+########## Performance Summary (times in sec) ##########=
+Benchmark Time Summary=
+Benchmark Time Summary::Optimization phase=0
+Benchmark Time Summary::DDOT=0
+Benchmark Time Summary::WAXPBY=0
+Benchmark Time Summary::SpMV=0
+Benchmark Time Summary::MG=0
+Benchmark Time Summary::Total=0
+Floating Point Operations Summary=
+Floating Point Operations Summary::Raw DDOT=1.22962e+07
+Floating Point Operations Summary::Raw WAXPBY=1.22962e+07
+Floating Point Operations Summary::Raw SpMV=9.75307e+07
+Floating Point Operations Summary::Raw MG=5.45048e+08
+Floating Point Operations Summary::Total=6.67171e+08
+Floating Point Operations Summary::Total with convergence overhead=6.67171e+07
+GB/s Summary=
+GB/s Summary::Raw Read B/W=inf
+GB/s Summary::Raw Write B/W=inf
+GB/s Summary::Raw Total B/W=inf
+GB/s Summary::Total with convergence and optimization phase overhead=2002.73
+GFLOP/s Summary=
+GFLOP/s Summary::Raw DDOT=inf
+GFLOP/s Summary::Raw WAXPBY=inf
+GFLOP/s Summary::Raw SpMV=inf
+GFLOP/s Summary::Raw MG=inf
+GFLOP/s Summary::Raw Total=inf
+GFLOP/s Summary::Total with convergence overhead=inf
+GFLOP/s Summary::Total with convergence and optimization phase overhead=262.976
+User Optimization Overheads=
+User Optimization Overheads::Optimization phase time (sec)=0
+User Optimization Overheads::Optimization phase time vs reference SpMV+MG time=nan
+Final Summary=
+Final Summary::HPCG result is=INVALID.
+Final Summary::Please review the YAML file contents=You may NOT submit these results for consideration.
diff --git a/bin/hpcg.dat b/bin/hpcg.dat
deleted file mode 100644
index 0e4508e1..00000000
--- a/bin/hpcg.dat
+++ /dev/null
@@ -1,4 +0,0 @@
-HPCG benchmark input file
-Sandia National Laboratories; University of Tennessee, Knoxville
-104 104 104
-60
diff --git a/bin/hpgmp20220307T160457.txt b/bin/hpgmp20220307T160457.txt
new file mode 100644
index 00000000..dbc9c791
--- /dev/null
+++ b/bin/hpgmp20220307T160457.txt
@@ -0,0 +1,287 @@
+ Setup    Time     0.002537 seconds.
+ Optimize Time     0 seconds.
+
+ Running GMRES(50) with max-iters = 50 and tol = 0 with precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid 
+GMRES Residual at the start of restart cycle = 368.706, 1
+GMRES Iteration = 1 (0)   Scaled Residual = 80.0312 / 368.706 = 0.21706
+GMRES Iteration = 2 (1)   Scaled Residual = 39.0951 / 368.706 = 0.106033
+GMRES Iteration = 3 (2)   Scaled Residual = 24.0276 / 368.706 = 0.0651674
+GMRES Iteration = 4 (3)   Scaled Residual = 15.5707 / 368.706 = 0.0422307
+GMRES Iteration = 5 (4)   Scaled Residual = 8.56239 / 368.706 = 0.0232228
+GMRES Iteration = 6 (5)   Scaled Residual = 3.9145 / 368.706 = 0.0106169
+GMRES Iteration = 7 (6)   Scaled Residual = 1.88736 / 368.706 = 0.00511888
+GMRES Iteration = 8 (7)   Scaled Residual = 1.00341 / 368.706 = 0.00272143
+GMRES Iteration = 9 (8)   Scaled Residual = 0.526511 / 368.706 = 0.001428
+GMRES Iteration = 10 (9)   Scaled Residual = 0.181835 / 368.706 = 0.00049317
+GMRES Iteration = 11 (10)   Scaled Residual = 0.0469161 / 368.706 = 0.000127245
+GMRES Iteration = 12 (11)   Scaled Residual = 0.0142933 / 368.706 = 3.87662e-05
+GMRES Iteration = 13 (12)   Scaled Residual = 0.0029118 / 368.706 = 7.89736e-06
+GMRES Iteration = 14 (13)   Scaled Residual = 0.000913253 / 368.706 = 2.47691e-06
+GMRES Iteration = 15 (14)   Scaled Residual = 0.000297245 / 368.706 = 8.06186e-07
+GMRES Iteration = 16 (15)   Scaled Residual = 6.74022e-05 / 368.706 = 1.82807e-07
+GMRES Iteration = 17 (16)   Scaled Residual = 1.14551e-05 / 368.706 = 3.10683e-08
+GMRES Iteration = 18 (17)   Scaled Residual = 2.53274e-06 / 368.706 = 6.86927e-09
+GMRES Iteration = 19 (18)   Scaled Residual = 1.26028e-06 / 368.706 = 3.41811e-09
+GMRES Iteration = 20 (19)   Scaled Residual = 3.7527e-07 / 368.706 = 1.0178e-09
+GMRES Iteration = 21 (20)   Scaled Residual = 7.73786e-08 / 368.706 = 2.09866e-10
+GMRES Iteration = 22 (21)   Scaled Residual = 2.13376e-08 / 368.706 = 5.78715e-11
+GMRES Iteration = 23 (22)   Scaled Residual = 4.23387e-09 / 368.706 = 1.14831e-11
+GMRES Iteration = 24 (23)   Scaled Residual = 9.2695e-10 / 368.706 = 2.51406e-12
+GMRES Iteration = 25 (24)   Scaled Residual = 1.93412e-10 / 368.706 = 5.24569e-13
+GMRES Iteration = 26 (25)   Scaled Residual = 2.89247e-11 / 368.706 = 7.84491e-14
+GMRES Iteration = 27 (26)   Scaled Residual = 9.95438e-12 / 368.706 = 2.69982e-14
+GMRES Iteration = 28 (27)   Scaled Residual = 2.41354e-12 / 368.706 = 6.54599e-15
+GMRES Iteration = 29 (28)   Scaled Residual = 1.11778e-12 / 368.706 = 3.03162e-15
+GMRES Iteration = 30 (29)   Scaled Residual = 4.0636e-13 / 368.706 = 1.10212e-15
+GMRES Iteration = 31 (30)   Scaled Residual = 1.10214e-13 / 368.706 = 2.98921e-16
+GMRES Iteration = 32 (31)   Scaled Residual = 4.95379e-14 / 368.706 = 1.34356e-16
+GMRES Iteration = 33 (32)   Scaled Residual = 1.03209e-14 / 368.706 = 2.79921e-17
+GMRES Iteration = 34 (33)   Scaled Residual = 2.6522e-15 / 368.706 = 7.19326e-18
+GMRES Iteration = 35 (34)   Scaled Residual = 5.91396e-16 / 368.706 = 1.60398e-18
+GMRES Iteration = 36 (35)   Scaled Residual = 6.94601e-17 / 368.706 = 1.88389e-19
+GMRES Iteration = 37 (36)   Scaled Residual = 8.83338e-18 / 368.706 = 2.39578e-20
+GMRES Iteration = 38 (37)   Scaled Residual = 1.02085e-18 / 368.706 = 2.76873e-21
+GMRES Iteration = 39 (38)   Scaled Residual = 1.93548e-19 / 368.706 = 5.24938e-22
+GMRES Iteration = 40 (39)   Scaled Residual = 7.02131e-20 / 368.706 = 1.90431e-22
+GMRES Iteration = 41 (40)   Scaled Residual = 1.56706e-20 / 368.706 = 4.25017e-23
+GMRES Iteration = 42 (41)   Scaled Residual = 2.64481e-21 / 368.706 = 7.17322e-24
+GMRES Iteration = 43 (42)   Scaled Residual = 4.96943e-22 / 368.706 = 1.3478e-24
+GMRES Iteration = 44 (43)   Scaled Residual = 1.01574e-22 / 368.706 = 2.75487e-25
+GMRES Iteration = 45 (44)   Scaled Residual = 3.17017e-23 / 368.706 = 8.5981e-26
+GMRES Iteration = 46 (45)   Scaled Residual = 4.14824e-24 / 368.706 = 1.12508e-26
+GMRES Iteration = 47 (46)   Scaled Residual = 1.0295e-24 / 368.706 = 2.79219e-27
+GMRES Iteration = 48 (47)   Scaled Residual = 1.77179e-25 / 368.706 = 4.80544e-28
+GMRES Iteration = 49 (48)   Scaled Residual = 3.76859e-26 / 368.706 = 1.02211e-28
+GMRES Iteration = 50 (49)   Scaled Residual = 6.42962e-27 / 368.706 = 1.74383e-29
+GMRES restart: k = 51 (50)
+GMRES Residual at the start of restart cycle = 4.01002e-13, 1.08759e-15
+GMRES Iteration = 1 (50)   Scaled Residual = 1.69505e-13 / 368.706 = 4.59729e-16
+GMRES Iteration = 2 (51)   Scaled Residual = 1.02679e-13 / 368.706 = 2.78485e-16
+GMRES Iteration = 3 (52)   Scaled Residual = 4.90765e-14 / 368.706 = 1.33105e-16
+GMRES Iteration = 4 (53)   Scaled Residual = 2.11347e-14 / 368.706 = 5.73214e-17
+GMRES Iteration = 5 (54)   Scaled Residual = 8.10829e-15 / 368.706 = 2.19912e-17
+GMRES Iteration = 6 (55)   Scaled Residual = 3.07713e-15 / 368.706 = 8.34575e-18
+GMRES Iteration = 7 (56)   Scaled Residual = 1.41193e-15 / 368.706 = 3.82943e-18
+GMRES Iteration = 8 (57)   Scaled Residual = 6.96752e-16 / 368.706 = 1.88972e-18
+GMRES Iteration = 9 (58)   Scaled Residual = 3.4625e-16 / 368.706 = 9.39097e-19
+GMRES Iteration = 10 (59)   Scaled Residual = 9.42334e-17 / 368.706 = 2.55579e-19
+GMRES Iteration = 11 (60)   Scaled Residual = 3.61901e-17 / 368.706 = 9.81543e-20
+GMRES Iteration = 12 (61)   Scaled Residual = 9.13127e-18 / 368.706 = 2.47657e-20
+GMRES Iteration = 13 (62)   Scaled Residual = 2.59868e-18 / 368.706 = 7.04812e-21
+GMRES Iteration = 14 (63)   Scaled Residual = 8.61639e-19 / 368.706 = 2.33693e-21
+GMRES Iteration = 15 (64)   Scaled Residual = 1.84923e-19 / 368.706 = 5.01546e-22
+GMRES Iteration = 16 (65)   Scaled Residual = 3.68866e-20 / 368.706 = 1.00043e-22
+GMRES Iteration = 17 (66)   Scaled Residual = 6.63075e-21 / 368.706 = 1.79839e-23
+GMRES Iteration = 18 (67)   Scaled Residual = 1.22725e-21 / 368.706 = 3.32854e-24
+GMRES Iteration = 19 (68)   Scaled Residual = 5.79654e-22 / 368.706 = 1.57213e-24
+GMRES Iteration = 20 (69)   Scaled Residual = 2.38304e-22 / 368.706 = 6.46324e-25
+GMRES Iteration = 21 (70)   Scaled Residual = 4.59033e-23 / 368.706 = 1.24499e-25
+GMRES Iteration = 22 (71)   Scaled Residual = 1.1072e-23 / 368.706 = 3.00293e-26
+GMRES Iteration = 23 (72)   Scaled Residual = 3.17896e-24 / 368.706 = 8.62193e-27
+GMRES Iteration = 24 (73)   Scaled Residual = 6.01624e-25 / 368.706 = 1.63172e-27
+GMRES Iteration = 25 (74)   Scaled Residual = 1.27563e-25 / 368.706 = 3.45974e-28
+GMRES Iteration = 26 (75)   Scaled Residual = 3.54711e-26 / 368.706 = 9.62044e-29
+GMRES Iteration = 27 (76)   Scaled Residual = 1.0024e-26 / 368.706 = 2.71871e-29
+GMRES Iteration = 28 (77)   Scaled Residual = 1.95945e-27 / 368.706 = 5.3144e-30
+GMRES Iteration = 29 (78)   Scaled Residual = 6.72153e-28 / 368.706 = 1.82301e-30
+GMRES Iteration = 30 (79)   Scaled Residual = 1.96036e-28 / 368.706 = 5.31687e-31
+GMRES Iteration = 31 (80)   Scaled Residual = 4.87233e-29 / 368.706 = 1.32147e-31
+GMRES Iteration = 32 (81)   Scaled Residual = 2.54153e-29 / 368.706 = 6.89312e-32
+GMRES Iteration = 33 (82)   Scaled Residual = 6.76297e-30 / 368.706 = 1.83424e-32
+GMRES Iteration = 34 (83)   Scaled Residual = 1.25642e-30 / 368.706 = 3.40766e-33
+GMRES Iteration = 35 (84)   Scaled Residual = 3.06865e-31 / 368.706 = 8.32275e-34
+GMRES Iteration = 36 (85)   Scaled Residual = 3.41268e-32 / 368.706 = 9.25582e-35
+GMRES Iteration = 37 (86)   Scaled Residual = 4.67277e-33 / 368.706 = 1.26734e-35
+GMRES Iteration = 38 (87)   Scaled Residual = 7.10258e-34 / 368.706 = 1.92635e-36
+GMRES Iteration = 39 (88)   Scaled Residual = 2.18752e-34 / 368.706 = 5.93297e-37
+GMRES Iteration = 40 (89)   Scaled Residual = 6.13372e-35 / 368.706 = 1.66358e-37
+GMRES Iteration = 41 (90)   Scaled Residual = 9.78421e-36 / 368.706 = 2.65366e-38
+GMRES Iteration = 42 (91)   Scaled Residual = 2.1363e-36 / 368.706 = 5.79405e-39
+GMRES Iteration = 43 (92)   Scaled Residual = 4.43795e-37 / 368.706 = 1.20366e-39
+GMRES Iteration = 44 (93)   Scaled Residual = 1.31239e-37 / 368.706 = 3.55946e-40
+GMRES Iteration = 45 (94)   Scaled Residual = 2.68152e-38 / 368.706 = 7.2728e-41
+GMRES Iteration = 46 (95)   Scaled Residual = 2.66743e-39 / 368.706 = 7.23457e-42
+GMRES Iteration = 47 (96)   Scaled Residual = 6.82831e-40 / 368.706 = 1.85197e-42
+GMRES Iteration = 48 (97)   Scaled Residual = 1.30438e-40 / 368.706 = 3.53773e-43
+GMRES Iteration = 49 (98)   Scaled Residual = 3.0921e-41 / 368.706 = 8.38636e-44
+GMRES Iteration = 50 (99)   Scaled Residual = 3.49219e-42 / 368.706 = 9.47148e-45
+GMRES restart: k = 51 (100)
+ Setup    Time     0.08111 seconds.
+ Optimize Time     0 seconds.
+
+ ** applying diagonal exaggeration ** 
+
+WARNING: PERFORMING UNPRECONDITIONED ITERATIONS
+
+ Running GMRES(30) with max-iters = 5000 and tol = 1e-12 without precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid 
+GMRES Residual at the start of restart cycle = 4.69816e+08, 1
+GMRES Iteration = 1 (0)   Scaled Residual = 3.26848e+08 / 4.69816e+08 = 0.695694
+GMRES Iteration = 2 (1)   Scaled Residual = 2.07945e+08 / 4.69816e+08 = 0.44261
+GMRES Iteration = 3 (2)   Scaled Residual = 7.8398e+07 / 4.69816e+08 = 0.16687
+GMRES Iteration = 4 (3)   Scaled Residual = 2.80854e+07 / 4.69816e+08 = 0.0597796
+GMRES Iteration = 5 (4)   Scaled Residual = 1.12102e+07 / 4.69816e+08 = 0.0238609
+GMRES Iteration = 6 (5)   Scaled Residual = 4.66992e+06 / 4.69816e+08 = 0.00993989
+GMRES Iteration = 7 (6)   Scaled Residual = 1.84189e+06 / 4.69816e+08 = 0.00392045
+GMRES Iteration = 8 (7)   Scaled Residual = 630885 / 4.69816e+08 = 0.00134283
+GMRES Iteration = 9 (8)   Scaled Residual = 160230 / 4.69816e+08 = 0.000341048
+GMRES Iteration = 10 (9)   Scaled Residual = 12.3098 / 4.69816e+08 = 2.62013e-08
+GMRES Iteration = 11 (10)   Scaled Residual = 4.18333e-06 / 4.69816e+08 = 8.90418e-15
+GMRES restart: k = 12 (11)
+GMRES Residual at the start of restart cycle = 4.28567e-06, 9.12202e-15
+ > GMRES converged 
+GMRES restart: k = 1 (11)
+Calling GMRES (all double) for testing: 
+Call [0] Number of GMRES Iterations [11] Scaled Residual [9.12202e-15]
+ Expected 12 iterations.  Performed 11.
+ Time     0.002079 seconds.
+ Gflop/s  0.00334584/0.002079 = 1.60935 (n = 4096)
+WARNING: PERFORMING UNPRECONDITIONED ITERATIONS
+
+ Running GMRES(30) with max-iters = 5000 and tol = 1e-12 without precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid 
+GMRES Residual at the start of restart cycle = 4.69816e+08, 1
+GMRES Iteration = 1 (0)   Scaled Residual = 3.26848e+08 / 4.69816e+08 = 0.695694
+GMRES Iteration = 2 (1)   Scaled Residual = 2.07945e+08 / 4.69816e+08 = 0.44261
+GMRES Iteration = 3 (2)   Scaled Residual = 7.8398e+07 / 4.69816e+08 = 0.16687
+GMRES Iteration = 4 (3)   Scaled Residual = 2.80854e+07 / 4.69816e+08 = 0.0597796
+GMRES Iteration = 5 (4)   Scaled Residual = 1.12102e+07 / 4.69816e+08 = 0.0238609
+GMRES Iteration = 6 (5)   Scaled Residual = 4.66992e+06 / 4.69816e+08 = 0.00993989
+GMRES Iteration = 7 (6)   Scaled Residual = 1.84189e+06 / 4.69816e+08 = 0.00392045
+GMRES Iteration = 8 (7)   Scaled Residual = 630885 / 4.69816e+08 = 0.00134283
+GMRES Iteration = 9 (8)   Scaled Residual = 160230 / 4.69816e+08 = 0.000341048
+GMRES Iteration = 10 (9)   Scaled Residual = 12.3098 / 4.69816e+08 = 2.62013e-08
+GMRES Iteration = 11 (10)   Scaled Residual = 4.18333e-06 / 4.69816e+08 = 8.90418e-15
+GMRES restart: k = 12 (11)
+GMRES Residual at the start of restart cycle = 4.28567e-06, 9.12202e-15
+ > GMRES converged 
+GMRES restart: k = 1 (11)
+Calling GMRES (all double) for testing: 
+Call [1] Number of GMRES Iterations [11] Scaled Residual [9.12202e-15]
+ Expected 12 iterations.  Performed 11.
+ Time     0.002014 seconds.
+ Gflop/s  0.00334584/0.002014 = 1.66129 (n = 4096)
+
+ Running GMRES(30) with max-iters = 5000 and tol = 1e-12 with precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid 
+GMRES Residual at the start of restart cycle = 4.69816e+08, 1
+GMRES Iteration = 1 (0)   Scaled Residual = 12.7853 / 4.69816e+08 = 2.72134e-08
+GMRES Iteration = 2 (1)   Scaled Residual = 7.74604e-07 / 4.69816e+08 = 1.64874e-15
+GMRES restart: k = 3 (2)
+GMRES Residual at the start of restart cycle = 7.86762e-07, 1.67462e-15
+ > GMRES converged 
+GMRES restart: k = 1 (2)
+Calling GMRES (all double) for testing: 
+Call [0] Number of GMRES Iterations [2] Scaled Residual [1.67462e-15]
+ Expected 2 iterations.  Performed 2.
+ Time     0.001602 seconds.
+ Gflop/s  0.00256032/0.001602 = 1.5982 (n = 4096)
+
+ Running GMRES(30) with max-iters = 5000 and tol = 1e-12 with precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid 
+GMRES Residual at the start of restart cycle = 4.69816e+08, 1
+GMRES Iteration = 1 (0)   Scaled Residual = 12.7853 / 4.69816e+08 = 2.72134e-08
+GMRES Iteration = 2 (1)   Scaled Residual = 7.74604e-07 / 4.69816e+08 = 1.64874e-15
+GMRES restart: k = 3 (2)
+GMRES Residual at the start of restart cycle = 7.86762e-07, 1.67462e-15
+ > GMRES converged 
+GMRES restart: k = 1 (2)
+Calling GMRES (all double) for testing: 
+Call [1] Number of GMRES Iterations [2] Scaled Residual [1.67462e-15]
+ Expected 2 iterations.  Performed 2.
+ Time     0.001724 seconds.
+ Gflop/s  0.00256032/0.001724 = 1.4851 (n = 4096)
+WARNING: PERFORMING UNPRECONDITIONED ITERATIONS
+
+ Running GMRES_IR(30) with max-iters = 5000 and tol = 1e-12 without precond , nrow = 4096
+GMRES_IR Residual at the start of restart cycle = 4.69816e+08, 1
+GMRES_IR Iteration = 1 (0)   Scaled Residual = 3.26845e+08 / 4.69816e+08 = 0.695688
+GMRES_IR Iteration = 2 (1)   Scaled Residual = 2.07943e+08 / 4.69816e+08 = 0.442606
+GMRES_IR Iteration = 3 (2)   Scaled Residual = 7.83975e+07 / 4.69816e+08 = 0.166868
+GMRES_IR Iteration = 4 (3)   Scaled Residual = 2.80855e+07 / 4.69816e+08 = 0.0597798
+GMRES_IR Iteration = 5 (4)   Scaled Residual = 1.12102e+07 / 4.69816e+08 = 0.0238608
+GMRES_IR Iteration = 6 (5)   Scaled Residual = 4.66999e+06 / 4.69816e+08 = 0.00994004
+GMRES_IR Iteration = 7 (6)   Scaled Residual = 1.84191e+06 / 4.69816e+08 = 0.0039205
+GMRES_IR Iteration = 8 (7)   Scaled Residual = 630883 / 4.69816e+08 = 0.00134283
+GMRES_IR Iteration = 9 (8)   Scaled Residual = 160230 / 4.69816e+08 = 0.000341048
+GMRES_IR Iteration = 10 (9)   Scaled Residual = 12.5592 / 4.69816e+08 = 2.67322e-08
+GMRES_IR Iteration = 11 (10)   Scaled Residual = 4.3181e-06 / 4.69816e+08 = 9.19105e-15
+GMRES_IR restart: k = 12 (11)
+GMRES_IR Residual at the start of restart cycle = 183.723, 3.91054e-07
+GMRES_IR Iteration = 1 (11)   Scaled Residual = 66.9452 / 4.69816e+08 = 1.42492e-07
+GMRES_IR Iteration = 2 (12)   Scaled Residual = 47.8823 / 4.69816e+08 = 1.01917e-07
+GMRES_IR Iteration = 3 (13)   Scaled Residual = 21.8527 / 4.69816e+08 = 4.65133e-08
+GMRES_IR Iteration = 4 (14)   Scaled Residual = 8.31923 / 4.69816e+08 = 1.77074e-08
+GMRES_IR Iteration = 5 (15)   Scaled Residual = 3.51961 / 4.69816e+08 = 7.49146e-09
+GMRES_IR Iteration = 6 (16)   Scaled Residual = 1.49216 / 4.69816e+08 = 3.17605e-09
+GMRES_IR Iteration = 7 (17)   Scaled Residual = 0.398246 / 4.69816e+08 = 8.47663e-10
+GMRES_IR Iteration = 8 (18)   Scaled Residual = 0.127894 / 4.69816e+08 = 2.72222e-10
+GMRES_IR Iteration = 9 (19)   Scaled Residual = 0.0255666 / 4.69816e+08 = 5.44183e-11
+GMRES_IR Iteration = 10 (20)   Scaled Residual = 2.47246e-06 / 4.69816e+08 = 5.26262e-15
+GMRES_IR restart: k = 11 (21)
+GMRES_IR Residual at the start of restart cycle = 2.2783e-05, 4.84934e-14
+ > GMRES_IR converged 
+GMRES_IR restart: k = 1 (21)
+Call [0] Number of GMRES-IR Iterations [21] Scaled Residual [4.84934e-14]
+ Expected 12 iterations.  Performed 21.
+ Time     0.005603 seconds.
+WARNING: PERFORMING UNPRECONDITIONED ITERATIONS
+
+ Running GMRES_IR(30) with max-iters = 5000 and tol = 1e-12 without precond , nrow = 4096
+GMRES_IR Residual at the start of restart cycle = 4.69816e+08, 1
+GMRES_IR Iteration = 1 (0)   Scaled Residual = 3.26845e+08 / 4.69816e+08 = 0.695688
+GMRES_IR Iteration = 2 (1)   Scaled Residual = 2.07943e+08 / 4.69816e+08 = 0.442606
+GMRES_IR Iteration = 3 (2)   Scaled Residual = 7.83975e+07 / 4.69816e+08 = 0.166868
+GMRES_IR Iteration = 4 (3)   Scaled Residual = 2.80855e+07 / 4.69816e+08 = 0.0597798
+GMRES_IR Iteration = 5 (4)   Scaled Residual = 1.12102e+07 / 4.69816e+08 = 0.0238608
+GMRES_IR Iteration = 6 (5)   Scaled Residual = 4.66999e+06 / 4.69816e+08 = 0.00994004
+GMRES_IR Iteration = 7 (6)   Scaled Residual = 1.84191e+06 / 4.69816e+08 = 0.0039205
+GMRES_IR Iteration = 8 (7)   Scaled Residual = 630883 / 4.69816e+08 = 0.00134283
+GMRES_IR Iteration = 9 (8)   Scaled Residual = 160230 / 4.69816e+08 = 0.000341048
+GMRES_IR Iteration = 10 (9)   Scaled Residual = 12.5592 / 4.69816e+08 = 2.67322e-08
+GMRES_IR Iteration = 11 (10)   Scaled Residual = 4.3181e-06 / 4.69816e+08 = 9.19105e-15
+GMRES_IR restart: k = 12 (11)
+GMRES_IR Residual at the start of restart cycle = 183.723, 3.91054e-07
+GMRES_IR Iteration = 1 (11)   Scaled Residual = 66.9452 / 4.69816e+08 = 1.42492e-07
+GMRES_IR Iteration = 2 (12)   Scaled Residual = 47.8823 / 4.69816e+08 = 1.01917e-07
+GMRES_IR Iteration = 3 (13)   Scaled Residual = 21.8527 / 4.69816e+08 = 4.65133e-08
+GMRES_IR Iteration = 4 (14)   Scaled Residual = 8.31923 / 4.69816e+08 = 1.77074e-08
+GMRES_IR Iteration = 5 (15)   Scaled Residual = 3.51961 / 4.69816e+08 = 7.49146e-09
+GMRES_IR Iteration = 6 (16)   Scaled Residual = 1.49216 / 4.69816e+08 = 3.17605e-09
+GMRES_IR Iteration = 7 (17)   Scaled Residual = 0.398246 / 4.69816e+08 = 8.47663e-10
+GMRES_IR Iteration = 8 (18)   Scaled Residual = 0.127894 / 4.69816e+08 = 2.72222e-10
+GMRES_IR Iteration = 9 (19)   Scaled Residual = 0.0255666 / 4.69816e+08 = 5.44183e-11
+GMRES_IR Iteration = 10 (20)   Scaled Residual = 2.47246e-06 / 4.69816e+08 = 5.26262e-15
+GMRES_IR restart: k = 11 (21)
+GMRES_IR Residual at the start of restart cycle = 2.2783e-05, 4.84934e-14
+ > GMRES_IR converged 
+GMRES_IR restart: k = 1 (21)
+Call [1] Number of GMRES-IR Iterations [21] Scaled Residual [4.84934e-14]
+ Expected 12 iterations.  Performed 21.
+ Time     0.005244 seconds.
+
+ Running GMRES_IR(30) with max-iters = 5000 and tol = 1e-12 with precond , nrow = 4096
+GMRES_IR Residual at the start of restart cycle = 4.69816e+08, 1
+GMRES_IR Iteration = 1 (0)   Scaled Residual = 35.18 / 4.69816e+08 = 7.48803e-08
+GMRES_IR Iteration = 2 (1)   Scaled Residual = 0.00035599 / 4.69816e+08 = 7.57722e-13
+GMRES_IR restart: k = 3 (2)
+GMRES_IR Residual at the start of restart cycle = 61.156, 1.3017e-07
+GMRES_IR Iteration = 1 (2)   Scaled Residual = 4.60599e-06 / 4.69816e+08 = 9.80382e-15
+GMRES_IR restart: k = 2 (3)
+GMRES_IR Residual at the start of restart cycle = 6.01636e-06, 1.28058e-14
+ > GMRES_IR converged 
+GMRES_IR restart: k = 1 (3)
+Call [0] Number of GMRES-IR Iterations [3] Scaled Residual [1.28058e-14]
+ Expected 2 iterations.  Performed 3.
+ Time     0.002948 seconds.
+
+ Running GMRES_IR(30) with max-iters = 5000 and tol = 1e-12 with precond , nrow = 4096
+GMRES_IR Residual at the start of restart cycle = 4.69816e+08, 1
+GMRES_IR Iteration = 1 (0)   Scaled Residual = 35.18 / 4.69816e+08 = 7.48803e-08
+GMRES_IR Iteration = 2 (1)   Scaled Residual = 0.00035599 / 4.69816e+08 = 7.57722e-13
+GMRES_IR restart: k = 3 (2)
+GMRES_IR Residual at the start of restart cycle = 61.156, 1.3017e-07
+GMRES_IR Iteration = 1 (2)   Scaled Residual = 4.60599e-06 / 4.69816e+08 = 9.80382e-15
+GMRES_IR restart: k = 2 (3)
+GMRES_IR Residual at the start of restart cycle = 6.01636e-06, 1.28058e-14
+ > GMRES_IR converged 
+GMRES_IR restart: k = 1 (3)
+Call [1] Number of GMRES-IR Iterations [3] Scaled Residual [1.28058e-14]
+ Expected 2 iterations.  Performed 3.
+ Time     0.002821 seconds.
diff --git a/bin/xhpgmp b/bin/xhpgmp
new file mode 100755
index 00000000..6723845f
Binary files /dev/null and b/bin/xhpgmp differ
diff --git a/src/CGData.hpp b/src/CGData.hpp
index 98841c47..aa5357e1 100644
--- a/src/CGData.hpp
+++ b/src/CGData.hpp
@@ -63,6 +63,7 @@ inline void DeleteCGData(CGData_type & data) {
   DeleteVector (data.r);
   DeleteVector (data.z);
   DeleteVector (data.p);
+  DeleteVector (data.w);
   DeleteVector (data.Ap);
   return;
 }
diff --git a/src/CheckAspectRatio.cpp b/src/CheckAspectRatio.cpp
index d625148d..e454e53f 100644
--- a/src/CheckAspectRatio.cpp
+++ b/src/CheckAspectRatio.cpp
@@ -23,7 +23,7 @@
 #include <mpi.h>
 #endif
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 
 #include "CheckAspectRatio.hpp"
 
diff --git a/src/CheckProblem.cpp b/src/CheckProblem.cpp
index 19419a0c..f153d881 100644
--- a/src/CheckProblem.cpp
+++ b/src/CheckProblem.cpp
@@ -29,7 +29,7 @@
 #if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
 #include <fstream>
 using std::endl;
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 #endif
 #include <cassert>
 
diff --git a/src/ComputeDotProduct_ref.cpp b/src/ComputeDotProduct_ref.cpp
index 7daf8a5c..922223de 100644
--- a/src/ComputeDotProduct_ref.cpp
+++ b/src/ComputeDotProduct_ref.cpp
@@ -24,11 +24,23 @@
 #include "Utils_MPI.hpp"
 #endif
 #ifndef HPCG_NO_OPENMP
-#include <omp.h>
+ #include <omp.h>
 #endif
+#ifdef HPCG_WITH_CUDA
+ #include <cuda_runtime.h>
+ #include <cublas_v2.h>
+#elif defined(HPCG_WITH_HIP)
+ #include <hip/hip_runtime_api.h>
+ #include <rocblas.h>
+#endif
+
 #include <cassert>
 #include "ComputeDotProduct_ref.hpp"
 
+#ifdef HPCG_DEBUG
+#include "Hpgmp_Params.hpp"
+#endif
+
 /*!
   Routine to compute the dot product of two vectors where:
 
@@ -51,33 +63,86 @@ int ComputeDotProduct_ref(const local_int_t n, const Vector_type & x, const Vect
   assert(y.localLength>=n);
 
   typedef typename Vector_type::scalar_type scalar_type;
-#ifndef HPCG_NO_MPI
-  MPI_Datatype MPI_SCALAR_TYPE = MpiTypeTraits<scalar_type>::getType ();
-#endif
-
   scalar_type local_result (0.0);
+
+#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG)
   scalar_type * xv = x.values;
   scalar_type * yv = y.values;
   if (yv==xv) {
-#ifndef HPCG_NO_OPENMP
+    #ifndef HPCG_NO_OPENMP
     #pragma omp parallel for reduction (+:local_result)
-#endif
+    #endif
     for (local_int_t i=0; i<n; i++) local_result += xv[i]*xv[i];
   } else {
-#ifndef HPCG_NO_OPENMP
+    #ifndef HPCG_NO_OPENMP
     #pragma omp parallel for reduction (+:local_result)
-#endif
+    #endif
     for (local_int_t i=0; i<n; i++) local_result += xv[i]*yv[i];
   }
+#endif
+
+#if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP)
+  scalar_type* d_x = x.d_values;
+  scalar_type* d_y = y.d_values;
+
+  #ifdef HPCG_DEBUG
+  scalar_type local_tmp = local_result;
+  #endif
+  #if defined(HPCG_WITH_CUDA)
+  // Compute dot on Nvidia GPU
+  cublasHandle_t handle = x.handle;
+  if (std::is_same<scalar_type, double>::value) {
+    if (CUBLAS_STATUS_SUCCESS != cublasDdot (handle, n, (double*)d_x, 1, (double*)d_y, 1, (double*)&local_result)) {
+      printf( " Failed cublasDdot\n" );
+    }
+  } else if (std::is_same<scalar_type, float>::value) {
+    if (CUBLAS_STATUS_SUCCESS != cublasSdot (handle, n, (float*)d_x, 1,  (float*)d_y, 1,  (float*)&local_result)) {
+      printf( " Failed cublasSdot\n" );
+    }
+  }
+  #elif defined(HPCG_WITH_HIP)
+  // Compute dot on AMD GPU
+  rocblas_handle handle = x.handle;
+  #if 1 // TODO remove this
+  if (hipSuccess != hipMemcpy(d_x, xv, sizeof(scalar_type) * n, hipMemcpyHostToDevice)) {
+    printf( " Failed hipMemcpy d_x\n" );
+  }
+  if (hipSuccess != hipMemcpy(d_y, yv, sizeof(scalar_type) * n, hipMemcpyHostToDevice)) {
+    printf( " Failed hipMemcpy d_y\n" );
+  }
+  #endif
+  if (std::is_same<scalar_type, double>::value) {
+    if (rocblas_status_success != rocblas_ddot (handle, n, (double*)d_x, 1, (double*)d_y, 1, (double*)&local_result)) {
+      printf( " Failed rocblas_ddot\n" );
+    }
+  } else if (std::is_same<scalar_type, float>::value) {
+    if (rocblas_status_success != rocblas_sdot (handle, n, (float*)d_x, 1,  (float*)d_y, 1,  (float*)&local_result)) {
+      printf( " Failed rocblas_sdot\n" );
+    }
+  }
+  #endif
+#endif
 
 #ifndef HPCG_NO_MPI
   // Use MPI's reduce function to collect all partial sums
+  MPI_Datatype MPI_SCALAR_TYPE = MpiTypeTraits<scalar_type>::getType ();
   double t0 = mytimer();
   scalar_type global_result (0.0);
   MPI_Allreduce(&local_result, &global_result, 1, MPI_SCALAR_TYPE, MPI_SUM,
                 MPI_COMM_WORLD);
   result = global_result;
   time_allreduce += mytimer() - t0;
+
+  #if defined(HPCG_WITH_CUDA) & defined(HPCG_DEBUG)
+  scalar_type global_tmp (0.0);
+  MPI_Allreduce(&local_tmp, &global_tmp, 1, MPI_SCALAR_TYPE, MPI_SUM,
+                MPI_COMM_WORLD);
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank == 0) {
+    HPCG_fout << rank << " : DotProduct(" << n << "): error = " << global_tmp-global_result << " (dot=" << global_result << ")" << std::endl;
+  }
+  #endif
 #else
   time_allreduce += 0.0;
   result = local_result;
diff --git a/src/ComputeGEMV.cpp b/src/ComputeGEMV.cpp
index 6cfaaca1..b7a190e3 100644
--- a/src/ComputeGEMV.cpp
+++ b/src/ComputeGEMV.cpp
@@ -19,44 +19,17 @@
  */
 
 #include "ComputeGEMV.hpp"
+#include "ComputeGEMV_ref.hpp"
 
 template<class MultiVector_type, class Vector_type, class SerialDenseMatrix_type>
 int ComputeGEMV(const local_int_t m, const local_int_t n,
                 const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const SerialDenseMatrix_type & x,
-                const typename      Vector_type::scalar_type beta,  const Vector_type & y) {
+                const typename      Vector_type::scalar_type beta,  const Vector_type & y,
+                bool & isOptimized) {
 
-  typedef typename       MultiVector_type::scalar_type scalarA_type;
-  typedef typename SerialDenseMatrix_type::scalar_type scalarX_type;
-  typedef typename            Vector_type::scalar_type scalarY_type;
-
-  const scalarA_type one  (1.0);
-  const scalarA_type zero (0.0);
-
-  assert(x.m >= n); // Test vector lengths
-  assert(x.n == 1);
-  assert(y.localLength >= m);
-
-  const scalarA_type * const Av = A.values;
-  const scalarX_type * const xv = x.values;
-        scalarY_type * yv = y.values;
-  if (beta == zero) {
-    for (local_int_t i = 0; i < m; i++) yv[i] = zero;
-  } else if (beta != one) {
-    for (local_int_t i = 0; i < m; i++) yv[i] *= beta;
-  }
-
-  if (alpha == one) {
-    for (local_int_t i=0; i<m; i++) {
-      for (local_int_t j=0; j<n; j++)
-        yv[i] += Av[i + j*m] * xv[j];
-    }
-  } else {
-    for (local_int_t i=0; i<m; i++) {
-      for (local_int_t j=0; j<n; j++)
-        yv[i] += alpha * Av[i + j*m] * xv[j];
-    }
-  }
-  return 0;
+  // This line and the next two lines should be removed and your version of ComputeGEMV should be used.
+  isOptimized = false;
+  return ComputeGEMV_ref(m, n, alpha, A, x, beta, y);
 }
 
 
@@ -67,15 +40,15 @@ int ComputeGEMV(const local_int_t m, const local_int_t n,
 // uniform
 template
 int ComputeGEMV< MultiVector<double>, Vector<double>, SerialDenseMatrix<double> >
-  (int, int, double, MultiVector<double> const&, SerialDenseMatrix<double> const&, double, Vector<double> const&);
+  (int, int, double, MultiVector<double> const&, SerialDenseMatrix<double> const&, double, Vector<double> const&, bool&);
 
 template
 int ComputeGEMV< MultiVector<float>, Vector<float>, SerialDenseMatrix<float> >
-  (int, int, float, MultiVector<float> const&, SerialDenseMatrix<float> const&, float, Vector<float> const&);
+  (int, int, float, MultiVector<float> const&, SerialDenseMatrix<float> const&, float, Vector<float> const&, bool&);
 
 
 // mixed
 template
 int ComputeGEMV< MultiVector<float>, Vector<double>, SerialDenseMatrix<float> >
-  (int, int, float, MultiVector<float> const&, SerialDenseMatrix<float> const&, double, Vector<double> const&);
+  (int, int, float, MultiVector<float> const&, SerialDenseMatrix<float> const&, double, Vector<double> const&, bool&);
 
diff --git a/src/ComputeGEMV.hpp b/src/ComputeGEMV.hpp
index 0e972771..393219fc 100644
--- a/src/ComputeGEMV.hpp
+++ b/src/ComputeGEMV.hpp
@@ -29,6 +29,7 @@
 template<class MultiVector_type, class Vector_type, class SerialDenseMatrix_type>
 int ComputeGEMV(const local_int_t m, const local_int_t n,
                 const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const SerialDenseMatrix_type & x,
-                const typename      Vector_type::scalar_type beta,  const Vector_type & y);
+                const typename      Vector_type::scalar_type beta,  const Vector_type & y,
+                bool & isOptimized);
 
 #endif // COMPUTE_GEMV
diff --git a/src/ComputeGEMVT.cpp b/src/ComputeGEMVT.cpp
new file mode 100644
index 00000000..6f456545
--- /dev/null
+++ b/src/ComputeGEMVT.cpp
@@ -0,0 +1,48 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeGEMVT.cpp
+
+ Routine to compute the GEMV of transpose of a matrix and a vector.
+ */
+#include "ComputeGEMVT.hpp"
+#include "ComputeGEMVT_ref.hpp"
+
+template<class MultiVector_type, class Vector_type, class SerialDenseMatrix_type>
+int ComputeGEMVT(const local_int_t m, const local_int_t n,
+                 const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const Vector_type & x,
+                 const typename      Vector_type::scalar_type beta,  const SerialDenseMatrix_type & y,
+                 bool & isOptimized) {
+
+  // This line and the next two lines should be removed and your version of ComputeGEMV should be used.
+  isOptimized = false;
+  return ComputeGEMVT_ref(m, n, alpha, A, x, beta, y);
+}
+
+
+/* --------------- *
+ * specializations *
+ * --------------- */
+
+// uniform
+template
+int ComputeGEMVT< MultiVector<double>, Vector<double>, SerialDenseMatrix<double> >
+  (int, int, double, MultiVector<double> const&, Vector<double> const&, double, SerialDenseMatrix<double> const&, bool&);
+
+template
+int ComputeGEMVT< MultiVector<float>, Vector<float>, SerialDenseMatrix<float> >
+  (int, int, float, MultiVector<float> const&, Vector<float> const&, float, SerialDenseMatrix<float> const&, bool&);
+
+
diff --git a/src/ComputeGEMVT.hpp b/src/ComputeGEMVT.hpp
new file mode 100644
index 00000000..e718942b
--- /dev/null
+++ b/src/ComputeGEMVT.hpp
@@ -0,0 +1,35 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeGEMVT.hpp
+
+ HPCG data structures for dense vectors
+ */
+
+#ifndef COMPUTE_GEMVT_HPP
+#define COMPUTE_GEMVT_HPP
+
+#include "Geometry.hpp"
+#include "MultiVector.hpp"
+#include "Vector.hpp"
+#include "SerialDenseMatrix.hpp"
+
+template<class MultiVector_type, class Vector_type, class SerialDenseMatrix_type>
+int ComputeGEMVT(const local_int_t m, const local_int_t n,
+                const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const Vector_type & x,
+                const typename      Vector_type::scalar_type beta,  const SerialDenseMatrix_type & y,
+                bool & isOptimized);
+
+#endif // COMPUTE_GEMVT
diff --git a/src/ComputeGEMVT_ref.cpp b/src/ComputeGEMVT_ref.cpp
new file mode 100644
index 00000000..153d9264
--- /dev/null
+++ b/src/ComputeGEMVT_ref.cpp
@@ -0,0 +1,153 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file Vector.hpp
+
+ HPCG data structures for dense vectors
+ */
+#ifndef HPCG_NO_MPI
+ #include "Utils_MPI.hpp"
+#endif
+#include "ComputeGEMVT_ref.hpp"
+
+template<class MultiVector_type, class Vector_type, class SerialDenseMatrix_type>
+int ComputeGEMVT_ref(const local_int_t m, const local_int_t n,
+                     const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const Vector_type & x,
+                     const typename      Vector_type::scalar_type beta,  const SerialDenseMatrix_type & y) {
+
+  typedef typename       MultiVector_type::scalar_type scalarA_type;
+  typedef typename SerialDenseMatrix_type::scalar_type scalarX_type;
+  typedef typename            Vector_type::scalar_type scalarY_type;
+
+  const scalarA_type one  (1.0);
+  const scalarA_type zero (0.0);
+
+  assert(x.localLength >= m); // Test vector lengths
+  assert(y.m >= n);
+  assert(y.n == 1);
+
+  // Input serial dense vector 
+  scalarA_type * const Av = A.values;
+  scalarX_type * const xv = x.values;
+  scalarY_type * const yv = y.values;
+
+#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG)
+  // GEMV on HOST CPU
+  if (beta == zero) {
+    for (local_int_t i = 0; i < n; i++) yv[i] = zero;
+  } else if (beta != one) {
+    for (local_int_t i = 0; i < n; i++) yv[i] *= beta;
+  }
+
+  if (alpha == one) {
+    for (local_int_t j=0; j<n; j++)
+      for (local_int_t i=0; i<m; i++) {
+        yv[j] += Av[i + j*m] * xv[i];
+    }
+  } else {
+    for (local_int_t i=0; i<m; i++) {
+      for (local_int_t j=0; j<n; j++)
+        yv[j] += alpha * Av[i + j*m] * xv[i];
+    }
+  }
+#endif
+
+#ifdef HPCG_WITH_CUDA
+  scalarA_type * const d_Av = A.d_values;
+  scalarX_type * const d_xv = x.d_values;
+  scalarY_type * const d_yv = y.d_values;
+  if ((std::is_same<scalarX_type, double>::value && std::is_same<scalarY_type, double>::value && std::is_same<scalarA_type, double>::value) ||
+      (std::is_same<scalarX_type, float >::value && std::is_same<scalarY_type, float >::value && std::is_same<scalarA_type, float >::value)) {
+
+    // Perform GEMV on device
+    if (std::is_same<scalarX_type, double>::value) {
+      if (CUBLAS_STATUS_SUCCESS != cublasDgemv(x.handle, CUBLAS_OP_T,
+                                               m, n,
+                                               (double*)&alpha, (double*)d_Av, m,
+                                                                (double*)d_xv, 1,
+                                               (double*)&beta,  (double*)d_yv, 1)){
+        printf( " Failed cublasDgemv\n" );
+      }
+    } else if (std::is_same<scalarX_type, float>::value) {
+      if (CUBLAS_STATUS_SUCCESS != cublasSgemv(x.handle, CUBLAS_OP_T,
+                                               m, n,
+                                               (float*)&alpha, (float*)d_Av, m,
+                                                               (float*)d_xv, 1,
+                                               (float*)&beta,  (float*)d_yv, 1)){
+        printf( " Failed cublasSgemv\n" );
+      }
+    }
+
+    // Copy input serial dense vector to host
+    if (cudaSuccess != cudaMemcpy(yv, d_yv, n*sizeof(scalarX_type), cudaMemcpyHostToDevice)) {
+      printf( " Failed to memcpy d_x\n" );
+    }
+  } else {
+    HPCG_fout << " Mixed-precision GEMV not supported" << std::endl;
+
+    // Copy input matrix A from HOST CPU
+    if (cudaSuccess != cudaMemcpy(Av, d_Av, m*n*sizeof(scalarY_type), cudaMemcpyDeviceToHost)) {
+      printf( " Failed to memcpy d_y\n" );
+    }
+    if (cudaSuccess != cudaMemcpy(xv, d_xv, m*sizeof(scalarX_type), cudaMemcpyHostToDevice)) {
+      printf( " Failed to memcpy d_x\n" );
+    }
+
+    // GEMV on HOST CPU
+    if (beta == zero) {
+      for (local_int_t i = 0; i < n; i++) yv[i] = zero;
+    } else if (beta != one) {
+      for (local_int_t i = 0; i < n; i++) yv[i] *= beta;
+    }
+
+    if (alpha == one) {
+      for (local_int_t j=0; j<n; j++)
+        for (local_int_t i=0; i<m; i++) {
+          yv[j] += Av[i + j*m] * xv[i];
+      }
+    } else {
+      for (local_int_t j=0; j<n; j++)
+        for (local_int_t i=0; i<m; i++) {
+          yv[j] += alpha * Av[i + j*m] * xv[i];
+      }
+    }
+  }
+#endif
+
+#ifndef HPCG_NO_MPI
+  // Use MPI's reduce function to collect all partial sums
+  MPI_Datatype MPI_SCALAR_TYPE = MpiTypeTraits<scalarY_type>::getType ();
+  MPI_Allreduce(MPI_IN_PLACE, yv, n, MPI_SCALAR_TYPE, MPI_SUM,
+                MPI_COMM_WORLD);
+#endif
+
+  return 0;
+}
+
+
+/* --------------- *
+ * specializations *
+ * --------------- */
+
+// uniform
+template
+int ComputeGEMVT_ref< MultiVector<double>, Vector<double>, SerialDenseMatrix<double> >
+  (int, int, double, MultiVector<double> const&, Vector<double> const&, double, SerialDenseMatrix<double> const&);
+
+template
+int ComputeGEMVT_ref< MultiVector<float>, Vector<float>, SerialDenseMatrix<float> >
+  (int, int, float, MultiVector<float> const&, Vector<float> const&, float, SerialDenseMatrix<float> const&);
+
+
diff --git a/src/ComputeGEMVT_ref.hpp b/src/ComputeGEMVT_ref.hpp
new file mode 100644
index 00000000..f9a49256
--- /dev/null
+++ b/src/ComputeGEMVT_ref.hpp
@@ -0,0 +1,34 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeGEMVT_re.hpp
+
+ HPCG data structures for dense vectors
+ */
+
+#ifndef COMPUTE_GEMVT_REF_HPP
+#define COMPUTE_GEMVT_REF_HPP
+
+#include "Geometry.hpp"
+#include "MultiVector.hpp"
+#include "Vector.hpp"
+#include "SerialDenseMatrix.hpp"
+
+template<class MultiVector_type, class Vector_type, class SerialDenseMatrix_type>
+int ComputeGEMVT_ref(const local_int_t m, const local_int_t n,
+                     const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const Vector_type & x,
+                     const typename      Vector_type::scalar_type beta,  const SerialDenseMatrix_type & y);
+
+#endif // COMPUTE_GEMVT
diff --git a/src/ComputeGEMV_ref.cpp b/src/ComputeGEMV_ref.cpp
new file mode 100644
index 00000000..c67408b0
--- /dev/null
+++ b/src/ComputeGEMV_ref.cpp
@@ -0,0 +1,152 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file Vector.hpp
+
+ HPCG data structures for dense vectors
+ */
+
+#include "ComputeGEMV_ref.hpp"
+
+template<class MultiVector_type, class Vector_type, class SerialDenseMatrix_type>
+int ComputeGEMV_ref(const local_int_t m, const local_int_t n,
+                    const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const SerialDenseMatrix_type & x,
+                    const typename      Vector_type::scalar_type beta,  const Vector_type & y) {
+
+  typedef typename       MultiVector_type::scalar_type scalarA_type;
+  typedef typename SerialDenseMatrix_type::scalar_type scalarX_type;
+  typedef typename            Vector_type::scalar_type scalarY_type;
+
+  const scalarA_type one  (1.0);
+  const scalarA_type zero (0.0);
+
+  assert(x.m >= n); // Test vector lengths
+  assert(x.n == 1);
+  assert(y.localLength >= m);
+
+  // Input serial dense vector 
+  const scalarX_type * const xv = x.values;
+
+  scalarA_type * const Av = A.values;
+  scalarY_type * const yv = y.values;
+
+#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG)
+  // GEMV on HOST CPU
+  if (beta == zero) {
+    for (local_int_t i = 0; i < m; i++) yv[i] = zero;
+  } else if (beta != one) {
+    for (local_int_t i = 0; i < m; i++) yv[i] *= beta;
+  }
+
+  if (alpha == one) {
+    for (local_int_t j=0; j<n; j++)
+      for (local_int_t i=0; i<m; i++) {
+        yv[i] += Av[i + j*m] * xv[j];
+    }
+  } else {
+    for (local_int_t j=0; j<n; j++)
+      for (local_int_t i=0; i<m; i++) {
+        yv[i] += alpha * Av[i + j*m] * xv[j];
+    }
+  }
+#endif
+
+#ifdef HPCG_WITH_CUDA
+  scalarA_type * const d_Av = A.d_values;
+  scalarX_type * const d_xv = x.d_values;
+  scalarY_type * const d_yv = y.d_values;
+  if ((std::is_same<scalarX_type, double>::value && std::is_same<scalarY_type, double>::value && std::is_same<scalarA_type, double>::value) ||
+      (std::is_same<scalarX_type, float >::value && std::is_same<scalarY_type, float >::value && std::is_same<scalarA_type, float >::value)) {
+
+    // Copy input serial dense vector to device
+    if (cudaSuccess != cudaMemcpy(d_xv, xv, n*sizeof(scalarX_type), cudaMemcpyHostToDevice)) {
+      printf( " Failed to memcpy d_x\n" );
+    }
+
+    // Perform GEMV on device
+    if (std::is_same<scalarX_type, double>::value) {
+      if (CUBLAS_STATUS_SUCCESS != cublasDgemv(y.handle, CUBLAS_OP_N,
+                                               m, n,
+                                               (double*)&alpha, (double*)d_Av, m,
+                                                                (double*)d_xv, 1,
+                                               (double*)&beta,  (double*)d_yv, 1)){
+        printf( " Failed cublasDgemv\n" );
+      }
+    } else if (std::is_same<scalarX_type, float>::value) {
+      if (CUBLAS_STATUS_SUCCESS != cublasSgemv(y.handle, CUBLAS_OP_N,
+                                               m, n,
+                                               (float*)&alpha, (float*)d_Av, m,
+                                                               (float*)d_xv, 1,
+                                               (float*)&beta,  (float*)d_yv, 1)){
+        printf( " Failed cublasSgemv\n" );
+      }
+    }
+  } else {
+    HPCG_fout << " Mixed-precision GEMV not supported" << std::endl;
+
+    // Copy input matrix A from HOST CPU
+    if (cudaSuccess != cudaMemcpy(Av, d_Av, m*n*sizeof(scalarY_type), cudaMemcpyDeviceToHost)) {
+      printf( " Failed to memcpy d_y\n" );
+    }
+
+    // GEMV on HOST CPU
+    if (beta == zero) {
+      for (local_int_t i = 0; i < m; i++) yv[i] = zero;
+    } else if (beta != one) {
+      for (local_int_t i = 0; i < m; i++) yv[i] *= beta;
+    }
+
+    if (alpha == one) {
+      for (local_int_t i=0; i<m; i++) {
+        for (local_int_t j=0; j<n; j++)
+          yv[i] += Av[i + j*m] * xv[j];
+      }
+    } else {
+      for (local_int_t i=0; i<m; i++) {
+        for (local_int_t j=0; j<n; j++)
+          yv[i] += alpha * Av[i + j*m] * xv[j];
+      }
+    }
+
+    // Copy output vector Y from HOST CPU
+    if (cudaSuccess != cudaMemcpy(d_yv, yv, m*sizeof(scalarY_type), cudaMemcpyHostToDevice)) {
+      printf( " Failed to memcpy d_y\n" );
+    }
+  }
+#endif
+
+  return 0;
+}
+
+
+/* --------------- *
+ * specializations *
+ * --------------- */
+
+// uniform
+template
+int ComputeGEMV_ref< MultiVector<double>, Vector<double>, SerialDenseMatrix<double> >
+  (int, int, double, MultiVector<double> const&, SerialDenseMatrix<double> const&, double, Vector<double> const&);
+
+template
+int ComputeGEMV_ref< MultiVector<float>, Vector<float>, SerialDenseMatrix<float> >
+  (int, int, float, MultiVector<float> const&, SerialDenseMatrix<float> const&, float, Vector<float> const&);
+
+
+// mixed
+template
+int ComputeGEMV_ref< MultiVector<float>, Vector<double>, SerialDenseMatrix<float> >
+  (int, int, float, MultiVector<float> const&, SerialDenseMatrix<float> const&, double, Vector<double> const&);
+
diff --git a/src/ComputeGEMV_ref.hpp b/src/ComputeGEMV_ref.hpp
new file mode 100644
index 00000000..197c53cd
--- /dev/null
+++ b/src/ComputeGEMV_ref.hpp
@@ -0,0 +1,34 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file Vector.hpp
+
+ HPCG data structures for dense vectors
+ */
+
+#ifndef COMPUTE_GEMV_REF_HPP
+#define COMPUTE_GEMV_REF_HPP
+
+#include "Geometry.hpp"
+#include "MultiVector.hpp"
+#include "Vector.hpp"
+#include "SerialDenseMatrix.hpp"
+
+template<class MultiVector_type, class Vector_type, class SerialDenseMatrix_type>
+int ComputeGEMV_ref(const local_int_t m, const local_int_t n,
+                    const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const SerialDenseMatrix_type & x,
+                    const typename      Vector_type::scalar_type beta,  const Vector_type & y);
+
+#endif // COMPUTE_GEMV_REF
diff --git a/src/ComputeGS_Forward_ref.cpp b/src/ComputeGS_Forward_ref.cpp
index 2e99f2d5..7a088598 100644
--- a/src/ComputeGS_Forward_ref.cpp
+++ b/src/ComputeGS_Forward_ref.cpp
@@ -13,13 +13,24 @@
 //@HEADER
 
 /*!
- @file ComputeSYMGS_ref.cpp
+ @file ComputeGS_Forward_ref.cpp
 
  HPCG routine
  */
 
 #ifndef HPCG_NO_MPI
-#include "ExchangeHalo.hpp"
+ #include "ExchangeHalo.hpp"
+#endif
+#ifdef HPCG_WITH_CUDA
+ #include <cuda_runtime.h>
+ #include <cublas_v2.h>
+ #include "ComputeSPMV.hpp"
+ #include "ComputeWAXPBY.hpp"
+ #ifdef HPCG_DEBUG
+ #include <mpi.h>
+ #include "Utils_MPI.hpp"
+ #include "Hpgmp_Params.hpp"
+ #endif
 #endif
 #include "ComputeGS_Forward_ref.hpp"
 #include <cassert>
@@ -53,14 +64,53 @@ int ComputeGS_Forward_ref(const SparseMatrix_type & A, const Vector_type & r, Ve
   assert(x.localLength==A.localNumberOfColumns); // Make sure x contain space for halo values
 
   typedef typename SparseMatrix_type::scalar_type scalar_type;
+  const local_int_t nrow = A.localNumberOfRows;
+  const local_int_t ncol = A.localNumberOfColumns;
+
+  const scalar_type * const rv = r.values;
+  scalar_type * const xv = x.values;
+
 #ifndef HPCG_NO_MPI
-  ExchangeHalo(A,x);
+  #ifdef HPCG_WITH_CUDA
+  // workspace
+  Vector_type b = A.x; // nrow
+  scalar_type * const d_bv = b.d_values;
+  scalar_type * const d_xv = x.d_values;
+
+  // Copy local part of X to HOST CPU
+  if (A.geom->rank==0) printf( " HaloExchange on Host for GS_Forward\n" );
+  if (cudaSuccess != cudaMemcpy(xv, d_xv, nrow*sizeof(scalar_type), cudaMemcpyDeviceToHost)) {
+    printf( " Failed to memcpy d_y\n" );
+  }
+  #endif
+
+  // Exchange Halo on HOST CPU
+  ExchangeHalo(A, x);
+
+  #ifdef HPCG_WITH_CUDA
+  // Copy X (after Halo Exchange on host) to device
+  #define HPCG_COMPACT_GS
+  #ifdef HPCG_COMPACT_GS
+  // Copy non-local part of X (after Halo Exchange) into x0 on device
+  //if (cudaSuccess != cudaMemcpy(&d_xv[nrow], &xv[nrow], (ncol-nrow)*sizeof(scalar_type), cudaMemcpyHostToDevice)) {
+  //  printf( " Failed to memcpy d_y\n" );
+  //}
+  #else
+  Vector_type x0 = A.y; // ncol
+  scalar_type * const x0v = x0.values;
+  CopyVector(x, x0); // this also copy on CPU, which is needed only for debug
+  #endif
+
+  #ifdef HPCG_DEBUG
+  if (A.geom->rank==0) {
+    HPCG_fout << A.geom->rank << " : ComputeGS(" << nrow << " x " << ncol << ") start" << std::endl;
+  }
+  #endif
+  #endif
 #endif
 
-  const local_int_t nrow = A.localNumberOfRows;
+#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG)
   scalar_type ** matrixDiagonal = A.matrixDiagonal;  // An array of pointers to the diagonal entries A.matrixValues
-  const scalar_type * const rv = r.values;
-  scalar_type * const xv = x.values;
 
   for (local_int_t i=0; i < nrow; i++) {
     const scalar_type * const currentValues = A.matrixValues[i];
@@ -76,8 +126,113 @@ int ComputeGS_Forward_ref(const SparseMatrix_type & A, const Vector_type & r, Ve
     sum += xv[i]*currentDiagonal; // Remove diagonal contribution from previous loop
 
     xv[i] = sum/currentDiagonal;
+  }
+#endif
+
+#ifdef HPCG_WITH_CUDA
+  const scalar_type  one ( 1.0);
+  const scalar_type mone (-1.0);
 
+  #ifdef HPCG_COMPACT_GS
+  // b = r - Ux
+  if (cudaSuccess != cudaMemcpy(d_bv, r.d_values, nrow*sizeof(scalar_type), cudaMemcpyDeviceToDevice)) {
+    printf( " Failed to memcpy d_r\n" );
   }
+  if (std::is_same<scalar_type, double>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                   nrow, ncol, A.nnzU,
+                                                   (const double*)&mone,  A.descrU,
+                                                                         (double*)A.d_Unzvals, A.d_Urow_ptr, A.d_Ucol_idx,
+                                                                         (double*)d_xv,
+                                                   (const double*)&one,  (double*)d_bv)) {
+       printf( " Failed cusparseDcsrmv\n" );
+     }
+  } else if (std::is_same<scalar_type, float>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                   nrow, ncol, A.nnzU,
+                                                   (const float*)&mone, A.descrA,
+                                                                        (float*)A.d_Unzvals, A.d_Urow_ptr, A.d_Ucol_idx,
+                                                                        (float*)d_xv,
+                                                   (const float*)&one,  (float*)d_bv)) {
+       printf( " Failed cusparseScsrmv\n" );
+     }
+  }
+  #else
+  // b = r - Ax0
+  ComputeSPMV(A, x0, b);
+  ComputeWAXPBY(nrow, -one, b, one, r, b, A.isWaxpbyOptimized);
+  #endif
+
+  // x = L^{-1}b
+  if (std::is_same<scalar_type, double>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrsv_solve(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                         nrow,
+                                                         (const double*)&one, A.descrL,
+                                                                              (double*)A.d_Lnzvals, A.d_Lrow_ptr, A.d_Lcol_idx,
+                                                                              A.infoL,
+                                                         (double*)d_bv, (double*)d_xv)) {
+       printf( " Failed cusparseDcsrv_solve\n" );
+     }
+  } else if (std::is_same<scalar_type, float>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseScsrsv_solve(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                         nrow,
+                                                         (const float*)&one, A.descrL,
+                                                                             (float*)A.d_Lnzvals, A.d_Lrow_ptr, A.d_Lcol_idx,
+                                                                             A.infoL,
+                                                         (float*)d_bv, (float*)d_xv)) {
+       printf( " Failed cusparseScsrv_solve\n" );
+     }
+  }
+
+  #ifdef HPCG_DEBUG
+  scalar_type * tv = (scalar_type *)malloc(nrow * sizeof(scalar_type));
+  for (int i=0; i<nrow; i++) tv[i] = xv[i];
+  // copy x to host for check inside WAXPBY (debug)
+  if (cudaSuccess != cudaMemcpy(xv, d_xv, nrow*sizeof(scalar_type), cudaMemcpyDeviceToHost)) {
+    printf( " Failed to memcpy d_b\n" );
+  }
+  #endif
+
+  #ifndef HPCG_COMPACT_GS
+  // x = x + x0
+  ComputeWAXPBY(nrow, one, x, one, x0, x, A.isWaxpbyOptimized);
+  #endif
+
+  #ifdef HPCG_DEBUG
+  scalar_type l_enorm = 0.0;
+  scalar_type l_xnorm = 0.0;
+  scalar_type l_rnorm = 0.0;
+  for (int j=0; j<nrow; j++) {
+    l_xnorm += tv[j]*tv[j];
+  }
+  for (int j=0; j<nrow; j++) {
+    l_enorm += (xv[j]-tv[j])*(xv[j]-tv[j]);
+    l_rnorm += rv[j]*rv[j];
+  }
+  scalar_type enorm = 0.0;
+  scalar_type xnorm = 0.0;
+  scalar_type rnorm = 0.0;
+  #ifndef HPCG_NO_MPI
+  MPI_Datatype MPI_SCALAR_TYPE = MpiTypeTraits<scalar_type>::getType ();
+  MPI_Allreduce(&l_enorm, &enorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&l_xnorm, &xnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&l_rnorm, &rnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+  #else
+  enorm = l_enorm;
+  xnorm = l_xnorm;
+  rnorm = l_rnorm;
+  #endif
+  enorm = sqrt(enorm);
+  xnorm = sqrt(xnorm);
+  rnorm = sqrt(rnorm);
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank == 0) {
+    HPCG_fout << rank << " : GS_forward(" << nrow << " x " << ncol << "): error = " << enorm << " (x=" << xnorm << ", r=" << rnorm << ")" << std::endl;
+  }
+  free(tv);
+  #endif
+#endif
 
   return 0;
 }
diff --git a/src/ComputeMG_ref.cpp b/src/ComputeMG_ref.cpp
index f5c85530..ac40b24b 100644
--- a/src/ComputeMG_ref.cpp
+++ b/src/ComputeMG_ref.cpp
@@ -19,10 +19,14 @@
  */
 
 #include "ComputeMG_ref.hpp"
+#include "ComputeSYMGS_ref.hpp"
 #include "ComputeGS_Forward_ref.hpp"
 #include "ComputeSPMV_ref.hpp"
 #include "ComputeRestriction_ref.hpp"
 #include "ComputeProlongation_ref.hpp"
+#ifdef HPCG_DEBUG
+#include "Hpgmp_Params.hpp"
+#endif
 #include <cassert>
 #include <iostream>
 
@@ -45,20 +49,42 @@ int ComputeMG_ref(const SparseMatrix_type & A, const Vector_type & r, Vector_typ
   int ierr = 0;
   if (A.mgData!=0) { // Go to next coarse level if defined
     int numberOfPresmootherSteps = A.mgData->numberOfPresmootherSteps;
-    for (int i=0; i< numberOfPresmootherSteps; ++i) ierr += ComputeGS_Forward_ref(A, r, x);
+    #if defined(HPCG_WITH_CUDA) & defined(HPCG_DEBUG)
+    if (A.geom->rank==0) HPCG_fout << std::endl << " > PreSmooth( " << numberOfPresmootherSteps << " ) " << std::endl;
+    #endif
+    if (symmetric) {
+      for (int i=0; i< numberOfPresmootherSteps; ++i) ierr += ComputeSYMGS_ref(A, r, x);
+    } else {
+      for (int i=0; i< numberOfPresmootherSteps; ++i) ierr += ComputeGS_Forward_ref(A, r, x);
+    }
 
     if (ierr!=0) return ierr;
     ierr = ComputeSPMV_ref(A, x, *A.mgData->Axf); if (ierr!=0) return ierr;
+
     // Perform restriction operation using simple injection
     ierr = ComputeRestriction_ref(A, r);  if (ierr!=0) return ierr;
     ierr = ComputeMG_ref(*A.Ac,*A.mgData->rc, *A.mgData->xc, symmetric);  if (ierr!=0) return ierr;
     ierr = ComputeProlongation_ref(A, x);  if (ierr!=0) return ierr;
     int numberOfPostsmootherSteps = A.mgData->numberOfPostsmootherSteps;
-    for (int i=0; i< numberOfPostsmootherSteps; ++i) ierr += ComputeGS_Forward_ref(A, r, x);
+    #if defined(HPCG_WITH_CUDA) & defined(HPCG_DEBUG)
+    if (A.geom->rank==0) HPCG_fout << " > PostSmooth( " << numberOfPostsmootherSteps << " ) " << std::endl;
+    #endif
+    if (symmetric) {
+      for (int i=0; i< numberOfPostsmootherSteps; ++i) ierr += ComputeSYMGS_ref(A, r, x);
+    } else {
+      for (int i=0; i< numberOfPostsmootherSteps; ++i) ierr += ComputeGS_Forward_ref(A, r, x);
+    }
     if (ierr!=0) return ierr;
   }
   else {
-    ierr = ComputeGS_Forward_ref(A, r, x);
+    #if defined(HPCG_WITH_CUDA) & defined(HPCG_DEBUG)
+    if (A.geom->rank==0) HPCG_fout << std::endl << " > CoarseSolve( " << 1 << " ) " << std::endl;
+    #endif
+    if (symmetric) {
+      ierr = ComputeSYMGS_ref(A, r, x);
+    } else {
+      ierr = ComputeGS_Forward_ref(A, r, x);
+    }
     if (ierr!=0) return ierr;
   }
   return 0;
diff --git a/src/ComputeProlongation_ref.cpp b/src/ComputeProlongation_ref.cpp
index 096c9073..5df9b451 100644
--- a/src/ComputeProlongation_ref.cpp
+++ b/src/ComputeProlongation_ref.cpp
@@ -45,11 +45,58 @@ int ComputeProlongation_ref(const SparseMatrix_type & Af, Vector_type & xf) {
   local_int_t * f2c = Af.mgData->f2cOperator;
   local_int_t nc = Af.mgData->rc->localLength;
 
-#ifndef HPCG_NO_OPENMP
-#pragma omp parallel for
-#endif
-// TODO: Somehow note that this loop can be safely vectorized since f2c has no repeated indices
-  for (local_int_t i=0; i<nc; ++i) xfv[f2c[i]] += xcv[i]; // This loop is safe to vectorize
+  #ifdef HPCG_WITH_CUDA
+   local_int_t n = xf.localLength;
+   scalar_type * d_xfv = xf.d_values;
+   scalar_type * d_xcv = Af.mgData->xc->d_values;
+   #if 1
+   const scalar_type zero ( 0.0);
+   const scalar_type  one ( 1.0);
+   const scalar_type mone (-1.0);
+   if (std::is_same<scalar_type, double>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(Af.cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE,
+                                                   nc, n, nc,
+                                                   (const double*)&one, Af.mgData->descrA,
+                                                                        (double*)Af.mgData->d_nzvals, Af.mgData->d_row_ptr, Af.mgData->d_col_idx,
+                                                                        (double*)d_xcv,
+                                                   (const double*)&one, (double*)d_xfv)) {
+       printf( " Failed cusparseDcsrmv\n" );
+     }
+   } else if (std::is_same<scalar_type, float>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(Af.cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE,
+                                                   nc, n, nc,
+                                                   (const float*)&one, Af.mgData->descrA,
+                                                                       (float*)Af.mgData->d_nzvals, Af.mgData->d_row_ptr, Af.mgData->d_col_idx,
+                                                                       (float*)d_xcv,
+                                                   (const float*)&one, (float*)d_xfv)) {
+       printf( " Failed cusparseScsrmv\n" );
+     }
+   }
+   #else
+   // Copy the whole compressed vector from Device to Host..
+   if (Af.geom->rank==0) printf( " Prologation on CPU ..\n" );
+   if (cudaSuccess != cudaMemcpy(xfv, d_xfv,  n*sizeof(scalar_type), cudaMemcpyDeviceToHost)) {
+     printf( " Failed to memcpy d_x\n" );
+   }
+   if (cudaSuccess != cudaMemcpy(xcv, d_xcv, nc*sizeof(scalar_type), cudaMemcpyDeviceToHost)) {
+     printf( " Failed to memcpy d_x\n" );
+   }
+
+   // Prologation on host
+   for (local_int_t i=0; i<nc; ++i) xfv[f2c[i]] += xcv[i]; // This loop is safe to vectorize
+
+   // Copy the whole expanded vector from Host to Device..
+   if (cudaSuccess != cudaMemcpy(d_xfv, xfv, n*sizeof(scalar_type), cudaMemcpyHostToDevice)) {
+     printf( " Failed to memcpy d_x\n" );
+   }
+   #endif
+  #else
+   #ifndef HPCG_NO_OPENMP
+   #pragma omp parallel for
+   #endif
+   // TODO: Somehow note that this loop can be safely vectorized since f2c has no repeated indices
+   for (local_int_t i=0; i<nc; ++i) xfv[f2c[i]] += xcv[i]; // This loop is safe to vectorize
+  #endif
 
   return 0;
 }
diff --git a/src/ComputeResidual.cpp b/src/ComputeResidual.cpp
index 71c292aa..6c0cb993 100644
--- a/src/ComputeResidual.cpp
+++ b/src/ComputeResidual.cpp
@@ -29,7 +29,7 @@
 
 #ifdef HPCG_DETAILED_DEBUG
 #include <fstream>
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 #endif
 
 #include <cmath>  // needed for fabs
diff --git a/src/ComputeRestriction_ref.cpp b/src/ComputeRestriction_ref.cpp
index 10b363ab..025172c2 100644
--- a/src/ComputeRestriction_ref.cpp
+++ b/src/ComputeRestriction_ref.cpp
@@ -48,10 +48,75 @@ int ComputeRestriction_ref(const SparseMatrix_type & A, const Vector_type & rf)
   local_int_t * f2c = A.mgData->f2cOperator;
   local_int_t nc = A.mgData->rc->localLength;
 
-#ifndef HPCG_NO_OPENMP
-#pragma omp parallel for
-#endif
-  for (local_int_t i=0; i<nc; ++i) rcv[i] = rfv[f2c[i]] - Axfv[f2c[i]];
+  #ifdef HPCG_WITH_CUDA
+   local_int_t n = rf.localLength;
+   scalar_type * d_Axfv = A.mgData->Axf->d_values;
+   scalar_type * d_rfv  = rf.d_values;
+   scalar_type * d_rcv  = A.mgData->rc->d_values;
+   #if 1
+   const scalar_type zero ( 0.0);
+   const scalar_type  one ( 1.0);
+   const scalar_type mone (-1.0);
+   if (std::is_same<scalar_type, double>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                   nc, n, nc,
+                                                   (const double*)&one,  A.mgData->descrA,
+                                                                         (double*)A.mgData->d_nzvals, A.mgData->d_row_ptr, A.mgData->d_col_idx,
+                                                                         (double*)d_rfv,
+                                                   (const double*)&zero, (double*)d_rcv)) {
+       printf( " Failed cusparseDcsrmv\n" );
+     }
+     if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                   nc, n, nc,
+                                                   (const double*)&mone, A.mgData->descrA,
+                                                                         (double*)A.mgData->d_nzvals, A.mgData->d_row_ptr, A.mgData->d_col_idx,
+                                                                         (double*)d_Axfv,
+                                                   (const double*)&one,  (double*)d_rcv)) {
+       printf( " Failed cusparseDcsrmv\n" );
+     }
+   } else if (std::is_same<scalar_type, float>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                   nc, n, nc,
+                                                   (const float*)&one,  A.mgData->descrA,
+                                                                        (float*)A.mgData->d_nzvals, A.mgData->d_row_ptr, A.mgData->d_col_idx,
+                                                                        (float*)d_rfv,
+                                                   (const float*)&zero, (float*)d_rcv)) {
+       printf( " Failed cusparseScsrmv\n" );
+     }
+     if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                   nc, n, nc,
+                                                   (const float*)&mone, A.mgData->descrA,
+                                                                        (float*)A.mgData->d_nzvals, A.mgData->d_row_ptr, A.mgData->d_col_idx,
+                                                                        (float*)d_Axfv,
+                                                   (const float*)&one,  (float*)d_rcv)) {
+       printf( " Failed cusparseScsrmv\n" );
+     }
+   }
+   #else
+   // Copy the whole prologated vector from Device to Host
+   if (cudaSuccess != cudaMemcpy(rfv, d_rfv, n*sizeof(scalar_type), cudaMemcpyDeviceToHost)) {
+     printf( " Failed to memcpy d_rfv\n" );
+   }
+   if (cudaSuccess != cudaMemcpy(Axfv, d_Axfv, n*sizeof(scalar_type), cudaMemcpyDeviceToHost)) {
+     printf( " Failed to memcpy d_Axfv\n" );
+   }
+
+   // Restriction on CPU 
+   if (A.geom->rank==0) printf( " Restriction on CPU\n" );
+   for (local_int_t i=0; i<nc; ++i) rcv[i] = rfv[f2c[i]] - Axfv[f2c[i]];
+
+   // Copy the whole restricted vector from Host to Device
+   if (cudaSuccess != cudaMemcpy(d_rcv, rcv, nc*sizeof(scalar_type), cudaMemcpyHostToDevice)) {
+     printf( " Failed to memcpy d_x\n" );
+   }
+   #endif
+  #else
+   // host
+   #ifndef HPCG_NO_OPENMP
+   #pragma omp parallel for
+   #endif
+   for (local_int_t i=0; i<nc; ++i) rcv[i] = rfv[f2c[i]] - Axfv[f2c[i]];
+  #endif
 
   return 0;
 }
diff --git a/src/ComputeSPMV_ref.cpp b/src/ComputeSPMV_ref.cpp
index a6219ce5..846cd843 100644
--- a/src/ComputeSPMV_ref.cpp
+++ b/src/ComputeSPMV_ref.cpp
@@ -25,7 +25,17 @@
 #endif
 
 #ifndef HPCG_NO_OPENMP
-#include <omp.h>
+ #include <omp.h>
+#endif
+#ifdef HPCG_WITH_CUDA
+ #include <cuda_runtime.h>
+ #include <cusparse.h>
+
+ #if defined(HPCG_DEBUG) & !defined(HPCG_NO_MPI)
+ #include <mpi.h>
+ #include "Utils_MPI.hpp"
+ #include "Hpgmp_Params.hpp"
+ #endif
 #endif
 #include <cassert>
 
@@ -51,15 +61,28 @@ int ComputeSPMV_ref(const SparseMatrix_type & A, Vector_type & x, Vector_type &
   assert(y.localLength>=A.localNumberOfRows);
   typedef typename SparseMatrix_type::scalar_type scalar_type;
 
+  const local_int_t nrow = A.localNumberOfRows;
+  scalar_type * const xv = x.values;
+  scalar_type * const yv = y.values;
+
 #ifndef HPCG_NO_MPI
-    ExchangeHalo(A,x);
+  if (A.geom->size > 1) {
+    #ifdef HPCG_WITH_CUDA
+    // Copy local part of X to HOST CPU
+    if (A.geom->rank==0) printf( " HaloExchange on Host for SpMV\n" );
+    if (cudaSuccess != cudaMemcpy(xv, x.d_values, nrow*sizeof(scalar_type), cudaMemcpyDeviceToHost)) {
+      printf( " Failed to memcpy d_y\n" );
+    }
+    #endif
+
+    ExchangeHalo(A, x);
+  }
 #endif
-  const scalar_type * const xv = x.values;
-  scalar_type * const yv = y.values;
-  const local_int_t nrow = A.localNumberOfRows;
-#ifndef HPCG_NO_OPENMP
+
+#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG)
+  #ifndef HPCG_NO_OPENMP
   #pragma omp parallel for
-#endif
+  #endif
   for (local_int_t i=0; i< nrow; i++)  {
     scalar_type sum = 0.0;
     const scalar_type * const cur_vals = A.matrixValues[i];
@@ -70,6 +93,77 @@ int ComputeSPMV_ref(const SparseMatrix_type & A, Vector_type & x, Vector_type &
       sum += cur_vals[j]*xv[cur_inds[j]];
     yv[i] = sum;
   }
+#endif
+
+#ifdef HPCG_WITH_CUDA
+  const scalar_type one  (1.0);
+  const scalar_type zero (0.0);
+  const local_int_t ncol = A.localNumberOfColumns;
+  const global_int_t nnz = A.localNumberOfNonzeros;
+
+  scalar_type * const d_xv = x.d_values;
+  scalar_type * const d_yv = y.d_values;
+  if (A.geom->size > 1) {
+    // copy non-local part of X to device (after Halo exchange)
+    if (cudaSuccess != cudaMemcpy(&d_xv[nrow], &xv[nrow], (ncol-nrow)*sizeof(scalar_type), cudaMemcpyHostToDevice)) {
+      printf( " Failed to memcpy d_x\n" );
+    }
+  }
+
+  if (std::is_same<scalar_type, double>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                   nrow, ncol, nnz,
+                                                   (const double*)&one,  A.descrA,
+                                                                         (double*)A.d_nzvals, A.d_row_ptr, A.d_col_idx,
+                                                                         (double*)d_xv,
+                                                   (const double*)&zero, (double*)d_yv)) {
+       printf( " Failed cusparseDcsrmv\n" );
+     }
+  } else if (std::is_same<scalar_type, float>::value) {
+     if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                   nrow, ncol, nnz,
+                                                   (const float*)&one,  A.descrA,
+                                                                        (float*)A.d_nzvals, A.d_row_ptr, A.d_col_idx,
+                                                                        (float*)d_xv,
+                                                   (const float*)&zero, (float*)d_yv)) {
+       printf( " Failed cusparseScsrmv\n" );
+     }
+  }
+  #ifdef HPCG_DEBUG
+  scalar_type * tv = (scalar_type *)malloc(nrow * sizeof(scalar_type));
+  if (cudaSuccess != cudaMemcpy(tv, d_yv, nrow*sizeof(scalar_type), cudaMemcpyDeviceToHost)) {
+    printf( " Failed to memcpy d_y\n" );
+  }
+  scalar_type l_enorm = 0.0;
+  scalar_type l_xnorm = 0.0;
+  scalar_type l_ynorm = 0.0;
+  for (int j=0; j<nrow; j++) {
+    l_enorm += (tv[j]-yv[j])*(tv[j]-yv[j]);
+    l_xnorm += xv[j]*xv[j];
+    l_ynorm += yv[j]*yv[j];
+  }
+  scalar_type enorm = 0.0;
+  scalar_type xnorm = 0.0;
+  scalar_type ynorm = 0.0;
+  #ifndef HPCG_NO_MPI
+  MPI_Datatype MPI_SCALAR_TYPE = MpiTypeTraits<scalar_type>::getType ();
+  MPI_Allreduce(&l_enorm, &enorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&l_xnorm, &xnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&l_ynorm, &ynorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+  #else
+  enorm = l_enorm;
+  xnorm = l_xnorm;
+  ynorm = l_ynorm;
+  #endif
+  enorm = sqrt(enorm);
+  xnorm = sqrt(xnorm);
+  ynorm = sqrt(ynorm);
+  if (A.geom->rank == 0) {
+    HPCG_fout << A.geom->rank << " : SpMV(" << nrow << " x " << ncol << "): error = " << enorm << "(x=" << xnorm << ", y=" << ynorm << ")" << std::endl;
+  }
+  free(tv);
+  #endif
+#endif
   return 0;
 }
 
diff --git a/src/ComputeSYMGS.cpp b/src/ComputeSYMGS.cpp
new file mode 100644
index 00000000..f854dfd1
--- /dev/null
+++ b/src/ComputeSYMGS.cpp
@@ -0,0 +1,56 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeSYMGS.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeSYMGS.hpp"
+#include "ComputeSYMGS_ref.hpp"
+
+/*!
+  Routine to compute one step of symmetric Gauss-Seidel:
+
+  Assumption about the structure of matrix A:
+  - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
+  - Entries in row 'i' are ordered such that:
+       - lower triangular terms are stored before the diagonal element.
+       - upper triangular terms are stored after the diagonal element.
+       - No other assumptions are made about entry ordering.
+
+  Symmetric Gauss-Seidel notes:
+  - We use the input vector x as the RHS and start with an initial guess for y of all zeros.
+  - We perform one forward sweep.  Since y is initially zero we can ignore the upper triangular terms of A.
+  - We then perform one back sweep.
+       - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
+
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep with r as the RHS.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out of sync with other kernels.
+
+  @see ComputeSYMGS_ref
+*/
+template<class SparseMatrix_type, class Vector_type>
+int ComputeSYMGS(const SparseMatrix_type & A, const Vector_type & r, Vector_type & x) {
+
+  // This line and the next two lines should be removed and your version of ComputeSYMGS should be used.
+  return ComputeSYMGS_ref(A, r, x);
+
+}
diff --git a/src/ComputeSYMGS.hpp b/src/ComputeSYMGS.hpp
new file mode 100644
index 00000000..d203873a
--- /dev/null
+++ b/src/ComputeSYMGS.hpp
@@ -0,0 +1,23 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTESYMGS_HPP
+#define COMPUTESYMGS_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+template<class SparseMatrix_type, class Vector_type>
+int ComputeSYMGS(const SparseMatrix_type & A, const Vector_type & r, Vector_type & x);
+
+#endif // COMPUTESYMGS_HPP
diff --git a/src/ComputeSYMGS_ref.cpp b/src/ComputeSYMGS_ref.cpp
new file mode 100644
index 00000000..13e85603
--- /dev/null
+++ b/src/ComputeSYMGS_ref.cpp
@@ -0,0 +1,117 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeSYMGS_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#endif
+#include "ComputeSYMGS_ref.hpp"
+#include <cassert>
+
+/*!
+  Computes one step of symmetric Gauss-Seidel:
+
+  Assumption about the structure of matrix A:
+  - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
+  - Entries in row 'i' are ordered such that:
+       - lower triangular terms are stored before the diagonal element.
+       - upper triangular terms are stored after the diagonal element.
+       - No other assumptions are made about entry ordering.
+
+  Symmetric Gauss-Seidel notes:
+  - We use the input vector x as the RHS and start with an initial guess for y of all zeros.
+  - We perform one forward sweep.  x should be initially zero on the first GS sweep, but we do not attempt to exploit this fact.
+  - We then perform one back sweep.
+  - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
+
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep with r as the RHS.
+
+
+  @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out of sync with other kernels.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeSYMGS
+*/
+template<class SparseMatrix_type, class Vector_type>
+int ComputeSYMGS_ref(const SparseMatrix_type & A, const Vector_type & r, Vector_type & x) {
+
+  assert(x.localLength==A.localNumberOfColumns); // Make sure x contain space for halo values
+
+  typedef typename SparseMatrix_type::scalar_type scalar_type;
+#ifndef HPCG_NO_MPI
+  ExchangeHalo(A,x);
+#endif
+
+  const local_int_t nrow = A.localNumberOfRows;
+  scalar_type ** matrixDiagonal = A.matrixDiagonal;  // An array of pointers to the diagonal entries A.matrixValues
+  const scalar_type * const rv = r.values;
+  scalar_type * const xv = x.values;
+
+  for (local_int_t i=0; i< nrow; i++) {
+    const scalar_type * const currentValues = A.matrixValues[i];
+    const local_int_t * const currentColIndices = A.mtxIndL[i];
+    const int currentNumberOfNonzeros = A.nonzerosInRow[i];
+    const scalar_type currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
+    scalar_type sum = rv[i]; // RHS value
+
+    for (int j=0; j< currentNumberOfNonzeros; j++) {
+      local_int_t curCol = currentColIndices[j];
+      sum -= currentValues[j] * xv[curCol];
+    }
+    sum += xv[i]*currentDiagonal; // Remove diagonal contribution from previous loop
+
+    xv[i] = sum/currentDiagonal;
+
+  }
+
+  // Now the back sweep.
+
+  for (local_int_t i=nrow-1; i>=0; i--) {
+    const scalar_type * const currentValues = A.matrixValues[i];
+    const local_int_t * const currentColIndices = A.mtxIndL[i];
+    const int currentNumberOfNonzeros = A.nonzerosInRow[i];
+    const scalar_type currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
+    scalar_type sum = rv[i]; // RHS value
+
+    for (int j = 0; j< currentNumberOfNonzeros; j++) {
+      local_int_t curCol = currentColIndices[j];
+      sum -= currentValues[j]*xv[curCol];
+    }
+    sum += xv[i]*currentDiagonal; // Remove diagonal contribution from previous loop
+
+    xv[i] = sum/currentDiagonal;
+  }
+
+  return 0;
+}
+
+
+/* --------------- *
+ * specializations *
+ * --------------- */
+
+template
+int ComputeSYMGS_ref< SparseMatrix<double>, Vector<double> >(SparseMatrix<double> const&, Vector<double> const&, Vector<double>&);
+
+template
+int ComputeSYMGS_ref< SparseMatrix<float>, Vector<float> >(SparseMatrix<float> const&, Vector<float> const&, Vector<float>&);
+
diff --git a/src/ComputeSYMGS_ref.hpp b/src/ComputeSYMGS_ref.hpp
new file mode 100644
index 00000000..0487ebef
--- /dev/null
+++ b/src/ComputeSYMGS_ref.hpp
@@ -0,0 +1,23 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTESYMGS_REF_HPP
+#define COMPUTESYMGS_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+template<class SparseMatrix_type, class Vector_type>
+int ComputeSYMGS_ref(const SparseMatrix_type & A, const Vector_type & r, Vector_type & x);
+
+#endif // COMPUTESYMGS_REF_HPP
diff --git a/src/ComputeWAXPBY_ref.cpp b/src/ComputeWAXPBY_ref.cpp
index ed7487f4..396b59ca 100644
--- a/src/ComputeWAXPBY_ref.cpp
+++ b/src/ComputeWAXPBY_ref.cpp
@@ -20,7 +20,17 @@
 
 #include "ComputeWAXPBY_ref.hpp"
 #ifndef HPCG_NO_OPENMP
-#include <omp.h>
+ #include <omp.h>
+#endif
+#ifdef HPCG_WITH_CUDA
+ #include <cuda_runtime.h>
+ #include "cublas_v2.h"
+
+ #if defined(HPCG_DEBUG) & !defined(HPCG_NO_MPI)
+ #include <mpi.h>
+ #include "Hpgmp_Params.hpp"
+ #include "Utils_MPI.hpp"
+ #endif
 #endif
 #include <cassert>
 /*!
@@ -49,30 +59,134 @@ int ComputeWAXPBY_ref(const local_int_t n,
   assert(x.localLength>=n); // Test vector lengths
   assert(y.localLength>=n);
 
+  // quick return
+  if (n <= 0) return 0;
+
   typedef typename VectorX_type::scalar_type scalarX_type;
   typedef typename VectorY_type::scalar_type scalarY_type;
   typedef typename VectorW_type::scalar_type scalarW_type;
 
-  const scalarX_type * const xv = x.values;
-  const scalarY_type * const yv = y.values;
-        scalarW_type * const wv = w.values;
+  scalarX_type * const xv = x.values;
+  scalarY_type * const yv = y.values;
+  scalarW_type * const wv = w.values;
 
+
+#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG)
   if (alpha==1.0) {
-#ifndef HPCG_NO_OPENMP
+    #ifndef HPCG_NO_OPENMP
     #pragma omp parallel for
-#endif
+    #endif
     for (local_int_t i=0; i<n; i++) wv[i] = xv[i] + beta * yv[i];
   } else if (beta==1.0) {
-#ifndef HPCG_NO_OPENMP
+    #ifndef HPCG_NO_OPENMP
     #pragma omp parallel for
-#endif
+    #endif
     for (local_int_t i=0; i<n; i++) wv[i] = alpha * xv[i] + yv[i];
   } else  {
-#ifndef HPCG_NO_OPENMP
+    #ifndef HPCG_NO_OPENMP
     #pragma omp parallel for
+    #endif
+    for (local_int_t i=0; i<n; i++) wv[i] = alpha * xv[i] + beta * yv[i];
+  }
 #endif
+
+#ifdef HPCG_WITH_CUDA
+  scalarX_type * const d_xv = x.d_values;
+  scalarY_type * const d_yv = y.d_values;
+  scalarW_type * const d_wv = w.d_values;
+
+  // Only uniform-precision supported
+  if ((std::is_same<scalarX_type, double>::value && std::is_same<scalarY_type, double>::value && std::is_same<scalarW_type, double>::value) ||
+      (std::is_same<scalarX_type, float >::value && std::is_same<scalarY_type, float >::value && std::is_same<scalarW_type, float >::value)) {
+
+    // Compute axpy on Nvidia GPU
+    // w = x (assuming y is not w)
+    if (cudaSuccess != cudaMemcpy(d_wv, d_xv, n*sizeof(scalarW_type), cudaMemcpyDeviceToDevice)) {
+      printf( " Failed to memcpy d_w\n" );
+    }
+    if (std::is_same<scalarX_type, double>::value) {
+      // w = alpha*w
+      if (CUBLAS_STATUS_SUCCESS != cublasDscal (w.handle, n, (const double*)&alpha, (double*)d_wv, 1)) {
+        printf( " Failed cublasDscal\n" );
+      }
+      // w += alpha*x
+      if (CUBLAS_STATUS_SUCCESS != cublasDaxpy (w.handle, n, (const double*)&beta, (double*)d_yv, 1, (double*)d_wv, 1)) {
+        printf( " Failed cublasDdot\n" );
+      }
+    } else if (std::is_same<scalarX_type, float>::value) {
+      // w = beta*y
+      if (CUBLAS_STATUS_SUCCESS != cublasSscal (w.handle, n, (const float*)&alpha, (float*)d_wv, 1)) {
+        printf( " Failed cublasSscal\n" );
+      }
+      // w += alpha*x
+      if (CUBLAS_STATUS_SUCCESS != cublasSaxpy (w.handle, n, (const float*)&beta, (float*) d_yv, 1, (float*) d_wv, 1)) {
+        printf( " Failed cublasDdot\n" );
+      }
+    }
+
+    #ifdef HPCG_DEBUG
+    scalarW_type * tv = (scalarW_type *)malloc(n * sizeof(scalarW_type));
+    if (cudaSuccess != cudaMemcpy(tv, d_wv, n*sizeof(scalarW_type), cudaMemcpyDeviceToHost)) {
+      printf( " Failed to memcpy d_w\n" );
+    }
+    scalarW_type l_enorm = 0.0;
+    scalarW_type l_wnorm = 0.0;
+    scalarW_type l_xnorm = 0.0;
+    scalarW_type l_ynorm = 0.0;
+    for (int j=0; j<n; j++) {
+      l_enorm += (tv[j]-wv[j])*(tv[j]-wv[j]);
+      l_wnorm += wv[j]*wv[j];
+      l_xnorm += xv[j]*xv[j];
+      l_ynorm += yv[j]*yv[j];
+    }
+    scalarW_type enorm = 0.0;
+    scalarW_type wnorm = 0.0;
+    scalarX_type xnorm = 0.0;
+    scalarY_type ynorm = 0.0;
+    #ifndef HPCG_NO_MPI
+    MPI_Datatype MPI_SCALAR_TYPE = MpiTypeTraits<scalarW_type>::getType ();
+    MPI_Allreduce(&l_enorm, &enorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&l_wnorm, &wnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&l_xnorm, &xnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&l_ynorm, &ynorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD);
+    #else
+    enorm = l_enorm;
+    wnorm = l_wnorm;
+    xnorm = l_xnorm;
+    ynorm = l_ynorm;
+    #endif
+    enorm = sqrt(enorm);
+    wnorm = sqrt(wnorm);
+    xnorm = sqrt(xnorm);
+    ynorm = sqrt(ynorm);
+    int rank = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    if (rank == 0) {
+      HPCG_fout << rank << " : WAXPBY(" << n << "): error = " << enorm << " (alpha=" << alpha << ", beta=" << beta
+	        << ", x=" << xnorm << ", y=" << ynorm << ", w=" << wnorm << ")" << std::endl;
+    }
+    free(tv);
+    #endif
+  } else {
+    HPCG_fout << " Mixed-precision WAXPBY not supported" << std::endl;
+
+    // copy Input vectors to Host
+    if (cudaSuccess != cudaMemcpy(xv, d_xv, n*sizeof(scalarX_type), cudaMemcpyDeviceToHost)) {
+      printf( " Failed to memcpy d_x\n" );
+    }
+    if (cudaSuccess != cudaMemcpy(yv, d_yv, n*sizeof(scalarY_type), cudaMemcpyDeviceToHost)) {
+      printf( " Failed to memcpy d_w\n" );
+    }
+
+    // WAXPBY on Host
     for (local_int_t i=0; i<n; i++) wv[i] = alpha * xv[i] + beta * yv[i];
+
+    // Copy output vector to Device
+    if (cudaSuccess != cudaMemcpy(d_wv, wv, n*sizeof(scalarW_type), cudaMemcpyHostToDevice)) {
+      printf( " Failed to memcpy d_w\n" );
+    }
   }
+#endif
 
   return 0;
 }
diff --git a/src/GMRES.cpp b/src/GMRES.cpp
index e6d7b435..16b37922 100644
--- a/src/GMRES.cpp
+++ b/src/GMRES.cpp
@@ -15,14 +15,14 @@
 /*!
  @file GMRES.cpp
 
- HPCG routine
+ GMRES routine
  */
 
 #include <fstream>
 
 #include <cmath>
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 
 #include "GMRES.hpp"
 #include "mytimer.hpp"
@@ -32,6 +32,7 @@
 #include "ComputeWAXPBY.hpp"
 #include "ComputeTRSM.hpp"
 #include "ComputeGEMV.hpp"
+#include "ComputeGEMVT.hpp"
 
 
 // Use TICK and TOCK to time a code section in MATLAB-like fashion
@@ -56,14 +57,14 @@
 
   @return Returns zero on success and a non-zero value otherwise.
 
-  @see CG_ref()
+  @see GMRES_ref()
 */
 template<class SparseMatrix_type, class CGData_type, class Vector_type>
 int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b, Vector_type & x,
           const int restart_length, const int max_iter, const typename SparseMatrix_type::scalar_type tolerance,
           int & niters, typename SparseMatrix_type::scalar_type & normr,  typename SparseMatrix_type::scalar_type & normr0,
-          double * times, bool doPreconditioning) {
-
+          double * times, double * flops, bool doPreconditioning) {
+ 
   typedef typename SparseMatrix_type::scalar_type scalar_type;
   typedef MultiVector<scalar_type> MultiVector_type;
   typedef SerialDenseMatrix<scalar_type> SerialDenseMatrix_type;
@@ -71,7 +72,7 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b
   const scalar_type one  (1.0);
   const scalar_type zero (0.0);
   double t_begin = mytimer();  // Start timing right away
-  double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
+  double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0, t6 = 0.0;
 
   normr = 0.0;
   scalar_type rtz = zero, oldrtz = zero, alpha = zero, beta = zero, pAp = zero;
@@ -80,20 +81,24 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b
 //  double t6 = 0.0;
 //#endif
   local_int_t nrow = A.localNumberOfRows;
+  local_int_t Nrow = A.totalNumberOfRows;
   Vector_type & r = data.r; // Residual vector
   Vector_type & z = data.z; // Preconditioned residual vector
   Vector_type & p = data.p; // Direction vector (in MPI mode ncol>=nrow)
   Vector_type & Ap = data.Ap;
 
   SerialDenseMatrix_type H;
+  SerialDenseMatrix_type h;
+  SerialDenseMatrix_type t;
   SerialDenseMatrix_type cs;
   SerialDenseMatrix_type ss;
-  SerialDenseMatrix_type t;
   MultiVector_type Q;
+  MultiVector_type P;
   Vector_type Qkm1;
   Vector_type Qk;
   Vector_type Qj;
   InitializeMatrix(H,  restart_length+1, restart_length);
+  InitializeMatrix(h,  restart_length+1, 1);
   InitializeMatrix(t,  restart_length+1, 1);
   InitializeMatrix(cs, restart_length+1, 1);
   InitializeMatrix(ss, restart_length+1, 1);
@@ -101,47 +106,47 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b
 
   if (!doPreconditioning && A.geom->rank==0) HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
 
-#ifdef HPCG_DEBUG
+  bool verbose = true;
   int print_freq = 1;
-  if (print_freq>50) print_freq=50;
-  if (print_freq<1)  print_freq=1;
-  if (A.geom->rank==0) HPCG_fout << std::endl << " Running GMRES(" << restart_length
-                                 << ") with max-iters = " << max_iter
-                                 << " and tol = " << tolerance
-                                 << (doPreconditioning ? " with precond " : " without precond ")
-                                 << ", nrow = " << nrow << std::endl;
-#endif
+  if (verbose && A.geom->rank==0) {
+    HPCG_fout << std::endl << " Running GMRES(" << restart_length
+                           << ") with max-iters = " << max_iter
+                           << " and tol = " << tolerance
+                           << (doPreconditioning ? " with precond " : " without precond ")
+                           << ", nrow = " << nrow 
+                           << " on ( " << A.geom->npx << " x " << A.geom->npy << " x " << A.geom->npz
+                           << " ) MPI grid "
+                           << std::endl;
+  }
+  *flops = 0.0;
+  double flops_gmg  = 0.0;
+  double flops_spmv = 0.0;
+  double flops_orth = 0.0;
+  global_int_t numSpMVs_MG = 1+(A.mgData->numberOfPresmootherSteps + A.mgData->numberOfPostsmootherSteps);
   niters = 0;
   bool converged = false;
   while (niters <= max_iter && !converged) {
     // p is of length ncols, copy x to p for sparse MV operation
     CopyVector(x, p);
-    TICK(); ComputeSPMV(A, p, Ap); TOCK(t3); // Ap = A*p
-    TICK(); ComputeWAXPBY(nrow, one, b, -one, Ap, r, A.isWaxpbyOptimized);  TOCK(t2); // r = b - Ax (x stored in p)
-    TICK(); ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized); TOCK(t1);
+    TICK(); ComputeSPMV(A, p, Ap); TOCK(t3); flops_spmv += (2*A.totalNumberOfNonzeros); // Ap = A*p
+    TICK(); ComputeWAXPBY(nrow, one, b, -one, Ap, r, A.isWaxpbyOptimized); TOCK(t2); *flops += (2*Nrow); // r = b - Ax (x stored in p)
+    TICK(); ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized); *flops += (2*Nrow); TOCK(t1);
     normr = sqrt(normr);
     GetVector(Q, 0, Qj);
     CopyVector(r, Qj);
-    TICK(); ComputeWAXPBY(nrow, zero, Qj, one/normr, Qj, Qj, A.isWaxpbyOptimized); TOCK(t2);
+    //TICK(); ComputeWAXPBY(nrow, zero, Qj, one/normr, Qj, Qj, A.isWaxpbyOptimized); TOCK(t2);
+    TICK(); ScaleVectorValue(Qj, one/normr); TOCK(t2); *flops += Nrow;
 
     // Record initial residual for convergence testing
     if (niters == 0) normr0 = normr;
-    #ifdef HPCG_DEBUG
-    if (A.geom->rank==0) HPCG_fout << "GMRES Residual at the start of restart cycle = "<< normr
-                                   << ", " << normr/normr0 << std::endl;
-    #endif
-
+    if (verbose && A.geom->rank==0) {
+      HPCG_fout << "GMRES Residual at the start of restart cycle = "<< normr
+                << ", " << normr/normr0 << std::endl;
+    }
     if (normr/normr0 <= tolerance) {
       converged = true;
-      #ifdef HPCG_DEBUG
-      if (A.geom->rank==0) HPCG_fout << " > GMRES converged " << std::endl;
-      #endif
+      if (verbose && A.geom->rank==0) HPCG_fout << " > GMRES converged " << std::endl;
     }
-/*if (normr/normr0 <= tolerance || (niters > 0 && doPreconditioning)) {
-  printf( " done %d iters (%s)\n",niters, (converged ? "Converged" : "Not Converged") );
-  printf( " done (%s)\n",(doPreconditioning ? "Precond" : "Not Precond") );
-  for (int i = 0; i < nrow; i++) printf( "x[%d] = %e\n",i,x.values[i] );
-}*/
 
     // do forward GS instead of symmetric GS
     bool symmetric = false;
@@ -154,37 +159,61 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b
       GetVector(Q, k,   Qk);
 
       TICK();
-      if (doPreconditioning)
-        ComputeMG(A, Qkm1, z, symmetric); // Apply preconditioner
-      else
+      if (doPreconditioning) {
+        ComputeMG(A, Qkm1, z, symmetric); flops_gmg += (2*numSpMVs_MG*A.totalNumberOfMGNonzeros); // Apply preconditioner
+      } else {
         CopyVector(Qkm1, z);              // copy r to z (no preconditioning)
+      }
       TOCK(t5); // Preconditioner apply time
 
       // Qk = A*z
-      TICK(); ComputeSPMV(A, z, Qk); TOCK(t3);
-
-      // MGS to orthogonalize z against Q(:,0:k-1), using dots
-      for (int j = 0; j < k; j++) {
-        // get j-th column of Q
-        GetVector(Q, j, Qj);
+      TICK(); ComputeSPMV(A, z, Qk); flops_spmv += (2*A.totalNumberOfNonzeros); TOCK(t3);
 
-        alpha = zero;
-        for (int i = 0; i < 2; i++) {
-          // beta = Qk'*Qj
-          TICK(); ComputeDotProduct(nrow, Qk, Qj, beta, t4, A.isDotProductOptimized); TOCK(t1);
 
-          // Qk = Qk - beta * Qj
-          TICK(); ComputeWAXPBY(nrow, one, Qk, -beta, Qj, Qk, A.isWaxpbyOptimized); TOCK(t2);
-          alpha += beta;
+      // orthogonalize z against Q(:,0:k-1), using dots
+      bool use_mgs = false;
+      TICK();
+      if (use_mgs) {
+        // MGS2
+        for (int j = 0; j < k; j++) {
+          // get j-th column of Q
+          GetVector(Q, j, Qj);
+
+          alpha = zero;
+          for (int i = 0; i < 2; i++) {
+            // beta = Qk'*Qj
+            TICK(); ComputeDotProduct(nrow, Qk, Qj, beta, t4, A.isDotProductOptimized); TOCK(t1);
+
+            // Qk = Qk - beta * Qj
+            TICK(); ComputeWAXPBY(nrow, one, Qk, -beta, Qj, Qk, A.isWaxpbyOptimized); TOCK(t2);
+            alpha += beta;
+          }
+          SetMatrixValue(H, j, k-1, alpha);
+        }
+      } else {
+        // CGS2
+        GetMultiVector(Q, 0, k-1, P);
+        ComputeGEMVT (nrow, k,  one, P, Qk, zero, h, A.isGemvOptimized); // h = Q(1:k)'*q(k+1)
+        ComputeGEMV  (nrow, k, -one, P, h,  one, Qk, A.isGemvOptimized); // h = Q(1:k)'*q(k+1)
+        for(int i = 0; i < k; i++) {
+          SetMatrixValue(H, i, k-1, h.values[i]);
+        }
+        // reorthogonalize
+        ComputeGEMVT (nrow, k,  one, P, Qk, zero, h, A.isGemvOptimized); // h = Q(1:k)'*q(k+1)
+        ComputeGEMV  (nrow, k, -one, P, h,  one, Qk, A.isGemvOptimized); // h = Q(1:k)'*q(k+1)
+        for(int i = 0; i < k; i++) {
+          AddMatrixValue(H, i, k-1, h.values[i]);
         }
-        SetMatrixValue(H, j, k-1, alpha);
       }
+      TOCK(t6); // Ortho time
+      flops_orth += (2*k*Nrow);
       // beta = norm(Qk)
-      TICK(); ComputeDotProduct(nrow, Qk, Qk, beta, t4, A.isDotProductOptimized); TOCK(t1);
+      TICK(); ComputeDotProduct(nrow, Qk, Qk, beta, t4, A.isDotProductOptimized); *flops += (2*Nrow); TOCK(t1);
       beta = sqrt(beta);
 
       // Qk = Qk / beta
-      TICK(); ComputeWAXPBY(nrow, zero, Qk, one/beta, Qk, Qk, A.isWaxpbyOptimized); TOCK(t2);
+      //TICK(); ComputeWAXPBY(nrow, zero, Qk, one/beta, Qk, Qk, A.isWaxpbyOptimized); TOCK(t2);
+      TICK(); ScaleVectorValue(Qk, one/beta); *flops += Nrow; TOCK(t2);
       SetMatrixValue(H, k, k-1, beta);
 
       // Given's rotation
@@ -220,49 +249,27 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b
       SetMatrixValue(cs, k-1, 0, cj);
 
       normr = std::abs(v2);
-      #ifdef HPCG_DEBUG
-        if (A.geom->rank==0 && (k%print_freq == 0 || k+1 == restart_length))
-          HPCG_fout << "GMRES Iteration = "<< k << " (" << niters << ")   Scaled Residual = "
-                    << normr << " / " << normr0 << " = " << normr/normr0 << std::endl;
-      #endif
+      if (verbose && A.geom->rank==0 && (k%print_freq == 0 || k+1 == restart_length)) {
+        HPCG_fout << "GMRES Iteration = "<< k << " (" << niters << ")   Scaled Residual = "
+                  << normr << " / " << normr0 << " = " << normr/normr0 << std::endl;
+      }
       niters ++;
       k ++;
     } // end of restart-cycle
     // prepare to restart
-    #ifdef HPCG_DEBUG
-      if (A.geom->rank==0)
-        HPCG_fout << "GMRES restart: k = "<< k << " (" << niters << ")" << std::endl;
-    #endif
-    // > update x
-/*if (A.geom->rank==0) {
-  printf( "\n k = %d\n",k );
-  printf( "R=[\n" );
-  for (int i = 0; i < k; i++) {
-    for (int j = 0; j < k; j++) printf("%e ",H.values[i + j * H.m] );
-    printf("\n");
-  }
-  printf("];\n\n");
-  printf( "t=[\n" );
-  for (int i = 0; i < k; i++) printf( "%e\n",t.values[i]);
-  printf("];\n\n");
-
-  if (niters == 1) {
-    printf( " nrow = %d, max_iter = %d\n",nrow,max_iter );
-    printf( " Q = [\n" );
-    for (int i = 0; i < nrow; i++) {
-      for (int j = 0; j <= k-1; j++) printf( "%e ",Q.values[i + j * nrow] );
-      printf("\n");
+    if (verbose && A.geom->rank==0) {
+      HPCG_fout << "GMRES restart: k = "<< k << " (" << niters << ")" << std::endl;
     }
-    printf( " ];\n\n" );
-  }
-}*/
+    // > update x
     ComputeTRSM(k-1, one, H, t);
     if (doPreconditioning) {
-      ComputeGEMV (nrow, k-1, one, Q, t, zero, r); // r = Q*t
-      ComputeMG(A, r, z, symmetric);               // z = M*r
-      TICK(); ComputeWAXPBY(nrow, one, x, one, z, x, A.isWaxpbyOptimized); TOCK(t2); // x += z
+      ComputeGEMV(nrow, k-1, one, Q, t, zero, r, A.isGemvOptimized); *flops += (2*Nrow*(k-1)); // r = Q*t
+      TICK();
+      ComputeMG(A, r, z, symmetric); flops_gmg += (2*numSpMVs_MG*A.totalNumberOfMGNonzeros);      // z = M*r
+      TOCK(t5); // Preconditioner apply time
+      TICK(); ComputeWAXPBY(nrow, one, x, one, z, x, A.isWaxpbyOptimized); TOCK(t2); *flops += (2*Nrow); // x += z
     } else {
-      ComputeGEMV (nrow, k-1, one, Q, t, one, x); // x += Q*t
+      ComputeGEMV (nrow, k-1, one, Q, t, one, x, A.isGemvOptimized); *flops += (2*Nrow*(k-1)); // x += Q*t
     }
   } // end of outer-loop
 
@@ -277,6 +284,28 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b
 //  times[6] += t6; // exchange halo time
 //#endif
   times[0] += mytimer() - t_begin;  // Total time. All done...
+  if (verbose && A.geom->rank==0) {
+    HPCG_fout << " > nnz(A)  : " << A.totalNumberOfNonzeros << std::endl;
+    HPCG_fout << " > nnz(MG) : " << A.totalNumberOfMGNonzeros << " (" << numSpMVs_MG << ")" << std::endl;
+    HPCG_fout << " > SpMV : " << (flops_spmv / 1000000000.0) << " / " << t3 << " = "
+                              << (flops_spmv / 1000000000.0) / t3 << " Gflop/s" << std::endl;
+    HPCG_fout << " > GMG  : " << (flops_gmg  / 1000000000.0) << " / " << t5 << " = "
+                              << (flops_gmg  / 1000000000.0) / t5 << " Gflop/s" << std::endl;
+    HPCG_fout << " > Orth : " << (flops_orth / 1000000000.0) << " / " << t6 << " = "
+                              << (flops_orth / 1000000000.0) / t6 << " Gflop/s" << std::endl;
+    HPCG_fout << std::endl;
+  }
+  *flops += flops_gmg;
+  *flops += flops_spmv;
+  *flops += flops_orth;
+
+  DeleteDenseMatrix(H);
+  DeleteDenseMatrix(h);
+  DeleteDenseMatrix(t);
+  DeleteDenseMatrix(cs);
+  DeleteDenseMatrix(ss);
+  DeleteMultiVector(Q);
+
   return 0;
 }
 
@@ -288,9 +317,9 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b
 template
 int GMRES< SparseMatrix<double>, CGData<double>, Vector<double> >
   (SparseMatrix<double> const&, CGData<double>&, Vector<double> const&, Vector<double>&,
-   const int, const int, double, int&, double&, double&, double*, bool);
+   const int, const int, double, int&, double&, double&, double*, double*, bool);
 
 template
 int GMRES< SparseMatrix<float>, CGData<float>, Vector<float> >
   (SparseMatrix<float> const&, CGData<float>&, Vector<float> const&, Vector<float>&,
-   const int, const int, float, int&, float&, float&, double*, bool);
+   const int, const int, float, int&, float&, float&, double*, double*, bool);
diff --git a/src/GMRES.hpp b/src/GMRES.hpp
index af1e2db8..609419a5 100644
--- a/src/GMRES.hpp
+++ b/src/GMRES.hpp
@@ -25,7 +25,7 @@ template<class SparseMatrix_type, class CGData_type, class Vector_type>
 int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b, Vector_type & x,
           const int restart_length, const int max_iter, const typename SparseMatrix_type::scalar_type tolerance,
           int & niters, typename SparseMatrix_type::scalar_type & normr, typename SparseMatrix_type::scalar_type & normr0,
-          double * times, bool doPreconditioning);
+          double * times, double * flops, bool doPreconditioning);
 
 // this function will compute the Conjugate Gradient iterations.
 // geom - Domain and processor topology information
diff --git a/src/GMRES_IR.cpp b/src/GMRES_IR.cpp
index a62ad933..e3e9047f 100644
--- a/src/GMRES_IR.cpp
+++ b/src/GMRES_IR.cpp
@@ -15,13 +15,13 @@
 /*!
  @file GMRES_IR.cpp
 
- HPCG routine
+ GMRES-IR routine
  */
 
 #include <fstream>
 #include <cmath>
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 
 #include "GMRES_IR.hpp"
 #include "mytimer.hpp"
@@ -31,6 +31,7 @@
 #include "ComputeWAXPBY.hpp"
 #include "ComputeTRSM.hpp"
 #include "ComputeGEMV.hpp"
+#include "ComputeGEMVT.hpp"
 
 
 // Use TICK and TOCK to time a code section in MATLAB-like fashion
@@ -55,14 +56,14 @@
 
   @return Returns zero on success and a non-zero value otherwise.
 
-  @see CG_ref()
+  @see GMRES_IR_ref()
 */
 template<class SparseMatrix_type, class SparseMatrix_type2, class CGData_type, class CGData_type2, class Vector_type>
 int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo,
              CGData_type & data, CGData_type2 & data_lo, const Vector_type & b_hi, Vector_type & x_hi,
              const int restart_length, const int max_iter, const typename SparseMatrix_type::scalar_type tolerance,
              int & niters, typename SparseMatrix_type::scalar_type & normr_hi, typename SparseMatrix_type::scalar_type & normr0_hi,
-             double * times, bool doPreconditioning) {
+             double * times, double *flops, bool doPreconditioning) {
 
   // higher precision for outer loop
   typedef typename SparseMatrix_type::scalar_type scalar_type;
@@ -75,11 +76,8 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo,
   typedef Vector<scalar_type2> Vector_type2;
 
   double t_begin = mytimer();  // Start timing right away
-  double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
+  double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0, t6 = 0.0;
 
-//#ifndef HPCG_NO_MPI
-//  double t6 = 0.0;
-//#endif
   // vectors/matrices in scalar_type2 (lower)
   const scalar_type2 one  (1.0);
   const scalar_type2 zero (0.0);
@@ -87,6 +85,7 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo,
   scalar_type2 rtz = zero, oldrtz = zero, alpha = zero, beta = zero, pAp = zero;
 
   local_int_t nrow = A_lo.localNumberOfRows;
+  local_int_t Nrow = A.totalNumberOfRows;
   Vector_type2 & x = data_lo.w; // Intermediate solution vector
   Vector_type2 & r = data_lo.r; // Residual vector
   Vector_type2 & z = data_lo.z; // Preconditioned residual vector
@@ -94,14 +93,17 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo,
   Vector_type2 & Ap = data_lo.Ap;
 
   SerialDenseMatrix_type2 H;
+  SerialDenseMatrix_type2 h;
+  SerialDenseMatrix_type2 t;
   SerialDenseMatrix_type2 cs;
   SerialDenseMatrix_type2 ss;
-  SerialDenseMatrix_type2 t;
   MultiVector_type2 Q;
+  MultiVector_type2 P;
   Vector_type2 Qkm1;
   Vector_type2 Qk;
   Vector_type2 Qj;
   InitializeMatrix(H,  restart_length+1, restart_length);
+  InitializeMatrix(h,  restart_length+1, 1);
   InitializeMatrix(t,  restart_length+1, 1);
   InitializeMatrix(cs, restart_length+1, 1);
   InitializeMatrix(ss, restart_length+1, 1);
@@ -116,45 +118,50 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo,
 
   if (!doPreconditioning && A.geom->rank==0) HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
 
-#ifdef HPCG_DEBUG
   int print_freq = 1;
+  bool verbose = true;
   if (print_freq>50) print_freq=50;
   if (print_freq<1)  print_freq=1;
-  if (A.geom->rank==0) HPCG_fout << std::endl << " Running GMRES_IR(" << restart_length
-                                 << ") with max-iters = " << max_iter
-                                 << " and tol = " << tolerance
-                                 << (doPreconditioning ? " with precond " : " without precond ")
-                                 << ", nrow = " << nrow << std::endl;
-#endif
+  if (verbose && A.geom->rank==0) {
+    HPCG_fout << std::endl << " Running GMRES_IR(" << restart_length
+                           << ") with max-iters = " << max_iter
+                           << " and tol = " << tolerance
+                           << (doPreconditioning ? " with precond " : " without precond ")
+                           << ", nrow = " << nrow << std::endl;
+  }
+  *flops = 0.0;
+  double flops_gmg  = 0.0;
+  double flops_spmv = 0.0;
+  double flops_orth = 0.0;
+  global_int_t numSpMVs_MG = 1+(A.mgData->numberOfPresmootherSteps + A.mgData->numberOfPostsmootherSteps);
   niters = 0;
   bool converged = false;
   while (niters <= max_iter && !converged) {
     // > Compute residual vector (higher working precision)
     // p is of length ncols, copy x to p for sparse MV operation
     CopyVector(x_hi, p_hi);
-    TICK(); ComputeSPMV(A, p_hi, Ap_hi); TOCK(t3); // Ap = A*p
-    TICK(); ComputeWAXPBY(nrow, one_hi, b_hi, -one_hi, Ap_hi, r_hi, A.isWaxpbyOptimized);  TOCK(t2); // r = b - Ax (x stored in p)
-    TICK(); ComputeDotProduct(nrow, r_hi, r_hi, normr_hi, t4, A.isDotProductOptimized); TOCK(t1);
+    TICK(); ComputeSPMV(A, p_hi, Ap_hi); flops_spmv += (2*A.totalNumberOfNonzeros); TOCK(t3); // Ap = A*p
+    TICK(); ComputeWAXPBY(nrow, one_hi, b_hi, -one_hi, Ap_hi, r_hi, A.isWaxpbyOptimized); *flops += (2*Nrow);  TOCK(t2); // r = b - Ax (x stored in p)
+    TICK(); ComputeDotProduct(nrow, r_hi, r_hi, normr_hi, t4, A.isDotProductOptimized); *flops += (2*Nrow); TOCK(t1);
     normr_hi = sqrt(normr_hi);
 
     // > Copy r and scale to the initial basis vector
     GetVector(Q, 0, Qj);
     CopyVector(r_hi, Qj);
-    TICK(); ComputeWAXPBY(nrow, zero, Qj, one_hi/normr_hi, Qj, Qj, A.isWaxpbyOptimized); TOCK(t2);
+    //TICK(); ComputeWAXPBY(nrow, zero, Qj, one_hi/normr_hi, Qj, Qj, A.isWaxpbyOptimized); TOCK(t2);
+    TICK(); ScaleVectorValue(Qj, one_hi/normr_hi); *flops += Nrow; TOCK(t2);
 
     // Record initial residual for convergence testing
     if (niters == 0) normr0 = normr_hi;
     normr = normr_hi;
-    #ifdef HPCG_DEBUG
-    if (A.geom->rank==0) HPCG_fout << "GMRES_IR Residual at the start of restart cycle = "<< normr
-                                   << ", " << normr/normr0 << std::endl;
-    #endif
+    if (verbose && A.geom->rank==0) {
+      HPCG_fout << "GMRES_IR Residual at the start of restart cycle = "<< normr
+                << ", " << normr/normr0 << std::endl;
+    }
 
     if (normr/normr0 <= tolerance) {
       converged = true;
-      #ifdef HPCG_DEBUG
-      if (A.geom->rank==0) HPCG_fout << " > GMRES_IR converged " << std::endl;
-      #endif
+      if (verbose && A.geom->rank==0) HPCG_fout << " > GMRES_IR converged " << std::endl;
     }
 
     // do forward GS instead of symmetric GS
@@ -168,37 +175,61 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo,
       GetVector(Q, k,   Qk);
 
       TICK();
-      if (doPreconditioning)
-        ComputeMG(A_lo, Qkm1, z, symmetric); // Apply preconditioner
-      else
-        CopyVector(Qkm1, z);              // copy r to z (no preconditioning)
+      if (doPreconditioning) {
+        ComputeMG(A_lo, Qkm1, z, symmetric); flops_gmg += (2*numSpMVs_MG*A.totalNumberOfMGNonzeros); // Apply preconditioner
+      } else {
+        CopyVector(Qkm1, z);       // copy r to z (no preconditioning)
+      }
       TOCK(t5); // Preconditioner apply time
 
       // Qk = A*z
-      TICK(); ComputeSPMV(A_lo, z, Qk); TOCK(t3);
+      TICK(); ComputeSPMV(A_lo, z, Qk); flops_spmv += (2*A.totalNumberOfNonzeros); TOCK(t3);
 
-      // MGS to orthogonalize z against Q(:,0:k-1), using dots
-      for (int j = 0; j < k; j++) {
-        // get j-th column of Q
-        GetVector(Q, j, Qj);
-
-        alpha = zero;
-        for (int i = 0; i < 2; i++) {
-          // beta = Qk'*Qj
-          TICK(); ComputeDotProduct(nrow, Qk, Qj, beta, t4, A.isDotProductOptimized); TOCK(t1);
-
-          // Qk = Qk - beta * Qj
-          TICK(); ComputeWAXPBY(nrow, one, Qk, -beta, Qj, Qk, A.isWaxpbyOptimized); TOCK(t2);
-          alpha += beta;
+      // orthogonalize z against Q(:,0:k-1), using dots
+      bool use_mgs = false;
+      TICK();
+      if (use_mgs) {
+        // MGS2
+        for (int j = 0; j < k; j++) {
+          // get j-th column of Q
+          GetVector(Q, j, Qj);
+
+          alpha = zero;
+          for (int i = 0; i < 2; i++) {
+            // beta = Qk'*Qj
+            TICK(); ComputeDotProduct(nrow, Qk, Qj, beta, t4, A.isDotProductOptimized); TOCK(t1);
+
+            // Qk = Qk - beta * Qj
+            TICK(); ComputeWAXPBY(nrow, one, Qk, -beta, Qj, Qk, A.isWaxpbyOptimized); TOCK(t2);
+            alpha += beta;
+          }
+          SetMatrixValue(H, j, k-1, alpha);
+        }
+      } else {
+        // CGS2
+        GetMultiVector(Q, 0, k-1, P);
+        ComputeGEMVT (nrow, k,  one, P, Qk, zero, h, A.isGemvOptimized); // h = Q(1:k)'*q(k+1)
+        ComputeGEMV  (nrow, k, -one, P, h,  one, Qk, A.isGemvOptimized); // h = Q(1:k)'*q(k+1)
+        for(int i = 0; i < k; i++) {
+          SetMatrixValue(H, i, k-1, h.values[i]);
+        }
+        // reorthogonalize
+        ComputeGEMVT (nrow, k,  one, P, Qk, zero, h, A.isGemvOptimized); // h = Q(1:k)'*q(k+1)
+        ComputeGEMV  (nrow, k, -one, P, h,  one, Qk, A.isGemvOptimized); // h = Q(1:k)'*q(k+1)
+        for(int i = 0; i < k; i++) {
+          AddMatrixValue(H, i, k-1, h.values[i]);
         }
-        SetMatrixValue(H, j, k-1, alpha);
       }
+      TOCK(t6); // Ortho time
+      flops_orth += (2*k*Nrow);
+
       // beta = norm(Qk)
       TICK(); ComputeDotProduct(nrow, Qk, Qk, beta, t4, A.isDotProductOptimized); TOCK(t1);
       beta = sqrt(beta);
 
       // Qk = Qk / beta
-      TICK(); ComputeWAXPBY(nrow, zero, Qk, one/beta, Qk, Qk, A.isWaxpbyOptimized); TOCK(t2);
+      //TICK(); ComputeWAXPBY(nrow, zero, Qk, one/beta, Qk, Qk, A.isWaxpbyOptimized); TOCK(t2);
+      TICK(); ScaleVectorValue(Qk, one/beta); *flops += Nrow; TOCK(t2);
       SetMatrixValue(H, k, k-1, beta);
 
       // Given's rotation
@@ -234,50 +265,28 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo,
       SetMatrixValue(cs, k-1, 0, cj);
 
       normr = std::abs(v2);
-      #ifdef HPCG_DEBUG
-        if (A.geom->rank==0 && (k%print_freq == 0 || k+1 == restart_length))
-          HPCG_fout << "GMRES_IR Iteration = "<< k << " (" << niters << ")   Scaled Residual = "
-                    << normr << " / " << normr0 << " = " << normr/normr0 << std::endl;
-      #endif
+      if (verbose && A.geom->rank==0 && (k%print_freq == 0 || k+1 == restart_length)) {
+        HPCG_fout << "GMRES_IR Iteration = "<< k << " (" << niters << ")   Scaled Residual = "
+                  << normr << " / " << normr0 << " = " << normr/normr0 << std::endl;
+      }
       niters ++;
       k ++;
     } // end of restart-cycle
     // prepare to restart
-    #ifdef HPCG_DEBUG
-      if (A.geom->rank==0)
-        HPCG_fout << "GMRES_IR restart: k = "<< k << " (" << niters << ")" << std::endl;
-    #endif
+    if (verbose && A.geom->rank==0)
+      HPCG_fout << "GMRES_IR restart: k = "<< k << " (" << niters << ")" << std::endl;
     // > update x
-#if 0
-printf( "\n k = %d\n",k );
-printf( "t=[\n" );
-for (int i = 0; i < k; i++) printf( "%e\n",t.values[i]);
-printf("];\n\n");
-
-printf( "R=[\n" );
-for (int i = 0; i < k; i++) {
-  for (int j = 0; j < k; j++) printf("%e ",H.values[i + j * H.m] );
-  printf("\n");
-}
-printf("];\n\n");
-
-if (niters == 1) {
- printf( " nrow = %d, max_iter = %d\n",nrow,max_iter );
- printf( " Q = [\n" );
- for (int i = 0; i < nrow; i++) {
-   for (int j = 0; j <= k-1; j++) printf( "%e ",Q.values[i + j * nrow] );
-   printf("\n");
- }
- printf( " ];\n\n" );
-}
-#endif
     ComputeTRSM(k-1, one, H, t);
     if (doPreconditioning) {
-      ComputeGEMV (nrow, k-1, one, Q, t, zero, r); // r = Q*t
-      ComputeMG(A_lo, r, z, symmetric);            // z = M*r
-      TICK(); ComputeWAXPBY(nrow, one_hi, x_hi, one, z, x_hi, A.isWaxpbyOptimized); TOCK(t2); // x += z
+      ComputeGEMV (nrow, k-1, one, Q, t, zero, r, A.isGemvOptimized); *flops += (2*Nrow*(k-1)); // r = Q*t
+      TICK();
+      ComputeMG(A_lo, r, z, symmetric); flops_gmg += (2*numSpMVs_MG*A.totalNumberOfMGNonzeros);    // z = M*r
+      TOCK(t5); // Preconditioner apply time
+      // mixed-precision
+      TICK(); ComputeWAXPBY(nrow, one_hi, x_hi, one, z, x_hi, A.isWaxpbyOptimized); *flops += (2*Nrow); TOCK(t2); // x += z
     } else {
-      ComputeGEMV (nrow, k-1, one_hi, Q, t, one_hi, x_hi); // x += Q*t
+      // mixed-precision
+      ComputeGEMV (nrow, k-1, one_hi, Q, t, one_hi, x_hi, A.isGemvOptimized); *flops += (2*Nrow*(k-1)); // x += Q*t
     }
   } // end of outer-loop
 
@@ -288,10 +297,29 @@ if (niters == 1) {
   times[3] += t3; // SPMV time
   times[4] += t4; // AllReduce time
   times[5] += t5; // preconditioner apply time
-//#ifndef HPCG_NO_MPI
-//  times[6] += t6; // exchange halo time
-//#endif
   times[0] += mytimer() - t_begin;  // Total time. All done...
+  if (verbose && A.geom->rank==0) {
+    HPCG_fout << " > nnz(A)  : " << A.totalNumberOfNonzeros << std::endl;
+    HPCG_fout << " > nnz(MG) : " << A.totalNumberOfMGNonzeros << " (" << numSpMVs_MG << ")" << std::endl;
+    HPCG_fout << " > SpMV : " << (flops_spmv / 1000000000.0) << " / " << t3 << " = "
+                              << (flops_spmv / 1000000000.0) / t3 << " Gflop/s" << std::endl;
+    HPCG_fout << " > GMG  : " << (flops_gmg  / 1000000000.0) << " / " << t5 << " = "
+                              << (flops_gmg  / 1000000000.0) / t5 << " Gflop/s" << std::endl;
+    HPCG_fout << " > Orth : " << (flops_orth / 1000000000.0) << " / " << t6 << " = "
+                              << (flops_orth / 1000000000.0) / t6 << " Gflop/s" << std::endl;
+    HPCG_fout << std::endl;
+  }
+  *flops += flops_gmg;
+  *flops += flops_spmv;
+  *flops += flops_orth;
+
+  DeleteDenseMatrix(H);
+  DeleteDenseMatrix(t);
+  DeleteDenseMatrix(h);
+  DeleteDenseMatrix(cs);
+  DeleteDenseMatrix(ss);
+  DeleteMultiVector(Q);
+
   return 0;
 }
 
@@ -304,17 +332,17 @@ if (niters == 1) {
 template
 int GMRES_IR< SparseMatrix<double>, SparseMatrix<double>, CGData<double>, CGData<double>, Vector<double> >
   (SparseMatrix<double> const&, SparseMatrix<double> const&, CGData<double>&, CGData<double>&, Vector<double> const&, Vector<double>&,
-   const int, const int, double, int&, double&, double&, double*, bool);
+   const int, const int, double, int&, double&, double&, double*, double*, bool);
 
 template
 int GMRES_IR< SparseMatrix<float>, SparseMatrix<float>, CGData<float>, CGData<float>, Vector<float> >
   (SparseMatrix<float> const&, SparseMatrix<float> const&, CGData<float>&, CGData<float>&, Vector<float> const&, Vector<float>&,
-   const int, const int, float, int&, float&, float&, double*, bool);
+   const int, const int, float, int&, float&, float&, double*, double*, bool);
 
 
 // mixed
 template
 int GMRES_IR< SparseMatrix<double>, SparseMatrix<float>, CGData<double>, CGData<float>, Vector<double> >
   (SparseMatrix<double> const&, SparseMatrix<float> const&, CGData<double>&, CGData<float>&, Vector<double> const&, Vector<double>&,
-   const int, const int, double, int&, double&, double&, double*, bool);
+   const int, const int, double, int&, double&, double&, double*, double*, bool);
 
diff --git a/src/GMRES_IR.hpp b/src/GMRES_IR.hpp
index a0e095b7..df6ea326 100644
--- a/src/GMRES_IR.hpp
+++ b/src/GMRES_IR.hpp
@@ -26,7 +26,7 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo,
              CGData_type & data, CGData_type2 & data_lo, const Vector_type & b_hi, Vector_type & x_hi,
              const int restart_length, const int max_iter, const typename SparseMatrix_type::scalar_type tolerance,
              int & niters, typename SparseMatrix_type::scalar_type & normr, typename SparseMatrix_type::scalar_type & normr0,
-             double * times, bool doPreconditioning);
+             double * times, double *flops, bool doPreconditioning);
 
 // this function will compute the Conjugate Gradient iterations.
 // geom - Domain and processor topology information
diff --git a/src/GenerateGeometry.cpp b/src/GenerateGeometry.cpp
index 9b33dfbb..90e9d94a 100644
--- a/src/GenerateGeometry.cpp
+++ b/src/GenerateGeometry.cpp
@@ -27,7 +27,7 @@
 
 #ifdef HPCG_DEBUG
 #include <fstream>
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 using std::endl;
 
 #endif
diff --git a/src/GenerateNonsymProblem.cpp b/src/GenerateNonsymProblem.cpp
index 7f46497f..f0880b47 100644
--- a/src/GenerateNonsymProblem.cpp
+++ b/src/GenerateNonsymProblem.cpp
@@ -49,8 +49,11 @@ void GenerateNonsymProblem(SparseMatrix_type & A, Vector_type * b, Vector_type *
   // Furthermore, any code must work for general unstructured sparse matrices.  Special knowledge about the
   // specific nature of the sparsity pattern may not be explicitly used.
 
-  return(GenerateNonsymProblem_v1_ref(A, b, x, xexact, init_vect));
-  //return(GenerateNonsymProblem_ref(A, b, x, xexact, init_vect));
+  #if 1
+  return GenerateNonsymProblem_v1_ref(A, b, x, xexact, init_vect);
+  #else
+  return GenerateNonsymProblem_ref(A, b, x, xexact, init_vect);
+  #endif
 }
 
 
diff --git a/src/GenerateNonsymProblem_ref.cpp b/src/GenerateNonsymProblem_ref.cpp
new file mode 100644
index 00000000..27284fdc
--- /dev/null
+++ b/src/GenerateNonsymProblem_ref.cpp
@@ -0,0 +1,271 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file GenerateProblem_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
+#include <fstream>
+using std::endl;
+#include "hpcg.hpp"
+#endif
+#include <cassert>
+#include <cmath>
+
+#include "GenerateNonsymProblem_ref.hpp"
+
+
+/*!
+  Reference version of GenerateProblem to generate the sparse matrix, right hand side, initial guess, and exact solution.
+
+  @param[in]  A      The known system matrix
+  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
+  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
+  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0 non-zero on entry)
+
+  @see GenerateGeometry
+*/
+
+template<class SparseMatrix_type, class Vector_type>
+void GenerateNonsymProblem_ref(SparseMatrix_type & A, Vector_type * b, Vector_type * x, Vector_type * xexact, bool init_vect) {
+
+  typedef typename SparseMatrix_type::scalar_type matrix_scalar_type;
+  typedef typename       Vector_type::scalar_type vector_scalar_type;
+  const matrix_scalar_type zero (0.0);
+  const matrix_scalar_type one  (1.0);
+  const vector_scalar_type two = one + one;
+
+  // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
+  // below may result in global range values.
+  global_int_t nx = A.geom->nx;
+  global_int_t ny = A.geom->ny;
+  global_int_t nz = A.geom->nz;
+  global_int_t gnx = A.geom->gnx;
+  global_int_t gny = A.geom->gny;
+  global_int_t gnz = A.geom->gnz;
+  global_int_t gix0 = A.geom->gix0;
+  global_int_t giy0 = A.geom->giy0;
+  global_int_t giz0 = A.geom->giz0;
+
+  local_int_t localNumberOfRows = nx*ny*nz; // This is the size of our subblock
+  // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
+  assert(localNumberOfRows>0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+  local_int_t numberOfNonzerosPerRow = 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
+
+  global_int_t totalNumberOfRows = gnx*gny*gnz; // Total number of grid points in mesh
+  // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+  assert(totalNumberOfRows>0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+
+
+  // Allocate arrays that are of length localNumberOfRows
+  char * nonzerosInRow = new char[localNumberOfRows];
+  global_int_t ** mtxIndG = new global_int_t*[localNumberOfRows];
+  local_int_t  ** mtxIndL = new local_int_t*[localNumberOfRows];
+  matrix_scalar_type ** matrixValues   = new matrix_scalar_type*[localNumberOfRows];
+  matrix_scalar_type ** matrixDiagonal = new matrix_scalar_type*[localNumberOfRows];
+
+  vector_scalar_type * bv = 0;
+  vector_scalar_type * xv = 0;
+  vector_scalar_type * xexactv = 0;
+  if (init_vect) {
+    InitializeVector(*b, localNumberOfRows);
+    InitializeVector(*x, localNumberOfRows);
+    InitializeVector(*xexact, localNumberOfRows);
+    bv = b->values; // Only compute exact solution if requested
+    xv = x->values; // Only compute exact solution if requested
+    xexactv = xexact->values; // Only compute exact solution if requested
+  }
+  A.localToGlobalMap.resize(localNumberOfRows);
+
+  // Use a parallel loop to do initial assignment:
+  // distributes the physical placement of arrays of pointers across the memory system
+#ifndef HPCG_NO_OPENMP
+  #pragma omp parallel for
+#endif
+  for (local_int_t i=0; i< localNumberOfRows; ++i) {
+    matrixValues[i] = 0;
+    matrixDiagonal[i] = 0;
+    mtxIndG[i] = 0;
+    mtxIndL[i] = 0;
+  }
+
+#ifndef HPCG_CONTIGUOUS_ARRAYS
+  // Now allocate the arrays pointed to
+  for (local_int_t i=0; i< localNumberOfRows; ++i)
+    mtxIndL[i] = new local_int_t[numberOfNonzerosPerRow];
+  for (local_int_t i=0; i< localNumberOfRows; ++i)
+    matrixValues[i] = new matrix_scalar_type[numberOfNonzerosPerRow];
+  for (local_int_t i=0; i< localNumberOfRows; ++i)
+   mtxIndG[i] = new global_int_t[numberOfNonzerosPerRow];
+
+#else
+  // Now allocate the arrays pointed to
+  mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+  matrixValues[0] = new matrix_scalar_type[localNumberOfRows * numberOfNonzerosPerRow];
+  mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+
+  for (local_int_t i=1; i< localNumberOfRows; ++i) {
+    mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
+    matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
+    mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
+  }
+#endif
+
+  matrix_scalar_type beta (1.0);
+  matrix_scalar_type gamma (10.0); //one;
+  local_int_t localNumberOfNonzeros = 0;
+  // TODO:  This triply nested loop could be flattened or use nested parallelism
+#ifndef HPCG_NO_OPENMP
+  #pragma omp parallel for
+#endif
+  for (local_int_t iz=0; iz<nz; iz++) {
+    global_int_t giz = giz0+iz;
+    for (local_int_t iy=0; iy<ny; iy++) {
+      global_int_t giy = giy0+iy;
+      for (local_int_t ix=0; ix<nx; ix++) {
+        global_int_t gix = gix0+ix;
+        local_int_t currentLocalRow = iz*(nx*ny) + iy*(nx) + ix;
+        global_int_t currentGlobalRow = giz*(gnx*gny) + giy*(gnx) + gix;
+#ifndef HPCG_NO_OPENMP
+// C++ std::map is not threadsafe for writing
+        #pragma omp critical
+#endif
+        A.globalToLocalMap[currentGlobalRow] = currentLocalRow;
+
+        A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
+#ifdef HPCG_DETAILED_DEBUG
+        HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " " << A.globalToLocalMap[currentGlobalRow] << endl;
+#endif
+        char numberOfNonzerosInRow = 0;
+        matrix_scalar_type * currentValuePointer = matrixValues[currentLocalRow]; // Pointer to current value in current row
+        vector_scalar_type bi (0.0);
+        global_int_t * currentIndexPointerG = mtxIndG[currentLocalRow]; // Pointer to current index in current row
+        for (int sz=-1; sz<=1; sz++) {
+          int jz = iz+sz;
+          if (giz+sz>-1 && giz+sz<gnz) {
+            for (int sy=-1; sy<=1; sy++) {
+              int jy = iy+sy;
+              if (giy+sy>-1 && giy+sy<gny) {
+                for (int sx=-1; sx<=1; sx++) {
+                  int jx = ix+sx;
+                  if (gix+sx>-1 && gix+sx<gnx) {
+                    global_int_t curcol = currentGlobalRow + sz*(gnx*gny) + sy*(gnx) + sx;
+                    if (curcol==currentGlobalRow) {
+                      matrixDiagonal[currentLocalRow] = currentValuePointer;
+                      *currentValuePointer = 26.0;
+                    } else {
+                      *currentValuePointer = 1.0;
+                    }
+                    matrix_scalar_type beta_i = sqrt(one + beta*(((matrix_scalar_type)(gix0+ix))/((matrix_scalar_type)(gnx-1)))) *
+                                                sqrt(one + beta*(((matrix_scalar_type)(giy0+iy))/((matrix_scalar_type)(gny-1)))) *
+                                                sqrt(one + beta*(((matrix_scalar_type)(giz0+iz))/((matrix_scalar_type)(gnz-1))));
+                    matrix_scalar_type beta_j = sqrt(one + beta*(((matrix_scalar_type)(gix0+jx))/((matrix_scalar_type)(gnx-1)))) *
+                                                sqrt(one + beta*(((matrix_scalar_type)(giy0+jy))/((matrix_scalar_type)(gny-1)))) *
+                                                sqrt(one + beta*(((matrix_scalar_type)(giz0+jz))/((matrix_scalar_type)(gnz-1))));
+                    *currentValuePointer *= (beta_i * beta_j);
+                    if (sy == 0 && sz == 0) {
+                      if (sx == 1) {
+                        *currentValuePointer += (gamma / two);
+                      } else if (sx == -1) {
+                        *currentValuePointer -= (gamma / two);
+                      }
+                    }
+                    bi += *currentValuePointer ;
+                    *currentIndexPointerG++ = curcol;
+                    *currentValuePointer++;
+                    numberOfNonzerosInRow++;
+                  } // end x bounds test
+                } // end sx loop
+              } // end y bounds test
+            } // end sy loop
+          } // end z bounds test
+        } // end sz loop
+        nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
+#ifndef HPCG_NO_OPENMP
+        #pragma omp critical
+#endif
+        localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
+        if (init_vect) {
+          bv[currentLocalRow] = bi; //26.0 - ((double) (numberOfNonzerosInRow-1));
+          xv[currentLocalRow] = zero;
+          xexactv[currentLocalRow] = one;
+        }
+      } // end ix loop
+    } // end iy loop
+  } // end iz loop
+#ifdef HPCG_DETAILED_DEBUG
+  HPCG_fout     << "Process " << A.geom->rank << " of " << A.geom->size <<" has " << localNumberOfRows    << " rows."     << endl
+      << "Process " << A.geom->rank << " of " << A.geom->size <<" has " << localNumberOfNonzeros<< " nonzeros." <<endl;
+#endif
+
+  global_int_t totalNumberOfNonzeros = 0;
+#ifndef HPCG_NO_MPI
+  // Use MPI's reduce function to sum all nonzeros
+#ifdef HPCG_NO_LONG_LONG
+  MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#else
+  long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
+  MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
+  totalNumberOfNonzeros = gnnz; // Copy back
+#endif
+#else
+  totalNumberOfNonzeros = localNumberOfNonzeros;
+#endif
+  // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+  // This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
+  assert(totalNumberOfNonzeros>0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
+
+  A.title = 0;
+  A.totalNumberOfRows = totalNumberOfRows;
+  A.totalNumberOfNonzeros = totalNumberOfNonzeros;
+  A.localNumberOfRows = localNumberOfRows;
+  A.localNumberOfColumns = localNumberOfRows;
+  A.localNumberOfNonzeros = localNumberOfNonzeros;
+  A.nonzerosInRow = nonzerosInRow;
+  A.mtxIndG = mtxIndG;
+  A.mtxIndL = mtxIndL;
+  A.matrixValues = matrixValues;
+  A.matrixDiagonal = matrixDiagonal;
+
+  return;
+}
+
+
+/* --------------- *
+ * specializations *
+ * --------------- */
+
+// uniform
+template
+void GenerateNonsymProblem_ref< SparseMatrix<double>, Vector<double> >(SparseMatrix<double>&, Vector<double>*, Vector<double>*, Vector<double>*, bool);
+
+template
+void GenerateNonsymProblem_ref< SparseMatrix<float>, Vector<float> >(SparseMatrix<float>&, Vector<float>*, Vector<float>*, Vector<float>*, bool);
+
+
+// mixed
+template
+void GenerateNonsymProblem_ref< SparseMatrix<float>, Vector<double> >(SparseMatrix<float>&, Vector<double>*, Vector<double>*, Vector<double>*, bool);
+
diff --git a/src/GenerateNonsymProblem_v1_ref.cpp b/src/GenerateNonsymProblem_v1_ref.cpp
index 0e0a3723..447057ad 100644
--- a/src/GenerateNonsymProblem_v1_ref.cpp
+++ b/src/GenerateNonsymProblem_v1_ref.cpp
@@ -29,7 +29,7 @@
 #if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
 #include <fstream>
 using std::endl;
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 #endif
 #include <cassert>
 
diff --git a/src/hpgmp.hpp b/src/Hpgmp_Params.hpp
similarity index 70%
rename from src/hpgmp.hpp
rename to src/Hpgmp_Params.hpp
index c2a4b830..185ef78e 100644
--- a/src/hpgmp.hpp
+++ b/src/Hpgmp_Params.hpp
@@ -13,13 +13,13 @@
 //@HEADER
 
 /*!
- @file hpgmp.hpp
+ @file Hpgmp_Params.hpp
 
  HPCG data structures and functions
  */
 
-#ifndef HPCG_HPP
-#define HPCG_HPP
+#ifndef HPGMP_PARAMS_HPP
+#define HPGMP_PARAMS_HPP
 
 #include <fstream>
 #include "Geometry.hpp"
@@ -30,13 +30,13 @@ struct HPCG_Params_STRUCT {
   int comm_size; //!< Number of MPI processes in MPI_COMM_WORLD
   int comm_rank; //!< This process' MPI rank in the range [0 to comm_size - 1]
   int numThreads; //!< This process' number of threads
-  local_int_t nx; //!< Number of processes in x-direction of 3D process grid
-  local_int_t ny; //!< Number of processes in y-direction of 3D process grid
-  local_int_t nz; //!< Number of processes in z-direction of 3D process grid
+  local_int_t nx; //!< Number of x-direction grid points for each local subdomain
+  local_int_t ny; //!< Number of y-direction grid points for each local subdomain
+  local_int_t nz; //!< Number of z-direction grid points for each local subdomain
   int runningTime; //!< Number of seconds to run the timed portion of the benchmark
-  int npx; //!< Number of x-direction grid points for each local subdomain
-  int npy; //!< Number of y-direction grid points for each local subdomain
-  int npz; //!< Number of z-direction grid points for each local subdomain
+  int npx; //!< Number of processes in z-direction of 3D process grid
+  int npy; //!< Number of processes in z-direction of 3D process grid
+  int npz; //!< Number of processes in z-direction of 3D process grid
   int pz; //!< Partition in the z processor dimension, default is npz
   local_int_t zl; //!< nz for processors in the z dimension with value less than pz
   local_int_t zu; //!< nz for processors in the z dimension with value greater than pz
diff --git a/src/MGData.hpp b/src/MGData.hpp
index cdcd6717..c6b6c98e 100644
--- a/src/MGData.hpp
+++ b/src/MGData.hpp
@@ -25,6 +25,11 @@
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 
+#ifdef HPCG_WITH_CUDA
+#include <cuda_runtime.h>
+#include <cusparse.h>
+#endif
+
 template<class SC>
 class MGData {
 public:
@@ -40,6 +45,13 @@ class MGData {
    used inside optimized ComputeSPMV().
    */
   void * optimizationData;
+  #ifdef HPCG_WITH_CUDA
+  // to store the restrictiion as CRS matrix on device
+  cusparseMatDescr_t descrA;
+  int *d_row_ptr;
+  int *d_col_idx;
+  SC  *d_nzvals;   //!< values of matrix entries
+  #endif
 };
 
 /*!
diff --git a/src/MultiVector.hpp b/src/MultiVector.hpp
index b04beae2..3c8536ef 100644
--- a/src/MultiVector.hpp
+++ b/src/MultiVector.hpp
@@ -21,6 +21,13 @@
 #ifndef MULTIVECTOR_HPP
 #define MULTIVECTOR_HPP
 
+#if defined(HPCG_WITH_CUDA)
+ #include <cuda_runtime.h>
+ #include <cublas_v2.h>
+#elif defined(HPCG_WITH_HIP)
+ #include <hip/hip_runtime_api.h>
+ #include <rocblas.h>
+#endif
 #include <cassert>
 #include <cstdlib>
 #include "Vector.hpp"
@@ -32,6 +39,14 @@ class MultiVector {
   local_int_t n;            //!< number of vectors
   local_int_t localLength;  //!< length of local portion of the vector
   SC * values;              //!< array of values
+  #if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP)
+  SC * d_values;   //!< array of values
+  #if defined(HPCG_WITH_CUDA)
+  cublasHandle_t handle;
+  #elif defined(HPCG_WITH_HIP)
+  rocblas_handle handle;
+  #endif
+  #endif
   /*!
    This is for storing optimized data structures created in OptimizeProblem and
    used inside optimized ComputeSPMV().
@@ -54,6 +69,21 @@ inline void InitializeMultiVector(MultiVector_type & V, local_int_t localLength,
   V.localLength = localLength;
   V.n = n;
   V.values = new scalar_type[localLength * n];
+  #if defined(HPCG_WITH_CUDA)
+  if (CUBLAS_STATUS_SUCCESS != cublasCreate(&V.handle)) {
+    printf( " InitializeVector :: Failed to create Handle\n" );
+  }
+  if (cudaSuccess != cudaMalloc ((void**)&V.d_values, (localLength*n)*sizeof(scalar_type))) {
+    printf( " InitializeVector :: Failed to allocate d_values\n" );
+  }
+  #elif defined(HPCG_WITH_HIP)
+  if (rocblas_status_success != rocblas_create_handle(&V.handle)) {
+    printf( " InitializeMultiVector :: Failed to create Handle\n" );
+  }
+  if (hipSuccess != hipMalloc ((void**)&V.d_values, (localLength*n)*sizeof(scalar_type))) {
+    printf( " InitializeMultiVector :: Failed to allocate d_values\n" );
+  }
+  #endif
   V.optimizationData = 0;
   return;
 }
@@ -76,6 +106,21 @@ inline void ZeroMultiVector(MultiVector_type & V) {
   return;
 }
 
+/*!
+  @param[inout] v - On entrance v is initialized, on exit all its values are zero.
+ */
+template<class MultiVector_type>
+inline void GetMultiVector(MultiVector_type & V, local_int_t j1, local_int_t j2, MultiVector_type & Vj) {
+  Vj.n = j2-j1+1;
+  Vj.localLength = V.localLength;
+  Vj.values = &V.values[V.localLength*j1];
+  #if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP)
+  Vj.d_values = &V.d_values[V.localLength*j1];
+  Vj.handle = V.handle;
+  #endif
+  return;
+}
+
 /*!
   @param[inout] v - On entrance v is initialized, on exit all its values are zero.
  */
@@ -83,6 +128,10 @@ template<class MultiVector_type, class Vector_type>
 inline void GetVector(MultiVector_type & V, local_int_t j, Vector_type & vj) {
   vj.localLength = V.localLength;
   vj.values = &V.values[V.localLength*j];
+  #if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP)
+  vj.d_values = &V.d_values[V.localLength*j];
+  vj.handle = V.handle;
+  #endif
   return;
 }
 
@@ -96,6 +145,13 @@ inline void DeleteMultiVector(MultiVector_type & V) {
 
   delete [] V.values;
   V.localLength = 0;
+  #if defined(HPCG_WITH_CUDA)
+  cudaFree (V.d_values);
+  cublasDestroy(V.handle);
+  #elif defined(HPCG_WITH_HIP)
+  hipFree(V.d_values);
+  rocblas_destroy_handle(V.handle);
+  #endif
   V.n = 0;
   return;
 }
diff --git a/src/OptimizeProblem.cpp b/src/OptimizeProblem.cpp
index 63d677da..594412d8 100644
--- a/src/OptimizeProblem.cpp
+++ b/src/OptimizeProblem.cpp
@@ -18,6 +18,14 @@
  HPCG routine
  */
 
+#ifdef HPCG_WITH_CUDA
+ #include <cuda_runtime.h>
+ #include <cublas_v2.h>
+#elif defined(HPCG_WITH_HIP)
+ #include <hip/hip_runtime_api.h>
+ #include <rocblas.h>
+#endif
+
 #include "OptimizeProblem.hpp"
 /*!
   Optimizes the data structures used for CG iteration to increase the
@@ -97,6 +105,357 @@ int OptimizeProblem(SparseMatrix_type & A, CGData_type & data, Vector_type & b,
     colors[i] = counters[colors[i]]++;
 #endif
 
+#if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP)
+  {
+    typedef typename SparseMatrix_type::scalar_type SC;
+
+    SparseMatrix_type * curLevelMatrix = &A;
+    do {
+      // -------------------------
+      // form CSR on host
+      const local_int_t nrow = curLevelMatrix->localNumberOfRows;
+      const local_int_t ncol = curLevelMatrix->localNumberOfColumns;
+      global_int_t nnzL = 0;
+      global_int_t nnz = curLevelMatrix->localNumberOfNonzeros;
+      int *h_row_ptr = (int*)malloc((nrow+1)* sizeof(int));
+      int *h_col_ind = (int*)malloc( nnz    * sizeof(int));
+      SC  *h_nzvals  = (SC *)malloc( nnz    * sizeof(SC));
+
+      nnz = 0;
+      h_row_ptr[0] = 0;
+      for (local_int_t i=0; i<nrow; i++)  {
+        const SC * const cur_vals = curLevelMatrix->matrixValues[i];
+        const local_int_t * const cur_inds = curLevelMatrix->mtxIndL[i];
+
+        const int cur_nnz = curLevelMatrix->nonzerosInRow[i];
+        for (int j=0; j<cur_nnz; j++) {
+          h_nzvals[nnz+j] = cur_vals[j];
+          h_col_ind[nnz+j] = cur_inds[j];
+          if (cur_inds[j] <= i) {
+            nnzL++;
+          }
+        }
+        nnz += cur_nnz;
+        h_row_ptr[i+1] = nnz;
+      }
+
+      // copy CSR(A) to device
+      #if defined(HPCG_WITH_CUDA)
+      if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_row_ptr), (nrow+1)*sizeof(int))) {
+        printf( " Failed to allocate A.d_row_ptr(nrow=%d)\n",nrow );
+      }
+      if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_col_idx), nnz*sizeof(int))) {
+        printf( " Failed to allocate A.d_col_idx(nnz=%d)\n",nnz );
+      }
+      if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_nzvals),  nnz*sizeof(SC))) {
+        printf( " Failed to allocate A.d_nzvals(nnz=%d)\n",nnz );
+      }
+
+      if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_row_ptr, h_row_ptr, (nrow+1)*sizeof(int), cudaMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_row_ptr\n" );
+      }
+      if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_col_idx, h_col_ind, nnz*sizeof(int), cudaMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_col_idx\n" );
+      }
+      if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_nzvals,  h_nzvals,  nnz*sizeof(SC),  cudaMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_nzvals\n" );
+      }
+      #elif defined(HPCG_WITH_HIP)
+      if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_row_ptr), (nrow+1)*sizeof(int))) {
+        printf( " Failed to allocate A.d_row_ptr(nrow=%d)\n",nrow );
+      }
+      if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_col_idx), nnz*sizeof(int))) {
+        printf( " Failed to allocate A.d_col_idx(nnz=%d)\n",nnz );
+      }
+      if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_nzvals),  nnz*sizeof(SC))) {
+        printf( " Failed to allocate A.d_nzvals(nnz=%d)\n",nnz );
+      }
+
+      if (hipSuccess != hipMemcpy(curLevelMatrix->d_row_ptr, h_row_ptr, (nrow+1)*sizeof(int), hipMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_row_ptr\n" );
+      }
+      if (hipSuccess != hipMemcpy(curLevelMatrix->d_col_idx, h_col_ind, nnz*sizeof(int), hipMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_col_idx\n" );
+      }
+      if (hipSuccess != hipMemcpy(curLevelMatrix->d_nzvals,  h_nzvals,  nnz*sizeof(SC), hipMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_nzvals\n" );
+      }
+      #endif
+
+      // free matrix on host
+      free(h_row_ptr);
+      free(h_col_ind);
+      free(h_nzvals);
+
+      // -------------------------
+      // Extract lower/upper-triangular matrix
+      global_int_t nnzU = nnz-nnzL;
+      int *h_Lrow_ptr = (int*)malloc((nrow+1)* sizeof(int));
+      int *h_Lcol_ind = (int*)malloc( nnzL   * sizeof(int));
+      SC  *h_Lnzvals  = (SC *)malloc( nnzL   * sizeof(SC));
+      int *h_Urow_ptr = (int*)malloc((nrow+1)* sizeof(int));
+      int *h_Ucol_ind = (int*)malloc( nnzU   * sizeof(int));
+      SC  *h_Unzvals  = (SC *)malloc( nnzU   * sizeof(SC));
+      nnzL = 0;
+      nnzU = 0;
+      h_Lrow_ptr[0] = 0;
+      h_Urow_ptr[0] = 0;
+      for (local_int_t i=0; i<nrow; i++)  {
+        const SC * const cur_vals = curLevelMatrix->matrixValues[i];
+        const local_int_t * const cur_inds = curLevelMatrix->mtxIndL[i];
+
+        const int cur_nnz = curLevelMatrix->nonzerosInRow[i];
+        for (int j=0; j<cur_nnz; j++) {
+          if (cur_inds[j] <= i) {
+            h_Lnzvals[nnzL] = cur_vals[j];
+            h_Lcol_ind[nnzL] = cur_inds[j];
+            nnzL ++;
+          } else {
+            h_Unzvals[nnzU] = cur_vals[j];
+            h_Ucol_ind[nnzU] = cur_inds[j];
+            nnzU ++;
+          }
+        }
+        h_Lrow_ptr[i+1] = nnzL;
+        h_Urow_ptr[i+1] = nnzU;
+      }
+      curLevelMatrix->nnzL = nnzL;
+      curLevelMatrix->nnzU = nnzU;
+
+      // copy CSR(L) to device
+      #if defined(HPCG_WITH_CUDA)
+      if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Lrow_ptr), (nrow+1)*sizeof(int))) {
+        printf( " Failed to allocate A.d_Lrow_ptr\n" );
+      }
+      if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Lcol_idx), nnzL*sizeof(int))) {
+        printf( " Failed to allocate A.d_Lcol_idx\n" );
+      }
+      if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Lnzvals),  nnzL*sizeof(SC))) {
+        printf( " Failed to allocate A.d_Lrow_ptr\n" );
+      }
+
+      if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Lrow_ptr, h_Lrow_ptr, (nrow+1)*sizeof(int), cudaMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Lrow_ptr\n" );
+      }
+      if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Lcol_idx, h_Lcol_ind, nnzL*sizeof(int), cudaMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Lcol_idx\n" );
+      }
+      if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Lnzvals,  h_Lnzvals,  nnzL*sizeof(SC),  cudaMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Lrow_ptr\n" );
+      }
+      #elif defined(HPCG_WITH_HIP)
+      if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Lrow_ptr), (nrow+1)*sizeof(int))) {
+        printf( " Failed to allocate A.d_Lrow_ptr\n" );
+      }
+      if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Lcol_idx), nnzL*sizeof(int))) {
+        printf( " Failed to allocate A.d_Lcol_idx\n" );
+      }
+      if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Lnzvals),  nnzL*sizeof(SC))) {
+        printf( " Failed to allocate A.d_Lrow_ptr\n" );
+      }
+
+      if (hipSuccess != hipMemcpy(curLevelMatrix->d_Lrow_ptr, h_Lrow_ptr, (nrow+1)*sizeof(int), hipMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Lrow_ptr\n" );
+      }
+      if (hipSuccess != hipMemcpy(curLevelMatrix->d_Lcol_idx, h_Lcol_ind, nnzL*sizeof(int), hipMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Lcol_idx\n" );
+      }
+      if (hipSuccess != hipMemcpy(curLevelMatrix->d_Lnzvals,  h_Lnzvals,  nnzL*sizeof(SC),  hipMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Lrow_ptr\n" );
+      }
+      #endif
+
+      // copy CSR(U) to device
+      #if defined(HPCG_WITH_CUDA)
+      if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Urow_ptr), (nrow+1)*sizeof(int))) {
+        printf( " Failed to allocate A.d_Urow_ptr(nrow=%d)\n",nrow );
+      }
+      if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Ucol_idx), nnzU*sizeof(int))) {
+        printf( " Failed to allocate A.d_Ucol_idx(nnzU=%d)\n",nnzU );
+      }
+      if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Unzvals),  nnzU*sizeof(SC))) {
+        printf( " Failed to allocate A.d_Urow_ptr(nnzU=%d)\n",nnzU );
+      }
+
+      if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Urow_ptr, h_Urow_ptr, (nrow+1)*sizeof(int), cudaMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Urow_ptr\n" );
+      }
+      if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Ucol_idx, h_Ucol_ind, nnzU*sizeof(int), cudaMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Ucol_idx\n" );
+      }
+      if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Unzvals,  h_Unzvals,  nnzU*sizeof(SC),  cudaMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Urow_ptr\n" );
+      }
+      #elif defined(HPCG_WITH_HIP)
+      if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Urow_ptr), (nrow+1)*sizeof(int))) {
+        printf( " Failed to allocate A.d_Urow_ptr(nrow=%d)\n",nrow );
+      }
+      if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Ucol_idx), nnzU*sizeof(int))) {
+        printf( " Failed to allocate A.d_Ucol_idx(nnzU=%d)\n",nnzU );
+      }
+      if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Unzvals),  nnzU*sizeof(SC))) {
+        printf( " Failed to allocate A.d_Urow_ptr(nnzU=%d)\n",nnzU );
+      }
+
+      if (hipSuccess != hipMemcpy(curLevelMatrix->d_Urow_ptr, h_Urow_ptr, (nrow+1)*sizeof(int), hipMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Urow_ptr\n" );
+      }
+      if (hipSuccess != hipMemcpy(curLevelMatrix->d_Ucol_idx, h_Ucol_ind, nnzU*sizeof(int), hipMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Ucol_idx\n" );
+      }
+      if (hipSuccess != hipMemcpy(curLevelMatrix->d_Unzvals,  h_Unzvals,  nnzU*sizeof(SC),  hipMemcpyHostToDevice)) {
+        printf( " Failed to memcpy A.d_Urow_ptr\n" );
+      #endif
+
+      // free matrix on host
+      free(h_Lrow_ptr);
+      free(h_Lcol_ind);
+      free(h_Lnzvals);
+      free(h_Urow_ptr);
+      free(h_Ucol_ind);
+      free(h_Unzvals);
+
+      #if defined(HPCG_WITH_CUDA)
+      // -------------------------
+      // create Handle (for each matrix)
+      cusparseCreate(&(curLevelMatrix->cusparseHandle));
+
+      // -------------------------
+      // descriptor for A
+      cusparseCreateMatDescr(&(curLevelMatrix->descrA));
+      cusparseSetMatType(curLevelMatrix->descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(curLevelMatrix->descrA, CUSPARSE_INDEX_BASE_ZERO);
+
+      // -------------------------
+      // run analysis for triangular solve
+      cusparseCreateMatDescr(&(curLevelMatrix->descrL));
+      cusparseCreateSolveAnalysisInfo(&(curLevelMatrix->infoL));
+      cusparseSetMatType(curLevelMatrix->descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
+      cusparseSetMatIndexBase(curLevelMatrix->descrL, CUSPARSE_INDEX_BASE_ZERO);
+      if (std::is_same<SC, double>::value) {
+        cusparseDcsrsv_analysis(curLevelMatrix->cusparseHandle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE, nrow, nnzL,
+                                curLevelMatrix->descrL,
+                                (double *)curLevelMatrix->d_Lnzvals, curLevelMatrix->d_Lrow_ptr, curLevelMatrix->d_Lcol_idx,
+                                curLevelMatrix->infoL);
+      } else if (std::is_same<SC, float>::value) {
+        cusparseScsrsv_analysis(curLevelMatrix->cusparseHandle,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE, nrow, nnzL,
+                                curLevelMatrix->descrL,
+                                (float *)curLevelMatrix->d_Lnzvals, curLevelMatrix->d_Lrow_ptr, curLevelMatrix->d_Lcol_idx,
+                                curLevelMatrix->infoL);
+      }
+
+      // -------------------------
+      // descriptor for U
+      cusparseCreateMatDescr(&(curLevelMatrix->descrU));
+      cusparseSetMatType(curLevelMatrix->descrU, CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(curLevelMatrix->descrU, CUSPARSE_INDEX_BASE_ZERO);
+      #elif defined(HPCG_WITH_HIP)
+      #endif
+
+      if (curLevelMatrix->mgData!=0) {
+        // -------------------------
+        // store restriction as CRS
+        local_int_t * f2c = curLevelMatrix->mgData->f2cOperator;
+        local_int_t nc = curLevelMatrix->mgData->rc->localLength;
+        h_row_ptr = (int*)malloc((nc+1)* sizeof(int));
+        h_col_ind = (int*)malloc( nc    * sizeof(int));
+        h_nzvals  = (SC *)malloc( nc    * sizeof(SC));
+
+        h_row_ptr[0] = 0;
+        for (local_int_t i=0; i<nc; ++i) {
+          h_col_ind[i] = f2c[i];
+          h_nzvals[i] = 1.0;
+          h_row_ptr[i+1] = i+1;
+        }
+
+        // copy CSR(A) to device
+        #if defined(HPCG_WITH_CUDA)
+        if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->mgData->d_row_ptr), (nc+1)*sizeof(int))) {
+          printf( " Failed to allocate A.d_row_ptr(nc=%d)\n",nc );
+        }
+        if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->mgData->d_col_idx), nc*sizeof(int))) {
+          printf( " Failed to allocate A.d_col_idx(nc=%d)\n",nc );
+        }
+        if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->mgData->d_nzvals),  nc*sizeof(SC))) {
+          printf( " Failed to allocate A.d_nzvals(nc=%d)\n",nc );
+        }
+
+        if (cudaSuccess != cudaMemcpy(curLevelMatrix->mgData->d_row_ptr, h_row_ptr, (nc+1)*sizeof(int), cudaMemcpyHostToDevice)) {
+          printf( " Failed to memcpy A.d_row_ptr\n" );
+        }
+        if (cudaSuccess != cudaMemcpy(curLevelMatrix->mgData->d_col_idx, h_col_ind, nc*sizeof(int), cudaMemcpyHostToDevice)) {
+          printf( " Failed to memcpy A.d_col_idx\n" );
+        }
+        if (cudaSuccess != cudaMemcpy(curLevelMatrix->mgData->d_nzvals,  h_nzvals,  nc*sizeof(SC),  cudaMemcpyHostToDevice)) {
+          printf( " Failed to memcpy A.d_nzvals\n" );
+        }
+        #elif defined(HPCG_WITH_HIP)
+        if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->mgData->d_row_ptr), (nc+1)*sizeof(int))) {
+          printf( " Failed to allocate A.d_row_ptr(nc=%d)\n",nc );
+        }
+        if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->mgData->d_col_idx), nc*sizeof(int))) {
+          printf( " Failed to allocate A.d_col_idx(nc=%d)\n",nc );
+        }
+        if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->mgData->d_nzvals),  nc*sizeof(SC))) {
+          printf( " Failed to allocate A.d_nzvals(nc=%d)\n",nc );
+        }
+
+        if (hipSuccess != hipMemcpy(curLevelMatrix->mgData->d_row_ptr, h_row_ptr, (nc+1)*sizeof(int), hipMemcpyHostToDevice)) {
+          printf( " Failed to memcpy A.d_row_ptr\n" );
+        }
+        if (hipSuccess != hipMemcpy(curLevelMatrix->mgData->d_col_idx, h_col_ind, nc*sizeof(int), hipMemcpyHostToDevice)) {
+          printf( " Failed to memcpy A.d_col_idx\n" );
+        }
+        if (hipSuccess != hipMemcpy(curLevelMatrix->mgData->d_nzvals,  h_nzvals,  nc*sizeof(SC),  hipMemcpyHostToDevice)) {
+          printf( " Failed to memcpy A.d_nzvals\n" );
+        }
+        #endif
+
+        // -------------------------
+        // descriptor for restrictor
+        #if defined(HPCG_WITH_CUDA)
+        cusparseCreateMatDescr(&(curLevelMatrix->mgData->descrA));
+        cusparseSetMatType(curLevelMatrix->mgData->descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatIndexBase(curLevelMatrix->mgData->descrA, CUSPARSE_INDEX_BASE_ZERO);
+        #elif defined(HPCG_WITH_HIP)
+        #endif
+
+        // free matrix on host
+        free(h_row_ptr);
+        free(h_col_ind);
+        free(h_nzvals);
+      } //A.mgData!=0
+
+      // for debuging, TODO: remove these
+      InitializeVector(curLevelMatrix->x, nrow);
+      InitializeVector(curLevelMatrix->y, ncol);
+ 
+      // next matrix
+      curLevelMatrix = curLevelMatrix->Ac;
+    } while (curLevelMatrix != 0);
+  }
+  {
+    typedef typename Vector_type::scalar_type vector_SC;
+    #if defined(HPCG_WITH_CUDA)
+    if (cudaSuccess != cudaMemcpy(b.d_values,  b.values, (b.localLength)*sizeof(vector_SC),  cudaMemcpyHostToDevice)) {
+      printf( " Failed to memcpy b\n" );
+    }
+    if (cudaSuccess != cudaMemcpy(x.d_values,  x.values, (x.localLength)*sizeof(vector_SC),  cudaMemcpyHostToDevice)) {
+      printf( " Failed to memcpy x\n" );
+    }
+    #elif defined(HPCG_WITH_HIP)
+    if (hipSuccess != hipMemcpy(b.d_values,  b.values, (b.localLength)*sizeof(vector_SC),  hipMemcpyHostToDevice)) {
+      printf( " Failed to memcpy b\n" );
+    }
+    if (hipSuccess != hipMemcpy(x.d_values,  x.values, (x.localLength)*sizeof(vector_SC),  hipMemcpyHostToDevice)) {
+      printf( " Failed to memcpy x\n" );
+    }
+    #endif
+  }
+#endif
+
   return 0;
 }
 
@@ -128,3 +487,9 @@ int OptimizeProblem< SparseMatrix<float>, CGData<float>, Vector<float> >
 template
 double OptimizeProblemMemoryUse< SparseMatrix<float> >
   (SparseMatrix<float> const&);
+
+// mixed-precision
+template
+int OptimizeProblem< SparseMatrix<float>, CGData<double>, Vector<double> >
+  (SparseMatrix<float>&, CGData<double>&, Vector<double>&, Vector<double>&, Vector<double>&);
+
diff --git a/src/ReportResults.cpp b/src/ReportResults.cpp
index 3eaed87b..99dcb4ee 100644
--- a/src/ReportResults.cpp
+++ b/src/ReportResults.cpp
@@ -31,7 +31,7 @@
 #include <fstream>
 using std::endl;
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 #endif
 
 /*!
diff --git a/src/SerialDenseMatrix.hpp b/src/SerialDenseMatrix.hpp
index 02162fdc..2faa13a1 100644
--- a/src/SerialDenseMatrix.hpp
+++ b/src/SerialDenseMatrix.hpp
@@ -21,17 +21,25 @@
 #ifndef SERIAL_DENSE_MATRIX_HPP
 #define SERIAL_DENSE_MATRIX_HPP
 
+#include <fstream>
 #include <cassert>
 #include <cstdlib>
+#ifdef HPCG_WITH_CUDA
+ #include <cuda_runtime.h>
+ #include <cublas_v2.h>
+#endif
 
 template<class SC>
 class SerialDenseMatrix {
 public:
   typedef SC scalar_type;
 
-  local_int_t m;            //!< number of rows
-  local_int_t n;            //!< number of columns
+  local_int_t m;        //!< number of rows
+  local_int_t n;        //!< number of columns
   SC * values;          //!< array of values
+#ifdef HPCG_WITH_CUDA
+  SC * d_values;        //!< array of values
+#endif
   /*!
    This is for storing optimized data structures created in OptimizeProblem and
    used inside optimized ComputeSPMV().
@@ -54,6 +62,11 @@ inline void InitializeMatrix(SerialDenseMatrix_type & A, local_int_t m, local_in
   A.m   = m;
   A.n   = n;
   A.values = new scalar_type[m*n];
+#ifdef HPCG_WITH_CUDA
+  if (cudaSuccess != cudaMalloc ((void**)&A.d_values, m*n*sizeof(scalar_type))) {
+    printf( " InitializeVector :: Failed to allocate d_values\n" );
+  }
+#endif
   A.optimizationData = 0;
   return;
 }
@@ -101,6 +114,18 @@ inline void CopyMatrix(const SerialDenseMatrix_type & A, SerialDenseMatrix_type
   return;
 }
 
+template<class SerialDenseMatrix_type>
+inline void AddMatrixValue(SerialDenseMatrix_type & A, local_int_t i, local_int_t j, typename SerialDenseMatrix_type::scalar_type value) {
+
+  typedef typename SerialDenseMatrix_type::scalar_type scalar_type;
+
+  assert(i>=0 && i < A.m);
+  assert(j>=0 && j < A.n);
+  scalar_type * vv = A.values;
+  vv[i + j*A.m] += value;
+  return;
+}
+
 template<class SerialDenseMatrix_type>
 inline void SetMatrixValue(SerialDenseMatrix_type & A, local_int_t i, local_int_t j, typename SerialDenseMatrix_type::scalar_type value) {
 
@@ -131,7 +156,7 @@ GetMatrixValue(SerialDenseMatrix_type & A, local_int_t i, local_int_t j) {
   @param[in] A the known system matrix
  */
 template<class SerialDenseMatrix_type>
-inline void DeleteSerialDenseMatrix(SerialDenseMatrix_type & A) {
+inline void DeleteDenseMatrix(SerialDenseMatrix_type & A) {
 
   delete [] A.values;
   A.m = 0;
diff --git a/src/SetupHalo_ref.cpp b/src/SetupHalo_ref.cpp
index 647f5df5..06527e72 100644
--- a/src/SetupHalo_ref.cpp
+++ b/src/SetupHalo_ref.cpp
@@ -31,7 +31,7 @@
 #ifdef HPCG_DETAILED_DEBUG
 #include <fstream>
 using std::endl;
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 #include <cassert>
 #endif
 
diff --git a/src/SetupProblem.cpp b/src/SetupProblem.cpp
index 500b32c5..0735c128 100644
--- a/src/SetupProblem.cpp
+++ b/src/SetupProblem.cpp
@@ -40,9 +40,14 @@ void SetupProblem(int numberOfMgLevels, SparseMatrix_type & A, Geometry * geom,
   GenerateNonsymProblem(A, b, x, xexact, init_vect);
   SetupHalo(A); //TODO: This is currently called in main... Should it really be called in both places?  Which one? 
 
+  A.localNumberOfMGNonzeros = A.localNumberOfNonzeros;
+  A.totalNumberOfMGNonzeros = A.totalNumberOfNonzeros;
+
   SparseMatrix_type * curLevelMatrix = &A;
   for (int level = 1; level< numberOfMgLevels; ++level) {
     GenerateNonsymCoarseProblem(*curLevelMatrix);
+    A.localNumberOfMGNonzeros += curLevelMatrix->Ac->localNumberOfNonzeros;
+    A.totalNumberOfMGNonzeros += curLevelMatrix->Ac->totalNumberOfNonzeros;
     curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level
   }
 
diff --git a/src/SparseMatrix.hpp b/src/SparseMatrix.hpp
index 639438c2..4dc8cf1a 100644
--- a/src/SparseMatrix.hpp
+++ b/src/SparseMatrix.hpp
@@ -36,6 +36,11 @@ typedef std::map< global_int_t, local_int_t > GlobalToLocalMap;
 using GlobalToLocalMap = std::unordered_map< global_int_t, local_int_t >;
 #endif
 
+#ifdef HPCG_WITH_CUDA
+#include <cuda_runtime.h>
+#include <cusparse.h>
+#endif
+
 template <class SC = double>
 class SparseMatrix {
 public:
@@ -44,9 +49,11 @@ class SparseMatrix {
   Geometry * geom; //!< geometry associated with this matrix
   global_int_t totalNumberOfRows; //!< total number of matrix rows across all processes
   global_int_t totalNumberOfNonzeros; //!< total number of matrix nonzeros across all processes
+  global_int_t totalNumberOfMGNonzeros; //!< total number of matrix nonzeros across all processes, for MG
   local_int_t localNumberOfRows; //!< number of rows local to this process
   local_int_t localNumberOfColumns;  //!< number of columns local to this process
   local_int_t localNumberOfNonzeros;  //!< number of nonzeros local to this process
+  local_int_t localNumberOfMGNonzeros;  //!< number of nonzeros local to this process, for MG
   char  * nonzerosInRow;  //!< The number of nonzeros in a row will always be 27 or fewer
   global_int_t ** mtxIndG; //!< matrix indices as global values
   local_int_t ** mtxIndL; //!< matrix indices as local values
@@ -58,6 +65,7 @@ class SparseMatrix {
   mutable bool isSpmvOptimized;
   mutable bool isMgOptimized;
   mutable bool isWaxpbyOptimized;
+  mutable bool isGemvOptimized;
   /*!
    This is for storing optimized data structres created in OptimizeProblem and
    used inside optimized ComputeSPMV().
@@ -76,6 +84,39 @@ class SparseMatrix {
   local_int_t * sendLength; //!< lenghts of messages sent to neighboring processes
   SC * sendBuffer; //!< send buffer for non-blocking sends
 #endif
+#if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP)
+  #if defined(HPCG_WITH_CUDA)
+  cusparseHandle_t cusparseHandle;
+  cusparseMatDescr_t descrA;
+  #endif
+
+  // to store the local matrix on device
+  int *d_row_ptr;
+  int *d_col_idx;
+  SC  *d_nzvals;   //!< values of matrix entries
+
+  // to store the lower-triangular matrix on device
+  local_int_t nnzL;
+  #if defined(HPCG_WITH_CUDA)
+  cusparseMatDescr_t descrL;
+  cusparseSolveAnalysisInfo_t infoL;
+  #endif
+  int *d_Lrow_ptr;
+  int *d_Lcol_idx;
+  SC  *d_Lnzvals;   //!< values of matrix entries
+  // to store the strictly upper-triangular matrix on device
+  local_int_t nnzU;
+  #if defined(HPCG_WITH_CUDA)
+  cusparseMatDescr_t descrU;
+  #endif
+  int *d_Urow_ptr;
+  int *d_Ucol_idx;
+  SC  *d_Unzvals;   //!< values of matrix entries
+
+  // TODO: remove
+  Vector<SC> x; // nrow
+  Vector<SC> y; // ncol
+#endif
 };
 
 /*!
@@ -89,9 +130,11 @@ inline void InitializeSparseMatrix(SparseMatrix_type & A, Geometry * geom) {
   A.geom = geom;
   A.totalNumberOfRows = 0;
   A.totalNumberOfNonzeros = 0;
+  A.totalNumberOfMGNonzeros = 0;
   A.localNumberOfRows = 0;
   A.localNumberOfColumns = 0;
   A.localNumberOfNonzeros = 0;
+  A.localNumberOfMGNonzeros = 0;
   A.nonzerosInRow = 0;
   A.mtxIndG = 0;
   A.mtxIndL = 0;
@@ -102,8 +145,9 @@ inline void InitializeSparseMatrix(SparseMatrix_type & A, Geometry * geom) {
   // functions that are meant to be optimized.
   A.isDotProductOptimized = true;
   A.isSpmvOptimized       = true;
-  A.isMgOptimized      = true;
+  A.isMgOptimized         = true;
   A.isWaxpbyOptimized     = true;
+  A.isGemvOptimized       = true;
 
 #ifndef HPCG_NO_MPI
   A.numberOfExternalValues = 0;
@@ -169,24 +213,61 @@ inline void DeleteMatrix(SparseMatrix_type & A) {
   delete [] A.mtxIndG[0];
   delete [] A.mtxIndL[0];
 #endif
-  if (A.title)                  delete [] A.title;
-  if (A.nonzerosInRow)             delete [] A.nonzerosInRow;
-  if (A.mtxIndG) delete [] A.mtxIndG;
-  if (A.mtxIndL) delete [] A.mtxIndL;
-  if (A.matrixValues) delete [] A.matrixValues;
-  if (A.matrixDiagonal)           delete [] A.matrixDiagonal;
+  if (A.title)                 delete [] A.title;
+  if (A.nonzerosInRow)         delete [] A.nonzerosInRow;
+  if (A.mtxIndG)               delete [] A.mtxIndG;
+  if (A.mtxIndL)               delete [] A.mtxIndL;
+  if (A.matrixValues)          delete [] A.matrixValues;
+  if (A.matrixDiagonal)        delete [] A.matrixDiagonal;
 
 #ifndef HPCG_NO_MPI
-  if (A.elementsToSend)       delete [] A.elementsToSend;
-  if (A.neighbors)              delete [] A.neighbors;
-  if (A.receiveLength)            delete [] A.receiveLength;
+  if (A.elementsToSend)        delete [] A.elementsToSend;
+  if (A.neighbors)             delete [] A.neighbors;
+  if (A.receiveLength)         delete [] A.receiveLength;
   if (A.sendLength)            delete [] A.sendLength;
   if (A.sendBuffer)            delete [] A.sendBuffer;
 #endif
 
-  if (A.geom!=0) { DeleteGeometry(*A.geom); delete A.geom; A.geom = 0;}
-  if (A.Ac!=0) { DeleteMatrix(*A.Ac); delete A.Ac; A.Ac = 0;} // Delete coarse matrix
-  if (A.mgData!=0) { DeleteMGData(*A.mgData); delete A.mgData; A.mgData = 0;} // Delete MG data
+  if (A.geom!=0) {
+    DeleteGeometry(*A.geom);
+    delete A.geom;
+    A.geom = 0;
+  }
+  if (A.Ac!=0) {
+    // Delete coarse matrix
+    DeleteMatrix(*A.Ac);
+    delete A.Ac; 
+    A.Ac = 0;
+  }
+  if (A.mgData!=0) {
+    // Delete MG data
+    DeleteMGData(*A.mgData);
+    delete A.mgData;
+    A.mgData = 0;
+  }
+
+#ifdef HPCG_WITH_CUDA
+  cudaFree (A.d_row_ptr);
+  cudaFree (A.d_col_idx);
+  cudaFree (A.d_nzvals);
+
+  cudaFree (A.d_Lrow_ptr);
+  cudaFree (A.d_Lcol_idx);
+  cudaFree (A.d_Lnzvals);
+
+  cudaFree (A.d_Urow_ptr);
+  cudaFree (A.d_Ucol_idx);
+  cudaFree (A.d_Unzvals);
+
+  DeleteVector (A.x);
+  DeleteVector (A.y);
+
+  cusparseDestroy(A.cusparseHandle);
+  cusparseDestroyMatDescr(A.descrA);
+  cusparseDestroyMatDescr(A.descrL);
+  cusparseDestroyMatDescr(A.descrU);
+  cusparseDestroySolveAnalysisInfo(A.infoL);
+#endif
   return;
 }
 
diff --git a/src/TestGMRES.cpp b/src/TestGMRES.cpp
index cee9ae70..4fb07f21 100644
--- a/src/TestGMRES.cpp
+++ b/src/TestGMRES.cpp
@@ -30,7 +30,7 @@
 #include <iostream>
 using std::endl;
 #include <vector>
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 
 #include "TestGMRES.hpp"
 #include "GMRES.hpp"
@@ -54,13 +54,15 @@ using std::endl;
 
 
 template<class SparseMatrix_type, class SparseMatrix_type2, class CGData_type, class CGData_type2, class Vector_type, class TestCGData_type>
-int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data) {
+int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, Vector_type & b, Vector_type & x,
+              TestCGData_type & testcg_data, bool test_diagonal_exaggeration, bool test_noprecond) {
 
   typedef typename SparseMatrix_type::scalar_type scalar_type;
   typedef typename SparseMatrix_type2::scalar_type scalar_type2;
   typedef Vector<scalar_type2> Vector_type2;
 
   // Use this array for collecting timing information
+  double flops;
   std::vector< double > times(8,0.0);
   // Temporary storage for holding original diagonal and RHS
   Vector_type origDiagA, exaggeratedDiagA, origB;
@@ -76,29 +78,30 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da
   CopyVector(origDiagA2, exagDiagA2);
   CopyVector(b, origB);
 
-#if 0
-  if (A.geom->rank==0) HPCG_fout << std::endl << " ** skippping diagonal exaggeration ** " << std::endl << std::endl;
-#else
-  // Modify the matrix diagonal to greatly exaggerate diagonal values.
-  // CG should converge in about 10 iterations for this problem, regardless of problem size
-  if (A.geom->rank==0) HPCG_fout << std::endl << " ** applying diagonal exaggeration ** " << std::endl << std::endl;
-  for (local_int_t i=0; i< A.localNumberOfRows; ++i) {
-    global_int_t globalRowID = A.localToGlobalMap[i];
-    if (globalRowID<9) {
-      scalar_type scale = (globalRowID+2)*1.0e6;
-      scalar_type2 scale2 = (globalRowID+2)*1.0e6;
-      ScaleVectorValue(exaggeratedDiagA, i, scale);
-      ScaleVectorValue(exagDiagA2, i, scale2);
-      ScaleVectorValue(b, i, scale);
-    } else {
-      ScaleVectorValue(exaggeratedDiagA, i, 1.0e6);
-      ScaleVectorValue(exagDiagA2, i, 1.0e6);
-      ScaleVectorValue(b, i, 1.0e6);
+  // TODO: This should be moved to somewhere-else, e.g., SetupProblem
+  if (test_diagonal_exaggeration) {
+    // Modify the matrix diagonal to greatly exaggerate diagonal values.
+    // CG should converge in about 10 iterations for this problem, regardless of problem size
+    if (A.geom->rank==0) HPCG_fout << std::endl << " ** applying diagonal exaggeration ** " << std::endl << std::endl;
+    for (local_int_t i=0; i< A.localNumberOfRows; ++i) {
+      global_int_t globalRowID = A.localToGlobalMap[i];
+      if (globalRowID<9) {
+        scalar_type scale = (globalRowID+2)*1.0e6;
+        scalar_type2 scale2 = (globalRowID+2)*1.0e6;
+        ScaleVectorValue(exaggeratedDiagA, i, scale);
+        ScaleVectorValue(exagDiagA2, i, scale2);
+        ScaleVectorValue(b, i, scale);
+      } else {
+        ScaleVectorValue(exaggeratedDiagA, i, 1.0e6);
+        ScaleVectorValue(exagDiagA2, i, 1.0e6);
+        ScaleVectorValue(b, i, 1.0e6);
+      }
     }
+    ReplaceMatrixDiagonal(A, exaggeratedDiagA);
+    ReplaceMatrixDiagonal(A_lo, exagDiagA2);//TODO probably some funny casting here... need to do properly.
+  } else {
+    if (A.geom->rank==0) HPCG_fout << std::endl << " ** skippping diagonal exaggeration ** " << std::endl << std::endl;
   }
-  ReplaceMatrixDiagonal(A, exaggeratedDiagA);
-  ReplaceMatrixDiagonal(A_lo, exagDiagA2);//TODO probably some funny casting here... need to do properly.
-#endif
 
   int niters = 0;
   scalar_type normr (0.0);
@@ -111,7 +114,7 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da
   testcg_data.expected_niters_prec = 2;   // For the preconditioned case, we should take about 1 iteration, permit 2
   testcg_data.niters_max_no_prec = 0;
   testcg_data.niters_max_prec = 0;
-  for (int k=0; k<2; ++k)
+  for (int k=(test_noprecond ? 0 : 1); k<2; ++k)
   { // This loop tests both unpreconditioned and preconditioned runs
     int expected_niters = testcg_data.expected_niters_no_prec;
     if (k==1) expected_niters = testcg_data.expected_niters_prec;
@@ -119,7 +122,7 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da
       ZeroVector(x); // Zero out x
 
       double time_tic = mytimer();
-      int ierr = GMRES(A, data, b, x, restart_length, maxIters, tolerance, niters, normr, normr0, &times[0], k);
+      int ierr = GMRES(A, data, b, x, restart_length, maxIters, tolerance, niters, normr, normr0, &times[0], &flops, k==1);
       double time_solve = mytimer() - time_tic;
       if (ierr) HPCG_fout << "Error in call to GMRES: " << ierr << ".\n" << endl;
       if (niters <= expected_niters) {
@@ -134,20 +137,22 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da
         HPCG_fout << "Call [" << i << "] Number of GMRES Iterations [" << niters <<"] Scaled Residual [" << normr/normr0 << "]" << endl;
         HPCG_fout << " Expected " << expected_niters << " iterations.  Performed " << niters << "." << endl;
         HPCG_fout << " Time     " << time_solve << " seconds." << endl;
+        HPCG_fout << " Gflop/s  " << flops/1000000000.0 << "/" << time_solve << " = " << (flops/1000000000.0)/time_solve 
+                  << " (n = " << A.totalNumberOfRows << ")" << endl;
+        HPCG_fout << " Time/itr " << time_solve / niters << endl;
       }
     }
   }
 
 #if 1
-  //for (int k=0; k<2; ++k)
-  for (int k=1; k<2; ++k)
+  for (int k=(test_noprecond ? 0 : 1); k<2; ++k)
   { // This loop tests both unpreconditioned and preconditioned runs
     int expected_niters = testcg_data.expected_niters_no_prec;
     if (k==1) expected_niters = testcg_data.expected_niters_prec;
     for (int i=0; i< numberOfCgCalls; ++i) {
       ZeroVector(x); // Zero out x
       double time_tic = mytimer();
-      int ierr = GMRES_IR(A, A_lo, data, data_lo, b, x, restart_length, maxIters, tolerance, niters, normr, normr0, &times[0], k);
+      int ierr = GMRES_IR(A, A_lo, data, data_lo, b, x, restart_length, maxIters, tolerance, niters, normr, normr0, &times[0], &flops, k);
       double time_solve = mytimer() - time_tic;
       if (ierr) HPCG_fout << "Error in call to GMRES-IR: " << ierr << ".\n" << endl;
       if (niters <= expected_niters) {
@@ -161,6 +166,9 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da
         HPCG_fout << "Call [" << i << "] Number of GMRES-IR Iterations [" << niters <<"] Scaled Residual [" << normr/normr0 << "]" << endl;
         HPCG_fout << " Expected " << expected_niters << " iterations.  Performed " << niters << "." << endl;
         HPCG_fout << " Time     " << time_solve << " seconds." << endl;
+        HPCG_fout << " Gflop/s  " << flops/1000000000.0 << "/" << time_solve << " = " << (flops/1000000000.0)/time_solve 
+                  << " (n = " << A.totalNumberOfRows << ")" << endl;
+        HPCG_fout << " Time/itr " << time_solve / niters << endl;
       }
     }
   }
@@ -172,6 +180,8 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da
   // Delete vectors
   DeleteVector(origDiagA);
   DeleteVector(exaggeratedDiagA);
+  DeleteVector(origDiagA2);
+  DeleteVector(exagDiagA2);
   DeleteVector(origB);
   testcg_data.normr = normr;
 
@@ -179,8 +189,9 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da
 }
 
 template<class SparseMatrix_type, class CGData_type, class Vector_type, class TestCGData_type>
-int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data) {
-  TestGMRES(A, A, data, data, b, x, testcg_data);
+int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data,
+              bool test_diagonal_exaggeration, bool test_noprecond) {
+  return TestGMRES(A, A, data, data, b, x, testcg_data, test_diagonal_exaggeration, test_noprecond);
 }
 
 
@@ -191,25 +202,25 @@ int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector
 // uniform
 template
 int TestGMRES< SparseMatrix<double>, CGData<double>, Vector<double>, TestCGData<double> >
-  (SparseMatrix<double>&, CGData<double>&, Vector<double>&, Vector<double>&, TestCGData<double>&);
+  (SparseMatrix<double>&, CGData<double>&, Vector<double>&, Vector<double>&, TestCGData<double>&, bool, bool);
 
 template
 int TestGMRES< SparseMatrix<float>, CGData<float>, Vector<float>, TestCGData<float> >
-  (SparseMatrix<float>&, CGData<float>&, Vector<float>&, Vector<float>&, TestCGData<float>&);
+  (SparseMatrix<float>&, CGData<float>&, Vector<float>&, Vector<float>&, TestCGData<float>&, bool, bool);
 
 
 
 // uniform version
 template
 int TestGMRES< SparseMatrix<double>, SparseMatrix<double>, CGData<double>, CGData<double>, Vector<double>, TestCGData<double> >
-  (SparseMatrix<double>&, SparseMatrix<double>&, CGData<double>&, CGData<double>&, Vector<double>&, Vector<double>&, TestCGData<double>&);
+  (SparseMatrix<double>&, SparseMatrix<double>&, CGData<double>&, CGData<double>&, Vector<double>&, Vector<double>&, TestCGData<double>&, bool, bool);
 
 template
 int TestGMRES< SparseMatrix<float>, SparseMatrix<float>, CGData<float>, CGData<float>, Vector<float>, TestCGData<float> >
-  (SparseMatrix<float>&, SparseMatrix<float>&, CGData<float>&, CGData<float>&, Vector<float>&, Vector<float>&, TestCGData<float>&);
+  (SparseMatrix<float>&, SparseMatrix<float>&, CGData<float>&, CGData<float>&, Vector<float>&, Vector<float>&, TestCGData<float>&, bool, bool);
 
 // mixed version
 template
 int TestGMRES< SparseMatrix<double>, SparseMatrix<float>, CGData<double>, CGData<float>, Vector<double>, TestCGData<double> >
-  (SparseMatrix<double>&, SparseMatrix<float>&, CGData<double>&, CGData<float>&, Vector<double>&, Vector<double>&, TestCGData<double>&);
+  (SparseMatrix<double>&, SparseMatrix<float>&, CGData<double>&, CGData<float>&, Vector<double>&, Vector<double>&, TestCGData<double>&, bool, bool);
 
diff --git a/src/TestGMRES.hpp b/src/TestGMRES.hpp
index 1fa4a8cc..0f95a104 100644
--- a/src/TestGMRES.hpp
+++ b/src/TestGMRES.hpp
@@ -21,7 +21,7 @@
 #ifndef TESTGMRES_HPP
 #define TESTGMRES_HPP
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 #include "SparseMatrix.hpp"
 #include "Vector.hpp"
 #include "CGData.hpp"
@@ -39,10 +39,12 @@ class TestCGData {
 };
 
 template<class SparseMatrix_type, class SparseMatrix_type2, class CGData_type, class CGData_type2, class Vector_type, class TestCGData_type>
-extern int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data);
+extern int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data,
+                     bool test_diagonal_exaggeration, bool test_noprecond);
 
 template<class SparseMatrix_type, class CGData_type, class Vector_type, class TestCGData_type>
-extern int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data);
+extern int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data,
+                     bool test_diagonal_exaggeration, bool test_noprecond);
 
 #endif  // TESTGMRES_HPP
 
diff --git a/src/TestSymmetry.cpp b/src/TestSymmetry.cpp
index 6375eda0..30d7aafb 100644
--- a/src/TestSymmetry.cpp
+++ b/src/TestSymmetry.cpp
@@ -29,7 +29,7 @@ using std::endl;
 #include <vector>
 #include <cmath>
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 
 #include "ComputeSPMV.hpp"
 #include "ComputeMG.hpp"
diff --git a/src/TestSymmetry.hpp b/src/TestSymmetry.hpp
index 35b8ba50..d7dd2438 100644
--- a/src/TestSymmetry.hpp
+++ b/src/TestSymmetry.hpp
@@ -21,7 +21,7 @@
 #ifndef TESTSYMMETRY_HPP
 #define TESTSYMMETRY_HPP
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 #include "SparseMatrix.hpp"
 #include "CGData.hpp"
 
diff --git a/src/Utils_MPI.hpp b/src/Utils_MPI.hpp
index 1715602e..9f85d624 100644
--- a/src/Utils_MPI.hpp
+++ b/src/Utils_MPI.hpp
@@ -4,6 +4,8 @@
 #ifndef HPGMP_UTILS_MPI_HPP
 #define HPGMP_UTILS_MPI_HPP
 
+#include <mpi.h>
+
 // MpiTypeTraits (from Teuchos)
 template<class T>
 class MpiTypeTraits {
diff --git a/src/Vector.hpp b/src/Vector.hpp
index 58fcbbda..d91e333f 100644
--- a/src/Vector.hpp
+++ b/src/Vector.hpp
@@ -24,6 +24,16 @@
 #include <fstream>
 #include <cassert>
 #include <cstdlib>
+#include "Hpgmp_Params.hpp"
+
+#ifdef HPCG_WITH_CUDA
+ #include <cuda_runtime.h>
+ #include <cublas_v2.h>
+#elif defined(HPCG_WITH_HIP)
+ #include <hip/hip_runtime_api.h>
+ #include <rocblas.h>
+#endif
+
 #include "Geometry.hpp"
 
 template<class SC = double>
@@ -32,6 +42,14 @@ class Vector {
   typedef SC scalar_type;
   local_int_t localLength;  //!< length of local portion of the vector
   SC * values;     //!< array of values
+#if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP)
+  SC * d_values;   //!< array of values
+  #if defined(HPCG_WITH_CUDA)
+  cublasHandle_t handle;
+  #elif defined(HPCG_WITH_HIP)
+  rocblas_handle handle;
+  #endif
+#endif
   /*!
    This is for storing optimized data structures created in OptimizeProblem and
    used inside optimized ComputeSPMV().
@@ -50,6 +68,21 @@ inline void InitializeVector(Vector_type & v, local_int_t localLength) {
   typedef typename Vector_type::scalar_type scalar_type;
   v.localLength = localLength;
   v.values = new scalar_type[localLength];
+  #if defined(HPCG_WITH_CUDA)
+  if (CUBLAS_STATUS_SUCCESS != cublasCreate(&v.handle)) {
+    printf( " InitializeVector :: Failed to create Handle\n" );
+  }
+  if (cudaSuccess != cudaMalloc ((void**)&v.d_values, localLength*sizeof(scalar_type))) {
+    printf( " InitializeVector :: Failed to allocate d_values\n" );
+  }
+  #elif defined(HPCG_WITH_HIP)
+  if (rocblas_status_success != rocblas_create_handle(&v.handle)) {
+    printf( " InitializeVector :: Failed to create Handle\n" );
+  }
+  if (hipSuccess != hipMalloc ((void**)&v.d_values, localLength*sizeof(scalar_type))) {
+    printf( " InitializeVector :: Failed to allocate d_values\n" );
+  }
+  #endif
   v.optimizationData = 0;
   return;
 }
@@ -62,9 +95,17 @@ inline void InitializeVector(Vector_type & v, local_int_t localLength) {
 template<class Vector_type>
 inline void ZeroVector(Vector_type & v) {
   typedef typename Vector_type::scalar_type scalar_type;
+  const scalar_type zero (0.0);
+
   local_int_t localLength = v.localLength;
   scalar_type * vv = v.values;
-  for (int i=0; i<localLength; ++i) vv[i] = 0.0;
+  for (int i=0; i<localLength; ++i) vv[i] = zero;
+#ifdef HPCG_WITH_CUDA
+  scalar_type * d_vv = v.d_values;
+  if (cudaSuccess != cudaMemset(d_vv, zero, localLength*sizeof(scalar_type))) {
+    printf( " CopyVector :: Failed to memcpy d_v\n" );
+  }
+#endif
   return;
 }
 /*!
@@ -82,6 +123,38 @@ inline void ScaleVectorValue(Vector_type & v, local_int_t index, typename Vector
   vv[index] *= value;
   return;
 }
+/*!
+  Multiply (scale) the vector by a given value.
+
+  @param[inout] v Vector to be modified
+  @param[in] value Value to scale by
+ */
+template<class Vector_type>
+inline void ScaleVectorValue(Vector_type & v, typename Vector_type::scalar_type value) {
+  typedef typename Vector_type::scalar_type scalar_type;
+  const scalar_type zero (0.0);
+
+  local_int_t localLength = v.localLength;
+  scalar_type * vv = v.values;
+  if (value == zero) {
+    for (int i=0; i<localLength; ++i) vv[i] = zero;
+  } else {
+    for (int i=0; i<localLength; ++i) vv[i] *= value;
+  }
+#ifdef HPCG_WITH_CUDA
+  scalar_type * d_vv = v.d_values;
+  if (std::is_same<scalar_type, double>::value) {
+    if (CUBLAS_STATUS_SUCCESS != cublasDscal (v.handle, localLength, (const double*)&value, (double*)d_vv, 1)) {
+      printf( " Failed cublasDscal\n" );
+    }
+  } else if (std::is_same<scalar_type, float>::value) {
+    if (CUBLAS_STATUS_SUCCESS != cublasSscal (v.handle, localLength, (const float*)&value, (float*)d_vv, 1)) {
+      printf( " Failed cublasSscal\n" );
+    }
+  }
+#endif
+  return;
+}
 /*!
   Fill the input vector with pseudo-random values.
 
@@ -109,7 +182,35 @@ inline void CopyVector(const Vector_src & v, Vector_dst & w) {
   assert(w.localLength >= localLength);
   scalar_src * vv = v.values;
   scalar_dst * wv = w.values;
+#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG)
   for (int i=0; i<localLength; ++i) wv[i] = vv[i];
+#endif
+
+#ifdef HPCG_WITH_CUDA
+  if (std::is_same<scalar_src, scalar_dst>::value) {
+    #ifdef HPCG_DEBUG
+    HPCG_fout << " CopyVector ( Unit-precision )" << std::endl;
+    #endif
+    if (cudaSuccess != cudaMemcpy(w.d_values, v.d_values, localLength*sizeof(scalar_src), cudaMemcpyDeviceToDevice)) {
+      printf( " CopyVector :: Failed to memcpy d_x\n" );
+    }
+  } else {
+    HPCG_fout << " CopyVector :: Mixed-precision not supported" << std::endl;
+
+    // Copy input vector to Host
+    if (cudaSuccess != cudaMemcpy(vv, v.d_values, localLength*sizeof(scalar_src), cudaMemcpyDeviceToHost)) {
+      printf( " CopyVector :: Failed to memcpy d_v\n" );
+    }
+
+    // Copy on Host
+    for (int i=0; i<localLength; ++i) wv[i] = vv[i];
+
+    // Copy output vector to Device
+    if (cudaSuccess != cudaMemcpy(w.d_values, wv, localLength*sizeof(scalar_dst), cudaMemcpyHostToDevice)) {
+      printf( " CopyVector :: Failed to memcpy d_w\n" );
+    }
+  }
+#endif
   return;
 }
 
@@ -123,6 +224,13 @@ template<class Vector_type>
 inline void DeleteVector(Vector_type & v) {
 
   delete [] v.values;
+  #if defined(HPCG_WITH_CUDA)
+  cudaFree(v.d_values);
+  cublasDestroy(v.handle);
+  #elif defined(HPCG_WITH_HIP)
+  hipFree(v.d_values);
+  rocblas_destroy_handle(v.handle);
+  #endif
   v.localLength = 0;
   return;
 }
diff --git a/src/finalize.cpp b/src/finalize.cpp
index f4b46e46..344f6fcb 100644
--- a/src/finalize.cpp
+++ b/src/finalize.cpp
@@ -14,7 +14,7 @@
 
 #include <fstream>
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 
 /*!
   Closes the I/O stream used for logging information throughout the HPCG run.
diff --git a/src/init.cpp b/src/init.cpp
index 1e23854b..df7cff3f 100644
--- a/src/init.cpp
+++ b/src/init.cpp
@@ -34,7 +34,7 @@ const char* NULLDEVICE="/dev/null";
 #include <fstream>
 #include <iostream>
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 
 #include "ReadHpcgDat.hpp"
 
@@ -79,11 +79,13 @@ HPCG_Init(int * argc_p, char ** *argv_p, HPCG_Params & params) {
   iparams = (int *)malloc(sizeof(int) * nparams);
 
   // Initialize iparams
-  for (i = 0; i < nparams; ++i) iparams[i] = 0;
+  for (i = 0; i < nparams; ++i) 
+    iparams[i] = 0;
 
   /* for sequential and some MPI implementations it's OK to read first three args */
   for (i = 0; i < nparams; ++i)
-    if (argc <= i+1 || sscanf(argv[i+1], "%d", iparams+i) != 1 || iparams[i] < 10) iparams[i] = 0;
+    if (argc <= i+1 || sscanf(argv[i+1], "%d", iparams+i) != 1 || iparams[i] < 10) 
+      iparams[i] = 0;
 
   /* for some MPI environments, command line arguments may get complicated so we need a prefix */
   for (i = 1; i <= argc && argv[i]; ++i)
@@ -94,8 +96,10 @@ HPCG_Init(int * argc_p, char ** *argv_p, HPCG_Params & params) {
 
   // Check if --rt was specified on the command line
   int * rt  = iparams+3;  // Assume runtime was not specified and will be read from the hpcg.dat file
-  if (iparams[3]) rt = 0; // If --rt was specified, we already have the runtime, so don't read it from file
+  if (iparams[3]) 
+    rt = 0; // If --rt was specified, we already have the runtime, so don't read it from file
   if (! iparams[0] && ! iparams[1] && ! iparams[2]) { /* no geometry arguments on the command line */
+    //TODO: If I am rank 0, then read the file. 
     ReadHpcgDat(iparams, rt, iparams+7);
     broadcastParams = true;
   }
diff --git a/src/main_hpgmp.cpp b/src/main_hpgmp.cpp
index 13cbfa51..30b0f0a0 100644
--- a/src/main_hpgmp.cpp
+++ b/src/main_hpgmp.cpp
@@ -35,7 +35,7 @@ using std::endl;
 
 #include <vector>
 
-#include "hpgmp.hpp"
+#include "Hpgmp_Params.hpp"
 
 #include "SetupProblem.hpp"
 #include "CheckAspectRatio.hpp"
@@ -58,8 +58,6 @@ using std::endl;
 
 #include "GMRES.hpp"
 #include "TestGMRES.hpp"
-#include "GenerateNonsymProblem.hpp"
-#include "GenerateNonsymCoarseProblem.hpp"
 
 typedef double scalar_type;
 typedef Vector<scalar_type> Vector_type;
@@ -91,16 +89,83 @@ int main(int argc, char * argv[]) {
   MPI_Init(&argc, &argv);
 #endif
 
+  //Initialize params for full-scale benchmark run:
   HPCG_Params params;
-
   HPCG_Init(&argc, &argv, params);
+  int size = params.comm_size; // Num MPI processes
+  int rank = params.comm_rank; // My process ID
+
+// **************************************************************************8
+// PHASE I: VERIFY CONVERGENCE WITH SMALL PROBLEM
+// **************************************************************************8
+
+  MPI_Comm SM_COMM;
+  MPI_Group group_world;
+  MPI_Group sm_group;
+  if( size <= 64 ) //Use all MPI ranks
+    SM_COMM = MPI_COMM_WORLD;
+  else{
+    // Get MPI Sub-communicator:
+    int num_ranks = 64;
+    int *process_ranks;
+    // make a list of processes in the new communicator
+    process_ranks = (int*) malloc(num_ranks*sizeof(int));
+    for(int i = 0; i < num_ranks; i++)
+      process_ranks[i] = i;
+    //get the group under MPI_COMM_WORLD
+    MPI_Comm_group(MPI_COMM_WORLD, &group_world);
+    // create the new group
+    MPI_Group_incl(group_world, num_ranks, process_ranks, &sm_group);
+    // create the new communicator
+    MPI_Comm_create(MPI_COMM_WORLD, sm_group, &SM_COMM);
+  }
+
+  HPCG_Params sm_params;
+  sm_params.nx = 32;
+  sm_params.ny = 32;
+  sm_params.nz = 32;
+
+  sm_params.runningTime = 0.0005; //Something really small since we just want one run??
+  sm_params.pz = 0;
+  sm_params.zl = 0;
+  sm_params.zu = 0;
+
+  sm_params.npx = 0;
+  sm_params.npy = 0;
+  sm_params.npz = 0;
+
+#ifndef HPCG_NO_MPI
+  MPI_Comm_rank( SM_COMM, &sm_params.comm_rank );
+  MPI_Comm_size( SM_COMM, &sm_params.comm_size );
+#else
+  sm_params.comm_rank = 0;
+  sm_params.comm_size = 1;
+#endif
+
+#ifdef HPCG_NO_OPENMP
+  sm_params.numThreads = 1;
+#else
+  #pragma omp parallel
+  sm_params.numThreads = omp_get_num_threads();
+#endif
+
+  // Construct the geometry and linear system
+  local_int_t nx,ny,nz;
+  sm_nx = (local_int_t)sm_params.nx;
+  sm_ny = (local_int_t)sm_params.ny;
+  sm_nz = (local_int_t)sm_params.nz;
+  Geometry * sm_geom = new Geometry;
+  GenerateGeometry(sm_params.comm_size, sm_params.comm_rank, sm_params.numThreads, sm_params.pz, 
+      sm_params.zl, sm_params.zu, sm_nx, sm_ny, sm_nz, sm_params.npx, sm_params.npy, sm_params.npz, sm_geom);
+
+// **************************************************************************8
+// PHASE II: BENCHMARKING PHASE
+// **************************************************************************8
 
   // Check if QuickPath option is enabled.
   // If the running time is set to zero, we minimize all paths through the program
   bool quickPath = 1; //TODO: Change back to the following after=(params.runningTime==0);
 
-  int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
-
 #ifdef HPCG_DETAILED_DEBUG
   if (size < 100 && rank==0) HPCG_fout << "Process "<<rank<<" of "<<size<<" is alive with " << params.numThreads << " threads." <<endl;
 
@@ -161,39 +226,16 @@ int main(int argc, char * argv[]) {
   //TODO: This is the spot where HPCG runs check problem.  Do we need CheckProblem?
   //Probably need to check multigird (and Traingular solve?) here. 
 
+  // Call user-tunable set up function.
+  double t7 = mytimer();
+  OptimizeProblem(A, data, b, x, xexact);
+  t7 = mytimer() - t7;
+  times[7] = t7;
 
-  ////////////////////////////////////
-  // Reference SpMV+MG Timing Phase //
-  ////////////////////////////////////
-
-  // Call Reference SpMV and MG. Compute Optimization time as ratio of times in these routines
-
-  local_int_t nrow = A.localNumberOfRows;
-  local_int_t ncol = A.localNumberOfColumns;
-
-  Vector_type x_overlap, b_computed;
-  InitializeVector(x_overlap, ncol); // Overlapped copy of x vector
-  InitializeVector(b_computed, nrow); // Computed RHS vector
-
-
-  // Record execution time of reference SpMV and MG kernels for reporting times
-  // First load vector with random values
-  FillRandomVector(x_overlap);
-
-  int numberOfCalls = 10;
-  if (quickPath) numberOfCalls = 1; //QuickPath means we do on one call of each block of repetitive code
-  double t_begin = mytimer();
-  for (int i=0; i< numberOfCalls; ++i) {
-    ierr = ComputeSPMV_ref(A, x_overlap, b_computed); // b_computed = A*x_overlap
-    if (ierr) HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
-    ierr = ComputeMG_ref(A, b_computed, x_overlap); // b_computed = Minv*y_overlap
-    if (ierr) HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
+  if (A.geom->rank==0) {
+    HPCG_fout << " Setup    Time     " << setup_time << " seconds." << endl;
+    HPCG_fout << " Optimize Time     " << t7 << " seconds." << endl;
   }
-  times[8] = (mytimer() - t_begin)/((double) numberOfCalls);  // Total time divided by number of calls.
-#ifdef HPCG_DEBUG
-  if (rank==0) HPCG_fout << "Total SpMV+MG timing phase execution time in main (sec) = " << mytimer() - t1 << endl;
-#endif
-
 
   ///////////////////////////////
   // Reference GMRES Timing Phase //
@@ -210,23 +252,21 @@ int main(int argc, char * argv[]) {
   scalar_type normr0 = 0.0;
   int restart_length = 50;
   int refMaxIters = 50;
-  numberOfCalls = 1; // Only need to run the residual reduction analysis once
 
   // Compute the residual reduction for the natural ordering and reference kernels
+  double flops = 0.0;
   std::vector< double > ref_times(9,0.0);
   scalar_type tolerance = 0.0; // Set tolerance to zero to make all runs do maxIters iterations
   int err_count = 0;
-  for (int i=0; i< numberOfCalls; ++i) {
-    ZeroVector(x);
-    ierr = GMRES(A, data, b, x, restart_length, refMaxIters, tolerance, niters, normr, normr0, &ref_times[0], true);
-    if (ierr) ++err_count; // count the number of errors in GMRES.
-    totalNiters_ref += niters;
-  }
+  ZeroVector(x);
+  ierr = GMRES(A, data, b, x, restart_length, refMaxIters, tolerance, niters, normr, normr0, &ref_times[0], &flops, true);
+  if (ierr) ++err_count; // count the number of errors in GMRES.
+  totalNiters_ref += niters;
   if (rank == 0 && err_count) HPCG_fout << err_count << " error(s) in call(s) to reference GMRES." << endl;
   scalar_type refTolerance = normr / normr0;
 
   // Call user-tunable set up function.
-  double t7 = mytimer();
+  t7 = mytimer();
   OptimizeProblem(A, data, b, x, xexact);
   t7 = mytimer() - t7;
   times[7] = t7;
@@ -248,6 +288,7 @@ int main(int argc, char * argv[]) {
 /*
 #ifdef HPCG_DEBUG
   t1 = mytimer();
+  if (rank==0) HPCG_fout << endl << "Running Uniform-precision Test" << endl;
 #endif
   testcg_data.count_pass = testcg_data.count_fail = 0;
   TestGMRES(A, data, b, x, testcg_data);
@@ -272,12 +313,24 @@ int main(int argc, char * argv[]) {
   SparseMatrix_type2 A2;
   CGData_type2 data2;
   SetupProblem(numberOfMgLevels, A2, geom, data2, &b, &x, &xexact, init_vect);
+  setup_time = mytimer() - setup_time; // Capture total time of setup
+
+  t7 = mytimer();
+  OptimizeProblem(A2, data, b, x, xexact);
+  t7 = mytimer() - t7;
+
   testcg_data.count_pass = testcg_data.count_fail = 0;
+  if (A.geom->rank==0) {
+    HPCG_fout << " Setup    Time     " << setup_time << " seconds." << endl;
+    HPCG_fout << " Optimize Time     " << t7 << " seconds." << endl;
+  }
 
+  bool test_diagonal_exaggeration = true;
+  bool test_noprecond = true;
 #ifdef HPCG_DEBUG
   t1 = mytimer();
 #endif
-  TestGMRES(A, A2, data, data2, b, x, testcg_data);
+  TestGMRES(A, A2, data, data2, b, x, testcg_data, test_diagonal_exaggeration, test_noprecond);
 #ifdef HPCG_DEBUG
   if (rank==0) HPCG_fout << "Total validation (mixed-precision TestGMRES) execution time in main (sec) = " << mytimer() - t1 << endl;
 #endif
@@ -309,8 +362,6 @@ int main(int argc, char * argv[]) {
   DeleteVector(x);
   DeleteVector(b);
   DeleteVector(xexact);
-  DeleteVector(x_overlap);
-  DeleteVector(b_computed);
   //delete [] testnorms_data.values;
 
   // Finish up
diff --git a/src/main_time.cpp b/src/main_time.cpp
new file mode 100644
index 00000000..c61aa201
--- /dev/null
+++ b/src/main_time.cpp
@@ -0,0 +1,265 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file main.cpp
+
+ HPCG routine
+ */
+
+// Main routine of a program that calls the HPCG conjugate gradient
+// solver to solve the problem, and then prints results.
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#include <fstream>
+#include <iostream>
+#include <cstdlib>
+#ifdef HPCG_DETAILED_DEBUG
+using std::cin;
+#endif
+using std::endl;
+
+#include <vector>
+
+#include "Hpgmp_Params.hpp"
+
+#include "SetupProblem.hpp"
+#include "CheckAspectRatio.hpp"
+#include "GenerateGeometry.hpp"
+#include "CheckProblem.hpp"
+#include "OptimizeProblem.hpp"
+#include "WriteProblem.hpp"
+#include "ReportResults.hpp"
+#include "mytimer.hpp"
+#include "ComputeSPMV_ref.hpp"
+#include "ComputeMG_ref.hpp"
+#include "ComputeResidual.hpp"
+#include "Geometry.hpp"
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+#include "CGData.hpp"
+#include "TestNorms.hpp"
+
+#include "TestGMRES.hpp"
+
+typedef double scalar_type;
+//typedef float  scalar_type;
+typedef Vector<scalar_type> Vector_type;
+typedef SparseMatrix<scalar_type> SparseMatrix_type;
+typedef CGData<scalar_type> CGData_type;
+typedef TestCGData<scalar_type> TestCGData_type;
+typedef TestNormsData<scalar_type> TestNormsData_type;
+
+typedef float scalar_type2;
+typedef Vector<scalar_type2> Vector_type2;
+typedef SparseMatrix<scalar_type2> SparseMatrix_type2;
+typedef CGData<scalar_type2> CGData_type2;
+typedef TestCGData<scalar_type2> TestCGData_type2;
+typedef TestNormsData<scalar_type2> TestNormsData_type2;
+
+
+/*!
+  Main driver program: Construct synthetic problem, run V&V tests, compute benchmark parameters, run benchmark, report results.
+
+  @param[in]  argc Standard argument count.  Should equal 1 (no arguments passed in) or 4 (nx, ny, nz passed in)
+  @param[in]  argv Standard argument array.  If argc==1, argv is unused.  If argc==4, argv[1], argv[2], argv[3] will be interpreted as nx, ny, nz, resp.
+
+  @return Returns zero on success and a non-zero value otherwise.
+
+*/
+int main(int argc, char * argv[]) {
+
+#ifndef HPCG_NO_MPI
+  MPI_Init(&argc, &argv);
+#endif
+
+  HPCG_Params params;
+
+  HPCG_Init(&argc, &argv, params);
+
+  // Check if QuickPath option is enabled.
+  // If the running time is set to zero, we minimize all paths through the program
+  bool quickPath = (params.runningTime==0);
+
+  int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
+
+#ifdef HPCG_DETAILED_DEBUG
+  if (size < 100 && rank==0) HPCG_fout << "Process "<<rank<<" of "<<size<<" is alive with " << params.numThreads << " threads." <<endl;
+
+  if (rank==0) {
+    char c;
+    std::cout << "Press key to continue"<< std::endl;
+    std::cin.get(c);
+  }
+#ifndef HPCG_NO_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+#endif
+
+  local_int_t nx,ny,nz;
+  nx = (local_int_t)params.nx;
+  ny = (local_int_t)params.ny;
+  nz = (local_int_t)params.nz;
+  int ierr = 0;  // Used to check return codes on function calls
+
+  ierr = CheckAspectRatio(0.125, nx, ny, nz, "local problem", rank==0);
+  if (ierr)
+    return ierr;
+
+  /////////////////////////
+  // Problem setup Phase //
+  /////////////////////////
+
+#ifdef HPCG_DEBUG
+  double t1 = mytimer();
+#endif
+
+  // Construct the geometry and linear system
+  Geometry * geom = new Geometry;
+  GenerateGeometry(size, rank, params.numThreads, params.pz, params.zl, params.zu, nx, ny, nz, params.npx, params.npy, params.npz, geom);
+
+  ierr = CheckAspectRatio(0.125, geom->npx, geom->npy, geom->npz, "process grid", rank==0);
+  if (ierr)
+    return ierr;
+
+  // Use this array for collecting timing information
+  std::vector< double > times(10,0.0);
+
+  double setup_time = mytimer();
+
+  // Setup the problem
+  SparseMatrix_type A;
+  CGData_type data;
+
+  bool init_vect = true;
+  Vector_type b, x, xexact;
+
+  int numberOfMgLevels = 4; // Number of levels including first
+  SetupProblem(numberOfMgLevels, A, geom, data, &b, &x, &xexact, init_vect);
+
+  setup_time = mytimer() - setup_time; // Capture total time of setup
+  times[9] = setup_time; // Save it for reporting
+
+  // Call user-tunable set up function.
+  double t7 = mytimer();
+  OptimizeProblem(A, data, b, x, xexact);
+  t7 = mytimer() - t7;
+  times[7] = t7;
+
+  if (A.geom->rank==0) {
+    HPCG_fout << " Setup    Time     " << setup_time << " seconds." << endl;
+    HPCG_fout << " Optimize Time     " << t7 << " seconds." << endl;
+  }
+
+  ////////////////////////////////////
+  // Reference SpMV+MG Timing Phase //
+  ////////////////////////////////////
+
+  // Call Reference SpMV and MG. Compute Optimization time as ratio of times in these routines
+
+  local_int_t nrow = A.localNumberOfRows;
+  local_int_t ncol = A.localNumberOfColumns;
+
+  Vector_type x_overlap, b_computed;
+  InitializeVector(x_overlap, ncol); // Overlapped copy of x vector
+  InitializeVector(b_computed, nrow); // Computed RHS vector
+
+
+  // Record execution time of reference SpMV and MG kernels for reporting times
+  // First load vector with random values
+  FillRandomVector(x_overlap);
+
+#if 0
+  int numberOfCalls = 10;
+  if (quickPath) numberOfCalls = 1; //QuickPath means we do on one call of each block of repetitive code
+  double t_begin = mytimer();
+  for (int i=0; i< numberOfCalls; ++i) {
+    ierr = ComputeSPMV_ref(A, x_overlap, b_computed); // b_computed = A*x_overlap
+    if (ierr) HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
+    ierr = ComputeMG_ref(A, b_computed, x_overlap); // b_computed = Minv*y_overlap
+    if (ierr) HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
+  }
+  times[8] = (mytimer() - t_begin)/((double) numberOfCalls);  // Total time divided by number of calls.
+#ifdef HPCG_DEBUG
+  if (rank==0) HPCG_fout << "Total SpMV+MG timing phase execution time in main (sec) = " << mytimer() - t1 << endl;
+#endif
+#endif
+
+
+  //////////////////////////////
+  // Validation Testing Phase //
+  //////////////////////////////
+
+  TestCGData_type testcg_data;
+  testcg_data.count_pass = testcg_data.count_fail = 0;
+
+  bool test_diagonal_exaggeration = false;
+  bool test_noprecond = false;
+
+#ifdef HPCG_DEBUG
+  t1 = mytimer();
+  if (rank==0) HPCG_fout << endl << "Running Uniform-precision Test" << endl;
+#endif
+  TestGMRES(A, data, b, x, testcg_data, test_diagonal_exaggeration, test_noprecond);
+#ifdef HPCG_DEBUG
+  if (rank==0) HPCG_fout << "Total validation (uniform-precision TestGMRES) execution time in main (sec) = " << mytimer() - t1 << endl;
+#endif
+
+  setup_time = mytimer();
+  init_vect = false;
+  SparseMatrix_type2 A2;
+  CGData_type2 data2;
+  SetupProblem(numberOfMgLevels, A2, geom, data2, &b, &x, &xexact, init_vect);
+  setup_time = mytimer() - setup_time; // Capture total time of setup
+
+  t7 = mytimer();
+  OptimizeProblem(A2, data, b, x, xexact);
+  t7 = mytimer() - t7;
+
+  testcg_data.count_pass = testcg_data.count_fail = 0;
+  if (A.geom->rank==0) {
+    HPCG_fout << " Setup    Time     " << setup_time << " seconds." << endl;
+    HPCG_fout << " Optimize Time     " << t7 << " seconds." << endl;
+  }
+
+
+#ifdef HPCG_DEBUG
+  t1 = mytimer();
+#endif
+  TestGMRES(A, A2, data, data2, b, x, testcg_data, test_diagonal_exaggeration, test_noprecond);
+#ifdef HPCG_DEBUG
+  if (rank==0) HPCG_fout << "Total validation (mixed-precision TestGMRES) execution time in main (sec) = " << mytimer() - t1 << endl;
+#endif
+
+  // free
+  //DeleteMatrix(A2);
+  //DeleteMatrix(A);
+  DeleteVector(x);
+  DeleteVector(b);
+  DeleteVector(xexact);
+  DeleteVector(x_overlap);
+  DeleteVector(b_computed);
+  DeleteCGData(data);
+  DeleteCGData(data2);
+
+  // Finish up
+  HPCG_Finalize();
+#ifndef HPCG_NO_MPI
+  MPI_Finalize();
+#endif
+  return 0;
+}