diff --git a/Makefile b/Makefile index 2e472be2..29308854 100644 --- a/Makefile +++ b/Makefile @@ -11,13 +11,19 @@ HPCG_DEPS = src/ComputeResidual.o \ src/CheckProblem.o \ src/OptimizeProblem.o src/ReadHpcgDat.o src/ReportResults.o \ src/SetupHalo.o src/SetupHalo_ref.o src/TestSymmetry.o src/TestNorms.o src/WriteProblem.o \ - src/YAML_Doc.o src/YAML_Element.o src/ComputeDotProduct.o \ - src/ComputeDotProduct_ref.o src/finalize.o src/init.o src/mytimer.o src/ComputeSPMV.o \ - src/ComputeSPMV_ref.o src/ComputeWAXPBY.o src/ComputeWAXPBY_ref.o \ - src/ComputeMG_ref.o src/ComputeMG.o src/ComputeProlongation_ref.o src/ComputeRestriction_ref.o \ + src/YAML_Doc.o src/YAML_Element.o \ + src/ComputeDotProduct.o src/ComputeDotProduct_ref.o \ + src/finalize.o src/init.o src/mytimer.o \ + src/ComputeSPMV.o src/ComputeSPMV_ref.o \ + src/ComputeSYMGS.o src/ComputeSYMGS_ref.o \ + src/ComputeWAXPBY.o src/ComputeWAXPBY_ref.o \ + src/ComputeMG_ref.o src/ComputeMG.o \ + src/ComputeProlongation_ref.o src/ComputeRestriction_ref.o \ src/ComputeOptimalShapeXYZ.o src/MixedBaseCounter.o src/CheckAspectRatio.o src/OutputFile.o \ \ - src/TestGMRES.o src/ComputeTRSM.o src/ComputeGEMV.o \ + src/TestGMRES.o src/ComputeTRSM.o src/ComputeGEMV.o src/ComputeGEMVT.o \ + src/ComputeGEMV.o src/ComputeGEMV_ref.o \ + src/ComputeGEMVT.o src/ComputeGEMVT_ref.o \ src/GMRES.o src/GMRES_IR.o \ src/ComputeGS_Forward.o src/ComputeGS_Forward_ref.o \ src/SetupProblem.o \ @@ -27,9 +33,13 @@ HPCG_DEPS = src/ComputeResidual.o \ bin/xhpgmp: src/main_hpgmp.o $(HPCG_DEPS) $(LINKER) $(LINKFLAGS) src/main_hpgmp.o $(HPCG_DEPS) -o bin/xhpgmp $(HPCG_LIBS) +bin/xhpgmp_time: src/main_time.o $(HPCG_DEPS) + $(LINKER) $(LINKFLAGS) src/main_time.o $(HPCG_DEPS) -o bin/xhpgmp_time $(HPCG_LIBS) + clean: rm -f $(HPCG_DEPS) \ - bin/xhpgmp src/main_hpgmp.o + bin/xhpgmp src/main_hpgmp.o \ + bin/xhpgmp_time src/main_time.o .PHONY: clean diff --git a/Makefile.ext b/Makefile.ext index 3cee767d..7c3b5a04 100644 --- a/Makefile.ext +++ b/Makefile.ext @@ -27,6 +27,8 @@ HPCG_DEPS = src/ComputeResidual.o \ src/ComputeOptimalShapeXYZ.o \ src/ComputeSPMV.o \ src/ComputeSPMV_ref.o \ + src/ComputeSYMGS.o \ + src/ComputeSYMGS_ref.o \ src/ComputeWAXPBY.o \ src/ComputeWAXPBY_ref.o \ src/ComputeMG_ref.o \ @@ -45,28 +47,37 @@ HPCG_DEPS = src/ComputeResidual.o \ src/ComputeGS_Forward_ref.o \ src/ComputeTRSM.o \ src/ComputeGEMV.o \ - src/SetupProblem.o \ + src/ComputeGEMV_ref.o \ + src/ComputeGEMVT.o \ + src/ComputeGEMVT_ref.o \ + src/SetupProblem.o \ src/GenerateNonsymProblem.o \ src/GenerateNonsymProblem_v1_ref.o \ src/GenerateNonsymCoarseProblem.o \ # These header files are included in many source files, so we recompile every file if one or more of these header is modified. -PRIMARY_HEADERS = HPCG_SRC_PATH/src/Geometry.hpp HPCG_SRC_PATH/src/SparseMatrix.hpp HPCG_SRC_PATH/src/Vector.hpp HPCG_SRC_PATH/src/CGData.hpp \ - HPCG_SRC_PATH/src/MGData.hpp HPCG_SRC_PATH/src/hpcg.hpp +PRIMARY_HEADERS = HPCG_SRC_PATH/src/Geometry.hpp HPCG_SRC_PATH/src/SparseMatrix.hpp HPCG_SRC_PATH/src/Vector.hpp HPCG_SRC_PATH/src/MultiVector.hpp \ + HPCG_SRC_PATH/src/CGData.hpp HPCG_SRC_PATH/src/MGData.hpp HPCG_SRC_PATH/src/Hpgmp_Params.hpp -all: bin/xhpgmp +all: bin/xhpgmp bin/xhpgmp_time bin/xhpgmp: src/main_hpgmp.o $(HPCG_DEPS) $(LINKER) $(LINKFLAGS) src/main_hpgmp.o $(HPCG_DEPS) $(HPCG_LIBS) -o bin/xhpgmp +bin/xhpgmp_time: src/main_time.o $(HPCG_DEPS) + $(LINKER) $(LINKFLAGS) src/main_time.o $(HPCG_DEPS) $(HPCG_LIBS) -o bin/xhpgmp_time + clean: - rm -f src/*.o bin/xhpgmp + rm -f src/*.o bin/xhpgmp bin/xhpgmp_time .PHONY: all clean src/main_hpgmp.o: HPCG_SRC_PATH/src/main_hpgmp.cpp $(PRIMARY_HEADERS) $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ +src/main_time.o: HPCG_SRC_PATH/src/main_time.cpp $(PRIMARY_HEADERS) + $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ + src/ComputeResidual.o: HPCG_SRC_PATH/src/ComputeResidual.cpp HPCG_SRC_PATH/src/ComputeResidual.hpp $(PRIMARY_HEADERS) $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ @@ -139,6 +150,9 @@ src/ComputeSPMV_ref.o: HPCG_SRC_PATH/src/ComputeSPMV_ref.cpp HPCG_SRC_PATH/src/C src/ComputeSYMGS.o: HPCG_SRC_PATH/src/ComputeSYMGS.cpp HPCG_SRC_PATH/src/ComputeSYMGS.hpp $(PRIMARY_HEADERS) $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ +src/ComputeSYMGS_ref.o: HPCG_SRC_PATH/src/ComputeSYMGS_ref.cpp HPCG_SRC_PATH/src/ComputeSYMGS_ref.hpp $(PRIMARY_HEADERS) + $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ + src/ComputeWAXPBY.o: HPCG_SRC_PATH/src/ComputeWAXPBY.cpp HPCG_SRC_PATH/src/ComputeWAXPBY.hpp $(PRIMARY_HEADERS) $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ @@ -185,6 +199,15 @@ src/ComputeTRSM.o: HPCG_SRC_PATH/src/ComputeTRSM.cpp HPCG_SRC_PATH/src/ComputeTR src/ComputeGEMV.o: HPCG_SRC_PATH/src/ComputeGEMV.cpp HPCG_SRC_PATH/src/ComputeGEMV.hpp $(PRIMARY_HEADERS) $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ +src/ComputeGEMV_ref.o: HPCG_SRC_PATH/src/ComputeGEMV_ref.cpp HPCG_SRC_PATH/src/ComputeGEMV_ref.hpp $(PRIMARY_HEADERS) + $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ + +src/ComputeGEMVT.o: HPCG_SRC_PATH/src/ComputeGEMVT.cpp HPCG_SRC_PATH/src/ComputeGEMVT.hpp $(PRIMARY_HEADERS) + $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ + +src/ComputeGEMVT_ref.o: HPCG_SRC_PATH/src/ComputeGEMVT_ref.cpp HPCG_SRC_PATH/src/ComputeGEMVT_ref.hpp $(PRIMARY_HEADERS) + $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ + src/SetupProblem.o: HPCG_SRC_PATH/src/SetupProblem.cpp HPCG_SRC_PATH/src/SetupProblem.hpp $(PRIMARY_HEADERS) $(CXX) -c $(CXXFLAGS) -IHPCG_SRC_PATH/src $< -o $@ diff --git a/QUICKSTART b/QUICKSTART index 466ffff1..f65db16f 100644 --- a/QUICKSTART +++ b/QUICKSTART @@ -37,6 +37,11 @@ NOTE: The instructions in this file assume you are working with a version export OMP_NUM_THREADS 4 mpiexec -np 64 ./xhpcg +5) To set parameters, hpcg.dat ... First two lines ignored. + Line 3: nx ny nz + Line 4: Time to run the bechmark (seconds) + Line 5: + 5) The benchmark has completed execution. This should take a few minutes when running in evaluation mode, and take about 30 minutes in official benchmark mode. If you are running on a production system, you may be able diff --git a/bin/HPGMP-Benchmark_1.1_2022-03-07_16-04-57.txt b/bin/HPGMP-Benchmark_1.1_2022-03-07_16-04-57.txt new file mode 100644 index 00000000..2c10174f --- /dev/null +++ b/bin/HPGMP-Benchmark_1.1_2022-03-07_16-04-57.txt @@ -0,0 +1,112 @@ +HPGMP-Benchmark +version=1.1 +Release date=March 28, 2019 +Machine Summary= +Machine Summary::Distributed Processes=1 +Machine Summary::Threads per processes=1 +Global Problem Dimensions= +Global Problem Dimensions::Global nx=16 +Global Problem Dimensions::Global ny=16 +Global Problem Dimensions::Global nz=16 +Processor Dimensions= +Processor Dimensions::npx=1 +Processor Dimensions::npy=1 +Processor Dimensions::npz=1 +Local Domain Dimensions= +Local Domain Dimensions::nx=16 +Local Domain Dimensions::ny=16 +Local Domain Dimensions::Lower ipz=0 +Local Domain Dimensions::Upper ipz=0 +Local Domain Dimensions::nz=16 +########## Problem Summary ##########= +Setup Information= +Setup Information::Setup Time=0.002537 +Linear System Information= +Linear System Information::Number of Equations=4096 +Linear System Information::Number of Nonzero Terms=97336 +Multigrid Information= +Multigrid Information::Number of coarse grid levels=3 +Multigrid Information::Coarse Grids= +Multigrid Information::Coarse Grids::Grid Level=1 +Multigrid Information::Coarse Grids::Number of Equations=512 +Multigrid Information::Coarse Grids::Number of Nonzero Terms=10648 +Multigrid Information::Coarse Grids::Number of Presmoother Steps=1 +Multigrid Information::Coarse Grids::Number of Postsmoother Steps=1 +Multigrid Information::Coarse Grids::Grid Level=2 +Multigrid Information::Coarse Grids::Number of Equations=64 +Multigrid Information::Coarse Grids::Number of Nonzero Terms=1000 +Multigrid Information::Coarse Grids::Number of Presmoother Steps=1 +Multigrid Information::Coarse Grids::Number of Postsmoother Steps=1 +Multigrid Information::Coarse Grids::Grid Level=3 +Multigrid Information::Coarse Grids::Number of Equations=8 +Multigrid Information::Coarse Grids::Number of Nonzero Terms=64 +Multigrid Information::Coarse Grids::Number of Presmoother Steps=1 +Multigrid Information::Coarse Grids::Number of Postsmoother Steps=1 +########## Memory Use Summary ##########= +Memory Use Information= +Memory Use Information::Total memory used for data (Gbytes)=0.00292882 +Memory Use Information::Memory used for OptimizeProblem data (Gbytes)=0 +Memory Use Information::Bytes per equation (Total memory / Number of Equations)=715.045 +Memory Use Information::Memory used for linear system and CG (Gbytes)=0.00257652 +Memory Use Information::Coarse Grids= +Memory Use Information::Coarse Grids::Grid Level=1 +Memory Use Information::Coarse Grids::Memory used=0.000308152 +Memory Use Information::Coarse Grids::Grid Level=2 +Memory Use Information::Coarse Grids::Memory used=3.8904e-05 +Memory Use Information::Coarse Grids::Grid Level=3 +Memory Use Information::Coarse Grids::Memory used=5.248e-06 +########## V&V Testing Summary ##########= +Spectral Convergence Tests= +Spectral Convergence Tests::Result=FAILED +Spectral Convergence Tests::Unpreconditioned= +Spectral Convergence Tests::Unpreconditioned::Maximum iteration count=21 +Spectral Convergence Tests::Unpreconditioned::Expected iteration count=12 +Spectral Convergence Tests::Preconditioned= +Spectral Convergence Tests::Preconditioned::Maximum iteration count=3 +Spectral Convergence Tests::Preconditioned::Expected iteration count=2 +########## Iterations Summary ##########= +Iteration Count Information= +Iteration Count Information::Result=PASSED +Iteration Count Information::Reference CG iterations per set=50 +Iteration Count Information::Optimized CG iterations per set=500 +Iteration Count Information::Total number of reference iterations=50 +Iteration Count Information::Total number of optimized iterations=500 +########## Reproducibility Summary ##########= +Reproducibility Information= +Reproducibility Information::Result=FAILED +Reproducibility Information::Scaled residual mean=2.122e-314 +Reproducibility Information::Scaled residual variance=2.122e-314 +########## Performance Summary (times in sec) ##########= +Benchmark Time Summary= +Benchmark Time Summary::Optimization phase=0 +Benchmark Time Summary::DDOT=0 +Benchmark Time Summary::WAXPBY=0 +Benchmark Time Summary::SpMV=0 +Benchmark Time Summary::MG=0 +Benchmark Time Summary::Total=0 +Floating Point Operations Summary= +Floating Point Operations Summary::Raw DDOT=1.22962e+07 +Floating Point Operations Summary::Raw WAXPBY=1.22962e+07 +Floating Point Operations Summary::Raw SpMV=9.75307e+07 +Floating Point Operations Summary::Raw MG=5.45048e+08 +Floating Point Operations Summary::Total=6.67171e+08 +Floating Point Operations Summary::Total with convergence overhead=6.67171e+07 +GB/s Summary= +GB/s Summary::Raw Read B/W=inf +GB/s Summary::Raw Write B/W=inf +GB/s Summary::Raw Total B/W=inf +GB/s Summary::Total with convergence and optimization phase overhead=2002.73 +GFLOP/s Summary= +GFLOP/s Summary::Raw DDOT=inf +GFLOP/s Summary::Raw WAXPBY=inf +GFLOP/s Summary::Raw SpMV=inf +GFLOP/s Summary::Raw MG=inf +GFLOP/s Summary::Raw Total=inf +GFLOP/s Summary::Total with convergence overhead=inf +GFLOP/s Summary::Total with convergence and optimization phase overhead=262.976 +User Optimization Overheads= +User Optimization Overheads::Optimization phase time (sec)=0 +User Optimization Overheads::Optimization phase time vs reference SpMV+MG time=nan +Final Summary= +Final Summary::HPCG result is=INVALID. +Final Summary::Please review the YAML file contents=You may NOT submit these results for consideration. diff --git a/bin/hpcg.dat b/bin/hpcg.dat deleted file mode 100644 index 0e4508e1..00000000 --- a/bin/hpcg.dat +++ /dev/null @@ -1,4 +0,0 @@ -HPCG benchmark input file -Sandia National Laboratories; University of Tennessee, Knoxville -104 104 104 -60 diff --git a/bin/hpgmp20220307T160457.txt b/bin/hpgmp20220307T160457.txt new file mode 100644 index 00000000..dbc9c791 --- /dev/null +++ b/bin/hpgmp20220307T160457.txt @@ -0,0 +1,287 @@ + Setup Time 0.002537 seconds. + Optimize Time 0 seconds. + + Running GMRES(50) with max-iters = 50 and tol = 0 with precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid +GMRES Residual at the start of restart cycle = 368.706, 1 +GMRES Iteration = 1 (0) Scaled Residual = 80.0312 / 368.706 = 0.21706 +GMRES Iteration = 2 (1) Scaled Residual = 39.0951 / 368.706 = 0.106033 +GMRES Iteration = 3 (2) Scaled Residual = 24.0276 / 368.706 = 0.0651674 +GMRES Iteration = 4 (3) Scaled Residual = 15.5707 / 368.706 = 0.0422307 +GMRES Iteration = 5 (4) Scaled Residual = 8.56239 / 368.706 = 0.0232228 +GMRES Iteration = 6 (5) Scaled Residual = 3.9145 / 368.706 = 0.0106169 +GMRES Iteration = 7 (6) Scaled Residual = 1.88736 / 368.706 = 0.00511888 +GMRES Iteration = 8 (7) Scaled Residual = 1.00341 / 368.706 = 0.00272143 +GMRES Iteration = 9 (8) Scaled Residual = 0.526511 / 368.706 = 0.001428 +GMRES Iteration = 10 (9) Scaled Residual = 0.181835 / 368.706 = 0.00049317 +GMRES Iteration = 11 (10) Scaled Residual = 0.0469161 / 368.706 = 0.000127245 +GMRES Iteration = 12 (11) Scaled Residual = 0.0142933 / 368.706 = 3.87662e-05 +GMRES Iteration = 13 (12) Scaled Residual = 0.0029118 / 368.706 = 7.89736e-06 +GMRES Iteration = 14 (13) Scaled Residual = 0.000913253 / 368.706 = 2.47691e-06 +GMRES Iteration = 15 (14) Scaled Residual = 0.000297245 / 368.706 = 8.06186e-07 +GMRES Iteration = 16 (15) Scaled Residual = 6.74022e-05 / 368.706 = 1.82807e-07 +GMRES Iteration = 17 (16) Scaled Residual = 1.14551e-05 / 368.706 = 3.10683e-08 +GMRES Iteration = 18 (17) Scaled Residual = 2.53274e-06 / 368.706 = 6.86927e-09 +GMRES Iteration = 19 (18) Scaled Residual = 1.26028e-06 / 368.706 = 3.41811e-09 +GMRES Iteration = 20 (19) Scaled Residual = 3.7527e-07 / 368.706 = 1.0178e-09 +GMRES Iteration = 21 (20) Scaled Residual = 7.73786e-08 / 368.706 = 2.09866e-10 +GMRES Iteration = 22 (21) Scaled Residual = 2.13376e-08 / 368.706 = 5.78715e-11 +GMRES Iteration = 23 (22) Scaled Residual = 4.23387e-09 / 368.706 = 1.14831e-11 +GMRES Iteration = 24 (23) Scaled Residual = 9.2695e-10 / 368.706 = 2.51406e-12 +GMRES Iteration = 25 (24) Scaled Residual = 1.93412e-10 / 368.706 = 5.24569e-13 +GMRES Iteration = 26 (25) Scaled Residual = 2.89247e-11 / 368.706 = 7.84491e-14 +GMRES Iteration = 27 (26) Scaled Residual = 9.95438e-12 / 368.706 = 2.69982e-14 +GMRES Iteration = 28 (27) Scaled Residual = 2.41354e-12 / 368.706 = 6.54599e-15 +GMRES Iteration = 29 (28) Scaled Residual = 1.11778e-12 / 368.706 = 3.03162e-15 +GMRES Iteration = 30 (29) Scaled Residual = 4.0636e-13 / 368.706 = 1.10212e-15 +GMRES Iteration = 31 (30) Scaled Residual = 1.10214e-13 / 368.706 = 2.98921e-16 +GMRES Iteration = 32 (31) Scaled Residual = 4.95379e-14 / 368.706 = 1.34356e-16 +GMRES Iteration = 33 (32) Scaled Residual = 1.03209e-14 / 368.706 = 2.79921e-17 +GMRES Iteration = 34 (33) Scaled Residual = 2.6522e-15 / 368.706 = 7.19326e-18 +GMRES Iteration = 35 (34) Scaled Residual = 5.91396e-16 / 368.706 = 1.60398e-18 +GMRES Iteration = 36 (35) Scaled Residual = 6.94601e-17 / 368.706 = 1.88389e-19 +GMRES Iteration = 37 (36) Scaled Residual = 8.83338e-18 / 368.706 = 2.39578e-20 +GMRES Iteration = 38 (37) Scaled Residual = 1.02085e-18 / 368.706 = 2.76873e-21 +GMRES Iteration = 39 (38) Scaled Residual = 1.93548e-19 / 368.706 = 5.24938e-22 +GMRES Iteration = 40 (39) Scaled Residual = 7.02131e-20 / 368.706 = 1.90431e-22 +GMRES Iteration = 41 (40) Scaled Residual = 1.56706e-20 / 368.706 = 4.25017e-23 +GMRES Iteration = 42 (41) Scaled Residual = 2.64481e-21 / 368.706 = 7.17322e-24 +GMRES Iteration = 43 (42) Scaled Residual = 4.96943e-22 / 368.706 = 1.3478e-24 +GMRES Iteration = 44 (43) Scaled Residual = 1.01574e-22 / 368.706 = 2.75487e-25 +GMRES Iteration = 45 (44) Scaled Residual = 3.17017e-23 / 368.706 = 8.5981e-26 +GMRES Iteration = 46 (45) Scaled Residual = 4.14824e-24 / 368.706 = 1.12508e-26 +GMRES Iteration = 47 (46) Scaled Residual = 1.0295e-24 / 368.706 = 2.79219e-27 +GMRES Iteration = 48 (47) Scaled Residual = 1.77179e-25 / 368.706 = 4.80544e-28 +GMRES Iteration = 49 (48) Scaled Residual = 3.76859e-26 / 368.706 = 1.02211e-28 +GMRES Iteration = 50 (49) Scaled Residual = 6.42962e-27 / 368.706 = 1.74383e-29 +GMRES restart: k = 51 (50) +GMRES Residual at the start of restart cycle = 4.01002e-13, 1.08759e-15 +GMRES Iteration = 1 (50) Scaled Residual = 1.69505e-13 / 368.706 = 4.59729e-16 +GMRES Iteration = 2 (51) Scaled Residual = 1.02679e-13 / 368.706 = 2.78485e-16 +GMRES Iteration = 3 (52) Scaled Residual = 4.90765e-14 / 368.706 = 1.33105e-16 +GMRES Iteration = 4 (53) Scaled Residual = 2.11347e-14 / 368.706 = 5.73214e-17 +GMRES Iteration = 5 (54) Scaled Residual = 8.10829e-15 / 368.706 = 2.19912e-17 +GMRES Iteration = 6 (55) Scaled Residual = 3.07713e-15 / 368.706 = 8.34575e-18 +GMRES Iteration = 7 (56) Scaled Residual = 1.41193e-15 / 368.706 = 3.82943e-18 +GMRES Iteration = 8 (57) Scaled Residual = 6.96752e-16 / 368.706 = 1.88972e-18 +GMRES Iteration = 9 (58) Scaled Residual = 3.4625e-16 / 368.706 = 9.39097e-19 +GMRES Iteration = 10 (59) Scaled Residual = 9.42334e-17 / 368.706 = 2.55579e-19 +GMRES Iteration = 11 (60) Scaled Residual = 3.61901e-17 / 368.706 = 9.81543e-20 +GMRES Iteration = 12 (61) Scaled Residual = 9.13127e-18 / 368.706 = 2.47657e-20 +GMRES Iteration = 13 (62) Scaled Residual = 2.59868e-18 / 368.706 = 7.04812e-21 +GMRES Iteration = 14 (63) Scaled Residual = 8.61639e-19 / 368.706 = 2.33693e-21 +GMRES Iteration = 15 (64) Scaled Residual = 1.84923e-19 / 368.706 = 5.01546e-22 +GMRES Iteration = 16 (65) Scaled Residual = 3.68866e-20 / 368.706 = 1.00043e-22 +GMRES Iteration = 17 (66) Scaled Residual = 6.63075e-21 / 368.706 = 1.79839e-23 +GMRES Iteration = 18 (67) Scaled Residual = 1.22725e-21 / 368.706 = 3.32854e-24 +GMRES Iteration = 19 (68) Scaled Residual = 5.79654e-22 / 368.706 = 1.57213e-24 +GMRES Iteration = 20 (69) Scaled Residual = 2.38304e-22 / 368.706 = 6.46324e-25 +GMRES Iteration = 21 (70) Scaled Residual = 4.59033e-23 / 368.706 = 1.24499e-25 +GMRES Iteration = 22 (71) Scaled Residual = 1.1072e-23 / 368.706 = 3.00293e-26 +GMRES Iteration = 23 (72) Scaled Residual = 3.17896e-24 / 368.706 = 8.62193e-27 +GMRES Iteration = 24 (73) Scaled Residual = 6.01624e-25 / 368.706 = 1.63172e-27 +GMRES Iteration = 25 (74) Scaled Residual = 1.27563e-25 / 368.706 = 3.45974e-28 +GMRES Iteration = 26 (75) Scaled Residual = 3.54711e-26 / 368.706 = 9.62044e-29 +GMRES Iteration = 27 (76) Scaled Residual = 1.0024e-26 / 368.706 = 2.71871e-29 +GMRES Iteration = 28 (77) Scaled Residual = 1.95945e-27 / 368.706 = 5.3144e-30 +GMRES Iteration = 29 (78) Scaled Residual = 6.72153e-28 / 368.706 = 1.82301e-30 +GMRES Iteration = 30 (79) Scaled Residual = 1.96036e-28 / 368.706 = 5.31687e-31 +GMRES Iteration = 31 (80) Scaled Residual = 4.87233e-29 / 368.706 = 1.32147e-31 +GMRES Iteration = 32 (81) Scaled Residual = 2.54153e-29 / 368.706 = 6.89312e-32 +GMRES Iteration = 33 (82) Scaled Residual = 6.76297e-30 / 368.706 = 1.83424e-32 +GMRES Iteration = 34 (83) Scaled Residual = 1.25642e-30 / 368.706 = 3.40766e-33 +GMRES Iteration = 35 (84) Scaled Residual = 3.06865e-31 / 368.706 = 8.32275e-34 +GMRES Iteration = 36 (85) Scaled Residual = 3.41268e-32 / 368.706 = 9.25582e-35 +GMRES Iteration = 37 (86) Scaled Residual = 4.67277e-33 / 368.706 = 1.26734e-35 +GMRES Iteration = 38 (87) Scaled Residual = 7.10258e-34 / 368.706 = 1.92635e-36 +GMRES Iteration = 39 (88) Scaled Residual = 2.18752e-34 / 368.706 = 5.93297e-37 +GMRES Iteration = 40 (89) Scaled Residual = 6.13372e-35 / 368.706 = 1.66358e-37 +GMRES Iteration = 41 (90) Scaled Residual = 9.78421e-36 / 368.706 = 2.65366e-38 +GMRES Iteration = 42 (91) Scaled Residual = 2.1363e-36 / 368.706 = 5.79405e-39 +GMRES Iteration = 43 (92) Scaled Residual = 4.43795e-37 / 368.706 = 1.20366e-39 +GMRES Iteration = 44 (93) Scaled Residual = 1.31239e-37 / 368.706 = 3.55946e-40 +GMRES Iteration = 45 (94) Scaled Residual = 2.68152e-38 / 368.706 = 7.2728e-41 +GMRES Iteration = 46 (95) Scaled Residual = 2.66743e-39 / 368.706 = 7.23457e-42 +GMRES Iteration = 47 (96) Scaled Residual = 6.82831e-40 / 368.706 = 1.85197e-42 +GMRES Iteration = 48 (97) Scaled Residual = 1.30438e-40 / 368.706 = 3.53773e-43 +GMRES Iteration = 49 (98) Scaled Residual = 3.0921e-41 / 368.706 = 8.38636e-44 +GMRES Iteration = 50 (99) Scaled Residual = 3.49219e-42 / 368.706 = 9.47148e-45 +GMRES restart: k = 51 (100) + Setup Time 0.08111 seconds. + Optimize Time 0 seconds. + + ** applying diagonal exaggeration ** + +WARNING: PERFORMING UNPRECONDITIONED ITERATIONS + + Running GMRES(30) with max-iters = 5000 and tol = 1e-12 without precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid +GMRES Residual at the start of restart cycle = 4.69816e+08, 1 +GMRES Iteration = 1 (0) Scaled Residual = 3.26848e+08 / 4.69816e+08 = 0.695694 +GMRES Iteration = 2 (1) Scaled Residual = 2.07945e+08 / 4.69816e+08 = 0.44261 +GMRES Iteration = 3 (2) Scaled Residual = 7.8398e+07 / 4.69816e+08 = 0.16687 +GMRES Iteration = 4 (3) Scaled Residual = 2.80854e+07 / 4.69816e+08 = 0.0597796 +GMRES Iteration = 5 (4) Scaled Residual = 1.12102e+07 / 4.69816e+08 = 0.0238609 +GMRES Iteration = 6 (5) Scaled Residual = 4.66992e+06 / 4.69816e+08 = 0.00993989 +GMRES Iteration = 7 (6) Scaled Residual = 1.84189e+06 / 4.69816e+08 = 0.00392045 +GMRES Iteration = 8 (7) Scaled Residual = 630885 / 4.69816e+08 = 0.00134283 +GMRES Iteration = 9 (8) Scaled Residual = 160230 / 4.69816e+08 = 0.000341048 +GMRES Iteration = 10 (9) Scaled Residual = 12.3098 / 4.69816e+08 = 2.62013e-08 +GMRES Iteration = 11 (10) Scaled Residual = 4.18333e-06 / 4.69816e+08 = 8.90418e-15 +GMRES restart: k = 12 (11) +GMRES Residual at the start of restart cycle = 4.28567e-06, 9.12202e-15 + > GMRES converged +GMRES restart: k = 1 (11) +Calling GMRES (all double) for testing: +Call [0] Number of GMRES Iterations [11] Scaled Residual [9.12202e-15] + Expected 12 iterations. Performed 11. + Time 0.002079 seconds. + Gflop/s 0.00334584/0.002079 = 1.60935 (n = 4096) +WARNING: PERFORMING UNPRECONDITIONED ITERATIONS + + Running GMRES(30) with max-iters = 5000 and tol = 1e-12 without precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid +GMRES Residual at the start of restart cycle = 4.69816e+08, 1 +GMRES Iteration = 1 (0) Scaled Residual = 3.26848e+08 / 4.69816e+08 = 0.695694 +GMRES Iteration = 2 (1) Scaled Residual = 2.07945e+08 / 4.69816e+08 = 0.44261 +GMRES Iteration = 3 (2) Scaled Residual = 7.8398e+07 / 4.69816e+08 = 0.16687 +GMRES Iteration = 4 (3) Scaled Residual = 2.80854e+07 / 4.69816e+08 = 0.0597796 +GMRES Iteration = 5 (4) Scaled Residual = 1.12102e+07 / 4.69816e+08 = 0.0238609 +GMRES Iteration = 6 (5) Scaled Residual = 4.66992e+06 / 4.69816e+08 = 0.00993989 +GMRES Iteration = 7 (6) Scaled Residual = 1.84189e+06 / 4.69816e+08 = 0.00392045 +GMRES Iteration = 8 (7) Scaled Residual = 630885 / 4.69816e+08 = 0.00134283 +GMRES Iteration = 9 (8) Scaled Residual = 160230 / 4.69816e+08 = 0.000341048 +GMRES Iteration = 10 (9) Scaled Residual = 12.3098 / 4.69816e+08 = 2.62013e-08 +GMRES Iteration = 11 (10) Scaled Residual = 4.18333e-06 / 4.69816e+08 = 8.90418e-15 +GMRES restart: k = 12 (11) +GMRES Residual at the start of restart cycle = 4.28567e-06, 9.12202e-15 + > GMRES converged +GMRES restart: k = 1 (11) +Calling GMRES (all double) for testing: +Call [1] Number of GMRES Iterations [11] Scaled Residual [9.12202e-15] + Expected 12 iterations. Performed 11. + Time 0.002014 seconds. + Gflop/s 0.00334584/0.002014 = 1.66129 (n = 4096) + + Running GMRES(30) with max-iters = 5000 and tol = 1e-12 with precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid +GMRES Residual at the start of restart cycle = 4.69816e+08, 1 +GMRES Iteration = 1 (0) Scaled Residual = 12.7853 / 4.69816e+08 = 2.72134e-08 +GMRES Iteration = 2 (1) Scaled Residual = 7.74604e-07 / 4.69816e+08 = 1.64874e-15 +GMRES restart: k = 3 (2) +GMRES Residual at the start of restart cycle = 7.86762e-07, 1.67462e-15 + > GMRES converged +GMRES restart: k = 1 (2) +Calling GMRES (all double) for testing: +Call [0] Number of GMRES Iterations [2] Scaled Residual [1.67462e-15] + Expected 2 iterations. Performed 2. + Time 0.001602 seconds. + Gflop/s 0.00256032/0.001602 = 1.5982 (n = 4096) + + Running GMRES(30) with max-iters = 5000 and tol = 1e-12 with precond , nrow = 4096 on ( 1 x 1 x 1 ) MPI grid +GMRES Residual at the start of restart cycle = 4.69816e+08, 1 +GMRES Iteration = 1 (0) Scaled Residual = 12.7853 / 4.69816e+08 = 2.72134e-08 +GMRES Iteration = 2 (1) Scaled Residual = 7.74604e-07 / 4.69816e+08 = 1.64874e-15 +GMRES restart: k = 3 (2) +GMRES Residual at the start of restart cycle = 7.86762e-07, 1.67462e-15 + > GMRES converged +GMRES restart: k = 1 (2) +Calling GMRES (all double) for testing: +Call [1] Number of GMRES Iterations [2] Scaled Residual [1.67462e-15] + Expected 2 iterations. Performed 2. + Time 0.001724 seconds. + Gflop/s 0.00256032/0.001724 = 1.4851 (n = 4096) +WARNING: PERFORMING UNPRECONDITIONED ITERATIONS + + Running GMRES_IR(30) with max-iters = 5000 and tol = 1e-12 without precond , nrow = 4096 +GMRES_IR Residual at the start of restart cycle = 4.69816e+08, 1 +GMRES_IR Iteration = 1 (0) Scaled Residual = 3.26845e+08 / 4.69816e+08 = 0.695688 +GMRES_IR Iteration = 2 (1) Scaled Residual = 2.07943e+08 / 4.69816e+08 = 0.442606 +GMRES_IR Iteration = 3 (2) Scaled Residual = 7.83975e+07 / 4.69816e+08 = 0.166868 +GMRES_IR Iteration = 4 (3) Scaled Residual = 2.80855e+07 / 4.69816e+08 = 0.0597798 +GMRES_IR Iteration = 5 (4) Scaled Residual = 1.12102e+07 / 4.69816e+08 = 0.0238608 +GMRES_IR Iteration = 6 (5) Scaled Residual = 4.66999e+06 / 4.69816e+08 = 0.00994004 +GMRES_IR Iteration = 7 (6) Scaled Residual = 1.84191e+06 / 4.69816e+08 = 0.0039205 +GMRES_IR Iteration = 8 (7) Scaled Residual = 630883 / 4.69816e+08 = 0.00134283 +GMRES_IR Iteration = 9 (8) Scaled Residual = 160230 / 4.69816e+08 = 0.000341048 +GMRES_IR Iteration = 10 (9) Scaled Residual = 12.5592 / 4.69816e+08 = 2.67322e-08 +GMRES_IR Iteration = 11 (10) Scaled Residual = 4.3181e-06 / 4.69816e+08 = 9.19105e-15 +GMRES_IR restart: k = 12 (11) +GMRES_IR Residual at the start of restart cycle = 183.723, 3.91054e-07 +GMRES_IR Iteration = 1 (11) Scaled Residual = 66.9452 / 4.69816e+08 = 1.42492e-07 +GMRES_IR Iteration = 2 (12) Scaled Residual = 47.8823 / 4.69816e+08 = 1.01917e-07 +GMRES_IR Iteration = 3 (13) Scaled Residual = 21.8527 / 4.69816e+08 = 4.65133e-08 +GMRES_IR Iteration = 4 (14) Scaled Residual = 8.31923 / 4.69816e+08 = 1.77074e-08 +GMRES_IR Iteration = 5 (15) Scaled Residual = 3.51961 / 4.69816e+08 = 7.49146e-09 +GMRES_IR Iteration = 6 (16) Scaled Residual = 1.49216 / 4.69816e+08 = 3.17605e-09 +GMRES_IR Iteration = 7 (17) Scaled Residual = 0.398246 / 4.69816e+08 = 8.47663e-10 +GMRES_IR Iteration = 8 (18) Scaled Residual = 0.127894 / 4.69816e+08 = 2.72222e-10 +GMRES_IR Iteration = 9 (19) Scaled Residual = 0.0255666 / 4.69816e+08 = 5.44183e-11 +GMRES_IR Iteration = 10 (20) Scaled Residual = 2.47246e-06 / 4.69816e+08 = 5.26262e-15 +GMRES_IR restart: k = 11 (21) +GMRES_IR Residual at the start of restart cycle = 2.2783e-05, 4.84934e-14 + > GMRES_IR converged +GMRES_IR restart: k = 1 (21) +Call [0] Number of GMRES-IR Iterations [21] Scaled Residual [4.84934e-14] + Expected 12 iterations. Performed 21. + Time 0.005603 seconds. +WARNING: PERFORMING UNPRECONDITIONED ITERATIONS + + Running GMRES_IR(30) with max-iters = 5000 and tol = 1e-12 without precond , nrow = 4096 +GMRES_IR Residual at the start of restart cycle = 4.69816e+08, 1 +GMRES_IR Iteration = 1 (0) Scaled Residual = 3.26845e+08 / 4.69816e+08 = 0.695688 +GMRES_IR Iteration = 2 (1) Scaled Residual = 2.07943e+08 / 4.69816e+08 = 0.442606 +GMRES_IR Iteration = 3 (2) Scaled Residual = 7.83975e+07 / 4.69816e+08 = 0.166868 +GMRES_IR Iteration = 4 (3) Scaled Residual = 2.80855e+07 / 4.69816e+08 = 0.0597798 +GMRES_IR Iteration = 5 (4) Scaled Residual = 1.12102e+07 / 4.69816e+08 = 0.0238608 +GMRES_IR Iteration = 6 (5) Scaled Residual = 4.66999e+06 / 4.69816e+08 = 0.00994004 +GMRES_IR Iteration = 7 (6) Scaled Residual = 1.84191e+06 / 4.69816e+08 = 0.0039205 +GMRES_IR Iteration = 8 (7) Scaled Residual = 630883 / 4.69816e+08 = 0.00134283 +GMRES_IR Iteration = 9 (8) Scaled Residual = 160230 / 4.69816e+08 = 0.000341048 +GMRES_IR Iteration = 10 (9) Scaled Residual = 12.5592 / 4.69816e+08 = 2.67322e-08 +GMRES_IR Iteration = 11 (10) Scaled Residual = 4.3181e-06 / 4.69816e+08 = 9.19105e-15 +GMRES_IR restart: k = 12 (11) +GMRES_IR Residual at the start of restart cycle = 183.723, 3.91054e-07 +GMRES_IR Iteration = 1 (11) Scaled Residual = 66.9452 / 4.69816e+08 = 1.42492e-07 +GMRES_IR Iteration = 2 (12) Scaled Residual = 47.8823 / 4.69816e+08 = 1.01917e-07 +GMRES_IR Iteration = 3 (13) Scaled Residual = 21.8527 / 4.69816e+08 = 4.65133e-08 +GMRES_IR Iteration = 4 (14) Scaled Residual = 8.31923 / 4.69816e+08 = 1.77074e-08 +GMRES_IR Iteration = 5 (15) Scaled Residual = 3.51961 / 4.69816e+08 = 7.49146e-09 +GMRES_IR Iteration = 6 (16) Scaled Residual = 1.49216 / 4.69816e+08 = 3.17605e-09 +GMRES_IR Iteration = 7 (17) Scaled Residual = 0.398246 / 4.69816e+08 = 8.47663e-10 +GMRES_IR Iteration = 8 (18) Scaled Residual = 0.127894 / 4.69816e+08 = 2.72222e-10 +GMRES_IR Iteration = 9 (19) Scaled Residual = 0.0255666 / 4.69816e+08 = 5.44183e-11 +GMRES_IR Iteration = 10 (20) Scaled Residual = 2.47246e-06 / 4.69816e+08 = 5.26262e-15 +GMRES_IR restart: k = 11 (21) +GMRES_IR Residual at the start of restart cycle = 2.2783e-05, 4.84934e-14 + > GMRES_IR converged +GMRES_IR restart: k = 1 (21) +Call [1] Number of GMRES-IR Iterations [21] Scaled Residual [4.84934e-14] + Expected 12 iterations. Performed 21. + Time 0.005244 seconds. + + Running GMRES_IR(30) with max-iters = 5000 and tol = 1e-12 with precond , nrow = 4096 +GMRES_IR Residual at the start of restart cycle = 4.69816e+08, 1 +GMRES_IR Iteration = 1 (0) Scaled Residual = 35.18 / 4.69816e+08 = 7.48803e-08 +GMRES_IR Iteration = 2 (1) Scaled Residual = 0.00035599 / 4.69816e+08 = 7.57722e-13 +GMRES_IR restart: k = 3 (2) +GMRES_IR Residual at the start of restart cycle = 61.156, 1.3017e-07 +GMRES_IR Iteration = 1 (2) Scaled Residual = 4.60599e-06 / 4.69816e+08 = 9.80382e-15 +GMRES_IR restart: k = 2 (3) +GMRES_IR Residual at the start of restart cycle = 6.01636e-06, 1.28058e-14 + > GMRES_IR converged +GMRES_IR restart: k = 1 (3) +Call [0] Number of GMRES-IR Iterations [3] Scaled Residual [1.28058e-14] + Expected 2 iterations. Performed 3. + Time 0.002948 seconds. + + Running GMRES_IR(30) with max-iters = 5000 and tol = 1e-12 with precond , nrow = 4096 +GMRES_IR Residual at the start of restart cycle = 4.69816e+08, 1 +GMRES_IR Iteration = 1 (0) Scaled Residual = 35.18 / 4.69816e+08 = 7.48803e-08 +GMRES_IR Iteration = 2 (1) Scaled Residual = 0.00035599 / 4.69816e+08 = 7.57722e-13 +GMRES_IR restart: k = 3 (2) +GMRES_IR Residual at the start of restart cycle = 61.156, 1.3017e-07 +GMRES_IR Iteration = 1 (2) Scaled Residual = 4.60599e-06 / 4.69816e+08 = 9.80382e-15 +GMRES_IR restart: k = 2 (3) +GMRES_IR Residual at the start of restart cycle = 6.01636e-06, 1.28058e-14 + > GMRES_IR converged +GMRES_IR restart: k = 1 (3) +Call [1] Number of GMRES-IR Iterations [3] Scaled Residual [1.28058e-14] + Expected 2 iterations. Performed 3. + Time 0.002821 seconds. diff --git a/bin/xhpgmp b/bin/xhpgmp new file mode 100755 index 00000000..6723845f Binary files /dev/null and b/bin/xhpgmp differ diff --git a/src/CGData.hpp b/src/CGData.hpp index 98841c47..aa5357e1 100644 --- a/src/CGData.hpp +++ b/src/CGData.hpp @@ -63,6 +63,7 @@ inline void DeleteCGData(CGData_type & data) { DeleteVector (data.r); DeleteVector (data.z); DeleteVector (data.p); + DeleteVector (data.w); DeleteVector (data.Ap); return; } diff --git a/src/CheckAspectRatio.cpp b/src/CheckAspectRatio.cpp index d625148d..e454e53f 100644 --- a/src/CheckAspectRatio.cpp +++ b/src/CheckAspectRatio.cpp @@ -23,7 +23,7 @@ #include #endif -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include "CheckAspectRatio.hpp" diff --git a/src/CheckProblem.cpp b/src/CheckProblem.cpp index 19419a0c..f153d881 100644 --- a/src/CheckProblem.cpp +++ b/src/CheckProblem.cpp @@ -29,7 +29,7 @@ #if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG) #include using std::endl; -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #endif #include diff --git a/src/ComputeDotProduct_ref.cpp b/src/ComputeDotProduct_ref.cpp index 7daf8a5c..922223de 100644 --- a/src/ComputeDotProduct_ref.cpp +++ b/src/ComputeDotProduct_ref.cpp @@ -24,11 +24,23 @@ #include "Utils_MPI.hpp" #endif #ifndef HPCG_NO_OPENMP -#include + #include #endif +#ifdef HPCG_WITH_CUDA + #include + #include +#elif defined(HPCG_WITH_HIP) + #include + #include +#endif + #include #include "ComputeDotProduct_ref.hpp" +#ifdef HPCG_DEBUG +#include "Hpgmp_Params.hpp" +#endif + /*! Routine to compute the dot product of two vectors where: @@ -51,33 +63,86 @@ int ComputeDotProduct_ref(const local_int_t n, const Vector_type & x, const Vect assert(y.localLength>=n); typedef typename Vector_type::scalar_type scalar_type; -#ifndef HPCG_NO_MPI - MPI_Datatype MPI_SCALAR_TYPE = MpiTypeTraits::getType (); -#endif - scalar_type local_result (0.0); + +#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG) scalar_type * xv = x.values; scalar_type * yv = y.values; if (yv==xv) { -#ifndef HPCG_NO_OPENMP + #ifndef HPCG_NO_OPENMP #pragma omp parallel for reduction (+:local_result) -#endif + #endif for (local_int_t i=0; i::value) { + if (CUBLAS_STATUS_SUCCESS != cublasDdot (handle, n, (double*)d_x, 1, (double*)d_y, 1, (double*)&local_result)) { + printf( " Failed cublasDdot\n" ); + } + } else if (std::is_same::value) { + if (CUBLAS_STATUS_SUCCESS != cublasSdot (handle, n, (float*)d_x, 1, (float*)d_y, 1, (float*)&local_result)) { + printf( " Failed cublasSdot\n" ); + } + } + #elif defined(HPCG_WITH_HIP) + // Compute dot on AMD GPU + rocblas_handle handle = x.handle; + #if 1 // TODO remove this + if (hipSuccess != hipMemcpy(d_x, xv, sizeof(scalar_type) * n, hipMemcpyHostToDevice)) { + printf( " Failed hipMemcpy d_x\n" ); + } + if (hipSuccess != hipMemcpy(d_y, yv, sizeof(scalar_type) * n, hipMemcpyHostToDevice)) { + printf( " Failed hipMemcpy d_y\n" ); + } + #endif + if (std::is_same::value) { + if (rocblas_status_success != rocblas_ddot (handle, n, (double*)d_x, 1, (double*)d_y, 1, (double*)&local_result)) { + printf( " Failed rocblas_ddot\n" ); + } + } else if (std::is_same::value) { + if (rocblas_status_success != rocblas_sdot (handle, n, (float*)d_x, 1, (float*)d_y, 1, (float*)&local_result)) { + printf( " Failed rocblas_sdot\n" ); + } + } + #endif +#endif #ifndef HPCG_NO_MPI // Use MPI's reduce function to collect all partial sums + MPI_Datatype MPI_SCALAR_TYPE = MpiTypeTraits::getType (); double t0 = mytimer(); scalar_type global_result (0.0); MPI_Allreduce(&local_result, &global_result, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); result = global_result; time_allreduce += mytimer() - t0; + + #if defined(HPCG_WITH_CUDA) & defined(HPCG_DEBUG) + scalar_type global_tmp (0.0); + MPI_Allreduce(&local_tmp, &global_tmp, 1, MPI_SCALAR_TYPE, MPI_SUM, + MPI_COMM_WORLD); + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (rank == 0) { + HPCG_fout << rank << " : DotProduct(" << n << "): error = " << global_tmp-global_result << " (dot=" << global_result << ")" << std::endl; + } + #endif #else time_allreduce += 0.0; result = local_result; diff --git a/src/ComputeGEMV.cpp b/src/ComputeGEMV.cpp index 6cfaaca1..b7a190e3 100644 --- a/src/ComputeGEMV.cpp +++ b/src/ComputeGEMV.cpp @@ -19,44 +19,17 @@ */ #include "ComputeGEMV.hpp" +#include "ComputeGEMV_ref.hpp" template int ComputeGEMV(const local_int_t m, const local_int_t n, const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const SerialDenseMatrix_type & x, - const typename Vector_type::scalar_type beta, const Vector_type & y) { + const typename Vector_type::scalar_type beta, const Vector_type & y, + bool & isOptimized) { - typedef typename MultiVector_type::scalar_type scalarA_type; - typedef typename SerialDenseMatrix_type::scalar_type scalarX_type; - typedef typename Vector_type::scalar_type scalarY_type; - - const scalarA_type one (1.0); - const scalarA_type zero (0.0); - - assert(x.m >= n); // Test vector lengths - assert(x.n == 1); - assert(y.localLength >= m); - - const scalarA_type * const Av = A.values; - const scalarX_type * const xv = x.values; - scalarY_type * yv = y.values; - if (beta == zero) { - for (local_int_t i = 0; i < m; i++) yv[i] = zero; - } else if (beta != one) { - for (local_int_t i = 0; i < m; i++) yv[i] *= beta; - } - - if (alpha == one) { - for (local_int_t i=0; i, Vector, SerialDenseMatrix > - (int, int, double, MultiVector const&, SerialDenseMatrix const&, double, Vector const&); + (int, int, double, MultiVector const&, SerialDenseMatrix const&, double, Vector const&, bool&); template int ComputeGEMV< MultiVector, Vector, SerialDenseMatrix > - (int, int, float, MultiVector const&, SerialDenseMatrix const&, float, Vector const&); + (int, int, float, MultiVector const&, SerialDenseMatrix const&, float, Vector const&, bool&); // mixed template int ComputeGEMV< MultiVector, Vector, SerialDenseMatrix > - (int, int, float, MultiVector const&, SerialDenseMatrix const&, double, Vector const&); + (int, int, float, MultiVector const&, SerialDenseMatrix const&, double, Vector const&, bool&); diff --git a/src/ComputeGEMV.hpp b/src/ComputeGEMV.hpp index 0e972771..393219fc 100644 --- a/src/ComputeGEMV.hpp +++ b/src/ComputeGEMV.hpp @@ -29,6 +29,7 @@ template int ComputeGEMV(const local_int_t m, const local_int_t n, const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const SerialDenseMatrix_type & x, - const typename Vector_type::scalar_type beta, const Vector_type & y); + const typename Vector_type::scalar_type beta, const Vector_type & y, + bool & isOptimized); #endif // COMPUTE_GEMV diff --git a/src/ComputeGEMVT.cpp b/src/ComputeGEMVT.cpp new file mode 100644 index 00000000..6f456545 --- /dev/null +++ b/src/ComputeGEMVT.cpp @@ -0,0 +1,48 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file ComputeGEMVT.cpp + + Routine to compute the GEMV of transpose of a matrix and a vector. + */ +#include "ComputeGEMVT.hpp" +#include "ComputeGEMVT_ref.hpp" + +template +int ComputeGEMVT(const local_int_t m, const local_int_t n, + const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const Vector_type & x, + const typename Vector_type::scalar_type beta, const SerialDenseMatrix_type & y, + bool & isOptimized) { + + // This line and the next two lines should be removed and your version of ComputeGEMV should be used. + isOptimized = false; + return ComputeGEMVT_ref(m, n, alpha, A, x, beta, y); +} + + +/* --------------- * + * specializations * + * --------------- */ + +// uniform +template +int ComputeGEMVT< MultiVector, Vector, SerialDenseMatrix > + (int, int, double, MultiVector const&, Vector const&, double, SerialDenseMatrix const&, bool&); + +template +int ComputeGEMVT< MultiVector, Vector, SerialDenseMatrix > + (int, int, float, MultiVector const&, Vector const&, float, SerialDenseMatrix const&, bool&); + + diff --git a/src/ComputeGEMVT.hpp b/src/ComputeGEMVT.hpp new file mode 100644 index 00000000..e718942b --- /dev/null +++ b/src/ComputeGEMVT.hpp @@ -0,0 +1,35 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file ComputeGEMVT.hpp + + HPCG data structures for dense vectors + */ + +#ifndef COMPUTE_GEMVT_HPP +#define COMPUTE_GEMVT_HPP + +#include "Geometry.hpp" +#include "MultiVector.hpp" +#include "Vector.hpp" +#include "SerialDenseMatrix.hpp" + +template +int ComputeGEMVT(const local_int_t m, const local_int_t n, + const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const Vector_type & x, + const typename Vector_type::scalar_type beta, const SerialDenseMatrix_type & y, + bool & isOptimized); + +#endif // COMPUTE_GEMVT diff --git a/src/ComputeGEMVT_ref.cpp b/src/ComputeGEMVT_ref.cpp new file mode 100644 index 00000000..153d9264 --- /dev/null +++ b/src/ComputeGEMVT_ref.cpp @@ -0,0 +1,153 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file Vector.hpp + + HPCG data structures for dense vectors + */ +#ifndef HPCG_NO_MPI + #include "Utils_MPI.hpp" +#endif +#include "ComputeGEMVT_ref.hpp" + +template +int ComputeGEMVT_ref(const local_int_t m, const local_int_t n, + const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const Vector_type & x, + const typename Vector_type::scalar_type beta, const SerialDenseMatrix_type & y) { + + typedef typename MultiVector_type::scalar_type scalarA_type; + typedef typename SerialDenseMatrix_type::scalar_type scalarX_type; + typedef typename Vector_type::scalar_type scalarY_type; + + const scalarA_type one (1.0); + const scalarA_type zero (0.0); + + assert(x.localLength >= m); // Test vector lengths + assert(y.m >= n); + assert(y.n == 1); + + // Input serial dense vector + scalarA_type * const Av = A.values; + scalarX_type * const xv = x.values; + scalarY_type * const yv = y.values; + +#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG) + // GEMV on HOST CPU + if (beta == zero) { + for (local_int_t i = 0; i < n; i++) yv[i] = zero; + } else if (beta != one) { + for (local_int_t i = 0; i < n; i++) yv[i] *= beta; + } + + if (alpha == one) { + for (local_int_t j=0; j::value && std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value && std::is_same::value)) { + + // Perform GEMV on device + if (std::is_same::value) { + if (CUBLAS_STATUS_SUCCESS != cublasDgemv(x.handle, CUBLAS_OP_T, + m, n, + (double*)&alpha, (double*)d_Av, m, + (double*)d_xv, 1, + (double*)&beta, (double*)d_yv, 1)){ + printf( " Failed cublasDgemv\n" ); + } + } else if (std::is_same::value) { + if (CUBLAS_STATUS_SUCCESS != cublasSgemv(x.handle, CUBLAS_OP_T, + m, n, + (float*)&alpha, (float*)d_Av, m, + (float*)d_xv, 1, + (float*)&beta, (float*)d_yv, 1)){ + printf( " Failed cublasSgemv\n" ); + } + } + + // Copy input serial dense vector to host + if (cudaSuccess != cudaMemcpy(yv, d_yv, n*sizeof(scalarX_type), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy d_x\n" ); + } + } else { + HPCG_fout << " Mixed-precision GEMV not supported" << std::endl; + + // Copy input matrix A from HOST CPU + if (cudaSuccess != cudaMemcpy(Av, d_Av, m*n*sizeof(scalarY_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_y\n" ); + } + if (cudaSuccess != cudaMemcpy(xv, d_xv, m*sizeof(scalarX_type), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy d_x\n" ); + } + + // GEMV on HOST CPU + if (beta == zero) { + for (local_int_t i = 0; i < n; i++) yv[i] = zero; + } else if (beta != one) { + for (local_int_t i = 0; i < n; i++) yv[i] *= beta; + } + + if (alpha == one) { + for (local_int_t j=0; j::getType (); + MPI_Allreduce(MPI_IN_PLACE, yv, n, MPI_SCALAR_TYPE, MPI_SUM, + MPI_COMM_WORLD); +#endif + + return 0; +} + + +/* --------------- * + * specializations * + * --------------- */ + +// uniform +template +int ComputeGEMVT_ref< MultiVector, Vector, SerialDenseMatrix > + (int, int, double, MultiVector const&, Vector const&, double, SerialDenseMatrix const&); + +template +int ComputeGEMVT_ref< MultiVector, Vector, SerialDenseMatrix > + (int, int, float, MultiVector const&, Vector const&, float, SerialDenseMatrix const&); + + diff --git a/src/ComputeGEMVT_ref.hpp b/src/ComputeGEMVT_ref.hpp new file mode 100644 index 00000000..f9a49256 --- /dev/null +++ b/src/ComputeGEMVT_ref.hpp @@ -0,0 +1,34 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file ComputeGEMVT_re.hpp + + HPCG data structures for dense vectors + */ + +#ifndef COMPUTE_GEMVT_REF_HPP +#define COMPUTE_GEMVT_REF_HPP + +#include "Geometry.hpp" +#include "MultiVector.hpp" +#include "Vector.hpp" +#include "SerialDenseMatrix.hpp" + +template +int ComputeGEMVT_ref(const local_int_t m, const local_int_t n, + const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const Vector_type & x, + const typename Vector_type::scalar_type beta, const SerialDenseMatrix_type & y); + +#endif // COMPUTE_GEMVT diff --git a/src/ComputeGEMV_ref.cpp b/src/ComputeGEMV_ref.cpp new file mode 100644 index 00000000..c67408b0 --- /dev/null +++ b/src/ComputeGEMV_ref.cpp @@ -0,0 +1,152 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file Vector.hpp + + HPCG data structures for dense vectors + */ + +#include "ComputeGEMV_ref.hpp" + +template +int ComputeGEMV_ref(const local_int_t m, const local_int_t n, + const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const SerialDenseMatrix_type & x, + const typename Vector_type::scalar_type beta, const Vector_type & y) { + + typedef typename MultiVector_type::scalar_type scalarA_type; + typedef typename SerialDenseMatrix_type::scalar_type scalarX_type; + typedef typename Vector_type::scalar_type scalarY_type; + + const scalarA_type one (1.0); + const scalarA_type zero (0.0); + + assert(x.m >= n); // Test vector lengths + assert(x.n == 1); + assert(y.localLength >= m); + + // Input serial dense vector + const scalarX_type * const xv = x.values; + + scalarA_type * const Av = A.values; + scalarY_type * const yv = y.values; + +#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG) + // GEMV on HOST CPU + if (beta == zero) { + for (local_int_t i = 0; i < m; i++) yv[i] = zero; + } else if (beta != one) { + for (local_int_t i = 0; i < m; i++) yv[i] *= beta; + } + + if (alpha == one) { + for (local_int_t j=0; j::value && std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value && std::is_same::value)) { + + // Copy input serial dense vector to device + if (cudaSuccess != cudaMemcpy(d_xv, xv, n*sizeof(scalarX_type), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy d_x\n" ); + } + + // Perform GEMV on device + if (std::is_same::value) { + if (CUBLAS_STATUS_SUCCESS != cublasDgemv(y.handle, CUBLAS_OP_N, + m, n, + (double*)&alpha, (double*)d_Av, m, + (double*)d_xv, 1, + (double*)&beta, (double*)d_yv, 1)){ + printf( " Failed cublasDgemv\n" ); + } + } else if (std::is_same::value) { + if (CUBLAS_STATUS_SUCCESS != cublasSgemv(y.handle, CUBLAS_OP_N, + m, n, + (float*)&alpha, (float*)d_Av, m, + (float*)d_xv, 1, + (float*)&beta, (float*)d_yv, 1)){ + printf( " Failed cublasSgemv\n" ); + } + } + } else { + HPCG_fout << " Mixed-precision GEMV not supported" << std::endl; + + // Copy input matrix A from HOST CPU + if (cudaSuccess != cudaMemcpy(Av, d_Av, m*n*sizeof(scalarY_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_y\n" ); + } + + // GEMV on HOST CPU + if (beta == zero) { + for (local_int_t i = 0; i < m; i++) yv[i] = zero; + } else if (beta != one) { + for (local_int_t i = 0; i < m; i++) yv[i] *= beta; + } + + if (alpha == one) { + for (local_int_t i=0; i, Vector, SerialDenseMatrix > + (int, int, double, MultiVector const&, SerialDenseMatrix const&, double, Vector const&); + +template +int ComputeGEMV_ref< MultiVector, Vector, SerialDenseMatrix > + (int, int, float, MultiVector const&, SerialDenseMatrix const&, float, Vector const&); + + +// mixed +template +int ComputeGEMV_ref< MultiVector, Vector, SerialDenseMatrix > + (int, int, float, MultiVector const&, SerialDenseMatrix const&, double, Vector const&); + diff --git a/src/ComputeGEMV_ref.hpp b/src/ComputeGEMV_ref.hpp new file mode 100644 index 00000000..197c53cd --- /dev/null +++ b/src/ComputeGEMV_ref.hpp @@ -0,0 +1,34 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file Vector.hpp + + HPCG data structures for dense vectors + */ + +#ifndef COMPUTE_GEMV_REF_HPP +#define COMPUTE_GEMV_REF_HPP + +#include "Geometry.hpp" +#include "MultiVector.hpp" +#include "Vector.hpp" +#include "SerialDenseMatrix.hpp" + +template +int ComputeGEMV_ref(const local_int_t m, const local_int_t n, + const typename MultiVector_type::scalar_type alpha, const MultiVector_type & A, const SerialDenseMatrix_type & x, + const typename Vector_type::scalar_type beta, const Vector_type & y); + +#endif // COMPUTE_GEMV_REF diff --git a/src/ComputeGS_Forward_ref.cpp b/src/ComputeGS_Forward_ref.cpp index 2e99f2d5..7a088598 100644 --- a/src/ComputeGS_Forward_ref.cpp +++ b/src/ComputeGS_Forward_ref.cpp @@ -13,13 +13,24 @@ //@HEADER /*! - @file ComputeSYMGS_ref.cpp + @file ComputeGS_Forward_ref.cpp HPCG routine */ #ifndef HPCG_NO_MPI -#include "ExchangeHalo.hpp" + #include "ExchangeHalo.hpp" +#endif +#ifdef HPCG_WITH_CUDA + #include + #include + #include "ComputeSPMV.hpp" + #include "ComputeWAXPBY.hpp" + #ifdef HPCG_DEBUG + #include + #include "Utils_MPI.hpp" + #include "Hpgmp_Params.hpp" + #endif #endif #include "ComputeGS_Forward_ref.hpp" #include @@ -53,14 +64,53 @@ int ComputeGS_Forward_ref(const SparseMatrix_type & A, const Vector_type & r, Ve assert(x.localLength==A.localNumberOfColumns); // Make sure x contain space for halo values typedef typename SparseMatrix_type::scalar_type scalar_type; + const local_int_t nrow = A.localNumberOfRows; + const local_int_t ncol = A.localNumberOfColumns; + + const scalar_type * const rv = r.values; + scalar_type * const xv = x.values; + #ifndef HPCG_NO_MPI - ExchangeHalo(A,x); + #ifdef HPCG_WITH_CUDA + // workspace + Vector_type b = A.x; // nrow + scalar_type * const d_bv = b.d_values; + scalar_type * const d_xv = x.d_values; + + // Copy local part of X to HOST CPU + if (A.geom->rank==0) printf( " HaloExchange on Host for GS_Forward\n" ); + if (cudaSuccess != cudaMemcpy(xv, d_xv, nrow*sizeof(scalar_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_y\n" ); + } + #endif + + // Exchange Halo on HOST CPU + ExchangeHalo(A, x); + + #ifdef HPCG_WITH_CUDA + // Copy X (after Halo Exchange on host) to device + #define HPCG_COMPACT_GS + #ifdef HPCG_COMPACT_GS + // Copy non-local part of X (after Halo Exchange) into x0 on device + //if (cudaSuccess != cudaMemcpy(&d_xv[nrow], &xv[nrow], (ncol-nrow)*sizeof(scalar_type), cudaMemcpyHostToDevice)) { + // printf( " Failed to memcpy d_y\n" ); + //} + #else + Vector_type x0 = A.y; // ncol + scalar_type * const x0v = x0.values; + CopyVector(x, x0); // this also copy on CPU, which is needed only for debug + #endif + + #ifdef HPCG_DEBUG + if (A.geom->rank==0) { + HPCG_fout << A.geom->rank << " : ComputeGS(" << nrow << " x " << ncol << ") start" << std::endl; + } + #endif + #endif #endif - const local_int_t nrow = A.localNumberOfRows; +#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG) scalar_type ** matrixDiagonal = A.matrixDiagonal; // An array of pointers to the diagonal entries A.matrixValues - const scalar_type * const rv = r.values; - scalar_type * const xv = x.values; for (local_int_t i=0; i < nrow; i++) { const scalar_type * const currentValues = A.matrixValues[i]; @@ -76,8 +126,113 @@ int ComputeGS_Forward_ref(const SparseMatrix_type & A, const Vector_type & r, Ve sum += xv[i]*currentDiagonal; // Remove diagonal contribution from previous loop xv[i] = sum/currentDiagonal; + } +#endif + +#ifdef HPCG_WITH_CUDA + const scalar_type one ( 1.0); + const scalar_type mone (-1.0); + #ifdef HPCG_COMPACT_GS + // b = r - Ux + if (cudaSuccess != cudaMemcpy(d_bv, r.d_values, nrow*sizeof(scalar_type), cudaMemcpyDeviceToDevice)) { + printf( " Failed to memcpy d_r\n" ); } + if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nrow, ncol, A.nnzU, + (const double*)&mone, A.descrU, + (double*)A.d_Unzvals, A.d_Urow_ptr, A.d_Ucol_idx, + (double*)d_xv, + (const double*)&one, (double*)d_bv)) { + printf( " Failed cusparseDcsrmv\n" ); + } + } else if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nrow, ncol, A.nnzU, + (const float*)&mone, A.descrA, + (float*)A.d_Unzvals, A.d_Urow_ptr, A.d_Ucol_idx, + (float*)d_xv, + (const float*)&one, (float*)d_bv)) { + printf( " Failed cusparseScsrmv\n" ); + } + } + #else + // b = r - Ax0 + ComputeSPMV(A, x0, b); + ComputeWAXPBY(nrow, -one, b, one, r, b, A.isWaxpbyOptimized); + #endif + + // x = L^{-1}b + if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrsv_solve(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nrow, + (const double*)&one, A.descrL, + (double*)A.d_Lnzvals, A.d_Lrow_ptr, A.d_Lcol_idx, + A.infoL, + (double*)d_bv, (double*)d_xv)) { + printf( " Failed cusparseDcsrv_solve\n" ); + } + } else if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseScsrsv_solve(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nrow, + (const float*)&one, A.descrL, + (float*)A.d_Lnzvals, A.d_Lrow_ptr, A.d_Lcol_idx, + A.infoL, + (float*)d_bv, (float*)d_xv)) { + printf( " Failed cusparseScsrv_solve\n" ); + } + } + + #ifdef HPCG_DEBUG + scalar_type * tv = (scalar_type *)malloc(nrow * sizeof(scalar_type)); + for (int i=0; i::getType (); + MPI_Allreduce(&l_enorm, &enorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&l_xnorm, &xnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&l_rnorm, &rnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + #else + enorm = l_enorm; + xnorm = l_xnorm; + rnorm = l_rnorm; + #endif + enorm = sqrt(enorm); + xnorm = sqrt(xnorm); + rnorm = sqrt(rnorm); + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (rank == 0) { + HPCG_fout << rank << " : GS_forward(" << nrow << " x " << ncol << "): error = " << enorm << " (x=" << xnorm << ", r=" << rnorm << ")" << std::endl; + } + free(tv); + #endif +#endif return 0; } diff --git a/src/ComputeMG_ref.cpp b/src/ComputeMG_ref.cpp index f5c85530..ac40b24b 100644 --- a/src/ComputeMG_ref.cpp +++ b/src/ComputeMG_ref.cpp @@ -19,10 +19,14 @@ */ #include "ComputeMG_ref.hpp" +#include "ComputeSYMGS_ref.hpp" #include "ComputeGS_Forward_ref.hpp" #include "ComputeSPMV_ref.hpp" #include "ComputeRestriction_ref.hpp" #include "ComputeProlongation_ref.hpp" +#ifdef HPCG_DEBUG +#include "Hpgmp_Params.hpp" +#endif #include #include @@ -45,20 +49,42 @@ int ComputeMG_ref(const SparseMatrix_type & A, const Vector_type & r, Vector_typ int ierr = 0; if (A.mgData!=0) { // Go to next coarse level if defined int numberOfPresmootherSteps = A.mgData->numberOfPresmootherSteps; - for (int i=0; i< numberOfPresmootherSteps; ++i) ierr += ComputeGS_Forward_ref(A, r, x); + #if defined(HPCG_WITH_CUDA) & defined(HPCG_DEBUG) + if (A.geom->rank==0) HPCG_fout << std::endl << " > PreSmooth( " << numberOfPresmootherSteps << " ) " << std::endl; + #endif + if (symmetric) { + for (int i=0; i< numberOfPresmootherSteps; ++i) ierr += ComputeSYMGS_ref(A, r, x); + } else { + for (int i=0; i< numberOfPresmootherSteps; ++i) ierr += ComputeGS_Forward_ref(A, r, x); + } if (ierr!=0) return ierr; ierr = ComputeSPMV_ref(A, x, *A.mgData->Axf); if (ierr!=0) return ierr; + // Perform restriction operation using simple injection ierr = ComputeRestriction_ref(A, r); if (ierr!=0) return ierr; ierr = ComputeMG_ref(*A.Ac,*A.mgData->rc, *A.mgData->xc, symmetric); if (ierr!=0) return ierr; ierr = ComputeProlongation_ref(A, x); if (ierr!=0) return ierr; int numberOfPostsmootherSteps = A.mgData->numberOfPostsmootherSteps; - for (int i=0; i< numberOfPostsmootherSteps; ++i) ierr += ComputeGS_Forward_ref(A, r, x); + #if defined(HPCG_WITH_CUDA) & defined(HPCG_DEBUG) + if (A.geom->rank==0) HPCG_fout << " > PostSmooth( " << numberOfPostsmootherSteps << " ) " << std::endl; + #endif + if (symmetric) { + for (int i=0; i< numberOfPostsmootherSteps; ++i) ierr += ComputeSYMGS_ref(A, r, x); + } else { + for (int i=0; i< numberOfPostsmootherSteps; ++i) ierr += ComputeGS_Forward_ref(A, r, x); + } if (ierr!=0) return ierr; } else { - ierr = ComputeGS_Forward_ref(A, r, x); + #if defined(HPCG_WITH_CUDA) & defined(HPCG_DEBUG) + if (A.geom->rank==0) HPCG_fout << std::endl << " > CoarseSolve( " << 1 << " ) " << std::endl; + #endif + if (symmetric) { + ierr = ComputeSYMGS_ref(A, r, x); + } else { + ierr = ComputeGS_Forward_ref(A, r, x); + } if (ierr!=0) return ierr; } return 0; diff --git a/src/ComputeProlongation_ref.cpp b/src/ComputeProlongation_ref.cpp index 096c9073..5df9b451 100644 --- a/src/ComputeProlongation_ref.cpp +++ b/src/ComputeProlongation_ref.cpp @@ -45,11 +45,58 @@ int ComputeProlongation_ref(const SparseMatrix_type & Af, Vector_type & xf) { local_int_t * f2c = Af.mgData->f2cOperator; local_int_t nc = Af.mgData->rc->localLength; -#ifndef HPCG_NO_OPENMP -#pragma omp parallel for -#endif -// TODO: Somehow note that this loop can be safely vectorized since f2c has no repeated indices - for (local_int_t i=0; ixc->d_values; + #if 1 + const scalar_type zero ( 0.0); + const scalar_type one ( 1.0); + const scalar_type mone (-1.0); + if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(Af.cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, + nc, n, nc, + (const double*)&one, Af.mgData->descrA, + (double*)Af.mgData->d_nzvals, Af.mgData->d_row_ptr, Af.mgData->d_col_idx, + (double*)d_xcv, + (const double*)&one, (double*)d_xfv)) { + printf( " Failed cusparseDcsrmv\n" ); + } + } else if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(Af.cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, + nc, n, nc, + (const float*)&one, Af.mgData->descrA, + (float*)Af.mgData->d_nzvals, Af.mgData->d_row_ptr, Af.mgData->d_col_idx, + (float*)d_xcv, + (const float*)&one, (float*)d_xfv)) { + printf( " Failed cusparseScsrmv\n" ); + } + } + #else + // Copy the whole compressed vector from Device to Host.. + if (Af.geom->rank==0) printf( " Prologation on CPU ..\n" ); + if (cudaSuccess != cudaMemcpy(xfv, d_xfv, n*sizeof(scalar_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_x\n" ); + } + if (cudaSuccess != cudaMemcpy(xcv, d_xcv, nc*sizeof(scalar_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_x\n" ); + } + + // Prologation on host + for (local_int_t i=0; i -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #endif #include // needed for fabs diff --git a/src/ComputeRestriction_ref.cpp b/src/ComputeRestriction_ref.cpp index 10b363ab..025172c2 100644 --- a/src/ComputeRestriction_ref.cpp +++ b/src/ComputeRestriction_ref.cpp @@ -48,10 +48,75 @@ int ComputeRestriction_ref(const SparseMatrix_type & A, const Vector_type & rf) local_int_t * f2c = A.mgData->f2cOperator; local_int_t nc = A.mgData->rc->localLength; -#ifndef HPCG_NO_OPENMP -#pragma omp parallel for -#endif - for (local_int_t i=0; iAxf->d_values; + scalar_type * d_rfv = rf.d_values; + scalar_type * d_rcv = A.mgData->rc->d_values; + #if 1 + const scalar_type zero ( 0.0); + const scalar_type one ( 1.0); + const scalar_type mone (-1.0); + if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nc, n, nc, + (const double*)&one, A.mgData->descrA, + (double*)A.mgData->d_nzvals, A.mgData->d_row_ptr, A.mgData->d_col_idx, + (double*)d_rfv, + (const double*)&zero, (double*)d_rcv)) { + printf( " Failed cusparseDcsrmv\n" ); + } + if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nc, n, nc, + (const double*)&mone, A.mgData->descrA, + (double*)A.mgData->d_nzvals, A.mgData->d_row_ptr, A.mgData->d_col_idx, + (double*)d_Axfv, + (const double*)&one, (double*)d_rcv)) { + printf( " Failed cusparseDcsrmv\n" ); + } + } else if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nc, n, nc, + (const float*)&one, A.mgData->descrA, + (float*)A.mgData->d_nzvals, A.mgData->d_row_ptr, A.mgData->d_col_idx, + (float*)d_rfv, + (const float*)&zero, (float*)d_rcv)) { + printf( " Failed cusparseScsrmv\n" ); + } + if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nc, n, nc, + (const float*)&mone, A.mgData->descrA, + (float*)A.mgData->d_nzvals, A.mgData->d_row_ptr, A.mgData->d_col_idx, + (float*)d_Axfv, + (const float*)&one, (float*)d_rcv)) { + printf( " Failed cusparseScsrmv\n" ); + } + } + #else + // Copy the whole prologated vector from Device to Host + if (cudaSuccess != cudaMemcpy(rfv, d_rfv, n*sizeof(scalar_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_rfv\n" ); + } + if (cudaSuccess != cudaMemcpy(Axfv, d_Axfv, n*sizeof(scalar_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_Axfv\n" ); + } + + // Restriction on CPU + if (A.geom->rank==0) printf( " Restriction on CPU\n" ); + for (local_int_t i=0; i + #include +#endif +#ifdef HPCG_WITH_CUDA + #include + #include + + #if defined(HPCG_DEBUG) & !defined(HPCG_NO_MPI) + #include + #include "Utils_MPI.hpp" + #include "Hpgmp_Params.hpp" + #endif #endif #include @@ -51,15 +61,28 @@ int ComputeSPMV_ref(const SparseMatrix_type & A, Vector_type & x, Vector_type & assert(y.localLength>=A.localNumberOfRows); typedef typename SparseMatrix_type::scalar_type scalar_type; + const local_int_t nrow = A.localNumberOfRows; + scalar_type * const xv = x.values; + scalar_type * const yv = y.values; + #ifndef HPCG_NO_MPI - ExchangeHalo(A,x); + if (A.geom->size > 1) { + #ifdef HPCG_WITH_CUDA + // Copy local part of X to HOST CPU + if (A.geom->rank==0) printf( " HaloExchange on Host for SpMV\n" ); + if (cudaSuccess != cudaMemcpy(xv, x.d_values, nrow*sizeof(scalar_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_y\n" ); + } + #endif + + ExchangeHalo(A, x); + } #endif - const scalar_type * const xv = x.values; - scalar_type * const yv = y.values; - const local_int_t nrow = A.localNumberOfRows; -#ifndef HPCG_NO_OPENMP + +#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG) + #ifndef HPCG_NO_OPENMP #pragma omp parallel for -#endif + #endif for (local_int_t i=0; i< nrow; i++) { scalar_type sum = 0.0; const scalar_type * const cur_vals = A.matrixValues[i]; @@ -70,6 +93,77 @@ int ComputeSPMV_ref(const SparseMatrix_type & A, Vector_type & x, Vector_type & sum += cur_vals[j]*xv[cur_inds[j]]; yv[i] = sum; } +#endif + +#ifdef HPCG_WITH_CUDA + const scalar_type one (1.0); + const scalar_type zero (0.0); + const local_int_t ncol = A.localNumberOfColumns; + const global_int_t nnz = A.localNumberOfNonzeros; + + scalar_type * const d_xv = x.d_values; + scalar_type * const d_yv = y.d_values; + if (A.geom->size > 1) { + // copy non-local part of X to device (after Halo exchange) + if (cudaSuccess != cudaMemcpy(&d_xv[nrow], &xv[nrow], (ncol-nrow)*sizeof(scalar_type), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy d_x\n" ); + } + } + + if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseDcsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nrow, ncol, nnz, + (const double*)&one, A.descrA, + (double*)A.d_nzvals, A.d_row_ptr, A.d_col_idx, + (double*)d_xv, + (const double*)&zero, (double*)d_yv)) { + printf( " Failed cusparseDcsrmv\n" ); + } + } else if (std::is_same::value) { + if (CUSPARSE_STATUS_SUCCESS != cusparseScsrmv(A.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + nrow, ncol, nnz, + (const float*)&one, A.descrA, + (float*)A.d_nzvals, A.d_row_ptr, A.d_col_idx, + (float*)d_xv, + (const float*)&zero, (float*)d_yv)) { + printf( " Failed cusparseScsrmv\n" ); + } + } + #ifdef HPCG_DEBUG + scalar_type * tv = (scalar_type *)malloc(nrow * sizeof(scalar_type)); + if (cudaSuccess != cudaMemcpy(tv, d_yv, nrow*sizeof(scalar_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_y\n" ); + } + scalar_type l_enorm = 0.0; + scalar_type l_xnorm = 0.0; + scalar_type l_ynorm = 0.0; + for (int j=0; j::getType (); + MPI_Allreduce(&l_enorm, &enorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&l_xnorm, &xnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&l_ynorm, &ynorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + #else + enorm = l_enorm; + xnorm = l_xnorm; + ynorm = l_ynorm; + #endif + enorm = sqrt(enorm); + xnorm = sqrt(xnorm); + ynorm = sqrt(ynorm); + if (A.geom->rank == 0) { + HPCG_fout << A.geom->rank << " : SpMV(" << nrow << " x " << ncol << "): error = " << enorm << "(x=" << xnorm << ", y=" << ynorm << ")" << std::endl; + } + free(tv); + #endif +#endif return 0; } diff --git a/src/ComputeSYMGS.cpp b/src/ComputeSYMGS.cpp new file mode 100644 index 00000000..f854dfd1 --- /dev/null +++ b/src/ComputeSYMGS.cpp @@ -0,0 +1,56 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file ComputeSYMGS.cpp + + HPCG routine + */ + +#include "ComputeSYMGS.hpp" +#include "ComputeSYMGS_ref.hpp" + +/*! + Routine to compute one step of symmetric Gauss-Seidel: + + Assumption about the structure of matrix A: + - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i] + - Entries in row 'i' are ordered such that: + - lower triangular terms are stored before the diagonal element. + - upper triangular terms are stored after the diagonal element. + - No other assumptions are made about entry ordering. + + Symmetric Gauss-Seidel notes: + - We use the input vector x as the RHS and start with an initial guess for y of all zeros. + - We perform one forward sweep. Since y is initially zero we can ignore the upper triangular terms of A. + - We then perform one back sweep. + - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after + + @param[in] A the known system matrix + @param[in] r the input vector + @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep with r as the RHS. + + @return returns 0 upon success and non-zero otherwise + + @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out of sync with other kernels. + + @see ComputeSYMGS_ref +*/ +template +int ComputeSYMGS(const SparseMatrix_type & A, const Vector_type & r, Vector_type & x) { + + // This line and the next two lines should be removed and your version of ComputeSYMGS should be used. + return ComputeSYMGS_ref(A, r, x); + +} diff --git a/src/ComputeSYMGS.hpp b/src/ComputeSYMGS.hpp new file mode 100644 index 00000000..d203873a --- /dev/null +++ b/src/ComputeSYMGS.hpp @@ -0,0 +1,23 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +#ifndef COMPUTESYMGS_HPP +#define COMPUTESYMGS_HPP +#include "SparseMatrix.hpp" +#include "Vector.hpp" + +template +int ComputeSYMGS(const SparseMatrix_type & A, const Vector_type & r, Vector_type & x); + +#endif // COMPUTESYMGS_HPP diff --git a/src/ComputeSYMGS_ref.cpp b/src/ComputeSYMGS_ref.cpp new file mode 100644 index 00000000..13e85603 --- /dev/null +++ b/src/ComputeSYMGS_ref.cpp @@ -0,0 +1,117 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file ComputeSYMGS_ref.cpp + + HPCG routine + */ + +#ifndef HPCG_NO_MPI +#include "ExchangeHalo.hpp" +#endif +#include "ComputeSYMGS_ref.hpp" +#include + +/*! + Computes one step of symmetric Gauss-Seidel: + + Assumption about the structure of matrix A: + - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i] + - Entries in row 'i' are ordered such that: + - lower triangular terms are stored before the diagonal element. + - upper triangular terms are stored after the diagonal element. + - No other assumptions are made about entry ordering. + + Symmetric Gauss-Seidel notes: + - We use the input vector x as the RHS and start with an initial guess for y of all zeros. + - We perform one forward sweep. x should be initially zero on the first GS sweep, but we do not attempt to exploit this fact. + - We then perform one back sweep. + - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after + + @param[in] A the known system matrix + @param[in] r the input vector + @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep with r as the RHS. + + + @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out of sync with other kernels. + + @return returns 0 upon success and non-zero otherwise + + @see ComputeSYMGS +*/ +template +int ComputeSYMGS_ref(const SparseMatrix_type & A, const Vector_type & r, Vector_type & x) { + + assert(x.localLength==A.localNumberOfColumns); // Make sure x contain space for halo values + + typedef typename SparseMatrix_type::scalar_type scalar_type; +#ifndef HPCG_NO_MPI + ExchangeHalo(A,x); +#endif + + const local_int_t nrow = A.localNumberOfRows; + scalar_type ** matrixDiagonal = A.matrixDiagonal; // An array of pointers to the diagonal entries A.matrixValues + const scalar_type * const rv = r.values; + scalar_type * const xv = x.values; + + for (local_int_t i=0; i< nrow; i++) { + const scalar_type * const currentValues = A.matrixValues[i]; + const local_int_t * const currentColIndices = A.mtxIndL[i]; + const int currentNumberOfNonzeros = A.nonzerosInRow[i]; + const scalar_type currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value + scalar_type sum = rv[i]; // RHS value + + for (int j=0; j< currentNumberOfNonzeros; j++) { + local_int_t curCol = currentColIndices[j]; + sum -= currentValues[j] * xv[curCol]; + } + sum += xv[i]*currentDiagonal; // Remove diagonal contribution from previous loop + + xv[i] = sum/currentDiagonal; + + } + + // Now the back sweep. + + for (local_int_t i=nrow-1; i>=0; i--) { + const scalar_type * const currentValues = A.matrixValues[i]; + const local_int_t * const currentColIndices = A.mtxIndL[i]; + const int currentNumberOfNonzeros = A.nonzerosInRow[i]; + const scalar_type currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value + scalar_type sum = rv[i]; // RHS value + + for (int j = 0; j< currentNumberOfNonzeros; j++) { + local_int_t curCol = currentColIndices[j]; + sum -= currentValues[j]*xv[curCol]; + } + sum += xv[i]*currentDiagonal; // Remove diagonal contribution from previous loop + + xv[i] = sum/currentDiagonal; + } + + return 0; +} + + +/* --------------- * + * specializations * + * --------------- */ + +template +int ComputeSYMGS_ref< SparseMatrix, Vector >(SparseMatrix const&, Vector const&, Vector&); + +template +int ComputeSYMGS_ref< SparseMatrix, Vector >(SparseMatrix const&, Vector const&, Vector&); + diff --git a/src/ComputeSYMGS_ref.hpp b/src/ComputeSYMGS_ref.hpp new file mode 100644 index 00000000..0487ebef --- /dev/null +++ b/src/ComputeSYMGS_ref.hpp @@ -0,0 +1,23 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +#ifndef COMPUTESYMGS_REF_HPP +#define COMPUTESYMGS_REF_HPP +#include "SparseMatrix.hpp" +#include "Vector.hpp" + +template +int ComputeSYMGS_ref(const SparseMatrix_type & A, const Vector_type & r, Vector_type & x); + +#endif // COMPUTESYMGS_REF_HPP diff --git a/src/ComputeWAXPBY_ref.cpp b/src/ComputeWAXPBY_ref.cpp index ed7487f4..396b59ca 100644 --- a/src/ComputeWAXPBY_ref.cpp +++ b/src/ComputeWAXPBY_ref.cpp @@ -20,7 +20,17 @@ #include "ComputeWAXPBY_ref.hpp" #ifndef HPCG_NO_OPENMP -#include + #include +#endif +#ifdef HPCG_WITH_CUDA + #include + #include "cublas_v2.h" + + #if defined(HPCG_DEBUG) & !defined(HPCG_NO_MPI) + #include + #include "Hpgmp_Params.hpp" + #include "Utils_MPI.hpp" + #endif #endif #include /*! @@ -49,30 +59,134 @@ int ComputeWAXPBY_ref(const local_int_t n, assert(x.localLength>=n); // Test vector lengths assert(y.localLength>=n); + // quick return + if (n <= 0) return 0; + typedef typename VectorX_type::scalar_type scalarX_type; typedef typename VectorY_type::scalar_type scalarY_type; typedef typename VectorW_type::scalar_type scalarW_type; - const scalarX_type * const xv = x.values; - const scalarY_type * const yv = y.values; - scalarW_type * const wv = w.values; + scalarX_type * const xv = x.values; + scalarY_type * const yv = y.values; + scalarW_type * const wv = w.values; + +#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG) if (alpha==1.0) { -#ifndef HPCG_NO_OPENMP + #ifndef HPCG_NO_OPENMP #pragma omp parallel for -#endif + #endif for (local_int_t i=0; i::value && std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value && std::is_same::value)) { + + // Compute axpy on Nvidia GPU + // w = x (assuming y is not w) + if (cudaSuccess != cudaMemcpy(d_wv, d_xv, n*sizeof(scalarW_type), cudaMemcpyDeviceToDevice)) { + printf( " Failed to memcpy d_w\n" ); + } + if (std::is_same::value) { + // w = alpha*w + if (CUBLAS_STATUS_SUCCESS != cublasDscal (w.handle, n, (const double*)&alpha, (double*)d_wv, 1)) { + printf( " Failed cublasDscal\n" ); + } + // w += alpha*x + if (CUBLAS_STATUS_SUCCESS != cublasDaxpy (w.handle, n, (const double*)&beta, (double*)d_yv, 1, (double*)d_wv, 1)) { + printf( " Failed cublasDdot\n" ); + } + } else if (std::is_same::value) { + // w = beta*y + if (CUBLAS_STATUS_SUCCESS != cublasSscal (w.handle, n, (const float*)&alpha, (float*)d_wv, 1)) { + printf( " Failed cublasSscal\n" ); + } + // w += alpha*x + if (CUBLAS_STATUS_SUCCESS != cublasSaxpy (w.handle, n, (const float*)&beta, (float*) d_yv, 1, (float*) d_wv, 1)) { + printf( " Failed cublasDdot\n" ); + } + } + + #ifdef HPCG_DEBUG + scalarW_type * tv = (scalarW_type *)malloc(n * sizeof(scalarW_type)); + if (cudaSuccess != cudaMemcpy(tv, d_wv, n*sizeof(scalarW_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_w\n" ); + } + scalarW_type l_enorm = 0.0; + scalarW_type l_wnorm = 0.0; + scalarW_type l_xnorm = 0.0; + scalarW_type l_ynorm = 0.0; + for (int j=0; j::getType (); + MPI_Allreduce(&l_enorm, &enorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&l_wnorm, &wnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&l_xnorm, &xnorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(&l_ynorm, &ynorm, 1, MPI_SCALAR_TYPE, MPI_SUM, MPI_COMM_WORLD); + #else + enorm = l_enorm; + wnorm = l_wnorm; + xnorm = l_xnorm; + ynorm = l_ynorm; + #endif + enorm = sqrt(enorm); + wnorm = sqrt(wnorm); + xnorm = sqrt(xnorm); + ynorm = sqrt(ynorm); + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (rank == 0) { + HPCG_fout << rank << " : WAXPBY(" << n << "): error = " << enorm << " (alpha=" << alpha << ", beta=" << beta + << ", x=" << xnorm << ", y=" << ynorm << ", w=" << wnorm << ")" << std::endl; + } + free(tv); + #endif + } else { + HPCG_fout << " Mixed-precision WAXPBY not supported" << std::endl; + + // copy Input vectors to Host + if (cudaSuccess != cudaMemcpy(xv, d_xv, n*sizeof(scalarX_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_x\n" ); + } + if (cudaSuccess != cudaMemcpy(yv, d_yv, n*sizeof(scalarY_type), cudaMemcpyDeviceToHost)) { + printf( " Failed to memcpy d_w\n" ); + } + + // WAXPBY on Host for (local_int_t i=0; i #include -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include "GMRES.hpp" #include "mytimer.hpp" @@ -32,6 +32,7 @@ #include "ComputeWAXPBY.hpp" #include "ComputeTRSM.hpp" #include "ComputeGEMV.hpp" +#include "ComputeGEMVT.hpp" // Use TICK and TOCK to time a code section in MATLAB-like fashion @@ -56,14 +57,14 @@ @return Returns zero on success and a non-zero value otherwise. - @see CG_ref() + @see GMRES_ref() */ template int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b, Vector_type & x, const int restart_length, const int max_iter, const typename SparseMatrix_type::scalar_type tolerance, int & niters, typename SparseMatrix_type::scalar_type & normr, typename SparseMatrix_type::scalar_type & normr0, - double * times, bool doPreconditioning) { - + double * times, double * flops, bool doPreconditioning) { + typedef typename SparseMatrix_type::scalar_type scalar_type; typedef MultiVector MultiVector_type; typedef SerialDenseMatrix SerialDenseMatrix_type; @@ -71,7 +72,7 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b const scalar_type one (1.0); const scalar_type zero (0.0); double t_begin = mytimer(); // Start timing right away - double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0; + double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0, t6 = 0.0; normr = 0.0; scalar_type rtz = zero, oldrtz = zero, alpha = zero, beta = zero, pAp = zero; @@ -80,20 +81,24 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b // double t6 = 0.0; //#endif local_int_t nrow = A.localNumberOfRows; + local_int_t Nrow = A.totalNumberOfRows; Vector_type & r = data.r; // Residual vector Vector_type & z = data.z; // Preconditioned residual vector Vector_type & p = data.p; // Direction vector (in MPI mode ncol>=nrow) Vector_type & Ap = data.Ap; SerialDenseMatrix_type H; + SerialDenseMatrix_type h; + SerialDenseMatrix_type t; SerialDenseMatrix_type cs; SerialDenseMatrix_type ss; - SerialDenseMatrix_type t; MultiVector_type Q; + MultiVector_type P; Vector_type Qkm1; Vector_type Qk; Vector_type Qj; InitializeMatrix(H, restart_length+1, restart_length); + InitializeMatrix(h, restart_length+1, 1); InitializeMatrix(t, restart_length+1, 1); InitializeMatrix(cs, restart_length+1, 1); InitializeMatrix(ss, restart_length+1, 1); @@ -101,47 +106,47 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b if (!doPreconditioning && A.geom->rank==0) HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl; -#ifdef HPCG_DEBUG + bool verbose = true; int print_freq = 1; - if (print_freq>50) print_freq=50; - if (print_freq<1) print_freq=1; - if (A.geom->rank==0) HPCG_fout << std::endl << " Running GMRES(" << restart_length - << ") with max-iters = " << max_iter - << " and tol = " << tolerance - << (doPreconditioning ? " with precond " : " without precond ") - << ", nrow = " << nrow << std::endl; -#endif + if (verbose && A.geom->rank==0) { + HPCG_fout << std::endl << " Running GMRES(" << restart_length + << ") with max-iters = " << max_iter + << " and tol = " << tolerance + << (doPreconditioning ? " with precond " : " without precond ") + << ", nrow = " << nrow + << " on ( " << A.geom->npx << " x " << A.geom->npy << " x " << A.geom->npz + << " ) MPI grid " + << std::endl; + } + *flops = 0.0; + double flops_gmg = 0.0; + double flops_spmv = 0.0; + double flops_orth = 0.0; + global_int_t numSpMVs_MG = 1+(A.mgData->numberOfPresmootherSteps + A.mgData->numberOfPostsmootherSteps); niters = 0; bool converged = false; while (niters <= max_iter && !converged) { // p is of length ncols, copy x to p for sparse MV operation CopyVector(x, p); - TICK(); ComputeSPMV(A, p, Ap); TOCK(t3); // Ap = A*p - TICK(); ComputeWAXPBY(nrow, one, b, -one, Ap, r, A.isWaxpbyOptimized); TOCK(t2); // r = b - Ax (x stored in p) - TICK(); ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized); TOCK(t1); + TICK(); ComputeSPMV(A, p, Ap); TOCK(t3); flops_spmv += (2*A.totalNumberOfNonzeros); // Ap = A*p + TICK(); ComputeWAXPBY(nrow, one, b, -one, Ap, r, A.isWaxpbyOptimized); TOCK(t2); *flops += (2*Nrow); // r = b - Ax (x stored in p) + TICK(); ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized); *flops += (2*Nrow); TOCK(t1); normr = sqrt(normr); GetVector(Q, 0, Qj); CopyVector(r, Qj); - TICK(); ComputeWAXPBY(nrow, zero, Qj, one/normr, Qj, Qj, A.isWaxpbyOptimized); TOCK(t2); + //TICK(); ComputeWAXPBY(nrow, zero, Qj, one/normr, Qj, Qj, A.isWaxpbyOptimized); TOCK(t2); + TICK(); ScaleVectorValue(Qj, one/normr); TOCK(t2); *flops += Nrow; // Record initial residual for convergence testing if (niters == 0) normr0 = normr; - #ifdef HPCG_DEBUG - if (A.geom->rank==0) HPCG_fout << "GMRES Residual at the start of restart cycle = "<< normr - << ", " << normr/normr0 << std::endl; - #endif - + if (verbose && A.geom->rank==0) { + HPCG_fout << "GMRES Residual at the start of restart cycle = "<< normr + << ", " << normr/normr0 << std::endl; + } if (normr/normr0 <= tolerance) { converged = true; - #ifdef HPCG_DEBUG - if (A.geom->rank==0) HPCG_fout << " > GMRES converged " << std::endl; - #endif + if (verbose && A.geom->rank==0) HPCG_fout << " > GMRES converged " << std::endl; } -/*if (normr/normr0 <= tolerance || (niters > 0 && doPreconditioning)) { - printf( " done %d iters (%s)\n",niters, (converged ? "Converged" : "Not Converged") ); - printf( " done (%s)\n",(doPreconditioning ? "Precond" : "Not Precond") ); - for (int i = 0; i < nrow; i++) printf( "x[%d] = %e\n",i,x.values[i] ); -}*/ // do forward GS instead of symmetric GS bool symmetric = false; @@ -154,37 +159,61 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b GetVector(Q, k, Qk); TICK(); - if (doPreconditioning) - ComputeMG(A, Qkm1, z, symmetric); // Apply preconditioner - else + if (doPreconditioning) { + ComputeMG(A, Qkm1, z, symmetric); flops_gmg += (2*numSpMVs_MG*A.totalNumberOfMGNonzeros); // Apply preconditioner + } else { CopyVector(Qkm1, z); // copy r to z (no preconditioning) + } TOCK(t5); // Preconditioner apply time // Qk = A*z - TICK(); ComputeSPMV(A, z, Qk); TOCK(t3); - - // MGS to orthogonalize z against Q(:,0:k-1), using dots - for (int j = 0; j < k; j++) { - // get j-th column of Q - GetVector(Q, j, Qj); + TICK(); ComputeSPMV(A, z, Qk); flops_spmv += (2*A.totalNumberOfNonzeros); TOCK(t3); - alpha = zero; - for (int i = 0; i < 2; i++) { - // beta = Qk'*Qj - TICK(); ComputeDotProduct(nrow, Qk, Qj, beta, t4, A.isDotProductOptimized); TOCK(t1); - // Qk = Qk - beta * Qj - TICK(); ComputeWAXPBY(nrow, one, Qk, -beta, Qj, Qk, A.isWaxpbyOptimized); TOCK(t2); - alpha += beta; + // orthogonalize z against Q(:,0:k-1), using dots + bool use_mgs = false; + TICK(); + if (use_mgs) { + // MGS2 + for (int j = 0; j < k; j++) { + // get j-th column of Q + GetVector(Q, j, Qj); + + alpha = zero; + for (int i = 0; i < 2; i++) { + // beta = Qk'*Qj + TICK(); ComputeDotProduct(nrow, Qk, Qj, beta, t4, A.isDotProductOptimized); TOCK(t1); + + // Qk = Qk - beta * Qj + TICK(); ComputeWAXPBY(nrow, one, Qk, -beta, Qj, Qk, A.isWaxpbyOptimized); TOCK(t2); + alpha += beta; + } + SetMatrixValue(H, j, k-1, alpha); + } + } else { + // CGS2 + GetMultiVector(Q, 0, k-1, P); + ComputeGEMVT (nrow, k, one, P, Qk, zero, h, A.isGemvOptimized); // h = Q(1:k)'*q(k+1) + ComputeGEMV (nrow, k, -one, P, h, one, Qk, A.isGemvOptimized); // h = Q(1:k)'*q(k+1) + for(int i = 0; i < k; i++) { + SetMatrixValue(H, i, k-1, h.values[i]); + } + // reorthogonalize + ComputeGEMVT (nrow, k, one, P, Qk, zero, h, A.isGemvOptimized); // h = Q(1:k)'*q(k+1) + ComputeGEMV (nrow, k, -one, P, h, one, Qk, A.isGemvOptimized); // h = Q(1:k)'*q(k+1) + for(int i = 0; i < k; i++) { + AddMatrixValue(H, i, k-1, h.values[i]); } - SetMatrixValue(H, j, k-1, alpha); } + TOCK(t6); // Ortho time + flops_orth += (2*k*Nrow); // beta = norm(Qk) - TICK(); ComputeDotProduct(nrow, Qk, Qk, beta, t4, A.isDotProductOptimized); TOCK(t1); + TICK(); ComputeDotProduct(nrow, Qk, Qk, beta, t4, A.isDotProductOptimized); *flops += (2*Nrow); TOCK(t1); beta = sqrt(beta); // Qk = Qk / beta - TICK(); ComputeWAXPBY(nrow, zero, Qk, one/beta, Qk, Qk, A.isWaxpbyOptimized); TOCK(t2); + //TICK(); ComputeWAXPBY(nrow, zero, Qk, one/beta, Qk, Qk, A.isWaxpbyOptimized); TOCK(t2); + TICK(); ScaleVectorValue(Qk, one/beta); *flops += Nrow; TOCK(t2); SetMatrixValue(H, k, k-1, beta); // Given's rotation @@ -220,49 +249,27 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b SetMatrixValue(cs, k-1, 0, cj); normr = std::abs(v2); - #ifdef HPCG_DEBUG - if (A.geom->rank==0 && (k%print_freq == 0 || k+1 == restart_length)) - HPCG_fout << "GMRES Iteration = "<< k << " (" << niters << ") Scaled Residual = " - << normr << " / " << normr0 << " = " << normr/normr0 << std::endl; - #endif + if (verbose && A.geom->rank==0 && (k%print_freq == 0 || k+1 == restart_length)) { + HPCG_fout << "GMRES Iteration = "<< k << " (" << niters << ") Scaled Residual = " + << normr << " / " << normr0 << " = " << normr/normr0 << std::endl; + } niters ++; k ++; } // end of restart-cycle // prepare to restart - #ifdef HPCG_DEBUG - if (A.geom->rank==0) - HPCG_fout << "GMRES restart: k = "<< k << " (" << niters << ")" << std::endl; - #endif - // > update x -/*if (A.geom->rank==0) { - printf( "\n k = %d\n",k ); - printf( "R=[\n" ); - for (int i = 0; i < k; i++) { - for (int j = 0; j < k; j++) printf("%e ",H.values[i + j * H.m] ); - printf("\n"); - } - printf("];\n\n"); - printf( "t=[\n" ); - for (int i = 0; i < k; i++) printf( "%e\n",t.values[i]); - printf("];\n\n"); - - if (niters == 1) { - printf( " nrow = %d, max_iter = %d\n",nrow,max_iter ); - printf( " Q = [\n" ); - for (int i = 0; i < nrow; i++) { - for (int j = 0; j <= k-1; j++) printf( "%e ",Q.values[i + j * nrow] ); - printf("\n"); + if (verbose && A.geom->rank==0) { + HPCG_fout << "GMRES restart: k = "<< k << " (" << niters << ")" << std::endl; } - printf( " ];\n\n" ); - } -}*/ + // > update x ComputeTRSM(k-1, one, H, t); if (doPreconditioning) { - ComputeGEMV (nrow, k-1, one, Q, t, zero, r); // r = Q*t - ComputeMG(A, r, z, symmetric); // z = M*r - TICK(); ComputeWAXPBY(nrow, one, x, one, z, x, A.isWaxpbyOptimized); TOCK(t2); // x += z + ComputeGEMV(nrow, k-1, one, Q, t, zero, r, A.isGemvOptimized); *flops += (2*Nrow*(k-1)); // r = Q*t + TICK(); + ComputeMG(A, r, z, symmetric); flops_gmg += (2*numSpMVs_MG*A.totalNumberOfMGNonzeros); // z = M*r + TOCK(t5); // Preconditioner apply time + TICK(); ComputeWAXPBY(nrow, one, x, one, z, x, A.isWaxpbyOptimized); TOCK(t2); *flops += (2*Nrow); // x += z } else { - ComputeGEMV (nrow, k-1, one, Q, t, one, x); // x += Q*t + ComputeGEMV (nrow, k-1, one, Q, t, one, x, A.isGemvOptimized); *flops += (2*Nrow*(k-1)); // x += Q*t } } // end of outer-loop @@ -277,6 +284,28 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b // times[6] += t6; // exchange halo time //#endif times[0] += mytimer() - t_begin; // Total time. All done... + if (verbose && A.geom->rank==0) { + HPCG_fout << " > nnz(A) : " << A.totalNumberOfNonzeros << std::endl; + HPCG_fout << " > nnz(MG) : " << A.totalNumberOfMGNonzeros << " (" << numSpMVs_MG << ")" << std::endl; + HPCG_fout << " > SpMV : " << (flops_spmv / 1000000000.0) << " / " << t3 << " = " + << (flops_spmv / 1000000000.0) / t3 << " Gflop/s" << std::endl; + HPCG_fout << " > GMG : " << (flops_gmg / 1000000000.0) << " / " << t5 << " = " + << (flops_gmg / 1000000000.0) / t5 << " Gflop/s" << std::endl; + HPCG_fout << " > Orth : " << (flops_orth / 1000000000.0) << " / " << t6 << " = " + << (flops_orth / 1000000000.0) / t6 << " Gflop/s" << std::endl; + HPCG_fout << std::endl; + } + *flops += flops_gmg; + *flops += flops_spmv; + *flops += flops_orth; + + DeleteDenseMatrix(H); + DeleteDenseMatrix(h); + DeleteDenseMatrix(t); + DeleteDenseMatrix(cs); + DeleteDenseMatrix(ss); + DeleteMultiVector(Q); + return 0; } @@ -288,9 +317,9 @@ int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b template int GMRES< SparseMatrix, CGData, Vector > (SparseMatrix const&, CGData&, Vector const&, Vector&, - const int, const int, double, int&, double&, double&, double*, bool); + const int, const int, double, int&, double&, double&, double*, double*, bool); template int GMRES< SparseMatrix, CGData, Vector > (SparseMatrix const&, CGData&, Vector const&, Vector&, - const int, const int, float, int&, float&, float&, double*, bool); + const int, const int, float, int&, float&, float&, double*, double*, bool); diff --git a/src/GMRES.hpp b/src/GMRES.hpp index af1e2db8..609419a5 100644 --- a/src/GMRES.hpp +++ b/src/GMRES.hpp @@ -25,7 +25,7 @@ template int GMRES(const SparseMatrix_type & A, CGData_type & data, const Vector_type & b, Vector_type & x, const int restart_length, const int max_iter, const typename SparseMatrix_type::scalar_type tolerance, int & niters, typename SparseMatrix_type::scalar_type & normr, typename SparseMatrix_type::scalar_type & normr0, - double * times, bool doPreconditioning); + double * times, double * flops, bool doPreconditioning); // this function will compute the Conjugate Gradient iterations. // geom - Domain and processor topology information diff --git a/src/GMRES_IR.cpp b/src/GMRES_IR.cpp index a62ad933..e3e9047f 100644 --- a/src/GMRES_IR.cpp +++ b/src/GMRES_IR.cpp @@ -15,13 +15,13 @@ /*! @file GMRES_IR.cpp - HPCG routine + GMRES-IR routine */ #include #include -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include "GMRES_IR.hpp" #include "mytimer.hpp" @@ -31,6 +31,7 @@ #include "ComputeWAXPBY.hpp" #include "ComputeTRSM.hpp" #include "ComputeGEMV.hpp" +#include "ComputeGEMVT.hpp" // Use TICK and TOCK to time a code section in MATLAB-like fashion @@ -55,14 +56,14 @@ @return Returns zero on success and a non-zero value otherwise. - @see CG_ref() + @see GMRES_IR_ref() */ template int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, const Vector_type & b_hi, Vector_type & x_hi, const int restart_length, const int max_iter, const typename SparseMatrix_type::scalar_type tolerance, int & niters, typename SparseMatrix_type::scalar_type & normr_hi, typename SparseMatrix_type::scalar_type & normr0_hi, - double * times, bool doPreconditioning) { + double * times, double *flops, bool doPreconditioning) { // higher precision for outer loop typedef typename SparseMatrix_type::scalar_type scalar_type; @@ -75,11 +76,8 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo, typedef Vector Vector_type2; double t_begin = mytimer(); // Start timing right away - double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0; + double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0, t6 = 0.0; -//#ifndef HPCG_NO_MPI -// double t6 = 0.0; -//#endif // vectors/matrices in scalar_type2 (lower) const scalar_type2 one (1.0); const scalar_type2 zero (0.0); @@ -87,6 +85,7 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo, scalar_type2 rtz = zero, oldrtz = zero, alpha = zero, beta = zero, pAp = zero; local_int_t nrow = A_lo.localNumberOfRows; + local_int_t Nrow = A.totalNumberOfRows; Vector_type2 & x = data_lo.w; // Intermediate solution vector Vector_type2 & r = data_lo.r; // Residual vector Vector_type2 & z = data_lo.z; // Preconditioned residual vector @@ -94,14 +93,17 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo, Vector_type2 & Ap = data_lo.Ap; SerialDenseMatrix_type2 H; + SerialDenseMatrix_type2 h; + SerialDenseMatrix_type2 t; SerialDenseMatrix_type2 cs; SerialDenseMatrix_type2 ss; - SerialDenseMatrix_type2 t; MultiVector_type2 Q; + MultiVector_type2 P; Vector_type2 Qkm1; Vector_type2 Qk; Vector_type2 Qj; InitializeMatrix(H, restart_length+1, restart_length); + InitializeMatrix(h, restart_length+1, 1); InitializeMatrix(t, restart_length+1, 1); InitializeMatrix(cs, restart_length+1, 1); InitializeMatrix(ss, restart_length+1, 1); @@ -116,45 +118,50 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo, if (!doPreconditioning && A.geom->rank==0) HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl; -#ifdef HPCG_DEBUG int print_freq = 1; + bool verbose = true; if (print_freq>50) print_freq=50; if (print_freq<1) print_freq=1; - if (A.geom->rank==0) HPCG_fout << std::endl << " Running GMRES_IR(" << restart_length - << ") with max-iters = " << max_iter - << " and tol = " << tolerance - << (doPreconditioning ? " with precond " : " without precond ") - << ", nrow = " << nrow << std::endl; -#endif + if (verbose && A.geom->rank==0) { + HPCG_fout << std::endl << " Running GMRES_IR(" << restart_length + << ") with max-iters = " << max_iter + << " and tol = " << tolerance + << (doPreconditioning ? " with precond " : " without precond ") + << ", nrow = " << nrow << std::endl; + } + *flops = 0.0; + double flops_gmg = 0.0; + double flops_spmv = 0.0; + double flops_orth = 0.0; + global_int_t numSpMVs_MG = 1+(A.mgData->numberOfPresmootherSteps + A.mgData->numberOfPostsmootherSteps); niters = 0; bool converged = false; while (niters <= max_iter && !converged) { // > Compute residual vector (higher working precision) // p is of length ncols, copy x to p for sparse MV operation CopyVector(x_hi, p_hi); - TICK(); ComputeSPMV(A, p_hi, Ap_hi); TOCK(t3); // Ap = A*p - TICK(); ComputeWAXPBY(nrow, one_hi, b_hi, -one_hi, Ap_hi, r_hi, A.isWaxpbyOptimized); TOCK(t2); // r = b - Ax (x stored in p) - TICK(); ComputeDotProduct(nrow, r_hi, r_hi, normr_hi, t4, A.isDotProductOptimized); TOCK(t1); + TICK(); ComputeSPMV(A, p_hi, Ap_hi); flops_spmv += (2*A.totalNumberOfNonzeros); TOCK(t3); // Ap = A*p + TICK(); ComputeWAXPBY(nrow, one_hi, b_hi, -one_hi, Ap_hi, r_hi, A.isWaxpbyOptimized); *flops += (2*Nrow); TOCK(t2); // r = b - Ax (x stored in p) + TICK(); ComputeDotProduct(nrow, r_hi, r_hi, normr_hi, t4, A.isDotProductOptimized); *flops += (2*Nrow); TOCK(t1); normr_hi = sqrt(normr_hi); // > Copy r and scale to the initial basis vector GetVector(Q, 0, Qj); CopyVector(r_hi, Qj); - TICK(); ComputeWAXPBY(nrow, zero, Qj, one_hi/normr_hi, Qj, Qj, A.isWaxpbyOptimized); TOCK(t2); + //TICK(); ComputeWAXPBY(nrow, zero, Qj, one_hi/normr_hi, Qj, Qj, A.isWaxpbyOptimized); TOCK(t2); + TICK(); ScaleVectorValue(Qj, one_hi/normr_hi); *flops += Nrow; TOCK(t2); // Record initial residual for convergence testing if (niters == 0) normr0 = normr_hi; normr = normr_hi; - #ifdef HPCG_DEBUG - if (A.geom->rank==0) HPCG_fout << "GMRES_IR Residual at the start of restart cycle = "<< normr - << ", " << normr/normr0 << std::endl; - #endif + if (verbose && A.geom->rank==0) { + HPCG_fout << "GMRES_IR Residual at the start of restart cycle = "<< normr + << ", " << normr/normr0 << std::endl; + } if (normr/normr0 <= tolerance) { converged = true; - #ifdef HPCG_DEBUG - if (A.geom->rank==0) HPCG_fout << " > GMRES_IR converged " << std::endl; - #endif + if (verbose && A.geom->rank==0) HPCG_fout << " > GMRES_IR converged " << std::endl; } // do forward GS instead of symmetric GS @@ -168,37 +175,61 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo, GetVector(Q, k, Qk); TICK(); - if (doPreconditioning) - ComputeMG(A_lo, Qkm1, z, symmetric); // Apply preconditioner - else - CopyVector(Qkm1, z); // copy r to z (no preconditioning) + if (doPreconditioning) { + ComputeMG(A_lo, Qkm1, z, symmetric); flops_gmg += (2*numSpMVs_MG*A.totalNumberOfMGNonzeros); // Apply preconditioner + } else { + CopyVector(Qkm1, z); // copy r to z (no preconditioning) + } TOCK(t5); // Preconditioner apply time // Qk = A*z - TICK(); ComputeSPMV(A_lo, z, Qk); TOCK(t3); + TICK(); ComputeSPMV(A_lo, z, Qk); flops_spmv += (2*A.totalNumberOfNonzeros); TOCK(t3); - // MGS to orthogonalize z against Q(:,0:k-1), using dots - for (int j = 0; j < k; j++) { - // get j-th column of Q - GetVector(Q, j, Qj); - - alpha = zero; - for (int i = 0; i < 2; i++) { - // beta = Qk'*Qj - TICK(); ComputeDotProduct(nrow, Qk, Qj, beta, t4, A.isDotProductOptimized); TOCK(t1); - - // Qk = Qk - beta * Qj - TICK(); ComputeWAXPBY(nrow, one, Qk, -beta, Qj, Qk, A.isWaxpbyOptimized); TOCK(t2); - alpha += beta; + // orthogonalize z against Q(:,0:k-1), using dots + bool use_mgs = false; + TICK(); + if (use_mgs) { + // MGS2 + for (int j = 0; j < k; j++) { + // get j-th column of Q + GetVector(Q, j, Qj); + + alpha = zero; + for (int i = 0; i < 2; i++) { + // beta = Qk'*Qj + TICK(); ComputeDotProduct(nrow, Qk, Qj, beta, t4, A.isDotProductOptimized); TOCK(t1); + + // Qk = Qk - beta * Qj + TICK(); ComputeWAXPBY(nrow, one, Qk, -beta, Qj, Qk, A.isWaxpbyOptimized); TOCK(t2); + alpha += beta; + } + SetMatrixValue(H, j, k-1, alpha); + } + } else { + // CGS2 + GetMultiVector(Q, 0, k-1, P); + ComputeGEMVT (nrow, k, one, P, Qk, zero, h, A.isGemvOptimized); // h = Q(1:k)'*q(k+1) + ComputeGEMV (nrow, k, -one, P, h, one, Qk, A.isGemvOptimized); // h = Q(1:k)'*q(k+1) + for(int i = 0; i < k; i++) { + SetMatrixValue(H, i, k-1, h.values[i]); + } + // reorthogonalize + ComputeGEMVT (nrow, k, one, P, Qk, zero, h, A.isGemvOptimized); // h = Q(1:k)'*q(k+1) + ComputeGEMV (nrow, k, -one, P, h, one, Qk, A.isGemvOptimized); // h = Q(1:k)'*q(k+1) + for(int i = 0; i < k; i++) { + AddMatrixValue(H, i, k-1, h.values[i]); } - SetMatrixValue(H, j, k-1, alpha); } + TOCK(t6); // Ortho time + flops_orth += (2*k*Nrow); + // beta = norm(Qk) TICK(); ComputeDotProduct(nrow, Qk, Qk, beta, t4, A.isDotProductOptimized); TOCK(t1); beta = sqrt(beta); // Qk = Qk / beta - TICK(); ComputeWAXPBY(nrow, zero, Qk, one/beta, Qk, Qk, A.isWaxpbyOptimized); TOCK(t2); + //TICK(); ComputeWAXPBY(nrow, zero, Qk, one/beta, Qk, Qk, A.isWaxpbyOptimized); TOCK(t2); + TICK(); ScaleVectorValue(Qk, one/beta); *flops += Nrow; TOCK(t2); SetMatrixValue(H, k, k-1, beta); // Given's rotation @@ -234,50 +265,28 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo, SetMatrixValue(cs, k-1, 0, cj); normr = std::abs(v2); - #ifdef HPCG_DEBUG - if (A.geom->rank==0 && (k%print_freq == 0 || k+1 == restart_length)) - HPCG_fout << "GMRES_IR Iteration = "<< k << " (" << niters << ") Scaled Residual = " - << normr << " / " << normr0 << " = " << normr/normr0 << std::endl; - #endif + if (verbose && A.geom->rank==0 && (k%print_freq == 0 || k+1 == restart_length)) { + HPCG_fout << "GMRES_IR Iteration = "<< k << " (" << niters << ") Scaled Residual = " + << normr << " / " << normr0 << " = " << normr/normr0 << std::endl; + } niters ++; k ++; } // end of restart-cycle // prepare to restart - #ifdef HPCG_DEBUG - if (A.geom->rank==0) - HPCG_fout << "GMRES_IR restart: k = "<< k << " (" << niters << ")" << std::endl; - #endif + if (verbose && A.geom->rank==0) + HPCG_fout << "GMRES_IR restart: k = "<< k << " (" << niters << ")" << std::endl; // > update x -#if 0 -printf( "\n k = %d\n",k ); -printf( "t=[\n" ); -for (int i = 0; i < k; i++) printf( "%e\n",t.values[i]); -printf("];\n\n"); - -printf( "R=[\n" ); -for (int i = 0; i < k; i++) { - for (int j = 0; j < k; j++) printf("%e ",H.values[i + j * H.m] ); - printf("\n"); -} -printf("];\n\n"); - -if (niters == 1) { - printf( " nrow = %d, max_iter = %d\n",nrow,max_iter ); - printf( " Q = [\n" ); - for (int i = 0; i < nrow; i++) { - for (int j = 0; j <= k-1; j++) printf( "%e ",Q.values[i + j * nrow] ); - printf("\n"); - } - printf( " ];\n\n" ); -} -#endif ComputeTRSM(k-1, one, H, t); if (doPreconditioning) { - ComputeGEMV (nrow, k-1, one, Q, t, zero, r); // r = Q*t - ComputeMG(A_lo, r, z, symmetric); // z = M*r - TICK(); ComputeWAXPBY(nrow, one_hi, x_hi, one, z, x_hi, A.isWaxpbyOptimized); TOCK(t2); // x += z + ComputeGEMV (nrow, k-1, one, Q, t, zero, r, A.isGemvOptimized); *flops += (2*Nrow*(k-1)); // r = Q*t + TICK(); + ComputeMG(A_lo, r, z, symmetric); flops_gmg += (2*numSpMVs_MG*A.totalNumberOfMGNonzeros); // z = M*r + TOCK(t5); // Preconditioner apply time + // mixed-precision + TICK(); ComputeWAXPBY(nrow, one_hi, x_hi, one, z, x_hi, A.isWaxpbyOptimized); *flops += (2*Nrow); TOCK(t2); // x += z } else { - ComputeGEMV (nrow, k-1, one_hi, Q, t, one_hi, x_hi); // x += Q*t + // mixed-precision + ComputeGEMV (nrow, k-1, one_hi, Q, t, one_hi, x_hi, A.isGemvOptimized); *flops += (2*Nrow*(k-1)); // x += Q*t } } // end of outer-loop @@ -288,10 +297,29 @@ if (niters == 1) { times[3] += t3; // SPMV time times[4] += t4; // AllReduce time times[5] += t5; // preconditioner apply time -//#ifndef HPCG_NO_MPI -// times[6] += t6; // exchange halo time -//#endif times[0] += mytimer() - t_begin; // Total time. All done... + if (verbose && A.geom->rank==0) { + HPCG_fout << " > nnz(A) : " << A.totalNumberOfNonzeros << std::endl; + HPCG_fout << " > nnz(MG) : " << A.totalNumberOfMGNonzeros << " (" << numSpMVs_MG << ")" << std::endl; + HPCG_fout << " > SpMV : " << (flops_spmv / 1000000000.0) << " / " << t3 << " = " + << (flops_spmv / 1000000000.0) / t3 << " Gflop/s" << std::endl; + HPCG_fout << " > GMG : " << (flops_gmg / 1000000000.0) << " / " << t5 << " = " + << (flops_gmg / 1000000000.0) / t5 << " Gflop/s" << std::endl; + HPCG_fout << " > Orth : " << (flops_orth / 1000000000.0) << " / " << t6 << " = " + << (flops_orth / 1000000000.0) / t6 << " Gflop/s" << std::endl; + HPCG_fout << std::endl; + } + *flops += flops_gmg; + *flops += flops_spmv; + *flops += flops_orth; + + DeleteDenseMatrix(H); + DeleteDenseMatrix(t); + DeleteDenseMatrix(h); + DeleteDenseMatrix(cs); + DeleteDenseMatrix(ss); + DeleteMultiVector(Q); + return 0; } @@ -304,17 +332,17 @@ if (niters == 1) { template int GMRES_IR< SparseMatrix, SparseMatrix, CGData, CGData, Vector > (SparseMatrix const&, SparseMatrix const&, CGData&, CGData&, Vector const&, Vector&, - const int, const int, double, int&, double&, double&, double*, bool); + const int, const int, double, int&, double&, double&, double*, double*, bool); template int GMRES_IR< SparseMatrix, SparseMatrix, CGData, CGData, Vector > (SparseMatrix const&, SparseMatrix const&, CGData&, CGData&, Vector const&, Vector&, - const int, const int, float, int&, float&, float&, double*, bool); + const int, const int, float, int&, float&, float&, double*, double*, bool); // mixed template int GMRES_IR< SparseMatrix, SparseMatrix, CGData, CGData, Vector > (SparseMatrix const&, SparseMatrix const&, CGData&, CGData&, Vector const&, Vector&, - const int, const int, double, int&, double&, double&, double*, bool); + const int, const int, double, int&, double&, double&, double*, double*, bool); diff --git a/src/GMRES_IR.hpp b/src/GMRES_IR.hpp index a0e095b7..df6ea326 100644 --- a/src/GMRES_IR.hpp +++ b/src/GMRES_IR.hpp @@ -26,7 +26,7 @@ int GMRES_IR(const SparseMatrix_type & A, const SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, const Vector_type & b_hi, Vector_type & x_hi, const int restart_length, const int max_iter, const typename SparseMatrix_type::scalar_type tolerance, int & niters, typename SparseMatrix_type::scalar_type & normr, typename SparseMatrix_type::scalar_type & normr0, - double * times, bool doPreconditioning); + double * times, double *flops, bool doPreconditioning); // this function will compute the Conjugate Gradient iterations. // geom - Domain and processor topology information diff --git a/src/GenerateGeometry.cpp b/src/GenerateGeometry.cpp index 9b33dfbb..90e9d94a 100644 --- a/src/GenerateGeometry.cpp +++ b/src/GenerateGeometry.cpp @@ -27,7 +27,7 @@ #ifdef HPCG_DEBUG #include -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" using std::endl; #endif diff --git a/src/GenerateNonsymProblem.cpp b/src/GenerateNonsymProblem.cpp index 7f46497f..f0880b47 100644 --- a/src/GenerateNonsymProblem.cpp +++ b/src/GenerateNonsymProblem.cpp @@ -49,8 +49,11 @@ void GenerateNonsymProblem(SparseMatrix_type & A, Vector_type * b, Vector_type * // Furthermore, any code must work for general unstructured sparse matrices. Special knowledge about the // specific nature of the sparsity pattern may not be explicitly used. - return(GenerateNonsymProblem_v1_ref(A, b, x, xexact, init_vect)); - //return(GenerateNonsymProblem_ref(A, b, x, xexact, init_vect)); + #if 1 + return GenerateNonsymProblem_v1_ref(A, b, x, xexact, init_vect); + #else + return GenerateNonsymProblem_ref(A, b, x, xexact, init_vect); + #endif } diff --git a/src/GenerateNonsymProblem_ref.cpp b/src/GenerateNonsymProblem_ref.cpp new file mode 100644 index 00000000..27284fdc --- /dev/null +++ b/src/GenerateNonsymProblem_ref.cpp @@ -0,0 +1,271 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file GenerateProblem_ref.cpp + + HPCG routine + */ + +#ifndef HPCG_NO_MPI +#include +#endif + +#ifndef HPCG_NO_OPENMP +#include +#endif + +#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG) +#include +using std::endl; +#include "hpcg.hpp" +#endif +#include +#include + +#include "GenerateNonsymProblem_ref.hpp" + + +/*! + Reference version of GenerateProblem to generate the sparse matrix, right hand side, initial guess, and exact solution. + + @param[in] A The known system matrix + @param[inout] b The newly allocated and generated right hand side vector (if b!=0 on entry) + @param[inout] x The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry) + @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0 non-zero on entry) + + @see GenerateGeometry +*/ + +template +void GenerateNonsymProblem_ref(SparseMatrix_type & A, Vector_type * b, Vector_type * x, Vector_type * xexact, bool init_vect) { + + typedef typename SparseMatrix_type::scalar_type matrix_scalar_type; + typedef typename Vector_type::scalar_type vector_scalar_type; + const matrix_scalar_type zero (0.0); + const matrix_scalar_type one (1.0); + const vector_scalar_type two = one + one; + + // Make local copies of geometry information. Use global_int_t since the RHS products in the calculations + // below may result in global range values. + global_int_t nx = A.geom->nx; + global_int_t ny = A.geom->ny; + global_int_t nz = A.geom->nz; + global_int_t gnx = A.geom->gnx; + global_int_t gny = A.geom->gny; + global_int_t gnz = A.geom->gnz; + global_int_t gix0 = A.geom->gix0; + global_int_t giy0 = A.geom->giy0; + global_int_t giz0 = A.geom->giz0; + + local_int_t localNumberOfRows = nx*ny*nz; // This is the size of our subblock + // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long + assert(localNumberOfRows>0); // Throw an exception of the number of rows is less than zero (can happen if int overflow) + local_int_t numberOfNonzerosPerRow = 27; // We are approximating a 27-point finite element/volume/difference 3D stencil + + global_int_t totalNumberOfRows = gnx*gny*gnz; // Total number of grid points in mesh + // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long + assert(totalNumberOfRows>0); // Throw an exception of the number of rows is less than zero (can happen if int overflow) + + + // Allocate arrays that are of length localNumberOfRows + char * nonzerosInRow = new char[localNumberOfRows]; + global_int_t ** mtxIndG = new global_int_t*[localNumberOfRows]; + local_int_t ** mtxIndL = new local_int_t*[localNumberOfRows]; + matrix_scalar_type ** matrixValues = new matrix_scalar_type*[localNumberOfRows]; + matrix_scalar_type ** matrixDiagonal = new matrix_scalar_type*[localNumberOfRows]; + + vector_scalar_type * bv = 0; + vector_scalar_type * xv = 0; + vector_scalar_type * xexactv = 0; + if (init_vect) { + InitializeVector(*b, localNumberOfRows); + InitializeVector(*x, localNumberOfRows); + InitializeVector(*xexact, localNumberOfRows); + bv = b->values; // Only compute exact solution if requested + xv = x->values; // Only compute exact solution if requested + xexactv = xexact->values; // Only compute exact solution if requested + } + A.localToGlobalMap.resize(localNumberOfRows); + + // Use a parallel loop to do initial assignment: + // distributes the physical placement of arrays of pointers across the memory system +#ifndef HPCG_NO_OPENMP + #pragma omp parallel for +#endif + for (local_int_t i=0; i< localNumberOfRows; ++i) { + matrixValues[i] = 0; + matrixDiagonal[i] = 0; + mtxIndG[i] = 0; + mtxIndL[i] = 0; + } + +#ifndef HPCG_CONTIGUOUS_ARRAYS + // Now allocate the arrays pointed to + for (local_int_t i=0; i< localNumberOfRows; ++i) + mtxIndL[i] = new local_int_t[numberOfNonzerosPerRow]; + for (local_int_t i=0; i< localNumberOfRows; ++i) + matrixValues[i] = new matrix_scalar_type[numberOfNonzerosPerRow]; + for (local_int_t i=0; i< localNumberOfRows; ++i) + mtxIndG[i] = new global_int_t[numberOfNonzerosPerRow]; + +#else + // Now allocate the arrays pointed to + mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow]; + matrixValues[0] = new matrix_scalar_type[localNumberOfRows * numberOfNonzerosPerRow]; + mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow]; + + for (local_int_t i=1; i< localNumberOfRows; ++i) { + mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow; + matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow; + mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow; + } +#endif + + matrix_scalar_type beta (1.0); + matrix_scalar_type gamma (10.0); //one; + local_int_t localNumberOfNonzeros = 0; + // TODO: This triply nested loop could be flattened or use nested parallelism +#ifndef HPCG_NO_OPENMP + #pragma omp parallel for +#endif + for (local_int_t iz=0; iz-1 && giz+sz-1 && giy+sy-1 && gix+sxrank << " of " << A.geom->size <<" has " << localNumberOfRows << " rows." << endl + << "Process " << A.geom->rank << " of " << A.geom->size <<" has " << localNumberOfNonzeros<< " nonzeros." <0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow) + + A.title = 0; + A.totalNumberOfRows = totalNumberOfRows; + A.totalNumberOfNonzeros = totalNumberOfNonzeros; + A.localNumberOfRows = localNumberOfRows; + A.localNumberOfColumns = localNumberOfRows; + A.localNumberOfNonzeros = localNumberOfNonzeros; + A.nonzerosInRow = nonzerosInRow; + A.mtxIndG = mtxIndG; + A.mtxIndL = mtxIndL; + A.matrixValues = matrixValues; + A.matrixDiagonal = matrixDiagonal; + + return; +} + + +/* --------------- * + * specializations * + * --------------- */ + +// uniform +template +void GenerateNonsymProblem_ref< SparseMatrix, Vector >(SparseMatrix&, Vector*, Vector*, Vector*, bool); + +template +void GenerateNonsymProblem_ref< SparseMatrix, Vector >(SparseMatrix&, Vector*, Vector*, Vector*, bool); + + +// mixed +template +void GenerateNonsymProblem_ref< SparseMatrix, Vector >(SparseMatrix&, Vector*, Vector*, Vector*, bool); + diff --git a/src/GenerateNonsymProblem_v1_ref.cpp b/src/GenerateNonsymProblem_v1_ref.cpp index 0e0a3723..447057ad 100644 --- a/src/GenerateNonsymProblem_v1_ref.cpp +++ b/src/GenerateNonsymProblem_v1_ref.cpp @@ -29,7 +29,7 @@ #if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG) #include using std::endl; -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #endif #include diff --git a/src/hpgmp.hpp b/src/Hpgmp_Params.hpp similarity index 70% rename from src/hpgmp.hpp rename to src/Hpgmp_Params.hpp index c2a4b830..185ef78e 100644 --- a/src/hpgmp.hpp +++ b/src/Hpgmp_Params.hpp @@ -13,13 +13,13 @@ //@HEADER /*! - @file hpgmp.hpp + @file Hpgmp_Params.hpp HPCG data structures and functions */ -#ifndef HPCG_HPP -#define HPCG_HPP +#ifndef HPGMP_PARAMS_HPP +#define HPGMP_PARAMS_HPP #include #include "Geometry.hpp" @@ -30,13 +30,13 @@ struct HPCG_Params_STRUCT { int comm_size; //!< Number of MPI processes in MPI_COMM_WORLD int comm_rank; //!< This process' MPI rank in the range [0 to comm_size - 1] int numThreads; //!< This process' number of threads - local_int_t nx; //!< Number of processes in x-direction of 3D process grid - local_int_t ny; //!< Number of processes in y-direction of 3D process grid - local_int_t nz; //!< Number of processes in z-direction of 3D process grid + local_int_t nx; //!< Number of x-direction grid points for each local subdomain + local_int_t ny; //!< Number of y-direction grid points for each local subdomain + local_int_t nz; //!< Number of z-direction grid points for each local subdomain int runningTime; //!< Number of seconds to run the timed portion of the benchmark - int npx; //!< Number of x-direction grid points for each local subdomain - int npy; //!< Number of y-direction grid points for each local subdomain - int npz; //!< Number of z-direction grid points for each local subdomain + int npx; //!< Number of processes in z-direction of 3D process grid + int npy; //!< Number of processes in z-direction of 3D process grid + int npz; //!< Number of processes in z-direction of 3D process grid int pz; //!< Partition in the z processor dimension, default is npz local_int_t zl; //!< nz for processors in the z dimension with value less than pz local_int_t zu; //!< nz for processors in the z dimension with value greater than pz diff --git a/src/MGData.hpp b/src/MGData.hpp index cdcd6717..c6b6c98e 100644 --- a/src/MGData.hpp +++ b/src/MGData.hpp @@ -25,6 +25,11 @@ #include "SparseMatrix.hpp" #include "Vector.hpp" +#ifdef HPCG_WITH_CUDA +#include +#include +#endif + template class MGData { public: @@ -40,6 +45,13 @@ class MGData { used inside optimized ComputeSPMV(). */ void * optimizationData; + #ifdef HPCG_WITH_CUDA + // to store the restrictiion as CRS matrix on device + cusparseMatDescr_t descrA; + int *d_row_ptr; + int *d_col_idx; + SC *d_nzvals; //!< values of matrix entries + #endif }; /*! diff --git a/src/MultiVector.hpp b/src/MultiVector.hpp index b04beae2..3c8536ef 100644 --- a/src/MultiVector.hpp +++ b/src/MultiVector.hpp @@ -21,6 +21,13 @@ #ifndef MULTIVECTOR_HPP #define MULTIVECTOR_HPP +#if defined(HPCG_WITH_CUDA) + #include + #include +#elif defined(HPCG_WITH_HIP) + #include + #include +#endif #include #include #include "Vector.hpp" @@ -32,6 +39,14 @@ class MultiVector { local_int_t n; //!< number of vectors local_int_t localLength; //!< length of local portion of the vector SC * values; //!< array of values + #if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP) + SC * d_values; //!< array of values + #if defined(HPCG_WITH_CUDA) + cublasHandle_t handle; + #elif defined(HPCG_WITH_HIP) + rocblas_handle handle; + #endif + #endif /*! This is for storing optimized data structures created in OptimizeProblem and used inside optimized ComputeSPMV(). @@ -54,6 +69,21 @@ inline void InitializeMultiVector(MultiVector_type & V, local_int_t localLength, V.localLength = localLength; V.n = n; V.values = new scalar_type[localLength * n]; + #if defined(HPCG_WITH_CUDA) + if (CUBLAS_STATUS_SUCCESS != cublasCreate(&V.handle)) { + printf( " InitializeVector :: Failed to create Handle\n" ); + } + if (cudaSuccess != cudaMalloc ((void**)&V.d_values, (localLength*n)*sizeof(scalar_type))) { + printf( " InitializeVector :: Failed to allocate d_values\n" ); + } + #elif defined(HPCG_WITH_HIP) + if (rocblas_status_success != rocblas_create_handle(&V.handle)) { + printf( " InitializeMultiVector :: Failed to create Handle\n" ); + } + if (hipSuccess != hipMalloc ((void**)&V.d_values, (localLength*n)*sizeof(scalar_type))) { + printf( " InitializeMultiVector :: Failed to allocate d_values\n" ); + } + #endif V.optimizationData = 0; return; } @@ -76,6 +106,21 @@ inline void ZeroMultiVector(MultiVector_type & V) { return; } +/*! + @param[inout] v - On entrance v is initialized, on exit all its values are zero. + */ +template +inline void GetMultiVector(MultiVector_type & V, local_int_t j1, local_int_t j2, MultiVector_type & Vj) { + Vj.n = j2-j1+1; + Vj.localLength = V.localLength; + Vj.values = &V.values[V.localLength*j1]; + #if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP) + Vj.d_values = &V.d_values[V.localLength*j1]; + Vj.handle = V.handle; + #endif + return; +} + /*! @param[inout] v - On entrance v is initialized, on exit all its values are zero. */ @@ -83,6 +128,10 @@ template inline void GetVector(MultiVector_type & V, local_int_t j, Vector_type & vj) { vj.localLength = V.localLength; vj.values = &V.values[V.localLength*j]; + #if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP) + vj.d_values = &V.d_values[V.localLength*j]; + vj.handle = V.handle; + #endif return; } @@ -96,6 +145,13 @@ inline void DeleteMultiVector(MultiVector_type & V) { delete [] V.values; V.localLength = 0; + #if defined(HPCG_WITH_CUDA) + cudaFree (V.d_values); + cublasDestroy(V.handle); + #elif defined(HPCG_WITH_HIP) + hipFree(V.d_values); + rocblas_destroy_handle(V.handle); + #endif V.n = 0; return; } diff --git a/src/OptimizeProblem.cpp b/src/OptimizeProblem.cpp index 63d677da..594412d8 100644 --- a/src/OptimizeProblem.cpp +++ b/src/OptimizeProblem.cpp @@ -18,6 +18,14 @@ HPCG routine */ +#ifdef HPCG_WITH_CUDA + #include + #include +#elif defined(HPCG_WITH_HIP) + #include + #include +#endif + #include "OptimizeProblem.hpp" /*! Optimizes the data structures used for CG iteration to increase the @@ -97,6 +105,357 @@ int OptimizeProblem(SparseMatrix_type & A, CGData_type & data, Vector_type & b, colors[i] = counters[colors[i]]++; #endif +#if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP) + { + typedef typename SparseMatrix_type::scalar_type SC; + + SparseMatrix_type * curLevelMatrix = &A; + do { + // ------------------------- + // form CSR on host + const local_int_t nrow = curLevelMatrix->localNumberOfRows; + const local_int_t ncol = curLevelMatrix->localNumberOfColumns; + global_int_t nnzL = 0; + global_int_t nnz = curLevelMatrix->localNumberOfNonzeros; + int *h_row_ptr = (int*)malloc((nrow+1)* sizeof(int)); + int *h_col_ind = (int*)malloc( nnz * sizeof(int)); + SC *h_nzvals = (SC *)malloc( nnz * sizeof(SC)); + + nnz = 0; + h_row_ptr[0] = 0; + for (local_int_t i=0; imatrixValues[i]; + const local_int_t * const cur_inds = curLevelMatrix->mtxIndL[i]; + + const int cur_nnz = curLevelMatrix->nonzerosInRow[i]; + for (int j=0; jd_row_ptr), (nrow+1)*sizeof(int))) { + printf( " Failed to allocate A.d_row_ptr(nrow=%d)\n",nrow ); + } + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_col_idx), nnz*sizeof(int))) { + printf( " Failed to allocate A.d_col_idx(nnz=%d)\n",nnz ); + } + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_nzvals), nnz*sizeof(SC))) { + printf( " Failed to allocate A.d_nzvals(nnz=%d)\n",nnz ); + } + + if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_row_ptr, h_row_ptr, (nrow+1)*sizeof(int), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_row_ptr\n" ); + } + if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_col_idx, h_col_ind, nnz*sizeof(int), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_col_idx\n" ); + } + if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_nzvals, h_nzvals, nnz*sizeof(SC), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_nzvals\n" ); + } + #elif defined(HPCG_WITH_HIP) + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_row_ptr), (nrow+1)*sizeof(int))) { + printf( " Failed to allocate A.d_row_ptr(nrow=%d)\n",nrow ); + } + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_col_idx), nnz*sizeof(int))) { + printf( " Failed to allocate A.d_col_idx(nnz=%d)\n",nnz ); + } + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_nzvals), nnz*sizeof(SC))) { + printf( " Failed to allocate A.d_nzvals(nnz=%d)\n",nnz ); + } + + if (hipSuccess != hipMemcpy(curLevelMatrix->d_row_ptr, h_row_ptr, (nrow+1)*sizeof(int), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_row_ptr\n" ); + } + if (hipSuccess != hipMemcpy(curLevelMatrix->d_col_idx, h_col_ind, nnz*sizeof(int), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_col_idx\n" ); + } + if (hipSuccess != hipMemcpy(curLevelMatrix->d_nzvals, h_nzvals, nnz*sizeof(SC), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_nzvals\n" ); + } + #endif + + // free matrix on host + free(h_row_ptr); + free(h_col_ind); + free(h_nzvals); + + // ------------------------- + // Extract lower/upper-triangular matrix + global_int_t nnzU = nnz-nnzL; + int *h_Lrow_ptr = (int*)malloc((nrow+1)* sizeof(int)); + int *h_Lcol_ind = (int*)malloc( nnzL * sizeof(int)); + SC *h_Lnzvals = (SC *)malloc( nnzL * sizeof(SC)); + int *h_Urow_ptr = (int*)malloc((nrow+1)* sizeof(int)); + int *h_Ucol_ind = (int*)malloc( nnzU * sizeof(int)); + SC *h_Unzvals = (SC *)malloc( nnzU * sizeof(SC)); + nnzL = 0; + nnzU = 0; + h_Lrow_ptr[0] = 0; + h_Urow_ptr[0] = 0; + for (local_int_t i=0; imatrixValues[i]; + const local_int_t * const cur_inds = curLevelMatrix->mtxIndL[i]; + + const int cur_nnz = curLevelMatrix->nonzerosInRow[i]; + for (int j=0; jnnzL = nnzL; + curLevelMatrix->nnzU = nnzU; + + // copy CSR(L) to device + #if defined(HPCG_WITH_CUDA) + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Lrow_ptr), (nrow+1)*sizeof(int))) { + printf( " Failed to allocate A.d_Lrow_ptr\n" ); + } + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Lcol_idx), nnzL*sizeof(int))) { + printf( " Failed to allocate A.d_Lcol_idx\n" ); + } + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Lnzvals), nnzL*sizeof(SC))) { + printf( " Failed to allocate A.d_Lrow_ptr\n" ); + } + + if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Lrow_ptr, h_Lrow_ptr, (nrow+1)*sizeof(int), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Lrow_ptr\n" ); + } + if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Lcol_idx, h_Lcol_ind, nnzL*sizeof(int), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Lcol_idx\n" ); + } + if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Lnzvals, h_Lnzvals, nnzL*sizeof(SC), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Lrow_ptr\n" ); + } + #elif defined(HPCG_WITH_HIP) + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Lrow_ptr), (nrow+1)*sizeof(int))) { + printf( " Failed to allocate A.d_Lrow_ptr\n" ); + } + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Lcol_idx), nnzL*sizeof(int))) { + printf( " Failed to allocate A.d_Lcol_idx\n" ); + } + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Lnzvals), nnzL*sizeof(SC))) { + printf( " Failed to allocate A.d_Lrow_ptr\n" ); + } + + if (hipSuccess != hipMemcpy(curLevelMatrix->d_Lrow_ptr, h_Lrow_ptr, (nrow+1)*sizeof(int), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Lrow_ptr\n" ); + } + if (hipSuccess != hipMemcpy(curLevelMatrix->d_Lcol_idx, h_Lcol_ind, nnzL*sizeof(int), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Lcol_idx\n" ); + } + if (hipSuccess != hipMemcpy(curLevelMatrix->d_Lnzvals, h_Lnzvals, nnzL*sizeof(SC), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Lrow_ptr\n" ); + } + #endif + + // copy CSR(U) to device + #if defined(HPCG_WITH_CUDA) + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Urow_ptr), (nrow+1)*sizeof(int))) { + printf( " Failed to allocate A.d_Urow_ptr(nrow=%d)\n",nrow ); + } + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Ucol_idx), nnzU*sizeof(int))) { + printf( " Failed to allocate A.d_Ucol_idx(nnzU=%d)\n",nnzU ); + } + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->d_Unzvals), nnzU*sizeof(SC))) { + printf( " Failed to allocate A.d_Urow_ptr(nnzU=%d)\n",nnzU ); + } + + if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Urow_ptr, h_Urow_ptr, (nrow+1)*sizeof(int), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Urow_ptr\n" ); + } + if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Ucol_idx, h_Ucol_ind, nnzU*sizeof(int), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Ucol_idx\n" ); + } + if (cudaSuccess != cudaMemcpy(curLevelMatrix->d_Unzvals, h_Unzvals, nnzU*sizeof(SC), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Urow_ptr\n" ); + } + #elif defined(HPCG_WITH_HIP) + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Urow_ptr), (nrow+1)*sizeof(int))) { + printf( " Failed to allocate A.d_Urow_ptr(nrow=%d)\n",nrow ); + } + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Ucol_idx), nnzU*sizeof(int))) { + printf( " Failed to allocate A.d_Ucol_idx(nnzU=%d)\n",nnzU ); + } + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->d_Unzvals), nnzU*sizeof(SC))) { + printf( " Failed to allocate A.d_Urow_ptr(nnzU=%d)\n",nnzU ); + } + + if (hipSuccess != hipMemcpy(curLevelMatrix->d_Urow_ptr, h_Urow_ptr, (nrow+1)*sizeof(int), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Urow_ptr\n" ); + } + if (hipSuccess != hipMemcpy(curLevelMatrix->d_Ucol_idx, h_Ucol_ind, nnzU*sizeof(int), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Ucol_idx\n" ); + } + if (hipSuccess != hipMemcpy(curLevelMatrix->d_Unzvals, h_Unzvals, nnzU*sizeof(SC), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_Urow_ptr\n" ); + #endif + + // free matrix on host + free(h_Lrow_ptr); + free(h_Lcol_ind); + free(h_Lnzvals); + free(h_Urow_ptr); + free(h_Ucol_ind); + free(h_Unzvals); + + #if defined(HPCG_WITH_CUDA) + // ------------------------- + // create Handle (for each matrix) + cusparseCreate(&(curLevelMatrix->cusparseHandle)); + + // ------------------------- + // descriptor for A + cusparseCreateMatDescr(&(curLevelMatrix->descrA)); + cusparseSetMatType(curLevelMatrix->descrA, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(curLevelMatrix->descrA, CUSPARSE_INDEX_BASE_ZERO); + + // ------------------------- + // run analysis for triangular solve + cusparseCreateMatDescr(&(curLevelMatrix->descrL)); + cusparseCreateSolveAnalysisInfo(&(curLevelMatrix->infoL)); + cusparseSetMatType(curLevelMatrix->descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR); + cusparseSetMatIndexBase(curLevelMatrix->descrL, CUSPARSE_INDEX_BASE_ZERO); + if (std::is_same::value) { + cusparseDcsrsv_analysis(curLevelMatrix->cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, nrow, nnzL, + curLevelMatrix->descrL, + (double *)curLevelMatrix->d_Lnzvals, curLevelMatrix->d_Lrow_ptr, curLevelMatrix->d_Lcol_idx, + curLevelMatrix->infoL); + } else if (std::is_same::value) { + cusparseScsrsv_analysis(curLevelMatrix->cusparseHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, nrow, nnzL, + curLevelMatrix->descrL, + (float *)curLevelMatrix->d_Lnzvals, curLevelMatrix->d_Lrow_ptr, curLevelMatrix->d_Lcol_idx, + curLevelMatrix->infoL); + } + + // ------------------------- + // descriptor for U + cusparseCreateMatDescr(&(curLevelMatrix->descrU)); + cusparseSetMatType(curLevelMatrix->descrU, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(curLevelMatrix->descrU, CUSPARSE_INDEX_BASE_ZERO); + #elif defined(HPCG_WITH_HIP) + #endif + + if (curLevelMatrix->mgData!=0) { + // ------------------------- + // store restriction as CRS + local_int_t * f2c = curLevelMatrix->mgData->f2cOperator; + local_int_t nc = curLevelMatrix->mgData->rc->localLength; + h_row_ptr = (int*)malloc((nc+1)* sizeof(int)); + h_col_ind = (int*)malloc( nc * sizeof(int)); + h_nzvals = (SC *)malloc( nc * sizeof(SC)); + + h_row_ptr[0] = 0; + for (local_int_t i=0; imgData->d_row_ptr), (nc+1)*sizeof(int))) { + printf( " Failed to allocate A.d_row_ptr(nc=%d)\n",nc ); + } + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->mgData->d_col_idx), nc*sizeof(int))) { + printf( " Failed to allocate A.d_col_idx(nc=%d)\n",nc ); + } + if (cudaSuccess != cudaMalloc ((void**)&(curLevelMatrix->mgData->d_nzvals), nc*sizeof(SC))) { + printf( " Failed to allocate A.d_nzvals(nc=%d)\n",nc ); + } + + if (cudaSuccess != cudaMemcpy(curLevelMatrix->mgData->d_row_ptr, h_row_ptr, (nc+1)*sizeof(int), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_row_ptr\n" ); + } + if (cudaSuccess != cudaMemcpy(curLevelMatrix->mgData->d_col_idx, h_col_ind, nc*sizeof(int), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_col_idx\n" ); + } + if (cudaSuccess != cudaMemcpy(curLevelMatrix->mgData->d_nzvals, h_nzvals, nc*sizeof(SC), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_nzvals\n" ); + } + #elif defined(HPCG_WITH_HIP) + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->mgData->d_row_ptr), (nc+1)*sizeof(int))) { + printf( " Failed to allocate A.d_row_ptr(nc=%d)\n",nc ); + } + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->mgData->d_col_idx), nc*sizeof(int))) { + printf( " Failed to allocate A.d_col_idx(nc=%d)\n",nc ); + } + if (hipSuccess != hipMalloc ((void**)&(curLevelMatrix->mgData->d_nzvals), nc*sizeof(SC))) { + printf( " Failed to allocate A.d_nzvals(nc=%d)\n",nc ); + } + + if (hipSuccess != hipMemcpy(curLevelMatrix->mgData->d_row_ptr, h_row_ptr, (nc+1)*sizeof(int), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_row_ptr\n" ); + } + if (hipSuccess != hipMemcpy(curLevelMatrix->mgData->d_col_idx, h_col_ind, nc*sizeof(int), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_col_idx\n" ); + } + if (hipSuccess != hipMemcpy(curLevelMatrix->mgData->d_nzvals, h_nzvals, nc*sizeof(SC), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy A.d_nzvals\n" ); + } + #endif + + // ------------------------- + // descriptor for restrictor + #if defined(HPCG_WITH_CUDA) + cusparseCreateMatDescr(&(curLevelMatrix->mgData->descrA)); + cusparseSetMatType(curLevelMatrix->mgData->descrA, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(curLevelMatrix->mgData->descrA, CUSPARSE_INDEX_BASE_ZERO); + #elif defined(HPCG_WITH_HIP) + #endif + + // free matrix on host + free(h_row_ptr); + free(h_col_ind); + free(h_nzvals); + } //A.mgData!=0 + + // for debuging, TODO: remove these + InitializeVector(curLevelMatrix->x, nrow); + InitializeVector(curLevelMatrix->y, ncol); + + // next matrix + curLevelMatrix = curLevelMatrix->Ac; + } while (curLevelMatrix != 0); + } + { + typedef typename Vector_type::scalar_type vector_SC; + #if defined(HPCG_WITH_CUDA) + if (cudaSuccess != cudaMemcpy(b.d_values, b.values, (b.localLength)*sizeof(vector_SC), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy b\n" ); + } + if (cudaSuccess != cudaMemcpy(x.d_values, x.values, (x.localLength)*sizeof(vector_SC), cudaMemcpyHostToDevice)) { + printf( " Failed to memcpy x\n" ); + } + #elif defined(HPCG_WITH_HIP) + if (hipSuccess != hipMemcpy(b.d_values, b.values, (b.localLength)*sizeof(vector_SC), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy b\n" ); + } + if (hipSuccess != hipMemcpy(x.d_values, x.values, (x.localLength)*sizeof(vector_SC), hipMemcpyHostToDevice)) { + printf( " Failed to memcpy x\n" ); + } + #endif + } +#endif + return 0; } @@ -128,3 +487,9 @@ int OptimizeProblem< SparseMatrix, CGData, Vector > template double OptimizeProblemMemoryUse< SparseMatrix > (SparseMatrix const&); + +// mixed-precision +template +int OptimizeProblem< SparseMatrix, CGData, Vector > + (SparseMatrix&, CGData&, Vector&, Vector&, Vector&); + diff --git a/src/ReportResults.cpp b/src/ReportResults.cpp index 3eaed87b..99dcb4ee 100644 --- a/src/ReportResults.cpp +++ b/src/ReportResults.cpp @@ -31,7 +31,7 @@ #include using std::endl; -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #endif /*! diff --git a/src/SerialDenseMatrix.hpp b/src/SerialDenseMatrix.hpp index 02162fdc..2faa13a1 100644 --- a/src/SerialDenseMatrix.hpp +++ b/src/SerialDenseMatrix.hpp @@ -21,17 +21,25 @@ #ifndef SERIAL_DENSE_MATRIX_HPP #define SERIAL_DENSE_MATRIX_HPP +#include #include #include +#ifdef HPCG_WITH_CUDA + #include + #include +#endif template class SerialDenseMatrix { public: typedef SC scalar_type; - local_int_t m; //!< number of rows - local_int_t n; //!< number of columns + local_int_t m; //!< number of rows + local_int_t n; //!< number of columns SC * values; //!< array of values +#ifdef HPCG_WITH_CUDA + SC * d_values; //!< array of values +#endif /*! This is for storing optimized data structures created in OptimizeProblem and used inside optimized ComputeSPMV(). @@ -54,6 +62,11 @@ inline void InitializeMatrix(SerialDenseMatrix_type & A, local_int_t m, local_in A.m = m; A.n = n; A.values = new scalar_type[m*n]; +#ifdef HPCG_WITH_CUDA + if (cudaSuccess != cudaMalloc ((void**)&A.d_values, m*n*sizeof(scalar_type))) { + printf( " InitializeVector :: Failed to allocate d_values\n" ); + } +#endif A.optimizationData = 0; return; } @@ -101,6 +114,18 @@ inline void CopyMatrix(const SerialDenseMatrix_type & A, SerialDenseMatrix_type return; } +template +inline void AddMatrixValue(SerialDenseMatrix_type & A, local_int_t i, local_int_t j, typename SerialDenseMatrix_type::scalar_type value) { + + typedef typename SerialDenseMatrix_type::scalar_type scalar_type; + + assert(i>=0 && i < A.m); + assert(j>=0 && j < A.n); + scalar_type * vv = A.values; + vv[i + j*A.m] += value; + return; +} + template inline void SetMatrixValue(SerialDenseMatrix_type & A, local_int_t i, local_int_t j, typename SerialDenseMatrix_type::scalar_type value) { @@ -131,7 +156,7 @@ GetMatrixValue(SerialDenseMatrix_type & A, local_int_t i, local_int_t j) { @param[in] A the known system matrix */ template -inline void DeleteSerialDenseMatrix(SerialDenseMatrix_type & A) { +inline void DeleteDenseMatrix(SerialDenseMatrix_type & A) { delete [] A.values; A.m = 0; diff --git a/src/SetupHalo_ref.cpp b/src/SetupHalo_ref.cpp index 647f5df5..06527e72 100644 --- a/src/SetupHalo_ref.cpp +++ b/src/SetupHalo_ref.cpp @@ -31,7 +31,7 @@ #ifdef HPCG_DETAILED_DEBUG #include using std::endl; -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include #endif diff --git a/src/SetupProblem.cpp b/src/SetupProblem.cpp index 500b32c5..0735c128 100644 --- a/src/SetupProblem.cpp +++ b/src/SetupProblem.cpp @@ -40,9 +40,14 @@ void SetupProblem(int numberOfMgLevels, SparseMatrix_type & A, Geometry * geom, GenerateNonsymProblem(A, b, x, xexact, init_vect); SetupHalo(A); //TODO: This is currently called in main... Should it really be called in both places? Which one? + A.localNumberOfMGNonzeros = A.localNumberOfNonzeros; + A.totalNumberOfMGNonzeros = A.totalNumberOfNonzeros; + SparseMatrix_type * curLevelMatrix = &A; for (int level = 1; level< numberOfMgLevels; ++level) { GenerateNonsymCoarseProblem(*curLevelMatrix); + A.localNumberOfMGNonzeros += curLevelMatrix->Ac->localNumberOfNonzeros; + A.totalNumberOfMGNonzeros += curLevelMatrix->Ac->totalNumberOfNonzeros; curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level } diff --git a/src/SparseMatrix.hpp b/src/SparseMatrix.hpp index 639438c2..4dc8cf1a 100644 --- a/src/SparseMatrix.hpp +++ b/src/SparseMatrix.hpp @@ -36,6 +36,11 @@ typedef std::map< global_int_t, local_int_t > GlobalToLocalMap; using GlobalToLocalMap = std::unordered_map< global_int_t, local_int_t >; #endif +#ifdef HPCG_WITH_CUDA +#include +#include +#endif + template class SparseMatrix { public: @@ -44,9 +49,11 @@ class SparseMatrix { Geometry * geom; //!< geometry associated with this matrix global_int_t totalNumberOfRows; //!< total number of matrix rows across all processes global_int_t totalNumberOfNonzeros; //!< total number of matrix nonzeros across all processes + global_int_t totalNumberOfMGNonzeros; //!< total number of matrix nonzeros across all processes, for MG local_int_t localNumberOfRows; //!< number of rows local to this process local_int_t localNumberOfColumns; //!< number of columns local to this process local_int_t localNumberOfNonzeros; //!< number of nonzeros local to this process + local_int_t localNumberOfMGNonzeros; //!< number of nonzeros local to this process, for MG char * nonzerosInRow; //!< The number of nonzeros in a row will always be 27 or fewer global_int_t ** mtxIndG; //!< matrix indices as global values local_int_t ** mtxIndL; //!< matrix indices as local values @@ -58,6 +65,7 @@ class SparseMatrix { mutable bool isSpmvOptimized; mutable bool isMgOptimized; mutable bool isWaxpbyOptimized; + mutable bool isGemvOptimized; /*! This is for storing optimized data structres created in OptimizeProblem and used inside optimized ComputeSPMV(). @@ -76,6 +84,39 @@ class SparseMatrix { local_int_t * sendLength; //!< lenghts of messages sent to neighboring processes SC * sendBuffer; //!< send buffer for non-blocking sends #endif +#if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP) + #if defined(HPCG_WITH_CUDA) + cusparseHandle_t cusparseHandle; + cusparseMatDescr_t descrA; + #endif + + // to store the local matrix on device + int *d_row_ptr; + int *d_col_idx; + SC *d_nzvals; //!< values of matrix entries + + // to store the lower-triangular matrix on device + local_int_t nnzL; + #if defined(HPCG_WITH_CUDA) + cusparseMatDescr_t descrL; + cusparseSolveAnalysisInfo_t infoL; + #endif + int *d_Lrow_ptr; + int *d_Lcol_idx; + SC *d_Lnzvals; //!< values of matrix entries + // to store the strictly upper-triangular matrix on device + local_int_t nnzU; + #if defined(HPCG_WITH_CUDA) + cusparseMatDescr_t descrU; + #endif + int *d_Urow_ptr; + int *d_Ucol_idx; + SC *d_Unzvals; //!< values of matrix entries + + // TODO: remove + Vector x; // nrow + Vector y; // ncol +#endif }; /*! @@ -89,9 +130,11 @@ inline void InitializeSparseMatrix(SparseMatrix_type & A, Geometry * geom) { A.geom = geom; A.totalNumberOfRows = 0; A.totalNumberOfNonzeros = 0; + A.totalNumberOfMGNonzeros = 0; A.localNumberOfRows = 0; A.localNumberOfColumns = 0; A.localNumberOfNonzeros = 0; + A.localNumberOfMGNonzeros = 0; A.nonzerosInRow = 0; A.mtxIndG = 0; A.mtxIndL = 0; @@ -102,8 +145,9 @@ inline void InitializeSparseMatrix(SparseMatrix_type & A, Geometry * geom) { // functions that are meant to be optimized. A.isDotProductOptimized = true; A.isSpmvOptimized = true; - A.isMgOptimized = true; + A.isMgOptimized = true; A.isWaxpbyOptimized = true; + A.isGemvOptimized = true; #ifndef HPCG_NO_MPI A.numberOfExternalValues = 0; @@ -169,24 +213,61 @@ inline void DeleteMatrix(SparseMatrix_type & A) { delete [] A.mtxIndG[0]; delete [] A.mtxIndL[0]; #endif - if (A.title) delete [] A.title; - if (A.nonzerosInRow) delete [] A.nonzerosInRow; - if (A.mtxIndG) delete [] A.mtxIndG; - if (A.mtxIndL) delete [] A.mtxIndL; - if (A.matrixValues) delete [] A.matrixValues; - if (A.matrixDiagonal) delete [] A.matrixDiagonal; + if (A.title) delete [] A.title; + if (A.nonzerosInRow) delete [] A.nonzerosInRow; + if (A.mtxIndG) delete [] A.mtxIndG; + if (A.mtxIndL) delete [] A.mtxIndL; + if (A.matrixValues) delete [] A.matrixValues; + if (A.matrixDiagonal) delete [] A.matrixDiagonal; #ifndef HPCG_NO_MPI - if (A.elementsToSend) delete [] A.elementsToSend; - if (A.neighbors) delete [] A.neighbors; - if (A.receiveLength) delete [] A.receiveLength; + if (A.elementsToSend) delete [] A.elementsToSend; + if (A.neighbors) delete [] A.neighbors; + if (A.receiveLength) delete [] A.receiveLength; if (A.sendLength) delete [] A.sendLength; if (A.sendBuffer) delete [] A.sendBuffer; #endif - if (A.geom!=0) { DeleteGeometry(*A.geom); delete A.geom; A.geom = 0;} - if (A.Ac!=0) { DeleteMatrix(*A.Ac); delete A.Ac; A.Ac = 0;} // Delete coarse matrix - if (A.mgData!=0) { DeleteMGData(*A.mgData); delete A.mgData; A.mgData = 0;} // Delete MG data + if (A.geom!=0) { + DeleteGeometry(*A.geom); + delete A.geom; + A.geom = 0; + } + if (A.Ac!=0) { + // Delete coarse matrix + DeleteMatrix(*A.Ac); + delete A.Ac; + A.Ac = 0; + } + if (A.mgData!=0) { + // Delete MG data + DeleteMGData(*A.mgData); + delete A.mgData; + A.mgData = 0; + } + +#ifdef HPCG_WITH_CUDA + cudaFree (A.d_row_ptr); + cudaFree (A.d_col_idx); + cudaFree (A.d_nzvals); + + cudaFree (A.d_Lrow_ptr); + cudaFree (A.d_Lcol_idx); + cudaFree (A.d_Lnzvals); + + cudaFree (A.d_Urow_ptr); + cudaFree (A.d_Ucol_idx); + cudaFree (A.d_Unzvals); + + DeleteVector (A.x); + DeleteVector (A.y); + + cusparseDestroy(A.cusparseHandle); + cusparseDestroyMatDescr(A.descrA); + cusparseDestroyMatDescr(A.descrL); + cusparseDestroyMatDescr(A.descrU); + cusparseDestroySolveAnalysisInfo(A.infoL); +#endif return; } diff --git a/src/TestGMRES.cpp b/src/TestGMRES.cpp index cee9ae70..4fb07f21 100644 --- a/src/TestGMRES.cpp +++ b/src/TestGMRES.cpp @@ -30,7 +30,7 @@ #include using std::endl; #include -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include "TestGMRES.hpp" #include "GMRES.hpp" @@ -54,13 +54,15 @@ using std::endl; template -int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data) { +int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, Vector_type & b, Vector_type & x, + TestCGData_type & testcg_data, bool test_diagonal_exaggeration, bool test_noprecond) { typedef typename SparseMatrix_type::scalar_type scalar_type; typedef typename SparseMatrix_type2::scalar_type scalar_type2; typedef Vector Vector_type2; // Use this array for collecting timing information + double flops; std::vector< double > times(8,0.0); // Temporary storage for holding original diagonal and RHS Vector_type origDiagA, exaggeratedDiagA, origB; @@ -76,29 +78,30 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da CopyVector(origDiagA2, exagDiagA2); CopyVector(b, origB); -#if 0 - if (A.geom->rank==0) HPCG_fout << std::endl << " ** skippping diagonal exaggeration ** " << std::endl << std::endl; -#else - // Modify the matrix diagonal to greatly exaggerate diagonal values. - // CG should converge in about 10 iterations for this problem, regardless of problem size - if (A.geom->rank==0) HPCG_fout << std::endl << " ** applying diagonal exaggeration ** " << std::endl << std::endl; - for (local_int_t i=0; i< A.localNumberOfRows; ++i) { - global_int_t globalRowID = A.localToGlobalMap[i]; - if (globalRowID<9) { - scalar_type scale = (globalRowID+2)*1.0e6; - scalar_type2 scale2 = (globalRowID+2)*1.0e6; - ScaleVectorValue(exaggeratedDiagA, i, scale); - ScaleVectorValue(exagDiagA2, i, scale2); - ScaleVectorValue(b, i, scale); - } else { - ScaleVectorValue(exaggeratedDiagA, i, 1.0e6); - ScaleVectorValue(exagDiagA2, i, 1.0e6); - ScaleVectorValue(b, i, 1.0e6); + // TODO: This should be moved to somewhere-else, e.g., SetupProblem + if (test_diagonal_exaggeration) { + // Modify the matrix diagonal to greatly exaggerate diagonal values. + // CG should converge in about 10 iterations for this problem, regardless of problem size + if (A.geom->rank==0) HPCG_fout << std::endl << " ** applying diagonal exaggeration ** " << std::endl << std::endl; + for (local_int_t i=0; i< A.localNumberOfRows; ++i) { + global_int_t globalRowID = A.localToGlobalMap[i]; + if (globalRowID<9) { + scalar_type scale = (globalRowID+2)*1.0e6; + scalar_type2 scale2 = (globalRowID+2)*1.0e6; + ScaleVectorValue(exaggeratedDiagA, i, scale); + ScaleVectorValue(exagDiagA2, i, scale2); + ScaleVectorValue(b, i, scale); + } else { + ScaleVectorValue(exaggeratedDiagA, i, 1.0e6); + ScaleVectorValue(exagDiagA2, i, 1.0e6); + ScaleVectorValue(b, i, 1.0e6); + } } + ReplaceMatrixDiagonal(A, exaggeratedDiagA); + ReplaceMatrixDiagonal(A_lo, exagDiagA2);//TODO probably some funny casting here... need to do properly. + } else { + if (A.geom->rank==0) HPCG_fout << std::endl << " ** skippping diagonal exaggeration ** " << std::endl << std::endl; } - ReplaceMatrixDiagonal(A, exaggeratedDiagA); - ReplaceMatrixDiagonal(A_lo, exagDiagA2);//TODO probably some funny casting here... need to do properly. -#endif int niters = 0; scalar_type normr (0.0); @@ -111,7 +114,7 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da testcg_data.expected_niters_prec = 2; // For the preconditioned case, we should take about 1 iteration, permit 2 testcg_data.niters_max_no_prec = 0; testcg_data.niters_max_prec = 0; - for (int k=0; k<2; ++k) + for (int k=(test_noprecond ? 0 : 1); k<2; ++k) { // This loop tests both unpreconditioned and preconditioned runs int expected_niters = testcg_data.expected_niters_no_prec; if (k==1) expected_niters = testcg_data.expected_niters_prec; @@ -119,7 +122,7 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da ZeroVector(x); // Zero out x double time_tic = mytimer(); - int ierr = GMRES(A, data, b, x, restart_length, maxIters, tolerance, niters, normr, normr0, ×[0], k); + int ierr = GMRES(A, data, b, x, restart_length, maxIters, tolerance, niters, normr, normr0, ×[0], &flops, k==1); double time_solve = mytimer() - time_tic; if (ierr) HPCG_fout << "Error in call to GMRES: " << ierr << ".\n" << endl; if (niters <= expected_niters) { @@ -134,20 +137,22 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da HPCG_fout << "Call [" << i << "] Number of GMRES Iterations [" << niters <<"] Scaled Residual [" << normr/normr0 << "]" << endl; HPCG_fout << " Expected " << expected_niters << " iterations. Performed " << niters << "." << endl; HPCG_fout << " Time " << time_solve << " seconds." << endl; + HPCG_fout << " Gflop/s " << flops/1000000000.0 << "/" << time_solve << " = " << (flops/1000000000.0)/time_solve + << " (n = " << A.totalNumberOfRows << ")" << endl; + HPCG_fout << " Time/itr " << time_solve / niters << endl; } } } #if 1 - //for (int k=0; k<2; ++k) - for (int k=1; k<2; ++k) + for (int k=(test_noprecond ? 0 : 1); k<2; ++k) { // This loop tests both unpreconditioned and preconditioned runs int expected_niters = testcg_data.expected_niters_no_prec; if (k==1) expected_niters = testcg_data.expected_niters_prec; for (int i=0; i< numberOfCgCalls; ++i) { ZeroVector(x); // Zero out x double time_tic = mytimer(); - int ierr = GMRES_IR(A, A_lo, data, data_lo, b, x, restart_length, maxIters, tolerance, niters, normr, normr0, ×[0], k); + int ierr = GMRES_IR(A, A_lo, data, data_lo, b, x, restart_length, maxIters, tolerance, niters, normr, normr0, ×[0], &flops, k); double time_solve = mytimer() - time_tic; if (ierr) HPCG_fout << "Error in call to GMRES-IR: " << ierr << ".\n" << endl; if (niters <= expected_niters) { @@ -161,6 +166,9 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da HPCG_fout << "Call [" << i << "] Number of GMRES-IR Iterations [" << niters <<"] Scaled Residual [" << normr/normr0 << "]" << endl; HPCG_fout << " Expected " << expected_niters << " iterations. Performed " << niters << "." << endl; HPCG_fout << " Time " << time_solve << " seconds." << endl; + HPCG_fout << " Gflop/s " << flops/1000000000.0 << "/" << time_solve << " = " << (flops/1000000000.0)/time_solve + << " (n = " << A.totalNumberOfRows << ")" << endl; + HPCG_fout << " Time/itr " << time_solve / niters << endl; } } } @@ -172,6 +180,8 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da // Delete vectors DeleteVector(origDiagA); DeleteVector(exaggeratedDiagA); + DeleteVector(origDiagA2); + DeleteVector(exagDiagA2); DeleteVector(origB); testcg_data.normr = normr; @@ -179,8 +189,9 @@ int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & da } template -int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data) { - TestGMRES(A, A, data, data, b, x, testcg_data); +int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data, + bool test_diagonal_exaggeration, bool test_noprecond) { + return TestGMRES(A, A, data, data, b, x, testcg_data, test_diagonal_exaggeration, test_noprecond); } @@ -191,25 +202,25 @@ int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector // uniform template int TestGMRES< SparseMatrix, CGData, Vector, TestCGData > - (SparseMatrix&, CGData&, Vector&, Vector&, TestCGData&); + (SparseMatrix&, CGData&, Vector&, Vector&, TestCGData&, bool, bool); template int TestGMRES< SparseMatrix, CGData, Vector, TestCGData > - (SparseMatrix&, CGData&, Vector&, Vector&, TestCGData&); + (SparseMatrix&, CGData&, Vector&, Vector&, TestCGData&, bool, bool); // uniform version template int TestGMRES< SparseMatrix, SparseMatrix, CGData, CGData, Vector, TestCGData > - (SparseMatrix&, SparseMatrix&, CGData&, CGData&, Vector&, Vector&, TestCGData&); + (SparseMatrix&, SparseMatrix&, CGData&, CGData&, Vector&, Vector&, TestCGData&, bool, bool); template int TestGMRES< SparseMatrix, SparseMatrix, CGData, CGData, Vector, TestCGData > - (SparseMatrix&, SparseMatrix&, CGData&, CGData&, Vector&, Vector&, TestCGData&); + (SparseMatrix&, SparseMatrix&, CGData&, CGData&, Vector&, Vector&, TestCGData&, bool, bool); // mixed version template int TestGMRES< SparseMatrix, SparseMatrix, CGData, CGData, Vector, TestCGData > - (SparseMatrix&, SparseMatrix&, CGData&, CGData&, Vector&, Vector&, TestCGData&); + (SparseMatrix&, SparseMatrix&, CGData&, CGData&, Vector&, Vector&, TestCGData&, bool, bool); diff --git a/src/TestGMRES.hpp b/src/TestGMRES.hpp index 1fa4a8cc..0f95a104 100644 --- a/src/TestGMRES.hpp +++ b/src/TestGMRES.hpp @@ -21,7 +21,7 @@ #ifndef TESTGMRES_HPP #define TESTGMRES_HPP -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include "SparseMatrix.hpp" #include "Vector.hpp" #include "CGData.hpp" @@ -39,10 +39,12 @@ class TestCGData { }; template -extern int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data); +extern int TestGMRES(SparseMatrix_type & A, SparseMatrix_type2 & A_lo, CGData_type & data, CGData_type2 & data_lo, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data, + bool test_diagonal_exaggeration, bool test_noprecond); template -extern int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data); +extern int TestGMRES(SparseMatrix_type & A, CGData_type & data, Vector_type & b, Vector_type & x, TestCGData_type & testcg_data, + bool test_diagonal_exaggeration, bool test_noprecond); #endif // TESTGMRES_HPP diff --git a/src/TestSymmetry.cpp b/src/TestSymmetry.cpp index 6375eda0..30d7aafb 100644 --- a/src/TestSymmetry.cpp +++ b/src/TestSymmetry.cpp @@ -29,7 +29,7 @@ using std::endl; #include #include -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include "ComputeSPMV.hpp" #include "ComputeMG.hpp" diff --git a/src/TestSymmetry.hpp b/src/TestSymmetry.hpp index 35b8ba50..d7dd2438 100644 --- a/src/TestSymmetry.hpp +++ b/src/TestSymmetry.hpp @@ -21,7 +21,7 @@ #ifndef TESTSYMMETRY_HPP #define TESTSYMMETRY_HPP -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include "SparseMatrix.hpp" #include "CGData.hpp" diff --git a/src/Utils_MPI.hpp b/src/Utils_MPI.hpp index 1715602e..9f85d624 100644 --- a/src/Utils_MPI.hpp +++ b/src/Utils_MPI.hpp @@ -4,6 +4,8 @@ #ifndef HPGMP_UTILS_MPI_HPP #define HPGMP_UTILS_MPI_HPP +#include + // MpiTypeTraits (from Teuchos) template class MpiTypeTraits { diff --git a/src/Vector.hpp b/src/Vector.hpp index 58fcbbda..d91e333f 100644 --- a/src/Vector.hpp +++ b/src/Vector.hpp @@ -24,6 +24,16 @@ #include #include #include +#include "Hpgmp_Params.hpp" + +#ifdef HPCG_WITH_CUDA + #include + #include +#elif defined(HPCG_WITH_HIP) + #include + #include +#endif + #include "Geometry.hpp" template @@ -32,6 +42,14 @@ class Vector { typedef SC scalar_type; local_int_t localLength; //!< length of local portion of the vector SC * values; //!< array of values +#if defined(HPCG_WITH_CUDA) | defined(HPCG_WITH_HIP) + SC * d_values; //!< array of values + #if defined(HPCG_WITH_CUDA) + cublasHandle_t handle; + #elif defined(HPCG_WITH_HIP) + rocblas_handle handle; + #endif +#endif /*! This is for storing optimized data structures created in OptimizeProblem and used inside optimized ComputeSPMV(). @@ -50,6 +68,21 @@ inline void InitializeVector(Vector_type & v, local_int_t localLength) { typedef typename Vector_type::scalar_type scalar_type; v.localLength = localLength; v.values = new scalar_type[localLength]; + #if defined(HPCG_WITH_CUDA) + if (CUBLAS_STATUS_SUCCESS != cublasCreate(&v.handle)) { + printf( " InitializeVector :: Failed to create Handle\n" ); + } + if (cudaSuccess != cudaMalloc ((void**)&v.d_values, localLength*sizeof(scalar_type))) { + printf( " InitializeVector :: Failed to allocate d_values\n" ); + } + #elif defined(HPCG_WITH_HIP) + if (rocblas_status_success != rocblas_create_handle(&v.handle)) { + printf( " InitializeVector :: Failed to create Handle\n" ); + } + if (hipSuccess != hipMalloc ((void**)&v.d_values, localLength*sizeof(scalar_type))) { + printf( " InitializeVector :: Failed to allocate d_values\n" ); + } + #endif v.optimizationData = 0; return; } @@ -62,9 +95,17 @@ inline void InitializeVector(Vector_type & v, local_int_t localLength) { template inline void ZeroVector(Vector_type & v) { typedef typename Vector_type::scalar_type scalar_type; + const scalar_type zero (0.0); + local_int_t localLength = v.localLength; scalar_type * vv = v.values; - for (int i=0; i +inline void ScaleVectorValue(Vector_type & v, typename Vector_type::scalar_type value) { + typedef typename Vector_type::scalar_type scalar_type; + const scalar_type zero (0.0); + + local_int_t localLength = v.localLength; + scalar_type * vv = v.values; + if (value == zero) { + for (int i=0; i::value) { + if (CUBLAS_STATUS_SUCCESS != cublasDscal (v.handle, localLength, (const double*)&value, (double*)d_vv, 1)) { + printf( " Failed cublasDscal\n" ); + } + } else if (std::is_same::value) { + if (CUBLAS_STATUS_SUCCESS != cublasSscal (v.handle, localLength, (const float*)&value, (float*)d_vv, 1)) { + printf( " Failed cublasSscal\n" ); + } + } +#endif + return; +} /*! Fill the input vector with pseudo-random values. @@ -109,7 +182,35 @@ inline void CopyVector(const Vector_src & v, Vector_dst & w) { assert(w.localLength >= localLength); scalar_src * vv = v.values; scalar_dst * wv = w.values; +#if !defined(HPCG_WITH_CUDA) | defined(HPCG_DEBUG) for (int i=0; i::value) { + #ifdef HPCG_DEBUG + HPCG_fout << " CopyVector ( Unit-precision )" << std::endl; + #endif + if (cudaSuccess != cudaMemcpy(w.d_values, v.d_values, localLength*sizeof(scalar_src), cudaMemcpyDeviceToDevice)) { + printf( " CopyVector :: Failed to memcpy d_x\n" ); + } + } else { + HPCG_fout << " CopyVector :: Mixed-precision not supported" << std::endl; + + // Copy input vector to Host + if (cudaSuccess != cudaMemcpy(vv, v.d_values, localLength*sizeof(scalar_src), cudaMemcpyDeviceToHost)) { + printf( " CopyVector :: Failed to memcpy d_v\n" ); + } + + // Copy on Host + for (int i=0; i inline void DeleteVector(Vector_type & v) { delete [] v.values; + #if defined(HPCG_WITH_CUDA) + cudaFree(v.d_values); + cublasDestroy(v.handle); + #elif defined(HPCG_WITH_HIP) + hipFree(v.d_values); + rocblas_destroy_handle(v.handle); + #endif v.localLength = 0; return; } diff --git a/src/finalize.cpp b/src/finalize.cpp index f4b46e46..344f6fcb 100644 --- a/src/finalize.cpp +++ b/src/finalize.cpp @@ -14,7 +14,7 @@ #include -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" /*! Closes the I/O stream used for logging information throughout the HPCG run. diff --git a/src/init.cpp b/src/init.cpp index 1e23854b..df7cff3f 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -34,7 +34,7 @@ const char* NULLDEVICE="/dev/null"; #include #include -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include "ReadHpcgDat.hpp" @@ -79,11 +79,13 @@ HPCG_Init(int * argc_p, char ** *argv_p, HPCG_Params & params) { iparams = (int *)malloc(sizeof(int) * nparams); // Initialize iparams - for (i = 0; i < nparams; ++i) iparams[i] = 0; + for (i = 0; i < nparams; ++i) + iparams[i] = 0; /* for sequential and some MPI implementations it's OK to read first three args */ for (i = 0; i < nparams; ++i) - if (argc <= i+1 || sscanf(argv[i+1], "%d", iparams+i) != 1 || iparams[i] < 10) iparams[i] = 0; + if (argc <= i+1 || sscanf(argv[i+1], "%d", iparams+i) != 1 || iparams[i] < 10) + iparams[i] = 0; /* for some MPI environments, command line arguments may get complicated so we need a prefix */ for (i = 1; i <= argc && argv[i]; ++i) @@ -94,8 +96,10 @@ HPCG_Init(int * argc_p, char ** *argv_p, HPCG_Params & params) { // Check if --rt was specified on the command line int * rt = iparams+3; // Assume runtime was not specified and will be read from the hpcg.dat file - if (iparams[3]) rt = 0; // If --rt was specified, we already have the runtime, so don't read it from file + if (iparams[3]) + rt = 0; // If --rt was specified, we already have the runtime, so don't read it from file if (! iparams[0] && ! iparams[1] && ! iparams[2]) { /* no geometry arguments on the command line */ + //TODO: If I am rank 0, then read the file. ReadHpcgDat(iparams, rt, iparams+7); broadcastParams = true; } diff --git a/src/main_hpgmp.cpp b/src/main_hpgmp.cpp index 13cbfa51..30b0f0a0 100644 --- a/src/main_hpgmp.cpp +++ b/src/main_hpgmp.cpp @@ -35,7 +35,7 @@ using std::endl; #include -#include "hpgmp.hpp" +#include "Hpgmp_Params.hpp" #include "SetupProblem.hpp" #include "CheckAspectRatio.hpp" @@ -58,8 +58,6 @@ using std::endl; #include "GMRES.hpp" #include "TestGMRES.hpp" -#include "GenerateNonsymProblem.hpp" -#include "GenerateNonsymCoarseProblem.hpp" typedef double scalar_type; typedef Vector Vector_type; @@ -91,16 +89,83 @@ int main(int argc, char * argv[]) { MPI_Init(&argc, &argv); #endif + //Initialize params for full-scale benchmark run: HPCG_Params params; - HPCG_Init(&argc, &argv, params); + int size = params.comm_size; // Num MPI processes + int rank = params.comm_rank; // My process ID + +// **************************************************************************8 +// PHASE I: VERIFY CONVERGENCE WITH SMALL PROBLEM +// **************************************************************************8 + + MPI_Comm SM_COMM; + MPI_Group group_world; + MPI_Group sm_group; + if( size <= 64 ) //Use all MPI ranks + SM_COMM = MPI_COMM_WORLD; + else{ + // Get MPI Sub-communicator: + int num_ranks = 64; + int *process_ranks; + // make a list of processes in the new communicator + process_ranks = (int*) malloc(num_ranks*sizeof(int)); + for(int i = 0; i < num_ranks; i++) + process_ranks[i] = i; + //get the group under MPI_COMM_WORLD + MPI_Comm_group(MPI_COMM_WORLD, &group_world); + // create the new group + MPI_Group_incl(group_world, num_ranks, process_ranks, &sm_group); + // create the new communicator + MPI_Comm_create(MPI_COMM_WORLD, sm_group, &SM_COMM); + } + + HPCG_Params sm_params; + sm_params.nx = 32; + sm_params.ny = 32; + sm_params.nz = 32; + + sm_params.runningTime = 0.0005; //Something really small since we just want one run?? + sm_params.pz = 0; + sm_params.zl = 0; + sm_params.zu = 0; + + sm_params.npx = 0; + sm_params.npy = 0; + sm_params.npz = 0; + +#ifndef HPCG_NO_MPI + MPI_Comm_rank( SM_COMM, &sm_params.comm_rank ); + MPI_Comm_size( SM_COMM, &sm_params.comm_size ); +#else + sm_params.comm_rank = 0; + sm_params.comm_size = 1; +#endif + +#ifdef HPCG_NO_OPENMP + sm_params.numThreads = 1; +#else + #pragma omp parallel + sm_params.numThreads = omp_get_num_threads(); +#endif + + // Construct the geometry and linear system + local_int_t nx,ny,nz; + sm_nx = (local_int_t)sm_params.nx; + sm_ny = (local_int_t)sm_params.ny; + sm_nz = (local_int_t)sm_params.nz; + Geometry * sm_geom = new Geometry; + GenerateGeometry(sm_params.comm_size, sm_params.comm_rank, sm_params.numThreads, sm_params.pz, + sm_params.zl, sm_params.zu, sm_nx, sm_ny, sm_nz, sm_params.npx, sm_params.npy, sm_params.npz, sm_geom); + +// **************************************************************************8 +// PHASE II: BENCHMARKING PHASE +// **************************************************************************8 // Check if QuickPath option is enabled. // If the running time is set to zero, we minimize all paths through the program bool quickPath = 1; //TODO: Change back to the following after=(params.runningTime==0); - int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID - #ifdef HPCG_DETAILED_DEBUG if (size < 100 && rank==0) HPCG_fout << "Process "<rank==0) { + HPCG_fout << " Setup Time " << setup_time << " seconds." << endl; + HPCG_fout << " Optimize Time " << t7 << " seconds." << endl; } - times[8] = (mytimer() - t_begin)/((double) numberOfCalls); // Total time divided by number of calls. -#ifdef HPCG_DEBUG - if (rank==0) HPCG_fout << "Total SpMV+MG timing phase execution time in main (sec) = " << mytimer() - t1 << endl; -#endif - /////////////////////////////// // Reference GMRES Timing Phase // @@ -210,23 +252,21 @@ int main(int argc, char * argv[]) { scalar_type normr0 = 0.0; int restart_length = 50; int refMaxIters = 50; - numberOfCalls = 1; // Only need to run the residual reduction analysis once // Compute the residual reduction for the natural ordering and reference kernels + double flops = 0.0; std::vector< double > ref_times(9,0.0); scalar_type tolerance = 0.0; // Set tolerance to zero to make all runs do maxIters iterations int err_count = 0; - for (int i=0; i< numberOfCalls; ++i) { - ZeroVector(x); - ierr = GMRES(A, data, b, x, restart_length, refMaxIters, tolerance, niters, normr, normr0, &ref_times[0], true); - if (ierr) ++err_count; // count the number of errors in GMRES. - totalNiters_ref += niters; - } + ZeroVector(x); + ierr = GMRES(A, data, b, x, restart_length, refMaxIters, tolerance, niters, normr, normr0, &ref_times[0], &flops, true); + if (ierr) ++err_count; // count the number of errors in GMRES. + totalNiters_ref += niters; if (rank == 0 && err_count) HPCG_fout << err_count << " error(s) in call(s) to reference GMRES." << endl; scalar_type refTolerance = normr / normr0; // Call user-tunable set up function. - double t7 = mytimer(); + t7 = mytimer(); OptimizeProblem(A, data, b, x, xexact); t7 = mytimer() - t7; times[7] = t7; @@ -248,6 +288,7 @@ int main(int argc, char * argv[]) { /* #ifdef HPCG_DEBUG t1 = mytimer(); + if (rank==0) HPCG_fout << endl << "Running Uniform-precision Test" << endl; #endif testcg_data.count_pass = testcg_data.count_fail = 0; TestGMRES(A, data, b, x, testcg_data); @@ -272,12 +313,24 @@ int main(int argc, char * argv[]) { SparseMatrix_type2 A2; CGData_type2 data2; SetupProblem(numberOfMgLevels, A2, geom, data2, &b, &x, &xexact, init_vect); + setup_time = mytimer() - setup_time; // Capture total time of setup + + t7 = mytimer(); + OptimizeProblem(A2, data, b, x, xexact); + t7 = mytimer() - t7; + testcg_data.count_pass = testcg_data.count_fail = 0; + if (A.geom->rank==0) { + HPCG_fout << " Setup Time " << setup_time << " seconds." << endl; + HPCG_fout << " Optimize Time " << t7 << " seconds." << endl; + } + bool test_diagonal_exaggeration = true; + bool test_noprecond = true; #ifdef HPCG_DEBUG t1 = mytimer(); #endif - TestGMRES(A, A2, data, data2, b, x, testcg_data); + TestGMRES(A, A2, data, data2, b, x, testcg_data, test_diagonal_exaggeration, test_noprecond); #ifdef HPCG_DEBUG if (rank==0) HPCG_fout << "Total validation (mixed-precision TestGMRES) execution time in main (sec) = " << mytimer() - t1 << endl; #endif @@ -309,8 +362,6 @@ int main(int argc, char * argv[]) { DeleteVector(x); DeleteVector(b); DeleteVector(xexact); - DeleteVector(x_overlap); - DeleteVector(b_computed); //delete [] testnorms_data.values; // Finish up diff --git a/src/main_time.cpp b/src/main_time.cpp new file mode 100644 index 00000000..c61aa201 --- /dev/null +++ b/src/main_time.cpp @@ -0,0 +1,265 @@ + +//@HEADER +// *************************************************** +// +// HPCG: High Performance Conjugate Gradient Benchmark +// +// Contact: +// Michael A. Heroux ( maherou@sandia.gov) +// Jack Dongarra (dongarra@eecs.utk.edu) +// Piotr Luszczek (luszczek@eecs.utk.edu) +// +// *************************************************** +//@HEADER + +/*! + @file main.cpp + + HPCG routine + */ + +// Main routine of a program that calls the HPCG conjugate gradient +// solver to solve the problem, and then prints results. + +#ifndef HPCG_NO_MPI +#include +#endif + +#include +#include +#include +#ifdef HPCG_DETAILED_DEBUG +using std::cin; +#endif +using std::endl; + +#include + +#include "Hpgmp_Params.hpp" + +#include "SetupProblem.hpp" +#include "CheckAspectRatio.hpp" +#include "GenerateGeometry.hpp" +#include "CheckProblem.hpp" +#include "OptimizeProblem.hpp" +#include "WriteProblem.hpp" +#include "ReportResults.hpp" +#include "mytimer.hpp" +#include "ComputeSPMV_ref.hpp" +#include "ComputeMG_ref.hpp" +#include "ComputeResidual.hpp" +#include "Geometry.hpp" +#include "SparseMatrix.hpp" +#include "Vector.hpp" +#include "CGData.hpp" +#include "TestNorms.hpp" + +#include "TestGMRES.hpp" + +typedef double scalar_type; +//typedef float scalar_type; +typedef Vector Vector_type; +typedef SparseMatrix SparseMatrix_type; +typedef CGData CGData_type; +typedef TestCGData TestCGData_type; +typedef TestNormsData TestNormsData_type; + +typedef float scalar_type2; +typedef Vector Vector_type2; +typedef SparseMatrix SparseMatrix_type2; +typedef CGData CGData_type2; +typedef TestCGData TestCGData_type2; +typedef TestNormsData TestNormsData_type2; + + +/*! + Main driver program: Construct synthetic problem, run V&V tests, compute benchmark parameters, run benchmark, report results. + + @param[in] argc Standard argument count. Should equal 1 (no arguments passed in) or 4 (nx, ny, nz passed in) + @param[in] argv Standard argument array. If argc==1, argv is unused. If argc==4, argv[1], argv[2], argv[3] will be interpreted as nx, ny, nz, resp. + + @return Returns zero on success and a non-zero value otherwise. + +*/ +int main(int argc, char * argv[]) { + +#ifndef HPCG_NO_MPI + MPI_Init(&argc, &argv); +#endif + + HPCG_Params params; + + HPCG_Init(&argc, &argv, params); + + // Check if QuickPath option is enabled. + // If the running time is set to zero, we minimize all paths through the program + bool quickPath = (params.runningTime==0); + + int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID + +#ifdef HPCG_DETAILED_DEBUG + if (size < 100 && rank==0) HPCG_fout << "Process "<npx, geom->npy, geom->npz, "process grid", rank==0); + if (ierr) + return ierr; + + // Use this array for collecting timing information + std::vector< double > times(10,0.0); + + double setup_time = mytimer(); + + // Setup the problem + SparseMatrix_type A; + CGData_type data; + + bool init_vect = true; + Vector_type b, x, xexact; + + int numberOfMgLevels = 4; // Number of levels including first + SetupProblem(numberOfMgLevels, A, geom, data, &b, &x, &xexact, init_vect); + + setup_time = mytimer() - setup_time; // Capture total time of setup + times[9] = setup_time; // Save it for reporting + + // Call user-tunable set up function. + double t7 = mytimer(); + OptimizeProblem(A, data, b, x, xexact); + t7 = mytimer() - t7; + times[7] = t7; + + if (A.geom->rank==0) { + HPCG_fout << " Setup Time " << setup_time << " seconds." << endl; + HPCG_fout << " Optimize Time " << t7 << " seconds." << endl; + } + + //////////////////////////////////// + // Reference SpMV+MG Timing Phase // + //////////////////////////////////// + + // Call Reference SpMV and MG. Compute Optimization time as ratio of times in these routines + + local_int_t nrow = A.localNumberOfRows; + local_int_t ncol = A.localNumberOfColumns; + + Vector_type x_overlap, b_computed; + InitializeVector(x_overlap, ncol); // Overlapped copy of x vector + InitializeVector(b_computed, nrow); // Computed RHS vector + + + // Record execution time of reference SpMV and MG kernels for reporting times + // First load vector with random values + FillRandomVector(x_overlap); + +#if 0 + int numberOfCalls = 10; + if (quickPath) numberOfCalls = 1; //QuickPath means we do on one call of each block of repetitive code + double t_begin = mytimer(); + for (int i=0; i< numberOfCalls; ++i) { + ierr = ComputeSPMV_ref(A, x_overlap, b_computed); // b_computed = A*x_overlap + if (ierr) HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl; + ierr = ComputeMG_ref(A, b_computed, x_overlap); // b_computed = Minv*y_overlap + if (ierr) HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl; + } + times[8] = (mytimer() - t_begin)/((double) numberOfCalls); // Total time divided by number of calls. +#ifdef HPCG_DEBUG + if (rank==0) HPCG_fout << "Total SpMV+MG timing phase execution time in main (sec) = " << mytimer() - t1 << endl; +#endif +#endif + + + ////////////////////////////// + // Validation Testing Phase // + ////////////////////////////// + + TestCGData_type testcg_data; + testcg_data.count_pass = testcg_data.count_fail = 0; + + bool test_diagonal_exaggeration = false; + bool test_noprecond = false; + +#ifdef HPCG_DEBUG + t1 = mytimer(); + if (rank==0) HPCG_fout << endl << "Running Uniform-precision Test" << endl; +#endif + TestGMRES(A, data, b, x, testcg_data, test_diagonal_exaggeration, test_noprecond); +#ifdef HPCG_DEBUG + if (rank==0) HPCG_fout << "Total validation (uniform-precision TestGMRES) execution time in main (sec) = " << mytimer() - t1 << endl; +#endif + + setup_time = mytimer(); + init_vect = false; + SparseMatrix_type2 A2; + CGData_type2 data2; + SetupProblem(numberOfMgLevels, A2, geom, data2, &b, &x, &xexact, init_vect); + setup_time = mytimer() - setup_time; // Capture total time of setup + + t7 = mytimer(); + OptimizeProblem(A2, data, b, x, xexact); + t7 = mytimer() - t7; + + testcg_data.count_pass = testcg_data.count_fail = 0; + if (A.geom->rank==0) { + HPCG_fout << " Setup Time " << setup_time << " seconds." << endl; + HPCG_fout << " Optimize Time " << t7 << " seconds." << endl; + } + + +#ifdef HPCG_DEBUG + t1 = mytimer(); +#endif + TestGMRES(A, A2, data, data2, b, x, testcg_data, test_diagonal_exaggeration, test_noprecond); +#ifdef HPCG_DEBUG + if (rank==0) HPCG_fout << "Total validation (mixed-precision TestGMRES) execution time in main (sec) = " << mytimer() - t1 << endl; +#endif + + // free + //DeleteMatrix(A2); + //DeleteMatrix(A); + DeleteVector(x); + DeleteVector(b); + DeleteVector(xexact); + DeleteVector(x_overlap); + DeleteVector(b_computed); + DeleteCGData(data); + DeleteCGData(data2); + + // Finish up + HPCG_Finalize(); +#ifndef HPCG_NO_MPI + MPI_Finalize(); +#endif + return 0; +}