Merge pull request #109 from DrTimothyAldenDavis/master

Master
DrTimothyAldenDavis · Mar 1, 2022 · fd1e1b4 · fd1e1b4
2 parents f17d16c + f9363f7
commit fd1e1b4
Show file tree

Hide file tree

Showing 18 changed files with 349 additions and 523 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,10 +26,10 @@ endif ( )
 set ( CMAKE_MACOSX_RPATH TRUE )
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "Feb 16, 2022" )
+set ( GraphBLAS_DATE "Feb 28, 2022" )
 set ( GraphBLAS_VERSION_MAJOR 6 )
 set ( GraphBLAS_VERSION_MINOR 2 )
-set ( GraphBLAS_VERSION_SUB   1 )
+set ( GraphBLAS_VERSION_SUB   2 )
 
 message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}  " date: " ${GraphBLAS_DATE} )
 

diff --git a/CUDA/GB_AxB_dot3_cuda.cu b/CUDA/GB_AxB_dot3_cuda.cu
@@ -324,8 +324,7 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     // phase1: assign each C(i,j) to a bucket, and count them
     //----------------------------------------------------------------------
     dim3 grid( ntasks) ; 
-    dim3 p2grid( (ntasks +  SYMBOLIC_PHASE_NTHREADS -1)
-                          / (SYMBOLIC_PHASE_NTHREADS) ) ; 
+
     dim3 block( SYMBOLIC_PHASE_NTHREADS ) ;
 
     std::string base_name = "GB_jit_AxB_dot3_";
@@ -368,6 +367,11 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     // phase2: cumsum across the blockbuckets, propagate to thread level
     //----------------------------------------------------------------------
 
+    // p2grid is for phase2, which uses # of tasks (aka thread blocks)
+    // equal to ceil (ntasks / SYMBOLIC_PHASE_NTHREADS).
+    int p2ntasks = ( (ntasks +  SYMBOLIC_PHASE_NTHREADS -1) / (SYMBOLIC_PHASE_NTHREADS) ) ;
+    dim3 p2grid( p2ntasks ) ;
+
     Opname = "phase2";
 
     std::stringstream phase2_program ;
@@ -428,7 +432,7 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     //----------------------------------------------------------------------
     // phase2: cumsum across the blockbuckets, propagate to thread level
     //----------------------------------------------------------------------
-    int nblock = ntasks;
+    int nblock = ntasks;    // # of tasks from phase1
 
     phase2Kernel.launch(                    // input
                         Nanobuckets,       // array of size NBUCKETS-blockDim.x-by-gridDim.x

diff --git a/CUDA/test/jitTestFactory.hpp b/CUDA/test/jitTestFactory.hpp
@@ -83,10 +83,15 @@ void print_array(void *arr, I size, const char *name) {
     std::cout << std::endl << "Done." << std::endl;
 }
 
+//------------------------------------------------------------------------------
+// test_AxB_phase1_factory: test phase1
+//------------------------------------------------------------------------------
+
 // Test generator code, to allow parameterized tests
 // Uses jitFactory, dataFactory and GB_jit 
 template <typename T_C, typename T_M, typename T_A,typename T_B>
-bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_Monoid monoid, GrB_BinaryOp binop) {
+bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_Monoid monoid, GrB_BinaryOp binop)
+{
 
     int gpuID;
     cudaGetDevice( &gpuID);
@@ -110,7 +115,6 @@ bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_M
     matrix<T_A>* A = G.getAptr();
     matrix<T_B>* B = G.getBptr();
 
-    int nblck = N;
     int nthrd = 32;
     int sz = 4;
     //int m = 256/sz;
@@ -127,7 +131,11 @@ bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_M
     int ntasks =  ( mnz + chunksize - 1)/chunksize;
 
     // Idea is to have each task work on a continguous block of columns of C
-    //ntasks =; //GB_IMIN( ntasks,  128*number_of_sms) ;    // ntasks will be grid.x
+    // Note: for small tests, mnz is small so ntasks is be governed by
+    // chunksize, not 128*number_of_sms.  For large problems in production,
+    // chunksize is less important since ntasks will likely be bounded by
+    // 128*number_of_sms (say 128*80 = 10,240 on a V100).
+    ntasks = GB_IMIN( ntasks,  128*number_of_sms) ;    // ntasks will be grid.x
 
     // TODO: Verify that RMM is checking and throwing exceptions
     int nanobuckets_size = NBUCKETS * nthrd * ntasks;
@@ -152,9 +160,13 @@ bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_M
     return true;
 }
 
+//------------------------------------------------------------------------------
+// test_AxB_phase2_factory: test phase2 and phase2end
+//------------------------------------------------------------------------------
 
 template <typename T_C>
-bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
+bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz)
+{
 
     int gpuID;
     cudaGetDevice( &gpuID);
@@ -164,7 +176,6 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
     phase2launchFactory<T_C> p2lF;
     phase2endlaunchFactory<T_C> p2elF;
 
-
     SpGEMM_problem_generator<T_C, T_C, T_C, T_C> G(N, N);
     int64_t Annz = N*N;
     int64_t Bnnz = N*N;
@@ -176,14 +187,14 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
 
     matrix<T_C>* C = G.getCptr();
     matrix<T_C>* M = G.getMptr();       // note: values are not accessed
+
 //  matrix<T_C>* A = G.getAptr();
 //  matrix<T_C>* B = G.getBptr();
 //
 //    T_C *Cx = C->mat->x;
 //    T_C *Ax = A->mat->x;
 //    T_C *Bx = B->mat->x;
 
-    int nblck = N;
    int nthrd = 32;
    // int sz = 4;
    //int m = 256/sz;
@@ -197,21 +208,26 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
 
     std::cout << "mnz: " << mnz << std::endl;
 
-//    int number_of_sms = GB_Global_gpu_sm_get (0) ;
-    int number_of_sms = 1;
+    int number_of_sms = GB_Global_gpu_sm_get (0) ;
 
     printf("number of sms: %d\n", number_of_sms);
 
-    int ntasks =  ( mnz +chunksize -1)/chunksize;
-
+    int ntasks = ( mnz +chunksize -1)/chunksize;
 
     printf("ntasks before: %d\n", ntasks);
 
     // Idea is to have each task work on a continguous block of columns of C
     ntasks = GB_IMIN( ntasks,  128*number_of_sms) ;    // ntasks will be grid.x
 
-    printf("ntasks after: %d\n", ntasks);
+    printf("ntasks after: %d (done in phase1)\n", ntasks);
 
+    // ntasks is the # of tasks that were created by phase1.  phase2 uses a
+    // much smaller grid (fewer thread blocks) than phase1 or phase2end,
+    // with p2ntasks.
+    int p2ntasks = (ntasks + nthrd - 1) / nthrd ;
+    printf("p2ntasks %d\n", p2ntasks);
+
+    // fabricate data as if it came from phase1:
     // TODO: Verify that RMM is checking and throwing exceptions
     int64_t *nanobuckets = (int64_t*)rmm_wrap_malloc(NBUCKETS * nthrd * ntasks * sizeof (int64_t));
     int64_t *blockbucket = (int64_t*)rmm_wrap_malloc(NBUCKETS * ntasks * sizeof (int64_t));
@@ -227,8 +243,11 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
     print_array<int64_t>(nanobuckets, NBUCKETS*nthrd*ntasks, "nanobuckets");
     print_array<int64_t>(blockbucket, NBUCKETS*ntasks, "blockbucket");
 
-    p2lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, blockbucket,
-                            bucketp, bucket, offset, nblck);
+    // launch phase2 (just with p2ntasks as the # of tasks)
+    p2lF.jitGridBlockLaunch( p2ntasks, nthrd, nanobuckets, blockbucket,
+                            bucketp, bucket, offset, /* phase1 size was: */ ntasks);
+
+    // do the reduction between phase2 and phase2end
     int64_t s= 0;
     for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
     {
@@ -237,8 +256,10 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
         //printf("bucketp[%d] = %ld\n", bucket, Bucketp[bucket]);
     }
 
+    // launch phase2end: note same # of tasks as phase1
+    const int64_t cnz = mnz ; // the size of M and C (including the zombie count in C)
     p2elF.jitGridBlockLaunch( ntasks, nthrd, nanobuckets, blockbucket,
-                              bucketp, bucket, offset, C->get_grb_matrix(), Cnz);
+                              bucketp, bucket, offset, C->get_grb_matrix(), cnz);
     kernTimer.Stop();
     std::cout<<"returned from phase2 kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
 

diff --git a/Doc/ChangeLog b/Doc/ChangeLog
@@ -1,3 +1,9 @@
+Version 6.2.2 Feb 28, 2022
+
+    * revised output of GxB_*_sort: to return newly created matrices
+        C and P as full or bitmap matrices, as appropriate, instead of
+        sparse/hypersparse, following their sparsity control settings
+
 Version 6.2.1 Feb 16, 2022
 
     * (41) bug fix: GxB_Iterator_get used (void *) + siz arithmetic

diff --git a/Doc/GraphBLAS_UserGuide.pdf b/Doc/GraphBLAS_UserGuide.pdf