Skip to content

Commit

Permalink
Merge pull request #109 from DrTimothyAldenDavis/master
Browse files Browse the repository at this point in the history
Master
  • Loading branch information
DrTimothyAldenDavis authored Mar 1, 2022
2 parents f17d16c + f9363f7 commit fd1e1b4
Show file tree
Hide file tree
Showing 18 changed files with 349 additions and 523 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ endif ( )
set ( CMAKE_MACOSX_RPATH TRUE )

# version of SuiteSparse:GraphBLAS
set ( GraphBLAS_DATE "Feb 16, 2022" )
set ( GraphBLAS_DATE "Feb 28, 2022" )
set ( GraphBLAS_VERSION_MAJOR 6 )
set ( GraphBLAS_VERSION_MINOR 2 )
set ( GraphBLAS_VERSION_SUB 1 )
set ( GraphBLAS_VERSION_SUB 2 )

message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB} " date: " ${GraphBLAS_DATE} )

Expand Down
10 changes: 7 additions & 3 deletions CUDA/GB_AxB_dot3_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
// phase1: assign each C(i,j) to a bucket, and count them
//----------------------------------------------------------------------
dim3 grid( ntasks) ;
dim3 p2grid( (ntasks + SYMBOLIC_PHASE_NTHREADS -1)
/ (SYMBOLIC_PHASE_NTHREADS) ) ;

dim3 block( SYMBOLIC_PHASE_NTHREADS ) ;

std::string base_name = "GB_jit_AxB_dot3_";
Expand Down Expand Up @@ -368,6 +367,11 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
// phase2: cumsum across the blockbuckets, propagate to thread level
//----------------------------------------------------------------------

// p2grid is for phase2, which uses # of tasks (aka thread blocks)
// equal to ceil (ntasks / SYMBOLIC_PHASE_NTHREADS).
int p2ntasks = ( (ntasks + SYMBOLIC_PHASE_NTHREADS -1) / (SYMBOLIC_PHASE_NTHREADS) ) ;
dim3 p2grid( p2ntasks ) ;

Opname = "phase2";

std::stringstream phase2_program ;
Expand Down Expand Up @@ -428,7 +432,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
//----------------------------------------------------------------------
// phase2: cumsum across the blockbuckets, propagate to thread level
//----------------------------------------------------------------------
int nblock = ntasks;
int nblock = ntasks; // # of tasks from phase1

phase2Kernel.launch( // input
Nanobuckets, // array of size NBUCKETS-blockDim.x-by-gridDim.x
Expand Down
49 changes: 35 additions & 14 deletions CUDA/test/jitTestFactory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,15 @@ void print_array(void *arr, I size, const char *name) {
std::cout << std::endl << "Done." << std::endl;
}

//------------------------------------------------------------------------------
// test_AxB_phase1_factory: test phase1
//------------------------------------------------------------------------------

// Test generator code, to allow parameterized tests
// Uses jitFactory, dataFactory and GB_jit
template <typename T_C, typename T_M, typename T_A,typename T_B>
bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_Monoid monoid, GrB_BinaryOp binop) {
bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_Monoid monoid, GrB_BinaryOp binop)
{

int gpuID;
cudaGetDevice( &gpuID);
Expand All @@ -110,7 +115,6 @@ bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_M
matrix<T_A>* A = G.getAptr();
matrix<T_B>* B = G.getBptr();

int nblck = N;
int nthrd = 32;
int sz = 4;
//int m = 256/sz;
Expand All @@ -127,7 +131,11 @@ bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_M
int ntasks = ( mnz + chunksize - 1)/chunksize;

// Idea is to have each task work on a continguous block of columns of C
//ntasks =; //GB_IMIN( ntasks, 128*number_of_sms) ; // ntasks will be grid.x
// Note: for small tests, mnz is small so ntasks is be governed by
// chunksize, not 128*number_of_sms. For large problems in production,
// chunksize is less important since ntasks will likely be bounded by
// 128*number_of_sms (say 128*80 = 10,240 on a V100).
ntasks = GB_IMIN( ntasks, 128*number_of_sms) ; // ntasks will be grid.x

// TODO: Verify that RMM is checking and throwing exceptions
int nanobuckets_size = NBUCKETS * nthrd * ntasks;
Expand All @@ -152,9 +160,13 @@ bool test_AxB_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, GrB_M
return true;
}

//------------------------------------------------------------------------------
// test_AxB_phase2_factory: test phase2 and phase2end
//------------------------------------------------------------------------------

template <typename T_C>
bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz)
{

int gpuID;
cudaGetDevice( &gpuID);
Expand All @@ -164,7 +176,6 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
phase2launchFactory<T_C> p2lF;
phase2endlaunchFactory<T_C> p2elF;


SpGEMM_problem_generator<T_C, T_C, T_C, T_C> G(N, N);
int64_t Annz = N*N;
int64_t Bnnz = N*N;
Expand All @@ -176,14 +187,14 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {

matrix<T_C>* C = G.getCptr();
matrix<T_C>* M = G.getMptr(); // note: values are not accessed

// matrix<T_C>* A = G.getAptr();
// matrix<T_C>* B = G.getBptr();
//
// T_C *Cx = C->mat->x;
// T_C *Ax = A->mat->x;
// T_C *Bx = B->mat->x;

int nblck = N;
int nthrd = 32;
// int sz = 4;
//int m = 256/sz;
Expand All @@ -197,21 +208,26 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {

std::cout << "mnz: " << mnz << std::endl;

// int number_of_sms = GB_Global_gpu_sm_get (0) ;
int number_of_sms = 1;
int number_of_sms = GB_Global_gpu_sm_get (0) ;

printf("number of sms: %d\n", number_of_sms);

int ntasks = ( mnz +chunksize -1)/chunksize;

int ntasks = ( mnz +chunksize -1)/chunksize;

printf("ntasks before: %d\n", ntasks);

// Idea is to have each task work on a continguous block of columns of C
ntasks = GB_IMIN( ntasks, 128*number_of_sms) ; // ntasks will be grid.x

printf("ntasks after: %d\n", ntasks);
printf("ntasks after: %d (done in phase1)\n", ntasks);

// ntasks is the # of tasks that were created by phase1. phase2 uses a
// much smaller grid (fewer thread blocks) than phase1 or phase2end,
// with p2ntasks.
int p2ntasks = (ntasks + nthrd - 1) / nthrd ;
printf("p2ntasks %d\n", p2ntasks);

// fabricate data as if it came from phase1:
// TODO: Verify that RMM is checking and throwing exceptions
int64_t *nanobuckets = (int64_t*)rmm_wrap_malloc(NBUCKETS * nthrd * ntasks * sizeof (int64_t));
int64_t *blockbucket = (int64_t*)rmm_wrap_malloc(NBUCKETS * ntasks * sizeof (int64_t));
Expand All @@ -227,8 +243,11 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
print_array<int64_t>(nanobuckets, NBUCKETS*nthrd*ntasks, "nanobuckets");
print_array<int64_t>(blockbucket, NBUCKETS*ntasks, "blockbucket");

p2lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, blockbucket,
bucketp, bucket, offset, nblck);
// launch phase2 (just with p2ntasks as the # of tasks)
p2lF.jitGridBlockLaunch( p2ntasks, nthrd, nanobuckets, blockbucket,
bucketp, bucket, offset, /* phase1 size was: */ ntasks);

// do the reduction between phase2 and phase2end
int64_t s= 0;
for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
{
Expand All @@ -237,8 +256,10 @@ bool test_AxB_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
//printf("bucketp[%d] = %ld\n", bucket, Bucketp[bucket]);
}

// launch phase2end: note same # of tasks as phase1
const int64_t cnz = mnz ; // the size of M and C (including the zombie count in C)
p2elF.jitGridBlockLaunch( ntasks, nthrd, nanobuckets, blockbucket,
bucketp, bucket, offset, C->get_grb_matrix(), Cnz);
bucketp, bucket, offset, C->get_grb_matrix(), cnz);
kernTimer.Stop();
std::cout<<"returned from phase2 kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;

Expand Down
6 changes: 6 additions & 0 deletions Doc/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
Version 6.2.2 Feb 28, 2022

* revised output of GxB_*_sort: to return newly created matrices
C and P as full or bitmap matrices, as appropriate, instead of
sparse/hypersparse, following their sparsity control settings

Version 6.2.1 Feb 16, 2022

* (41) bug fix: GxB_Iterator_get used (void *) + siz arithmetic
Expand Down
Binary file modified Doc/GraphBLAS_UserGuide.pdf
Binary file not shown.
Loading

0 comments on commit fd1e1b4

Please sign in to comment.