Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate Modified AC-SpGEMM / GALATIC #26

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
6 changes: 3 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[submodule "ext/moderngpu"]
path = ext/moderngpu
url = https://[email protected]/ctcyang/moderngpu.git
Copy link
Collaborator

@ctcyang ctcyang Jun 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: Do you mind changing both of these submodule URLs to a consistent format such as https://github.com/ctcyang/moderngpu.git and https://github.com/richardlett/GALATIC.git?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed them both to ssh as that will be more universal until repo is public (automatically uses your ssh-key)

[submodule "ext/cub"]
path = ext/cub
url = https://ctcyang@github.com/NVlabs/cub.git
[submodule "ext/GALATIC"]
path = ext/GALATIC
url = git@github.com:richardlett/GALATIC.git
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ set( PROJ_PATH ${CMAKE_SOURCE_DIR})
set( PROJ_OUT_PATH ${CMAKE_BINARY_DIR})
set( PROJ_HEADERS "" )
set( PROJ_LIBRARIES "" )
set( PROJ_INCLUDES "./" "ext/moderngpu/include" "ext/cub/cub")
set( PROJ_INCLUDES "./" "ext/moderngpu/include")
set( mgpu_SRC_FILES "ext/moderngpu/src/mgpucontext.cu" "ext/moderngpu/src/mgpuutil.cpp")
set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/../bin )
#set( CUDA_CURAND_LIBRARY "$ENV{CUDA_HOME}/lib64/libcurand.so" )
Expand All @@ -23,12 +23,12 @@ set( CUDA_CUSPARSE_LIBRARY "$ENV{CUDA_HOME}/lib64/libcusparse.so" )
#FILE( GLOB_RECURSE PROJ_LIBRARIES ext/cublas1.1/*.cu )
FILE( GLOB_RECURSE PROJ_HEADERS graphblas/*.hpp)
# nvcc flags
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_35 -lineinfo -O3 -use_fast_math -Xptxas=-v")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_35 -lineinfo -O3 -use_fast_math -Xptxas=-v --expt-relaxed-constexpr ")
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-fpermissive;-arch=sm_35;-lineinfo;-Xptxas=-v;-dlcm=ca;-maxrregcount=64)
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_20,code=sm_21)
# needed for cudamalloc
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-fpermissive -g -m64 -std=c++11" )
set(CMAKE_CXX_FLAGS "-fpermissive -g -std=c++14" )
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: one space instead of two between -g and -std=c++14

#set(CMAKE_CXX_FLAGS "-fpermissive -pg -m64 -std=c++11" )
#set(CMAKE_CXX_FLAGS "-fpermissive -g -m64 -std=c++11 -H" )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include common.mk
#-------------------------------------------------------------------------------

# Includes
INC += -I$(MGPU_DIR) -I$(CUB_DIR) -I$(BOOST_DIR) -I$(GRB_DIR)
INC += -I$(MGPU_DIR) -I$(BOOST_DIR) -I$(GRB_DIR)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: one space instead of two between MGPU_DIR and BOOST_DIR


#-------------------------------------------------------------------------------
# Dependency Lists
Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ GraphBLAST is a GPU implementation of [GraphBLAS](http://www.graphblas.org), an

## Prerequisites

This software has been tested on the following dependencies:
This software has been tested to build with the following dependencies:

* CUDA 9.1, 9.2
* Boost 1.58
* g++ 4.9.3, 5.4.0
* CUDA 11.3
* (Change: CUDA > 11 is now required)
* Boost 1.74
* g++ 8.3.0
* (Change: C++14 is required)

Optional:

Expand Down
1 change: 1 addition & 0 deletions ext/GALATIC
Submodule GALATIC added at fca085
1 change: 0 additions & 1 deletion ext/cub
Submodule cub deleted from d62284
8 changes: 8 additions & 0 deletions graphblas/backend/cuda/descriptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,14 @@ Info Descriptor::loadArgs(const po::variables_map& vm) {
std::cout << "Error: incorrect nthread selection!\n";
}

if(mode_ == "galatic") {
CHECK(set(GrB_MODE, GrB_GALATIC));
} else if (mode_ == "cusparse2") {
CHECK(set(GrB_MODE, GrB_CUSPARSE2));
} else {
std::cout << R"(Invalid mode: Options are "galatic" and "cusparse2")" << std::endl;
}

// TODO(@ctcyang): Enable device selection using ndevice_
// if( ndevice_!=0 )

Expand Down
27 changes: 23 additions & 4 deletions graphblas/backend/cuda/operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,30 @@ Info mxm(Matrix<c>* C,
desc));
} else if (typeid(c) == typeid(float) && typeid(a) == typeid(float) &&
typeid(b) == typeid(float)) {
CHECK(cusparse_spgemm2(&C->sparse_, mask, accum, op, &A->sparse_,
&B->sparse_, desc));

Desc_value s_mode;
CHECK(desc->get(GrB_MODE, &s_mode));

if (s_mode == GrB_CUSPARSE2)
CHECK(cusparse_spgemm2(&C->sparse_, mask, accum, op, &A->sparse_,
&B->sparse_, desc));
else {
if (s_mode != GrB_GALATIC) {
std::cout << R"(Unknown mode (Options are: "cusspare2" and "galatic"; defaulting to galatic)" << std::endl;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: fix typo cusspare2 -> cusparse2

}
CHECK(GALATIC_spgemm(&C->sparse_,
op,
&A->sparse_,
&B->sparse_,
desc));

}
} else {
std::cout << "Error: Unmasked SpGEMM not implemented yet!\n";
return GrB_NOT_IMPLEMENTED;
CHECK(GALATIC_spgemm(&C->sparse_,
op,
&A->sparse_,
&B->sparse_,
desc));
}
} else {
std::cout << "Error: SpMM and GEMM not implemented yet!\n";
Expand Down
2 changes: 1 addition & 1 deletion graphblas/backend/cuda/reduce.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef GRAPHBLAS_BACKEND_CUDA_REDUCE_HPP_
#define GRAPHBLAS_BACKEND_CUDA_REDUCE_HPP_

#include <cub.cuh>
#include <cub/cub.cuh>

#include <iostream>

Expand Down
195 changes: 195 additions & 0 deletions graphblas/backend/cuda/spgemm.hpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
#ifndef GRAPHBLAS_BACKEND_CUDA_SPGEMM_HPP_
#define GRAPHBLAS_BACKEND_CUDA_SPGEMM_HPP_


Copy link
Collaborator

@ctcyang ctcyang Jun 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: stylistic thing - I like having only single-spaced lines between lines with code, so could you get rid of all the newline spacing? 1-line gaps are fine, so don't worry about those.

#include "../../../ext/GALATIC/include/dCSR.cuh"
Copy link
Collaborator

@ctcyang ctcyang Jun 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not a big fan of relative path includes that are dependent on a file being in the right folder. Could you add the galatic directory "GALATIC/include" to the CMakeLists.txt includes?

Something else that would be nice to have is a single file helper file in GALATIC that includes all helper files part of your public interface. Then we can just have a one-liner #include "galatic.cuh". This will let you iterate on GALATIC really quickly, for example add new include + source files or change naming of files in your repo, without breaking GALATIC's dependencies (e.g. graphblast).

Some examples for how to do that are "cub.cuh", ModernGPU's "moderngpu.cuh" and "graphblast.hpp".

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing this out! You can tell I don't have the most experience with C++ :p

Copy link
Collaborator

@ctcyang ctcyang Jun 25, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel that way about myself too, but most important thing is to code lots and learn something new every day =P

#include "../../../ext/GALATIC/include/SemiRingInterface.h"
#include "../../../ext/GALATIC/source/device/Multiply.cuh"

#include "graphblas/backend/cuda/sparse_matrix.hpp"


Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: get rid of empty line

#include <cuda.h>
#include <cusparse.h>

#include <iostream>
#include <vector>


Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: get rid of empty lines




namespace graphblas {
namespace backend {
template <typename T>
Expand Down Expand Up @@ -108,6 +118,191 @@ Info spgemmMasked(SparseMatrix<c>* C,
C->csc_initialized_ = false;
return GrB_SUCCESS;
}
// Shallow copy graphblast sparsematrix -> Galatic dCSR format
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: please add a newline between following template function and above function.

template<typename T>
static void matrixToGalatic(const SparseMatrix<T> *input , dCSR<T>& output) {
output.col_ids = reinterpret_cast<unsigned int*>(input->d_csrColInd_);
output.data = input->d_csrVal_;
output.row_offsets = reinterpret_cast<unsigned int*>(input->d_csrRowPtr_);
output.rows = input->nrows_;
output.cols = input->ncols_;
output.nnz = input->nvals_;
}

// Shallow copy Galatic dCSR format -> graphblast sparsematrix
template<typename T>
static void galaticToSparse(SparseMatrix<T> *output , const dCSR<T>& input) {
output->d_csrColInd_ = reinterpret_cast<Index*>(input.col_ids);
output->d_csrVal_ = input.data;
output->d_csrRowPtr_ = reinterpret_cast<Index*>(input.row_offsets);
output->nvals_ = input.nnz;
output->ncapacity_ = input.nnz;
}

// Nullize pointers in Galatic's sparse matrices;
// Galatic's destructors check for null. Doing this will prevent double
// freeing when shallowcopying with matrixToGalatic & galaticToSparse
template<typename T>
static void nullizeGalaticMatrix(dCSR<T>& m) {
m.data = nullptr;
m.col_ids = nullptr;
m.row_offsets = nullptr;
}

// A generic shim between graphblast's and GALATIC's semiring interfaces
template<typename NativeSR, typename a, typename b, typename c>
struct GalaticSemiring : SemiRing<a, b, c> {
NativeSR nativeSemiring;

__device__ c multiply(const a& left, const b& right) const
{ return nativeSemiring.mul_op(left, right); }
__device__ c add(const c& left,const c& right) const
{ return nativeSemiring.add_op(left, right); }
__device__ static c AdditiveIdentity()
{ return NativeSR::identity(); }
};

template <typename c, typename a, typename b, typename SemiringT>
Info GALATIC_spgemm(SparseMatrix<c>* C,
SemiringT op,
const SparseMatrix<a>* A,
const SparseMatrix<b>* B,
Descriptor* desc) {

Index A_nrows, A_ncols, A_nvals;
Index B_nrows, B_ncols, B_nvals;
Index C_nrows, C_ncols, C_nvals;

A_nrows = A->nrows_;
A_ncols = A->ncols_;
A_nvals = A->nvals_;
B_nrows = B->nrows_;
B_ncols = B->ncols_;
B_nvals = B->nvals_;
C_nrows = C->nrows_;
C_ncols = C->ncols_;

// Dimension compatibility check
if ((A_ncols != B_nrows) || (C_ncols != B_ncols) || (C_nrows != A_nrows)) {
std::cout << "Dim mismatch mxm" << std::endl;
std::cout << A_ncols << " " << B_nrows << std::endl;
std::cout << C_ncols << " " << B_ncols << std::endl;
std::cout << C_nrows << " " << A_nrows << std::endl;
return GrB_DIMENSION_MISMATCH;
}

if (C->d_csrColInd_ != NULL) {
CUDA_CALL(cudaFree(C->d_csrColInd_));
CUDA_CALL(cudaFree(C->d_csrVal_));
C->d_csrColInd_ = NULL;
C->d_csrVal_ = NULL;
}

if (C->d_csrRowPtr_ != NULL) {
CUDA_CALL(cudaFree(C->d_csrRowPtr_));
C->d_csrRowPtr_ = NULL;
}

if (C->h_csrColInd_ != NULL) {
free(C->h_csrColInd_);
free(C->h_csrVal_);
C->h_csrColInd_ = NULL;
C->h_csrVal_ = NULL;
}

dCSR<c> outMatrixGPU;
dCSR<a> leftInputMatrixGPU;
dCSR<b> rightInputMatrixGPU;

//shallow copy input matrices to galatic format
matrixToGalatic(A, leftInputMatrixGPU);
matrixToGalatic(B, rightInputMatrixGPU);


Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: get rid of extra lines and the rest in this file.


GPUMatrixMatrixMultiplyTraits DefaultTraits;


// GALATIC has its own semiring interface;
// GalaticSemiring is a shim here for conversion of graphblast-style
// SemiringT type. GalaticSemiring definition is above this function
GalaticSemiring<SemiringT, a, b, c> semiring_shim;
semiring_shim.nativeSemiring = op;

ExecutionStats stats;
try {


Desc_value nt_mode;
CHECK(desc->get(GrB_NT, &nt_mode));
const int num_threads = static_cast<int>(nt_mode);

switch (num_threads) {
case 64:
ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
64, 4, 2, 8, 4, 16, 512, 8, 0, a, b, c,
GalaticSemiring<SemiringT, a, b, c>>
(leftInputMatrixGPU, rightInputMatrixGPU,
outMatrixGPU, DefaultTraits, stats, semiring_shim);
break;
case 128:
ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
128, 4, 2, 4, 4, 16, 512, 8, 0, a, b, c,
GalaticSemiring<SemiringT, a, b, c>>
( leftInputMatrixGPU, rightInputMatrixGPU,
outMatrixGPU, DefaultTraits, stats, semiring_shim);
break;
case 512:
ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
512, 1, 1, 1, 2, 16, 512, 8, 0, a, b, c,
GalaticSemiring<SemiringT, a, b, c>>
(leftInputMatrixGPU, rightInputMatrixGPU,
outMatrixGPU, DefaultTraits, stats, semiring_shim);
break;
default: // 256
ACSpGEMM::MultiplyImplementation<GalaticSemiring<SemiringT, a, b, c>,
256, 4, 2, 4, 4, 16, 512, 8, 0, a, b, c,
GalaticSemiring<SemiringT, a, b, c>>
(leftInputMatrixGPU, rightInputMatrixGPU,
outMatrixGPU, DefaultTraits, stats, semiring_shim);
break;
}
} catch(std::exception& e) {
std::cerr
<< "Exception occured in GALATIC SpGEMM, called from GALATIC_spgemm\n"
<< "Exception:\n"
<< e.what()
<< std::endl;
return GrB_OUT_OF_MEMORY; //the most likely issue, fixme
}

// shallow copy to native format.
galaticToSparse(C , outMatrixGPU);

// prevent allocations being freed twice when destructors are ran,
// as we are doing shallow copies:
//
// A, B -> leftInputMatrixGPU, rightInputMatrixGPU
// outMatrixGPU -> C.
nullizeGalaticMatrix(outMatrixGPU);
nullizeGalaticMatrix(leftInputMatrixGPU);
nullizeGalaticMatrix(rightInputMatrixGPU);


if (C->h_csrRowPtr_ == NULL)
C->h_csrRowPtr_ = reinterpret_cast<Index*>(malloc((A_nrows+1)*
sizeof(Index)));
C->h_csrColInd_ = reinterpret_cast<Index*>(malloc(C->ncapacity_*sizeof(Index)));
C->h_csrVal_ = reinterpret_cast<c*>(malloc(C->ncapacity_*sizeof(c)));


C->need_update_ = true; // Set flag that we need to copy data from GPU
C->csr_initialized_ = true;
C->csc_initialized_ = false;
return GrB_SUCCESS;
}



template <typename c, typename a, typename b, typename m,
typename BinaryOpT, typename SemiringT>
Expand Down
2 changes: 1 addition & 1 deletion graphblas/backend/cuda/spmspv_inner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <cusparse.h>

#include <moderngpu.cuh>
#include <cub.cuh>
#include <cub/cub.cuh>

#include <iostream>
#include <algorithm>
Expand Down
2 changes: 1 addition & 1 deletion graphblas/backend/cuda/spmv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <cusparse.h>

#include <moderngpu.cuh>
#include <cub.cuh>
#include <cub/cub.cuh>

#include <iostream>
#include <string>
Expand Down
Loading