Skip to content

Commit

Permalink
Support cublasLt
Browse files Browse the repository at this point in the history
  • Loading branch information
JieRen98 committed Dec 1, 2024
1 parent 8deb18d commit f720941
Show file tree
Hide file tree
Showing 11 changed files with 225 additions and 0 deletions.
1 change: 1 addition & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ versinclude_HEADERS = \
include/starpu_disk.h \
include/starpu_cublas.h \
include/starpu_cublas_v2.h \
include/starpu_cublasLt.h \
include/starpu_cusolver.h \
include/starpu_cusparse.h \
include/starpu_hipblas.h \
Expand Down
4 changes: 4 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -1844,6 +1844,10 @@ if test x$enable_cuda = xyes; then
SAVED_LIBS="${LIBS}"
AC_CHECK_HEADERS([cuda_gl_interop.h])

AC_CHECK_LIB([cublasLt], [cublasLtCreate],
[AC_DEFINE([STARPU_HAVE_LIBCUBLASLT], [1], [Define to 1 if you have the cublasLt library])
STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcublasLt"])

AC_CHECK_LIB([cusparse], [cusparseCreate],
[AC_DEFINE([STARPU_HAVE_LIBCUSPARSE], [1], [Define to 1 if you have the cusparse library])
STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcusparse"])
Expand Down
1 change: 1 addition & 0 deletions doc/doxygen/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ dox_inputs = $(DOX_CONFIG) \
$(top_srcdir)/include/starpu_bound.h \
$(top_srcdir)/include/starpu_cublas.h \
$(top_srcdir)/include/starpu_cublas_v2.h \
$(top_srcdir)/include/starpu_cublasLt.h \
$(top_srcdir)/include/starpu_cusparse.h \
$(top_srcdir)/include/starpu_cuda.h \
$(top_srcdir)/include/starpu_cusolver.h \
Expand Down
1 change: 1 addition & 0 deletions doc/doxygen/chapters/files.doxy
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
\file starpu_cublas.h
\file starpu_cublas_v2.h
\file starpu_cuda.h
\file starpu_cublasLt.h
\file starpu_cusolver.h
\file starpu_cusparse.h
\file starpu_data_filters.h
Expand Down
7 changes: 7 additions & 0 deletions doc/doxygen/chapters/starpu_faq/check_list_performance.doxy
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,13 @@ the beginning of the codelet to make sure that CUBLAS is really using the proper
stream. When using CUBLAS v2, starpu_cublas_get_local_handle() can be called to queue CUBLAS
kernels with the proper configuration.

Similary, calling starpu_cublasLt_init() makes StarPU create CUBLASLT handles
on each CUDA device, starpu_cublasLt_get_local_handle() can then be used to
queue CUSPARSE kernels with the proper configuration. starpu_cublasLt_shutdown() will synchronously deinitialize the CUBLASLT library on every CUDA device.
Since CUBLASLT handles are not along with CUDA streams, users may need to call
starpu_cuda_get_local_stream() to get a CUDA stream before calling a
CUBLASLT API.

Similarly, calling starpu_cusparse_init() makes StarPU create CUSPARSE handles
on each CUDA device, starpu_cusparse_get_local_handle() can then be used to
queue CUSPARSE kernels with the proper configuration. starpu_cusparse_shutdown() will synchronously deinitialize the CUSPARSE library on every CUDA device.
Expand Down
1 change: 1 addition & 0 deletions doc/doxygen/doxygen-config-include.cfg.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ INPUT += @top_builddir@/doc/doxygen/starpu_config.h \
@top_srcdir@/include/starpu_task_list.h \
@top_srcdir@/include/starpu_task_util.h \
@top_srcdir@/include/starpu_cuda.h \
@top_srcdir@/include/starpu_cublasLt.h \
@top_srcdir@/include/starpu_cusparse.h \
@top_srcdir@/include/starpu_cublas.h \
@top_srcdir@/include/starpu_cublas_v2.h \
Expand Down
1 change: 1 addition & 0 deletions include/starpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ typedef INT_PTR intptr_t;
#include <starpu_hip.h>
#include <starpu_hipblas.h>
#include <starpu_cublas.h>
#include <starpu_cublasLt.h>
#include <starpu_cusparse.h>
#include <starpu_bound.h>
#include <starpu_hash.h>
Expand Down
61 changes: 61 additions & 0 deletions include/starpu_cublasLt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2010-2024 University of Bordeaux, CNRS (LaBRI UMR 5800), Inria
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/

#ifndef __STARPU_CUBLASLT_H__
#define __STARPU_CUBLASLT_H__

#ifdef STARPU_USE_CUDA
#include <cublasLt.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif

/**
@ingroup API_CUDA_Extensions
@{
*/

/**
Initialize CUBLASLT on every CUDA device
controlled by StarPU. This call blocks until CUBLASLT has been properly
initialized on every device. See \ref CUDA-specificOptimizations for more details.
*/
void starpu_cublasLt_init(void);

/**
Synchronously deinitialize the CUBLASLT library on
every CUDA device. See \ref CUDA-specificOptimizations for more details.
*/
void starpu_cublasLt_shutdown(void);

#ifdef STARPU_USE_CUDA
/**
Return the CUBLASLT handle to be used to queue CUBLASLT
kernels. It is properly initialized and configured for multistream by
starpu_cublasLt_init(). See \ref CUDA-specificOptimizations for more details.
*/
cublasLtHandle_t starpu_cublasLt_get_local_handle(void);
#endif

/** @} */

#ifdef __cplusplus
}
#endif

#endif /* __STARPU_CUBLASLT_H__ */
1 change: 1 addition & 0 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ endif

libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas_v2.c
libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublasLt.c
libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cusparse.c
libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cusolver.c

Expand Down
76 changes: 76 additions & 0 deletions src/drivers/cuda/starpu_cublasLt.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2009-2024 University of Bordeaux, CNRS (LaBRI UMR 5800), Inria
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/

#include <common/config.h>

#include <starpu.h>
#include <starpu_cuda.h>
#include <core/workers.h>

#ifdef STARPU_HAVE_LIBCUBLASLT
#include <cublasLt.h>

static cublasLtHandle_t cublasLt_handles[STARPU_NMAXWORKERS];
static cublasLtHandle_t main_handle;

static void init_cublasLt_func(void *args STARPU_ATTRIBUTE_UNUSED)
{
cublasLtCreate(&cublasLt_handles[starpu_worker_get_id_check()]);
// No need for setting streams, because the cublasLt handles are not along with streams
}

static void shutdown_cublasLt_func(void *args STARPU_ATTRIBUTE_UNUSED)
{
cublasLtDestroy(cublasLt_handles[starpu_worker_get_id_check()]);
}
#endif

void starpu_cublasLt_init(void)
{
#ifdef STARPU_HAVE_LIBCUBLASLT
if (!starpu_cuda_worker_get_count())
return;
starpu_execute_on_each_worker_ex(init_cublasLt_func, NULL, STARPU_CUDA, "init_cublasLt");

if (cublasLtCreate(&main_handle) != CUBLAS_STATUS_SUCCESS)
main_handle = NULL;
#endif
}

void starpu_cublasLt_shutdown(void)
{
#ifdef STARPU_HAVE_LIBCUBLASLT
if (!starpu_cuda_worker_get_count())
return;
starpu_execute_on_each_worker_ex(shutdown_cublasLt_func, NULL, STARPU_CUDA, "shutdown_cublasLt");

if (main_handle)
cublasLtDestroy(main_handle);
#endif
}

#ifdef STARPU_HAVE_LIBCUBLASLT
cublasLtHandle_t starpu_cublasLt_get_local_handle(void)
{
if (!starpu_cuda_worker_get_count())
return NULL;
int workerid = starpu_worker_get_id();
if (workerid >= 0)
return cublasLt_handles[workerid];
else
return main_handle;
}
#endif
71 changes: 71 additions & 0 deletions tests/helper/cublasLt_init.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2009-2024 University of Bordeaux, CNRS (LaBRI UMR 5800), Inria
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/

#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <starpu.h>
#include <stdlib.h>
#include "../helper.h"

/*
* Test initializing cublasLt, and how much time that takes
*/

static double start;
static double end;

//static float *data = NULL;

int main(int argc, char **argv)
{
int ret;

ret = starpu_initialize(NULL, &argc, &argv);
if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

unsigned ngpus = starpu_cuda_worker_get_count();

double init_timing;
double shutdown_timing;

start = starpu_timing_now();
starpu_cublasLt_init();
end = starpu_timing_now();
init_timing = end - start;

start = starpu_timing_now();
starpu_cublasLt_shutdown();
end = starpu_timing_now();
shutdown_timing = end - start;

FPRINTF(stderr, "Total:\n");
FPRINTF(stderr, "\tinit: %2.2f ms\n", init_timing/(1000));
FPRINTF(stderr, "\tshutdown: %2.2f ms\n", shutdown_timing/(1000));

if (ngpus != 0)
{
FPRINTF(stderr, "per-GPU (#gpu = %u):\n", ngpus);

FPRINTF(stderr, "\tinit: %2.2f ms\n", init_timing/(1000*ngpus));
FPRINTF(stderr, "\tshutdown: %2.2f ms\n", shutdown_timing/(1000*ngpus));
}

starpu_shutdown();

return EXIT_SUCCESS;
}

0 comments on commit f720941

Please sign in to comment.