From f72094112f0bcd2e488970b0d03ae0a2672e5a43 Mon Sep 17 00:00:00 2001 From: Jie Ren Date: Sun, 1 Dec 2024 15:50:48 +0300 Subject: [PATCH] Support cublasLt --- Makefile.am | 1 + configure.ac | 4 + doc/doxygen/Makefile.am | 1 + doc/doxygen/chapters/files.doxy | 1 + .../starpu_faq/check_list_performance.doxy | 7 ++ doc/doxygen/doxygen-config-include.cfg.in | 1 + include/starpu.h | 1 + include/starpu_cublasLt.h | 61 +++++++++++++++ src/Makefile.am | 1 + src/drivers/cuda/starpu_cublasLt.c | 76 +++++++++++++++++++ tests/helper/cublasLt_init.c | 71 +++++++++++++++++ 11 files changed, 225 insertions(+) create mode 100644 include/starpu_cublasLt.h create mode 100644 src/drivers/cuda/starpu_cublasLt.c create mode 100644 tests/helper/cublasLt_init.c diff --git a/Makefile.am b/Makefile.am index 1c03279510..303428fe23 100644 --- a/Makefile.am +++ b/Makefile.am @@ -128,6 +128,7 @@ versinclude_HEADERS = \ include/starpu_disk.h \ include/starpu_cublas.h \ include/starpu_cublas_v2.h \ + include/starpu_cublasLt.h \ include/starpu_cusolver.h \ include/starpu_cusparse.h \ include/starpu_hipblas.h \ diff --git a/configure.ac b/configure.ac index bcb7f5e3d7..8ca257ad5b 100644 --- a/configure.ac +++ b/configure.ac @@ -1844,6 +1844,10 @@ if test x$enable_cuda = xyes; then SAVED_LIBS="${LIBS}" AC_CHECK_HEADERS([cuda_gl_interop.h]) + AC_CHECK_LIB([cublasLt], [cublasLtCreate], + [AC_DEFINE([STARPU_HAVE_LIBCUBLASLT], [1], [Define to 1 if you have the cublasLt library]) + STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcublasLt"]) + AC_CHECK_LIB([cusparse], [cusparseCreate], [AC_DEFINE([STARPU_HAVE_LIBCUSPARSE], [1], [Define to 1 if you have the cusparse library]) STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcusparse"]) diff --git a/doc/doxygen/Makefile.am b/doc/doxygen/Makefile.am index 807c9216ad..5876ccb0f1 100644 --- a/doc/doxygen/Makefile.am +++ b/doc/doxygen/Makefile.am @@ -173,6 +173,7 @@ dox_inputs = $(DOX_CONFIG) \ $(top_srcdir)/include/starpu_bound.h \ $(top_srcdir)/include/starpu_cublas.h \ $(top_srcdir)/include/starpu_cublas_v2.h \ + $(top_srcdir)/include/starpu_cublasLt.h \ $(top_srcdir)/include/starpu_cusparse.h \ $(top_srcdir)/include/starpu_cuda.h \ $(top_srcdir)/include/starpu_cusolver.h \ diff --git a/doc/doxygen/chapters/files.doxy b/doc/doxygen/chapters/files.doxy index b6a27fd95c..add03eb888 100644 --- a/doc/doxygen/chapters/files.doxy +++ b/doc/doxygen/chapters/files.doxy @@ -24,6 +24,7 @@ \file starpu_cublas.h \file starpu_cublas_v2.h \file starpu_cuda.h +\file starpu_cublasLt.h \file starpu_cusolver.h \file starpu_cusparse.h \file starpu_data_filters.h diff --git a/doc/doxygen/chapters/starpu_faq/check_list_performance.doxy b/doc/doxygen/chapters/starpu_faq/check_list_performance.doxy index 13aa53498c..c0dfc4e2b0 100644 --- a/doc/doxygen/chapters/starpu_faq/check_list_performance.doxy +++ b/doc/doxygen/chapters/starpu_faq/check_list_performance.doxy @@ -151,6 +151,13 @@ the beginning of the codelet to make sure that CUBLAS is really using the proper stream. When using CUBLAS v2, starpu_cublas_get_local_handle() can be called to queue CUBLAS kernels with the proper configuration. +Similary, calling starpu_cublasLt_init() makes StarPU create CUBLASLT handles +on each CUDA device, starpu_cublasLt_get_local_handle() can then be used to +queue CUSPARSE kernels with the proper configuration. starpu_cublasLt_shutdown() will synchronously deinitialize the CUBLASLT library on every CUDA device. +Since CUBLASLT handles are not along with CUDA streams, users may need to call +starpu_cuda_get_local_stream() to get a CUDA stream before calling a +CUBLASLT API. + Similarly, calling starpu_cusparse_init() makes StarPU create CUSPARSE handles on each CUDA device, starpu_cusparse_get_local_handle() can then be used to queue CUSPARSE kernels with the proper configuration. starpu_cusparse_shutdown() will synchronously deinitialize the CUSPARSE library on every CUDA device. diff --git a/doc/doxygen/doxygen-config-include.cfg.in b/doc/doxygen/doxygen-config-include.cfg.in index d028cba2bf..51a75fea74 100644 --- a/doc/doxygen/doxygen-config-include.cfg.in +++ b/doc/doxygen/doxygen-config-include.cfg.in @@ -25,6 +25,7 @@ INPUT += @top_builddir@/doc/doxygen/starpu_config.h \ @top_srcdir@/include/starpu_task_list.h \ @top_srcdir@/include/starpu_task_util.h \ @top_srcdir@/include/starpu_cuda.h \ + @top_srcdir@/include/starpu_cublasLt.h \ @top_srcdir@/include/starpu_cusparse.h \ @top_srcdir@/include/starpu_cublas.h \ @top_srcdir@/include/starpu_cublas_v2.h \ diff --git a/include/starpu.h b/include/starpu.h index ce7f543fd0..47ffd52011 100644 --- a/include/starpu.h +++ b/include/starpu.h @@ -73,6 +73,7 @@ typedef INT_PTR intptr_t; #include #include #include +#include #include #include #include diff --git a/include/starpu_cublasLt.h b/include/starpu_cublasLt.h new file mode 100644 index 0000000000..1b0dd78d0b --- /dev/null +++ b/include/starpu_cublasLt.h @@ -0,0 +1,61 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. +* + * Copyright (C) 2010-2024 University of Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +#ifndef __STARPU_CUBLASLT_H__ +#define __STARPU_CUBLASLT_H__ + +#ifdef STARPU_USE_CUDA +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + /** + @ingroup API_CUDA_Extensions + @{ + */ + + /** + Initialize CUBLASLT on every CUDA device + controlled by StarPU. This call blocks until CUBLASLT has been properly + initialized on every device. See \ref CUDA-specificOptimizations for more details. + */ + void starpu_cublasLt_init(void); + + /** + Synchronously deinitialize the CUBLASLT library on + every CUDA device. See \ref CUDA-specificOptimizations for more details. + */ + void starpu_cublasLt_shutdown(void); + +#ifdef STARPU_USE_CUDA + /** + Return the CUBLASLT handle to be used to queue CUBLASLT + kernels. It is properly initialized and configured for multistream by + starpu_cublasLt_init(). See \ref CUDA-specificOptimizations for more details. + */ + cublasLtHandle_t starpu_cublasLt_get_local_handle(void); +#endif + + /** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* __STARPU_CUBLASLT_H__ */ diff --git a/src/Makefile.am b/src/Makefile.am index 4b2a282fce..16596cd701 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -375,6 +375,7 @@ endif libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas_v2.c +libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublasLt.c libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cusparse.c libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cusolver.c diff --git a/src/drivers/cuda/starpu_cublasLt.c b/src/drivers/cuda/starpu_cublasLt.c new file mode 100644 index 0000000000..82a8ca56c4 --- /dev/null +++ b/src/drivers/cuda/starpu_cublasLt.c @@ -0,0 +1,76 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2009-2024 University of Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +#include + +#include +#include +#include + +#ifdef STARPU_HAVE_LIBCUBLASLT +#include + +static cublasLtHandle_t cublasLt_handles[STARPU_NMAXWORKERS]; +static cublasLtHandle_t main_handle; + +static void init_cublasLt_func(void *args STARPU_ATTRIBUTE_UNUSED) +{ + cublasLtCreate(&cublasLt_handles[starpu_worker_get_id_check()]); + // No need for setting streams, because the cublasLt handles are not along with streams +} + +static void shutdown_cublasLt_func(void *args STARPU_ATTRIBUTE_UNUSED) +{ + cublasLtDestroy(cublasLt_handles[starpu_worker_get_id_check()]); +} +#endif + +void starpu_cublasLt_init(void) +{ +#ifdef STARPU_HAVE_LIBCUBLASLT + if (!starpu_cuda_worker_get_count()) + return; + starpu_execute_on_each_worker_ex(init_cublasLt_func, NULL, STARPU_CUDA, "init_cublasLt"); + + if (cublasLtCreate(&main_handle) != CUBLAS_STATUS_SUCCESS) + main_handle = NULL; +#endif +} + +void starpu_cublasLt_shutdown(void) +{ +#ifdef STARPU_HAVE_LIBCUBLASLT + if (!starpu_cuda_worker_get_count()) + return; + starpu_execute_on_each_worker_ex(shutdown_cublasLt_func, NULL, STARPU_CUDA, "shutdown_cublasLt"); + + if (main_handle) + cublasLtDestroy(main_handle); +#endif +} + +#ifdef STARPU_HAVE_LIBCUBLASLT +cublasLtHandle_t starpu_cublasLt_get_local_handle(void) +{ + if (!starpu_cuda_worker_get_count()) + return NULL; + int workerid = starpu_worker_get_id(); + if (workerid >= 0) + return cublasLt_handles[workerid]; + else + return main_handle; +} +#endif diff --git a/tests/helper/cublasLt_init.c b/tests/helper/cublasLt_init.c new file mode 100644 index 0000000000..6c4971ee76 --- /dev/null +++ b/tests/helper/cublasLt_init.c @@ -0,0 +1,71 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. +* + * Copyright (C) 2009-2024 University of Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +#include +#include +#include +#include +#include +#include "../helper.h" + +/* + * Test initializing cublasLt, and how much time that takes + */ + +static double start; +static double end; + +//static float *data = NULL; + +int main(int argc, char **argv) +{ + int ret; + + ret = starpu_initialize(NULL, &argc, &argv); + if (ret == -ENODEV) return STARPU_TEST_SKIPPED; + STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); + + unsigned ngpus = starpu_cuda_worker_get_count(); + + double init_timing; + double shutdown_timing; + + start = starpu_timing_now(); + starpu_cublasLt_init(); + end = starpu_timing_now(); + init_timing = end - start; + + start = starpu_timing_now(); + starpu_cublasLt_shutdown(); + end = starpu_timing_now(); + shutdown_timing = end - start; + + FPRINTF(stderr, "Total:\n"); + FPRINTF(stderr, "\tinit: %2.2f ms\n", init_timing/(1000)); + FPRINTF(stderr, "\tshutdown: %2.2f ms\n", shutdown_timing/(1000)); + + if (ngpus != 0) + { + FPRINTF(stderr, "per-GPU (#gpu = %u):\n", ngpus); + + FPRINTF(stderr, "\tinit: %2.2f ms\n", init_timing/(1000*ngpus)); + FPRINTF(stderr, "\tshutdown: %2.2f ms\n", shutdown_timing/(1000*ngpus)); + } + + starpu_shutdown(); + + return EXIT_SUCCESS; +}