diff --git a/include/galahad_blas.h b/include/galahad_blas.h index 2e3077f28b..0456227e0e 100644 --- a/include/galahad_blas.h +++ b/include/galahad_blas.h @@ -1,6 +1,6 @@ -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_BLAS_interface GALAHAD_BLAS_interface_64 -#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER +#ifdef NO_UNDERSCORE_INTEGER_64 #define DASUM DASUM64 #define DCABS1 DCABS164 #define DDOT DDOT64 @@ -69,7 +69,7 @@ #define ZTRMM ZTRMM64 #define ZTRMV ZTRMV64 #define ZTRSM ZTRSM64 -#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER +#elif DOUBLE_UNDERSCORE_INTEGER_64 #define DASUM DASUM__64 #define DCABS1 DCABS1__64 #define DDOT DDOT__64 @@ -138,7 +138,7 @@ #define ZTRMM ZTRMM__64 #define ZTRMV ZTRMV__64 #define ZTRSM ZTRSM__64 -#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER +#elif NO_SYMBOL_INTEGER_64 #else #define DASUM DASUM_64 #define DCABS1 DCABS1_64 diff --git a/include/galahad_blas_original.h b/include/galahad_blas_original.h index 5a24f81068..7972a8b267 100644 --- a/include/galahad_blas_original.h +++ b/include/galahad_blas_original.h @@ -1,6 +1,6 @@ -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_BLAS_interface GALAHAD_BLAS_interface_64 -#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER +#ifdef NO_UNDERSCORE_INTEGER_64 #define SNRM2 SNRM264 #define DNRM2 DNRM264 #define ISAMAX ISAMAX64 @@ -25,7 +25,7 @@ #define DGEMM DGEMM64 #define SGER SGER64 #define DGER DGER64 -#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER +#elif DOUBLE_UNDERSCORE_INTEGER_64 #define SNRM2 SNRM2__64 #define DNRM2 DNRM2__64 #define ISAMAX ISAMAX__64 @@ -50,7 +50,7 @@ #define DGEMM DGEMM__64 #define SGER SGER__64 #define DGER DGER__64 -#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER +#elif NO_SYMBOL_INTEGER_64 #define SNRM2 SNRM2 #define DNRM2 DNRM2 #define ISAMAX ISAMAX diff --git a/include/galahad_kinds.h b/include/galahad_kinds.h index dd34d8890b..789e8c7dc3 100644 --- a/include/galahad_kinds.h +++ b/include/galahad_kinds.h @@ -1,4 +1,4 @@ -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_KINDS_single GALAHAD_KINDS_single_64 #define GALAHAD_KINDS_double GALAHAD_KINDS_double_64 #endif diff --git a/include/galahad_lapack.h b/include/galahad_lapack.h index eb07daa2a5..878d7c10b5 100644 --- a/include/galahad_lapack.h +++ b/include/galahad_lapack.h @@ -1,7 +1,7 @@ #include "galahad_blas.h" -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_LAPACK_interface GALAHAD_LAPACK_interface_64 -#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER +#ifdef NO_UNDERSCORE_INTEGER_64 #define DISNAN DISNAN64 #define DLADIV DLADIV64 #define DLAISN DLAISN64 @@ -275,7 +275,7 @@ #define ZLARFB ZLARFB64 #define ZLARFG ZLARFG64 #define ZLARFT ZLARFT64 -#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER +#elif DOUBLE_UNDERSCORE_INTEGER_64 #define DISNAN DISNAN__64 #define DLADIV DLADIV__64 #define DLAISN DLAISN__64 @@ -549,7 +549,7 @@ #define ZLARFB ZLARFB__64 #define ZLARFG ZLARFG__64 #define ZLARFT ZLARFT__64 -#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER +#elif NO_SYMBOL_INTEGER_64 #else #define DISNAN DISNAN_64 #define DLADIV DLADIV_64 diff --git a/include/galahad_lapack_original.h b/include/galahad_lapack_original.h index 03da26d27e..e2dc2247a7 100644 --- a/include/galahad_lapack_original.h +++ b/include/galahad_lapack_original.h @@ -1,6 +1,6 @@ -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_LAPACK_interface GALAHAD_LAPACK_interface_64 -#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER +#ifdef NO_UNDERSCORE_INTEGER_64 #define SGETRF SGETRF64 #define DGETRF DGETRF64 #define SGETRS SGETRS64 @@ -39,7 +39,7 @@ #define DSTERF DSTERF64 #define SLAEV2 SLAEV264 #define DLAEV2 DLAEV264 -#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER +#elif DOUBLE_UNDERSCORE_INTEGER_64 #define SGETRF SGETRF__64 #define DGETRF DGETRF__64 #define SGETRS SGETRS__64 @@ -78,45 +78,7 @@ #define DSTERF DSTERF__64 #define SLAEV2 SLAEV2__64 #define DLAEV2 DLAEV2__64 -#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER -#define SGETRF SGETRF -#define DGETRF DGETRF -#define SGETRS SGETRS -#define DGETRS DGETRS -#define SGELS SGELS -#define DGELS DGELS -#define SGELSY SGELSY -#define DGELSY DGELSY -#define SGELSS SGELSS -#define DGELSS DGELSS -#define SGELSD SGELSD -#define DGELSD DGELSD -#define SGESVD SGESVD -#define DGESVD DGESVD -#define SPTTRF SPTTRF -#define DPTTRF DPTTRF -#define SPOTRF SPOTRF -#define DPOTRF DPOTRF -#define SPOTRS SPOTRS -#define DPOTRS DPOTRS -#define SSYTRF SSYTRF -#define DSYTRF DSYTRF -#define SSYTRS SSYTRS -#define DSYTRS DSYTRS -#define SPBTRF SPBTRF -#define DPBTRF DPBTRF -#define SPBTRS SPBTRS -#define DPBTRS DPBTRS -#define SSYEV SSYEV -#define DSYEV DSYEV -#define SSYGV SSYGV -#define DSYGV DSYGV -#define SHSEQR SHSEQR -#define DHSEQR DHSEQR -#define SSTERF SSTERF -#define DSTERF DSTERF -#define SLAEV2 SLAEV2 -#define DLAEV2 DLAEV2 +#elif NO_SYMBOL_INTEGER_64 #else #define SGETRF SGETRF_64 #define DGETRF DGETRF_64 diff --git a/include/galahad_modules.h b/include/galahad_modules.h index dba91fdb0d..687c15b362 100644 --- a/include/galahad_modules.h +++ b/include/galahad_modules.h @@ -1,4 +1,4 @@ -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_BLAS_interface GALAHAD_BLAS_interface_64 #define GALAHAD_LAPACK_interface GALAHAD_LAPACK_interface_64 #define GALAHAD_KINDS_single GALAHAD_KINDS_single_64 @@ -6,7 +6,7 @@ #endif #ifdef GALAHAD_SINGLE -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define CUTEst_interface_precision CUTEST_interface_single_64 #define CUTEST_interface_precision CUTEST_interface_single_64 @@ -743,7 +743,7 @@ #else -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define CUTEst_interface_precision CUTEST_interface_double_64 #define CUTEST_interface_precision CUTEST_interface_double_64 @@ -1479,7 +1479,7 @@ #ifdef GALAHAD_SINGLE #define mumps_struc smumps_struc #define MUMPS_STRUC SMUMPS_STRUC -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_MUMPS_TYPES_precision GALAHAD_MUMPS_TYPES_single_64 #ifdef DUMMY_SMUMPS #define MUMPS_precision GALAHAD_SMUMPS_64 @@ -1497,7 +1497,7 @@ #else #define mumps_struc dmumps_struc #define MUMPS_STRUC DMUMPS_STRUC -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_MUMPS_TYPES_precision GALAHAD_MUMPS_TYPES_double_64 #ifdef DUMMY_DMUMPS #define MUMPS_precision GALAHAD_DMUMPS_64 diff --git a/include/metis.h b/include/metis.h index e3e029ced1..7184844075 100644 --- a/include/metis.h +++ b/include/metis.h @@ -30,7 +30,7 @@ GCC does provides these definitions in stdint.h, but it may require some modifications on other architectures. --------------------------------------------------------------------------*/ -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define IDXTYPEWIDTH 64 #else #define IDXTYPEWIDTH 32 diff --git a/include/spral_procedures.h b/include/spral_procedures.h index 425a3ec830..c83b023e6f 100644 --- a/include/spral_procedures.h +++ b/include/spral_procedures.h @@ -1,5 +1,5 @@ #ifdef GALAHAD_SINGLE -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_KINDS_precision galahad_kinds_single_64 #define SPRAL_SSIDS_precision spral_ssids_single_64 #else @@ -7,7 +7,7 @@ #define SPRAL_SSIDS_precision spral_ssids_single #endif #else -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_KINDS_precision galahad_kinds_double_64 #define SPRAL_SSIDS_precision spral_ssids_double_64 #else @@ -16,13 +16,13 @@ #endif #endif -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define spral_ssids_lapack_iface spral_ssids_lapack_iface_64 #define spral_ssids_blas_iface spral_ssids_blas_iface_64 #endif #ifdef SPRAL_SINGLE -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define SPRAL_KINDS_precision spral_kinds_single_64 #define spral_kinds_precision spral_kinds_single_64 #define spral_ssids_precision spral_ssids_single_64 @@ -66,7 +66,7 @@ #define spral_matrix_util_precision spral_matrix_util_single #endif #else -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define SPRAL_KINDS_precision spral_kinds_double_64 #define spral_kinds_precision spral_kinds_double_64 #define spral_ssids_precision spral_ssids_double_64 diff --git a/include/spral_ssids.h b/include/spral_ssids.h index f25e0b9361..435e5bfb98 100644 --- a/include/spral_ssids.h +++ b/include/spral_ssids.h @@ -1,4 +1,7 @@ //* \file spral_ssids.h */ +/** + * \version GALAHAD 4.3 - 2024-02-04 AT 10:10 GMT + */ #ifdef __cplusplus extern "C" { @@ -13,62 +16,63 @@ extern "C" { // precision #include "galahad_precision.h" +#include "ssids_rip.hxx" /************************************ * Derived types ************************************/ struct spral_ssids_options { - int array_base; // Not in Fortran type - int print_level; - int unit_diagnostics; - int unit_error; - int unit_warning; - int ordering; - int nemin; + ipc_ array_base; // Not in Fortran type + ipc_ print_level; + ipc_ unit_diagnostics; + ipc_ unit_error; + ipc_ unit_warning; + ipc_ ordering; + ipc_ nemin; bool ignore_numa; bool use_gpu; bool gpu_only; - int64_t min_gpu_work; + longc_ min_gpu_work; float max_load_inbalance; float gpu_perf_coeff; - int scaling; - int64_t small_subtree_threshold; - int cpu_block_size; + ipc_ scaling; + longc_ small_subtree_threshold; + ipc_ cpu_block_size; bool action; - int pivot_method; + ipc_ pivot_method; real_wp_ small; real_wp_ u; - int nstream; + ipc_ nstream; real_wp_ multiplier; float min_loadbalance; - int failed_pivot_method; + ipc_ failed_pivot_method; // char unused[80]; // Allow for future expansion }; struct spral_ssids_inform { - int flag; - int matrix_dup; - int matrix_missing_diag; - int matrix_outrange; - int matrix_rank; - int maxdepth; - int maxfront; - int maxsupernode; - int num_delay; - int64_t num_factor; - int64_t num_flops; - int num_neg; - int num_sup; - int num_two; - int stat; - int cuda_error; - int cublas_error; - int not_first_pass; - int not_second_pass; - int nparts; - int64_t cpu_flops; - int64_t gpu_flops; + ipc_ flag; + ipc_ matrix_dup; + ipc_ matrix_missing_diag; + ipc_ matrix_outrange; + ipc_ matrix_rank; + ipc_ maxdepth; + ipc_ maxfront; + ipc_ maxsupernode; + ipc_ num_delay; + longc_ num_factor; + longc_ num_flops; + ipc_ num_neg; + ipc_ num_sup; + ipc_ num_two; + ipc_ stat; + ipc_ cuda_error; + ipc_ cublas_error; + ipc_ not_first_pass; + ipc_ not_second_pass; + ipc_ nparts; + longc_ cpu_flops; + longc_ gpu_flops; // char unused[76]; // Allow for future expansion }; @@ -79,40 +83,40 @@ struct spral_ssids_inform { /* Initialize options to defaults */ void spral_ssids_default_options(struct spral_ssids_options *options); /* Perform analysis phase for CSC data */ -void spral_ssids_analyse(bool check, int n, int *order, const int64_t *ptr, - const int *row, const real_wp_ *val, void **akeep, +void spral_ssids_analyse(bool check, ipc_ n, ipc_ *order, const longc_ *ptr, + const ipc_ *row, const real_wp_ *val, void **akeep, const struct spral_ssids_options *options, struct spral_ssids_inform *inform); -void spral_ssids_analyse_ptr32(bool check, int n, int *order, const int *ptr, - const int *row, const real_wp_ *val, void **akeep, +void spral_ssids_analyse_ptr32(bool check, ipc_ n, ipc_ *order, const int *ptr, + const ipc_ *row, const real_wp_ *val, void **akeep, const struct spral_ssids_options *options, struct spral_ssids_inform *inform); /* Perform analysis phase for coordinate data */ -void spral_ssids_analyse_coord(int n, int *order, int64_t ne, const int *row, - const int *col, const real_wp_ *val, void **akeep, +void spral_ssids_analyse_coord(ipc_ n, ipc_ *order, longc_ ne, const ipc_ *row, + const ipc_ *col, const real_wp_ *val, void **akeep, const struct spral_ssids_options *options, struct spral_ssids_inform *inform); /* Perform numerical factorization */ -void spral_ssids_factor(bool posdef, const int64_t *ptr, const int *row, +void spral_ssids_factor(bool posdef, const longc_ *ptr, const ipc_ *row, const real_wp_ *val, real_wp_ *scale, void *akeep, void **fkeep, const struct spral_ssids_options *options, struct spral_ssids_inform *inform); -void spral_ssids_factor_ptr32(bool posdef, const int *ptr, const int *row, +void spral_ssids_factor_ptr32(bool posdef, const int *ptr, const ipc_ *row, const real_wp_ *val, real_wp_ *scale, void *akeep, void **fkeep, const struct spral_ssids_options *options, struct spral_ssids_inform *inform); /* Perform triangular solve(s) for single rhs */ -void spral_ssids_solve1(int job, real_wp_ *x1, void *akeep, void *fkeep, +void spral_ssids_solve1(ipc_ job, real_wp_ *x1, void *akeep, void *fkeep, const struct spral_ssids_options *options, struct spral_ssids_inform *inform); /* Perform triangular solve(s) for one or more rhs */ -void spral_ssids_solve(int job, int nrhs, real_wp_ *x, int ldx, void *akeep, +void spral_ssids_solve(ipc_ job, ipc_ nrhs, real_wp_ *x, ipc_ ldx, void *akeep, void *fkeep, const struct spral_ssids_options *options, struct spral_ssids_inform *inform); /* Free memory */ -int spral_ssids_free_akeep(void **akeep); -int spral_ssids_free_fkeep(void **fkeep); -int spral_ssids_free(void **akeep, void **fkeep); +ipc_ spral_ssids_free_akeep(void **akeep); +ipc_ spral_ssids_free_fkeep(void **fkeep); +ipc_ spral_ssids_free(void **akeep, void **fkeep); /************************************ * Advanced subroutines @@ -125,7 +129,7 @@ void spral_ssids_enquire_posdef(const void *akeep, const void *fkeep, /* Retrieve information on pivots (indefinite case) */ void spral_ssids_enquire_indef(const void *akeep, const void *fkeep, const struct spral_ssids_options *options, - struct spral_ssids_inform *inform, int *piv_order, real_wp_ *d); + struct spral_ssids_inform *inform, ipc_ *piv_order, real_wp_ *d); /* Alter pivots (indefinite case only) */ void spral_ssids_alter(const real_wp_ *d, const void *akeep, void *fkeep, const struct spral_ssids_options *options, diff --git a/include/ssids_contrib.h b/include/ssids_contrib.h index 1028c82e59..dbd7a529b2 100644 --- a/include/ssids_contrib.h +++ b/include/ssids_contrib.h @@ -2,10 +2,14 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 13:30 GMT * * \brief Defines C++ interface to routines from spral_ssids_contrib and * spral_ssids_contrib_free modules. */ + +#include "ssids_rip.hxx" + #ifndef SPRAL_SSIDS_CONTRIB_H #define SPRAL_SSIDS_CONTRIB_H @@ -15,15 +19,17 @@ extern "C" { #ifdef SPRAL_SINGLE void spral_ssids_contrib_get_data_single(const void *const contrib, - int *const n, const float* *const val, int *const ldval, - const int* *const rlist, int *const ndelay, const int* *const delay_perm, - const float* *const delay_val, int *const lddelay); + ipc_ *const n, const float* *const val, ipc_ *const ldval, + const ipc_* *const rlist, ipc_ *const ndelay, + const ipc_* *const delay_perm, + const float* *const delay_val, ipc_ *const lddelay); void spral_ssids_contrib_free_sgl(void *const contrib); #else void spral_ssids_contrib_get_data_double(const void *const contrib, - int *const n, const double* *const val, int *const ldval, - const int* *const rlist, int *const ndelay, const int* *const delay_perm, - const double* *const delay_val, int *const lddelay); + ipc_ *const n, const double* *const val, ipc_ *const ldval, + const ipc_* *const rlist, ipc_ *const ndelay, + const ipc_* *const delay_perm, + const double* *const delay_val, ipc_ *const lddelay); void spral_ssids_contrib_free_dbl(void *const contrib); #endif diff --git a/include/ssids_cpu_AppendAlloc.hxx b/include/ssids_cpu_AppendAlloc.hxx index a86a774199..9315ae5274 100644 --- a/include/ssids_cpu_AppendAlloc.hxx +++ b/include/ssids_cpu_AppendAlloc.hxx @@ -2,7 +2,9 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-04 AT 10:10 GMT */ + #pragma once //#define MEM_STATS @@ -10,6 +12,7 @@ #include #include "spral_compat.hxx" // for std::align if required +#include "ssids_rip.hxx" namespace spral { namespace ssids { namespace cpu { @@ -22,11 +25,11 @@ namespace append_alloc_internal { */ class Page { #if defined(__AVX512F__) - static const int align = 64; // 64 byte alignment + static const ipc_ align = 64; // 64 byte alignment #elif defined(__AVX__) - static const int align = 32; // 32 byte alignment + static const ipc_ align = 32; // 32 byte alignment #else - static const int align = 16; // 16 byte alignment + static const ipc_ align = 16; // 16 byte alignment #endif public: Page(size_t sz, Page* next=nullptr) diff --git a/include/ssids_cpu_BuddyAllocator.hxx b/include/ssids_cpu_BuddyAllocator.hxx index d84f275e61..fb990ff417 100644 --- a/include/ssids_cpu_BuddyAllocator.hxx +++ b/include/ssids_cpu_BuddyAllocator.hxx @@ -2,6 +2,7 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-04 AT 10:10 GMT */ #pragma once @@ -10,6 +11,7 @@ #include #include "spral_omp.hxx" +#include "ssids_rip.hxx" namespace spral { namespace ssids { namespace cpu { @@ -36,18 +38,18 @@ namespace buddy_alloc_internal { template > class Page { // \{ - typedef typename std::allocator_traits::template rebind_traits IntAllocTraits; + typedef typename std::allocator_traits::template rebind_traits IntAllocTraits; // \} - static int const nlevel=16; ///< Number of divisions to smallest allocation unit. + static ipc_ const nlevel=16; ///< Number of divisions to smallest allocation unit. #if defined(__AVX512F__) - static int const align=64; ///< Underlying alignment of all pointers returned + static ipc_ const align=64; ///< Underlying alignment of all pointers returned #elif defined(__AVX__) - static int const align=32; ///< Underlying alignment of all pointers returned + static ipc_ const align=32; ///< Underlying alignment of all pointers returned #else - static int const align=16; ///< Underlying alignment of all pointers returned + static ipc_ const align=16; ///< Underlying alignment of all pointers returned #endif - static int const ISSUED_FLAG = -2; ///< Flag: value is issued + static ipc_ const ISSUED_FLAG = -2; ///< Flag: value is issued public: // \{ Page(Page const&) =delete; // not copyable @@ -78,7 +80,7 @@ public: next_ = IntAllocTraits::allocate(intAlloc, 1<<(nlevel-1)); /* Initialize data structures */ head_[nlevel-1] = 0; next_[0] = -1; // a single free block at top level - for(int i=0; i size_) return nullptr; // too big: don't even try // Determine which level of block we're trying to find - int level = sz_to_level(sz); + ipc_ level = sz_to_level(sz); void* ptr = addr_to_ptr(get_next_ptr(level)); #ifdef MEM_STATS if(ptr) { @@ -139,8 +141,8 @@ public: } /** \brief Release memory associated with ptr for reuse. */ void deallocate(void* ptr, std::size_t sz) { - int idx = ptr_to_addr(ptr); - int level = sz_to_level(sz); + ipc_ idx = ptr_to_addr(ptr); + ipc_ level = sz_to_level(sz); mark_free(idx, level); #ifdef MEM_STATS used_ -= sz; @@ -148,7 +150,7 @@ public: } /** \brief Return true if this Page owners given pointer */ bool is_owner(void* ptr) { - int idx = ptr_to_addr(ptr); + ipc_ idx = ptr_to_addr(ptr); return (idx>=0 && idx<(1<<(nlevel-1))); } /** @@ -159,8 +161,8 @@ public: * */ size_t count_free() const { size_t free=0; - for(int i=0; i=nlevel) return -1; // invalid level if(head_[level] == -1) { // Need to split next level up to get one - int above = get_next_ptr(level+1); + ipc_ above = get_next_ptr(level+1); if(above==-1) return -1; // couldn't find one split_block(level+1, above); } - int p = head_[level]; + ipc_ p = head_[level]; head_[level] = next_[p]; next_[p] = ISSUED_FLAG; return p; } /** Marks given block as free, tries to merge with partner if possible */ - void mark_free(int idx, int level) { + void mark_free(ipc_ idx, ipc_ level) { if(level < nlevel-1) { // There exists a partner, see if we can merge with it - int partner = get_partner(idx, level); + ipc_ partner = get_partner(idx, level); if(next_[partner] != ISSUED_FLAG) { // Partner is free in *some* list, not necessarily this level if(remove_from_free_list(partner, level)) { @@ -208,9 +210,9 @@ private: /** Finds the given address in free list for level and removes it. * Returns false if it cannot be found, true otherwise. */ - bool remove_from_free_list(int idx, int level) { - int prev = -1; - int current = head_[level]; + bool remove_from_free_list(ipc_ idx, ipc_ level) { + ipc_ prev = -1; + ipc_ current = head_[level]; while(current!=-1 && current != idx) { prev = current; current = next_[current]; @@ -227,36 +229,36 @@ private: } /** Splits the given block */ - void split_block(int level, int block) { - int left = block; - int right = get_partner(block, level-1); + void split_block(ipc_ level, ipc_ block) { + ipc_ left = block; + ipc_ right = get_partner(block, level-1); next_[right] = head_[level-1]; next_[left] = right; head_[level-1] = left; } /** Given address location, return pointer */ - void* addr_to_ptr(int idx) { + void* addr_to_ptr(ipc_ idx) { return (idx==-1) ? nullptr : base_ + idx*min_size_; } /** Given pointer, return address */ - int ptr_to_addr(void* ptr) { + ipc_ ptr_to_addr(void* ptr) { return static_cast(static_cast(ptr)-base_) / min_size_; } /** Given a size, find the relevant level */ - int sz_to_level(std::size_t sz) { - int val = sz / min_size_; + ipc_ sz_to_level(std::size_t sz) { + ipc_ val = sz / min_size_; // Find next power of 2 higher than val - int level = 0; + ipc_ level = 0; while((val>>level) > 0) ++level; return level; } /** Given an index find its partner at given level */ - int get_partner(int idx, int level) { + ipc_ get_partner(ipc_ idx, ipc_ level) { return idx ^ (1<* next_child; // Pointer to parent's next child /* Data that changes during factorize */ - int ndelay_in; // Number of delays arising from children - int ndelay_out; // Number of delays arising to push into parent - int nelim; // Number of columns succesfully eliminated + ipc_ ndelay_in; // Number of delays arising from children + ipc_ ndelay_out; // Number of delays arising to push into parent + ipc_ nelim; // Number of columns succesfully eliminated T *lcol; // Pointer to start of factor data - int *perm; // Pointer to permutation + ipc_ *perm; // Pointer to permutation T *contrib; // Pointer to contribution block private: PoolAllocator pool_alloc_; // Our own version of pool allocator for freeing diff --git a/include/ssids_cpu_NumericSubtree.hxx b/include/ssids_cpu_NumericSubtree.hxx index 9e5959d9ca..9b161063e4 100644 --- a/include/ssids_cpu_NumericSubtree.hxx +++ b/include/ssids_cpu_NumericSubtree.hxx @@ -2,9 +2,12 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 07:40 GMT */ + #pragma once +#include "ssids_rip.hxx" #include "ssids_profile.hxx" #include "ssids_cpu_cpu_iface.hxx" #include "ssids_cpu_factor.hxx" @@ -77,7 +80,7 @@ public: { /* Associate symbolic nodes to numeric ones; copy tree structure */ nodes_.reserve(symbolic_subtree.nnodes_+1); - for(int ni=0; niidx] : nullptr; @@ -86,11 +89,11 @@ public: } /* Allocate workspaces */ - int num_threads = omp_get_num_threads(); + ipc_ num_threads = omp_get_num_threads(); std::vector thread_stats(num_threads); std::vector work; work.reserve(num_threads); - for(int i=0; i(m); + for(ipc_ ni=0; ni(m); T *d = nodes_[ni].lcol + n*ldl; - for(int i=0; i(m+ndin); + ipc_ ldl = align_lda(m+ndin); /* Build map (indef only) */ - int const *map; + ipc_ const *map; if(!posdef) { // indef need to allow for permutation and/or delays - for(int i=0; i m, just use beta=0 // in dgemm call and then add as we scatter - for(int r=0; r - void solve_diag_bwd_inner(int nrhs, T* x, int ldx) const { + void solve_diag_bwd_inner(ipc_ nrhs, T* x, ipc_ ldx) const { if(posdef && !do_bwd) return; // diagonal solve is a no-op for posdef /* Allocate memory - map only needed for indef bwd/diag_bwd solve */ T* xlocal = new T[nrhs*symb_.n]; - int* map_alloc = (!posdef && do_bwd) ? new int[symb_.n] + ipc_* map_alloc = (!posdef && do_bwd) ? new ipc_[symb_.n] : nullptr; /* Perform solve */ - for(int ni=symb_.nnodes_-1; ni>=0; --ni) { - int m = symb_[ni].nrow; - int n = symb_[ni].ncol; - int nelim = (posdef) ? n + for(ipc_ ni=symb_.nnodes_-1; ni>=0; --ni) { + ipc_ m = symb_[ni].nrow; + ipc_ n = symb_[ni].ncol; + ipc_ nelim = (posdef) ? n : nodes_[ni].nelim; - int ndin = (posdef) ? 0 + ipc_ ndin = (posdef) ? 0 : nodes_[ni].ndelay_in; /* Build map (indef only) */ - int const *map; + ipc_ const *map; if(!posdef) { // indef need to allow for permutation and/or delays if(do_bwd) { - for(int i=0; i(m+ndin); - for(int r=0; r(m+ndin); + for(ipc_ r=0; r(nrhs, x, ldx); } - void solve_diag_bwd(int nrhs, T* x, int ldx) const { + void solve_diag_bwd(ipc_ nrhs, T* x, ipc_ ldx) const { solve_diag_bwd_inner(nrhs, x, ldx); } - void solve_bwd(int nrhs, T* x, int ldx) const { + void solve_bwd(ipc_ nrhs, T* x, ipc_ ldx) const { solve_diag_bwd_inner(nrhs, x, ldx); } @@ -434,21 +436,21 @@ public: * Note that piv_order is only set in indefinite case. * One of piv_order or d may be null in indefinite case. */ - void enquire(int *piv_order, T* d) const { + void enquire(ipc_ *piv_order, T* d) const { if(posdef) { - for(int ni=0; ni(blkm); - for(int i=0; i(blkm); + for(ipc_ i=0; i(blkm); - int nelim = nodes_[ni].nelim; + for(ipc_ ni=0, piv=0; ni(blkm); + ipc_ nelim = nodes_[ni].nelim; T const* dptr = &nodes_[ni].lcol[blkn*ldl]; // if (d) { // printf("d01 = %.1f %.1f\n", dptr[0], dptr[1]); @@ -457,7 +459,7 @@ public: // printf("d67 = %.1f %.1f\n", dptr[6], dptr[7]); // } // printf("ni = %i, nelim = %i\n", ni+1, nelim); - for(int i=0; i(blkm); - int nelim = nodes_[ni].nelim; + for(ipc_ ni=0; ni(blkm); + ipc_ nelim = nodes_[ni].nelim; T* dptr = &nodes_[ni].lcol[blkn*ldl]; - T dum; +// T dum; - for(int i=0; i(m); - int nelim = nodes_[node].nelim; - int const* rlist = &symb_[node].rlist[ symb_[node].ncol ]; - for(int i=0; i(m); + ipc_ nelim = nodes_[node].nelim; + ipc_ const* rlist = &symb_[node].rlist[ symb_[node].ncol ]; + for(ipc_ i=0; i +#include "ssids_rip.hxx" #include "ssids_cpu_cpu_iface.hxx" #include "ssids_cpu_factor.hxx" #include "ssids_cpu_NumericNode.hxx" @@ -14,18 +17,16 @@ #include "ssids_cpu_ThreadStats.hxx" #ifdef SPRAL_SINGLE -#define precision_ float #define FAPrecisionTraits FASingleTraits #define factor_alloc_precision factor_alloc_single #define ldlt_tpp_factor ldlt_tpp_factor_sgl #else -#define precision_ double #define FAPrecisionTraits FADoubleTraits #define factor_alloc_precision factor_alloc_double #define ldlt_tpp_factor ldlt_tpp_factor_dbl #endif -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define host_gemm host_gemm_64 #endif @@ -46,8 +47,8 @@ template class SmallLeafNumericSubtree { - typedef typename std::allocator_traits::template rebind_traits FAPrecisionTraits; - typedef typename std::allocator_traits::template rebind_traits FAIntTraits; + typedef typename std::allocator_traits::template rebind_traits FAPrecisionTraits; + typedef typename std::allocator_traits::template rebind_traits FAIntTraits; typedef std::allocator_traits PATraits; public: SmallLeafNumericSubtree(SmallLeafSymbolicSubtree const& symb, std::vector>& old_nodes, T const* aval, T const* scaling, FactorAllocator& factor_alloc, PoolAllocator& pool_alloc, std::vector& work_vec, struct cpu_factor_options const& options, ThreadStats& stats) @@ -55,30 +56,30 @@ public: { Workspace& work = work_vec[omp_get_thread_num()]; /* Initialize nodes */ - for(int ni=symb_.sa_; ni<=symb_.en_; ++ni) { + for(ipc_ ni=symb_.sa_; ni<=symb_.en_; ++ni) { old_nodes_[ni].ndelay_in = 0; old_nodes_[ni].lcol = lcol_ + symb_[ni-symb_.sa_].lcol_offset; } memset(lcol_, 0, symb_.nfactor_*sizeof(T)); /* Add aval entries */ - for(int ni=symb_.sa_; ni<=symb_.en_; ++ni) + for(ipc_ ni=symb_.sa_; ni<=symb_.en_; ++ni) add_a(ni-symb_.sa_, symb_.symb_[ni], aval, scaling); /* Perform factorization */ - for(int ni=symb_.sa_; ni<=symb_.en_; ++ni) { + for(ipc_ ni=symb_.sa_; ni<=symb_.en_; ++ni) { // Assembly - int* map = work.get_ptr(symb_.symb_.n+1); + ipc_* map = work.get_ptr(symb_.symb_.n+1); assemble (ni-symb_.sa_, symb_.symb_[ni], &old_nodes_[ni], factor_alloc, pool_alloc, map, aval, scaling); // Update stats - int nrow = symb_.symb_[ni].nrow; + ipc_ nrow = symb_.symb_[ni].nrow; stats.maxfront = std::max(stats.maxfront, nrow); - int ncol = symb_.symb_[ni].ncol; + ipc_ ncol = symb_.symb_[ni].ncol; stats.maxsupernode = std::max(stats.maxsupernode, ncol); // Factorization - precision_ one_val = 1.0; + rpc_ one_val = 1.0; factor_node_posdef (one_val, symb_.symb_[ni], old_nodes_[ni], options, stats); if(stats.flag(snode.nrow); + rpc_ *lcol = lcol_ + symb_[si].lcol_offset; + size_t ldl = align_lda(snode.nrow); if(scaling) { /* Scaling to apply */ - for(int i=0; i* node, FactorAllocator& factor_alloc, PoolAllocator& pool_alloc, - int* map, + ipc_* map, T const* aval, T const* scaling ) { @@ -133,18 +134,19 @@ void assemble( typename FAIntTraits::allocator_type factor_alloc_int(factor_alloc); /* Count incoming delays and determine size of node */ - int nrow = snode.nrow; - int ncol = snode.ncol; + ipc_ nrow = snode.nrow; + ipc_ ncol = snode.ncol; /* Get space for contribution block + zero it */ - long contrib_dimn = snode.nrow - snode.ncol; - node->contrib = (contrib_dimn > 0) ? PATraits::allocate(pool_alloc, contrib_dimn*contrib_dimn) : nullptr; + longc_ contrib_dimn = snode.nrow - snode.ncol; + node->contrib = (contrib_dimn > 0) ? PATraits::allocate(pool_alloc, + contrib_dimn*contrib_dimn) : nullptr; if(node->contrib) memset(node->contrib, 0, contrib_dimn*contrib_dimn*sizeof(T)); /* Alloc + set perm */ node->perm = FAIntTraits::allocate(factor_alloc_int, ncol); // ncol fully summed variables - for(int i=0; iperm[i] = snode.rlist[i]; /* Add children */ @@ -152,32 +154,32 @@ void assemble( /* Build lookup vector, allowing for insertion of delayed vars */ /* Note that while rlist[] is 1-indexed this is fine so long as lookup * is also 1-indexed (which it is as it is another node's rlist[] */ - for(int i=0; ifirst_child; child!=NULL; child=child->next_child) { SymbolicNode const& csnode = child->symb; /* Handle expected contributions (only if something there) */ if(child->contrib) { - int cm = csnode.nrow - csnode.ncol; - for(int i=0; icontrib[i*cm]; if(c < snode.ncol) { // Contribution added to lcol - int ldd = align_lda(nrow); + ipc_ ldd = align_lda(nrow); T *dest = &node->lcol[c*ldd]; - for(int j=i; jcontrib[(c-ncol)*ldd]; - for(int j=i; j class SmallLeafNumericSubtree { - typedef typename std::allocator_traits::template rebind_traits FAPrecisionTraits; - typedef typename std::allocator_traits::template rebind_traits FAIntTraits; + typedef typename std::allocator_traits::template rebind_traits FAPrecisionTraits; + typedef typename std::allocator_traits::template rebind_traits FAIntTraits; typedef std::allocator_traits PATraits; public: SmallLeafNumericSubtree(SmallLeafSymbolicSubtree const& symb, std::vector>& old_nodes, T const* aval, T const* scaling, FactorAllocator& factor_alloc, PoolAllocator& pool_alloc, std::vector& work_vec, struct cpu_factor_options const& options, ThreadStats& stats) : old_nodes_(old_nodes), symb_(symb) { Workspace& work = work_vec[omp_get_thread_num()]; - for(int ni=symb_.sa_; ni<=symb_.en_; ++ni) { + for(ipc_ ni=symb_.sa_; ni<=symb_.en_; ++ni) { /*printf("%d: Node %d parent %d (of %d) size %d x %d\n", omp_get_thread_num(), ni, symb_[ni].parent, symb_.nnodes_, symb_[ni].nrow, symb_[ni].ncol);*/ // Assembly of node (not of contribution block) - int* map = work.get_ptr(symb_.symb_.n+1); + ipc_* map = work.get_ptr(symb_.symb_.n+1); assemble_pre (symb_.symb_[ni], old_nodes_[ni], factor_alloc, pool_alloc, map, aval, scaling); // Update stats - int nrow = symb_.symb_[ni].nrow + old_nodes_[ni].ndelay_in; + ipc_ nrow = symb_.symb_[ni].nrow + old_nodes_[ni].ndelay_in; stats.maxfront = std::max(stats.maxfront, nrow); - int ncol = symb_.symb_[ni].ncol + old_nodes_[ni].ndelay_in; + ipc_ ncol = symb_.symb_[ni].ncol + old_nodes_[ni].ndelay_in; stats.maxsupernode = std::max(stats.maxsupernode, ncol); // Factorization @@ -241,7 +243,7 @@ private: NumericNode& node, FactorAllocator& factor_alloc, PoolAllocator& pool_alloc, - int* map, + ipc_* map, T const* aval, T const* scaling ) { @@ -254,35 +256,35 @@ private: for(auto* child=node.first_child; child!=NULL; child=child->next_child) { node.ndelay_in += child->ndelay_out; } - int nrow = snode.nrow + node.ndelay_in; - int ncol = snode.ncol + node.ndelay_in; + ipc_ nrow = snode.nrow + node.ndelay_in; + ipc_ ncol = snode.ncol + node.ndelay_in; /* Get space for node now we know it size using Fortran allocator + zero it*/ // NB L is nrow x ncol and D is 2 x ncol (but no D if posdef) - size_t ldl = align_lda(nrow); + size_t ldl = align_lda(nrow); size_t len = (ldl+2) * ncol; // +2 is for D node.lcol = FAPrecisionTraits::allocate(factor_alloc_precision, len); memset(node.lcol, 0, len*sizeof(T)); /* Get space for contribution block + (explicitly do not zero it!) */ - long contrib_dimn = snode.nrow - snode.ncol; + longc_ contrib_dimn = snode.nrow - snode.ncol; node.contrib = (contrib_dimn > 0) ? PATraits::allocate(pool_alloc, contrib_dimn*contrib_dimn) : nullptr; /* Alloc + set perm for expected eliminations at this node (delays are set * when they are imported from children) */ node.perm = FAIntTraits::allocate(factor_alloc_int, ncol); // ncol fully summed variables - for(int i=0; i= snode.ncol) k += node.ndelay_in; T rscale = scaling[ snode.rlist[r]-1 ]; T cscale = scaling[ snode.rlist[c]-1 ]; @@ -290,12 +292,12 @@ private: } } else { /* No scaling to apply */ - for(int i=0; i= snode.ncol) k += node.ndelay_in; node.lcol[k] = aval[src]; } @@ -306,30 +308,30 @@ private: /* Build lookup vector, allowing for insertion of delayed vars */ /* Note that while rlist[] is 1-indexed this is fine so long as lookup * is also 1-indexed (which it is as it is another node's rlist[] */ - for(int i=0; inext_child) { SymbolicNode const& csnode = child->symb; /* Handle delays - go to back of node * (i.e. become the last rows as in lower triangular format) */ - for(int i=0; indelay_out; i++) { + for(ipc_ i=0; indelay_out; i++) { // Add delayed rows (from delayed cols) T *dest = &node.lcol[delay_col*(ldl+1)]; - int lds = align_lda(csnode.nrow + child->ndelay_in); + ipc_ lds = align_lda(csnode.nrow + child->ndelay_in); T *src = &child->lcol[(child->nelim+i)*(lds+1)]; node.perm[delay_col] = child->perm[child->nelim+i]; - for(int j=0; jndelay_out-i; j++) { + for(ipc_ j=0; jndelay_out-i; j++) { dest[j] = src[j]; } // Add child's non-fully summed rows (from delayed cols) dest = node.lcol; src = &child->lcol[child->nelim*lds + child->ndelay_in +i*lds]; - for(int j=csnode.ncol; jcontrib) { - int cm = csnode.nrow - csnode.ncol; - for(int i=0; icontrib[i*cm]; // NB: we handle contribution to contrib in assemble_post() if(c < snode.ncol) { // Contribution added to lcol - int ldd = align_lda(nrow); + ipc_ ldd = align_lda(nrow); T *dest = &node.lcol[c*ldd]; - for(int j=i; jndelay_in; - int n = snode.ncol + node->ndelay_in; + ipc_ m = snode.nrow + node->ndelay_in; + ipc_ n = snode.ncol + node->ndelay_in; size_t ldl = align_lda(m); T *lcol = node->lcol; T *d = &node->lcol[ n*ldl ]; - int *perm = node->perm; + ipc_ *perm = node->perm; /* Perform factorization */ //Verify verifier(m, n, perm, lcol, ldl); @@ -385,8 +387,8 @@ private: //verifier.verify(node->nelim, perm, lcol, ldl, d); if(m-n>0 && node->nelim>0) { - int nelim = node->nelim; - int ldld = align_lda(m-n); + ipc_ nelim = node->nelim; + ipc_ ldld = align_lda(m-n); T *ld = work.get_ptr(nelim*ldld); calcLD(m-n, nelim, &lcol[n], ldl, d, ld, ldld); host_gemm(OP_N, OP_T, m-n, m-n, nelim, @@ -397,7 +399,7 @@ private: /* Record information */ node->ndelay_out = n - node->nelim; stats.num_delay += node->ndelay_out; - for (int64_t j = m; j >= m-(node->nelim)+1; --j) { + for (longc_ j = m; j >= m-(node->nelim)+1; --j) { stats.num_factor += j; stats.num_flops += j*j; } @@ -409,7 +411,7 @@ private: node->free_contrib(); } else if(node->nelim==0) { // FIXME: If we fix the above, we don't need this explict zeroing - long contrib_size = m-n; + longc_ contrib_size = m-n; memset(node->contrib, 0, contrib_size*contrib_size*sizeof(T)); } } @@ -418,35 +420,35 @@ private: SymbolicNode const& snode, NumericNode& node, PoolAllocator& pool_alloc, - int* map + ipc_* map ) { /* Initialise variables */ - int ncol = snode.ncol + node.ndelay_in; + ipc_ ncol = snode.ncol + node.ndelay_in; /* Add children */ if(node.first_child != NULL) { /* Build lookup vector, allowing for insertion of delayed vars */ /* Note that while rlist[] is 1-indexed this is fine so long as lookup * is also 1-indexed (which it is as it is another node's rlist[] */ - for(int i=0; inext_child) { SymbolicNode const& csnode = child->symb; if(!child->contrib) continue; - int cm = csnode.nrow - csnode.ncol; - for(int i=0; icontrib[i*cm]; // NB: only interested in contribution to generated element if(c >= snode.ncol) { // Contribution added to contrib - int ldd = snode.nrow - snode.ncol; + ipc_ ldd = snode.nrow - snode.ncol; T *dest = &node.contrib[(c-ncol)*ldd]; - for(int j=i; j +#include "ssids_rip.hxx" #include "ssids_cpu_cpu_iface.hxx" #include "ssids_cpu_SymbolicNode.hxx" -#ifdef SPRAL_SINGLE -#define precision_ float -#else -#define precision_ double -#endif namespace spral { namespace ssids { namespace cpu { class SymbolicSubtree; @@ -33,11 +31,11 @@ class SmallLeafSymbolicSubtree { private: class Node { public: - int nrow; - int ncol; - int sparent; - int* rlist; - int lcol_offset; + ipc_ nrow; + ipc_ ncol; + ipc_ sparent; + ipc_* rlist; + ipc_ lcol_offset; }; public: @@ -76,35 +74,40 @@ public: * nlist[2*i+1] of the relevant supernode (as per nptr) of \f$ L \f$. * \param symb Underlying SymbolicSubtree for containing parttree. */ - SmallLeafSymbolicSubtree(int sa, int en, int part_offset, int const* sptr, int const* sparent, long const* rptr, int const* rlist, long const* nptr, long const* nlist, SymbolicSubtree const& symb) - : sa_(sa), en_(en), nnodes_(en-sa+1), parent_(sparent[part_offset+en]-1-part_offset), - nodes_(nnodes_), - rlist_(new int[rptr[part_offset+en+1]-rptr[part_offset+sa]], std::default_delete()), + SmallLeafSymbolicSubtree(ipc_ sa, ipc_ en, ipc_ part_offset, + ipc_ const* sptr, ipc_ const* sparent, + longc_ const* rptr, ipc_ const* rlist, + longc_ const* nptr, longc_ const* nlist, + SymbolicSubtree const& symb) + : sa_(sa), en_(en), nnodes_(en-sa+1), + parent_(sparent[part_offset+en]-1-part_offset), nodes_(nnodes_), + rlist_(new ipc_[rptr[part_offset+en+1]-rptr[part_offset+sa]], + std::default_delete()), nptr_(nptr), nlist_(nlist), symb_(symb) { /* Setup basic node information */ nfactor_ = 0; - int* newrlist = rlist_.get(); - for(int ni=sa; ni<=en; ++ni) { + ipc_* newrlist = rlist_.get(); + for(ipc_ ni=sa; ni<=en; ++ni) { nodes_[ni-sa].nrow = rptr[part_offset+ni+1] - rptr[part_offset+ni]; nodes_[ni-sa].ncol = sptr[part_offset+ni+1] - sptr[part_offset+ni]; nodes_[ni-sa].sparent = sparent[part_offset+ni]-sa-1; // sparent is Fortran indexed // FIXME: subtract ncol off rlist for elim'd vars nodes_[ni-sa].rlist = &newrlist[rptr[part_offset+ni]-rptr[part_offset+sa]]; nodes_[ni-sa].lcol_offset = nfactor_; - size_t ldl = align_lda(nodes_[ni-sa].nrow); + size_t ldl = align_lda(nodes_[ni-sa].nrow); nfactor_ += nodes_[ni-sa].ncol*ldl; } /* Construct rlist_ being offsets into parent node */ - for(int ni=sa; ni<=en; ++ni) { + for(ipc_ ni=sa; ni<=en; ++ni) { if(nodes_[ni-sa].ncol == nodes_[ni-sa].nrow) continue; // is root - int const* ilist = &rlist[rptr[part_offset+ni]-1]; // rptr is Fortran indexed + ipc_ const* ilist = &rlist[rptr[part_offset+ni]-1]; // rptr is Fortran indexed ilist += nodes_[ni-sa].ncol; // Skip eliminated vars - int pnode = sparent[part_offset+ni]-1; //Fortran indexed - int const* jlist = &rlist[rptr[pnode]-1]; // rptr is Fortran indexed - int const* jstart = jlist; - int *outlist = nodes_[ni-sa].rlist; - for(int i=nodes_[ni-sa].ncol; i nodes_; //< Nodes of this subtree. - std::shared_ptr rlist_; //< Row entries of this subtree. - long const* nptr_; //< Node mapping into nlist_. - long const* nlist_; //< Mapping from \f$ A \f$ to \f$ L \f$. + std::shared_ptr rlist_; //< Row entries of this subtree. + longc_ const* nptr_; //< Node mapping into nlist_. + longc_ const* nlist_; //< Mapping from \f$ A \f$ to \f$ L \f$. SymbolicSubtree const& symb_; //< Underlying parttree template +#include "ssids_rip.hxx" namespace spral { namespace ssids { namespace cpu { /** Symbolic representation of a node */ struct SymbolicNode { bool insmallleaf; - int idx; //< Index of node - int nrow; //< Number of rows - int ncol; //< Number of columns + ipc_ idx; //< Index of node + ipc_ nrow; //< Number of rows + ipc_ ncol; //< Number of columns SymbolicNode* first_child; //< Pointer to first child in linked list SymbolicNode* next_child; //< Pointer to second child in linked list - int const* rlist; //< Pointer to row lists - int num_a; //< Number of entries mapped from A to L - long const* amap; //< Pointer to map from A to L locations - int parent; //< index of parent node - std::vector contrib; //< index of expected contribution(s) + ipc_ const* rlist; //< Pointer to row lists + ipc_ num_a; //< Number of entries mapped from A to L + longc_ const* amap; //< Pointer to map from A to L locations + ipc_ parent; //< index of parent node + std::vector contrib; //< index of expected contribution(s) }; }}} /* end of namespace spral::ssids::cpu */ diff --git a/include/ssids_cpu_SymbolicSubtree.hxx b/include/ssids_cpu_SymbolicSubtree.hxx index 943f92587e..9407cf4f23 100644 --- a/include/ssids_cpu_SymbolicSubtree.hxx +++ b/include/ssids_cpu_SymbolicSubtree.hxx @@ -2,27 +2,29 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 15:00 GMT */ #pragma once #include #include +#include +#include "ssids_rip.hxx" #include "ssids_cpu_SmallLeafSymbolicSubtree.hxx" #include "ssids_cpu_SymbolicNode.hxx" -#ifdef SPRAL_SINGLE -#define precision_ float -#else -#define precision_ double -#endif - namespace spral { namespace ssids { namespace cpu { /** Symbolic factorization of a subtree to be factored on the CPU */ class SymbolicSubtree { public: - SymbolicSubtree(int n, int sa, int en, int const* sptr, int const* sparent, long const* rptr, int const* rlist, long const* nptr, long const* nlist, int ncontrib, int const* contrib_idx, struct cpu_factor_options const& options) + SymbolicSubtree(ipc_ n, ipc_ sa, ipc_ en, ipc_ const* sptr, + ipc_ const* sparent, longc_ const* rptr, + ipc_ const* rlist, longc_ const* nptr, + longc_ const* nlist, ipc_ ncontrib, + ipc_ const* contrib_idx, + struct cpu_factor_options const& options) : n(n), nnodes_(en-sa), nodes_(nnodes_+1) { // Adjust sa to C indexing (en is not used except in nnodes_ init above) @@ -30,9 +32,9 @@ public: // FIXME: don't process nodes that are in small leaf subtrees /* Fill out basic details */ maxfront_ = 0; - for(int ni=0; ni(rptr[sa+ni+1] - rptr[sa+ni]); + nodes_[ni].nrow = static_cast(rptr[sa+ni+1] - rptr[sa+ni]); nodes_[ni].ncol = sptr[sa+ni+1] - sptr[sa+ni]; nodes_[ni].first_child = nullptr; nodes_[ni].next_child = nullptr; @@ -45,36 +47,36 @@ public: } nodes_[nnodes_].first_child = nullptr; // List of roots /* Build child linked lists */ - for(int ni=0; nifirst_child; parent->first_child = &nodes_[ni]; } /* Record contribution block inputs */ - for(int ci=0; ci(nodes_[ni].nrow)*nodes_[ni].ncol; /* Find small leaf subtrees */ // Count flops below each node - std::vector flops(nnodes_+1, 0); - for(int ni=0; ni flops(nnodes_+1, 0); + for(ipc_ ni=0; ni 0) // not a leaf! flops[ni] += options.small_subtree_threshold; - int parent = std::min(nodes_[ni].parent, nnodes_); + ipc_ parent = std::min(nodes_[ni].parent, nnodes_); flops[parent] += flops[ni]; } // Start at least node and work way up using parents until too large - for(int ni=0; ni= options.small_subtree_threshold) break; last = current; } @@ -83,27 +85,27 @@ public: small_leafs_.emplace_back( ni, last, sa, sptr, sparent, rptr, rlist, nptr, nlist, *this ); - for(int i=ni; i<=last; ++i) + for(ipc_ i=ni; i<=last; ++i) nodes_[i].insmallleaf = true; ni = last+1; // Skip to next node not in this subtree } } - SymbolicNode const& operator[](int idx) const { + SymbolicNode const& operator[](ipc_ idx) const { return nodes_[idx]; } - size_t get_factor_mem_est(precision_ multiplier) const { - size_t mem = n*sizeof(int) + (2*n+nfactor_)*sizeof(precision_); + size_t get_factor_mem_est(rpc_ multiplier) const { + size_t mem = n*sizeof(ipc_) + (2*n+nfactor_)*sizeof(rpc_); return std::max(mem, static_cast(mem*multiplier)); } template size_t get_pool_size() const { - return maxfront_*align_lda(maxfront_); + return maxfront_*align_lda(maxfront_); } public: - int const n; //< Maximum row index + ipc_ const n; //< Maximum row index private: - int nnodes_; + ipc_ nnodes_; size_t nfactor_; size_t maxfront_; std::vector nodes_; diff --git a/include/ssids_cpu_ThreadStats.hxx b/include/ssids_cpu_ThreadStats.hxx index 4948dced81..9673330dc6 100644 --- a/include/ssids_cpu_ThreadStats.hxx +++ b/include/ssids_cpu_ThreadStats.hxx @@ -2,19 +2,23 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 15:00 GMT */ + #pragma once #include #include +#include "ssids_rip.hxx" + namespace spral { namespace ssids { namespace cpu { /** \brief SSIDS error/warning flags. * * Must match Fortran definitions in src/ssids/datatypes.f90 */ -enum Flag : int { +enum Flag : ipc_ { SUCCESS = 0, ERROR_SINGULAR = -5, @@ -29,11 +33,11 @@ enum Flag : int { */ class SingularError: public std::runtime_error { public: - SingularError(int col) + SingularError(ipc_ col) : std::runtime_error("Matrix is singular"), col(col) {} - int const col; + ipc_ const col; }; /** @@ -47,16 +51,16 @@ public: */ struct ThreadStats { Flag flag = Flag::SUCCESS; ///< Error flag for thread - int num_delay = 0; ///< Number of delays - int64_t num_factor = 0; ///< Number of entries in factors - int64_t num_flops = 0; ///< Number of floating point operations - int num_neg = 0; ///< Number of negative pivots - int num_two = 0; ///< Number of 2x2 pivots - int num_zero = 0; ///< Number of zero pivots - int maxfront = 0; ///< Maximum front size - int maxsupernode = 0; ///< Maximum supernode size - int not_first_pass = 0; ///< Number of pivots not eliminated in APP - int not_second_pass = 0; ///< Number of pivots not eliminated in APP or TPP + ipc_ num_delay = 0; ///< Number of delays + longc_ num_factor = 0; ///< Number of entries in factors + longc_ num_flops = 0; ///< Number of floating point operations + ipc_ num_neg = 0; ///< Number of negative pivots + ipc_ num_two = 0; ///< Number of 2x2 pivots + ipc_ num_zero = 0; ///< Number of zero pivots + ipc_ maxfront = 0; ///< Maximum front size + ipc_ maxsupernode = 0; ///< Maximum supernode size + ipc_ not_first_pass = 0; ///< Number of pivots not eliminated in APP + ipc_ not_second_pass = 0; ///< Number of pivots not eliminated in APP or TPP ThreadStats& operator+=(ThreadStats const& other); }; diff --git a/include/ssids_cpu_Workspace.hxx b/include/ssids_cpu_Workspace.hxx index f099fd50fe..b73bf29195 100644 --- a/include/ssids_cpu_Workspace.hxx +++ b/include/ssids_cpu_Workspace.hxx @@ -2,12 +2,15 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-04 AT 08:30 GMT */ + #pragma once #include #include "spral_compat.hxx" // in case std::align not defined +#include "ssids_rip.hxx" namespace spral { namespace ssids { namespace cpu { @@ -16,11 +19,11 @@ namespace spral { namespace ssids { namespace cpu { * given size. */ class Workspace { #if defined(__AVX512F__) - static int const align = 64; + static ipc_ const align = 64; #elif defined(__AVX__) - static int const align = 32; + static ipc_ const align = 32; #else - static int const align = 16; + static ipc_ const align = 16; #endif public: Workspace(size_t sz) diff --git a/include/ssids_cpu_cpu_iface.hxx b/include/ssids_cpu_cpu_iface.hxx index cd9c9a0b6a..f87dc6105e 100644 --- a/include/ssids_cpu_cpu_iface.hxx +++ b/include/ssids_cpu_cpu_iface.hxx @@ -2,40 +2,36 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 10:30 GMT */ + #pragma once #include - -#ifdef SPRAL_SINGLE -#define precision_ float -#else -#define precision_ double -#endif - +#include "ssids_rip.hxx" namespace spral { namespace ssids { namespace cpu { -enum struct PivotMethod : int { +enum struct PivotMethod : ipc_ { app_aggressive = 1, app_block = 2, tpp = 3 }; -enum struct FailedPivotMethod : int { +enum struct FailedPivotMethod : ipc_ { tpp = 1, pass = 2 }; struct cpu_factor_options { - int print_level; + ipc_ print_level; bool action; - precision_ small; - precision_ u; - precision_ multiplier; - long small_subtree_threshold; - int cpu_block_size; + rpc_ small; + rpc_ u; + rpc_ multiplier; + longc_ small_subtree_threshold; + ipc_ cpu_block_size; PivotMethod pivot_method; FailedPivotMethod failed_pivot_method; }; @@ -44,14 +40,14 @@ struct cpu_factor_options { template size_t align_lda(size_t lda) { #if defined(__AVX512F__) - int const align = 64; + ipc_ const align = 64; #elif defined(__AVX__) - int const align = 32; + ipc_ const align = 32; #else - int const align = 16; + ipc_ const align = 16; #endif static_assert(align % sizeof(T) == 0, "Can only align if T divides align"); - int const Talign = align / sizeof(T); + ipc_ const Talign = align / sizeof(T); return Talign*((lda-1)/Talign + 1); } diff --git a/include/ssids_cpu_factor.hxx b/include/ssids_cpu_factor.hxx index 611758ebd0..9310abd53a 100644 --- a/include/ssids_cpu_factor.hxx +++ b/include/ssids_cpu_factor.hxx @@ -2,6 +2,7 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 11:30 GMT */ #pragma once @@ -16,6 +17,8 @@ #endif /* _OPENMP */ /* SPRAL headers */ + +#include "ssids_rip.hxx" #include "ssids_profile.hxx" #include "ssids_cpu_cpu_iface.hxx" #include "ssids_cpu_SymbolicNode.hxx" @@ -40,7 +43,7 @@ #define ldlt_tpp_factor ldlt_tpp_factor_dbl #endif -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define host_gemm host_gemm_64 #endif @@ -49,7 +52,7 @@ namespace spral { namespace ssids { namespace cpu { /* Factorize a node (indef) */ template void factor_node_indef( - int ni, // FIXME: remove post debug + ipc_ ni, // FIXME: remove post debug SymbolicNode const& snode, NumericNode &node, struct cpu_factor_options const& options, @@ -58,12 +61,12 @@ void factor_node_indef( PoolAlloc& pool_alloc ) { /* Extract useful information about node */ - int m = snode.nrow + node.ndelay_in; - int n = snode.ncol + node.ndelay_in; + ipc_ m = snode.nrow + node.ndelay_in; + ipc_ n = snode.ncol + node.ndelay_in; size_t ldl = align_lda(m); T *lcol = node.lcol; T *d = &node.lcol[ n*ldl ]; - int *perm = node.perm; + ipc_ *perm = node.perm; T *contrib = node.contrib; /* Perform factorization */ @@ -87,7 +90,7 @@ void factor_node_indef( /* Finish factorization worth simplistic code */ if(node.nelim < n) { - int nelim = node.nelim; + ipc_ nelim = node.nelim; if(options.pivot_method!=PivotMethod::tpp) stats.not_first_pass += n-nelim; // Only use TPP to finish off if we're a root node, it's not finishing @@ -104,8 +107,8 @@ void factor_node_indef( options.small, nelim, &lcol[nelim], ldl ); if(m-n>0 && node.nelim>nelim) { - int nelim2 = node.nelim - nelim; - int ldld = align_lda(m-n); + ipc_ nelim2 = node.nelim - nelim; + ipc_ ldld = align_lda(m-n); T *ld = work[omp_get_thread_num()].get_ptr(nelim2*ldld); calcLD( m-n, nelim2, &lcol[nelim*ldl+n], ldl, &d[2*nelim], ld, ldld @@ -132,7 +135,7 @@ void factor_node_indef( /* Record information */ node.ndelay_out = n - node.nelim; stats.num_delay += node.ndelay_out; - for (int64_t j = m; j >= m-(node.nelim)+1; --j) { + for (longc_ j = m; j >= m-(node.nelim)+1; --j) { stats.num_factor += j; stats.num_flops += j*j; } @@ -144,7 +147,7 @@ void factor_node_indef( node.free_contrib(); } else if(node.nelim==0) { // FIXME: If we fix the above, we don't need this explict zeroing - long contrib_size = m-n; + longc_ contrib_size = m-n; memset(node.contrib, 0, contrib_size*contrib_size*sizeof(T)); } } @@ -158,14 +161,14 @@ void factor_node_posdef( ThreadStats& stats ) { /* Extract useful information about node */ - int m = snode.nrow; - int n = snode.ncol; - int ldl = align_lda(m); + ipc_ m = snode.nrow; + ipc_ n = snode.ncol; + ipc_ ldl = align_lda(m); T *lcol = node.lcol; T *contrib = node.contrib; /* Perform factorization */ - int flag; + ipc_ flag; cholesky_factor( m, n, lcol, ldl, beta, contrib, m-n, options.cpu_block_size, &flag ); @@ -178,7 +181,7 @@ void factor_node_posdef( /* Record information */ node.ndelay_out = 0; - for (int64_t j = m; j >= m-(node.nelim)+1; --j) { + for (longc_ j = m; j >= m-(node.nelim)+1; --j) { stats.num_factor += j; stats.num_flops += j*j; } @@ -186,7 +189,7 @@ void factor_node_posdef( /* Factorize a node (wrapper) */ template void factor_node( - int ni, + ipc_ ni, SymbolicNode const& snode, NumericNode &node, struct cpu_factor_options const& options, diff --git a/include/ssids_cpu_kernels_SimdVec.hxx b/include/ssids_cpu_kernels_SimdVec.hxx index 58ebebada2..e3ac62ee3a 100644 --- a/include/ssids_cpu_kernels_SimdVec.hxx +++ b/include/ssids_cpu_kernels_SimdVec.hxx @@ -2,13 +2,17 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-04 AT 10:10 GMT */ + #pragma once #include #include #include +#include "ssids_rip.hxx" + #if defined(__AVX2__) || defined(__AVX__) #include #endif @@ -35,12 +39,12 @@ public: #if defined(__AVX2__) || defined(__AVX__) /// Length of underlying vector type - static const int vector_length = 4; /* this should probably be 8 !! */ + static const ipc_ vector_length = 4; /* this should probably be 8 !! */ /// Typedef for underlying vector type containing singles typedef __m256 simd_precision_type; #else /// Length of underlying vector type - static const int vector_length = 1; + static const ipc_ vector_length = 1; /// Typedef for underlying vector type containing floats typedef float simd_precision_type; #endif @@ -243,7 +247,7 @@ public: /// Returns a vector with all positions idx or above set to true, otherwise /// false. static - SimdVec gt_mask(int idx) { + SimdVec gt_mask(ipc_ idx) { #if defined(__AVX2__) || defined(__AVX__) const float avx_true = -std::numeric_limits::quiet_NaN(); const float avx_false = 0.0; @@ -265,7 +269,7 @@ public: /// Prints the vector (inefficient, use for debug only) void print() { - for(int i=0; i::quiet_NaN(); const double avx_false = 0.0; @@ -517,7 +521,7 @@ public: /// Prints the vector (inefficient, use for debug only) void print() { - for(int i=0; i #include +#include "ssids_rip.hxx" #include "ssids_contrib.h" #include "ssids_profile.hxx" #include "ssids_cpu_NumericNode.hxx" @@ -20,13 +22,11 @@ #define spral_ssids_contrib_free spral_ssids_contrib_free_sgl #define FAPrecisionTraits FASingleTraits #define factor_alloc_precision factor_alloc_single -#define precision float #else #define spral_ssids_contrib_get_data spral_ssids_contrib_get_data_double #define spral_ssids_contrib_free spral_ssids_contrib_free_dbl #define FAPrecisionTraits FADoubleTraits #define factor_alloc_precision factor_alloc_double -#define precision double #endif namespace spral { namespace ssids { namespace cpu { @@ -37,16 +37,16 @@ namespace spral { namespace ssids { namespace cpu { */ template inline -void asm_col(int n, int const* idx, T const* src, T* dest) { - int const nunroll = 4; - int n2 = nunroll*(n/nunroll); - for(int j=0; j -void add_a_block(int from, int to, NumericNode& node, T const* aval, +void add_a_block(ipc_ from, ipc_ to, NumericNode& node, T const* aval, T const* scaling) { SymbolicNode const& snode = node.symb; size_t ldl = node.get_ldl(); if(scaling) { /* Scaling to apply */ - for(int i=from; i= snode.ncol) k += node.ndelay_in; T rscale = scaling[ snode.rlist[r]-1 ]; T cscale = scaling[ snode.rlist[c]-1 ]; @@ -80,12 +80,12 @@ void add_a_block(int from, int to, NumericNode& node, T const* aval, } } else { /* No scaling to apply */ - for(int i=from; i= snode.ncol) k += node.ndelay_in; node.lcol[k] = aval[src]; } @@ -103,18 +103,18 @@ void add_a_block(int from, int to, NumericNode& node, T const* aval, * \param cache Length cm lookup vector. */ template -void assemble_expected(int from, int to, NumericNode& node, NumericNode const& cnode, MapVector const& map, int* cache) { +void assemble_expected(ipc_ from, ipc_ to, NumericNode& node, NumericNode const& cnode, MapVector const& map, ipc_* cache) { SymbolicNode const& csnode = cnode.symb; - int cm = csnode.nrow - csnode.ncol; - for(int j=from; j& node, Numeric * \param cache Length cm lookup vector. */ template -void assemble_expected_contrib(int from, int to, NumericNode& node, NumericNode const& cnode, MapVector const& map, int* cache) { +void assemble_expected_contrib(ipc_ from, ipc_ to, NumericNode& node, NumericNode const& cnode, MapVector const& map, ipc_* cache) { SymbolicNode const& csnode = cnode.symb; - int cm = csnode.nrow - csnode.ncol; - int ncol = node.symb.ncol + node.ndelay_in; - for(int j=from; j= node.symb.ncol) { // Contribution added to contrib - int ldd = node.symb.nrow - node.symb.ncol; + ipc_ ldd = node.symb.nrow - node.symb.ncol; T *dest = &node.contrib[(c-ncol)*ldd]; asm_col(cm-i, &cache[i], &src[i], dest); } @@ -155,7 +155,7 @@ template void assemble_pre( bool posdef, - int n, + ipc_ n, SymbolicNode const& snode, void** child_contrib, NumericNode& node, @@ -169,11 +169,11 @@ void assemble_pre( Profile::Task task_asm_pre("TA_ASM_PRE"); #endif /* Rebind allocators */ - typedef typename std::allocator_traits::template rebind_traits FAPrecisionTraits; + typedef typename std::allocator_traits::template rebind_traits FAPrecisionTraits; typename FAPrecisionTraits::allocator_type factor_alloc_precision(factor_alloc); - typedef typename std::allocator_traits::template rebind_traits FAIntTraits; + typedef typename std::allocator_traits::template rebind_traits FAIntTraits; typename FAIntTraits::allocator_type factor_alloc_int(factor_alloc); - typedef typename std::allocator_traits::template rebind_traits PAIntTraits; + typedef typename std::allocator_traits::template rebind_traits PAIntTraits; typename PAIntTraits::allocator_type pool_alloc_int(pool_alloc); /* Count incoming delays and determine size of node */ @@ -182,21 +182,21 @@ void assemble_pre( node.ndelay_in += child->ndelay_out; } for(int contrib_idx : snode.contrib) { - int cn, ldcontrib, ndelay, lddelay; - precision const *cval, *delay_val; - int const *crlist, *delay_perm; + ipc_ cn, ldcontrib, ndelay, lddelay; + rpc_ const *cval, *delay_val; + ipc_ const *crlist, *delay_perm; spral_ssids_contrib_get_data( child_contrib[contrib_idx], &cn, &cval, &ldcontrib, &crlist, &ndelay, &delay_perm, &delay_val, &lddelay ); node.ndelay_in += ndelay; } - int nrow = snode.nrow + node.ndelay_in; - int ncol = snode.ncol + node.ndelay_in; + ipc_ nrow = snode.nrow + node.ndelay_in; + ipc_ ncol = snode.ncol + node.ndelay_in; /* Get space for node now we know it size using Fortran allocator + zero it*/ // NB L is nrow x ncol and D is 2 x ncol (but no D if posdef) - size_t ldl = align_lda(nrow); + size_t ldl = align_lda(nrow); size_t len = posdef ? ldl * ncol // posdef : (ldl+2) * ncol; // indef (includes D) node.lcol = FAPrecisionTraits::allocate(factor_alloc_precision, len); @@ -209,18 +209,18 @@ void assemble_pre( /* Alloc + set perm for expected eliminations at this node (delays are set * when they are imported from children) */ node.perm = FAIntTraits::allocate(factor_alloc_int, ncol); // ncol fully summed variables - for(int i=0; i( + auto map = std::unique_ptr( PAIntTraits::allocate(pool_alloc_int, n+1), map_deleter); - for(int i=0; isymb; /* Handle delays - go to back of node * (i.e. become the last rows as in lower triangular format) */ - for(int i=0; indelay_out; i++) { + for(ipc_ i=0; indelay_out; i++) { // Add delayed rows (from delayed cols) T *dest = &node.lcol[delay_col*(ldl+1)]; - int lds = align_lda(csnode.nrow + child->ndelay_in); + ipc_ lds = align_lda(csnode.nrow + child->ndelay_in); T *src = &child->lcol[(child->nelim+i)*(lds+1)]; node.perm[delay_col] = child->perm[child->nelim+i]; - for(int j=0; jndelay_out-i; j++) { + for(ipc_ j=0; jndelay_out-i; j++) { dest[j] = src[j]; } // Add child's non-fully summed rows (from delayed cols) dest = node.lcol; src = &child->lcol[child->nelim*lds + child->ndelay_in +i*lds]; - for(int j=csnode.ncol; jcontrib) { - int cm = csnode.nrow - csnode.ncol; - int const block_size = 256; // FIXME: make configurable? + ipc_ cm = csnode.nrow - csnode.ncol; + ipc_ const block_size = 256; // FIXME: make configurable? if(cm < block_size) { // Single block - int* cache = work[omp_get_thread_num()].get_ptr(cm); + ipc_* cache = work[omp_get_thread_num()].get_ptr(cm); assemble_expected(0, cm, node, *child, map, cache); } else { // Multiple blocks #pragma omp taskgroup - for(int iblk=0; iblk(cm); + ipc_* cache = work[omp_get_thread_num()].get_ptr(cm); assemble_expected(iblk, std::min(iblk+block_size,cm), node, *child, map, cache); #ifdef PROFILE @@ -318,32 +318,32 @@ void assemble_pre( } } /* Add any contribution block from other subtrees */ - for(int contrib_idx : snode.contrib) { - int cn, ldcontrib, ndelay, lddelay; - precision const *cval, *delay_val; - int const *crlist, *delay_perm; + for(ipc_ contrib_idx : snode.contrib) { + ipc_ cn, ldcontrib, ndelay, lddelay; + rpc_ const *cval, *delay_val; + ipc_ const *crlist, *delay_perm; spral_ssids_contrib_get_data( child_contrib[contrib_idx], &cn, &cval, &ldcontrib, &crlist, &ndelay, &delay_perm, &delay_val, &lddelay ); - int* cache = work[omp_get_thread_num()].get_ptr(cn); - for(int j=0; j(cn); + for(ipc_ j=0; j(nrow); + ipc_ ldd = align_lda(nrow); T *dest = &node.lcol[c*ldd]; asm_col(cn-i, &cache[i], &src[i], dest); } @@ -369,7 +369,7 @@ template void assemble_post( - int n, + ipc_ n, SymbolicNode const& snode, void** child_contrib, NumericNode& node, @@ -377,36 +377,36 @@ void assemble_post( std::vector& work ) { /* Rebind allocators */ - typedef typename std::allocator_traits::template rebind_traits PAIntTraits; + typedef typename std::allocator_traits::template rebind_traits PAIntTraits; typename PAIntTraits::allocator_type pool_alloc_int(pool_alloc); /* Initialise variables */ - int ncol = snode.ncol + node.ndelay_in; + ipc_ ncol = snode.ncol + node.ndelay_in; /* Add children */ - int* map = nullptr; + ipc_* map = nullptr; if(node.first_child != NULL || snode.contrib.size() > 0) { /* Build lookup vector, allowing for insertion of delayed vars */ /* Note that while rlist[] is 1-indexed this is fine so long as lookup * is also 1-indexed (which it is as it is another node's rlist[] */ if(!map) map = PAIntTraits::allocate(pool_alloc_int, n+1); // FIXME: probably don't need to worry about first ncol? - for(int i=0; inext_child) { SymbolicNode const& csnode = child->symb; if(!child->contrib) continue; - int cm = csnode.nrow - csnode.ncol; - int const block_size = 256; + ipc_ cm = csnode.nrow - csnode.ncol; + ipc_ const block_size = 256; if(cm < block_size) { - int* cache = work[omp_get_thread_num()].get_ptr(cm); + ipc_* cache = work[omp_get_thread_num()].get_ptr(cm); assemble_expected_contrib(0, cm, node, *child, map, cache); } else { #pragma omp taskgroup - for(int iblk=0; iblk(cm); + ipc_* cache = work[omp_get_thread_num()].get_ptr(cm); assemble_expected_contrib(iblk, std::min(iblk+block_size,cm), node, *child, map, cache); #ifdef PROFILE @@ -429,25 +429,25 @@ void assemble_post( } } /* Add any contribution block from other subtrees */ - for(int contrib_idx : snode.contrib) { - int cn, ldcontrib, ndelay, lddelay; - precision const *cval, *delay_val; - int const *crlist, *delay_perm; + for(ipc_ contrib_idx : snode.contrib) { + ipc_ cn, ldcontrib, ndelay, lddelay; + rpc_ const *cval, *delay_val; + ipc_ const *crlist, *delay_perm; spral_ssids_contrib_get_data( child_contrib[contrib_idx], &cn, &cval, &ldcontrib, &crlist, &ndelay, &delay_perm, &delay_val, &lddelay ); if(!cval) continue; // child was all delays, nothing to do - int* cache = work[omp_get_thread_num()].get_ptr(cn); - for(int j=0; j(cn); + for(ipc_ j=0; j= snode.ncol) { // Contribution added to contrib - int ldd = snode.nrow - snode.ncol; + ipc_ ldd = snode.nrow - snode.ncol; T *dest = &node.contrib[(c-ncol)*ldd]; asm_col(cn-i, &cache[i], &src[i], dest); } diff --git a/include/ssids_cpu_kernels_block_ldlt.hxx b/include/ssids_cpu_kernels_block_ldlt.hxx index 79f9c59e39..e180680c2a 100644 --- a/include/ssids_cpu_kernels_block_ldlt.hxx +++ b/include/ssids_cpu_kernels_block_ldlt.hxx @@ -2,6 +2,7 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 14:30 GMT */ #pragma once @@ -16,27 +17,27 @@ namespace block_ldlt_internal { /** Swaps two columns of A */ /* NB: ldwork only well defined for c -void swap_cols(int idx1, int idx2, int n, T *a, int lda, T *ldwork, int *perm) { +template +void swap_cols(ipc_ idx1, ipc_ idx2, ipc_ n, T *a, ipc_ lda, T *ldwork, ipc_ *perm) { if(idx1==idx2) return; // noop /* Ensure wlog idx1 < idx2 */ if(idx1 > idx2) { - int temp = idx1; + ipc_ temp = idx1; idx1 = idx2; idx2 = temp; } /* Swap perm */ if(perm) { - int temp = perm[idx1]; + ipc_ temp = perm[idx1]; perm[idx1] = perm[idx2]; perm[idx2] = temp; } /* Swap ldwork */ if(ldwork) { - for(int c=0; c -void find_maxloc(const int from, const T *a, int lda, T &bestv_out, int &rloc, int &cloc) { +template +void find_maxloc(const ipc_ from, const T *a, ipc_ lda, T &bestv_out, ipc_ &rloc, ipc_ &cloc) { typedef SimdVec SimdVecT; /* Handle special cases: @@ -85,8 +86,8 @@ void find_maxloc(const int from, const T *a, int lda, T &bestv_out, int &rloc, i BLOCK_SIZE % (2*SimdVecT::vector_length) != 0) { T bestv = -1.0; rloc = BLOCK_SIZE; cloc = BLOCK_SIZE; - for(int c=from; c bestv) { bestv = fabs(v); @@ -104,7 +105,7 @@ void find_maxloc(const int from, const T *a, int lda, T &bestv_out, int &rloc, i // Define a union that lets us abuse T to store ints and still use // avx blend. union intT { - int i; + ipc_ i; T d; }; @@ -112,19 +113,19 @@ void find_maxloc(const int from, const T *a, int lda, T &bestv_out, int &rloc, i SimdVecT bestv(-1.0); SimdVecT bestv2(-1.0); intT imax; - imax.i = std::numeric_limits::max(); + imax.i = std::numeric_limits::max(); SimdVecT bestr(imax.d); SimdVecT bestr2(imax.d); SimdVecT bestc(imax.d); SimdVecT bestc2(imax.d); // Loop over array at stride equal to vector length - for(int c=from; c bestv_out) { bestv_out = bv2[i]; rloc = br2[i].i + i; // NB rloc only stores base of vector, so need +i @@ -218,51 +219,51 @@ bool test_2x2(T a11, T a21, T a22, T &detpiv, T &detscale) { } /** Updates the trailing submatrix (2x2 case) */ -template -void update_2x2(int p, T *a, int lda, const T *ld) { - for(int c=p+2; c +void update_2x2(ipc_ p, T *a, ipc_ lda, const T *ld) { + for(ipc_ c=p+2; c -void update_1x1(int p, T *a, int lda, const T *ld) { +template +void update_1x1(ipc_ p, T *a, ipc_ lda, const T *ld) { #if 0 - for(int c=p+1; c::vector_length; - const int unroll=4; // How many iteration of loop we're doing + const ipc_ vlen = SimdVec::vector_length; + const ipc_ unroll=4; // How many iteration of loop we're doing // Handle case of small BLOCK_SIZE safely if(BLOCK_SIZE < vlen || BLOCK_SIZE%vlen != 0 || BLOCK_SIZE < unroll) { - for(int c=p+1; c ldvec( -ld[c] ); // NB minus so we can use fma below - for(int r=vlen*(c/vlen); r lvec = SimdVec::load_aligned(&a[p*lda+r]); SimdVec avec = SimdVec::load_aligned(&a[c*lda+r]); avec = fmadd(avec, lvec, ldvec); avec.store_aligned(&a[c*lda+r]); } } - for(int c=unroll*((p+1-1)/unroll+1); c ldvec0( -ld[c] ); // NB minus so we can use fma below SimdVec ldvec1( -ld[c+1] ); // NB minus so we can use fma below SimdVec ldvec2( -ld[c+2] ); // NB minus so we can use fma below SimdVec ldvec3( -ld[c+3] ); // NB minus so we can use fma below - for(int r=vlen*(c/vlen); r lvec = SimdVec::load_aligned(&a[p*lda+r]); SimdVec avec0 = SimdVec::load_aligned(&a[(c+0)*lda+r]); SimdVec avec1 = SimdVec::load_aligned(&a[(c+1)*lda+r]); @@ -286,16 +287,16 @@ void update_1x1(int p, T *a, int lda, const T *ld) { /** Factorize a square block without restricting pivots * Expects to be given a square block of size BLOCK_SIZE with numbers of * interest in bottom right part. */ -template -void block_ldlt(int from, int *perm, T *a, int lda, T *d, T *ldwork, - bool action, const T u, const T small, int *lperm=nullptr) { +template +void block_ldlt(ipc_ from, ipc_ *perm, T *a, ipc_ lda, T *d, T *ldwork, + bool action, const T u, const T small, ipc_ *lperm=nullptr) { using namespace block_ldlt_internal; /* Main loop */ - for(int p=from; p(p, a, lda, bestv, t, m); // Handle case where everything remaining is small @@ -306,9 +307,9 @@ void block_ldlt(int from, int *perm, T *a, int lda, T *d, T *ldwork, for(; p (p, t, BLOCK_SIZE, a, lda, ldwork, perm); - if(lperm) { int temp=lperm[p]; lperm[p]=lperm[t]; lperm[t]=temp; } + if(lperm) { ipc_ temp=lperm[p]; lperm[p]=lperm[t]; lperm[t]=temp; } /* Divide through, preserving a copy */ T *work = &ldwork[p*BLOCK_SIZE]; - for(int r=p+1; r m by construction. Hence m>=p, t>=p+1 and swaps are safe */ swap_cols (p, m, BLOCK_SIZE, a, lda, ldwork, perm); - if(lperm) { int temp=lperm[p]; lperm[p]=lperm[m]; lperm[m]=temp; } + if(lperm) { ipc_ temp=lperm[p]; lperm[p]=lperm[m]; lperm[m]=temp; } swap_cols (p+1, t, BLOCK_SIZE, a, lda, ldwork, perm); - if(lperm) { int temp=lperm[p+1]; lperm[p+1]=lperm[t]; lperm[t]=temp; } + if(lperm) { ipc_ temp=lperm[p+1]; lperm[p+1]=lperm[t]; lperm[t]=temp; } /* Calculate 2x2 inverse */ T d11 = (a22*detscale)/detpiv; T d22 = (a11*detscale)/detpiv; T d21 = (-a21*detscale)/detpiv; /* Divide through, preserving a copy */ T *work = &ldwork[p*BLOCK_SIZE]; - for(int r=p+2; r #include +#include "ssids_rip.hxx" #include "ssids_cpu_kernels_common.hxx" #include "ssids_cpu_kernels_SimdVec.hxx" @@ -19,18 +22,18 @@ namespace spral { namespace ssids { namespace cpu { * Note this will mostly just fail if sizeof(T) doesn't divide into alignment. */ template -int offset_to_align(T* ptr) { +ipc_ offset_to_align(T* ptr) { #if defined(__AVX512F__) - int const align = 64; + ipc_ const align = 64; #elif defined(__AVX__) - int const align = 32; + ipc_ const align = 32; #else - int const align = 16; + ipc_ const align = 16; #endif uintptr_t offset = align - (reinterpret_cast(ptr) % align); offset /= sizeof(T); if((reinterpret_cast(ptr+offset) % align) == 0) return offset; - else return std::numeric_limits::max(); + else return std::numeric_limits::max(); } /** Calculates LD from L and D. @@ -39,32 +42,35 @@ int offset_to_align(T* ptr) { * multiples of 32 bytes, so we can use AVX. */ template -void calcLD(int m, int n, T const* l, int ldl, T const* d, T* ld, int ldld) { +void calcLD(ipc_ m, ipc_ n, T const* l, ipc_ ldl, T const* d, T* ld, + ipc_ ldld) { typedef SimdVec SimdVecT; - for(int col=0; col #include "ssids_cpu_Workspace.hxx" +#include "ssids_rip.hxx" #ifdef SPRAL_SINGLE #define ldlt_app_factor ldlt_app_factor_sgl @@ -24,19 +27,19 @@ namespace spral { namespace ssids { namespace cpu { template -int ldlt_app_factor(int m, int n, int *perm, T *a, int lda, T *d, T beta, - T* upd, int ldupd, struct cpu_factor_options const& options, +ipc_ ldlt_app_factor(ipc_ m, ipc_ n, ipc_ *perm, T *a, ipc_ lda, T *d, T beta, + T* upd, ipc_ ldupd, struct cpu_factor_options const& options, std::vector& work, Allocator const& alloc); template -void ldlt_app_solve_fwd(int m, int n, T const* l, int ldl, int nrhs, T* x, - int ldx); +void ldlt_app_solve_fwd(ipc_ m, ipc_ n, T const* l, ipc_ ldl, ipc_ nrhs, T* x, + ipc_ ldx); template -void ldlt_app_solve_diag(int n, T const* d, int nrhs, T* x, int ldx); +void ldlt_app_solve_diag(ipc_ n, T const* d, ipc_ nrhs, T* x, ipc_ ldx); template -void ldlt_app_solve_bwd(int m, int n, T const* l, int ldl, int nrhs, T* x, - int ldx); +void ldlt_app_solve_bwd(ipc_ m, ipc_ n, T const* l, ipc_ ldl, ipc_ nrhs, T* x, + ipc_ ldx); }}} /* namespaces spral::ssids::cpu */ diff --git a/include/ssids_cpu_kernels_ldlt_nopiv.hxx b/include/ssids_cpu_kernels_ldlt_nopiv.hxx index 348bbbb7f9..22e3d7a5f3 100644 --- a/include/ssids_cpu_kernels_ldlt_nopiv.hxx +++ b/include/ssids_cpu_kernels_ldlt_nopiv.hxx @@ -2,17 +2,19 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 14:40 GMT */ + #pragma once +#include "ssids_rip.hxx" + #ifdef SPRAL_SINGLE -#define precision_ float #define ldlt_nopiv_factor ldlt_nopiv_factor_sgl #define ldlt_nopiv_solve_fwd ldlt_nopiv_solve_fwd_sgl #define ldlt_nopiv_solve_diag ldlt_nopiv_solve_diag_sgl #define ldlt_nopiv_solve_bwd ldlt_nopiv_solve_bwd_sgl #else -#define precision_ double #define ldlt_nopiv_factor ldlt_nopiv_factor_dbl #define ldlt_nopiv_solve_fwd ldlt_nopiv_solve_fwd_dbl #define ldlt_nopiv_solve_diag ldlt_nopiv_solve_diag_dbl @@ -21,12 +23,12 @@ namespace spral { namespace ssids { namespace cpu { -int ldlt_nopiv_factor(int m, int n, precision_* a, int lda, precision_* work); -void ldlt_nopiv_solve_fwd(int m, int n, precision_ const* a, int lda, - precision_ *x); -void ldlt_nopiv_solve_diag(int m, int n, precision_ const* a, int lda, - precision_ *x); -void ldlt_nopiv_solve_bwd(int m, int n, precision_ const* a, int lda, - precision_ *x); +ipc_ ldlt_nopiv_factor(ipc_ m, ipc_ n, rpc_* a, ipc_ lda, rpc_* work); +void ldlt_nopiv_solve_fwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda, + rpc_ *x); +void ldlt_nopiv_solve_diag(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda, + rpc_ *x); +void ldlt_nopiv_solve_bwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda, + rpc_ *x); }}} /* namespaces spral::ssids::cpu */ diff --git a/include/ssids_cpu_kernels_ldlt_tpp.hxx b/include/ssids_cpu_kernels_ldlt_tpp.hxx index fdf95b85d3..3b66eccd0f 100644 --- a/include/ssids_cpu_kernels_ldlt_tpp.hxx +++ b/include/ssids_cpu_kernels_ldlt_tpp.hxx @@ -2,17 +2,19 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 14:40 GMT */ + #pragma once +#include "ssids_rip.hxx" + #ifdef SPRAL_SINGLE -#define precision_ float #define ldlt_tpp_factor ldlt_tpp_factor_sgl #define ldlt_tpp_solve_fwd ldlt_tpp_solve_fwd_sgl #define ldlt_tpp_solve_diag ldlt_tpp_solve_diag_sgl #define ldlt_tpp_solve_bwd ldlt_tpp_solve_bwd_sgl #else -#define precision_ double #define ldlt_tpp_factor ldlt_tpp_factor_dbl #define ldlt_tpp_solve_fwd ldlt_tpp_solve_fwd_dbl #define ldlt_tpp_solve_diag ldlt_tpp_solve_diag_dbl @@ -21,14 +23,13 @@ namespace spral { namespace ssids { namespace cpu { -int ldlt_tpp_factor(int m, int n, int* perm, precision_* a, - int lda, precision_* d, - precision_* ld, int ldld, bool action, precision_ u, precision_ small, - int nleft=0, precision_ *aleft=nullptr, int ldleft=0); -void ldlt_tpp_solve_fwd(int m, int n, precision_ const* l, int ldl, int nrhs, - precision_* x, int ldx); -void ldlt_tpp_solve_diag(int n, precision_ const* d, precision_* x); -void ldlt_tpp_solve_bwd(int m, int n, precision_ const* l, int ldl, int nrhs, - precision_* x, int ldx); +ipc_ ldlt_tpp_factor(ipc_ m, ipc_ n, ipc_* perm, rpc_* a, ipc_ lda, rpc_* d, + rpc_* ld, ipc_ ldld, bool action, rpc_ u, rpc_ small, + ipc_ nleft=0, rpc_ *aleft=nullptr, ipc_ ldleft=0); +void ldlt_tpp_solve_fwd(ipc_ m, ipc_ n, rpc_ const* l, ipc_ ldl, ipc_ nrhs, + rpc_* x, ipc_ ldx); +void ldlt_tpp_solve_diag(ipc_ n, rpc_ const* d, rpc_* x); +void ldlt_tpp_solve_bwd(ipc_ m, ipc_ n, rpc_ const* l, ipc_ ldl, ipc_ nrhs, + rpc_* x, ipc_ ldx); }}} /* end of namespace spral::ssids::cpu */ diff --git a/include/ssids_cpu_kernels_verify.hxx b/include/ssids_cpu_kernels_verify.hxx index f7704c2aac..6d43510013 100644 --- a/include/ssids_cpu_kernels_verify.hxx +++ b/include/ssids_cpu_kernels_verify.hxx @@ -2,33 +2,36 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 14:50 GMT */ + #pragma once #include #include "ssids_cpu_kernels_wrappers.hxx" +#include "ssids_rip.hxx" namespace spral { namespace ssids { namespace cpu { namespace verify_internal { template -void calcLD(int m, int n, T const* lcol, int ldl, T const* d, T* ld) { - for(int j=0; j class Verify { public: - Verify(int m, int n, int const* perm, T const* a, int lda) + Verify(ipc_ m, ipc_ n, ipc_ const* perm, T const* a, ipc_ lda) : m_(m), n_(n), lda_(m), a_(m*n), perm_(n) { // Take a copy - for(int j=0; j= c) { if(std::abs(a_[c*lda_+r] - ldlt[j*nelim+i]) > 1e-10) { printf("Mismatch1 [%d,%d]=%e != [%d,%d]=%e diff %e\n", r, c, @@ -128,10 +131,10 @@ public: ld, nelim, 0.0, below, m_-nelim ); // rows nelim:n may be permuted - for(int j=0; j= c) { if(std::abs(a_[c*lda_+r] - below[j*(m_-nelim)+i-nelim]) > 1e-10) { printf("Mismatch2 [%d,%d]=%e != [%d,%d]=%e diff %e\n", r, c, @@ -150,10 +153,10 @@ public: } } // rows nelim:n are only column permuted - for(int j=0; j 1e-10) { printf("Mismatch3 [%d,%d]=%e != [%d,%d]=%e diff %e\n", r, c, a_[c*lda_+r], i, j, below[j*(m_-nelim)+i-nelim], @@ -172,11 +175,11 @@ public: } private: - int m_; - int n_; - int lda_; + ipc_ m_; + ipc_ n_; + ipc_ lda_; std::vector a_; - std::vector perm_; + std::vector perm_; }; diff --git a/include/ssids_cpu_kernels_wrappers.hxx b/include/ssids_cpu_kernels_wrappers.hxx index a57adcf3ad..a5afc9c750 100644 --- a/include/ssids_cpu_kernels_wrappers.hxx +++ b/include/ssids_cpu_kernels_wrappers.hxx @@ -2,10 +2,15 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 11:00 GMT */ + #pragma once + #include + #include "ssids_cpu_kernels_common.hxx" +#include "ssids_rip.hxx" namespace spral { namespace ssids { namespace cpu { @@ -57,53 +62,53 @@ void host_trsm(enum spral::ssids::cpu::side side, int m, int n, T alpha, const T* a, int lda, T* b, int ldb); -/* _GEMM */ +/* _GEMM_64 */ template void host_gemm_64(enum spral::ssids::cpu::operation transa, enum spral::ssids::cpu::operation transb, - int64_t m, int64_t n, int64_t k, T alpha, const T* a, - int64_t lda, const T* b, int64_t ldb, T beta, - T* c, int64_t ldc); + longc_ m, longc_ n, longc_ k, T alpha, const T* a, + longc_ lda, const T* b, longc_ ldb, T beta, + T* c, longc_ ldc); -/* _GEMV */ +/* _GEMV_64 */ template void gemv_64(enum spral::ssids::cpu::operation trans, - int64_t m, int64_t n, T alpha, const T* a, int64_t lda, - const T* x, int64_t incx, T beta, T* y, int64_t incy); + longc_ m, longc_ n, T alpha, const T* a, longc_ lda, + const T* x, longc_ incx, T beta, T* y, longc_ incy); -/* _POTRF */ +/* _POTRF_64 */ template -int64_t lapack_potrf_64(enum spral::ssids::cpu::fillmode uplo, int64_t n, - T* a, int64_t lda); +longc_ lapack_potrf_64(enum spral::ssids::cpu::fillmode uplo, longc_ n, + T* a, longc_ lda); -/* _SYTRF - Bunch-Kaufman factorization */ +/* _SYTRF_64 - Bunch-Kaufman factorization */ template -int64_t lapack_sytrf_64(enum spral::ssids::cpu::fillmode uplo, - int64_t n, T* a, int64_t lda, int64_t* ipiv, - T* work, int64_t lwork); +longc_ lapack_sytrf_64(enum spral::ssids::cpu::fillmode uplo, + longc_ n, T* a, longc_ lda, longc_* ipiv, + T* work, longc_ lwork); -/* _SYRK */ +/* _SYRK_64 */ template void host_syrk_64(enum spral::ssids::cpu::fillmode uplo, enum spral::ssids::cpu::operation trans, - int64_t n, int64_t k, T alpha, const T* a, int64_t lda, - T beta, T* c, int64_t ldc); + longc_ n, longc_ k, T alpha, const T* a, longc_ lda, + T beta, T* c, longc_ ldc); -/* _TRSV */ +/* _TRSV_64 */ template void host_trsv_64(enum spral::ssids::cpu::fillmode uplo, enum spral::ssids::cpu::operation trans, enum spral::ssids::cpu::diagonal diag, - int64_t n, const T* a, int64_t lda, T* x, int64_t incx); + longc_ n, const T* a, longc_ lda, T* x, longc_ incx); -/* _TRSM */ +/* _TRSM_64 */ template void host_trsm_64(enum spral::ssids::cpu::side side, enum spral::ssids::cpu::fillmode uplo, enum spral::ssids::cpu::operation transa, enum spral::ssids::cpu::diagonal diag, - int64_t m, int64_t n, T alpha, const T* a, int64_t lda, - T* b, int64_t ldb); + longc_ m, longc_ n, T alpha, const T* a, longc_ lda, + T* b, longc_ ldb); }}} /* namespaces spral::ssids::cpu */ diff --git a/include/ssids_gpu_kernels_datatypes.h b/include/ssids_gpu_kernels_datatypes.h index f4dc45daa1..78cadbc884 100644 --- a/include/ssids_gpu_kernels_datatypes.h +++ b/include/ssids_gpu_kernels_datatypes.h @@ -1,29 +1,38 @@ +/** \file + * \copyright 2016 The Science and Technology Facilities Council (STFC) + * \licence BSD licence, see LICENCE file for details + * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 14:30 GMT + */ + #define MAX_CUDA_BLOCKS 65535 +#include "ssids_rip.hxx" + namespace spral { namespace ssids { namespace gpu { /** \brief Represents work for a a node to be factorized * (as part of a batched call). */ struct multinode_fact_type { - int nrows; ///< number of rows in node - int ncols; ///< number of columns in node - double *lval; ///< pointer to factors L - double *ldval; ///< pointer to workspace for storing L*D - double *dval; ///< pointer to factors D - int offp; ///< offset into permutation vector for this node - int ib; ///< ??? - int jb; ///< ??? - int done; ///< number of columns sucessfully factorized? - int rght; ///< ??? - int lbuf; ///< ??? + ipc_ nrows; ///< number of rows in node + ipc_ ncols; ///< number of columns in node + rpc_ *lval; ///< pointer to factors L + rpc_ *ldval; ///< pointer to workspace for storing L*D + rpc_ *dval; ///< pointer to factors D + ipc_ offp; ///< offset into permutation vector for this node + ipc_ ib; ///< ??? + ipc_ jb; ///< ??? + ipc_ done; ///< number of columns sucessfully factorized? + ipc_ rght; ///< ??? + ipc_ lbuf; ///< ??? }; /** \brief Statistics to be returned to user. */ struct cuda_stats { - int num_two; ///< Number of 2x2 pivots - int num_neg; ///< Number of negative pivots - int num_zero; ///< Number of zero pivots + ipc_ num_two; ///< Number of 2x2 pivots + ipc_ num_neg; ///< Number of negative pivots + ipc_ num_zero; ///< Number of zero pivots }; }}} /* namespace spral::ssids::gpu */ diff --git a/include/ssids_gpu_kernels_dtrsv.h b/include/ssids_gpu_kernels_dtrsv.h index 7c74696973..07932a7179 100644 --- a/include/ssids_gpu_kernels_dtrsv.h +++ b/include/ssids_gpu_kernels_dtrsv.h @@ -6,12 +6,15 @@ Other Contributors: Christopher Munro (STFC) Philippe Vandermersch (NVIDIA) All rights reserved. +Current version - GALAHAD 4.3 - 2024-02-03 AT 15:15 GMT This file is a modified version of the ASEArch blas version. It has had a lookup capability added to allow execution on multiple small matrices simulateously. */ +#include "ssids_rip.hxx" + namespace spral { namespace ssids { namespace gpu { /** \brief Return value at address vptr using volatile load. */ @@ -43,8 +46,8 @@ __inline__ __device__ T_ELEM loadVolatile(const volatile T_ELEM *const vptr) #endif /** \brief Return physical SM id as per special register %smid. */ -unsigned int __inline__ __device__ getSM(void) { - volatile unsigned int output; +uipc_ __inline__ __device__ getSM(void) { + volatile uipc_ output; asm volatile("mov.u32 %0,%smid;" : "=r"(output) : ); return output; } @@ -60,13 +63,13 @@ unsigned int __inline__ __device__ getSM(void) { * \param val This thread's element of x. * \sa dblkSolve_trans() */ -template -void __device__ dblkSolve(const volatile T_ELEM *const minus_a, const int lda, T_ELEM &val) +template +void __device__ dblkSolve(const volatile T_ELEM *const minus_a, const ipc_ lda, T_ELEM &val) { volatile T_ELEM __shared__ xs; #pragma unroll 16 - for (int i=0; i -void __device__ dblkSolve_trans(const volatile T_ELEM *const minus_a, const int lda, T_ELEM &val) +template +void __device__ dblkSolve_trans(const volatile T_ELEM *const minus_a, const ipc_ lda, T_ELEM &val) { volatile T_ELEM __shared__ xs; #pragma unroll 16 - for (int i=blkSize-1; i>=0; --i) { + for (ipc_ i=blkSize-1; i>=0; --i) { if (threadIdx.x==i) { if (!ISUNIT) val *= minus_a[i*lda+i]; xs = val; @@ -116,21 +119,21 @@ void __device__ dblkSolve_trans(const volatile T_ELEM *const minus_a, const int * \param cache Location to copy to, leading dimension nbi. * \sa tocache_small() */ -template -void __device__ tocache(const unsigned int tid, const volatile T_ELEM *const a, const int lda, volatile T_ELEM *const cache) +template +void __device__ tocache(const uipc_ tid, const volatile T_ELEM *const a, const ipc_ lda, volatile T_ELEM *const cache) { - const int x = tid % nbi; - const int y = tid / nbi; - const int ty = ntid/nbi; + const ipc_ x = tid % nbi; + const ipc_ y = tid / nbi; + const ipc_ ty = ntid/nbi; if (!TRANS) { - for (int i=0; i(i+y)) cache[(i+y)*nbi+x] = -a[(i+y)*lda+x]; else if ((i+y)(i+y)) cache[(i+y)+nbi*x] = -a[(i+y)*lda+x]; else if ((i+y) -void __device__ tocache_small(const int n, const unsigned int tid, const volatile T_ELEM *const a, int lda, volatile T_ELEM *const cache) +template +void __device__ tocache_small(const ipc_ n, const uipc_ tid, const volatile T_ELEM *const a, ipc_ lda, volatile T_ELEM *const cache) { - const int x = tid % nbi; - const int y = tid / nbi; - const int ty = ntid/nbi; + const ipc_ x = tid % nbi; + const ipc_ y = tid / nbi; + const ipc_ ty = ntid/nbi; if (!TRANS) { - for (int i=0; i=nbi) continue; // past end of cache array if ((i+y)(i+y) && x=nbi) continue; // past end of cache array if ((i+y)(i+y) && x +template void __device__ slv21(const volatile T_ELEM *const x11, volatile T_ELEM *const a21, const volatile T_ELEM *const l22, volatile T_ELEM *const xsarray) { - const int tid = threadsx*threadIdx.y+threadIdx.x; - const int ntid = threadsx*threadsy; - const int x = (n>0) ? tid % n : 0; - const int y = (n>0) ? tid / n : 0; - const int ty = (n>0) ? ntid/n : 1; + const ipc_ tid = threadsx*threadIdx.y+threadIdx.x; + const ipc_ ntid = threadsx*threadsy; + const ipc_ x = (n>0) ? tid % n : 0; + const ipc_ y = (n>0) ? tid / n : 0; + const ipc_ ty = (n>0) ? ntid/n : 1; /* Note: as different threads within a warp can work on different columns, we need different xs variables (one per col being worked on) */ @@ -245,19 +248,19 @@ void __device__ slv21(const volatile T_ELEM *const x11, volatile T_ELEM *const a if (y>n) return; #pragma unroll - for (int j=0; j=n) continue; /* construct col (j+y) of -L_21 X_11 */ T_ELEM val = 0; - for (int k=j; k -void __device__ transpose(const int n, const volatile T_ELEM *const a, volatile T_ELEM *const at) +template +void __device__ transpose(const ipc_ n, const volatile T_ELEM *const a, volatile T_ELEM *const at) { if (threadIdx.y==0 && threadIdx.x +template void __device__ invert(volatile T_ELEM *const a, volatile T_ELEM /*__shared__*/ *const xsarray) { if (n==2) { @@ -355,7 +358,7 @@ void __device__ invert(volatile T_ELEM *const a, volatile T_ELEM /*__shared__*/ * solution on output * \param partSum workspace??? */ -template +template void __device__ slvinv(const volatile T_ELEM *a, volatile T_ELEM *xshared, T_ELEM &val, volatile T_ELEM *const partSum) { a += threadIdx.y*n+threadIdx.x; @@ -369,14 +372,14 @@ void __device__ slvinv(const volatile T_ELEM *a, volatile T_ELEM *xshared, T_ELE /* matrix-vector multiply for solution */ if (threadIdx.y -void __device__ slvinv_trans(const volatile T_ELEM *a, volatile T_ELEM *xshared, T_ELEM &val, volatile T_ELEM *const partSum, const int row) +template +void __device__ slvinv_trans(const volatile T_ELEM *a, volatile T_ELEM *xshared, T_ELEM &val, volatile T_ELEM *const partSum, const ipc_ row) { a += threadIdx.y*n+threadIdx.x; xshared += threadIdx.y; @@ -408,7 +411,7 @@ void __device__ slvinv_trans(const volatile T_ELEM *a, volatile T_ELEM *xshared, /* matrix-vector multiply for solution */ val=0; if (threadIdx.x +template #ifndef DOXYGEN_SHOULD_SKIP_THIS __launch_bounds__(threadsx*threadsy, 4) #endif /* DOXYGEN_SHOULD_SKIP_THIS */ -void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal, int *sync +void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal, ipc_ *sync #ifdef TIMING , struct trsv_times *times #endif ) { lookup += blockIdx.x; - const int n = lookup->n; + const ipc_ n = lookup->n; const T_ELEM *const a = lookup->a; - const int lda = lookup->lda; + const ipc_ lda = lookup->lda; xglobal += lookup->x_offset; sync += lookup->sync_offset; #ifdef TIMING - const unsigned int sa = clock(); + const uipc_ sa = clock(); #endif - const int nblk = (n + (nb-1)) / nb; - const int tid = threadsx*threadIdx.y + threadIdx.x; + const ipc_ nblk = (n + (nb-1)) / nb; + const ipc_ tid = threadsx*threadIdx.y + threadIdx.x; /* sync components: * sync[0] => nblk - Last ready column [init to -1] @@ -493,14 +496,14 @@ void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal, T_ELEM ps[nb/threadsy]; /* Get row handled by this block */ - const int row = nblk-1 - nextRow(&sync[1]); + const ipc_ row = nblk-1 - nextRow(&sync[1]); const bool short_row = ((n-1)/nb==row && n%nb!=0); /* requires special handling */ if (row!=nblk-1) { const T_ELEM *const aval = &a[(row*nb+threadIdx.x)*lda+(row+1)*nb+threadIdx.y]; #pragma unroll - for (int j=0; jrow+1; --col) { + for(ipc_ j=0; jrow+1; --col) { /* apply update from block (row, col) */ const T_ELEM *const aval = &a[(row*nb+threadIdx.y)*lda + col*nb+threadIdx.x]; - T_ELEM *const xg = &(xglobal[int(col*nb)]); + T_ELEM *const xg = &(xglobal[ipc_(col*nb)]); wait_until_ge(tid, &sync[0], nblk-1-col, &col_done); // Wait for diagonal block to be done T_ELEM xl; if (col=i && threadIdx.x(cache, xshared, val, partSum, row); if (!short_row || threadIdx.x(cache, nb, val); if (!short_row || threadIdx.x(cache, nb, val); if (!short_row || threadIdx.xsm = getSM(); @@ -673,23 +676,23 @@ void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal, * \param sync sync array to offset into. * \param lookup batch lookup array. */ -template +template #ifndef DOXYGEN_SHOULD_SKIP_THIS __launch_bounds__(threadsx*threadsy, 4) #endif /* DOXYGEN_SHOULD_SKIP_THIS */ /* Note: setting above occupany to 5 causes random errors on large problems: suspect compiler bug */ -void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, int *__restrict__ sync, struct trsv_lookup *lookup) +void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, ipc_ *__restrict__ sync, struct trsv_lookup *lookup) { lookup += blockIdx.x; - const int n = lookup->n; + const ipc_ n = lookup->n; const T_ELEM *const a = lookup->a; - const int lda = lookup->lda; + const ipc_ lda = lookup->lda; xglobal += lookup->x_offset; sync += lookup->sync_offset; - const int incx=1; + const ipc_ incx=1; - const int tid = threadsx*threadIdx.y + threadIdx.x; + const ipc_ tid = threadsx*threadIdx.y + threadIdx.x; /* sync components: * sync[0] => Last ready column [init to -1] @@ -704,14 +707,14 @@ void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, int *__restrict__ syn if (incx<0) xglobal+=(1-n)*incx; /* Get row handled by this block */ - const int row = nextRow(&sync[1]); + const ipc_ row = nextRow(&sync[1]); const bool short_row = ((n-1)/nb==row && n%nb!=0); /* requires special handling */ if (row!=0) { const T_ELEM *const aval = &a[((row-1)*nb+threadIdx.y)*lda+row*nb+threadIdx.x]; #pragma unroll - for (int j=0; j=n%nb) val = 0.0; @@ -785,14 +788,14 @@ void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, int *__restrict__ syn slvinv(cache, xlocal, val, partSum); if (!short_row || threadIdx.x(cache, nb, val); if (!short_row || threadIdx.x(cache, nb, val); if (!short_row || threadIdx.x //#define PROFILE @@ -56,7 +60,7 @@ public: * \param name Predefined name of task, as setup in Profile::init(). * \param thread Optional thread number, otherwise use best guess. */ - Task(char const* name, int thread=Profile::guess_core()) + Task(char const* name, ipc_ thread=Profile::guess_core()) : name(name), thread(thread), t1(Profile::now()) {} @@ -73,7 +77,7 @@ public: private: char const* name; //< Name of task, one defined in Profile::init(). - int thread; //< Thread of task. + ipc_ thread; //< Thread of task. double t1; //< Start time of task. }; @@ -83,7 +87,7 @@ public: * \param thread Optional thread number, otherwise use best guess. */ static - void setState(char const* name, int thread=Profile::guess_core()) { + void setState(char const* name, ipc_ thread=Profile::guess_core()) { #if defined(PROFILE) && defined(HAVE_GTG) double t = Profile::now(); ::setState(t, "ST_TASK", Profile::get_thread_name(thread), name); @@ -108,7 +112,7 @@ public: * \param thread Optional thread number, otherwise use best guess. */ static - void setNullState(int thread=Profile::guess_core()) { + void setNullState(ipc_ thread=Profile::guess_core()) { setState("0", thread); } @@ -120,7 +124,7 @@ public: */ static void addEvent(char const* type, char const*val, - int thread=Profile::guess_core()) { + ipc_ thread=Profile::guess_core()) { #if defined(PROFILE) && defined(HAVE_GTG) ::addEvent(now(), type, get_thread_name(thread), val); #endif @@ -132,8 +136,8 @@ public: * \note Times are all measured from the end of this subroutine. */ static - // void init(int nregions, spral::hw_topology::NumaRegion* regions) { - void init(int nnodes, spral::hw_topology::NumaRegion* nodes) { + // void init(ipc_ nregions, spral::hw_topology::NumaRegion* regions) { + void init(ipc_ nnodes, spral::hw_topology::NumaRegion* nodes) { #if defined(PROFILE) && defined(HAVE_GTG) // Initialise profiling setTraceType(PAJE); @@ -142,23 +146,23 @@ public: addContType("CT_NODE", "0", "Node"); addContType("CT_THREAD", "CT_NODE", "Thread"); addContType("CT_GPU", "CT_NODE", "GPU"); - // int nnodes = 0; + // ipc_ nnodes = 0; // spral::hw_topology::NumaRegion* nodes; if (!nodes) spral_hw_topology_guess(&nnodes, &nodes); - int core_idx=0; - for(int node=0; node + +/* real precision employed */ + +#ifdef SPRAL_SINGLE +#define rpc_ float +#else +#define rpc_ double +#endif + +/* integer storage employed */ + +#ifdef INTEGER_64 +#define ipc_ int64_t +#define uipc_ uint64_t +#else +#define ipc_ int +#define uipc_ unsigned int +#endif + +/* generic storage */ + +#define longc_ int64_t diff --git a/meson.build b/meson.build index ef2550a95e..24f3ea225c 100644 --- a/meson.build +++ b/meson.build @@ -293,8 +293,8 @@ endif # Compile GALAHAD with 64-bit integer if galahad_int64 - extra_args_single += ['-DGALAHAD_64BIT_INTEGER', '-DSPRAL_64BIT_INTEGER'] - extra_args_double += ['-DGALAHAD_64BIT_INTEGER', '-DSPRAL_64BIT_INTEGER'] + extra_args_single += '-DINTEGER_64' + extra_args_double += '-DINTEGER_64' endif # Sources diff --git a/src/dum/pastixf_enums.F90 b/src/dum/pastixf_enums.F90 index 5630a80048..71794b7cfe 100644 --- a/src/dum/pastixf_enums.F90 +++ b/src/dum/pastixf_enums.F90 @@ -1,4 +1,4 @@ -! THIS VERSION: GALAHAD 4.1 - 2022-10-25 AT 16:25 GMT. +! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:25 GMT. !-*- G A L A H A D - D U M M Y P A S T I X F _ E N U M S M O D U L E -*- @@ -7,7 +7,7 @@ MODULE pastixf_enums USE spmf_enums USE iso_c_binding, ONLY : c_double, c_int, c_ptr, c_int32_t, c_int64_t -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 INTEGER, PARAMETER :: pastix_int_t = c_int64_t #else INTEGER, PARAMETER :: pastix_int_t = c_int32_t diff --git a/src/dum/spmf_enums.F90 b/src/dum/spmf_enums.F90 index 75a6aeb65f..d68f29d021 100644 --- a/src/dum/spmf_enums.F90 +++ b/src/dum/spmf_enums.F90 @@ -1,4 +1,4 @@ -! THIS VERSION: GALAHAD 4.3 - 2024-01-17 AT 07:30 GMT. +! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:30 GMT. !-*-*- G A L A H A D - D U M M Y S P M F _ E N U M S M O D U L E -*-*- @@ -7,7 +7,7 @@ MODULE spmf_enums USE iso_c_binding, ONLY : c_float, c_double, c_ptr, & c_int, c_int32_t, c_int64_t -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 INTEGER, PARAMETER :: spm_int_t = c_int64_t #else INTEGER, PARAMETER :: spm_int_t = c_int32_t diff --git a/src/forthcoming/colt/colt.F90 b/src/forthcoming/colt/colt.F90 index 4d7c3a09e6..d025fd753d 100644 --- a/src/forthcoming/colt/colt.F90 +++ b/src/forthcoming/colt/colt.F90 @@ -3148,7 +3148,7 @@ END SUBROUTINE eval_HOCPRODS / REAL( n_points - 1, KIND = rp_ ) ) * ( t_upper - t_lower ) nlp%X( : nlp%n ) = zero - nlp%X( 2 ) = one + nlp%X( 1 ) = inform%target IF ( data%printd ) THEN WRITE( data%out, "( A, ' X ', /, ( 5ES12.4 ) )" ) & prefix, nlp%X( : nlp%n ) diff --git a/src/kinds/kinds.F90 b/src/kinds/kinds.F90 index b685638d4c..e149e60dec 100644 --- a/src/kinds/kinds.F90 +++ b/src/kinds/kinds.F90 @@ -1,4 +1,4 @@ -! THIS VERSION: GALAHAD 4.3 - 2024-01-26 AT 11:10 GMT. +! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:20 GMT. #include "galahad_modules.h" @@ -41,7 +41,7 @@ MODULE GALAHAD_KINDS ! integer and logical kinds (replace the latter in fortran 2023) -#ifdef GALAHAD_64BIT_INTEGER +#ifdef INTEGER_64 INTEGER, PARAMETER :: ip_ = INT64 INTEGER, PARAMETER :: ipc_ = C_INT64_T #else diff --git a/src/lancelot/makemaster b/src/lancelot/makemaster index 8c564af4fd..673102afa6 100644 --- a/src/lancelot/makemaster +++ b/src/lancelot/makemaster @@ -1,14 +1,14 @@ # Main body of the LANCELOT B installation makefile under GALAHAD # N. Gould and Ph. L. Toint. -# This version: 2024-01-26 +# This version: 2024-02-03 SHELL = /bin/$(BINSHELL) ifeq "$(PRECIS)" "single_64" - DPREC = -DGALAHAD_SINGLE -DGALAHAD_64BIT_INTEGER + DPREC = -DGALAHAD_SINGLE -DINTEGER_64 else ifeq "$(PRECIS)" "double_64" - DPREC = -DGALAHAD_DOUBLE -DGALAHAD_64BIT_INTEGER + DPREC = -DGALAHAD_DOUBLE -DINTEGER_64 else ifeq "$(PRECIS)" "single" DPREC = -DGALAHAD_SINGLE else diff --git a/src/lapack/rebuild.F90 b/src/lapack/rebuild.F90 index 29d5e18b2f..b907e2c156 100644 --- a/src/lapack/rebuild.F90 +++ b/src/lapack/rebuild.F90 @@ -1,4 +1,4 @@ -! THIS VERSION: GALAHAD 4.3 - 2024-01-29 AT 09:45 GMT. +! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:25 GMT. ! read a file containg a subset of the reference blas, lapack, etc ! written in fortran 77, and output a multi-precision version capable @@ -117,14 +117,14 @@ PROGRAM BUILD DO i = 1, 4 SELECT CASE ( i ) CASE( 1 ) - WRITE( hout, "( '#ifdef GALAHAD_64BIT_INTEGER', /, & + WRITE( hout, "( '#ifdef INTEGER_64', /, & & '#define GALAHAD_', A, '_interface GALAHAD_', A, & & '_interface_64', /, & - & '#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER')" ) urefs, urefs + & '#ifdef NO_UNDERSCORE_INTEGER_64')" ) urefs, urefs CASE( 2 ) - WRITE( hout, "( '#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER' )" ) + WRITE( hout, "( '#elif DOUBLE_UNDERSCORE_INTEGER_64' )" ) CASE( 3 ) - WRITE( hout, "( '#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER' )" ) + WRITE( hout, "( '#elif NO_SYMBOL_INTEGER_64' )" ) CYCLE CASE( 4 ) WRITE( hout, "( '#else' )" ) diff --git a/src/makedefs/definitions b/src/makedefs/definitions index f5da20f02f..53d30428de 100644 --- a/src/makedefs/definitions +++ b/src/makedefs/definitions @@ -1,17 +1,17 @@ # Standard GALAHAD makefile definitions # Nick Gould, for GALAHAD production -# This version: 2024-01-26 +# This version: 2024-02-03 # makefile shell SHELL = /bin/$(BINSHELL) ifeq "$(PRECIS)" "single_64" - DPREC = -DGALAHAD_SINGLE -DGALAHAD_64BIT_INTEGER + DPREC = -DGALAHAD_SINGLE -DINTEGER_64 HSL_PRECIS = s else ifeq "$(PRECIS)" "double_64" - DPREC = -DGALAHAD_DOUBLE -DGALAHAD_64BIT_INTEGER + DPREC = -DGALAHAD_DOUBLE -DINTEGER_64 HSL_PRECIS = d else ifeq "$(PRECIS)" "single" DPREC = -DGALAHAD_SINGLE diff --git a/src/makedefs/hsl_definitions b/src/makedefs/hsl_definitions index d439ae3a3b..07169aa1e6 100644 --- a/src/makedefs/hsl_definitions +++ b/src/makedefs/hsl_definitions @@ -1,16 +1,16 @@ # Standard GALAHAD HSL makefile definitions # Nick Gould, for GALAHAD production -# This version: 2024-01-26 +# This version: 2024-02-03 SHELL = /bin/$(BINSHELL) ifeq "$(PRECIS)" "single_64" - DPREC = -DGALAHAD_SINGLE -DGALAHAD_64BIT_INTEGER + DPREC = -DGALAHAD_SINGLE -DINTEGER_64 HSL_PRECIS = s INTEGER = 64bit else ifeq "$(PRECIS)" "double_64" - DPREC = -DGALAHAD_DOUBLE -DGALAHAD_64BIT_INTEGER + DPREC = -DGALAHAD_DOUBLE -DINTEGER_64 HSL_PRECIS = d INTEGER = 64bit else ifeq "$(PRECIS)" "single" diff --git a/src/spral/makemaster b/src/spral/makemaster index 0c52b67660..e9a6f44b90 100644 --- a/src/spral/makemaster +++ b/src/spral/makemaster @@ -3,14 +3,14 @@ # available under a BSD licence as part of GALAHAD # Nick Gould, for GALAHAD production -# This version: 2024-01-26 +# This version: 2024-02-03 SHELL = /bin/$(BINSHELL) ifeq "$(PRECIS)" "single_64" - DPREC = -DSPRAL_SINGLE -DSPRAL_64BIT_INTEGER + DPREC = -DSPRAL_SINGLE -DINTEGER_64 else ifeq "$(PRECIS)" "double_64" - DPREC = -DSPRAL_DOUBLE -DSPRAL_64BIT_INTEGER + DPREC = -DSPRAL_DOUBLE -DINTEGER_64 else ifeq "$(PRECIS)" "single" DPREC = -DSPRAL_SINGLE else diff --git a/src/spral/spral_kinds.F90 b/src/spral/spral_kinds.F90 index 6cf3eb6b40..65c0a8a895 100644 --- a/src/spral/spral_kinds.F90 +++ b/src/spral/spral_kinds.F90 @@ -1,6 +1,6 @@ -! THIS VERSION: GALAHAD 4.3 - 2024-01-26 AT 11:10 GMT. +! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:30 GMT. -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define SPRAL_KINDS_double spral_kinds_double_64 #define SPRAL_KINDS_single spral_kinds_single_64 #endif @@ -44,7 +44,7 @@ MODULE SPRAL_KINDS ! integer kinds -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 INTEGER, PARAMETER :: ip_ = INT64 INTEGER, PARAMETER :: ipc_ = C_INT64_T #else diff --git a/src/ssids/C/ssids_ciface.F90 b/src/ssids/C/ssids_ciface.F90 index a3c8199a28..ba0a92577c 100644 --- a/src/ssids/C/ssids_ciface.F90 +++ b/src/ssids/C/ssids_ciface.F90 @@ -1,7 +1,7 @@ -! THIS VERSION: GALAHAD 4.1 - 2023-01-25 AT 09:00 GMT. +! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:35 GMT. #ifdef SPRAL_SINGLE -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define SPRAL_KINDS_precision SPRAL_KINDS_single_64 #define SPRAL_SSIDS_precision_ciface SPRAL_SSIDS_single_ciface_64 #define SPRAL_SSIDS_types_precision spral_ssids_types_single_64 @@ -13,7 +13,7 @@ #define SPRAL_SSIDS_inform_precision spral_ssids_inform_single #endif #else -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define SPRAL_KINDS_precision SPRAL_KINDS_double_64 #define SPRAL_SSIDS_precision_ciface SPRAL_SSIDS_double_ciface_64 #define SPRAL_SSIDS_types_precision spral_ssids_types_double_64 diff --git a/src/ssids/NumericSubtree.cxx b/src/ssids/NumericSubtree.cxx index e6eb594df1..ed6a89695b 100644 --- a/src/ssids/NumericSubtree.cxx +++ b/src/ssids/NumericSubtree.cxx @@ -2,7 +2,9 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 09:15 GMT */ + #include "ssids_cpu_NumericSubtree.hxx" #include @@ -11,9 +13,9 @@ #include "spral_omp.hxx" #include "ssids_cpu_AppendAlloc.hxx" +#include "ssids_rip.hxx" #ifdef SPRAL_SINGLE -#define precision_ float #define spral_ssids_cpu_create_num_subtree \ spral_ssids_cpu_create_num_subtree_sgl #define spral_ssids_cpu_destroy_num_subtree \ @@ -35,7 +37,6 @@ #define spral_ssids_cpu_subtree_free_contrib \ spral_ssids_cpu_subtree_free_contrib_sgl #else -#define precision_ double #define spral_ssids_cpu_create_num_subtree \ spral_ssids_cpu_create_num_subtree_dbl #define spral_ssids_cpu_destroy_num_subtree \ @@ -69,7 +70,7 @@ typedef float T; #else typedef double T; #endif -const int PAGE_SIZE = 8*1024*1024; // 8MB +const ipc_ PAGE_SIZE = 8*1024*1024; // 8MB typedef NumericSubtree> NumericSubtreePosdef; typedef NumericSubtree> NumericSubtreeIndef; @@ -80,13 +81,14 @@ extern "C" void* spral_ssids_cpu_create_num_subtree( bool posdef, void const* symbolic_subtree_ptr, - const precision_ *const aval, // Values of A - const precision_ *const scaling, // Scaling vector (NULL if none) + const rpc_ *const aval, // Values of A + const rpc_ *const scaling, // Scaling vector (NULL if none) void** child_contrib, // Contributions from child subtrees struct cpu_factor_options const* options, // Options in ThreadStats* stats // Info out ) { - auto const& symbolic_subtree = *static_cast(symbolic_subtree_ptr); + auto const& symbolic_subtree = + *static_cast(symbolic_subtree_ptr); // Perform factorization if(posdef) { @@ -126,9 +128,9 @@ extern "C" Flag spral_ssids_cpu_subtree_solve_fwd( bool posdef, // If true, performs A=LL^T, if false do pivoted A=LDL^T void const* subtree_ptr,// pointer to relevant type of NumericSubtree - int nrhs, // number of right-hand sides - precision_* x, // ldx x nrhs array of right-hand sides - int ldx // leading dimension of x + ipc_ nrhs, // number of right-hand sides + rpc_* x, // ldx x nrhs array of right-hand sides + ipc_ ldx // leading dimension of x ) { // Call method @@ -153,9 +155,9 @@ extern "C" Flag spral_ssids_cpu_subtree_solve_diag( bool posdef, // If true, performs A=LL^T, if false do pivoted A=LDL^T void const* subtree_ptr,// pointer to relevant type of NumericSubtree - int nrhs, // number of right-hand sides - precision_* x, // ldx x nrhs array of right-hand sides - int ldx // leading dimension of x + ipc_ nrhs, // number of right-hand sides + rpc_* x, // ldx x nrhs array of right-hand sides + ipc_ ldx // leading dimension of x ) { // Call method @@ -178,9 +180,9 @@ extern "C" Flag spral_ssids_cpu_subtree_solve_diag_bwd( bool posdef, // If true, performs A=LL^T, if false do pivoted A=LDL^T void const* subtree_ptr,// pointer to relevant type of NumericSubtree - int nrhs, // number of right-hand sides - precision_* x, // ldx x nrhs array of right-hand sides - int ldx // leading dimension of x + ipc_ nrhs, // number of right-hand sides + rpc_* x, // ldx x nrhs array of right-hand sides + ipc_ ldx // leading dimension of x ) { // Call method @@ -205,9 +207,9 @@ extern "C" Flag spral_ssids_cpu_subtree_solve_bwd( bool posdef, // If true, performs A=LL^T, if false do pivoted A=LDL^T void const* subtree_ptr,// pointer to relevant type of NumericSubtree - int nrhs, // number of right-hand sides - precision_* x, // ldx x nrhs array of right-hand sides - int ldx // leading dimension of x + ipc_ nrhs, // number of right-hand sides + rpc_* x, // ldx x nrhs array of right-hand sides + ipc_ ldx // leading dimension of x ) { // Call method @@ -232,8 +234,8 @@ extern "C" void spral_ssids_cpu_subtree_enquire( bool posdef, // If true, performs A=LL^T, if false do pivoted A=LDL^T void const* subtree_ptr,// pointer to relevant type of NumericSubtree - int* piv_order, // pivot order, may be null, only used if indef - precision_* d // diagonal entries, may be null + ipc_* piv_order, // pivot order, may be null, only used if indef + rpc_* d // diagonal entries, may be null ) { // Call method @@ -253,7 +255,7 @@ extern "C" void spral_ssids_cpu_subtree_alter( bool posdef, // If true, performs A=LL^T, if false do pivoted A=LDL^T void* subtree_ptr,// pointer to relevant type of NumericSubtree - precision_ const* d // new diagonal entries + rpc_ const* d // new diagonal entries ) { assert(!posdef); // Should never be called on positive definite matrices. @@ -268,14 +270,14 @@ extern "C" void spral_ssids_cpu_subtree_get_contrib( bool posdef, // If true, performs A=LL^T, if false do pivoted A=LDL^T void* subtree_ptr,// pointer to relevant type of NumericSubtree - int* n, // returned dimension of contribution block - precision_ const** val, // returned pointer to contribution block - int* ldval, // leading dimension of val - int const** rlist, // returned pointer to row list - int* ndelay, // returned number of delays - int const** delay_perm, // returned pointer to delay values - precision_ const** delay_val, // returned pointer to delay values - int* lddelay // leading dimension of delay_val + ipc_* n, // returned dimension of contribution block + rpc_ const** val, // returned pointer to contribution block + ipc_* ldval, // leading dimension of val + ipc_ const** rlist, // returned pointer to row list + ipc_* ndelay, // returned number of delays + ipc_ const** delay_perm, // returned pointer to delay values + rpc_ const** delay_val, // returned pointer to delay values + ipc_* lddelay // leading dimension of delay_val ) { // Call method if(posdef) { // Converting from runtime to compile time posdef value diff --git a/src/ssids/SymbolicSubtree.cxx b/src/ssids/SymbolicSubtree.cxx index fd7ef25b18..5912738327 100644 --- a/src/ssids/SymbolicSubtree.cxx +++ b/src/ssids/SymbolicSubtree.cxx @@ -2,16 +2,19 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 16:00 GMT */ + +#include "ssids_rip.hxx" #include "ssids_cpu_SymbolicSubtree.hxx" using namespace spral::ssids::cpu; extern "C" void* spral_ssids_cpu_create_symbolic_subtree( - int n, int sa, int en, int const* sptr, int const* sparent, - long const* rptr, int const* rlist, long const* nptr, long const* nlist, - int ncontrib, int const* contrib_idx, + ipc_ n, ipc_ sa, ipc_ en, ipc_ const* sptr, ipc_ const* sparent, + longc_ const* rptr, ipc_ const* rlist, longc_ const* nptr, + longc_ const* nlist, ipc_ ncontrib, ipc_ const* contrib_idx, struct cpu_factor_options const* options) { return (void*) new SymbolicSubtree( n, sa, en, sptr, sparent, rptr, rlist, nptr, nlist, ncontrib, diff --git a/src/ssids/assemble.cu b/src/ssids/assemble.cu index 002b808c0a..33a130c0f0 100644 --- a/src/ssids/assemble.cu +++ b/src/ssids/assemble.cu @@ -1,3 +1,10 @@ +/* Copyright (c) 2013 Science and Technology Facilities Council (STFC) + * Copyright (c) 2013 NVIDIA + * Authors: Evgueni Ovtchinnikov (STFC) + * Jeremy Appleyard (NVIDIA) + * This version: GALAHAD 4.3 - 2024-02-03 AT 09:40 GMT + */ + #ifdef __cplusplus #include #else @@ -8,11 +15,11 @@ #include #include +#include "ssids_rip.hxx" #include "spral_cuda_cuda_check.h" #include "ssids_gpu_kernels_datatypes.h" #ifdef SPRAL_SINGLE -#define precision_ float #define load_nodes_type load_nodes_type_single #define assemble_cp_type assemble_cp_type_single #define assemble_blk_type assemble_blk_type_single @@ -28,7 +35,6 @@ #define spral_ssids_load_nodes_sc spral_ssids_load_nodes_sc_single #define spral_ssids_max_abs spral_ssids_max_abs_single #else -#define precision_ double #define load_nodes_type load_nodes_type_single #define assemble_cp_type assemble_cp_type_double #define assemble_blk_type assemble_blk_type_double @@ -55,12 +61,12 @@ namespace /* anon */ { struct load_nodes_type { - long nnz; // Number of entries to map - int lda; // Leading dimension of A - int ldl; // Leading dimension of L - precision_ *lcol; // Pointer to non-delay part of L - long offn; // Offset into nlist - long offr; // Offset into rlist + longc_ nnz; // Number of entries to map + ipc_ lda; // Leading dimension of A + ipc_ ldl; // Leading dimension of L + rpc_ *lcol; // Pointer to non-delay part of L + longc_ offn; // Offset into nlist + longc_ offr; // Offset into rlist }; /* @@ -73,22 +79,22 @@ struct load_nodes_type { __global__ void cu_load_nodes( const struct load_nodes_type *lndata, - const long *nlist, - const precision_ *aval + const longc_ *nlist, + const rpc_ *aval ) { lndata += blockIdx.x; - const long nnz = lndata->nnz; - const int lda = lndata->lda; - const int ldl = lndata->ldl; + const longc_ nnz = lndata->nnz; + const ipc_ lda = lndata->lda; + const ipc_ ldl = lndata->ldl; nlist += 2*lndata->offn; - precision_ *const lval = lndata->lcol; + rpc_ *const lval = lndata->lcol; - for (int i = threadIdx.x; i < nnz; i += blockDim.x) { + for (ipc_ i = threadIdx.x; i < nnz; i += blockDim.x) { // Note: nlist is 1-indexed, not 0 indexed, so we have to adjust - const int r = (nlist[2*i+1] - 1) % lda; // row index - const int c = (nlist[2*i+1] - 1) / lda; // col index - const long sidx = nlist[2*i+0] - 1; // source index + const ipc_ r = (nlist[2*i+1] - 1) % lda; // row index + const ipc_ c = (nlist[2*i+1] - 1) / lda; // col index + const longc_ sidx = nlist[2*i+0] - 1; // source index lval[r + c*ldl] = aval[sidx]; } } @@ -104,41 +110,41 @@ cu_load_nodes( __global__ void cu_load_nodes_sc( const struct load_nodes_type *lndata, - const long *nlist, - const int *rlist, - const precision_ *scale, - const precision_ *aval + const longc_ *nlist, + const ipc_ *rlist, + const rpc_ *scale, + const rpc_ *aval ) { lndata += blockIdx.x; - const int nnz = lndata->nnz; - const int lda = lndata->lda; - const int ldl = lndata->ldl; + const ipc_ nnz = lndata->nnz; + const ipc_ lda = lndata->lda; + const ipc_ ldl = lndata->ldl; nlist += 2*lndata->offn; - precision_ *const lval = lndata->lcol; + rpc_ *const lval = lndata->lcol; rlist += lndata->offr; - for (int i = threadIdx.x; i < nnz; i += blockDim.x) { + for (ipc_ i = threadIdx.x; i < nnz; i += blockDim.x) { // Note: nlist and rlist are 1-indexed, not 0 indexed, so we adjust - const int r = (nlist[2*i+1] - 1) % lda; // row index - const int c = (nlist[2*i+1] - 1) / lda; // col index - const long sidx = nlist[2*i+0] - 1; // source index - const precision_ rs = scale[rlist[r] - 1]; // row scaling - const precision_ cs = scale[rlist[c] - 1]; // col scaling + const ipc_ r = (nlist[2*i+1] - 1) % lda; // row index + const ipc_ c = (nlist[2*i+1] - 1) / lda; // col index + const longc_ sidx = nlist[2*i+0] - 1; // source index + const rpc_ rs = scale[rlist[r] - 1]; // row scaling + const rpc_ cs = scale[rlist[c] - 1]; // col scaling lval[r + c*ldl] = rs * aval[sidx] * cs; } } // BLOCK_SIZE = blockDim.x // maxabs must be initialized to zeros -template< typename ELEMENT_TYPE, unsigned int BLOCK_SIZE > +template< typename ELEMENT_TYPE, uipc_ BLOCK_SIZE > __global__ void -cu_max_abs( const long n, const ELEMENT_TYPE *const u, ELEMENT_TYPE *const maxabs ) +cu_max_abs( const longc_ n, const ELEMENT_TYPE *const u, ELEMENT_TYPE *const maxabs ) { __shared__ volatile ELEMENT_TYPE tmax[BLOCK_SIZE]; tmax[threadIdx.x] = 0.0; - for ( long i = threadIdx.x + blockDim.x*blockIdx.x; i < n; + for ( longc_ i = threadIdx.x + blockDim.x*blockIdx.x; i < n; i += blockDim.x*gridDim.x ) { const ELEMENT_TYPE v = fabs(u[i]); if ( v > tmax[threadIdx.x] ) @@ -146,7 +152,7 @@ cu_max_abs( const long n, const ELEMENT_TYPE *const u, ELEMENT_TYPE *const maxab } __syncthreads(); - for ( int inc = 1; inc < BLOCK_SIZE; inc *= 2 ) { + for ( ipc_ inc = 1; inc < BLOCK_SIZE; inc *= 2 ) { if ( 2*inc*threadIdx.x + inc < BLOCK_SIZE && tmax[2*inc*threadIdx.x + inc] > tmax[2*inc*threadIdx.x] ) tmax[2*inc*threadIdx.x] = tmax[2*inc*threadIdx.x + inc]; @@ -160,30 +166,30 @@ cu_max_abs( const long n, const ELEMENT_TYPE *const u, ELEMENT_TYPE *const maxab /* Following data type describes a single child-parent assembly */ struct assemble_cp_type { // Parent data - int pvoffset; // Offset to start of parent node values - precision_ *pval; // Pointer to non-delay part of parent L - int ldp; // Leading dimension of parent + ipc_ pvoffset; // Offset to start of parent node values + rpc_ *pval; // Pointer to non-delay part of parent L + ipc_ ldp; // Leading dimension of parent // Child data - int cm; // Number of rows in child - int cn; // Number of columns in child - int ldc; // Leading dimension of child - long cvoffset; // Offset to start of child node values - precision_ *cv; // Pointer to start of child node values + ipc_ cm; // Number of rows in child + ipc_ cn; // Number of columns in child + ipc_ ldc; // Leading dimension of child + longc_ cvoffset; // Offset to start of child node values + rpc_ *cv; // Pointer to start of child node values // Alignment data - int *rlist_direct; // Pointer to start of child's rlist - int *ind; // Pointer to start of child's contribution index + ipc_ *rlist_direct; // Pointer to start of child's rlist + ipc_ *ind; // Pointer to start of child's contribution index // Sync data - int sync_offset; // we watch sync[sync_offset] - int sync_wait_for; // and wait for it to have value >= sync_wait_for + ipc_ sync_offset; // we watch sync[sync_offset] + ipc_ sync_wait_for; // and wait for it to have value >= sync_wait_for }; /* Following data type describes actions of single CUDA block */ struct assemble_blk_type { - int cp; // node we're assembling into - int blk; // block number of that node + ipc_ cp; // node we're assembling into + ipc_ blk; // block number of that node }; /* Used to force volatile load of a declared non-volatile variable */ @@ -201,18 +207,17 @@ __inline__ __device__ T_ELEM loadVolatile(volatile T_ELEM *const vptr) { * next_blk is used to ensure all blocks run in exact desired order. * sync[] is used to ensure dependencies are completed in the correct order. */ -template +template void __global__ assemble( const struct assemble_blk_type *blkdata, // block mapping const struct assemble_cp_type *cpdata, // child-parent data - const precision_ *const children, // pointer to array containing children - precision_ *const parents, // pointer to array containing parents - unsigned int *const next_blk, // gmem location used to determine next block - volatile unsigned int *const sync // sync[cp] is #blocks completed so far for cp + const rpc_ *const children, // pointer to array containing children + rpc_ *const parents, // pointer to array containing parents + uipc_ *const next_blk, // gmem location used to determine next block + volatile uipc_ *const sync // sync[cp] is #blocks completed so far for cp ) { // Get block number - __shared__ volatile unsigned int mynext_blk; + __shared__ volatile uipc_ mynext_blk; if(threadIdx.x==0 && threadIdx.y==0) mynext_blk = atomicAdd(next_blk, 1); __syncthreads(); @@ -220,21 +225,21 @@ void __global__ assemble( // Determine global information blkdata += mynext_blk; cpdata += blkdata->cp; - int blk = blkdata->blk; - int nx = (cpdata->cm-1) / blk_sz_x + 1; // number of blocks high child is - int bx = blk % nx; // coordinate of block in x direction - int by = blk / nx; // coordinate of block in y direction - int ldc = cpdata->ldc; - int ldp = cpdata->ldp; + ipc_ blk = blkdata->blk; + ipc_ nx = (cpdata->cm-1) / blk_sz_x + 1; // number of blocks high child is + ipc_ bx = blk % nx; // coordinate of block in x direction + ipc_ by = blk / nx; // coordinate of block in y direction + ipc_ ldc = cpdata->ldc; + ipc_ ldp = cpdata->ldp; // Initialize local information - int m = min(blk_sz_x, cpdata->cm - bx*blk_sz_x); - int n = min(blk_sz_y, cpdata->cn - by*blk_sz_y); - const precision_ *src = + ipc_ m = min(blk_sz_x, cpdata->cm - bx*blk_sz_x); + ipc_ n = min(blk_sz_y, cpdata->cn - by*blk_sz_y); + const rpc_ *src = cpdata->cv + ldc*by*blk_sz_y + bx*blk_sz_x; - precision_ *dest = cpdata->pval; - int *rows = cpdata->rlist_direct + bx*blk_sz_x; - int *cols = cpdata->rlist_direct + by*blk_sz_y; + rpc_ *dest = cpdata->pval; + ipc_ *rows = cpdata->rlist_direct + bx*blk_sz_x; + ipc_ *cols = cpdata->rlist_direct + by*blk_sz_y; // Wait for previous child of this parent to complete if(threadIdx.x==0 && threadIdx.y==0) { @@ -243,12 +248,12 @@ void __global__ assemble( __syncthreads(); // Perform assembly - for(int j=0; jcp]), 1); + atomicAdd((ipc_*)&(sync[blkdata->cp]), 1); } } struct assemble_delay_type { - int dskip; // Number of rows to skip for delays from later children - int m; // Number of rows in child to copy - int n; // Number of cols in child to copy - int ldd; // Leading dimension of dest (parent) - int lds; // Leading dimension of src (child) - precision_ *dval; // Pointer to dest (parent) - precision_ *sval; // Pointer to src (child) - long roffset; // Offset to rlist_direct + ipc_ dskip; // Number of rows to skip for delays from later children + ipc_ m; // Number of rows in child to copy + ipc_ n; // Number of cols in child to copy + ipc_ ldd; // Leading dimension of dest (parent) + ipc_ lds; // Leading dimension of src (child) + rpc_ *dval; // Pointer to dest (parent) + rpc_ *sval; // Pointer to src (child) + longc_ roffset; // Offset to rlist_direct }; /* Copies delays from child to parent using one block per parent @@ -279,26 +284,26 @@ struct assemble_delay_type { */ void __global__ add_delays( struct assemble_delay_type *dinfo, // information on each block - const int *rlist_direct // children's rows indices in parents + const ipc_ *rlist_direct // children's rows indices in parents ) { dinfo += blockIdx.x; - const int dskip = dinfo->dskip; // number of delays - const int m = dinfo->m; // number of rows - const int n = dinfo->n; // number of cols - const int ldd = dinfo->ldd; // leading dimension of dest - const int lds = dinfo->lds; // leading dimension of src - - precision_ *const dest = dinfo->dval; - const precision_ *const src = dinfo->sval; + const ipc_ dskip = dinfo->dskip; // number of delays + const ipc_ m = dinfo->m; // number of rows + const ipc_ n = dinfo->n; // number of cols + const ipc_ ldd = dinfo->ldd; // leading dimension of dest + const ipc_ lds = dinfo->lds; // leading dimension of src + + rpc_ *const dest = dinfo->dval; + const rpc_ *const src = dinfo->sval; rlist_direct += dinfo->roffset; - for ( int y = threadIdx.y; y < n; y += blockDim.y ) { - for ( int x = threadIdx.x; x < m; x += blockDim.x ) { + for ( ipc_ y = threadIdx.y; y < n; y += blockDim.y ) { + for ( ipc_ x = threadIdx.x; x < m; x += blockDim.x ) { if ( x < n ) { dest[x + y*ldd] = src[x + y*lds]; } else { - int xt = dskip + rlist_direct[x - n] - 1; + ipc_ xt = dskip + rlist_direct[x - n] - 1; dest[xt + y*ldd] = src[x + y*lds]; } } @@ -314,12 +319,12 @@ void __global__ add_delays( extern "C" { /* Invokes the add_delays<<<>>>() kernel */ -void spral_ssids_add_delays( const cudaStream_t *stream, int ndblk, - struct assemble_delay_type *gpu_dinfo, int *rlist_direct ) { +void spral_ssids_add_delays( const cudaStream_t *stream, ipc_ ndblk, + struct assemble_delay_type *gpu_dinfo, ipc_ *rlist_direct ) { if ( ndblk == 0 ) return; // Nothing to see here dim3 threads(ADD_DELAYS_TX, ADD_DELAYS_TY); - for ( int i = 0; i < ndblk; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, ndblk - i); + for ( ipc_ i = 0; i < ndblk; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, ndblk - i); add_delays <<< nb, threads, 0, *stream >>> ( gpu_dinfo + i, rlist_direct ); @@ -328,25 +333,25 @@ void spral_ssids_add_delays( const cudaStream_t *stream, int ndblk, } /* Runs the kernel assemble<<<>>>() after setting up memory correctly. */ -/* Requires gpu_next_sync[] to be of size >= (1+ncp)*sizeof(unsigned int) */ -void spral_ssids_assemble(const cudaStream_t *stream, int nblk, int blkoffset, - struct assemble_blk_type *blkdata, int ncp, - struct assemble_cp_type *cpdata, precision_ *children, - precision_ *parents, unsigned int *gpu_next_sync) { +/* Requires gpu_next_sync[] to be of size >= (1+ncp)*sizeof(uipc_) */ +void spral_ssids_assemble(const cudaStream_t *stream, ipc_ nblk, ipc_ blkoffset, + struct assemble_blk_type *blkdata, ipc_ ncp, + struct assemble_cp_type *cpdata, rpc_ *children, + rpc_ *parents, uipc_ *gpu_next_sync) { /* Create and initialize synchronization objects using a single call: next_blk[1] sync[ncp] */ CudaSafeCall( - cudaMemsetAsync(gpu_next_sync,0,(1+ncp)*sizeof(unsigned int),*stream) + cudaMemsetAsync(gpu_next_sync,0,(1+ncp)*sizeof(uipc_),*stream) ); /* Note, that we can only have at most 65535 blocks per dimn. * For some problems, nblk can exceed this, so we use more than one launch. * As the next block we look at is specified by next_blk this works fine. */ dim3 threads(HOGG_ASSEMBLE_NTX, HOGG_ASSEMBLE_NTY); - for(int i=0; i @@ -358,38 +363,38 @@ void spral_ssids_assemble(const cudaStream_t *stream, int nblk, int blkoffset, } // Note: modified value lval is passed in via pointer in lndata, not as argument -void spral_ssids_load_nodes( const cudaStream_t *stream, int nblocks, - const struct load_nodes_type *lndata, const long* list, - const precision_* mval ) { - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); +void spral_ssids_load_nodes( const cudaStream_t *stream, ipc_ nblocks, + const struct load_nodes_type *lndata, const longc_* list, + const rpc_* mval ) { + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); cu_load_nodes <<< nb, 128, 0, *stream >>> ( lndata + i, list, mval ); CudaCheckError(); } } // Note: modified value lval is passed in via pointer in lndata, not as argument -void spral_ssids_load_nodes_sc( const cudaStream_t *stream, int nblocks, - const struct load_nodes_type *lndata, const long* list, const int* rlist, - const precision_* scale, const precision_* mval ) { - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); +void spral_ssids_load_nodes_sc( const cudaStream_t *stream, ipc_ nblocks, + const struct load_nodes_type *lndata, const longc_* list, const ipc_* rlist, + const rpc_* scale, const rpc_* mval ) { + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); cu_load_nodes_sc <<< nb, 128, 0, *stream >>> ( lndata + i, list, rlist, scale, mval ); CudaCheckError(); } } void spral_ssids_max_abs( const cudaStream_t *stream, - int nb, long n, precision_* u, precision_* buff, precision_* maxabs ) + ipc_ nb, longc_ n, rpc_* u, rpc_* buff, rpc_* maxabs ) { - cudaMemsetAsync(buff, 0, nb*sizeof(precision_), *stream); + cudaMemsetAsync(buff, 0, nb*sizeof(rpc_), *stream); cudaStreamSynchronize(*stream); if ( n > 1024*nb ) - cu_max_abs< precision_, 256 ><<< nb, 256, 0, *stream >>>( n, u, buff ); + cu_max_abs< rpc_, 256 ><<< nb, 256, 0, *stream >>>( n, u, buff ); else - cu_max_abs< precision_, 32 ><<< nb, 32, 0, *stream >>>( n, u, buff ); + cu_max_abs< rpc_, 32 ><<< nb, 32, 0, *stream >>>( n, u, buff ); CudaCheckError(); - cu_max_abs< precision_, 1024 ><<< 1, 1024, 0, *stream >>>( nb, buff, maxabs ); + cu_max_abs< rpc_, 1024 ><<< 1, 1024, 0, *stream >>>( nb, buff, maxabs ); CudaCheckError(); } diff --git a/src/ssids/cholesky.cxx b/src/ssids/cholesky.cxx index 53f77fbad2..d272e7b72c 100644 --- a/src/ssids/cholesky.cxx +++ b/src/ssids/cholesky.cxx @@ -2,28 +2,29 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 15:45 GMT */ + #include "ssids_cpu_kernels_cholesky.hxx" #include #include // FIXME: remove as only used for debug +#include "ssids_rip.hxx" #include "ssids_profile.hxx" #include "ssids_cpu_kernels_wrappers.hxx" #ifdef SPRAL_SINGLE -#define precision_ float #define cholesky_factor cholesky_factor_sgl #define cholesky_solve_fwd cholesky_solve_fwd_sgl #define cholesky_solve_bwd cholesky_solve_bwd_sgl #else -#define precision_ double #define cholesky_factor cholesky_factor_dbl #define cholesky_solve_fwd cholesky_solve_fwd_dbl #define cholesky_solve_bwd cholesky_solve_bwd_dbl #endif -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define host_gemm host_gemm_64 #define lapack_potrf lapack_potrf_64 #define host_syrk host_syrk_64 @@ -50,11 +51,11 @@ namespace spral { namespace ssids { namespace cpu { * \param info is initialized to -1, and will be changed to the index of any * column where a non-zero column is encountered. */ -void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta, - precision_* upd, int ldupd, int blksz, int *info) { +void cholesky_factor(ipc_ m, ipc_ n, rpc_* a, ipc_ lda, rpc_ beta, + rpc_* upd, ipc_ ldupd, ipc_ blksz, ipc_ *info) { if(n < blksz) { // Adjust so blocks have blksz**2 entries - blksz = int((long(blksz)*blksz) / n); + blksz = ipc_((long(blksz)*blksz) / n); } #pragma omp atomic write @@ -64,36 +65,36 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta, * its current col-wise implementation ensuring maximum work available??? */ #pragma omp taskgroup - for(int j = 0; j < n; j += blksz) { - int blkn = std::min(blksz, n-j); + for(ipc_ j = 0; j < n; j += blksz) { + ipc_ blkn = std::min(blksz, n-j); /* Diagonal Block Factorization Task */ #pragma omp task default(none) \ firstprivate(j, blkn) \ shared(m, a, lda, blksz, info, beta, upd, ldupd) \ depend(inout: a[j*(lda+1):1]) { - int my_info; + ipc_ my_info; #pragma omp atomic read my_info = *info; if (my_info == -1) { #ifdef PROFILE Profile::Task task("TA_CHOL_DIAG"); #endif - int blkm = std::min(blksz, m-j); - int flag = lapack_potrf(FILL_MODE_LWR, blkn, &a[j*(lda+1)], lda); + ipc_ blkm = std::min(blksz, m-j); + ipc_ flag = lapack_potrf(FILL_MODE_LWR, blkn, &a[j*(lda+1)], lda); if (flag > 0) { // Matrix was not positive definite #pragma omp atomic write *info = flag-1; // flag uses Fortran indexing } else if (blkm > blkn) { // Diagonal block factored OK, handle some rectangular part of block - precision_ one_val = 1.0; - precision_ minus_one_val = - 1.0; + rpc_ one_val = 1.0; + rpc_ minus_one_val = - 1.0; host_trsm(SIDE_RIGHT, FILL_MODE_LWR, OP_T, DIAG_NON_UNIT, blkm-blkn, blkn, one_val, &a[j*(lda+1)], lda, &a[j*(lda+1)+blkn], lda); if (upd) { - precision_ rbeta = (j==0) ? beta : 1.0; + rpc_ rbeta = (j==0) ? beta : 1.0; host_syrk(FILL_MODE_LWR, OP_N, blkm-blkn, blkn, minus_one_val, &a[j*(lda+1)+blkn], lda, rbeta, upd, ldupd); } @@ -104,27 +105,27 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta, } } /* Column Solve Tasks */ - for (int i = j+blksz; i < m; i += blksz) { - int blkm = std::min(blksz, m-i); + for (ipc_ i = j+blksz; i < m; i += blksz) { + ipc_ blkm = std::min(blksz, m-i); #pragma omp task default(none) \ firstprivate(i, j, blkn, blkm) \ shared(a, lda, info, beta, upd, ldupd, blksz, n) \ depend(in: a[j*(lda+1):1]) \ depend(inout: a[j*lda + i:1]) { - int my_info; + ipc_ my_info; #pragma omp atomic read my_info = *info; if (my_info == -1) { #ifdef PROFILE Profile::Task task("TA_CHOL_TRSM"); #endif - precision_ one_val = 1.0; - precision_ minus_one_val = - 1.0; + rpc_ one_val = 1.0; + rpc_ minus_one_val = - 1.0; host_trsm(SIDE_RIGHT, FILL_MODE_LWR, OP_T, DIAG_NON_UNIT, blkm, blkn, one_val, &a[j*(lda+1)], lda, &a[j*lda+i], lda); if ((blkn < blksz) && upd) { - precision_ rbeta = (j==0) ? beta : 1.0; + rpc_ rbeta = (j==0) ? beta : 1.0; host_gemm(OP_N, OP_T, blkm, blksz-blkn, blkn, minus_one_val, &a[j*lda+i], lda, &a[j*(lda+1)+blkn], lda, rbeta, &upd[i-n], ldupd); @@ -136,9 +137,9 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta, } } /* Schur Update Tasks: mostly internal */ - for (int k = j+blksz; k < n; k += blksz) { - int blkk = std::min(blksz, n-k); - for (int i = k; i < m; i += blksz) { + for (ipc_ k = j+blksz; k < n; k += blksz) { + ipc_ blkk = std::min(blksz, n-k); + for (ipc_ i = k; i < m; i += blksz) { #pragma omp task default(none) \ firstprivate(i, j, k, blkn, blkk) \ shared(m, a, lda, blksz, info, beta, upd, ldupd, n) \ @@ -146,22 +147,22 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta, depend(in: a[j*lda+i:1]) \ depend(inout: a[k*lda+i:1]) { - int my_info; + ipc_ my_info; #pragma omp atomic read my_info = *info; if (my_info == -1) { #ifdef PROFILE Profile::Task task("TA_CHOL_UPD"); #endif - int blkm = std::min(blksz, m-i); - precision_ one_val = 1.0; - precision_ minus_one_val = - 1.0; + ipc_ blkm = std::min(blksz, m-i); + rpc_ one_val = 1.0; + rpc_ minus_one_val = - 1.0; host_gemm(OP_N, OP_T, blkm, blkk, blkn, minus_one_val, &a[j*lda+i], lda, &a[j*lda+k], lda, one_val, &a[k*lda+i], lda); if ((blkk < blksz) && upd) { - precision_ rbeta = (j==0) ? beta : 1.0; - int upd_width = (m n) @@ -235,10 +236,10 @@ void cholesky_solve_fwd(int m, int n, precision_ const* a, int lda, } /* Backwards solve corresponding to cholesky_factor() */ -void cholesky_solve_bwd(int m, int n, precision_ const* a, int lda, - int nrhs, precision_* x, int ldx) { - precision_ one_val = 1.0; - precision_ minus_one_val = - 1.0; +void cholesky_solve_bwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda, + ipc_ nrhs, rpc_* x, ipc_ ldx) { + rpc_ one_val = 1.0; + rpc_ minus_one_val = - 1.0; if(nrhs==1) { if(m > n) gemv(OP_T, m-n, n, minus_one_val, &a[n], lda, &x[n], 1, one_val, x, 1); diff --git a/src/ssids/cpu_iface.F90 b/src/ssids/cpu_iface.F90 index 279c1dc40f..7c6fa44540 100644 --- a/src/ssids/cpu_iface.F90 +++ b/src/ssids/cpu_iface.F90 @@ -1,11 +1,36 @@ -! THIS VERSION: GALAHAD 4.3 - 2024-02-01 AT 07:50 GMT. +! THIS VERSION: GALAHAD 4.3 - 2024-02-04 AT 11:50 GMT. #ifdef SPRAL_SINGLE -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define spral_kinds_precision spral_kinds_single_64 #define spral_ssids_cpu_iface_precision spral_ssids_cpu_iface_single_64 #define spral_ssids_inform_precision spral_ssids_inform_single_64 #define spral_ssids_types_precision spral_ssids_types_single_64 +#ifdef NO_UNDERSCORE_INTEGER_64 +#define gemv sgemv64 +#define trsv strsv64 +#define syrk ssyrk64 +#define trsm strsm64 +#define sytrf ssytrf64 +#define potrf spotrf64 +#define gemm sgemm64 +#elif DOUBLE_UNDERSCORE_INTEGER_64 +#define gemv sgemv__64 +#define trsv strsv__64 +#define syrk ssyrk__64 +#define trsm strsm__64 +#define sytrf ssytrf__64 +#define potrf spotrf__64 +#define gemm sgemm__64 +#elif NO_SYMBOL_INTEGER_64 +#define gemv sgemv +#define trsv strsv +#define syrk ssyrk +#define trsm strsm +#define sytrf ssytrf +#define potrf spotrf +#define gemm sgemm +#else #define gemv sgemv_64 #define trsv strsv_64 #define syrk ssyrk_64 @@ -13,6 +38,7 @@ #define sytrf ssytrf_64 #define potrf spotrf_64 #define gemm sgemm_64 +#endif #define spral_c_gemv spral_c_sgemv_64 #define spral_c_trsv spral_c_strsv_64 #define spral_c_syrk spral_c_ssyrk_64 @@ -41,11 +67,36 @@ #define spral_c_gemm spral_c_sgemm #endif #else -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define spral_kinds_precision spral_kinds_double_64 #define spral_ssids_cpu_iface_precision spral_ssids_cpu_iface_double_64 #define spral_ssids_inform_precision spral_ssids_inform_double_64 #define spral_ssids_types_precision spral_ssids_types_double_64 +#ifdef NO_UNDERSCORE_INTEGER_64 +#define gemv dgemv64 +#define trsv dtrsv64 +#define syrk dsyrk64 +#define trsm dtrsm64 +#define sytrf dsytrf64 +#define potrf dpotrf64 +#define gemm dgemm64 +#elif DOUBLE_UNDERSCORE_INTEGER_64 +#define gemv dgemv__64 +#define trsv dtrsv__64 +#define syrk dsyrk__64 +#define trsm dtrsm__64 +#define sytrf dsytrf__64 +#define potrf dpotrf__64 +#define gemm dgemm__64 +#elif NO_SYMBOL_INTEGER_64 +#define gemv dgemv +#define trsv dtrsv +#define syrk dsyrk +#define trsm dtrsm +#define sytrf dsytrf +#define potrf dpotrf +#define gemm dgemm +#else #define gemv dgemv_64 #define trsv dtrsv_64 #define syrk dsyrk_64 @@ -53,6 +104,7 @@ #define sytrf dsytrf_64 #define potrf dpotrf_64 #define gemm dgemm_64 +#endif #define spral_c_gemv spral_c_dgemv_64 #define spral_c_trsv spral_c_dtrsv_64 #define spral_c_syrk spral_c_dsyrk_64 @@ -82,7 +134,7 @@ #endif #endif -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define GALAHAD_BLAS_interface GALAHAD_BLAS_interface_64 #define GALAHAD_LAPACK_interface GALAHAD_LAPACK_interface_64 #endif diff --git a/src/ssids/cpu_solve.F90 b/src/ssids/cpu_solve.F90 index a82e894646..b62a8148db 100644 --- a/src/ssids/cpu_solve.F90 +++ b/src/ssids/cpu_solve.F90 @@ -1,9 +1,9 @@ -! THIS VERSION: GALAHAD 4.1 - 2023-05-20 AT 14:10 GMT. +! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:40 GMT. #include "spral_procedures.h" #ifdef SPRAL_SINGLE -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define trsm strsm_64 #define trsv strsv_64 #define gemm sgemm_64 @@ -15,8 +15,7 @@ #define gemv sgemv #endif #else - -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define trsm dtrsm_64 #define trsv dtrsv_64 #define gemm dgemm_64 @@ -29,7 +28,7 @@ #endif #endif -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define host_gemm host_gemm_64 #endif diff --git a/src/ssids/cpu_subtree.F90 b/src/ssids/cpu_subtree.F90 index bc97cffaff..4915de6c51 100644 --- a/src/ssids/cpu_subtree.F90 +++ b/src/ssids/cpu_subtree.F90 @@ -394,6 +394,7 @@ function factor(this, posdef, aval, child_contrib, options, inform, scaling) cscaling = C_NULL_PTR if (present(scaling)) cscaling = C_LOC(scaling) call cpu_copy_options_in(options, coptions) + cpu_factor%csubtree = & c_create_numeric_subtree(cpu_factor%posdef, this%csubtree, & aval, cscaling, contrib_ptr, coptions, cstats) diff --git a/src/ssids/dense_factor.cu b/src/ssids/dense_factor.cu index 956f7805b5..5b745d42d4 100644 --- a/src/ssids/dense_factor.cu +++ b/src/ssids/dense_factor.cu @@ -1,5 +1,6 @@ /* Copyright (c) 2013 Science and Technology Facilities Council (STFC) * Authors: Evgueni Ovtchinnikov and Jonathan Hogg + * This version: GALAHAD 4.3 - 2024-02-03 AT 09:50 GMT * * This file contains CUDA kernels for partial LL^T and LDL^T factorization * of dense submatrices. @@ -15,11 +16,11 @@ #include #include +#include "ssids_rip.hxx" #include "ssids_gpu_kernels_datatypes.h" #include "spral_cuda_cuda_check.h" #ifdef SPRAL_SINGLE -#define precision_ float #define multinode_chol_type multinode_chol_type_single #define multiblock_fact_type multiblock_fact_type_single #define cstat_data_type cstat_data_type_single @@ -40,7 +41,6 @@ #define spral_ssids_multiblock_llt_setup spral_ssids_multiblock_llt_setup_single #define spral_ssids_square_ldlt spral_ssids_square_ldlt_single #else -#define precision_ double #define multinode_chol_type multinode_chol_type_double #define multiblock_fact_type multiblock_fact_type_double #define cstat_data_type cstat_data_type_double @@ -77,13 +77,13 @@ using namespace spral::ssids::gpu; namespace /* anon */ { -extern __shared__ volatile precision_ SharedMemory[]; +extern __shared__ volatile rpc_ SharedMemory[]; __global__ void cu_block_ldlt_init( - const int ncols, - int *const stat, - int *const ind + const ipc_ ncols, + ipc_ *const stat, + ipc_ *const ind ) { if (threadIdx.x == 0) { stat[0] = ncols; // successful pivots @@ -95,23 +95,23 @@ cu_block_ldlt_init( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __device__ void dev_init_chol_fact( - const unsigned int block, - const int nrows, // number of rows of the factorized matrix - const int ncols, // number of columns thereof + const uipc_ block, + const ipc_ nrows, // number of rows of the factorized matrix + const ipc_ ncols, // number of columns thereof const ELEMENT_TYPE *const a, // array of elements of A - const int lda, // leading dimension of a + const ipc_ lda, // leading dimension of a volatile ELEMENT_TYPE *const fs // initial L factor (shared mem) ) { - const int SIZE_X = TILES*TILE_SIZE; + const ipc_ SIZE_X = TILES*TILE_SIZE; - int x; // row index + ipc_ x; // row index - for ( int tile = 0; tile < TILES; tile++ ) { + for ( ipc_ tile = 0; tile < TILES; tile++ ) { if ( tile ) { // load A's offdiagonal tiles into shared memory x = ncols + threadIdx.x + (tile - 1)*TILE_SIZE + (TILES - 1)*TILE_SIZE*block; // offdiagonal row index in A @@ -131,23 +131,23 @@ dev_init_chol_fact( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __device__ void dev_save_chol_fact( - const unsigned int block, - const int nrows, // number of rows of the factorized matrix - const int ncols, // number of columns thereof + const uipc_ block, + const ipc_ nrows, // number of rows of the factorized matrix + const ipc_ ncols, // number of columns thereof const volatile ELEMENT_TYPE *const fs, // initial L factor (shared mem) ELEMENT_TYPE *const f, // array of elements of L - const int ldf // leading dimension of f + const ipc_ ldf // leading dimension of f ) { - const int SIZE_X = TILES*TILE_SIZE; + const ipc_ SIZE_X = TILES*TILE_SIZE; - int x; // row index + ipc_ x; // row index - for ( int tile = 0; tile < TILES; tile++ ) { + for ( ipc_ tile = 0; tile < TILES; tile++ ) { if ( tile ) { // upload the relevant elements of fs to f x = ncols + threadIdx.x + (tile - 1)*TILE_SIZE + (TILES - 1)*TILE_SIZE*block; @@ -167,23 +167,23 @@ dev_save_chol_fact( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __device__ void dev_block_chol( - const int block, - const int nrows, - const int ncols, + const ipc_ block, + const ipc_ nrows, + const ipc_ ncols, const ELEMENT_TYPE *const a, - const int lda, + const ipc_ lda, ELEMENT_TYPE *const f, - const int ldf, - int *const stat + const ipc_ ldf, + ipc_ *const stat ) { - const int SIZE_X = TILES * TILE_SIZE; + const ipc_ SIZE_X = TILES * TILE_SIZE; - int ip; + ipc_ ip; ELEMENT_TYPE v; volatile ELEMENT_TYPE *const work = (volatile ELEMENT_TYPE*)SharedMemory; @@ -210,7 +210,7 @@ dev_block_chol( __syncthreads(); if ((threadIdx.y > ip) && (threadIdx.y < ncols)) { - for (int x = threadIdx.x + TILE_SIZE; x < SIZE_X; x += TILE_SIZE) + for (ipc_ x = threadIdx.x + TILE_SIZE; x < SIZE_X; x += TILE_SIZE) work[x + SIZE_X*threadIdx.y] -= work[threadIdx.y + SIZE_X*ip] * work[x + SIZE_X*ip]; if (threadIdx.x > ip) @@ -231,72 +231,72 @@ dev_block_chol( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __global__ void cu_block_chol( - const int nrows, - const int ncols, + const ipc_ nrows, + const ipc_ ncols, const ELEMENT_TYPE *const a, - const int lda, + const ipc_ lda, ELEMENT_TYPE *const f, - const int ldf, - int *const stat + const ipc_ ldf, + ipc_ *const stat ) { dev_block_chol< ELEMENT_TYPE, TILE_SIZE, TILES > ( blockIdx.x, nrows, ncols, a, lda, f, ldf, stat ); } struct multinode_chol_type { - int nrows; - int ncols; - precision_ *lcol; + ipc_ nrows; + ipc_ ncols; + rpc_ *lcol; }; // input data type for multiblock_fact and multiblock_chol // each CUDA block gets a copy struct multiblock_fact_type { - int nrows; // no node's rows - int ncols; // no node's cols - int ld; // node's leading dimension - int p; // no rows above the pivot block - precision_ *aptr; // pointer to this node's A matrix - precision_ *ldptr; // pointer to this node's LD matrix - int offf; // this node's L offset in the array of all Ls - precision_ *dptr; // pointer to this node's D in array of all Ds - int node; // node index - int offb; // the idx of the first CUDA block processing this node + ipc_ nrows; // no node's rows + ipc_ ncols; // no node's cols + ipc_ ld; // node's leading dimension + ipc_ p; // no rows above the pivot block + rpc_ *aptr; // pointer to this node's A matrix + rpc_ *ldptr; // pointer to this node's LD matrix + ipc_ offf; // this node's L offset in the array of all Ls + rpc_ *dptr; // pointer to this node's D in array of all Ds + ipc_ node; // node index + ipc_ offb; // the idx of the first CUDA block processing this node }; __global__ void cu_multiblock_fact_setup( struct multinode_fact_type *ndata, struct multiblock_fact_type *const mbfdata, - const int step, - const int block_size, - const int blocks, - const int offb, - int *const stat, - int *const ind, - int *const nl + const ipc_ step, + const ipc_ block_size, + const ipc_ blocks, + const ipc_ offb, + ipc_ *const stat, + ipc_ *const ind, + ipc_ *const nl ) { ndata += blockIdx.x; - const int ncols = ndata->ncols; - const int nrows = ndata->nrows; - precision_ *const lval = ndata->lval; - precision_ *const ldval = ndata->ldval; - precision_ *const dval = ndata->dval; - int ib = ndata->ib; - int jb = ndata->jb; - int done = ndata->done; - int rght = ndata->rght; - const int lbuf = ndata->lbuf; + const ipc_ ncols = ndata->ncols; + const ipc_ nrows = ndata->nrows; + rpc_ *const lval = ndata->lval; + rpc_ *const ldval = ndata->ldval; + rpc_ *const dval = ndata->dval; + ipc_ ib = ndata->ib; + ipc_ jb = ndata->jb; + ipc_ done = ndata->done; + ipc_ rght = ndata->rght; + const ipc_ lbuf = ndata->lbuf; if (jb < ib) return; - const int pivoted = stat[blockIdx.x]; + const ipc_ pivoted = stat[blockIdx.x]; if (pivoted > 0) { done += pivoted; @@ -325,8 +325,8 @@ cu_multiblock_fact_setup( ndata->rght = rght; } - const int rb = nrows - done; - int cb = rght - ib + 1; + const ipc_ rb = nrows - done; + ipc_ cb = rght - ib + 1; if (cb > block_size) cb = block_size; @@ -338,14 +338,14 @@ cu_multiblock_fact_setup( if (ind && (threadIdx.x < cb) && (threadIdx.y == 0)) ind[blockIdx.x*block_size + threadIdx.x] = cb + 1; - int k = (rb - cb - 1)/(block_size*(blocks - 1)) + 1; + ipc_ k = (rb - cb - 1)/(block_size*(blocks - 1)) + 1; - __shared__ volatile int ncb; + __shared__ volatile ipc_ ncb; if ((threadIdx.x == 0) && (threadIdx.y == 0)) ncb = atomicAdd(&nl[0], k); - __shared__ volatile int iwork[9]; - __shared__ precision_ *volatile lptr, *volatile ldptr, *volatile dptr; + __shared__ volatile ipc_ iwork[9]; + __shared__ rpc_ *volatile lptr, *volatile ldptr, *volatile dptr; if ((threadIdx.x == 0) && (threadIdx.y == 0)) { iwork[0] = cb; iwork[1] = rb; @@ -360,7 +360,7 @@ cu_multiblock_fact_setup( } __syncthreads(); - for (int i = threadIdx.y; i < k; i += blockDim.y) { + for (ipc_ i = threadIdx.y; i < k; i += blockDim.y) { switch(threadIdx.x) { case 0: mbfdata[ncb+i].ncols = iwork[0]; break; case 1: mbfdata[ncb+i].nrows = iwork[1]; break; @@ -427,28 +427,28 @@ of size 2*TILE_SIZE, initialized to 0 by this kernel. */ template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __device__ void dev_init_fact( - const unsigned int block, // relative CUDA block number - const int nrows, - const int ncols, - const int offp, + const uipc_ block, // relative CUDA block number + const ipc_ nrows, + const ipc_ ncols, + const ipc_ offp, const ELEMENT_TYPE *const a, // array of elements of A - const int lda, // leading dimension of a + const ipc_ lda, // leading dimension of a volatile ELEMENT_TYPE *const fs, // initial L factor (shared mem) volatile ELEMENT_TYPE *const ds // initial D**(-1) (shared mem) ) { - const int SIZE_X = TILES * TILE_SIZE; + const ipc_ SIZE_X = TILES * TILE_SIZE; - int x, y; // position indices + ipc_ x, y; // position indices y = threadIdx.y % TILE_SIZE; // fs & fds column processed by this thread if ( threadIdx.y < TILE_SIZE ) { - for ( int tile = 0; tile < TILES; tile += 2 ) { + for ( ipc_ tile = 0; tile < TILES; tile += 2 ) { if ( tile ) { // load A_u and A_l's even tiles into shared memory x = threadIdx.x + (tile - 1)*TILE_SIZE + (TILES - 1)*TILE_SIZE*block; // offdiagonal row index in A @@ -467,7 +467,7 @@ dev_init_fact( } else { // load A_u and A_l's odd tiles into shared memory - for (int tile = 1; tile < TILES; tile += 2) { + for (ipc_ tile = 1; tile < TILES; tile += 2) { x = threadIdx.x + (tile - 1)*TILE_SIZE + (TILES - 1)*TILE_SIZE*block; if (x >= offp) @@ -486,33 +486,33 @@ dev_init_fact( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __device__ void dev_save_fact( - const unsigned int block, - const int nrows, - const int ncols, - const int offp, - const int my, // save only if my is non-zero + const uipc_ block, + const ipc_ nrows, + const ipc_ ncols, + const ipc_ offp, + const ipc_ my, // save only if my is non-zero const volatile ELEMENT_TYPE *const fs, // L (shared mem) const volatile ELEMENT_TYPE *const fds, // L*D (shared mem) const volatile ELEMENT_TYPE *const ds, // 2 diags of D**(-1) (shared mem) ELEMENT_TYPE *const f, // L (global mem) - const int ldf, // leading dimension of f + const ipc_ ldf, // leading dimension of f ELEMENT_TYPE *const fd, // L*D (global mem) - const int ldfd, // leading dimension of fd + const ipc_ ldfd, // leading dimension of fd ELEMENT_TYPE *const d // 2 diags of D**(-1) (global mem) ) { - const int SIZE_X = TILES * TILE_SIZE; + const ipc_ SIZE_X = TILES * TILE_SIZE; - int x, y; // position indices + ipc_ x, y; // position indices y = threadIdx.y % TILE_SIZE; // fs & fds column processed by this thread if ( threadIdx.y < TILE_SIZE ) { // warps 0, 1 - for ( int tile = 0; tile < TILES; tile += 2 ) { + for ( ipc_ tile = 0; tile < TILES; tile += 2 ) { if ( tile ) { // upload L_u, L_l, L_u*D and L_l*D's even tiles x = threadIdx.x + (tile - 1)*TILE_SIZE + (TILES - 1)*TILE_SIZE*block; @@ -540,7 +540,7 @@ dev_save_fact( } // loop through even tiles ends here } else { // upload L_u, L_l, L_u*D and L_l*D's odd tiles (warps 2, 3) - for (int tile = 1; tile < TILES; tile += 2) { + for (ipc_ tile = 1; tile < TILES; tile += 2) { x = threadIdx.x + (tile - 1)*TILE_SIZE + (TILES - 1)*TILE_SIZE*block; if (x >= offp) // skip L_d @@ -557,20 +557,20 @@ dev_save_fact( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __device__ void dev_init_max( - const int ncols, + const ipc_ ncols, const volatile ELEMENT_TYPE *const fs, - const int mx, // this thread mask - volatile int *const mask, // pivot index/mask + const ipc_ mx, // this thread mask + volatile ipc_ *const mask, // pivot index/mask volatile bool *const not_max, // "not largest" flag - volatile int &jps, // the index of the largest element - volatile int &quit // pivoting failure flag + volatile ipc_ &jps, // the index of the largest element + volatile ipc_ &quit // pivoting failure flag ) { - const int SIZE_X = TILES*TILE_SIZE; + const ipc_ SIZE_X = TILES*TILE_SIZE; if (threadIdx.y == 0) { mask[threadIdx.x] = mx; // initialize the pivot index @@ -592,7 +592,7 @@ dev_init_max( // select the leftmost among the largest elements of the row if ((threadIdx.y == 0) && (not_max[threadIdx.x] == 0)) - atomicMin((int*)&jps, threadIdx.x); // in case of a tie, choose the leftmost + atomicMin((ipc_*)&jps, threadIdx.x); // in case of a tie, choose the leftmost __syncthreads(); } @@ -622,9 +622,9 @@ template< typename ELEMENT_TYPE > __device__ void dev_select_pivots_at_root( const ELEMENT_TYPE *const fs, - const int ld, // leading dimension of fs - int &ip, - int &jp, + const ipc_ ld, // leading dimension of fs + ipc_ &ip, + ipc_ &jp, ELEMENT_TYPE &a11, ELEMENT_TYPE &a12, ELEMENT_TYPE &a22, @@ -659,9 +659,9 @@ template< typename ELEMENT_TYPE > __device__ void dev_select_pivots( const volatile ELEMENT_TYPE *const fs, - const int ld, // leading dimension of fs - int &ip, - int &jp, + const ipc_ ld, // leading dimension of fs + ipc_ &ip, + ipc_ &jp, ELEMENT_TYPE &a11, ELEMENT_TYPE &a12, ELEMENT_TYPE &a22, @@ -697,11 +697,11 @@ dev_select_pivots( template< typename ELEMENT_TYPE > __device__ bool dev_1x1_pivot_fails( - const int x, - const int ip, + const ipc_ x, + const ipc_ ip, volatile ELEMENT_TYPE *const fs, volatile ELEMENT_TYPE *const fds, - const int ld, + const ipc_ ld, const ELEMENT_TYPE det, const ELEMENT_TYPE delta, const ELEMENT_TYPE eps @@ -731,12 +731,12 @@ dev_1x1_pivot_fails( template< typename ELEMENT_TYPE > __device__ bool dev_2x2_pivot_fails( - const int x, - const int ip, - const int jp, + const ipc_ x, + const ipc_ ip, + const ipc_ jp, volatile ELEMENT_TYPE *const fs, volatile ELEMENT_TYPE *const fds, - const int ld, + const ipc_ ld, const ELEMENT_TYPE a11, const ELEMENT_TYPE a12, const ELEMENT_TYPE a22, @@ -786,16 +786,16 @@ dev_2x2_pivot_fails( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES // = 7 for a single node and = 11 for many nodes +uipc_ TILE_SIZE, +uipc_ TILES // = 7 for a single node and = 11 for many nodes > __device__ void dev_eliminate_1x1( - int &x, // row for this thread - const int y, // column for this thread - const int ip, // pivoted column + ipc_ &x, // row for this thread + const ipc_ y, // column for this thread + const ipc_ ip, // pivoted column volatile ELEMENT_TYPE *const fs, - const int ld, + const ipc_ ld, const ELEMENT_TYPE p // pivot value ) { if ( x != ip ) @@ -813,15 +813,15 @@ dev_eliminate_1x1( /* The next function eliminates the two pivoted columns from non-pivoted */ template< typename ELEMENT_TYPE, -unsigned int TILE_SIZE, unsigned int TILES > +uipc_ TILE_SIZE, uipc_ TILES > __device__ void dev_eliminate_2x2( - int &x, - const int y, - const int ip, - const int jp, + ipc_ &x, + const ipc_ y, + const ipc_ ip, + const ipc_ jp, volatile ELEMENT_TYPE *const fs, - const int ld, + const ipc_ ld, const ELEMENT_TYPE pi, const ELEMENT_TYPE pj ) { @@ -839,15 +839,15 @@ dev_eliminate_2x2( /* The next function performs elimination in one tile only */ -template< typename ELEMENT_TYPE, unsigned int TILE_SIZE > +template< typename ELEMENT_TYPE, uipc_ TILE_SIZE > inline __device__ void dev_eliminate( - int &x, - const int y, - const int ip, - const int jp, + ipc_ &x, + const ipc_ y, + const ipc_ ip, + const ipc_ jp, volatile ELEMENT_TYPE *const fs, - const int ld, + const ipc_ ld, const ELEMENT_TYPE pi, const ELEMENT_TYPE pj ) { @@ -877,40 +877,40 @@ Called by cu_block_ldlt and cu_multiblock_ldlt factorization kernels. */ template< typename ELEMENT_TYPE, -unsigned int TILE_SIZE, unsigned int TILES > +uipc_ TILE_SIZE, uipc_ TILES > __device__ void dev_block_ldlt( - const unsigned int block, - const int nrows, // number of rows of the factorized matrix - const int ncols, // number of columns thereof - const int offp, // number of rows above the pivot block + const uipc_ block, + const ipc_ nrows, // number of rows of the factorized matrix + const ipc_ ncols, // number of columns thereof + const ipc_ offp, // number of rows above the pivot block ELEMENT_TYPE *const a, // array of elements of A - const int lda, // leading dimension of a + const ipc_ lda, // leading dimension of a ELEMENT_TYPE *const f, // array of elements of the L factor - const int ldf, // leading dimension of f + const ipc_ ldf, // leading dimension of f ELEMENT_TYPE *const fd, // array of elements of L*D - const int ldfd, // leading dimension of fd + const ipc_ ldfd, // leading dimension of fd ELEMENT_TYPE *const d, // array for main diagonal and subdiagonal of D const ELEMENT_TYPE delta, // pivoting threashold const ELEMENT_TYPE eps, // zero pivot threashold - int *const index, // pivot order index - int *const stat // number of successful pivots + ipc_ *const index, // pivot order index + ipc_ *const stat // number of successful pivots ) { - const int SIZE_X = TILES*TILE_SIZE; + const ipc_ SIZE_X = TILES*TILE_SIZE; - int ip, jp; // pivot row and col indices - int x, y; // position indices - int mx, my; // masks + ipc_ ip, jp; // pivot row and col indices + ipc_ x, y; // position indices + ipc_ mx, my; // masks ELEMENT_TYPE a11, a12, a22, det; // 2x2 pivot data __shared__ volatile ELEMENT_TYPE fs[SIZE_X*TILE_SIZE]; // work array for f __shared__ volatile ELEMENT_TYPE fds[SIZE_X*TILE_SIZE]; // work array for fd __shared__ volatile ELEMENT_TYPE ds[2*TILE_SIZE]; // work array for d - __shared__ volatile int mask[TILE_SIZE]; // pivot mask/index + __shared__ volatile ipc_ mask[TILE_SIZE]; // pivot mask/index __shared__ volatile bool not_max[TILE_SIZE]; // flag for finding the largest row elm - __shared__ volatile int quit; // failure flag - __shared__ volatile int jps; // pivot column index + __shared__ volatile ipc_ quit; // failure flag + __shared__ volatile ipc_ jps; // pivot column index y = threadIdx.y % TILE_SIZE; // fs & fds column processed by this thread @@ -924,7 +924,7 @@ dev_block_ldlt( dev_init_max< ELEMENT_TYPE, TILE_SIZE, TILES > ( ncols, fs, mx, mask, not_max, jps, quit ); - for ( int row = 0, pivoted = 0; row < ncols; ) { + for ( ipc_ row = 0, pivoted = 0; row < ncols; ) { // select the pivot based on the row's largest element index jps ip = row; @@ -1065,7 +1065,7 @@ dev_block_ldlt( // select leftmost largest element in the row if ( row < ncols ) { if ( threadIdx.y == 0 && not_max[threadIdx.x] == 0 ) - atomicMin((int*)&jps, threadIdx.x); // in case of a tie, choose the leftmost + atomicMin((ipc_*)&jps, threadIdx.x); // in case of a tie, choose the leftmost } } else { // do elimination in the (TILES)-th tile @@ -1096,26 +1096,26 @@ dev_block_ldlt( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __global__ void cu_block_ldlt( - const int nrows, // n.o. rows in A - const int ncols, // n.o. cols in A (<= TILE_SIZE) - const int offp, // n.o. rows in A_u + const ipc_ nrows, // n.o. rows in A + const ipc_ ncols, // n.o. cols in A (<= TILE_SIZE) + const ipc_ offp, // n.o. rows in A_u ELEMENT_TYPE *const a, // array of A's elements - const int lda, // leading dimension of a + const ipc_ lda, // leading dimension of a ELEMENT_TYPE *const f, // array of L's elements - const int ldf, // leading dimension of f + const ipc_ ldf, // leading dimension of f ELEMENT_TYPE *const fd, // array of (L*D)'s elements - const int ldfd, // leading dimension of fd + const ipc_ ldfd, // leading dimension of fd ELEMENT_TYPE *const d, // array of D**(-1)'s diagonal and subdiagonal elements const ELEMENT_TYPE delta, // pivoting threshold const ELEMENT_TYPE eps, // zero column threshold: // the column is zeroed if all elements are <= eps - int *const index, // pivot index (cf. permutation matrix P) - int *const stat // n.o. successful pivots + ipc_ *const index, // pivot index (cf. permutation matrix P) + ipc_ *const stat // n.o. successful pivots ) { dev_block_ldlt< ELEMENT_TYPE, TILE_SIZE, TILES > ( blockIdx.x, nrows, ncols, offp, a, lda, f, ldf, @@ -1130,8 +1130,8 @@ cu_block_ldlt( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __global__ void cu_multiblock_ldlt( @@ -1139,28 +1139,28 @@ cu_multiblock_ldlt( ELEMENT_TYPE *f, // same for L const ELEMENT_TYPE delta, // same as in cu_block_fact const ELEMENT_TYPE eps, // same as in cu_block_fact - int *const index, // array of all pivot indices - int *const stat // array of successful pivots' numbers + ipc_ *const index, // array of all pivot indices + ipc_ *const stat // array of successful pivots' numbers ) { /* * Read information on what to do from global memory */ mbfdata += blockIdx.x; // shift to the data for this CUDA block - int ncols = mbfdata->ncols; // n.o. cols in A processed by this CUDA block + ipc_ ncols = mbfdata->ncols; // n.o. cols in A processed by this CUDA block if ( ncols < 1 ) return; - int nrows = mbfdata->nrows; // n.o. rows in A - int lda = mbfdata->ld; // leading dimension of A - int p = mbfdata->p; // n.o. rows in A_u - int node = mbfdata->node; // A's number - int block = mbfdata->offb; // relative CUDA block index + ipc_ nrows = mbfdata->nrows; // n.o. rows in A + ipc_ lda = mbfdata->ld; // leading dimension of A + ipc_ p = mbfdata->p; // n.o. rows in A_u + ipc_ node = mbfdata->node; // A's number + ipc_ block = mbfdata->offb; // relative CUDA block index f += mbfdata->offf; // shift to the array of this L elements - precision_ *fd = mbfdata->ldptr; - precision_ *a = mbfdata->aptr; // pointer to A - precision_ *d = mbfdata->dptr; // pointer to D**(-1) + rpc_ *fd = mbfdata->ldptr; + rpc_ *a = mbfdata->aptr; // pointer to A + rpc_ *d = mbfdata->dptr; // pointer to D**(-1) - dev_block_ldlt < precision_, TILE_SIZE, TILES > + dev_block_ldlt < rpc_, TILE_SIZE, TILES > ( block, nrows, ncols, p, a, lda, f, lda, fd, lda, d, delta, eps, &index[node*TILE_SIZE], &stat[node]); } @@ -1191,26 +1191,26 @@ cu_multiblock_ldlt( template< typename ELEMENT_TYPE > __global__ void cu_square_ldlt( - const int n, + const ipc_ n, ELEMENT_TYPE *const a, // A on input, L on output ELEMENT_TYPE *const f, // L ELEMENT_TYPE *const w, // L*D ELEMENT_TYPE *const d, // main diag and subdiag of the inverse of D - const int ld, // leading dimension of a, f, w + const ipc_ ld, // leading dimension of a, f, w const ELEMENT_TYPE delta, // same as above const ELEMENT_TYPE eps, // same as above - int *const ind, // same as in cu_block_fact - int *const stat // same as in cu_block_fact + ipc_ *const ind, // same as in cu_block_fact + ipc_ *const stat // same as in cu_block_fact ) { - int x, y; - int col; - int ip, jp; - int pivoted, recent; + ipc_ x, y; + ipc_ col; + ipc_ ip, jp; + ipc_ pivoted, recent; ELEMENT_TYPE a11, a12, a22, det; volatile ELEMENT_TYPE *work = (volatile ELEMENT_TYPE*)SharedMemory; // work array - volatile int *const iwork = (volatile int*)&(work[blockDim.x]); // integer work array - volatile int *const iw = (volatile int*)&(iwork[blockDim.x]); // iw[0]: failure flag, + volatile ipc_ *const iwork = (volatile ipc_*)&(work[blockDim.x]); // integer work array + volatile ipc_ *const iw = (volatile ipc_*)&(iwork[blockDim.x]); // iw[0]: failure flag, // iw[1]: largest col. elem. index for ( x = threadIdx.x; x < n; x += blockDim.x ) { @@ -1224,7 +1224,7 @@ cu_square_ldlt( pivoted = 0; // n.o. pivoted cols - for ( int pass = 0; ; pass++ ) { // failed cols are skipped until next pass + for ( ipc_ pass = 0; ; pass++ ) { // failed cols are skipped until next pass recent = 0; // n.o. cols pivoted during this pass @@ -1388,26 +1388,26 @@ cu_square_ldlt( template < typename ELEMENT_TYPE, -unsigned int TILE_SIZE, -unsigned int TILES +uipc_ TILE_SIZE, +uipc_ TILES > __global__ void cu_multiblock_chol( struct multiblock_fact_type *mbfdata, ELEMENT_TYPE *f, // array of L nodes - int *stat // execution status + ipc_ *stat // execution status ) { /* * Read information on what to do from global memory */ mbfdata += blockIdx.x; - int ncols = mbfdata->ncols; + ipc_ ncols = mbfdata->ncols; if ( ncols < 1 ) return; - int nrows = mbfdata->nrows; - int ld = mbfdata->ld; - int node = mbfdata->node; - int block = mbfdata->offb; + ipc_ nrows = mbfdata->nrows; + ipc_ ld = mbfdata->ld; + ipc_ node = mbfdata->node; + ipc_ block = mbfdata->offb; ELEMENT_TYPE *const a = mbfdata->aptr; f += mbfdata->offf; @@ -1417,8 +1417,8 @@ cu_multiblock_chol( } struct cstat_data_type { - int nelim; - precision_ *dval; + ipc_ nelim; + rpc_ *dval; }; __global__ void @@ -1428,16 +1428,16 @@ cu_collect_stats( ) { // Designed to be run with a single thread csdata += blockIdx.x; - precision_ *const d = csdata->dval; - const int nelim = csdata->nelim; + rpc_ *const d = csdata->dval; + const ipc_ nelim = csdata->nelim; - int num_zero = 0; - int num_neg = 0; - int num_two = 0; + ipc_ num_zero = 0; + ipc_ num_neg = 0; + ipc_ num_two = 0; - for (int i = 0; i < nelim; ) { - const precision_ a11 = d[2*i]; - const precision_ a21 = d[2*i + 1]; + for (ipc_ i = 0; i < nelim; ) { + const rpc_ a11 = d[2*i]; + const rpc_ a21 = d[2*i + 1]; if ( a21 == 0.0 ) { // 1x1 pivot (can be a zero pivot) if ( a11 == 0 ) @@ -1448,15 +1448,15 @@ cu_collect_stats( } else { // 2x2 pivot (can't be a zero pivot) - const precision_ a22 = d[2*(i + 1)]; + const rpc_ a22 = d[2*(i + 1)]; num_two++; // To check for negative eigenvalues, we exploit // det = product of evals // trace = sum of evals // if det is negative, exactly one eval is negative; // otherwise, both have same sign, equal to sign of trace - const precision_ det = a11*a22 - a21*a21; - const precision_ trace = a11 + a22; + const rpc_ det = a11*a22 - a21*a21; + const rpc_ trace = a11 + a22; if ( det < 0 ) num_neg++; else if ( trace < 0 ) @@ -1482,64 +1482,64 @@ cu_collect_stats( extern "C" { void spral_ssids_block_ldlt( - cudaStream_t *stream, int nrows, int ncols, int p, - precision_* a, int lda, - precision_* f, int ldf, - precision_* fd, int ldfd, - precision_* d, - precision_ delta, precision_ eps, - int* index, int* stat + cudaStream_t *stream, ipc_ nrows, ipc_ ncols, ipc_ p, + rpc_* a, ipc_ lda, + rpc_* f, ipc_ ldf, + rpc_* fd, ipc_ ldfd, + rpc_* d, + rpc_ delta, rpc_ eps, + ipc_* index, ipc_* stat ) { - int nblocks = (nrows - ncols - 1)/(BLOCK_SIZE*(BLOCKS - 1)) + 1; + ipc_ nblocks = (nrows - ncols - 1)/(BLOCK_SIZE*(BLOCKS - 1)) + 1; cu_block_ldlt_init<<< 1, BLOCK_SIZE, 0, *stream >>>( ncols, stat, index ); dim3 threads(BLOCK_SIZE, 2*BLOCK_SIZE); cu_block_ldlt - < precision_, BLOCK_SIZE, BLOCKS > + < rpc_, BLOCK_SIZE, BLOCKS > <<< nblocks, threads, 0, *stream >>> ( nrows, ncols, p, a, lda, f, ldf, fd, ldfd, d, delta, eps, index, stat ); } -void spral_ssids_block_llt( cudaStream_t *stream, int nrows, int ncols, - precision_* a, int lda, precision_* f, int ldf, int* stat ) { - int smsize = CBLOCKS*BLOCK_SIZE*BLOCK_SIZE*sizeof(precision_); - int nblocks = (nrows - ncols - 1)/(BLOCK_SIZE*(CBLOCKS - 1)) + 1; +void spral_ssids_block_llt( cudaStream_t *stream, ipc_ nrows, ipc_ ncols, + rpc_* a, ipc_ lda, rpc_* f, ipc_ ldf, ipc_* stat ) { + ipc_ smsize = CBLOCKS*BLOCK_SIZE*BLOCK_SIZE*sizeof(rpc_); + ipc_ nblocks = (nrows - ncols - 1)/(BLOCK_SIZE*(CBLOCKS - 1)) + 1; dim3 threads(BLOCK_SIZE, BLOCK_SIZE); cu_block_chol - < precision_, BLOCK_SIZE, CBLOCKS > + < rpc_, BLOCK_SIZE, CBLOCKS > <<< nblocks, threads, smsize, *stream >>> ( nrows, ncols, a, lda, f, ldf, stat ); } -void spral_ssids_collect_stats(cudaStream_t *stream, int nblk, +void spral_ssids_collect_stats(cudaStream_t *stream, ipc_ nblk, const struct cstat_data_type *csdata, struct cuda_stats *stats) { - for(int i=0; i>> (csdata+i, stats); CudaCheckError(); } } -void spral_ssids_multiblock_ldlt( cudaStream_t *stream, int nblocks, - struct multiblock_fact_type *mbfdata, precision_* f, precision_ delta, - precision_ eps, int* index, int* stat ) { +void spral_ssids_multiblock_ldlt( cudaStream_t *stream, ipc_ nblocks, + struct multiblock_fact_type *mbfdata, rpc_* f, rpc_ delta, + rpc_ eps, ipc_* index, ipc_* stat ) { dim3 threads(BLOCK_SIZE, 2*BLOCK_SIZE); - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); cu_multiblock_ldlt - < precision_, BLOCK_SIZE, MBLOCKS > + < rpc_, BLOCK_SIZE, MBLOCKS > <<< nb, threads, 0, *stream >>> ( mbfdata + i, f, delta, eps, index, stat ); } } -void spral_ssids_multiblock_ldlt_setup( cudaStream_t *stream, int nblocks, +void spral_ssids_multiblock_ldlt_setup( cudaStream_t *stream, ipc_ nblocks, struct multinode_fact_type *ndata, struct multiblock_fact_type *mbfdata, - int step, int block_size, int blocks, int* stat, int* ind, int* ncb ) { + ipc_ step, ipc_ block_size, ipc_ blocks, ipc_* stat, ipc_* ind, ipc_* ncb ) { dim3 threads(10,8); - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); cu_multiblock_fact_setup <<< nb, threads, 0, *stream >>> ( ndata + i, mbfdata, step, block_size, blocks, @@ -1547,28 +1547,28 @@ void spral_ssids_multiblock_ldlt_setup( cudaStream_t *stream, int nblocks, } } -void spral_ssids_multiblock_llt( cudaStream_t *stream, int nblocks, - struct multiblock_fact_type *mbfdata, precision_* f, int* stat ) { +void spral_ssids_multiblock_llt( cudaStream_t *stream, ipc_ nblocks, + struct multiblock_fact_type *mbfdata, rpc_* f, ipc_* stat ) { if ( nblocks < 1 ) return; - int smsize = MCBLOCKS*BLOCK_SIZE*BLOCK_SIZE*sizeof(precision_); + ipc_ smsize = MCBLOCKS*BLOCK_SIZE*BLOCK_SIZE*sizeof(rpc_); dim3 threads(BLOCK_SIZE, BLOCK_SIZE); - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); cu_multiblock_chol - < precision_, BLOCK_SIZE, MCBLOCKS > + < rpc_, BLOCK_SIZE, MCBLOCKS > <<< nb, threads, smsize, *stream >>> ( mbfdata + i, f, stat ); } } -void spral_ssids_multiblock_llt_setup( cudaStream_t *stream, int nblocks, +void spral_ssids_multiblock_llt_setup( cudaStream_t *stream, ipc_ nblocks, struct multinode_fact_type *ndata, struct multiblock_fact_type *mbfdata, - int step, int block_size, int blocks, int* stat, int* ncb ) { + ipc_ step, ipc_ block_size, ipc_ blocks, ipc_* stat, ipc_* ncb ) { dim3 threads(16,8); - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); cu_multiblock_fact_setup <<< nb, threads, 0, *stream >>> ( ndata + i, mbfdata, step, block_size, blocks, i, stat + i, 0, ncb ); @@ -1577,20 +1577,20 @@ void spral_ssids_multiblock_llt_setup( cudaStream_t *stream, int nblocks, void spral_ssids_square_ldlt( cudaStream_t *stream, - int n, - precision_* a, - precision_* f, - precision_* w, - precision_* d, - int ld, - precision_ delta, precision_ eps, - int* index, - int* stat + ipc_ n, + rpc_* a, + rpc_* f, + rpc_* w, + rpc_* d, + ipc_ ld, + rpc_ delta, rpc_ eps, + ipc_* index, + ipc_* stat ) { - int nt = min(n, 256); - int sm = nt*sizeof(precision_) + (nt + 2)*sizeof(int); - cu_square_ldlt< precision_ ><<< 1, nt, sm, *stream >>> + ipc_ nt = min(n, 256); + ipc_ sm = nt*sizeof(rpc_) + (nt + 2)*sizeof(ipc_); + cu_square_ldlt< rpc_ ><<< 1, nt, sm, *stream >>> ( n, a, f, w, d, ld, delta, eps, index, stat ); } diff --git a/src/ssids/fkeep.F90 b/src/ssids/fkeep.F90 index 11f89a72fe..af79d56b73 100644 --- a/src/ssids/fkeep.F90 +++ b/src/ssids/fkeep.F90 @@ -75,7 +75,6 @@ subroutine inner_factor_cpu(fkeep, akeep, val, options, inform) logical :: abort, all_region type(contrib_type), dimension(:), allocatable :: child_contrib type(ssids_inform), dimension(:), allocatable :: thread_inform - #ifdef PROFILE ! Begin profile trace (noop if not enabled) call profile_begin(akeep%topology) diff --git a/src/ssids/ldlt_app.cxx b/src/ssids/ldlt_app.cxx index bcab9225aa..8654f3391c 100644 --- a/src/ssids/ldlt_app.cxx +++ b/src/ssids/ldlt_app.cxx @@ -1,7 +1,6 @@ -/** \file - * \copyright 2016 The Science and Technology Facilities Council (STFC) - * \licence BSD licence, see LICENCE file for details - * \author Jonathan Hogg +/** \file \copyright 2016 The Science and Technology Facilities Council + * (STFC) \licence BSD licence, see LICENCE file for details \author + * Jonathan Hogg \version GALAHAD 4.3 - 2024-02-03 AT 61:10 GMT */ #include "ssids_cpu_kernels_ldlt_app.hxx" @@ -34,16 +33,14 @@ #include "ssids_cpu_kernels_wrappers.hxx" #ifdef SPRAL_SINGLE -#define precision_ float #define ldlt_app_internal ldlt_app_internal_sgl #define ldlt_app_factor_mem_required ldlt_app_factor_mem_required_sgl #else -#define precision_ double #define ldlt_app_internal ldlt_app_internal_dbl #define ldlt_app_factor_mem_required ldlt_app_factor_mem_required_dbl #endif -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define host_gemm host_gemm_64 #define host_trsv host_trsv_64 #define host_trsm host_trsm_64 @@ -54,15 +51,15 @@ namespace spral { namespace ssids { namespace cpu { namespace ldlt_app_internal { -static const int INNER_BLOCK_SIZE = 32; +static const ipc_ INNER_BLOCK_SIZE = 32; /** \return number of blocks for given n */ -inline int calc_nblk(int n, int block_size) { +inline ipc_ calc_nblk(ipc_ n, ipc_ block_size) { return (n-1) / block_size + 1; } /** \return block size of block blk if maximum in dimension is n */ -inline int calc_blkn(int blk, int n, int block_size) { +inline ipc_ calc_blkn(ipc_ blk, ipc_ n, ipc_ block_size) { return std::min(block_size, n-blk*block_size); } @@ -74,7 +71,7 @@ template class Column { public: bool first_elim; ///< True if first column with eliminations - int nelim; ///< Number of eliminated entries in this column + ipc_ nelim; ///< Number of eliminated entries in this column T *d; ///< Pointer to local d // \{ @@ -86,7 +83,7 @@ class Column { /** \brief Initialize number of passed columns ready for reduction * \param passed number of variables passing a posteori pivot test in block */ - void init_passed(int passed) { + void init_passed(ipc_ passed) { spral::omp::AcquiredLock scopeLock(lock_); npass_ = passed; } @@ -94,7 +91,7 @@ class Column { * \details Aquires a lock before doing a minimum reduction across blocks * \param passed number of variables passing a posteori pivot test in block */ - void update_passed(int passed) { + void update_passed(ipc_ passed) { spral::omp::AcquiredLock scopeLock(lock_); npass_ = std::min(npass_, passed); } @@ -107,7 +104,7 @@ class Column { * sucessful columns in the case of a global cancellation. * \param passed number of pivots that succeeded for a block * \returns true if passed < nelim */ - bool test_fail(int passed) { + bool test_fail(ipc_ passed) { bool fail = (passed < nelim); if(!fail) { // Record number of blocks in column passing this test @@ -125,7 +122,7 @@ class Column { * a variable, and sets nelim for this column. * \param next_elim global number of eliminated pivots to be updated based * on number eliminated in this column. */ - void adjust(int& next_elim) { + void adjust(ipc_& next_elim) { // Test if last passed column was first part of a 2x2: if so, // decrement npass spral::omp::AcquiredLock scopeLock(lock_); @@ -156,25 +153,25 @@ class Column { * \internal Note that there is no need to consider a similar operation for * d[] as it is only used for eliminated variables. */ - void move_back(int n, int const* perm, int* elim_perm, int* failed_perm) { + void move_back(ipc_ n, ipc_ const* perm, ipc_* elim_perm, ipc_* failed_perm) { if(perm != elim_perm) { // Don't move if memory is identical - for(int i=0; i& operator[](int idx) { return cdata_[idx]; } + Column& operator[](ipc_ idx) { return cdata_[idx]; } /** \brief Return local permutation pointer for given column * \param blk block column * \return pointer to local permutation */ - int* get_lperm(int blk) { return &lperm_[blk*block_size_]; } + ipc_* get_lperm(ipc_ blk) { return &lperm_[blk*block_size_]; } /** \brief Calculate number of eliminated columns in unpivoted case * \param m number of rows in matrix * \return number of sucesfully eliminated columns */ - int calc_nelim(int m) const { - int mblk = calc_nblk(m, block_size_); - int nblk = calc_nblk(n_, block_size_); - int nelim = 0; - for(int j=0; j *cdata_; ///< underlying array of columns - int* lperm_; ///< underlying local permutation + ipc_* lperm_; ///< underlying local permutation }; /** Returns true if ptr is suitably aligned for AVX, false if not */ bool is_aligned(void* ptr) { #if defined(__AVX512F__) - const int align = 64; + const ipc_ align = 64; #elif defined(__AVX__) - const int align = 32; + const ipc_ align = 32; #else - const int align = 16; + const ipc_ align = 16; #endif return (reinterpret_cast(ptr) % align == 0); } @@ -271,10 +268,10 @@ bool is_aligned(void* ptr) { * within diagonal block. * Note that out and aval may overlap. */ template -void move_up_diag(Column const& idata, Column const& jdata, T* out, T const* aval, int lda) { +void move_up_diag(Column const& idata, Column const& jdata, T* out, T const* aval, ipc_ lda) { if(out == aval) return; // don't bother moving if memory is the same - for(int j=0; j -void move_up_rect(int m, int rfrom, Column const& jdata, T* out, T const* aval, int lda) { +void move_up_rect(ipc_ m, ipc_ rfrom, Column const& jdata, T* out, T const* aval, ipc_ lda) { if(out == aval) return; // don't bother moving if memory is the same - for(int j=0; j -void copy_failed_diag(int m, int n, Column const& idata, Column const& jdata, T* rout, T* cout, T* dout, int ldout, T const* aval, int lda) { +void copy_failed_diag(ipc_ m, ipc_ n, Column const& idata, Column const& jdata, T* rout, T* cout, T* dout, ipc_ ldout, T const* aval, ipc_ lda) { /* copy rows */ - for(int j=0; j -void copy_failed_rect(int m, int n, int rfrom, Column const& jdata, T* cout, int ldout, T const* aval, int lda) { - for(int j=jdata.nelim, jout=0; j -int check_threshold(int rfrom, int rto, int cfrom, int cto, T u, T* aval, int lda) { +ipc_ check_threshold(ipc_ rfrom, ipc_ rto, ipc_ cfrom, ipc_ cto, T u, T* aval, ipc_ lda) { // Perform threshold test for each uneliminated row/column - int least_fail = (op==OP_N) ? cto : rto; - for(int j=cfrom; j 1.0/u) { if(op==OP_N) { // must be least failed col @@ -345,24 +342,24 @@ int check_threshold(int rfrom, int rto, int cfrom, int cto, T u, T* aval, int ld * 1x1 ( 0 ) stored as d = [ 0.0 0.0 ] */ template -void apply_pivot(int m, int n, int from, const T *diag, const T *d, - const T small, T* aval, int lda) { +void apply_pivot(ipc_ m, ipc_ n, ipc_ from, const T *diag, const T *d, + const T small, T* aval, ipc_ lda) { if(op==OP_N && from > m) return; // no-op if(op==OP_T && from > n) return; // no-op - precision_ one_val = 1.0; + rpc_ one_val = 1.0; if(op==OP_N) { // Perform solve L_11^-T host_trsm(SIDE_RIGHT, FILL_MODE_LWR, OP_T, DIAG_UNIT, m, n, one_val, diag, lda, aval, lda); // Perform solve L_21 D^-1 - for(int i=0; i(SIDE_LEFT, FILL_MODE_LWR, OP_N, DIAG_UNIT, m, n-from, one_val, diag, lda, &aval[from*lda], lda); // Perform solve D^-T L_21^T - for(int i=0; i(m_)), acopy_(alloc_.allocate(n_*ldcopy_)) @@ -484,7 +481,7 @@ class CopyBackup { * \param iblk row index of block. * \param jblk column index of block. */ - void release(int iblk, int jblk) { /* no-op */ } + void release(ipc_ iblk, ipc_ jblk) { /* no-op */ } /** \brief Create a restore point for the given block. * \param iblk row index of block. @@ -492,10 +489,10 @@ class CopyBackup { * \param aval pointer to block to be stored. * \param lda leading dimension of aval. */ - void create_restore_point(int iblk, int jblk, T const* aval, int lda) { + void create_restore_point(ipc_ iblk, ipc_ jblk, T const* aval, ipc_ lda) { T* lwork = get_lwork(iblk, jblk); - for(int j=0; jc) ? lwork[c*ldcopy_+r] : lwork[r*ldcopy_+c]; } - for(int i=get_ncol(jblk); ic) ? lwork[c*block_size_+r] : lwork[r*block_size_+c]; } - for(int i=get_ncol(jblk); i pool_; ///< pool of blocks std::vector ptr_; ///< map from pointer matrix entry to block }; template +template class Block { public: /** \brief Constuctor. @@ -808,8 +805,8 @@ class Block { * \param lda Leading dimension of a. * \param block_size The block size. */ - Block(int i, int j, int m, int n, ColumnData& cdata, T* a, - int lda, int block_size) + Block(ipc_ i, ipc_ j, ipc_ m, ipc_ n, ColumnData& cdata, T* a, + ipc_ lda, ipc_ block_size) : i_(i), j_(j), m_(m), n_(n), lda_(lda), block_size_(block_size), cdata_(cdata), aval_(&a[j*block_size*lda+i*block_size]) {} @@ -838,19 +835,19 @@ class Block { * \param work Thread-specific workspace. */ void apply_rperm(Workspace& work) { - int ldl = align_lda(block_size_); + ipc_ ldl = align_lda(block_size_); T* lwork = work.get_ptr(ncol()*ldl); - int* lperm = cdata_.get_lperm(i_); + ipc_* lperm = cdata_.get_lperm(i_); // Copy into lwork with permutation - for(int j=0; j(block_size_); + ipc_ ldl = align_lda(block_size_); T* lwork = work.get_ptr(ncol()*ldl); - int* lperm = cdata_.get_lperm(i_); + ipc_* lperm = cdata_.get_lperm(i_); // Copy into lwork with permutation - for(int j=0; j(block_size_); + ipc_ ldl = align_lda(block_size_); T* lwork = work.get_ptr(ncol()*ldl); - int* lperm = cdata_.get_lperm(j_); + ipc_* lperm = cdata_.get_lperm(j_); // Copy into lwork with permutation - for(int j=0; j - void restore_if_required(Backup& backup, int elim_col) { + void restore_if_required(Backup& backup, ipc_ elim_col) { if(i_ == elim_col && j_ == elim_col) { // In eliminated diagonal block if(cdata_[i_].nelim < ncol()) { // If there are failed pivots backup.restore_part_with_sym_perm( @@ -945,7 +942,7 @@ class Block { } else if(j_ == elim_col) { // In eliminated col if(cdata_[j_].nelim < ncol()) { // If there are failed pivots - int rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0; + ipc_ rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0; backup.restore_part(i_, j_, rfrom, cdata_[j_].nelim, aval_, lda_); } // Release resources regardless, no longer required @@ -978,13 +975,13 @@ class Block { * LDLT::factor(). */ template - int factor(int next_elim, int* perm, T* d, + ipc_ factor(ipc_ next_elim, ipc_* perm, T* d, struct cpu_factor_options const &options, std::vector& work, Allocator const& alloc) { if(i_ != j_) throw std::runtime_error("factor called on non-diagonal block!"); - int* lperm = cdata_.get_lperm(i_); - for(int i=0; i(ncol()); - int* blkperm = &perm[i_*block_size_]; - for(int i=0; i(ncol()); + ipc_* blkperm = &perm[i_*block_size_]; + for(ipc_ i=0; i(ncol()); - int* blkperm = &perm[i_*INNER_BLOCK_SIZE]; - for(int i=0; i(ncol()); + ipc_* blkperm = &perm[i_*INNER_BLOCK_SIZE]; + for(ipc_ i=0; i( INNER_BLOCK_SIZE*INNER_BLOCK_SIZE ); @@ -1057,7 +1054,7 @@ class Block { * \param small The drop tolerance for zero testing. * \returns Number of successful pivots in this block. */ - int apply_pivot_app(Block const& dblk, T u, T small) { + ipc_ apply_pivot_app(Block const& dblk, T u, T small) { if(i_ == j_) throw std::runtime_error("apply_pivot called on diagonal block!"); if(i_ == dblk.i_) { // Apply within row (ApplyT) @@ -1098,16 +1095,16 @@ class Block { * \param ldupd Leading dimension of upd. */ void update(Block const& isrc, Block const& jsrc, Workspace& work, - precision_ beta=1.0, T* upd=nullptr, int ldupd=0) { + rpc_ beta=1.0, T* upd=nullptr, ipc_ ldupd=0) { if(isrc.i_ == i_ && isrc.j_ == jsrc.j_) { // Update to right of elim column (UpdateN) - int elim_col = isrc.j_; + ipc_ elim_col = isrc.j_; if(cdata_[elim_col].nelim == 0) return; // nothing to do - int rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0; - int cfrom = (j_ <= elim_col) ? cdata_[j_].nelim : 0; - int ldld = align_lda(block_size_); - precision_ one_val = 1.0; - precision_ minus_one_val = - 1.0; + ipc_ rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0; + ipc_ cfrom = (j_ <= elim_col) ? cdata_[j_].nelim : 0; + ipc_ ldld = align_lda(block_size_); + rpc_ one_val = 1.0; + rpc_ minus_one_val = - 1.0; T* ld = work.get_ptr(block_size_*ldld); // NB: we use ld[rfrom] below so alignment matches that of aval[rfrom] calcLD( @@ -1121,7 +1118,7 @@ class Block { ); if(upd && j_==calc_nblk(n_,block_size_)-1) { // Handle fractional part of upd that "belongs" to this block - int u_ncol = std::min(block_size_-ncol(), m_-n_); // ncol for upd + ipc_ u_ncol = std::min(block_size_-ncol(), m_-n_); // ncol for upd beta = (cdata_[elim_col].first_elim) ? beta : 1.0; // user beta only on first update if(i_ == j_) { // diagonal block @@ -1144,11 +1141,11 @@ class Block { } } else { // Update to left of elim column (UpdateT) - int elim_col = jsrc.i_; + ipc_ elim_col = jsrc.i_; if(cdata_[elim_col].nelim == 0) return; // nothing to do - int rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0; - int cfrom = (j_ <= elim_col) ? cdata_[j_].nelim : 0; - int ldld = align_lda(block_size_); + ipc_ rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0; + ipc_ cfrom = (j_ <= elim_col) ? cdata_[j_].nelim : 0; + ipc_ ldld = align_lda(block_size_); T* ld = work.get_ptr(block_size_*ldld); // NB: we use ld[rfrom] below so alignment matches that of aval[rfrom] if(isrc.j_==elim_col) { @@ -1164,8 +1161,8 @@ class Block { cdata_[elim_col].d, &ld[rfrom], ldld ); } - precision_ one_val = 1.0; - precision_ minus_one_val = - 1.0; + rpc_ one_val = 1.0; + rpc_ minus_one_val = - 1.0; host_gemm( OP_N, OP_N, nrow()-rfrom, ncol()-cfrom, cdata_[elim_col].nelim, minus_one_val, &ld[rfrom], ldld, &jsrc.aval_[cfrom*lda_], lda_, @@ -1189,9 +1186,9 @@ class Block { * \param upd_ij pointer to \f$ U_{ij} \f$ values to be updated. * \param ldupd leading dimension of upd_ij. */ - void form_contrib(Block const& isrc, Block const& jsrc, Workspace& work, precision_ beta, T* upd_ij, int ldupd) { - int elim_col = isrc.j_; - int ldld = align_lda(block_size_); + void form_contrib(Block const& isrc, Block const& jsrc, Workspace& work, rpc_ beta, T* upd_ij, ipc_ ldupd) { + ipc_ elim_col = isrc.j_; + ipc_ ldld = align_lda(block_size_); T* ld = work.get_ptr(block_size_*ldld); calcLD( nrow(), cdata_[elim_col].nelim, isrc.aval_, lda_, @@ -1199,8 +1196,8 @@ class Block { ); // User-supplied beta only on first update; otherwise 1.0 T rbeta = (cdata_[elim_col].first_elim) ? beta : 1.0; - int blkn = get_nrow(j_); // nrow not ncol as we're on contrib - precision_ minus_one_val = - 1.0; + ipc_ blkn = get_nrow(j_); // nrow not ncol as we're on contrib + rpc_ minus_one_val = - 1.0; host_gemm( OP_N, OP_T, nrow(), blkn, cdata_[elim_col].nelim, minus_one_val, ld, ldld, jsrc.aval_, lda_, @@ -1213,11 +1210,11 @@ class Block { * for elimination. Entries in that block row/column marked as * failed are ignored. */ - bool isnan(int elim_col=-1) const { - int m = (i_==elim_col) ? cdata_[i_].get_npass() : nrow(); - int n = (j_==elim_col) ? cdata_[j_].get_npass() : ncol(); - for(int j=0; j& cdata_; ///< global column data array T* aval_; ///< pointer to underlying matrix storage }; @@ -1278,7 +1275,7 @@ class Block { * \tparam Allocator allocator to use for internal memory allocations */ template class LDLT { /// \{ - typedef typename std::allocator_traits::template rebind_alloc IntAlloc; + typedef typename std::allocator_traits::template rebind_alloc IntAlloc; typedef typename std::allocator_traits::template rebind_alloc TAlloc; /// \} private: /** Performs LDL^T factorization with block pivoting. Detects failure * and aborts only column if an a posteori pivot test fails. */ static - int run_elim_pivoted(int const m, int const n, int* perm, T* a, - int const lda, T* d, ColumnData& cdata, Backup& backup, - struct cpu_factor_options const& options, int const block_size, - T const beta, T* upd, int const ldupd, std::vector& work, - Allocator const& alloc, int const from_blk=0) { + ipc_ run_elim_pivoted(ipc_ const m, ipc_ const n, ipc_* perm, T* a, + ipc_ const lda, T* d, ColumnData& cdata, Backup& backup, + struct cpu_factor_options const& options, ipc_ const block_size, + T const beta, T* upd, ipc_ const ldupd, std::vector& work, + Allocator const& alloc, ipc_ const from_blk=0) { typedef Block BlockSpec; - int const nblk = calc_nblk(n, block_size); - int const mblk = calc_nblk(m, block_size); + ipc_ const nblk = calc_nblk(n, block_size); + ipc_ const mblk = calc_nblk(m, block_size); //printf("ENTRY PIV %d %d vis %d %d %d\n", m, n, mblk, nblk, block_size); /* Setup */ - int next_elim = from_blk*block_size; - int flag; + ipc_ next_elim = from_blk*block_size; + ipc_ flag; #pragma omp atomic write flag = 0; @@ -1316,7 +1313,7 @@ class LDLT { abort = false; #pragma omp taskgroup - for (int blk = from_blk; blk < nblk; blk++) { + for (ipc_ blk = from_blk; blk < nblk; blk++) { /*if(debug) { printf("Bcol %d:\n", blk); print_mat(mblk, nblk, m, n, blkdata, cdata, lda); @@ -1339,12 +1336,16 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_DIAG"); #endif +#ifdef INTEGER_64 + if (debug) printf("Factor(%ld)\n", blk); +#else if (debug) printf("Factor(%d)\n", blk); +#endif BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size); // Store a copy for recovery in case of a failed column dblk.backup(backup); // Perform actual factorization - int nelim = dblk.template factor(next_elim, perm, d, + ipc_ nelim = dblk.template factor(next_elim, perm, d, options, work, alloc); if (nelim < 0) { #pragma omp atomic write @@ -1387,7 +1388,7 @@ class LDLT { } } /* task/abort */ // Loop over off-diagonal blocks applying pivot - for(int jblk = 0; jblk < blk; jblk++) { + for(ipc_ jblk = 0; jblk < blk; jblk++) { #pragma omp task \ firstprivate(blk, jblk) \ shared(a, abort, backup, cdata, options) \ @@ -1403,7 +1404,11 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_APPLY"); #endif +#ifdef INTEGER_64 + if (debug) printf("ApplyT(%ld,%ld)\n", blk, jblk); +#else if (debug) printf("ApplyT(%d,%d)\n", blk, jblk); +#endif BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size); BlockSpec cblk(blk, jblk, m, n, cdata, a, lda, block_size); // Apply row permutation from factorization of dblk and in @@ -1412,7 +1417,7 @@ class LDLT { cblk.apply_rperm_and_backup(backup); // Perform elimination and determine number of rows in block // passing a posteori threshold pivot test - int blkpass = cblk.apply_pivot_app(dblk, options.u, + ipc_ blkpass = cblk.apply_pivot_app(dblk, options.u, options.small); // Update column's passed pivot count cdata[blk].update_passed(blkpass); @@ -1421,7 +1426,7 @@ class LDLT { #endif } } /* task/abort */ } - for (int iblk = blk + 1; iblk < mblk; iblk++) { + for (ipc_ iblk = blk + 1; iblk < mblk; iblk++) { #pragma omp task \ firstprivate(blk, iblk) \ shared(a, abort, backup, cdata, options) \ @@ -1437,7 +1442,11 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_APPLY"); #endif +#ifdef INTEGER_64 + if (debug) printf("ApplyN(%ld,%ld)\n", iblk, blk); +#else if (debug) printf("ApplyN(%d,%d)\n", iblk, blk); +#endif BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size); BlockSpec rblk(iblk, blk, m, n, cdata, a, lda, block_size); // Apply column permutation from factorization of dblk and in @@ -1446,7 +1455,7 @@ class LDLT { rblk.apply_cperm_and_backup(backup); // Perform elimination and determine number of rows in block // passing a posteori threshold pivot test - int blkpass = rblk.apply_pivot_app(dblk, options.u, + ipc_ blkpass = rblk.apply_pivot_app(dblk, options.u, options.small); // Update column's passed pivot count cdata[blk].update_passed(blkpass); @@ -1471,7 +1480,11 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_ADJUST"); #endif +#ifdef INTEGER_64 + if (debug) printf("Adjust(%ld)\n", blk); +#else if (debug) printf("Adjust(%d)\n", blk); +#endif cdata[blk].adjust(next_elim); #ifdef PROFILE task.done(); @@ -1479,11 +1492,11 @@ class LDLT { } } /* task/abort */ // Update uneliminated columns - for (int jblk = 0; jblk < blk; jblk++) { - for (int iblk = jblk; iblk < mblk; iblk++) { + for (ipc_ jblk = 0; jblk < blk; jblk++) { + for (ipc_ iblk = jblk; iblk < mblk; iblk++) { // Calculate block index we depend on for i // (we only work with lower half of matrix) - int adep_idx = (blk < iblk) ? blk*block_size*lda + iblk*block_size + ipc_ adep_idx = (blk < iblk) ? blk*block_size*lda + iblk*block_size : iblk*block_size*lda + blk*block_size; #pragma omp task \ firstprivate(blk, iblk, jblk) \ @@ -1501,11 +1514,15 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_UPDA"); #endif +#ifdef INTEGER_64 + if (debug) printf("UpdateT(%ld,%ld,%ld)\n", iblk, jblk, blk); +#else if (debug) printf("UpdateT(%d,%d,%d)\n", iblk, jblk, blk); - int thread_num = omp_get_thread_num(); +#endif + ipc_ thread_num = omp_get_thread_num(); BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size); - int isrc_row = (blk<=iblk) ? iblk : blk; - int isrc_col = (blk<=iblk) ? blk : iblk; + ipc_ isrc_row = (blk<=iblk) ? iblk : blk; + ipc_ isrc_col = (blk<=iblk) ? blk : iblk; BlockSpec isrc(isrc_row, isrc_col, m, n, cdata, a, lda, block_size); BlockSpec jsrc(blk, jblk, m, n, cdata, a, lda, block_size); @@ -1520,8 +1537,8 @@ class LDLT { } } /* task/abort */ } } - for(int jblk = blk; jblk < nblk; jblk++) { - for(int iblk = jblk; iblk < mblk; iblk++) { + for(ipc_ jblk = blk; jblk < nblk; jblk++) { + for(ipc_ iblk = jblk; iblk < mblk; iblk++) { #pragma omp task \ firstprivate(blk, iblk, jblk) \ shared(a, abort, cdata, backup, work, upd) \ @@ -1538,8 +1555,12 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_UPDA"); #endif +#ifdef INTEGER_64 + if (debug) printf("UpdateN(%ld,%ld,%ld)\n", iblk, jblk, blk); +#else if (debug) printf("UpdateN(%d,%d,%d)\n", iblk, jblk, blk); - int thread_num = omp_get_thread_num(); +#endif + ipc_ thread_num = omp_get_thread_num(); BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size); BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size); BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size); @@ -1557,10 +1578,10 @@ class LDLT { // Handle update to contribution block, if required if (upd && (mblk > nblk)) { - int uoffset = std::min(nblk*block_size, m) - n; + ipc_ uoffset = std::min(nblk*block_size, m) - n; T *upd2 = &upd[uoffset*(ldupd+1)]; - for(int jblk = nblk; jblk < mblk; ++jblk) - for(int iblk = jblk; iblk < mblk; ++iblk) { + for(ipc_ jblk = nblk; jblk < mblk; ++jblk) + for(ipc_ iblk = jblk; iblk < mblk; ++iblk) { T* upd_ij = &upd2[(jblk-nblk)*block_size*ldupd + (iblk-nblk)*block_size]; #pragma omp task \ firstprivate(iblk, jblk, blk, upd_ij) \ @@ -1578,8 +1599,12 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_UPDC"); #endif +#ifdef INTEGER_64 + if (debug) printf("FormContrib(%ld,%ld,%ld)\n", iblk,jblk,blk); +#else if (debug) printf("FormContrib(%d,%d,%d)\n", iblk,jblk,blk); - int thread_num = omp_get_thread_num(); +#endif + ipc_ thread_num = omp_get_thread_num(); BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size); BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size); BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size); @@ -1592,7 +1617,7 @@ class LDLT { } } } // taskgroup and for - int my_flag; + ipc_ my_flag; #pragma omp atomic read my_flag = flag; if (my_flag < 0) return my_flag; // Error @@ -1609,23 +1634,23 @@ class LDLT { * and aborts only column if an a posteori pivot test fails. * Serial version without tasks. */ static - int run_elim_pivoted_notasks(int const m, int const n, int* perm, T* a, - int const lda, T* d, ColumnData& cdata, Backup& backup, - struct cpu_factor_options const& options, int const block_size, - T const beta, T* upd, int const ldupd, std::vector& work, - Allocator const& alloc, int const from_blk=0) { + ipc_ run_elim_pivoted_notasks(ipc_ const m, ipc_ const n, ipc_* perm, T* a, + ipc_ const lda, T* d, ColumnData& cdata, Backup& backup, + struct cpu_factor_options const& options, ipc_ const block_size, + T const beta, T* upd, ipc_ const ldupd, std::vector& work, + Allocator const& alloc, ipc_ const from_blk=0) { typedef Block BlockSpec; - int const nblk = calc_nblk(n, block_size); - int const mblk = calc_nblk(m, block_size); + ipc_ const nblk = calc_nblk(n, block_size); + ipc_ const mblk = calc_nblk(m, block_size); //printf("ENTRY PIV %d %d vis %d %d %d\n", m, n, mblk, nblk, block_size); /* Setup */ - int next_elim = from_blk*block_size; + ipc_ next_elim = from_blk*block_size; /* Inner loop - iterate over block columns */ try { - for(int blk=from_blk; blk( + ipc_ nelim = dblk.template factor( next_elim, perm, d, options, work, alloc ); if(nelim<0) return nelim; @@ -1647,8 +1676,12 @@ class LDLT { } // Loop over off-diagonal blocks applying pivot - for(int jblk=0; jblknblk) { - int uoffset = std::min(nblk*block_size, m) - n; + ipc_ uoffset = std::min(nblk*block_size, m) - n; T *upd2 = &upd[uoffset*(ldupd+1)]; - for(int jblk=nblk; jblk& cdata, Backup& backup, - int* up_to_date, struct cpu_factor_options const& options, - int const block_size, T const beta, T* upd, int const ldupd, + ipc_ run_elim_unpivoted(ipc_ const m, ipc_ const n, ipc_* perm, T* a, + ipc_ const lda, T* d, ColumnData& cdata, Backup& backup, + ipc_* up_to_date, struct cpu_factor_options const& options, + ipc_ const block_size, T const beta, T* upd, ipc_ const ldupd, std::vector& work, Allocator const& alloc) { typedef Block BlockSpec; - int const nblk = calc_nblk(n, block_size); - int const mblk = calc_nblk(m, block_size); + ipc_ const nblk = calc_nblk(n, block_size); + ipc_ const mblk = calc_nblk(m, block_size); //printf("ENTRY %d %d vis %d %d %d\n", m, n, mblk, nblk, block_size); /* Setup */ - int next_elim = 0; - int flag; + ipc_ next_elim = 0; + ipc_ flag; #pragma omp atomic write flag = 0; @@ -1779,7 +1833,7 @@ class LDLT { #pragma omp atomic write abort = false; #pragma omp taskgroup - for(int blk = 0; blk < nblk; blk++) { + for(ipc_ blk = 0; blk < nblk; blk++) { /*if(debug) { printf("Bcol %d:\n", blk); print_mat(mblk, nblk, m, n, blkdata, cdata, lda); @@ -1801,14 +1855,18 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_DIAG"); #endif +#ifdef INTEGER_64 + if(debug) printf("Factor(%ld)\n", blk); +#else if(debug) printf("Factor(%d)\n", blk); +#endif BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size); // On first access to this block, store copy in case of failure if (blk == 0) dblk.backup(backup); // Record block state as assuming we've done up to col blk up_to_date[blk*mblk+blk] = blk; // Perform actual factorization - int nelim = dblk.template factor(next_elim, perm, d, options, work, alloc); + ipc_ nelim = dblk.template factor(next_elim, perm, d, options, work, alloc); if (nelim < get_ncol(blk, n, block_size)) { cdata[blk].init_passed(0); // diagonal block has NOT passed #ifdef _OPENMP @@ -1850,7 +1908,7 @@ class LDLT { } } /* task/abort */ // Loop over off-diagonal blocks applying pivot - for (int jblk = 0; jblk < blk; jblk++) { + for (ipc_ jblk = 0; jblk < blk; jblk++) { #pragma omp task \ firstprivate(blk, jblk) \ shared(a, abort, backup, cdata, options, work, up_to_date) \ @@ -1865,8 +1923,12 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_APPLY"); #endif +#ifdef INTEGER_64 + if (debug) printf("ApplyT(%ld,%ld)\n", blk, jblk); +#else if (debug) printf("ApplyT(%d,%d)\n", blk, jblk); - int thread_num = omp_get_thread_num(); +#endif + ipc_ thread_num = omp_get_thread_num(); BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size); BlockSpec cblk(blk, jblk, m, n, cdata, a, lda, block_size); // Record block state as assuming we've done up to col blk @@ -1880,7 +1942,7 @@ class LDLT { #endif } } /* task/abort */ } - for (int iblk = blk+1; iblk < mblk; iblk++) { + for (ipc_ iblk = blk+1; iblk < mblk; iblk++) { #pragma omp task \ firstprivate(blk, iblk) \ shared(a, abort, backup, cdata, options, work, up_to_date) \ @@ -1895,8 +1957,12 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_APPLY"); #endif +#ifdef INTEGER_64 + if (debug) printf("ApplyN(%ld,%ld)\n", iblk, blk); +#else if (debug) printf("ApplyN(%d,%d)\n", iblk, blk); - int thread_num = omp_get_thread_num(); +#endif + ipc_ thread_num = omp_get_thread_num(); BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size); BlockSpec rblk(iblk, blk, m, n, cdata, a, lda, block_size); // On first access to this block, store copy in case of failure @@ -1907,7 +1973,7 @@ class LDLT { rblk.apply_cperm(work[thread_num]); // Perform elimination and determine number of rows in block // passing a posteori threshold pivot test - int blkpass = rblk.apply_pivot_app(dblk, options.u, options.small); + ipc_ blkpass = rblk.apply_pivot_app(dblk, options.u, options.small); // Update column's passed pivot count if (cdata[blk].test_fail(blkpass)) { #ifdef _OPENMP @@ -1926,9 +1992,9 @@ class LDLT { // Update uneliminated columns // Column blk only needed if upd is present - int jsa = (upd) ? blk : blk + 1; - for(int jblk = jsa; jblk < nblk; jblk++) { - for(int iblk = jblk; iblk < mblk; iblk++) { + ipc_ jsa = (upd) ? blk : blk + 1; + for(ipc_ jblk = jsa; jblk < nblk; jblk++) { + for(ipc_ iblk = jblk; iblk < mblk; iblk++) { #pragma omp task \ firstprivate(blk, iblk, jblk) \ shared(a, abort, cdata, backup, work, upd, up_to_date) \ @@ -1944,8 +2010,12 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_UPDA"); #endif +#ifdef INTEGER_64 + if (debug) printf("UpdateN(%ld,%ld,%ld)\n", iblk, jblk, blk); +#else if (debug) printf("UpdateN(%d,%d,%d)\n", iblk, jblk, blk); - int thread_num = omp_get_thread_num(); +#endif + ipc_ thread_num = omp_get_thread_num(); BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size); BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size); BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size); @@ -1964,10 +2034,10 @@ class LDLT { // Handle update to contribution block, if required if (upd && (mblk > nblk)) { - int uoffset = std::min(nblk*block_size, m) - n; + ipc_ uoffset = std::min(nblk*block_size, m) - n; T *upd2 = &upd[uoffset*(ldupd+1)]; - for(int jblk = nblk; jblk < mblk; ++jblk) - for(int iblk = jblk; iblk < mblk; ++iblk) { + for(ipc_ jblk = nblk; jblk < mblk; ++jblk) + for(ipc_ iblk = jblk; iblk < mblk; ++iblk) { T* upd_ij = &upd2[(jblk-nblk)*block_size*ldupd + (iblk-nblk)*block_size]; #pragma omp task \ firstprivate(iblk, jblk, blk, upd_ij) \ @@ -1984,8 +2054,12 @@ class LDLT { #ifdef PROFILE Profile::Task task("TA_LDLT_UPDC"); #endif +#ifdef INTEGER_64 + if (debug) printf("FormContrib(%ld,%ld,%ld)\n", iblk, jblk,blk); +#else if (debug) printf("FormContrib(%d,%d,%d)\n", iblk, jblk,blk); - int thread_num = omp_get_thread_num(); +#endif + ipc_ thread_num = omp_get_thread_num(); BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size); BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size); BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size); @@ -2007,7 +2081,7 @@ class LDLT { print_mat(mblk, nblk, m, n, blkdata, cdata, lda); }*/ - int my_flag; + ipc_ my_flag; #pragma omp atomic read my_flag = flag; if (my_flag < 0) return my_flag; @@ -2017,22 +2091,22 @@ class LDLT { /** Performs LDL^T factorization assuming everything works. Detects failure * and aborts entire thing if a posteori pivot test fails. */ static - int run_elim_unpivoted_notasks(int const m, int const n, int* perm, T* a, - int const lda, T* d, ColumnData& cdata, Backup& backup, - int* up_to_date, struct cpu_factor_options const& options, - int const block_size, T const beta, T* upd, int const ldupd, + ipc_ run_elim_unpivoted_notasks(ipc_ const m, ipc_ const n, ipc_* perm, T* a, + ipc_ const lda, T* d, ColumnData& cdata, Backup& backup, + ipc_* up_to_date, struct cpu_factor_options const& options, + ipc_ const block_size, T const beta, T* upd, ipc_ const ldupd, std::vector& work, Allocator const& alloc) { typedef Block BlockSpec; - int const nblk = calc_nblk(n, block_size); - int const mblk = calc_nblk(m, block_size); + ipc_ const nblk = calc_nblk(n, block_size); + ipc_ const mblk = calc_nblk(m, block_size); //printf("ENTRY %d %d vis %d %d %d\n", m, n, mblk, nblk, block_size); /* Setup */ - int next_elim = 0; + ipc_ next_elim = 0; /* Inner loop - iterate over block columns */ - for(int blk=0; blk( + ipc_ nelim = dblk.template factor( next_elim, perm, d, options, work, alloc ); if(nelim < get_ncol(blk, n, block_size)) { @@ -2065,9 +2143,13 @@ class LDLT { } // Loop over off-diagonal blocks applying pivot - for(int jblk=0; jblknblk) { - int uoffset = std::min(nblk*block_size, m) - n; + ipc_ uoffset = std::min(nblk*block_size, m) - n; T *upd2 = &upd[uoffset*(ldupd+1)]; - for(int jblk=nblk; jblk nelim_blk then we reset and recalculate completely * */ static - void restore(int const nelim_blk, int const m, int const n, int* perm, T* a, - int const lda, T* d, ColumnData& cdata, Backup& backup, - int const* old_perm, int const* up_to_date, int const block_size, - std::vector& work, T* upd, int const ldupd) { + void restore(ipc_ const nelim_blk, ipc_ const m, ipc_ const n, ipc_* perm, T* a, + ipc_ const lda, T* d, ColumnData& cdata, Backup& backup, + ipc_ const* old_perm, ipc_ const* up_to_date, ipc_ const block_size, + std::vector& work, T* upd, ipc_ const ldupd) { typedef Block BlockSpec; - int const nblk = calc_nblk(n, block_size); - int const mblk = calc_nblk(m, block_size); + ipc_ const nblk = calc_nblk(n, block_size); + ipc_ const mblk = calc_nblk(m, block_size); /* Restore perm for failed part */ - for(int i=nelim_blk*block_size; i= nelim_blk) { #pragma omp task \ firstprivate(iblk, jblk) \ shared(a, cdata, work) \ depend(inout: a[jblk*block_size*lda+iblk*block_size:1]) { - int thread_num = omp_get_thread_num(); + ipc_ thread_num = omp_get_thread_num(); BlockSpec rblk(iblk, jblk, m, n, cdata, a, lda, block_size); rblk.apply_inv_rperm(work[thread_num]); } @@ -2195,9 +2289,9 @@ class LDLT { } } // Now all eliminated columns are good, fix up remainder of node - for(int jblk=nelim_blk; jblk= nelim_blk) { // Bad updates applied, needs reset and full recalculation #pragma omp task \ @@ -2211,7 +2305,7 @@ class LDLT { progress = -1; } // Apply any missing updates to a - for(int kblk=progress+1; kblk= nelim_blk) progress = -1; // needs complete reset T* upd_ij = &upd2[(jblk-nblk)*block_size*ldupd + (iblk-nblk)*block_size]; - for(int kblk=progress+1; kblk const& eliminated, const T *a, int lda) { - for(int row=0; row const& eliminated, const T *a, ipc_ lda) { + for(ipc_ row=0; row& work, Allocator const& alloc=Allocator()) { + ipc_ factor(ipc_ m, ipc_ n, ipc_ *perm, T *a, ipc_ lda, T *d, Backup& backup, struct cpu_factor_options const& options, PivotMethod pivot_method, ipc_ block_size, T beta, T* upd, ipc_ ldupd, std::vector& work, Allocator const& alloc=Allocator()) { /* Sanity check arguments */ if(m < n) return -1; if(lda < n) return -4; /* Initialize useful quantities: */ - int nblk = calc_nblk(n, block_size); - int mblk = calc_nblk(m, block_size); + ipc_ nblk = calc_nblk(n, block_size); + ipc_ mblk = calc_nblk(m, block_size); /* Temporary workspaces */ ColumnData cdata(n, block_size, IntAlloc(alloc)); @@ -2324,7 +2426,7 @@ class LDLT { * - If no pivots selected across matrix, perform swaps to get large * entries into diagonal blocks */ - int num_elim; + ipc_ num_elim; if(pivot_method == PivotMethod::app_aggressive) { if(beta!=0.0) { // We don't support backup of contribution block at present, @@ -2336,12 +2438,12 @@ class LDLT { // Take a copy of perm typedef std::allocator_traits IATraits; IntAlloc intAlloc(alloc); - int* perm_copy = IATraits::allocate(intAlloc, n); - for(int i=0; i failed_perm(n-num_elim, 0, alloc); - for(int jblk=0, insert=0, fail_insert=0; jblk failed_perm(n-num_elim, 0, alloc); + for(ipc_ jblk=0, insert=0, fail_insert=0; jblk failed_diag(nfail*n, 0, alloc); std::vector failed_rect(nfail*(m-n), 0, alloc); - for(int jblk=0, jfail=0, jinsert=0; jblk eliminated(n); - for(int i=0; i -size_t ldlt_app_factor_mem_required(int m, int n, int block_size) { +size_t ldlt_app_factor_mem_required(ipc_ m, ipc_ n, ipc_ block_size) { #if defined(__AVX512F__) - int const align = 64; + ipc_ const align = 64; #elif defined(__AVX__) - int const align = 32; + ipc_ const align = 32; #else - int const align = 16; + ipc_ const align = 16; #endif return align_lda(m) * n * sizeof(T) + align; // CopyBackup } template -int ldlt_app_factor(int m, int n, int* perm, T* a, int lda, T* d, T beta, - T* upd, int ldupd, struct cpu_factor_options const& options, +ipc_ ldlt_app_factor(ipc_ m, ipc_ n, ipc_* perm, T* a, ipc_ lda, T* d, T beta, + T* upd, ipc_ ldupd, struct cpu_factor_options const& options, std::vector& work, Allocator const& alloc) { // If we've got a tall and narrow node, adjust block size so each block // has roughly blksz**2 entries // FIXME: Decide if this reshape is actually useful, given it will generate // a lot more update tasks instead? - int outer_block_size = options.cpu_block_size; + ipc_ outer_block_size = options.cpu_block_size; /*if(n < outer_block_size) { - outer_block_size = int((long(outer_block_size)*outer_block_size) / n); + outer_block_size = ipc_((longc_(outer_block_size)*outer_block_size) / n); }*/ #ifdef PROFILE @@ -2567,13 +2669,18 @@ int ldlt_app_factor(int m, int n, int* perm, T* a, int lda, T* d, T beta, outer_block_size, beta, upd, ldupd, work, alloc ); } -template int ldlt_app_factor>>(int, int, int*, precision_*, int, precision_*, precision_, precision_*, int, struct cpu_factor_options const&, std::vector&, BuddyAllocator> const& alloc); +template ipc_ ldlt_app_factor>>(ipc_, ipc_, ipc_*, rpc_*, ipc_, rpc_*, rpc_, + rpc_*, ipc_, struct cpu_factor_options const&, + std::vector&, + BuddyAllocator> const& alloc); template -void ldlt_app_solve_fwd(int m, int n, T const* l, int ldl, int nrhs, T* x, - int ldx) { - precision_ one_val = 1.0; - precision_ minus_one_val = - 1.0; +void ldlt_app_solve_fwd(ipc_ m, ipc_ n, T const* l, ipc_ ldl, ipc_ nrhs, T* x, + ipc_ ldx) { + rpc_ one_val = 1.0; + rpc_ minus_one_val = - 1.0; if(nrhs==1) { host_trsv(FILL_MODE_LWR, OP_N, DIAG_UNIT, n, l, ldl, x, 1); if(m > n) @@ -2586,16 +2693,16 @@ void ldlt_app_solve_fwd(int m, int n, T const* l, int ldl, int nrhs, T* x, ldl, x, ldx, one_val, &x[n], ldx); } } -template void ldlt_app_solve_fwd(int, int, precision_ const*, - int, int, precision_*, int); +template void ldlt_app_solve_fwd(ipc_, ipc_, rpc_ const*, + ipc_, ipc_, rpc_*, ipc_); template -void ldlt_app_solve_diag(int n, T const* d, int nrhs, T* x, int ldx) { - for(int i=0; i(int, precision_ const*, int, - precision_*, int); +template void ldlt_app_solve_diag(ipc_, rpc_ const*, ipc_, + rpc_*, ipc_); template -void ldlt_app_solve_bwd(int m, int n, T const* l, int ldl, int nrhs, T* x, - int ldx) { - precision_ one_val = 1.0; - precision_ minus_one_val = - 1.0; +void ldlt_app_solve_bwd(ipc_ m, ipc_ n, T const* l, ipc_ ldl, ipc_ nrhs, T* x, + ipc_ ldx) { + rpc_ one_val = 1.0; + rpc_ minus_one_val = - 1.0; if(nrhs==1) { if(m > n) gemv(OP_T, m-n, n, minus_one_val, &l[n], ldl, &x[n], 1, one_val, x, 1); @@ -2633,7 +2740,7 @@ void ldlt_app_solve_bwd(int m, int n, T const* l, int ldl, int nrhs, T* x, one_val, l, ldl, x, ldx); } } -template void ldlt_app_solve_bwd(int, int, precision_ const*, int, - int, precision_*, int); +template void ldlt_app_solve_bwd(ipc_, ipc_, rpc_ const*, ipc_, + ipc_, rpc_*, ipc_); }}} /* namespaces spral::ssids::cpu */ diff --git a/src/ssids/ldlt_nopiv.cxx b/src/ssids/ldlt_nopiv.cxx index 21919e278b..788bd12118 100644 --- a/src/ssids/ldlt_nopiv.cxx +++ b/src/ssids/ldlt_nopiv.cxx @@ -2,7 +2,9 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 09:50 GMT */ + #include "ssids_cpu_kernels_ldlt_nopiv.hxx" namespace spral { namespace ssids { namespace cpu { @@ -23,42 +25,42 @@ namespace spral { namespace ssids { namespace cpu { * * Returns -1 on success, otherwise location of negative or zero pivot. * */ -int ldlt_nopiv_factor(int m, int n, precision_* a, int lda, precision_* work) { - for(int j=0; j=0; j-=2) { - for(int i=j+2; i=0; j-=2) { + for(ipc_ i=j+2; i @@ -13,7 +15,7 @@ #include "ssids_cpu_ThreadStats.hxx" #include "ssids_cpu_kernels_wrappers.hxx" -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 #define host_gemm host_gemm_64 #define host_trsv host_trsv_64 #define host_trsm host_trsm_64 @@ -25,7 +27,7 @@ namespace spral { namespace ssids { namespace cpu { namespace { /** overload fabs for floats and doubles */ -precision_ fabs_(precision_ x) { +rpc_ fabs_(rpc_ x) { #ifdef SPRAL_SINGLE double fabsd = fabs(double(x)); float fabss; @@ -37,21 +39,21 @@ precision_ fabs_(precision_ x) { } /** Returns true if all entries in col are less than small in abs value */ -bool check_col_small(int idx, int from, int to, precision_ const* a, - int lda, precision_ small) { +bool check_col_small(ipc_ idx, ipc_ from, ipc_ to, rpc_ const* a, + ipc_ lda, rpc_ small) { bool check = true; - for(int c=from; c=to) return -1; - int best_idx=from; precision_ best_val=fabs(a[from*lda]); - for(int idx=from+1; idx best_val) { best_idx = idx; best_val = fabs(a[idx*lda]); @@ -61,8 +63,8 @@ int find_row_abs_max(int from, int to, precision_ const* a, int lda) { /** Performs symmetric swap of col1 and col2 in lower triangle */ // FIXME: remove n only here for debug -void swap_cols(int col1, int col2, int m, int n, int* perm, precision_* a, - int lda, int nleft, precision_* aleft, int ldleft) { +void swap_cols(ipc_ col1, ipc_ col2, ipc_ m, ipc_ n, ipc_* perm, rpc_* a, + ipc_ lda, ipc_ nleft, rpc_* aleft, ipc_ ldleft) { if(col1 == col2) return; // No-op // Ensure col1 < col2 @@ -73,19 +75,19 @@ void swap_cols(int col1, int col2, int m, int n, int* perm, precision_* a, std::swap( perm[col1], perm[col2] ); // Swap aleft(col1, :) and aleft(col2, :) - for(int c=0; c::infinity(); + d[2] = std::numeric_limits::infinity(); d[3] = (a11*detscale)/detpiv; //printf("t2 %e < %e?\n", std::max(maxp, maxt), small); if(std::max(maxp, maxt) < small) return true; // Rest of col small - precision_ x1 = fabs(d[0])*maxt + fabs(d[1])*maxp; - precision_ x2 = fabs(d[1])*maxt + fabs(d[3])*maxp; + rpc_ x1 = fabs(d[0])*maxt + fabs(d[1])*maxp; + rpc_ x2 = fabs(d[1])*maxt + fabs(d[3])*maxp; //printf("t3 %e < %e?\n", std::max(x1, x2), 1.0/u); return ( u*std::max(x1, x2) < 1.0 ); } /** Applies the 2x2 pivot to rest of block column */ -void apply_2x2(int nelim, int m, precision_* a, int lda, precision_* ld, - int ldld, precision_* d) { +void apply_2x2(ipc_ nelim, ipc_ m, rpc_* a, ipc_ lda, rpc_* ld, + ipc_ ldld, rpc_* d) { /* Set diagonal block to identity */ - precision_* a1 = &a[nelim*lda]; - precision_* a2 = &a[(nelim+1)*lda]; + rpc_* a1 = &a[nelim*lda]; + rpc_* a2 = &a[(nelim+1)*lda]; a1[nelim] = 1.0; a1[nelim+1] = 0.0; a2[nelim+1] = 1.0; /* Extract D^-1 values */ - precision_ d11 = d[2*nelim]; - precision_ d21 = d[2*nelim+1]; - precision_ d22 = d[2*nelim+3]; + rpc_ d11 = d[2*nelim]; + rpc_ d21 = d[2*nelim+1]; + rpc_ d22 = d[2*nelim+3]; /* Divide through, preserving copy in ld */ - for(int r=nelim+2; r= u*maxp ) { //printf("1x1 pivot %d\n", p); swap_cols(p, nelim, m, n, perm, a, lda, nleft, aleft, ldleft); @@ -297,10 +299,10 @@ int ldlt_tpp_factor(int m, int n, int* perm, precision_* a, int lda, return nelim; } -void ldlt_tpp_solve_fwd(int m, int n, precision_ const* l, int ldl, int nrhs, - precision_* x, int ldx) { - precision_ one_val = 1.0; - precision_ minus_one_val = - 1.0; +void ldlt_tpp_solve_fwd(ipc_ m, ipc_ n, rpc_ const* l, ipc_ ldl, ipc_ nrhs, + rpc_* x, ipc_ ldx) { + rpc_ one_val = 1.0; + rpc_ minus_one_val = - 1.0; if(nrhs==1) { host_trsv(FILL_MODE_LWR, OP_N, DIAG_UNIT, n, l, ldl, x, 1); if(m > n) @@ -314,31 +316,31 @@ void ldlt_tpp_solve_fwd(int m, int n, precision_ const* l, int ldl, int nrhs, } } -void ldlt_tpp_solve_diag(int n, precision_ const* d, precision_* x) { - for(int i=0; i n) gemv(OP_T, m-n, n, minus_one_val, &l[n], ldl, &x[n], 1, one_val, x, 1); diff --git a/src/ssids/profile.cxx b/src/ssids/profile.cxx index 2550676281..4885ac78fa 100644 --- a/src/ssids/profile.cxx +++ b/src/ssids/profile.cxx @@ -6,6 +6,7 @@ */ #include "ssids_profile.hxx" +#include "ssids_rip.hxx" #ifdef PROFILE struct timespec spral::ssids::Profile::tstart; @@ -14,7 +15,7 @@ struct timespec spral::ssids::Profile::tstart; using namespace spral::ssids; extern "C" -void spral_ssids_profile_begin(int nregions, void const* regions) { +void spral_ssids_profile_begin(ipc_ nregions, void const* regions) { Profile::init(nregions, (spral::hw_topology::NumaRegion*)regions); } @@ -24,7 +25,7 @@ void spral_ssids_profile_end() { } extern "C" -Profile::Task* spral_ssids_profile_create_task(char const* name, int thread) { +Profile::Task* spral_ssids_profile_create_task(char const* name, ipc_ thread) { // We interpret negative thread values as absent if(thread >= 0) { return new Profile::Task(name, thread); @@ -47,6 +48,6 @@ void spral_ssids_profile_set_state(char const* container, char const* type, extern "C" void spral_ssids_profile_add_event( - char const* type, char const*val, int thread) { + char const* type, char const*val, ipc_ thread) { Profile::addEvent(type, val, thread); } diff --git a/src/ssids/reorder.cu b/src/ssids/reorder.cu index a7978c024d..b1a5df7099 100644 --- a/src/ssids/reorder.cu +++ b/src/ssids/reorder.cu @@ -1,3 +1,9 @@ +/* Copyright (c) 2013 Science and Technology Facilities Council (STFC) + * Licence: BSD licence, see LICENCE file for details + * Author: Jonathan Hogg + * This version: GALAHAD 4.3 - 2024-02-03 AT 09:50 GMT + */ + #ifdef __cplusplus #include #else @@ -8,11 +14,11 @@ #include #include +#include "ssids_rip.hxx" #include "ssids_gpu_kernels_datatypes.h" #include "spral_cuda_cuda_check.h" #ifdef SPRAL_SINGLE -#define precision_ float #define multiswap_type multiswap_type_single #define multireorder_data multireorder_data_single #define multisymm_type multisymm_type_single @@ -44,7 +50,6 @@ #define spral_ssids_swap_ni2D_ic spral_ssids_swap_ni2D_ic_single #define spral_ssids_swap_ni2D_ir spral_ssids_swap_ni2D_ir_single #else -#define precision_ double #define multiswap_type multiswap_type_double #define multireorder_data multireorder_data_double #define multisymm_type multisymm_type_double @@ -92,43 +97,43 @@ namespace /* anon */ { template< typename ELEMENT_TYPE > __global__ void -cu_copy_mc( int nrows, int ncols, - ELEMENT_TYPE* a, int lda, - ELEMENT_TYPE* b, int ldb, - int* mask ) +cu_copy_mc( ipc_ nrows, ipc_ ncols, + ELEMENT_TYPE* a, ipc_ lda, + ELEMENT_TYPE* b, ipc_ ldb, + ipc_* mask ) { - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = threadIdx.y + blockDim.y*blockIdx.y; + ipc_ i = threadIdx.x + blockDim.x*blockIdx.x; + ipc_ j = threadIdx.y + blockDim.y*blockIdx.y; if ( i < nrows && j < ncols && mask[j] > 0 ) b[i + ldb*j] = a[i + lda*j]; } template< typename ELEMENT_TYPE > __global__ void -cu_copy_ic( int nrows, int ncols, - ELEMENT_TYPE* a, int lda, - ELEMENT_TYPE* b, int ldb, - int* ind ) +cu_copy_ic( ipc_ nrows, ipc_ ncols, + ELEMENT_TYPE* a, ipc_ lda, + ELEMENT_TYPE* b, ipc_ ldb, + ipc_* ind ) { - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = threadIdx.y + blockDim.y*blockIdx.y; + ipc_ i = threadIdx.x + blockDim.x*blockIdx.x; + ipc_ j = threadIdx.y + blockDim.y*blockIdx.y; if ( i < nrows && j < ncols && ind[j] > 0 ) b[i + ldb*(ind[j] - 1)] = a[i + lda*j]; } template< typename ELEMENT_TYPE > __global__ void -cu_swap_ni2D_ic( int nrows, int ncols, - ELEMENT_TYPE* a, int lda, - ELEMENT_TYPE* b, int ldb, - int* index ) +cu_swap_ni2D_ic( ipc_ nrows, ipc_ ncols, + ELEMENT_TYPE* a, ipc_ lda, + ELEMENT_TYPE* b, ipc_ ldb, + ipc_* index ) // swaps columns of non-intersecting 2D arrays a(1:n,index(1:m)) and b(1:n,1:m) // index is one-based { - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = threadIdx.y + blockDim.y*blockIdx.y; - int k; - precision_ s; + ipc_ i = threadIdx.x + blockDim.x*blockIdx.x; + ipc_ j = threadIdx.y + blockDim.y*blockIdx.y; + ipc_ k; + rpc_ s; if ( i < nrows && j < ncols && (k = index[j] - 1) > -1 ) { s = a[i + lda*k]; @@ -139,17 +144,17 @@ cu_swap_ni2D_ic( int nrows, int ncols, template< typename ELEMENT_TYPE > __global__ void -cu_swap_ni2D_ir( int nrows, int ncols, - ELEMENT_TYPE* a, int lda, - ELEMENT_TYPE* b, int ldb, - int* index ) +cu_swap_ni2D_ir( ipc_ nrows, ipc_ ncols, + ELEMENT_TYPE* a, ipc_ lda, + ELEMENT_TYPE* b, ipc_ ldb, + ipc_* index ) // swaps rows of non-intersecting 2D arrays a(index(1:n),1:m) and b(1:n,1:m) // index is one-based { - int i = threadIdx.x + blockDim.x*blockIdx.x; - int j = threadIdx.y + blockDim.y*blockIdx.y; - int k; - precision_ s; + ipc_ i = threadIdx.x + blockDim.x*blockIdx.x; + ipc_ j = threadIdx.y + blockDim.y*blockIdx.y; + ipc_ k; + rpc_ s; if ( i < nrows && j < ncols && (k = index[i] - 1) > -1 ) { s = a[k + lda*j]; @@ -159,12 +164,12 @@ cu_swap_ni2D_ir( int nrows, int ncols, } struct multiswap_type { - int nrows; - int ncols; - int k; - precision_ *lcol; - int lda; - int off; + ipc_ nrows; + ipc_ ncols; + ipc_ k; + rpc_ *lcol; + ipc_ lda; + ipc_ off; }; template< typename ELEMENT_TYPE > @@ -173,19 +178,19 @@ cu_multiswap_ni2D_c( struct multiswap_type *swapdata ) // swaps non-intersecting rows or cols of a 2D multiarray a { swapdata += blockIdx.x; - int nrows = swapdata->nrows; + ipc_ nrows = swapdata->nrows; if ( blockIdx.y*blockDim.x >= nrows ) return; - int k = swapdata->k; + ipc_ k = swapdata->k; ELEMENT_TYPE *a = swapdata->lcol; - int lda = swapdata->lda; - int off = lda*swapdata->off; + ipc_ lda = swapdata->lda; + ipc_ off = lda*swapdata->off; ELEMENT_TYPE s; - for ( int i = threadIdx.x + blockIdx.y*blockDim.x; i < nrows; + for ( ipc_ i = threadIdx.x + blockIdx.y*blockDim.x; i < nrows; i += blockDim.x*gridDim.y ) - for ( int j = threadIdx.y; j < k; j += blockDim.y ) { + for ( ipc_ j = threadIdx.y; j < k; j += blockDim.y ) { s = a[i + lda*j]; a[i + lda*j] = a[off + i + lda*j]; a[off + i + lda*j] = s; @@ -198,18 +203,18 @@ cu_multiswap_ni2D_r( struct multiswap_type *swapdata ) // swaps non-intersecting rows or cols of a 2D multiarray a { swapdata += blockIdx.x; - int ncols = swapdata->ncols; + ipc_ ncols = swapdata->ncols; if ( blockIdx.y*blockDim.y >= ncols ) return; - int k = swapdata->k; + ipc_ k = swapdata->k; ELEMENT_TYPE *a = swapdata->lcol; - int lda = swapdata->lda; - int off = swapdata->off; + ipc_ lda = swapdata->lda; + ipc_ off = swapdata->off; ELEMENT_TYPE s; - for ( int i = threadIdx.x; i < k; i += blockDim.x ) - for ( int j = threadIdx.y + blockIdx.y*blockDim.y; j < ncols; + for ( ipc_ i = threadIdx.x; i < k; i += blockDim.x ) + for ( ipc_ j = threadIdx.y + blockIdx.y*blockDim.y; j < ncols; j += blockDim.y*gridDim.y ) { s = a[i + lda*j]; a[i + lda*j] = a[off + i + lda*j]; @@ -220,14 +225,14 @@ cu_multiswap_ni2D_r( struct multiswap_type *swapdata ) template< typename ELEMENT_TYPE > __global__ void cu_reorder_rows( - int nrows, int ncols, - ELEMENT_TYPE* a, int lda, - ELEMENT_TYPE* b, int ldb, - int* index + ipc_ nrows, ipc_ ncols, + ELEMENT_TYPE* a, ipc_ lda, + ELEMENT_TYPE* b, ipc_ ldb, + ipc_* index ) { - int x; - int y = threadIdx.y + blockIdx.y*blockDim.y; + ipc_ x; + ipc_ y = threadIdx.y + blockIdx.y*blockDim.y; for ( x = threadIdx.x; x < nrows; x += blockDim.x ) if ( y < ncols ) @@ -238,14 +243,14 @@ cu_reorder_rows( a[x + lda*y] = b[x + ldb*y]; } -template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y > +template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y > __global__ void -cu_reorder_cols2( int nrows, int ncols, - ELEMENT_TYPE* a, int lda, - ELEMENT_TYPE* b, int ldb, - int* index, int mode ) +cu_reorder_cols2( ipc_ nrows, ipc_ ncols, + ELEMENT_TYPE* a, ipc_ lda, + ELEMENT_TYPE* b, ipc_ ldb, + ipc_* index, ipc_ mode ) { - int ix = threadIdx.x + blockIdx.x*blockDim.x; + ipc_ ix = threadIdx.x + blockIdx.x*blockDim.x; __shared__ volatile ELEMENT_TYPE work[SIZE_X*SIZE_Y]; @@ -281,14 +286,14 @@ cu_reorder_cols2( int nrows, int ncols, } } -template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y > +template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y > __global__ void -cu_reorder_rows2( int nrows, int ncols, - ELEMENT_TYPE* a, int lda, - ELEMENT_TYPE* b, int ldb, - int* index, int mode ) +cu_reorder_rows2( ipc_ nrows, ipc_ ncols, + ELEMENT_TYPE* a, ipc_ lda, + ELEMENT_TYPE* b, ipc_ ldb, + ipc_* index, ipc_ mode ) { - int iy = threadIdx.y + blockIdx.x*blockDim.y; + ipc_ iy = threadIdx.y + blockIdx.x*blockDim.y; __shared__ volatile ELEMENT_TYPE work[SIZE_X*SIZE_Y]; @@ -327,33 +332,33 @@ cu_reorder_rows2( int nrows, int ncols, /* * Copies new L factors back to A array without any permutation */ -template< typename ELEMENT_TYPE, int NTX > +template< typename ELEMENT_TYPE, ipc_ NTX > __device__ void __forceinline__ // Required to avoid errors about reg counts compiling with -G copy_L_LD_no_perm( - int nblk, int bidx, int tid, - int nrows, int ncols, - ELEMENT_TYPE *dest, int ldd, - const ELEMENT_TYPE *src, int lds + ipc_ nblk, ipc_ bidx, ipc_ tid, + ipc_ nrows, ipc_ ncols, + ELEMENT_TYPE *dest, ipc_ ldd, + const ELEMENT_TYPE *src, ipc_ lds ) { - int tx = tid % NTX; - int ty = tid / NTX; + ipc_ tx = tid % NTX; + ipc_ ty = tid / NTX; src += NTX*bidx; dest += NTX*bidx; nrows -= NTX*bidx; if ( ty < ncols ) { - for ( int x = tx; x < nrows; x += NTX*nblk ) + for ( ipc_ x = tx; x < nrows; x += NTX*nblk ) dest[x + ldd*ty] = src[x + lds*ty]; } } /* Shuffles the permutation vector using shared memory [in case it overlaps itself] */ -template < int SIZE_X > +template < ipc_ SIZE_X > __device__ void -shuffle_perm_shmem( int n, volatile const int *const indr, int *perm ) { +shuffle_perm_shmem( ipc_ n, volatile const ipc_ *const indr, ipc_ *perm ) { // Update permutation - __shared__ volatile int iwork[SIZE_X]; + __shared__ volatile ipc_ iwork[SIZE_X]; if ( threadIdx.x < n && threadIdx.y == 0 ) iwork[indr[threadIdx.x] - 1] = perm[threadIdx.x]; __syncthreads(); @@ -366,19 +371,19 @@ shuffle_perm_shmem( int n, volatile const int *const indr, int *perm ) { * This version uses shared memory and is designed for the case when the new * and old location of columns and rows overlap. */ -template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y > +template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y > __device__ void __forceinline__ // Required to avoid errors about reg counts compiling with -G copy_L_LD_perm_shmem( - int block, int nblocks, - int done, int pivoted, int delayed, - int nrows, int ncols, - int ib, int jb, - int offc, int offp, - int ld, - volatile int *const indr, - precision_ *a, precision_ *b, const precision_ *c, - int *perm + ipc_ block, ipc_ nblocks, + ipc_ done, ipc_ pivoted, ipc_ delayed, + ipc_ nrows, ipc_ ncols, + ipc_ ib, ipc_ jb, + ipc_ offc, ipc_ offp, + ipc_ ld, + volatile ipc_ *const indr, + rpc_ *a, rpc_ *b, const rpc_ *c, + ipc_ *perm ) { __shared__ volatile ELEMENT_TYPE work1[SIZE_X*SIZE_Y]; __shared__ volatile ELEMENT_TYPE work2[SIZE_X*SIZE_Y]; @@ -389,8 +394,8 @@ copy_L_LD_perm_shmem( // Extend permutation array to cover non-pivoted columns if ( threadIdx.x == 0 && threadIdx.y == 0 ) { - int i = 0; - int j = pivoted; + ipc_ i = 0; + ipc_ j = pivoted; for ( ; i < delayed; i++ ) indr[i] = ++j; for ( ; i < delayed + jb - ib + 1; i++ ) @@ -398,7 +403,7 @@ copy_L_LD_perm_shmem( indr[i] = ++j; } - int off = done*ld; + ipc_ off = done*ld; // We handle the (done-jb) x (done-jb) block that requires both // row and column permutations seperately using the first block. @@ -409,17 +414,17 @@ copy_L_LD_perm_shmem( // Swap columns of A and copy in L, but avoiding rows that need // permuted // Also, swap cols of LD but avoiding rows that need permuted - int baseStep = blockDim.x*(nblocks - 1); + ipc_ baseStep = blockDim.x*(nblocks - 1); #if (SM_3X) - for ( int i = jb + blockDim.x*(block - 1); i < nrows; + for ( ipc_ i = jb + blockDim.x*(block - 1); i < nrows; i += baseStep ) { #else - for ( int i = jb + blockDim.x*(block - 1); i < nrows + baseStep; + for ( ipc_ i = jb + blockDim.x*(block - 1); i < nrows + baseStep; i += baseStep * 2 ) { #endif - int ix = i + threadIdx.x; + ipc_ ix = i + threadIdx.x; #if (!SM_3X) - int ix2 = ix + baseStep; + ipc_ ix2 = ix + baseStep; #endif __syncthreads(); @@ -486,15 +491,15 @@ copy_L_LD_perm_shmem( // Swap rows of A baseStep = blockDim.y*(nblocks - 1); #if (SM_3X) - for ( int i = blockDim.y*(block - 1); i < ncols; + for ( ipc_ i = blockDim.y*(block - 1); i < ncols; i += baseStep ) { #else - for ( int i = blockDim.y*(block - 1); i < ncols + baseStep; + for ( ipc_ i = blockDim.y*(block - 1); i < ncols + baseStep; i += baseStep * 2 ) { #endif - int iy = i + threadIdx.y; + ipc_ iy = i + threadIdx.y; #if (!SM_3X) - int iy2 = iy + baseStep; + ipc_ iy2 = iy + baseStep; #endif __syncthreads(); @@ -538,7 +543,7 @@ copy_L_LD_perm_shmem( // row /and/ column permutations. shuffle_perm_shmem< SIZE_X > ( delayed + jb - ib + 1, indr, &perm[offp + done] ); - int pass = threadIdx.x < jb - done && threadIdx.y < jb - done; + ipc_ pass = threadIdx.x < jb - done && threadIdx.y < jb - done; // Handle L and LD if ( pass ) { @@ -591,26 +596,26 @@ copy_L_LD_perm_shmem( * This version does this directly in global memory and is designed for the case * when the new and old location of columns and rows DO NOT overlap. */ -template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y > +template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y > __device__ void __forceinline__ // Required to avoid errors about reg counts compiling with -G copy_L_LD_perm_noshmem( - int node, - int block, int nblocks, - int done, int pivoted, int delayed, - int nrows, int ncols, - int ib, int jb, - int offc, int offp, - int ld, - const int *ind, - const volatile int *const indf, - precision_ *a, precision_ *b, const precision_ *c, - int *perm + ipc_ node, + ipc_ block, ipc_ nblocks, + ipc_ done, ipc_ pivoted, ipc_ delayed, + ipc_ nrows, ipc_ ncols, + ipc_ ib, ipc_ jb, + ipc_ offc, ipc_ offp, + ipc_ ld, + const ipc_ *ind, + const volatile ipc_ *const indf, + rpc_ *a, rpc_ *b, const rpc_ *c, + ipc_ *perm ) { - int off1 = done; - int off2 = ib - 1; - int offi = node*SIZE_Y/2; + ipc_ off1 = done; + ipc_ off2 = ib - 1; + ipc_ offi = node*SIZE_Y/2; // We handle the two pivoted x pivoted blocks where row and columns cross // over seperately using the first block. @@ -618,13 +623,13 @@ copy_L_LD_perm_noshmem( // All remaining rows and columns are handlded by the remaining blocks. if ( block ) { // Handle parts of matrix that require EITHER row OR col shuffle - int tx = (threadIdx.y < SIZE_Y/2) ? threadIdx.x : threadIdx.x + blockDim.x; - int ty = (threadIdx.y < SIZE_Y/2) ? threadIdx.y : threadIdx.y - SIZE_Y/2; + ipc_ tx = (threadIdx.y < SIZE_Y/2) ? threadIdx.x : threadIdx.x + blockDim.x; + ipc_ ty = (threadIdx.y < SIZE_Y/2) ? threadIdx.y : threadIdx.y - SIZE_Y/2; // Swap a[:,done:done+pivoted] and a[:,ib:jb] pulling in c[] as we go - for ( int x = tx + 2*blockDim.x*(block - 1); + for ( ipc_ x = tx + 2*blockDim.x*(block - 1); x < nrows && ty < jb - ib + 1; x += 2*blockDim.x*(nblocks - 1) ) { - int y = ind[offi + ty] - 1; + ipc_ y = ind[offi + ty] - 1; if ( (x >= done && x < done + jb - ib + 1) || (x >= ib - 1 && x < jb) || y < 0 ) continue; // handled separately @@ -632,10 +637,10 @@ copy_L_LD_perm_noshmem( a[x + ld*(off1 + y)] = c[offc + x + ld*ty]; } // Swap b[:,done:done+pivoted] and b[:,ib:jb] - for ( int x = tx + 2*blockDim.x*(block - 1); + for ( ipc_ x = tx + 2*blockDim.x*(block - 1); x < nrows && ty < jb - ib + 1; x += 2*blockDim.x*(nblocks - 1) ) { - int y = ind[offi + ty] - 1; + ipc_ y = ind[offi + ty] - 1; if ( ( x >= done && x < done + jb - ib + 1 ) || ( x >= ib - 1 && x < jb ) || y < 0) continue; // handled separately @@ -647,10 +652,10 @@ copy_L_LD_perm_noshmem( if ( (block - 1)*blockDim.y >= ncols ) return; // swap a[done:done+pivoted,:] and a[ib:jb,:] - for ( int y = threadIdx.y + blockDim.y*(block - 1); + for ( ipc_ y = threadIdx.y + blockDim.y*(block - 1); y < ncols && threadIdx.x < jb - ib + 1; y += blockDim.y*(nblocks - 1) ) { - int x = ind[offi + threadIdx.x] - 1; + ipc_ x = ind[offi + threadIdx.x] - 1; if ( (y >= done && y < done + jb - ib + 1) || (y >= ib - 1 && y < jb) || x < 0 ) continue; // handled separately @@ -659,10 +664,10 @@ copy_L_LD_perm_noshmem( a[off2 + threadIdx.x + ld*y] = s; } // swap b[done:done+pivoted,:] and b[ib:jb,:] - for ( int y = threadIdx.y + blockDim.y*(block - 1); + for ( ipc_ y = threadIdx.y + blockDim.y*(block - 1); y < ncols && threadIdx.x < jb - ib + 1; y += blockDim.y*(nblocks - 1) ) { - int x = ind[offi + threadIdx.x] - 1; + ipc_ x = ind[offi + threadIdx.x] - 1; if ( (y >= done && y < done + jb - ib + 1) || (y >= ib - 1 && y < jb) || x < 0) continue; // handled separately @@ -675,9 +680,9 @@ copy_L_LD_perm_noshmem( // Handle part of matrix that requires BOTH row AND col shuffle if ( threadIdx.x < jb - ib + 1 && threadIdx.y == 0 ) { // Update permutation - int i = indf[threadIdx.x] - 1; + ipc_ i = indf[threadIdx.x] - 1; if ( i >= 0 ) { - int s = perm[offp + ib - 1 + threadIdx.x]; + ipc_ s = perm[offp + ib - 1 + threadIdx.x]; perm[offp + ib - 1 + threadIdx.x] = perm[offp + done + i]; perm[offp + done + i] = s; } @@ -688,8 +693,8 @@ copy_L_LD_perm_noshmem( // Swap a[done:done+pivoted,done:done+pivoted] and // a[done:done+pivoted,ib:jb] // pulling in new cols from c[] as we go. - int x = done + threadIdx.x; - int y = ind[offi + threadIdx.y] - 1; + ipc_ x = done + threadIdx.x; + ipc_ y = ind[offi + threadIdx.y] - 1; if ( x < done + jb - ib + 1 && threadIdx.y < jb - ib + 1 && y >= 0 ) { a[x + ld*(off2 + threadIdx.y)] = a[x + ld*(off1 + y)]; a[x + ld*(off1 + y)] = c[offc + x + ld*threadIdx.y]; @@ -765,12 +770,12 @@ copy_L_LD_perm_noshmem( } struct multireorder_data { - int node; - int block; - int nblocks; + ipc_ node; + ipc_ block; + ipc_ nblocks; }; -template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y > +template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y > #if (SM_3X) __launch_bounds__(256, 8) #else @@ -781,13 +786,13 @@ cu_multireorder( const struct multinode_fact_type *ndata, const struct multireorder_data* rdata, const ELEMENT_TYPE* c, - const int* stat, - const int* ind, - int* perm, - int* ncb) { - __shared__ volatile int indf[SIZE_X]; // index from node_fact - __shared__ volatile int indr[SIZE_X]; // reorder index - __shared__ volatile int simple; + const ipc_* stat, + const ipc_* ind, + ipc_* perm, + ipc_* ncb) { + __shared__ volatile ipc_ indf[SIZE_X]; // index from node_fact + __shared__ volatile ipc_ indr[SIZE_X]; // reorder index + __shared__ volatile ipc_ simple; // Reset ncb ready for next call of muliblock_fact_setup() if ( blockIdx.x == 0 && threadIdx.x == 0 && threadIdx.y == 0 ) { @@ -797,43 +802,43 @@ cu_multireorder( // Load data on block rdata += blockIdx.x; - int node = rdata->node; + ipc_ node = rdata->node; ndata += node; - int ib = ndata->ib; - int jb = ndata->jb; + ipc_ ib = ndata->ib; + ipc_ jb = ndata->jb; if ( jb < ib ) return; - int pivoted = stat[node]; + ipc_ pivoted = stat[node]; if ( pivoted < 1 ) return; - int nrows = ndata->nrows; - int bidx = rdata->block; + ipc_ nrows = ndata->nrows; + ipc_ bidx = rdata->block; if ( bidx > 1 && (bidx - 1)*blockDim.x >= nrows ) return; - int done = ndata->done; + ipc_ done = ndata->done; - int ld = nrows; - int delayed = ib - done - 1; // Number delayed before most recent factor + ipc_ ld = nrows; + ipc_ delayed = ib - done - 1; // Number delayed before most recent factor if ( threadIdx.x == 0 && threadIdx.y == 0 ) simple = (delayed == 0); // true if we don't need to offset __syncthreads(); - int next; + ipc_ next; if ( threadIdx.x < jb - ib + 1 && threadIdx.y == 0 ) { next = ind[node*SIZE_Y/2 + threadIdx.x]; // SIZE_Y=2*BLOCK_SIZE indf[threadIdx.x] = next; if ( jb - ib + 1 > delayed ) indr[delayed + threadIdx.x] = next; if ( indf[threadIdx.x] != threadIdx.x + 1 ) - atomicMin((int*)&simple, 0); + atomicMin((ipc_*)&simple, 0); } __syncthreads(); ELEMENT_TYPE *a = ndata->lval; ELEMENT_TYPE *b = ndata->ldval; - int offc = ndata->lbuf; - int nblk = rdata->nblocks; + ipc_ offc = ndata->lbuf; + ipc_ nblk = rdata->nblocks; if ( simple ) { // Copy successful columns from workspace c to factors a without an // offset or permutation. @@ -843,8 +848,8 @@ cu_multireorder( } else { // We need a permutation - int ncols = ndata->ncols; - int offp = ndata->offp; + ipc_ ncols = ndata->ncols; + ipc_ offp = ndata->offp; if ( jb - ib + 1 > delayed ) { // Can't just shuffle along, as pivoted columns overlap with where they // need to be. However, we know that pivoted+delayed < 2*BLOCK_SIZE, so @@ -864,14 +869,14 @@ cu_multireorder( } } -template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y > +template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y > __global__ void cu_multicopy( const struct multinode_fact_type *ndata, const struct multireorder_data* rdata, ELEMENT_TYPE* b, - int* stat, - int* ncb + ipc_* stat, + ipc_* ncb ) { @@ -881,25 +886,25 @@ cu_multicopy( } rdata += blockIdx.x; - int node = rdata->node; + ipc_ node = rdata->node; ndata += node; - int ib = ndata->ib; - int jb = ndata->jb; + ipc_ ib = ndata->ib; + ipc_ jb = ndata->jb; if ( jb < ib ) return; - int pivoted = stat[node]; + ipc_ pivoted = stat[node]; if ( pivoted < 1 ) return; - int nrows = ndata->nrows; - int block = rdata->block; - int nblocks = rdata->nblocks; + ipc_ nrows = ndata->nrows; + ipc_ block = rdata->block; + ipc_ nblocks = rdata->nblocks; if ( block > 1 && (block - 1)*blockDim.x >= nrows ) return; - int done = ndata->done; + ipc_ done = ndata->done; ELEMENT_TYPE *a = ndata->lval; - int offb = ndata->lbuf; - for ( int x = threadIdx.x + blockDim.x*block; + ipc_ offb = ndata->lbuf; + for ( ipc_ x = threadIdx.x + blockDim.x*block; x < nrows && threadIdx.y < pivoted; x += blockDim.x*nblocks ) { a[x + nrows*(done + threadIdx.y)] = b[offb + x + nrows*threadIdx.y]; @@ -907,9 +912,9 @@ cu_multicopy( } struct multisymm_type { - precision_ *lcol; - int ncols; - int nrows; + rpc_ *lcol; + ipc_ ncols; + ipc_ nrows; }; /* @@ -923,11 +928,11 @@ cu_multisymm( const struct multisymm_type* msdata ) { msdata += blockIdx.x; ELEMENT_TYPE *a = msdata->lcol; - int ncols = msdata->ncols; - int nrows = msdata->nrows; - for ( int i = threadIdx.x + blockDim.x*blockIdx.y; i < ncols; + ipc_ ncols = msdata->ncols; + ipc_ nrows = msdata->nrows; + for ( ipc_ i = threadIdx.x + blockDim.x*blockIdx.y; i < ncols; i += blockDim.x*gridDim.y ) - for ( int j = threadIdx.y + blockDim.y*blockIdx.z; j < i; + for ( ipc_ j = threadIdx.y + blockDim.y*blockIdx.z; j < i; j += blockDim.y*gridDim.z ) a[j + i*nrows] = a[i + j*nrows]; } @@ -940,151 +945,151 @@ cu_multisymm( const struct multisymm_type* msdata ) extern "C" { -void spral_ssids_copy_ic(cudaStream_t *stream, int nrows, int ncols, - precision_* a, int lda, precision_* b, int ldb, int* ind) { - int rb = (nrows - 1)/BLOCK_SIZE + 1; - int cb = (ncols - 1)/BLOCK_SIZE + 1; +void spral_ssids_copy_ic(cudaStream_t *stream, ipc_ nrows, ipc_ ncols, + rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* ind) { + ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1; + ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1; dim3 threads(BLOCK_SIZE, BLOCK_SIZE); dim3 grid(rb, cb); - cu_copy_ic< precision_ > + cu_copy_ic< rpc_ > <<< grid, threads, 0, *stream >>> ( nrows, ncols, a, lda, b, ldb, ind ); } -void spral_ssids_copy_mc(cudaStream_t *stream, int nrows, int ncols, precision_* a, - int lda, precision_* b, int ldb, int* mask) { - int rb = (nrows - 1)/BLOCK_SIZE + 1; - int cb = (ncols - 1)/BLOCK_SIZE + 1; +void spral_ssids_copy_mc(cudaStream_t *stream, ipc_ nrows, ipc_ ncols, rpc_* a, + ipc_ lda, rpc_* b, ipc_ ldb, ipc_* mask) { + ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1; + ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1; dim3 threads(BLOCK_SIZE, BLOCK_SIZE); dim3 grid(rb, cb); - cu_copy_mc< precision_ > + cu_copy_mc< rpc_ > <<< grid, threads, 0, *stream >>> ( nrows, ncols, a, lda, b, ldb, mask ); } -void spral_ssids_multisymm(cudaStream_t *stream, int nblocks, +void spral_ssids_multisymm(cudaStream_t *stream, ipc_ nblocks, const struct multisymm_type* msdata) { dim3 threads(BLOCK_SIZE, BLOCK_SIZE); - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); dim3 grid(nb,4,4); - cu_multisymm< precision_ ><<< grid, threads, 0, *stream >>>( msdata + i ); + cu_multisymm< rpc_ ><<< grid, threads, 0, *stream >>>( msdata + i ); } } -void spral_ssids_multicopy(cudaStream_t *stream, int nblocks, +void spral_ssids_multicopy(cudaStream_t *stream, ipc_ nblocks, const struct multinode_fact_type *ndata, const struct multireorder_data *rdata, - precision_* a, precision_* b, int* stat, int* ncb) { + rpc_* a, rpc_* b, ipc_* stat, ipc_* ncb) { dim3 threads(BLOCK_SIZE, BLOCK_SIZE); - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); - cu_multicopy< precision_, BLOCK_SIZE, BLOCK_SIZE > + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); + cu_multicopy< rpc_, BLOCK_SIZE, BLOCK_SIZE > <<< nb, threads, 0, *stream >>> ( ndata, rdata + i, b, stat, ncb ); } } -void spral_ssids_multireorder(cudaStream_t *stream, int nblocks, +void spral_ssids_multireorder(cudaStream_t *stream, ipc_ nblocks, const struct multinode_fact_type *ndata, const struct multireorder_data *rdata, - precision_* c, int* stat, int* ind, int* index, int* ncb) { + rpc_* c, ipc_* stat, ipc_* ind, ipc_* index, ipc_* ncb) { dim3 threads(2*BLOCK_SIZE, 2*BLOCK_SIZE); - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); dim3 grid(nb,1); - cu_multireorder< precision_, 2*BLOCK_SIZE, 2*BLOCK_SIZE > + cu_multireorder< rpc_, 2*BLOCK_SIZE, 2*BLOCK_SIZE > <<< grid, threads, 0, *stream >>> ( ndata, rdata + i, c, stat, ind, index, ncb ); } } // ncols <= 2*BLOCK_SIZE -void spral_ssids_reorder_cols2(cudaStream_t *stream, int nrows, int ncols, - precision_* a, int lda, precision_* b, int ldb, int* index, int mode ) { - int rb = (nrows - 1)/BLOCK_SIZE + 1; +void spral_ssids_reorder_cols2(cudaStream_t *stream, ipc_ nrows, ipc_ ncols, + rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index, ipc_ mode ) { + ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1; dim3 grid(rb, 2); if ( ncols <= BLOCK_SIZE ) { dim3 threads(BLOCK_SIZE, BLOCK_SIZE); - cu_reorder_cols2< precision_, BLOCK_SIZE, BLOCK_SIZE > + cu_reorder_cols2< rpc_, BLOCK_SIZE, BLOCK_SIZE > <<< grid, threads, 0, *stream >>> ( nrows, ncols, a, lda, b, ldb, index, mode ); } else if ( ncols <= 2*BLOCK_SIZE ) { dim3 threads(BLOCK_SIZE, 2*BLOCK_SIZE); - cu_reorder_cols2< precision_, BLOCK_SIZE, 2*BLOCK_SIZE > + cu_reorder_cols2< rpc_, BLOCK_SIZE, 2*BLOCK_SIZE > <<< grid, threads, 0, *stream >>> ( nrows, ncols, a, lda, b, ldb, index, mode ); } } -void spral_ssids_reorder_rows(cudaStream_t *stream, int nrows, int ncols, - precision_* a, int lda, precision_* b, int ldb, int* index) { - int cb = (ncols - 1)/BLOCK_SIZE + 1; +void spral_ssids_reorder_rows(cudaStream_t *stream, ipc_ nrows, ipc_ ncols, + rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index) { + ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1; dim3 grid(1, cb); - int tx = min(nrows, 1024/BLOCK_SIZE); + ipc_ tx = min(nrows, 1024/BLOCK_SIZE); dim3 threads(tx, BLOCK_SIZE); - cu_reorder_rows< precision_ > + cu_reorder_rows< rpc_ > <<< grid, threads, 0, *stream >>> ( nrows, ncols, a, lda, b, ldb, index ); } // nrows <= 2*BLOCK_SIZE -void spral_ssids_reorder_rows2(cudaStream_t *stream, int nrows, int ncols, - precision_* a, int lda, precision_* b, int ldb, int* index, int mode ) { - int cb = (ncols - 1)/BLOCK_SIZE + 1; +void spral_ssids_reorder_rows2(cudaStream_t *stream, ipc_ nrows, ipc_ ncols, + rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index, ipc_ mode ) { + ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1; dim3 grid(cb, 2); if ( nrows <= BLOCK_SIZE ) { dim3 threads(BLOCK_SIZE, BLOCK_SIZE); - cu_reorder_rows2< precision_, BLOCK_SIZE, BLOCK_SIZE > + cu_reorder_rows2< rpc_, BLOCK_SIZE, BLOCK_SIZE > <<< grid, threads, 0, *stream >>> ( nrows, ncols, a, lda, b, ldb, index, mode ); } else if ( nrows <= 2*BLOCK_SIZE ) { dim3 threads(2*BLOCK_SIZE, BLOCK_SIZE); - cu_reorder_rows2< precision_, 2*BLOCK_SIZE, BLOCK_SIZE > + cu_reorder_rows2< rpc_, 2*BLOCK_SIZE, BLOCK_SIZE > <<< grid, threads, 0, *stream >>> ( nrows, ncols, a, lda, b, ldb, index, mode ); } } -void spral_ssids_swap_ni2Dm(cudaStream_t *stream, int nblocks, +void spral_ssids_swap_ni2Dm(cudaStream_t *stream, ipc_ nblocks, struct multiswap_type *swapdata) { dim3 threads(BLOCK_SIZE, BLOCK_SIZE); - for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { - int nb = min(MAX_CUDA_BLOCKS, nblocks - i); + for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) { + ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i); dim3 grid(nb,8); cu_multiswap_ni2D_c - < precision_ > + < rpc_ > <<< grid, threads, 0, *stream >>> ( swapdata + i ); cu_multiswap_ni2D_r - < precision_ > + < rpc_ > <<< grid, threads, 0, *stream >>> ( swapdata + i ); } } -void spral_ssids_swap_ni2D_ic(cudaStream_t *stream, int nrows, int ncols, - precision_* a, int lda, precision_* b, int ldb, int* index) { - int rb = (nrows - 1)/BLOCK_SIZE + 1; - int cb = (ncols - 1)/BLOCK_SIZE + 1; +void spral_ssids_swap_ni2D_ic(cudaStream_t *stream, ipc_ nrows, ipc_ ncols, + rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index) { + ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1; + ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1; dim3 threads(BLOCK_SIZE, BLOCK_SIZE); dim3 grid(rb, cb); - cu_swap_ni2D_ic< precision_ > + cu_swap_ni2D_ic< rpc_ > <<< grid, threads, 0, *stream >>> ( nrows, ncols, a, lda, b, ldb, index ); } -void spral_ssids_swap_ni2D_ir(cudaStream_t *stream, int nrows, int ncols, - precision_* a, int lda, precision_* b, int ldb, int* index) { - int rb = (nrows - 1)/BLOCK_SIZE + 1; - int cb = (ncols - 1)/BLOCK_SIZE + 1; +void spral_ssids_swap_ni2D_ir(cudaStream_t *stream, ipc_ nrows, ipc_ ncols, + rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index) { + ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1; + ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1; dim3 threads(BLOCK_SIZE, BLOCK_SIZE); dim3 grid(rb, cb); - cu_swap_ni2D_ir< precision_ > + cu_swap_ni2D_ir< rpc_ > <<< grid, threads, 0, *stream >>> ( nrows, ncols, a, lda, b, ldb, index ); } diff --git a/src/ssids/solve.cu b/src/ssids/solve.cu index 7ac4357a1d..3ad774eb45 100644 --- a/src/ssids/solve.cu +++ b/src/ssids/solve.cu @@ -6,13 +6,14 @@ * Jeremey Appleyard NVIDIA * * This code has not yet been publically released under any licence. + * This version: GALAHAD 4.3 - 2024-02-03 AT 09:50 GMT */ #include #include "spral_cuda_cuda_check.h" +#include "ssids_rip.hxx" #ifdef SPRAL_SINGLE -#define precision_ float #define gather gather_single #define gemv_transpose_lookup gemv_transpose_lookup_single #define gemv_transpose_sps_rhs gemv_transpose_sps_rhs_single @@ -39,7 +40,6 @@ #define spral_ssids_run_fwd_solve_kernels spral_ssids_run_fwd_solve_kernels_single #define spral_ssids_run_slv_contrib_fwd spral_ssids_run_slv_contrib_fwd_single #else -#define precision_ double #define gather gather_double #define gemv_transpose_lookup gemv_transpose_lookup_double #define gemv_transpose_sps_rhs gemv_transpose_sps_rhs_double @@ -89,11 +89,11 @@ using namespace spral::ssids::gpu; namespace /* anon */ { /* Perform the assignment xdense(:) = xsparse( idx(:) ) */ -template -void __device__ gather(const int n, const int *const idx, const precision_ *const xsparse, - volatile precision_ *const xdense) { - int tid = threadsx*threadIdx.y + threadIdx.x; - for(int i=tid; i +void __device__ gather(const ipc_ n, const ipc_ *const idx, const rpc_ *const xsparse, + volatile rpc_ *const xdense) { + ipc_ tid = threadsx*threadIdx.y + threadIdx.x; + for(ipc_ i=tid; i +template __launch_bounds__(threadsx*threadsy, 6) void __global__ gemv_transpose_sps_rhs(struct gemv_transpose_lookup *lookup, - precision_ *x, precision_ *y + rpc_ *x, rpc_ *y ) { // Reuse shmem for two different purposes - __shared__ volatile precision_ shmem[maxn*threadsx]; - volatile precision_ *const partSum = shmem; - volatile precision_ *const xlocal = shmem; + __shared__ volatile rpc_ shmem[maxn*threadsx]; + volatile rpc_ *const partSum = shmem; + volatile rpc_ *const xlocal = shmem; - precision_ partSumReg[maxn / threadsy]; // Assumes neat division + rpc_ partSumReg[maxn / threadsy]; // Assumes neat division lookup += blockIdx.x; - int m = lookup->m; - int n = lookup->n; - const precision_ *a = lookup->a; - const int *rlist = lookup->rlist; - int lda = lookup->lda; + ipc_ m = lookup->m; + ipc_ n = lookup->n; + const rpc_ *a = lookup->a; + const ipc_ *rlist = lookup->rlist; + ipc_ lda = lookup->lda; y += lookup->yoffset; - /* Read x(rlist(:)) into xlocal(:) */ + /* Read x(rlist(:)) ipc_o xlocal(:) */ gather (m, rlist, x, xlocal); __syncthreads(); /* Perform matrix-vector multiply with answer y in register that is then stored in partSum for later reduction. */ if(m==maxm) { - volatile precision_ *const xl = xlocal + threadIdx.x; + volatile rpc_ *const xl = xlocal + threadIdx.x; #pragma unroll - for(int iLoop=0; iLoop= threadsx ? j - threadsx : j); + for(ipc_ j=threadIdx.x; j= threadsx ? j - threadsx : j); val += partSum[i*threadsx+j2]; } y[i] = val; @@ -202,13 +202,13 @@ void __global__ gemv_transpose_sps_rhs(struct gemv_transpose_lookup *lookup, /***********************************************************************/ struct reducing_d_solve_lookup { - int first_idx; // Index of supernode for thread 0 of this block. - int m; // Number of columns in upd to reduce. - int n; // Number of rows THIS BLOCK is responisble for. - int ldupd; // Leading dimension of upd. - int updoffset; // Offset into upd for supernode. - const precision_ *d; - const int *perm; // Offset into perm for supernode. + ipc_ first_idx; // Index of supernode for thread 0 of this block. + ipc_ m; // Number of columns in upd to reduce. + ipc_ n; // Number of rows THIS BLOCK is responisble for. + ipc_ ldupd; // Leading dimension of upd. + ipc_ updoffset; // Offset into upd for supernode. + const rpc_ *d; + const ipc_ *perm; // Offset into perm for supernode. }; /* This subroutine performs two unrelated tasks and subtracts the result of the @@ -222,40 +222,40 @@ struct reducing_d_solve_lookup { */ template void __global__ reducing_d_solve(struct reducing_d_solve_lookup *lookup, - precision_ *upd, const precision_ *x + rpc_ *upd, const rpc_ *x ) { /* Read details from lookup */ lookup += blockIdx.x; - int idx = lookup->first_idx + threadIdx.x; - int m = lookup->m; - int n = lookup->n; - int ldupd = lookup->ldupd; + ipc_ idx = lookup->first_idx + threadIdx.x; + ipc_ m = lookup->m; + ipc_ n = lookup->n; + ipc_ ldupd = lookup->ldupd; upd += lookup->updoffset; - const precision_ *d = lookup->d; - const int *perm = lookup->perm; + const rpc_ *d = lookup->d; + const ipc_ *perm = lookup->perm; /* Don't do anything on threads past end of arrays */ if(threadIdx.x>=m) return; /* Task 1: Sum upd and negate */ - precision_ val = upd[idx]; - for(int j=1; j void __global__ d_solve(struct reducing_d_solve_lookup *lookup, - const precision_ *x, precision_ *y) { + const rpc_ *x, rpc_ *y) { /* Read details from lookup */ lookup += blockIdx.x; - int idx = lookup->first_idx + threadIdx.x; - int m = lookup->m; - const precision_ *d = lookup->d; - const int *perm = lookup->perm; + ipc_ idx = lookup->first_idx + threadIdx.x; + ipc_ m = lookup->m; + const rpc_ *d = lookup->d; + const ipc_ *perm = lookup->perm; /* Don't do anything on threads past end of arrays */ if(threadIdx.x>=m) return; /* D solve (note that D is actually stored as inverse already) */ - int rp = perm[idx]; - precision_ val; + ipc_ rp = perm[idx]; + rpc_ val; if(idx!=0 && d[2*idx-1] != 0) { /* second part of 2x2 */ - int rp2 = perm[idx-1]; + ipc_ rp2 = perm[idx-1]; val = d[2*idx-1] * x[rp2] + d[2*idx] * x[rp]; } else if (d[2*idx+1] != 0) { /* first part of 2x2 */ - int rp2 = perm[idx+1]; + ipc_ rp2 = perm[idx+1]; val = d[2*idx] * x[rp] + d[2*idx+1] * x[rp2]; } else { @@ -321,44 +321,44 @@ void __global__ d_solve(struct reducing_d_solve_lookup *lookup, /***********************************************************************/ struct scatter_lookup { - int n; - int src_offset; - const int *index; - int dest_offset; + ipc_ n; + ipc_ src_offset; + const ipc_ *index; + ipc_ dest_offset; }; /* This subroutine performs the scatter operation dest( index(:) ) = src(:) */ -void __global__ scatter(struct scatter_lookup *lookup, const precision_ *src, - precision_ *dest +void __global__ scatter(struct scatter_lookup *lookup, const rpc_ *src, + rpc_ *dest ) { lookup += blockIdx.x; if(threadIdx.x >= lookup->n) return; // Skip on out of range threads src += lookup->src_offset; - const int *index = lookup->index; + const ipc_ *index = lookup->index; dest += lookup->dest_offset; - int idx = index[threadIdx.x]; + ipc_ idx = index[threadIdx.x]; dest[idx] = src[threadIdx.x]; } /* This subroutine performs the scatter operation dest( index(:) ) += src(:) */ -void __global__ scatter_sum(struct scatter_lookup *lookup, const precision_ *src, - precision_ *dest +void __global__ scatter_sum(struct scatter_lookup *lookup, const rpc_ *src, + rpc_ *dest ) { lookup += blockIdx.x; if(threadIdx.x >= lookup->n) return; // Skip on out of range threads src += lookup->src_offset; - const int *index = lookup->index; + const ipc_ *index = lookup->index; dest += lookup->dest_offset; - int idx = index[threadIdx.x]; + ipc_ idx = index[threadIdx.x]; dest[idx] += src[threadIdx.x]; } @@ -368,10 +368,10 @@ void __global__ scatter_sum(struct scatter_lookup *lookup, const precision_ *src /***********************************************************************/ struct lookups_gpu_bwd { - int ngemv; - int nrds; - int ntrsv; - int nscatter; + ipc_ ngemv; + ipc_ nrds; + ipc_ ntrsv; + ipc_ nscatter; struct gemv_transpose_lookup *gemv; struct reducing_d_solve_lookup *rds; struct trsv_lookup *trsv; @@ -383,34 +383,34 @@ struct lookups_gpu_bwd { * Result y actually output as array with leading dimn m that must be summed * externally. */ -template -void __global__ simple_gemv(int m, int n, const precision_ *a, int lda, - const precision_ *x, precision_ *y) { +template +void __global__ simple_gemv(ipc_ m, ipc_ n, const rpc_ *a, ipc_ lda, + const rpc_ *x, rpc_ *y) { a += blockIdx.x*maxm + (blockIdx.y*maxn)*lda; x += blockIdx.y*maxn; y += m*blockIdx.y + maxm*blockIdx.x; - __shared__ volatile precision_ partSum[maxm*threadsy]; + __shared__ volatile rpc_ partSum[maxm*threadsy]; m = MIN(maxm, m-blockIdx.x*maxm); n = MIN(maxn, n-blockIdx.y*maxn); - volatile precision_ *const ps = partSum + maxm*threadIdx.y; - for(int j=threadIdx.x; j -void __global__ simple_gemv_lookup(const precision_ *x, precision_ *y, +template +void __global__ simple_gemv_lookup(const rpc_ *x, rpc_ *y, struct gemv_notrans_lookup *lookup) { lookup += blockIdx.x; - int m = lookup->m; - int n = lookup->n; - precision_ const* a = lookup->a; - int lda = lookup->lda; + ipc_ m = lookup->m; + ipc_ n = lookup->n; + rpc_ const* a = lookup->a; + ipc_ lda = lookup->lda; x += lookup->x_offset; y += lookup->y_offset; - __shared__ volatile precision_ partSum[maxm*threadsy]; + __shared__ volatile rpc_ partSum[maxm*threadsy]; - volatile precision_ *const ps = partSum + maxm*threadIdx.y; + volatile rpc_ *const ps = partSum + maxm*threadIdx.y; // Templated parameters for shortcut if (maxm <= threadsx) { ps[threadIdx.x] = 0; } else { - for(int j=threadIdx.x; j= numLookups) return; lookup += offset; - int m = lookup->m; + ipc_ m = lookup->m; if(threadIdx.x>=m) return; - int n = lookup->n; + ipc_ n = lookup->n; src += lookup->src_offset + threadIdx.x; - int ldsrc = lookup->ldsrc; - precision_ *d = dest[lookup->dest_idx] + lookup->dest_offset; + ipc_ ldsrc = lookup->ldsrc; + rpc_ *d = dest[lookup->dest_idx] + lookup->dest_offset; - precision_ val = 0; - for(int i=0; icp; - int blk = blkdata->blk; - int m = lookup->m; - int nelim = lookup->nelim; - precision_ *xparent = cvalues[lookup->cvparent]; - volatile const precision_ *xchild = cvalues[lookup->cvchild]; - const int * list = *(lookup->list); + ipc_ blk = blkdata->blk; + ipc_ m = lookup->m; + ipc_ nelim = lookup->nelim; + rpc_ *xparent = cvalues[lookup->cvparent]; + volatile const rpc_ *xchild = cvalues[lookup->cvchild]; + const ipc_ * list = *(lookup->list); xlocal += lookup->x_offset; // Wait for previous children to complete @@ -580,8 +580,8 @@ void __global__ assemble_lvl(struct assemble_lookup2 *lookup, struct assemble_bl xchild += blk*ASSEMBLE_NB; // Perform actual assembly - for(int i=threadIdx.x; isync_offset]), 1); + atomicAdd((ipc_*)&(sync[lookup->sync_offset]), 1); } } -void __global__ grabx(precision_ *xlocal, precision_ **xstack, const precision_ *x, +void __global__ grabx(rpc_ *xlocal, rpc_ **xstack, const rpc_ *x, struct assemble_lookup *lookup) { lookup += blockIdx.x; if(threadIdx.x>=lookup->m) return; - int xend = lookup->xend; - precision_ *contrib = + ipc_ xend = lookup->xend; + rpc_ *contrib = (threadIdx.x>=xend) ? xstack[lookup->contrib_idx]+lookup->contrib_offset : NULL; xlocal += lookup->x_offset; - int row = lookup->list[threadIdx.x]; + ipc_ row = lookup->list[threadIdx.x]; if(threadIdx.x0) { - for(int i=0; i>> (sync+2*i); CudaCheckError(); } - for(int i=0; inassemble; i+=65535) + for(ipc_ i=0; inassemble; i+=65535) grabx <<nassemble-i), ASSEMBLE_NB, 0, *stream>>> (xlocal_gpu, xstack_gpu, x_gpu, gpu->assemble+i); - cudaMemset(asm_sync, 0, (1+gpu->nasm_sync)*sizeof(int)); - for(int i=0; inasmblk; i+=65535) + cudaMemset(asm_sync, 0, (1+gpu->nasm_sync)*sizeof(ipc_)); + for(ipc_ i=0; inasmblk; i+=65535) assemble_lvl <<nasmblk-i), ASSEMBLE_NB, 0, *stream>>> (gpu->assemble2, gpu->asmblk, xlocal_gpu, &asm_sync[0], &asm_sync[1], cvalues_gpu); CudaCheckError(); if(gpu->ntrsv>0) { if(posdef) { - for(int i=0; intrsv; i+=65535) + for(ipc_ i=0; intrsv; i+=65535) trsv_ln_exec - + <<ntrsv-i), dim3(THREADSX_TASK,THREADSY_TASK), 0, *stream>>> (xlocal_gpu, sync, gpu->trsv+i); } else { - for(int i=0; intrsv; i+=65535) + for(ipc_ i=0; intrsv; i+=65535) trsv_ln_exec - + <<ntrsv-i), dim3(THREADSX_TASK,THREADSY_TASK), 0, *stream>>> (xlocal_gpu, sync, gpu->trsv+i); } CudaCheckError(); } if(gpu->ngemv>0) { - for(int i=0; ingemv; i+=65535) + for(ipc_ i=0; ingemv; i+=65535) simple_gemv_lookup <<ngemv-i), dim3(GEMV_THREADSX,GEMV_THREADSY), 0, *stream>>> @@ -699,15 +699,15 @@ void spral_ssids_run_fwd_solve_kernels(bool posdef, (work_gpu, cvalues_gpu, gpu->nreduce, gpu->reduce); CudaCheckError(); } - for(int i=0; inscatter; i+=65535) + for(ipc_ i=0; inscatter; i+=65535) scatter <<nscatter-i), SCATTER_NB, 0, *stream>>> (gpu->scatter+i, xlocal_gpu, x_gpu); CudaCheckError(); } -void spral_ssids_run_d_solve_kernel(precision_ *x_gpu, - precision_ *y_gpu, struct lookups_gpu_bwd *gpu, +void spral_ssids_run_d_solve_kernel(rpc_ *x_gpu, + rpc_ *y_gpu, struct lookups_gpu_bwd *gpu, const cudaStream_t *stream) { if(gpu->nrds>0) { @@ -720,18 +720,18 @@ void spral_ssids_run_d_solve_kernel(precision_ *x_gpu, } void spral_ssids_run_bwd_solve_kernels(bool dsolve, - bool unit_diagonal, precision_ *x_gpu, precision_ *work_gpu, - int nsync, int *sync_gpu, struct lookups_gpu_bwd *gpu, + bool unit_diagonal, rpc_ *x_gpu, rpc_ *work_gpu, + ipc_ nsync, ipc_ *sync_gpu, struct lookups_gpu_bwd *gpu, const cudaStream_t *stream) { /* === Kernel Launches === */ if(nsync>0) { - for(int i=0; i>> (sync_gpu+2*i); CudaCheckError(); } if(gpu->ngemv>0) { - for(int i=0; ingemv; i+=65535) + for(ipc_ i=0; ingemv; i+=65535) gemv_transpose_sps_rhs <<ngemv-i), dim3(TRSM_TR_THREADSX,TRSM_TR_THREADSY), 0, *stream>>> @@ -741,13 +741,13 @@ void spral_ssids_run_bwd_solve_kernels(bool dsolve, if(gpu->nrds>0) { if(dsolve) { - for(int i=0; inrds; i+=65535) + for(ipc_ i=0; inrds; i+=65535) reducing_d_solve <<nrds-i), REDUCING_D_SOLVE_THREADS_PER_BLOCK, 0, *stream>>> (gpu->rds+i, work_gpu, x_gpu); } else { - for(int i=0; inrds; i+=65535) + for(ipc_ i=0; inrds; i+=65535) reducing_d_solve <<nrds-i), REDUCING_D_SOLVE_THREADS_PER_BLOCK, 0, *stream>>> @@ -758,15 +758,15 @@ void spral_ssids_run_bwd_solve_kernels(bool dsolve, if(gpu->ntrsv>0) { if(unit_diagonal) { - for(int i=0; intrsv; i+=65535) + for(ipc_ i=0; intrsv; i+=65535) trsv_lt_exec - + <<ntrsv-i), dim3(THREADSX_TASK,THREADSY_TASK), 0, *stream>>> (gpu->trsv+i, work_gpu, sync_gpu); } else { - for(int i=0; intrsv; i+=65535) + for(ipc_ i=0; intrsv; i+=65535) trsv_lt_exec - + <<ntrsv-i), dim3(THREADSX_TASK,THREADSY_TASK), 0, *stream>>> (gpu->trsv+i, work_gpu, sync_gpu); } @@ -774,7 +774,7 @@ void spral_ssids_run_bwd_solve_kernels(bool dsolve, } if(gpu->nscatter>0) { - for(int i=0; inscatter; i+=65535) + for(ipc_ i=0; inscatter; i+=65535) scatter <<nscatter-i), SCATTER_NB, 0, *stream>>> (gpu->scatter+i, work_gpu, x_gpu); @@ -784,10 +784,10 @@ void spral_ssids_run_bwd_solve_kernels(bool dsolve, void spral_ssids_run_slv_contrib_fwd( struct lookup_contrib_fwd const* gpu, - precision_* x_gpu, precision_ const* xstack_gpu, + rpc_* x_gpu, rpc_ const* xstack_gpu, const cudaStream_t *stream) { if(gpu->nscatter>0) { - for(int i=0; inscatter; i+=65535) + for(ipc_ i=0; inscatter; i+=65535) scatter_sum <<nscatter-i), SCATTER_NB, 0, *stream>>> (gpu->scatter+i, xstack_gpu, x_gpu); diff --git a/src/ssids/syrk.cu b/src/ssids/syrk.cu index 59d7413429..235ee9f3d6 100644 --- a/src/ssids/syrk.cu +++ b/src/ssids/syrk.cu @@ -2,17 +2,18 @@ * Copyright (c) 2013 NVIDIA * Authors: Evgueni Ovtchinnikov (STFC) * Jeremy Appleyard (NVIDIA) + * This version: GALAHAD 4.3 - 2024-02-03 AT 09:40 GMT */ #include #include #include +#include "ssids_rip.hxx" #include "ssids_gpu_kernels_datatypes.h" #include "spral_cuda_cuda_check.h" #ifdef SPRAL_SINGLE -#define precision_ float #define loadDevToSmem_generic loadDevToSmem_generic_single #define multisyrk_type multisyrk_type_single #define multielm_data multielm_data_single @@ -23,7 +24,6 @@ #define spral_ssids_multidsyrk spral_ssids_multidsyrk_single #define spral_ssids_multidsyrk_low_col spral_ssids_multidsyrk_low_col_single #else -#define precision_ double #define loadDevToSmem_generic loadDevToSmem_generic_double #define multisyrk_type multisyrk_type_double #define multielm_data multielm_data_double @@ -50,18 +50,18 @@ namespace /* anon */ { -template< int WIDTH > +template< ipc_ WIDTH > inline __device__ void -loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile precision_ *const __restrict__ bs, - const precision_* __restrict__ a, const precision_* __restrict__ b, - int bx, int by, int offa, int lda, int ldb, - int n, int i, int k) +loadDevToSmem_generic( volatile rpc_ *const __restrict__ as, volatile rpc_ *const __restrict__ bs, + const rpc_* __restrict__ a, const rpc_* __restrict__ b, + ipc_ bx, ipc_ by, ipc_ offa, ipc_ lda, ipc_ ldb, + ipc_ n, ipc_ i, ipc_ k) { switch (WIDTH) { case 4: if ( i + 3 < k ) { if ( threadIdx.y < 4 ) { - int x = threadIdx.x + (threadIdx.y + bx*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8; if ( x < n ) { as[threadIdx.x + threadIdx.y*8 ] = a[offa + x + i*lda]; as[threadIdx.x + threadIdx.y*8 + 32] = a[offa + x + (i + 1)*lda]; @@ -70,7 +70,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec } } else { - int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; if ( x < n ) { bs[threadIdx.x + (threadIdx.y - 4)*8 ] = b[offa + x + i*ldb]; bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = b[offa + x + (i + 1)*ldb]; @@ -81,7 +81,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec } else if ( i + 2 < k ) { if ( threadIdx.y < 4 ) { - int x = threadIdx.x + (threadIdx.y + bx*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8; if ( x < n ) { as[threadIdx.x + threadIdx.y*8 ] = a[offa + x + i*lda]; as[threadIdx.x + threadIdx.y*8 + 32] = a[offa + x + (i + 1)*lda]; @@ -90,7 +90,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec } } else { - int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; if ( x < n ) { bs[threadIdx.x + (threadIdx.y - 4)*8 ] = b[offa + x + i*ldb]; bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = b[offa + x + (i + 1)*ldb]; @@ -101,7 +101,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec } else if ( i + 1 < k ) { if ( threadIdx.y < 4 ) { - int x = threadIdx.x + (threadIdx.y + bx*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8; if ( x < n ) { as[threadIdx.x + threadIdx.y*8 ] = a[offa + x + i*lda]; as[threadIdx.x + threadIdx.y*8 + 32] = a[offa + x + (i + 1)*lda]; @@ -110,7 +110,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec } } else { - int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; if ( x < n ) { bs[threadIdx.x + (threadIdx.y - 4)*8 ] = b[offa + x + i*ldb]; bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = b[offa + x + (i + 1)*ldb]; @@ -121,7 +121,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec } else { if ( threadIdx.y < 4 ) { - int x = threadIdx.x + (threadIdx.y + bx*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8; if ( x < n ) { as[threadIdx.x + threadIdx.y*8 ] = a[offa + x + i*lda]; as[threadIdx.x + threadIdx.y*8 + 32] = 0.0; @@ -130,7 +130,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec } } else { - int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; if ( x < n ) { bs[threadIdx.x + (threadIdx.y - 4)*8 ] = b[offa + x + i*ldb]; bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = 0.0; @@ -144,14 +144,14 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec case 2: if ( i + 1 < k ) { if ( threadIdx.y < 4 ) { - int x = threadIdx.x + (threadIdx.y + bx*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8; if ( x < n ) { as[threadIdx.x + threadIdx.y*8 ] = a[offa + x + i*lda]; as[threadIdx.x + threadIdx.y*8 + 32] = a[offa + x + (i + 1)*lda]; } } else { - int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; if ( x < n ) { bs[threadIdx.x + (threadIdx.y - 4)*8 ] = b[offa + x + i*ldb]; bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = b[offa + x + (i + 1)*ldb]; @@ -160,14 +160,14 @@ case 2: } else { if ( threadIdx.y < 4 ) { - int x = threadIdx.x + (threadIdx.y + bx*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8; if ( x < n ) { as[threadIdx.x + threadIdx.y*8 ] = a[offa + x + i*lda]; as[threadIdx.x + threadIdx.y*8 + 32] = 0.0; } } else { - int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; + ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8; if ( x < n ) { bs[threadIdx.x + (threadIdx.y - 4)*8 ] = b[offa + x + i*ldb]; bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = 0.0; @@ -182,14 +182,14 @@ case 2: } struct multisyrk_type { - int first; - precision_ *lval; - precision_ *ldval; + ipc_ first; + rpc_ *lval; + rpc_ *ldval; long offc; - int n; - int k; - int lda; - int ldb; + ipc_ n; + ipc_ k; + ipc_ lda; + ipc_ ldb; }; // multisyrk kernels below compute the low trangular part of a*b^T @@ -201,11 +201,12 @@ __launch_bounds__(64, 14) #endif __global__ void cu_multisyrk_lc_r4x4( - const struct multisyrk_type* msdata, int off, ELEMENT_TYPE* c + const struct multisyrk_type* msdata, ipc_ off, ELEMENT_TYPE* c ){ -// The number of elements we want in each shared memory buffer depends on the shared memory:register ratio -// SM 3.0+ has precision_ the number of registers per shared memory, so need half the shared memory here. +// The number of elements we want in each shared memory buffer depends on +/ the shared memory:register ratio SM 3.0+ has precision_ the number of +// registers per shared memory, so need half the shared memory here. #if SM_3X #define SYRK_WIDTH 4 #define DOUBLE_BUFFERED 0 @@ -232,22 +233,22 @@ cu_multisyrk_lc_r4x4( #endif msdata += blockIdx.x; - int first = msdata->first; + ipc_ first = msdata->first; const ELEMENT_TYPE * __restrict__ a = msdata->lval; const ELEMENT_TYPE * __restrict__ b = msdata->ldval; - int offc = msdata->offc; - int n = msdata->n; - int k = msdata->k; - int lda = msdata->lda; - int ldb = msdata->ldb; + ipc_ offc = msdata->offc; + ipc_ n = msdata->n; + ipc_ k = msdata->k; + ipc_ lda = msdata->lda; + ipc_ ldb = msdata->ldb; if ( n < 1 ) return; - int bx, by; + ipc_ bx, by; { - int nb = (n - 1)/32 + 1; + ipc_ nb = (n - 1)/32 + 1; for ( bx = 0, by = 0; by < nb; by++ ) { if ( off + blockIdx.x - first - bx < nb - by ) { bx = off + blockIdx.x - first - bx + by; @@ -259,23 +260,23 @@ cu_multisyrk_lc_r4x4( #if (USE_DOUBLE2) double2 s[8]; - for ( int i = 0; i < 8; i++ ) { + for ( ipc_ i = 0; i < 8; i++ ) { s[i].x = 0.0; s[i].y = 0.0; } #else ELEMENT_TYPE s[16]; - for ( int i = 0; i < 16; i++ ) + for ( ipc_ i = 0; i < 16; i++ ) s[i] = 0.0; #endif #if (SYRK_WIDTH <= 2 && DOUBLE_BUFFERED) - loadDevToSmem_generic( (volatile precision_*)as, bs, a, b, + loadDevToSmem_generic( (volatile rpc_*)as, bs, a, b, bx, by, 0, lda, ldb, n, 0, k ); #endif - for ( int i = 0; i < k; i += SYRK_WIDTH ) { + for ( ipc_ i = 0; i < k; i += SYRK_WIDTH ) { @@ -286,21 +287,21 @@ cu_multisyrk_lc_r4x4( // challenge to get it working without spilling. #if (DOUBLE_BUFFERED) if ( i + SYRK_WIDTH < k ) { - loadDevToSmem_generic( (volatile precision_*)as2, bs2, + loadDevToSmem_generic( (volatile rpc_*)as2, bs2, a, b, bx, by, 0, lda, ldb, n, i + SYRK_WIDTH, k ); } #endif // (DOUBLE_BUFFERED) #if (SYRK_WIDTH > 2 || DOUBLE_BUFFERED) - loadDevToSmem_generic( (volatile precision_*)as, bs, a, b, + loadDevToSmem_generic( (volatile rpc_*)as, bs, a, b, bx, by, 0, lda, ldb, n, i, k ); #endif __syncthreads(); #pragma unroll - for ( int ix = 0; ix < SYRK_WIDTH; ix++) { - for ( int iy = 0; iy < 4; iy++ ) { + for ( ipc_ ix = 0; ix < SYRK_WIDTH; ix++) { + for ( ipc_ iy = 0; iy < 4; iy++ ) { #if (USE_DOUBLE2) s[iy*2 ].x += as[threadIdx.x + ix * 16 ].x*bs[threadIdx.y + 8*iy + ix * 32]; s[iy*2 ].y += as[threadIdx.x + ix * 16 ].y*bs[threadIdx.y + 8*iy + ix * 32]; @@ -324,13 +325,13 @@ cu_multisyrk_lc_r4x4( __syncthreads(); if ( i + SYRK_WIDTH < k ) { #if (SYRK_WIDTH <= 2) - loadDevToSmem_generic( (volatile precision_*)as, bs, a, b, bx, by, 0, lda, ldb, n, i + SYRK_WIDTH, k ); + loadDevToSmem_generic( (volatile rpc_*)as, bs, a, b, bx, by, 0, lda, ldb, n, i + SYRK_WIDTH, k ); #endif } #pragma unroll - for ( int ix = 0; ix < SYRK_WIDTH; ix++) { - for ( int iy = 0; iy < 4; iy++ ) { + for ( ipc_ ix = 0; ix < SYRK_WIDTH; ix++) { + for ( ipc_ iy = 0; iy < 4; iy++ ) { #if (USE_DOUBLE2) s[iy*2 ].x += as2[threadIdx.x + ix * 16 ].x*bs2[threadIdx.y + 8*iy + ix * 32]; s[iy*2 ].y += as2[threadIdx.x + ix * 16 ].y*bs2[threadIdx.y + 8*iy + ix * 32]; @@ -351,10 +352,10 @@ cu_multisyrk_lc_r4x4( } #if (USE_DOUBLE2) - for ( int iy = 0; iy < 4; iy++ ) { - for ( int ix = 0; ix < 2; ix++ ) { - int x = threadIdx.x * 2 + ix*16 + bx*32; - int y = threadIdx.y + iy*8 + by*32; + for ( ipc_ iy = 0; iy < 4; iy++ ) { + for ( ipc_ ix = 0; ix < 2; ix++ ) { + ipc_ x = threadIdx.x * 2 + ix*16 + bx*32; + ipc_ y = threadIdx.y + iy*8 + by*32; if ( x < n && y < n && y <= x ) { c[offc + x + y*n] = -s[ix + iy*2].x; } @@ -366,38 +367,38 @@ cu_multisyrk_lc_r4x4( } } #else - int xMaxBase = (3 + bx*4)*8; - int yMaxBase = (3 + by*4)*8; + ipc_ xMaxBase = (3 + bx*4)*8; + ipc_ yMaxBase = (3 + by*4)*8; - int XNPass = xMaxBase + 8 < n; - int YNPass = yMaxBase + 8 < n; - int YXPass = yMaxBase + 8 <= xMaxBase; + ipc_ XNPass = xMaxBase + 8 < n; + ipc_ YNPass = yMaxBase + 8 < n; + ipc_ YXPass = yMaxBase + 8 <= xMaxBase; // This is only a small improvement (~1%) if (XNPass && YNPass && YXPass) { - for ( int iy = 0; iy < 4; iy++ ) { - for ( int ix = 0; ix < 4; ix++ ) { - int x = threadIdx.x + (ix + bx*4)*8; - int y = threadIdx.y + (iy + by*4)*8; + for ( ipc_ iy = 0; iy < 4; iy++ ) { + for ( ipc_ ix = 0; ix < 4; ix++ ) { + ipc_ x = threadIdx.x + (ix + bx*4)*8; + ipc_ y = threadIdx.y + (iy + by*4)*8; c[offc + x + y*n] = -s[ix + iy*4]; } } } else if (XNPass && YNPass) { - for ( int iy = 0; iy < 4; iy++ ) { - for ( int ix = 0; ix < 4; ix++ ) { - int x = threadIdx.x + (ix + bx*4)*8; - int y = threadIdx.y + (iy + by*4)*8; + for ( ipc_ iy = 0; iy < 4; iy++ ) { + for ( ipc_ ix = 0; ix < 4; ix++ ) { + ipc_ x = threadIdx.x + (ix + bx*4)*8; + ipc_ y = threadIdx.y + (iy + by*4)*8; if ( y <= x ) c[offc + x + y*n] = -s[ix + iy*4]; } } } else { - for ( int iy = 0; iy < 4; iy++ ) { - for ( int ix = 0; ix < 4; ix++ ) { - int x = threadIdx.x + (ix + bx*4)*8; - int y = threadIdx.y + (iy + by*4)*8; + for ( ipc_ iy = 0; iy < 4; iy++ ) { + for ( ipc_ ix = 0; ix < 4; ix++ ) { + ipc_ x = threadIdx.x + (ix + bx*4)*8; + ipc_ y = threadIdx.y + (iy + by*4)*8; if ( x < n && y < n && y <= x ) c[offc + x + y*n] = -s[ix + iy*4]; } @@ -412,8 +413,8 @@ cu_multisyrk_lc_r4x4( } struct multielm_data { - int node; - int offb; + ipc_ node; + ipc_ offb; }; template< typename ELEMENT_TYPE > @@ -423,16 +424,16 @@ template< typename ELEMENT_TYPE > __global__ void cu_multisyrk_r4x4( bool posdef, - int* stat, + ipc_* stat, multielm_data* mdata, - int off, + ipc_ off, struct multinode_fact_type *ndatat ){ - int bx, by; - int n, m, k; - int offa, offc; - int lda, ldb; - int nb; + ipc_ bx, by; + ipc_ n, m, k; + ipc_ offa, offc; + ipc_ lda, ldb; + ipc_ nb; ELEMENT_TYPE s[16]; #if SM_3X #define SYRK_WIDTH 2 @@ -467,9 +468,9 @@ cu_multisyrk_r4x4( if ( by >= n || by >= m ) return; - const precision_ * __restrict__ a = ndatat->lval; - const precision_ * __restrict__ b = posdef ? ndatat->lval : ndatat->ldval; - precision_ * __restrict__ c = ndatat->lval; + const rpc_ * __restrict__ a = ndatat->lval; + const rpc_ * __restrict__ b = posdef ? ndatat->lval : ndatat->ldval; + rpc_ * __restrict__ c = ndatat->lval; offa = by + lda*n; offc = by + by*n; @@ -486,17 +487,17 @@ cu_multisyrk_r4x4( bx = by%nb; by = by/nb; - for ( int i = 0; i < 16; i++ ) { + for ( ipc_ i = 0; i < 16; i++ ) { s[i] = 0.0; } #if (DOUBLE_BUFFERED) - loadDevToSmem_generic( (volatile precision_*)as, bs, a, b, bx, by, offa, lda, ldb, n, 0, k ); + loadDevToSmem_generic( (volatile rpc_*)as, bs, a, b, bx, by, offa, lda, ldb, n, 0, k ); #endif - for ( int i = 0; i < k; i += SYRK_WIDTH ) { + for ( ipc_ i = 0; i < k; i += SYRK_WIDTH ) { #if (!DOUBLE_BUFFERED) - loadDevToSmem_generic( (volatile precision_*)as, bs, a, b, bx, by, offa, lda, ldb, n, i, k ); + loadDevToSmem_generic( (volatile rpc_*)as, bs, a, b, bx, by, offa, lda, ldb, n, i, k ); #endif __syncthreads(); @@ -508,8 +509,8 @@ cu_multisyrk_r4x4( #endif #pragma unroll - for ( int ix = 0; ix < SYRK_WIDTH; ix++) { - for ( int iy = 0; iy < 4; iy++ ) { + for ( ipc_ ix = 0; ix < SYRK_WIDTH; ix++) { + for ( ipc_ iy = 0; iy < 4; iy++ ) { s[iy*4] += as[threadIdx.x + 32 * ix ]*bs[threadIdx.y + 8*iy + 32 * ix]; s[iy*4 + 1] += as[threadIdx.x + 32 * ix + 8 ]*bs[threadIdx.y + 8*iy + 32 * ix]; s[iy*4 + 2] += as[threadIdx.x + 32 * ix + 16]*bs[threadIdx.y + 8*iy + 32 * ix]; @@ -529,8 +530,8 @@ cu_multisyrk_r4x4( } #pragma unroll - for ( int ix = 0; ix < SYRK_WIDTH; ix++) { - for ( int iy = 0; iy < 4; iy++ ) { + for ( ipc_ ix = 0; ix < SYRK_WIDTH; ix++) { + for ( ipc_ iy = 0; iy < 4; iy++ ) { s[iy*4] += as2[threadIdx.x + 32 * ix ]*bs2[threadIdx.y + 8*iy + 32 * ix]; s[iy*4 + 1] += as2[threadIdx.x + 32 * ix + 8 ]*bs2[threadIdx.y + 8*iy + 32 * ix]; s[iy*4 + 2] += as2[threadIdx.x + 32 * ix + 16]*bs2[threadIdx.y + 8*iy + 32 * ix]; @@ -540,10 +541,10 @@ cu_multisyrk_r4x4( #endif } - for ( int iy = 0; iy < 4; iy++ ) - for ( int ix = 0; ix < 4; ix++ ) { - int x = threadIdx.x + (ix + bx*4)*8; - int y = threadIdx.y + (iy + by*4)*8; + for ( ipc_ iy = 0; iy < 4; iy++ ) + for ( ipc_ ix = 0; ix < 4; ix++ ) { + ipc_ x = threadIdx.x + (ix + bx*4)*8; + ipc_ y = threadIdx.y + (iy + by*4)*8; if ( x < n && y < m ) c[offc + x + y*lda] = c[offc + x + y*lda] - s[ix + iy*4]; } @@ -552,45 +553,45 @@ cu_multisyrk_r4x4( template< typename ELEMENT_TYPE > __global__ void cu_syrk_r4x4( - int n, int m, int k, - precision_ alpha, const precision_* a, int lda, const precision_* b, int ldb, - precision_ beta, precision_* c, int ldc + ipc_ n, ipc_ m, ipc_ k, + rpc_ alpha, const rpc_* a, ipc_ lda, const rpc_* b, ipc_ ldb, + rpc_ beta, rpc_* c, ipc_ ldc ){ ELEMENT_TYPE s[16]; __shared__ volatile ELEMENT_TYPE as[128], bs[128]; - for ( int i = 0; i < 16; i++ ) + for ( ipc_ i = 0; i < 16; i++ ) s[i] = 0; - for ( int i = 0; i < k; i += 4 ) { + for ( ipc_ i = 0; i < k; i += 4 ) { loadDevToSmem_generic< 4 >( as, bs, a, b, blockIdx.x, blockIdx.y, 0, lda, ldb, n, i, k ); __syncthreads(); - for ( int iy = 0; iy < 4; iy++ ) { + for ( ipc_ iy = 0; iy < 4; iy++ ) { s[iy*4] += as[threadIdx.x ]*bs[threadIdx.y + 8*iy]; s[iy*4 + 1] += as[threadIdx.x + 8 ]*bs[threadIdx.y + 8*iy]; s[iy*4 + 2] += as[threadIdx.x + 16]*bs[threadIdx.y + 8*iy]; s[iy*4 + 3] += as[threadIdx.x + 24]*bs[threadIdx.y + 8*iy]; } - for ( int iy = 0; iy < 4; iy++ ) { + for ( ipc_ iy = 0; iy < 4; iy++ ) { s[iy*4] += as[threadIdx.x + 32]*bs[threadIdx.y + 8*iy + 32]; s[iy*4 + 1] += as[threadIdx.x + 40]*bs[threadIdx.y + 8*iy + 32]; s[iy*4 + 2] += as[threadIdx.x + 48]*bs[threadIdx.y + 8*iy + 32]; s[iy*4 + 3] += as[threadIdx.x + 56]*bs[threadIdx.y + 8*iy + 32]; } - for ( int iy = 0; iy < 4; iy++ ) { + for ( ipc_ iy = 0; iy < 4; iy++ ) { s[iy*4] += as[threadIdx.x + 64]*bs[threadIdx.y + 8*iy + 64]; s[iy*4 + 1] += as[threadIdx.x + 72]*bs[threadIdx.y + 8*iy + 64]; s[iy*4 + 2] += as[threadIdx.x + 80]*bs[threadIdx.y + 8*iy + 64]; s[iy*4 + 3] += as[threadIdx.x + 88]*bs[threadIdx.y + 8*iy + 64]; } - for ( int iy = 0; iy < 4; iy++ ) { + for ( ipc_ iy = 0; iy < 4; iy++ ) { s[iy*4] += as[threadIdx.x + 96 ]*bs[threadIdx.y + 8*iy + 96]; s[iy*4 + 1] += as[threadIdx.x + 104]*bs[threadIdx.y + 8*iy + 96]; s[iy*4 + 2] += as[threadIdx.x + 112]*bs[threadIdx.y + 8*iy + 96]; @@ -601,19 +602,19 @@ cu_syrk_r4x4( } if ( beta ) { - for ( int iy = 0; iy < 4; iy++ ) - for ( int ix = 0; ix < 4; ix++ ) { - int x = threadIdx.x + (ix + blockIdx.x*4)*8; - int y = threadIdx.y + (iy + blockIdx.y*4)*8; + for ( ipc_ iy = 0; iy < 4; iy++ ) + for ( ipc_ ix = 0; ix < 4; ix++ ) { + ipc_ x = threadIdx.x + (ix + blockIdx.x*4)*8; + ipc_ y = threadIdx.y + (iy + blockIdx.y*4)*8; if ( x < n && y < m ) c[x + y*ldc] = beta*c[x + y*ldc] + alpha*s[ix + iy*4]; } } else { - for ( int iy = 0; iy < 4; iy++ ) - for ( int ix = 0; ix < 4; ix++ ) { - int x = threadIdx.x + (ix + blockIdx.x*4)*8; - int y = threadIdx.y + (iy + blockIdx.y*4)*8; + for ( ipc_ iy = 0; iy < 4; iy++ ) + for ( ipc_ ix = 0; ix < 4; ix++ ) { + ipc_ x = threadIdx.x + (ix + blockIdx.x*4)*8; + ipc_ y = threadIdx.y + (iy + blockIdx.y*4)*8; if ( x < n && y < m ) c[x + y*ldc] = alpha*s[ix + iy*4]; } @@ -629,36 +630,36 @@ cu_syrk_r4x4( extern "C" { -void spral_ssids_dsyrk(cudaStream_t *stream, int n, int m, int k, - precision_ alpha, const precision_* a, int lda, const precision_* b, - int ldb, precision_ beta, precision_* c, int ldc) { - int nx, ny; +void spral_ssids_dsyrk(cudaStream_t *stream, ipc_ n, ipc_ m, ipc_ k, + rpc_ alpha, const rpc_* a, ipc_ lda, const rpc_* b, + ipc_ ldb, rpc_ beta, rpc_* c, ipc_ ldc) { + ipc_ nx, ny; nx = (n - 1)/32 + 1; ny = (m - 1)/32 + 1; dim3 threads(8,8); dim3 grid(nx,ny); - cu_syrk_r4x4< precision_ > <<< grid, threads, 0, *stream >>> + cu_syrk_r4x4< rpc_ > <<< grid, threads, 0, *stream >>> ( n, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); } -void spral_ssids_multidsyrk(cudaStream_t *stream, bool posdef, int nb, - int* stat, struct multielm_data* mdata, +void spral_ssids_multidsyrk(cudaStream_t *stream, bool posdef, ipc_ nb, + ipc_* stat, struct multielm_data* mdata, struct multinode_fact_type *ndata) { dim3 threads(8,8); - for ( int i = 0; i < nb; i += MAX_CUDA_BLOCKS ) { - int blocks = min(MAX_CUDA_BLOCKS, nb - i); - cu_multisyrk_r4x4< precision_ > + for ( ipc_ i = 0; i < nb; i += MAX_CUDA_BLOCKS ) { + ipc_ blocks = min(MAX_CUDA_BLOCKS, nb - i); + cu_multisyrk_r4x4< rpc_ > <<< blocks, threads, 0, *stream >>> ( posdef, stat, mdata + i, i, ndata ); } } -void spral_ssids_multidsyrk_low_col(cudaStream_t *stream, int nb, - struct multisyrk_type* msdata, precision_* c) { +void spral_ssids_multidsyrk_low_col(cudaStream_t *stream, ipc_ nb, + struct multisyrk_type* msdata, rpc_* c) { dim3 threads(8,8); - for ( int i = 0; i < nb; i += MAX_CUDA_BLOCKS ) { - int blocks = min(MAX_CUDA_BLOCKS, nb - i); - cu_multisyrk_lc_r4x4< precision_ > + for ( ipc_ i = 0; i < nb; i += MAX_CUDA_BLOCKS ) { + ipc_ blocks = min(MAX_CUDA_BLOCKS, nb - i); + cu_multisyrk_lc_r4x4< rpc_ > <<< blocks, threads, 0, *stream >>>( msdata + i, i, c ); } } diff --git a/src/ssids/wrappers.cxx b/src/ssids/wrappers.cxx index 813e005c2f..1222328e20 100644 --- a/src/ssids/wrappers.cxx +++ b/src/ssids/wrappers.cxx @@ -2,6 +2,7 @@ * \copyright 2016 The Science and Technology Facilities Council (STFC) * \licence BSD licence, see LICENCE file for details * \author Jonathan Hogg + * \version GALAHAD 4.3 - 2024-02-03 AT 11:30 GMT */ #include "ssids_cpu_kernels_wrappers.hxx" @@ -11,34 +12,34 @@ /* ================ SINGLE PRECISION WITH 64 BIT INTEGERS =================== */ -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 extern "C" { void spral_c_sgemm_64(char* transa, char* transb, - int64_t* m, int64_t* n, int64_t* k, - float* alpha, const float* a, int64_t* lda, - const float* b, int64_t* ldb, float *beta, - float* c, int64_t* ldc); + int64_t* m, int64_t* n, int64_t* k, + float* alpha, const float* a, int64_t* lda, + const float* b, int64_t* ldb, float *beta, + float* c, int64_t* ldc); void spral_c_spotrf_64(char *uplo, int64_t *n, float *a, - int64_t *lda, int64_t *info); + int64_t *lda, int64_t *info); void spral_c_ssytrf_64(char *uplo, int64_t *n, float *a, - int64_t *lda, int64_t *ipiv, float *work, - int64_t *lwork, int64_t *info); + int64_t *lda, int64_t *ipiv, float *work, + int64_t *lwork, int64_t *info); void spral_c_strsm_64(char *side, char *uplo, char *transa, - char *diag, int64_t *m, int64_t *n, - const float *alpha, const float *a, - int64_t *lda, float *b, int64_t *ldb); + char *diag, int64_t *m, int64_t *n, + const float *alpha, const float *a, + int64_t *lda, float *b, int64_t *ldb); void spral_c_ssyrk_64(char *uplo, char *trans, - int64_t *n, int64_t *k, float *alpha, - const float *a, int64_t *lda, float *beta, - float *c, int64_t *ldc); + int64_t *n, int64_t *k, float *alpha, + const float *a, int64_t *lda, float *beta, + float *c, int64_t *ldc); void spral_c_strsv_64(char *uplo, char *trans, char *diag, - int64_t *n, const float *a, int64_t *lda, - float *x, int64_t *incx); + int64_t *n, const float *a, int64_t *lda, + float *x, int64_t *incx); void spral_c_sgemv_64(char *trans, int64_t *m, int64_t *n, - const float* alpha, const float* a, - int64_t *lda, const float* x, int64_t* incx, - const float* beta, float* y, int64_t* incy); + const float* alpha, const float* a, + int64_t *lda, const float* x, int64_t* incx, + const float* beta, float* y, int64_t* incy); } namespace spral { namespace ssids { namespace cpu { @@ -46,10 +47,10 @@ namespace spral { namespace ssids { namespace cpu { /* _GEMM */ template <> void host_gemm_64(enum spral::ssids::cpu::operation transa, - enum spral::ssids::cpu::operation transb, - int64_t m, int64_t n, int64_t k, float alpha, - const float* a, int64_t lda, const float* b, - int64_t ldb, float beta, float* c, int64_t ldc) { + enum spral::ssids::cpu::operation transb, + int64_t m, int64_t n, int64_t k, float alpha, + const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc) { char ftransa = (transa==spral::ssids::cpu::OP_N) ? 'N' : 'T'; char ftransb = (transb==spral::ssids::cpu::OP_N) ? 'N' : 'T'; spral_c_sgemm_64(&ftransa, &ftransb, &m, &n, &k, &alpha, a, &lda, @@ -59,9 +60,9 @@ void host_gemm_64(enum spral::ssids::cpu::operation transa, /* _GEMV */ template <> void gemv_64(enum spral::ssids::cpu::operation trans, - int64_t m, int64_t n, float alpha, const float* a, - int64_t lda, const float* x, int64_t incx, - float beta, float* y, int64_t incy) { + int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, const float* x, int64_t incx, + float beta, float* y, int64_t incy) { char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T'; spral_c_sgemv_64(&ftrans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); @@ -70,7 +71,7 @@ void gemv_64(enum spral::ssids::cpu::operation trans, /* _POTRF */ template<> int64_t lapack_potrf_64(enum spral::ssids::cpu::fillmode uplo, - int64_t n, float* a, int64_t lda) { + int64_t n, float* a, int64_t lda) { char fuplo; switch(uplo) { case spral::ssids::cpu::FILL_MODE_LWR: fuplo = 'L'; break; @@ -85,8 +86,8 @@ int64_t lapack_potrf_64(enum spral::ssids::cpu::fillmode uplo, /* _SYTRF - Bunch-Kaufman factorization */ template<> int64_t lapack_sytrf_64(enum spral::ssids::cpu::fillmode uplo, - int64_t n, float* a, int64_t lda, - int64_t *ipiv, float* work, int64_t lwork) { + int64_t n, float* a, int64_t lda, + int64_t *ipiv, float* work, int64_t lwork) { char fuplo; switch(uplo) { case spral::ssids::cpu::FILL_MODE_LWR: fuplo = 'L'; break; @@ -101,9 +102,9 @@ int64_t lapack_sytrf_64(enum spral::ssids::cpu::fillmode uplo, /* _SYRK */ template <> void host_syrk_64(enum spral::ssids::cpu::fillmode uplo, - enum spral::ssids::cpu::operation trans, - int64_t n, int64_t k, float alpha, const float* a, - int64_t lda, float beta, float* c, int64_t ldc) { + enum spral::ssids::cpu::operation trans, + int64_t n, int64_t k, float alpha, const float* a, + int64_t lda, float beta, float* c, int64_t ldc) { char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U'; char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T'; spral_c_ssyrk_64(&fuplo, &ftrans, &n, &k, &alpha, a, &lda, &beta, c, &ldc); @@ -112,10 +113,10 @@ void host_syrk_64(enum spral::ssids::cpu::fillmode uplo, /* _TRSV */ template <> void host_trsv_64(enum spral::ssids::cpu::fillmode uplo, - enum spral::ssids::cpu::operation trans, - enum spral::ssids::cpu::diagonal diag, - int64_t n, const float* a, int64_t lda, - float* x, int64_t incx) { + enum spral::ssids::cpu::operation trans, + enum spral::ssids::cpu::diagonal diag, + int64_t n, const float* a, int64_t lda, + float* x, int64_t incx) { char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U'; char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T'; char fdiag = (diag==spral::ssids::cpu::DIAG_UNIT) ? 'U' : 'N'; @@ -125,11 +126,11 @@ void host_trsv_64(enum spral::ssids::cpu::fillmode uplo, /* _TRSM */ template <> void host_trsm_64(enum spral::ssids::cpu::side side, - enum spral::ssids::cpu::fillmode uplo, - enum spral::ssids::cpu::operation transa, - enum spral::ssids::cpu::diagonal diag, - int64_t m, int64_t n, float alpha, const float* a, - int64_t lda, float* b, int64_t ldb) { + enum spral::ssids::cpu::fillmode uplo, + enum spral::ssids::cpu::operation transa, + enum spral::ssids::cpu::diagonal diag, + int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, float* b, int64_t ldb) { char fside = (side==spral::ssids::cpu::SIDE_LEFT) ? 'L' : 'R'; char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U'; char ftransa = (transa==spral::ssids::cpu::OP_N) ? 'N' : 'T'; @@ -277,34 +278,34 @@ void host_trsm(enum spral::ssids::cpu::side side, /* ================ DOUBLE PRECISION WITH 64 BIT INTEGERS =================== */ -#ifdef SPRAL_64BIT_INTEGER +#ifdef INTEGER_64 extern "C" { void spral_c_dgemm_64(char* transa, char* transb, - int64_t* m, int64_t* n, int64_t* k, - double* alpha, const double* a, int64_t* lda, - const double* b, int64_t* ldb, double *beta, - double* c, int64_t* ldc); + int64_t* m, int64_t* n, int64_t* k, + double* alpha, const double* a, int64_t* lda, + const double* b, int64_t* ldb, double *beta, + double* c, int64_t* ldc); void spral_c_dpotrf_64(char *uplo, int64_t *n, double *a, - int64_t *lda, int64_t *info); + int64_t *lda, int64_t *info); void spral_c_dsytrf_64(char *uplo, int64_t *n, double *a, - int64_t *lda, int64_t *ipiv, double *work, - int64_t *lwork, int64_t *info); + int64_t *lda, int64_t *ipiv, double *work, + int64_t *lwork, int64_t *info); void spral_c_dtrsm_64(char *side, char *uplo, char *transa, - char *diag, int64_t *m, int64_t *n, - const double *alpha, const double *a, - int64_t *lda, double *b, int64_t *ldb); + char *diag, int64_t *m, int64_t *n, + const double *alpha, const double *a, + int64_t *lda, double *b, int64_t *ldb); void spral_c_dsyrk_64(char *uplo, char *trans, - int64_t *n, int64_t *k, double *alpha, - const double *a, int64_t *lda, double *beta, - double *c, int64_t *ldc); + int64_t *n, int64_t *k, double *alpha, + const double *a, int64_t *lda, double *beta, + double *c, int64_t *ldc); void spral_c_dtrsv_64(char *uplo, char *trans, char *diag, - int64_t *n, const double *a, int64_t *lda, - double *x, int64_t *incx); + int64_t *n, const double *a, int64_t *lda, + double *x, int64_t *incx); void spral_c_dgemv_64(char *trans, int64_t *m, int64_t *n, - const double* alpha, const double* a, - int64_t *lda, const double* x, int64_t* incx, - const double* beta, double* y, int64_t* incy); + const double* alpha, const double* a, + int64_t *lda, const double* x, int64_t* incx, + const double* beta, double* y, int64_t* incy); } namespace spral { namespace ssids { namespace cpu { @@ -312,10 +313,10 @@ namespace spral { namespace ssids { namespace cpu { /* _GEMM */ template <> void host_gemm_64(enum spral::ssids::cpu::operation transa, - enum spral::ssids::cpu::operation transb, - int64_t m, int64_t n, int64_t k, double alpha, - const double* a, int64_t lda, const double* b, - int64_t ldb, double beta, double* c, int64_t ldc) { + enum spral::ssids::cpu::operation transb, + int64_t m, int64_t n, int64_t k, double alpha, + const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc) { char ftransa = (transa==spral::ssids::cpu::OP_N) ? 'N' : 'T'; char ftransb = (transb==spral::ssids::cpu::OP_N) ? 'N' : 'T'; spral_c_dgemm_64(&ftransa, &ftransb, &m, &n, &k, &alpha, a, &lda, @@ -325,9 +326,9 @@ void host_gemm_64(enum spral::ssids::cpu::operation transa, /* _GEMV */ template <> void gemv_64(enum spral::ssids::cpu::operation trans, - int64_t m, int64_t n, double alpha, const double* a, - int64_t lda, const double* x, int64_t incx, - double beta, double* y, int64_t incy) { + int64_t m, int64_t n, double alpha, const double* a, + int64_t lda, const double* x, int64_t incx, + double beta, double* y, int64_t incy) { char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T'; spral_c_dgemv_64(&ftrans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); @@ -336,7 +337,7 @@ void gemv_64(enum spral::ssids::cpu::operation trans, /* _POTRF */ template<> int64_t lapack_potrf_64(enum spral::ssids::cpu::fillmode uplo, - int64_t n, double* a, int64_t lda) { + int64_t n, double* a, int64_t lda) { char fuplo; switch(uplo) { case spral::ssids::cpu::FILL_MODE_LWR: fuplo = 'L'; break; @@ -351,8 +352,9 @@ int64_t lapack_potrf_64(enum spral::ssids::cpu::fillmode uplo, /* _SYTRF - Bunch-Kaufman factorization */ template<> int64_t lapack_sytrf_64(enum spral::ssids::cpu::fillmode uplo, - int64_t n, double* a, int64_t lda, int64_t *ipiv, - double* work, int64_t lwork) { + int64_t n, double* a, + int64_t lda, int64_t *ipiv, + double* work, int64_t lwork) { char fuplo; switch(uplo) { case spral::ssids::cpu::FILL_MODE_LWR: fuplo = 'L'; break; @@ -367,9 +369,9 @@ int64_t lapack_sytrf_64(enum spral::ssids::cpu::fillmode uplo, /* _SYRK */ template <> void host_syrk_64(enum spral::ssids::cpu::fillmode uplo, - enum spral::ssids::cpu::operation trans, - int64_t n, int64_t k, double alpha, const double* a, - int64_t lda, double beta, double* c, int64_t ldc) { + enum spral::ssids::cpu::operation trans, + int64_t n, int64_t k, double alpha, const double* a, + int64_t lda, double beta, double* c, int64_t ldc) { char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U'; char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T'; spral_c_dsyrk_64(&fuplo, &ftrans, &n, &k, &alpha, a, &lda, &beta, c, &ldc); @@ -378,10 +380,10 @@ void host_syrk_64(enum spral::ssids::cpu::fillmode uplo, /* _TRSV */ template <> void host_trsv_64(enum spral::ssids::cpu::fillmode uplo, - enum spral::ssids::cpu::operation trans, - enum spral::ssids::cpu::diagonal diag, - int64_t n, const double* a, int64_t lda, - double* x, int64_t incx) { + enum spral::ssids::cpu::operation trans, + enum spral::ssids::cpu::diagonal diag, + int64_t n, const double* a, int64_t lda, + double* x, int64_t incx) { char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U'; char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T'; char fdiag = (diag==spral::ssids::cpu::DIAG_UNIT) ? 'U' : 'N'; @@ -391,11 +393,11 @@ void host_trsv_64(enum spral::ssids::cpu::fillmode uplo, /* _TRSM */ template <> void host_trsm_64(enum spral::ssids::cpu::side side, - enum spral::ssids::cpu::fillmode uplo, - enum spral::ssids::cpu::operation transa, - enum spral::ssids::cpu::diagonal diag, - int64_t m, int64_t n, double alpha, const double* a, - int64_t lda, double* b, int64_t ldb) { + enum spral::ssids::cpu::fillmode uplo, + enum spral::ssids::cpu::operation transa, + enum spral::ssids::cpu::diagonal diag, + int64_t m, int64_t n, double alpha, const double* a, + int64_t lda, double* b, int64_t ldb) { char fside = (side==spral::ssids::cpu::SIDE_LEFT) ? 'L' : 'R'; char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U'; char ftransa = (transa==spral::ssids::cpu::OP_N) ? 'N' : 'T';