diff --git a/include/galahad_blas.h b/include/galahad_blas.h
index 2e3077f28b..0456227e0e 100644
--- a/include/galahad_blas.h
+++ b/include/galahad_blas.h
@@ -1,6 +1,6 @@
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_BLAS_interface GALAHAD_BLAS_interface_64
-#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER
+#ifdef NO_UNDERSCORE_INTEGER_64
 #define DASUM  DASUM64
 #define DCABS1 DCABS164
 #define DDOT   DDOT64
@@ -69,7 +69,7 @@
 #define ZTRMM  ZTRMM64
 #define ZTRMV  ZTRMV64
 #define ZTRSM  ZTRSM64
-#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER
+#elif DOUBLE_UNDERSCORE_INTEGER_64
 #define DASUM  DASUM__64
 #define DCABS1 DCABS1__64
 #define DDOT   DDOT__64
@@ -138,7 +138,7 @@
 #define ZTRMM  ZTRMM__64
 #define ZTRMV  ZTRMV__64
 #define ZTRSM  ZTRSM__64
-#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER
+#elif NO_SYMBOL_INTEGER_64
 #else
 #define DASUM  DASUM_64
 #define DCABS1 DCABS1_64
diff --git a/include/galahad_blas_original.h b/include/galahad_blas_original.h
index 5a24f81068..7972a8b267 100644
--- a/include/galahad_blas_original.h
+++ b/include/galahad_blas_original.h
@@ -1,6 +1,6 @@
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_BLAS_interface GALAHAD_BLAS_interface_64
-#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER
+#ifdef NO_UNDERSCORE_INTEGER_64
 #define SNRM2 SNRM264
 #define DNRM2 DNRM264
 #define ISAMAX ISAMAX64
@@ -25,7 +25,7 @@
 #define DGEMM DGEMM64
 #define SGER SGER64
 #define DGER DGER64
-#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER
+#elif DOUBLE_UNDERSCORE_INTEGER_64
 #define SNRM2 SNRM2__64
 #define DNRM2 DNRM2__64
 #define ISAMAX ISAMAX__64
@@ -50,7 +50,7 @@
 #define DGEMM DGEMM__64
 #define SGER SGER__64
 #define DGER DGER__64
-#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER
+#elif NO_SYMBOL_INTEGER_64
 #define SNRM2 SNRM2
 #define DNRM2 DNRM2
 #define ISAMAX ISAMAX
diff --git a/include/galahad_kinds.h b/include/galahad_kinds.h
index dd34d8890b..789e8c7dc3 100644
--- a/include/galahad_kinds.h
+++ b/include/galahad_kinds.h
@@ -1,4 +1,4 @@
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_KINDS_single GALAHAD_KINDS_single_64
 #define GALAHAD_KINDS_double GALAHAD_KINDS_double_64
 #endif
diff --git a/include/galahad_lapack.h b/include/galahad_lapack.h
index eb07daa2a5..878d7c10b5 100644
--- a/include/galahad_lapack.h
+++ b/include/galahad_lapack.h
@@ -1,7 +1,7 @@
 #include "galahad_blas.h"
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_LAPACK_interface GALAHAD_LAPACK_interface_64
-#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER
+#ifdef NO_UNDERSCORE_INTEGER_64
 #define DISNAN DISNAN64
 #define DLADIV DLADIV64
 #define DLAISN DLAISN64
@@ -275,7 +275,7 @@
 #define ZLARFB ZLARFB64
 #define ZLARFG ZLARFG64
 #define ZLARFT ZLARFT64
-#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER
+#elif DOUBLE_UNDERSCORE_INTEGER_64
 #define DISNAN DISNAN__64
 #define DLADIV DLADIV__64
 #define DLAISN DLAISN__64
@@ -549,7 +549,7 @@
 #define ZLARFB ZLARFB__64
 #define ZLARFG ZLARFG__64
 #define ZLARFT ZLARFT__64
-#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER
+#elif NO_SYMBOL_INTEGER_64
 #else
 #define DISNAN DISNAN_64
 #define DLADIV DLADIV_64
diff --git a/include/galahad_lapack_original.h b/include/galahad_lapack_original.h
index 03da26d27e..e2dc2247a7 100644
--- a/include/galahad_lapack_original.h
+++ b/include/galahad_lapack_original.h
@@ -1,6 +1,6 @@
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_LAPACK_interface GALAHAD_LAPACK_interface_64
-#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER
+#ifdef NO_UNDERSCORE_INTEGER_64
 #define SGETRF SGETRF64
 #define DGETRF DGETRF64
 #define SGETRS SGETRS64
@@ -39,7 +39,7 @@
 #define DSTERF DSTERF64
 #define SLAEV2 SLAEV264
 #define DLAEV2 DLAEV264
-#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER
+#elif DOUBLE_UNDERSCORE_INTEGER_64
 #define SGETRF SGETRF__64
 #define DGETRF DGETRF__64
 #define SGETRS SGETRS__64
@@ -78,45 +78,7 @@
 #define DSTERF DSTERF__64
 #define SLAEV2 SLAEV2__64
 #define DLAEV2 DLAEV2__64
-#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER
-#define SGETRF SGETRF
-#define DGETRF DGETRF
-#define SGETRS SGETRS
-#define DGETRS DGETRS
-#define SGELS  SGELS
-#define DGELS  DGELS
-#define SGELSY SGELSY
-#define DGELSY DGELSY
-#define SGELSS SGELSS
-#define DGELSS DGELSS
-#define SGELSD SGELSD
-#define DGELSD DGELSD
-#define SGESVD SGESVD
-#define DGESVD DGESVD
-#define SPTTRF SPTTRF
-#define DPTTRF DPTTRF
-#define SPOTRF SPOTRF
-#define DPOTRF DPOTRF
-#define SPOTRS SPOTRS
-#define DPOTRS DPOTRS
-#define SSYTRF SSYTRF
-#define DSYTRF DSYTRF
-#define SSYTRS SSYTRS
-#define DSYTRS DSYTRS
-#define SPBTRF SPBTRF
-#define DPBTRF DPBTRF
-#define SPBTRS SPBTRS
-#define DPBTRS DPBTRS
-#define SSYEV  SSYEV
-#define DSYEV  DSYEV
-#define SSYGV  SSYGV
-#define DSYGV  DSYGV
-#define SHSEQR SHSEQR
-#define DHSEQR DHSEQR
-#define SSTERF SSTERF
-#define DSTERF DSTERF
-#define SLAEV2 SLAEV2
-#define DLAEV2 DLAEV2
+#elif NO_SYMBOL_INTEGER_64
 #else
 #define SGETRF SGETRF_64
 #define DGETRF DGETRF_64
diff --git a/include/galahad_modules.h b/include/galahad_modules.h
index dba91fdb0d..687c15b362 100644
--- a/include/galahad_modules.h
+++ b/include/galahad_modules.h
@@ -1,4 +1,4 @@
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_BLAS_interface GALAHAD_BLAS_interface_64
 #define GALAHAD_LAPACK_interface GALAHAD_LAPACK_interface_64
 #define GALAHAD_KINDS_single GALAHAD_KINDS_single_64
@@ -6,7 +6,7 @@
 #endif
 
 #ifdef GALAHAD_SINGLE
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 
 #define CUTEst_interface_precision CUTEST_interface_single_64
 #define CUTEST_interface_precision CUTEST_interface_single_64
@@ -743,7 +743,7 @@
 
 #else
 
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 
 #define CUTEst_interface_precision CUTEST_interface_double_64
 #define CUTEST_interface_precision CUTEST_interface_double_64
@@ -1479,7 +1479,7 @@
 #ifdef GALAHAD_SINGLE
 #define mumps_struc smumps_struc
 #define MUMPS_STRUC SMUMPS_STRUC
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_MUMPS_TYPES_precision GALAHAD_MUMPS_TYPES_single_64
 #ifdef DUMMY_SMUMPS
 #define MUMPS_precision GALAHAD_SMUMPS_64
@@ -1497,7 +1497,7 @@
 #else
 #define mumps_struc dmumps_struc
 #define MUMPS_STRUC DMUMPS_STRUC
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_MUMPS_TYPES_precision GALAHAD_MUMPS_TYPES_double_64
 #ifdef DUMMY_DMUMPS
 #define MUMPS_precision GALAHAD_DMUMPS_64
diff --git a/include/metis.h b/include/metis.h
index e3e029ced1..7184844075 100644
--- a/include/metis.h
+++ b/include/metis.h
@@ -30,7 +30,7 @@
  GCC does provides these definitions in stdint.h, but it may require some
  modifications on other architectures.
 --------------------------------------------------------------------------*/
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
   #define IDXTYPEWIDTH 64
 #else
   #define IDXTYPEWIDTH 32
diff --git a/include/spral_procedures.h b/include/spral_procedures.h
index 425a3ec830..c83b023e6f 100644
--- a/include/spral_procedures.h
+++ b/include/spral_procedures.h
@@ -1,5 +1,5 @@
 #ifdef GALAHAD_SINGLE
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_KINDS_precision galahad_kinds_single_64
 #define SPRAL_SSIDS_precision spral_ssids_single_64
 #else
@@ -7,7 +7,7 @@
 #define SPRAL_SSIDS_precision spral_ssids_single
 #endif
 #else
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_KINDS_precision galahad_kinds_double_64
 #define SPRAL_SSIDS_precision spral_ssids_double_64
 #else
@@ -16,13 +16,13 @@
 #endif
 #endif
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define spral_ssids_lapack_iface spral_ssids_lapack_iface_64
 #define spral_ssids_blas_iface spral_ssids_blas_iface_64
 #endif
 
 #ifdef SPRAL_SINGLE
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define SPRAL_KINDS_precision spral_kinds_single_64
 #define spral_kinds_precision spral_kinds_single_64
 #define spral_ssids_precision spral_ssids_single_64
@@ -66,7 +66,7 @@
 #define spral_matrix_util_precision spral_matrix_util_single
 #endif
 #else
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define SPRAL_KINDS_precision spral_kinds_double_64
 #define spral_kinds_precision spral_kinds_double_64
 #define spral_ssids_precision spral_ssids_double_64
diff --git a/include/spral_ssids.h b/include/spral_ssids.h
index f25e0b9361..435e5bfb98 100644
--- a/include/spral_ssids.h
+++ b/include/spral_ssids.h
@@ -1,4 +1,7 @@
 //* \file spral_ssids.h */
+/**  
+ * \version   GALAHAD 4.3 - 2024-02-04 AT 10:10 GMT
+ */
 
 #ifdef __cplusplus
 extern "C" {
@@ -13,62 +16,63 @@ extern "C" {
 
 // precision
 #include "galahad_precision.h"
+#include "ssids_rip.hxx"
 
 /************************************
  * Derived types
  ************************************/
 
 struct spral_ssids_options {
-   int array_base; // Not in Fortran type
-   int print_level;
-   int unit_diagnostics;
-   int unit_error;
-   int unit_warning;
-   int ordering;
-   int nemin;
+   ipc_ array_base; // Not in Fortran type
+   ipc_ print_level;
+   ipc_ unit_diagnostics;
+   ipc_ unit_error;
+   ipc_ unit_warning;
+   ipc_ ordering;
+   ipc_ nemin;
    bool ignore_numa;
    bool use_gpu;
    bool gpu_only;
-   int64_t min_gpu_work;
+   longc_ min_gpu_work;
    float max_load_inbalance;
    float gpu_perf_coeff;
-   int scaling;
-   int64_t small_subtree_threshold;
-   int cpu_block_size;
+   ipc_ scaling;
+   longc_ small_subtree_threshold;
+   ipc_ cpu_block_size;
    bool action;
-   int pivot_method;
+   ipc_ pivot_method;
    real_wp_ small;
    real_wp_ u;
-   int nstream;
+   ipc_ nstream;
    real_wp_ multiplier;
    float min_loadbalance;
-   int failed_pivot_method;
+   ipc_ failed_pivot_method;
    // char unused[80]; // Allow for future expansion
 };
 
 struct spral_ssids_inform {
-   int flag;
-   int matrix_dup;
-   int matrix_missing_diag;
-   int matrix_outrange;
-   int matrix_rank;
-   int maxdepth;
-   int maxfront;
-   int maxsupernode;
-   int num_delay;
-   int64_t num_factor;
-   int64_t num_flops;
-   int num_neg;
-   int num_sup;
-   int num_two;
-   int stat;
-   int cuda_error;
-   int cublas_error;
-   int not_first_pass;
-   int not_second_pass;
-   int nparts;
-   int64_t cpu_flops;
-   int64_t gpu_flops;
+   ipc_ flag;
+   ipc_ matrix_dup;
+   ipc_ matrix_missing_diag;
+   ipc_ matrix_outrange;
+   ipc_ matrix_rank;
+   ipc_ maxdepth;
+   ipc_ maxfront;
+   ipc_ maxsupernode;
+   ipc_ num_delay;
+   longc_ num_factor;
+   longc_ num_flops;
+   ipc_ num_neg;
+   ipc_ num_sup;
+   ipc_ num_two;
+   ipc_ stat;
+   ipc_ cuda_error;
+   ipc_ cublas_error;
+   ipc_ not_first_pass;
+   ipc_ not_second_pass;
+   ipc_ nparts;
+   longc_ cpu_flops;
+   longc_ gpu_flops;
    // char unused[76]; // Allow for future expansion
 };
 
@@ -79,40 +83,40 @@ struct spral_ssids_inform {
 /* Initialize options to defaults */
 void spral_ssids_default_options(struct spral_ssids_options *options);
 /* Perform analysis phase for CSC data */
-void spral_ssids_analyse(bool check, int n, int *order, const int64_t *ptr,
-      const int *row, const real_wp_ *val, void **akeep,
+void spral_ssids_analyse(bool check, ipc_ n, ipc_ *order, const longc_ *ptr,
+      const ipc_ *row, const real_wp_ *val, void **akeep,
       const struct spral_ssids_options *options,
       struct spral_ssids_inform *inform);
-void spral_ssids_analyse_ptr32(bool check, int n, int *order, const int *ptr,
-      const int *row, const real_wp_ *val, void **akeep,
+void spral_ssids_analyse_ptr32(bool check, ipc_ n, ipc_ *order, const int *ptr,
+      const ipc_ *row, const real_wp_ *val, void **akeep,
       const struct spral_ssids_options *options,
       struct spral_ssids_inform *inform);
 /* Perform analysis phase for coordinate data */
-void spral_ssids_analyse_coord(int n, int *order, int64_t ne, const int *row,
-      const int *col, const real_wp_ *val, void **akeep,
+void spral_ssids_analyse_coord(ipc_ n, ipc_ *order, longc_ ne, const ipc_ *row,
+      const ipc_ *col, const real_wp_ *val, void **akeep,
       const struct spral_ssids_options *options,
       struct spral_ssids_inform *inform);
 /* Perform numerical factorization */
-void spral_ssids_factor(bool posdef, const int64_t *ptr, const int *row,
+void spral_ssids_factor(bool posdef, const longc_ *ptr, const ipc_ *row,
       const real_wp_ *val, real_wp_ *scale, void *akeep, void **fkeep,
       const struct spral_ssids_options *options,
       struct spral_ssids_inform *inform);
-void spral_ssids_factor_ptr32(bool posdef, const int *ptr, const int *row,
+void spral_ssids_factor_ptr32(bool posdef, const int *ptr, const ipc_ *row,
       const real_wp_ *val, real_wp_ *scale, void *akeep, void **fkeep,
       const struct spral_ssids_options *options,
       struct spral_ssids_inform *inform);
 /* Perform triangular solve(s) for single rhs */
-void spral_ssids_solve1(int job, real_wp_ *x1, void *akeep, void *fkeep,
+void spral_ssids_solve1(ipc_ job, real_wp_ *x1, void *akeep, void *fkeep,
       const struct spral_ssids_options *options,
       struct spral_ssids_inform *inform);
 /* Perform triangular solve(s) for one or more rhs */
-void spral_ssids_solve(int job, int nrhs, real_wp_ *x, int ldx, void *akeep,
+void spral_ssids_solve(ipc_ job, ipc_ nrhs, real_wp_ *x, ipc_ ldx, void *akeep,
       void *fkeep, const struct spral_ssids_options *options,
       struct spral_ssids_inform *inform);
 /* Free memory */
-int spral_ssids_free_akeep(void **akeep);
-int spral_ssids_free_fkeep(void **fkeep);
-int spral_ssids_free(void **akeep, void **fkeep);
+ipc_ spral_ssids_free_akeep(void **akeep);
+ipc_ spral_ssids_free_fkeep(void **fkeep);
+ipc_ spral_ssids_free(void **akeep, void **fkeep);
 
 /************************************
  * Advanced subroutines
@@ -125,7 +129,7 @@ void spral_ssids_enquire_posdef(const void *akeep, const void *fkeep,
 /* Retrieve information on pivots (indefinite case) */
 void spral_ssids_enquire_indef(const void *akeep, const void *fkeep,
       const struct spral_ssids_options *options,
-      struct spral_ssids_inform *inform, int *piv_order, real_wp_ *d);
+      struct spral_ssids_inform *inform, ipc_ *piv_order, real_wp_ *d);
 /* Alter pivots (indefinite case only) */
 void spral_ssids_alter(const real_wp_ *d, const void *akeep, void *fkeep,
       const struct spral_ssids_options *options,
diff --git a/include/ssids_contrib.h b/include/ssids_contrib.h
index 1028c82e59..dbd7a529b2 100644
--- a/include/ssids_contrib.h
+++ b/include/ssids_contrib.h
@@ -2,10 +2,14 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 13:30 GMT
  *
  *  \brief Defines C++ interface to routines from spral_ssids_contrib and
  *         spral_ssids_contrib_free modules.
  */
+
+#include "ssids_rip.hxx"
+
 #ifndef SPRAL_SSIDS_CONTRIB_H
 #define SPRAL_SSIDS_CONTRIB_H
 
@@ -15,15 +19,17 @@ extern "C" {
 
 #ifdef SPRAL_SINGLE
 void spral_ssids_contrib_get_data_single(const void *const contrib,
-      int *const n, const float* *const val, int *const ldval,
-      const int* *const rlist, int *const ndelay, const int* *const delay_perm,
-      const float* *const delay_val, int *const lddelay);
+      ipc_ *const n, const float* *const val, ipc_ *const ldval,
+      const ipc_* *const rlist, ipc_ *const ndelay, 
+      const ipc_* *const delay_perm,
+      const float* *const delay_val, ipc_ *const lddelay);
 void spral_ssids_contrib_free_sgl(void *const contrib);
 #else
 void spral_ssids_contrib_get_data_double(const void *const contrib,
-      int *const n, const double* *const val, int *const ldval,
-      const int* *const rlist, int *const ndelay, const int* *const delay_perm,
-      const double* *const delay_val, int *const lddelay);
+      ipc_ *const n, const double* *const val, ipc_ *const ldval,
+      const ipc_* *const rlist, ipc_ *const ndelay, 
+      const ipc_* *const delay_perm,
+      const double* *const delay_val, ipc_ *const lddelay);
 void spral_ssids_contrib_free_dbl(void *const contrib);
 #endif
 
diff --git a/include/ssids_cpu_AppendAlloc.hxx b/include/ssids_cpu_AppendAlloc.hxx
index a86a774199..9315ae5274 100644
--- a/include/ssids_cpu_AppendAlloc.hxx
+++ b/include/ssids_cpu_AppendAlloc.hxx
@@ -2,7 +2,9 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-04 AT 10:10 GMT
  */
+
 #pragma once
 
 //#define MEM_STATS
@@ -10,6 +12,7 @@
 #include <memory>
 
 #include "spral_compat.hxx" // for std::align if required
+#include "ssids_rip.hxx"
 
 namespace spral { namespace ssids { namespace cpu {
 
@@ -22,11 +25,11 @@ namespace append_alloc_internal {
  */
 class Page {
 #if defined(__AVX512F__)
-  static const int align = 64; // 64 byte alignment
+  static const ipc_ align = 64; // 64 byte alignment
 #elif defined(__AVX__)
-  static const int align = 32; // 32 byte alignment
+  static const ipc_ align = 32; // 32 byte alignment
 #else
-  static const int align = 16; // 16 byte alignment
+  static const ipc_ align = 16; // 16 byte alignment
 #endif
 public:
    Page(size_t sz, Page* next=nullptr)
diff --git a/include/ssids_cpu_BuddyAllocator.hxx b/include/ssids_cpu_BuddyAllocator.hxx
index d84f275e61..fb990ff417 100644
--- a/include/ssids_cpu_BuddyAllocator.hxx
+++ b/include/ssids_cpu_BuddyAllocator.hxx
@@ -2,6 +2,7 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-04 AT 10:10 GMT
  */
 #pragma once
 
@@ -10,6 +11,7 @@
 #include <memory>
 
 #include "spral_omp.hxx"
+#include "ssids_rip.hxx"
 
 namespace spral { namespace ssids { namespace cpu {
 
@@ -36,18 +38,18 @@ namespace buddy_alloc_internal {
 template <typename CharAllocator=std::allocator<char>>
 class Page {
    // \{
-   typedef typename std::allocator_traits<CharAllocator>::template rebind_traits<int> IntAllocTraits;
+   typedef typename std::allocator_traits<CharAllocator>::template rebind_traits<ipc_> IntAllocTraits;
    // \}
-   static int const nlevel=16; ///< Number of divisions to smallest allocation unit.
+   static ipc_ const nlevel=16; ///< Number of divisions to smallest allocation unit.
 
 #if defined(__AVX512F__)
-  static int const align=64; ///< Underlying alignment of all pointers returned
+  static ipc_ const align=64; ///< Underlying alignment of all pointers returned
 #elif defined(__AVX__)
-  static int const align=32; ///< Underlying alignment of all pointers returned
+  static ipc_ const align=32; ///< Underlying alignment of all pointers returned
 #else
-  static int const align=16; ///< Underlying alignment of all pointers returned
+  static ipc_ const align=16; ///< Underlying alignment of all pointers returned
 #endif
-   static int const ISSUED_FLAG = -2; ///< Flag: value is issued
+   static ipc_ const ISSUED_FLAG = -2; ///< Flag: value is issued
 public:
    // \{
    Page(Page const&) =delete; // not copyable
@@ -78,7 +80,7 @@ public:
       next_ = IntAllocTraits::allocate(intAlloc, 1<<(nlevel-1));
       /* Initialize data structures */
       head_[nlevel-1] = 0; next_[0] = -1; // a single free block at top level
-      for(int i=0; i<nlevel-1; ++i)
+      for(ipc_ i=0; i<nlevel-1; ++i)
          head_[i] = -1; // ... and no free blocks at other levels
 #ifdef MEM_STATS
       printf("BuddyAllocator: Creating new page %p size %ld\n", mem_, size_);
@@ -93,7 +95,7 @@ public:
       other.mem_ = nullptr;
       other.base_ = nullptr;
       other.next_ = nullptr;
-      for(int i=0; i<nlevel; ++i)
+      for(ipc_ i=0; i<nlevel; ++i)
          head_[i] = other.head_[i];
 #ifdef MEM_STATS
       used_ = other.used_;
@@ -127,7 +129,7 @@ public:
    void* allocate(std::size_t sz) {
       if(sz > size_) return nullptr; // too big: don't even try
       // Determine which level of block we're trying to find
-      int level = sz_to_level(sz);
+      ipc_ level = sz_to_level(sz);
       void* ptr = addr_to_ptr(get_next_ptr(level));
 #ifdef MEM_STATS
       if(ptr) {
@@ -139,8 +141,8 @@ public:
    }
    /** \brief Release memory associated with ptr for reuse. */
    void deallocate(void* ptr, std::size_t sz) {
-      int idx = ptr_to_addr(ptr);
-      int level = sz_to_level(sz);
+      ipc_ idx = ptr_to_addr(ptr);
+      ipc_ level = sz_to_level(sz);
       mark_free(idx, level);
 #ifdef MEM_STATS
       used_ -= sz;
@@ -148,7 +150,7 @@ public:
    }
    /** \brief Return true if this Page owners given pointer */
    bool is_owner(void* ptr) {
-      int idx = ptr_to_addr(ptr);
+      ipc_ idx = ptr_to_addr(ptr);
       return (idx>=0 && idx<(1<<(nlevel-1)));
    }
    /**
@@ -159,8 +161,8 @@ public:
     * */
    size_t count_free() const {
       size_t free=0;
-      for(int i=0; i<nlevel; ++i) {
-         for(int p=head_[i]; p!=-1; p=next_[p])
+      for(ipc_ i=0; i<nlevel; ++i) {
+         for(ipc_ p=head_[i]; p!=-1; p=next_[p])
             free += (1<<i) * min_size_;
       }
       return free;
@@ -172,25 +174,25 @@ public:
 private:
    /** Returns next ptr at given level, creating one if required.
     *  If we cannot create one, return -1 */
-   int get_next_ptr(int level) {
+   ipc_ get_next_ptr(ipc_ level) {
       if(level<0 || level>=nlevel) return -1; // invalid level
       if(head_[level] == -1) {
          // Need to split next level up to get one
-         int above = get_next_ptr(level+1);
+         ipc_ above = get_next_ptr(level+1);
          if(above==-1) return -1; // couldn't find one
          split_block(level+1, above);
       }
-      int p = head_[level];
+      ipc_ p = head_[level];
       head_[level] = next_[p];
       next_[p] = ISSUED_FLAG;
       return p;
    }
 
    /** Marks given block as free, tries to merge with partner if possible */
-   void mark_free(int idx, int level) {
+   void mark_free(ipc_ idx, ipc_ level) {
       if(level < nlevel-1) {
          // There exists a partner, see if we can merge with it
-         int partner = get_partner(idx, level);
+         ipc_ partner = get_partner(idx, level);
          if(next_[partner] != ISSUED_FLAG) {
             // Partner is free in *some* list, not necessarily this level
             if(remove_from_free_list(partner, level)) {
@@ -208,9 +210,9 @@ private:
    /** Finds the given address in free list for level and removes it.
     *  Returns false if it cannot be found, true otherwise.
     */
-   bool remove_from_free_list(int idx, int level) {
-      int prev = -1;
-      int current = head_[level];
+   bool remove_from_free_list(ipc_ idx, ipc_ level) {
+      ipc_ prev = -1;
+      ipc_ current = head_[level];
       while(current!=-1 && current != idx) {
          prev = current;
          current = next_[current];
@@ -227,36 +229,36 @@ private:
    }
 
    /** Splits the given block */
-   void split_block(int level, int block) {
-      int left = block;
-      int right = get_partner(block, level-1);
+   void split_block(ipc_ level, ipc_ block) {
+      ipc_ left = block;
+      ipc_ right = get_partner(block, level-1);
       next_[right] = head_[level-1];
       next_[left] = right;
       head_[level-1] = left;
    }
 
    /** Given address location, return pointer */
-   void* addr_to_ptr(int idx) {
+   void* addr_to_ptr(ipc_ idx) {
       return (idx==-1) ? nullptr : base_ + idx*min_size_;
    }
 
    /** Given pointer, return address */
-   int ptr_to_addr(void* ptr) {
+   ipc_ ptr_to_addr(void* ptr) {
       return
          static_cast<uintptr_t>(static_cast<char*>(ptr)-base_) / min_size_;
    }
 
    /** Given a size, find the relevant level */
-   int sz_to_level(std::size_t sz) {
-      int val = sz / min_size_;
+   ipc_ sz_to_level(std::size_t sz) {
+      ipc_ val = sz / min_size_;
       // Find next power of 2 higher than val
-      int level = 0;
+      ipc_ level = 0;
       while((val>>level) > 0) ++level;
       return level;
    }
 
    /** Given an index find its partner at given level */
-   int get_partner(int idx, int level) {
+   ipc_ get_partner(ipc_ idx, ipc_ level) {
       return idx ^ (1<<level);
    }
 
@@ -265,8 +267,8 @@ private:
    size_t size_; ///< Maximum size.
    char* mem_; ///< Pointer to memory allocation.
    char* base_; ///< Aligned memory base.
-   int head_[nlevel]; ///< First free block at each level's size.
-   int *next_; ///< Next free block at given level.
+   ipc_ head_[nlevel]; ///< First free block at each level's size.
+   ipc_ *next_; ///< Next free block at given level.
 #ifdef MEM_STATS
    size_t used_ = 0; ///< Total amount used.
    size_t max_used_ = 0; ///< High water mark of used_.
@@ -319,6 +321,7 @@ public:
          if(ptr) break; // allocation suceeded
       }
       if(!ptr) {
+      //if(ptr == NULL) {
          // Failed to alloc on existing page: make a bigger page and use it
 #ifdef MEM_STATS
          printf("Failed to allocate %ld on existing page...\n", sz);
diff --git a/include/ssids_cpu_NumericNode.hxx b/include/ssids_cpu_NumericNode.hxx
index 8cc1cffa1c..6f65f9f96a 100644
--- a/include/ssids_cpu_NumericNode.hxx
+++ b/include/ssids_cpu_NumericNode.hxx
@@ -2,9 +2,13 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 14:30 GMT
  */
+
 #pragma once
 
+#include "ssids_rip.hxx"
+
 namespace spral { namespace ssids { namespace cpu {
 
 class SymbolicNode;
@@ -64,11 +68,11 @@ public:
    NumericNode<T, PoolAllocator>* next_child; // Pointer to parent's next child
 
    /* Data that changes during factorize */
-   int ndelay_in; // Number of delays arising from children
-   int ndelay_out; // Number of delays arising to push into parent
-   int nelim; // Number of columns succesfully eliminated
+   ipc_ ndelay_in; // Number of delays arising from children
+   ipc_ ndelay_out; // Number of delays arising to push into parent
+   ipc_ nelim; // Number of columns succesfully eliminated
    T *lcol; // Pointer to start of factor data
-   int *perm; // Pointer to permutation
+   ipc_ *perm; // Pointer to permutation
    T *contrib; // Pointer to contribution block
 private:
    PoolAllocator pool_alloc_; // Our own version of pool allocator for freeing
diff --git a/include/ssids_cpu_NumericSubtree.hxx b/include/ssids_cpu_NumericSubtree.hxx
index 9e5959d9ca..9b161063e4 100644
--- a/include/ssids_cpu_NumericSubtree.hxx
+++ b/include/ssids_cpu_NumericSubtree.hxx
@@ -2,9 +2,12 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 07:40 GMT
  */
+
 #pragma once
 
+#include "ssids_rip.hxx"
 #include "ssids_profile.hxx"
 #include "ssids_cpu_cpu_iface.hxx"
 #include "ssids_cpu_factor.hxx"
@@ -77,7 +80,7 @@ public:
    {
       /* Associate symbolic nodes to numeric ones; copy tree structure */
       nodes_.reserve(symbolic_subtree.nnodes_+1);
-      for(int ni=0; ni<symb_.nnodes_+1; ++ni) {
+      for(ipc_ ni=0; ni<symb_.nnodes_+1; ++ni) {
          nodes_.emplace_back(symbolic_subtree[ni], pool_alloc_);
          auto* fc = symbolic_subtree[ni].first_child;
          nodes_[ni].first_child = fc ? &nodes_[fc->idx] : nullptr;
@@ -86,11 +89,11 @@ public:
       }
 
       /* Allocate workspaces */
-      int num_threads = omp_get_num_threads();
+      ipc_ num_threads = omp_get_num_threads();
       std::vector<ThreadStats> thread_stats(num_threads);
       std::vector<Workspace> work;
       work.reserve(num_threads);
-      for(int i=0; i<num_threads; ++i)
+      for(ipc_ i=0; i<num_threads; ++i)
          work.emplace_back(PAGE_SIZE);
 
        // initialise stats already so we can safely early-return in case of
@@ -107,7 +110,7 @@ public:
       #pragma omp taskgroup
       {
          /* Loop over small leaf subtrees */
-         for(unsigned int si=0; si<symb_.small_leafs_.size(); ++si) {
+         for(uipc_ si=0; si<symb_.small_leafs_.size(); ++si) {
             auto* parent_lcol = nodes_.data() + symb_.small_leafs_[si].get_parent();
             #pragma omp task default(none) \
                firstprivate(si) \
@@ -120,7 +123,7 @@ public:
               if (!my_abort) {
                // #pragma omp cancellation point taskgroup
                try {
-                  int this_thread = omp_get_thread_num();
+                  ipc_ this_thread = omp_get_thread_num();
 #ifdef PROFILE
                   Profile::Task task_subtree("TA_SUBTREE");
 #endif
@@ -168,7 +171,7 @@ public:
          }
 
          /* Loop over singleton nodes in order */
-         for(int ni=0; ni<symb_.nnodes_; ++ni) {
+         for(ipc_ ni=0; ni<symb_.nnodes_; ++ni) {
             if(symb_[ni].insmallleaf) continue; // already handled
             auto* this_lcol = &nodes_[ni]; // for depend
             auto* parent_lcol = nodes_.data() + symb_[ni].parent; // for depend
@@ -188,16 +191,16 @@ public:
                   // printf("%d: Node %d parent %d (of %d) size %d x %d\n",
                   //       omp_get_thread_num(), ni, symb_[ni].parent,
                   //       symb_.nnodes_, symb_[ni].nrow, symb_[ni].ncol);
-                  int this_thread = omp_get_thread_num();
+                  ipc_ this_thread = omp_get_thread_num();
                   // Assembly of node (not of contribution block)
                   assemble_pre
                      (posdef, symb_.n, symb_[ni], child_contrib, nodes_[ni],
                       factor_alloc_, pool_alloc_, work, aval, scaling);
                   // Update stats
-                  int nrow = symb_[ni].nrow + nodes_[ni].ndelay_in;
+                  ipc_ nrow = symb_[ni].nrow + nodes_[ni].ndelay_in;
                   thread_stats[this_thread].maxfront =
                      std::max(thread_stats[this_thread].maxfront, nrow);
-                  int ncol = symb_[ni].ncol + nodes_[ni].ndelay_in;
+                  ipc_ ncol = symb_[ni].ncol + nodes_[ni].ndelay_in;
                   thread_stats[this_thread].maxsupernode =
                      std::max(thread_stats[this_thread].maxsupernode, ncol);
 
@@ -250,7 +253,6 @@ public:
          }
       } // taskgroup
 
-
       // Reduce thread_stats (stats already initialised above)
       for(auto tstats : thread_stats)
          stats += tstats;
@@ -261,12 +263,12 @@ public:
       if(posdef) {
          // all stats remain zero
       } else { // indefinite
-         for(int ni=0; ni<symb_.nnodes_; ni++) {
-            int m = symb_[ni].nrow + nodes_[ni].ndelay_in;
-            int n = symb_[ni].ncol + nodes_[ni].ndelay_in;
-            int ldl = align_lda<T>(m);
+         for(ipc_ ni=0; ni<symb_.nnodes_; ni++) {
+            ipc_ m = symb_[ni].nrow + nodes_[ni].ndelay_in;
+            ipc_ n = symb_[ni].ncol + nodes_[ni].ndelay_in;
+            ipc_ ldl = align_lda<T>(m);
             T *d = nodes_[ni].lcol + n*ldl;
-            for(int i=0; i<nodes_[ni].nelim; ) {
+            for(ipc_ i=0; i<nodes_[ni].nelim; ) {
                T a11 = d[2*i];
                T a21 = d[2*i+1];
                if(i+1==nodes_[ni].nelim || std::isfinite(d[2*i+2])) {
@@ -296,28 +298,28 @@ public:
       delete[] small_leafs_;
    }
 
-   void solve_fwd(int nrhs, T* x, int ldx) const {
+   void solve_fwd(ipc_ nrhs, T* x, ipc_ ldx) const {
       /* Allocate memory */
       T* xlocal = new T[nrhs*symb_.n];
-      int* map_alloc = (!posdef) ? new int[symb_.n] : nullptr; // only indef
+      ipc_* map_alloc = (!posdef) ? new ipc_[symb_.n] : nullptr; // only indef
 
       /* Main loop */
-      for(int ni=0; ni<symb_.nnodes_; ++ni) {
-         int m = symb_[ni].nrow;
-         int n = symb_[ni].ncol;
-         int nelim = (posdef) ? n
+      for(ipc_ ni=0; ni<symb_.nnodes_; ++ni) {
+         ipc_ m = symb_[ni].nrow;
+         ipc_ n = symb_[ni].ncol;
+         ipc_ nelim = (posdef) ? n
                               : nodes_[ni].nelim;
-         int ndin = (posdef) ? 0
+         ipc_ ndin = (posdef) ? 0
                              : nodes_[ni].ndelay_in;
-         int ldl = align_lda<T>(m+ndin);
+         ipc_ ldl = align_lda<T>(m+ndin);
 
          /* Build map (indef only) */
-         int const *map;
+         ipc_ const *map;
          if(!posdef) {
             // indef need to allow for permutation and/or delays
-            for(int i=0; i<n+ndin; ++i)
+            for(ipc_ i=0; i<n+ndin; ++i)
                map_alloc[i] = nodes_[ni].perm[i];
-            for(int i=n; i<m; ++i)
+            for(ipc_ i=n; i<m; ++i)
                map_alloc[i+ndin] = symb_[ni].rlist[i];
             map = map_alloc;
          } else {
@@ -328,8 +330,8 @@ public:
          /* Gather into dense vector xlocal */
          // FIXME: don't bother copying elements of x > m, just use beta=0
          //        in dgemm call and then add as we scatter
-         for(int r=0; r<nrhs; ++r)
-         for(int i=0; i<m+ndin; ++i)
+         for(ipc_ r=0; r<nrhs; ++r)
+         for(ipc_ i=0; i<m+ndin; ++i)
             xlocal[r*symb_.n+i] = x[r*ldx + map[i]-1]; // Fortran indexed
 
          /* Perform dense solve */
@@ -341,8 +343,8 @@ public:
          }
 
          /* Scatter result */
-         for(int r=0; r<nrhs; ++r)
-         for(int i=0; i<m+ndin; ++i)
+         for(ipc_ r=0; r<nrhs; ++r)
+         for(ipc_ i=0; i<m+ndin; ++i)
             x[r*ldx + map[i]-1] = xlocal[r*symb_.n+i];
       }
 
@@ -352,31 +354,31 @@ public:
    }
 
    template <bool do_diag, bool do_bwd>
-   void solve_diag_bwd_inner(int nrhs, T* x, int ldx) const {
+   void solve_diag_bwd_inner(ipc_ nrhs, T* x, ipc_ ldx) const {
       if(posdef && !do_bwd) return; // diagonal solve is a no-op for posdef
 
       /* Allocate memory - map only needed for indef bwd/diag_bwd solve */
       T* xlocal = new T[nrhs*symb_.n];
-      int* map_alloc = (!posdef && do_bwd) ? new int[symb_.n]
+      ipc_* map_alloc = (!posdef && do_bwd) ? new ipc_[symb_.n]
                                            : nullptr;
 
       /* Perform solve */
-      for(int ni=symb_.nnodes_-1; ni>=0; --ni) {
-         int m = symb_[ni].nrow;
-         int n = symb_[ni].ncol;
-         int nelim = (posdef) ? n
+      for(ipc_ ni=symb_.nnodes_-1; ni>=0; --ni) {
+         ipc_ m = symb_[ni].nrow;
+         ipc_ n = symb_[ni].ncol;
+         ipc_ nelim = (posdef) ? n
                               : nodes_[ni].nelim;
-         int ndin = (posdef) ? 0
+         ipc_ ndin = (posdef) ? 0
                              : nodes_[ni].ndelay_in;
 
          /* Build map (indef only) */
-         int const *map;
+         ipc_ const *map;
          if(!posdef) {
             // indef need to allow for permutation and/or delays
             if(do_bwd) {
-               for(int i=0; i<n+ndin; ++i)
+               for(ipc_ i=0; i<n+ndin; ++i)
                   map_alloc[i] = nodes_[ni].perm[i];
-               for(int i=n; i<m; ++i)
+               for(ipc_ i=n; i<m; ++i)
                   map_alloc[i+ndin] = symb_[ni].rlist[i];
                map = map_alloc;
             } else { // if only doing diagonal, only need first nelim<=n+ndin
@@ -388,11 +390,11 @@ public:
          }
 
          /* Gather into dense vector xlocal */
-         int blkm = (do_bwd) ? m+ndin
+         ipc_ blkm = (do_bwd) ? m+ndin
                              : nelim;
-         int ldl = align_lda<T>(m+ndin);
-         for(int r=0; r<nrhs; ++r)
-         for(int i=0; i<blkm; ++i)
+         ipc_ ldl = align_lda<T>(m+ndin);
+         for(ipc_ r=0; r<nrhs; ++r)
+         for(ipc_ i=0; i<blkm; ++i)
             xlocal[r*symb_.n+i] = x[r*ldx + map[i]-1];
 
          /* Perform dense solve */
@@ -408,8 +410,8 @@ public:
          }
 
          /* Scatter result (only first nelim entries have changed) */
-         for(int r=0; r<nrhs; ++r)
-         for(int i=0; i<nelim; ++i)
+         for(ipc_ r=0; r<nrhs; ++r)
+         for(ipc_ i=0; i<nelim; ++i)
             x[r*ldx + map[i]-1] = xlocal[r*symb_.n+i];
       }
 
@@ -418,15 +420,15 @@ public:
       delete[] xlocal;
    }
 
-   void solve_diag(int nrhs, T* x, int ldx) const {
+   void solve_diag(ipc_ nrhs, T* x, ipc_ ldx) const {
       solve_diag_bwd_inner<true, false>(nrhs, x, ldx);
    }
 
-   void solve_diag_bwd(int nrhs, T* x, int ldx) const {
+   void solve_diag_bwd(ipc_ nrhs, T* x, ipc_ ldx) const {
       solve_diag_bwd_inner<true, true>(nrhs, x, ldx);
    }
 
-   void solve_bwd(int nrhs, T* x, int ldx) const {
+   void solve_bwd(ipc_ nrhs, T* x, ipc_ ldx) const {
       solve_diag_bwd_inner<false, true>(nrhs, x, ldx);
    }
 
@@ -434,21 +436,21 @@ public:
     * Note that piv_order is only set in indefinite case.
     * One of piv_order or d may be null in indefinite case.
     */
-   void enquire(int *piv_order, T* d) const {
+   void enquire(ipc_ *piv_order, T* d) const {
       if(posdef) {
-         for(int ni=0; ni<symb_.nnodes_; ++ni) {
-            int blkm = symb_[ni].nrow;
-            int nelim = symb_[ni].ncol;
-            int ldl = align_lda<T>(blkm);
-            for(int i=0; i<nelim; ++i)
+         for(ipc_ ni=0; ni<symb_.nnodes_; ++ni) {
+            ipc_ blkm = symb_[ni].nrow;
+            ipc_ nelim = symb_[ni].ncol;
+            ipc_ ldl = align_lda<T>(blkm);
+            for(ipc_ i=0; i<nelim; ++i)
                *(d++) = nodes_[ni].lcol[i*(ldl+1)];
          }
       } else { /*indef*/
-         for(int ni=0, piv=0; ni<symb_.nnodes_; ++ni) {
-            int blkm = symb_[ni].nrow + nodes_[ni].ndelay_in;
-            int blkn = symb_[ni].ncol + nodes_[ni].ndelay_in;
-            int ldl = align_lda<T>(blkm);
-            int nelim = nodes_[ni].nelim;
+         for(ipc_ ni=0, piv=0; ni<symb_.nnodes_; ++ni) {
+            ipc_ blkm = symb_[ni].nrow + nodes_[ni].ndelay_in;
+            ipc_ blkn = symb_[ni].ncol + nodes_[ni].ndelay_in;
+            ipc_ ldl = align_lda<T>(blkm);
+            ipc_ nelim = nodes_[ni].nelim;
             T const* dptr = &nodes_[ni].lcol[blkn*ldl];
 //            if (d) {
 //              printf("d01 = %.1f %.1f\n", dptr[0], dptr[1]);
@@ -457,7 +459,7 @@ public:
 //              printf("d67 = %.1f %.1f\n", dptr[6], dptr[7]);
 //            }
 //          printf("ni = %i, nelim = %i\n", ni+1, nelim);
-            for(int i=0; i<nelim; ) {
+            for(ipc_ i=0; i<nelim; ) {
 //             bool a=i+1==nelim ;
 //             bool b=(std::isfinite(dptr[2*i+2]));
 //             printf(" i = %d a = %d b = %d\n", i, a, b );
@@ -498,30 +500,30 @@ public:
 
    /** Allows user to alter D values, indef case only. */
    void alter(T const* d) {
-      for(int ni=0; ni<symb_.nnodes_; ++ni) {
-         int blkm = symb_[ni].nrow + nodes_[ni].ndelay_in;
-         int blkn = symb_[ni].ncol + nodes_[ni].ndelay_in;
-         int ldl = align_lda<T>(blkm);
-         int nelim = nodes_[ni].nelim;
+      for(ipc_ ni=0; ni<symb_.nnodes_; ++ni) {
+         ipc_ blkm = symb_[ni].nrow + nodes_[ni].ndelay_in;
+         ipc_ blkn = symb_[ni].ncol + nodes_[ni].ndelay_in;
+         ipc_ ldl = align_lda<T>(blkm);
+         ipc_ nelim = nodes_[ni].nelim;
          T* dptr = &nodes_[ni].lcol[blkn*ldl];
-         T dum;
+//         T dum;
 
-         for(int i=0; i<nelim; ) {
+         for(ipc_ i=0; i<nelim; ) {
             if(i+1==nelim || std::isfinite(dptr[2*i+2])) {
                /* 1x1 pivot */
                dptr[2*i+0] = *(d++);
-               dum = *(d++);
+//               dum = *(d++);
                i+=1;
             } else {
                /* 2x2 pivot */
                dptr[2*i+0] = *(d++);
                dptr[2*i+1] = *(d++);
                dptr[2*i+3] = *(d++);
-               dum = *(d++);
+//               dum = *(d++);
                i+=2;
             }
          }
-//       for(int i=0; i<nelim; ++i) {
+//       for(ipc_ i=0; i<nelim; ++i) {
 //          dptr[2*i+0] = *(d++);
 //          dptr[2*i+1] = *(d++);
 //       }
@@ -529,17 +531,29 @@ public:
    }
 
 	void print() const {
-		for(int node=0; node<symb_.nnodes_; node++) {
+		for(ipc_ node=0; node<symb_.nnodes_; node++) {
+#ifdef INTEGER_64
+			printf("== Node %ld ==\n", node);
+#else
 			printf("== Node %d ==\n", node);
-			int m = symb_[node].nrow + nodes_[node].ndelay_in;
-			int n = symb_[node].ncol + nodes_[node].ndelay_in;
-         int ldl = align_lda<T>(m);
-         int nelim = nodes_[node].nelim;
-			int const* rlist = &symb_[node].rlist[ symb_[node].ncol ];
-			for(int i=0; i<m; ++i) {
-				if(i<n) printf("%d%s:", nodes_[node].perm[i], (i<nelim)?"X":"D");
+#endif
+			ipc_ m = symb_[node].nrow + nodes_[node].ndelay_in;
+			ipc_ n = symb_[node].ncol + nodes_[node].ndelay_in;
+         ipc_ ldl = align_lda<T>(m);
+         ipc_ nelim = nodes_[node].nelim;
+			ipc_ const* rlist = &symb_[node].rlist[ symb_[node].ncol ];
+			for(ipc_ i=0; i<m; ++i) {
+#ifdef INTEGER_64
+				if(i<n) printf("%ld%s:", nodes_[node].perm[i], 
+                                               (i<nelim)?"X":"D");
+				else    printf("%ld:", rlist[i-n]);
+#else
+				if(i<n) printf("%d%s:", nodes_[node].perm[i], 
+                                               (i<nelim)?"X":"D");
 				else    printf("%d:", rlist[i-n]);
-				for(int j=0; j<n; j++) printf(" %10.2e", nodes_[node].lcol[j*ldl+i]);
+#endif
+				for(ipc_ j=0; j<n; j++) printf(" %10.2e", 
+                                   nodes_[node].lcol[j*ldl+i]);
             T const* d = &nodes_[node].lcol[n*ldl];
 				if(!posdef && i<nelim)
                printf("  d: %10.2e %10.2e", d[2*i+0], d[2*i+1]);
@@ -549,9 +563,9 @@ public:
 	}
 
    /** Return contribution block from subtree (if not a real root) */
-   void get_contrib(int& n, T const*& val, int& ldval, int const*& rlist,
-         int& ndelay, int const*& delay_perm, T const*& delay_val,
-         int& lddelay) const {
+   void get_contrib(ipc_& n, T const*& val, ipc_& ldval, ipc_ const*& rlist,
+         ipc_& ndelay, ipc_ const*& delay_perm, T const*& delay_val,
+         ipc_& lddelay) const {
       auto& root = *nodes_.back().first_child;
       n = root.symb.nrow - root.symb.ncol;
       val = root.contrib;
diff --git a/include/ssids_cpu_SmallLeafNumericSubtree.hxx b/include/ssids_cpu_SmallLeafNumericSubtree.hxx
index 45f77c8de8..5939ba1bd3 100644
--- a/include/ssids_cpu_SmallLeafNumericSubtree.hxx
+++ b/include/ssids_cpu_SmallLeafNumericSubtree.hxx
@@ -2,11 +2,14 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 15:00 GMT
  */
+
 #pragma once
 
 #include <memory>
 
+#include "ssids_rip.hxx"
 #include "ssids_cpu_cpu_iface.hxx"
 #include "ssids_cpu_factor.hxx"
 #include "ssids_cpu_NumericNode.hxx"
@@ -14,18 +17,16 @@
 #include "ssids_cpu_ThreadStats.hxx"
 
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define FAPrecisionTraits FASingleTraits
 #define factor_alloc_precision factor_alloc_single
 #define ldlt_tpp_factor ldlt_tpp_factor_sgl
 #else
-#define precision_ double
 #define FAPrecisionTraits FADoubleTraits
 #define factor_alloc_precision factor_alloc_double
 #define ldlt_tpp_factor ldlt_tpp_factor_dbl
 #endif
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define host_gemm host_gemm_64
 #endif
 
@@ -46,8 +47,8 @@ template <typename T,
           typename PoolAllocator // Allocator for pool memory usage
           >
 class SmallLeafNumericSubtree<true, T, FactorAllocator, PoolAllocator> {
-   typedef typename std::allocator_traits<FactorAllocator>::template rebind_traits<precision_> FAPrecisionTraits;
-   typedef typename std::allocator_traits<FactorAllocator>::template rebind_traits<int> FAIntTraits;
+   typedef typename std::allocator_traits<FactorAllocator>::template rebind_traits<rpc_> FAPrecisionTraits;
+   typedef typename std::allocator_traits<FactorAllocator>::template rebind_traits<ipc_> FAIntTraits;
    typedef std::allocator_traits<PoolAllocator> PATraits;
 public:
    SmallLeafNumericSubtree(SmallLeafSymbolicSubtree const& symb, std::vector<NumericNode<T,PoolAllocator>>& old_nodes, T const* aval, T const* scaling, FactorAllocator& factor_alloc, PoolAllocator& pool_alloc, std::vector<Workspace>& work_vec, struct cpu_factor_options const& options, ThreadStats& stats)
@@ -55,30 +56,30 @@ public:
    {
       Workspace& work = work_vec[omp_get_thread_num()];
       /* Initialize nodes */
-      for(int ni=symb_.sa_; ni<=symb_.en_; ++ni) {
+      for(ipc_ ni=symb_.sa_; ni<=symb_.en_; ++ni) {
          old_nodes_[ni].ndelay_in = 0;
          old_nodes_[ni].lcol = lcol_ + symb_[ni-symb_.sa_].lcol_offset;
       }
       memset(lcol_, 0, symb_.nfactor_*sizeof(T));
 
       /* Add aval entries */
-      for(int ni=symb_.sa_; ni<=symb_.en_; ++ni)
+      for(ipc_ ni=symb_.sa_; ni<=symb_.en_; ++ni)
          add_a(ni-symb_.sa_, symb_.symb_[ni], aval, scaling);
 
       /* Perform factorization */
-      for(int ni=symb_.sa_; ni<=symb_.en_; ++ni) {
+      for(ipc_ ni=symb_.sa_; ni<=symb_.en_; ++ni) {
          // Assembly
-         int* map = work.get_ptr<int>(symb_.symb_.n+1);
+         ipc_* map = work.get_ptr<ipc_>(symb_.symb_.n+1);
          assemble
             (ni-symb_.sa_, symb_.symb_[ni], &old_nodes_[ni], factor_alloc,
              pool_alloc, map, aval, scaling);
          // Update stats
-         int nrow = symb_.symb_[ni].nrow;
+         ipc_ nrow = symb_.symb_[ni].nrow;
          stats.maxfront = std::max(stats.maxfront, nrow);
-         int ncol = symb_.symb_[ni].ncol;
+         ipc_ ncol = symb_.symb_[ni].ncol;
          stats.maxsupernode = std::max(stats.maxsupernode, ncol);
          // Factorization
-         precision_ one_val = 1.0;
+         rpc_ one_val = 1.0;
          factor_node_posdef
             (one_val, symb_.symb_[ni], old_nodes_[ni], options, stats);
          if(stats.flag<Flag::SUCCESS) return;
@@ -87,20 +88,20 @@ public:
 
 private:
 void add_a(
-      int si,
+      ipc_ si,
       SymbolicNode const& snode,
       T const* aval,
       T const* scaling
       ) {
-   precision_ *lcol = lcol_ + symb_[si].lcol_offset;
-   size_t ldl = align_lda<precision_>(snode.nrow);
+   rpc_ *lcol = lcol_ + symb_[si].lcol_offset;
+   size_t ldl = align_lda<rpc_>(snode.nrow);
    if(scaling) {
       /* Scaling to apply */
-      for(int i=0; i<snode.num_a; i++) {
-         long src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
-         long dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
-         int c = dest / snode.nrow;
-         int r = dest % snode.nrow;
+      for(ipc_ i=0; i<snode.num_a; i++) {
+         longc_ src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
+         longc_ dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
+         ipc_ c = dest / snode.nrow;
+         ipc_ r = dest % snode.nrow;
          T rscale = scaling[ snode.rlist[r]-1 ];
          T cscale = scaling[ snode.rlist[c]-1 ];
          size_t k = c*ldl + r;
@@ -108,11 +109,11 @@ void add_a(
       }
    } else {
       /* No scaling to apply */
-      for(int i=0; i<snode.num_a; i++) {
-         long src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
-         long dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
-         int c = dest / snode.nrow;
-         int r = dest % snode.nrow;
+      for(ipc_ i=0; i<snode.num_a; i++) {
+         longc_ src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
+         longc_ dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
+         ipc_ c = dest / snode.nrow;
+         ipc_ r = dest % snode.nrow;
          size_t k = c*ldl + r;
          lcol[k] = aval[src];
       }
@@ -120,12 +121,12 @@ void add_a(
 }
 
 void assemble(
-      int si,
+      ipc_ si,
       SymbolicNode const& snode,
       NumericNode<T,PoolAllocator>* node,
       FactorAllocator& factor_alloc,
       PoolAllocator& pool_alloc,
-      int* map,
+      ipc_* map,
       T const* aval,
       T const* scaling
       ) {
@@ -133,18 +134,19 @@ void assemble(
    typename FAIntTraits::allocator_type factor_alloc_int(factor_alloc);
 
    /* Count incoming delays and determine size of node */
-   int nrow = snode.nrow;
-   int ncol = snode.ncol;
+   ipc_ nrow = snode.nrow;
+   ipc_ ncol = snode.ncol;
 
    /* Get space for contribution block + zero it */
-   long contrib_dimn = snode.nrow - snode.ncol;
-   node->contrib = (contrib_dimn > 0) ? PATraits::allocate(pool_alloc, contrib_dimn*contrib_dimn) : nullptr;
+   longc_ contrib_dimn = snode.nrow - snode.ncol;
+   node->contrib = (contrib_dimn > 0) ? PATraits::allocate(pool_alloc, 
+      contrib_dimn*contrib_dimn) : nullptr;
    if(node->contrib)
       memset(node->contrib, 0, contrib_dimn*contrib_dimn*sizeof(T));
 
    /* Alloc + set perm */
    node->perm = FAIntTraits::allocate(factor_alloc_int, ncol); // ncol fully summed variables
-   for(int i=0; i<snode.ncol; i++)
+   for(ipc_ i=0; i<snode.ncol; i++)
       node->perm[i] = snode.rlist[i];
 
    /* Add children */
@@ -152,32 +154,32 @@ void assemble(
       /* Build lookup vector, allowing for insertion of delayed vars */
       /* Note that while rlist[] is 1-indexed this is fine so long as lookup
        * is also 1-indexed (which it is as it is another node's rlist[] */
-      for(int i=0; i<snode.nrow; i++)
+      for(ipc_ i=0; i<snode.nrow; i++)
          map[ snode.rlist[i] ] = i;
       /* Loop over children adding contributions */
       for(auto* child=node->first_child; child!=NULL; child=child->next_child) {
          SymbolicNode const& csnode = child->symb;
          /* Handle expected contributions (only if something there) */
          if(child->contrib) {
-            int cm = csnode.nrow - csnode.ncol;
-            for(int i=0; i<cm; i++) {
-               int c = map[ csnode.rlist[csnode.ncol+i] ];
+            ipc_ cm = csnode.nrow - csnode.ncol;
+            for(ipc_ i=0; i<cm; i++) {
+               ipc_ c = map[ csnode.rlist[csnode.ncol+i] ];
                T *src = &child->contrib[i*cm];
                if(c < snode.ncol) {
                   // Contribution added to lcol
-                  int ldd = align_lda<precision_>(nrow);
+                  ipc_ ldd = align_lda<rpc_>(nrow);
                   T *dest = &node->lcol[c*ldd];
-                  for(int j=i; j<cm; j++) {
-                     int r = map[ csnode.rlist[csnode.ncol+j] ];
+                  for(ipc_ j=i; j<cm; j++) {
+                     ipc_ r = map[ csnode.rlist[csnode.ncol+j] ];
                      dest[r] += src[j];
                   }
                } else {
                   // Contribution added to contrib
                   // FIXME: Add after contribution block established?
-                  int ldd = snode.nrow - snode.ncol;
+                  ipc_ ldd = snode.nrow - snode.ncol;
                   T *dest = &node->contrib[(c-ncol)*ldd];
-                  for(int j=i; j<cm; j++) {
-                     int r = map[ csnode.rlist[csnode.ncol+j] ] - ncol;
+                  for(ipc_ j=i; j<cm; j++) {
+                     ipc_ r = map[ csnode.rlist[csnode.ncol+j] ] - ncol;
                      dest[r] += src[j];
                   }
                }
@@ -201,27 +203,27 @@ template <typename T,
           typename PoolAllocator // Allocator for pool memory usage
           >
 class SmallLeafNumericSubtree<false, T, FactorAllocator, PoolAllocator> {
-   typedef typename std::allocator_traits<FactorAllocator>::template rebind_traits<precision_> FAPrecisionTraits;
-   typedef typename std::allocator_traits<FactorAllocator>::template rebind_traits<int> FAIntTraits;
+   typedef typename std::allocator_traits<FactorAllocator>::template rebind_traits<rpc_> FAPrecisionTraits;
+   typedef typename std::allocator_traits<FactorAllocator>::template rebind_traits<ipc_> FAIntTraits;
    typedef std::allocator_traits<PoolAllocator> PATraits;
 public:
    SmallLeafNumericSubtree(SmallLeafSymbolicSubtree const& symb, std::vector<NumericNode<T,PoolAllocator>>& old_nodes, T const* aval, T const* scaling, FactorAllocator& factor_alloc, PoolAllocator& pool_alloc, std::vector<Workspace>& work_vec, struct cpu_factor_options const& options, ThreadStats& stats)
    : old_nodes_(old_nodes), symb_(symb)
    {
       Workspace& work = work_vec[omp_get_thread_num()];
-      for(int ni=symb_.sa_; ni<=symb_.en_; ++ni) {
+      for(ipc_ ni=symb_.sa_; ni<=symb_.en_; ++ni) {
          /*printf("%d: Node %d parent %d (of %d) size %d x %d\n",
                omp_get_thread_num(), ni, symb_[ni].parent, symb_.nnodes_,
                symb_[ni].nrow, symb_[ni].ncol);*/
          // Assembly of node (not of contribution block)
-         int* map = work.get_ptr<int>(symb_.symb_.n+1);
+         ipc_* map = work.get_ptr<ipc_>(symb_.symb_.n+1);
          assemble_pre
             (symb_.symb_[ni], old_nodes_[ni], factor_alloc,
              pool_alloc, map, aval, scaling);
          // Update stats
-         int nrow = symb_.symb_[ni].nrow + old_nodes_[ni].ndelay_in;
+         ipc_ nrow = symb_.symb_[ni].nrow + old_nodes_[ni].ndelay_in;
          stats.maxfront = std::max(stats.maxfront, nrow);
-         int ncol = symb_.symb_[ni].ncol + old_nodes_[ni].ndelay_in;
+         ipc_ ncol = symb_.symb_[ni].ncol + old_nodes_[ni].ndelay_in;
          stats.maxsupernode = std::max(stats.maxsupernode, ncol);
 
          // Factorization
@@ -241,7 +243,7 @@ private:
          NumericNode<T,PoolAllocator>& node,
          FactorAllocator& factor_alloc,
          PoolAllocator& pool_alloc,
-         int* map,
+         ipc_* map,
          T const* aval,
          T const* scaling
          ) {
@@ -254,35 +256,35 @@ private:
       for(auto* child=node.first_child; child!=NULL; child=child->next_child) {
          node.ndelay_in += child->ndelay_out;
       }
-      int nrow = snode.nrow + node.ndelay_in;
-      int ncol = snode.ncol + node.ndelay_in;
+      ipc_ nrow = snode.nrow + node.ndelay_in;
+      ipc_ ncol = snode.ncol + node.ndelay_in;
 
       /* Get space for node now we know it size using Fortran allocator + zero it*/
       // NB L is  nrow x ncol and D is 2 x ncol (but no D if posdef)
-      size_t ldl = align_lda<precision_>(nrow);
+      size_t ldl = align_lda<rpc_>(nrow);
       size_t len = (ldl+2) * ncol; // +2 is for D
       node.lcol = FAPrecisionTraits::allocate(factor_alloc_precision, len);
       memset(node.lcol, 0, len*sizeof(T));
 
       /* Get space for contribution block + (explicitly do not zero it!) */
-      long contrib_dimn = snode.nrow - snode.ncol;
+      longc_ contrib_dimn = snode.nrow - snode.ncol;
       node.contrib = (contrib_dimn > 0) ? PATraits::allocate(pool_alloc, contrib_dimn*contrib_dimn) : nullptr;
 
       /* Alloc + set perm for expected eliminations at this node (delays are set
        * when they are imported from children) */
       node.perm = FAIntTraits::allocate(factor_alloc_int, ncol); // ncol fully summed variables
-      for(int i=0; i<snode.ncol; i++)
+      for(ipc_ i=0; i<snode.ncol; i++)
          node.perm[i] = snode.rlist[i];
 
       /* Add A */
       if(scaling) {
          /* Scaling to apply */
-         for(int i=0; i<snode.num_a; i++) {
-            long src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
-            long dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
-            int c = dest / snode.nrow;
-            int r = dest % snode.nrow;
-            long k = c*ldl + r;
+         for(ipc_ i=0; i<snode.num_a; i++) {
+            longc_ src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
+            longc_ dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
+            ipc_ c = dest / snode.nrow;
+            ipc_ r = dest % snode.nrow;
+            longc_ k = c*ldl + r;
             if(r >= snode.ncol) k += node.ndelay_in;
             T rscale = scaling[ snode.rlist[r]-1 ];
             T cscale = scaling[ snode.rlist[c]-1 ];
@@ -290,12 +292,12 @@ private:
          }
       } else {
          /* No scaling to apply */
-         for(int i=0; i<snode.num_a; i++) {
-            long src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
-            long dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
-            int c = dest / snode.nrow;
-            int r = dest % snode.nrow;
-            long k = c*ldl + r;
+         for(ipc_ i=0; i<snode.num_a; i++) {
+            longc_ src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
+            longc_ dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
+            ipc_ c = dest / snode.nrow;
+            ipc_ r = dest % snode.nrow;
+            longc_ k = c*ldl + r;
             if(r >= snode.ncol) k += node.ndelay_in;
             node.lcol[k] = aval[src];
          }
@@ -306,30 +308,30 @@ private:
          /* Build lookup vector, allowing for insertion of delayed vars */
          /* Note that while rlist[] is 1-indexed this is fine so long as lookup
           * is also 1-indexed (which it is as it is another node's rlist[] */
-         for(int i=0; i<snode.ncol; i++)
+         for(ipc_ i=0; i<snode.ncol; i++)
             map[ snode.rlist[i] ] = i;
-         for(int i=snode.ncol; i<snode.nrow; i++)
+         for(ipc_ i=snode.ncol; i<snode.nrow; i++)
             map[ snode.rlist[i] ] = i + node.ndelay_in;
          /* Loop over children adding contributions */
-         int delay_col = snode.ncol;
+         ipc_ delay_col = snode.ncol;
          for(auto* child=node.first_child; child!=NULL; child=child->next_child) {
             SymbolicNode const& csnode = child->symb;
             /* Handle delays - go to back of node
              * (i.e. become the last rows as in lower triangular format) */
-            for(int i=0; i<child->ndelay_out; i++) {
+            for(ipc_ i=0; i<child->ndelay_out; i++) {
                // Add delayed rows (from delayed cols)
                T *dest = &node.lcol[delay_col*(ldl+1)];
-               int lds = align_lda<T>(csnode.nrow + child->ndelay_in);
+               ipc_ lds = align_lda<T>(csnode.nrow + child->ndelay_in);
                T *src = &child->lcol[(child->nelim+i)*(lds+1)];
                node.perm[delay_col] = child->perm[child->nelim+i];
-               for(int j=0; j<child->ndelay_out-i; j++) {
+               for(ipc_ j=0; j<child->ndelay_out-i; j++) {
                   dest[j] = src[j];
                }
                // Add child's non-fully summed rows (from delayed cols)
                dest = node.lcol;
                src = &child->lcol[child->nelim*lds + child->ndelay_in +i*lds];
-               for(int j=csnode.ncol; j<csnode.nrow; j++) {
-                  int r = map[ csnode.rlist[j] ];
+               for(ipc_ j=csnode.ncol; j<csnode.nrow; j++) {
+                  ipc_ r = map[ csnode.rlist[j] ];
                   if(r < ncol) dest[r*ldl+delay_col] = src[j];
                   else         dest[delay_col*ldl+r] = src[j];
                }
@@ -338,17 +340,17 @@ private:
 
             /* Handle expected contributions (only if something there) */
             if(child->contrib) {
-               int cm = csnode.nrow - csnode.ncol;
-               for(int i=0; i<cm; i++) {
-                  int c = map[ csnode.rlist[csnode.ncol+i] ];
+               ipc_ cm = csnode.nrow - csnode.ncol;
+               for(ipc_ i=0; i<cm; i++) {
+                  ipc_ c = map[ csnode.rlist[csnode.ncol+i] ];
                   T *src = &child->contrib[i*cm];
                   // NB: we handle contribution to contrib in assemble_post()
                   if(c < snode.ncol) {
                      // Contribution added to lcol
-                     int ldd = align_lda<T>(nrow);
+                     ipc_ ldd = align_lda<T>(nrow);
                      T *dest = &node.lcol[c*ldd];
-                     for(int j=i; j<cm; j++) {
-                        int r = map[ csnode.rlist[csnode.ncol+j] ];
+                     for(ipc_ j=i; j<cm; j++) {
+                        ipc_ r = map[ csnode.rlist[csnode.ncol+j] ];
                         dest[r] += src[j];
                      }
                   }
@@ -368,12 +370,12 @@ private:
          PoolAllocator& pool_alloc
          ) {
       /* Extract useful information about node */
-      int m = snode.nrow + node->ndelay_in;
-      int n = snode.ncol + node->ndelay_in;
+      ipc_ m = snode.nrow + node->ndelay_in;
+      ipc_ n = snode.ncol + node->ndelay_in;
       size_t ldl = align_lda<T>(m);
       T *lcol = node->lcol;
       T *d = &node->lcol[ n*ldl ];
-      int *perm = node->perm;
+      ipc_ *perm = node->perm;
 
       /* Perform factorization */
       //Verify<T> verifier(m, n, perm, lcol, ldl);
@@ -385,8 +387,8 @@ private:
       //verifier.verify(node->nelim, perm, lcol, ldl, d);
 
       if(m-n>0 && node->nelim>0) {
-         int nelim = node->nelim;
-         int ldld = align_lda<T>(m-n);
+         ipc_ nelim = node->nelim;
+         ipc_ ldld = align_lda<T>(m-n);
          T *ld = work.get_ptr<T>(nelim*ldld);
          calcLD<OP_N>(m-n, nelim, &lcol[n], ldl, d, ld, ldld);
          host_gemm<T>(OP_N, OP_T, m-n, m-n, nelim,
@@ -397,7 +399,7 @@ private:
       /* Record information */
       node->ndelay_out = n - node->nelim;
       stats.num_delay += node->ndelay_out;
-      for (int64_t j = m; j >= m-(node->nelim)+1; --j) {
+      for (longc_ j = m; j >= m-(node->nelim)+1; --j) {
           stats.num_factor += j;
           stats.num_flops += j*j;
       }
@@ -409,7 +411,7 @@ private:
          node->free_contrib();
       } else if(node->nelim==0) {
          // FIXME: If we fix the above, we don't need this explict zeroing
-         long contrib_size = m-n;
+         longc_ contrib_size = m-n;
          memset(node->contrib, 0, contrib_size*contrib_size*sizeof(T));
       }
    }
@@ -418,35 +420,35 @@ private:
          SymbolicNode const& snode,
          NumericNode<T,PoolAllocator>& node,
          PoolAllocator& pool_alloc,
-         int* map
+         ipc_* map
          ) {
       /* Initialise variables */
-      int ncol = snode.ncol + node.ndelay_in;
+      ipc_ ncol = snode.ncol + node.ndelay_in;
 
       /* Add children */
       if(node.first_child != NULL) {
          /* Build lookup vector, allowing for insertion of delayed vars */
          /* Note that while rlist[] is 1-indexed this is fine so long as lookup
           * is also 1-indexed (which it is as it is another node's rlist[] */
-         for(int i=0; i<snode.ncol; i++)
+         for(ipc_ i=0; i<snode.ncol; i++)
             map[ snode.rlist[i] ] = i;
-         for(int i=snode.ncol; i<snode.nrow; i++)
+         for(ipc_ i=snode.ncol; i<snode.nrow; i++)
             map[ snode.rlist[i] ] = i + node.ndelay_in;
          /* Loop over children adding contributions */
          for(auto* child=node.first_child; child!=NULL; child=child->next_child) {
             SymbolicNode const& csnode = child->symb;
             if(!child->contrib) continue;
-            int cm = csnode.nrow - csnode.ncol;
-            for(int i=0; i<cm; i++) {
-               int c = map[ csnode.rlist[csnode.ncol+i] ];
+            ipc_ cm = csnode.nrow - csnode.ncol;
+            for(ipc_ i=0; i<cm; i++) {
+               ipc_ c = map[ csnode.rlist[csnode.ncol+i] ];
                T *src = &child->contrib[i*cm];
                // NB: only interested in contribution to generated element
                if(c >= snode.ncol) {
                   // Contribution added to contrib
-                  int ldd = snode.nrow - snode.ncol;
+                  ipc_ ldd = snode.nrow - snode.ncol;
                   T *dest = &node.contrib[(c-ncol)*ldd];
-                  for(int j=i; j<cm; j++) {
-                     int r = map[ csnode.rlist[csnode.ncol+j] ] - ncol;
+                  for(ipc_ j=i; j<cm; j++) {
+                     ipc_ r = map[ csnode.rlist[csnode.ncol+j] ] - ncol;
                      dest[r] += src[j];
                   }
                }
diff --git a/include/ssids_cpu_SmallLeafSymbolicSubtree.hxx b/include/ssids_cpu_SmallLeafSymbolicSubtree.hxx
index f79536f052..995a317dde 100644
--- a/include/ssids_cpu_SmallLeafSymbolicSubtree.hxx
+++ b/include/ssids_cpu_SmallLeafSymbolicSubtree.hxx
@@ -2,19 +2,17 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 15:00 GMT
  */
+
 #pragma once
 
 #include <memory>
 
+#include "ssids_rip.hxx"
 #include "ssids_cpu_cpu_iface.hxx"
 #include "ssids_cpu_SymbolicNode.hxx"
 
-#ifdef SPRAL_SINGLE
-#define precision_ float
-#else
-#define precision_ double
-#endif
 namespace spral { namespace ssids { namespace cpu {
 
 class SymbolicSubtree;
@@ -33,11 +31,11 @@ class SmallLeafSymbolicSubtree {
 private:
    class Node {
    public:
-      int nrow;
-      int ncol;
-      int sparent;
-      int* rlist;
-      int lcol_offset;
+      ipc_ nrow;
+      ipc_ ncol;
+      ipc_ sparent;
+      ipc_* rlist;
+      ipc_ lcol_offset;
    };
 
 public:
@@ -76,35 +74,40 @@ public:
     *        nlist[2*i+1] of the relevant supernode (as per nptr) of \f$ L \f$.
     * \param symb Underlying SymbolicSubtree for containing parttree.
     */
-   SmallLeafSymbolicSubtree(int sa, int en, int part_offset, int const* sptr, int const* sparent, long const* rptr, int const* rlist, long const* nptr, long const* nlist, SymbolicSubtree const& symb)
-   : sa_(sa), en_(en), nnodes_(en-sa+1), parent_(sparent[part_offset+en]-1-part_offset),
-     nodes_(nnodes_),
-     rlist_(new int[rptr[part_offset+en+1]-rptr[part_offset+sa]], std::default_delete<int[]>()),
+   SmallLeafSymbolicSubtree(ipc_ sa, ipc_ en, ipc_ part_offset, 
+                            ipc_ const* sptr, ipc_ const* sparent, 
+                            longc_ const* rptr, ipc_ const* rlist, 
+                            longc_ const* nptr, longc_ const* nlist, 
+                            SymbolicSubtree const& symb)
+   : sa_(sa), en_(en), nnodes_(en-sa+1), 
+     parent_(sparent[part_offset+en]-1-part_offset), nodes_(nnodes_),
+     rlist_(new ipc_[rptr[part_offset+en+1]-rptr[part_offset+sa]], 
+     std::default_delete<ipc_[]>()),
      nptr_(nptr), nlist_(nlist), symb_(symb)
    {
       /* Setup basic node information */
       nfactor_ = 0;
-      int* newrlist = rlist_.get();
-      for(int ni=sa; ni<=en; ++ni) {
+      ipc_* newrlist = rlist_.get();
+      for(ipc_ ni=sa; ni<=en; ++ni) {
          nodes_[ni-sa].nrow = rptr[part_offset+ni+1] - rptr[part_offset+ni];
          nodes_[ni-sa].ncol = sptr[part_offset+ni+1] - sptr[part_offset+ni];
          nodes_[ni-sa].sparent = sparent[part_offset+ni]-sa-1; // sparent is Fortran indexed
          // FIXME: subtract ncol off rlist for elim'd vars
          nodes_[ni-sa].rlist = &newrlist[rptr[part_offset+ni]-rptr[part_offset+sa]];
          nodes_[ni-sa].lcol_offset = nfactor_;
-         size_t ldl = align_lda<precision_>(nodes_[ni-sa].nrow);
+         size_t ldl = align_lda<rpc_>(nodes_[ni-sa].nrow);
          nfactor_ += nodes_[ni-sa].ncol*ldl;
       }
       /* Construct rlist_ being offsets into parent node */
-      for(int ni=sa; ni<=en; ++ni) {
+      for(ipc_ ni=sa; ni<=en; ++ni) {
          if(nodes_[ni-sa].ncol == nodes_[ni-sa].nrow) continue; // is root
-         int const* ilist = &rlist[rptr[part_offset+ni]-1]; // rptr is Fortran indexed
+         ipc_ const* ilist = &rlist[rptr[part_offset+ni]-1]; // rptr is Fortran indexed
          ilist += nodes_[ni-sa].ncol; // Skip eliminated vars
-         int pnode = sparent[part_offset+ni]-1; //Fortran indexed
-         int const* jlist = &rlist[rptr[pnode]-1]; // rptr is Fortran indexed
-         int const* jstart = jlist;
-         int *outlist = nodes_[ni-sa].rlist;
-         for(int i=nodes_[ni-sa].ncol; i<nodes_[ni-sa].nrow; ++i) {
+         ipc_ pnode = sparent[part_offset+ni]-1; //Fortran indexed
+         ipc_ const* jlist = &rlist[rptr[pnode]-1]; // rptr is Fortran indexed
+         ipc_ const* jstart = jlist;
+         ipc_ *outlist = nodes_[ni-sa].rlist;
+         for(ipc_ i=nodes_[ni-sa].ncol; i<nodes_[ni-sa].nrow; ++i) {
             for(; *ilist != *jlist; ++jlist); // Finds match in jlist
             *(outlist++) = jlist - jstart;
             ++ilist;
@@ -113,19 +116,19 @@ public:
    }
 
    /** \brief Return parent node of subtree in parttree indexing. */
-   int get_parent() const { return parent_; }
+   ipc_ get_parent() const { return parent_; }
    /** \brief Return given node of this tree. */
-   Node const& operator[](int idx) const { return nodes_[idx]; }
+   Node const& operator[](ipc_ idx) const { return nodes_[idx]; }
 protected:
-   int sa_; //< First node in subtree.
-   int en_; //< Last node in subtree.
-   int nnodes_; //< Number of nodes in subtree.
-   int nfactor_; //< Number of entries in factor for subtree.
-   int parent_; //< Parent of subtree in parttree.
+   ipc_ sa_; //< First node in subtree.
+   ipc_ en_; //< Last node in subtree.
+   ipc_ nnodes_; //< Number of nodes in subtree.
+   ipc_ nfactor_; //< Number of entries in factor for subtree.
+   ipc_ parent_; //< Parent of subtree in parttree.
    std::vector<Node> nodes_; //< Nodes of this subtree.
-   std::shared_ptr<int> rlist_; //< Row entries of this subtree.
-   long const* nptr_; //< Node mapping into nlist_.
-   long const* nlist_; //< Mapping from \f$ A \f$ to \f$ L \f$.
+   std::shared_ptr<ipc_> rlist_; //< Row entries of this subtree.
+   longc_ const* nptr_; //< Node mapping into nlist_.
+   longc_ const* nlist_; //< Mapping from \f$ A \f$ to \f$ L \f$.
    SymbolicSubtree const& symb_; //< Underlying parttree
 
    template <bool posdef, typename T, typename FactorAllocator,
diff --git a/include/ssids_cpu_SymbolicNode.hxx b/include/ssids_cpu_SymbolicNode.hxx
index 9ba1cbd945..d5b741ab6c 100644
--- a/include/ssids_cpu_SymbolicNode.hxx
+++ b/include/ssids_cpu_SymbolicNode.hxx
@@ -2,26 +2,28 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 15:00 GMT
  */
 #pragma once
 
 #include <vector>
+#include "ssids_rip.hxx"
 
 namespace spral { namespace ssids { namespace cpu {
 
 /** Symbolic representation of a node */
 struct SymbolicNode {
    bool insmallleaf;
-   int idx; //< Index of node
-   int nrow; //< Number of rows
-   int ncol; //< Number of columns
+   ipc_ idx; //< Index of node
+   ipc_ nrow; //< Number of rows
+   ipc_ ncol; //< Number of columns
    SymbolicNode* first_child; //< Pointer to first child in linked list
    SymbolicNode* next_child; //< Pointer to second child in linked list
-   int const* rlist; //< Pointer to row lists
-   int num_a; //< Number of entries mapped from A to L
-   long const* amap; //< Pointer to map from A to L locations
-   int parent; //< index of parent node
-   std::vector<int> contrib; //< index of expected contribution(s)
+   ipc_ const* rlist; //< Pointer to row lists
+   ipc_ num_a; //< Number of entries mapped from A to L
+   longc_ const* amap; //< Pointer to map from A to L locations
+   ipc_ parent; //< index of parent node
+   std::vector<ipc_> contrib; //< index of expected contribution(s)
 };
 
 }}} /* end of namespace spral::ssids::cpu */
diff --git a/include/ssids_cpu_SymbolicSubtree.hxx b/include/ssids_cpu_SymbolicSubtree.hxx
index 943f92587e..9407cf4f23 100644
--- a/include/ssids_cpu_SymbolicSubtree.hxx
+++ b/include/ssids_cpu_SymbolicSubtree.hxx
@@ -2,27 +2,29 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 15:00 GMT
  */
 #pragma once
 
 #include <cstddef>
 #include <vector>
+#include <stdio.h>
 
+#include "ssids_rip.hxx"
 #include "ssids_cpu_SmallLeafSymbolicSubtree.hxx"
 #include "ssids_cpu_SymbolicNode.hxx"
 
-#ifdef SPRAL_SINGLE
-#define precision_ float
-#else
-#define precision_ double
-#endif
-
 namespace spral { namespace ssids { namespace cpu {
 
 /** Symbolic factorization of a subtree to be factored on the CPU */
 class SymbolicSubtree {
 public:
-   SymbolicSubtree(int n, int sa, int en, int const* sptr, int const* sparent, long const* rptr, int const* rlist, long const* nptr, long const* nlist, int ncontrib, int const* contrib_idx, struct cpu_factor_options const& options)
+   SymbolicSubtree(ipc_ n, ipc_ sa, ipc_ en, ipc_ const* sptr, 
+                   ipc_ const* sparent, longc_ const* rptr, 
+                   ipc_ const* rlist, longc_ const* nptr, 
+                   longc_ const* nlist, ipc_ ncontrib, 
+                   ipc_ const* contrib_idx, 
+                   struct cpu_factor_options const& options)
    : n(n), nnodes_(en-sa), nodes_(nnodes_+1)
    {
       // Adjust sa to C indexing (en is not used except in nnodes_ init above)
@@ -30,9 +32,9 @@ public:
       // FIXME: don't process nodes that are in small leaf subtrees
       /* Fill out basic details */
       maxfront_ = 0;
-      for(int ni=0; ni<nnodes_; ++ni) {
+      for(ipc_ ni=0; ni<nnodes_; ++ni) {
          nodes_[ni].idx = ni;
-         nodes_[ni].nrow = static_cast<int>(rptr[sa+ni+1] - rptr[sa+ni]);
+         nodes_[ni].nrow = static_cast<ipc_>(rptr[sa+ni+1] - rptr[sa+ni]);
          nodes_[ni].ncol = sptr[sa+ni+1] - sptr[sa+ni];
          nodes_[ni].first_child = nullptr;
          nodes_[ni].next_child = nullptr;
@@ -45,36 +47,36 @@ public:
       }
       nodes_[nnodes_].first_child = nullptr; // List of roots
       /* Build child linked lists */
-      for(int ni=0; ni<nnodes_; ++ni) {
+      for(ipc_ ni=0; ni<nnodes_; ++ni) {
          SymbolicNode *parent = &nodes_[ std::min(nodes_[ni].parent, nnodes_) ];
          nodes_[ni].next_child = parent->first_child;
          parent->first_child = &nodes_[ni];
       }
       /* Record contribution block inputs */
-      for(int ci=0; ci<ncontrib; ++ci) {
-         int idx = contrib_idx[ci]-1 - sa; // contrib_idx is Fortran indexed
+      for(ipc_ ci=0; ci<ncontrib; ++ci) {
+         ipc_ idx = contrib_idx[ci]-1 - sa; // contrib_idx is Fortran indexed
          nodes_[idx].contrib.push_back(ci);
       }
       /* Count size of factors */
       nfactor_ = 0;
-      for(int ni=0; ni<nnodes_; ++ni)
+      for(ipc_ ni=0; ni<nnodes_; ++ni)
          nfactor_ += static_cast<size_t>(nodes_[ni].nrow)*nodes_[ni].ncol;
       /* Find small leaf subtrees */
       // Count flops below each node
-      std::vector<long> flops(nnodes_+1, 0);
-      for(int ni=0; ni<nnodes_; ++ni) {
-         for(int k=0; k<nodes_[ni].ncol; ++k)
+      std::vector<longc_> flops(nnodes_+1, 0);
+      for(ipc_ ni=0; ni<nnodes_; ++ni) {
+         for(ipc_ k=0; k<nodes_[ni].ncol; ++k)
             flops[ni] += (nodes_[ni].nrow - k)*(nodes_[ni].nrow - k);
          if(nodes_[ni].contrib.size() > 0) // not a leaf!
             flops[ni] += options.small_subtree_threshold;
-         int parent = std::min(nodes_[ni].parent, nnodes_);
+         ipc_ parent = std::min(nodes_[ni].parent, nnodes_);
          flops[parent] += flops[ni];
       }
       // Start at least node and work way up using parents until too large
-      for(int ni=0; ni<nnodes_; ) {
+      for(ipc_ ni=0; ni<nnodes_; ) {
          if(nodes_[ni].first_child) { ++ni; continue; } // Not a leaf
-         int last = ni;
-         for(int current=ni; current<nnodes_; current=nodes_[current].parent) {
+         ipc_ last = ni;
+         for(ipc_ current=ni; current<nnodes_; current=nodes_[current].parent) {
             if(flops[current] >= options.small_subtree_threshold) break;
             last = current;
          }
@@ -83,27 +85,27 @@ public:
          small_leafs_.emplace_back(
                ni, last, sa, sptr, sparent, rptr, rlist, nptr, nlist, *this
                );
-         for(int i=ni; i<=last; ++i)
+         for(ipc_ i=ni; i<=last; ++i)
             nodes_[i].insmallleaf = true;
          ni = last+1; // Skip to next node not in this subtree
       }
    }
 
-   SymbolicNode const& operator[](int idx) const {
+   SymbolicNode const& operator[](ipc_ idx) const {
       return nodes_[idx];
    }
-   size_t get_factor_mem_est(precision_ multiplier) const {
-      size_t mem = n*sizeof(int) + (2*n+nfactor_)*sizeof(precision_);
+   size_t get_factor_mem_est(rpc_ multiplier) const {
+      size_t mem = n*sizeof(ipc_) + (2*n+nfactor_)*sizeof(rpc_);
       return std::max(mem, static_cast<size_t>(mem*multiplier));
    }
    template <typename T>
    size_t get_pool_size() const {
-      return maxfront_*align_lda<precision_>(maxfront_);
+      return maxfront_*align_lda<rpc_>(maxfront_);
    }
 public:
-   int const n; //< Maximum row index
+   ipc_ const n; //< Maximum row index
 private:
-   int nnodes_;
+   ipc_ nnodes_;
    size_t nfactor_;
    size_t maxfront_;
    std::vector<SymbolicNode> nodes_;
diff --git a/include/ssids_cpu_ThreadStats.hxx b/include/ssids_cpu_ThreadStats.hxx
index 4948dced81..9673330dc6 100644
--- a/include/ssids_cpu_ThreadStats.hxx
+++ b/include/ssids_cpu_ThreadStats.hxx
@@ -2,19 +2,23 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 15:00 GMT
  */
+
 #pragma once
 
 #include <cstdint>
 #include <stdexcept>
 
+#include "ssids_rip.hxx"
+
 namespace spral { namespace ssids { namespace cpu {
 
 /** \brief SSIDS error/warning flags.
  *
  * Must match Fortran definitions in src/ssids/datatypes.f90
  */
-enum Flag : int {
+enum Flag : ipc_ {
    SUCCESS                 = 0,
 
    ERROR_SINGULAR          = -5,
@@ -29,11 +33,11 @@ enum Flag : int {
  */
 class SingularError: public std::runtime_error {
 public:
-   SingularError(int col)
+   SingularError(ipc_ col)
    : std::runtime_error("Matrix is singular"), col(col)
    {}
 
-   int const col;
+   ipc_ const col;
 };
 
 /**
@@ -47,16 +51,16 @@ public:
  */
 struct ThreadStats {
    Flag flag = Flag::SUCCESS; ///< Error flag for thread
-   int num_delay = 0;   ///< Number of delays
-   int64_t num_factor = 0;    ///< Number of entries in factors
-   int64_t num_flops = 0;     ///< Number of floating point operations
-   int num_neg = 0;     ///< Number of negative pivots
-   int num_two = 0;     ///< Number of 2x2 pivots
-   int num_zero = 0;    ///< Number of zero pivots
-   int maxfront = 0;    ///< Maximum front size
-   int maxsupernode = 0;      ///< Maximum supernode size
-   int not_first_pass = 0;    ///< Number of pivots not eliminated in APP
-   int not_second_pass = 0;   ///< Number of pivots not eliminated in APP or TPP
+   ipc_ num_delay = 0;   ///< Number of delays
+   longc_ num_factor = 0;    ///< Number of entries in factors
+   longc_ num_flops = 0;     ///< Number of floating point operations
+   ipc_ num_neg = 0;     ///< Number of negative pivots
+   ipc_ num_two = 0;     ///< Number of 2x2 pivots
+   ipc_ num_zero = 0;    ///< Number of zero pivots
+   ipc_ maxfront = 0;    ///< Maximum front size
+   ipc_ maxsupernode = 0;     ///< Maximum supernode size
+   ipc_ not_first_pass = 0;   ///< Number of pivots not eliminated in APP
+   ipc_ not_second_pass = 0;  ///< Number of pivots not eliminated in APP or TPP
 
    ThreadStats& operator+=(ThreadStats const& other);
 };
diff --git a/include/ssids_cpu_Workspace.hxx b/include/ssids_cpu_Workspace.hxx
index f099fd50fe..b73bf29195 100644
--- a/include/ssids_cpu_Workspace.hxx
+++ b/include/ssids_cpu_Workspace.hxx
@@ -2,12 +2,15 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-04 AT 08:30 GMT
  */
+
 #pragma once
 
 #include <memory>
 
 #include "spral_compat.hxx" // in case std::align not defined
+#include "ssids_rip.hxx"
 
 namespace spral { namespace ssids { namespace cpu {
 
@@ -16,11 +19,11 @@ namespace spral { namespace ssids { namespace cpu {
  * given size. */
 class Workspace {
 #if defined(__AVX512F__)
-  static int const align = 64;
+  static ipc_ const align = 64;
 #elif defined(__AVX__)
-  static int const align = 32;
+  static ipc_ const align = 32;
 #else
-  static int const align = 16;
+  static ipc_ const align = 16;
 #endif
 public:
    Workspace(size_t sz)
diff --git a/include/ssids_cpu_cpu_iface.hxx b/include/ssids_cpu_cpu_iface.hxx
index cd9c9a0b6a..f87dc6105e 100644
--- a/include/ssids_cpu_cpu_iface.hxx
+++ b/include/ssids_cpu_cpu_iface.hxx
@@ -2,40 +2,36 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 10:30 GMT
  */
+
 #pragma once
 
 #include <cstddef>
 
-
-#ifdef SPRAL_SINGLE
-#define precision_ float
-#else
-#define precision_ double
-#endif
-
+#include "ssids_rip.hxx"
 
 namespace spral { namespace ssids { namespace cpu {
 
-enum struct PivotMethod : int {
+enum struct PivotMethod : ipc_ {
    app_aggressive = 1,
    app_block      = 2,
    tpp            = 3
 };
 
-enum struct FailedPivotMethod : int {
+enum struct FailedPivotMethod : ipc_ {
    tpp            = 1,
    pass           = 2
 };
 
 struct cpu_factor_options {
-   int print_level;
+   ipc_ print_level;
    bool action;
-   precision_ small;
-   precision_ u;
-   precision_ multiplier;
-   long small_subtree_threshold;
-   int cpu_block_size;
+   rpc_ small;
+   rpc_ u;
+   rpc_ multiplier;
+   longc_ small_subtree_threshold;
+   ipc_ cpu_block_size;
    PivotMethod pivot_method;
    FailedPivotMethod failed_pivot_method;
 };
@@ -44,14 +40,14 @@ struct cpu_factor_options {
 template<typename T>
 size_t align_lda(size_t lda) {
 #if defined(__AVX512F__)
-  int const align = 64;
+  ipc_ const align = 64;
 #elif defined(__AVX__)
-  int const align = 32;
+  ipc_ const align = 32;
 #else
-  int const align = 16;
+  ipc_ const align = 16;
 #endif
    static_assert(align % sizeof(T) == 0, "Can only align if T divides align");
-   int const Talign = align / sizeof(T);
+   ipc_ const Talign = align / sizeof(T);
    return Talign*((lda-1)/Talign + 1);
 }
 
diff --git a/include/ssids_cpu_factor.hxx b/include/ssids_cpu_factor.hxx
index 611758ebd0..9310abd53a 100644
--- a/include/ssids_cpu_factor.hxx
+++ b/include/ssids_cpu_factor.hxx
@@ -2,6 +2,7 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 11:30 GMT
  */
 #pragma once
 
@@ -16,6 +17,8 @@
 #endif /* _OPENMP */
 
 /* SPRAL headers */
+
+#include "ssids_rip.hxx"
 #include "ssids_profile.hxx"
 #include "ssids_cpu_cpu_iface.hxx"
 #include "ssids_cpu_SymbolicNode.hxx"
@@ -40,7 +43,7 @@
 #define ldlt_tpp_factor ldlt_tpp_factor_dbl
 #endif
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define host_gemm host_gemm_64
 #endif
 
@@ -49,7 +52,7 @@ namespace spral { namespace ssids { namespace cpu {
 /* Factorize a node (indef) */
 template <typename T, typename PoolAlloc>
 void factor_node_indef(
-      int ni, // FIXME: remove post debug
+      ipc_ ni, // FIXME: remove post debug
       SymbolicNode const& snode,
       NumericNode<T, PoolAlloc> &node,
       struct cpu_factor_options const& options,
@@ -58,12 +61,12 @@ void factor_node_indef(
       PoolAlloc& pool_alloc
       ) {
    /* Extract useful information about node */
-   int m = snode.nrow + node.ndelay_in;
-   int n = snode.ncol + node.ndelay_in;
+   ipc_ m = snode.nrow + node.ndelay_in;
+   ipc_ n = snode.ncol + node.ndelay_in;
    size_t ldl = align_lda<T>(m);
    T *lcol = node.lcol;
    T *d = &node.lcol[ n*ldl ];
-   int *perm = node.perm;
+   ipc_ *perm = node.perm;
    T *contrib = node.contrib;
 
    /* Perform factorization */
@@ -87,7 +90,7 @@ void factor_node_indef(
 
    /* Finish factorization worth simplistic code */
    if(node.nelim < n) {
-      int nelim = node.nelim;
+      ipc_ nelim = node.nelim;
       if(options.pivot_method!=PivotMethod::tpp)
          stats.not_first_pass += n-nelim;
       // Only use TPP to finish off if we're a root node, it's not finishing
@@ -104,8 +107,8 @@ void factor_node_indef(
                options.small, nelim, &lcol[nelim], ldl
                );
          if(m-n>0 && node.nelim>nelim) {
-            int nelim2 = node.nelim - nelim;
-            int ldld = align_lda<T>(m-n);
+            ipc_ nelim2 = node.nelim - nelim;
+            ipc_ ldld = align_lda<T>(m-n);
             T *ld = work[omp_get_thread_num()].get_ptr<T>(nelim2*ldld);
             calcLD<OP_N>(
                   m-n, nelim2, &lcol[nelim*ldl+n], ldl, &d[2*nelim], ld, ldld
@@ -132,7 +135,7 @@ void factor_node_indef(
    /* Record information */
    node.ndelay_out = n - node.nelim;
    stats.num_delay += node.ndelay_out;
-   for (int64_t j = m; j >= m-(node.nelim)+1; --j) {
+   for (longc_ j = m; j >= m-(node.nelim)+1; --j) {
        stats.num_factor += j;
        stats.num_flops += j*j;
    }
@@ -144,7 +147,7 @@ void factor_node_indef(
       node.free_contrib();
    } else if(node.nelim==0) {
       // FIXME: If we fix the above, we don't need this explict zeroing
-      long contrib_size = m-n;
+      longc_ contrib_size = m-n;
       memset(node.contrib, 0, contrib_size*contrib_size*sizeof(T));
    }
 }
@@ -158,14 +161,14 @@ void factor_node_posdef(
       ThreadStats& stats
       ) {
    /* Extract useful information about node */
-   int m = snode.nrow;
-   int n = snode.ncol;
-   int ldl = align_lda<T>(m);
+   ipc_ m = snode.nrow;
+   ipc_ n = snode.ncol;
+   ipc_ ldl = align_lda<T>(m);
    T *lcol = node.lcol;
    T *contrib = node.contrib;
 
    /* Perform factorization */
-   int flag;
+   ipc_ flag;
    cholesky_factor(
          m, n, lcol, ldl, beta, contrib, m-n, options.cpu_block_size, &flag
          );
@@ -178,7 +181,7 @@ void factor_node_posdef(
 
    /* Record information */
    node.ndelay_out = 0;
-   for (int64_t j = m; j >= m-(node.nelim)+1; --j) {
+   for (longc_ j = m; j >= m-(node.nelim)+1; --j) {
        stats.num_factor += j;
        stats.num_flops += j*j;
    }
@@ -186,7 +189,7 @@ void factor_node_posdef(
 /* Factorize a node (wrapper) */
 template <bool posdef, typename T, typename PoolAlloc>
 void factor_node(
-      int ni,
+      ipc_ ni,
       SymbolicNode const& snode,
       NumericNode<T, PoolAlloc> &node,
       struct cpu_factor_options const& options,
diff --git a/include/ssids_cpu_kernels_SimdVec.hxx b/include/ssids_cpu_kernels_SimdVec.hxx
index 58ebebada2..e3ac62ee3a 100644
--- a/include/ssids_cpu_kernels_SimdVec.hxx
+++ b/include/ssids_cpu_kernels_SimdVec.hxx
@@ -2,13 +2,17 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-04 AT 10:10 GMT
  */
+
 #pragma once
 
 #include <cmath>
 #include <cstdio>
 #include <limits>
 
+#include "ssids_rip.hxx"
+
 #if defined(__AVX2__) || defined(__AVX__)
 #include <immintrin.h>
 #endif
@@ -35,12 +39,12 @@ public:
 
 #if defined(__AVX2__) || defined(__AVX__)
    /// Length of underlying vector type
-   static const int vector_length = 4; /* this should probably be 8 !! */
+   static const ipc_ vector_length = 4; /* this should probably be 8 !! */
    /// Typedef for underlying vector type containing singles
    typedef __m256 simd_precision_type;
 #else
    /// Length of underlying vector type
-   static const int vector_length = 1;
+   static const ipc_ vector_length = 1;
    /// Typedef for underlying vector type containing floats
    typedef float simd_precision_type;
 #endif
@@ -243,7 +247,7 @@ public:
    /// Returns a vector with all positions idx or above set to true, otherwise
    /// false.
    static
-   SimdVec gt_mask(int idx) {
+   SimdVec gt_mask(ipc_ idx) {
 #if defined(__AVX2__) || defined(__AVX__)
       const float avx_true  = -std::numeric_limits<float>::quiet_NaN();
       const float avx_false = 0.0;
@@ -265,7 +269,7 @@ public:
 
    /// Prints the vector (inefficient, use for debug only)
    void print() {
-      for(int i=0; i<vector_length; i++) printf(" %e", (*this)[i]);
+      for(ipc_ i=0; i<vector_length; i++) printf(" %e", (*this)[i]);
    }
 
 private:
@@ -287,12 +291,12 @@ public:
 
 #if defined(__AVX2__) || defined(__AVX__)
    /// Length of underlying vector type
-   static const int vector_length = 4;
+   static const ipc_ vector_length = 4;
    /// Typedef for underlying vector type containing doubles
    typedef __m256d simd_precision_type;
 #else
    /// Length of underlying vector type
-   static const int vector_length = 1;
+   static const ipc_ vector_length = 1;
    /// Typedef for underlying vector type containing doubles
    typedef double simd_precision_type;
 #endif
@@ -495,7 +499,7 @@ public:
    /// Returns a vector with all positions idx or above set to true, otherwise
    /// false.
    static
-   SimdVec gt_mask(int idx) {
+   SimdVec gt_mask(ipc_ idx) {
 #if defined(__AVX2__) || defined(__AVX__)
       const double avx_true  = -std::numeric_limits<double>::quiet_NaN();
       const double avx_false = 0.0;
@@ -517,7 +521,7 @@ public:
 
    /// Prints the vector (inefficient, use for debug only)
    void print() {
-      for(int i=0; i<vector_length; i++) printf(" %e", (*this)[i]);
+      for(ipc_ i=0; i<vector_length; i++) printf(" %e", (*this)[i]);
    }
 
 private:
diff --git a/include/ssids_cpu_kernels_assemble.hxx b/include/ssids_cpu_kernels_assemble.hxx
index 7d52bad066..fb16a938d3 100644
--- a/include/ssids_cpu_kernels_assemble.hxx
+++ b/include/ssids_cpu_kernels_assemble.hxx
@@ -2,6 +2,7 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 10:50 GMT
  */
 #pragma once
 
@@ -9,6 +10,7 @@
 #include<memory>
 #include<vector>
 
+#include "ssids_rip.hxx"
 #include "ssids_contrib.h"
 #include "ssids_profile.hxx"
 #include "ssids_cpu_NumericNode.hxx"
@@ -20,13 +22,11 @@
 #define spral_ssids_contrib_free spral_ssids_contrib_free_sgl
 #define FAPrecisionTraits FASingleTraits
 #define factor_alloc_precision factor_alloc_single
-#define precision float
 #else
 #define spral_ssids_contrib_get_data spral_ssids_contrib_get_data_double
 #define spral_ssids_contrib_free spral_ssids_contrib_free_dbl
 #define FAPrecisionTraits FADoubleTraits
 #define factor_alloc_precision factor_alloc_double
-#define precision double
 #endif
 
 namespace spral { namespace ssids { namespace cpu {
@@ -37,16 +37,16 @@ namespace spral { namespace ssids { namespace cpu {
  */
 template <typename T>
 inline
-void asm_col(int n, int const* idx, T const* src, T* dest) {
-   int const nunroll = 4;
-   int n2 = nunroll*(n/nunroll);
-   for(int j=0; j<n2; j+=nunroll) {
+void asm_col(ipc_ n, ipc_ const* idx, T const* src, T* dest) {
+   ipc_ const nunroll = 4;
+   ipc_ n2 = nunroll*(n/nunroll);
+   for(ipc_ j=0; j<n2; j+=nunroll) {
       dest[ idx[j+0] ] += src[j+0];
       dest[ idx[j+1] ] += src[j+1];
       dest[ idx[j+2] ] += src[j+2];
       dest[ idx[j+3] ] += src[j+3];
    }
-   for(int j=n2; j<n; j++)
+   for(ipc_ j=n2; j<n; j++)
       dest[ idx[j] ] += src[j];
 }
 
@@ -61,18 +61,18 @@ void asm_col(int n, int const* idx, T const* src, T* dest) {
    * \param scaling Scaling to apply (none if null).
    */
 template <typename T, typename NumericNode>
-void add_a_block(int from, int to, NumericNode& node, T const* aval,
+void add_a_block(ipc_ from, ipc_ to, NumericNode& node, T const* aval,
       T const* scaling) {
    SymbolicNode const& snode = node.symb;
    size_t ldl = node.get_ldl();
    if(scaling) {
       /* Scaling to apply */
-      for(int i=from; i<to; ++i) {
-         long src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
-         long dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
-         int c = dest / snode.nrow;
-         int r = dest % snode.nrow;
-         long k = c*ldl + r;
+      for(ipc_ i=from; i<to; ++i) {
+         longc_ src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
+         longc_ dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
+         ipc_ c = dest / snode.nrow;
+         ipc_ r = dest % snode.nrow;
+         longc_ k = c*ldl + r;
          if(r >= snode.ncol) k += node.ndelay_in;
          T rscale = scaling[ snode.rlist[r]-1 ];
          T cscale = scaling[ snode.rlist[c]-1 ];
@@ -80,12 +80,12 @@ void add_a_block(int from, int to, NumericNode& node, T const* aval,
       }
    } else {
       /* No scaling to apply */
-      for(int i=from; i<to; ++i) {
-         long src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
-         long dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
-         int c = dest / snode.nrow;
-         int r = dest % snode.nrow;
-         long k = c*ldl + r;
+      for(ipc_ i=from; i<to; ++i) {
+         longc_ src  = snode.amap[2*i+0] - 1; // amap contains 1-based values
+         longc_ dest = snode.amap[2*i+1] - 1; // amap contains 1-based values
+         ipc_ c = dest / snode.nrow;
+         ipc_ r = dest % snode.nrow;
+         longc_ k = c*ldl + r;
          if(r >= snode.ncol) k += node.ndelay_in;
          node.lcol[k] = aval[src];
       }
@@ -103,18 +103,18 @@ void add_a_block(int from, int to, NumericNode& node, T const* aval,
  * \param cache Length cm lookup vector.
  */
 template <typename T, typename PoolAlloc, typename MapVector>
-void assemble_expected(int from, int to, NumericNode<T,PoolAlloc>& node, NumericNode<T,PoolAlloc> const& cnode, MapVector const& map, int* cache) {
+void assemble_expected(ipc_ from, ipc_ to, NumericNode<T,PoolAlloc>& node, NumericNode<T,PoolAlloc> const& cnode, MapVector const& map, ipc_* cache) {
    SymbolicNode const& csnode = cnode.symb;
-   int cm = csnode.nrow - csnode.ncol;
-   for(int j=from; j<cm; ++j)
+   ipc_ cm = csnode.nrow - csnode.ncol;
+   for(ipc_ j=from; j<cm; ++j)
       cache[j] = map[ csnode.rlist[csnode.ncol+j] ];
-   for(int i=from; i<to; i++) {
-      int c = cache[i];
+   for(ipc_ i=from; i<to; i++) {
+      ipc_ c = cache[i];
       T *src = &cnode.contrib[i*cm];
       // NB: we handle contribution to contrib in assemble_post()
       if(c < node.symb.ncol) {
          // Contribution added to lcol
-         int ldd = node.get_ldl();
+         ipc_ ldd = node.get_ldl();
          T *dest = &node.lcol[c*ldd];
          asm_col(cm-i, &cache[i], &src[i], dest);
       }
@@ -131,19 +131,19 @@ void assemble_expected(int from, int to, NumericNode<T,PoolAlloc>& node, Numeric
  * \param cache Length cm lookup vector.
  */
 template <typename T, typename PoolAlloc, typename MapVector>
-void assemble_expected_contrib(int from, int to, NumericNode<T,PoolAlloc>& node, NumericNode<T,PoolAlloc> const& cnode, MapVector const& map, int* cache) {
+void assemble_expected_contrib(ipc_ from, ipc_ to, NumericNode<T,PoolAlloc>& node, NumericNode<T,PoolAlloc> const& cnode, MapVector const& map, ipc_* cache) {
    SymbolicNode const& csnode = cnode.symb;
-   int cm = csnode.nrow - csnode.ncol;
-   int ncol = node.symb.ncol + node.ndelay_in;
-   for(int j=from; j<cm; ++j)
+   ipc_ cm = csnode.nrow - csnode.ncol;
+   ipc_ ncol = node.symb.ncol + node.ndelay_in;
+   for(ipc_ j=from; j<cm; ++j)
       cache[j] = map[ csnode.rlist[csnode.ncol+j] ] - ncol;
-   for(int i=from; i<to; i++) {
-      int c = cache[i]+ncol;
+   for(ipc_ i=from; i<to; i++) {
+      ipc_ c = cache[i]+ncol;
       T *src = &cnode.contrib[i*cm];
       // NB: only interested in contribution to generated element
       if(c >= node.symb.ncol) {
          // Contribution added to contrib
-         int ldd = node.symb.nrow - node.symb.ncol;
+         ipc_ ldd = node.symb.nrow - node.symb.ncol;
          T *dest = &node.contrib[(c-ncol)*ldd];
          asm_col(cm-i, &cache[i], &src[i], dest);
       }
@@ -155,7 +155,7 @@ template <typename T,
           typename PoolAlloc>
 void assemble_pre(
       bool posdef,
-      int n,
+      ipc_ n,
       SymbolicNode const& snode,
       void** child_contrib,
       NumericNode<T,PoolAlloc>& node,
@@ -169,11 +169,11 @@ void assemble_pre(
    Profile::Task task_asm_pre("TA_ASM_PRE");
 #endif
    /* Rebind allocators */
-   typedef typename std::allocator_traits<FactorAlloc>::template rebind_traits<precision> FAPrecisionTraits;
+   typedef typename std::allocator_traits<FactorAlloc>::template rebind_traits<rpc_> FAPrecisionTraits;
    typename FAPrecisionTraits::allocator_type factor_alloc_precision(factor_alloc);
-   typedef typename std::allocator_traits<FactorAlloc>::template rebind_traits<int> FAIntTraits;
+   typedef typename std::allocator_traits<FactorAlloc>::template rebind_traits<ipc_> FAIntTraits;
    typename FAIntTraits::allocator_type factor_alloc_int(factor_alloc);
-   typedef typename std::allocator_traits<PoolAlloc>::template rebind_traits<int> PAIntTraits;
+   typedef typename std::allocator_traits<PoolAlloc>::template rebind_traits<ipc_> PAIntTraits;
    typename PAIntTraits::allocator_type pool_alloc_int(pool_alloc);
 
    /* Count incoming delays and determine size of node */
@@ -182,21 +182,21 @@ void assemble_pre(
       node.ndelay_in += child->ndelay_out;
    }
    for(int contrib_idx : snode.contrib) {
-      int cn, ldcontrib, ndelay, lddelay;
-      precision const *cval, *delay_val;
-      int const *crlist, *delay_perm;
+      ipc_ cn, ldcontrib, ndelay, lddelay;
+      rpc_ const *cval, *delay_val;
+      ipc_ const *crlist, *delay_perm;
       spral_ssids_contrib_get_data(
             child_contrib[contrib_idx], &cn, &cval, &ldcontrib, &crlist,
             &ndelay, &delay_perm, &delay_val, &lddelay
             );
       node.ndelay_in += ndelay;
    }
-   int nrow = snode.nrow + node.ndelay_in;
-   int ncol = snode.ncol + node.ndelay_in;
+   ipc_ nrow = snode.nrow + node.ndelay_in;
+   ipc_ ncol = snode.ncol + node.ndelay_in;
 
    /* Get space for node now we know it size using Fortran allocator + zero it*/
    // NB L is  nrow x ncol and D is 2 x ncol (but no D if posdef)
-   size_t ldl = align_lda<precision>(nrow);
+   size_t ldl = align_lda<rpc_>(nrow);
    size_t len = posdef ?  ldl    * ncol  // posdef
                        : (ldl+2) * ncol; // indef (includes D)
    node.lcol = FAPrecisionTraits::allocate(factor_alloc_precision, len);
@@ -209,18 +209,18 @@ void assemble_pre(
    /* Alloc + set perm for expected eliminations at this node (delays are set
     * when they are imported from children) */
    node.perm = FAIntTraits::allocate(factor_alloc_int, ncol); // ncol fully summed variables
-   for(int i=0; i<snode.ncol; i++)
+   for(ipc_ i=0; i<snode.ncol; i++)
       node.perm[i] = snode.rlist[i];
 
    /* Add A */
-   int const add_a_blk_sz = 256;
+   ipc_ const add_a_blk_sz = 256;
    if(snode.num_a < add_a_blk_sz) {
       // Single block
       add_a_block(0, snode.num_a, node, aval, scaling);
    } else {
       // Multiple blocks
       #pragma omp taskgroup
-      for(int iblk=0; iblk<snode.num_a; iblk+=add_a_blk_sz) {
+      for(ipc_ iblk=0; iblk<snode.num_a; iblk+=add_a_blk_sz) {
 /*         #pragma omp task default(none) \ */
          #pragma omp task \
             firstprivate(iblk) \
@@ -238,19 +238,19 @@ void assemble_pre(
    /*
     * Add children
     */
-   int delay_col = snode.ncol;
+   ipc_ delay_col = snode.ncol;
 
    /* Build lookup vector, allowing for insertion of delayed vars */
    /* Note that while rlist[] is 1-indexed this is fine so long as lookup
     * is also 1-indexed (which it is as it is another node's rlist[] */
-   const auto map_deleter = [&pool_alloc_int, n](int* p) {
+   const auto map_deleter = [&pool_alloc_int, n](ipc_* p) {
       PAIntTraits::deallocate(pool_alloc_int, p, n+1);
     };
-    auto map = std::unique_ptr<int[], decltype(map_deleter)>(
+    auto map = std::unique_ptr<ipc_[], decltype(map_deleter)>(
           PAIntTraits::allocate(pool_alloc_int, n+1), map_deleter);
-   for(int i=0; i<snode.ncol; i++)
+   for(ipc_ i=0; i<snode.ncol; i++)
       map[ snode.rlist[i] ] = i;
-   for(int i=snode.ncol; i<snode.nrow; i++)
+   for(ipc_ i=snode.ncol; i<snode.nrow; i++)
       map[ snode.rlist[i] ] = i + node.ndelay_in;
    /* Loop over children adding contributions */
 #ifdef PROFILE
@@ -263,20 +263,20 @@ void assemble_pre(
       SymbolicNode const& csnode = child->symb;
       /* Handle delays - go to back of node
        * (i.e. become the last rows as in lower triangular format) */
-      for(int i=0; i<child->ndelay_out; i++) {
+      for(ipc_ i=0; i<child->ndelay_out; i++) {
          // Add delayed rows (from delayed cols)
          T *dest = &node.lcol[delay_col*(ldl+1)];
-         int lds = align_lda<T>(csnode.nrow + child->ndelay_in);
+         ipc_ lds = align_lda<T>(csnode.nrow + child->ndelay_in);
          T *src = &child->lcol[(child->nelim+i)*(lds+1)];
          node.perm[delay_col] = child->perm[child->nelim+i];
-         for(int j=0; j<child->ndelay_out-i; j++) {
+         for(ipc_ j=0; j<child->ndelay_out-i; j++) {
             dest[j] = src[j];
          }
          // Add child's non-fully summed rows (from delayed cols)
          dest = node.lcol;
          src = &child->lcol[child->nelim*lds + child->ndelay_in +i*lds];
-         for(int j=csnode.ncol; j<csnode.nrow; j++) {
-            int r = map[ csnode.rlist[j] ];
+         for(ipc_ j=csnode.ncol; j<csnode.nrow; j++) {
+            ipc_ r = map[ csnode.rlist[j] ];
             if(r < ncol) dest[r*ldl+delay_col] = src[j];
             else         dest[delay_col*ldl+r] = src[j];
          }
@@ -288,16 +288,16 @@ void assemble_pre(
 
       /* Handle expected contributions (only if something there) */
       if(child->contrib) {
-         int cm = csnode.nrow - csnode.ncol;
-         int const block_size = 256; // FIXME: make configurable?
+         ipc_ cm = csnode.nrow - csnode.ncol;
+         ipc_ const block_size = 256; // FIXME: make configurable?
          if(cm < block_size) {
             // Single block
-            int* cache = work[omp_get_thread_num()].get_ptr<int>(cm);
+            ipc_* cache = work[omp_get_thread_num()].get_ptr<ipc_>(cm);
             assemble_expected(0, cm, node, *child, map, cache);
          } else {
             // Multiple blocks
             #pragma omp taskgroup
-            for(int iblk=0; iblk<cm; iblk+=block_size) {
+            for(ipc_ iblk=0; iblk<cm; iblk+=block_size) {
 /*               #pragma omp task default(none) \ */
                #pragma omp task \
                   firstprivate(iblk) \
@@ -306,7 +306,7 @@ void assemble_pre(
 #ifdef PROFILE
                   Profile::Task task_asm_pre("TA_ASM_PRE");
 #endif
-                  int* cache = work[omp_get_thread_num()].get_ptr<int>(cm);
+                  ipc_* cache = work[omp_get_thread_num()].get_ptr<ipc_>(cm);
                   assemble_expected(iblk, std::min(iblk+block_size,cm), node,
                         *child, map, cache);
 #ifdef PROFILE
@@ -318,32 +318,32 @@ void assemble_pre(
       }
    }
    /* Add any contribution block from other subtrees */
-   for(int contrib_idx : snode.contrib) {
-      int cn, ldcontrib, ndelay, lddelay;
-      precision const *cval, *delay_val;
-      int const *crlist, *delay_perm;
+   for(ipc_ contrib_idx : snode.contrib) {
+      ipc_ cn, ldcontrib, ndelay, lddelay;
+      rpc_ const *cval, *delay_val;
+      ipc_ const *crlist, *delay_perm;
       spral_ssids_contrib_get_data(
             child_contrib[contrib_idx], &cn, &cval, &ldcontrib, &crlist,
             &ndelay, &delay_perm, &delay_val, &lddelay
             );
-      int* cache = work[omp_get_thread_num()].get_ptr<int>(cn);
-      for(int j=0; j<cn; ++j)
+      ipc_* cache = work[omp_get_thread_num()].get_ptr<ipc_>(cn);
+      for(ipc_ j=0; j<cn; ++j)
          cache[j] = map[ crlist[j] ];
       /* Handle delays - go to back of node
        * (i.e. become the last rows as in lower triangular format) */
-      for(int i=0; i<ndelay; i++) {
+      for(ipc_ i=0; i<ndelay; i++) {
          // Add delayed rows (from delayed cols)
          T *dest = &node.lcol[delay_col*(ldl+1)];
          T const* src = &delay_val[i*(lddelay+1)];
          node.perm[delay_col] = delay_perm[i];
-         for(int j=0; j<ndelay-i; j++) {
+         for(ipc_ j=0; j<ndelay-i; j++) {
             dest[j] = src[j];
          }
          // Add child's non-fully summed rows (from delayed cols)
          dest = node.lcol;
          src = &delay_val[i*lddelay+ndelay];
-         for(int j=0; j<cn; j++) {
-            int r = cache[j];
+         for(ipc_ j=0; j<cn; j++) {
+            ipc_ r = cache[j];
             if(r < ncol) dest[r*ldl+delay_col] = src[j];
             else         dest[delay_col*ldl+r] = src[j];
          }
@@ -351,13 +351,13 @@ void assemble_pre(
       }
       if(!cval) continue; // child was all delays, nothing more to do
       /* Handle expected contribution */
-      for(int i=0; i<cn; ++i) {
-         int c = cache[i];
+      for(ipc_ i=0; i<cn; ++i) {
+         ipc_ c = cache[i];
          T const* src = &cval[i*ldcontrib];
          // NB: we handle contribution to contrib in assemble_post()
          if(c < snode.ncol) {
             // Contribution added to lcol
-            int ldd = align_lda<T>(nrow);
+            ipc_ ldd = align_lda<T>(nrow);
             T *dest = &node.lcol[c*ldd];
             asm_col(cn-i, &cache[i], &src[i], dest);
          }
@@ -369,7 +369,7 @@ template <typename T,
           typename PoolAlloc
           >
 void assemble_post(
-      int n,
+      ipc_ n,
       SymbolicNode const& snode,
       void** child_contrib,
       NumericNode<T,PoolAlloc>& node,
@@ -377,36 +377,36 @@ void assemble_post(
       std::vector<Workspace>& work
       ) {
    /* Rebind allocators */
-   typedef typename std::allocator_traits<PoolAlloc>::template rebind_traits<int> PAIntTraits;
+   typedef typename std::allocator_traits<PoolAlloc>::template rebind_traits<ipc_> PAIntTraits;
    typename PAIntTraits::allocator_type pool_alloc_int(pool_alloc);
 
    /* Initialise variables */
-   int ncol = snode.ncol + node.ndelay_in;
+   ipc_ ncol = snode.ncol + node.ndelay_in;
 
    /* Add children */
-   int* map = nullptr;
+   ipc_* map = nullptr;
    if(node.first_child != NULL || snode.contrib.size() > 0) {
       /* Build lookup vector, allowing for insertion of delayed vars */
       /* Note that while rlist[] is 1-indexed this is fine so long as lookup
        * is also 1-indexed (which it is as it is another node's rlist[] */
       if(!map) map = PAIntTraits::allocate(pool_alloc_int, n+1);
       // FIXME: probably don't need to worry about first ncol?
-      for(int i=0; i<snode.ncol; i++)
+      for(ipc_ i=0; i<snode.ncol; i++)
          map[ snode.rlist[i] ] = i;
-      for(int i=snode.ncol; i<snode.nrow; i++)
+      for(ipc_ i=snode.ncol; i<snode.nrow; i++)
          map[ snode.rlist[i] ] = i + node.ndelay_in;
       /* Loop over children adding contributions */
       for(auto* child=node.first_child; child!=NULL; child=child->next_child) {
          SymbolicNode const& csnode = child->symb;
          if(!child->contrib) continue;
-         int cm = csnode.nrow - csnode.ncol;
-         int const block_size = 256;
+         ipc_ cm = csnode.nrow - csnode.ncol;
+         ipc_ const block_size = 256;
          if(cm < block_size) {
-            int* cache = work[omp_get_thread_num()].get_ptr<int>(cm);
+            ipc_* cache = work[omp_get_thread_num()].get_ptr<ipc_>(cm);
             assemble_expected_contrib(0, cm, node, *child, map, cache);
          } else {
             #pragma omp taskgroup
-            for(int iblk=0; iblk<cm; iblk+=block_size) {
+            for(ipc_ iblk=0; iblk<cm; iblk+=block_size) {
 /*               #pragma omp task default(none) \ */
                #pragma omp task \
                   firstprivate(iblk) \
@@ -415,7 +415,7 @@ void assemble_post(
 #ifdef PROFILE
                   Profile::Task task_asm("TA_ASM_POST");
 #endif
-                  int* cache = work[omp_get_thread_num()].get_ptr<int>(cm);
+                  ipc_* cache = work[omp_get_thread_num()].get_ptr<ipc_>(cm);
                   assemble_expected_contrib(iblk, std::min(iblk+block_size,cm),
                         node, *child, map, cache);
 #ifdef PROFILE
@@ -429,25 +429,25 @@ void assemble_post(
       }
    }
    /* Add any contribution block from other subtrees */
-   for(int contrib_idx : snode.contrib) {
-      int cn, ldcontrib, ndelay, lddelay;
-      precision const *cval, *delay_val;
-      int const *crlist, *delay_perm;
+   for(ipc_ contrib_idx : snode.contrib) {
+      ipc_ cn, ldcontrib, ndelay, lddelay;
+      rpc_ const *cval, *delay_val;
+      ipc_ const *crlist, *delay_perm;
       spral_ssids_contrib_get_data(
             child_contrib[contrib_idx], &cn, &cval, &ldcontrib, &crlist,
             &ndelay, &delay_perm, &delay_val, &lddelay
             );
       if(!cval) continue; // child was all delays, nothing to do
-      int* cache = work[omp_get_thread_num()].get_ptr<int>(cn);
-      for(int j=0; j<cn; ++j)
+      ipc_* cache = work[omp_get_thread_num()].get_ptr<ipc_>(cn);
+      for(ipc_ j=0; j<cn; ++j)
          cache[j] = map[ crlist[j] ] - ncol;
-      for(int i=0; i<cn; ++i) {
-         int c = cache[i]+ncol;
+      for(ipc_ i=0; i<cn; ++i) {
+         ipc_ c = cache[i]+ncol;
          T const* src = &cval[i*ldcontrib];
          // NB: only interested in contribution to generated element
          if(c >= snode.ncol) {
             // Contribution added to contrib
-            int ldd = snode.nrow - snode.ncol;
+            ipc_ ldd = snode.nrow - snode.ncol;
             T *dest = &node.contrib[(c-ncol)*ldd];
             asm_col(cn-i, &cache[i], &src[i], dest);
          }
diff --git a/include/ssids_cpu_kernels_block_ldlt.hxx b/include/ssids_cpu_kernels_block_ldlt.hxx
index 79f9c59e39..e180680c2a 100644
--- a/include/ssids_cpu_kernels_block_ldlt.hxx
+++ b/include/ssids_cpu_kernels_block_ldlt.hxx
@@ -2,6 +2,7 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 14:30 GMT
  */
 #pragma once
 
@@ -16,27 +17,27 @@ namespace block_ldlt_internal {
 
 /** Swaps two columns of A */
 /* NB: ldwork only well defined for c<idx1 */
-template<typename T, int BLOCK_SIZE>
-void swap_cols(int idx1, int idx2, int n, T *a, int lda, T *ldwork, int *perm) {
+template<typename T, ipc_ BLOCK_SIZE>
+void swap_cols(ipc_ idx1, ipc_ idx2, ipc_ n, T *a, ipc_ lda, T *ldwork, ipc_ *perm) {
    if(idx1==idx2) return; // noop
 
    /* Ensure wlog idx1 < idx2 */
    if(idx1 > idx2) {
-      int temp = idx1;
+      ipc_ temp = idx1;
       idx1 = idx2;
       idx2 = temp;
    }
 
    /* Swap perm */
    if(perm) {
-      int temp = perm[idx1];
+      ipc_ temp = perm[idx1];
       perm[idx1] = perm[idx2];
       perm[idx2] = temp;
    }
 
    /* Swap ldwork */
    if(ldwork) {
-      for(int c=0; c<idx1; c++) {
+      for(ipc_ c=0; c<idx1; c++) {
          T temp = ldwork[c*BLOCK_SIZE+idx1];
          ldwork[c*BLOCK_SIZE+idx1] = ldwork[c*BLOCK_SIZE+idx2];
          ldwork[c*BLOCK_SIZE+idx2] = temp;
@@ -44,14 +45,14 @@ void swap_cols(int idx1, int idx2, int n, T *a, int lda, T *ldwork, int *perm) {
    }
 
    /* Swap row portions */
-   for(int c=0; c<idx1; c++) {
+   for(ipc_ c=0; c<idx1; c++) {
       T temp = a[c*lda+idx1];
       a[c*lda+idx1] = a[c*lda+idx2];
       a[c*lda+idx2] = temp;
    }
 
    /* Swap row of idx2 with col of idx1 */
-   for(int i=idx1+1; i<idx2; i++) {
+   for(ipc_ i=idx1+1; i<idx2; i++) {
       T temp = a[idx1*lda+i];
       a[idx1*lda+i] = a[i*lda+idx2];
       a[i*lda+idx2] = temp;
@@ -65,7 +66,7 @@ void swap_cols(int idx1, int idx2, int n, T *a, int lda, T *ldwork, int *perm) {
    }
 
    /* Swap col portions */
-   for(int r=idx2+1; r<n; r++) {
+   for(ipc_ r=idx2+1; r<n; r++) {
       T temp = a[idx1*lda+r];
       a[idx1*lda+r] = a[idx2*lda+r];
       a[idx2*lda+r] = temp;
@@ -73,8 +74,8 @@ void swap_cols(int idx1, int idx2, int n, T *a, int lda, T *ldwork, int *perm) {
 }
 
 
-template <typename T, int BLOCK_SIZE>
-void find_maxloc(const int from, const T *a, int lda, T &bestv_out, int &rloc, int &cloc) {
+template <typename T, ipc_ BLOCK_SIZE>
+void find_maxloc(const ipc_ from, const T *a, ipc_ lda, T &bestv_out, ipc_ &rloc, ipc_ &cloc) {
    typedef SimdVec<T> SimdVecT;
 
    /* Handle special cases:
@@ -85,8 +86,8 @@ void find_maxloc(const int from, const T *a, int lda, T &bestv_out, int &rloc, i
          BLOCK_SIZE % (2*SimdVecT::vector_length) != 0) {
       T bestv = -1.0;
       rloc = BLOCK_SIZE; cloc = BLOCK_SIZE;
-      for(int c=from; c<BLOCK_SIZE; c++) {
-         for(int r=c; r<BLOCK_SIZE; r++) {
+      for(ipc_ c=from; c<BLOCK_SIZE; c++) {
+         for(ipc_ r=c; r<BLOCK_SIZE; r++) {
             double v = a[c*lda+r];
             if(fabs(v) > bestv) {
                bestv = fabs(v);
@@ -104,7 +105,7 @@ void find_maxloc(const int from, const T *a, int lda, T &bestv_out, int &rloc, i
    // Define a union that lets us abuse T to store ints and still use
    // avx blend.
    union intT {
-      int i;
+      ipc_ i;
       T d;
    };
 
@@ -112,19 +113,19 @@ void find_maxloc(const int from, const T *a, int lda, T &bestv_out, int &rloc, i
    SimdVecT bestv(-1.0);
    SimdVecT bestv2(-1.0);
    intT imax;
-   imax.i = std::numeric_limits<int>::max();
+   imax.i = std::numeric_limits<ipc_>::max();
    SimdVecT bestr(imax.d);
    SimdVecT bestr2(imax.d);
    SimdVecT bestc(imax.d);
    SimdVecT bestc2(imax.d);
    // Loop over array at stride equal to vector length
-   for(int c=from; c<BLOCK_SIZE; c++) {
+   for(ipc_ c=from; c<BLOCK_SIZE; c++) {
       // Coerce c to be treated as a T then scatter it
       intT c_d;
       c_d.i = c;
       SimdVecT c_vec(c_d.d);
       // First iteration must be careful as we only want the lower triangle
-      const int vlen = SimdVecT::vector_length;
+      const ipc_ vlen = SimdVecT::vector_length;
       {
          intT r_d;
          r_d.i = vlen *(c / vlen);
@@ -195,7 +196,7 @@ void find_maxloc(const int from, const T *a, int lda, T &bestv_out, int &rloc, i
    bestv_out = bv2[0];
    rloc = br2[0].i;
    cloc = bc2[0].i;
-   for(int i=1; i<SimdVecT::vector_length; i++) {
+   for(ipc_ i=1; i<SimdVecT::vector_length; i++) {
       if(bv2[i] > bestv_out) {
          bestv_out = bv2[i];
          rloc = br2[i].i + i; // NB rloc only stores base of vector, so need +i
@@ -218,51 +219,51 @@ bool test_2x2(T a11, T a21, T a22, T &detpiv, T &detscale) {
 }
 
 /** Updates the trailing submatrix (2x2 case) */
-template <typename T, int BLOCK_SIZE>
-void update_2x2(int p, T *a, int lda, const T *ld) {
-   for(int c=p+2; c<BLOCK_SIZE; c++) {
+template <typename T, ipc_ BLOCK_SIZE>
+void update_2x2(ipc_ p, T *a, ipc_ lda, const T *ld) {
+   for(ipc_ c=p+2; c<BLOCK_SIZE; c++) {
       #pragma omp simd
-      for(int r=c; r<BLOCK_SIZE; r++) {
+      for(ipc_ r=c; r<BLOCK_SIZE; r++) {
          a[c*lda+r] -= ld[c]*a[p*lda+r] + ld[BLOCK_SIZE+c]*a[(p+1)*lda+r];
       }
    }
 }
 
 /** Updates the trailing submatrix (1x1 case) */
-template <typename T, int BLOCK_SIZE>
-void update_1x1(int p, T *a, int lda, const T *ld) {
+template <typename T, ipc_ BLOCK_SIZE>
+void update_1x1(ipc_ p, T *a, ipc_ lda, const T *ld) {
 #if 0
-   for(int c=p+1; c<BLOCK_SIZE; c++)
+   for(ipc_ c=p+1; c<BLOCK_SIZE; c++)
       #pragma omp simd
-      for(int r=c; r<BLOCK_SIZE; r++)
+      for(ipc_ r=c; r<BLOCK_SIZE; r++)
          a[c*lda+r] -= ld[c]*a[p*lda+r];
 #else
-   const int vlen = SimdVec<T>::vector_length;
-   const int unroll=4; // How many iteration of loop we're doing
+   const ipc_ vlen = SimdVec<T>::vector_length;
+   const ipc_ unroll=4; // How many iteration of loop we're doing
 
    // Handle case of small BLOCK_SIZE safely
    if(BLOCK_SIZE < vlen || BLOCK_SIZE%vlen != 0 || BLOCK_SIZE < unroll) {
-      for(int c=p+1; c<BLOCK_SIZE; c++)
-         for(int r=c; r<BLOCK_SIZE; r++)
+      for(ipc_ c=p+1; c<BLOCK_SIZE; c++)
+         for(ipc_ r=c; r<BLOCK_SIZE; r++)
             a[c*lda+r] -= ld[c]*a[p*lda+r];
       return;
    }
-   for(int c=p+1; c<unroll*((p+1-1)/unroll+1); c++) {
+   for(ipc_ c=p+1; c<unroll*((p+1-1)/unroll+1); c++) {
       SimdVec<T> ldvec( -ld[c] ); // NB minus so we can use fma below
-      for(int r=vlen*(c/vlen); r<BLOCK_SIZE; r+=vlen) {
+      for(ipc_ r=vlen*(c/vlen); r<BLOCK_SIZE; r+=vlen) {
          SimdVec<T> lvec = SimdVec<T>::load_aligned(&a[p*lda+r]);
          SimdVec<T> avec = SimdVec<T>::load_aligned(&a[c*lda+r]);
          avec = fmadd(avec, lvec, ldvec);
          avec.store_aligned(&a[c*lda+r]);
       }
    }
-   for(int c=unroll*((p+1-1)/unroll+1); c<BLOCK_SIZE; c+=unroll) {
+   for(ipc_ c=unroll*((p+1-1)/unroll+1); c<BLOCK_SIZE; c+=unroll) {
       // NB we use minus ld[c] below to allow fma afterwards
       SimdVec<T> ldvec0( -ld[c] ); // NB minus so we can use fma below
       SimdVec<T> ldvec1( -ld[c+1] ); // NB minus so we can use fma below
       SimdVec<T> ldvec2( -ld[c+2] ); // NB minus so we can use fma below
       SimdVec<T> ldvec3( -ld[c+3] ); // NB minus so we can use fma below
-      for(int r=vlen*(c/vlen); r<BLOCK_SIZE; r+=vlen) {
+      for(ipc_ r=vlen*(c/vlen); r<BLOCK_SIZE; r+=vlen) {
          SimdVec<T> lvec = SimdVec<T>::load_aligned(&a[p*lda+r]);
          SimdVec<T> avec0 = SimdVec<T>::load_aligned(&a[(c+0)*lda+r]);
          SimdVec<T> avec1 = SimdVec<T>::load_aligned(&a[(c+1)*lda+r]);
@@ -286,16 +287,16 @@ void update_1x1(int p, T *a, int lda, const T *ld) {
 /** Factorize a square block without restricting pivots
  *  Expects to be given a square block of size BLOCK_SIZE with numbers of
  *  interest in bottom right part. */
-template<typename T, int BLOCK_SIZE>
-void block_ldlt(int from, int *perm, T *a, int lda, T *d, T *ldwork,
-      bool action, const T u, const T small, int *lperm=nullptr) {
+template<typename T, ipc_ BLOCK_SIZE>
+void block_ldlt(ipc_ from, ipc_ *perm, T *a, ipc_ lda, T *d, T *ldwork,
+      bool action, const T u, const T small, ipc_ *lperm=nullptr) {
    using namespace block_ldlt_internal;
 
    /* Main loop */
-   for(int p=from; p<BLOCK_SIZE; ) {
+   for(ipc_ p=from; p<BLOCK_SIZE; ) {
       // Find largest uneliminated entry
       T bestv; // Value of maximum entry
-      int t, m; // row and col location of maximum entry
+      ipc_ t, m; // row and col location of maximum entry
       find_maxloc<T,BLOCK_SIZE>(p, a, lda, bestv, t, m);
 
       // Handle case where everything remaining is small
@@ -306,9 +307,9 @@ void block_ldlt(int from, int *perm, T *a, int lda, T *d, T *ldwork,
          for(; p<BLOCK_SIZE; ) {
             // Zero out col
             d[2*p] = 0.0; d[2*p+1] = 0.0;
-            for(int r=p; r<BLOCK_SIZE; r++)
+            for(ipc_ r=p; r<BLOCK_SIZE; r++)
                a[p*lda+r] = 0.0;
-            for(int r=p; r<BLOCK_SIZE; r++)
+            for(ipc_ r=p; r<BLOCK_SIZE; r++)
                ldwork[p*BLOCK_SIZE+r] = 0.0;
             // NB: lperm remains unchanged
             p++;
@@ -317,8 +318,8 @@ void block_ldlt(int from, int *perm, T *a, int lda, T *d, T *ldwork,
       }
 
       // Figure out pivot size
-      int pivsiz = 0;
-      int m2=m, t2=t; // FIXME: debug remove
+      ipc_ pivsiz = 0;
+      ipc_ m2=m, t2=t; // FIXME: debug remove
       T a11, a21, a22, detscale, detpiv;
       if(t==m) {
          a11 = a[t*lda+t];
@@ -349,9 +350,17 @@ void block_ldlt(int from, int *perm, T *a, int lda, T *d, T *ldwork,
       if(pivsiz == 0) {
          // FIXME: debug remove
          printf("broken!\n");
+#ifdef INTEGER_64
+         printf("t = %ld m = %ld\n", t2, m2);
+#else
          printf("t = %d m = %d\n", t2, m2);
+#endif
          a11 = a[m2*lda+m2];
+#ifdef INTEGER_64
+         printf("[%ld] = %e\n", m2*BLOCK_SIZE+m2, a[m2*lda+m2]);
+#else
          printf("[%d] = %e\n", m2*BLOCK_SIZE+m2, a[m2*lda+m2]);
+#endif
          a22 = a[t2*lda+t2];
          a21 = a[m2*lda+t2];
          printf("a11 = %e a21 = %e a22 = %e\n", a11, a21, a22);
@@ -362,10 +371,10 @@ void block_ldlt(int from, int *perm, T *a, int lda, T *d, T *ldwork,
          T d11 = 1.0/a11;
          swap_cols<T, BLOCK_SIZE>
             (p, t, BLOCK_SIZE, a, lda, ldwork, perm);
-         if(lperm) { int temp=lperm[p]; lperm[p]=lperm[t]; lperm[t]=temp; }
+         if(lperm) { ipc_ temp=lperm[p]; lperm[p]=lperm[t]; lperm[t]=temp; }
          /* Divide through, preserving a copy */
          T *work = &ldwork[p*BLOCK_SIZE];
-         for(int r=p+1; r<BLOCK_SIZE; r++) {
+         for(ipc_ r=p+1; r<BLOCK_SIZE; r++) {
             work[r] = a[p*lda+r];
             a[p*lda+r] *= d11;
          }
@@ -381,17 +390,17 @@ void block_ldlt(int from, int *perm, T *a, int lda, T *d, T *ldwork,
          /* NB t > m by construction. Hence m>=p, t>=p+1 and swaps are safe */
          swap_cols<T, BLOCK_SIZE>
             (p,   m, BLOCK_SIZE, a, lda, ldwork, perm);
-         if(lperm) { int temp=lperm[p]; lperm[p]=lperm[m]; lperm[m]=temp; }
+         if(lperm) { ipc_ temp=lperm[p]; lperm[p]=lperm[m]; lperm[m]=temp; }
          swap_cols<T, BLOCK_SIZE>
             (p+1, t, BLOCK_SIZE, a, lda, ldwork, perm);
-         if(lperm) { int temp=lperm[p+1]; lperm[p+1]=lperm[t]; lperm[t]=temp; }
+         if(lperm) { ipc_ temp=lperm[p+1]; lperm[p+1]=lperm[t]; lperm[t]=temp; }
          /* Calculate 2x2 inverse */
          T d11 = (a22*detscale)/detpiv;
          T d22 = (a11*detscale)/detpiv;
          T d21 = (-a21*detscale)/detpiv;
          /* Divide through, preserving a copy */
          T *work = &ldwork[p*BLOCK_SIZE];
-         for(int r=p+2; r<BLOCK_SIZE; r++) {
+         for(ipc_ r=p+2; r<BLOCK_SIZE; r++) {
             work[r]   = a[p*lda+r];
             work[BLOCK_SIZE+r] = a[(p+1)*lda+r];
             a[p*lda+r]     = d11*work[r] + d21*work[BLOCK_SIZE+r];
diff --git a/include/ssids_cpu_kernels_calc_ld.hxx b/include/ssids_cpu_kernels_calc_ld.hxx
index c1f2e1fdf3..6639f1c7cc 100644
--- a/include/ssids_cpu_kernels_calc_ld.hxx
+++ b/include/ssids_cpu_kernels_calc_ld.hxx
@@ -2,12 +2,15 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-04 AT 08:30 GMT
  */
+
 #pragma once
 
 #include <cmath>
 #include <limits>
 
+#include "ssids_rip.hxx"
 #include "ssids_cpu_kernels_common.hxx"
 #include "ssids_cpu_kernels_SimdVec.hxx"
 
@@ -19,18 +22,18 @@ namespace spral { namespace ssids { namespace cpu {
  *  Note this will mostly just fail if sizeof(T) doesn't divide into alignment.
  */
 template <typename T>
-int offset_to_align(T* ptr) {
+ipc_ offset_to_align(T* ptr) {
 #if defined(__AVX512F__)
-  int const align = 64;
+  ipc_ const align = 64;
 #elif defined(__AVX__)
-  int const align = 32;
+  ipc_ const align = 32;
 #else
-  int const align = 16;
+  ipc_ const align = 16;
 #endif
    uintptr_t offset = align - (reinterpret_cast<uintptr_t>(ptr) % align);
    offset /= sizeof(T);
    if((reinterpret_cast<uintptr_t>(ptr+offset) % align) == 0) return offset;
-   else return std::numeric_limits<int>::max();
+   else return std::numeric_limits<ipc_>::max();
 }
 
 /** Calculates LD from L and D.
@@ -39,32 +42,35 @@ int offset_to_align(T* ptr) {
  * multiples of 32 bytes, so we can use AVX.
  */
 template <enum operation op, typename T>
-void calcLD(int m, int n, T const* l, int ldl, T const* d, T* ld, int ldld) {
+void calcLD(ipc_ m, ipc_ n, T const* l, ipc_ ldl, T const* d, T* ld, 
+            ipc_ ldld) {
    typedef SimdVec<T> SimdVecT;
 
-   for(int col=0; col<n; ) {
+   for(ipc_ col=0; col<n; ) {
       if(col+1==n || std::isfinite(d[2*col+2])) {
          // 1x1 pivot
          T d11 = d[2*col];
          if(d11 != 0.0) d11 = 1/d11; // Zero pivots just cause zeroes
          if(op==OP_N) {
-            int const vlen = SimdVecT::vector_length;
-            int const unroll = 4;
-            int offset = offset_to_align(l);
+            ipc_ const vlen = SimdVecT::vector_length;
+            ipc_ const unroll = 4;
+            ipc_ offset = offset_to_align(l);
             if(offset_to_align(ld) != offset) offset = m; // give up on vectors
-            int nvec = std::max(0, (m-offset) / vlen);
-            for(int row=0; row<std::min(offset,m); ++row)
+            ipc_ i0_ = 0;
+            ipc_ nvec = std::max(i0_, (m-offset) / vlen);
+            for(ipc_ row=0; row<std::min(offset,m); ++row)
                ld[col*ldld+row] = d11 * l[col*ldl+row];
             SimdVecT d11v(d11);
             if(nvec < unroll) {
-               for(int row=offset; row<offset+nvec*vlen; row+=vlen) {
+               for(ipc_ row=offset; row<offset+nvec*vlen; row+=vlen) {
                   SimdVecT lv = SimdVecT::load_aligned(&l[col*ldl+row]);
                   lv = lv * d11;
                   lv.store_aligned(&ld[col*ldld+row]);
                }
             } else {
-               int nunroll = nvec / unroll;
-               for(int row=offset; row<offset+nunroll*unroll*vlen; row+=unroll*vlen) {
+               ipc_ nunroll = nvec / unroll;
+               for(ipc_ row=offset; row<offset+nunroll*unroll*vlen; 
+                        row+=unroll*vlen) {
                   SimdVecT lv0 = SimdVecT::load_aligned(&l[col*ldl+row+0*vlen]);
                   SimdVecT lv1 = SimdVecT::load_aligned(&l[col*ldl+row+1*vlen]);
                   SimdVecT lv2 = SimdVecT::load_aligned(&l[col*ldl+row+2*vlen]);
@@ -78,16 +84,17 @@ void calcLD(int m, int n, T const* l, int ldl, T const* d, T* ld, int ldld) {
                   lv2.store_aligned(&ld[col*ldld+row+2*vlen]);
                   lv3.store_aligned(&ld[col*ldld+row+3*vlen]);
                }
-               for(int row=offset+nunroll*unroll*vlen; row<offset+nvec*vlen; row+=vlen) {
+               for(ipc_ row=offset+nunroll*unroll*vlen; 
+                        row<offset+nvec*vlen; row+=vlen) {
                   SimdVecT lv = SimdVecT::load_aligned(&l[col*ldl+row]);
                   lv = lv * d11;
                   lv.store_aligned(&ld[col*ldld+row]);
                }
             }
-            for(int row=offset+nvec*vlen; row<m; row++)
+            for(ipc_ row=offset+nvec*vlen; row<m; row++)
                ld[col*ldld+row] = d11 * l[col*ldl+row];
          } else { /* op==OP_T */
-            for(int row=0; row<m; row++)
+            for(ipc_ row=0; row<m; row++)
                ld[col*ldld+row] = d11 * l[row*ldl+col];
          }
          col++;
@@ -100,7 +107,7 @@ void calcLD(int m, int n, T const* l, int ldl, T const* d, T* ld, int ldld) {
          d11 = d11/det;
          d21 = d21/det;
          d22 = d22/det;
-         for(int row=0; row<m; row++) {
+         for(ipc_ row=0; row<m; row++) {
             T a1, a2;
             if(op==OP_N) {
                a1 = l[col*ldl+row];
diff --git a/include/ssids_cpu_kernels_cholesky.hxx b/include/ssids_cpu_kernels_cholesky.hxx
index daebfea0db..f13903d9f8 100644
--- a/include/ssids_cpu_kernels_cholesky.hxx
+++ b/include/ssids_cpu_kernels_cholesky.hxx
@@ -2,16 +2,16 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
- *  This version 2023-02-01 13:50 GMT
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 14:30 GMT
  */
 
+#include "ssids_rip.hxx"
+
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define cholesky_factor cholesky_factor_sgl
 #define cholesky_solve_fwd cholesky_solve_fwd_sgl
 #define cholesky_solve_bwd cholesky_solve_bwd_sgl
 #else
-#define precision_ double
 #define cholesky_factor cholesky_factor_dbl
 #define cholesky_solve_fwd cholesky_solve_fwd_dbl
 #define cholesky_solve_bwd cholesky_solve_bwd_dbl
@@ -19,11 +19,11 @@
 
 namespace spral { namespace ssids { namespace cpu {
 
-void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta,
-   precision_* upd, int ldupd, int blksz, int *info);
-void cholesky_solve_fwd(int m, int n, precision_ const* a, int lda, int nrhs,
-   precision_* x, int ldx);
-void cholesky_solve_bwd(int m, int n, precision_ const* a, int lda, int nrhs,
-   precision_* x, int ldx);
+void cholesky_factor(ipc_ m, ipc_ n, rpc_* a, ipc_ lda, rpc_ beta,
+   rpc_* upd, ipc_ ldupd, ipc_ blksz, ipc_ *info);
+void cholesky_solve_fwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda, ipc_ nrhs,
+   rpc_* x, ipc_ ldx);
+void cholesky_solve_bwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda, ipc_ nrhs,
+   rpc_* x, ipc_ ldx);
 
 }}} /* namespaces spral::ssids::cpu */
diff --git a/include/ssids_cpu_kernels_common.hxx b/include/ssids_cpu_kernels_common.hxx
index 114ddbd887..4c2b035c92 100644
--- a/include/ssids_cpu_kernels_common.hxx
+++ b/include/ssids_cpu_kernels_common.hxx
@@ -2,9 +2,13 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 14:30 GMT
  */
+
 #pragma once
 
+#include "ssids_rip.hxx"
+
 namespace spral { namespace ssids { namespace cpu {
 
 /** \brief Supported CPU architectures that can be targeted */
@@ -14,7 +18,9 @@ enum cpu_arch {
    CPU_ARCH_AVX2     // Allow use of AVX2 (FMA3)
 };
 
-/** \brief CPU_BEST_ARCH is set to a value of enum cpu_arch that represents the best supported instruction set supported by current compiler and compiler flags */
+/** \brief CPU_BEST_ARCH is set to a value of enum cpu_arch that represents 
+    the best supported instruction set supported by current compiler and 
+    compiler flags */
 #ifdef __AVX2__
 const enum cpu_arch CPU_BEST_ARCH = CPU_ARCH_AVX2;
 #else
@@ -27,7 +33,7 @@ const enum cpu_arch CPU_BEST_ARCH = CPU_ARCH_GENERIC;
 
 
 /** \brief The warpSize for the current architecture as a constant */
-const int WARPSIZE = 32;
+const ipc_ WARPSIZE = 32;
 
 /** \brief bub::operation enumerates operations that can be applied to a matrix
   * argument of a BLAS call.
diff --git a/include/ssids_cpu_kernels_ldlt_app.hxx b/include/ssids_cpu_kernels_ldlt_app.hxx
index ecdbeed40a..4ab6638d1b 100644
--- a/include/ssids_cpu_kernels_ldlt_app.hxx
+++ b/include/ssids_cpu_kernels_ldlt_app.hxx
@@ -2,12 +2,15 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 14:30 GMT
  */
+
 #pragma once
 
 #include <vector>
 
 #include "ssids_cpu_Workspace.hxx"
+#include "ssids_rip.hxx"
 
 #ifdef SPRAL_SINGLE
 #define ldlt_app_factor ldlt_app_factor_sgl
@@ -24,19 +27,19 @@
 namespace spral { namespace ssids { namespace cpu {
 
 template<typename T, typename Allocator>
-int ldlt_app_factor(int m, int n, int *perm, T *a, int lda, T *d, T beta,
-   T* upd, int ldupd, struct cpu_factor_options const& options,
+ipc_ ldlt_app_factor(ipc_ m, ipc_ n, ipc_ *perm, T *a, ipc_ lda, T *d, T beta,
+   T* upd, ipc_ ldupd, struct cpu_factor_options const& options,
    std::vector<Workspace>& work, Allocator const& alloc);
 
 template <typename T>
-void ldlt_app_solve_fwd(int m, int n, T const* l, int ldl, int nrhs, T* x,
-   int ldx);
+void ldlt_app_solve_fwd(ipc_ m, ipc_ n, T const* l, ipc_ ldl, ipc_ nrhs, T* x,
+   ipc_ ldx);
 
 template <typename T>
-void ldlt_app_solve_diag(int n, T const* d, int nrhs, T* x, int ldx);
+void ldlt_app_solve_diag(ipc_ n, T const* d, ipc_ nrhs, T* x, ipc_ ldx);
 
 template <typename T>
-void ldlt_app_solve_bwd(int m, int n, T const* l, int ldl, int nrhs, T* x,
-   int ldx);
+void ldlt_app_solve_bwd(ipc_ m, ipc_ n, T const* l, ipc_ ldl, ipc_ nrhs, T* x,
+   ipc_ ldx);
 
 }}} /* namespaces spral::ssids::cpu */
diff --git a/include/ssids_cpu_kernels_ldlt_nopiv.hxx b/include/ssids_cpu_kernels_ldlt_nopiv.hxx
index 348bbbb7f9..22e3d7a5f3 100644
--- a/include/ssids_cpu_kernels_ldlt_nopiv.hxx
+++ b/include/ssids_cpu_kernels_ldlt_nopiv.hxx
@@ -2,17 +2,19 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 14:40 GMT
  */
+
 #pragma once
 
+#include "ssids_rip.hxx"
+
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define ldlt_nopiv_factor ldlt_nopiv_factor_sgl
 #define ldlt_nopiv_solve_fwd ldlt_nopiv_solve_fwd_sgl
 #define ldlt_nopiv_solve_diag ldlt_nopiv_solve_diag_sgl
 #define ldlt_nopiv_solve_bwd ldlt_nopiv_solve_bwd_sgl
 #else
-#define precision_ double
 #define ldlt_nopiv_factor ldlt_nopiv_factor_dbl
 #define ldlt_nopiv_solve_fwd ldlt_nopiv_solve_fwd_dbl
 #define ldlt_nopiv_solve_diag ldlt_nopiv_solve_diag_dbl
@@ -21,12 +23,12 @@
 
 namespace spral { namespace ssids { namespace cpu {
 
-int ldlt_nopiv_factor(int m, int n, precision_* a, int lda, precision_* work);
-void ldlt_nopiv_solve_fwd(int m, int n, precision_ const* a, int lda,
-   precision_ *x);
-void ldlt_nopiv_solve_diag(int m, int n, precision_ const* a, int lda,
-   precision_ *x);
-void ldlt_nopiv_solve_bwd(int m, int n, precision_ const* a, int lda,
-   precision_ *x);
+ipc_ ldlt_nopiv_factor(ipc_ m, ipc_ n, rpc_* a, ipc_ lda, rpc_* work);
+void ldlt_nopiv_solve_fwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda,
+   rpc_ *x);
+void ldlt_nopiv_solve_diag(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda,
+   rpc_ *x);
+void ldlt_nopiv_solve_bwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda,
+   rpc_ *x);
 
 }}} /* namespaces spral::ssids::cpu */
diff --git a/include/ssids_cpu_kernels_ldlt_tpp.hxx b/include/ssids_cpu_kernels_ldlt_tpp.hxx
index fdf95b85d3..3b66eccd0f 100644
--- a/include/ssids_cpu_kernels_ldlt_tpp.hxx
+++ b/include/ssids_cpu_kernels_ldlt_tpp.hxx
@@ -2,17 +2,19 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 14:40 GMT
  */
+
 #pragma once
 
+#include "ssids_rip.hxx"
+
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define ldlt_tpp_factor ldlt_tpp_factor_sgl
 #define ldlt_tpp_solve_fwd ldlt_tpp_solve_fwd_sgl
 #define ldlt_tpp_solve_diag ldlt_tpp_solve_diag_sgl
 #define ldlt_tpp_solve_bwd ldlt_tpp_solve_bwd_sgl
 #else
-#define precision_ double
 #define ldlt_tpp_factor ldlt_tpp_factor_dbl
 #define ldlt_tpp_solve_fwd ldlt_tpp_solve_fwd_dbl
 #define ldlt_tpp_solve_diag ldlt_tpp_solve_diag_dbl
@@ -21,14 +23,13 @@
 
 namespace spral { namespace ssids { namespace cpu {
 
-int ldlt_tpp_factor(int m, int n, int* perm, precision_* a,
-      int lda, precision_* d,
-      precision_* ld, int ldld, bool action, precision_ u, precision_ small,
-      int nleft=0, precision_ *aleft=nullptr, int ldleft=0);
-void ldlt_tpp_solve_fwd(int m, int n, precision_ const* l, int ldl, int nrhs,
-      precision_* x, int ldx);
-void ldlt_tpp_solve_diag(int n, precision_ const* d, precision_* x);
-void ldlt_tpp_solve_bwd(int m, int n, precision_ const* l, int ldl, int nrhs,
-      precision_* x, int ldx);
+ipc_ ldlt_tpp_factor(ipc_ m, ipc_ n, ipc_* perm, rpc_* a, ipc_ lda, rpc_* d,
+      rpc_* ld, ipc_ ldld, bool action, rpc_ u, rpc_ small,
+      ipc_ nleft=0, rpc_ *aleft=nullptr, ipc_ ldleft=0);
+void ldlt_tpp_solve_fwd(ipc_ m, ipc_ n, rpc_ const* l, ipc_ ldl, ipc_ nrhs,
+      rpc_* x, ipc_ ldx);
+void ldlt_tpp_solve_diag(ipc_ n, rpc_ const* d, rpc_* x);
+void ldlt_tpp_solve_bwd(ipc_ m, ipc_ n, rpc_ const* l, ipc_ ldl, ipc_ nrhs,
+      rpc_* x, ipc_ ldx);
 
 }}} /* end of namespace spral::ssids::cpu */
diff --git a/include/ssids_cpu_kernels_verify.hxx b/include/ssids_cpu_kernels_verify.hxx
index f7704c2aac..6d43510013 100644
--- a/include/ssids_cpu_kernels_verify.hxx
+++ b/include/ssids_cpu_kernels_verify.hxx
@@ -2,33 +2,36 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 14:50 GMT
  */
+
 #pragma once
 
 #include <vector>
 
 #include "ssids_cpu_kernels_wrappers.hxx"
+#include "ssids_rip.hxx"
 
 namespace spral { namespace ssids { namespace cpu {
 
 namespace verify_internal {
 
 template <typename T>
-void calcLD(int m, int n, T const* lcol, int ldl, T const* d, T* ld) {
-   for(int j=0; j<n;) {
+void calcLD(ipc_ m, ipc_ n, T const* lcol, ipc_ ldl, T const* d, T* ld) {
+   for(ipc_ j=0; j<n;) {
       if(j+1==n || std::isfinite(d[2*j+2])) {
          // 1x1 pivot
          // (Actually stored as D^-1 so need to invert it again)
          if(d[2*j] == 0.0) {
             // Handle zero pivots with care
-            for(int i=0; i<m; i++) {
+            for(ipc_ i=0; i<m; i++) {
                ld[j*m+i] = 0.0;
             }
          } else {
             // Standard 1x1 pivot
             T d11 = 1/d[2*j];
             // And calulate ld
-            for(int i=0; i<m; i++) {
+            for(ipc_ i=0; i<m; i++) {
                ld[j*m+i] = d11*lcol[j*ldl+i];
             }
          }
@@ -41,7 +44,7 @@ void calcLD(int m, int n, T const* lcol, int ldl, T const* d, T* ld) {
          T det = di11*di22 - di21*di21;
          T d11 = di22 / det; T d21 = -di21 / det; T d22 = di11 / det;
          // And calulate ld
-         for(int i=0; i<m; i++) {
+         for(ipc_ i=0; i<m; i++) {
             ld[j*m+i]     = d11*lcol[j*ldl+i] + d21*lcol[(j+1)*ldl+i];
             ld[(j+1)*m+i] = d21*lcol[j*ldl+i] + d22*lcol[(j+1)*ldl+i];
          }
@@ -56,25 +59,25 @@ void calcLD(int m, int n, T const* lcol, int ldl, T const* d, T* ld) {
 template<typename T>
 class Verify {
 public:
-   Verify(int m, int n, int const* perm, T const* a, int lda)
+   Verify(ipc_ m, ipc_ n, ipc_ const* perm, T const* a, ipc_ lda)
       : m_(m), n_(n), lda_(m), a_(m*n), perm_(n)
    {
       // Take a copy
-      for(int j=0; j<n; ++j)
-      for(int i=j; i<m; ++i)
+      for(ipc_ j=0; j<n; ++j)
+      for(ipc_ i=j; i<m; ++i)
          a_[j*lda_+i] = a[j*lda+i];
-      for(int i=0; i<n; ++i)
+      for(ipc_ i=0; i<n; ++i)
          perm_[i] = perm[i];
    }
 
-   void verify(int nelim, int const* perm, T const* l, int ldl, T const* d) const {
+   void verify(ipc_ nelim, ipc_ const* perm, T const* l, ipc_ ldl, T const* d) const {
       printf("Verifying %d %d %d\n", m_, n_, nelim);
       if(nelim==0) return;
 
       // Construct lperm
-      int *lperm = new int[n_];
-      for(int i=0; i<n_; ++i)
-         for(int j=0; j<n_; ++j)
+      ipc_ *lperm = new ipc_[n_];
+      for(ipc_ i=0; i<n_; ++i)
+         for(ipc_ j=0; j<n_; ++j)
             if(perm_[i] == perm[j]) {
                lperm[j] = i;
                break;
@@ -82,10 +85,10 @@ public:
 
       // Take copy of l and explicitly zero upper triangle
       T *lcopy = new T[m_*n_];
-      for(int j=0; j<nelim; ++j) {
-         for(int i=0; i<j; ++i)
+      for(ipc_ j=0; j<nelim; ++j) {
+         for(ipc_ i=0; i<j; ++i)
             lcopy[j*m_+i] = 0.0;
-         for(int i=j; i<m_; ++i)
+         for(ipc_ i=j; i<m_; ++i)
             lcopy[j*m_+i] = l[j*ldl+i];
       }
 
@@ -97,10 +100,10 @@ public:
             OP_N, OP_T, nelim, nelim, nelim, 1.0, lcopy, m_, ld, nelim,
             0.0, ldlt, nelim
             );
-      for(int j=0; j<nelim; ++j) {
-         int c = lperm[j];
-         for(int i=j; i<nelim; ++i) {
-            int r = lperm[i];
+      for(ipc_ j=0; j<nelim; ++j) {
+         ipc_ c = lperm[j];
+         for(ipc_ i=j; i<nelim; ++i) {
+            ipc_ r = lperm[i];
             if(r >= c) {
                if(std::abs(a_[c*lda_+r] - ldlt[j*nelim+i]) > 1e-10) {
                   printf("Mismatch1 [%d,%d]=%e  != [%d,%d]=%e diff %e\n", r, c,
@@ -128,10 +131,10 @@ public:
                ld, nelim, 0.0, below, m_-nelim
                );
          // rows nelim:n may be permuted
-         for(int j=0; j<nelim; ++j) {
-            int c = lperm[j];
-            for(int i=nelim; i<n_; ++i) {
-               int r = lperm[i];
+         for(ipc_ j=0; j<nelim; ++j) {
+            ipc_ c = lperm[j];
+            for(ipc_ i=nelim; i<n_; ++i) {
+               ipc_ r = lperm[i];
                if(r >= c) {
                   if(std::abs(a_[c*lda_+r] - below[j*(m_-nelim)+i-nelim]) > 1e-10) {
                      printf("Mismatch2 [%d,%d]=%e  != [%d,%d]=%e diff %e\n", r, c,
@@ -150,10 +153,10 @@ public:
             }
          }
          // rows nelim:n are only column permuted
-         for(int j=0; j<nelim; ++j) {
-            int c = lperm[j];
-            for(int i=n_; i<m_; ++i) {
-               int r = i;
+         for(ipc_ j=0; j<nelim; ++j) {
+            ipc_ c = lperm[j];
+            for(ipc_ i=n_; i<m_; ++i) {
+               ipc_ r = i;
                if(std::abs(a_[c*lda_+r] - below[j*(m_-nelim)+i-nelim]) > 1e-10) {
                   printf("Mismatch3 [%d,%d]=%e  != [%d,%d]=%e diff %e\n", r, c,
                         a_[c*lda_+r], i, j, below[j*(m_-nelim)+i-nelim],
@@ -172,11 +175,11 @@ public:
    }
 
 private:
-   int m_;
-   int n_;
-   int lda_;
+   ipc_ m_;
+   ipc_ n_;
+   ipc_ lda_;
    std::vector<T> a_;
-   std::vector<int> perm_;
+   std::vector<ipc_> perm_;
 };
 
 
diff --git a/include/ssids_cpu_kernels_wrappers.hxx b/include/ssids_cpu_kernels_wrappers.hxx
index a57adcf3ad..a5afc9c750 100644
--- a/include/ssids_cpu_kernels_wrappers.hxx
+++ b/include/ssids_cpu_kernels_wrappers.hxx
@@ -2,10 +2,15 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 11:00 GMT
  */
+
 #pragma once
+
 #include <stdint.h>
+
 #include "ssids_cpu_kernels_common.hxx"
+#include "ssids_rip.hxx"
 
 namespace spral { namespace ssids { namespace cpu {
 
@@ -57,53 +62,53 @@ void host_trsm(enum spral::ssids::cpu::side side,
                int m, int n, T alpha, const T* a, int lda, 
                T* b, int ldb);
 
-/* _GEMM */
+/* _GEMM_64 */
 template <typename T>
 void host_gemm_64(enum spral::ssids::cpu::operation transa, 
                enum spral::ssids::cpu::operation transb, 
-               int64_t m, int64_t n, int64_t k, T alpha, const T* a, 
-               int64_t lda, const T* b, int64_t ldb, T beta, 
-               T* c, int64_t ldc);
+               longc_ m, longc_ n, longc_ k, T alpha, const T* a, 
+               longc_ lda, const T* b, longc_ ldb, T beta, 
+               T* c, longc_ ldc);
 
-/* _GEMV */
+/* _GEMV_64 */
 template <typename T>
 void gemv_64(enum spral::ssids::cpu::operation trans, 
-         int64_t m, int64_t n, T alpha, const T* a, int64_t lda, 
-         const T* x, int64_t incx, T beta, T* y, int64_t incy);
+         longc_ m, longc_ n, T alpha, const T* a, longc_ lda, 
+         const T* x, longc_ incx, T beta, T* y, longc_ incy);
 
-/* _POTRF */
+/* _POTRF_64 */
 template <typename T>
-int64_t lapack_potrf_64(enum spral::ssids::cpu::fillmode uplo, int64_t n, 
-                 T* a, int64_t lda);
+longc_ lapack_potrf_64(enum spral::ssids::cpu::fillmode uplo, longc_ n, 
+                 T* a, longc_ lda);
 
-/* _SYTRF - Bunch-Kaufman factorization */
+/* _SYTRF_64 - Bunch-Kaufman factorization */
 template <typename T>
-int64_t lapack_sytrf_64(enum spral::ssids::cpu::fillmode uplo, 
-                 int64_t n, T* a, int64_t lda, int64_t* ipiv, 
-                 T* work, int64_t lwork);
+longc_ lapack_sytrf_64(enum spral::ssids::cpu::fillmode uplo, 
+                 longc_ n, T* a, longc_ lda, longc_* ipiv, 
+                 T* work, longc_ lwork);
 
-/* _SYRK */
+/* _SYRK_64 */
 template <typename T>
 void host_syrk_64(enum spral::ssids::cpu::fillmode uplo, 
               enum spral::ssids::cpu::operation trans, 
-              int64_t n, int64_t k, T alpha, const T* a, int64_t lda, 
-              T beta, T* c, int64_t ldc);
+              longc_ n, longc_ k, T alpha, const T* a, longc_ lda, 
+              T beta, T* c, longc_ ldc);
 
-/* _TRSV */
+/* _TRSV_64 */
 template <typename T>
 void host_trsv_64(enum spral::ssids::cpu::fillmode uplo, 
                enum spral::ssids::cpu::operation trans, 
                enum spral::ssids::cpu::diagonal diag, 
-               int64_t n, const T* a, int64_t lda, T* x, int64_t incx);
+               longc_ n, const T* a, longc_ lda, T* x, longc_ incx);
 
-/* _TRSM */
+/* _TRSM_64 */
 template <typename T>
 void host_trsm_64(enum spral::ssids::cpu::side side, 
                enum spral::ssids::cpu::fillmode uplo, 
                enum spral::ssids::cpu::operation transa, 
                enum spral::ssids::cpu::diagonal diag, 
-               int64_t m, int64_t n, T alpha, const T* a, int64_t lda, 
-               T* b, int64_t ldb);
+               longc_ m, longc_ n, T alpha, const T* a, longc_ lda, 
+               T* b, longc_ ldb);
 
 }}} /* namespaces spral::ssids::cpu */
 
diff --git a/include/ssids_gpu_kernels_datatypes.h b/include/ssids_gpu_kernels_datatypes.h
index f4dc45daa1..78cadbc884 100644
--- a/include/ssids_gpu_kernels_datatypes.h
+++ b/include/ssids_gpu_kernels_datatypes.h
@@ -1,29 +1,38 @@
+/** \file
+ *  \copyright 2016 The Science and Technology Facilities Council (STFC)
+ *  \licence   BSD licence, see LICENCE file for details
+ *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 14:30 GMT
+ */
+
 #define MAX_CUDA_BLOCKS 65535
 
+#include "ssids_rip.hxx"
+
 namespace spral { namespace ssids { namespace gpu {
 
 /** \brief Represents work for a a node to be factorized
  *         (as part of a batched call).
  */
 struct multinode_fact_type {
-  int nrows; ///< number of rows in node
-  int ncols; ///< number of columns in node
-  double *lval; ///< pointer to factors L
-  double *ldval; ///< pointer to workspace for storing L*D
-  double *dval; ///< pointer to factors D
-  int offp; ///< offset into permutation vector for this node
-  int ib; ///< ???
-  int jb; ///< ???
-  int done; ///< number of columns sucessfully factorized?
-  int rght; ///< ???
-  int lbuf; ///< ???
+  ipc_ nrows; ///< number of rows in node
+  ipc_ ncols; ///< number of columns in node
+  rpc_ *lval; ///< pointer to factors L
+  rpc_ *ldval; ///< pointer to workspace for storing L*D
+  rpc_ *dval; ///< pointer to factors D
+  ipc_ offp; ///< offset into permutation vector for this node
+  ipc_ ib; ///< ???
+  ipc_ jb; ///< ???
+  ipc_ done; ///< number of columns sucessfully factorized?
+  ipc_ rght; ///< ???
+  ipc_ lbuf; ///< ???
 };
 
 /** \brief Statistics to be returned to user. */
 struct cuda_stats {
-  int num_two; ///< Number of 2x2 pivots
-  int num_neg; ///< Number of negative pivots
-  int num_zero; ///< Number of zero pivots
+  ipc_ num_two; ///< Number of 2x2 pivots
+  ipc_ num_neg; ///< Number of negative pivots
+  ipc_ num_zero; ///< Number of zero pivots
 };
 
 }}} /* namespace spral::ssids::gpu */
diff --git a/include/ssids_gpu_kernels_dtrsv.h b/include/ssids_gpu_kernels_dtrsv.h
index 7c74696973..07932a7179 100644
--- a/include/ssids_gpu_kernels_dtrsv.h
+++ b/include/ssids_gpu_kernels_dtrsv.h
@@ -6,12 +6,15 @@ Other Contributors:
    Christopher Munro (STFC)
    Philippe Vandermersch (NVIDIA)
 All rights reserved.
+Current version - GALAHAD 4.3 - 2024-02-03 AT 15:15 GMT
 
 This file is a modified version of the ASEArch blas version. It has had a
 lookup capability added to allow execution on multiple small matrices
 simulateously.
 */
 
+#include "ssids_rip.hxx"
+
 namespace spral { namespace ssids { namespace gpu {
 
 /** \brief Return value at address vptr using volatile load. */
@@ -43,8 +46,8 @@ __inline__ __device__ T_ELEM loadVolatile(const volatile T_ELEM *const vptr)
 #endif
 
 /** \brief Return physical SM id as per special register %smid. */
-unsigned int __inline__ __device__ getSM(void) {
-  volatile unsigned int output;
+uipc_  __inline__ __device__ getSM(void) {
+  volatile uipc_ output;
   asm volatile("mov.u32 %0,%smid;" : "=r"(output) : );
   return output;
 }
@@ -60,13 +63,13 @@ unsigned int __inline__ __device__ getSM(void) {
  * \param val This thread's element of x.
  * \sa dblkSolve_trans()
  */
-template <typename T_ELEM, int blkSize, bool ISUNIT>
-void __device__ dblkSolve(const volatile T_ELEM *const minus_a, const int lda, T_ELEM &val)
+template <typename T_ELEM, ipc_ blkSize, bool ISUNIT>
+void __device__ dblkSolve(const volatile T_ELEM *const minus_a, const ipc_ lda, T_ELEM &val)
 {
    volatile T_ELEM __shared__ xs;
 
 #pragma unroll 16
-   for (int i=0; i<blkSize; ++i) {
+   for (ipc_ i=0; i<blkSize; ++i) {
      if (threadIdx.x==i) {
        if (!ISUNIT) val *= minus_a[i*lda+i];
        xs = val;
@@ -86,13 +89,13 @@ void __device__ dblkSolve(const volatile T_ELEM *const minus_a, const int lda, T
  * \param val This thread's element of x.
  * \sa dblkSolve()
  */
-template <typename T_ELEM, int blkSize, bool ISUNIT>
-void __device__ dblkSolve_trans(const volatile T_ELEM *const minus_a, const int lda, T_ELEM &val)
+template <typename T_ELEM, ipc_ blkSize, bool ISUNIT>
+void __device__ dblkSolve_trans(const volatile T_ELEM *const minus_a, const ipc_ lda, T_ELEM &val)
 {
    volatile T_ELEM __shared__ xs;
 
 #pragma unroll 16
-   for (int i=blkSize-1; i>=0; --i) {
+   for (ipc_ i=blkSize-1; i>=0; --i) {
      if (threadIdx.x==i) {
        if (!ISUNIT) val *= minus_a[i*lda+i];
        xs = val;
@@ -116,21 +119,21 @@ void __device__ dblkSolve_trans(const volatile T_ELEM *const minus_a, const int
  * \param cache Location to copy to, leading dimension nbi.
  * \sa tocache_small()
  */
-template <typename T_ELEM, unsigned int nbi, unsigned int ntid, bool TRANS, bool ISUNIT>
-void __device__ tocache(const unsigned int tid, const volatile T_ELEM *const a, const int lda, volatile T_ELEM *const cache)
+template <typename T_ELEM, uipc_ nbi, uipc_ ntid, bool TRANS, bool ISUNIT>
+void __device__ tocache(const uipc_ tid, const volatile T_ELEM *const a, const ipc_ lda, volatile T_ELEM *const cache)
 {
-  const int x = tid % nbi;
-  const int y = tid / nbi;
-  const int ty = ntid/nbi;
+  const ipc_ x = tid % nbi;
+  const ipc_ y = tid / nbi;
+  const ipc_ ty = ntid/nbi;
 
   if (!TRANS) {
-    for (int i=0; i<nbi; i+=ty) {
+    for (ipc_ i=0; i<nbi; i+=ty) {
       if (x>(i+y)) cache[(i+y)*nbi+x] = -a[(i+y)*lda+x];
       else if ((i+y)<nbi) cache[(i+y)*nbi+x] = 0.0;
       if ((!ISUNIT) && (x==(i+y))) cache[(i+y)*nbi+x] = 1.0 / a[(i+y)*lda+x];
     }
   } else {
-    for (int i=0; i<nbi; i+=ty) {
+    for (ipc_ i=0; i<nbi; i+=ty) {
       if (x>(i+y)) cache[(i+y)+nbi*x] = -a[(i+y)*lda+x];
       else if ((i+y)<nbi) cache[(i+y)+nbi*x] = 0.0;
       if ((!ISUNIT) && (x==(i+y))) cache[(i+y)+nbi*x] = 1.0 / a[(i+y)*lda+x];
@@ -153,22 +156,22 @@ void __device__ tocache(const unsigned int tid, const volatile T_ELEM *const a,
  * \param cache Location to copy to, leading dimension nbi.
  * \sa tocache()
  */
-template <typename T_ELEM, unsigned int nbi, unsigned int ntid, bool TRANS, bool ISUNIT>
-void __device__ tocache_small(const int n, const unsigned int tid, const volatile T_ELEM *const a, int lda, volatile T_ELEM *const cache)
+template <typename T_ELEM, uipc_ nbi, uipc_ ntid, bool TRANS, bool ISUNIT>
+void __device__ tocache_small(const ipc_ n, const uipc_ tid, const volatile T_ELEM *const a, ipc_ lda, volatile T_ELEM *const cache)
 {
-  const int x = tid % nbi;
-  const int y = tid / nbi;
-  const int ty = ntid/nbi;
+  const ipc_ x = tid % nbi;
+  const ipc_ y = tid / nbi;
+  const ipc_ ty = ntid/nbi;
 
   if (!TRANS) {
-    for (int i=0; i<n; i+=ty) {
+    for (ipc_ i=0; i<n; i+=ty) {
       if (i+y>=nbi) continue; // past end of cache array
       if ((i+y)<n && (x>(i+y) && x<n))  cache[(i+y)*nbi+x] = -a[(i+y)*lda+x];
       else                              cache[(i+y)*nbi+x] = 0.0;
       if ((!ISUNIT) && x==(i+y) && x<n) cache[(i+y)*nbi+x] = 1.0 / a[(i+y)*lda+x];
     }
   } else {
-    for (int i=0; i<nbi; i+=ty) {
+    for (ipc_ i=0; i<nbi; i+=ty) {
       if (i+y>=nbi) continue; // past end of cache array
       if ((i+y)<n && x>(i+y) && x<n)    cache[(i+y)+nbi*x] = -a[(i+y)*lda+x];
       else                              cache[(i+y)+nbi*x] = 0.0;
@@ -186,7 +189,7 @@ void __device__ tocache_small(const int n, const unsigned int tid, const volatil
  * \param col_to_wait Target number to wait for.
  * \param col_done Value of *sync on return.
  */
-void __device__ wait_until_ge(const int tid, const volatile int *const sync, const int col_to_wait, int *const col_done)
+void __device__ wait_until_ge(const ipc_ tid, const volatile ipc_ *const sync, const ipc_ col_to_wait, ipc_ *const col_done)
 {
   if (tid == 0) {
     /* Only read global memory when necessary */
@@ -205,9 +208,9 @@ void __device__ wait_until_ge(const int tid, const volatile int *const sync, con
  * \param address pointer to global address used to store next row.
  * \returns next row index
  */
-int __device__ nextRow(int *const address)
+ipc_ __device__ nextRow(ipc_ *const address)
 {
-  volatile int __shared__ old;
+  volatile ipc_ __shared__ old;
   if (threadIdx.x==0 && threadIdx.y==0)
     old = atomicAdd(address, 1);
   __syncthreads();
@@ -229,14 +232,14 @@ int __device__ nextRow(int *const address)
  * \param l22 The matrix \f$ L_{21} \f$.
  * \param xsarray workspace???
 */
-template <typename T_ELEM, int n, int lda, int threadsx, int threadsy, bool ISUNIT>
+template <typename T_ELEM, ipc_ n, ipc_ lda, ipc_ threadsx, ipc_ threadsy, bool ISUNIT>
 void __device__ slv21(const volatile T_ELEM *const x11, volatile T_ELEM *const a21, const volatile T_ELEM *const l22, volatile T_ELEM *const xsarray)
 {
-  const int tid = threadsx*threadIdx.y+threadIdx.x;
-  const int ntid = threadsx*threadsy;
-  const int x = (n>0) ? tid % n : 0;
-  const int y = (n>0) ? tid / n : 0;
-  const int ty = (n>0) ? ntid/n : 1;
+  const ipc_ tid = threadsx*threadIdx.y+threadIdx.x;
+  const ipc_ ntid = threadsx*threadsy;
+  const ipc_ x = (n>0) ? tid % n : 0;
+  const ipc_ y = (n>0) ? tid / n : 0;
+  const ipc_ ty = (n>0) ? ntid/n : 1;
 
    /* Note: as different threads within a warp can work on different
       columns, we need different xs variables (one per col being worked on) */
@@ -245,19 +248,19 @@ void __device__ slv21(const volatile T_ELEM *const x11, volatile T_ELEM *const a
   if (y>n) return;
 
 #pragma unroll
-  for (int j=0; j<n; j+=ty) {
+  for (ipc_ j=0; j<n; j+=ty) {
     if ((j+y)>=n) continue;
 
     /* construct col (j+y) of -L_21 X_11 */
     T_ELEM val = 0;
-    for (int k=j; k<n; ++k) {
+    for (ipc_ k=j; k<n; ++k) {
       if (k+y<n) val += a21[(k+y)*lda+x] * x11[(j+y)*lda+k+y];
     }
     val = -val;
 
     /* solve L_22 X_21(col j) = a21(col j) in place */
 #pragma unroll 2
-    for (int k=0; k<n; ++k) { // Column of l22, must be done in order
+    for (ipc_ k=0; k<n; ++k) { // Column of l22, must be done in order
       if (x==k) {
         if (!ISUNIT) val *= l22[k*lda+k];
         xs[0] = val;
@@ -278,11 +281,11 @@ void __device__ slv21(const volatile T_ELEM *const x11, volatile T_ELEM *const a
  * \param a Matrix to transpose.
  * \param at Space to output transpose of a.
  */
-template <typename T_ELEM, int threadsy, int lda>
-void __device__ transpose(const int n, const volatile T_ELEM *const a, volatile T_ELEM *const at)
+template <typename T_ELEM, ipc_ threadsy, ipc_ lda>
+void __device__ transpose(const ipc_ n, const volatile T_ELEM *const a, volatile T_ELEM *const at)
 {
   if (threadIdx.y==0 && threadIdx.x<n) {
-    for (int j=0; j<n; ++j)
+    for (ipc_ j=0; j<n; ++j)
       at[j*lda+threadIdx.x] = a[threadIdx.x*lda+j];
   }
 }
@@ -314,7 +317,7 @@ void __device__ transpose(const int n, const volatile T_ELEM *const a, volatile
  * \param a matrix to invert
  * \param xsarray workspace ???
  */
-template <typename T_ELEM, int n, int lda, int threadsx, int threadsy, bool ISUNIT, bool TRANS>
+template <typename T_ELEM, ipc_ n, ipc_ lda, ipc_ threadsx, ipc_ threadsy, bool ISUNIT, bool TRANS>
 void __device__ invert(volatile T_ELEM *const a, volatile T_ELEM /*__shared__*/ *const xsarray)
 {
   if (n==2) {
@@ -355,7 +358,7 @@ void __device__ invert(volatile T_ELEM *const a, volatile T_ELEM /*__shared__*/
  *        solution on output
  * \param partSum workspace???
  */
-template<typename T_ELEM, int n, int threadsy>
+template<typename T_ELEM, ipc_ n, ipc_ threadsy>
 void __device__ slvinv(const volatile T_ELEM *a, volatile T_ELEM *xshared, T_ELEM &val, volatile T_ELEM *const partSum)
 {
   a += threadIdx.y*n+threadIdx.x;
@@ -369,14 +372,14 @@ void __device__ slvinv(const volatile T_ELEM *a, volatile T_ELEM *xshared, T_ELE
   /* matrix-vector multiply for solution */
   if (threadIdx.y<threadsy && threadIdx.x<n) {
     val=0;
-    for (int j=0; j<n; j+=threadsy) {
+    for (ipc_ j=0; j<n; j+=threadsy) {
       val += a[j*n] * xshared[j];
     }
     partSum[threadIdx.y*n+threadIdx.x] = val;
   }
   __syncthreads();
   if (threadIdx.y==0) {
-    for(int i=1; i<threadsy; ++i)
+    for(ipc_ i=1; i<threadsy; ++i)
       val += partSum[i*n+threadIdx.x];
   }
 }
@@ -394,8 +397,8 @@ void __device__ slvinv(const volatile T_ELEM *a, volatile T_ELEM *xshared, T_ELE
  * \param partSum workspace???
  * \param row FIXME: unused???
  */
-template<typename T_ELEM, int n, int threadsy>
-void __device__ slvinv_trans(const volatile T_ELEM *a, volatile T_ELEM *xshared, T_ELEM &val, volatile T_ELEM *const partSum, const int row)
+template<typename T_ELEM, ipc_ n, ipc_ threadsy>
+void __device__ slvinv_trans(const volatile T_ELEM *a, volatile T_ELEM *xshared, T_ELEM &val, volatile T_ELEM *const partSum, const ipc_ row)
 {
   a += threadIdx.y*n+threadIdx.x;
   xshared += threadIdx.y;
@@ -408,7 +411,7 @@ void __device__ slvinv_trans(const volatile T_ELEM *a, volatile T_ELEM *xshared,
   /* matrix-vector multiply for solution */
   val=0;
   if (threadIdx.x<n) {
-    for (int j=0; j<n; j+=threadsy) {
+    for (ipc_ j=0; j<n; j+=threadsy) {
       if (threadIdx.x <= j+threadIdx.y) {
         val += a[j*n] * xshared[j];
       }
@@ -417,13 +420,13 @@ void __device__ slvinv_trans(const volatile T_ELEM *a, volatile T_ELEM *xshared,
   partSum[threadIdx.y*n+threadIdx.x] = val;
   __syncthreads();
   if (threadIdx.y==0) {
-    for (int i=1; i<threadsy; ++i)
+    for (ipc_ i=1; i<threadsy; ++i)
       val += partSum[i*n+threadIdx.x];
   }
 }
 
 /** \brief Sets sync values correctly prior to call to trsv_ln_exec */
-void __global__ trsv_init(volatile int *sync)
+void __global__ trsv_init(volatile ipc_ *sync)
 {
   sync += 2*blockIdx.x;
   sync[0] = -1; // Last ready column
@@ -432,18 +435,18 @@ void __global__ trsv_init(volatile int *sync)
 
 /** \brief Represents a single block for batched trsv call. */
 struct trsv_lookup {
-  int n; ///< Size of matrix this block works on.
+  ipc_ n; ///< Size of matrix this block works on.
   const double *a; ///< Data for matrix this block works on.
-  int lda; ///< Leading dimension of a.
-  int x_offset; ///< Offset into x vector we're solving for
-  int sync_offset; ///< Offset into sync vector for this matrix
+  ipc_ lda; ///< Leading dimension of a.
+  ipc_ x_offset; ///< Offset into x vector we're solving for
+  ipc_ sync_offset; ///< Offset into sync vector for this matrix
 };
 
 #ifdef TIMING
 struct trsv_times {
-  unsigned int sm;
-  unsigned int sa;
-  unsigned int en;
+  uipc_ sm;
+  uipc_ sa;
+  uipc_ en;
 };
 #endif
 
@@ -459,28 +462,28 @@ struct trsv_times {
  * \param xglobal x array to offset into.
  * \param sync sync array to offset into.
  */
-template <typename T_ELEM, unsigned int nb, unsigned int threadsx, unsigned int threadsy, bool ISUNIT>
+template <typename T_ELEM, uipc_ nb, uipc_ threadsx, uipc_ threadsy, bool ISUNIT>
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 __launch_bounds__(threadsx*threadsy, 4)
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
-void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal, int *sync
+void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal, ipc_ *sync
 #ifdef TIMING
       , struct trsv_times *times
 #endif
 ) {
    lookup += blockIdx.x;
-   const int n = lookup->n;
+   const ipc_ n = lookup->n;
    const T_ELEM *const a = lookup->a;
-   const int lda = lookup->lda;
+   const ipc_ lda = lookup->lda;
    xglobal += lookup->x_offset;
    sync += lookup->sync_offset;
 
 #ifdef TIMING
-   const unsigned int sa = clock();
+   const uipc_ sa = clock();
 #endif
 
-   const int nblk = (n + (nb-1)) / nb;
-   const int tid = threadsx*threadIdx.y + threadIdx.x;
+   const ipc_ nblk = (n + (nb-1)) / nb;
+   const ipc_ tid = threadsx*threadIdx.y + threadIdx.x;
 
    /* sync components:
     *    sync[0] => nblk - Last ready column [init to -1]
@@ -493,14 +496,14 @@ void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal,
    T_ELEM ps[nb/threadsy];
 
    /* Get row handled by this block */
-   const int row = nblk-1 - nextRow(&sync[1]);
+   const ipc_ row = nblk-1 - nextRow(&sync[1]);
 
    const bool short_row = ((n-1)/nb==row && n%nb!=0); /* requires special handling */
 
    if (row!=nblk-1) {
      const T_ELEM *const aval = &a[(row*nb+threadIdx.x)*lda+(row+1)*nb+threadIdx.y];
 #pragma unroll
-     for (int j=0; j<nb; j+=threadsy)
+     for (ipc_ j=0; j<nb; j+=threadsy)
        regcache[j/threadsy] = aval[j];
    }
 
@@ -539,81 +542,81 @@ void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal,
    volatile T_ELEM __shared__ soln[nb];
    if (threadIdx.y==0) {
      if (!short_row) {
-       soln[threadIdx.x] = xglobal[int(row*nb+threadIdx.x)];
+       soln[threadIdx.x] = xglobal[ipc_(row*nb+threadIdx.x)];
      } else {
        if (threadIdx.x<n%nb)
-         soln[threadIdx.x] = xglobal[int(row*nb+threadIdx.x)];
+         soln[threadIdx.x] = xglobal[ipc_(row*nb+threadIdx.x)];
        else
          soln[threadIdx.x] = 0;
      }
    }
 #pragma unroll
-   for(int j=0; j<nb/threadsy; ++j) ps[j] = 0;
-   int col_done = -1;
-   for (int col=nblk-1; col>row+1; --col) {
+   for(ipc_ j=0; j<nb/threadsy; ++j) ps[j] = 0;
+   ipc_ col_done = -1;
+   for (ipc_ col=nblk-1; col>row+1; --col) {
      /* apply update from block (row, col) */
      const T_ELEM *const aval = &a[(row*nb+threadIdx.y)*lda + col*nb+threadIdx.x];
-     T_ELEM *const xg = &(xglobal[int(col*nb)]);
+     T_ELEM *const xg = &(xglobal[ipc_(col*nb)]);
      wait_until_ge(tid, &sync[0], nblk-1-col, &col_done); // Wait for diagonal block to be done
      T_ELEM xl;
      if (col<nblk-1) {
-       xl = *(const volatile T_ELEM*)&(xg[int(threadIdx.x)]);
+       xl = *(const volatile T_ELEM*)&(xg[ipc_(threadIdx.x)]);
      } else {
-       if (threadIdx.x<(n-1)%nb+1) xl = *(const volatile T_ELEM*)&(xg[int(threadIdx.x)]);
+       if (threadIdx.x<(n-1)%nb+1) xl = *(const volatile T_ELEM*)&(xg[ipc_(threadIdx.x)]);
        else                        xl = 0;
      }
      if (nb % threadsy == 0) {
        if (col!=nblk-1 || n%nb==0) {
 #pragma unroll
-         for (int j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
+         for (ipc_ j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
            ps[j/threadsy] += aval[j*lda] * xl;
        } else {
-         for (int j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
+         for (ipc_ j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
            if (threadIdx.x<n%nb) ps[j/threadsy] += aval[j*lda] * xl;
        }
      } else {
 #pragma unroll
-       for (int j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
+       for (ipc_ j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
          if (j+threadIdx.y<nb)
            ps[j/threadsy] += aval[j*lda] * xl;
      }
    }
    T_ELEM val = 0;
 #pragma unroll
-   for (int i=0; i<nb; i+=threadsy) {
+   for (ipc_ i=0; i<nb; i+=threadsy) {
      partSum[threadIdx.x*threadsy+threadIdx.y] = ps[i/threadsy];
      __syncthreads();
      if (threadIdx.y==0 && threadIdx.x>=i && threadIdx.x<i+threadsy) {
-       for (int j=0; j<nb; ++j)
+       for (ipc_ j=0; j<nb; ++j)
          val += partSum[(threadIdx.x-i)+threadsy*j];
      }
      __syncthreads();
    }
    if (row!=nblk-1) {
      /* apply update from block (row, col) */
-     const int col = row+1;
-     T_ELEM *const xg = &(xglobal[int(col*nb)]);
+     const ipc_ col = row+1;
+     T_ELEM *const xg = &(xglobal[ipc_(col*nb)]);
      wait_until_ge(tid, &sync[0], nblk-1-col, &col_done); // Wait for diagonal block to be done
      T_ELEM __shared__ xlocal[nb];
      T_ELEM *const xl = xlocal+threadIdx.y;
      if (col<nblk-1) {
-       if (tid<nb) xlocal[tid] = *(const volatile T_ELEM*)&(xg[int(tid)]);
+       if (tid<nb) xlocal[tid] = *(const volatile T_ELEM*)&(xg[ipc_(tid)]);
        __syncthreads();
 #pragma unroll
-       for (int j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
+       for (ipc_ j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
          val += regcache[j/threadsy] * xl[j];
      } else {
-       if (tid<(n-1)%nb+1) xlocal[tid] = *(const volatile T_ELEM*)&(xg[int(tid)]);
+       if (tid<(n-1)%nb+1) xlocal[tid] = *(const volatile T_ELEM*)&(xg[ipc_(tid)]);
        __syncthreads();
 #pragma unroll
-       for (int j=0; j<(n-1)%nb+1; j+=threadsy) // do j=0,nb-1,threadsy
+       for (ipc_ j=0; j<(n-1)%nb+1; j+=threadsy) // do j=0,nb-1,threadsy
          if (j+threadIdx.y<(n-1)%nb+1) val += regcache[j/threadsy] * xl[j];
      }
    }
    partSum[threadIdx.y*threadsx+threadIdx.x] = val;
    __syncthreads();
    if (threadIdx.y==0) {
-     for (int i=1; i<threadsy; ++i)
+     for (ipc_ i=1; i<threadsy; ++i)
        val += partSum[i*threadsx+threadIdx.x];
      val = soln[threadIdx.x]-val;
    }
@@ -625,14 +628,14 @@ void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal,
      slvinv_trans<T_ELEM, nb, threadsy>(cache, xshared, val, partSum, row);
      if (!short_row || threadIdx.x<n%nb) {
        if (threadIdx.y==0) {
-         xglobal[int(row*nb+tid)] = val;
+         xglobal[ipc_(row*nb+tid)] = val;
        }
      }
    } else {
      if (threadIdx.y==0) {
        dblkSolve_trans<T_ELEM,nb,ISUNIT>(cache, nb, val);
        if (!short_row || threadIdx.x<n%nb) {
-         xglobal[int(row*nb+tid)] = val;
+         xglobal[ipc_(row*nb+tid)] = val;
        }
      }
    }
@@ -640,7 +643,7 @@ void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal,
    if (threadIdx.y==0) {
      dblkSolve_trans<T_ELEM,nb,ISUNIT>(cache, nb, val);
      if (!short_row || threadIdx.x<n%nb) {
-       xglobal[int(row*nb+tid)] = val;
+       xglobal[ipc_(row*nb+tid)] = val;
      }
    }
 #endif /* INV_AFTER */
@@ -650,7 +653,7 @@ void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal,
    __threadfence_system(); // Flush sync[0] asap
 
 #ifdef TIMING
-   const unsigned int en = clock();
+   const uipc_ en = clock();
    if (threadIdx.x==0 && threadIdx.y==0) {
      times += blockIdx.x;
      times->sm = getSM();
@@ -673,23 +676,23 @@ void __global__ trsv_lt_exec(const struct trsv_lookup *lookup, T_ELEM *xglobal,
  * \param sync sync array to offset into.
  * \param lookup batch lookup array.
  */
-template <typename T_ELEM, unsigned int nb, unsigned int threadsx, unsigned int threadsy, bool ISUNIT>
+template <typename T_ELEM, uipc_ nb, uipc_ threadsx, uipc_ threadsy, bool ISUNIT>
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 __launch_bounds__(threadsx*threadsy, 4)
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 /* Note: setting above occupany to 5 causes random errors on large problems:
    suspect compiler bug */
-void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, int *__restrict__ sync, struct trsv_lookup *lookup)
+void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, ipc_ *__restrict__ sync, struct trsv_lookup *lookup)
 {
    lookup += blockIdx.x;
-   const int n = lookup->n;
+   const ipc_ n = lookup->n;
    const T_ELEM *const a = lookup->a;
-   const int lda = lookup->lda;
+   const ipc_ lda = lookup->lda;
    xglobal += lookup->x_offset;
    sync += lookup->sync_offset;
-   const int incx=1;
+   const ipc_ incx=1;
 
-   const int tid = threadsx*threadIdx.y + threadIdx.x;
+   const ipc_ tid = threadsx*threadIdx.y + threadIdx.x;
 
    /* sync components:
     *    sync[0] => Last ready column [init to -1]
@@ -704,14 +707,14 @@ void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, int *__restrict__ syn
    if (incx<0) xglobal+=(1-n)*incx;
 
    /* Get row handled by this block */
-   const int row = nextRow(&sync[1]);
+   const ipc_ row = nextRow(&sync[1]);
 
    const bool short_row = ((n-1)/nb==row && n%nb!=0); /* requires special handling */
 
    if (row!=0) {
      const T_ELEM *const aval = &a[((row-1)*nb+threadIdx.y)*lda+row*nb+threadIdx.x];
 #pragma unroll
-     for (int j=0; j<nb; j+=threadsy)
+     for (ipc_ j=0; j<nb; j+=threadsy)
        regcache[j/threadsy] = aval[j*lda];
    }
 
@@ -734,46 +737,46 @@ void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, int *__restrict__ syn
    T_ELEM val = 0;
    if (threadIdx.y==0) {
      if (!short_row) {
-       val = -xglobal[int(row*nb+threadIdx.x)*incx];
+       val = -xglobal[ipc_(row*nb+threadIdx.x)*incx];
      } else {
-       if (threadIdx.x<n%nb) val = -xglobal[int(row*nb+threadIdx.x)*incx];
+       if (threadIdx.x<n%nb) val = -xglobal[ipc_(row*nb+threadIdx.x)*incx];
      }
    }
-   int col_done = -1;
-   for (int col=0; col<row-1; ++col) {
+   ipc_ col_done = -1;
+   for (ipc_ col=0; col<row-1; ++col) {
      /* apply update from block (row, col) */
      const T_ELEM *const aval = &a[(col*nb+threadIdx.y)*lda + row*nb+threadIdx.x];
-     T_ELEM *const xg = &(xglobal[int(col*nb)*incx]);
+     T_ELEM *const xg = &(xglobal[ipc_(col*nb)*incx]);
      wait_until_ge(tid, &sync[0], col, &col_done); // Wait for diagonal block to be done
      T_ELEM *const xl = xlocal+threadIdx.y;
-     if (tid<nb) xlocal[tid] = *(const volatile T_ELEM*)&(xg[int(tid)*incx]);
+     if (tid<nb) xlocal[tid] = *(const volatile T_ELEM*)&(xg[ipc_(tid)*incx]);
      __syncthreads();
      if (nb % threadsy == 0) {
 #pragma unroll
-       for (int j=0; j<nb; j+=threadsy)
+       for (ipc_ j=0; j<nb; j+=threadsy)
          val += aval[j*lda] * xl[j];
      } else {
 #pragma unroll
-       for (int j=0; j<nb; j+=threadsy)
+       for (ipc_ j=0; j<nb; j+=threadsy)
          if(j+threadIdx.y<nb) val += aval[j*lda] * xl[j];
      }
    }
    if (row!=0) {
-     const int col = row-1;
+     const ipc_ col = row-1;
      /* apply update from block (row, col) */
-     T_ELEM *const xg = &(xglobal[int(col*nb)*incx]);
+     T_ELEM *const xg = &(xglobal[ipc_(col*nb)*incx]);
      wait_until_ge(tid, &sync[0], col, &col_done); // Wait for diagonal block to be done
      T_ELEM *const xl = xlocal+threadIdx.y;
-     if (tid<nb) xlocal[tid] = *(const volatile T_ELEM*)&(xg[int(tid)*incx]);
+     if (tid<nb) xlocal[tid] = *(const volatile T_ELEM*)&(xg[ipc_(tid)*incx]);
      __syncthreads();
 #pragma unroll
-     for (int j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
+     for (ipc_ j=0; j<nb; j+=threadsy) // do j=0,nb-1,threadsy
        val += regcache[j/threadsy] * xl[j];
    }
    partSum[threadIdx.y*threadsx+threadIdx.x] = val;
    __syncthreads();
    if (threadIdx.y==0) {
-     for (int i=1; i<threadsy; ++i)
+     for (ipc_ i=1; i<threadsy; ++i)
        val += partSum[i*threadsx+threadIdx.x];
      val = -val;
      if (short_row && threadIdx.x>=n%nb) val = 0.0;
@@ -785,14 +788,14 @@ void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, int *__restrict__ syn
      slvinv<T_ELEM, nb, threadsy>(cache, xlocal, val, partSum);
      if (!short_row || threadIdx.x<n%nb) {
        if (threadIdx.y==0) {
-         xglobal[int(row*nb+tid)*incx] = val;
+         xglobal[ipc_(row*nb+tid)*incx] = val;
        }
      }
    } else {
      if (threadIdx.y==0) {
        dblkSolve<T_ELEM,nb,ISUNIT>(cache, nb, val);
        if (!short_row || threadIdx.x<n%nb) {
-         xglobal[int(row*nb+tid)*incx] = val;
+         xglobal[ipc_(row*nb+tid)*incx] = val;
        }
      }
    }
@@ -800,7 +803,7 @@ void __global__ trsv_ln_exec(T_ELEM *__restrict__ xglobal, int *__restrict__ syn
    if (threadIdx.y==0) {
      dblkSolve<T_ELEM,nb,ISUNIT>(cache, nb, val);
      if (!short_row || threadIdx.x<n%nb) {
-       xglobal[int(row*nb+tid)*incx] = val;
+       xglobal[ipc_(row*nb+tid)*incx] = val;
      }
    }
 #endif /* INV_AFTER */
diff --git a/include/ssids_profile.hxx b/include/ssids_profile.hxx
index 5cab540b45..90dfbfd4bc 100644
--- a/include/ssids_profile.hxx
+++ b/include/ssids_profile.hxx
@@ -2,10 +2,14 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-04 AT 09:10 GMT
  */
+
 #pragma once
 
 #include "spral_config.h"
+#include "ssids_rip.hxx"
+
 #include <time.h>
 
 //#define PROFILE
@@ -56,7 +60,7 @@ public:
        * \param name Predefined name of task, as setup in Profile::init().
        * \param thread Optional thread number, otherwise use best guess.
        */
-      Task(char const* name, int thread=Profile::guess_core())
+      Task(char const* name, ipc_ thread=Profile::guess_core())
       : name(name), thread(thread), t1(Profile::now())
       {}
 
@@ -73,7 +77,7 @@ public:
 
    private:
       char const* name; //< Name of task, one defined in Profile::init().
-      int thread; //< Thread of task.
+      ipc_ thread; //< Thread of task.
       double t1; //< Start time of task.
    };
 
@@ -83,7 +87,7 @@ public:
     * \param thread Optional thread number, otherwise use best guess.
     */
    static
-   void setState(char const* name, int thread=Profile::guess_core()) {
+   void setState(char const* name, ipc_ thread=Profile::guess_core()) {
 #if defined(PROFILE) && defined(HAVE_GTG)
       double t = Profile::now();
       ::setState(t, "ST_TASK", Profile::get_thread_name(thread), name);
@@ -108,7 +112,7 @@ public:
     * \param thread Optional thread number, otherwise use best guess.
     */
    static
-   void setNullState(int thread=Profile::guess_core()) {
+   void setNullState(ipc_ thread=Profile::guess_core()) {
       setState("0", thread);
    }
 
@@ -120,7 +124,7 @@ public:
     */
    static
    void addEvent(char const* type, char const*val,
-         int thread=Profile::guess_core()) {
+         ipc_ thread=Profile::guess_core()) {
 #if defined(PROFILE) && defined(HAVE_GTG)
       ::addEvent(now(), type, get_thread_name(thread), val);
 #endif
@@ -132,8 +136,8 @@ public:
     * \note Times are all measured from the end of this subroutine.
     */
    static
-   // void init(int nregions, spral::hw_topology::NumaRegion* regions) {
-   void init(int nnodes, spral::hw_topology::NumaRegion* nodes) {
+   // void init(ipc_ nregions, spral::hw_topology::NumaRegion* regions) {
+   void init(ipc_ nnodes, spral::hw_topology::NumaRegion* nodes) {
 #if defined(PROFILE) && defined(HAVE_GTG)
       // Initialise profiling
       setTraceType(PAJE);
@@ -142,23 +146,23 @@ public:
       addContType("CT_NODE", "0", "Node");
       addContType("CT_THREAD", "CT_NODE", "Thread");
       addContType("CT_GPU", "CT_NODE", "GPU");
-      // int nnodes = 0;
+      // ipc_ nnodes = 0;
       // spral::hw_topology::NumaRegion* nodes;
       if (!nodes) spral_hw_topology_guess(&nnodes, &nodes);
-      int core_idx=0;
-      for(int node=0; node<nnodes; ++node) {
+      ipc_ core_idx=0;
+      for(ipc_ node=0; node<nnodes; ++node) {
          char node_id[100], node_name[100];
          snprintf(node_id, 100, "C_Node%d", node);
          snprintf(node_name, 100, "Node %d", node);
          addContainer(0.0, node_id, "CT_NODE", "0", node_name, "0");
-         for(int i=0; i<nodes[node].nproc; ++i) {
+         for(ipc_ i=0; i<nodes[node].nproc; ++i) {
             char core_name[100];
             snprintf(core_name, 100, "Core %d", core_idx);
             addContainer(0.0, get_thread_name(core_idx), "CT_THREAD", node_id,
                   core_name, "0");
             core_idx++;
          }
-         for(int gpu=0; gpu<nodes[node].ngpu; ++gpu) {
+         for(ipc_ gpu=0; gpu<nodes[node].ngpu; ++gpu) {
             char gpu_name[100], gpu_id[100];
             snprintf(gpu_id, 100, "C_GPU%d", nodes[node].gpus[gpu]);
             snprintf(gpu_name, 100, "GPU %d", nodes[node].gpus[gpu]);
@@ -225,7 +229,7 @@ public:
 private:
    /** \brief Convert thread index to character string */
    static
-   char const* get_thread_name(int thread) {
+   char const* get_thread_name(ipc_ thread) {
       char const* thread_name[] = {
          "Thread0", "Thread1", "Thread2", "Thread3",
          "Thread4", "Thread5", "Thread6", "Thread7",
@@ -249,7 +253,7 @@ private:
 
    /** \brief Return best guess at processor id. */
    static
-   int guess_core() {
+   ipc_ guess_core() {
 #ifdef HAVE_SCHED_GETCPU
       return sched_getcpu();
 #else /* HAVE_SCHED_GETCPU */
diff --git a/include/ssids_rip.hxx b/include/ssids_rip.hxx
new file mode 100644
index 0000000000..6856b96f72
--- /dev/null
+++ b/include/ssids_rip.hxx
@@ -0,0 +1,30 @@
+/** \file
+ *  \copyright 2024 GALAHAD productions
+ *  \licence   BSD licence, see LICENCE file for details
+ *  \author    Nick Gould
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 10:30 GMT
+ */
+
+#include <stdint.h>
+
+/* real precision employed */
+
+#ifdef SPRAL_SINGLE
+#define rpc_ float
+#else
+#define rpc_ double
+#endif
+
+/* integer storage employed */
+
+#ifdef INTEGER_64
+#define ipc_ int64_t
+#define uipc_ uint64_t
+#else
+#define ipc_ int
+#define uipc_ unsigned int
+#endif
+
+/* generic storage */
+
+#define longc_ int64_t
diff --git a/meson.build b/meson.build
index ef2550a95e..24f3ea225c 100644
--- a/meson.build
+++ b/meson.build
@@ -293,8 +293,8 @@ endif
 
 # Compile GALAHAD with 64-bit integer
 if galahad_int64
-  extra_args_single += ['-DGALAHAD_64BIT_INTEGER', '-DSPRAL_64BIT_INTEGER']
-  extra_args_double += ['-DGALAHAD_64BIT_INTEGER', '-DSPRAL_64BIT_INTEGER']
+  extra_args_single += '-DINTEGER_64'
+  extra_args_double += '-DINTEGER_64'
 endif
 
 # Sources
diff --git a/src/dum/pastixf_enums.F90 b/src/dum/pastixf_enums.F90
index 5630a80048..71794b7cfe 100644
--- a/src/dum/pastixf_enums.F90
+++ b/src/dum/pastixf_enums.F90
@@ -1,4 +1,4 @@
-! THIS VERSION: GALAHAD 4.1 - 2022-10-25 AT 16:25 GMT.
+! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:25 GMT.
 
 !-*-  G A L A H A D  -  D U M M Y   P A S T I X F _ E N U M S   M O D U L E  -*-
 
@@ -7,7 +7,7 @@ MODULE pastixf_enums
    USE spmf_enums
    USE iso_c_binding, ONLY : c_double, c_int, c_ptr, c_int32_t, c_int64_t
 
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
   INTEGER, PARAMETER :: pastix_int_t = c_int64_t
 #else
   INTEGER, PARAMETER :: pastix_int_t = c_int32_t
diff --git a/src/dum/spmf_enums.F90 b/src/dum/spmf_enums.F90
index 75a6aeb65f..d68f29d021 100644
--- a/src/dum/spmf_enums.F90
+++ b/src/dum/spmf_enums.F90
@@ -1,4 +1,4 @@
-! THIS VERSION: GALAHAD 4.3 - 2024-01-17 AT 07:30 GMT.
+! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:30 GMT.
 
 !-*-*-  G A L A H A D  -  D U M M Y   S P M F _ E N U M S   M O D U L E  -*-*-
 
@@ -7,7 +7,7 @@ MODULE spmf_enums
   USE iso_c_binding, ONLY : c_float, c_double, c_ptr,                          &
                             c_int, c_int32_t, c_int64_t
 
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
   INTEGER, PARAMETER :: spm_int_t = c_int64_t
 #else
   INTEGER, PARAMETER :: spm_int_t = c_int32_t
diff --git a/src/forthcoming/colt/colt.F90 b/src/forthcoming/colt/colt.F90
index 4d7c3a09e6..d025fd753d 100644
--- a/src/forthcoming/colt/colt.F90
+++ b/src/forthcoming/colt/colt.F90
@@ -3148,7 +3148,7 @@ END SUBROUTINE eval_HOCPRODS
            / REAL( n_points - 1, KIND = rp_ ) ) * (  t_upper -  t_lower )
 
        nlp%X( : nlp%n ) = zero
-       nlp%X( 2 ) = one
+       nlp%X( 1 ) = inform%target
        IF ( data%printd ) THEN
          WRITE( data%out, "( A, ' X ', /, ( 5ES12.4 ) )" )                     &
            prefix, nlp%X( : nlp%n )
diff --git a/src/kinds/kinds.F90 b/src/kinds/kinds.F90
index b685638d4c..e149e60dec 100644
--- a/src/kinds/kinds.F90
+++ b/src/kinds/kinds.F90
@@ -1,4 +1,4 @@
-! THIS VERSION: GALAHAD 4.3 - 2024-01-26 AT 11:10 GMT.
+! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:20 GMT.
 
 #include "galahad_modules.h"
 
@@ -41,7 +41,7 @@ MODULE GALAHAD_KINDS
 
 !  integer and logical kinds (replace the latter in fortran 2023)
 
-#ifdef GALAHAD_64BIT_INTEGER
+#ifdef INTEGER_64
   INTEGER, PARAMETER :: ip_ = INT64
   INTEGER, PARAMETER :: ipc_ = C_INT64_T
 #else
diff --git a/src/lancelot/makemaster b/src/lancelot/makemaster
index 8c564af4fd..673102afa6 100644
--- a/src/lancelot/makemaster
+++ b/src/lancelot/makemaster
@@ -1,14 +1,14 @@
 #  Main body of the LANCELOT B installation makefile under GALAHAD
 
 #  N. Gould and Ph. L. Toint.
-#  This version: 2024-01-26
+#  This version: 2024-02-03
 
 SHELL = /bin/$(BINSHELL)
 
 ifeq "$(PRECIS)" "single_64"
-  DPREC = -DGALAHAD_SINGLE -DGALAHAD_64BIT_INTEGER
+  DPREC = -DGALAHAD_SINGLE -DINTEGER_64
 else ifeq "$(PRECIS)" "double_64"
-  DPREC = -DGALAHAD_DOUBLE -DGALAHAD_64BIT_INTEGER
+  DPREC = -DGALAHAD_DOUBLE -DINTEGER_64
 else ifeq "$(PRECIS)" "single"
   DPREC = -DGALAHAD_SINGLE
 else
diff --git a/src/lapack/rebuild.F90 b/src/lapack/rebuild.F90
index 29d5e18b2f..b907e2c156 100644
--- a/src/lapack/rebuild.F90
+++ b/src/lapack/rebuild.F90
@@ -1,4 +1,4 @@
-! THIS VERSION: GALAHAD 4.3 - 2024-01-29 AT 09:45 GMT.
+! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:25 GMT.
 
 !  read a file containg a subset of the reference blas, lapack, etc
 !  written in fortran 77, and output a multi-precision version capable
@@ -117,14 +117,14 @@ PROGRAM BUILD
      DO i = 1, 4
        SELECT CASE ( i )
        CASE( 1 )
-         WRITE( hout, "( '#ifdef GALAHAD_64BIT_INTEGER', /,                    &
+         WRITE( hout, "( '#ifdef INTEGER_64', /,                               &
         &         '#define GALAHAD_', A, '_interface GALAHAD_', A,             &
         &         '_interface_64', /,                                          &
-        &         '#ifdef GALAHAD_NO_UNDERSCORE_64BIT_INTEGER')" ) urefs, urefs
+        &         '#ifdef NO_UNDERSCORE_INTEGER_64')" ) urefs, urefs
        CASE( 2 )
-         WRITE( hout, "( '#elif GALAHAD_DOUBLE_UNDERSCORE_64BIT_INTEGER' )" )
+         WRITE( hout, "( '#elif DOUBLE_UNDERSCORE_INTEGER_64' )" )
        CASE( 3 )
-         WRITE( hout, "( '#elif GALAHAD_NO_SYMBOL_64BIT_INTEGER' )" )
+         WRITE( hout, "( '#elif NO_SYMBOL_INTEGER_64' )" )
          CYCLE
        CASE( 4 )
          WRITE( hout, "( '#else' )" )
diff --git a/src/makedefs/definitions b/src/makedefs/definitions
index f5da20f02f..53d30428de 100644
--- a/src/makedefs/definitions
+++ b/src/makedefs/definitions
@@ -1,17 +1,17 @@
 #  Standard GALAHAD makefile definitions
 
 #  Nick Gould, for GALAHAD production
-#  This version: 2024-01-26
+#  This version: 2024-02-03
 
 #  makefile shell
 
 SHELL = /bin/$(BINSHELL)
 
 ifeq "$(PRECIS)" "single_64"
-  DPREC = -DGALAHAD_SINGLE -DGALAHAD_64BIT_INTEGER
+  DPREC = -DGALAHAD_SINGLE -DINTEGER_64
   HSL_PRECIS = s
 else ifeq "$(PRECIS)" "double_64"
-  DPREC = -DGALAHAD_DOUBLE -DGALAHAD_64BIT_INTEGER
+  DPREC = -DGALAHAD_DOUBLE -DINTEGER_64
   HSL_PRECIS = d
 else ifeq "$(PRECIS)" "single"
   DPREC = -DGALAHAD_SINGLE
diff --git a/src/makedefs/hsl_definitions b/src/makedefs/hsl_definitions
index d439ae3a3b..07169aa1e6 100644
--- a/src/makedefs/hsl_definitions
+++ b/src/makedefs/hsl_definitions
@@ -1,16 +1,16 @@
 #  Standard GALAHAD HSL makefile definitions
 
 #  Nick Gould, for GALAHAD production
-#  This version: 2024-01-26
+#  This version: 2024-02-03
 
 SHELL = /bin/$(BINSHELL)
 
 ifeq "$(PRECIS)" "single_64"
-  DPREC = -DGALAHAD_SINGLE -DGALAHAD_64BIT_INTEGER
+  DPREC = -DGALAHAD_SINGLE -DINTEGER_64
   HSL_PRECIS = s
   INTEGER = 64bit
 else ifeq "$(PRECIS)" "double_64"
-  DPREC = -DGALAHAD_DOUBLE -DGALAHAD_64BIT_INTEGER
+  DPREC = -DGALAHAD_DOUBLE -DINTEGER_64
   HSL_PRECIS = d
   INTEGER = 64bit
 else ifeq "$(PRECIS)" "single"
diff --git a/src/spral/makemaster b/src/spral/makemaster
index 0c52b67660..e9a6f44b90 100644
--- a/src/spral/makemaster
+++ b/src/spral/makemaster
@@ -3,14 +3,14 @@
 #  available under a BSD licence as part of GALAHAD
 
 #  Nick Gould, for GALAHAD production
-#  This version: 2024-01-26
+#  This version: 2024-02-03
 
 SHELL = /bin/$(BINSHELL)
 
 ifeq "$(PRECIS)" "single_64"
-  DPREC = -DSPRAL_SINGLE -DSPRAL_64BIT_INTEGER
+  DPREC = -DSPRAL_SINGLE -DINTEGER_64
 else ifeq "$(PRECIS)" "double_64"
-  DPREC = -DSPRAL_DOUBLE -DSPRAL_64BIT_INTEGER
+  DPREC = -DSPRAL_DOUBLE -DINTEGER_64
 else ifeq "$(PRECIS)" "single"
   DPREC = -DSPRAL_SINGLE
 else
diff --git a/src/spral/spral_kinds.F90 b/src/spral/spral_kinds.F90
index 6cf3eb6b40..65c0a8a895 100644
--- a/src/spral/spral_kinds.F90
+++ b/src/spral/spral_kinds.F90
@@ -1,6 +1,6 @@
-! THIS VERSION: GALAHAD 4.3 - 2024-01-26 AT 11:10 GMT.
+! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:30 GMT.
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define SPRAL_KINDS_double spral_kinds_double_64
 #define SPRAL_KINDS_single spral_kinds_single_64
 #endif
@@ -44,7 +44,7 @@ MODULE SPRAL_KINDS
 
 !  integer kinds
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
   INTEGER, PARAMETER :: ip_ = INT64
   INTEGER, PARAMETER :: ipc_ = C_INT64_T
 #else
diff --git a/src/ssids/C/ssids_ciface.F90 b/src/ssids/C/ssids_ciface.F90
index a3c8199a28..ba0a92577c 100644
--- a/src/ssids/C/ssids_ciface.F90
+++ b/src/ssids/C/ssids_ciface.F90
@@ -1,7 +1,7 @@
-! THIS VERSION: GALAHAD 4.1 - 2023-01-25 AT 09:00 GMT.
+! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:35 GMT.
 
 #ifdef SPRAL_SINGLE
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define SPRAL_KINDS_precision SPRAL_KINDS_single_64
 #define SPRAL_SSIDS_precision_ciface SPRAL_SSIDS_single_ciface_64
 #define SPRAL_SSIDS_types_precision spral_ssids_types_single_64
@@ -13,7 +13,7 @@
 #define SPRAL_SSIDS_inform_precision spral_ssids_inform_single
 #endif
 #else
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define SPRAL_KINDS_precision SPRAL_KINDS_double_64
 #define SPRAL_SSIDS_precision_ciface SPRAL_SSIDS_double_ciface_64
 #define SPRAL_SSIDS_types_precision spral_ssids_types_double_64
diff --git a/src/ssids/NumericSubtree.cxx b/src/ssids/NumericSubtree.cxx
index e6eb594df1..ed6a89695b 100644
--- a/src/ssids/NumericSubtree.cxx
+++ b/src/ssids/NumericSubtree.cxx
@@ -2,7 +2,9 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 09:15 GMT
  */
+
 #include "ssids_cpu_NumericSubtree.hxx"
 
 #include <cassert>
@@ -11,9 +13,9 @@
 
 #include "spral_omp.hxx"
 #include "ssids_cpu_AppendAlloc.hxx"
+#include "ssids_rip.hxx"
 
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define spral_ssids_cpu_create_num_subtree \
         spral_ssids_cpu_create_num_subtree_sgl
 #define spral_ssids_cpu_destroy_num_subtree \
@@ -35,7 +37,6 @@
 #define spral_ssids_cpu_subtree_free_contrib \
         spral_ssids_cpu_subtree_free_contrib_sgl
 #else
-#define precision_ double
 #define spral_ssids_cpu_create_num_subtree \
         spral_ssids_cpu_create_num_subtree_dbl
 #define spral_ssids_cpu_destroy_num_subtree \
@@ -69,7 +70,7 @@ typedef float T;
 #else
 typedef double T;
 #endif
-const int PAGE_SIZE = 8*1024*1024; // 8MB
+const ipc_ PAGE_SIZE = 8*1024*1024; // 8MB
 typedef NumericSubtree<true, T, PAGE_SIZE, AppendAlloc<T>> NumericSubtreePosdef;
 typedef NumericSubtree<false, T, PAGE_SIZE, AppendAlloc<T>> NumericSubtreeIndef;
 
@@ -80,13 +81,14 @@ extern "C"
 void* spral_ssids_cpu_create_num_subtree(
       bool posdef,
       void const* symbolic_subtree_ptr,
-      const precision_ *const aval, // Values of A
-      const precision_ *const scaling, // Scaling vector (NULL if none)
+      const rpc_ *const aval, // Values of A
+      const rpc_ *const scaling, // Scaling vector (NULL if none)
       void** child_contrib, // Contributions from child subtrees
       struct cpu_factor_options const* options, // Options in
       ThreadStats* stats // Info out
       ) {
-   auto const& symbolic_subtree = *static_cast<SymbolicSubtree const*>(symbolic_subtree_ptr);
+   auto const& symbolic_subtree = 
+      *static_cast<SymbolicSubtree const*>(symbolic_subtree_ptr);
 
    // Perform factorization
    if(posdef) {
@@ -126,9 +128,9 @@ extern "C"
 Flag spral_ssids_cpu_subtree_solve_fwd(
       bool posdef,      // If true, performs A=LL^T, if false do pivoted A=LDL^T
       void const* subtree_ptr,// pointer to relevant type of NumericSubtree
-      int nrhs,         // number of right-hand sides
-      precision_* x,        // ldx x nrhs array of right-hand sides
-      int ldx           // leading dimension of x
+      ipc_ nrhs,         // number of right-hand sides
+      rpc_* x,        // ldx x nrhs array of right-hand sides
+      ipc_ ldx           // leading dimension of x
       ) {
 
    // Call method
@@ -153,9 +155,9 @@ extern "C"
 Flag spral_ssids_cpu_subtree_solve_diag(
       bool posdef,      // If true, performs A=LL^T, if false do pivoted A=LDL^T
       void const* subtree_ptr,// pointer to relevant type of NumericSubtree
-      int nrhs,         // number of right-hand sides
-      precision_* x,        // ldx x nrhs array of right-hand sides
-      int ldx           // leading dimension of x
+      ipc_ nrhs,         // number of right-hand sides
+      rpc_* x,        // ldx x nrhs array of right-hand sides
+      ipc_ ldx           // leading dimension of x
       ) {
 
    // Call method
@@ -178,9 +180,9 @@ extern "C"
 Flag spral_ssids_cpu_subtree_solve_diag_bwd(
       bool posdef,      // If true, performs A=LL^T, if false do pivoted A=LDL^T
       void const* subtree_ptr,// pointer to relevant type of NumericSubtree
-      int nrhs,         // number of right-hand sides
-      precision_* x,        // ldx x nrhs array of right-hand sides
-      int ldx           // leading dimension of x
+      ipc_ nrhs,         // number of right-hand sides
+      rpc_* x,        // ldx x nrhs array of right-hand sides
+      ipc_ ldx           // leading dimension of x
       ) {
 
    // Call method
@@ -205,9 +207,9 @@ extern "C"
 Flag spral_ssids_cpu_subtree_solve_bwd(
       bool posdef,      // If true, performs A=LL^T, if false do pivoted A=LDL^T
       void const* subtree_ptr,// pointer to relevant type of NumericSubtree
-      int nrhs,         // number of right-hand sides
-      precision_* x,        // ldx x nrhs array of right-hand sides
-      int ldx           // leading dimension of x
+      ipc_ nrhs,         // number of right-hand sides
+      rpc_* x,        // ldx x nrhs array of right-hand sides
+      ipc_ ldx           // leading dimension of x
       ) {
 
    // Call method
@@ -232,8 +234,8 @@ extern "C"
 void spral_ssids_cpu_subtree_enquire(
       bool posdef,      // If true, performs A=LL^T, if false do pivoted A=LDL^T
       void const* subtree_ptr,// pointer to relevant type of NumericSubtree
-      int* piv_order,   // pivot order, may be null, only used if indef
-      precision_* d         // diagonal entries, may be null
+      ipc_* piv_order,   // pivot order, may be null, only used if indef
+      rpc_* d         // diagonal entries, may be null
       ) {
 
    // Call method
@@ -253,7 +255,7 @@ extern "C"
 void spral_ssids_cpu_subtree_alter(
       bool posdef,      // If true, performs A=LL^T, if false do pivoted A=LDL^T
       void* subtree_ptr,// pointer to relevant type of NumericSubtree
-      precision_ const* d   // new diagonal entries
+      rpc_ const* d   // new diagonal entries
       ) {
 
    assert(!posdef); // Should never be called on positive definite matrices.
@@ -268,14 +270,14 @@ extern "C"
 void spral_ssids_cpu_subtree_get_contrib(
       bool posdef,      // If true, performs A=LL^T, if false do pivoted A=LDL^T
       void* subtree_ptr,// pointer to relevant type of NumericSubtree
-      int* n,           // returned dimension of contribution block
-      precision_ const** val,     // returned pointer to contribution block
-      int* ldval,       // leading dimension of val
-      int const** rlist,      // returned pointer to row list
-      int* ndelay,      // returned number of delays
-      int const** delay_perm,  // returned pointer to delay values
-      precision_ const** delay_val,  // returned pointer to delay values
-      int* lddelay      // leading dimension of delay_val
+      ipc_* n,           // returned dimension of contribution block
+      rpc_ const** val,     // returned pointer to contribution block
+      ipc_* ldval,       // leading dimension of val
+      ipc_ const** rlist,      // returned pointer to row list
+      ipc_* ndelay,      // returned number of delays
+      ipc_ const** delay_perm,  // returned pointer to delay values
+      rpc_ const** delay_val,  // returned pointer to delay values
+      ipc_* lddelay      // leading dimension of delay_val
       ) {
    // Call method
    if(posdef) { // Converting from runtime to compile time posdef value
diff --git a/src/ssids/SymbolicSubtree.cxx b/src/ssids/SymbolicSubtree.cxx
index fd7ef25b18..5912738327 100644
--- a/src/ssids/SymbolicSubtree.cxx
+++ b/src/ssids/SymbolicSubtree.cxx
@@ -2,16 +2,19 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 16:00 GMT
  */
+
+#include "ssids_rip.hxx"
 #include "ssids_cpu_SymbolicSubtree.hxx"
 
 using namespace spral::ssids::cpu;
 
 extern "C"
 void* spral_ssids_cpu_create_symbolic_subtree(
-      int n, int sa, int en, int const* sptr, int const* sparent,
-      long const* rptr, int const* rlist, long const* nptr, long const* nlist,
-      int ncontrib, int const* contrib_idx,
+      ipc_ n, ipc_ sa, ipc_ en, ipc_ const* sptr, ipc_ const* sparent,
+      longc_ const* rptr, ipc_ const* rlist, longc_ const* nptr, 
+      longc_ const* nlist, ipc_ ncontrib, ipc_ const* contrib_idx,
       struct cpu_factor_options const* options) {
    return (void*) new SymbolicSubtree(
          n, sa, en, sptr, sparent, rptr, rlist, nptr, nlist, ncontrib,
diff --git a/src/ssids/assemble.cu b/src/ssids/assemble.cu
index 002b808c0a..33a130c0f0 100644
--- a/src/ssids/assemble.cu
+++ b/src/ssids/assemble.cu
@@ -1,3 +1,10 @@
+/* Copyright (c) 2013 Science and Technology Facilities Council (STFC)
+ * Copyright (c) 2013 NVIDIA
+ * Authors: Evgueni Ovtchinnikov (STFC)
+ *          Jeremy Appleyard (NVIDIA)
+ * This version: GALAHAD 4.3 - 2024-02-03 AT 09:40 GMT
+ */
+
 #ifdef __cplusplus
 #include <cmath>
 #else
@@ -8,11 +15,11 @@
 #include <cuda_runtime_api.h>
 #include <device_launch_parameters.h>
 
+#include "ssids_rip.hxx"
 #include "spral_cuda_cuda_check.h"
 #include "ssids_gpu_kernels_datatypes.h"
 
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define load_nodes_type load_nodes_type_single
 #define assemble_cp_type assemble_cp_type_single
 #define assemble_blk_type assemble_blk_type_single
@@ -28,7 +35,6 @@
 #define spral_ssids_load_nodes_sc spral_ssids_load_nodes_sc_single
 #define spral_ssids_max_abs spral_ssids_max_abs_single
 #else
-#define precision_ double
 #define load_nodes_type load_nodes_type_single
 #define assemble_cp_type assemble_cp_type_double
 #define assemble_blk_type assemble_blk_type_double
@@ -55,12 +61,12 @@
 namespace /* anon */ {
 
 struct load_nodes_type {
-  long nnz;    // Number of entries to map
-  int lda;    // Leading dimension of A
-  int ldl;    // Leading dimension of L
-  precision_ *lcol; // Pointer to non-delay part of L
-  long offn;   // Offset into nlist
-  long offr;  // Offset into rlist
+  longc_ nnz;    // Number of entries to map
+  ipc_ lda;    // Leading dimension of A
+  ipc_ ldl;    // Leading dimension of L
+  rpc_ *lcol; // Pointer to non-delay part of L
+  longc_ offn;   // Offset into nlist
+  longc_ offr;  // Offset into rlist
 };
 
 /*
@@ -73,22 +79,22 @@ struct load_nodes_type {
 __global__ void
 cu_load_nodes(
     const struct load_nodes_type *lndata,
-    const long *nlist,
-    const precision_ *aval
+    const longc_ *nlist,
+    const rpc_ *aval
 ) {
    lndata += blockIdx.x;
-   const long nnz = lndata->nnz;
-   const int lda = lndata->lda;
-   const int ldl = lndata->ldl;
+   const longc_ nnz = lndata->nnz;
+   const ipc_ lda = lndata->lda;
+   const ipc_ ldl = lndata->ldl;
 
    nlist += 2*lndata->offn;
-   precision_ *const lval = lndata->lcol;
+   rpc_ *const lval = lndata->lcol;
 
-   for (int i = threadIdx.x; i < nnz; i += blockDim.x) {
+   for (ipc_ i = threadIdx.x; i < nnz; i += blockDim.x) {
      // Note: nlist is 1-indexed, not 0 indexed, so we have to adjust
-     const int r = (nlist[2*i+1] - 1) % lda; // row index
-     const int c = (nlist[2*i+1] - 1) / lda; // col index
-     const long sidx = nlist[2*i+0] - 1; // source index
+     const ipc_ r = (nlist[2*i+1] - 1) % lda; // row index
+     const ipc_ c = (nlist[2*i+1] - 1) / lda; // col index
+     const longc_ sidx = nlist[2*i+0] - 1; // source index
      lval[r + c*ldl] = aval[sidx];
    }
 }
@@ -104,41 +110,41 @@ cu_load_nodes(
 __global__ void
 cu_load_nodes_sc(
     const struct load_nodes_type *lndata,
-    const long *nlist,
-    const int *rlist,
-    const precision_ *scale,
-    const precision_ *aval
+    const longc_ *nlist,
+    const ipc_ *rlist,
+    const rpc_ *scale,
+    const rpc_ *aval
 ) {
    lndata += blockIdx.x;
-   const int nnz = lndata->nnz;
-   const int lda = lndata->lda;
-   const int ldl = lndata->ldl;
+   const ipc_ nnz = lndata->nnz;
+   const ipc_ lda = lndata->lda;
+   const ipc_ ldl = lndata->ldl;
 
    nlist += 2*lndata->offn;
-   precision_ *const lval = lndata->lcol;
+   rpc_ *const lval = lndata->lcol;
    rlist += lndata->offr;
 
-   for (int i = threadIdx.x; i < nnz; i += blockDim.x) {
+   for (ipc_ i = threadIdx.x; i < nnz; i += blockDim.x) {
       // Note: nlist and rlist are 1-indexed, not 0 indexed, so we adjust
-      const int r = (nlist[2*i+1] - 1) % lda; // row index
-      const int c = (nlist[2*i+1] - 1) / lda; // col index
-      const long sidx = nlist[2*i+0] - 1; // source index
-      const precision_ rs = scale[rlist[r] - 1]; // row scaling
-      const precision_ cs = scale[rlist[c] - 1]; // col scaling
+      const ipc_ r = (nlist[2*i+1] - 1) % lda; // row index
+      const ipc_ c = (nlist[2*i+1] - 1) / lda; // col index
+      const longc_ sidx = nlist[2*i+0] - 1; // source index
+      const rpc_ rs = scale[rlist[r] - 1]; // row scaling
+      const rpc_ cs = scale[rlist[c] - 1]; // col scaling
       lval[r + c*ldl] = rs * aval[sidx] * cs;
    }
 }
 
 // BLOCK_SIZE = blockDim.x
 // maxabs must be initialized to zeros
-template< typename ELEMENT_TYPE, unsigned int BLOCK_SIZE >
+template< typename ELEMENT_TYPE, uipc_ BLOCK_SIZE >
 __global__ void
-cu_max_abs( const long n, const ELEMENT_TYPE *const u, ELEMENT_TYPE *const maxabs )
+cu_max_abs( const longc_ n, const ELEMENT_TYPE *const u, ELEMENT_TYPE *const maxabs )
 {
   __shared__ volatile ELEMENT_TYPE tmax[BLOCK_SIZE];
 
   tmax[threadIdx.x] = 0.0;
-  for ( long i = threadIdx.x + blockDim.x*blockIdx.x; i < n;
+  for ( longc_ i = threadIdx.x + blockDim.x*blockIdx.x; i < n;
         i += blockDim.x*gridDim.x ) {
     const ELEMENT_TYPE v = fabs(u[i]);
     if ( v > tmax[threadIdx.x] )
@@ -146,7 +152,7 @@ cu_max_abs( const long n, const ELEMENT_TYPE *const u, ELEMENT_TYPE *const maxab
   }
   __syncthreads();
 
-  for ( int inc = 1; inc < BLOCK_SIZE; inc *= 2 ) {
+  for ( ipc_ inc = 1; inc < BLOCK_SIZE; inc *= 2 ) {
     if ( 2*inc*threadIdx.x + inc < BLOCK_SIZE
         && tmax[2*inc*threadIdx.x + inc] > tmax[2*inc*threadIdx.x] )
       tmax[2*inc*threadIdx.x] = tmax[2*inc*threadIdx.x + inc];
@@ -160,30 +166,30 @@ cu_max_abs( const long n, const ELEMENT_TYPE *const u, ELEMENT_TYPE *const maxab
 /* Following data type describes a single child-parent assembly */
 struct assemble_cp_type {
   // Parent data
-  int pvoffset; // Offset to start of parent node values
-  precision_ *pval; // Pointer to non-delay part of parent L
-  int ldp; // Leading dimension of parent
+  ipc_ pvoffset; // Offset to start of parent node values
+  rpc_ *pval; // Pointer to non-delay part of parent L
+  ipc_ ldp; // Leading dimension of parent
 
   // Child data
-  int cm; // Number of rows in child
-  int cn; // Number of columns in child
-  int ldc; // Leading dimension of child
-  long cvoffset; // Offset to start of child node values
-  precision_ *cv; // Pointer to start of child node values
+  ipc_ cm; // Number of rows in child
+  ipc_ cn; // Number of columns in child
+  ipc_ ldc; // Leading dimension of child
+  longc_ cvoffset; // Offset to start of child node values
+  rpc_ *cv; // Pointer to start of child node values
 
   // Alignment data
-  int *rlist_direct; // Pointer to start of child's rlist
-  int *ind; // Pointer to start of child's contribution index
+  ipc_ *rlist_direct; // Pointer to start of child's rlist
+  ipc_ *ind; // Pointer to start of child's contribution index
 
   // Sync data
-  int sync_offset; // we watch sync[sync_offset]
-  int sync_wait_for; // and wait for it to have value >= sync_wait_for
+  ipc_ sync_offset; // we watch sync[sync_offset]
+  ipc_ sync_wait_for; // and wait for it to have value >= sync_wait_for
 };
 
 /* Following data type describes actions of single CUDA block */
 struct assemble_blk_type {
-  int cp; // node we're assembling into
-  int blk; // block number of that node
+  ipc_ cp; // node we're assembling into
+  ipc_ blk; // block number of that node
 };
 
 /* Used to force volatile load of a declared non-volatile variable */
@@ -201,18 +207,17 @@ __inline__ __device__ T_ELEM loadVolatile(volatile T_ELEM *const vptr) {
  * next_blk is used to ensure all blocks run in exact desired order.
  * sync[] is used to ensure dependencies are completed in the correct order.
  */
-template <unsigned int blk_sz_x, unsigned int blk_sz_y,
-          unsigned int ntx, unsigned nty>
+template <uipc_ blk_sz_x, uipc_ blk_sz_y, uipc_ ntx, unsigned nty>
 void __global__ assemble(
     const struct assemble_blk_type *blkdata, // block mapping
     const struct assemble_cp_type *cpdata, // child-parent data
-    const precision_ *const children, // pointer to array containing children
-    precision_ *const parents, // pointer to array containing parents
-    unsigned int *const next_blk, // gmem location used to determine next block
-    volatile unsigned int *const sync // sync[cp] is #blocks completed so far for cp
+    const rpc_ *const children, // pointer to array containing children
+    rpc_ *const parents, // pointer to array containing parents
+    uipc_ *const next_blk, // gmem location used to determine next block
+    volatile uipc_ *const sync // sync[cp] is #blocks completed so far for cp
 ) {
    // Get block number
-   __shared__ volatile unsigned int mynext_blk;
+   __shared__ volatile uipc_ mynext_blk;
    if(threadIdx.x==0 && threadIdx.y==0)
       mynext_blk = atomicAdd(next_blk, 1);
    __syncthreads();
@@ -220,21 +225,21 @@ void __global__ assemble(
    // Determine global information
    blkdata += mynext_blk;
    cpdata += blkdata->cp;
-   int blk = blkdata->blk;
-   int nx = (cpdata->cm-1) / blk_sz_x + 1; // number of blocks high child is
-   int bx = blk % nx; // coordinate of block in x direction
-   int by = blk / nx; // coordinate of block in y direction
-   int ldc = cpdata->ldc;
-   int ldp = cpdata->ldp;
+   ipc_ blk = blkdata->blk;
+   ipc_ nx = (cpdata->cm-1) / blk_sz_x + 1; // number of blocks high child is
+   ipc_ bx = blk % nx; // coordinate of block in x direction
+   ipc_ by = blk / nx; // coordinate of block in y direction
+   ipc_ ldc = cpdata->ldc;
+   ipc_ ldp = cpdata->ldp;
 
    // Initialize local information
-   int m = min(blk_sz_x, cpdata->cm - bx*blk_sz_x);
-   int n = min(blk_sz_y, cpdata->cn - by*blk_sz_y);
-   const precision_ *src =
+   ipc_ m = min(blk_sz_x, cpdata->cm - bx*blk_sz_x);
+   ipc_ n = min(blk_sz_y, cpdata->cn - by*blk_sz_y);
+   const rpc_ *src =
       cpdata->cv + ldc*by*blk_sz_y + bx*blk_sz_x;
-   precision_ *dest = cpdata->pval;
-   int *rows = cpdata->rlist_direct + bx*blk_sz_x;
-   int *cols = cpdata->rlist_direct + by*blk_sz_y;
+   rpc_ *dest = cpdata->pval;
+   ipc_ *rows = cpdata->rlist_direct + bx*blk_sz_x;
+   ipc_ *cols = cpdata->rlist_direct + by*blk_sz_y;
 
    // Wait for previous child of this parent to complete
    if(threadIdx.x==0 && threadIdx.y==0) {
@@ -243,12 +248,12 @@ void __global__ assemble(
    __syncthreads();
 
    // Perform assembly
-   for(int j=0; j<blk_sz_y/nty; j++) {
+   for(ipc_ j=0; j<blk_sz_y/nty; j++) {
       if( threadIdx.y+j*nty < n ) {
-         int col = cols[threadIdx.y+j*nty]-1;
-         for(int i=0; i<blk_sz_x/ntx; i++) {
+         ipc_ col = cols[threadIdx.y+j*nty]-1;
+         for(ipc_ i=0; i<blk_sz_x/ntx; i++) {
             if( threadIdx.x+i*ntx < m ) {
-               int row = rows[threadIdx.x+i*ntx]-1;
+               ipc_ row = rows[threadIdx.x+i*ntx]-1;
                dest[row + col*ldp] +=
                   src[threadIdx.x+i*ntx + (threadIdx.y+j*nty)*ldc];
             }
@@ -259,19 +264,19 @@ void __global__ assemble(
    // Record that we're done
    __syncthreads();
    if(threadIdx.x==0 && threadIdx.y==0) {
-      atomicAdd((int*)&(sync[blkdata->cp]), 1);
+      atomicAdd((ipc_*)&(sync[blkdata->cp]), 1);
    }
 }
 
 struct assemble_delay_type {
-  int dskip; // Number of rows to skip for delays from later children
-  int m; // Number of rows in child to copy
-  int n; // Number of cols in child to copy
-  int ldd; // Leading dimension of dest (parent)
-  int lds; // Leading dimension of src (child)
-  precision_ *dval; // Pointer to dest (parent)
-  precision_ *sval; // Pointer to src (child)
-  long roffset; // Offset to rlist_direct
+  ipc_ dskip; // Number of rows to skip for delays from later children
+  ipc_ m; // Number of rows in child to copy
+  ipc_ n; // Number of cols in child to copy
+  ipc_ ldd; // Leading dimension of dest (parent)
+  ipc_ lds; // Leading dimension of src (child)
+  rpc_ *dval; // Pointer to dest (parent)
+  rpc_ *sval; // Pointer to src (child)
+  longc_ roffset; // Offset to rlist_direct
 };
 
 /* Copies delays from child to parent using one block per parent
@@ -279,26 +284,26 @@ struct assemble_delay_type {
  */
 void __global__ add_delays(
     struct assemble_delay_type *dinfo, // information on each block
-    const int *rlist_direct // children's rows indices in parents
+    const ipc_ *rlist_direct // children's rows indices in parents
 ) {
    dinfo += blockIdx.x;
-   const int dskip = dinfo->dskip; // number of delays
-   const int m = dinfo->m; // number of rows
-   const int n = dinfo->n; // number of cols
-   const int ldd = dinfo->ldd; // leading dimension of dest
-   const int lds = dinfo->lds; // leading dimension of src
-
-   precision_ *const dest = dinfo->dval;
-   const precision_ *const src = dinfo->sval;
+   const ipc_ dskip = dinfo->dskip; // number of delays
+   const ipc_ m = dinfo->m; // number of rows
+   const ipc_ n = dinfo->n; // number of cols
+   const ipc_ ldd = dinfo->ldd; // leading dimension of dest
+   const ipc_ lds = dinfo->lds; // leading dimension of src
+
+   rpc_ *const dest = dinfo->dval;
+   const rpc_ *const src = dinfo->sval;
    rlist_direct += dinfo->roffset;
 
-   for ( int y = threadIdx.y; y < n; y += blockDim.y ) {
-      for ( int x = threadIdx.x; x < m; x += blockDim.x ) {
+   for ( ipc_ y = threadIdx.y; y < n; y += blockDim.y ) {
+      for ( ipc_ x = threadIdx.x; x < m; x += blockDim.x ) {
          if ( x < n ) {
             dest[x + y*ldd] = src[x + y*lds];
          }
          else {
-            int xt = dskip + rlist_direct[x - n] - 1;
+            ipc_ xt = dskip + rlist_direct[x - n] - 1;
             dest[xt + y*ldd] = src[x + y*lds];
          }
       }
@@ -314,12 +319,12 @@ void __global__ add_delays(
 extern "C" {
 
 /* Invokes the add_delays<<<>>>() kernel */
-void spral_ssids_add_delays( const cudaStream_t *stream, int ndblk,
-      struct assemble_delay_type *gpu_dinfo, int *rlist_direct ) {
+void spral_ssids_add_delays( const cudaStream_t *stream, ipc_ ndblk,
+      struct assemble_delay_type *gpu_dinfo, ipc_ *rlist_direct ) {
    if ( ndblk == 0 ) return; // Nothing to see here
    dim3 threads(ADD_DELAYS_TX, ADD_DELAYS_TY);
-   for ( int i = 0; i < ndblk; i += MAX_CUDA_BLOCKS ) {
-      int nb = min(MAX_CUDA_BLOCKS, ndblk - i);
+   for ( ipc_ i = 0; i < ndblk; i += MAX_CUDA_BLOCKS ) {
+      ipc_ nb = min(MAX_CUDA_BLOCKS, ndblk - i);
       add_delays
          <<< nb, threads, 0, *stream >>>
          ( gpu_dinfo + i, rlist_direct );
@@ -328,25 +333,25 @@ void spral_ssids_add_delays( const cudaStream_t *stream, int ndblk,
 }
 
 /* Runs the kernel assemble<<<>>>() after setting up memory correctly. */
-/* Requires gpu_next_sync[] to be of size >= (1+ncp)*sizeof(unsigned int) */
-void spral_ssids_assemble(const cudaStream_t *stream, int nblk, int blkoffset,
-      struct assemble_blk_type *blkdata, int ncp,
-      struct assemble_cp_type *cpdata, precision_ *children,
-      precision_ *parents, unsigned int *gpu_next_sync) {
+/* Requires gpu_next_sync[] to be of size >= (1+ncp)*sizeof(uipc_) */
+void spral_ssids_assemble(const cudaStream_t *stream, ipc_ nblk, ipc_ blkoffset,
+      struct assemble_blk_type *blkdata, ipc_ ncp,
+      struct assemble_cp_type *cpdata, rpc_ *children,
+      rpc_ *parents, uipc_ *gpu_next_sync) {
    /* Create and initialize synchronization objects using a single call:
       next_blk[1]
       sync[ncp]
     */
    CudaSafeCall(
-         cudaMemsetAsync(gpu_next_sync,0,(1+ncp)*sizeof(unsigned int),*stream)
+         cudaMemsetAsync(gpu_next_sync,0,(1+ncp)*sizeof(uipc_),*stream)
          );
    /* Note, that we can only have at most 65535 blocks per dimn.
     * For some problems, nblk can exceed this, so we use more than one launch.
     * As the next block we look at is specified by next_blk this works fine.
     */
    dim3 threads(HOGG_ASSEMBLE_NTX, HOGG_ASSEMBLE_NTY);
-   for(int i=0; i<nblk; i+=MAX_CUDA_BLOCKS) {
-      int blocks = min(MAX_CUDA_BLOCKS, nblk-i);
+   for(ipc_ i=0; i<nblk; i+=MAX_CUDA_BLOCKS) {
+      ipc_ blocks = min(MAX_CUDA_BLOCKS, nblk-i);
       assemble
          <HOGG_ASSEMBLE_TX, HOGG_ASSEMBLE_TY,
           HOGG_ASSEMBLE_NTX, HOGG_ASSEMBLE_NTY>
@@ -358,38 +363,38 @@ void spral_ssids_assemble(const cudaStream_t *stream, int nblk, int blkoffset,
 }
 
 // Note: modified value lval is passed in via pointer in lndata, not as argument
-void spral_ssids_load_nodes( const cudaStream_t *stream, int nblocks,
-      const struct load_nodes_type *lndata, const long* list,
-      const precision_* mval ) {
-  for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-    int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+void spral_ssids_load_nodes( const cudaStream_t *stream, ipc_ nblocks,
+      const struct load_nodes_type *lndata, const longc_* list,
+      const rpc_* mval ) {
+  for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+    ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
     cu_load_nodes <<< nb, 128, 0, *stream >>> ( lndata + i, list, mval );
     CudaCheckError();
   }
 }
 
 // Note: modified value lval is passed in via pointer in lndata, not as argument
-void spral_ssids_load_nodes_sc( const cudaStream_t *stream, int nblocks,
-      const struct load_nodes_type *lndata, const long* list, const int* rlist,
-      const precision_* scale, const precision_* mval ) {
-  for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-    int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+void spral_ssids_load_nodes_sc( const cudaStream_t *stream, ipc_ nblocks,
+      const struct load_nodes_type *lndata, const longc_* list, const ipc_* rlist,
+      const rpc_* scale, const rpc_* mval ) {
+  for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+    ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
     cu_load_nodes_sc <<< nb, 128, 0, *stream >>> ( lndata + i, list, rlist, scale, mval );
     CudaCheckError();
   }
 }
 
 void spral_ssids_max_abs( const cudaStream_t *stream,
-      int nb, long n, precision_* u, precision_* buff, precision_* maxabs )
+      ipc_ nb, longc_ n, rpc_* u, rpc_* buff, rpc_* maxabs )
 {
-  cudaMemsetAsync(buff, 0, nb*sizeof(precision_), *stream);
+  cudaMemsetAsync(buff, 0, nb*sizeof(rpc_), *stream);
   cudaStreamSynchronize(*stream);
   if ( n > 1024*nb )
-    cu_max_abs< precision_, 256 ><<< nb, 256, 0, *stream >>>( n, u, buff );
+    cu_max_abs< rpc_, 256 ><<< nb, 256, 0, *stream >>>( n, u, buff );
   else
-    cu_max_abs< precision_, 32 ><<< nb, 32, 0, *stream >>>( n, u, buff );
+    cu_max_abs< rpc_, 32 ><<< nb, 32, 0, *stream >>>( n, u, buff );
   CudaCheckError();
-  cu_max_abs< precision_, 1024 ><<< 1, 1024, 0, *stream >>>( nb, buff, maxabs );
+  cu_max_abs< rpc_, 1024 ><<< 1, 1024, 0, *stream >>>( nb, buff, maxabs );
   CudaCheckError();
 }
 
diff --git a/src/ssids/cholesky.cxx b/src/ssids/cholesky.cxx
index 53f77fbad2..d272e7b72c 100644
--- a/src/ssids/cholesky.cxx
+++ b/src/ssids/cholesky.cxx
@@ -2,28 +2,29 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 15:45 GMT
  */
+
 #include "ssids_cpu_kernels_cholesky.hxx"
 
 #include <algorithm>
 #include <cstdio> // FIXME: remove as only used for debug
 
+#include "ssids_rip.hxx"
 #include "ssids_profile.hxx"
 #include "ssids_cpu_kernels_wrappers.hxx"
 
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define cholesky_factor cholesky_factor_sgl
 #define cholesky_solve_fwd cholesky_solve_fwd_sgl
 #define cholesky_solve_bwd cholesky_solve_bwd_sgl
 #else
-#define precision_ double
 #define cholesky_factor cholesky_factor_dbl
 #define cholesky_solve_fwd cholesky_solve_fwd_dbl
 #define cholesky_solve_bwd cholesky_solve_bwd_dbl
 #endif
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define host_gemm host_gemm_64
 #define lapack_potrf lapack_potrf_64
 #define host_syrk host_syrk_64
@@ -50,11 +51,11 @@ namespace spral { namespace ssids { namespace cpu {
  * \param info is initialized to -1, and will be changed to the index of any
  *    column where a non-zero column is encountered.
  */
-void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta,
-                     precision_* upd, int ldupd, int blksz, int *info) {
+void cholesky_factor(ipc_ m, ipc_ n, rpc_* a, ipc_ lda, rpc_ beta,
+                     rpc_* upd, ipc_ ldupd, ipc_ blksz, ipc_ *info) {
    if(n < blksz) {
       // Adjust so blocks have blksz**2 entries
-      blksz = int((long(blksz)*blksz) / n);
+      blksz = ipc_((long(blksz)*blksz) / n);
    }
 
    #pragma omp atomic write
@@ -64,36 +65,36 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta,
     * its current col-wise implementation ensuring maximum work available??? */
    #pragma omp taskgroup
 
-   for(int j = 0; j < n; j += blksz) {
-     int blkn = std::min(blksz, n-j);
+   for(ipc_ j = 0; j < n; j += blksz) {
+     ipc_ blkn = std::min(blksz, n-j);
      /* Diagonal Block Factorization Task */
      #pragma omp task default(none)                      \
         firstprivate(j, blkn)                            \
         shared(m, a, lda, blksz, info, beta, upd, ldupd) \
         depend(inout: a[j*(lda+1):1])
      {
-       int my_info;
+       ipc_ my_info;
        #pragma omp atomic read
        my_info = *info;
        if (my_info == -1) {
 #ifdef PROFILE
          Profile::Task task("TA_CHOL_DIAG");
 #endif
-         int blkm = std::min(blksz, m-j);
-         int flag = lapack_potrf(FILL_MODE_LWR, blkn, &a[j*(lda+1)], lda);
+         ipc_ blkm = std::min(blksz, m-j);
+         ipc_ flag = lapack_potrf(FILL_MODE_LWR, blkn, &a[j*(lda+1)], lda);
          if (flag > 0) {
            // Matrix was not positive definite
            #pragma omp atomic write
            *info = flag-1; // flag uses Fortran indexing
          } else if (blkm > blkn) {
            // Diagonal block factored OK, handle some rectangular part of block
-           precision_ one_val = 1.0;
-           precision_ minus_one_val = - 1.0;
+           rpc_ one_val = 1.0;
+           rpc_ minus_one_val = - 1.0;
            host_trsm(SIDE_RIGHT, FILL_MODE_LWR, OP_T, DIAG_NON_UNIT,
                      blkm-blkn, blkn, one_val, &a[j*(lda+1)], lda,
                      &a[j*(lda+1)+blkn], lda);
            if (upd) {
-             precision_ rbeta = (j==0) ? beta : 1.0;
+             rpc_ rbeta = (j==0) ? beta : 1.0;
              host_syrk(FILL_MODE_LWR, OP_N, blkm-blkn, blkn, minus_one_val,
                        &a[j*(lda+1)+blkn], lda, rbeta, upd, ldupd);
            }
@@ -104,27 +105,27 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta,
        }
      }
      /* Column Solve Tasks */
-     for (int i = j+blksz; i < m; i += blksz) {
-       int blkm = std::min(blksz, m-i);
+     for (ipc_ i = j+blksz; i < m; i += blksz) {
+       ipc_ blkm = std::min(blksz, m-i);
        #pragma omp task default(none)                        \
          firstprivate(i, j, blkn, blkm)                      \
          shared(a, lda, info, beta, upd, ldupd, blksz, n)    \
          depend(in: a[j*(lda+1):1])                          \
          depend(inout: a[j*lda + i:1])
        {
-         int my_info;
+         ipc_ my_info;
          #pragma omp atomic read
          my_info = *info;
          if (my_info == -1) {
 #ifdef PROFILE
            Profile::Task task("TA_CHOL_TRSM");
 #endif
-           precision_ one_val = 1.0;
-           precision_ minus_one_val = - 1.0;
+           rpc_ one_val = 1.0;
+           rpc_ minus_one_val = - 1.0;
            host_trsm(SIDE_RIGHT, FILL_MODE_LWR, OP_T, DIAG_NON_UNIT, blkm,
                      blkn, one_val, &a[j*(lda+1)], lda, &a[j*lda+i], lda);
            if ((blkn < blksz) && upd) {
-             precision_ rbeta = (j==0) ? beta : 1.0;
+             rpc_ rbeta = (j==0) ? beta : 1.0;
              host_gemm(OP_N, OP_T, blkm, blksz-blkn, blkn, minus_one_val,
                        &a[j*lda+i], lda, &a[j*(lda+1)+blkn], lda,
                        rbeta, &upd[i-n], ldupd);
@@ -136,9 +137,9 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta,
        }
      }
      /* Schur Update Tasks: mostly internal */
-     for (int k = j+blksz; k < n; k += blksz) {
-       int blkk = std::min(blksz, n-k);
-       for (int i = k; i < m; i += blksz) {
+     for (ipc_ k = j+blksz; k < n; k += blksz) {
+       ipc_ blkk = std::min(blksz, n-k);
+       for (ipc_ i = k; i < m; i += blksz) {
          #pragma omp task default(none)                            \
            firstprivate(i, j, k, blkn, blkk)                       \
            shared(m, a, lda, blksz, info, beta, upd, ldupd, n)     \
@@ -146,22 +147,22 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta,
            depend(in: a[j*lda+i:1])                                \
            depend(inout: a[k*lda+i:1])
          {
-           int my_info;
+           ipc_ my_info;
            #pragma omp atomic read
            my_info = *info;
            if (my_info == -1) {
 #ifdef PROFILE
              Profile::Task task("TA_CHOL_UPD");
 #endif
-             int blkm = std::min(blksz, m-i);
-             precision_ one_val = 1.0;
-             precision_ minus_one_val = - 1.0;
+             ipc_ blkm = std::min(blksz, m-i);
+             rpc_ one_val = 1.0;
+             rpc_ minus_one_val = - 1.0;
              host_gemm(OP_N, OP_T, blkm, blkk, blkn, minus_one_val,
                        &a[j*lda+i], lda, &a[j*lda+k], lda, one_val,
                        &a[k*lda+i], lda);
              if ((blkk < blksz) && upd) {
-               precision_ rbeta = (j==0) ? beta : 1.0;
-               int upd_width = (m<k+blksz) ? blkm - blkk : blksz - blkk;
+               rpc_ rbeta = (j==0) ? beta : 1.0;
+               ipc_ upd_width = (m<k+blksz) ? blkm - blkk : blksz - blkk;
                if ((i-n) < 0) {
                  // Special case for first block of contrib
                  host_gemm(OP_N, OP_T, blkm+i-n, upd_width, blkn, minus_one_val,
@@ -182,9 +183,9 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta,
      }
      /* Contrib Schur complement update: external */
      if (upd) {
-       for (int k = blksz*((n-1)/blksz+1); k < m; k += blksz) {
-         int blkk = std::min(blksz, m-k);
-         for (int i = k; i < m; i += blksz) {
+       for (ipc_ k = blksz*((n-1)/blksz+1); k < m; k += blksz) {
+         ipc_ blkk = std::min(blksz, m-k);
+         for (ipc_ i = k; i < m; i += blksz) {
            #pragma omp task default(none)                        \
              firstprivate(i, j, k, blkn, blkk)                   \
              shared(m, n, a, lda, blksz, info, beta, upd, ldupd) \
@@ -192,16 +193,16 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta,
              depend(in: a[j*lda+i:1])                            \
              depend(inout: upd[(k-n)*lda+(i-n):1])
            {
-             int my_info;
+             ipc_ my_info;
              #pragma omp atomic read
              my_info = *info;
              if (my_info == -1) {
 #ifdef PROFILE
                Profile::Task task("TA_CHOL_UPD");
 #endif
-               int blkm = std::min(blksz, m-i);
-               precision_ rbeta = (j==0) ? beta : 1.0;
-               precision_ minus_one_val = - 1.0;
+               ipc_ blkm = std::min(blksz, m-i);
+               rpc_ rbeta = (j==0) ? beta : 1.0;
+               rpc_ minus_one_val = - 1.0;
                host_gemm(OP_N, OP_T, blkm, blkk, blkn, minus_one_val,
                          &a[j*lda+i], lda, &a[j*lda+k], lda,
                          rbeta, &upd[(k-n)*ldupd+(i-n)], ldupd);
@@ -217,10 +218,10 @@ void cholesky_factor(int m, int n, precision_* a, int lda, precision_ beta,
 }
 
 /* Forwards solve corresponding to cholesky_factor() */
-void cholesky_solve_fwd(int m, int n, precision_ const* a, int lda,
-                        int nrhs, precision_* x, int ldx) {
-   precision_ one_val = 1.0;
-   precision_ minus_one_val = - 1.0;
+void cholesky_solve_fwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda,
+                        ipc_ nrhs, rpc_* x, ipc_ ldx) {
+   rpc_ one_val = 1.0;
+   rpc_ minus_one_val = - 1.0;
    if(nrhs==1) {
       host_trsv(FILL_MODE_LWR, OP_N, DIAG_NON_UNIT, n, a, lda, x, 1);
       if(m > n)
@@ -235,10 +236,10 @@ void cholesky_solve_fwd(int m, int n, precision_ const* a, int lda,
 }
 
 /* Backwards solve corresponding to cholesky_factor() */
-void cholesky_solve_bwd(int m, int n, precision_ const* a, int lda,
-                        int nrhs, precision_* x, int ldx) {
-   precision_ one_val = 1.0;
-   precision_ minus_one_val = - 1.0;
+void cholesky_solve_bwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda,
+                        ipc_ nrhs, rpc_* x, ipc_ ldx) {
+   rpc_ one_val = 1.0;
+   rpc_ minus_one_val = - 1.0;
    if(nrhs==1) {
       if(m > n)
          gemv(OP_T, m-n, n, minus_one_val, &a[n], lda, &x[n], 1, one_val, x, 1);
diff --git a/src/ssids/cpu_iface.F90 b/src/ssids/cpu_iface.F90
index 279c1dc40f..7c6fa44540 100644
--- a/src/ssids/cpu_iface.F90
+++ b/src/ssids/cpu_iface.F90
@@ -1,11 +1,36 @@
-! THIS VERSION: GALAHAD 4.3 - 2024-02-01 AT 07:50 GMT.
+! THIS VERSION: GALAHAD 4.3 - 2024-02-04 AT 11:50 GMT.
 
 #ifdef SPRAL_SINGLE
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define spral_kinds_precision spral_kinds_single_64
 #define spral_ssids_cpu_iface_precision spral_ssids_cpu_iface_single_64
 #define spral_ssids_inform_precision spral_ssids_inform_single_64
 #define spral_ssids_types_precision spral_ssids_types_single_64
+#ifdef NO_UNDERSCORE_INTEGER_64
+#define gemv  sgemv64
+#define trsv  strsv64
+#define syrk  ssyrk64
+#define trsm  strsm64
+#define sytrf ssytrf64
+#define potrf spotrf64
+#define gemm  sgemm64
+#elif DOUBLE_UNDERSCORE_INTEGER_64
+#define gemv  sgemv__64
+#define trsv  strsv__64
+#define syrk  ssyrk__64
+#define trsm  strsm__64
+#define sytrf ssytrf__64
+#define potrf spotrf__64
+#define gemm  sgemm__64
+#elif NO_SYMBOL_INTEGER_64
+#define gemv  sgemv
+#define trsv  strsv
+#define syrk  ssyrk
+#define trsm  strsm
+#define sytrf ssytrf
+#define potrf spotrf
+#define gemm  sgemm
+#else
 #define gemv  sgemv_64
 #define trsv  strsv_64
 #define syrk  ssyrk_64
@@ -13,6 +38,7 @@
 #define sytrf ssytrf_64
 #define potrf spotrf_64
 #define gemm  sgemm_64
+#endif
 #define spral_c_gemv  spral_c_sgemv_64
 #define spral_c_trsv  spral_c_strsv_64
 #define spral_c_syrk  spral_c_ssyrk_64
@@ -41,11 +67,36 @@
 #define spral_c_gemm  spral_c_sgemm
 #endif
 #else
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define spral_kinds_precision spral_kinds_double_64
 #define spral_ssids_cpu_iface_precision spral_ssids_cpu_iface_double_64
 #define spral_ssids_inform_precision spral_ssids_inform_double_64
 #define spral_ssids_types_precision spral_ssids_types_double_64
+#ifdef NO_UNDERSCORE_INTEGER_64
+#define gemv  dgemv64
+#define trsv  dtrsv64
+#define syrk  dsyrk64
+#define trsm  dtrsm64
+#define sytrf dsytrf64
+#define potrf dpotrf64
+#define gemm  dgemm64
+#elif DOUBLE_UNDERSCORE_INTEGER_64
+#define gemv  dgemv__64
+#define trsv  dtrsv__64
+#define syrk  dsyrk__64
+#define trsm  dtrsm__64
+#define sytrf dsytrf__64
+#define potrf dpotrf__64
+#define gemm  dgemm__64
+#elif NO_SYMBOL_INTEGER_64
+#define gemv  dgemv
+#define trsv  dtrsv
+#define syrk  dsyrk
+#define trsm  dtrsm
+#define sytrf dsytrf
+#define potrf dpotrf
+#define gemm  dgemm
+#else
 #define gemv  dgemv_64
 #define trsv  dtrsv_64
 #define syrk  dsyrk_64
@@ -53,6 +104,7 @@
 #define sytrf dsytrf_64
 #define potrf dpotrf_64
 #define gemm  dgemm_64
+#endif
 #define spral_c_gemv  spral_c_dgemv_64
 #define spral_c_trsv  spral_c_dtrsv_64
 #define spral_c_syrk  spral_c_dsyrk_64
@@ -82,7 +134,7 @@
 #endif
 #endif
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define GALAHAD_BLAS_interface GALAHAD_BLAS_interface_64
 #define GALAHAD_LAPACK_interface GALAHAD_LAPACK_interface_64
 #endif
diff --git a/src/ssids/cpu_solve.F90 b/src/ssids/cpu_solve.F90
index a82e894646..b62a8148db 100644
--- a/src/ssids/cpu_solve.F90
+++ b/src/ssids/cpu_solve.F90
@@ -1,9 +1,9 @@
-! THIS VERSION: GALAHAD 4.1 - 2023-05-20 AT 14:10 GMT.
+! THIS VERSION: GALAHAD 4.3 - 2024-02-03 AT 11:40 GMT.
 
 #include "spral_procedures.h"
 
 #ifdef SPRAL_SINGLE
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define trsm strsm_64
 #define trsv strsv_64
 #define gemm sgemm_64
@@ -15,8 +15,7 @@
 #define gemv sgemv
 #endif
 #else
-
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define trsm dtrsm_64
 #define trsv dtrsv_64
 #define gemm dgemm_64
@@ -29,7 +28,7 @@
 #endif
 #endif
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define host_gemm host_gemm_64
 #endif
 
diff --git a/src/ssids/cpu_subtree.F90 b/src/ssids/cpu_subtree.F90
index bc97cffaff..4915de6c51 100644
--- a/src/ssids/cpu_subtree.F90
+++ b/src/ssids/cpu_subtree.F90
@@ -394,6 +394,7 @@ function factor(this, posdef, aval, child_contrib, options, inform, scaling)
     cscaling = C_NULL_PTR
     if (present(scaling)) cscaling = C_LOC(scaling)
     call cpu_copy_options_in(options, coptions)
+
     cpu_factor%csubtree = &
          c_create_numeric_subtree(cpu_factor%posdef, this%csubtree, &
          aval, cscaling, contrib_ptr, coptions, cstats)
diff --git a/src/ssids/dense_factor.cu b/src/ssids/dense_factor.cu
index 956f7805b5..5b745d42d4 100644
--- a/src/ssids/dense_factor.cu
+++ b/src/ssids/dense_factor.cu
@@ -1,5 +1,6 @@
 /* Copyright (c) 2013 Science and Technology Facilities Council (STFC)
  * Authors: Evgueni Ovtchinnikov and Jonathan Hogg
+ * This version: GALAHAD 4.3 - 2024-02-03 AT 09:50 GMT
  *
  * This file contains CUDA kernels for partial LL^T and LDL^T factorization
  * of dense submatrices.
@@ -15,11 +16,11 @@
 #include <cuda_runtime_api.h>
 #include <device_launch_parameters.h>
 
+#include "ssids_rip.hxx"
 #include "ssids_gpu_kernels_datatypes.h"
 #include "spral_cuda_cuda_check.h"
 
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define multinode_chol_type multinode_chol_type_single
 #define multiblock_fact_type multiblock_fact_type_single
 #define cstat_data_type cstat_data_type_single
@@ -40,7 +41,6 @@
 #define spral_ssids_multiblock_llt_setup spral_ssids_multiblock_llt_setup_single
 #define spral_ssids_square_ldlt spral_ssids_square_ldlt_single
 #else
-#define precision_ double
 #define multinode_chol_type multinode_chol_type_double
 #define multiblock_fact_type multiblock_fact_type_double
 #define cstat_data_type cstat_data_type_double
@@ -77,13 +77,13 @@ using namespace spral::ssids::gpu;
 
 namespace /* anon */ {
 
-extern __shared__ volatile precision_ SharedMemory[];
+extern __shared__ volatile rpc_ SharedMemory[];
 
 __global__ void
 cu_block_ldlt_init(
-    const int ncols,
-    int *const stat,
-    int *const ind
+    const ipc_ ncols,
+    ipc_ *const stat,
+    ipc_ *const ind
 ) {
   if (threadIdx.x == 0) {
     stat[0] = ncols; // successful pivots
@@ -95,23 +95,23 @@ cu_block_ldlt_init(
 template
 <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __device__ void
 dev_init_chol_fact(
-    const unsigned int block,
-    const int nrows, // number of rows of the factorized matrix
-    const int ncols, // number of columns thereof
+    const uipc_ block,
+    const ipc_ nrows, // number of rows of the factorized matrix
+    const ipc_ ncols, // number of columns thereof
     const ELEMENT_TYPE *const a, // array of elements of A
-    const int lda, // leading dimension of a
+    const ipc_ lda, // leading dimension of a
     volatile ELEMENT_TYPE *const fs // initial L factor (shared mem)
 ) {
-  const int SIZE_X = TILES*TILE_SIZE;
+  const ipc_ SIZE_X = TILES*TILE_SIZE;
 
-  int x; // row index
+  ipc_ x; // row index
 
-  for ( int tile = 0; tile < TILES; tile++ ) {
+  for ( ipc_ tile = 0; tile < TILES; tile++ ) {
     if ( tile ) { // load A's offdiagonal tiles into shared memory
       x = ncols + threadIdx.x + (tile - 1)*TILE_SIZE +
         (TILES - 1)*TILE_SIZE*block; // offdiagonal row index in A
@@ -131,23 +131,23 @@ dev_init_chol_fact(
 template
 <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __device__ void
 dev_save_chol_fact(
-    const unsigned int block,
-    const int nrows, // number of rows of the factorized matrix
-    const int ncols, // number of columns thereof
+    const uipc_ block,
+    const ipc_ nrows, // number of rows of the factorized matrix
+    const ipc_ ncols, // number of columns thereof
     const volatile ELEMENT_TYPE *const fs, // initial L factor (shared mem)
     ELEMENT_TYPE *const f, // array of elements of L
-    const int ldf // leading dimension of f
+    const ipc_ ldf // leading dimension of f
 ) {
-  const int SIZE_X = TILES*TILE_SIZE;
+  const ipc_ SIZE_X = TILES*TILE_SIZE;
 
-  int x; // row index
+  ipc_ x; // row index
 
-  for ( int tile = 0; tile < TILES; tile++ ) {
+  for ( ipc_ tile = 0; tile < TILES; tile++ ) {
     if ( tile ) { // upload the relevant elements of fs to f
       x = ncols + threadIdx.x + (tile - 1)*TILE_SIZE +
         (TILES - 1)*TILE_SIZE*block;
@@ -167,23 +167,23 @@ dev_save_chol_fact(
 template
 <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __device__ void
 dev_block_chol(
-    const int block,
-    const int nrows,
-    const int ncols,
+    const ipc_ block,
+    const ipc_ nrows,
+    const ipc_ ncols,
     const ELEMENT_TYPE *const a,
-    const int lda,
+    const ipc_ lda,
     ELEMENT_TYPE *const f,
-    const int ldf,
-    int *const stat
+    const ipc_ ldf,
+    ipc_ *const stat
 ) {
-  const int SIZE_X = TILES * TILE_SIZE;
+  const ipc_ SIZE_X = TILES * TILE_SIZE;
 
-  int ip;
+  ipc_ ip;
   ELEMENT_TYPE v;
 
   volatile ELEMENT_TYPE *const work = (volatile ELEMENT_TYPE*)SharedMemory;
@@ -210,7 +210,7 @@ dev_block_chol(
     __syncthreads();
 
     if ((threadIdx.y > ip) && (threadIdx.y < ncols)) {
-      for (int x = threadIdx.x + TILE_SIZE; x < SIZE_X; x += TILE_SIZE)
+      for (ipc_ x = threadIdx.x + TILE_SIZE; x < SIZE_X; x += TILE_SIZE)
         work[x + SIZE_X*threadIdx.y] -=
           work[threadIdx.y + SIZE_X*ip] * work[x + SIZE_X*ip];
       if (threadIdx.x > ip)
@@ -231,72 +231,72 @@ dev_block_chol(
 template
 <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __global__ void
 cu_block_chol(
-    const int nrows,
-    const int ncols,
+    const ipc_ nrows,
+    const ipc_ ncols,
     const ELEMENT_TYPE *const a,
-    const int lda,
+    const ipc_ lda,
     ELEMENT_TYPE *const f,
-    const int ldf,
-    int *const stat
+    const ipc_ ldf,
+    ipc_ *const stat
 ) {
   dev_block_chol< ELEMENT_TYPE, TILE_SIZE, TILES >
     ( blockIdx.x, nrows, ncols, a, lda, f, ldf, stat );
 }
 
 struct multinode_chol_type {
-   int nrows;
-   int ncols;
-   precision_ *lcol;
+   ipc_ nrows;
+   ipc_ ncols;
+   rpc_ *lcol;
 };
 
 // input data type for multiblock_fact and multiblock_chol
 // each CUDA block gets a copy
 struct multiblock_fact_type {
-   int nrows; // no node's rows
-   int ncols; // no node's cols
-   int ld;    // node's leading dimension
-   int p;     // no rows above the pivot block
-   precision_ *aptr; // pointer to this node's A matrix
-   precision_ *ldptr; // pointer to this node's LD matrix
-   int offf;  // this node's L offset in the array of all Ls
-   precision_ *dptr; // pointer to this node's D in array of all Ds
-   int node;  // node index
-   int offb;  // the idx of the first CUDA block processing this node
+   ipc_ nrows; // no node's rows
+   ipc_ ncols; // no node's cols
+   ipc_ ld;    // node's leading dimension
+   ipc_ p;     // no rows above the pivot block
+   rpc_ *aptr; // pointer to this node's A matrix
+   rpc_ *ldptr; // pointer to this node's LD matrix
+   ipc_ offf;  // this node's L offset in the array of all Ls
+   rpc_ *dptr; // pointer to this node's D in array of all Ds
+   ipc_ node;  // node index
+   ipc_ offb;  // the idx of the first CUDA block processing this node
 };
 
 __global__ void
 cu_multiblock_fact_setup(
     struct multinode_fact_type *ndata,
     struct multiblock_fact_type *const mbfdata,
-    const int step,
-    const int block_size,
-    const int blocks,
-    const int offb,
-    int *const stat,
-    int *const ind,
-    int *const nl
+    const ipc_ step,
+    const ipc_ block_size,
+    const ipc_ blocks,
+    const ipc_ offb,
+    ipc_ *const stat,
+    ipc_ *const ind,
+    ipc_ *const nl
 ) {
   ndata += blockIdx.x;
-  const int ncols = ndata->ncols;
-  const int nrows = ndata->nrows;
-  precision_ *const lval  = ndata->lval;
-  precision_ *const ldval = ndata->ldval;
-  precision_ *const dval  = ndata->dval;
-  int ib    = ndata->ib;
-  int jb    = ndata->jb;
-  int done  = ndata->done;
-  int rght  = ndata->rght;
-  const int lbuf  = ndata->lbuf;
+  const ipc_ ncols = ndata->ncols;
+  const ipc_ nrows = ndata->nrows;
+  rpc_ *const lval  = ndata->lval;
+  rpc_ *const ldval = ndata->ldval;
+  rpc_ *const dval  = ndata->dval;
+  ipc_ ib    = ndata->ib;
+  ipc_ jb    = ndata->jb;
+  ipc_ done  = ndata->done;
+  ipc_ rght  = ndata->rght;
+  const ipc_ lbuf  = ndata->lbuf;
 
   if (jb < ib)
     return;
 
-  const int pivoted = stat[blockIdx.x];
+  const ipc_ pivoted = stat[blockIdx.x];
 
   if (pivoted > 0) {
     done += pivoted;
@@ -325,8 +325,8 @@ cu_multiblock_fact_setup(
       ndata->rght = rght;
   }
 
-  const int rb = nrows - done;
-  int cb = rght - ib + 1;
+  const ipc_ rb = nrows - done;
+  ipc_ cb = rght - ib + 1;
 
   if (cb > block_size)
     cb = block_size;
@@ -338,14 +338,14 @@ cu_multiblock_fact_setup(
   if (ind && (threadIdx.x < cb) && (threadIdx.y == 0))
     ind[blockIdx.x*block_size + threadIdx.x] = cb + 1;
 
-  int k = (rb - cb - 1)/(block_size*(blocks - 1)) + 1;
+  ipc_ k = (rb - cb - 1)/(block_size*(blocks - 1)) + 1;
 
-  __shared__ volatile int ncb;
+  __shared__ volatile ipc_ ncb;
   if ((threadIdx.x == 0) && (threadIdx.y == 0))
     ncb = atomicAdd(&nl[0], k);
 
-  __shared__ volatile int iwork[9];
-  __shared__ precision_ *volatile lptr, *volatile ldptr, *volatile dptr;
+  __shared__ volatile ipc_ iwork[9];
+  __shared__ rpc_ *volatile lptr, *volatile ldptr, *volatile dptr;
   if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
     iwork[0] = cb;
     iwork[1] = rb;
@@ -360,7 +360,7 @@ cu_multiblock_fact_setup(
   }
   __syncthreads();
 
-  for (int i = threadIdx.y; i < k; i += blockDim.y) {
+  for (ipc_ i = threadIdx.y; i < k; i += blockDim.y) {
     switch(threadIdx.x) {
     case 0: mbfdata[ncb+i].ncols = iwork[0]; break;
     case 1: mbfdata[ncb+i].nrows = iwork[1]; break;
@@ -427,28 +427,28 @@ of size 2*TILE_SIZE, initialized to 0 by this kernel.
 */
 template <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __device__ void
 dev_init_fact(
-    const unsigned int block, // relative CUDA block number
-    const int nrows,
-    const int ncols,
-    const int offp,
+    const uipc_ block, // relative CUDA block number
+    const ipc_ nrows,
+    const ipc_ ncols,
+    const ipc_ offp,
     const ELEMENT_TYPE *const a, // array of elements of A
-    const int lda, // leading dimension of a
+    const ipc_ lda, // leading dimension of a
     volatile ELEMENT_TYPE *const fs, // initial L factor (shared mem)
     volatile ELEMENT_TYPE *const ds // initial D**(-1) (shared mem)
 ) {
-  const int SIZE_X = TILES * TILE_SIZE;
+  const ipc_ SIZE_X = TILES * TILE_SIZE;
 
-  int x, y; // position indices
+  ipc_ x, y; // position indices
 
   y = threadIdx.y % TILE_SIZE; // fs & fds column processed by this thread
 
   if ( threadIdx.y < TILE_SIZE ) {
-    for ( int tile = 0; tile < TILES; tile += 2 ) {
+    for ( ipc_ tile = 0; tile < TILES; tile += 2 ) {
       if ( tile ) { // load A_u and A_l's even tiles into shared memory
         x = threadIdx.x + (tile - 1)*TILE_SIZE +
             (TILES - 1)*TILE_SIZE*block; // offdiagonal row index in A
@@ -467,7 +467,7 @@ dev_init_fact(
   }
   else {
     // load A_u and A_l's odd tiles into shared memory
-    for (int tile = 1; tile < TILES; tile += 2) {
+    for (ipc_ tile = 1; tile < TILES; tile += 2) {
       x = threadIdx.x + (tile - 1)*TILE_SIZE +
         (TILES - 1)*TILE_SIZE*block;
       if (x >= offp)
@@ -486,33 +486,33 @@ dev_init_fact(
 
 template <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __device__ void
 dev_save_fact(
-    const unsigned int block,
-    const int nrows,
-    const int ncols,
-    const int offp,
-    const int my, // save only if my is non-zero
+    const uipc_ block,
+    const ipc_ nrows,
+    const ipc_ ncols,
+    const ipc_ offp,
+    const ipc_ my, // save only if my is non-zero
     const volatile ELEMENT_TYPE *const fs, // L (shared mem)
     const volatile ELEMENT_TYPE *const fds, // L*D (shared mem)
     const volatile ELEMENT_TYPE *const ds, // 2 diags of D**(-1) (shared mem)
     ELEMENT_TYPE *const f, // L (global mem)
-    const int ldf, // leading dimension of f
+    const ipc_ ldf, // leading dimension of f
     ELEMENT_TYPE *const fd, // L*D (global mem)
-    const int ldfd, // leading dimension of fd
+    const ipc_ ldfd, // leading dimension of fd
     ELEMENT_TYPE *const d // 2 diags of D**(-1) (global mem)
 ) {
-  const int SIZE_X = TILES * TILE_SIZE;
+  const ipc_ SIZE_X = TILES * TILE_SIZE;
 
-  int x, y; // position indices
+  ipc_ x, y; // position indices
 
   y = threadIdx.y % TILE_SIZE; // fs & fds column processed by this thread
 
   if ( threadIdx.y < TILE_SIZE ) { // warps 0, 1
-    for ( int tile = 0; tile < TILES; tile += 2 ) {
+    for ( ipc_ tile = 0; tile < TILES; tile += 2 ) {
       if ( tile ) { // upload L_u, L_l, L_u*D and L_l*D's even tiles
         x = threadIdx.x + (tile - 1)*TILE_SIZE +
             (TILES - 1)*TILE_SIZE*block;
@@ -540,7 +540,7 @@ dev_save_fact(
     } // loop through even tiles ends here
   }
   else { // upload L_u, L_l, L_u*D and L_l*D's odd tiles (warps 2, 3)
-    for (int tile = 1; tile < TILES; tile += 2) {
+    for (ipc_ tile = 1; tile < TILES; tile += 2) {
       x = threadIdx.x + (tile - 1)*TILE_SIZE +
         (TILES - 1)*TILE_SIZE*block;
       if (x >= offp) // skip L_d
@@ -557,20 +557,20 @@ dev_save_fact(
 
 template <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __device__ void
 dev_init_max(
-    const int ncols,
+    const ipc_ ncols,
     const volatile ELEMENT_TYPE *const fs,
-    const int mx, // this thread mask
-    volatile int *const mask, // pivot index/mask
+    const ipc_ mx, // this thread mask
+    volatile ipc_ *const mask, // pivot index/mask
     volatile bool *const not_max, // "not largest" flag
-    volatile int &jps, // the index of the largest element
-    volatile int &quit // pivoting failure flag
+    volatile ipc_ &jps, // the index of the largest element
+    volatile ipc_ &quit // pivoting failure flag
 ) {
-  const int SIZE_X = TILES*TILE_SIZE;
+  const ipc_ SIZE_X = TILES*TILE_SIZE;
 
   if (threadIdx.y == 0) {
     mask[threadIdx.x] = mx; // initialize the pivot index
@@ -592,7 +592,7 @@ dev_init_max(
 
   // select the leftmost among the largest elements of the row
   if ((threadIdx.y == 0) && (not_max[threadIdx.x] == 0))
-    atomicMin((int*)&jps, threadIdx.x); // in case of a tie, choose the leftmost
+    atomicMin((ipc_*)&jps, threadIdx.x); // in case of a tie, choose the leftmost
   __syncthreads();
 }
 
@@ -622,9 +622,9 @@ template< typename ELEMENT_TYPE >
 __device__ void
 dev_select_pivots_at_root(
     const ELEMENT_TYPE *const fs,
-    const int ld, // leading dimension of fs
-    int &ip,
-    int &jp,
+    const ipc_ ld, // leading dimension of fs
+    ipc_ &ip,
+    ipc_ &jp,
     ELEMENT_TYPE &a11,
     ELEMENT_TYPE &a12,
     ELEMENT_TYPE &a22,
@@ -659,9 +659,9 @@ template< typename ELEMENT_TYPE >
 __device__ void
 dev_select_pivots(
     const volatile ELEMENT_TYPE *const fs,
-    const int ld, // leading dimension of fs
-    int &ip,
-    int &jp,
+    const ipc_ ld, // leading dimension of fs
+    ipc_ &ip,
+    ipc_ &jp,
     ELEMENT_TYPE &a11,
     ELEMENT_TYPE &a12,
     ELEMENT_TYPE &a22,
@@ -697,11 +697,11 @@ dev_select_pivots(
 template< typename ELEMENT_TYPE >
 __device__ bool
 dev_1x1_pivot_fails(
-    const int x,
-    const int ip,
+    const ipc_ x,
+    const ipc_ ip,
     volatile ELEMENT_TYPE *const fs,
     volatile ELEMENT_TYPE *const fds,
-    const int ld,
+    const ipc_ ld,
     const ELEMENT_TYPE det,
     const ELEMENT_TYPE delta,
     const ELEMENT_TYPE eps
@@ -731,12 +731,12 @@ dev_1x1_pivot_fails(
 template< typename ELEMENT_TYPE >
 __device__ bool
 dev_2x2_pivot_fails(
-    const int x,
-    const int ip,
-    const int jp,
+    const ipc_ x,
+    const ipc_ ip,
+    const ipc_ jp,
     volatile ELEMENT_TYPE *const fs,
     volatile ELEMENT_TYPE *const fds,
-    const int ld,
+    const ipc_ ld,
     const ELEMENT_TYPE a11,
     const ELEMENT_TYPE a12,
     const ELEMENT_TYPE a22,
@@ -786,16 +786,16 @@ dev_2x2_pivot_fails(
 
 template <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES // = 7 for a single node and = 11 for many nodes
+uipc_ TILE_SIZE,
+uipc_ TILES // = 7 for a single node and = 11 for many nodes
 >
 __device__ void
 dev_eliminate_1x1(
-    int &x, // row for this thread
-    const int y, // column for this thread
-    const int ip, // pivoted column
+    ipc_ &x, // row for this thread
+    const ipc_ y, // column for this thread
+    const ipc_ ip, // pivoted column
     volatile ELEMENT_TYPE *const fs,
-    const int ld,
+    const ipc_ ld,
     const ELEMENT_TYPE p // pivot value
 ) {
   if ( x != ip )
@@ -813,15 +813,15 @@ dev_eliminate_1x1(
 /* The next function eliminates the two pivoted columns from non-pivoted */
 
 template< typename ELEMENT_TYPE,
-unsigned int TILE_SIZE, unsigned int TILES >
+uipc_ TILE_SIZE, uipc_ TILES >
 __device__ void
 dev_eliminate_2x2(
-    int &x,
-    const int y,
-    const int ip,
-    const int jp,
+    ipc_ &x,
+    const ipc_ y,
+    const ipc_ ip,
+    const ipc_ jp,
     volatile ELEMENT_TYPE *const fs,
-    const int ld,
+    const ipc_ ld,
     const ELEMENT_TYPE pi,
     const ELEMENT_TYPE pj
 ) {
@@ -839,15 +839,15 @@ dev_eliminate_2x2(
 
 /* The next function performs elimination in one tile only */
 
-template< typename ELEMENT_TYPE, unsigned int TILE_SIZE >
+template< typename ELEMENT_TYPE, uipc_ TILE_SIZE >
 inline __device__ void
 dev_eliminate(
-    int &x,
-    const int y,
-    const int ip,
-    const int jp,
+    ipc_ &x,
+    const ipc_ y,
+    const ipc_ ip,
+    const ipc_ jp,
     volatile ELEMENT_TYPE *const fs,
-    const int ld,
+    const ipc_ ld,
     const ELEMENT_TYPE pi,
     const ELEMENT_TYPE pj
 ) {
@@ -877,40 +877,40 @@ Called by cu_block_ldlt and cu_multiblock_ldlt factorization kernels.
 
 */
 template< typename ELEMENT_TYPE,
-unsigned int TILE_SIZE, unsigned int TILES >
+uipc_ TILE_SIZE, uipc_ TILES >
 __device__ void
 dev_block_ldlt(
-    const unsigned int block,
-    const int nrows, // number of rows of the factorized matrix
-    const int ncols, // number of columns thereof
-    const int offp, // number of rows above the pivot block
+    const uipc_ block,
+    const ipc_ nrows, // number of rows of the factorized matrix
+    const ipc_ ncols, // number of columns thereof
+    const ipc_ offp, // number of rows above the pivot block
     ELEMENT_TYPE *const a, // array of elements of A
-    const int lda, // leading dimension of a
+    const ipc_ lda, // leading dimension of a
     ELEMENT_TYPE *const f, // array of elements of the L factor
-    const int ldf, // leading dimension of f
+    const ipc_ ldf, // leading dimension of f
     ELEMENT_TYPE *const fd, // array of elements of L*D
-    const int ldfd, // leading dimension of fd
+    const ipc_ ldfd, // leading dimension of fd
     ELEMENT_TYPE *const d, // array for main diagonal and subdiagonal of D
     const ELEMENT_TYPE delta, // pivoting threashold
     const ELEMENT_TYPE eps, // zero pivot threashold
-    int *const index, // pivot order index
-    int *const stat  // number of successful pivots
+    ipc_ *const index, // pivot order index
+    ipc_ *const stat  // number of successful pivots
 ) {
-  const int SIZE_X = TILES*TILE_SIZE;
+  const ipc_ SIZE_X = TILES*TILE_SIZE;
 
-  int ip, jp; // pivot row and col indices
-  int x, y; // position indices
-  int mx, my; // masks
+  ipc_ ip, jp; // pivot row and col indices
+  ipc_ x, y; // position indices
+  ipc_ mx, my; // masks
   ELEMENT_TYPE a11, a12, a22, det; // 2x2 pivot data
 
   __shared__ volatile ELEMENT_TYPE fs[SIZE_X*TILE_SIZE]; // work array for f
   __shared__ volatile ELEMENT_TYPE fds[SIZE_X*TILE_SIZE]; // work array for fd
   __shared__ volatile ELEMENT_TYPE ds[2*TILE_SIZE]; // work array for d
-  __shared__ volatile int mask[TILE_SIZE]; // pivot mask/index
+  __shared__ volatile ipc_ mask[TILE_SIZE]; // pivot mask/index
   __shared__ volatile bool not_max[TILE_SIZE]; // flag for finding the largest row elm
 
-  __shared__ volatile int quit; // failure flag
-  __shared__ volatile int jps; // pivot column index
+  __shared__ volatile ipc_ quit; // failure flag
+  __shared__ volatile ipc_ jps; // pivot column index
 
   y = threadIdx.y % TILE_SIZE; // fs & fds column processed by this thread
 
@@ -924,7 +924,7 @@ dev_block_ldlt(
   dev_init_max< ELEMENT_TYPE, TILE_SIZE, TILES >
     ( ncols, fs, mx, mask, not_max, jps, quit );
 
-  for ( int row = 0, pivoted = 0; row < ncols; ) {
+  for ( ipc_ row = 0, pivoted = 0; row < ncols; ) {
 
     // select the pivot based on the row's largest element index jps
     ip = row;
@@ -1065,7 +1065,7 @@ dev_block_ldlt(
       // select leftmost largest element in the row
       if ( row < ncols ) {
         if ( threadIdx.y == 0 && not_max[threadIdx.x] == 0 )
-          atomicMin((int*)&jps, threadIdx.x); // in case of a tie, choose the leftmost
+          atomicMin((ipc_*)&jps, threadIdx.x); // in case of a tie, choose the leftmost
       }
     }
     else { // do elimination in the (TILES)-th tile
@@ -1096,26 +1096,26 @@ dev_block_ldlt(
 template
 <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __global__ void
 cu_block_ldlt(
-    const int nrows, // n.o. rows in A
-    const int ncols, // n.o. cols in A (<= TILE_SIZE)
-    const int offp,  // n.o. rows in A_u
+    const ipc_ nrows, // n.o. rows in A
+    const ipc_ ncols, // n.o. cols in A (<= TILE_SIZE)
+    const ipc_ offp,  // n.o. rows in A_u
     ELEMENT_TYPE *const a, // array of A's elements
-    const int lda, // leading dimension of a
+    const ipc_ lda, // leading dimension of a
     ELEMENT_TYPE *const f, // array of L's elements
-    const int ldf, // leading dimension of f
+    const ipc_ ldf, // leading dimension of f
     ELEMENT_TYPE *const fd, // array of (L*D)'s elements
-    const int ldfd, // leading dimension of fd
+    const ipc_ ldfd, // leading dimension of fd
     ELEMENT_TYPE *const d, // array of D**(-1)'s diagonal and subdiagonal elements
     const ELEMENT_TYPE delta, // pivoting threshold
     const ELEMENT_TYPE eps, // zero column threshold:
     // the column is zeroed if all elements are <= eps
-    int *const index, // pivot index (cf. permutation matrix P)
-    int *const stat // n.o. successful pivots
+    ipc_ *const index, // pivot index (cf. permutation matrix P)
+    ipc_ *const stat // n.o. successful pivots
 ) {
    dev_block_ldlt< ELEMENT_TYPE, TILE_SIZE, TILES >
       ( blockIdx.x, nrows, ncols, offp, a, lda, f, ldf,
@@ -1130,8 +1130,8 @@ cu_block_ldlt(
 template
 <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __global__ void
 cu_multiblock_ldlt(
@@ -1139,28 +1139,28 @@ cu_multiblock_ldlt(
     ELEMENT_TYPE *f, // same for L
     const ELEMENT_TYPE delta, // same as in cu_block_fact
     const ELEMENT_TYPE eps, // same as in cu_block_fact
-    int *const index, // array of all pivot indices
-    int *const stat // array of successful pivots' numbers
+    ipc_ *const index, // array of all pivot indices
+    ipc_ *const stat // array of successful pivots' numbers
 ) {
    /*
     * Read information on what to do from global memory
     */
    mbfdata += blockIdx.x; // shift to the data for this CUDA block
-   int ncols = mbfdata->ncols; // n.o. cols in A processed by this CUDA block
+   ipc_ ncols = mbfdata->ncols; // n.o. cols in A processed by this CUDA block
    if ( ncols < 1 )
       return;
-   int nrows = mbfdata->nrows; // n.o. rows in A
-   int lda   = mbfdata->ld; // leading dimension of A
-   int p     = mbfdata->p; // n.o. rows in A_u
-   int node  = mbfdata->node; // A's number
-   int block  = mbfdata->offb; // relative CUDA block index
+   ipc_ nrows = mbfdata->nrows; // n.o. rows in A
+   ipc_ lda   = mbfdata->ld; // leading dimension of A
+   ipc_ p     = mbfdata->p; // n.o. rows in A_u
+   ipc_ node  = mbfdata->node; // A's number
+   ipc_ block  = mbfdata->offb; // relative CUDA block index
 
    f += mbfdata->offf; // shift to the array of this L elements
-   precision_ *fd = mbfdata->ldptr;
-   precision_ *a = mbfdata->aptr; // pointer to A
-   precision_ *d = mbfdata->dptr; // pointer to D**(-1)
+   rpc_ *fd = mbfdata->ldptr;
+   rpc_ *a = mbfdata->aptr; // pointer to A
+   rpc_ *d = mbfdata->dptr; // pointer to D**(-1)
 
-   dev_block_ldlt < precision_, TILE_SIZE, TILES >
+   dev_block_ldlt < rpc_, TILE_SIZE, TILES >
      ( block, nrows, ncols, p, a, lda, f, lda,
        fd, lda, d, delta, eps, &index[node*TILE_SIZE], &stat[node]);
 }
@@ -1191,26 +1191,26 @@ cu_multiblock_ldlt(
 template< typename ELEMENT_TYPE >
 __global__ void
 cu_square_ldlt(
-    const int n,
+    const ipc_ n,
     ELEMENT_TYPE *const a, // A on input, L on output
     ELEMENT_TYPE *const f, // L
     ELEMENT_TYPE *const w, // L*D
     ELEMENT_TYPE *const d, // main diag and subdiag of the inverse of D
-    const int ld, // leading dimension of a, f, w
+    const ipc_ ld, // leading dimension of a, f, w
     const ELEMENT_TYPE delta, // same as above
     const ELEMENT_TYPE eps, // same as above
-    int *const ind, // same as in cu_block_fact
-    int *const stat // same as in cu_block_fact
+    ipc_ *const ind, // same as in cu_block_fact
+    ipc_ *const stat // same as in cu_block_fact
 ) {
-  int x, y;
-  int col;
-  int ip, jp;
-  int pivoted, recent;
+  ipc_ x, y;
+  ipc_ col;
+  ipc_ ip, jp;
+  ipc_ pivoted, recent;
   ELEMENT_TYPE a11, a12, a22, det;
 
   volatile ELEMENT_TYPE *work = (volatile ELEMENT_TYPE*)SharedMemory; // work array
-  volatile int *const iwork = (volatile int*)&(work[blockDim.x]); // integer work array
-  volatile int *const iw = (volatile int*)&(iwork[blockDim.x]); // iw[0]: failure flag,
+  volatile ipc_ *const iwork = (volatile ipc_*)&(work[blockDim.x]); // integer work array
+  volatile ipc_ *const iw = (volatile ipc_*)&(iwork[blockDim.x]); // iw[0]: failure flag,
                                        // iw[1]: largest col. elem. index
 
   for ( x = threadIdx.x; x < n; x += blockDim.x ) {
@@ -1224,7 +1224,7 @@ cu_square_ldlt(
 
   pivoted = 0; // n.o. pivoted cols
 
-  for ( int pass = 0; ; pass++ ) { // failed cols are skipped until next pass
+  for ( ipc_ pass = 0; ; pass++ ) { // failed cols are skipped until next pass
 
     recent = 0; // n.o. cols pivoted during this pass
 
@@ -1388,26 +1388,26 @@ cu_square_ldlt(
 template
 <
 typename ELEMENT_TYPE,
-unsigned int TILE_SIZE,
-unsigned int TILES
+uipc_ TILE_SIZE,
+uipc_ TILES
 >
 __global__ void
 cu_multiblock_chol(
     struct multiblock_fact_type *mbfdata,
     ELEMENT_TYPE *f, // array of L nodes
-    int *stat // execution status
+    ipc_ *stat // execution status
 ) {
   /*
    * Read information on what to do from global memory
    */
   mbfdata += blockIdx.x;
-  int ncols = mbfdata->ncols;
+  ipc_ ncols = mbfdata->ncols;
   if ( ncols < 1 )
     return;
-  int nrows = mbfdata->nrows;
-  int ld    = mbfdata->ld;
-  int node  = mbfdata->node;
-  int block  = mbfdata->offb;
+  ipc_ nrows = mbfdata->nrows;
+  ipc_ ld    = mbfdata->ld;
+  ipc_ node  = mbfdata->node;
+  ipc_ block  = mbfdata->offb;
 
   ELEMENT_TYPE *const a = mbfdata->aptr;
   f += mbfdata->offf;
@@ -1417,8 +1417,8 @@ cu_multiblock_chol(
 }
 
 struct cstat_data_type {
-  int nelim;
-  precision_ *dval;
+  ipc_ nelim;
+  rpc_ *dval;
 };
 
 __global__ void
@@ -1428,16 +1428,16 @@ cu_collect_stats(
 ) {
    // Designed to be run with a single thread
    csdata += blockIdx.x;
-   precision_ *const d = csdata->dval;
-   const int nelim = csdata->nelim;
+   rpc_ *const d = csdata->dval;
+   const ipc_ nelim = csdata->nelim;
 
-   int num_zero = 0;
-   int num_neg = 0;
-   int num_two = 0;
+   ipc_ num_zero = 0;
+   ipc_ num_neg = 0;
+   ipc_ num_two = 0;
 
-   for (int i = 0; i < nelim; ) {
-      const precision_ a11 = d[2*i];
-      const precision_ a21 = d[2*i + 1];
+   for (ipc_ i = 0; i < nelim; ) {
+      const rpc_ a11 = d[2*i];
+      const rpc_ a21 = d[2*i + 1];
       if ( a21 == 0.0 ) {
          // 1x1 pivot (can be a zero pivot)
          if ( a11 == 0 )
@@ -1448,15 +1448,15 @@ cu_collect_stats(
       }
       else {
          // 2x2 pivot (can't be a zero pivot)
-         const precision_ a22 = d[2*(i + 1)];
+         const rpc_ a22 = d[2*(i + 1)];
          num_two++;
          // To check for negative eigenvalues, we exploit
          // det   = product of evals
          // trace = sum of evals
          // if det is negative, exactly one eval is negative;
          // otherwise, both have same sign, equal to sign of trace
-         const precision_ det = a11*a22 - a21*a21;
-         const precision_ trace = a11 + a22;
+         const rpc_ det = a11*a22 - a21*a21;
+         const rpc_ trace = a11 + a22;
          if ( det < 0 )
             num_neg++;
          else if ( trace < 0 )
@@ -1482,64 +1482,64 @@ cu_collect_stats(
 extern "C" {
 
 void spral_ssids_block_ldlt(
-      cudaStream_t *stream, int nrows, int ncols, int p,
-      precision_* a, int lda,
-      precision_* f, int ldf,
-      precision_* fd, int ldfd,
-      precision_* d,
-      precision_ delta, precision_ eps,
-      int* index, int* stat
+      cudaStream_t *stream, ipc_ nrows, ipc_ ncols, ipc_ p,
+      rpc_* a, ipc_ lda,
+      rpc_* f, ipc_ ldf,
+      rpc_* fd, ipc_ ldfd,
+      rpc_* d,
+      rpc_ delta, rpc_ eps,
+      ipc_* index, ipc_* stat
       ) {
 
-   int nblocks = (nrows - ncols - 1)/(BLOCK_SIZE*(BLOCKS - 1)) + 1;
+   ipc_ nblocks = (nrows - ncols - 1)/(BLOCK_SIZE*(BLOCKS - 1)) + 1;
    cu_block_ldlt_init<<< 1, BLOCK_SIZE, 0, *stream >>>( ncols, stat, index );
 
    dim3 threads(BLOCK_SIZE, 2*BLOCK_SIZE);
    cu_block_ldlt
-      < precision_, BLOCK_SIZE, BLOCKS >
+      < rpc_, BLOCK_SIZE, BLOCKS >
       <<< nblocks, threads, 0, *stream >>>
       ( nrows, ncols, p, a, lda, f, ldf, fd, ldfd, d, delta, eps, index, stat );
 }
 
-void spral_ssids_block_llt( cudaStream_t *stream, int nrows, int ncols,
-      precision_* a, int lda, precision_* f, int ldf, int* stat ) {
-   int smsize = CBLOCKS*BLOCK_SIZE*BLOCK_SIZE*sizeof(precision_);
-   int nblocks = (nrows - ncols - 1)/(BLOCK_SIZE*(CBLOCKS - 1)) + 1;
+void spral_ssids_block_llt( cudaStream_t *stream, ipc_ nrows, ipc_ ncols,
+      rpc_* a, ipc_ lda, rpc_* f, ipc_ ldf, ipc_* stat ) {
+   ipc_ smsize = CBLOCKS*BLOCK_SIZE*BLOCK_SIZE*sizeof(rpc_);
+   ipc_ nblocks = (nrows - ncols - 1)/(BLOCK_SIZE*(CBLOCKS - 1)) + 1;
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
    cu_block_chol
-      < precision_, BLOCK_SIZE, CBLOCKS >
+      < rpc_, BLOCK_SIZE, CBLOCKS >
       <<< nblocks, threads, smsize, *stream >>>
       ( nrows, ncols, a, lda, f, ldf, stat );
 }
 
-void spral_ssids_collect_stats(cudaStream_t *stream, int nblk,
+void spral_ssids_collect_stats(cudaStream_t *stream, ipc_ nblk,
       const struct cstat_data_type *csdata, struct cuda_stats *stats) {
-   for(int i=0; i<nblk; i+=MAX_CUDA_BLOCKS) {
-      int nb = min(MAX_CUDA_BLOCKS, nblk-i);
+   for(ipc_ i=0; i<nblk; i+=MAX_CUDA_BLOCKS) {
+      ipc_ nb = min(MAX_CUDA_BLOCKS, nblk-i);
       cu_collect_stats <<<nb, 1, 0, *stream>>> (csdata+i, stats);
       CudaCheckError();
    }
 }
 
-void spral_ssids_multiblock_ldlt( cudaStream_t *stream, int nblocks,
-      struct multiblock_fact_type *mbfdata, precision_* f, precision_ delta,
-      precision_ eps, int* index, int* stat ) {
+void spral_ssids_multiblock_ldlt( cudaStream_t *stream, ipc_ nblocks,
+      struct multiblock_fact_type *mbfdata, rpc_* f, rpc_ delta,
+      rpc_ eps, ipc_* index, ipc_* stat ) {
    dim3 threads(BLOCK_SIZE, 2*BLOCK_SIZE);
-   for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-      int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+   for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+      ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
       cu_multiblock_ldlt
-         < precision_, BLOCK_SIZE, MBLOCKS >
+         < rpc_, BLOCK_SIZE, MBLOCKS >
          <<< nb, threads, 0, *stream >>>
          ( mbfdata + i, f, delta, eps, index, stat );
    }
 }
 
-void spral_ssids_multiblock_ldlt_setup( cudaStream_t *stream, int nblocks,
+void spral_ssids_multiblock_ldlt_setup( cudaStream_t *stream, ipc_ nblocks,
       struct multinode_fact_type *ndata, struct multiblock_fact_type *mbfdata,
-      int step, int block_size, int blocks, int* stat, int* ind, int* ncb ) {
+      ipc_ step, ipc_ block_size, ipc_ blocks, ipc_* stat, ipc_* ind, ipc_* ncb ) {
    dim3 threads(10,8);
-   for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-      int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+   for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+      ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
       cu_multiblock_fact_setup
          <<< nb, threads, 0, *stream >>>
          ( ndata + i, mbfdata, step, block_size, blocks,
@@ -1547,28 +1547,28 @@ void spral_ssids_multiblock_ldlt_setup( cudaStream_t *stream, int nblocks,
    }
 }
 
-void spral_ssids_multiblock_llt( cudaStream_t *stream, int nblocks,
-      struct multiblock_fact_type *mbfdata, precision_* f, int* stat ) {
+void spral_ssids_multiblock_llt( cudaStream_t *stream, ipc_ nblocks,
+      struct multiblock_fact_type *mbfdata, rpc_* f, ipc_* stat ) {
    if ( nblocks < 1 )
       return;
 
-   int smsize = MCBLOCKS*BLOCK_SIZE*BLOCK_SIZE*sizeof(precision_);
+   ipc_ smsize = MCBLOCKS*BLOCK_SIZE*BLOCK_SIZE*sizeof(rpc_);
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-   for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-      int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+   for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+      ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
       cu_multiblock_chol
-         < precision_, BLOCK_SIZE, MCBLOCKS >
+         < rpc_, BLOCK_SIZE, MCBLOCKS >
          <<< nb, threads, smsize, *stream >>>
          ( mbfdata + i, f, stat );
    }
 }
 
-void spral_ssids_multiblock_llt_setup( cudaStream_t *stream, int nblocks,
+void spral_ssids_multiblock_llt_setup( cudaStream_t *stream, ipc_ nblocks,
       struct multinode_fact_type *ndata, struct multiblock_fact_type *mbfdata,
-      int step, int block_size, int blocks, int* stat, int* ncb ) {
+      ipc_ step, ipc_ block_size, ipc_ blocks, ipc_* stat, ipc_* ncb ) {
    dim3 threads(16,8);
-   for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-      int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+   for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+      ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
       cu_multiblock_fact_setup
          <<< nb, threads, 0, *stream >>>
          ( ndata + i, mbfdata, step, block_size, blocks, i, stat + i, 0, ncb );
@@ -1577,20 +1577,20 @@ void spral_ssids_multiblock_llt_setup( cudaStream_t *stream, int nblocks,
 
 void spral_ssids_square_ldlt(
             cudaStream_t *stream,
-            int n,
-            precision_* a,
-            precision_* f,
-            precision_* w,
-            precision_* d,
-            int ld,
-            precision_ delta, precision_ eps,
-            int* index,
-            int* stat
+            ipc_ n,
+            rpc_* a,
+            rpc_* f,
+            rpc_* w,
+            rpc_* d,
+            ipc_ ld,
+            rpc_ delta, rpc_ eps,
+            ipc_* index,
+            ipc_* stat
            )
 {
-  int nt = min(n, 256);
-  int sm = nt*sizeof(precision_) + (nt + 2)*sizeof(int);
-  cu_square_ldlt< precision_ ><<< 1, nt, sm, *stream >>>
+  ipc_ nt = min(n, 256);
+  ipc_ sm = nt*sizeof(rpc_) + (nt + 2)*sizeof(ipc_);
+  cu_square_ldlt< rpc_ ><<< 1, nt, sm, *stream >>>
     ( n, a, f, w, d, ld, delta, eps, index, stat );
 }
 
diff --git a/src/ssids/fkeep.F90 b/src/ssids/fkeep.F90
index 11f89a72fe..af79d56b73 100644
--- a/src/ssids/fkeep.F90
+++ b/src/ssids/fkeep.F90
@@ -75,7 +75,6 @@ subroutine inner_factor_cpu(fkeep, akeep, val, options, inform)
   logical :: abort, all_region
   type(contrib_type), dimension(:), allocatable :: child_contrib
   type(ssids_inform), dimension(:), allocatable :: thread_inform
-
 #ifdef PROFILE
   ! Begin profile trace (noop if not enabled)
   call profile_begin(akeep%topology)
diff --git a/src/ssids/ldlt_app.cxx b/src/ssids/ldlt_app.cxx
index bcab9225aa..8654f3391c 100644
--- a/src/ssids/ldlt_app.cxx
+++ b/src/ssids/ldlt_app.cxx
@@ -1,7 +1,6 @@
-/** \file
- *  \copyright 2016 The Science and Technology Facilities Council (STFC)
- *  \licence   BSD licence, see LICENCE file for details
- *  \author    Jonathan Hogg
+/** \file \copyright 2016 The Science and Technology Facilities Council
+ *  (STFC) \licence BSD licence, see LICENCE file for details \author
+ *  Jonathan Hogg \version GALAHAD 4.3 - 2024-02-03 AT 61:10 GMT
  */
 #include "ssids_cpu_kernels_ldlt_app.hxx"
 
@@ -34,16 +33,14 @@
 #include "ssids_cpu_kernels_wrappers.hxx"
 
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define ldlt_app_internal ldlt_app_internal_sgl
 #define ldlt_app_factor_mem_required ldlt_app_factor_mem_required_sgl
 #else
-#define precision_ double
 #define ldlt_app_internal ldlt_app_internal_dbl
 #define ldlt_app_factor_mem_required ldlt_app_factor_mem_required_dbl
 #endif
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define host_gemm host_gemm_64
 #define host_trsv host_trsv_64
 #define host_trsm host_trsm_64
@@ -54,15 +51,15 @@ namespace spral { namespace ssids { namespace cpu {
 
 namespace ldlt_app_internal {
 
-static const int INNER_BLOCK_SIZE = 32;
+static const ipc_ INNER_BLOCK_SIZE = 32;
 
 /** \return number of blocks for given n */
-inline int calc_nblk(int n, int block_size) {
+inline ipc_ calc_nblk(ipc_ n, ipc_ block_size) {
    return (n-1) / block_size + 1;
 }
 
 /** \return block size of block blk if maximum in dimension is n */
-inline int calc_blkn(int blk, int n, int block_size) {
+inline ipc_ calc_blkn(ipc_ blk, ipc_ n, ipc_ block_size) {
    return std::min(block_size, n-blk*block_size);
 }
 
@@ -74,7 +71,7 @@ template<typename T>
 class Column {
 public:
    bool first_elim; ///< True if first column with eliminations
-   int nelim; ///< Number of eliminated entries in this column
+   ipc_ nelim; ///< Number of eliminated entries in this column
    T *d; ///< Pointer to local d
 
    // \{
@@ -86,7 +83,7 @@ class Column {
    /** \brief Initialize number of passed columns ready for reduction
     *  \param passed number of variables passing a posteori pivot test in block
     */
-   void init_passed(int passed) {
+   void init_passed(ipc_ passed) {
       spral::omp::AcquiredLock scopeLock(lock_);
       npass_ = passed;
    }
@@ -94,7 +91,7 @@ class Column {
     *  \details Aquires a lock before doing a minimum reduction across blocks
     *  \param passed number of variables passing a posteori pivot test in block
     */
-   void update_passed(int passed) {
+   void update_passed(ipc_ passed) {
       spral::omp::AcquiredLock scopeLock(lock_);
       npass_ = std::min(npass_, passed);
    }
@@ -107,7 +104,7 @@ class Column {
     *           sucessful columns in the case of a global cancellation.
     *  \param passed number of pivots that succeeded for a block
     *  \returns true if passed < nelim */
-   bool test_fail(int passed) {
+   bool test_fail(ipc_ passed) {
       bool fail = (passed < nelim);
       if(!fail) {
          // Record number of blocks in column passing this test
@@ -125,7 +122,7 @@ class Column {
     *           a variable, and sets nelim for this column.
     *  \param next_elim global number of eliminated pivots to be updated based
     *         on number eliminated in this column. */
-   void adjust(int& next_elim) {
+   void adjust(ipc_& next_elim) {
       // Test if last passed column was first part of a 2x2: if so,
       // decrement npass
       spral::omp::AcquiredLock scopeLock(lock_);
@@ -156,25 +153,25 @@ class Column {
     *  \internal Note that there is no need to consider a similar operation for
     *            d[] as it is only used for eliminated variables.
     */
-   void move_back(int n, int const* perm, int* elim_perm, int* failed_perm) {
+   void move_back(ipc_ n, ipc_ const* perm, ipc_* elim_perm, ipc_* failed_perm) {
       if(perm != elim_perm) { // Don't move if memory is identical
-         for(int i=0; i<nelim; ++i)
+         for(ipc_ i=0; i<nelim; ++i)
             *(elim_perm++) = perm[i];
       }
       // Copy failed perm
-      for(int i=nelim; i<n; ++i)
+      for(ipc_ i=nelim; i<n; ++i)
          *(failed_perm++) = perm[i];
    }
 
    /** \brief return number of passed columns */
-   int get_npass() const {
+   ipc_ get_npass() const {
      spral::omp::AcquiredLock scopeLock(lock_);
      return npass_;
    }
 
 private:
    mutable spral::omp::Lock lock_; ///< lock for altering npass
-   int npass_=0; ///< reduction variable for nelim
+   ipc_ npass_=0; ///< reduction variable for nelim
 };
 
 /** \brief Stores data about block columns
@@ -200,18 +197,18 @@ class ColumnData {
     *  \param block_size block size
     *  \param alloc allocator instance to use for allocation
     */
-   ColumnData(int n, int block_size, IntAlloc const& alloc)
+   ColumnData(ipc_ n, ipc_ block_size, IntAlloc const& alloc)
    : n_(n), block_size_(block_size), alloc_(alloc)
    {
-      int nblk = calc_nblk(n_, block_size_);
+      ipc_ nblk = calc_nblk(n_, block_size_);
       typename ColAllocTraits::allocator_type colAlloc(alloc_);
       cdata_ = ColAllocTraits::allocate(colAlloc, nblk);
-      for(int i=0; i<nblk; ++i)
+      for(ipc_ i=0; i<nblk; ++i)
          ColAllocTraits::construct(colAlloc, &cdata_[i]);
       lperm_ = IntAllocTraits::allocate(alloc_, nblk*block_size_);
    }
    ~ColumnData() {
-      int nblk = calc_nblk(n_, block_size_);
+      ipc_ nblk = calc_nblk(n_, block_size_);
       IntAllocTraits::deallocate(alloc_, lperm_, nblk*block_size_);
       typename ColAllocTraits::allocator_type colAlloc(alloc_);
       ColAllocTraits::deallocate(colAlloc, cdata_, nblk);
@@ -220,23 +217,23 @@ class ColumnData {
    /** \brief Returns Column instance for given column
     *  \param idx block column
     */
-   Column<T>& operator[](int idx) { return cdata_[idx]; }
+   Column<T>& operator[](ipc_ idx) { return cdata_[idx]; }
 
    /** \brief Return local permutation pointer for given column
     *  \param blk block column
     *  \return pointer to local permutation
     */
-   int* get_lperm(int blk) { return &lperm_[blk*block_size_]; }
+   ipc_* get_lperm(ipc_ blk) { return &lperm_[blk*block_size_]; }
 
    /** \brief Calculate number of eliminated columns in unpivoted case
     *  \param m number of rows in matrix
     *  \return number of sucesfully eliminated columns
     */
-   int calc_nelim(int m) const {
-      int mblk = calc_nblk(m, block_size_);
-      int nblk = calc_nblk(n_, block_size_);
-      int nelim = 0;
-      for(int j=0; j<nblk; ++j) {
+   ipc_ calc_nelim(ipc_ m) const {
+      ipc_ mblk = calc_nblk(m, block_size_);
+      ipc_ nblk = calc_nblk(n_, block_size_);
+      ipc_ nelim = 0;
+      for(ipc_ j=0; j<nblk; ++j) {
          if(cdata_[j].get_npass() == mblk-j) {
             nelim += cdata_[j].nelim;
          } else {
@@ -247,22 +244,22 @@ class ColumnData {
    };
 
 private:
-   int const n_; ///< number of columns in matrix
-   int const block_size_; ///< block size for matrix
+   ipc_ const n_; ///< number of columns in matrix
+   ipc_ const block_size_; ///< block size for matrix
    IntAlloc alloc_; ///< internal copy of allocator to be used in destructor
    Column<T> *cdata_; ///< underlying array of columns
-   int* lperm_; ///< underlying local permutation
+   ipc_* lperm_; ///< underlying local permutation
 };
 
 
 /** Returns true if ptr is suitably aligned for AVX, false if not */
 bool is_aligned(void* ptr) {
 #if defined(__AVX512F__)
-  const int align = 64;
+  const ipc_ align = 64;
 #elif defined(__AVX__)
-  const int align = 32;
+  const ipc_ align = 32;
 #else
-  const int align = 16;
+  const ipc_ align = 16;
 #endif
    return (reinterpret_cast<uintptr_t>(ptr) % align == 0);
 }
@@ -271,10 +268,10 @@ bool is_aligned(void* ptr) {
  *  within diagonal block.
  *  Note that out and aval may overlap. */
 template<typename T, typename Column>
-void move_up_diag(Column const& idata, Column const& jdata, T* out, T const* aval, int lda) {
+void move_up_diag(Column const& idata, Column const& jdata, T* out, T const* aval, ipc_ lda) {
    if(out == aval) return; // don't bother moving if memory is the same
-   for(int j=0; j<jdata.nelim; ++j)
-   for(int i=0; i<idata.nelim; ++i)
+   for(ipc_ j=0; j<jdata.nelim; ++j)
+   for(ipc_ i=0; i<idata.nelim; ++i)
       out[j*lda+i] = aval[j*lda+i];
 }
 
@@ -282,47 +279,47 @@ void move_up_diag(Column const& idata, Column const& jdata, T* out, T const* ava
  *  within rectangular block of matrix.
  *  Note that out and aval may overlap. */
 template<typename T, typename Column>
-void move_up_rect(int m, int rfrom, Column const& jdata, T* out, T const* aval, int lda) {
+void move_up_rect(ipc_ m, ipc_ rfrom, Column const& jdata, T* out, T const* aval, ipc_ lda) {
    if(out == aval) return; // don't bother moving if memory is the same
-   for(int j=0; j<jdata.nelim; ++j)
-   for(int i=rfrom; i<m; ++i)
+   for(ipc_ j=0; j<jdata.nelim; ++j)
+   for(ipc_ i=rfrom; i<m; ++i)
       out[j*lda+i] = aval[j*lda+i];
 }
 
 /** Copies failed rows and columns^T to specified locations */
 template<typename T, typename Column>
-void copy_failed_diag(int m, int n, Column const& idata, Column const& jdata, T* rout, T* cout, T* dout, int ldout, T const* aval, int lda) {
+void copy_failed_diag(ipc_ m, ipc_ n, Column const& idata, Column const& jdata, T* rout, T* cout, T* dout, ipc_ ldout, T const* aval, ipc_ lda) {
    /* copy rows */
-   for(int j=0; j<jdata.nelim; ++j)
-   for(int i=idata.nelim, iout=0; i<m; ++i, ++iout)
+   for(ipc_ j=0; j<jdata.nelim; ++j)
+   for(ipc_ i=idata.nelim, iout=0; i<m; ++i, ++iout)
       rout[j*ldout+iout] = aval[j*lda+i];
    /* copy cols in transpose (not for diagonal block) */
    if(&idata != &jdata) {
-      for(int j=jdata.nelim, jout=0; j<n; ++j, ++jout)
-      for(int i=0; i<idata.nelim; ++i)
+      for(ipc_ j=jdata.nelim, jout=0; j<n; ++j, ++jout)
+      for(ipc_ i=0; i<idata.nelim; ++i)
          cout[i*ldout+jout] = aval[j*lda+i];
    }
    /* copy intersection of failed rows and cols */
-   for(int j=jdata.nelim, jout=0; j<n; j++, ++jout)
-   for(int i=idata.nelim, iout=0; i<m; ++i, ++iout)
+   for(ipc_ j=jdata.nelim, jout=0; j<n; j++, ++jout)
+   for(ipc_ i=idata.nelim, iout=0; i<m; ++i, ++iout)
       dout[jout*ldout+iout] = aval[j*lda+i];
 }
 
 /** Copies failed columns to specified location */
 template<typename T, typename Column>
-void copy_failed_rect(int m, int n, int rfrom, Column const& jdata, T* cout, int ldout, T const* aval, int lda) {
-   for(int j=jdata.nelim, jout=0; j<n; ++j, ++jout)
-      for(int i=rfrom; i<m; ++i)
+void copy_failed_rect(ipc_ m, ipc_ n, ipc_ rfrom, Column const& jdata, T* cout, ipc_ ldout, T const* aval, ipc_ lda) {
+   for(ipc_ j=jdata.nelim, jout=0; j<n; ++j, ++jout)
+      for(ipc_ i=rfrom; i<m; ++i)
          cout[jout*ldout+i] = aval[j*lda+i];
 }
 
 /** Check if a block satisifies pivot threshold (colwise version) */
 template <enum operation op, typename T>
-int check_threshold(int rfrom, int rto, int cfrom, int cto, T u, T* aval, int lda) {
+ipc_ check_threshold(ipc_ rfrom, ipc_ rto, ipc_ cfrom, ipc_ cto, T u, T* aval, ipc_ lda) {
    // Perform threshold test for each uneliminated row/column
-   int least_fail = (op==OP_N) ? cto : rto;
-   for(int j=cfrom; j<cto; j++)
-   for(int i=rfrom; i<rto; i++)
+   ipc_ least_fail = (op==OP_N) ? cto : rto;
+   for(ipc_ j=cfrom; j<cto; j++)
+   for(ipc_ i=rfrom; i<rto; i++)
       if(fabs(aval[j*lda+i]) > 1.0/u) {
          if(op==OP_N) {
             // must be least failed col
@@ -345,24 +342,24 @@ int check_threshold(int rfrom, int rto, int cfrom, int cto, T u, T* aval, int ld
  * 1x1  ( 0 ) stored as d = [ 0.0 0.0 ]
  */
 template <enum operation op, typename T>
-void apply_pivot(int m, int n, int from, const T *diag, const T *d,
-                 const T small, T* aval, int lda) {
+void apply_pivot(ipc_ m, ipc_ n, ipc_ from, const T *diag, const T *d,
+                 const T small, T* aval, ipc_ lda) {
    if(op==OP_N && from > m) return; // no-op
    if(op==OP_T && from > n) return; // no-op
 
-   precision_ one_val = 1.0;
+   rpc_ one_val = 1.0;
    if(op==OP_N) {
       // Perform solve L_11^-T
       host_trsm<T>(SIDE_RIGHT, FILL_MODE_LWR, OP_T, DIAG_UNIT,
             m, n, one_val, diag, lda, aval, lda);
       // Perform solve L_21 D^-1
-      for(int i=0; i<n; ) {
+      for(ipc_ i=0; i<n; ) {
          if(i+1==n || std::isfinite(d[2*i+2])) {
             // 1x1 pivot
             T d11 = d[2*i];
             if(d11 == 0.0) {
                // Handle zero pivots carefully
-               for(int j=0; j<m; j++) {
+               for(ipc_ j=0; j<m; j++) {
                   T v = aval[i*lda+j];
                   aval[i*lda+j] =
                      (fabs(v)<small) ? 0.0
@@ -371,7 +368,7 @@ void apply_pivot(int m, int n, int from, const T *diag, const T *d,
                }
             } else {
                // Non-zero pivot, apply in normal fashion
-               for(int j=0; j<m; j++)
+               for(ipc_ j=0; j<m; j++)
                   aval[i*lda+j] *= d11;
             }
             i++;
@@ -380,7 +377,7 @@ void apply_pivot(int m, int n, int from, const T *diag, const T *d,
             T d11 = d[2*i];
             T d21 = d[2*i+1];
             T d22 = d[2*i+3];
-            for(int j=0; j<m; j++) {
+            for(ipc_ j=0; j<m; j++) {
                T a1 = aval[i*lda+j];
                T a2 = aval[(i+1)*lda+j];
                aval[i*lda+j]     = d11*a1 + d21*a2;
@@ -394,13 +391,13 @@ void apply_pivot(int m, int n, int from, const T *diag, const T *d,
       host_trsm<T>(SIDE_LEFT, FILL_MODE_LWR, OP_N, DIAG_UNIT,
             m, n-from, one_val, diag, lda, &aval[from*lda], lda);
       // Perform solve D^-T L_21^T
-      for(int i=0; i<m; ) {
+      for(ipc_ i=0; i<m; ) {
          if(i+1==m || std::isfinite(d[2*i+2])) {
             // 1x1 pivot
             T d11 = d[2*i];
             if(d11 == 0.0) {
                // Handle zero pivots carefully
-               for(int j=from; j<n; j++) {
+               for(ipc_ j=from; j<n; j++) {
                   T v = aval[j*lda+i];
                   aval[j*lda+i] =
                      (fabs(v)<small) ? 0.0 // *v handles NaNs
@@ -409,7 +406,7 @@ void apply_pivot(int m, int n, int from, const T *diag, const T *d,
                }
             } else {
                // Non-zero pivot, apply in normal fashion
-               for(int j=from; j<n; j++) {
+               for(ipc_ j=from; j<n; j++) {
                   aval[j*lda+i] *= d11;
                }
             }
@@ -419,7 +416,7 @@ void apply_pivot(int m, int n, int from, const T *diag, const T *d,
             T d11 = d[2*i];
             T d21 = d[2*i+1];
             T d22 = d[2*i+3];
-            for(int j=from; j<n; j++) {
+            for(ipc_ j=from; j<n; j++) {
                T a1 = aval[j*lda+i];
                T a2 = aval[j*lda+(i+1)];
                aval[j*lda+i]     = d11*a1 + d21*a2;
@@ -454,7 +451,7 @@ class CopyBackup {
     *  \param block_size dimension of a block in rows or columns
     *  \param alloc allocator instance to use when allocating memory
     */
-   CopyBackup(int m, int n, int block_size, Allocator const& alloc=Allocator())
+   CopyBackup(ipc_ m, ipc_ n, ipc_ block_size, Allocator const& alloc=Allocator())
    : alloc_(alloc), m_(m), n_(n), mblk_(calc_nblk(m,block_size)),
      block_size_(block_size), ldcopy_(align_lda<T>(m_)),
      acopy_(alloc_.allocate(n_*ldcopy_))
@@ -484,7 +481,7 @@ class CopyBackup {
     *  \param iblk row index of block.
     *  \param jblk column index of block.
     */
-   void release(int iblk, int jblk) { /* no-op */ }
+   void release(ipc_ iblk, ipc_ jblk) { /* no-op */ }
 
    /** \brief Create a restore point for the given block.
     *  \param iblk row index of block.
@@ -492,10 +489,10 @@ class CopyBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void create_restore_point(int iblk, int jblk, T const* aval, int lda) {
+   void create_restore_point(ipc_ iblk, ipc_ jblk, T const* aval, ipc_ lda) {
       T* lwork = get_lwork(iblk, jblk);
-      for(int j=0; j<get_ncol(jblk); j++)
-      for(int i=0; i<get_nrow(iblk); i++)
+      for(ipc_ j=0; j<get_ncol(jblk); j++)
+      for(ipc_ i=0; i<get_nrow(iblk); i++)
          lwork[j*ldcopy_+i] = aval[j*lda+i];
    }
 
@@ -510,20 +507,20 @@ class CopyBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void create_restore_point_with_row_perm(int iblk, int jblk, int nperm,
-         int const* perm, T* aval, int lda) {
+   void create_restore_point_with_row_perm(ipc_ iblk, ipc_ jblk, ipc_ nperm,
+         ipc_ const* perm, T* aval, ipc_ lda) {
       T* lwork = get_lwork(iblk, jblk);
-      for(int j=0; j<get_ncol(jblk); j++) {
-         for(int i=0; i<nperm; i++) {
-            int r = perm[i];
+      for(ipc_ j=0; j<get_ncol(jblk); j++) {
+         for(ipc_ i=0; i<nperm; i++) {
+            ipc_ r = perm[i];
             lwork[j*ldcopy_+i] = aval[j*lda+r];
          }
-         for(int i=nperm; i<get_nrow(iblk); i++) {
+         for(ipc_ i=nperm; i<get_nrow(iblk); i++) {
             lwork[j*ldcopy_+i] = aval[j*lda+i];
          }
       }
-      for(int j=0; j<get_ncol(jblk); j++)
-      for(int i=0; i<nperm; i++)
+      for(ipc_ j=0; j<get_ncol(jblk); j++)
+      for(ipc_ i=0; i<nperm; i++)
          aval[j*lda+i] = lwork[j*ldcopy_+i];
    }
 
@@ -537,15 +534,15 @@ class CopyBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void create_restore_point_with_col_perm(int iblk, int jblk, const int *perm, T* aval, int lda) {
+   void create_restore_point_with_col_perm(ipc_ iblk, ipc_ jblk, const ipc_ *perm, T* aval, ipc_ lda) {
       T* lwork = get_lwork(iblk, jblk);
-      for(int j=0; j<get_ncol(jblk); j++) {
-         int c = perm[j];
-         for(int i=0; i<get_nrow(iblk); i++)
+      for(ipc_ j=0; j<get_ncol(jblk); j++) {
+         ipc_ c = perm[j];
+         for(ipc_ i=0; i<get_nrow(iblk); i++)
             lwork[j*ldcopy_+i] = aval[c*lda+i];
       }
-      for(int j=0; j<get_ncol(jblk); j++)
-      for(int i=0; i<get_nrow(iblk); i++)
+      for(ipc_ j=0; j<get_ncol(jblk); j++)
+      for(ipc_ i=0; i<get_nrow(iblk); i++)
          aval[j*lda+i] = lwork[j*ldcopy_+i];
    }
 
@@ -557,10 +554,10 @@ class CopyBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void restore_part(int iblk, int jblk, int rfrom, int cfrom, T* aval, int lda) {
+   void restore_part(ipc_ iblk, ipc_ jblk, ipc_ rfrom, ipc_ cfrom, T* aval, ipc_ lda) {
       T* lwork = get_lwork(iblk, jblk);
-      for(int j=cfrom; j<get_ncol(jblk); j++)
-      for(int i=rfrom; i<get_nrow(iblk); i++)
+      for(ipc_ j=cfrom; j<get_ncol(jblk); j++)
+      for(ipc_ i=rfrom; i<get_nrow(iblk); i++)
          aval[j*lda+i] = lwork[j*ldcopy_+i];
    }
 
@@ -577,39 +574,39 @@ class CopyBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void restore_part_with_sym_perm(int iblk, int jblk, int from, const int *perm, T* aval, int lda) {
+   void restore_part_with_sym_perm(ipc_ iblk, ipc_ jblk, ipc_ from, const ipc_ *perm, T* aval, ipc_ lda) {
       T* lwork = get_lwork(iblk, jblk);
-      for(int j=from; j<get_ncol(jblk); j++) {
-         int c = perm[j];
-         for(int i=from; i<get_ncol(jblk); i++) {
-            int r = perm[i];
+      for(ipc_ j=from; j<get_ncol(jblk); j++) {
+         ipc_ c = perm[j];
+         for(ipc_ i=from; i<get_ncol(jblk); i++) {
+            ipc_ r = perm[i];
             aval[j*lda+i] = (r>c) ? lwork[c*ldcopy_+r]
                                   : lwork[r*ldcopy_+c];
          }
-         for(int i=get_ncol(jblk); i<get_nrow(iblk); i++)
+         for(ipc_ i=get_ncol(jblk); i<get_nrow(iblk); i++)
             aval[j*lda+i] = lwork[c*ldcopy_+i];
       }
    }
 
 private:
    /** \brief returns pointer to internal backup of given block */
-   inline T* get_lwork(int iblk, int jblk) {
+   inline T* get_lwork(ipc_ iblk, ipc_ jblk) {
       return &acopy_[jblk*block_size_*ldcopy_+iblk*block_size_];
    }
    /** \brief return number of columns in given block column */
-   inline int get_ncol(int blk) const {
+   inline ipc_ get_ncol(ipc_ blk) const {
       return calc_blkn(blk, n_, block_size_);
    }
    /** \brief return number of rows in given block row */
-   inline int get_nrow(int blk) const {
+   inline ipc_ get_nrow(ipc_ blk) const {
       return calc_blkn(blk, m_, block_size_);
    }
 
    Allocator alloc_; ///< internal copy of allocator needed for destructor
-   int const m_; ///< number of rows in matrix
-   int const n_; ///< number of columns in matrix
-   int const mblk_; ///< number of block rows in matrix
-   int const block_size_; ///< block size
+   ipc_ const m_; ///< number of rows in matrix
+   ipc_ const n_; ///< number of columns in matrix
+   ipc_ const mblk_; ///< number of block rows in matrix
+   ipc_ const block_size_; ///< block size
    size_t const ldcopy_; ///< leading dimension of acopy_
    T* acopy_; ///< internal storage for copy of matrix
 };
@@ -634,7 +631,7 @@ class PoolBackup {
     *  \param alloc allocator instance to use when allocating memory
     */
    // FIXME: reduce pool size
-   PoolBackup(int m, int n, int block_size, Allocator const& alloc=Allocator())
+   PoolBackup(ipc_ m, ipc_ n, ipc_ block_size, Allocator const& alloc=Allocator())
    : m_(m), n_(n), block_size_(block_size), mblk_(calc_nblk(m,block_size)),
      pool_(calc_nblk(n,block_size)*((calc_nblk(n,block_size)+1)/2+mblk_), block_size, alloc),
      ptr_(mblk_*calc_nblk(n,block_size), alloc)
@@ -644,7 +641,7 @@ class PoolBackup {
     *  \param iblk row index of block.
     *  \param jblk column index of block.
     */
-   void release(int iblk, int jblk) {
+   void release(ipc_ iblk, ipc_ jblk) {
       pool_.release(ptr_[jblk*mblk_+iblk]);
       ptr_[jblk*mblk_+iblk] = nullptr;
    }
@@ -655,11 +652,11 @@ class PoolBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void create_restore_point(int iblk, int jblk, T const* aval, int lda) {
+   void create_restore_point(ipc_ iblk, ipc_ jblk, T const* aval, ipc_ lda) {
       T*& lwork = ptr_[jblk*mblk_+iblk];
       lwork = pool_.get_wait();
-      for(int j=0; j<get_ncol(jblk); j++)
-      for(int i=0; i<get_nrow(iblk); i++)
+      for(ipc_ j=0; j<get_ncol(jblk); j++)
+      for(ipc_ i=0; i<get_nrow(iblk); i++)
          lwork[j*block_size_+i] = aval[j*lda+i];
    }
 
@@ -674,21 +671,21 @@ class PoolBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void create_restore_point_with_row_perm(int iblk, int jblk, int nperm,
-         int const* perm, T* aval, int lda) {
+   void create_restore_point_with_row_perm(ipc_ iblk, ipc_ jblk, ipc_ nperm,
+         ipc_ const* perm, T* aval, ipc_ lda) {
       T*& lwork = ptr_[jblk*mblk_+iblk];
       lwork = pool_.get_wait();
-      for(int j=0; j<get_ncol(jblk); j++) {
-         for(int i=0; i<nperm; i++) {
-            int r = perm[i];
+      for(ipc_ j=0; j<get_ncol(jblk); j++) {
+         for(ipc_ i=0; i<nperm; i++) {
+            ipc_ r = perm[i];
             lwork[j*block_size_+i] = aval[j*lda+r];
          }
-         for(int i=nperm; i<get_nrow(iblk); i++) {
+         for(ipc_ i=nperm; i<get_nrow(iblk); i++) {
             lwork[j*block_size_+i] = aval[j*lda+i];
          }
       }
-      for(int j=0; j<get_ncol(jblk); j++)
-      for(int i=0; i<nperm; i++)
+      for(ipc_ j=0; j<get_ncol(jblk); j++)
+      for(ipc_ i=0; i<nperm; i++)
          aval[j*lda+i] = lwork[j*block_size_+i];
    }
 
@@ -702,17 +699,17 @@ class PoolBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void create_restore_point_with_col_perm(int iblk, int jblk,
-         int const* perm, T* aval, int lda) {
+   void create_restore_point_with_col_perm(ipc_ iblk, ipc_ jblk,
+         ipc_ const* perm, T* aval, ipc_ lda) {
       T*& lwork = ptr_[jblk*mblk_+iblk];
       lwork = pool_.get_wait();
-      for(int j=0; j<get_ncol(jblk); j++) {
-         int c = perm[j];
-         for(int i=0; i<get_nrow(iblk); i++)
+      for(ipc_ j=0; j<get_ncol(jblk); j++) {
+         ipc_ c = perm[j];
+         for(ipc_ i=0; i<get_nrow(iblk); i++)
             lwork[j*block_size_+i] = aval[c*lda+i];
       }
-      for(int j=0; j<get_ncol(jblk); j++)
-      for(int i=0; i<get_nrow(iblk); i++)
+      for(ipc_ j=0; j<get_ncol(jblk); j++)
+      for(ipc_ i=0; i<get_nrow(iblk); i++)
          aval[j*lda+i] = lwork[j*block_size_+i];
    }
 
@@ -724,10 +721,10 @@ class PoolBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void restore_part(int iblk, int jblk, int rfrom, int cfrom, T* aval, int lda) {
+   void restore_part(ipc_ iblk, ipc_ jblk, ipc_ rfrom, ipc_ cfrom, T* aval, ipc_ lda) {
       T*& lwork = ptr_[jblk*mblk_+iblk];
-      for(int j=cfrom; j<get_ncol(jblk); j++)
-      for(int i=rfrom; i<get_nrow(iblk); i++)
+      for(ipc_ j=cfrom; j<get_ncol(jblk); j++)
+      for(ipc_ i=rfrom; i<get_nrow(iblk); i++)
          aval[j*lda+i] = lwork[j*block_size_+i];
    }
 
@@ -744,41 +741,41 @@ class PoolBackup {
     *  \param aval pointer to block to be stored.
     *  \param lda leading dimension of aval.
     */
-   void restore_part_with_sym_perm(int iblk, int jblk, int from,
-         int const* perm, T* aval, int lda) {
+   void restore_part_with_sym_perm(ipc_ iblk, ipc_ jblk, ipc_ from,
+         ipc_ const* perm, T* aval, ipc_ lda) {
       T*& lwork = ptr_[jblk*mblk_+iblk];
-      for(int j=from; j<get_ncol(jblk); j++) {
-         int c = perm[j];
-         for(int i=from; i<get_ncol(jblk); i++) {
-            int r = perm[i];
+      for(ipc_ j=from; j<get_ncol(jblk); j++) {
+         ipc_ c = perm[j];
+         for(ipc_ i=from; i<get_ncol(jblk); i++) {
+            ipc_ r = perm[i];
             aval[j*lda+i] = (r>c) ? lwork[c*block_size_+r]
                                   : lwork[r*block_size_+c];
          }
-         for(int i=get_ncol(jblk); i<get_nrow(iblk); i++)
+         for(ipc_ i=get_ncol(jblk); i<get_nrow(iblk); i++)
             aval[j*lda+i] = lwork[c*block_size_+i];
       }
    }
 
 private:
    /** \brief return number of columns in given block column */
-   inline int get_ncol(int blk) {
+   inline ipc_ get_ncol(ipc_ blk) {
       return calc_blkn(blk, n_, block_size_);
    }
    /** \brief return number of rows in given block row */
-   inline int get_nrow(int blk) {
+   inline ipc_ get_nrow(ipc_ blk) {
       return calc_blkn(blk, m_, block_size_);
    }
 
-   int const m_; ///< number of rows in main matrix
-   int const n_; ///< number of columns in main matrix
-   int const block_size_; ///< block size of main matrix
-   int const mblk_; ///< number of block rows in main matrix
+   ipc_ const m_; ///< number of rows in main matrix
+   ipc_ const n_; ///< number of columns in main matrix
+   ipc_ const block_size_; ///< block size of main matrix
+   ipc_ const mblk_; ///< number of block rows in main matrix
    BlockPool<T, Allocator> pool_; ///< pool of blocks
    std::vector<T*, TptrAlloc> ptr_; ///< map from pointer matrix entry to block
 };
 
 template<typename T,
-         int BLOCK_SIZE,
+         ipc_ BLOCK_SIZE,
          typename Backup,
          bool use_tasks, // Use tasks, so we can disable on one or more levels
          bool debug=false,
@@ -795,7 +792,7 @@ class LDLT;
  *  \tparam IntAlloc an allocator for type int used in specification of
  *          ColumnData type.
  */
-template<typename T, int INNER_BLOCK_SIZE, typename IntAlloc>
+template<typename T, ipc_ INNER_BLOCK_SIZE, typename IntAlloc>
 class Block {
 public:
    /** \brief Constuctor.
@@ -808,8 +805,8 @@ class Block {
     *  \param lda Leading dimension of a.
     *  \param block_size The block size.
     */
-   Block(int i, int j, int m, int n, ColumnData<T,IntAlloc>& cdata, T* a,
-         int lda, int block_size)
+   Block(ipc_ i, ipc_ j, ipc_ m, ipc_ n, ColumnData<T,IntAlloc>& cdata, T* a,
+         ipc_ lda, ipc_ block_size)
    : i_(i), j_(j), m_(m), n_(n), lda_(lda), block_size_(block_size),
      cdata_(cdata), aval_(&a[j*block_size*lda+i*block_size])
    {}
@@ -838,19 +835,19 @@ class Block {
     *  \param work Thread-specific workspace.
     */
    void apply_rperm(Workspace& work) {
-      int ldl = align_lda<T>(block_size_);
+      ipc_ ldl = align_lda<T>(block_size_);
       T* lwork = work.get_ptr<T>(ncol()*ldl);
-      int* lperm = cdata_.get_lperm(i_);
+      ipc_* lperm = cdata_.get_lperm(i_);
       // Copy into lwork with permutation
-      for(int j=0; j<ncol(); ++j) {
-         for(int i=0; i<get_ncol(i_); ++i) {
-            int r = lperm[i];
+      for(ipc_ j=0; j<ncol(); ++j) {
+         for(ipc_ i=0; i<get_ncol(i_); ++i) {
+            ipc_ r = lperm[i];
             lwork[j*ldl+i] = aval_[j*lda_+r];
          }
       }
       // Copy back again
-      for(int j=0; j<ncol(); ++j)
-      for(int i=0; i<get_ncol(i_); ++i)
+      for(ipc_ j=0; j<ncol(); ++j)
+      for(ipc_ i=0; i<get_ncol(i_); ++i)
          aval_[j*lda_+i] = lwork[j*ldl+i];
    }
 
@@ -859,19 +856,19 @@ class Block {
     *  \param work Thread-specific workspace.
     */
    void apply_inv_rperm(Workspace& work) {
-      int ldl = align_lda<T>(block_size_);
+      ipc_ ldl = align_lda<T>(block_size_);
       T* lwork = work.get_ptr<T>(ncol()*ldl);
-      int* lperm = cdata_.get_lperm(i_);
+      ipc_* lperm = cdata_.get_lperm(i_);
       // Copy into lwork with permutation
-      for(int j=0; j<ncol(); ++j) {
-         for(int i=0; i<get_ncol(i_); ++i) {
-            int r = lperm[i];
+      for(ipc_ j=0; j<ncol(); ++j) {
+         for(ipc_ i=0; i<get_ncol(i_); ++i) {
+            ipc_ r = lperm[i];
             lwork[j*ldl+r] = aval_[j*lda_+i];
          }
       }
       // Copy back again
-      for(int j=0; j<ncol(); ++j)
-      for(int i=0; i<get_ncol(i_); ++i)
+      for(ipc_ j=0; j<ncol(); ++j)
+      for(ipc_ i=0; i<get_ncol(i_); ++i)
          aval_[j*lda_+i] = lwork[j*ldl+i];
    }
 
@@ -890,18 +887,18 @@ class Block {
     *  \param work Thread-specific workspace.
     */
    void apply_cperm(Workspace& work) {
-      int ldl = align_lda<T>(block_size_);
+      ipc_ ldl = align_lda<T>(block_size_);
       T* lwork = work.get_ptr<T>(ncol()*ldl);
-      int* lperm = cdata_.get_lperm(j_);
+      ipc_* lperm = cdata_.get_lperm(j_);
       // Copy into lwork with permutation
-      for(int j=0; j<ncol(); ++j) {
-         int c = lperm[j];
-         for(int i=0; i<nrow(); ++i)
+      for(ipc_ j=0; j<ncol(); ++j) {
+         ipc_ c = lperm[j];
+         for(ipc_ i=0; i<nrow(); ++i)
             lwork[j*ldl+i] = aval_[c*lda_+i];
       }
       // Copy back again
-      for(int j=0; j<ncol(); ++j)
-      for(int i=0; i<nrow(); ++i)
+      for(ipc_ j=0; j<ncol(); ++j)
+      for(ipc_ i=0; i<nrow(); ++i)
          aval_[j*lda_+i] = lwork[j*ldl+i];
    }
 
@@ -925,7 +922,7 @@ class Block {
     *         wish to perform restores associated with.
     */
    template <typename Backup>
-   void restore_if_required(Backup& backup, int elim_col) {
+   void restore_if_required(Backup& backup, ipc_ elim_col) {
       if(i_ == elim_col && j_ == elim_col) { // In eliminated diagonal block
          if(cdata_[i_].nelim < ncol()) { // If there are failed pivots
             backup.restore_part_with_sym_perm(
@@ -945,7 +942,7 @@ class Block {
       }
       else if(j_ == elim_col) { // In eliminated col
          if(cdata_[j_].nelim < ncol()) { // If there are failed pivots
-            int rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0;
+            ipc_ rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0;
             backup.restore_part(i_, j_, rfrom, cdata_[j_].nelim, aval_, lda_);
          }
          // Release resources regardless, no longer required
@@ -978,13 +975,13 @@ class Block {
     *         LDLT::factor().
     */
    template <typename Allocator>
-   int factor(int next_elim, int* perm, T* d,
+   ipc_ factor(ipc_ next_elim, ipc_* perm, T* d,
          struct cpu_factor_options const &options,
          std::vector<Workspace>& work, Allocator const& alloc) {
       if(i_ != j_)
          throw std::runtime_error("factor called on non-diagonal block!");
-      int* lperm = cdata_.get_lperm(i_);
-      for(int i=0; i<ncol(); i++)
+      ipc_* lperm = cdata_.get_lperm(i_);
+      for(ipc_ i=0; i<ncol(); i++)
          lperm[i] = i;
       cdata_[i_].d = &d[2*next_elim];
       if(block_size_ != INNER_BLOCK_SIZE) {
@@ -1003,11 +1000,11 @@ class Block {
                       INNER_BLOCK_SIZE, 0, nullptr, 0, work, alloc
                       );
          if(cdata_[i_].nelim < 0) return cdata_[i_].nelim;
-         int* temp = work[omp_get_thread_num()].get_ptr<int>(ncol());
-         int* blkperm = &perm[i_*block_size_];
-         for(int i=0; i<ncol(); ++i)
+         ipc_* temp = work[omp_get_thread_num()].get_ptr<ipc_>(ncol());
+         ipc_* blkperm = &perm[i_*block_size_];
+         for(ipc_ i=0; i<ncol(); ++i)
             temp[i] = blkperm[lperm[i]];
-         for(int i=0; i<ncol(); ++i)
+         for(ipc_ i=0; i<ncol(); ++i)
             blkperm[i] = temp[i];
       } else { /* block_size == INNER_BLOCK_SIZE */
          // Call another routine for small block factorization
@@ -1019,14 +1016,14 @@ class Block {
                   options.u, options.small
                   );
             if(cdata_[i_].nelim < 0) return cdata_[i_].nelim;
-            int* temp = work[omp_get_thread_num()].get_ptr<int>(ncol());
-            int* blkperm = &perm[i_*INNER_BLOCK_SIZE];
-            for(int i=0; i<ncol(); ++i)
+            ipc_* temp = work[omp_get_thread_num()].get_ptr<ipc_>(ncol());
+            ipc_* blkperm = &perm[i_*INNER_BLOCK_SIZE];
+            for(ipc_ i=0; i<ncol(); ++i)
                temp[i] = blkperm[lperm[i]];
-            for(int i=0; i<ncol(); ++i)
+            for(ipc_ i=0; i<ncol(); ++i)
                blkperm[i] = temp[i];
          } else {
-            int* blkperm = &perm[i_*INNER_BLOCK_SIZE];
+            ipc_* blkperm = &perm[i_*INNER_BLOCK_SIZE];
             T* ld = work[omp_get_thread_num()].get_ptr<T>(
                   INNER_BLOCK_SIZE*INNER_BLOCK_SIZE
                   );
@@ -1057,7 +1054,7 @@ class Block {
     *  \param small The drop tolerance for zero testing.
     *  \returns Number of successful pivots in this block.
     */
-   int apply_pivot_app(Block const& dblk, T u, T small) {
+   ipc_ apply_pivot_app(Block const& dblk, T u, T small) {
       if(i_ == j_)
          throw std::runtime_error("apply_pivot called on diagonal block!");
       if(i_ == dblk.i_) { // Apply within row (ApplyT)
@@ -1098,16 +1095,16 @@ class Block {
     *  \param ldupd Leading dimension of upd.
     */
    void update(Block const& isrc, Block const& jsrc, Workspace& work,
-         precision_ beta=1.0, T* upd=nullptr, int ldupd=0) {
+         rpc_ beta=1.0, T* upd=nullptr, ipc_ ldupd=0) {
       if(isrc.i_ == i_ && isrc.j_ == jsrc.j_) {
          // Update to right of elim column (UpdateN)
-         int elim_col = isrc.j_;
+         ipc_ elim_col = isrc.j_;
          if(cdata_[elim_col].nelim == 0) return; // nothing to do
-         int rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0;
-         int cfrom = (j_ <= elim_col) ? cdata_[j_].nelim : 0;
-         int ldld = align_lda<T>(block_size_);
-         precision_ one_val = 1.0;
-         precision_ minus_one_val = - 1.0;
+         ipc_ rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0;
+         ipc_ cfrom = (j_ <= elim_col) ? cdata_[j_].nelim : 0;
+         ipc_ ldld = align_lda<T>(block_size_);
+         rpc_ one_val = 1.0;
+         rpc_ minus_one_val = - 1.0;
          T* ld = work.get_ptr<T>(block_size_*ldld);
          // NB: we use ld[rfrom] below so alignment matches that of aval[rfrom]
          calcLD<OP_N>(
@@ -1121,7 +1118,7 @@ class Block {
                );
          if(upd && j_==calc_nblk(n_,block_size_)-1) {
             // Handle fractional part of upd that "belongs" to this block
-            int u_ncol = std::min(block_size_-ncol(), m_-n_); // ncol for upd
+            ipc_ u_ncol = std::min(block_size_-ncol(), m_-n_); // ncol for upd
             beta = (cdata_[elim_col].first_elim) ? beta : 1.0; // user beta only on first update
             if(i_ == j_) {
                // diagonal block
@@ -1144,11 +1141,11 @@ class Block {
          }
       } else {
          // Update to left of elim column (UpdateT)
-         int elim_col = jsrc.i_;
+         ipc_ elim_col = jsrc.i_;
          if(cdata_[elim_col].nelim == 0) return; // nothing to do
-         int rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0;
-         int cfrom = (j_ <= elim_col) ? cdata_[j_].nelim : 0;
-         int ldld = align_lda<T>(block_size_);
+         ipc_ rfrom = (i_ <= elim_col) ? cdata_[i_].nelim : 0;
+         ipc_ cfrom = (j_ <= elim_col) ? cdata_[j_].nelim : 0;
+         ipc_ ldld = align_lda<T>(block_size_);
          T* ld = work.get_ptr<T>(block_size_*ldld);
          // NB: we use ld[rfrom] below so alignment matches that of aval[rfrom]
          if(isrc.j_==elim_col) {
@@ -1164,8 +1161,8 @@ class Block {
                   cdata_[elim_col].d, &ld[rfrom], ldld
                   );
          }
-         precision_ one_val = 1.0;
-         precision_ minus_one_val = - 1.0;
+         rpc_ one_val = 1.0;
+         rpc_ minus_one_val = - 1.0;
          host_gemm(
                OP_N, OP_N, nrow()-rfrom, ncol()-cfrom, cdata_[elim_col].nelim,
                minus_one_val, &ld[rfrom], ldld, &jsrc.aval_[cfrom*lda_], lda_,
@@ -1189,9 +1186,9 @@ class Block {
     *  \param upd_ij pointer to \f$ U_{ij} \f$ values to be updated.
     *  \param ldupd leading dimension of upd_ij.
     */
-   void form_contrib(Block const& isrc, Block const& jsrc, Workspace& work, precision_ beta, T* upd_ij, int ldupd) {
-      int elim_col = isrc.j_;
-      int ldld = align_lda<T>(block_size_);
+   void form_contrib(Block const& isrc, Block const& jsrc, Workspace& work, rpc_ beta, T* upd_ij, ipc_ ldupd) {
+      ipc_ elim_col = isrc.j_;
+      ipc_ ldld = align_lda<T>(block_size_);
       T* ld = work.get_ptr<T>(block_size_*ldld);
       calcLD<OP_N>(
             nrow(), cdata_[elim_col].nelim, isrc.aval_, lda_,
@@ -1199,8 +1196,8 @@ class Block {
             );
       // User-supplied beta only on first update; otherwise 1.0
       T rbeta = (cdata_[elim_col].first_elim) ? beta : 1.0;
-      int blkn = get_nrow(j_); // nrow not ncol as we're on contrib
-      precision_ minus_one_val = - 1.0;
+      ipc_ blkn = get_nrow(j_); // nrow not ncol as we're on contrib
+      rpc_ minus_one_val = - 1.0;
       host_gemm(
             OP_N, OP_T, nrow(), blkn, cdata_[elim_col].nelim,
             minus_one_val, ld, ldld, jsrc.aval_, lda_,
@@ -1213,11 +1210,11 @@ class Block {
     *         for elimination. Entries in that block row/column marked as
     *         failed are ignored.
     */
-   bool isnan(int elim_col=-1) const {
-      int m = (i_==elim_col) ? cdata_[i_].get_npass() : nrow();
-      int n = (j_==elim_col) ? cdata_[j_].get_npass() : ncol();
-      for(int j=0; j<n; ++j)
-      for(int i=((i_==j_)?j:0); i<m; ++i) {
+   bool isnan(ipc_ elim_col=-1) const {
+      ipc_ m = (i_==elim_col) ? cdata_[i_].get_npass() : nrow();
+      ipc_ n = (j_==elim_col) ? cdata_[j_].get_npass() : ncol();
+      for(ipc_ j=0; j<n; ++j)
+      for(ipc_ i=((i_==j_)?j:0); i<m; ++i) {
          if(std::isnan(aval_[j*lda_+i])) {
             printf("%d, %d is nan\n", i, j);
             return true;
@@ -1233,34 +1230,34 @@ class Block {
    /** \brief Prints block (debug only) */
    void print() const {
       printf("Block %d, %d (%d x %d):\n", i_, j_, nrow(), ncol());
-      for(int i=0; i<nrow(); ++i) {
+      for(ipc_ i=0; i<nrow(); ++i) {
          printf("%d:", i);
-         for(int j=0; j<ncol(); ++j)
+         for(ipc_ j=0; j<ncol(); ++j)
             printf(" %e", aval_[j*lda_+i]);
          printf("\n");
       }
    }
 
    /** \brief return number of rows in this block */
-   int nrow() const { return get_nrow(i_); }
+   ipc_ nrow() const { return get_nrow(i_); }
    /** \brief return number of columns in this block */
-   int ncol() const { return get_ncol(j_); }
+   ipc_ ncol() const { return get_ncol(j_); }
 private:
    /** \brief return number of columns in given block column */
-   inline int get_ncol(int blk) const {
+   inline ipc_ get_ncol(ipc_ blk) const {
       return calc_blkn(blk, n_, block_size_);
    }
    /** \brief return number of rows in given block row */
-   inline int get_nrow(int blk) const {
+   inline ipc_ get_nrow(ipc_ blk) const {
       return calc_blkn(blk, m_, block_size_);
    }
 
-   int const i_; ///< block's row
-   int const j_; ///< block's column
-   int const m_; ///< number of rows in matrix
-   int const n_; ///< number of columns in matrix
-   int const lda_; ///< leading dimension of underlying storage
-   int const block_size_; ///< block size
+   ipc_ const i_; ///< block's row
+   ipc_ const j_; ///< block's column
+   ipc_ const m_; ///< number of rows in matrix
+   ipc_ const n_; ///< number of columns in matrix
+   ipc_ const lda_; ///< leading dimension of underlying storage
+   ipc_ const block_size_; ///< block size
    ColumnData<T,IntAlloc>& cdata_; ///< global column data array
    T* aval_; ///< pointer to underlying matrix storage
 };
@@ -1278,7 +1275,7 @@ class Block {
  *  \tparam Allocator allocator to use for internal memory allocations
  */
 template<typename T,
-         int BLOCK_SIZE,
+         ipc_ BLOCK_SIZE,
          typename Backup,
          bool use_tasks,
          bool debug,
@@ -1286,27 +1283,27 @@ template<typename T,
          >
 class LDLT {
    /// \{
-   typedef typename std::allocator_traits<Allocator>::template rebind_alloc<int> IntAlloc;
+   typedef typename std::allocator_traits<Allocator>::template rebind_alloc<ipc_> IntAlloc;
    typedef typename std::allocator_traits<Allocator>::template rebind_alloc<T> TAlloc;
    /// \}
 private:
    /** Performs LDL^T factorization with block pivoting. Detects failure
     *  and aborts only column if an a posteori pivot test fails. */
    static
-   int run_elim_pivoted(int const m, int const n, int* perm, T* a,
-         int const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
-         struct cpu_factor_options const& options, int const block_size,
-         T const beta, T* upd, int const ldupd, std::vector<Workspace>& work,
-         Allocator const& alloc, int const from_blk=0) {
+   ipc_ run_elim_pivoted(ipc_ const m, ipc_ const n, ipc_* perm, T* a,
+         ipc_ const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
+         struct cpu_factor_options const& options, ipc_ const block_size,
+         T const beta, T* upd, ipc_ const ldupd, std::vector<Workspace>& work,
+         Allocator const& alloc, ipc_ const from_blk=0) {
       typedef Block<T, BLOCK_SIZE, IntAlloc> BlockSpec;
 
-      int const nblk = calc_nblk(n, block_size);
-      int const mblk = calc_nblk(m, block_size);
+      ipc_ const nblk = calc_nblk(n, block_size);
+      ipc_ const mblk = calc_nblk(m, block_size);
       //printf("ENTRY PIV %d %d vis %d %d %d\n", m, n, mblk, nblk, block_size);
 
       /* Setup */
-      int next_elim = from_blk*block_size;
-      int flag;
+      ipc_ next_elim = from_blk*block_size;
+      ipc_ flag;
       #pragma omp atomic write
       flag = 0;
 
@@ -1316,7 +1313,7 @@ class LDLT {
       abort = false;
 
       #pragma omp taskgroup
-      for (int blk = from_blk; blk < nblk; blk++) {
+      for (ipc_ blk = from_blk; blk < nblk; blk++) {
          /*if(debug) {
             printf("Bcol %d:\n", blk);
             print_mat(mblk, nblk, m, n, blkdata, cdata, lda);
@@ -1339,12 +1336,16 @@ class LDLT {
 #ifdef PROFILE
                Profile::Task task("TA_LDLT_DIAG");
 #endif
+#ifdef INTEGER_64
+               if (debug) printf("Factor(%ld)\n", blk);
+#else
                if (debug) printf("Factor(%d)\n", blk);
+#endif
                BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
                // Store a copy for recovery in case of a failed column
                dblk.backup(backup);
                // Perform actual factorization
-               int nelim = dblk.template factor<Allocator>(next_elim, perm, d,
+               ipc_ nelim = dblk.template factor<Allocator>(next_elim, perm, d,
                                                         options, work, alloc);
                if (nelim < 0) {
                  #pragma omp atomic write
@@ -1387,7 +1388,7 @@ class LDLT {
          } } /* task/abort */
 
          // Loop over off-diagonal blocks applying pivot
-         for(int jblk = 0; jblk < blk; jblk++) {
+         for(ipc_ jblk = 0; jblk < blk; jblk++) {
             #pragma omp task                                          \
                firstprivate(blk, jblk)                                \
                shared(a, abort, backup, cdata, options)               \
@@ -1403,7 +1404,11 @@ class LDLT {
 #ifdef PROFILE
                 Profile::Task task("TA_LDLT_APPLY");
 #endif
+#ifdef INTEGER_64
+                if (debug) printf("ApplyT(%ld,%ld)\n", blk, jblk);
+#else
                 if (debug) printf("ApplyT(%d,%d)\n", blk, jblk);
+#endif
                 BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
                 BlockSpec cblk(blk, jblk, m, n, cdata, a, lda, block_size);
                 // Apply row permutation from factorization of dblk and in
@@ -1412,7 +1417,7 @@ class LDLT {
                 cblk.apply_rperm_and_backup(backup);
                 // Perform elimination and determine number of rows in block
                 // passing a posteori threshold pivot test
-                int blkpass = cblk.apply_pivot_app(dblk, options.u,
+                ipc_ blkpass = cblk.apply_pivot_app(dblk, options.u,
                                                    options.small);
                 // Update column's passed pivot count
                 cdata[blk].update_passed(blkpass);
@@ -1421,7 +1426,7 @@ class LDLT {
 #endif
             } } /* task/abort */
          }
-         for (int iblk = blk + 1; iblk < mblk; iblk++) {
+         for (ipc_ iblk = blk + 1; iblk < mblk; iblk++) {
             #pragma omp task                                          \
                firstprivate(blk, iblk)                                \
                shared(a, abort, backup, cdata, options)               \
@@ -1437,7 +1442,11 @@ class LDLT {
 #ifdef PROFILE
                 Profile::Task task("TA_LDLT_APPLY");
 #endif
+#ifdef INTEGER_64
+                if (debug) printf("ApplyN(%ld,%ld)\n", iblk, blk);
+#else
                 if (debug) printf("ApplyN(%d,%d)\n", iblk, blk);
+#endif
                 BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
                 BlockSpec rblk(iblk, blk, m, n, cdata, a, lda, block_size);
                 // Apply column permutation from factorization of dblk and in
@@ -1446,7 +1455,7 @@ class LDLT {
                 rblk.apply_cperm_and_backup(backup);
                 // Perform elimination and determine number of rows in block
                 // passing a posteori threshold pivot test
-                int blkpass = rblk.apply_pivot_app(dblk, options.u,
+                ipc_ blkpass = rblk.apply_pivot_app(dblk, options.u,
                                                    options.small);
                 // Update column's passed pivot count
                 cdata[blk].update_passed(blkpass);
@@ -1471,7 +1480,11 @@ class LDLT {
 #ifdef PROFILE
              Profile::Task task("TA_LDLT_ADJUST");
 #endif
+#ifdef INTEGER_64
+             if (debug) printf("Adjust(%ld)\n", blk);
+#else
              if (debug) printf("Adjust(%d)\n", blk);
+#endif
              cdata[blk].adjust(next_elim);
 #ifdef PROFILE
              task.done();
@@ -1479,11 +1492,11 @@ class LDLT {
          } } /* task/abort */
 
          // Update uneliminated columns
-         for (int jblk = 0; jblk < blk; jblk++) {
-            for (int iblk = jblk; iblk < mblk; iblk++) {
+         for (ipc_ jblk = 0; jblk < blk; jblk++) {
+            for (ipc_ iblk = jblk; iblk < mblk; iblk++) {
                // Calculate block index we depend on for i
                // (we only work with lower half of matrix)
-               int adep_idx = (blk < iblk) ? blk*block_size*lda + iblk*block_size
+               ipc_ adep_idx = (blk < iblk) ? blk*block_size*lda + iblk*block_size
                                            : iblk*block_size*lda + blk*block_size;
                #pragma omp task                                           \
                   firstprivate(blk, iblk, jblk)                           \
@@ -1501,11 +1514,15 @@ class LDLT {
 #ifdef PROFILE
                   Profile::Task task("TA_LDLT_UPDA");
 #endif
+#ifdef INTEGER_64
+                  if (debug) printf("UpdateT(%ld,%ld,%ld)\n", iblk, jblk, blk);
+#else
                   if (debug) printf("UpdateT(%d,%d,%d)\n", iblk, jblk, blk);
-                  int thread_num = omp_get_thread_num();
+#endif
+                  ipc_ thread_num = omp_get_thread_num();
                   BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
-                  int isrc_row = (blk<=iblk) ? iblk : blk;
-                  int isrc_col = (blk<=iblk) ? blk : iblk;
+                  ipc_ isrc_row = (blk<=iblk) ? iblk : blk;
+                  ipc_ isrc_col = (blk<=iblk) ? blk : iblk;
                   BlockSpec isrc(isrc_row, isrc_col, m, n, cdata, a, lda,
                         block_size);
                   BlockSpec jsrc(blk, jblk, m, n, cdata, a, lda, block_size);
@@ -1520,8 +1537,8 @@ class LDLT {
                } } /* task/abort */
             }
          }
-         for(int jblk = blk; jblk < nblk; jblk++) {
-            for(int iblk = jblk; iblk < mblk; iblk++) {
+         for(ipc_ jblk = blk; jblk < nblk; jblk++) {
+            for(ipc_ iblk = jblk; iblk < mblk; iblk++) {
                #pragma omp task                                           \
                   firstprivate(blk, iblk, jblk)                           \
                   shared(a, abort, cdata, backup, work, upd)              \
@@ -1538,8 +1555,12 @@ class LDLT {
 #ifdef PROFILE
                    Profile::Task task("TA_LDLT_UPDA");
 #endif
+#ifdef INTEGER_64
+                   if (debug) printf("UpdateN(%ld,%ld,%ld)\n", iblk, jblk, blk);
+#else
                    if (debug) printf("UpdateN(%d,%d,%d)\n", iblk, jblk, blk);
-                   int thread_num = omp_get_thread_num();
+#endif
+                   ipc_ thread_num = omp_get_thread_num();
                    BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
                    BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size);
                    BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size);
@@ -1557,10 +1578,10 @@ class LDLT {
 
          // Handle update to contribution block, if required
          if (upd && (mblk > nblk)) {
-            int uoffset = std::min(nblk*block_size, m) - n;
+            ipc_ uoffset = std::min(nblk*block_size, m) - n;
             T *upd2 = &upd[uoffset*(ldupd+1)];
-            for(int jblk = nblk; jblk < mblk; ++jblk)
-              for(int iblk = jblk; iblk < mblk; ++iblk) {
+            for(ipc_ jblk = nblk; jblk < mblk; ++jblk)
+              for(ipc_ iblk = jblk; iblk < mblk; ++iblk) {
                 T* upd_ij = &upd2[(jblk-nblk)*block_size*ldupd + (iblk-nblk)*block_size];
                 #pragma omp task                                      \
                   firstprivate(iblk, jblk, blk, upd_ij)               \
@@ -1578,8 +1599,12 @@ class LDLT {
 #ifdef PROFILE
                     Profile::Task task("TA_LDLT_UPDC");
 #endif
+#ifdef INTEGER_64
+                    if (debug) printf("FormContrib(%ld,%ld,%ld)\n", iblk,jblk,blk);
+#else
                     if (debug) printf("FormContrib(%d,%d,%d)\n", iblk,jblk,blk);
-                    int thread_num = omp_get_thread_num();
+#endif
+                    ipc_ thread_num = omp_get_thread_num();
                     BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
                     BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size);
                     BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size);
@@ -1592,7 +1617,7 @@ class LDLT {
               }
          }
       } // taskgroup and for
-      int my_flag;
+      ipc_ my_flag;
       #pragma omp atomic read
       my_flag = flag;
       if (my_flag < 0) return my_flag; // Error
@@ -1609,23 +1634,23 @@ class LDLT {
     *  and aborts only column if an a posteori pivot test fails.
     *  Serial version without tasks. */
    static
-   int run_elim_pivoted_notasks(int const m, int const n, int* perm, T* a,
-         int const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
-         struct cpu_factor_options const& options, int const block_size,
-         T const beta, T* upd, int const ldupd, std::vector<Workspace>& work,
-         Allocator const& alloc, int const from_blk=0) {
+   ipc_ run_elim_pivoted_notasks(ipc_ const m, ipc_ const n, ipc_* perm, T* a,
+         ipc_ const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
+         struct cpu_factor_options const& options, ipc_ const block_size,
+         T const beta, T* upd, ipc_ const ldupd, std::vector<Workspace>& work,
+         Allocator const& alloc, ipc_ const from_blk=0) {
       typedef Block<T, BLOCK_SIZE, IntAlloc> BlockSpec;
 
-      int const nblk = calc_nblk(n, block_size);
-      int const mblk = calc_nblk(m, block_size);
+      ipc_ const nblk = calc_nblk(n, block_size);
+      ipc_ const mblk = calc_nblk(m, block_size);
       //printf("ENTRY PIV %d %d vis %d %d %d\n", m, n, mblk, nblk, block_size);
 
       /* Setup */
-      int next_elim = from_blk*block_size;
+      ipc_ next_elim = from_blk*block_size;
 
       /* Inner loop - iterate over block columns */
       try {
-         for(int blk=from_blk; blk<nblk; blk++) {
+         for(ipc_ blk=from_blk; blk<nblk; blk++) {
             /*if(debug) {
                printf("Bcol %d:\n", blk);
                print_mat(mblk, nblk, m, n, blkdata, cdata, lda);
@@ -1633,12 +1658,16 @@ class LDLT {
 
             // Factor diagonal: depend on perm[blk*block_size] as we init npass
             {
+#ifdef INTEGER_64
+               if(debug) printf("Factor(%ld)\n", blk);
+#else
                if(debug) printf("Factor(%d)\n", blk);
+#endif
                BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
                // Store a copy for recovery in case of a failed column
                dblk.backup(backup);
                // Perform actual factorization
-               int nelim = dblk.template factor<Allocator>(
+               ipc_ nelim = dblk.template factor<Allocator>(
                      next_elim, perm, d, options, work, alloc
                      );
                if(nelim<0) return nelim;
@@ -1647,8 +1676,12 @@ class LDLT {
             }
 
             // Loop over off-diagonal blocks applying pivot
-            for(int jblk=0; jblk<blk; jblk++) {
+            for(ipc_ jblk=0; jblk<blk; jblk++) {
+#ifdef INTEGER_64
+               if(debug) printf("ApplyT(%ld,%ld)\n", blk, jblk);
+#else
                if(debug) printf("ApplyT(%d,%d)\n", blk, jblk);
+#endif
                BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
                BlockSpec cblk(blk, jblk, m, n, cdata, a, lda, block_size);
                // Apply row permutation from factorization of dblk and in
@@ -1657,14 +1690,18 @@ class LDLT {
                cblk.apply_rperm_and_backup(backup);
                // Perform elimination and determine number of rows in block
                // passing a posteori threshold pivot test
-               int blkpass = cblk.apply_pivot_app(
+               ipc_ blkpass = cblk.apply_pivot_app(
                      dblk, options.u, options.small
                      );
                // Update column's passed pivot count
                cdata[blk].update_passed(blkpass);
             }
-            for(int iblk=blk+1; iblk<mblk; iblk++) {
+            for(ipc_ iblk=blk+1; iblk<mblk; iblk++) {
+#ifdef INTEGER_64
+               if(debug) printf("ApplyN(%ld,%ld)\n", iblk, blk);
+#else
                if(debug) printf("ApplyN(%d,%d)\n", iblk, blk);
+#endif
                BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
                BlockSpec rblk(iblk, blk, m, n, cdata, a, lda, block_size);
                // Apply column permutation from factorization of dblk and in
@@ -1673,7 +1710,7 @@ class LDLT {
                rblk.apply_cperm_and_backup(backup);
                // Perform elimination and determine number of rows in block
                // passing a posteori threshold pivot test
-               int blkpass = rblk.apply_pivot_app(dblk, options.u,
+               ipc_ blkpass = rblk.apply_pivot_app(dblk, options.u,
                                                   options.small);
                // Update column's passed pivot count
                cdata[blk].update_passed(blkpass);
@@ -1681,17 +1718,25 @@ class LDLT {
 
             // Adjust column once all applys have finished and we know final
             // number of passed columns.
+#ifdef INTEGER_64
+            if(debug) printf("Adjust(%ld)\n", blk);
+#else
             if(debug) printf("Adjust(%d)\n", blk);
+#endif
             cdata[blk].adjust(next_elim);
 
             // Update uneliminated columns
-            for(int jblk=0; jblk<blk; jblk++) {
-               for(int iblk=jblk; iblk<mblk; iblk++) {
+            for(ipc_ jblk=0; jblk<blk; jblk++) {
+               for(ipc_ iblk=jblk; iblk<mblk; iblk++) {
+#ifdef INTEGER_64
+                  if(debug) printf("UpdateT(%ld,%ld,%ld)\n", iblk, jblk, blk);
+#else
                   if(debug) printf("UpdateT(%d,%d,%d)\n", iblk, jblk, blk);
-                  int thread_num = omp_get_thread_num();
+#endif
+                  ipc_ thread_num = omp_get_thread_num();
                   BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
-                  int isrc_row = (blk<=iblk) ? iblk : blk;
-                  int isrc_col = (blk<=iblk) ? blk : iblk;
+                  ipc_ isrc_row = (blk<=iblk) ? iblk : blk;
+                  ipc_ isrc_col = (blk<=iblk) ? blk : iblk;
                   BlockSpec isrc(isrc_row, isrc_col, m, n, cdata, a, lda,
                         block_size);
                   BlockSpec jsrc(blk, jblk, m, n, cdata, a, lda, block_size);
@@ -1702,10 +1747,14 @@ class LDLT {
                   ublk.update(isrc, jsrc, work[thread_num]);
                }
             }
-            for(int jblk=blk; jblk<nblk; jblk++) {
-               for(int iblk=jblk; iblk<mblk; iblk++) {
+            for(ipc_ jblk=blk; jblk<nblk; jblk++) {
+               for(ipc_ iblk=jblk; iblk<mblk; iblk++) {
+#ifdef INTEGER_64
+                  if(debug) printf("UpdateN(%ld,%ld,%ld)\n", iblk, jblk, blk);
+#else
                   if(debug) printf("UpdateN(%d,%d,%d)\n", iblk, jblk, blk);
-                  int thread_num = omp_get_thread_num();
+#endif
+                  ipc_ thread_num = omp_get_thread_num();
                   BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
                   BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size);
                   BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size);
@@ -1720,16 +1769,21 @@ class LDLT {
 
             // Handle update to contribution block, if required
             if(upd && mblk>nblk) {
-               int uoffset = std::min(nblk*block_size, m) - n;
+               ipc_ uoffset = std::min(nblk*block_size, m) - n;
                T *upd2 = &upd[uoffset*(ldupd+1)];
-               for(int jblk=nblk; jblk<mblk; ++jblk)
-               for(int iblk=jblk; iblk<mblk; ++iblk) {
+               for(ipc_ jblk=nblk; jblk<mblk; ++jblk)
+               for(ipc_ iblk=jblk; iblk<mblk; ++iblk) {
                   T* upd_ij = &upd2[(jblk-nblk)*block_size*ldupd +
                                     (iblk-nblk)*block_size];
                   {
-                     if(debug) printf("FormContrib(%d,%d,%d)\n", iblk,
-                                      jblk, blk);
-                     int thread_num = omp_get_thread_num();
+#ifdef INTEGER_64
+                     if(debug) printf("FormContrib(%ld,%ld,%ld)\n", 
+                                      iblk, jblk, blk);
+#else
+                     if(debug) printf("FormContrib(%d,%d,%d)\n",
+                                      iblk, jblk, blk);
+#endif
+                     ipc_ thread_num = omp_get_thread_num();
                      BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda,block_size);
                      BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size);
                      BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size);
@@ -1757,20 +1811,20 @@ class LDLT {
    /** Performs LDL^T factorization assuming everything works. Detects failure
     *  and aborts entire thing if a posteori pivot test fails. */
    static
-   int run_elim_unpivoted(int const m, int const n, int* perm, T* a,
-         int const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
-         int* up_to_date, struct cpu_factor_options const& options,
-         int const block_size, T const beta, T* upd, int const ldupd,
+   ipc_ run_elim_unpivoted(ipc_ const m, ipc_ const n, ipc_* perm, T* a,
+         ipc_ const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
+         ipc_* up_to_date, struct cpu_factor_options const& options,
+         ipc_ const block_size, T const beta, T* upd, ipc_ const ldupd,
          std::vector<Workspace>& work, Allocator const& alloc) {
       typedef Block<T, BLOCK_SIZE, IntAlloc> BlockSpec;
 
-      int const nblk = calc_nblk(n, block_size);
-      int const mblk = calc_nblk(m, block_size);
+      ipc_ const nblk = calc_nblk(n, block_size);
+      ipc_ const mblk = calc_nblk(m, block_size);
       //printf("ENTRY %d %d vis %d %d %d\n", m, n, mblk, nblk, block_size);
 
       /* Setup */
-      int next_elim = 0;
-      int flag;
+      ipc_ next_elim = 0;
+      ipc_ flag;
       #pragma omp atomic write
       flag = 0;
 
@@ -1779,7 +1833,7 @@ class LDLT {
       #pragma omp atomic write
       abort = false;
       #pragma omp taskgroup
-      for(int blk = 0; blk < nblk; blk++) {
+      for(ipc_ blk = 0; blk < nblk; blk++) {
          /*if(debug) {
             printf("Bcol %d:\n", blk);
             print_mat(mblk, nblk, m, n, blkdata, cdata, lda);
@@ -1801,14 +1855,18 @@ class LDLT {
 #ifdef PROFILE
                Profile::Task task("TA_LDLT_DIAG");
 #endif
+#ifdef INTEGER_64
+               if(debug) printf("Factor(%ld)\n", blk);
+#else
                if(debug) printf("Factor(%d)\n", blk);
+#endif
                BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
                // On first access to this block, store copy in case of failure
                if (blk == 0) dblk.backup(backup);
                // Record block state as assuming we've done up to col blk
                up_to_date[blk*mblk+blk] = blk;
                // Perform actual factorization
-               int nelim = dblk.template factor<Allocator>(next_elim, perm, d, options, work, alloc);
+               ipc_ nelim = dblk.template factor<Allocator>(next_elim, perm, d, options, work, alloc);
                if (nelim < get_ncol(blk, n, block_size)) {
                  cdata[blk].init_passed(0); // diagonal block has NOT passed
 #ifdef _OPENMP
@@ -1850,7 +1908,7 @@ class LDLT {
          } } /* task/abort */
 
          // Loop over off-diagonal blocks applying pivot
-         for (int jblk = 0; jblk < blk; jblk++) {
+         for (ipc_ jblk = 0; jblk < blk; jblk++) {
             #pragma omp task                                              \
                firstprivate(blk, jblk)                                    \
                shared(a, abort, backup, cdata, options, work, up_to_date) \
@@ -1865,8 +1923,12 @@ class LDLT {
 #ifdef PROFILE
                 Profile::Task task("TA_LDLT_APPLY");
 #endif
+#ifdef INTEGER_64
+                if (debug) printf("ApplyT(%ld,%ld)\n", blk, jblk);
+#else
                 if (debug) printf("ApplyT(%d,%d)\n", blk, jblk);
-                int thread_num = omp_get_thread_num();
+#endif
+                ipc_ thread_num = omp_get_thread_num();
                 BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
                 BlockSpec cblk(blk, jblk, m, n, cdata, a, lda, block_size);
                 // Record block state as assuming we've done up to col blk
@@ -1880,7 +1942,7 @@ class LDLT {
 #endif
             } } /* task/abort */
          }
-         for (int iblk = blk+1; iblk < mblk; iblk++) {
+         for (ipc_ iblk = blk+1; iblk < mblk; iblk++) {
             #pragma omp task                                              \
                firstprivate(blk, iblk)                                    \
                shared(a, abort, backup, cdata, options, work, up_to_date) \
@@ -1895,8 +1957,12 @@ class LDLT {
 #ifdef PROFILE
                 Profile::Task task("TA_LDLT_APPLY");
 #endif
+#ifdef INTEGER_64
+                if (debug) printf("ApplyN(%ld,%ld)\n", iblk, blk);
+#else
                 if (debug) printf("ApplyN(%d,%d)\n", iblk, blk);
-                int thread_num = omp_get_thread_num();
+#endif
+                ipc_ thread_num = omp_get_thread_num();
                 BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
                 BlockSpec rblk(iblk, blk, m, n, cdata, a, lda, block_size);
                 // On first access to this block, store copy in case of failure
@@ -1907,7 +1973,7 @@ class LDLT {
                 rblk.apply_cperm(work[thread_num]);
                 // Perform elimination and determine number of rows in block
                 // passing a posteori threshold pivot test
-                int blkpass = rblk.apply_pivot_app(dblk, options.u, options.small);
+                ipc_ blkpass = rblk.apply_pivot_app(dblk, options.u, options.small);
                 // Update column's passed pivot count
                 if (cdata[blk].test_fail(blkpass)) {
 #ifdef _OPENMP
@@ -1926,9 +1992,9 @@ class LDLT {
 
          // Update uneliminated columns
          // Column blk only needed if upd is present
-         int jsa = (upd) ? blk : blk + 1;
-         for(int jblk = jsa; jblk < nblk; jblk++) {
-            for(int iblk = jblk; iblk < mblk; iblk++) {
+         ipc_ jsa = (upd) ? blk : blk + 1;
+         for(ipc_ jblk = jsa; jblk < nblk; jblk++) {
+            for(ipc_ iblk = jblk; iblk < mblk; iblk++) {
                #pragma omp task                                           \
                   firstprivate(blk, iblk, jblk)                           \
                   shared(a, abort, cdata, backup, work, upd, up_to_date)  \
@@ -1944,8 +2010,12 @@ class LDLT {
 #ifdef PROFILE
                    Profile::Task task("TA_LDLT_UPDA");
 #endif
+#ifdef INTEGER_64
+                   if (debug) printf("UpdateN(%ld,%ld,%ld)\n", iblk, jblk, blk);
+#else
                    if (debug) printf("UpdateN(%d,%d,%d)\n", iblk, jblk, blk);
-                   int thread_num = omp_get_thread_num();
+#endif
+                   ipc_ thread_num = omp_get_thread_num();
                    BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
                    BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size);
                    BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size);
@@ -1964,10 +2034,10 @@ class LDLT {
 
          // Handle update to contribution block, if required
          if (upd && (mblk > nblk)) {
-            int uoffset = std::min(nblk*block_size, m) - n;
+            ipc_ uoffset = std::min(nblk*block_size, m) - n;
             T *upd2 = &upd[uoffset*(ldupd+1)];
-            for(int jblk = nblk; jblk < mblk; ++jblk)
-              for(int iblk = jblk; iblk < mblk; ++iblk) {
+            for(ipc_ jblk = nblk; jblk < mblk; ++jblk)
+              for(ipc_ iblk = jblk; iblk < mblk; ++iblk) {
                 T* upd_ij = &upd2[(jblk-nblk)*block_size*ldupd + (iblk-nblk)*block_size];
                 #pragma omp task                                      \
                   firstprivate(iblk, jblk, blk, upd_ij)               \
@@ -1984,8 +2054,12 @@ class LDLT {
 #ifdef PROFILE
                    Profile::Task task("TA_LDLT_UPDC");
 #endif
+#ifdef INTEGER_64
+                   if (debug) printf("FormContrib(%ld,%ld,%ld)\n", iblk, jblk,blk);
+#else
                    if (debug) printf("FormContrib(%d,%d,%d)\n", iblk, jblk,blk);
-                   int thread_num = omp_get_thread_num();
+#endif
+                   ipc_ thread_num = omp_get_thread_num();
                    BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
                    BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size);
                    BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size);
@@ -2007,7 +2081,7 @@ class LDLT {
          print_mat(mblk, nblk, m, n, blkdata, cdata, lda);
       }*/
 
-      int my_flag;
+      ipc_ my_flag;
       #pragma omp atomic read
       my_flag = flag;
       if (my_flag < 0) return my_flag;
@@ -2017,22 +2091,22 @@ class LDLT {
    /** Performs LDL^T factorization assuming everything works. Detects failure
     *  and aborts entire thing if a posteori pivot test fails. */
    static
-   int run_elim_unpivoted_notasks(int const m, int const n, int* perm, T* a,
-         int const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
-         int* up_to_date, struct cpu_factor_options const& options,
-         int const block_size, T const beta, T* upd, int const ldupd,
+   ipc_ run_elim_unpivoted_notasks(ipc_ const m, ipc_ const n, ipc_* perm, T* a,
+         ipc_ const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
+         ipc_* up_to_date, struct cpu_factor_options const& options,
+         ipc_ const block_size, T const beta, T* upd, ipc_ const ldupd,
          std::vector<Workspace>& work, Allocator const& alloc) {
       typedef Block<T, BLOCK_SIZE, IntAlloc> BlockSpec;
 
-      int const nblk = calc_nblk(n, block_size);
-      int const mblk = calc_nblk(m, block_size);
+      ipc_ const nblk = calc_nblk(n, block_size);
+      ipc_ const mblk = calc_nblk(m, block_size);
       //printf("ENTRY %d %d vis %d %d %d\n", m, n, mblk, nblk, block_size);
 
       /* Setup */
-      int next_elim = 0;
+      ipc_ next_elim = 0;
 
       /* Inner loop - iterate over block columns */
-      for(int blk=0; blk<nblk; blk++) {
+      for(ipc_ blk=0; blk<nblk; blk++) {
          /*if(debug) {
             printf("Bcol %d:\n", blk);
             print_mat(mblk, nblk, m, n, blkdata, cdata, lda);
@@ -2040,14 +2114,18 @@ class LDLT {
 
          // Factor diagonal
          try {
+#ifdef INTEGER_64
+            if(debug) printf("Factor(%ld)\n", blk);
+#else
             if(debug) printf("Factor(%d)\n", blk);
+#endif
             BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
             // On first access to this block, store copy in case of failure
             if(blk==0) dblk.backup(backup);
             // Record block state as assuming we've done up to col blk
             up_to_date[blk*mblk+blk] = blk;
             // Perform actual factorization
-            int nelim = dblk.template factor<Allocator>(
+            ipc_ nelim = dblk.template factor<Allocator>(
                   next_elim, perm, d, options, work, alloc
                   );
             if(nelim < get_ncol(blk, n, block_size)) {
@@ -2065,9 +2143,13 @@ class LDLT {
          }
 
          // Loop over off-diagonal blocks applying pivot
-         for(int jblk=0; jblk<blk; jblk++) {
+         for(ipc_ jblk=0; jblk<blk; jblk++) {
+#ifdef INTEGER_64
+            if(debug) printf("ApplyT(%ld,%ld)\n", blk, jblk);
+#else
             if(debug) printf("ApplyT(%d,%d)\n", blk, jblk);
-            int thread_num = omp_get_thread_num();
+#endif
+            ipc_ thread_num = omp_get_thread_num();
             BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
             BlockSpec cblk(blk, jblk, m, n, cdata, a, lda, block_size);
             // Record block state as assuming we've done up to col blk
@@ -2077,9 +2159,13 @@ class LDLT {
             // NB: no actual application of pivot must be done, as we are
             // assuming everything has passed...
          }
-         for(int iblk=blk+1; iblk<mblk; iblk++) {
+         for(ipc_ iblk=blk+1; iblk<mblk; iblk++) {
+#ifdef INTEGER_64
+            if(debug) printf("ApplyN(%ld,%ld)\n", iblk, blk);
+#else
             if(debug) printf("ApplyN(%d,%d)\n", iblk, blk);
-            int thread_num = omp_get_thread_num();
+#endif
+            ipc_ thread_num = omp_get_thread_num();
             BlockSpec dblk(blk, blk, m, n, cdata, a, lda, block_size);
             BlockSpec rblk(iblk, blk, m, n, cdata, a, lda, block_size);
             // On first access to this block, store copy in case of failure
@@ -2090,7 +2176,7 @@ class LDLT {
             rblk.apply_cperm(work[thread_num]);
             // Perform elimination and determine number of rows in block
             // passing a posteori threshold pivot test
-            int blkpass = rblk.apply_pivot_app(dblk, options.u, options.small);
+            ipc_ blkpass = rblk.apply_pivot_app(dblk, options.u, options.small);
             // Update column's passed pivot count
             if(cdata[blk].test_fail(blkpass))
                return cdata.calc_nelim(m);
@@ -2098,11 +2184,15 @@ class LDLT {
 
          // Update uneliminated columns
          // Column blk only needed if upd is present
-         int jsa = (upd) ? blk : blk + 1;
-         for(int jblk=jsa; jblk<nblk; jblk++) {
-            for(int iblk=jblk; iblk<mblk; iblk++) {
+         ipc_ jsa = (upd) ? blk : blk + 1;
+         for(ipc_ jblk=jsa; jblk<nblk; jblk++) {
+            for(ipc_ iblk=jblk; iblk<mblk; iblk++) {
+#ifdef INTEGER_64
+               if(debug) printf("UpdateN(%ld,%ld,%ld)\n", iblk, jblk, blk);
+#else
                if(debug) printf("UpdateN(%d,%d,%d)\n", iblk, jblk, blk);
-               int thread_num = omp_get_thread_num();
+#endif
+               ipc_ thread_num = omp_get_thread_num();
                BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
                BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size);
                BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size);
@@ -2117,14 +2207,18 @@ class LDLT {
 
          // Handle update to contribution block, if required
          if(upd && mblk>nblk) {
-            int uoffset = std::min(nblk*block_size, m) - n;
+            ipc_ uoffset = std::min(nblk*block_size, m) - n;
             T *upd2 = &upd[uoffset*(ldupd+1)];
-            for(int jblk=nblk; jblk<mblk; ++jblk)
-            for(int iblk=jblk; iblk<mblk; ++iblk) {
+            for(ipc_ jblk=nblk; jblk<mblk; ++jblk)
+            for(ipc_ iblk=jblk; iblk<mblk; ++iblk) {
             T* upd_ij = &upd2[(jblk-nblk)*block_size*ldupd +
                               (iblk-nblk)*block_size];
+#ifdef INTEGER_64
+               if(debug) printf("FormContrib(%ld,%ld,%ld)\n", iblk, jblk, blk);
+#else
                if(debug) printf("FormContrib(%d,%d,%d)\n", iblk, jblk, blk);
-               int thread_num = omp_get_thread_num();
+#endif
+               ipc_ thread_num = omp_get_thread_num();
                BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
                BlockSpec isrc(iblk, blk, m, n, cdata, a, lda, block_size);
                BlockSpec jsrc(jblk, blk, m, n, cdata, a, lda, block_size);
@@ -2151,17 +2245,17 @@ class LDLT {
     * 3) If up_to_date > nelim_blk then we reset and recalculate completely
     * */
    static
-   void restore(int const nelim_blk, int const m, int const n, int* perm, T* a,
-         int const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
-         int const* old_perm, int const* up_to_date, int const block_size,
-         std::vector<Workspace>& work, T* upd, int const ldupd) {
+   void restore(ipc_ const nelim_blk, ipc_ const m, ipc_ const n, ipc_* perm, T* a,
+         ipc_ const lda, T* d, ColumnData<T,IntAlloc>& cdata, Backup& backup,
+         ipc_ const* old_perm, ipc_ const* up_to_date, ipc_ const block_size,
+         std::vector<Workspace>& work, T* upd, ipc_ const ldupd) {
       typedef Block<T, BLOCK_SIZE, IntAlloc> BlockSpec;
 
-      int const nblk = calc_nblk(n, block_size);
-      int const mblk = calc_nblk(m, block_size);
+      ipc_ const nblk = calc_nblk(n, block_size);
+      ipc_ const mblk = calc_nblk(m, block_size);
 
       /* Restore perm for failed part */
-      for(int i=nelim_blk*block_size; i<n; ++i)
+      for(ipc_ i=nelim_blk*block_size; i<n; ++i)
          perm[i] = old_perm[i];
 
       /* Restore a */
@@ -2178,16 +2272,16 @@ class LDLT {
       // Hence we skip the "passed" diagonal block, and the rectangular block
       // below it. Then we just apply a reverse row permutation if required to
       // the failed rows in the passed columns.
-      for(int jblk=0; jblk<nelim_blk; ++jblk) {
-         for(int iblk=nelim_blk; iblk<nblk; ++iblk) {
-            int progress = up_to_date[jblk*mblk+iblk];
+      for(ipc_ jblk=0; jblk<nelim_blk; ++jblk) {
+         for(ipc_ iblk=nelim_blk; iblk<nblk; ++iblk) {
+            ipc_ progress = up_to_date[jblk*mblk+iblk];
             if(progress >= nelim_blk) {
                #pragma omp task \
                   firstprivate(iblk, jblk) \
                   shared(a, cdata, work) \
                   depend(inout: a[jblk*block_size*lda+iblk*block_size:1])
                {
-                  int thread_num = omp_get_thread_num();
+                  ipc_ thread_num = omp_get_thread_num();
                   BlockSpec rblk(iblk, jblk, m, n, cdata, a, lda, block_size);
                   rblk.apply_inv_rperm(work[thread_num]);
                }
@@ -2195,9 +2289,9 @@ class LDLT {
          }
       }
       // Now all eliminated columns are good, fix up remainder of node
-      for(int jblk=nelim_blk; jblk<nblk; ++jblk) {
-         for(int iblk=jblk; iblk<mblk; ++iblk) {
-            int progress = up_to_date[jblk*mblk+iblk];
+      for(ipc_ jblk=nelim_blk; jblk<nblk; ++jblk) {
+         for(ipc_ iblk=jblk; iblk<mblk; ++iblk) {
+            ipc_ progress = up_to_date[jblk*mblk+iblk];
             if(progress >= nelim_blk) {
                // Bad updates applied, needs reset and full recalculation
                #pragma omp task \
@@ -2211,7 +2305,7 @@ class LDLT {
                progress = -1;
             }
             // Apply any missing updates to a
-            for(int kblk=progress+1; kblk<nelim_blk; ++kblk) {
+            for(ipc_ kblk=progress+1; kblk<nelim_blk; ++kblk) {
                #pragma omp task \
                   firstprivate(iblk, jblk, kblk) \
                   shared(a, upd, cdata, work) \
@@ -2219,7 +2313,7 @@ class LDLT {
                   depend(in: a[kblk*block_size*lda+iblk*block_size:1]) \
                   depend(in: a[kblk*block_size*lda+jblk*block_size:1])
                {
-                  int thread_num = omp_get_thread_num();
+                  ipc_ thread_num = omp_get_thread_num();
                   BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
                   BlockSpec isrc(iblk, kblk, m, n, cdata, a, lda, block_size);
                   BlockSpec jsrc(jblk, kblk, m, n, cdata, a, lda, block_size);
@@ -2230,22 +2324,22 @@ class LDLT {
       }
       // Now all eliminated columns are good, fix up contribution block
       if(upd) {
-         int uoffset = std::min(nblk*block_size, m) - n;
+         ipc_ uoffset = std::min(nblk*block_size, m) - n;
          T *upd2 = &upd[uoffset*(ldupd+1)];
-         for(int jblk=nblk; jblk<mblk; ++jblk)
-         for(int iblk=jblk; iblk<mblk; ++iblk) {
-            int progress = up_to_date[jblk*mblk+iblk];
+         for(ipc_ jblk=nblk; jblk<mblk; ++jblk)
+         for(ipc_ iblk=jblk; iblk<mblk; ++iblk) {
+            ipc_ progress = up_to_date[jblk*mblk+iblk];
             if(progress >= nelim_blk) progress = -1; // needs complete reset
             T* upd_ij = &upd2[(jblk-nblk)*block_size*ldupd +
                               (iblk-nblk)*block_size];
-            for(int kblk=progress+1; kblk<nelim_blk; ++kblk) {
+            for(ipc_ kblk=progress+1; kblk<nelim_blk; ++kblk) {
                // NB: no need for isrc or jsrc dep as must be good already
                #pragma omp task \
                   firstprivate(iblk, jblk, kblk, upd_ij) \
                   shared(a, cdata, work) \
                   depend(inout: upd_ij[0:1])
                {
-                  int thread_num = omp_get_thread_num();
+                  ipc_ thread_num = omp_get_thread_num();
                   BlockSpec ublk(iblk, jblk, m, n, cdata, a, lda, block_size);
                   BlockSpec isrc(iblk, kblk, m, n, cdata, a, lda, block_size);
                   BlockSpec jsrc(jblk, kblk, m, n, cdata, a, lda, block_size);
@@ -2276,14 +2370,22 @@ class LDLT {
     *  \param lda leading dimension of a
     */
    static
-   void print_mat(int m, int n, const int *perm,
-                  std::vector<bool> const& eliminated, const T *a, int lda) {
-      for(int row=0; row<m; row++) {
+   void print_mat(ipc_ m, ipc_ n, const ipc_ *perm,
+                  std::vector<bool> const& eliminated, const T *a, ipc_ lda) {
+      for(ipc_ row=0; row<m; row++) {
          if(row < n)
+#ifdef INTEGER_64
+            printf("%ld%s:", perm[row], eliminated[row]?"X":" ");
+#else
             printf("%d%s:", perm[row], eliminated[row]?"X":" ");
+#endif
          else
+#ifdef INTEGER_64
+            printf("%ld%s:", row, "U");
+#else
             printf("%d%s:", row, "U");
-         for(int col=0; col<std::min(n,row+1); col++)
+#endif
+         for(ipc_ col=0; col<std::min(n,row+1); col++)
             printf(" %10.4f", a[col*lda+row]);
          printf("\n");
       }
@@ -2291,26 +2393,26 @@ class LDLT {
 
    /** \brief return number of columns in given block column */
    static
-   inline int get_ncol(int blk, int n, int block_size) {
+   inline ipc_ get_ncol(ipc_ blk, ipc_ n, ipc_ block_size) {
       return calc_blkn(blk, n, block_size);
    }
    /** \brief return number of rows in given block row */
    static
-   inline int get_nrow(int blk, int m, int block_size) {
+   inline ipc_ get_nrow(ipc_ blk, ipc_ m, ipc_ block_size) {
       return calc_blkn(blk, m, block_size);
    }
 
 public:
    /** Factorize an entire matrix */
    static
-   int factor(int m, int n, int *perm, T *a, int lda, T *d, Backup& backup, struct cpu_factor_options const& options, PivotMethod pivot_method, int block_size, T beta, T* upd, int ldupd, std::vector<Workspace>& work, Allocator const& alloc=Allocator()) {
+   ipc_ factor(ipc_ m, ipc_ n, ipc_ *perm, T *a, ipc_ lda, T *d, Backup& backup, struct cpu_factor_options const& options, PivotMethod pivot_method, ipc_ block_size, T beta, T* upd, ipc_ ldupd, std::vector<Workspace>& work, Allocator const& alloc=Allocator()) {
       /* Sanity check arguments */
       if(m < n) return -1;
       if(lda < n) return -4;
 
       /* Initialize useful quantities: */
-      int nblk = calc_nblk(n, block_size);
-      int mblk = calc_nblk(m, block_size);
+      ipc_ nblk = calc_nblk(n, block_size);
+      ipc_ mblk = calc_nblk(m, block_size);
 
       /* Temporary workspaces */
       ColumnData<T, IntAlloc> cdata(n, block_size, IntAlloc(alloc));
@@ -2324,7 +2426,7 @@ class LDLT {
        *    - If no pivots selected across matrix, perform swaps to get large
        *      entries into diagonal blocks
        */
-      int num_elim;
+      ipc_ num_elim;
       if(pivot_method == PivotMethod::app_aggressive) {
          if(beta!=0.0) {
             // We don't support backup of contribution block at present,
@@ -2336,12 +2438,12 @@ class LDLT {
          // Take a copy of perm
          typedef std::allocator_traits<IntAlloc> IATraits;
          IntAlloc intAlloc(alloc);
-         int* perm_copy = IATraits::allocate(intAlloc, n);
-         for(int i=0; i<n; ++i)
+         ipc_* perm_copy = IATraits::allocate(intAlloc, n);
+         for(ipc_ i=0; i<n; ++i)
             perm_copy[i] = perm[i];
          size_t num_blocks = (upd) ? ((size_t) mblk)*mblk
                                    : ((size_t) mblk)*nblk;
-         int* up_to_date = IATraits::allocate(intAlloc, num_blocks);
+         ipc_* up_to_date = IATraits::allocate(intAlloc, num_blocks);
          for(size_t i=0; i<num_blocks; ++i)
             up_to_date[i] = -1; // not even backed up yet
          // Run the elimination
@@ -2367,7 +2469,7 @@ class LDLT {
             }
 #endif
             // Factorization ecountered a pivoting failure.
-            int nelim_blk = num_elim/block_size;
+            ipc_ nelim_blk = num_elim/block_size;
             // Rollback to known good state
             restore(
                   nelim_blk, m, n, perm, a, lda, d, cdata, backup, perm_copy,
@@ -2413,8 +2515,8 @@ class LDLT {
 #ifdef PROFILE
          Profile::Task task_post("TA_LDLT_POST");
 #endif
-         std::vector<int, IntAlloc> failed_perm(n-num_elim, 0, alloc);
-         for(int jblk=0, insert=0, fail_insert=0; jblk<nblk; jblk++) {
+         std::vector<ipc_, IntAlloc> failed_perm(n-num_elim, 0, alloc);
+         for(ipc_ jblk=0, insert=0, fail_insert=0; jblk<nblk; jblk++) {
             cdata[jblk].move_back(
                   get_ncol(jblk, n, block_size), &perm[jblk*block_size],
                   &perm[insert], failed_perm.data() + fail_insert
@@ -2422,16 +2524,16 @@ class LDLT {
             insert += cdata[jblk].nelim;
             fail_insert += get_ncol(jblk, n, block_size) - cdata[jblk].nelim;
          }
-         for(int i=0; i<n-num_elim; ++i)
+         for(ipc_ i=0; i<n-num_elim; ++i)
             perm[num_elim+i] = failed_perm[i];
 
          // Extract failed entries of a
-         int nfail = n-num_elim;
+         ipc_ nfail = n-num_elim;
          std::vector<T, TAlloc> failed_diag(nfail*n, 0, alloc);
          std::vector<T, TAlloc> failed_rect(nfail*(m-n), 0, alloc);
-         for(int jblk=0, jfail=0, jinsert=0; jblk<nblk; ++jblk) {
+         for(ipc_ jblk=0, jfail=0, jinsert=0; jblk<nblk; ++jblk) {
             // Diagonal part
-            for(int iblk=jblk, ifail=jfail, iinsert=jinsert; iblk<nblk; ++iblk) {
+            for(ipc_ iblk=jblk, ifail=jfail, iinsert=jinsert; iblk<nblk; ++iblk) {
                copy_failed_diag(
                      get_ncol(iblk, n, block_size),
                      get_ncol(jblk, n, block_size),
@@ -2453,7 +2555,7 @@ class LDLT {
                   failed_rect.data() + (jfail*(m-n)+(nblk-1)*block_size-n), m-n,
                   &a[jblk*block_size*lda+(nblk-1)*block_size], lda
                   );
-            for(int iblk=nblk; iblk<mblk; ++iblk) {
+            for(ipc_ iblk=nblk; iblk<mblk; ++iblk) {
                copy_failed_rect(
                      get_nrow(iblk, m, block_size),
                      get_ncol(jblk, n, block_size), 0, cdata[jblk],
@@ -2466,9 +2568,9 @@ class LDLT {
          }
 
          // Move data up
-         for(int jblk=0, jinsert=0; jblk<nblk; ++jblk) {
+         for(ipc_ jblk=0, jinsert=0; jblk<nblk; ++jblk) {
             // Diagonal part
-            for(int iblk=jblk, iinsert=jinsert; iblk<nblk; ++iblk) {
+            for(ipc_ iblk=jblk, iinsert=jinsert; iblk<nblk; ++iblk) {
                move_up_diag(
                      cdata[iblk], cdata[jblk], &a[jinsert*lda+iinsert],
                      &a[jblk*block_size*lda+iblk*block_size], lda
@@ -2483,7 +2585,7 @@ class LDLT {
                   &a[jinsert*lda+(nblk-1)*block_size],
                   &a[jblk*block_size*lda+(nblk-1)*block_size], lda
                   );
-            for(int iblk=nblk; iblk<mblk; ++iblk)
+            for(ipc_ iblk=nblk; iblk<mblk; ++iblk)
                move_up_rect(
                      get_nrow(iblk, m, block_size), 0, cdata[jblk],
                      &a[jinsert*lda+iblk*block_size],
@@ -2494,13 +2596,13 @@ class LDLT {
 
          // Store failed entries back to correct locations
          // Diagonal part
-         for(int j=0; j<n; ++j)
-         for(int i=std::max(j,num_elim), k=i-num_elim; i<n; ++i, ++k)
+         for(ipc_ j=0; j<n; ++j)
+         for(ipc_ i=std::max(j,num_elim), k=i-num_elim; i<n; ++i, ++k)
             a[j*lda+i] = failed_diag[j*nfail+k];
          // Rectangular part
          T* arect = &a[num_elim*lda+n];
-         for(int j=0; j<nfail; ++j)
-         for(int i=0; i<m-n; ++i)
+         for(ipc_ j=0; j<nfail; ++j)
+         for(ipc_ i=0; i<m-n; ++i)
             arect[j*lda+i] = failed_rect[j*(m-n)+i];
 #ifdef PROFILE
          task_post.done();
@@ -2509,8 +2611,8 @@ class LDLT {
 
       if(debug) {
          std::vector<bool> eliminated(n);
-         for(int i=0; i<num_elim; i++) eliminated[i] = true;
-         for(int i=num_elim; i<n; i++) eliminated[i] = false;
+         for(ipc_ i=0; i<num_elim; i++) eliminated[i] = true;
+         for(ipc_ i=num_elim; i<n; i++) eliminated[i] = false;
          printf("FINAL:\n");
          print_mat(m, n, perm, eliminated, a, lda);
       }
@@ -2524,28 +2626,28 @@ class LDLT {
 using namespace spral::ssids::cpu::ldlt_app_internal;
 
 template<typename T>
-size_t ldlt_app_factor_mem_required(int m, int n, int block_size) {
+size_t ldlt_app_factor_mem_required(ipc_ m, ipc_ n, ipc_ block_size) {
 #if defined(__AVX512F__)
-  int const align = 64;
+  ipc_ const align = 64;
 #elif defined(__AVX__)
-  int const align = 32;
+  ipc_ const align = 32;
 #else
-  int const align = 16;
+  ipc_ const align = 16;
 #endif
    return align_lda<T>(m) * n * sizeof(T) + align; // CopyBackup
 }
 
 template<typename T, typename Allocator>
-int ldlt_app_factor(int m, int n, int* perm, T* a, int lda, T* d, T beta,
-                    T* upd, int ldupd, struct cpu_factor_options const& options,
+ipc_ ldlt_app_factor(ipc_ m, ipc_ n, ipc_* perm, T* a, ipc_ lda, T* d, T beta,
+                    T* upd, ipc_ ldupd, struct cpu_factor_options const& options,
                     std::vector<Workspace>& work, Allocator const& alloc) {
    // If we've got a tall and narrow node, adjust block size so each block
    // has roughly blksz**2 entries
    // FIXME: Decide if this reshape is actually useful, given it will generate
    //        a lot more update tasks instead?
-   int outer_block_size = options.cpu_block_size;
+   ipc_ outer_block_size = options.cpu_block_size;
    /*if(n < outer_block_size) {
-       outer_block_size = int((long(outer_block_size)*outer_block_size) / n);
+       outer_block_size = ipc_((longc_(outer_block_size)*outer_block_size) / n);
    }*/
 
 #ifdef PROFILE
@@ -2567,13 +2669,18 @@ int ldlt_app_factor(int m, int n, int* perm, T* a, int lda, T* d, T beta,
             outer_block_size, beta, upd, ldupd, work, alloc
             );
 }
-template int ldlt_app_factor<precision_, BuddyAllocator<precision_,std::allocator<precision_>>>(int, int, int*, precision_*, int, precision_*, precision_, precision_*, int, struct cpu_factor_options const&, std::vector<Workspace>&, BuddyAllocator<precision_,std::allocator<precision_>> const& alloc);
+template ipc_ ldlt_app_factor<rpc_, BuddyAllocator<rpc_,
+       std::allocator<rpc_>>>(ipc_, ipc_, ipc_*, rpc_*, ipc_, rpc_*, rpc_, 
+                              rpc_*, ipc_, struct cpu_factor_options const&, 
+                              std::vector<Workspace>&, 
+                              BuddyAllocator<rpc_,
+                              std::allocator<rpc_>> const& alloc);
 
 template <typename T>
-void ldlt_app_solve_fwd(int m, int n, T const* l, int ldl, int nrhs, T* x,
-                        int ldx) {
-   precision_ one_val = 1.0;
-   precision_ minus_one_val = - 1.0;
+void ldlt_app_solve_fwd(ipc_ m, ipc_ n, T const* l, ipc_ ldl, ipc_ nrhs, T* x,
+                        ipc_ ldx) {
+   rpc_ one_val = 1.0;
+   rpc_ minus_one_val = - 1.0;
    if(nrhs==1) {
       host_trsv(FILL_MODE_LWR, OP_N, DIAG_UNIT, n, l, ldl, x, 1);
       if(m > n)
@@ -2586,16 +2693,16 @@ void ldlt_app_solve_fwd(int m, int n, T const* l, int ldl, int nrhs, T* x,
                    ldl, x, ldx, one_val, &x[n], ldx);
    }
 }
-template void ldlt_app_solve_fwd<precision_>(int, int, precision_ const*,
-                                             int, int, precision_*, int);
+template void ldlt_app_solve_fwd<rpc_>(ipc_, ipc_, rpc_ const*,
+                                             ipc_, ipc_, rpc_*, ipc_);
 
 template <typename T>
-void ldlt_app_solve_diag(int n, T const* d, int nrhs, T* x, int ldx) {
-   for(int i=0; i<n; ) {
+void ldlt_app_solve_diag(ipc_ n, T const* d, ipc_ nrhs, T* x, ipc_ ldx) {
+   for(ipc_ i=0; i<n; ) {
       if(i+1==n || std::isfinite(d[2*i+2])) {
          // 1x1 pivot
          T d11 = d[2*i];
-         for(int r=0; r<nrhs; ++r)
+         for(ipc_ r=0; r<nrhs; ++r)
             x[r*ldx+i] *= d11;
          i++;
       } else {
@@ -2603,7 +2710,7 @@ void ldlt_app_solve_diag(int n, T const* d, int nrhs, T* x, int ldx) {
          T d11 = d[2*i];
          T d21 = d[2*i+1];
          T d22 = d[2*i+3];
-         for(int r=0; r<nrhs; ++r) {
+         for(ipc_ r=0; r<nrhs; ++r) {
             T x1 = x[r*ldx+i];
             T x2 = x[r*ldx+i+1];
             x[r*ldx+i]   = d11*x1 + d21*x2;
@@ -2613,14 +2720,14 @@ void ldlt_app_solve_diag(int n, T const* d, int nrhs, T* x, int ldx) {
       }
    }
 }
-template void ldlt_app_solve_diag<precision_>(int, precision_ const*, int,
-                                              precision_*, int);
+template void ldlt_app_solve_diag<rpc_>(ipc_, rpc_ const*, ipc_,
+                                              rpc_*, ipc_);
 
 template <typename T>
-void ldlt_app_solve_bwd(int m, int n, T const* l, int ldl, int nrhs, T* x,
-                        int ldx) {
-   precision_ one_val = 1.0;
-   precision_ minus_one_val = - 1.0;
+void ldlt_app_solve_bwd(ipc_ m, ipc_ n, T const* l, ipc_ ldl, ipc_ nrhs, T* x,
+                        ipc_ ldx) {
+   rpc_ one_val = 1.0;
+   rpc_ minus_one_val = - 1.0;
    if(nrhs==1) {
       if(m > n)
          gemv(OP_T, m-n, n, minus_one_val, &l[n], ldl, &x[n], 1, one_val, x, 1);
@@ -2633,7 +2740,7 @@ void ldlt_app_solve_bwd(int m, int n, T const* l, int ldl, int nrhs, T* x,
                 one_val, l, ldl, x, ldx);
    }
 }
-template void ldlt_app_solve_bwd<precision_>(int, int, precision_ const*, int,
-                                             int, precision_*, int);
+template void ldlt_app_solve_bwd<rpc_>(ipc_, ipc_, rpc_ const*, ipc_,
+                                       ipc_, rpc_*, ipc_);
 
 }}} /* namespaces spral::ssids::cpu */
diff --git a/src/ssids/ldlt_nopiv.cxx b/src/ssids/ldlt_nopiv.cxx
index 21919e278b..788bd12118 100644
--- a/src/ssids/ldlt_nopiv.cxx
+++ b/src/ssids/ldlt_nopiv.cxx
@@ -2,7 +2,9 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 09:50 GMT
  */
+
 #include "ssids_cpu_kernels_ldlt_nopiv.hxx"
 
 namespace spral { namespace ssids { namespace cpu {
@@ -23,42 +25,42 @@ namespace spral { namespace ssids { namespace cpu {
  *
  * Returns -1 on success, otherwise location of negative or zero pivot.
  * */
-int ldlt_nopiv_factor(int m, int n, precision_* a, int lda, precision_* work) {
-   for(int j=0; j<n-1; j+=2) {
+ipc_ ldlt_nopiv_factor(ipc_ m, ipc_ n, rpc_* a, ipc_ lda, rpc_* work) {
+   for(ipc_ j=0; j<n-1; j+=2) {
       /* Setup shortcut pointers to make code easier to read */
-      precision_ *a1 = &a[j*lda], *a2 = &a[(j+1)*lda];
-      precision_ *work1 = &work[0], *work2 = &work[m];
+      rpc_ *a1 = &a[j*lda], *a2 = &a[(j+1)*lda];
+      rpc_ *work1 = &work[0], *work2 = &work[m];
       /* Invert 2x2 diagonal block (j,j+1) */
-      precision_ a11 = a1[j];
-      precision_ a21 = a1[j+1];
-      precision_ a22 = a2[j+1];
-      precision_ det = a11*a22 - a21*a21;
+      rpc_ a11 = a1[j];
+      rpc_ a21 = a1[j+1];
+      rpc_ a22 = a2[j+1];
+      rpc_ det = a11*a22 - a21*a21;
       if(det <= 0.0)
          return (a11 <= 0.0) ? j : j+1; /* Matrix is not +ive definite */
       det = 1/det;
-      precision_ l11 = a22 * det;    a1[j]   = l11;
-      precision_ l21 = -a21 * det;   a1[j+1] = l21;
-      precision_ l22 = a11 * det;    a2[j+1] = l22;
+      rpc_ l11 = a22 * det;    a1[j]   = l11;
+      rpc_ l21 = -a21 * det;   a1[j+1] = l21;
+      rpc_ l22 = a11 * det;    a2[j+1] = l22;
       /* Apply to block below diagonal */
-      for(int i=j+2; i<m; ++i) {
-         precision_ x1 = a1[i]; work1[i] = x1;
-         precision_ x2 = a2[i]; work2[i] = x2;
+      for(ipc_ i=j+2; i<m; ++i) {
+         rpc_ x1 = a1[i]; work1[i] = x1;
+         rpc_ x2 = a2[i]; work2[i] = x2;
          a1[i] = l11*x1 + l21*x2;
          a2[i] = l21*x1 + l22*x2;
       }
       /* Apply schur complement update */
-      for(int k=j+2; k<n; ++k)
-      for(int i=j+2; i<m; ++i)
+      for(ipc_ k=j+2; k<n; ++k)
+      for(ipc_ i=j+2; i<m; ++i)
          a[k*lda+i] -= a1[i]*work1[k] + a2[i]*work2[k];
    }
 
    if(n%2!=0) {
       /* n is odd, last column can't use a 2x2 pivot, so use a 1x1 */
-      int j = n-1;
-      precision_ *a1 = &a[j*lda];
+      ipc_ j = n-1;
+      rpc_ *a1 = &a[j*lda];
       if(a1[j] <= 0.0) return j; /* matrix not posdef */
-      precision_ l11 = 1/a1[j]; a1[j] = l11;
-      for(int i=j+1; i<m; ++i)
+      rpc_ l11 = 1/a1[j]; a1[j] = l11;
+      for(ipc_ i=j+1; i<m; ++i)
          a1[i] *= l11;
    }
 
@@ -66,48 +68,45 @@ int ldlt_nopiv_factor(int m, int n, precision_* a, int lda, precision_* work) {
 }
 
 /* Corresponding forward solve to ldlt_nopiv_factor() */
-void ldlt_nopiv_solve_fwd(int m, int n, precision_ const* a, int lda,
-   precision_ *x) {
-   for(int j=0; j<n-1; j+=2) {
-      for(int i=j+2; i<m; ++i)
+void ldlt_nopiv_solve_fwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda, rpc_ *x) {
+   for(ipc_ j=0; j<n-1; j+=2) {
+      for(ipc_ i=j+2; i<m; ++i)
          x[i] -= a[j*lda+i]*x[j] + a[(j+1)*lda+i]*x[j+1];
    }
    if(n%2!=0) {
       // n is odd, handle last column as 1x1 pivot
-      int j = n-1;
-      for(int i=n; i<m; ++i)
+      ipc_ j = n-1;
+      for(ipc_ i=n; i<m; ++i)
          x[i] -= a[j*lda+i]*x[j];
    }
 }
 
 /* Corresponding diagonal solve to ldlt_nopiv_factor() */
-void ldlt_nopiv_solve_diag(int m, int n, precision_ const* a, int lda,
-   precision_ *x) {
-   for(int j=0; j<n-1; j+=2) {
-      precision_ x1 = x[j];
-      precision_ x2 = x[j+1];
+void ldlt_nopiv_solve_diag(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda, rpc_ *x) {
+   for(ipc_ j=0; j<n-1; j+=2) {
+      rpc_ x1 = x[j];
+      rpc_ x2 = x[j+1];
       x[j]   = a[j*lda+j  ]*x1 + a[    j*lda+j+1]*x2;
       x[j+1] = a[j*lda+j+1]*x1 + a[(j+1)*lda+j+1]*x2;
    }
    if(n%2!=0) {
       // n is odd, handle last column as 1x1 pivot
-      int j = n-1;
+      ipc_ j = n-1;
       x[j] *= a[j*lda+j];
    }
 }
 
 /* Corresponding backward solve to ldlt_nopiv_factor() */
-void ldlt_nopiv_solve_bwd(int m, int n, precision_ const* a, int lda,
-  precision_ *x) {
+void ldlt_nopiv_solve_bwd(ipc_ m, ipc_ n, rpc_ const* a, ipc_ lda, rpc_ *x) {
    if(n%2!=0) {
       // n is odd, handle last column as 1x1 pivot
-      int j = n-1;
-      for(int i=n; i<m; ++i)
+      ipc_ j = n-1;
+      for(ipc_ i=n; i<m; ++i)
          x[j] -= a[j*lda+i]*x[i];
       n--;
    }
-   for(int j=n-2; j>=0; j-=2) {
-      for(int i=j+2; i<m; ++i) {
+   for(ipc_ j=n-2; j>=0; j-=2) {
+      for(ipc_ i=j+2; i<m; ++i) {
          x[j] -= a[j*lda+i] * x[i];
          x[j+1] -= a[(j+1)*lda+i] * x[i];
       }
diff --git a/src/ssids/ldlt_tpp.cxx b/src/ssids/ldlt_tpp.cxx
index fc04ca1971..a7ce326aef 100644
--- a/src/ssids/ldlt_tpp.cxx
+++ b/src/ssids/ldlt_tpp.cxx
@@ -2,7 +2,9 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 09:50 GMT
  */
+
 #include "ssids_cpu_kernels_ldlt_tpp.hxx"
 
 #include <algorithm>
@@ -13,7 +15,7 @@
 #include "ssids_cpu_ThreadStats.hxx"
 #include "ssids_cpu_kernels_wrappers.hxx"
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 #define host_gemm host_gemm_64
 #define host_trsv host_trsv_64
 #define host_trsm host_trsm_64
@@ -25,7 +27,7 @@ namespace spral { namespace ssids { namespace cpu {
 namespace {
 
 /** overload fabs for floats and doubles */
-precision_ fabs_(precision_ x) {
+rpc_ fabs_(rpc_ x) {
 #ifdef SPRAL_SINGLE
      double fabsd = fabs(double(x));
      float fabss;
@@ -37,21 +39,21 @@ precision_ fabs_(precision_ x) {
 }
 
 /** Returns true if all entries in col are less than small in abs value */
-bool check_col_small(int idx, int from, int to, precision_ const* a,
-                     int lda, precision_ small) {
+bool check_col_small(ipc_ idx, ipc_ from, ipc_ to, rpc_ const* a,
+                     ipc_ lda, rpc_ small) {
    bool check = true;
-   for(int c=from; c<idx; ++c)
+   for(ipc_ c=from; c<idx; ++c)
       check = check && (fabs(a[c*lda+idx]) < small);
-   for(int r=idx; r<to; ++r)
+   for(ipc_ r=idx; r<to; ++r)
       check = check && (fabs(a[idx*lda+r]) < small);
    return check;
 }
 
 /** Returns col index of largest entry in row starting at a */
-int find_row_abs_max(int from, int to, precision_ const* a, int lda) {
+ipc_ find_row_abs_max(ipc_ from, ipc_ to, rpc_ const* a, ipc_ lda) {
    if(from>=to) return -1;
-   int best_idx=from; precision_ best_val=fabs(a[from*lda]);
-   for(int idx=from+1; idx<to; ++idx)
+   ipc_ best_idx=from; rpc_ best_val=fabs(a[from*lda]);
+   for(ipc_ idx=from+1; idx<to; ++idx)
       if(fabs(a[idx*lda]) > best_val) {
          best_idx = idx;
          best_val = fabs(a[idx*lda]);
@@ -61,8 +63,8 @@ int find_row_abs_max(int from, int to, precision_ const* a, int lda) {
 
 /** Performs symmetric swap of col1 and col2 in lower triangle */
 // FIXME: remove n only here for debug
-void swap_cols(int col1, int col2, int m, int n, int* perm, precision_* a,
-               int lda, int nleft, precision_* aleft, int ldleft) {
+void swap_cols(ipc_ col1, ipc_ col2, ipc_ m, ipc_ n, ipc_* perm, rpc_* a,
+               ipc_ lda, ipc_ nleft, rpc_* aleft, ipc_ ldleft) {
    if(col1 == col2) return; // No-op
 
    // Ensure col1 < col2
@@ -73,19 +75,19 @@ void swap_cols(int col1, int col2, int m, int n, int* perm, precision_* a,
    std::swap( perm[col1], perm[col2] );
 
    // Swap aleft(col1, :) and aleft(col2, :)
-   for(int c=0; c<nleft; ++c)
+   for(ipc_ c=0; c<nleft; ++c)
       std::swap( aleft[c*ldleft+col1], aleft[c*ldleft+col2] );
 
    // Swap a(col1, 0:col1-1) and a(col2, 0:col1-1)
-   for(int c=0; c<col1; ++c)
+   for(ipc_ c=0; c<col1; ++c)
       std::swap( a[c*lda+col1], a[c*lda+col2] );
 
    // Swap a(col1+1:col2-1, col1) and a(col2, col1+1:col2-1)
-   for(int i=col1+1; i<col2; ++i)
+   for(ipc_ i=col1+1; i<col2; ++i)
       std::swap( a[col1*lda+i], a[i*lda+col2] );
 
    // Swap a(col2+1:m, col1) and a(col2+1:m, col2)
-   for(int r=col2+1; r<m; ++r)
+   for(ipc_ r=col2+1; r<m; ++r)
       std::swap( a[col1*lda+r], a[col2*lda+r] );
 
    // Swap a(col1, col1) and a(col2, col2)
@@ -93,14 +95,14 @@ void swap_cols(int col1, int col2, int m, int n, int* perm, precision_* a,
 }
 
 /** Returns abs value of largest unelim entry in row/col not in posn exclude or on diagonal */
-precision_ find_rc_abs_max_exclude(int col, int nelim, int m,
-                                   precision_ const* a, int lda, int exclude) {
-   precision_ best = 0.0;
-   for(int c=nelim; c<col; ++c) {
+rpc_ find_rc_abs_max_exclude(ipc_ col, ipc_ nelim, ipc_ m,
+                                   rpc_ const* a, ipc_ lda, ipc_ exclude) {
+   rpc_ best = 0.0;
+   for(ipc_ c=nelim; c<col; ++c) {
       if(c==exclude) continue;
       best = std::max(best, fabs_(a[c*lda+col]));
    }
-   for(int r=col+1; r<m; ++r) {
+   for(ipc_ r=col+1; r<m; ++r) {
       if(r==exclude) continue;
       best = std::max(best, fabs_(a[col*lda+r]));
    }
@@ -108,22 +110,22 @@ precision_ find_rc_abs_max_exclude(int col, int nelim, int m,
 }
 
 /** Return true if (t,p) is a good 2x2 pivot, false otherwise */
-bool test_2x2(int t, int p, precision_ maxt, precision_ maxp, precision_ const* a, int lda, precision_ u, precision_ small, precision_* d) {
+bool test_2x2(ipc_ t, ipc_ p, rpc_ maxt, rpc_ maxp, rpc_ const* a, ipc_ lda, rpc_ u, rpc_ small, rpc_* d) {
    // NB: We know t < p
 
    // Check there is a non-zero in the pivot block
-   precision_ a11 = a[t*lda+t];
-   precision_ a21 = a[t*lda+p];
-   precision_ a22 = a[p*lda+p];
-   //printf("Testing 2x2 pivot (%d, %d) %e %e %e vs %e %e\n", t, p, a11, a21, a22, maxt, maxp);
-   precision_ maxpiv = std::max(fabs(a11), std::max(fabs(a21), fabs(a22)));
+   rpc_ a11 = a[t*lda+t];
+   rpc_ a21 = a[t*lda+p];
+   rpc_ a22 = a[p*lda+p];
+   //pripc_f("Testing 2x2 pivot (%d, %d) %e %e %e vs %e %e\n", t, p, a11, a21, a22, maxt, maxp);
+   rpc_ maxpiv = std::max(fabs(a11), std::max(fabs(a21), fabs(a22)));
    if(maxpiv < small) return false;
 
    // Ensure non-singular and not afflicted by cancellation
-   precision_ detscale = 1/maxpiv;
-   precision_ detpiv0 = (a11*detscale)*a22;
-   precision_ detpiv1 = (a21*detscale)*a21;
-   precision_ detpiv = detpiv0 - detpiv1;
+   rpc_ detscale = 1/maxpiv;
+   rpc_ detpiv0 = (a11*detscale)*a22;
+   rpc_ detpiv1 = (a21*detscale)*a21;
+   rpc_ detpiv = detpiv0 - detpiv1;
    //printf("t1 %e < %e %e %e?\n", fabs_(detpiv), small, fabs_(detpiv0/2), fabs_(detpiv1/2));
    if(fabs_(detpiv) < std::max(small, std::max(fabs_(detpiv0/2),
                                                fabs_(detpiv1/2)))) return false;
@@ -131,31 +133,31 @@ bool test_2x2(int t, int p, precision_ maxt, precision_ maxp, precision_ const*
    // Finally apply threshold pivot check
    d[0] = (a22*detscale)/detpiv;
    d[1] = (-a21*detscale)/detpiv;
-   d[2] = std::numeric_limits<precision_>::infinity();
+   d[2] = std::numeric_limits<rpc_>::infinity();
    d[3] = (a11*detscale)/detpiv;
    //printf("t2 %e < %e?\n", std::max(maxp, maxt), small);
    if(std::max(maxp, maxt) < small) return true; // Rest of col small
-   precision_ x1 = fabs(d[0])*maxt + fabs(d[1])*maxp;
-   precision_ x2 = fabs(d[1])*maxt + fabs(d[3])*maxp;
+   rpc_ x1 = fabs(d[0])*maxt + fabs(d[1])*maxp;
+   rpc_ x2 = fabs(d[1])*maxt + fabs(d[3])*maxp;
    //printf("t3 %e < %e?\n", std::max(x1, x2), 1.0/u);
    return ( u*std::max(x1, x2) < 1.0 );
 }
 
 /** Applies the 2x2 pivot to rest of block column */
-void apply_2x2(int nelim, int m, precision_* a, int lda, precision_* ld,
-               int ldld, precision_* d) {
+void apply_2x2(ipc_ nelim, ipc_ m, rpc_* a, ipc_ lda, rpc_* ld,
+               ipc_ ldld, rpc_* d) {
    /* Set diagonal block to identity */
-   precision_* a1 = &a[nelim*lda];
-   precision_* a2 = &a[(nelim+1)*lda];
+   rpc_* a1 = &a[nelim*lda];
+   rpc_* a2 = &a[(nelim+1)*lda];
    a1[nelim] = 1.0;
    a1[nelim+1] = 0.0;
    a2[nelim+1] = 1.0;
    /* Extract D^-1 values */
-   precision_ d11 = d[2*nelim];
-   precision_ d21 = d[2*nelim+1];
-   precision_ d22 = d[2*nelim+3];
+   rpc_ d11 = d[2*nelim];
+   rpc_ d21 = d[2*nelim+1];
+   rpc_ d22 = d[2*nelim+3];
    /* Divide through, preserving copy in ld */
-   for(int r=nelim+2; r<m; ++r) {
+   for(ipc_ r=nelim+2; r<m; ++r) {
       ld[r] = a1[r]; ld[ldld+r] = a2[r];
       a1[r] = d11*ld[r] + d21*ld[ldld+r];
       a2[r] = d21*ld[r] + d22*ld[ldld+r];
@@ -163,23 +165,23 @@ void apply_2x2(int nelim, int m, precision_* a, int lda, precision_* ld,
 }
 
 /** Applies the 1x1 pivot to rest of block column */
-void apply_1x1(int nelim, int m, precision_* a, int lda, precision_* ld,
-               int ldld, precision_* d) {
+void apply_1x1(ipc_ nelim, ipc_ m, rpc_* a, ipc_ lda, rpc_* ld,
+               ipc_ ldld, rpc_* d) {
    /* Set diagonal block to identity */
-   precision_* a1 = &a[nelim*lda];
+   rpc_* a1 = &a[nelim*lda];
    a1[nelim] = 1.0;
    /* Extract D^-1 values */
-   precision_ d11 = d[2*nelim];
+   rpc_ d11 = d[2*nelim];
    /* Divide through, preserving copy in ld */
-   for(int r=nelim+1; r<m; ++r) {
+   for(ipc_ r=nelim+1; r<m; ++r) {
       ld[r] = a1[r];
       a1[r] *= d11;
    }
 }
 
 /** Sets column to zero */
-void zero_col(int col, int m, precision_* a, int lda) {
-   for(int r=col; r<m; ++r) {
+void zero_col(ipc_ col, ipc_ m, rpc_* a, ipc_ lda) {
+   for(ipc_ r=col; r<m; ++r) {
       a[col*lda+r] = 0.0;
    }
 }
@@ -188,19 +190,19 @@ void zero_col(int col, int m, precision_* a, int lda) {
 
 /** Simple LDL^T with threshold partial pivoting.
  * Intended for finishing off small matrices, not for performance */
-int ldlt_tpp_factor(int m, int n, int* perm, precision_* a, int lda,
-                    precision_* d,  precision_* ld, int ldld, bool action,
-                    precision_ u, precision_ small, int nleft,
-      precision_* aleft, int ldleft) {
+ipc_ ldlt_tpp_factor(ipc_ m, ipc_ n, ipc_* perm, rpc_* a, ipc_ lda,
+                    rpc_* d,  rpc_* ld, ipc_ ldld, bool action,
+                    rpc_ u, rpc_ small, ipc_ nleft,
+      rpc_* aleft, ipc_ ldleft) {
    //printf("=== ENTRY %d %d ===\n", m, n);
-   int nelim = 0; // Number of eliminated variables
-   precision_ one_val = 1.0;
-   precision_ minus_one_val = - 1.0;
+   ipc_ nelim = 0; // Number of eliminated variables
+   rpc_ one_val = 1.0;
+   rpc_ minus_one_val = - 1.0;
    while(nelim<n) {
       /*printf("nelim = %d\n", nelim);
-      for(int r=0; r<m; ++r) {
+      for(ipc_ r=0; r<m; ++r) {
          printf("%d: ", perm[r]);
-         for(int c=0; c<=std::min(r,n-1); ++c) printf(" %e", a[c*lda+r]);
+         for(ipc_ c=0; c<=std::min(r,n-1); ++c) printf(" %e", a[c*lda+r]);
          printf("\n");
       }*/
       // Need to check if col nelim is zero now or it gets missed
@@ -215,7 +217,7 @@ int ldlt_tpp_factor(int m, int n, int* perm, precision_* a, int lda,
          nelim++;
          continue;
       }
-      int p; // Index of current candidate pivot [starts at col 2]
+      ipc_ p; // Index of current candidate pivot [starts at col 2]
       for(p=nelim+1; p<n; ++p) {
          //printf("Consider p=%d\n", p);
          // Check if column p is effectively zero
@@ -232,11 +234,11 @@ int ldlt_tpp_factor(int m, int n, int* perm, precision_* a, int lda,
          }
 
          // Find column index of largest entry in |a(p, nelim+1:p-1)|
-         int t = find_row_abs_max(nelim, p, &a[p], lda);
+         ipc_ t = find_row_abs_max(nelim, p, &a[p], lda);
 
          // Try (t,p) as 2x2 pivot
-         precision_ maxt = find_rc_abs_max_exclude(t, nelim, m, a, lda, p);
-         precision_ maxp = find_rc_abs_max_exclude(p, nelim, m, a, lda, t);
+         rpc_ maxt = find_rc_abs_max_exclude(t, nelim, m, a, lda, p);
+         rpc_ maxp = find_rc_abs_max_exclude(p, nelim, m, a, lda, t);
          if( test_2x2(t, p, maxt, maxp, a, lda, u, small, &d[2*nelim]) ) {
             //printf("2x2 pivot\n");
             swap_cols(t, nelim, m, n, perm, a, lda, nleft, aleft, ldleft);
@@ -269,7 +271,7 @@ int ldlt_tpp_factor(int m, int n, int* perm, precision_* a, int lda,
 
          // Try 1x1 pivot on p=nelim as last resort (we started at p=nelim+1)
          p = nelim;
-         precision_ maxp = find_rc_abs_max_exclude(p, nelim, m, a, lda, -1);
+         rpc_ maxp = find_rc_abs_max_exclude(p, nelim, m, a, lda, -1);
          if( fabs_(a[p*lda+p]) >= u*maxp ) {
             //printf("1x1 pivot %d\n", p);
             swap_cols(p, nelim, m, n, perm, a, lda, nleft, aleft, ldleft);
@@ -297,10 +299,10 @@ int ldlt_tpp_factor(int m, int n, int* perm, precision_* a, int lda,
    return nelim;
 }
 
-void ldlt_tpp_solve_fwd(int m, int n, precision_ const* l, int ldl, int nrhs,
-                        precision_* x, int ldx) {
-   precision_ one_val = 1.0;
-   precision_ minus_one_val = - 1.0;
+void ldlt_tpp_solve_fwd(ipc_ m, ipc_ n, rpc_ const* l, ipc_ ldl, ipc_ nrhs,
+                        rpc_* x, ipc_ ldx) {
+   rpc_ one_val = 1.0;
+   rpc_ minus_one_val = - 1.0;
    if(nrhs==1) {
       host_trsv(FILL_MODE_LWR, OP_N, DIAG_UNIT, n, l, ldl, x, 1);
       if(m > n)
@@ -314,31 +316,31 @@ void ldlt_tpp_solve_fwd(int m, int n, precision_ const* l, int ldl, int nrhs,
    }
 }
 
-void ldlt_tpp_solve_diag(int n, precision_ const* d, precision_* x) {
-   for(int i=0; i<n; ) {
+void ldlt_tpp_solve_diag(ipc_ n, rpc_ const* d, rpc_* x) {
+   for(ipc_ i=0; i<n; ) {
       if(i+1<n && std::isinf(d[2*i+2])) {
          // 2x2 pivot
-         precision_ d11 = d[2*i];
-         precision_ d21 = d[2*i+1];
-         precision_ d22 = d[2*i+3];
-         precision_ x1 = x[i];
-         precision_ x2 = x[i+1];
+         rpc_ d11 = d[2*i];
+         rpc_ d21 = d[2*i+1];
+         rpc_ d22 = d[2*i+3];
+         rpc_ x1 = x[i];
+         rpc_ x2 = x[i+1];
          x[i]   = d11*x1 + d21*x2;
          x[i+1] = d21*x1 + d22*x2;
          i += 2;
       } else {
          // 1x1 pivot
-         precision_ d11 = d[2*i];
+         rpc_ d11 = d[2*i];
          x[i] *= d11;
          i++;
       }
    }
 }
 
-void ldlt_tpp_solve_bwd(int m, int n, precision_ const* l, int ldl, int nrhs,
-                        precision_* x, int ldx) {
-   precision_ one_val = 1.0;
-   precision_ minus_one_val = - 1.0;
+void ldlt_tpp_solve_bwd(ipc_ m, ipc_ n, rpc_ const* l, ipc_ ldl, ipc_ nrhs,
+                        rpc_* x, ipc_ ldx) {
+   rpc_ one_val = 1.0;
+   rpc_ minus_one_val = - 1.0;
    if(nrhs==1) {
       if(m > n)
          gemv(OP_T, m-n, n, minus_one_val, &l[n], ldl, &x[n], 1, one_val, x, 1);
diff --git a/src/ssids/profile.cxx b/src/ssids/profile.cxx
index 2550676281..4885ac78fa 100644
--- a/src/ssids/profile.cxx
+++ b/src/ssids/profile.cxx
@@ -6,6 +6,7 @@
  */
 
 #include "ssids_profile.hxx"
+#include "ssids_rip.hxx"
 
 #ifdef PROFILE
 struct timespec spral::ssids::Profile::tstart;
@@ -14,7 +15,7 @@ struct timespec spral::ssids::Profile::tstart;
 using namespace spral::ssids;
 
 extern "C"
-void spral_ssids_profile_begin(int nregions, void const* regions) {
+void spral_ssids_profile_begin(ipc_ nregions, void const* regions) {
    Profile::init(nregions, (spral::hw_topology::NumaRegion*)regions);
 }
 
@@ -24,7 +25,7 @@ void spral_ssids_profile_end() {
 }
 
 extern "C"
-Profile::Task* spral_ssids_profile_create_task(char const* name, int thread) {
+Profile::Task* spral_ssids_profile_create_task(char const* name, ipc_ thread) {
    // We interpret negative thread values as absent
    if(thread >= 0) {
       return new Profile::Task(name, thread);
@@ -47,6 +48,6 @@ void spral_ssids_profile_set_state(char const* container, char const* type,
 
 extern "C"
 void spral_ssids_profile_add_event(
-      char const* type, char const*val, int thread) {
+      char const* type, char const*val, ipc_ thread) {
    Profile::addEvent(type, val, thread);
 }
diff --git a/src/ssids/reorder.cu b/src/ssids/reorder.cu
index a7978c024d..b1a5df7099 100644
--- a/src/ssids/reorder.cu
+++ b/src/ssids/reorder.cu
@@ -1,3 +1,9 @@
+/* Copyright (c) 2013 Science and Technology Facilities Council (STFC)
+ * Licence: BSD licence, see LICENCE file for details
+ * Author: Jonathan Hogg
+ * This version: GALAHAD 4.3 - 2024-02-03 AT 09:50 GMT
+ */
+
 #ifdef __cplusplus
 #include <cmath>
 #else
@@ -8,11 +14,11 @@
 #include <cuda_runtime_api.h>
 #include <device_launch_parameters.h>
 
+#include "ssids_rip.hxx"
 #include "ssids_gpu_kernels_datatypes.h"
 #include "spral_cuda_cuda_check.h"
 
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define multiswap_type multiswap_type_single
 #define multireorder_data multireorder_data_single
 #define multisymm_type multisymm_type_single
@@ -44,7 +50,6 @@
 #define spral_ssids_swap_ni2D_ic spral_ssids_swap_ni2D_ic_single
 #define spral_ssids_swap_ni2D_ir spral_ssids_swap_ni2D_ir_single
 #else
-#define precision_ double
 #define multiswap_type multiswap_type_double
 #define multireorder_data multireorder_data_double
 #define multisymm_type multisymm_type_double
@@ -92,43 +97,43 @@ namespace /* anon */ {
 
 template< typename ELEMENT_TYPE >
 __global__ void
-cu_copy_mc( int nrows, int ncols,
-            ELEMENT_TYPE* a, int lda,
-            ELEMENT_TYPE* b, int ldb,
-            int* mask )
+cu_copy_mc( ipc_ nrows, ipc_ ncols,
+            ELEMENT_TYPE* a, ipc_ lda,
+            ELEMENT_TYPE* b, ipc_ ldb,
+            ipc_* mask )
 {
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = threadIdx.y + blockDim.y*blockIdx.y;
+  ipc_ i = threadIdx.x + blockDim.x*blockIdx.x;
+  ipc_ j = threadIdx.y + blockDim.y*blockIdx.y;
   if ( i < nrows && j < ncols && mask[j] > 0 )
     b[i + ldb*j] = a[i + lda*j];
 }
 
 template< typename ELEMENT_TYPE >
 __global__ void
-cu_copy_ic( int nrows, int ncols,
-            ELEMENT_TYPE* a, int lda,
-            ELEMENT_TYPE* b, int ldb,
-            int* ind )
+cu_copy_ic( ipc_ nrows, ipc_ ncols,
+            ELEMENT_TYPE* a, ipc_ lda,
+            ELEMENT_TYPE* b, ipc_ ldb,
+            ipc_* ind )
 {
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = threadIdx.y + blockDim.y*blockIdx.y;
+  ipc_ i = threadIdx.x + blockDim.x*blockIdx.x;
+  ipc_ j = threadIdx.y + blockDim.y*blockIdx.y;
   if ( i < nrows && j < ncols && ind[j] > 0 )
     b[i + ldb*(ind[j] - 1)] = a[i + lda*j];
 }
 
 template< typename ELEMENT_TYPE >
 __global__ void
-cu_swap_ni2D_ic( int nrows, int ncols,
-                 ELEMENT_TYPE* a, int lda,
-                 ELEMENT_TYPE* b, int ldb,
-                 int* index )
+cu_swap_ni2D_ic( ipc_ nrows, ipc_ ncols,
+                 ELEMENT_TYPE* a, ipc_ lda,
+                 ELEMENT_TYPE* b, ipc_ ldb,
+                 ipc_* index )
 // swaps columns of non-intersecting 2D arrays a(1:n,index(1:m)) and b(1:n,1:m)
 // index is one-based
 {
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = threadIdx.y + blockDim.y*blockIdx.y;
-  int k;
-  precision_ s;
+  ipc_ i = threadIdx.x + blockDim.x*blockIdx.x;
+  ipc_ j = threadIdx.y + blockDim.y*blockIdx.y;
+  ipc_ k;
+  rpc_ s;
 
   if ( i < nrows && j < ncols && (k = index[j] - 1) > -1 ) {
     s = a[i + lda*k];
@@ -139,17 +144,17 @@ cu_swap_ni2D_ic( int nrows, int ncols,
 
 template< typename ELEMENT_TYPE >
 __global__ void
-cu_swap_ni2D_ir( int nrows, int ncols,
-                 ELEMENT_TYPE* a, int lda,
-                 ELEMENT_TYPE* b, int ldb,
-                 int* index )
+cu_swap_ni2D_ir( ipc_ nrows, ipc_ ncols,
+                 ELEMENT_TYPE* a, ipc_ lda,
+                 ELEMENT_TYPE* b, ipc_ ldb,
+                 ipc_* index )
 // swaps rows of non-intersecting 2D arrays a(index(1:n),1:m) and b(1:n,1:m)
 // index is one-based
 {
-  int i = threadIdx.x + blockDim.x*blockIdx.x;
-  int j = threadIdx.y + blockDim.y*blockIdx.y;
-  int k;
-  precision_ s;
+  ipc_ i = threadIdx.x + blockDim.x*blockIdx.x;
+  ipc_ j = threadIdx.y + blockDim.y*blockIdx.y;
+  ipc_ k;
+  rpc_ s;
 
   if ( i < nrows && j < ncols && (k = index[i] - 1) > -1 ) {
     s = a[k + lda*j];
@@ -159,12 +164,12 @@ cu_swap_ni2D_ir( int nrows, int ncols,
 }
 
 struct multiswap_type {
-   int nrows;
-   int ncols;
-   int k;
-   precision_ *lcol;
-   int lda;
-   int off;
+   ipc_ nrows;
+   ipc_ ncols;
+   ipc_ k;
+   rpc_ *lcol;
+   ipc_ lda;
+   ipc_ off;
 };
 
 template< typename ELEMENT_TYPE >
@@ -173,19 +178,19 @@ cu_multiswap_ni2D_c( struct multiswap_type *swapdata )
 // swaps non-intersecting rows or cols of a 2D multiarray a
 {
   swapdata += blockIdx.x;
-  int nrows = swapdata->nrows;
+  ipc_ nrows = swapdata->nrows;
   if ( blockIdx.y*blockDim.x >= nrows )
     return;
 
-  int k     = swapdata->k;
+  ipc_ k     = swapdata->k;
   ELEMENT_TYPE *a = swapdata->lcol;
-  int lda   = swapdata->lda;
-  int off  = lda*swapdata->off;
+  ipc_ lda   = swapdata->lda;
+  ipc_ off  = lda*swapdata->off;
   ELEMENT_TYPE s;
 
-  for ( int i = threadIdx.x + blockIdx.y*blockDim.x; i < nrows;
+  for ( ipc_ i = threadIdx.x + blockIdx.y*blockDim.x; i < nrows;
         i += blockDim.x*gridDim.y )
-    for ( int j = threadIdx.y; j < k; j += blockDim.y ) {
+    for ( ipc_ j = threadIdx.y; j < k; j += blockDim.y ) {
       s = a[i + lda*j];
       a[i + lda*j] = a[off + i + lda*j];
       a[off + i + lda*j] = s;
@@ -198,18 +203,18 @@ cu_multiswap_ni2D_r( struct multiswap_type *swapdata )
 // swaps non-intersecting rows or cols of a 2D multiarray a
 {
   swapdata += blockIdx.x;
-  int ncols = swapdata->ncols;
+  ipc_ ncols = swapdata->ncols;
   if ( blockIdx.y*blockDim.y >= ncols )
     return;
 
-  int k     = swapdata->k;
+  ipc_ k     = swapdata->k;
   ELEMENT_TYPE *a = swapdata->lcol;
-  int lda   = swapdata->lda;
-  int off  = swapdata->off;
+  ipc_ lda   = swapdata->lda;
+  ipc_ off  = swapdata->off;
   ELEMENT_TYPE s;
 
-  for ( int i = threadIdx.x; i < k; i += blockDim.x )
-    for ( int j = threadIdx.y + blockIdx.y*blockDim.y; j < ncols;
+  for ( ipc_ i = threadIdx.x; i < k; i += blockDim.x )
+    for ( ipc_ j = threadIdx.y + blockIdx.y*blockDim.y; j < ncols;
           j += blockDim.y*gridDim.y ) {
       s = a[i + lda*j];
       a[i + lda*j] = a[off + i + lda*j];
@@ -220,14 +225,14 @@ cu_multiswap_ni2D_r( struct multiswap_type *swapdata )
 template< typename ELEMENT_TYPE >
 __global__ void
 cu_reorder_rows(
-                int nrows, int ncols,
-                ELEMENT_TYPE* a, int lda,
-                ELEMENT_TYPE* b, int ldb,
-                int* index
+                ipc_ nrows, ipc_ ncols,
+                ELEMENT_TYPE* a, ipc_ lda,
+                ELEMENT_TYPE* b, ipc_ ldb,
+                ipc_* index
                )
 {
-  int x;
-  int y = threadIdx.y + blockIdx.y*blockDim.y;
+  ipc_ x;
+  ipc_ y = threadIdx.y + blockIdx.y*blockDim.y;
 
   for ( x = threadIdx.x; x < nrows; x += blockDim.x )
     if ( y < ncols )
@@ -238,14 +243,14 @@ cu_reorder_rows(
       a[x + lda*y] = b[x + ldb*y];
 }
 
-template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y >
+template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y >
 __global__ void
-cu_reorder_cols2( int nrows, int ncols,
-                  ELEMENT_TYPE* a, int lda,
-                  ELEMENT_TYPE* b, int ldb,
-                  int* index, int mode )
+cu_reorder_cols2( ipc_ nrows, ipc_ ncols,
+                  ELEMENT_TYPE* a, ipc_ lda,
+                  ELEMENT_TYPE* b, ipc_ ldb,
+                  ipc_* index, ipc_ mode )
 {
-  int ix = threadIdx.x + blockIdx.x*blockDim.x;
+  ipc_ ix = threadIdx.x + blockIdx.x*blockDim.x;
 
   __shared__ volatile ELEMENT_TYPE work[SIZE_X*SIZE_Y];
 
@@ -281,14 +286,14 @@ cu_reorder_cols2( int nrows, int ncols,
   }
 }
 
-template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y >
+template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y >
 __global__ void
-cu_reorder_rows2( int nrows, int ncols,
-                  ELEMENT_TYPE* a, int lda,
-                  ELEMENT_TYPE* b, int ldb,
-                  int* index, int mode )
+cu_reorder_rows2( ipc_ nrows, ipc_ ncols,
+                  ELEMENT_TYPE* a, ipc_ lda,
+                  ELEMENT_TYPE* b, ipc_ ldb,
+                  ipc_* index, ipc_ mode )
 {
-  int iy = threadIdx.y + blockIdx.x*blockDim.y;
+  ipc_ iy = threadIdx.y + blockIdx.x*blockDim.y;
 
   __shared__ volatile ELEMENT_TYPE work[SIZE_X*SIZE_Y];
 
@@ -327,33 +332,33 @@ cu_reorder_rows2( int nrows, int ncols,
 /*
  * Copies new L factors back to A array without any permutation
  */
-template< typename ELEMENT_TYPE, int NTX >
+template< typename ELEMENT_TYPE, ipc_ NTX >
 __device__ void
 __forceinline__ // Required to avoid errors about reg counts compiling with -G
 copy_L_LD_no_perm(
-      int nblk, int bidx, int tid,
-      int nrows, int ncols,
-      ELEMENT_TYPE *dest, int ldd,
-      const ELEMENT_TYPE *src, int lds
+      ipc_ nblk, ipc_ bidx, ipc_ tid,
+      ipc_ nrows, ipc_ ncols,
+      ELEMENT_TYPE *dest, ipc_ ldd,
+      const ELEMENT_TYPE *src, ipc_ lds
 ) {
-   int tx = tid % NTX;
-   int ty = tid / NTX;
+   ipc_ tx = tid % NTX;
+   ipc_ ty = tid / NTX;
    src += NTX*bidx;
    dest += NTX*bidx;
    nrows -= NTX*bidx;
    if ( ty < ncols ) {
-      for ( int x = tx; x < nrows; x += NTX*nblk )
+      for ( ipc_ x = tx; x < nrows; x += NTX*nblk )
          dest[x + ldd*ty] = src[x + lds*ty];
    }
 }
 
 /* Shuffles the permutation vector using shared memory
    [in case it overlaps itself] */
-template < int SIZE_X >
+template < ipc_ SIZE_X >
 __device__ void
-shuffle_perm_shmem( int n, volatile const int *const indr, int *perm ) {
+shuffle_perm_shmem( ipc_ n, volatile const ipc_ *const indr, ipc_ *perm ) {
    // Update permutation
-   __shared__ volatile int iwork[SIZE_X];
+   __shared__ volatile ipc_ iwork[SIZE_X];
    if ( threadIdx.x < n && threadIdx.y == 0 )
       iwork[indr[threadIdx.x] - 1] = perm[threadIdx.x];
    __syncthreads();
@@ -366,19 +371,19 @@ shuffle_perm_shmem( int n, volatile const int *const indr, int *perm ) {
  * This version uses shared memory and is designed for the case when the new
  * and old location of columns and rows overlap.
  */
-template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y >
+template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y >
 __device__ void
 __forceinline__ // Required to avoid errors about reg counts compiling with -G
 copy_L_LD_perm_shmem(
-      int block, int nblocks,
-      int done, int pivoted, int delayed,
-      int nrows, int ncols,
-      int ib, int jb,
-      int offc, int offp,
-      int ld,
-      volatile int *const indr,
-      precision_ *a, precision_ *b, const precision_ *c,
-      int *perm
+      ipc_ block, ipc_ nblocks,
+      ipc_ done, ipc_ pivoted, ipc_ delayed,
+      ipc_ nrows, ipc_ ncols,
+      ipc_ ib, ipc_ jb,
+      ipc_ offc, ipc_ offp,
+      ipc_ ld,
+      volatile ipc_ *const indr,
+      rpc_ *a, rpc_ *b, const rpc_ *c,
+      ipc_ *perm
 ) {
    __shared__ volatile ELEMENT_TYPE work1[SIZE_X*SIZE_Y];
    __shared__ volatile ELEMENT_TYPE work2[SIZE_X*SIZE_Y];
@@ -389,8 +394,8 @@ copy_L_LD_perm_shmem(
 
    // Extend permutation array to cover non-pivoted columns
    if ( threadIdx.x == 0 && threadIdx.y == 0 ) {
-      int i = 0;
-      int j = pivoted;
+      ipc_ i = 0;
+      ipc_ j = pivoted;
       for ( ; i < delayed; i++ )
          indr[i] = ++j;
       for ( ; i < delayed + jb - ib + 1; i++ )
@@ -398,7 +403,7 @@ copy_L_LD_perm_shmem(
             indr[i] = ++j;
    }
 
-   int off = done*ld;
+   ipc_ off = done*ld;
 
    // We handle the (done-jb) x (done-jb) block that requires both
    // row and column permutations seperately using the first block.
@@ -409,17 +414,17 @@ copy_L_LD_perm_shmem(
       // Swap columns of A and copy in L, but avoiding rows that need
       // permuted
       // Also, swap cols of LD but avoiding rows that need permuted
-      int baseStep = blockDim.x*(nblocks - 1);
+      ipc_ baseStep = blockDim.x*(nblocks - 1);
 #if (SM_3X)
-      for ( int i = jb + blockDim.x*(block - 1); i < nrows;
+      for ( ipc_ i = jb + blockDim.x*(block - 1); i < nrows;
             i += baseStep ) {
 #else
-      for ( int i = jb + blockDim.x*(block - 1); i < nrows + baseStep;
+      for ( ipc_ i = jb + blockDim.x*(block - 1); i < nrows + baseStep;
             i += baseStep * 2 ) {
 #endif
-         int ix = i + threadIdx.x;
+         ipc_ ix = i + threadIdx.x;
 #if (!SM_3X)
-         int ix2 = ix + baseStep;
+         ipc_ ix2 = ix + baseStep;
 #endif
          __syncthreads();
 
@@ -486,15 +491,15 @@ copy_L_LD_perm_shmem(
       // Swap rows of A
       baseStep = blockDim.y*(nblocks - 1);
 #if (SM_3X)
-      for ( int i = blockDim.y*(block - 1); i < ncols;
+      for ( ipc_ i = blockDim.y*(block - 1); i < ncols;
             i += baseStep  ) {
 #else
-      for ( int i = blockDim.y*(block - 1); i < ncols + baseStep;
+      for ( ipc_ i = blockDim.y*(block - 1); i < ncols + baseStep;
             i += baseStep * 2 ) {
 #endif
-         int iy = i + threadIdx.y;
+         ipc_ iy = i + threadIdx.y;
 #if (!SM_3X)
-         int iy2 = iy + baseStep;
+         ipc_ iy2 = iy + baseStep;
 #endif
          __syncthreads();
 
@@ -538,7 +543,7 @@ copy_L_LD_perm_shmem(
       // row /and/ column permutations.
       shuffle_perm_shmem< SIZE_X > ( delayed + jb - ib + 1, indr, &perm[offp + done] );
 
-      int pass = threadIdx.x < jb - done && threadIdx.y < jb - done;
+      ipc_ pass = threadIdx.x < jb - done && threadIdx.y < jb - done;
 
       // Handle L and LD
       if ( pass ) {
@@ -591,26 +596,26 @@ copy_L_LD_perm_shmem(
  * This version does this directly in global memory and is designed for the case
  * when the new and old location of columns and rows DO NOT overlap.
  */
-template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y >
+template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y >
 __device__ void
 __forceinline__ // Required to avoid errors about reg counts compiling with -G
 copy_L_LD_perm_noshmem(
-      int node,
-      int block, int nblocks,
-      int done, int pivoted, int delayed,
-      int nrows, int ncols,
-      int ib, int jb,
-      int offc, int offp,
-      int ld,
-      const int *ind,
-      const volatile int *const indf,
-      precision_ *a, precision_ *b, const precision_ *c,
-      int *perm
+      ipc_ node,
+      ipc_ block, ipc_ nblocks,
+      ipc_ done, ipc_ pivoted, ipc_ delayed,
+      ipc_ nrows, ipc_ ncols,
+      ipc_ ib, ipc_ jb,
+      ipc_ offc, ipc_ offp,
+      ipc_ ld,
+      const ipc_ *ind,
+      const volatile ipc_ *const indf,
+      rpc_ *a, rpc_ *b, const rpc_ *c,
+      ipc_ *perm
 ) {
 
-   int off1 = done;
-   int off2 = ib - 1;
-   int offi = node*SIZE_Y/2;
+   ipc_ off1 = done;
+   ipc_ off2 = ib - 1;
+   ipc_ offi = node*SIZE_Y/2;
 
    // We handle the two pivoted x pivoted blocks where row and columns cross
    // over seperately using the first block.
@@ -618,13 +623,13 @@ copy_L_LD_perm_noshmem(
    // All remaining rows and columns are handlded by the remaining blocks.
    if ( block ) {
       // Handle parts of matrix that require EITHER row OR col shuffle
-      int tx = (threadIdx.y < SIZE_Y/2) ? threadIdx.x : threadIdx.x + blockDim.x;
-      int ty = (threadIdx.y < SIZE_Y/2) ? threadIdx.y : threadIdx.y - SIZE_Y/2;
+      ipc_ tx = (threadIdx.y < SIZE_Y/2) ? threadIdx.x : threadIdx.x + blockDim.x;
+      ipc_ ty = (threadIdx.y < SIZE_Y/2) ? threadIdx.y : threadIdx.y - SIZE_Y/2;
       // Swap a[:,done:done+pivoted] and a[:,ib:jb] pulling in c[] as we go
-      for ( int x = tx + 2*blockDim.x*(block - 1);
+      for ( ipc_ x = tx + 2*blockDim.x*(block - 1);
             x < nrows && ty < jb - ib + 1;
             x += 2*blockDim.x*(nblocks - 1) ) {
-         int y = ind[offi + ty] - 1;
+         ipc_ y = ind[offi + ty] - 1;
          if ( (x >= done   && x < done + jb - ib + 1)
                || (x >= ib - 1 && x < jb) || y < 0 )
             continue; // handled separately
@@ -632,10 +637,10 @@ copy_L_LD_perm_noshmem(
          a[x + ld*(off1 + y)] = c[offc + x + ld*ty];
       }
       // Swap b[:,done:done+pivoted] and b[:,ib:jb]
-      for ( int x = tx + 2*blockDim.x*(block - 1);
+      for ( ipc_ x = tx + 2*blockDim.x*(block - 1);
             x < nrows && ty < jb - ib + 1;
             x += 2*blockDim.x*(nblocks - 1) ) {
-         int y = ind[offi + ty] - 1;
+         ipc_ y = ind[offi + ty] - 1;
          if ( ( x >= done && x < done + jb - ib + 1 )
                || ( x >= ib - 1 && x < jb ) || y < 0)
             continue; // handled separately
@@ -647,10 +652,10 @@ copy_L_LD_perm_noshmem(
       if ( (block - 1)*blockDim.y >= ncols )
          return;
       // swap a[done:done+pivoted,:] and a[ib:jb,:]
-      for ( int y = threadIdx.y + blockDim.y*(block - 1);
+      for ( ipc_ y = threadIdx.y + blockDim.y*(block - 1);
             y < ncols && threadIdx.x < jb - ib + 1;
             y += blockDim.y*(nblocks - 1) ) {
-         int x = ind[offi + threadIdx.x] - 1;
+         ipc_ x = ind[offi + threadIdx.x] - 1;
          if ( (y >= done && y < done + jb - ib + 1)
                || (y >= ib - 1 && y < jb) || x < 0 )
             continue; // handled separately
@@ -659,10 +664,10 @@ copy_L_LD_perm_noshmem(
          a[off2 + threadIdx.x + ld*y] = s;
       }
       // swap b[done:done+pivoted,:] and b[ib:jb,:]
-      for ( int y = threadIdx.y + blockDim.y*(block - 1);
+      for ( ipc_ y = threadIdx.y + blockDim.y*(block - 1);
             y < ncols && threadIdx.x < jb - ib + 1;
             y += blockDim.y*(nblocks - 1) ) {
-         int x = ind[offi + threadIdx.x] - 1;
+         ipc_ x = ind[offi + threadIdx.x] - 1;
          if ( (y >= done   && y < done + jb - ib + 1)
                || (y >= ib - 1 && y < jb) || x < 0)
             continue; // handled separately
@@ -675,9 +680,9 @@ copy_L_LD_perm_noshmem(
       // Handle part of matrix that requires BOTH row AND col shuffle
       if ( threadIdx.x < jb - ib + 1 && threadIdx.y == 0 ) {
          // Update permutation
-         int i = indf[threadIdx.x] - 1;
+         ipc_ i = indf[threadIdx.x] - 1;
          if ( i >= 0 ) {
-            int s = perm[offp + ib - 1 + threadIdx.x];
+            ipc_ s = perm[offp + ib - 1 + threadIdx.x];
             perm[offp + ib - 1 + threadIdx.x] = perm[offp + done + i];
             perm[offp + done + i] = s;
          }
@@ -688,8 +693,8 @@ copy_L_LD_perm_noshmem(
       // Swap a[done:done+pivoted,done:done+pivoted] and
       // a[done:done+pivoted,ib:jb]
       // pulling in new cols from c[] as we go.
-      int x = done + threadIdx.x;
-      int y = ind[offi + threadIdx.y] - 1;
+      ipc_ x = done + threadIdx.x;
+      ipc_ y = ind[offi + threadIdx.y] - 1;
       if ( x < done + jb - ib + 1 && threadIdx.y < jb - ib + 1 && y >= 0 ) {
          a[x + ld*(off2 + threadIdx.y)] = a[x + ld*(off1 + y)];
          a[x + ld*(off1 + y)] = c[offc + x + ld*threadIdx.y];
@@ -765,12 +770,12 @@ copy_L_LD_perm_noshmem(
 }
 
 struct multireorder_data {
-   int node;
-   int block;
-   int nblocks;
+   ipc_ node;
+   ipc_ block;
+   ipc_ nblocks;
 };
 
-template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y >
+template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y >
 #if (SM_3X)
 __launch_bounds__(256, 8)
 #else
@@ -781,13 +786,13 @@ cu_multireorder(
                 const struct multinode_fact_type *ndata,
                 const struct multireorder_data* rdata,
                 const ELEMENT_TYPE* c,
-                const int* stat,
-                const int* ind,
-                int* perm,
-                int* ncb) {
-   __shared__ volatile int indf[SIZE_X]; // index from node_fact
-   __shared__ volatile int indr[SIZE_X]; // reorder index
-   __shared__ volatile int simple;
+                const ipc_* stat,
+                const ipc_* ind,
+                ipc_* perm,
+                ipc_* ncb) {
+   __shared__ volatile ipc_ indf[SIZE_X]; // index from node_fact
+   __shared__ volatile ipc_ indr[SIZE_X]; // reorder index
+   __shared__ volatile ipc_ simple;
 
    // Reset ncb ready for next call of muliblock_fact_setup()
    if ( blockIdx.x == 0 && threadIdx.x == 0 && threadIdx.y == 0 ) {
@@ -797,43 +802,43 @@ cu_multireorder(
 
    // Load data on block
    rdata += blockIdx.x;
-   int node = rdata->node;
+   ipc_ node = rdata->node;
    ndata += node;
-   int ib   = ndata->ib;
-   int jb   = ndata->jb;
+   ipc_ ib   = ndata->ib;
+   ipc_ jb   = ndata->jb;
    if ( jb < ib )
       return;
-   int pivoted = stat[node];
+   ipc_ pivoted = stat[node];
    if ( pivoted < 1 )
       return;
-   int nrows = ndata->nrows;
-   int bidx = rdata->block;
+   ipc_ nrows = ndata->nrows;
+   ipc_ bidx = rdata->block;
    if ( bidx > 1 && (bidx - 1)*blockDim.x >= nrows )
       return;
 
-   int done = ndata->done;
+   ipc_ done = ndata->done;
 
-   int ld = nrows;
-   int delayed = ib - done - 1; // Number delayed before most recent factor
+   ipc_ ld = nrows;
+   ipc_ delayed = ib - done - 1; // Number delayed before most recent factor
 
    if ( threadIdx.x == 0 && threadIdx.y == 0 )
       simple = (delayed == 0); // true if we don't need to offset
    __syncthreads();
-   int next;
+   ipc_ next;
    if ( threadIdx.x < jb - ib + 1 && threadIdx.y == 0 ) {
       next = ind[node*SIZE_Y/2 + threadIdx.x]; // SIZE_Y=2*BLOCK_SIZE
       indf[threadIdx.x] = next;
       if ( jb - ib + 1 > delayed )
          indr[delayed + threadIdx.x] = next;
       if ( indf[threadIdx.x] != threadIdx.x + 1 )
-         atomicMin((int*)&simple, 0);
+         atomicMin((ipc_*)&simple, 0);
    }
    __syncthreads();
 
    ELEMENT_TYPE *a = ndata->lval;
    ELEMENT_TYPE *b = ndata->ldval;
-   int offc = ndata->lbuf;
-   int nblk = rdata->nblocks;
+   ipc_ offc = ndata->lbuf;
+   ipc_ nblk = rdata->nblocks;
    if ( simple ) {
       // Copy successful columns from workspace c to factors a without an
       // offset or permutation.
@@ -843,8 +848,8 @@ cu_multireorder(
    }
    else {
       // We need a permutation
-      int ncols = ndata->ncols;
-      int offp = ndata->offp;
+      ipc_ ncols = ndata->ncols;
+      ipc_ offp = ndata->offp;
       if ( jb - ib + 1 > delayed ) {
          // Can't just shuffle along, as pivoted columns overlap with where they
          // need to be. However, we know that pivoted+delayed < 2*BLOCK_SIZE, so
@@ -864,14 +869,14 @@ cu_multireorder(
    }
 }
 
-template< typename ELEMENT_TYPE, unsigned int SIZE_X, unsigned int SIZE_Y >
+template< typename ELEMENT_TYPE, uipc_ SIZE_X, uipc_ SIZE_Y >
 __global__ void
 cu_multicopy(
               const struct multinode_fact_type *ndata,
               const struct multireorder_data* rdata,
               ELEMENT_TYPE* b,
-              int* stat,
-              int* ncb
+              ipc_* stat,
+              ipc_* ncb
             )
 {
 
@@ -881,25 +886,25 @@ cu_multicopy(
    }
 
    rdata += blockIdx.x;
-   int node = rdata->node;
+   ipc_ node = rdata->node;
    ndata += node;
-   int ib   = ndata->ib;
-   int jb   = ndata->jb;
+   ipc_ ib   = ndata->ib;
+   ipc_ jb   = ndata->jb;
    if ( jb < ib )
       return;
-   int pivoted = stat[node];
+   ipc_ pivoted = stat[node];
    if ( pivoted < 1 )
       return;
-   int nrows = ndata->nrows;
-   int block = rdata->block;
-   int nblocks = rdata->nblocks;
+   ipc_ nrows = ndata->nrows;
+   ipc_ block = rdata->block;
+   ipc_ nblocks = rdata->nblocks;
    if ( block > 1 && (block - 1)*blockDim.x >= nrows )
       return;
 
-   int done = ndata->done;
+   ipc_ done = ndata->done;
    ELEMENT_TYPE *a = ndata->lval;
-   int offb = ndata->lbuf;
-   for ( int x = threadIdx.x + blockDim.x*block;
+   ipc_ offb = ndata->lbuf;
+   for ( ipc_ x = threadIdx.x + blockDim.x*block;
         x < nrows && threadIdx.y < pivoted;
         x += blockDim.x*nblocks ) {
      a[x + nrows*(done + threadIdx.y)] = b[offb + x + nrows*threadIdx.y];
@@ -907,9 +912,9 @@ cu_multicopy(
 }
 
 struct multisymm_type {
-   precision_ *lcol;
-   int ncols;
-   int nrows;
+   rpc_ *lcol;
+   ipc_ ncols;
+   ipc_ nrows;
 };
 
 /*
@@ -923,11 +928,11 @@ cu_multisymm( const struct multisymm_type* msdata )
 {
   msdata += blockIdx.x;
   ELEMENT_TYPE *a = msdata->lcol;
-  int ncols = msdata->ncols;
-  int nrows = msdata->nrows;
-  for ( int i = threadIdx.x + blockDim.x*blockIdx.y; i < ncols;
+  ipc_ ncols = msdata->ncols;
+  ipc_ nrows = msdata->nrows;
+  for ( ipc_ i = threadIdx.x + blockDim.x*blockIdx.y; i < ncols;
         i += blockDim.x*gridDim.y )
-    for ( int j = threadIdx.y + blockDim.y*blockIdx.z; j < i;
+    for ( ipc_ j = threadIdx.y + blockDim.y*blockIdx.z; j < i;
           j += blockDim.y*gridDim.z )
         a[j + i*nrows] = a[i + j*nrows];
 }
@@ -940,151 +945,151 @@ cu_multisymm( const struct multisymm_type* msdata )
 
 extern "C" {
 
-void spral_ssids_copy_ic(cudaStream_t *stream, int nrows, int ncols,
-    precision_* a, int lda, precision_* b, int ldb, int* ind) {
-  int rb = (nrows - 1)/BLOCK_SIZE + 1;
-  int cb = (ncols - 1)/BLOCK_SIZE + 1;
+void spral_ssids_copy_ic(cudaStream_t *stream, ipc_ nrows, ipc_ ncols,
+    rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* ind) {
+  ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1;
+  ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1;
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
   dim3 grid(rb, cb);
-  cu_copy_ic< precision_ >
+  cu_copy_ic< rpc_ >
     <<< grid, threads, 0, *stream >>>
     ( nrows, ncols, a, lda, b, ldb, ind );
 }
 
-void spral_ssids_copy_mc(cudaStream_t *stream, int nrows, int ncols, precision_* a,
-      int lda, precision_* b, int ldb, int* mask) {
-  int rb = (nrows - 1)/BLOCK_SIZE + 1;
-  int cb = (ncols - 1)/BLOCK_SIZE + 1;
+void spral_ssids_copy_mc(cudaStream_t *stream, ipc_ nrows, ipc_ ncols, rpc_* a,
+      ipc_ lda, rpc_* b, ipc_ ldb, ipc_* mask) {
+  ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1;
+  ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1;
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
   dim3 grid(rb, cb);
-  cu_copy_mc< precision_ >
+  cu_copy_mc< rpc_ >
     <<< grid, threads, 0, *stream >>>
     ( nrows, ncols, a, lda, b, ldb, mask );
 }
 
-void spral_ssids_multisymm(cudaStream_t *stream, int nblocks,
+void spral_ssids_multisymm(cudaStream_t *stream, ipc_ nblocks,
       const struct multisymm_type* msdata) {
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-  for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-    int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+  for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+    ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
     dim3 grid(nb,4,4);
-    cu_multisymm< precision_ ><<< grid, threads, 0, *stream >>>( msdata + i );
+    cu_multisymm< rpc_ ><<< grid, threads, 0, *stream >>>( msdata + i );
   }
 }
 
-void spral_ssids_multicopy(cudaStream_t *stream, int nblocks,
+void spral_ssids_multicopy(cudaStream_t *stream, ipc_ nblocks,
       const struct multinode_fact_type *ndata,
       const struct multireorder_data *rdata,
-      precision_* a, precision_* b, int* stat, int* ncb) {
+      rpc_* a, rpc_* b, ipc_* stat, ipc_* ncb) {
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-  for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-    int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
-    cu_multicopy< precision_, BLOCK_SIZE, BLOCK_SIZE >
+  for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+    ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+    cu_multicopy< rpc_, BLOCK_SIZE, BLOCK_SIZE >
       <<< nb, threads, 0, *stream >>>
       ( ndata, rdata + i, b, stat, ncb );
   }
 }
 
-void spral_ssids_multireorder(cudaStream_t *stream, int nblocks,
+void spral_ssids_multireorder(cudaStream_t *stream, ipc_ nblocks,
       const struct multinode_fact_type *ndata,
       const struct multireorder_data *rdata,
-      precision_* c, int* stat, int* ind, int* index, int* ncb) {
+      rpc_* c, ipc_* stat, ipc_* ind, ipc_* index, ipc_* ncb) {
   dim3 threads(2*BLOCK_SIZE, 2*BLOCK_SIZE);
-  for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-    int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+  for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+    ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
     dim3 grid(nb,1);
-    cu_multireorder< precision_, 2*BLOCK_SIZE, 2*BLOCK_SIZE >
+    cu_multireorder< rpc_, 2*BLOCK_SIZE, 2*BLOCK_SIZE >
       <<< grid, threads, 0, *stream >>>
       ( ndata, rdata + i, c, stat, ind, index, ncb );
   }
 }
 
 // ncols <= 2*BLOCK_SIZE
-void spral_ssids_reorder_cols2(cudaStream_t *stream, int nrows, int ncols,
-    precision_* a, int lda, precision_* b, int ldb, int* index, int mode ) {
-  int rb = (nrows - 1)/BLOCK_SIZE + 1;
+void spral_ssids_reorder_cols2(cudaStream_t *stream, ipc_ nrows, ipc_ ncols,
+    rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index, ipc_ mode ) {
+  ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1;
   dim3 grid(rb, 2);
 
   if ( ncols <= BLOCK_SIZE ) {
     dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-    cu_reorder_cols2< precision_, BLOCK_SIZE, BLOCK_SIZE >
+    cu_reorder_cols2< rpc_, BLOCK_SIZE, BLOCK_SIZE >
       <<< grid, threads, 0, *stream >>>
       ( nrows, ncols, a, lda, b, ldb, index, mode );
   }
   else if ( ncols <= 2*BLOCK_SIZE ) {
     dim3 threads(BLOCK_SIZE, 2*BLOCK_SIZE);
-    cu_reorder_cols2< precision_, BLOCK_SIZE, 2*BLOCK_SIZE >
+    cu_reorder_cols2< rpc_, BLOCK_SIZE, 2*BLOCK_SIZE >
       <<< grid, threads, 0, *stream >>>
       ( nrows, ncols, a, lda, b, ldb, index, mode );
   }
 }
 
-void spral_ssids_reorder_rows(cudaStream_t *stream, int nrows, int ncols,
-      precision_* a, int lda, precision_* b, int ldb, int* index) {
-  int cb = (ncols - 1)/BLOCK_SIZE + 1;
+void spral_ssids_reorder_rows(cudaStream_t *stream, ipc_ nrows, ipc_ ncols,
+      rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index) {
+  ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1;
   dim3 grid(1, cb);
-  int tx = min(nrows, 1024/BLOCK_SIZE);
+  ipc_ tx = min(nrows, 1024/BLOCK_SIZE);
   dim3 threads(tx, BLOCK_SIZE);
-  cu_reorder_rows< precision_ >
+  cu_reorder_rows< rpc_ >
     <<< grid, threads, 0, *stream >>>
     ( nrows, ncols, a, lda, b, ldb, index );
 }
 
 // nrows <= 2*BLOCK_SIZE
-void spral_ssids_reorder_rows2(cudaStream_t *stream, int nrows, int ncols,
-    precision_* a, int lda, precision_* b, int ldb, int* index, int mode ) {
-  int cb = (ncols - 1)/BLOCK_SIZE + 1;
+void spral_ssids_reorder_rows2(cudaStream_t *stream, ipc_ nrows, ipc_ ncols,
+    rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index, ipc_ mode ) {
+  ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1;
   dim3 grid(cb, 2);
 
   if ( nrows <= BLOCK_SIZE ) {
     dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-    cu_reorder_rows2< precision_, BLOCK_SIZE, BLOCK_SIZE >
+    cu_reorder_rows2< rpc_, BLOCK_SIZE, BLOCK_SIZE >
       <<< grid, threads, 0, *stream >>>
       ( nrows, ncols, a, lda, b, ldb, index, mode );
   }
   else if ( nrows <= 2*BLOCK_SIZE ) {
     dim3 threads(2*BLOCK_SIZE, BLOCK_SIZE);
-    cu_reorder_rows2< precision_, 2*BLOCK_SIZE, BLOCK_SIZE >
+    cu_reorder_rows2< rpc_, 2*BLOCK_SIZE, BLOCK_SIZE >
       <<< grid, threads, 0, *stream >>>
       ( nrows, ncols, a, lda, b, ldb, index, mode );
   }
 }
 
-void spral_ssids_swap_ni2Dm(cudaStream_t *stream, int nblocks,
+void spral_ssids_swap_ni2Dm(cudaStream_t *stream, ipc_ nblocks,
       struct multiswap_type *swapdata) {
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
-   for ( int i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
-      int nb = min(MAX_CUDA_BLOCKS, nblocks - i);
+   for ( ipc_ i = 0; i < nblocks; i += MAX_CUDA_BLOCKS ) {
+      ipc_ nb = min(MAX_CUDA_BLOCKS, nblocks - i);
       dim3 grid(nb,8);
       cu_multiswap_ni2D_c
-         < precision_ >
+         < rpc_ >
          <<< grid, threads, 0, *stream >>>
          ( swapdata + i );
       cu_multiswap_ni2D_r
-         < precision_ >
+         < rpc_ >
          <<< grid, threads, 0, *stream >>>
          ( swapdata + i );
    }
 }
 
-void spral_ssids_swap_ni2D_ic(cudaStream_t *stream, int nrows, int ncols,
-      precision_* a, int lda, precision_* b, int ldb, int* index) {
-  int rb = (nrows - 1)/BLOCK_SIZE + 1;
-  int cb = (ncols - 1)/BLOCK_SIZE + 1;
+void spral_ssids_swap_ni2D_ic(cudaStream_t *stream, ipc_ nrows, ipc_ ncols,
+      rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index) {
+  ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1;
+  ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1;
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
   dim3 grid(rb, cb);
-  cu_swap_ni2D_ic< precision_ >
+  cu_swap_ni2D_ic< rpc_ >
     <<< grid, threads, 0, *stream >>>
     ( nrows, ncols, a, lda, b, ldb, index );
 }
 
-void spral_ssids_swap_ni2D_ir(cudaStream_t *stream, int nrows, int ncols,
-    precision_* a, int lda, precision_* b, int ldb, int* index) {
-  int rb = (nrows - 1)/BLOCK_SIZE + 1;
-  int cb = (ncols - 1)/BLOCK_SIZE + 1;
+void spral_ssids_swap_ni2D_ir(cudaStream_t *stream, ipc_ nrows, ipc_ ncols,
+    rpc_* a, ipc_ lda, rpc_* b, ipc_ ldb, ipc_* index) {
+  ipc_ rb = (nrows - 1)/BLOCK_SIZE + 1;
+  ipc_ cb = (ncols - 1)/BLOCK_SIZE + 1;
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
   dim3 grid(rb, cb);
-  cu_swap_ni2D_ir< precision_ >
+  cu_swap_ni2D_ir< rpc_ >
     <<< grid, threads, 0, *stream >>>
     ( nrows, ncols, a, lda, b, ldb, index );
 }
diff --git a/src/ssids/solve.cu b/src/ssids/solve.cu
index 7ac4357a1d..3ad774eb45 100644
--- a/src/ssids/solve.cu
+++ b/src/ssids/solve.cu
@@ -6,13 +6,14 @@
  * Jeremey Appleyard    NVIDIA
  *
  * This code has not yet been publically released under any licence.
+ * This version: GALAHAD 4.3 - 2024-02-03 AT 09:50 GMT
  */
 
 #include <cublas_v2.h>
 #include "spral_cuda_cuda_check.h"
+#include "ssids_rip.hxx"
 
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define gather gather_single
 #define gemv_transpose_lookup gemv_transpose_lookup_single
 #define gemv_transpose_sps_rhs gemv_transpose_sps_rhs_single
@@ -39,7 +40,6 @@
 #define spral_ssids_run_fwd_solve_kernels spral_ssids_run_fwd_solve_kernels_single
 #define spral_ssids_run_slv_contrib_fwd spral_ssids_run_slv_contrib_fwd_single
 #else
-#define precision_ double
 #define gather gather_double
 #define gemv_transpose_lookup gemv_transpose_lookup_double
 #define gemv_transpose_sps_rhs gemv_transpose_sps_rhs_double
@@ -89,11 +89,11 @@ using namespace spral::ssids::gpu;
 namespace /* anon */ {
 
 /* Perform the assignment xdense(:) = xsparse( idx(:) ) */
-template <int threadsx, int threadsy>
-void __device__ gather(const int n, const int *const idx, const precision_ *const xsparse,
-      volatile precision_ *const xdense) {
-   int tid = threadsx*threadIdx.y + threadIdx.x;
-   for(int i=tid; i<n; i+=threadsx*threadsy)
+template <int threadsx, ipc_ threadsy>
+void __device__ gather(const ipc_ n, const ipc_ *const idx, const rpc_ *const xsparse,
+      volatile rpc_ *const xdense) {
+   ipc_ tid = threadsx*threadIdx.y + threadIdx.x;
+   for(ipc_ i=tid; i<n; i+=threadsx*threadsy)
       xdense[i] = xsparse[ idx[i] ];
 }
 
@@ -102,12 +102,12 @@ void __device__ gather(const int n, const int *const idx, const precision_ *cons
 /***********************************************************************/
 
 struct gemv_transpose_lookup {
-   int m; // number of rows of L (cols of L^T) for block
-   int n; // number of cols of L (rows of L^T) for block
-   const precision_ *a;
-   int lda; // leading dimension of a
-   const int *rlist;
-   int yoffset; // offset into y for answer
+   ipc_ m; // number of rows of L (cols of L^T) for block
+   ipc_ n; // number of cols of L (rows of L^T) for block
+   const rpc_ *a;
+   ipc_ lda; // leading dimension of a
+   const ipc_ *rlist;
+   ipc_ yoffset; // offset ipc_o y for answer
 };
 
 /* This subroutine performs a matrix-vector multiplication y = Ax where
@@ -118,54 +118,54 @@ struct gemv_transpose_lookup {
  * Requires max(maxm + maxn*threadsx) shared memory.
  * Requires threadsy to exactly divide maxn.
  */
-template <int threadsx, int threadsy, int maxm, int maxn>
+template <int threadsx, ipc_ threadsy, ipc_ maxm, ipc_ maxn>
 __launch_bounds__(threadsx*threadsy, 6)
 void __global__ gemv_transpose_sps_rhs(struct gemv_transpose_lookup *lookup,
-      precision_ *x, precision_ *y
+      rpc_ *x, rpc_ *y
       ) {
 
    // Reuse shmem for two different purposes
-   __shared__ volatile precision_ shmem[maxn*threadsx];
-   volatile precision_ *const partSum = shmem;
-   volatile precision_ *const xlocal = shmem;
+   __shared__ volatile rpc_ shmem[maxn*threadsx];
+   volatile rpc_ *const partSum = shmem;
+   volatile rpc_ *const xlocal = shmem;
 
-   precision_ partSumReg[maxn / threadsy]; // Assumes neat division
+   rpc_ partSumReg[maxn / threadsy]; // Assumes neat division
 
 
    lookup += blockIdx.x;
-   int m = lookup->m;
-   int n = lookup->n;
-   const precision_ *a = lookup->a;
-   const int *rlist = lookup->rlist;
-   int lda = lookup->lda;
+   ipc_ m = lookup->m;
+   ipc_ n = lookup->n;
+   const rpc_ *a = lookup->a;
+   const ipc_ *rlist = lookup->rlist;
+   ipc_ lda = lookup->lda;
    y += lookup->yoffset;
 
-   /* Read x(rlist(:)) into xlocal(:) */
+   /* Read x(rlist(:)) ipc_o xlocal(:) */
    gather <threadsx,threadsy> (m, rlist, x, xlocal);
    __syncthreads();
 
    /* Perform matrix-vector multiply with answer y in register that
       is then stored in partSum for later reduction. */
    if(m==maxm) {
-      volatile precision_ *const xl = xlocal + threadIdx.x;
+      volatile rpc_ *const xl = xlocal + threadIdx.x;
 #pragma unroll
-      for(int iLoop=0; iLoop<maxn/threadsy; iLoop++) { // row
-         int i = iLoop * threadsy + threadIdx.y;
+      for(ipc_ iLoop=0; iLoop<maxn/threadsy; iLoop++) { // row
+         ipc_ i = iLoop * threadsy + threadIdx.y;
          partSumReg[iLoop] = 0;
          if (i < n) {
-            const precision_ *arow = a+i*lda+threadIdx.x;
-            for(int j=0; j<maxm; j+=threadsx)
+            const rpc_ *arow = a+i*lda+threadIdx.x;
+            for(ipc_ j=0; j<maxm; j+=threadsx)
                partSumReg[iLoop] += xl[j] * arow[j];
          }
       }
    } else {
 #pragma unroll
-      for(int iLoop=0; iLoop<maxn/threadsy; iLoop++) { // row
-         int i = iLoop * threadsy + threadIdx.y;
+      for(ipc_ iLoop=0; iLoop<maxn/threadsy; iLoop++) { // row
+         ipc_ i = iLoop * threadsy + threadIdx.y;
          partSumReg[iLoop] = 0;
          if (i < n) {
-            const precision_ *arow = a+i*lda;
-            for(int j=threadIdx.x; j<m; j+=threadsx)
+            const rpc_ *arow = a+i*lda;
+            for(ipc_ j=threadIdx.x; j<m; j+=threadsx)
                partSumReg[iLoop] += xlocal[j] * arow[j];
          }
       }
@@ -173,8 +173,8 @@ void __global__ gemv_transpose_sps_rhs(struct gemv_transpose_lookup *lookup,
 
    __syncthreads(); // Wait till done with xlocal=shmem before using partSum
 #pragma unroll
-   for(int iLoop=0; iLoop<maxn/threadsy; iLoop++) { // row
-      int i = iLoop * threadsy + threadIdx.y;
+   for(ipc_ iLoop=0; iLoop<maxn/threadsy; iLoop++) { // row
+      ipc_ i = iLoop * threadsy + threadIdx.y;
       if (i < n) {
          partSum[i*threadsx+threadIdx.x] = partSumReg[iLoop];
       }
@@ -184,11 +184,11 @@ void __global__ gemv_transpose_sps_rhs(struct gemv_transpose_lookup *lookup,
 
    /* Reduce partSum across threads to get y contribution from this block */
    if(threadIdx.y==0) {
-      for(int i=threadIdx.x; i<n; i+=threadsx) {
-         precision_ val = 0;
+      for(ipc_ i=threadIdx.x; i<n; i+=threadsx) {
+         rpc_ val = 0;
          /* The offset avoids large bank conflicts. */
-         for(int j=threadIdx.x; j<threadsx+threadIdx.x; j++) {
-            int j2 = (j >= threadsx ? j - threadsx : j);
+         for(ipc_ j=threadIdx.x; j<threadsx+threadIdx.x; j++) {
+            ipc_ j2 = (j >= threadsx ? j - threadsx : j);
             val += partSum[i*threadsx+j2];
          }
          y[i] = val;
@@ -202,13 +202,13 @@ void __global__ gemv_transpose_sps_rhs(struct gemv_transpose_lookup *lookup,
 /***********************************************************************/
 
 struct reducing_d_solve_lookup {
-   int first_idx; // Index of supernode for thread 0 of this block.
-   int m; // Number of columns in upd to reduce.
-   int n; // Number of rows THIS BLOCK is responisble for.
-   int ldupd; // Leading dimension of upd.
-   int updoffset; // Offset into upd for supernode.
-   const precision_ *d;
-   const int *perm; // Offset into perm for supernode.
+   ipc_ first_idx; // Index of supernode for thread 0 of this block.
+   ipc_ m; // Number of columns in upd to reduce.
+   ipc_ n; // Number of rows THIS BLOCK is responisble for.
+   ipc_ ldupd; // Leading dimension of upd.
+   ipc_ updoffset; // Offset into upd for supernode.
+   const rpc_ *d;
+   const ipc_ *perm; // Offset into perm for supernode.
 };
 
 /* This subroutine performs two unrelated tasks and subtracts the result of the
@@ -222,40 +222,40 @@ struct reducing_d_solve_lookup {
  */
 template <int threadsx, bool DSOLVE>
 void __global__ reducing_d_solve(struct reducing_d_solve_lookup *lookup,
-      precision_ *upd, const precision_ *x
+      rpc_ *upd, const rpc_ *x
       ) {
 
    /* Read details from lookup */
    lookup += blockIdx.x;
-   int idx = lookup->first_idx + threadIdx.x;
-   int m = lookup->m;
-   int n = lookup->n;
-   int ldupd = lookup->ldupd;
+   ipc_ idx = lookup->first_idx + threadIdx.x;
+   ipc_ m = lookup->m;
+   ipc_ n = lookup->n;
+   ipc_ ldupd = lookup->ldupd;
    upd += lookup->updoffset;
-   const precision_ *d = lookup->d;
-   const int *perm = lookup->perm;
+   const rpc_ *d = lookup->d;
+   const ipc_ *perm = lookup->perm;
 
 
    /* Don't do anything on threads past end of arrays */
    if(threadIdx.x>=m) return;
 
    /* Task 1: Sum upd and negate */
-   precision_ val = upd[idx];
-   for(int j=1; j<n; j++)
+   rpc_ val = upd[idx];
+   for(ipc_ j=1; j<n; j++)
       val += upd[j*ldupd+idx];
    val = -val;
 
    /* Task 2: D solve (note that D is actually stored as inverse already) */
    if(DSOLVE) {
-      int rp = perm[idx];
+      ipc_ rp = perm[idx];
       if(idx!=0 && d[2*idx-1] != 0) {
          /* second part of 2x2 */
-         int rp2 = perm[idx-1];
+         ipc_ rp2 = perm[idx-1];
          val += d[2*idx-1] * x[rp2] +
                 d[2*idx]   * x[rp];
       } else if (d[2*idx+1] != 0) {
          /* first part of 2x2 */
-         int rp2 = perm[idx+1];
+         ipc_ rp2 = perm[idx+1];
          val += d[2*idx]   * x[rp] +
                 d[2*idx+1] * x[rp2];
       } else {
@@ -263,7 +263,7 @@ void __global__ reducing_d_solve(struct reducing_d_solve_lookup *lookup,
          val += x[rp]*d[2*idx];
       }
    } else {
-      int rp = perm[idx];
+      ipc_ rp = perm[idx];
       val += x[rp];
    }
 
@@ -282,29 +282,29 @@ void __global__ reducing_d_solve(struct reducing_d_solve_lookup *lookup,
  */
 template <int threadsx>
 void __global__ d_solve(struct reducing_d_solve_lookup *lookup,
-      const precision_ *x, precision_ *y) {
+      const rpc_ *x, rpc_ *y) {
 
    /* Read details from lookup */
    lookup += blockIdx.x;
-   int idx = lookup->first_idx + threadIdx.x;
-   int m = lookup->m;
-   const precision_ *d = lookup->d;
-   const int *perm = lookup->perm;
+   ipc_ idx = lookup->first_idx + threadIdx.x;
+   ipc_ m = lookup->m;
+   const rpc_ *d = lookup->d;
+   const ipc_ *perm = lookup->perm;
 
    /* Don't do anything on threads past end of arrays */
    if(threadIdx.x>=m) return;
 
    /* D solve (note that D is actually stored as inverse already) */
-   int rp = perm[idx];
-   precision_ val;
+   ipc_ rp = perm[idx];
+   rpc_ val;
    if(idx!=0 && d[2*idx-1] != 0) {
       /* second part of 2x2 */
-      int rp2 = perm[idx-1];
+      ipc_ rp2 = perm[idx-1];
       val = d[2*idx-1] * x[rp2] +
             d[2*idx]   * x[rp];
    } else if (d[2*idx+1] != 0) {
       /* first part of 2x2 */
-      int rp2 = perm[idx+1];
+      ipc_ rp2 = perm[idx+1];
       val = d[2*idx]   * x[rp] +
             d[2*idx+1] * x[rp2];
    } else {
@@ -321,44 +321,44 @@ void __global__ d_solve(struct reducing_d_solve_lookup *lookup,
 /***********************************************************************/
 
 struct scatter_lookup {
-   int n;
-   int src_offset;
-   const int *index;
-   int dest_offset;
+   ipc_ n;
+   ipc_ src_offset;
+   const ipc_ *index;
+   ipc_ dest_offset;
 };
 
 /* This subroutine performs the scatter operation dest( index(:) ) = src(:)
  */
-void __global__ scatter(struct scatter_lookup *lookup, const precision_ *src,
-      precision_ *dest
+void __global__ scatter(struct scatter_lookup *lookup, const rpc_ *src,
+      rpc_ *dest
       ) {
 
    lookup += blockIdx.x;
    if(threadIdx.x >= lookup->n) return; // Skip on out of range threads
    src += lookup->src_offset;
-   const int *index = lookup->index;
+   const ipc_ *index = lookup->index;
    dest += lookup->dest_offset;
 
 
-   int idx = index[threadIdx.x];
+   ipc_ idx = index[threadIdx.x];
    dest[idx] = src[threadIdx.x];
 
 }
 
 /* This subroutine performs the scatter operation dest( index(:) ) += src(:)
  */
-void __global__ scatter_sum(struct scatter_lookup *lookup, const precision_ *src,
-      precision_ *dest
+void __global__ scatter_sum(struct scatter_lookup *lookup, const rpc_ *src,
+      rpc_ *dest
       ) {
 
    lookup += blockIdx.x;
    if(threadIdx.x >= lookup->n) return; // Skip on out of range threads
    src += lookup->src_offset;
-   const int *index = lookup->index;
+   const ipc_ *index = lookup->index;
    dest += lookup->dest_offset;
 
 
-   int idx = index[threadIdx.x];
+   ipc_ idx = index[threadIdx.x];
    dest[idx] += src[threadIdx.x];
 
 }
@@ -368,10 +368,10 @@ void __global__ scatter_sum(struct scatter_lookup *lookup, const precision_ *src
 /***********************************************************************/
 
 struct lookups_gpu_bwd {
-   int ngemv;
-   int nrds;
-   int ntrsv;
-   int nscatter;
+   ipc_ ngemv;
+   ipc_ nrds;
+   ipc_ ntrsv;
+   ipc_ nscatter;
    struct gemv_transpose_lookup *gemv;
    struct reducing_d_solve_lookup *rds;
    struct trsv_lookup *trsv;
@@ -383,34 +383,34 @@ struct lookups_gpu_bwd {
  * Result y actually output as array with leading dimn m that must be summed
  * externally.
  */
-template <int threadsx, int threadsy, int maxm, int maxn>
-void __global__ simple_gemv(int m, int n, const precision_ *a, int lda,
-      const precision_ *x, precision_ *y) {
+template <int threadsx, ipc_ threadsy, ipc_ maxm, ipc_ maxn>
+void __global__ simple_gemv(ipc_ m, ipc_ n, const rpc_ *a, ipc_ lda,
+      const rpc_ *x, rpc_ *y) {
    a += blockIdx.x*maxm + (blockIdx.y*maxn)*lda;
    x += blockIdx.y*maxn;
    y += m*blockIdx.y + maxm*blockIdx.x;
 
-   __shared__ volatile precision_ partSum[maxm*threadsy];
+   __shared__ volatile rpc_ partSum[maxm*threadsy];
 
    m = MIN(maxm, m-blockIdx.x*maxm);
    n = MIN(maxn, n-blockIdx.y*maxn);
 
-   volatile precision_ *const ps = partSum + maxm*threadIdx.y;
-   for(int j=threadIdx.x; j<m; j+=threadsx) {
+   volatile rpc_ *const ps = partSum + maxm*threadIdx.y;
+   for(ipc_ j=threadIdx.x; j<m; j+=threadsx) {
       ps[j] = 0;
    }
-   for(int i=threadIdx.y; i<n; i+=threadsy) {
-      precision_ xv = x[i];
-      for(int j=threadIdx.x; j<m; j+=threadsx) {
+   for(ipc_ i=threadIdx.y; i<n; i+=threadsy) {
+      rpc_ xv = x[i];
+      for(ipc_ j=threadIdx.x; j<m; j+=threadsx) {
          ps[j] += a[i*lda+j]*xv;
       }
    }
 
    __syncthreads();
    if(threadIdx.y==0) {
-      for(int j=threadIdx.x; j<m; j+=threadsx) {
-         precision_ val = ps[j];
-         for(int i=1; i<threadsy; i++) {
+      for(ipc_ j=threadIdx.x; j<m; j+=threadsx) {
+         rpc_ val = ps[j];
+         for(ipc_ i=1; i<threadsy; i++) {
             val += ps[j+i*maxm];
          }
          y[j] = val;
@@ -419,46 +419,46 @@ void __global__ simple_gemv(int m, int n, const precision_ *a, int lda,
 }
 
 struct gemv_notrans_lookup {
-   int m;
-   int n;
-   const precision_ *a;
-   int lda;
-   int x_offset;
-   int y_offset;
+   ipc_ m;
+   ipc_ n;
+   const rpc_ *a;
+   ipc_ lda;
+   ipc_ x_offset;
+   ipc_ y_offset;
 };
 
-template <int threadsx, int threadsy, int maxm, int maxn>
-void __global__ simple_gemv_lookup(const precision_ *x, precision_ *y,
+template <int threadsx, ipc_ threadsy, ipc_ maxm, ipc_ maxn>
+void __global__ simple_gemv_lookup(const rpc_ *x, rpc_ *y,
       struct gemv_notrans_lookup *lookup) {
    lookup += blockIdx.x;
-   int m = lookup->m;
-   int n = lookup->n;
-   precision_ const* a = lookup->a;
-   int lda = lookup->lda;
+   ipc_ m = lookup->m;
+   ipc_ n = lookup->n;
+   rpc_ const* a = lookup->a;
+   ipc_ lda = lookup->lda;
    x += lookup->x_offset;
    y += lookup->y_offset;
 
-   __shared__ volatile precision_ partSum[maxm*threadsy];
+   __shared__ volatile rpc_ partSum[maxm*threadsy];
 
-   volatile precision_ *const ps = partSum + maxm*threadIdx.y;
+   volatile rpc_ *const ps = partSum + maxm*threadIdx.y;
 
    // Templated parameters for shortcut
    if (maxm <= threadsx) {
       ps[threadIdx.x] = 0;
    }
    else {
-      for(int j=threadIdx.x; j<m; j+=threadsx) {
+      for(ipc_ j=threadIdx.x; j<m; j+=threadsx) {
          ps[j] = 0;
       }
    }
-   for(int i=threadIdx.y; i<n; i+=threadsy) {
-      precision_ xv = x[i];
+   for(ipc_ i=threadIdx.y; i<n; i+=threadsy) {
+      rpc_ xv = x[i];
       // Templated parameters for shortcut - this reads out of bounds so shouldn't be uncommented
       /*if (maxm <= threadsx) {
          ps[threadIdx.x] += a[i*lda+threadIdx.x]*xv;
       }
       else {*/
-         for(int j=threadIdx.x; j<m; j+=threadsx) {
+         for(ipc_ j=threadIdx.x; j<m; j+=threadsx) {
             ps[j] += a[i*lda+j]*xv;
          }
       //}
@@ -469,17 +469,17 @@ void __global__ simple_gemv_lookup(const precision_ *x, precision_ *y,
       // Templated parameters for shortcut
       if (maxm <= threadsx) {
          if (threadIdx.x < m) {
-            precision_ val = ps[threadIdx.x];
-            for(int i=1; i<threadsy; i++) {
+            rpc_ val = ps[threadIdx.x];
+            for(ipc_ i=1; i<threadsy; i++) {
                val += ps[threadIdx.x+i*maxm];
             }
             y[threadIdx.x] = val;
          }
       }
       else {
-         for(int j=threadIdx.x; j<m; j+=threadsx) {
-            precision_ val = ps[j];
-            for(int i=1; i<threadsy; i++) {
+         for(ipc_ j=threadIdx.x; j<m; j+=threadsx) {
+            rpc_ val = ps[j];
+            for(ipc_ i=1; i<threadsy; i++) {
                val += ps[j+i*maxm];
             }
             y[j] = val;
@@ -489,73 +489,73 @@ void __global__ simple_gemv_lookup(const precision_ *x, precision_ *y,
 }
 
 struct reduce_notrans_lookup {
-   int m;
-   int n;
-   int src_offset;
-   int ldsrc;
-   int dest_idx;
-   int dest_offset;
+   ipc_ m;
+   ipc_ n;
+   ipc_ src_offset;
+   ipc_ ldsrc;
+   ipc_ dest_idx;
+   ipc_ dest_offset;
 };
 
-void __global__ gemv_reduce_lookup(const precision_ *src, precision_ **dest, int numLookups, struct reduce_notrans_lookup *lookup) {
-   int offset = blockIdx.x * blockDim.y + threadIdx.y;
+void __global__ gemv_reduce_lookup(const rpc_ *src, rpc_ **dest, ipc_ numLookups, struct reduce_notrans_lookup *lookup) {
+   ipc_ offset = blockIdx.x * blockDim.y + threadIdx.y;
    if (offset >= numLookups) return;
 
    lookup += offset;
-   int m = lookup->m;
+   ipc_ m = lookup->m;
    if(threadIdx.x>=m) return;
-   int n = lookup->n;
+   ipc_ n = lookup->n;
    src += lookup->src_offset + threadIdx.x;
-   int ldsrc = lookup->ldsrc;
-   precision_ *d = dest[lookup->dest_idx] + lookup->dest_offset;
+   ipc_ ldsrc = lookup->ldsrc;
+   rpc_ *d = dest[lookup->dest_idx] + lookup->dest_offset;
 
-   precision_ val = 0;
-   for(int i=0; i<n; i++)
+   rpc_ val = 0;
+   for(ipc_ i=0; i<n; i++)
       val += src[i*ldsrc];
    d[threadIdx.x] -= val;
 }
 
 // FIXME: move to common header?
 struct assemble_blk_type {
-   int cp;
-   int blk;
+   ipc_ cp;
+   ipc_ blk;
 };
 
 struct assemble_lookup {
-   int m;
-   int xend;
-   int const* list;
-   int x_offset;
-   int contrib_idx;
-   int contrib_offset;
-   int nchild;
-   int const* clen;
-   int * const* clists;
-   int * const* clists_direct;
-   int cvalues_offset;
-   int first; // First index of node. Used to shortcut searching
+   ipc_ m;
+   ipc_ xend;
+   ipc_ const* list;
+   ipc_ x_offset;
+   ipc_ contrib_idx;
+   ipc_ contrib_offset;
+   ipc_ nchild;
+   ipc_ const* clen;
+   ipc_ * const* clists;
+   ipc_ * const* clists_direct;
+   ipc_ cvalues_offset;
+   ipc_ first; // First index of node. Used to shortcut searching
 };
 
 struct assemble_lookup2 {
-   int m;
-   int nelim;
-   int x_offset;
-   int *const* list;
-   int cvparent;
-   int cvchild;
-   int sync_offset;
-   int sync_waitfor;
+   ipc_ m;
+   ipc_ nelim;
+   ipc_ x_offset;
+   ipc_ *const* list;
+   ipc_ cvparent;
+   ipc_ cvchild;
+   ipc_ sync_offset;
+   ipc_ sync_waitfor;
 };
 
-void __device__ wait_for_sync(const int tid, volatile int *const sync, const int target) {
+void __device__ wait_for_sync(const ipc_ tid, volatile ipc_ *const sync, const ipc_ target) {
    if(tid==0) {
       while(*sync < target) {}
    }
    __syncthreads();
 }
 
-void __global__ assemble_lvl(struct assemble_lookup2 *lookup, struct assemble_blk_type *blkdata, precision_ *xlocal, int *next_blk, volatile int *sync, precision_ * const* cvalues) {
-   __shared__ volatile int thisblk;
+void __global__ assemble_lvl(struct assemble_lookup2 *lookup, struct assemble_blk_type *blkdata, rpc_ *xlocal, ipc_ *next_blk, volatile ipc_ *sync, rpc_ * const* cvalues) {
+   __shared__ volatile ipc_ thisblk;
    if(threadIdx.x==0)
       thisblk = atomicAdd(next_blk, 1);
    __syncthreads();
@@ -563,12 +563,12 @@ void __global__ assemble_lvl(struct assemble_lookup2 *lookup, struct assemble_bl
    blkdata += thisblk;
    lookup += blkdata->cp;
 
-   int blk = blkdata->blk;
-   int m = lookup->m;
-   int nelim = lookup->nelim;
-   precision_ *xparent = cvalues[lookup->cvparent];
-   volatile const precision_ *xchild = cvalues[lookup->cvchild];
-   const int * list = *(lookup->list);
+   ipc_ blk = blkdata->blk;
+   ipc_ m = lookup->m;
+   ipc_ nelim = lookup->nelim;
+   rpc_ *xparent = cvalues[lookup->cvparent];
+   volatile const rpc_ *xchild = cvalues[lookup->cvchild];
+   const ipc_ * list = *(lookup->list);
    xlocal += lookup->x_offset;
 
    // Wait for previous children to complete
@@ -580,8 +580,8 @@ void __global__ assemble_lvl(struct assemble_lookup2 *lookup, struct assemble_bl
    xchild += blk*ASSEMBLE_NB;
 
    // Perform actual assembly
-   for(int i=threadIdx.x; i<m; i+=blockDim.x) {
-      int j = list[i];
+   for(ipc_ i=threadIdx.x; i<m; i+=blockDim.x) {
+      ipc_ j = list[i];
       if(j < nelim) {
          xlocal[j] += xchild[i];
       } else {
@@ -593,37 +593,37 @@ void __global__ assemble_lvl(struct assemble_lookup2 *lookup, struct assemble_bl
    __threadfence();
    __syncthreads();
    if(threadIdx.x==0) {
-      atomicAdd((int*)&(sync[lookup->sync_offset]), 1);
+      atomicAdd((ipc_*)&(sync[lookup->sync_offset]), 1);
    }
 }
 
-void __global__ grabx(precision_ *xlocal, precision_ **xstack, const precision_ *x,
+void __global__ grabx(rpc_ *xlocal, rpc_ **xstack, const rpc_ *x,
       struct assemble_lookup *lookup) {
 
    lookup += blockIdx.x;
    if(threadIdx.x>=lookup->m) return;
-   int xend = lookup->xend;
-   precision_ *contrib =
+   ipc_ xend = lookup->xend;
+   rpc_ *contrib =
       (threadIdx.x>=xend) ?
          xstack[lookup->contrib_idx]+lookup->contrib_offset :
          NULL;
    xlocal += lookup->x_offset;
 
-   int row = lookup->list[threadIdx.x];
+   ipc_ row = lookup->list[threadIdx.x];
 
    if(threadIdx.x<xend) xlocal[threadIdx.x] = x[row];
    else                 contrib[threadIdx.x] = 0.0;
 }
 
 struct lookups_gpu_fwd {
-   int nassemble;
-   int nasm_sync;
-   int nassemble2;
-   int nasmblk;
-   int ntrsv;
-   int ngemv;
-   int nreduce;
-   int nscatter;
+   ipc_ nassemble;
+   ipc_ nasm_sync;
+   ipc_ nassemble2;
+   ipc_ nasmblk;
+   ipc_ ntrsv;
+   ipc_ ngemv;
+   ipc_ nreduce;
+   ipc_ nscatter;
    struct assemble_lookup *assemble;
    struct assemble_lookup2 *assemble2;
    struct assemble_blk_type *asmblk;
@@ -634,7 +634,7 @@ struct lookups_gpu_fwd {
 };
 
 struct lookup_contrib_fwd {
-   int nscatter;
+   ipc_ nscatter;
    struct scatter_lookup *scatter;
 };
 
@@ -647,44 +647,44 @@ struct lookup_contrib_fwd {
 extern "C" {
 
 void spral_ssids_run_fwd_solve_kernels(bool posdef,
-      struct lookups_gpu_fwd const* gpu, precision_ *xlocal_gpu,
-      precision_ **xstack_gpu, precision_ *x_gpu, precision_ ** cvalues_gpu,
-      precision_ *work_gpu, int nsync, int *sync, int nasm_sync, int *asm_sync,
+      struct lookups_gpu_fwd const* gpu, rpc_ *xlocal_gpu,
+      rpc_ **xstack_gpu, rpc_ *x_gpu, rpc_ ** cvalues_gpu,
+      rpc_ *work_gpu, ipc_ nsync, ipc_ *sync, ipc_ nasm_sync, ipc_ *asm_sync,
       const cudaStream_t *stream) {
 
    if(nsync>0) {
-      for(int i=0; i<nsync; i+=65535)
+      for(ipc_ i=0; i<nsync; i+=65535)
          trsv_init <<<MIN(65535,nsync-i), 1, 0, *stream>>> (sync+2*i);
       CudaCheckError();
    }
-   for(int i=0; i<gpu->nassemble; i+=65535)
+   for(ipc_ i=0; i<gpu->nassemble; i+=65535)
       grabx
          <<<MIN(65535,gpu->nassemble-i), ASSEMBLE_NB, 0, *stream>>>
          (xlocal_gpu, xstack_gpu, x_gpu, gpu->assemble+i);
-   cudaMemset(asm_sync, 0, (1+gpu->nasm_sync)*sizeof(int));
-   for(int i=0; i<gpu->nasmblk; i+=65535)
+   cudaMemset(asm_sync, 0, (1+gpu->nasm_sync)*sizeof(ipc_));
+   for(ipc_ i=0; i<gpu->nasmblk; i+=65535)
       assemble_lvl
          <<<MIN(65535,gpu->nasmblk-i), ASSEMBLE_NB, 0, *stream>>>
          (gpu->assemble2, gpu->asmblk, xlocal_gpu, &asm_sync[0], &asm_sync[1], cvalues_gpu);
    CudaCheckError();
    if(gpu->ntrsv>0) {
       if(posdef) {
-         for(int i=0; i<gpu->ntrsv; i+=65535)
+         for(ipc_ i=0; i<gpu->ntrsv; i+=65535)
             trsv_ln_exec
-               <precision_,TRSV_NB_TASK,THREADSX_TASK,THREADSY_TASK,false>
+               <rpc_,TRSV_NB_TASK,THREADSX_TASK,THREADSY_TASK,false>
                <<<MIN(65535,gpu->ntrsv-i), dim3(THREADSX_TASK,THREADSY_TASK), 0, *stream>>>
                (xlocal_gpu, sync, gpu->trsv+i);
       } else {
-         for(int i=0; i<gpu->ntrsv; i+=65535)
+         for(ipc_ i=0; i<gpu->ntrsv; i+=65535)
             trsv_ln_exec
-               <precision_,TRSV_NB_TASK,THREADSX_TASK,THREADSY_TASK,true>
+               <rpc_,TRSV_NB_TASK,THREADSX_TASK,THREADSY_TASK,true>
                <<<MIN(65535,gpu->ntrsv-i), dim3(THREADSX_TASK,THREADSY_TASK), 0, *stream>>>
                (xlocal_gpu, sync, gpu->trsv+i);
       }
       CudaCheckError();
    }
    if(gpu->ngemv>0) {
-      for(int i=0; i<gpu->ngemv; i+=65535)
+      for(ipc_ i=0; i<gpu->ngemv; i+=65535)
          simple_gemv_lookup
             <GEMV_THREADSX, GEMV_THREADSY, GEMV_NX, GEMV_NY>
             <<<MIN(65535,gpu->ngemv-i), dim3(GEMV_THREADSX,GEMV_THREADSY), 0, *stream>>>
@@ -699,15 +699,15 @@ void spral_ssids_run_fwd_solve_kernels(bool posdef,
          (work_gpu, cvalues_gpu, gpu->nreduce, gpu->reduce);
       CudaCheckError();
    }
-   for(int i=0; i<gpu->nscatter; i+=65535)
+   for(ipc_ i=0; i<gpu->nscatter; i+=65535)
       scatter
          <<<MIN(65535,gpu->nscatter-i), SCATTER_NB, 0, *stream>>>
          (gpu->scatter+i, xlocal_gpu, x_gpu);
    CudaCheckError();
 }
 
-void spral_ssids_run_d_solve_kernel(precision_ *x_gpu,
-      precision_ *y_gpu, struct lookups_gpu_bwd *gpu,
+void spral_ssids_run_d_solve_kernel(rpc_ *x_gpu,
+      rpc_ *y_gpu, struct lookups_gpu_bwd *gpu,
      const cudaStream_t *stream) {
 
    if(gpu->nrds>0) {
@@ -720,18 +720,18 @@ void spral_ssids_run_d_solve_kernel(precision_ *x_gpu,
 }
 
 void spral_ssids_run_bwd_solve_kernels(bool dsolve,
-      bool unit_diagonal, precision_ *x_gpu, precision_ *work_gpu,
-      int nsync, int *sync_gpu, struct lookups_gpu_bwd *gpu,
+      bool unit_diagonal, rpc_ *x_gpu, rpc_ *work_gpu,
+      ipc_ nsync, ipc_ *sync_gpu, struct lookups_gpu_bwd *gpu,
       const cudaStream_t *stream) {
 
    /* === Kernel Launches === */
    if(nsync>0) {
-      for(int i=0; i<nsync; i+=65535)
+      for(ipc_ i=0; i<nsync; i+=65535)
          trsv_init <<<MIN(65535,nsync-i), 1, 0, *stream>>> (sync_gpu+2*i);
       CudaCheckError();
    }
    if(gpu->ngemv>0) {
-      for(int i=0; i<gpu->ngemv; i+=65535)
+      for(ipc_ i=0; i<gpu->ngemv; i+=65535)
          gemv_transpose_sps_rhs
             <TRSM_TR_THREADSX, TRSM_TR_THREADSY, TRSM_TR_NBX, TRSM_TR_NBY>
             <<<MIN(65535,gpu->ngemv-i), dim3(TRSM_TR_THREADSX,TRSM_TR_THREADSY), 0, *stream>>>
@@ -741,13 +741,13 @@ void spral_ssids_run_bwd_solve_kernels(bool dsolve,
 
    if(gpu->nrds>0) {
       if(dsolve) {
-         for(int i=0; i<gpu->nrds; i+=65535)
+         for(ipc_ i=0; i<gpu->nrds; i+=65535)
             reducing_d_solve
                <REDUCING_D_SOLVE_THREADS_PER_BLOCK, true>
                <<<MIN(65535,gpu->nrds-i), REDUCING_D_SOLVE_THREADS_PER_BLOCK, 0, *stream>>>
                (gpu->rds+i, work_gpu, x_gpu);
       } else {
-         for(int i=0; i<gpu->nrds; i+=65535)
+         for(ipc_ i=0; i<gpu->nrds; i+=65535)
             reducing_d_solve
                <REDUCING_D_SOLVE_THREADS_PER_BLOCK, false>
                <<<MIN(65535,gpu->nrds-i), REDUCING_D_SOLVE_THREADS_PER_BLOCK, 0, *stream>>>
@@ -758,15 +758,15 @@ void spral_ssids_run_bwd_solve_kernels(bool dsolve,
 
    if(gpu->ntrsv>0) {
       if(unit_diagonal) {
-         for(int i=0; i<gpu->ntrsv; i+=65535)
+         for(ipc_ i=0; i<gpu->ntrsv; i+=65535)
             trsv_lt_exec
-               <precision_,TRSV_NB_TASK,THREADSX_TASK,THREADSY_TASK,true>
+               <rpc_,TRSV_NB_TASK,THREADSX_TASK,THREADSY_TASK,true>
                <<<MIN(65535,gpu->ntrsv-i), dim3(THREADSX_TASK,THREADSY_TASK), 0, *stream>>>
                (gpu->trsv+i, work_gpu, sync_gpu);
       } else {
-         for(int i=0; i<gpu->ntrsv; i+=65535)
+         for(ipc_ i=0; i<gpu->ntrsv; i+=65535)
             trsv_lt_exec
-               <precision_,TRSV_NB_TASK,THREADSX_TASK,THREADSY_TASK,false>
+               <rpc_,TRSV_NB_TASK,THREADSX_TASK,THREADSY_TASK,false>
                <<<MIN(65535,gpu->ntrsv-i), dim3(THREADSX_TASK,THREADSY_TASK), 0, *stream>>>
                (gpu->trsv+i, work_gpu, sync_gpu);
       }
@@ -774,7 +774,7 @@ void spral_ssids_run_bwd_solve_kernels(bool dsolve,
    }
 
    if(gpu->nscatter>0) {
-      for(int i=0; i<gpu->nscatter; i+=65535)
+      for(ipc_ i=0; i<gpu->nscatter; i+=65535)
          scatter
             <<<MIN(65535,gpu->nscatter-i), SCATTER_NB, 0, *stream>>>
             (gpu->scatter+i, work_gpu, x_gpu);
@@ -784,10 +784,10 @@ void spral_ssids_run_bwd_solve_kernels(bool dsolve,
 
 void spral_ssids_run_slv_contrib_fwd(
       struct lookup_contrib_fwd const* gpu,
-      precision_* x_gpu, precision_ const* xstack_gpu,
+      rpc_* x_gpu, rpc_ const* xstack_gpu,
       const cudaStream_t *stream) {
    if(gpu->nscatter>0) {
-      for(int i=0; i<gpu->nscatter; i+=65535)
+      for(ipc_ i=0; i<gpu->nscatter; i+=65535)
          scatter_sum
             <<<MIN(65535,gpu->nscatter-i), SCATTER_NB, 0, *stream>>>
             (gpu->scatter+i, xstack_gpu, x_gpu);
diff --git a/src/ssids/syrk.cu b/src/ssids/syrk.cu
index 59d7413429..235ee9f3d6 100644
--- a/src/ssids/syrk.cu
+++ b/src/ssids/syrk.cu
@@ -2,17 +2,18 @@
  * Copyright (c) 2013 NVIDIA
  * Authors: Evgueni Ovtchinnikov (STFC)
  *          Jeremy Appleyard (NVIDIA)
+ * This version: GALAHAD 4.3 - 2024-02-03 AT 09:40 GMT
  */
 
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <device_launch_parameters.h>
 
+#include "ssids_rip.hxx"
 #include "ssids_gpu_kernels_datatypes.h"
 #include "spral_cuda_cuda_check.h"
 
 #ifdef SPRAL_SINGLE
-#define precision_ float
 #define loadDevToSmem_generic loadDevToSmem_generic_single
 #define multisyrk_type multisyrk_type_single
 #define multielm_data multielm_data_single
@@ -23,7 +24,6 @@
 #define spral_ssids_multidsyrk spral_ssids_multidsyrk_single
 #define spral_ssids_multidsyrk_low_col spral_ssids_multidsyrk_low_col_single
 #else
-#define precision_ double
 #define loadDevToSmem_generic loadDevToSmem_generic_double
 #define multisyrk_type multisyrk_type_double
 #define multielm_data multielm_data_double
@@ -50,18 +50,18 @@ namespace /* anon */ {
 
 
 
-template< int WIDTH >
+template< ipc_ WIDTH >
 inline __device__ void
-loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile precision_ *const __restrict__ bs,
-               const precision_* __restrict__ a, const precision_* __restrict__ b,
-               int bx, int by, int offa, int lda, int ldb,
-               int n, int i, int k)
+loadDevToSmem_generic( volatile rpc_ *const __restrict__ as, volatile rpc_ *const __restrict__ bs,
+               const rpc_* __restrict__ a, const rpc_* __restrict__ b,
+               ipc_ bx, ipc_ by, ipc_ offa, ipc_ lda, ipc_ ldb,
+               ipc_ n, ipc_ i, ipc_ k)
 {
   switch (WIDTH) {
     case 4:
     if ( i + 3 < k ) {
       if ( threadIdx.y < 4 ) {
-        int x = threadIdx.x + (threadIdx.y + bx*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8;
         if ( x < n ) {
           as[threadIdx.x + threadIdx.y*8     ] = a[offa + x + i*lda];
           as[threadIdx.x + threadIdx.y*8 + 32] = a[offa + x + (i + 1)*lda];
@@ -70,7 +70,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec
         }
       }
       else {
-        int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
         if ( x < n ) {
           bs[threadIdx.x + (threadIdx.y - 4)*8     ] = b[offa + x + i*ldb];
           bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = b[offa + x + (i + 1)*ldb];
@@ -81,7 +81,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec
     }
     else if ( i + 2 < k ) {
       if ( threadIdx.y < 4 ) {
-        int x = threadIdx.x + (threadIdx.y + bx*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8;
         if ( x < n ) {
           as[threadIdx.x + threadIdx.y*8     ] = a[offa + x + i*lda];
           as[threadIdx.x + threadIdx.y*8 + 32] = a[offa + x + (i + 1)*lda];
@@ -90,7 +90,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec
         }
       }
       else {
-        int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
         if ( x < n ) {
           bs[threadIdx.x + (threadIdx.y - 4)*8     ] = b[offa + x + i*ldb];
           bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = b[offa + x + (i + 1)*ldb];
@@ -101,7 +101,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec
     }
     else if ( i + 1 < k ) {
       if ( threadIdx.y < 4 ) {
-        int x = threadIdx.x + (threadIdx.y + bx*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8;
         if ( x < n ) {
           as[threadIdx.x + threadIdx.y*8     ] = a[offa + x + i*lda];
           as[threadIdx.x + threadIdx.y*8 + 32] = a[offa + x + (i + 1)*lda];
@@ -110,7 +110,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec
         }
       }
       else {
-        int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
         if ( x < n ) {
           bs[threadIdx.x + (threadIdx.y - 4)*8     ] = b[offa + x + i*ldb];
           bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = b[offa + x + (i + 1)*ldb];
@@ -121,7 +121,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec
     }
     else {
       if ( threadIdx.y < 4 ) {
-        int x = threadIdx.x + (threadIdx.y + bx*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8;
         if ( x < n ) {
           as[threadIdx.x + threadIdx.y*8     ] = a[offa + x + i*lda];
           as[threadIdx.x + threadIdx.y*8 + 32] = 0.0;
@@ -130,7 +130,7 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec
         }
       }
       else {
-        int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
         if ( x < n ) {
           bs[threadIdx.x + (threadIdx.y - 4)*8     ] = b[offa + x + i*ldb];
           bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = 0.0;
@@ -144,14 +144,14 @@ loadDevToSmem_generic( volatile precision_ *const __restrict__ as, volatile prec
 case 2:
     if ( i + 1 < k ) {
       if ( threadIdx.y < 4 ) {
-        int x = threadIdx.x + (threadIdx.y + bx*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8;
         if ( x < n ) {
           as[threadIdx.x + threadIdx.y*8     ] = a[offa + x + i*lda];
           as[threadIdx.x + threadIdx.y*8 + 32] = a[offa + x + (i + 1)*lda];
         }
       }
       else {
-        int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
         if ( x < n ) {
           bs[threadIdx.x + (threadIdx.y - 4)*8     ] = b[offa + x + i*ldb];
           bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = b[offa + x + (i + 1)*ldb];
@@ -160,14 +160,14 @@ case 2:
     }
     else {
       if ( threadIdx.y < 4 ) {
-        int x = threadIdx.x + (threadIdx.y + bx*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y + bx*4)*8;
         if ( x < n ) {
           as[threadIdx.x + threadIdx.y*8     ] = a[offa + x + i*lda];
           as[threadIdx.x + threadIdx.y*8 + 32] = 0.0;
         }
       }
       else {
-        int x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
+        ipc_ x = threadIdx.x + (threadIdx.y - 4 + by*4)*8;
         if ( x < n ) {
           bs[threadIdx.x + (threadIdx.y - 4)*8     ] = b[offa + x + i*ldb];
           bs[threadIdx.x + (threadIdx.y - 4)*8 + 32] = 0.0;
@@ -182,14 +182,14 @@ case 2:
 }
 
 struct multisyrk_type {
-  int first;
-  precision_ *lval;
-  precision_ *ldval;
+  ipc_ first;
+  rpc_ *lval;
+  rpc_ *ldval;
   long offc;
-  int n;
-  int k;
-  int lda;
-  int ldb;
+  ipc_ n;
+  ipc_ k;
+  ipc_ lda;
+  ipc_ ldb;
 };
 
 // multisyrk kernels below compute the low trangular part of a*b^T
@@ -201,11 +201,12 @@ __launch_bounds__(64, 14)
 #endif
 __global__ void
 cu_multisyrk_lc_r4x4(
-  const struct multisyrk_type* msdata, int off, ELEMENT_TYPE* c
+  const struct multisyrk_type* msdata, ipc_ off, ELEMENT_TYPE* c
 ){
 
-// The number of elements we want in each shared memory buffer depends on the shared memory:register ratio
-// SM 3.0+ has precision_ the number of registers per shared memory, so need half the shared memory here.
+// The number of elements we want in each shared memory buffer depends on 
+/  the shared memory:register ratio SM 3.0+ has precision_ the number of 
+// registers per shared memory, so need half the shared memory here.
 #if SM_3X
   #define SYRK_WIDTH 4
   #define DOUBLE_BUFFERED 0
@@ -232,22 +233,22 @@ cu_multisyrk_lc_r4x4(
 #endif
 
   msdata += blockIdx.x;
-  int first = msdata->first;
+  ipc_ first = msdata->first;
   const ELEMENT_TYPE * __restrict__ a = msdata->lval;
   const ELEMENT_TYPE * __restrict__ b = msdata->ldval;
-  int offc  = msdata->offc;
-  int n     = msdata->n;
-  int k     = msdata->k;
-  int lda   = msdata->lda;
-  int ldb   = msdata->ldb;
+  ipc_ offc  = msdata->offc;
+  ipc_ n     = msdata->n;
+  ipc_ k     = msdata->k;
+  ipc_ lda   = msdata->lda;
+  ipc_ ldb   = msdata->ldb;
 
   if ( n < 1 )
     return;
 
 
-  int bx, by;
+  ipc_ bx, by;
   {
-    int nb = (n - 1)/32 + 1;
+    ipc_ nb = (n - 1)/32 + 1;
     for ( bx = 0, by = 0; by < nb; by++ ) {
       if ( off + blockIdx.x - first - bx < nb - by ) {
         bx = off + blockIdx.x - first - bx + by;
@@ -259,23 +260,23 @@ cu_multisyrk_lc_r4x4(
 
 #if (USE_DOUBLE2)
   double2 s[8];
-  for ( int i = 0; i < 8; i++ ) {
+  for ( ipc_ i = 0; i < 8; i++ ) {
     s[i].x = 0.0;
     s[i].y = 0.0;
   }
 #else
   ELEMENT_TYPE s[16];
-  for ( int i = 0; i < 16; i++ )
+  for ( ipc_ i = 0; i < 16; i++ )
     s[i] = 0.0;
 #endif
 
 
 #if (SYRK_WIDTH <= 2 && DOUBLE_BUFFERED)
-  loadDevToSmem_generic<SYRK_WIDTH>( (volatile precision_*)as, bs, a, b,
+  loadDevToSmem_generic<SYRK_WIDTH>( (volatile rpc_*)as, bs, a, b,
     bx, by, 0, lda, ldb, n, 0, k );
 #endif
 
-  for ( int i = 0; i < k; i += SYRK_WIDTH ) {
+  for ( ipc_ i = 0; i < k; i += SYRK_WIDTH ) {
 
 
 
@@ -286,21 +287,21 @@ cu_multisyrk_lc_r4x4(
     // challenge to get it working without spilling.
 #if (DOUBLE_BUFFERED)
     if ( i + SYRK_WIDTH < k ) {
-       loadDevToSmem_generic<SYRK_WIDTH>( (volatile precision_*)as2, bs2,
+       loadDevToSmem_generic<SYRK_WIDTH>( (volatile rpc_*)as2, bs2,
          a, b, bx, by, 0, lda, ldb, n, i + SYRK_WIDTH, k );
     }
 #endif // (DOUBLE_BUFFERED)
 
 #if (SYRK_WIDTH > 2 || DOUBLE_BUFFERED)
-    loadDevToSmem_generic<SYRK_WIDTH>( (volatile precision_*)as, bs, a, b,
+    loadDevToSmem_generic<SYRK_WIDTH>( (volatile rpc_*)as, bs, a, b,
       bx, by, 0, lda, ldb, n, i, k );
 #endif
     __syncthreads();
 
 
     #pragma unroll
-    for ( int ix = 0; ix < SYRK_WIDTH; ix++) {
-      for ( int iy = 0; iy < 4; iy++ ) {
+    for ( ipc_ ix = 0; ix < SYRK_WIDTH; ix++) {
+      for ( ipc_ iy = 0; iy < 4; iy++ ) {
 #if (USE_DOUBLE2)
         s[iy*2    ].x += as[threadIdx.x + ix * 16    ].x*bs[threadIdx.y + 8*iy + ix * 32];
         s[iy*2    ].y += as[threadIdx.x + ix * 16    ].y*bs[threadIdx.y + 8*iy + ix * 32];
@@ -324,13 +325,13 @@ cu_multisyrk_lc_r4x4(
     __syncthreads();
     if ( i + SYRK_WIDTH < k ) {
 #if (SYRK_WIDTH <= 2)
-       loadDevToSmem_generic<SYRK_WIDTH>( (volatile precision_*)as, bs, a, b, bx, by, 0, lda, ldb, n, i + SYRK_WIDTH, k );
+       loadDevToSmem_generic<SYRK_WIDTH>( (volatile rpc_*)as, bs, a, b, bx, by, 0, lda, ldb, n, i + SYRK_WIDTH, k );
 #endif
     }
 
     #pragma unroll
-    for ( int ix = 0; ix < SYRK_WIDTH; ix++) {
-      for ( int iy = 0; iy < 4; iy++ ) {
+    for ( ipc_ ix = 0; ix < SYRK_WIDTH; ix++) {
+      for ( ipc_ iy = 0; iy < 4; iy++ ) {
 #if (USE_DOUBLE2)
         s[iy*2    ].x += as2[threadIdx.x + ix * 16    ].x*bs2[threadIdx.y + 8*iy + ix * 32];
         s[iy*2    ].y += as2[threadIdx.x + ix * 16    ].y*bs2[threadIdx.y + 8*iy + ix * 32];
@@ -351,10 +352,10 @@ cu_multisyrk_lc_r4x4(
   }
 
 #if (USE_DOUBLE2)
-  for ( int iy = 0; iy < 4; iy++ ) {
-    for ( int ix = 0; ix < 2; ix++ ) {
-      int x = threadIdx.x * 2 + ix*16 + bx*32;
-      int y = threadIdx.y + iy*8 + by*32;
+  for ( ipc_ iy = 0; iy < 4; iy++ ) {
+    for ( ipc_ ix = 0; ix < 2; ix++ ) {
+      ipc_ x = threadIdx.x * 2 + ix*16 + bx*32;
+      ipc_ y = threadIdx.y + iy*8 + by*32;
       if ( x < n && y < n && y <= x ) {
         c[offc + x + y*n] = -s[ix + iy*2].x;
       }
@@ -366,38 +367,38 @@ cu_multisyrk_lc_r4x4(
     }
   }
 #else
-  int xMaxBase = (3 + bx*4)*8;
-  int yMaxBase = (3 + by*4)*8;
+  ipc_ xMaxBase = (3 + bx*4)*8;
+  ipc_ yMaxBase = (3 + by*4)*8;
 
-  int XNPass = xMaxBase + 8 < n;
-  int YNPass = yMaxBase + 8 < n;
-  int YXPass = yMaxBase + 8 <= xMaxBase;
+  ipc_ XNPass = xMaxBase + 8 < n;
+  ipc_ YNPass = yMaxBase + 8 < n;
+  ipc_ YXPass = yMaxBase + 8 <= xMaxBase;
 
   // This is only a small improvement (~1%)
   if (XNPass && YNPass && YXPass) {
-    for ( int iy = 0; iy < 4; iy++ ) {
-      for ( int ix = 0; ix < 4; ix++ ) {
-        int x = threadIdx.x + (ix + bx*4)*8;
-        int y = threadIdx.y + (iy + by*4)*8;
+    for ( ipc_ iy = 0; iy < 4; iy++ ) {
+      for ( ipc_ ix = 0; ix < 4; ix++ ) {
+        ipc_ x = threadIdx.x + (ix + bx*4)*8;
+        ipc_ y = threadIdx.y + (iy + by*4)*8;
         c[offc + x + y*n] = -s[ix + iy*4];
       }
     }
   }
   else if (XNPass && YNPass) {
-    for ( int iy = 0; iy < 4; iy++ ) {
-      for ( int ix = 0; ix < 4; ix++ ) {
-        int x = threadIdx.x + (ix + bx*4)*8;
-        int y = threadIdx.y + (iy + by*4)*8;
+    for ( ipc_ iy = 0; iy < 4; iy++ ) {
+      for ( ipc_ ix = 0; ix < 4; ix++ ) {
+        ipc_ x = threadIdx.x + (ix + bx*4)*8;
+        ipc_ y = threadIdx.y + (iy + by*4)*8;
         if ( y <= x )
           c[offc + x + y*n] = -s[ix + iy*4];
       }
     }
   }
   else {
-    for ( int iy = 0; iy < 4; iy++ ) {
-      for ( int ix = 0; ix < 4; ix++ ) {
-        int x = threadIdx.x + (ix + bx*4)*8;
-        int y = threadIdx.y + (iy + by*4)*8;
+    for ( ipc_ iy = 0; iy < 4; iy++ ) {
+      for ( ipc_ ix = 0; ix < 4; ix++ ) {
+        ipc_ x = threadIdx.x + (ix + bx*4)*8;
+        ipc_ y = threadIdx.y + (iy + by*4)*8;
         if ( x < n && y < n && y <= x )
           c[offc + x + y*n] = -s[ix + iy*4];
       }
@@ -412,8 +413,8 @@ cu_multisyrk_lc_r4x4(
 }
 
 struct multielm_data {
-  int node;
-  int offb;
+  ipc_ node;
+  ipc_ offb;
 };
 
 template< typename ELEMENT_TYPE >
@@ -423,16 +424,16 @@ template< typename ELEMENT_TYPE >
 __global__ void
 cu_multisyrk_r4x4(
     bool posdef,
-    int* stat,
+    ipc_* stat,
     multielm_data* mdata,
-    int off,
+    ipc_ off,
     struct multinode_fact_type *ndatat
 ){
-  int bx, by;
-  int n, m, k;
-  int offa, offc;
-  int lda, ldb;
-  int nb;
+  ipc_ bx, by;
+  ipc_ n, m, k;
+  ipc_ offa, offc;
+  ipc_ lda, ldb;
+  ipc_ nb;
   ELEMENT_TYPE s[16];
 #if SM_3X
   #define SYRK_WIDTH 2
@@ -467,9 +468,9 @@ cu_multisyrk_r4x4(
   if ( by >= n || by >= m )
     return;
 
-  const precision_ * __restrict__ a = ndatat->lval;
-  const precision_ * __restrict__ b = posdef ? ndatat->lval : ndatat->ldval;
-  precision_ * __restrict__ c = ndatat->lval;
+  const rpc_ * __restrict__ a = ndatat->lval;
+  const rpc_ * __restrict__ b = posdef ? ndatat->lval : ndatat->ldval;
+  rpc_ * __restrict__ c = ndatat->lval;
 
   offa = by + lda*n;
   offc = by + by*n;
@@ -486,17 +487,17 @@ cu_multisyrk_r4x4(
   bx = by%nb;
   by = by/nb;
 
-  for ( int i = 0; i < 16; i++ ) {
+  for ( ipc_ i = 0; i < 16; i++ ) {
     s[i] = 0.0;
   }
 
 #if (DOUBLE_BUFFERED)
-  loadDevToSmem_generic<SYRK_WIDTH>( (volatile precision_*)as, bs, a, b, bx, by, offa, lda, ldb, n, 0, k );
+  loadDevToSmem_generic<SYRK_WIDTH>( (volatile rpc_*)as, bs, a, b, bx, by, offa, lda, ldb, n, 0, k );
 #endif
 
-  for ( int i = 0; i < k; i += SYRK_WIDTH ) {
+  for ( ipc_ i = 0; i < k; i += SYRK_WIDTH ) {
 #if (!DOUBLE_BUFFERED)
-    loadDevToSmem_generic<SYRK_WIDTH>( (volatile precision_*)as, bs, a, b, bx, by, offa, lda, ldb, n, i, k );
+    loadDevToSmem_generic<SYRK_WIDTH>( (volatile rpc_*)as, bs, a, b, bx, by, offa, lda, ldb, n, i, k );
 #endif
 
     __syncthreads();
@@ -508,8 +509,8 @@ cu_multisyrk_r4x4(
 #endif
 
     #pragma unroll
-    for ( int ix = 0; ix < SYRK_WIDTH; ix++) {
-      for ( int iy = 0; iy < 4; iy++ ) {
+    for ( ipc_ ix = 0; ix < SYRK_WIDTH; ix++) {
+      for ( ipc_ iy = 0; iy < 4; iy++ ) {
         s[iy*4]     += as[threadIdx.x + 32 * ix     ]*bs[threadIdx.y + 8*iy + 32 * ix];
         s[iy*4 + 1] += as[threadIdx.x + 32 * ix + 8 ]*bs[threadIdx.y + 8*iy + 32 * ix];
         s[iy*4 + 2] += as[threadIdx.x + 32 * ix + 16]*bs[threadIdx.y + 8*iy + 32 * ix];
@@ -529,8 +530,8 @@ cu_multisyrk_r4x4(
     }
 
     #pragma unroll
-    for ( int ix = 0; ix < SYRK_WIDTH; ix++) {
-      for ( int iy = 0; iy < 4; iy++ ) {
+    for ( ipc_ ix = 0; ix < SYRK_WIDTH; ix++) {
+      for ( ipc_ iy = 0; iy < 4; iy++ ) {
         s[iy*4]     += as2[threadIdx.x + 32 * ix     ]*bs2[threadIdx.y + 8*iy + 32 * ix];
         s[iy*4 + 1] += as2[threadIdx.x + 32 * ix + 8 ]*bs2[threadIdx.y + 8*iy + 32 * ix];
         s[iy*4 + 2] += as2[threadIdx.x + 32 * ix + 16]*bs2[threadIdx.y + 8*iy + 32 * ix];
@@ -540,10 +541,10 @@ cu_multisyrk_r4x4(
 #endif
   }
 
-  for ( int iy = 0; iy < 4; iy++ )
-    for ( int ix = 0; ix < 4; ix++ ) {
-      int x = threadIdx.x + (ix + bx*4)*8;
-      int y = threadIdx.y + (iy + by*4)*8;
+  for ( ipc_ iy = 0; iy < 4; iy++ )
+    for ( ipc_ ix = 0; ix < 4; ix++ ) {
+      ipc_ x = threadIdx.x + (ix + bx*4)*8;
+      ipc_ y = threadIdx.y + (iy + by*4)*8;
       if ( x < n && y < m )
         c[offc + x + y*lda] = c[offc + x + y*lda] - s[ix + iy*4];
     }
@@ -552,45 +553,45 @@ cu_multisyrk_r4x4(
 template< typename ELEMENT_TYPE >
 __global__ void
 cu_syrk_r4x4(
-  int n, int m, int k,
-  precision_ alpha, const precision_* a, int lda, const precision_* b, int ldb,
-  precision_ beta, precision_* c, int ldc
+  ipc_ n, ipc_ m, ipc_ k,
+  rpc_ alpha, const rpc_* a, ipc_ lda, const rpc_* b, ipc_ ldb,
+  rpc_ beta, rpc_* c, ipc_ ldc
 ){
   ELEMENT_TYPE s[16];
 
   __shared__ volatile ELEMENT_TYPE as[128], bs[128];
 
-  for ( int i = 0; i < 16; i++ )
+  for ( ipc_ i = 0; i < 16; i++ )
     s[i] = 0;
 
-  for ( int i = 0; i < k; i += 4 ) {
+  for ( ipc_ i = 0; i < k; i += 4 ) {
 
     loadDevToSmem_generic< 4 >( as, bs, a, b, blockIdx.x, blockIdx.y, 0, lda, ldb,
                               n, i, k );
     __syncthreads();
 
-    for ( int iy = 0; iy < 4; iy++ ) {
+    for ( ipc_ iy = 0; iy < 4; iy++ ) {
       s[iy*4]     += as[threadIdx.x     ]*bs[threadIdx.y + 8*iy];
       s[iy*4 + 1] += as[threadIdx.x + 8 ]*bs[threadIdx.y + 8*iy];
       s[iy*4 + 2] += as[threadIdx.x + 16]*bs[threadIdx.y + 8*iy];
       s[iy*4 + 3] += as[threadIdx.x + 24]*bs[threadIdx.y + 8*iy];
     }
 
-    for ( int iy = 0; iy < 4; iy++ ) {
+    for ( ipc_ iy = 0; iy < 4; iy++ ) {
       s[iy*4]     += as[threadIdx.x + 32]*bs[threadIdx.y + 8*iy + 32];
       s[iy*4 + 1] += as[threadIdx.x + 40]*bs[threadIdx.y + 8*iy + 32];
       s[iy*4 + 2] += as[threadIdx.x + 48]*bs[threadIdx.y + 8*iy + 32];
       s[iy*4 + 3] += as[threadIdx.x + 56]*bs[threadIdx.y + 8*iy + 32];
     }
 
-    for ( int iy = 0; iy < 4; iy++ ) {
+    for ( ipc_ iy = 0; iy < 4; iy++ ) {
       s[iy*4]     += as[threadIdx.x + 64]*bs[threadIdx.y + 8*iy + 64];
       s[iy*4 + 1] += as[threadIdx.x + 72]*bs[threadIdx.y + 8*iy + 64];
       s[iy*4 + 2] += as[threadIdx.x + 80]*bs[threadIdx.y + 8*iy + 64];
       s[iy*4 + 3] += as[threadIdx.x + 88]*bs[threadIdx.y + 8*iy + 64];
     }
 
-    for ( int iy = 0; iy < 4; iy++ ) {
+    for ( ipc_ iy = 0; iy < 4; iy++ ) {
       s[iy*4]     += as[threadIdx.x + 96 ]*bs[threadIdx.y + 8*iy + 96];
       s[iy*4 + 1] += as[threadIdx.x + 104]*bs[threadIdx.y + 8*iy + 96];
       s[iy*4 + 2] += as[threadIdx.x + 112]*bs[threadIdx.y + 8*iy + 96];
@@ -601,19 +602,19 @@ cu_syrk_r4x4(
   }
 
   if ( beta ) {
-    for ( int iy = 0; iy < 4; iy++ )
-      for ( int ix = 0; ix < 4; ix++ ) {
-        int x = threadIdx.x + (ix + blockIdx.x*4)*8;
-        int y = threadIdx.y + (iy + blockIdx.y*4)*8;
+    for ( ipc_ iy = 0; iy < 4; iy++ )
+      for ( ipc_ ix = 0; ix < 4; ix++ ) {
+        ipc_ x = threadIdx.x + (ix + blockIdx.x*4)*8;
+        ipc_ y = threadIdx.y + (iy + blockIdx.y*4)*8;
         if ( x < n && y < m )
           c[x + y*ldc] = beta*c[x + y*ldc] + alpha*s[ix + iy*4];
       }
   }
   else {
-    for ( int iy = 0; iy < 4; iy++ )
-      for ( int ix = 0; ix < 4; ix++ ) {
-        int x = threadIdx.x + (ix + blockIdx.x*4)*8;
-        int y = threadIdx.y + (iy + blockIdx.y*4)*8;
+    for ( ipc_ iy = 0; iy < 4; iy++ )
+      for ( ipc_ ix = 0; ix < 4; ix++ ) {
+        ipc_ x = threadIdx.x + (ix + blockIdx.x*4)*8;
+        ipc_ y = threadIdx.y + (iy + blockIdx.y*4)*8;
         if ( x < n && y < m )
           c[x + y*ldc] = alpha*s[ix + iy*4];
       }
@@ -629,36 +630,36 @@ cu_syrk_r4x4(
 
 extern "C" {
 
-void spral_ssids_dsyrk(cudaStream_t *stream, int n, int m, int k,
-      precision_ alpha, const precision_* a, int lda, const precision_* b,
-      int ldb, precision_ beta, precision_* c, int ldc) {
-  int nx, ny;
+void spral_ssids_dsyrk(cudaStream_t *stream, ipc_ n, ipc_ m, ipc_ k,
+      rpc_ alpha, const rpc_* a, ipc_ lda, const rpc_* b,
+      ipc_ ldb, rpc_ beta, rpc_* c, ipc_ ldc) {
+  ipc_ nx, ny;
   nx = (n - 1)/32 + 1;
   ny = (m - 1)/32 + 1;
   dim3 threads(8,8);
   dim3 grid(nx,ny);
-  cu_syrk_r4x4< precision_ > <<< grid, threads, 0, *stream >>>
+  cu_syrk_r4x4< rpc_ > <<< grid, threads, 0, *stream >>>
     ( n, m, k, alpha, a, lda, b, ldb, beta, c, ldc );
 }
 
-void spral_ssids_multidsyrk(cudaStream_t *stream, bool posdef, int nb,
-      int* stat, struct multielm_data* mdata,
+void spral_ssids_multidsyrk(cudaStream_t *stream, bool posdef, ipc_ nb,
+      ipc_* stat, struct multielm_data* mdata,
       struct multinode_fact_type *ndata) {
   dim3 threads(8,8);
-  for ( int i = 0; i < nb; i += MAX_CUDA_BLOCKS ) {
-    int blocks = min(MAX_CUDA_BLOCKS, nb - i);
-    cu_multisyrk_r4x4< precision_ >
+  for ( ipc_ i = 0; i < nb; i += MAX_CUDA_BLOCKS ) {
+    ipc_ blocks = min(MAX_CUDA_BLOCKS, nb - i);
+    cu_multisyrk_r4x4< rpc_ >
       <<< blocks, threads, 0, *stream >>>
       ( posdef, stat, mdata + i, i, ndata );
   }
 }
 
-void spral_ssids_multidsyrk_low_col(cudaStream_t *stream, int nb,
-      struct multisyrk_type* msdata, precision_* c) {
+void spral_ssids_multidsyrk_low_col(cudaStream_t *stream, ipc_ nb,
+      struct multisyrk_type* msdata, rpc_* c) {
   dim3 threads(8,8);
-  for ( int i = 0; i < nb; i += MAX_CUDA_BLOCKS ) {
-    int blocks = min(MAX_CUDA_BLOCKS, nb - i);
-    cu_multisyrk_lc_r4x4< precision_ >
+  for ( ipc_ i = 0; i < nb; i += MAX_CUDA_BLOCKS ) {
+    ipc_ blocks = min(MAX_CUDA_BLOCKS, nb - i);
+    cu_multisyrk_lc_r4x4< rpc_ >
       <<< blocks, threads, 0, *stream >>>( msdata + i, i, c );
   }
 }
diff --git a/src/ssids/wrappers.cxx b/src/ssids/wrappers.cxx
index 813e005c2f..1222328e20 100644
--- a/src/ssids/wrappers.cxx
+++ b/src/ssids/wrappers.cxx
@@ -2,6 +2,7 @@
  *  \copyright 2016 The Science and Technology Facilities Council (STFC)
  *  \licence   BSD licence, see LICENCE file for details
  *  \author    Jonathan Hogg
+ *  \version   GALAHAD 4.3 - 2024-02-03 AT 11:30 GMT
  */
 #include "ssids_cpu_kernels_wrappers.hxx"
 
@@ -11,34 +12,34 @@
 
 /* ================ SINGLE PRECISION WITH 64 BIT INTEGERS =================== */
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 
 extern "C" {
    void spral_c_sgemm_64(char* transa, char* transb, 
-                      int64_t* m, int64_t* n, int64_t* k, 
-                      float* alpha, const float* a, int64_t* lda, 
-                      const float* b, int64_t* ldb, float *beta, 
-                      float* c, int64_t* ldc);
+                         int64_t* m, int64_t* n, int64_t* k, 
+                         float* alpha, const float* a, int64_t* lda, 
+                         const float* b, int64_t* ldb, float *beta, 
+                         float* c, int64_t* ldc);
    void spral_c_spotrf_64(char *uplo, int64_t *n, float *a, 
-                       int64_t *lda, int64_t *info);
+                          int64_t *lda, int64_t *info);
    void spral_c_ssytrf_64(char *uplo, int64_t *n, float *a, 
-                       int64_t *lda, int64_t *ipiv, float *work, 
-                       int64_t *lwork, int64_t *info);
+                          int64_t *lda, int64_t *ipiv, float *work, 
+                          int64_t *lwork, int64_t *info);
    void spral_c_strsm_64(char *side, char *uplo, char *transa, 
-                      char *diag, int64_t *m, int64_t *n, 
-                      const float *alpha, const float *a, 
-                      int64_t *lda, float *b, int64_t *ldb);
+                         char *diag, int64_t *m, int64_t *n, 
+                         const float *alpha, const float *a, 
+                         int64_t *lda, float *b, int64_t *ldb);
    void spral_c_ssyrk_64(char *uplo, char *trans, 
-                      int64_t *n, int64_t *k, float *alpha, 
-                      const float *a, int64_t *lda, float *beta, 
-                      float *c, int64_t *ldc);
+                         int64_t *n, int64_t *k, float *alpha, 
+                         const float *a, int64_t *lda, float *beta, 
+                         float *c, int64_t *ldc);
    void spral_c_strsv_64(char *uplo, char *trans, char *diag, 
-                      int64_t *n, const float *a, int64_t *lda, 
-                      float *x, int64_t *incx);
+                         int64_t *n, const float *a, int64_t *lda, 
+                         float *x, int64_t *incx);
    void spral_c_sgemv_64(char *trans, int64_t *m, int64_t *n, 
-                      const float* alpha, const float* a, 
-                      int64_t *lda, const float* x, int64_t* incx, 
-                      const float* beta, float* y, int64_t* incy);
+                         const float* alpha, const float* a, 
+                         int64_t *lda, const float* x, int64_t* incx, 
+                         const float* beta, float* y, int64_t* incy);
 }
 
 namespace spral { namespace ssids { namespace cpu {
@@ -46,10 +47,10 @@ namespace spral { namespace ssids { namespace cpu {
 /* _GEMM */
 template <>
 void host_gemm_64<float>(enum spral::ssids::cpu::operation transa, 
-                      enum spral::ssids::cpu::operation transb, 
-                      int64_t m, int64_t n, int64_t k, float alpha, 
-                      const float* a, int64_t lda, const float* b, 
-                      int64_t ldb, float beta, float* c, int64_t ldc) {
+                         enum spral::ssids::cpu::operation transb, 
+                         int64_t m, int64_t n, int64_t k, float alpha, 
+                         const float* a, int64_t lda, const float* b, 
+                         int64_t ldb, float beta, float* c, int64_t ldc) {
    char ftransa = (transa==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    char ftransb = (transb==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    spral_c_sgemm_64(&ftransa, &ftransb, &m, &n, &k, &alpha, a, &lda, 
@@ -59,9 +60,9 @@ void host_gemm_64<float>(enum spral::ssids::cpu::operation transa,
 /* _GEMV */
 template <>
 void gemv_64<float>(enum spral::ssids::cpu::operation trans, 
-                 int64_t m, int64_t n, float alpha, const float* a, 
-                 int64_t lda, const float* x, int64_t incx, 
-                 float beta, float* y, int64_t incy) {
+                    int64_t m, int64_t n, float alpha, const float* a, 
+                    int64_t lda, const float* x, int64_t incx, 
+                    float beta, float* y, int64_t incy) {
    char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    spral_c_sgemv_64(&ftrans, &m, &n, &alpha, a, &lda, x, &incx, 
                  &beta, y, &incy);
@@ -70,7 +71,7 @@ void gemv_64<float>(enum spral::ssids::cpu::operation trans,
 /* _POTRF */
 template<>
 int64_t lapack_potrf_64<float>(enum spral::ssids::cpu::fillmode uplo, 
-                        int64_t n, float* a, int64_t lda) {
+                               int64_t n, float* a, int64_t lda) {
    char fuplo;
    switch(uplo) {
       case spral::ssids::cpu::FILL_MODE_LWR: fuplo = 'L'; break;
@@ -85,8 +86,8 @@ int64_t lapack_potrf_64<float>(enum spral::ssids::cpu::fillmode uplo,
 /* _SYTRF - Bunch-Kaufman factorization */
 template<>
 int64_t lapack_sytrf_64<float>(enum spral::ssids::cpu::fillmode uplo, 
-                        int64_t n, float* a, int64_t lda, 
-                        int64_t *ipiv, float* work, int64_t lwork) {
+                               int64_t n, float* a, int64_t lda, 
+                               int64_t *ipiv, float* work, int64_t lwork) {
    char fuplo;
    switch(uplo) {
       case spral::ssids::cpu::FILL_MODE_LWR: fuplo = 'L'; break;
@@ -101,9 +102,9 @@ int64_t lapack_sytrf_64<float>(enum spral::ssids::cpu::fillmode uplo,
 /* _SYRK */
 template <>
 void host_syrk_64<float>(enum spral::ssids::cpu::fillmode uplo, 
-                      enum spral::ssids::cpu::operation trans, 
-                      int64_t n, int64_t k, float alpha, const float* a, 
-                      int64_t lda, float beta, float* c, int64_t ldc) {
+                         enum spral::ssids::cpu::operation trans, 
+                         int64_t n, int64_t k, float alpha, const float* a, 
+                         int64_t lda, float beta, float* c, int64_t ldc) {
    char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U';
    char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    spral_c_ssyrk_64(&fuplo, &ftrans, &n, &k, &alpha, a, &lda, &beta, c, &ldc);
@@ -112,10 +113,10 @@ void host_syrk_64<float>(enum spral::ssids::cpu::fillmode uplo,
 /* _TRSV */
 template <>
 void host_trsv_64<float>(enum spral::ssids::cpu::fillmode uplo, 
-                      enum spral::ssids::cpu::operation trans, 
-                      enum spral::ssids::cpu::diagonal diag, 
-                      int64_t n, const float* a, int64_t lda, 
-                      float* x, int64_t incx) {
+                         enum spral::ssids::cpu::operation trans, 
+                         enum spral::ssids::cpu::diagonal diag, 
+                         int64_t n, const float* a, int64_t lda, 
+                         float* x, int64_t incx) {
    char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U';
    char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    char fdiag = (diag==spral::ssids::cpu::DIAG_UNIT) ? 'U' : 'N';
@@ -125,11 +126,11 @@ void host_trsv_64<float>(enum spral::ssids::cpu::fillmode uplo,
 /* _TRSM */
 template <>
 void host_trsm_64<float>(enum spral::ssids::cpu::side side, 
-                      enum spral::ssids::cpu::fillmode uplo, 
-                      enum spral::ssids::cpu::operation transa, 
-                      enum spral::ssids::cpu::diagonal diag, 
-                      int64_t m, int64_t n, float alpha, const float* a, 
-                      int64_t lda, float* b, int64_t ldb) {
+                         enum spral::ssids::cpu::fillmode uplo, 
+                         enum spral::ssids::cpu::operation transa, 
+                         enum spral::ssids::cpu::diagonal diag, 
+                         int64_t m, int64_t n, float alpha, const float* a, 
+                         int64_t lda, float* b, int64_t ldb) {
    char fside = (side==spral::ssids::cpu::SIDE_LEFT) ? 'L' : 'R';
    char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U';
    char ftransa = (transa==spral::ssids::cpu::OP_N) ? 'N' : 'T';
@@ -277,34 +278,34 @@ void host_trsm<float>(enum spral::ssids::cpu::side side,
 
 /* ================ DOUBLE PRECISION WITH 64 BIT INTEGERS =================== */
 
-#ifdef SPRAL_64BIT_INTEGER
+#ifdef INTEGER_64
 
 extern "C" {
    void spral_c_dgemm_64(char* transa, char* transb, 
-                      int64_t* m, int64_t* n, int64_t* k, 
-                      double* alpha, const double* a, int64_t* lda, 
-                      const double* b, int64_t* ldb, double *beta, 
-                      double* c, int64_t* ldc);
+                         int64_t* m, int64_t* n, int64_t* k, 
+                         double* alpha, const double* a, int64_t* lda, 
+                         const double* b, int64_t* ldb, double *beta, 
+                         double* c, int64_t* ldc);
    void spral_c_dpotrf_64(char *uplo, int64_t *n, double *a, 
-                       int64_t *lda, int64_t *info);
+                          int64_t *lda, int64_t *info);
    void spral_c_dsytrf_64(char *uplo, int64_t *n, double *a, 
-                       int64_t *lda, int64_t *ipiv, double *work, 
-                       int64_t *lwork, int64_t *info);
+                          int64_t *lda, int64_t *ipiv, double *work, 
+                          int64_t *lwork, int64_t *info);
    void spral_c_dtrsm_64(char *side, char *uplo, char *transa, 
-                      char *diag, int64_t *m, int64_t *n, 
-                      const double *alpha, const double *a, 
-                      int64_t *lda, double *b, int64_t *ldb);
+                         char *diag, int64_t *m, int64_t *n, 
+                         const double *alpha, const double *a, 
+                         int64_t *lda, double *b, int64_t *ldb);
    void spral_c_dsyrk_64(char *uplo, char *trans, 
-                      int64_t *n, int64_t *k, double *alpha, 
-                      const double *a, int64_t *lda, double *beta, 
-                      double *c, int64_t *ldc);
+                         int64_t *n, int64_t *k, double *alpha, 
+                         const double *a, int64_t *lda, double *beta, 
+                         double *c, int64_t *ldc);
    void spral_c_dtrsv_64(char *uplo, char *trans, char *diag, 
-                      int64_t *n, const double *a, int64_t *lda, 
-                      double *x, int64_t *incx);
+                         int64_t *n, const double *a, int64_t *lda, 
+                         double *x, int64_t *incx);
    void spral_c_dgemv_64(char *trans, int64_t *m, int64_t *n, 
-                      const double* alpha, const double* a, 
-                      int64_t *lda, const double* x, int64_t* incx, 
-                      const double* beta, double* y, int64_t* incy);
+                         const double* alpha, const double* a, 
+                         int64_t *lda, const double* x, int64_t* incx, 
+                         const double* beta, double* y, int64_t* incy);
 }
 
 namespace spral { namespace ssids { namespace cpu {
@@ -312,10 +313,10 @@ namespace spral { namespace ssids { namespace cpu {
 /* _GEMM */
 template <>
 void host_gemm_64<double>(enum spral::ssids::cpu::operation transa, 
-                       enum spral::ssids::cpu::operation transb, 
-                       int64_t m, int64_t n, int64_t k, double alpha, 
-                       const double* a, int64_t lda, const double* b, 
-                       int64_t ldb, double beta, double* c, int64_t ldc) {
+                          enum spral::ssids::cpu::operation transb, 
+                          int64_t m, int64_t n, int64_t k, double alpha, 
+                          const double* a, int64_t lda, const double* b, 
+                          int64_t ldb, double beta, double* c, int64_t ldc) {
    char ftransa = (transa==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    char ftransb = (transb==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    spral_c_dgemm_64(&ftransa, &ftransb, &m, &n, &k, &alpha, a, &lda, 
@@ -325,9 +326,9 @@ void host_gemm_64<double>(enum spral::ssids::cpu::operation transa,
 /* _GEMV */
 template <>
 void gemv_64<double>(enum spral::ssids::cpu::operation trans, 
-                 int64_t m, int64_t n, double alpha, const double* a, 
-                 int64_t lda, const double* x, int64_t incx, 
-                 double beta, double* y, int64_t incy) {
+                     int64_t m, int64_t n, double alpha, const double* a, 
+                     int64_t lda, const double* x, int64_t incx, 
+                     double beta, double* y, int64_t incy) {
    char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    spral_c_dgemv_64(&ftrans, &m, &n, &alpha, a, &lda, x, &incx, 
                  &beta, y, &incy);
@@ -336,7 +337,7 @@ void gemv_64<double>(enum spral::ssids::cpu::operation trans,
 /* _POTRF */
 template<>
 int64_t lapack_potrf_64<double>(enum spral::ssids::cpu::fillmode uplo, 
-                         int64_t n, double* a, int64_t lda) {
+                                int64_t n, double* a, int64_t lda) {
    char fuplo;
    switch(uplo) {
       case spral::ssids::cpu::FILL_MODE_LWR: fuplo = 'L'; break;
@@ -351,8 +352,9 @@ int64_t lapack_potrf_64<double>(enum spral::ssids::cpu::fillmode uplo,
 /* _SYTRF - Bunch-Kaufman factorization */
 template<>
 int64_t lapack_sytrf_64<double>(enum spral::ssids::cpu::fillmode uplo, 
-                         int64_t n, double* a, int64_t lda, int64_t *ipiv, 
-                         double* work, int64_t lwork) {
+                                int64_t n, double* a, 
+                                int64_t lda, int64_t *ipiv, 
+                                double* work, int64_t lwork) {
    char fuplo;
    switch(uplo) {
       case spral::ssids::cpu::FILL_MODE_LWR: fuplo = 'L'; break;
@@ -367,9 +369,9 @@ int64_t lapack_sytrf_64<double>(enum spral::ssids::cpu::fillmode uplo,
 /* _SYRK */
 template <>
 void host_syrk_64<double>(enum spral::ssids::cpu::fillmode uplo, 
-                       enum spral::ssids::cpu::operation trans, 
-                       int64_t n, int64_t k, double alpha, const double* a, 
-                       int64_t lda, double beta, double* c, int64_t ldc) {
+                          enum spral::ssids::cpu::operation trans, 
+                          int64_t n, int64_t k, double alpha, const double* a, 
+                          int64_t lda, double beta, double* c, int64_t ldc) {
    char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U';
    char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    spral_c_dsyrk_64(&fuplo, &ftrans, &n, &k, &alpha, a, &lda, &beta, c, &ldc);
@@ -378,10 +380,10 @@ void host_syrk_64<double>(enum spral::ssids::cpu::fillmode uplo,
 /* _TRSV */
 template <>
 void host_trsv_64<double>(enum spral::ssids::cpu::fillmode uplo, 
-                       enum spral::ssids::cpu::operation trans, 
-                       enum spral::ssids::cpu::diagonal diag, 
-                       int64_t n, const double* a, int64_t lda, 
-                       double* x, int64_t incx) {
+                          enum spral::ssids::cpu::operation trans, 
+                          enum spral::ssids::cpu::diagonal diag, 
+                          int64_t n, const double* a, int64_t lda, 
+                          double* x, int64_t incx) {
    char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U';
    char ftrans = (trans==spral::ssids::cpu::OP_N) ? 'N' : 'T';
    char fdiag = (diag==spral::ssids::cpu::DIAG_UNIT) ? 'U' : 'N';
@@ -391,11 +393,11 @@ void host_trsv_64<double>(enum spral::ssids::cpu::fillmode uplo,
 /* _TRSM */
 template <>
 void host_trsm_64<double>(enum spral::ssids::cpu::side side, 
-                       enum spral::ssids::cpu::fillmode uplo, 
-                       enum spral::ssids::cpu::operation transa, 
-                       enum spral::ssids::cpu::diagonal diag, 
-                       int64_t m, int64_t n, double alpha, const double* a, 
-                       int64_t lda, double* b, int64_t ldb) {
+                          enum spral::ssids::cpu::fillmode uplo, 
+                          enum spral::ssids::cpu::operation transa, 
+                          enum spral::ssids::cpu::diagonal diag, 
+                          int64_t m, int64_t n, double alpha, const double* a, 
+                          int64_t lda, double* b, int64_t ldb) {
    char fside = (side==spral::ssids::cpu::SIDE_LEFT) ? 'L' : 'R';
    char fuplo = (uplo==spral::ssids::cpu::FILL_MODE_LWR) ? 'L' : 'U';
    char ftransa = (transa==spral::ssids::cpu::OP_N) ? 'N' : 'T';