From b04701346a2a85dc9b4ab63e30f64f4a45888f46 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Thu, 2 Jan 2025 14:23:04 -0700 Subject: [PATCH] ops: embiggen the ops framework to take a count argument of type size_t. related to https://github.com/open-mpi/ompi/pull/12226 related to https://github.com/open-mpi/ompi/issues/9194 Signed-off-by: Howard Pritchard --- ompi/mca/op/aarch64/op_aarch64.h | 4 +- ompi/mca/op/aarch64/op_aarch64_component.c | 10 +- ompi/mca/op/aarch64/op_aarch64_functions.c | 28 ++--- ompi/mca/op/avx/op_avx.h | 4 +- ompi/mca/op/avx/op_avx_component.c | 10 +- ompi/mca/op/avx/op_avx_functions.c | 101 ++++++++++--------- ompi/mca/op/base/op_base_find_available.c | 14 +-- ompi/mca/op/base/op_base_frame.c | 8 +- ompi/mca/op/base/op_base_functions.c | 62 ++++++------ ompi/mca/op/base/op_base_op_select.c | 32 +++--- ompi/mca/op/example/op_example.h | 6 +- ompi/mca/op/example/op_example_component.c | 10 +- ompi/mca/op/example/op_example_module_bxor.c | 6 +- ompi/mca/op/example/op_example_module_max.c | 12 ++- ompi/mca/op/op.h | 68 +++++++------ ompi/mpi/c/op_create.c | 3 + ompi/op/op.c | 6 +- ompi/op/op.h | 73 ++++++-------- 18 files changed, 244 insertions(+), 213 deletions(-) diff --git a/ompi/mca/op/aarch64/op_aarch64.h b/ompi/mca/op/aarch64/op_aarch64.h index 22be89614c6..895cb15ec94 100644 --- a/ompi/mca/op/aarch64/op_aarch64.h +++ b/ompi/mca/op/aarch64/op_aarch64.h @@ -4,6 +4,8 @@ * reserved. * Copyright (c) 2019 Arm Ltd. All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -31,7 +33,7 @@ BEGIN_C_DECLS */ typedef struct { /** The base op component struct */ - ompi_op_base_component_1_0_0_t super; + ompi_op_base_component_2_0_0_t super; /* What follows is aarch64-component-specific cached information. We tend to use this scheme (caching information on the aarch64 diff --git a/ompi/mca/op/aarch64/op_aarch64_component.c b/ompi/mca/op/aarch64/op_aarch64_component.c index 92bf4fa3656..1b581d111f2 100644 --- a/ompi/mca/op/aarch64/op_aarch64_component.c +++ b/ompi/mca/op/aarch64/op_aarch64_component.c @@ -6,6 +6,8 @@ * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * Copyright (c) 2024 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -34,7 +36,7 @@ static int mca_op_aarch64_component_open(void); static int mca_op_aarch64_component_close(void); static int mca_op_aarch64_component_init_query(bool enable_progress_threads, bool enable_mpi_thread_multiple); -static struct ompi_op_base_module_1_0_0_t * +static struct ompi_op_base_module_2_0_0_t * mca_op_aarch64_component_op_query(struct ompi_op_t *op, int *priority); static int mca_op_aarch64_component_register(void); @@ -43,7 +45,7 @@ ompi_op_aarch64_component_t mca_op_aarch64_component = { information about the component itself */ { .opc_version = { - OMPI_OP_BASE_VERSION_1_0_0, + OMPI_OP_BASE_VERSION_2_0_0, .mca_component_name = "aarch64", MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, @@ -164,7 +166,7 @@ ompi_op_aarch64_3buff_functions_sve[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TY /* * Query whether this component can be used for a specific op */ -static struct ompi_op_base_module_1_0_0_t * +static struct ompi_op_base_module_2_0_0_t * mca_op_aarch64_component_op_query(struct ompi_op_t *op, int *priority) { /* Sanity check -- although the framework should never invoke the @@ -242,5 +244,5 @@ static struct ompi_op_base_module_1_0_0_t * if (NULL != module) { *priority = 50; } - return (ompi_op_base_module_1_0_0_t *) module; + return (ompi_op_base_module_2_0_0_t *) module; } diff --git a/ompi/mca/op/aarch64/op_aarch64_functions.c b/ompi/mca/op/aarch64/op_aarch64_functions.c index 5515ddb9567..de017defbf3 100644 --- a/ompi/mca/op/aarch64/op_aarch64_functions.c +++ b/ompi/mca/op/aarch64/op_aarch64_functions.c @@ -6,6 +6,8 @@ * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * Copyright (c) 2024 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * * $COPYRIGHT$ * @@ -114,11 +116,11 @@ _Generic((*(out)), \ #if defined(GENERATE_NEON_CODE) #define OP_AARCH64_FUNC(name, type_name, type_size, type_cnt, type, op) \ static void OP_CONCAT(ompi_op_aarch64_2buff_##name##_##type##type_size##_t, \ - APPEND)(const void *_in, void *_out, int *count, \ + APPEND)(const void *_in, void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int left_over = *count; \ + size_t left_over = *count; \ type##type_size##_t *in = (type##type_size##_t *) _in, \ *out = (type##type_size##_t *) _out; \ OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##x##type_cnt##_t) vsrc, vdst; \ @@ -138,12 +140,12 @@ _Generic((*(out)), \ #elif defined(GENERATE_SVE_CODE) #define OP_AARCH64_FUNC(name, type_name, type_size, type_cnt, type, op) \ static void OP_CONCAT(ompi_op_aarch64_2buff_##name##_##type##type_size##_t, APPEND) \ - (const void *_in, void *_out, int *count, \ + (const void *_in, void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ const int types_per_step = svcnt(*((type##type_size##_t *) _in)); \ - const int cnt = *count; \ + const size_t cnt = *count; \ type##type_size##_t *in = (type##type_size##_t *) _in, \ *out = (type##type_size##_t *) _out; \ OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \ @@ -279,11 +281,11 @@ _Generic((*(out)), \ #if defined(GENERATE_NEON_CODE) #define OP_AARCH64_FUNC_3BUFF(name, type_name, type_size, type_cnt, type, op) \ static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPEND) \ - (const void *_in1, const void *_in2, void *_out, int *count, \ + (const void *_in1, const void *_in2, void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int left_over = *count; \ + size_t left_over = *count; \ type##type_size##_t *in1 = (type##type_size##_t *) _in1, \ *in2 = (type##type_size##_t *) _in2, \ *out = (type##type_size##_t *) _out; \ @@ -304,17 +306,17 @@ static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPE #elif defined(GENERATE_SVE_CODE) #define OP_AARCH64_FUNC_3BUFF(name, type_name, type_size, type_cnt, type, op) \ static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPEND) \ - (const void *_in1, const void *_in2, void *_out, int *count, \ + (const void *_in1, const void *_in2, void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ const int types_per_step = svcnt(*((type##type_size##_t *) _in1)); \ type##type_size##_t *in1 = (type##type_size##_t *) _in1, \ *in2 = (type##type_size##_t *) _in2, \ *out = (type##type_size##_t *) _out; \ - const int cnt = *count; \ + const size_t cnt = *count; \ OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \ - for (int idx=0; idx < cnt; idx += types_per_step) { \ + for (size_t idx=0; idx < cnt; idx += types_per_step) { \ svbool_t pred = svwhilelt_b##type_size(idx, cnt); \ vsrc = svld1(pred, &in1[idx]); \ vdst = svld1(pred, &in2[idx]); \ diff --git a/ompi/mca/op/avx/op_avx.h b/ompi/mca/op/avx/op_avx.h index 3e0d0fd4620..ba625549a0f 100644 --- a/ompi/mca/op/avx/op_avx.h +++ b/ompi/mca/op/avx/op_avx.h @@ -2,6 +2,8 @@ * Copyright (c) 2019-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,7 +39,7 @@ BEGIN_C_DECLS */ typedef struct { /** The base op component struct */ - ompi_op_base_component_1_0_0_t super; + ompi_op_base_component_2_0_0_t super; /* What follows is avx-component-specific cached information. We tend to use this scheme (caching information on the avx diff --git a/ompi/mca/op/avx/op_avx_component.c b/ompi/mca/op/avx/op_avx_component.c index a2f01a373e2..d5f76b9375e 100644 --- a/ompi/mca/op/avx/op_avx_component.c +++ b/ompi/mca/op/avx/op_avx_component.c @@ -5,6 +5,8 @@ * Copyright (c) 2020 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2021 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,7 +34,7 @@ static int avx_component_open(void); static int avx_component_close(void); static int avx_component_init_query(bool enable_progress_threads, bool enable_mpi_thread_multiple); -static struct ompi_op_base_module_1_0_0_t * +static struct ompi_op_base_module_2_0_0_t * avx_component_op_query(struct ompi_op_t *op, int *priority); static int avx_component_register(void); @@ -131,7 +133,7 @@ static uint32_t has_intel_AVX_features(void) ompi_op_avx_component_t mca_op_avx_component = { { .opc_version = { - OMPI_OP_BASE_VERSION_1_0_0, + OMPI_OP_BASE_VERSION_2_0_0, .mca_component_name = "avx", MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, @@ -250,7 +252,7 @@ avx_component_init_query(bool enable_progress_threads, /* * Query whether this component can be used for a specific op */ -static struct ompi_op_base_module_1_0_0_t* +static struct ompi_op_base_module_2_0_0_t* avx_component_op_query(struct ompi_op_t *op, int *priority) { ompi_op_base_module_t *module = NULL; @@ -325,5 +327,5 @@ avx_component_op_query(struct ompi_op_t *op, int *priority) if (NULL != module) { *priority = 50; } - return (ompi_op_base_module_1_0_0_t *) module; + return (ompi_op_base_module_2_0_0_t *) module; } diff --git a/ompi/mca/op/avx/op_avx_functions.c b/ompi/mca/op/avx/op_avx_functions.c index d97d2d96338..8113d094927 100644 --- a/ompi/mca/op/avx/op_avx_functions.c +++ b/ompi/mca/op/avx/op_avx_functions.c @@ -4,6 +4,8 @@ * reserved. * Copyright (c) 2020 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -189,7 +191,7 @@ #if __AVX512F__ #define OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op) \ if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ - int types_per_step = (512 / 8) / sizeof(type); \ + size_t types_per_step = (512 / 8) / sizeof(type); \ for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ __m512i vecA = _mm512_loadu_si512((__m512*)in); \ in += types_per_step; \ @@ -211,7 +213,7 @@ #if __AVX__ #define OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op) \ if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - int types_per_step = (256 / 8) / sizeof(type); /* AVX2 */ \ + size_t types_per_step = (256 / 8) / sizeof(type); /* AVX2 */ \ for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ __m256i vecA = _mm256_loadu_si256((__m256i*)in); \ in += types_per_step; \ @@ -233,7 +235,7 @@ #if __SSE3__ #define OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op) \ if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \ - int types_per_step = (128 / 8) / sizeof(type); \ + size_t types_per_step = (128 / 8) / sizeof(type); \ for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ __m128i vecA = _mm_lddqu_si128((__m128i*)in); \ in += types_per_step; \ @@ -251,17 +253,17 @@ #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ #define OP_AVX_FUNC(name, type_sign, type_size, type, op) \ -static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in, void *_out, int *count, \ +static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in, void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int left_over = *count; \ + size_t left_over = *count; \ type *in = (type*)_in, *out = (type*)_out; \ OP_AVX_AVX512_FUNC(name, type_sign, type_size, type, op); \ OP_AVX_AVX2_FUNC(name, type_sign, type_size, type, op); \ OP_AVX_SSE4_1_FUNC(name, type_sign, type_size, type, op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(out[7], in[7]); \ case 7: out[6] = current_func(out[6], in[6]); \ @@ -282,7 +284,7 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in #if __AVX512BW__ && __AVX__ #define OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op) \ if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ - int types_per_step = (256 / 8) / sizeof(type); \ + size_t types_per_step = (256 / 8) / sizeof(type); \ for (; left_over >= types_per_step; left_over -= types_per_step) { \ __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in); \ __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)out); \ @@ -309,15 +311,15 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in /* special case for int8 mul */ #define OP_AVX_MUL(name, type_sign, type_size, type, op) \ -static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_in, void *_out, int *count, \ +static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_in, void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int left_over = *count; \ + size_t left_over = *count; \ type *in = (type*)_in, *out = (type*)_out; \ OP_AVX_AVX512_MUL(name, type_sign, type_size, type, op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(out[7], in[7]); \ case 7: out[6] = current_func(out[6], in[6]); \ @@ -406,17 +408,17 @@ static void OP_CONCAT( ompi_op_avx_2buff_##name##_##type, PREPEND)(const void *_ #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ #define OP_AVX_BIT_FUNC(name, type_size, type, op) \ -static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in, void *_out, int *count, \ +static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in, void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int types_per_step, left_over = *count; \ + size_t types_per_step, left_over = *count; \ type *in = (type*)_in, *out = (type*)_out; \ OP_AVX_AVX512_BIT_FUNC(name, type_size, type, op); \ OP_AVX_AVX2_BIT_FUNC(name, type_size, type, op); \ OP_AVX_SSE3_BIT_FUNC(name, type_size, type, op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(out[7], in[7]); \ case 7: out[6] = current_func(out[6], in[6]); \ @@ -499,17 +501,18 @@ static void OP_CONCAT(ompi_op_avx_2buff_##name##_##type,PREPEND)(const void *_in #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ #define OP_AVX_FLOAT_FUNC(op) \ -static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, void *_out, int *count, \ +static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int types_per_step, left_over = *count; \ + size_t types_per_step; \ + size_t left_over = *count; \ float *in = (float*)_in, *out = (float*)_out; \ OP_AVX_AVX512_FLOAT_FUNC(op); \ OP_AVX_AVX_FLOAT_FUNC(op); \ OP_AVX_SSE_FLOAT_FUNC(op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(out[7], in[7]); \ case 7: out[6] = current_func(out[6], in[6]); \ @@ -592,19 +595,19 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_float,PREPEND)(const void *_in, v #endif /* defined(OMPI_MCA_OP_HAVE_AVX) && (1 == OMPI_MCA_OP_HAVE_AVX) */ #define OP_AVX_DOUBLE_FUNC(op) \ -static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, void *_out, int *count, \ +static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int types_per_step = (512 / 8) / sizeof(double); \ - int left_over = *count; \ + size_t types_per_step = (512 / 8) / sizeof(double); \ + size_t left_over = *count; \ double* in = (double*)_in; \ double* out = (double*)_out; \ OP_AVX_AVX512_DOUBLE_FUNC(op); \ OP_AVX_AVX_DOUBLE_FUNC(op); \ OP_AVX_SSE2_DOUBLE_FUNC(op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(out[7], in[7]); \ case 7: out[6] = current_func(out[6], in[6]); \ @@ -759,7 +762,7 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, #if __AVX512F__ #define OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op) \ if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG|OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ - int types_per_step = (512 / 8) / sizeof(type); \ + size_t types_per_step = (512 / 8) / sizeof(type); \ for (; left_over >= types_per_step; left_over -= types_per_step) { \ __m512i vecA = _mm512_loadu_si512(in1); \ __m512i vecB = _mm512_loadu_si512(in2); \ @@ -782,7 +785,7 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, #if __AVX__ #define OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op) \ if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX2_FLAG | OMPI_OP_AVX_HAS_AVX_FLAG) ) { \ - int types_per_step = (256 / 8) / sizeof(type); \ + size_t types_per_step = (256 / 8) / sizeof(type); \ for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ __m256i vecA = _mm256_loadu_si256((__m256i*)in1); \ __m256i vecB = _mm256_loadu_si256((__m256i*)in2); \ @@ -805,7 +808,7 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, #if __SSE3__ && __SSE2__ #define OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op) \ if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_SSE3_FLAG | OMPI_OP_AVX_HAS_SSE4_1_FLAG) ) { \ - int types_per_step = (128 / 8) / sizeof(type); \ + size_t types_per_step = (128 / 8) / sizeof(type); \ for( ; left_over >= types_per_step; left_over -= types_per_step ) { \ __m128i vecA = _mm_lddqu_si128((__m128i*)in1); \ __m128i vecB = _mm_lddqu_si128((__m128i*)in2); \ @@ -826,17 +829,17 @@ static void OP_CONCAT(ompi_op_avx_2buff_##op##_double,PREPEND)(const void *_in, #define OP_AVX_FUNC_3(name, type_sign, type_size, type, op) \ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * restrict _in1, \ const void * restrict _in2, \ - void * restrict _out, int *count, \ + void * restrict _out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ type *in1 = (type*)_in1, *in2 = (type*)_in2, *out = (type*)_out; \ - int left_over = *count; \ + size_t left_over = *count; \ OP_AVX_AVX512_FUNC_3(name, type_sign, type_size, type, op); \ OP_AVX_AVX2_FUNC_3(name, type_sign, type_size, type, op); \ OP_AVX_SSE4_1_FUNC_3(name, type_sign, type_size, type, op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(in1[7], in2[7]); \ case 7: out[6] = current_func(in1[6], in2[6]); \ @@ -858,7 +861,7 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re #if __AVX512BW__ && __AVX__ #define OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op) \ if( OMPI_OP_AVX_HAS_FLAGS(OMPI_OP_AVX_HAS_AVX512F_FLAG | OMPI_OP_AVX_HAS_AVX512BW_FLAG) ) { \ - int types_per_step = (256 / 8) / sizeof(type); \ + size_t types_per_step = (256 / 8) / sizeof(type); \ for (; left_over >= types_per_step; left_over -= types_per_step) { \ __m256i vecA_tmp = _mm256_loadu_si256((__m256i*)in1); \ __m256i vecB_tmp = _mm256_loadu_si256((__m256i*)in2); \ @@ -888,15 +891,15 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re #define OP_AVX_MUL_3(name, type_sign, type_size, type, op) \ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * restrict _in1, \ const void * restrict _in2, \ - void * restrict _out, int *count, \ + void * restrict _out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ type *in1 = (type*)_in1, *in2 = (type*)_in2, *out = (type*)_out; \ - int left_over = *count; \ + size_t left_over = *count; \ OP_AVX_AVX512_MUL_3(name, type_sign, type_size, type, op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(in1[7], in2[7]); \ case 7: out[6] = current_func(in1[6], in2[6]); \ @@ -984,17 +987,17 @@ static void OP_CONCAT(ompi_op_avx_3buff_##name##_##type,PREPEND)(const void * re #define OP_AVX_BIT_FUNC_3(name, type_size, type, op) \ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, const void *_in2, \ - void *_out, int *count, \ + void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int types_per_step, left_over = *count; \ + size_t types_per_step, left_over = *count; \ type *in1 = (type*)_in1, *in2 = (type*)_in2, *out = (type*)_out; \ OP_AVX_AVX512_BIT_FUNC_3(name, type_size, type, op); \ OP_AVX_AVX2_BIT_FUNC_3(name, type_size, type, op); \ OP_AVX_SSE3_BIT_FUNC_3(name, type_size, type, op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(in1[7], in2[7]); \ case 7: out[6] = current_func(in1[6], in2[6]); \ @@ -1082,17 +1085,17 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_##type,PREPEND)(const void *_in1, #define OP_AVX_FLOAT_FUNC_3(op) \ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, const void *_in2, \ - void *_out, int *count, \ + void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int types_per_step, left_over = *count; \ + size_t types_per_step, left_over = *count; \ float *in1 = (float*)_in1, *in2 = (float*)_in2, *out = (float*)_out; \ OP_AVX_AVX512_FLOAT_FUNC_3(op); \ OP_AVX_AVX_FLOAT_FUNC_3(op); \ OP_AVX_SSE_FLOAT_FUNC_3(op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(in1[7], in2[7]); \ case 7: out[6] = current_func(in1[6], in2[6]); \ @@ -1180,17 +1183,17 @@ static void OP_CONCAT(ompi_op_avx_3buff_##op##_float,PREPEND)(const void *_in1, #define OP_AVX_DOUBLE_FUNC_3(op) \ static void OP_CONCAT(ompi_op_avx_3buff_##op##_double,PREPEND)(const void *_in1, const void *_in2, \ - void *_out, int *count, \ + void *_out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int types_per_step, left_over = *count; \ + size_t types_per_step, left_over = *count; \ double *in1 = (double*)_in1, *in2 = (double*)_in2, *out = (double*)_out; \ OP_AVX_AVX512_DOUBLE_FUNC_3(op); \ OP_AVX_AVX_DOUBLE_FUNC_3(op); \ OP_AVX_SSE2_DOUBLE_FUNC_3(op); \ while( left_over > 0 ) { \ - int how_much = (left_over > 8) ? 8 : left_over; \ + size_t how_much = (left_over > 8) ? 8 : left_over; \ switch(how_much) { \ case 8: out[7] = current_func(in1[7], in2[7]); \ case 7: out[6] = current_func(in1[6], in2[6]); \ diff --git a/ompi/mca/op/base/op_base_find_available.c b/ompi/mca/op/base/op_base_find_available.c index 3fdaf86f18c..21dfe3dc6f5 100644 --- a/ompi/mca/op/base/op_base_find_available.c +++ b/ompi/mca/op/base/op_base_find_available.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -42,7 +44,7 @@ static int init_query(const mca_base_component_t * ls, bool enable_progress_threads, bool enable_mpi_threads); -static int init_query_1_0_0(const mca_base_component_t * ls, +static int init_query_2_0_0(const mca_base_component_t * ls, bool enable_progress_threads, bool enable_mpi_threads); @@ -104,10 +106,10 @@ static int init_query(const mca_base_component_t * c, /* This component has already been successfully opened. So now query it. */ - if (1 == c->mca_type_major_version && + if (2 == c->mca_type_major_version && 0 == c->mca_type_minor_version && 0 == c->mca_type_release_version) { - ret = init_query_1_0_0(c, enable_progress_threads, + ret = init_query_2_0_0(c, enable_progress_threads, enable_mpi_threads); } else { /* Unrecognized op API version */ @@ -141,12 +143,12 @@ static int init_query(const mca_base_component_t * c, /* * Query a specific component, op v2.0.0 */ -static int init_query_1_0_0(const mca_base_component_t * component, +static int init_query_2_0_0(const mca_base_component_t * component, bool enable_progress_threads, bool enable_mpi_threads) { - ompi_op_base_component_1_0_0_t *op = - (ompi_op_base_component_1_0_0_t *) component; + ompi_op_base_component_2_0_0_t *op = + (ompi_op_base_component_2_0_0_t *) component; return op->opc_init_query(enable_progress_threads, enable_mpi_threads); diff --git a/ompi/mca/op/base/op_base_frame.c b/ompi/mca/op/base/op_base_frame.c index 90167300851..5f15b30a7ba 100644 --- a/ompi/mca/op/base/op_base_frame.c +++ b/ompi/mca/op/base/op_base_frame.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,7 +48,7 @@ static void module_constructor(ompi_op_base_module_t *m) memset(&(m->opm_3buff_fns), 0, sizeof(m->opm_3buff_fns)); } -static void module_constructor_1_0_0(ompi_op_base_module_1_0_0_t *m) +static void module_constructor_2_0_0(ompi_op_base_module_2_0_0_t *m) { m->opm_enable = NULL; m->opm_op = NULL; @@ -56,8 +58,8 @@ static void module_constructor_1_0_0(ompi_op_base_module_1_0_0_t *m) OBJ_CLASS_INSTANCE(ompi_op_base_module_t, opal_object_t, module_constructor, NULL); -OBJ_CLASS_INSTANCE(ompi_op_base_module_1_0_0_t, opal_object_t, - module_constructor_1_0_0, NULL); +OBJ_CLASS_INSTANCE(ompi_op_base_module_2_0_0_t, opal_object_t, + module_constructor_2_0_0, NULL); MCA_BASE_FRAMEWORK_DECLARE(ompi, op, NULL, NULL, NULL, NULL, mca_op_base_static_components, 0); diff --git a/ompi/mca/op/base/op_base_functions.c b/ompi/mca/op/base/op_base_functions.c index 411059e9ccc..cc5cf0e7e65 100644 --- a/ompi/mca/op/base/op_base_functions.c +++ b/ompi/mca/op/base/op_base_functions.c @@ -14,6 +14,8 @@ * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,11 +40,11 @@ * This macro is for (out op in). */ #define OP_FUNC(name, type_name, type, op) \ - static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \ + static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ type *a = (type *) in; \ type *b = (type *) out; \ for (i = *count; i > 0; i--) { \ @@ -58,11 +60,11 @@ * This macro is for (out = op(out, in)) */ #define FUNC_FUNC(name, type_name, type) \ - static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \ + static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ type *a = (type *) in; \ type *b = (type *) out; \ for (i = *count; i > 0; i--) { \ @@ -86,11 +88,11 @@ } ompi_op_predefined_##type_name##_t; #define LOC_FUNC(name, type_name, op) \ - static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, int *count, \ + static void ompi_op_base_2buff_##name##_##type_name(const void *in, void *out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ ompi_op_predefined_##type_name##_t *a = (ompi_op_predefined_##type_name##_t*) in; \ ompi_op_predefined_##type_name##_t *b = (ompi_op_predefined_##type_name##_t*) out; \ for (i = *count; i > 0; i--, ++a, ++b) { \ @@ -110,11 +112,11 @@ * not supports the corresponding complex number type. */ #define COMPLEX_SUM_FUNC(type_name, type) \ - static void ompi_op_base_2buff_sum_##type_name(const void *in, void *out, int *count, \ + static void ompi_op_base_2buff_sum_##type_name(const void *in, void *out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ type (*a)[2] = (type (*)[2]) in; \ type (*b)[2] = (type (*)[2]) out; \ for (i = *count; i > 0; i--, ++a, ++b) { \ @@ -130,11 +132,11 @@ * not supports the corresponding complex number type. */ #define COMPLEX_PROD_FUNC(type_name, type) \ - static void ompi_op_base_2buff_prod_##type_name(const void *in, void *out, int *count, \ + static void ompi_op_base_2buff_prod_##type_name(const void *in, void *out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ type (*a)[2] = (type (*)[2]) in; \ type (*b)[2] = (type (*)[2]) out; \ type c[2]; \ @@ -685,11 +687,11 @@ LOC_FUNC(minloc, long_double_int, <) */ #define OP_FUNC_3BUF(name, type_name, type, op) \ static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \ - const void * restrict in2, void * restrict out, int *count, \ + const void * restrict in2, void * restrict out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ type *a1 = (type *) in1; \ type *a2 = (type *) in2; \ type *b = (type *) out; \ @@ -707,11 +709,11 @@ LOC_FUNC(minloc, long_double_int, <) */ #define FUNC_FUNC_3BUF(name, type_name, type) \ static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \ - const void * restrict in2, void * restrict out, int *count, \ + const void * restrict in2, void * restrict out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ type *a1 = (type *) in1; \ type *a2 = (type *) in2; \ type *b = (type *) out; \ @@ -740,11 +742,11 @@ LOC_FUNC(minloc, long_double_int, <) #define LOC_FUNC_3BUF(name, type_name, op) \ static void ompi_op_base_3buff_##name##_##type_name(const void * restrict in1, \ - const void * restrict in2, void * restrict out, int *count, \ + const void * restrict in2, void * restrict out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ ompi_op_predefined_##type_name##_t *a1 = (ompi_op_predefined_##type_name##_t*) in1; \ ompi_op_predefined_##type_name##_t *a2 = (ompi_op_predefined_##type_name##_t*) in2; \ ompi_op_predefined_##type_name##_t *b = (ompi_op_predefined_##type_name##_t*) out; \ @@ -770,11 +772,11 @@ LOC_FUNC(minloc, long_double_int, <) */ #define COMPLEX_SUM_FUNC_3BUF(type_name, type) \ static void ompi_op_base_3buff_sum_##type_name(const void * restrict in1, \ - const void * restrict in2, void * restrict out, int *count, \ + const void * restrict in2, void * restrict out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ type (*a1)[2] = (type (*)[2]) in1; \ type (*a2)[2] = (type (*)[2]) in2; \ type (*b)[2] = (type (*)[2]) out; \ @@ -792,11 +794,11 @@ LOC_FUNC(minloc, long_double_int, <) */ #define COMPLEX_PROD_FUNC_3BUF(type_name, type) \ static void ompi_op_base_3buff_prod_##type_name(const void * restrict in1, \ - const void * restrict in2, void * restrict out, int *count, \ + const void * restrict in2, void * restrict out, size_t *count, \ struct ompi_datatype_t **dtype, \ - struct ompi_op_base_module_1_0_0_t *module) \ + struct ompi_op_base_module_2_0_0_t *module) \ { \ - int i; \ + size_t i; \ type (*a1)[2] = (type (*)[2]) in1; \ type (*a2)[2] = (type (*)[2]) in2; \ type (*b)[2] = (type (*)[2]) out; \ diff --git a/ompi/mca/op/base/op_base_op_select.c b/ompi/mca/op/base/op_base_op_select.c index c032172bf19..6576102216f 100644 --- a/ompi/mca/op/base/op_base_op_select.c +++ b/ompi/mca/op/base/op_base_op_select.c @@ -16,6 +16,8 @@ * Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2020-2024 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -52,7 +54,7 @@ typedef struct avail_op_t { opal_list_item_t super; int ao_priority; - ompi_op_base_module_1_0_0_t *ao_module; + ompi_op_base_module_2_0_0_t *ao_module; } avail_op_t; @@ -63,15 +65,15 @@ static opal_list_t *check_components(opal_list_t *components, ompi_op_t *op); static int check_one_component(ompi_op_t *op, const mca_base_component_t *component, - ompi_op_base_module_1_0_0_t **module); + ompi_op_base_module_2_0_0_t **module); static int query(const mca_base_component_t *component, ompi_op_t *op, int *priority, - ompi_op_base_module_1_0_0_t **module); + ompi_op_base_module_2_0_0_t **module); -static int query_1_0_0(const ompi_op_base_component_1_0_0_t *op_component, +static int query_2_0_0(const ompi_op_base_component_2_0_0_t *op_component, ompi_op_t *op, int *priority, - ompi_op_base_module_1_0_0_t **module); + ompi_op_base_module_2_0_0_t **module); /* * Stuff for the OBJ interface @@ -232,7 +234,7 @@ static opal_list_t *check_components(opal_list_t *components, int priority; mca_base_component_list_item_t *cli; const mca_base_component_t *component; - ompi_op_base_module_1_0_0_t *module; + ompi_op_base_module_2_0_0_t *module; opal_list_t *selectable; avail_op_t *avail; @@ -270,7 +272,7 @@ static opal_list_t *check_components(opal_list_t *components, */ static int check_one_component(ompi_op_t *op, const mca_base_component_t *component, - ompi_op_base_module_1_0_0_t **module) + ompi_op_base_module_2_0_0_t **module) { int err; int priority = -1; @@ -304,16 +306,16 @@ static int check_one_component(ompi_op_t *op, */ static int query(const mca_base_component_t *component, ompi_op_t *op, - int *priority, ompi_op_base_module_1_0_0_t **module) + int *priority, ompi_op_base_module_2_0_0_t **module) { *module = NULL; - if (1 == component->mca_type_major_version && + if (2 == component->mca_type_major_version && 0 == component->mca_type_minor_version && 0 == component->mca_type_release_version) { - const ompi_op_base_component_1_0_0_t *op100 = - (ompi_op_base_component_1_0_0_t *) component; + const ompi_op_base_component_2_0_0_t *op100 = + (ompi_op_base_component_2_0_0_t *) component; - return query_1_0_0(op100, op, priority, module); + return query_2_0_0(op100, op, priority, module); } /* Unknown op API version -- return error */ @@ -322,11 +324,11 @@ static int query(const mca_base_component_t *component, } -static int query_1_0_0(const ompi_op_base_component_1_0_0_t *component, +static int query_2_0_0(const ompi_op_base_component_2_0_0_t *component, ompi_op_t *op, int *priority, - ompi_op_base_module_1_0_0_t **module) + ompi_op_base_module_2_0_0_t **module) { - ompi_op_base_module_1_0_0_t *ret; + ompi_op_base_module_2_0_0_t *ret; /* There's currently no need for conversion */ diff --git a/ompi/mca/op/example/op_example.h b/ompi/mca/op/example/op_example.h index eaaba98534d..b835bc08eb8 100644 --- a/ompi/mca/op/example/op_example.h +++ b/ompi/mca/op/example/op_example.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,7 +38,7 @@ BEGIN_C_DECLS */ typedef struct { /** The base op component struct */ - ompi_op_base_component_1_0_0_t super; + ompi_op_base_component_2_0_0_t super; /* What follows is example-component-specific cached information. We tend to use this scheme (caching information on the example @@ -60,7 +62,7 @@ typedef struct { * on the example component. */ typedef struct { - ompi_op_base_module_1_0_0_t super; + ompi_op_base_module_2_0_0_t super; /* Just like the ompi_op_example_component_t, this struct is meant to cache information on a per-module basis. What follows are diff --git a/ompi/mca/op/example/op_example_component.c b/ompi/mca/op/example/op_example_component.c index af2ac8f4c46..fc5f9baafb5 100644 --- a/ompi/mca/op/example/op_example_component.c +++ b/ompi/mca/op/example/op_example_component.c @@ -14,6 +14,8 @@ * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,7 +45,7 @@ static int example_component_open(void); static int example_component_close(void); static int example_component_init_query(bool enable_progress_threads, bool enable_mpi_thread_multiple); -static struct ompi_op_base_module_1_0_0_t * +static struct ompi_op_base_module_2_0_0_t * example_component_op_query(struct ompi_op_t *op, int *priority); static int example_component_register(void); @@ -52,7 +54,7 @@ ompi_op_example_component_t mca_op_example_component = { information about the component itself */ { .opc_version = { - OMPI_OP_BASE_VERSION_1_0_0, + OMPI_OP_BASE_VERSION_2_0_0, .mca_component_name = "example", MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, @@ -248,7 +250,7 @@ static int example_component_init_query(bool enable_progress_threads, /* * Query whether this component can be used for a specific op */ -static struct ompi_op_base_module_1_0_0_t * +static struct ompi_op_base_module_2_0_0_t * example_component_op_query(struct ompi_op_t *op, int *priority) { ompi_op_base_module_t *module = NULL; @@ -307,5 +309,5 @@ static struct ompi_op_base_module_1_0_0_t * if (NULL != module) { *priority = 50; } - return (ompi_op_base_module_1_0_0_t *) module; + return (ompi_op_base_module_2_0_0_t *) module; } diff --git a/ompi/mca/op/example/op_example_module_bxor.c b/ompi/mca/op/example/op_example_module_bxor.c index 23a90ede488..fe34e8af5c4 100644 --- a/ompi/mca/op/example/op_example_module_bxor.c +++ b/ompi/mca/op/example/op_example_module_bxor.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,7 +43,7 @@ * on the example component. */ typedef struct { - ompi_op_base_module_1_0_0_t super; + ompi_op_base_module_2_0_0_t super; /* Just like the ompi_op_example_component_t, this struct is meant to cache information on a per-module basis. What follows are @@ -109,7 +111,7 @@ static OBJ_CLASS_INSTANCE(module_bxor_t, /** * Bxor function for C int */ -static void bxor_int(void *in, void *out, int *count, +static void bxor_int(void *in, void *out, size_t *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_bxor_t *m = (module_bxor_t*) module; diff --git a/ompi/mca/op/example/op_example_module_max.c b/ompi/mca/op/example/op_example_module_max.c index 4c43ecf22a4..588d0982b35 100644 --- a/ompi/mca/op/example/op_example_module_max.c +++ b/ompi/mca/op/example/op_example_module_max.c @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,7 +43,7 @@ * on the example component. */ typedef struct { - ompi_op_base_module_1_0_0_t super; + ompi_op_base_module_2_0_0_t super; /* Just like the ompi_op_example_component_t, this struct is meant to cache information on a per-module basis. What follows are @@ -118,7 +120,7 @@ static OBJ_CLASS_INSTANCE(module_max_t, /** * Max function for C float */ -static void max_float(void *in, void *out, int *count, +static void max_float(void *in, void *out, size_t *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_max_t *m = (module_max_t*) module; @@ -152,7 +154,7 @@ static void max_float(void *in, void *out, int *count, /** * Max function for C double */ -static void max_double(void *in, void *out, int *count, +static void max_double(void *in, void *out, size_t *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_max_t *m = (module_max_t*) module; @@ -166,7 +168,7 @@ static void max_double(void *in, void *out, int *count, /** * Max function for Fortran REAL */ -static void max_real(void *in, void *out, int *count, +static void max_real(void *in, void *out, size_t *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { module_max_t *m = (module_max_t*) module; @@ -180,7 +182,7 @@ static void max_real(void *in, void *out, int *count, /** * Max function for Fortran DOUBLE PRECISION */ -static void max_double_precision(void *in, void *out, int *count, +static void max_double_precision(void *in, void *out, size_t *count, ompi_datatype_t **type, ompi_op_base_module_t *module) { diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h index 34d26376ab9..c45bb8c1f13 100644 --- a/ompi/mca/op/op.h +++ b/ompi/mca/op/op.h @@ -17,6 +17,8 @@ * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -248,9 +250,9 @@ enum { * Pre-declare this so that we can pass it as an argument to the * typedef'ed functions. */ -struct ompi_op_base_module_1_0_0_t; +struct ompi_op_base_module_2_0_0_t; -typedef struct ompi_op_base_module_1_0_0_t ompi_op_base_module_t; +typedef struct ompi_op_base_module_2_0_0_t ompi_op_base_module_t; /** * Typedef for 2-buffer op functions. @@ -260,22 +262,22 @@ typedef struct ompi_op_base_module_1_0_0_t ompi_op_base_module_t; * repeated code, but it's better this way (and this typedef will * never change, so there's not much of a maintenance worry). */ -typedef void (*ompi_op_base_handler_fn_1_0_0_t)(const void *, void *, int *, +typedef void (*ompi_op_base_handler_fn_2_0_0_t)(const void *, void *, size_t *, struct ompi_datatype_t **, - struct ompi_op_base_module_1_0_0_t *); + struct ompi_op_base_module_2_0_0_t *); -typedef ompi_op_base_handler_fn_1_0_0_t ompi_op_base_handler_fn_t; +typedef ompi_op_base_handler_fn_2_0_0_t ompi_op_base_handler_fn_t; /* * Typedef for 3-buffer (two input and one output) op functions. */ -typedef void (*ompi_op_base_3buff_handler_fn_1_0_0_t)(const void *, +typedef void (*ompi_op_base_3buff_handler_fn_2_0_0_t)(const void *, const void *, - void *, int *, + void *, size_t *, struct ompi_datatype_t **, - struct ompi_op_base_module_1_0_0_t *); + struct ompi_op_base_module_2_0_0_t *); -typedef ompi_op_base_3buff_handler_fn_1_0_0_t ompi_op_base_3buff_handler_fn_t; +typedef ompi_op_base_3buff_handler_fn_2_0_0_t ompi_op_base_3buff_handler_fn_t; /** * Op component initialization @@ -322,8 +324,8 @@ typedef int (*ompi_op_base_component_init_query_fn_t) * provide a module with the requested functionality or NULL if the * component should not be used on the given communicator. */ -typedef struct ompi_op_base_module_1_0_0_t * - (*ompi_op_base_component_op_query_1_0_0_fn_t) +typedef struct ompi_op_base_module_2_0_0_t * + (*ompi_op_base_component_op_query_2_0_0_fn_t) (struct ompi_op_t *op, int *priority); /** @@ -333,7 +335,7 @@ typedef struct ompi_op_base_module_1_0_0_t * * this structure, called mca_op_[component_name]_component, must * exist in any op component. */ -typedef struct ompi_op_base_component_1_0_0_t { +typedef struct ompi_op_base_component_2_0_0_t { /** Base component description */ mca_base_component_t opc_version; /** Base component data block */ @@ -342,14 +344,14 @@ typedef struct ompi_op_base_component_1_0_0_t { /** Component initialization function */ ompi_op_base_component_init_query_fn_t opc_init_query; /** Query whether component is usable for given op */ - ompi_op_base_component_op_query_1_0_0_fn_t opc_op_query; -} ompi_op_base_component_1_0_0_t; + ompi_op_base_component_op_query_2_0_0_fn_t opc_op_query; +} ompi_op_base_component_2_0_0_t; /** Per guidance in mca.h, use the unversioned struct name if you just want to always keep up with the most recent version of the interface. */ -typedef struct ompi_op_base_component_1_0_0_t ompi_op_base_component_t; +typedef struct ompi_op_base_component_2_0_0_t ompi_op_base_component_t; /** * Module initialization function. Should return OPAL_SUCCESS if @@ -357,20 +359,20 @@ typedef struct ompi_op_base_component_1_0_0_t ompi_op_base_component_t; * if the module doesn't need to do anything between the component * query function and being invoked for MPI_Op operations. */ -typedef int (*ompi_op_base_module_enable_1_0_0_fn_t) - (struct ompi_op_base_module_1_0_0_t *module, +typedef int (*ompi_op_base_module_enable_2_0_0_fn_t) + (struct ompi_op_base_module_2_0_0_t *module, struct ompi_op_t *op); /** * Module struct */ -typedef struct ompi_op_base_module_1_0_0_t { +typedef struct ompi_op_base_module_2_0_0_t { /** Op modules all inherit from opal_object */ opal_object_t super; /** Enable function called when an op module is (possibly) going to be used for the given MPI_Op */ - ompi_op_base_module_enable_1_0_0_fn_t opm_enable; + ompi_op_base_module_enable_2_0_0_fn_t opm_enable; /** Just for reference -- a pointer to the MPI_Op that this module is being used for */ @@ -378,9 +380,9 @@ typedef struct ompi_op_base_module_1_0_0_t { /** Function pointers for all the different datatypes to be used with the MPI_Op that this module is used with */ - ompi_op_base_handler_fn_1_0_0_t opm_fns[OMPI_OP_BASE_TYPE_MAX]; - ompi_op_base_3buff_handler_fn_1_0_0_t opm_3buff_fns[OMPI_OP_BASE_TYPE_MAX]; -} ompi_op_base_module_1_0_0_t; + ompi_op_base_handler_fn_2_0_0_t opm_fns[OMPI_OP_BASE_TYPE_MAX]; + ompi_op_base_3buff_handler_fn_2_0_0_t opm_3buff_fns[OMPI_OP_BASE_TYPE_MAX]; +} ompi_op_base_module_2_0_0_t; /** * Declare the module as a class, unversioned @@ -390,37 +392,37 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_op_base_module_t); /** * Declare the module as a class, unversioned */ -OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_op_base_module_1_0_0_t); +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_op_base_module_2_0_0_t); /** * Struct that is used in op.h to hold all the function pointers and * pointers to the corresopnding modules (so that we can properly * RETAIN/RELEASE them) */ -typedef struct ompi_op_base_op_fns_1_0_0_t { - ompi_op_base_handler_fn_1_0_0_t fns[OMPI_OP_BASE_TYPE_MAX]; +typedef struct ompi_op_base_op_fns_2_0_0_t { + ompi_op_base_handler_fn_2_0_0_t fns[OMPI_OP_BASE_TYPE_MAX]; ompi_op_base_module_t *modules[OMPI_OP_BASE_TYPE_MAX]; -} ompi_op_base_op_fns_1_0_0_t; +} ompi_op_base_op_fns_2_0_0_t; -typedef ompi_op_base_op_fns_1_0_0_t ompi_op_base_op_fns_t; +typedef ompi_op_base_op_fns_2_0_0_t ompi_op_base_op_fns_t; /** * Struct that is used in op.h to hold all the function pointers and * pointers to the corresopnding modules (so that we can properly * RETAIN/RELEASE them) */ -typedef struct ompi_op_base_op_3buff_fns_1_0_0_t { - ompi_op_base_3buff_handler_fn_1_0_0_t fns[OMPI_OP_BASE_TYPE_MAX]; +typedef struct ompi_op_base_op_3buff_fns_2_0_0_t { + ompi_op_base_3buff_handler_fn_2_0_0_t fns[OMPI_OP_BASE_TYPE_MAX]; ompi_op_base_module_t *modules[OMPI_OP_BASE_TYPE_MAX]; -} ompi_op_base_op_3buff_fns_1_0_0_t; +} ompi_op_base_op_3buff_fns_2_0_0_t; -typedef ompi_op_base_op_3buff_fns_1_0_0_t ompi_op_base_op_3buff_fns_t; +typedef ompi_op_base_op_3buff_fns_2_0_0_t ompi_op_base_op_3buff_fns_t; /* * Macro for use in modules that are of type op v2.0.0 */ -#define OMPI_OP_BASE_VERSION_1_0_0 \ - OMPI_MCA_BASE_VERSION_2_1_0("op", 1, 0, 0) +#define OMPI_OP_BASE_VERSION_2_0_0 \ + OMPI_MCA_BASE_VERSION_2_1_0("op", 2, 0, 0) END_C_DECLS diff --git a/ompi/mpi/c/op_create.c b/ompi/mpi/c/op_create.c index 28e00222ad9..1fafbea899a 100644 --- a/ompi/mpi/c/op_create.c +++ b/ompi/mpi/c/op_create.c @@ -12,6 +12,8 @@ * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -57,6 +59,7 @@ int MPI_Op_create(MPI_User_function * function, int commute, MPI_Op * op) /* Create and cache the op. Sets a refcount of 1. */ *op = ompi_op_create_user(OPAL_INT_TO_BOOL(commute), + false, (ompi_op_fortran_handler_fn_t *) function); if (NULL == *op) { err = MPI_ERR_INTERN; diff --git a/ompi/op/op.c b/ompi/op/op.c index 3977fa8b97b..c800dc0a1cb 100644 --- a/ompi/op/op.c +++ b/ompi/op/op.c @@ -17,7 +17,7 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. - * Copyright (c) 2018 Triad National Security, LLC. All rights + * Copyright (c) 2018-2025 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -353,6 +353,7 @@ static int ompi_op_finalize (void) * Create a new MPI_Op */ ompi_op_t *ompi_op_create_user(bool commute, + bool bigcount, ompi_op_fortran_handler_fn_t func) { ompi_op_t *new_op; @@ -382,6 +383,9 @@ ompi_op_t *ompi_op_create_user(bool commute, if (commute) { new_op->o_flags |= OMPI_OP_FLAGS_COMMUTE; } + if(bigcount) { + new_op->o_flags |= OMPI_OP_FLAGS_BIGCOUNT; + } opal_string_copy(new_op->o_name, "USER OP", sizeof(new_op->o_name)); new_op->o_name[sizeof(new_op->o_name) - 1] = '\0'; diff --git a/ompi/op/op.h b/ompi/op/op.h index f3cf5b53636..0189b0240e6 100644 --- a/ompi/op/op.h +++ b/ompi/op/op.h @@ -18,7 +18,7 @@ * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2018 Triad National Security, LLC. All rights + * Copyright (c) 2018-2025 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2021 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -61,12 +61,16 @@ BEGIN_C_DECLS */ typedef void (ompi_op_c_handler_fn_t)(const void *, void *, int *, struct ompi_datatype_t **); +typedef void (ompi_op_c_handler_bc_fn_t)(const void *, void *, size_t *, + struct ompi_datatype_t **); /** * Typedef for fortran user-defined MPI_Ops. */ typedef void (ompi_op_fortran_handler_fn_t)(const void *, void *, MPI_Fint *, MPI_Fint *); +typedef void (ompi_op_fortran_handler_bc_fn_t)(const void *, void *, + size_t *, MPI_Fint *); /** * Typedef for Java op functions intercept (used for user-defined @@ -98,8 +102,8 @@ typedef void (ompi_op_java_handler_fn_t)(const void *, void *, int *, #define OMPI_OP_FLAGS_FLOAT_ASSOC 0x0020 /** Set if the callback function is communative */ #define OMPI_OP_FLAGS_COMMUTE 0x0040 - - +/** Set if the callback function is using bigcount */ +#define OMPI_OP_FLAGS_BIGCOUNT 0x0080 /* @@ -152,8 +156,12 @@ struct ompi_op_t { ompi_op_base_op_fns_t intrinsic; /** C handler function pointer */ ompi_op_c_handler_fn_t *c_fn; + /** C handler function pointer - bigcount*/ + ompi_op_c_handler_bc_fn_t *c_fn_bc; /** Fortran handler function pointer */ ompi_op_fortran_handler_fn_t *fort_fn; + /** Fortran handler function pointer - bigcount*/ + ompi_op_fortran_handler_bc_fn_t *fort_fn_bc; /** Java intercept function data */ struct { /* The OMPI C++ callback/intercept function */ @@ -333,6 +341,8 @@ int ompi_op_init(void); * * @param commute Boolean indicating whether the operation is * communative or not + * @param bigcount Boolean indicating whether or not the op is + * using the bigcount (MPI_Count) interface * @param func Function pointer of the error handler * * @returns op Pointer to the ompi_op_t that will be @@ -355,6 +365,7 @@ int ompi_op_init(void); * manually. */ ompi_op_t *ompi_op_create_user(bool commute, + bool bigcount, ompi_op_fortran_handler_fn_t func); /** @@ -507,36 +518,6 @@ static inline void ompi_op_reduce(ompi_op_t * op, const void *source, MPI_Fint f_dtype, f_count; int count = full_count; - /* - * If the full_count is > INT_MAX then we need to call the reduction op - * in iterations of counts <= INT_MAX since it has an `int *len` - * parameter. - * - * Note: When we add BigCount support then we can distinguish between - * a reduction operation with `int *len` and `MPI_Count *len`. At which - * point we can avoid this loop. - */ - if( OPAL_UNLIKELY(full_count > INT_MAX) ) { - size_t done_count = 0, shift; - int iter_count; - ptrdiff_t ext, lb; - - ompi_datatype_get_extent(dtype, &lb, &ext); - - while(done_count < full_count) { - if(done_count + INT_MAX > full_count) { - iter_count = full_count - done_count; - } else { - iter_count = INT_MAX; - } - shift = done_count * ext; - // Recurse one level in iterations of 'int' - ompi_op_reduce(op, (const char*)source + shift, (char*)target + shift, iter_count, dtype); - done_count += iter_count; - } - return; - } - /* * Call the reduction function. Two dimensions: a) if both the op * and the datatype are intrinsic, we have a series of predefined @@ -570,7 +551,7 @@ static inline void ompi_op_reduce(ompi_op_t * op, const void *source, dtype_id = ompi_op_ddt_map[dtype->id]; } op->o_func.intrinsic.fns[dtype_id](source, target, - &count, &dtype, + &full_count, &dtype, op->o_func.intrinsic.modules[dtype_id]); return; } @@ -578,8 +559,12 @@ static inline void ompi_op_reduce(ompi_op_t * op, const void *source, /* User-defined function */ if (0 != (op->o_flags & OMPI_OP_FLAGS_FORTRAN_FUNC)) { f_dtype = OMPI_INT_2_FINT(dtype->d_f_to_c_index); - f_count = OMPI_INT_2_FINT(count); - op->o_func.fort_fn(source, target, &f_count, &f_dtype); + if (0 == (op->o_flags & OMPI_OP_FLAGS_BIGCOUNT)) { + f_count = OMPI_INT_2_FINT(count); + op->o_func.fort_fn(source, target, &f_count, &f_dtype); + } else { + op->o_func.fort_fn_bc(source, target, &full_count, &f_dtype); + } return; } else if (0 != (op->o_flags & OMPI_OP_FLAGS_JAVA_FUNC)) { op->o_func.java_data.intercept_fn(source, target, &count, &dtype, @@ -588,15 +573,23 @@ static inline void ompi_op_reduce(ompi_op_t * op, const void *source, op->o_func.java_data.object); return; } - op->o_func.c_fn(source, target, &count, &dtype); + if (0 == (op->o_flags & OMPI_OP_FLAGS_BIGCOUNT)) { + op->o_func.c_fn(source, target, &count, &dtype); + } else { + op->o_func.c_fn_bc(source, target, &full_count, &dtype); + } return; } static inline void ompi_3buff_op_user (ompi_op_t *op, void * restrict source1, void * restrict source2, - void * restrict result, int count, struct ompi_datatype_t *dtype) + void * restrict result, size_t count, struct ompi_datatype_t *dtype) { ompi_datatype_copy_content_same_ddt (dtype, count, (char*)result, (char*)source1); - op->o_func.c_fn (source2, result, &count, &dtype); + if (0 == (op->o_flags & OMPI_OP_FLAGS_BIGCOUNT)) { + op->o_func.c_fn (source2, result, (int *)&count, &dtype); + } else { + op->o_func.c_fn_bc (source2, result, &count, &dtype); + } } /** @@ -624,7 +617,7 @@ static inline void ompi_3buff_op_user (ompi_op_t *op, void * restrict source1, v */ static inline void ompi_3buff_op_reduce(ompi_op_t * op, void *source1, void *source2, void *target, - int count, ompi_datatype_t * dtype) + size_t count, ompi_datatype_t * dtype) { void *restrict src1; void *restrict src2;