From 129bc8ab660ab570ba6fa20d86eee2477e1cdb44 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 1 Dec 2022 09:21:15 +0100 Subject: [PATCH] fabtests/multinode: Skip tests for unsupported collective operations Some collective providers do not support all collective operations. In such a case unsupported operations will not affect the final test result. fi_query_collective() is used to check operation support. A new function, test_query(), call fi_query_collective() with parameters defined in extended coll_test structure. The test is silently skipped if fi_query_collective returns -FI_ENOSYS. Signed-off-by: Tomasz Gromadzki --- fabtests/multinode/include/coll_test.h | 20 +-- fabtests/multinode/src/core_coll.c | 228 +++++++++++++------------ 2 files changed, 129 insertions(+), 119 deletions(-) diff --git a/fabtests/multinode/include/coll_test.h b/fabtests/multinode/include/coll_test.h index 92eae06804d..98bc8c3fd77 100644 --- a/fabtests/multinode/include/coll_test.h +++ b/fabtests/multinode/include/coll_test.h @@ -31,15 +31,13 @@ */ #pragma once - - -typedef int (*coll_test_setup_t)(); -typedef int (*coll_test_run_t)(); -typedef void (*coll_test_teardown_t)(); - struct coll_test { - char *name; - coll_test_setup_t setup; - coll_test_run_t run; - coll_test_teardown_t teardown; -}; \ No newline at end of file + char *name; + int (*setup)(void); + int (*run)(enum fi_collective_op coll_op, enum fi_op op, + enum fi_datatype datatype); + int (*teardown)(void); + enum fi_collective_op coll_op; + enum fi_op op; + enum fi_datatype datatype; +}; diff --git a/fabtests/multinode/src/core_coll.c b/fabtests/multinode/src/core_coll.c index 535c7df2d29..b342a897a2b 100644 --- a/fabtests/multinode/src/core_coll.c +++ b/fabtests/multinode/src/core_coll.c @@ -150,19 +150,17 @@ static int coll_setup_w_start_addr_stride(int start_addr, int stride) err = fi_av_set(av, &av_set_attr, &av_set, NULL); if (err) - FT_DEBUG("av_set creation failed ret = %d\n", err); + FT_PRINTERR("fi_av_set", err); err = fi_av_set_addr(av_set, &world_addr); if (err) { - FT_DEBUG("failed to get collective addr = %d (%s)\n", err, - fi_strerror(err)); + FT_PRINTERR("failed to get collective addr - fi_av_set_addr", err); return err; } err = fi_join_collective(ep, world_addr, av_set, 0, &coll_mc, &done_flag); if (err) { - FT_DEBUG("collective join failed ret = %d (%s)\n", err, - fi_strerror(err)); + FT_PRINTERR("fi_join_collective", err); return err; } @@ -179,13 +177,18 @@ static int coll_setup_w_stride() return coll_setup_w_start_addr_stride(1, 2); } -static void coll_teardown() +static int coll_teardown() { + int ret; if (!is_my_rank_participating()) - return; + return FI_SUCCESS; - fi_close(&coll_mc->fid); - fi_close(&av_set->fid); + ret = fi_close(&coll_mc->fid); + if (ret) + fi_close(&av_set->fid); + else + ret = fi_close(&av_set->fid); + return ret; } static int join_test_run() @@ -193,34 +196,38 @@ static int join_test_run() return FI_SUCCESS; } -static int barrier_test_run() +static int test_query(enum fi_collective_op coll_op, enum fi_op op, + enum fi_datatype datatype) { struct fi_collective_attr attr; - uint64_t done_flag; - int err; - attr.op = FI_NOOP; - attr.datatype = FI_VOID; + attr.op = op; + attr.datatype = datatype; attr.mode = 0; - err = fi_query_collective(domain, FI_BARRIER, &attr, 0); - if (err) { - FT_DEBUG("barrier collective not supported: %d (%s)\n", err, - fi_strerror(err)); - return err; - } + return fi_query_collective(domain, coll_op, &attr, 0); +} + +static int barrier_test_run(enum fi_collective_op coll_op, enum fi_op op, + enum fi_datatype datatype) +{ + uint64_t done_flag; + int err; + + assert(coll_op == FI_BARRIER); coll_addr = fi_mc_addr(coll_mc); err = fi_barrier(ep, coll_addr, &done_flag); if (err) { - FT_DEBUG("collective barrier failed: %d (%s)\n", err, fi_strerror(err)); + FT_PRINTERR("collective barrier failed - fi_barrier", err); return err; } return wait_for_comp(&done_flag); } -static int sum_all_reduce_test_run() +static int sum_all_reduce_test_run(enum fi_collective_op coll_op, enum fi_op op, + enum fi_datatype datatype) { uint64_t done_flag; uint64_t result = 0; @@ -229,9 +236,12 @@ static int sum_all_reduce_test_run() const uint64_t base_data_value = 1234; /* any arbitrary value != 0 */ size_t count = 1; uint64_t i; - struct fi_collective_attr attr; int err; + assert(coll_op == FI_ALLREDUCE); + assert(op == FI_SUM); + assert(datatype == FI_UINT64); + if (!is_my_rank_participating()) return FI_SUCCESS; @@ -239,16 +249,6 @@ static int sum_all_reduce_test_run() // verifiable data = base_data_value + pm_job.my_rank; - attr.op = FI_SUM; - attr.datatype = FI_UINT64; - attr.mode = 0; - err = fi_query_collective(domain, FI_ALLREDUCE, &attr, 0); - if (err) { - FT_DEBUG("SUM AllReduce collective not supported: %d (%s)\n", - err, fi_strerror(err)); - return err; - } - for (i = av_set_attr.start_addr; i <= av_set_attr.end_addr; i += av_set_attr.stride) { @@ -256,11 +256,10 @@ static int sum_all_reduce_test_run() } coll_addr = fi_mc_addr(coll_mc); - err = fi_allreduce(ep, &data, count, NULL, &result, NULL, coll_addr, FI_UINT64, - FI_SUM, 0, &done_flag); + err = fi_allreduce(ep, &data, count, NULL, &result, NULL, coll_addr, + FI_UINT64, FI_SUM, 0, &done_flag); if (err) { - FT_DEBUG("collective allreduce failed: %d (%s)\n", - err, fi_strerror(err)); + FT_PRINTERR("collective allreduce failed - fi_allreduce", err); return err; } @@ -271,12 +270,13 @@ static int sum_all_reduce_test_run() if (result == expect_result) return FI_SUCCESS; - FT_DEBUG("allreduce failed; expect: %ld, actual: %ld\n", + FT_DEBUG("allreduce failed; expect: %ld, actual: %ld", expect_result, result); return -FI_ENOEQ; } -static int all_gather_test_run() +static int all_gather_test_run(enum fi_collective_op coll_op, enum fi_op op, + enum fi_datatype datatype) { uint64_t done_flag; uint64_t *result; @@ -284,19 +284,10 @@ static int all_gather_test_run() uint64_t data = pm_job.my_rank; size_t count = 1; uint64_t i; - struct fi_collective_attr attr; int ret; - attr.op = FI_NOOP; - attr.datatype = FI_UINT64; - attr.mode = 0; - - ret = fi_query_collective(domain, FI_ALLGATHER, &attr, 0); - if (ret) { - FT_DEBUG("SUM AllReduce collective not supported: %d (%s)\n", ret, - fi_strerror(ret)); - return ret; - } + assert(coll_op == FI_ALLGATHER); + assert(datatype == FI_UINT64); result = malloc(pm_job.num_ranks * sizeof(*expect_result)); if (!result) @@ -315,8 +306,7 @@ static int all_gather_test_run() ret = fi_allgather(ep, &data, count, NULL, result, NULL, coll_addr, FI_UINT64, 0, &done_flag); if (ret) { - FT_DEBUG("collective allreduce failed: %d (%s)\n", - ret, fi_strerror(ret)); + FT_PRINTERR("collective allreduce failed:", ret); goto out; } @@ -342,27 +332,19 @@ static int all_gather_test_run() return ret; } -static int scatter_test_run() +static int scatter_test_run(enum fi_collective_op coll_op, enum fi_op op, + enum fi_datatype datatype) { uint64_t done_flag; uint64_t result; uint64_t *data; uint64_t i; - struct fi_collective_attr attr; fi_addr_t root = 0; size_t data_size = pm_job.num_ranks * sizeof(*data); - int ret; + int err; - attr.op = FI_NOOP; - attr.datatype = FI_UINT64; - attr.mode = 0; - - ret = fi_query_collective(domain, FI_SCATTER, &attr, 0); - if (ret) { - FT_DEBUG("Scatter collective not supported: %d (%s)\n", ret, - fi_strerror(ret)); - return ret; - } + assert(coll_op == FI_SCATTER); + assert(datatype == FI_UINT64); data = malloc(data_size); if (!data) @@ -374,55 +356,47 @@ static int scatter_test_run() coll_addr = fi_mc_addr(coll_mc); if (pm_job.my_rank == root) { - ret = fi_scatter(ep, data, 1, NULL, &result, NULL, coll_addr, + err = fi_scatter(ep, data, 1, NULL, &result, NULL, coll_addr, root, FI_UINT64, 0, &done_flag); } else { - ret = fi_scatter(ep, NULL, 1, NULL, &result, NULL, coll_addr, + err = fi_scatter(ep, NULL, 1, NULL, &result, NULL, coll_addr, root, FI_UINT64, 0, &done_flag); } - if (ret) { - FT_DEBUG("collective scatter failed: %d (%s)\n", ret, fi_strerror(ret)); + if (err) { + FT_PRINTERR("collective scatter failed - fi_scatter", err); goto out; } - ret = wait_for_comp(&done_flag); - if (ret) + err = wait_for_comp(&done_flag); + if (err) goto out; if (data[pm_job.my_rank] != result) { - FT_DEBUG("scatter failed; expect: %ld, actual: %ld\n", + FT_DEBUG("scatter failed; expect: %ld, actual: %ld", data[pm_job.my_rank], result); - ret = -1; + err = -1; goto out; } - ret = FI_SUCCESS; + err = FI_SUCCESS; out: free(data); - return ret; + return err; } -static int broadcast_test_run() +static int broadcast_test_run(enum fi_collective_op coll_op, enum fi_op op, + enum fi_datatype datatype) { uint64_t done_flag; uint64_t *result, *data; uint64_t i; - struct fi_collective_attr attr; fi_addr_t root = 0; size_t data_cnt = pm_job.num_ranks; int err; - attr.op = FI_NOOP; - attr.datatype = FI_UINT64; - attr.mode = 0; - - err = fi_query_collective(domain, FI_BROADCAST, &attr, 0); - if (err) { - FT_DEBUG("Broadcast collective not supported: %d (%s)\n", err, - fi_strerror(err)); - return err; - } + assert(coll_op == FI_BROADCAST); + assert(datatype == FI_UINT64); result = malloc(data_cnt * sizeof(*result)); if (!result) @@ -446,8 +420,7 @@ static int broadcast_test_run() root, FI_UINT64, 0, &done_flag); } if (err) { - FT_DEBUG("broadcast scatter failed: %d (%s)\n", - err, fi_strerror(err)); + FT_PRINTERR("broadcast scatter failed - fi_broadcast", err); goto out; } @@ -481,48 +454,71 @@ struct coll_test tests[] = { .name = "join_test", .setup = coll_setup, .run = join_test_run, - .teardown = coll_teardown + .teardown = coll_teardown, + .coll_op = FI_BARRIER, + .op = FI_NOOP, + .datatype = FI_VOID, }, { .name = "barrier_test", .setup = coll_setup, .run = barrier_test_run, - .teardown = coll_teardown + .teardown = coll_teardown, + .coll_op = FI_BARRIER, + .op = FI_NOOP, + .datatype = FI_VOID, }, { .name = "sum_all_reduce_test", .setup = coll_setup, .run = sum_all_reduce_test_run, - .teardown = coll_teardown + .teardown = coll_teardown, + .coll_op = FI_ALLREDUCE, + .op = FI_SUM, + .datatype = FI_UINT64, }, { .name = "sum_all_reduce_w_stride_test", .setup = coll_setup_w_stride, .run = sum_all_reduce_test_run, - .teardown = coll_teardown + .teardown = coll_teardown, + .coll_op = FI_ALLREDUCE, + .op = FI_SUM, + .datatype = FI_UINT64, }, { .name = "all_gather_test", .setup = coll_setup, .run = all_gather_test_run, - .teardown = coll_teardown + .teardown = coll_teardown, + .coll_op = FI_ALLGATHER, + .op = FI_NOOP, + .datatype = FI_UINT64, }, { .name = "scatter_test", .setup = coll_setup, .run = scatter_test_run, - .teardown = coll_teardown + .teardown = coll_teardown, + .coll_op = FI_SCATTER, + .op = FI_NOOP, + .datatype = FI_UINT64 }, { .name = "broadcast_test", .setup = coll_setup, .run = broadcast_test_run, .teardown = coll_teardown, + .coll_op = FI_BROADCAST, + .op = FI_NOOP, + .datatype = FI_UINT64 + }, + { + .name = "empty_test_to_stop_the_sequence_of_execution", + .run = NULL, }, }; -const int NUM_TESTS = ARRAY_SIZE(tests); - static inline void setup_hints() { hints->ep_attr->type = FI_EP_RDM; @@ -610,29 +606,45 @@ static void pm_job_free_res() int multinode_run_tests(int argc, char **argv) { + struct coll_test *test; int ret = FI_SUCCESS; - int i; ret = multinode_setup_fabric(argc, argv); if (ret) return ret; - for (i = 0; i < NUM_TESTS && !ret; i++) { - FT_DEBUG("Running Test: %s \n", tests[i].name); + for (test = tests; test->run && !ret; test++) { + FT_DEBUG("Running Test: %s", test->name); + ret = test_query(test->coll_op, test->op, test->datatype); + if (ret) { + FT_DEBUG("Test skipped: operation %s not supported.", + test->name); + ret = FI_SUCCESS; + continue; + } - ret = tests[i].setup(); - FT_DEBUG("Setup Complete...\n"); - if (ret) + ret = test->setup(); + if (ret) { + FT_DEBUG("Setup Failed..."); goto out; + } + FT_DEBUG("Setup Complete..."); - ret = tests[i].run(); - if (ret) + ret = test->run(test->coll_op, test->op, test->datatype); + + if (ret) { + FT_DEBUG("Test Failed: %s", test->name); goto out; + } pm_barrier(); - tests[i].teardown(); - FT_DEBUG("Run Complete...\n"); - FT_DEBUG("Test Complete: %s \n", tests[i].name); + ret = test->teardown(); + if (ret) { + FT_DEBUG("Teardown Failed..."); + goto out; + } + FT_DEBUG("Run Complete..."); + FT_DEBUG("Test Complete: %s", tests->name); } out: